mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-17 00:58:13 +02:00
Nvidia implements both the same way as AMD does, so it makes sense to allow for code sharing here. Reviewed-by: Georg Lehmann <dadschoorse@gmail.com> Reviewed-by: Mel Henning <mhenning@darkrefraction.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40541>
340 lines
11 KiB
C
340 lines
11 KiB
C
/*
|
|
* Copyright 2026 Valve Corporation
|
|
* SPDX-License-Identifier: MIT
|
|
*/
|
|
|
|
#include "nir.h"
|
|
#include "nir_range_analysis.h"
|
|
|
|
/* Applications like DXVK spam float control restriction flags on all ALU instructions.
|
|
* Remove signed zero, Inf, NaN preserve flags if we can prove that no inputs/outputs
|
|
* are zero/inf/NaN using range analysis.
|
|
* For signed zero, we can go a step further by back propagating when signed zero are not
|
|
* needed, which is a quite common. For example, any float comparison, cosinus, exp2, log2,
|
|
* or addition with non zero value does not care about the zero sign of the inputs. Neither
|
|
* do texture coordinates.
|
|
* Drivers can also set no_signed_zero for fragment output stores based on state, blending,
|
|
* fixed point or R11G11B10 formats do not care about the sign of zero.
|
|
*
|
|
* Future work:
|
|
* For pre raster stages, position doesn't care, and we could back propagate information from
|
|
* the FS for varyings, and interpolated varyings do not care anyway.
|
|
*/
|
|
|
|
struct opt_fp_ctrl_state {
|
|
nir_fp_analysis_state fp_class_state;
|
|
};
|
|
|
|
static inline nir_block *
|
|
block_get_loop_preheader(nir_block *block)
|
|
{
|
|
nir_cf_node *parent = block->cf_node.parent;
|
|
if (parent->type != nir_cf_node_loop)
|
|
return NULL;
|
|
if (block != nir_cf_node_cf_tree_first(parent))
|
|
return NULL;
|
|
return nir_cf_node_as_block(nir_cf_node_prev(parent));
|
|
}
|
|
|
|
|
|
static bool
|
|
src_mark_preserve_sz(nir_src *src, UNUSED void *state)
|
|
{
|
|
nir_def_instr(src->ssa)->pass_flags = true;
|
|
return true;
|
|
}
|
|
|
|
static bool
|
|
can_prop_nsz(nir_alu_instr *alu)
|
|
{
|
|
/* Only divide cares about the sign of zero even when the sign of zero
|
|
* of the output doesn't matter.
|
|
*/
|
|
switch (alu->op) {
|
|
case nir_op_fdiv:
|
|
case nir_op_frcp:
|
|
case nir_op_frsq:
|
|
case nir_op_fcopysign_pco:
|
|
return false;
|
|
default:
|
|
return true;
|
|
}
|
|
}
|
|
|
|
static bool
|
|
opt_alu_fp_math_ctrl(nir_alu_instr *alu, struct opt_fp_ctrl_state *state)
|
|
{
|
|
if (alu->op == nir_op_bcsel) {
|
|
src_mark_preserve_sz(&alu->src[0].src, NULL);
|
|
|
|
if (alu->instr.pass_flags) {
|
|
src_mark_preserve_sz(&alu->src[1].src, NULL);
|
|
src_mark_preserve_sz(&alu->src[2].src, NULL);
|
|
}
|
|
return false;
|
|
} else if (nir_op_is_vec_or_mov(alu->op) && !alu->instr.pass_flags) {
|
|
return false;
|
|
}
|
|
|
|
const nir_op_info *op_info = &nir_op_infos[(int)alu->op];
|
|
unsigned old_fp_math_ctrl = alu->fp_math_ctrl;
|
|
if (alu->fp_math_ctrl & nir_fp_preserve_sz_inf_nan) {
|
|
fp_class_mask class_mask = 0;
|
|
|
|
bool dest_is_float = nir_alu_type_get_base_type(op_info->output_type) == nir_type_float;
|
|
|
|
if (dest_is_float) {
|
|
class_mask |= nir_analyze_fp_class(&state->fp_class_state, &alu->def);
|
|
if (can_prop_nsz(alu) && (!alu->instr.pass_flags || !(class_mask & FP_CLASS_ANY_ZERO)))
|
|
alu->fp_math_ctrl &= ~nir_fp_preserve_signed_zero;
|
|
}
|
|
|
|
for (unsigned i = 0; i < op_info->num_inputs; i++) {
|
|
if (nir_alu_type_get_base_type(op_info->input_types[i]) == nir_type_float)
|
|
class_mask |= nir_analyze_fp_class(&state->fp_class_state, alu->src[i].src.ssa);
|
|
}
|
|
|
|
/* If class_mask is 0, the opcode has no float operands/definition,
|
|
* So it must be a special opcode, or operate on bfloats. Ignore these.
|
|
*/
|
|
if (class_mask) {
|
|
/* If none of the float operands or the definition can be zero/Inf/NaN,
|
|
* remove the matching fp_math_ctrl flag.
|
|
*/
|
|
if (!(class_mask & FP_CLASS_ANY_ZERO))
|
|
alu->fp_math_ctrl &= ~nir_fp_preserve_signed_zero;
|
|
if (!(class_mask & FP_CLASS_ANY_INF))
|
|
alu->fp_math_ctrl &= ~nir_fp_preserve_inf;
|
|
if (!(class_mask & FP_CLASS_NAN))
|
|
alu->fp_math_ctrl &= ~nir_fp_preserve_nan;
|
|
}
|
|
}
|
|
|
|
if (alu->fp_math_ctrl & nir_fp_preserve_signed_zero) {
|
|
/* Some alu never cares about the input sign of zero. */
|
|
switch (alu->op) {
|
|
case nir_op_fabs:
|
|
case nir_op_fsat:
|
|
case nir_op_fexp2:
|
|
case nir_op_flog2:
|
|
case nir_op_fcos:
|
|
case nir_op_fcos_normalized_2_pi:
|
|
case nir_op_fmulz:
|
|
case nir_op_ffract:
|
|
break;
|
|
case nir_op_fmin: {
|
|
bool had_neg_zero = false;
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
fp_class_mask fp_class = nir_analyze_fp_class(&state->fp_class_state, alu->src[i].src.ssa);
|
|
|
|
if (fp_class & (FP_CLASS_NAN | FP_CLASS_ANY_POS | FP_CLASS_POS_ZERO)) {
|
|
src_mark_preserve_sz(&alu->src[!i].src, NULL);
|
|
} else if (fp_class & FP_CLASS_NEG_ZERO) {
|
|
/* If both operands can be -0.0, at least one needs to be preserved. */
|
|
if (had_neg_zero)
|
|
src_mark_preserve_sz(&alu->src[!i].src, NULL);
|
|
had_neg_zero = true;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
|
|
case nir_op_fmax: {
|
|
bool had_pos_zero = false;
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
fp_class_mask fp_class = nir_analyze_fp_class(&state->fp_class_state, alu->src[i].src.ssa);
|
|
|
|
if (fp_class & (FP_CLASS_NAN | FP_CLASS_ANY_NEG | FP_CLASS_NEG_ZERO)) {
|
|
src_mark_preserve_sz(&alu->src[!i].src, NULL);
|
|
} else if (fp_class & FP_CLASS_POS_ZERO) {
|
|
/* If both operands can be +0.0, at least one needs to be preserved. */
|
|
if (had_pos_zero)
|
|
src_mark_preserve_sz(&alu->src[!i].src, NULL);
|
|
had_pos_zero = true;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case nir_op_fsub:
|
|
case nir_op_fadd: {
|
|
bool had_pos_zero = false;
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
fp_class_mask fp_class = nir_analyze_fp_class(&state->fp_class_state, alu->src[i].src.ssa);
|
|
|
|
bool negate = i == 1 && alu->op == nir_op_fsub;
|
|
|
|
if (fp_class & (negate ? FP_CLASS_POS_ZERO : FP_CLASS_NEG_ZERO)) {
|
|
src_mark_preserve_sz(&alu->src[!i].src, NULL);
|
|
} else if (fp_class & (negate ? FP_CLASS_NEG_ZERO : FP_CLASS_POS_ZERO)) {
|
|
/* If both operands can be +0.0, at least one needs to be preserved. */
|
|
if (had_pos_zero)
|
|
src_mark_preserve_sz(&alu->src[!i].src, NULL);
|
|
had_pos_zero = true;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
|
|
case nir_op_ffmaz:
|
|
src_mark_preserve_sz(&alu->src[2].src, NULL);
|
|
break;
|
|
case nir_op_ffma:
|
|
if ((nir_analyze_fp_class(&state->fp_class_state, alu->src[2].src.ssa) & FP_CLASS_NEG_ZERO) &&
|
|
!nir_alu_srcs_equal(alu, alu, 0, 1)) {
|
|
src_mark_preserve_sz(&alu->src[0].src, NULL);
|
|
src_mark_preserve_sz(&alu->src[1].src, NULL);
|
|
}
|
|
src_mark_preserve_sz(&alu->src[2].src, NULL);
|
|
break;
|
|
case nir_op_fmul:
|
|
if (nir_alu_srcs_equal(alu, alu, 0, 1))
|
|
break;
|
|
FALLTHROUGH;
|
|
default:
|
|
for (unsigned i = 0; i < op_info->num_inputs; i++)
|
|
src_mark_preserve_sz(&alu->src[i].src, NULL);
|
|
break;
|
|
}
|
|
} else {
|
|
/* Only preserve signed zeros for non float operands. */
|
|
for (unsigned i = 0; i < op_info->num_inputs; i++) {
|
|
if (nir_alu_type_get_base_type(op_info->input_types[i]) != nir_type_float)
|
|
src_mark_preserve_sz(&alu->src[i].src, NULL);
|
|
}
|
|
}
|
|
|
|
return alu->fp_math_ctrl != old_fp_math_ctrl;
|
|
}
|
|
|
|
static void
|
|
prop_tex_fp_math_ctrl(nir_tex_instr *tex)
|
|
{
|
|
for (unsigned i = 0; i < tex->num_srcs; i++) {
|
|
/* Floating point tex sources don't care about sign of zero. */
|
|
if (nir_tex_instr_src_type(tex, i) != nir_type_float)
|
|
src_mark_preserve_sz(&tex->src[i].src, NULL);
|
|
}
|
|
}
|
|
|
|
static bool
|
|
opt_intrin_fp_math_ctrl(nir_intrinsic_instr *intrin)
|
|
{
|
|
switch (intrin->intrinsic) {
|
|
case nir_intrinsic_ddx:
|
|
case nir_intrinsic_ddx_coarse:
|
|
case nir_intrinsic_ddx_fine:
|
|
case nir_intrinsic_ddy:
|
|
case nir_intrinsic_ddy_coarse:
|
|
case nir_intrinsic_ddy_fine: {
|
|
unsigned fp_math_ctrl = nir_intrinsic_fp_math_ctrl(intrin);
|
|
if ((fp_math_ctrl & nir_fp_preserve_signed_zero) == 0)
|
|
return false;
|
|
|
|
if (intrin->instr.pass_flags) {
|
|
src_mark_preserve_sz(&intrin->src[0], NULL);
|
|
return false;
|
|
} else {
|
|
fp_math_ctrl &= ~nir_fp_preserve_signed_zero;
|
|
nir_intrinsic_set_fp_math_ctrl(intrin, fp_math_ctrl);
|
|
return true;
|
|
}
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
|
|
if (!nir_intrinsic_has_io_semantics(intrin)) {
|
|
nir_foreach_src(&intrin->instr, src_mark_preserve_sz, NULL);
|
|
return false;
|
|
}
|
|
|
|
nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
|
|
const nir_intrinsic_info *info = &nir_intrinsic_infos[(int)intrin->intrinsic];
|
|
|
|
if (info->has_dest) {
|
|
nir_foreach_src(&intrin->instr, src_mark_preserve_sz, NULL);
|
|
|
|
/* For loads, set no signed zero flag based on gathered info. */
|
|
if (!intrin->instr.pass_flags && !sem.no_signed_zero) {
|
|
sem.no_signed_zero = 1;
|
|
nir_intrinsic_set_io_semantics(intrin, sem);
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
} else {
|
|
/* For stores, propagate the signed zero information for the data source. */
|
|
for (unsigned i = sem.no_signed_zero; i < info->num_srcs; i++)
|
|
src_mark_preserve_sz(&intrin->src[i], NULL);
|
|
|
|
return false;
|
|
}
|
|
}
|
|
|
|
static bool
|
|
opt_fp_math_ctrl_impl(nir_function_impl *impl)
|
|
{
|
|
/* Setup pass flags: Store if signed zeros are needed.
|
|
* Handle loop header phis here already:
|
|
* For now, just disable opts for the back edges.
|
|
* That could be improved, but not sure if it's worth it.
|
|
*/
|
|
nir_foreach_block_reverse(block, impl) {
|
|
nir_block *preheader = block_get_loop_preheader(block);
|
|
|
|
nir_foreach_instr_reverse(instr, block) {
|
|
instr->pass_flags = false;
|
|
if (instr->type == nir_instr_type_phi && preheader) {
|
|
nir_phi_instr *phi = nir_instr_as_phi(instr);
|
|
nir_foreach_phi_src(src, phi) {
|
|
if (src->pred != preheader)
|
|
nir_def_instr(src->src.ssa)->pass_flags = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
struct opt_fp_ctrl_state state = {0};
|
|
state.fp_class_state = nir_create_fp_analysis_state(impl);
|
|
|
|
bool progress = false;
|
|
|
|
nir_foreach_block_reverse(block, impl) {
|
|
nir_foreach_instr_reverse(instr, block) {
|
|
switch (instr->type) {
|
|
case nir_instr_type_alu:
|
|
progress |= opt_alu_fp_math_ctrl(nir_instr_as_alu(instr), &state);
|
|
break;
|
|
case nir_instr_type_tex:
|
|
prop_tex_fp_math_ctrl(nir_instr_as_tex(instr));
|
|
break;
|
|
case nir_instr_type_intrinsic:
|
|
progress |= opt_intrin_fp_math_ctrl(nir_instr_as_intrinsic(instr));
|
|
break;
|
|
case nir_instr_type_phi:
|
|
if (!instr->pass_flags)
|
|
break;
|
|
FALLTHROUGH;
|
|
default:
|
|
nir_foreach_src(instr, src_mark_preserve_sz, NULL);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
nir_free_fp_analysis_state(&state.fp_class_state);
|
|
|
|
return nir_progress(progress, impl, nir_metadata_all);
|
|
}
|
|
|
|
bool
|
|
nir_opt_fp_math_ctrl(nir_shader *shader)
|
|
{
|
|
bool progress = false;
|
|
|
|
nir_foreach_function_impl(impl, shader)
|
|
progress |= opt_fp_math_ctrl_impl(impl);
|
|
|
|
return progress;
|
|
}
|