From ee3496c5ecad1c62ad3d59f8020241a742e3b3d1 Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Tue, 10 Mar 2026 12:27:09 +0100 Subject: [PATCH 1/3] nir: add no_signed_zero flag to io semantics --- src/compiler/nir/nir.h | 5 ++++- src/compiler/nir/nir_opt_vectorize_io.c | 16 ++++++++++++++++ src/compiler/nir/nir_print.c | 3 +++ 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index f0e4a92125c..bc316f5c317 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -2054,7 +2054,10 @@ typedef struct nir_io_semantics { unsigned interp_explicit_strict : 1; /* preserve original vertex order */ /* Skip nir_validate of the intrinsic. Any new code that sets it will ba NAK'd. */ unsigned no_validate : 1; - unsigned padding; + + /* Start of the second uint. */ + unsigned no_signed_zero : 1; /* whether it matters if the input/output -0.0 or +0.0. */ + unsigned padding : 31; } nir_io_semantics; /* Transform feedback info for 4 outputs. */ diff --git a/src/compiler/nir/nir_opt_vectorize_io.c b/src/compiler/nir/nir_opt_vectorize_io.c index f22dfc92de9..5c585a37018 100644 --- a/src/compiler/nir/nir_opt_vectorize_io.c +++ b/src/compiler/nir/nir_opt_vectorize_io.c @@ -176,6 +176,18 @@ vectorize_load(nir_intrinsic_instr *chan[8], unsigned start, unsigned count, nir_io_semantics sem = nir_intrinsic_io_semantics(new_intr); + for (unsigned i = start; i < start + count; i++) { + if (chan[i]) { + if (!nir_intrinsic_io_semantics(chan[i]).no_signed_zero) + sem.no_signed_zero = 0; + } + + if (step == merge_low_high_16_to_32 && chan[4 + i]) { + if (!nir_intrinsic_io_semantics(chan[4 + i]).no_signed_zero) + sem.no_signed_zero = 0; + } + } + if (step == vectorize_high_16_separately) { assert(start >= 4); sem.high_16bits = 1; @@ -298,6 +310,8 @@ vectorize_store(nir_intrinsic_instr *chan[8], unsigned start, unsigned count, sem.no_sysval_output = 0; if (!nir_intrinsic_io_semantics(chan[i]).no_varying) sem.no_varying = 0; + if (!nir_intrinsic_io_semantics(chan[i]).no_signed_zero) + sem.no_signed_zero = 0; } if (step == merge_low_high_16_to_32) { @@ -307,6 +321,8 @@ vectorize_store(nir_intrinsic_instr *chan[8], unsigned start, unsigned count, sem.no_sysval_output = 0; if (!nir_intrinsic_io_semantics(chan[4 + i]).no_varying) sem.no_varying = 0; + if (!nir_intrinsic_io_semantics(chan[4 + i]).no_signed_zero) + sem.no_signed_zero = 0; } /* Update the type. */ diff --git a/src/compiler/nir/nir_print.c b/src/compiler/nir/nir_print.c index 387190a57cd..48eb2116e7d 100644 --- a/src/compiler/nir/nir_print.c +++ b/src/compiler/nir/nir_print.c @@ -1601,6 +1601,9 @@ print_intrinsic_instr(nir_intrinsic_instr *instr, print_state *state) if (io.no_validate) fprintf(fp, " no_validate"); + if (io.no_signed_zero) + fprintf(fp, " no_signed_zero"); + break; } From 25759ed3c7bad2f8571aa41e13c1a46c752246a4 Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Tue, 10 Mar 2026 13:24:37 +0100 Subject: [PATCH 2/3] nir/opt_fp_math_ctrl: handle input/output no_signed_zero flag --- src/compiler/nir/nir_opt_fp_math_ctrl.c | 41 ++++++++++++++++++++----- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/src/compiler/nir/nir_opt_fp_math_ctrl.c b/src/compiler/nir/nir_opt_fp_math_ctrl.c index 8dc265eeb77..99e1606875b 100644 --- a/src/compiler/nir/nir_opt_fp_math_ctrl.c +++ b/src/compiler/nir/nir_opt_fp_math_ctrl.c @@ -13,9 +13,10 @@ * needed, which is a quite common. For example, any float comparison, cosinus, exp2, log2, * or addition with non zero value does not care about the zero sign of the inputs. Neither * do texture coordinates. + * Drivers can also set no_signed_zero for fragment output stores based on state, + * fixed point or R11G11B10 formats do not care about the sign of zero. * - * Future work could also consider fragment output state, fixed point or R11G11B10 formats - * do not care about the sign of zero. + * Future work: * For pre raster stages, position doesn't care, and we could back propagate information from * the FS for varyings, and interpolated varyings do not care anyway. */ @@ -214,8 +215,8 @@ prop_tex_fp_math_ctrl(nir_tex_instr *tex) } } -static void -prop_intrin_fp_math_ctrl(nir_intrinsic_instr *intrin) +static bool +opt_intrin_fp_math_ctrl(nir_intrinsic_instr *intrin) { switch (intrin->intrinsic) { case nir_intrinsic_ddx: @@ -226,11 +227,37 @@ prop_intrin_fp_math_ctrl(nir_intrinsic_instr *intrin) case nir_intrinsic_ddy_fine: if (intrin->instr.pass_flags) src_mark_preserve_sz(&intrin->src[0], NULL); - break; + return false; default: - nir_foreach_src(&intrin->instr, src_mark_preserve_sz, NULL); break; } + + if (!nir_intrinsic_has_io_semantics(intrin)) { + nir_foreach_src(&intrin->instr, src_mark_preserve_sz, NULL); + return false; + } + + nir_io_semantics sem = nir_intrinsic_io_semantics(intrin); + const nir_intrinsic_info *info = &nir_intrinsic_infos[(int)intrin->intrinsic]; + + if (info->has_dest) { + nir_foreach_src(&intrin->instr, src_mark_preserve_sz, NULL); + + /* For loads, set no signed zero flag based on gathered info. */ + if (!intrin->instr.pass_flags && !sem.no_signed_zero) { + sem.no_signed_zero = 1; + nir_intrinsic_set_io_semantics(intrin, sem); + return true; + } + + return false; + } else { + /* For stores, propagate the signed zero information for the data source. */ + for (unsigned i = sem.no_signed_zero; i < info->num_srcs; i++) + src_mark_preserve_sz(&intrin->src[i], NULL); + + return false; + } } static bool @@ -272,7 +299,7 @@ opt_fp_math_ctrl_impl(nir_function_impl *impl) prop_tex_fp_math_ctrl(nir_instr_as_tex(instr)); break; case nir_instr_type_intrinsic: - prop_intrin_fp_math_ctrl(nir_instr_as_intrinsic(instr)); + progress |= opt_intrin_fp_math_ctrl(nir_instr_as_intrinsic(instr)); break; case nir_instr_type_phi: if (!instr->pass_flags) From 660226b73e16a152b4c94943c4bb0776fde06e3f Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Tue, 10 Mar 2026 14:18:20 +0100 Subject: [PATCH 3/3] radv: set no_signed_zero for FS store_output when format doesn't care Or when blending is enabled, because blending doesn't require IEEE math. Foz-DB Navi21: Totals from 367 (0.32% of 114627) affected shaders: MaxWaves: 7884 -> 7876 (-0.10%); split: +0.20%, -0.30% Instrs: 354948 -> 354386 (-0.16%); split: -0.16%, +0.00% CodeSize: 1905980 -> 1903172 (-0.15%); split: -0.15%, +0.00% VGPRs: 20208 -> 20216 (+0.04%); split: -0.08%, +0.12% Latency: 1855670 -> 1854973 (-0.04%); split: -0.06%, +0.02% InvThroughput: 540792 -> 539688 (-0.20%); split: -0.20%, +0.00% PreSGPRs: 18426 -> 18366 (-0.33%) PreVGPRs: 17213 -> 17249 (+0.21%); split: -0.05%, +0.26% VALU: 258793 -> 258237 (-0.21%); split: -0.22%, +0.00% SALU: 35168 -> 35166 (-0.01%); split: -0.01%, +0.01% --- src/amd/vulkan/nir/radv_nir.h | 3 ++- .../nir/radv_nir_trim_fs_color_exports.c | 23 ++++++++++++---- src/amd/vulkan/radv_pipeline_graphics.c | 26 ++++++++++++++++--- src/amd/vulkan/radv_shader.h | 1 + 4 files changed, 44 insertions(+), 9 deletions(-) diff --git a/src/amd/vulkan/nir/radv_nir.h b/src/amd/vulkan/nir/radv_nir.h index e6fc3f32b32..26a6469123d 100644 --- a/src/amd/vulkan/nir/radv_nir.h +++ b/src/amd/vulkan/nir/radv_nir.h @@ -24,6 +24,7 @@ struct radv_shader_args; struct radv_shader_layout; struct radv_device; struct radv_graphics_state_key; +struct radv_ps_epilog_key; bool radv_nir_apply_pipeline_layout(nir_shader *shader, struct radv_device *device, const struct radv_shader_stage *stage); @@ -78,7 +79,7 @@ bool radv_nir_lower_draw_id_to_zero(nir_shader *shader); bool radv_nir_remap_color_attachment(nir_shader *shader, const struct radv_graphics_state_key *gfx_state); -bool radv_nir_trim_fs_color_exports(nir_shader *shader, uint32_t colors_needed); +bool radv_nir_trim_fs_color_exports(nir_shader *shader, const struct radv_ps_epilog_key *epilog_key); bool radv_nir_lower_printf(nir_shader *shader); diff --git a/src/amd/vulkan/nir/radv_nir_trim_fs_color_exports.c b/src/amd/vulkan/nir/radv_nir_trim_fs_color_exports.c index da4da0e993a..6ae301f34c4 100644 --- a/src/amd/vulkan/nir/radv_nir_trim_fs_color_exports.c +++ b/src/amd/vulkan/nir/radv_nir_trim_fs_color_exports.c @@ -8,11 +8,12 @@ #include "nir/nir_builder.h" #include "radv_constants.h" #include "radv_nir.h" +#include "radv_shader.h" static bool trim_fs_color_exports(nir_builder *b, nir_intrinsic_instr *intrin, void *state) { - const uint32_t colors_needed = *(uint32_t *)state; + const struct radv_ps_epilog_key *epilog_key = (const struct radv_ps_epilog_key *)state; if (intrin->intrinsic != nir_intrinsic_store_output) return false; @@ -24,14 +25,26 @@ trim_fs_color_exports(nir_builder *b, nir_intrinsic_instr *intrin, void *state) if (index < 0) return false; - const unsigned needed = (colors_needed >> (index * 4) & 0xf) >> nir_intrinsic_component(intrin); + bool progress = false; + + if (epilog_key->no_signed_zero & BITFIELD_BIT(index)) { + nir_io_semantics sem = nir_intrinsic_io_semantics(intrin); + + if (!sem.no_signed_zero) { + sem.no_signed_zero = 1; + nir_intrinsic_set_io_semantics(intrin, sem); + progress = true; + } + } + + const unsigned needed = (epilog_key->colors_needed >> (index * 4) & 0xf) >> nir_intrinsic_component(intrin); const unsigned write_mask = nir_intrinsic_write_mask(intrin); const unsigned new_write_mask = write_mask & needed; if (new_write_mask == write_mask) - return false; + return progress; if (!new_write_mask) nir_instr_remove(&intrin->instr); @@ -42,7 +55,7 @@ trim_fs_color_exports(nir_builder *b, nir_intrinsic_instr *intrin, void *state) } bool -radv_nir_trim_fs_color_exports(nir_shader *shader, uint32_t colors_needed) +radv_nir_trim_fs_color_exports(nir_shader *shader, const struct radv_ps_epilog_key *epilog_key) { - return nir_shader_intrinsics_pass(shader, trim_fs_color_exports, nir_metadata_control_flow, &colors_needed); + return nir_shader_intrinsics_pass(shader, trim_fs_color_exports, nir_metadata_control_flow, (void *)epilog_key); } diff --git a/src/amd/vulkan/radv_pipeline_graphics.c b/src/amd/vulkan/radv_pipeline_graphics.c index dad478528d1..151eca221af 100644 --- a/src/amd/vulkan/radv_pipeline_graphics.c +++ b/src/amd/vulkan/radv_pipeline_graphics.c @@ -154,6 +154,23 @@ format_is_float32(VkFormat format) return channel >= 0 && desc->channel[channel].type == UTIL_FORMAT_TYPE_FLOAT && desc->channel[channel].size == 32; } +static bool +format_ignores_signed_zero(VkFormat format) +{ + const struct util_format_description *desc = radv_format_description(format); + + /* Unsigned float formats don't care about signed zeros. */ + if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT || desc->format == PIPE_FORMAT_R9G9B9E5_FLOAT) + return true; + + for (unsigned i = 0; i < desc->nr_channels; i++) { + if (desc->channel[i].pure_integer || desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT) + return false; + } + + return true; +} + static bool radv_pipeline_needs_ps_epilog(const struct vk_graphics_pipeline_state *state, VkGraphicsPipelineLibraryFlagBitsEXT lib_flags) @@ -1768,7 +1785,7 @@ radv_generate_ps_epilog_key(const struct radv_device *device, const struct radv_ { const struct radv_physical_device *pdev = radv_device_physical(device); const struct radv_instance *instance = radv_physical_device_instance(pdev); - unsigned col_format = 0, is_int8 = 0, is_int10 = 0, is_float32 = 0, z_format = 0; + unsigned col_format = 0, is_int8 = 0, is_int10 = 0, is_float32 = 0, z_format = 0, no_signed_zero = 0; struct radv_ps_epilog_key key; memset(&key, 0, sizeof(key)); @@ -1794,6 +1811,8 @@ radv_generate_ps_epilog_key(const struct radv_device *device, const struct radv_ key.colors_needed |= comp_used << (4 * i); + if (format_ignores_signed_zero(fmt) || blend_enable) + no_signed_zero |= 1 << i; if (format_is_int8(fmt)) is_int8 |= 1 << i; if (format_is_int10(fmt)) @@ -1822,6 +1841,7 @@ radv_generate_ps_epilog_key(const struct radv_device *device, const struct radv_ col_format |= (col_format & 0xf) << 4; key.color_map[1] = 1; key.colors_needed |= (key.colors_needed & 0xf) << 4; + no_signed_zero |= 0x2; } z_format = ac_get_spi_shader_z_format(state->export_depth, state->export_stencil, state->export_sample_mask, @@ -1831,6 +1851,7 @@ radv_generate_ps_epilog_key(const struct radv_device *device, const struct radv_ key.color_is_int8 = pdev->info.compiler_info.has_cb_lt16bit_int_clamp_bug ? is_int8 : 0; key.color_is_int10 = pdev->info.compiler_info.has_cb_lt16bit_int_clamp_bug ? is_int10 : 0; key.enable_mrt_output_nan_fixup = instance->drirc.debug.enable_mrt_output_nan_fixup ? is_float32 : 0; + key.no_signed_zero = no_signed_zero; key.colors_written = state->colors_written; key.mrt0_is_dual_src = state->mrt0_is_dual_src && key.colors_needed & 0xf; key.export_depth = state->export_depth; @@ -2836,8 +2857,7 @@ radv_graphics_shaders_compile(struct radv_device *device, struct vk_pipeline_cac /* Lower FS outputs to scalar to allow dce. */ NIR_PASS(_, stages[MESA_SHADER_FRAGMENT].nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL); - NIR_PASS(update_info, stages[MESA_SHADER_FRAGMENT].nir, radv_nir_trim_fs_color_exports, - gfx_state->ps.epilog.colors_needed); + NIR_PASS(update_info, stages[MESA_SHADER_FRAGMENT].nir, radv_nir_trim_fs_color_exports, &gfx_state->ps.epilog); NIR_PASS(update_info, stages[MESA_SHADER_FRAGMENT].nir, nir_opt_copy_prop); NIR_PASS(update_info, stages[MESA_SHADER_FRAGMENT].nir, nir_opt_dce); diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index b3d0178b0d3..7042393e6cf 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -104,6 +104,7 @@ struct radv_ps_epilog_key { uint8_t color_is_int8; uint8_t color_is_int10; uint8_t enable_mrt_output_nan_fixup; + uint8_t no_signed_zero; uint32_t colors_needed;