From 369a3b22b494aecc2ad241b9d66f2d877fcafc18 Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Tue, 30 Dec 2025 21:56:24 +0100 Subject: [PATCH] nir/opt_uniform_subgroup: optimize uniform ddx/ddy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We can't just use 0.0 as the replacement because of NaN/Inf. But turning the intrinsic into a simple fsub should still be better or at least equal. Foz-DB Navi48: Totals from 128 (0.10% of 125402) affected shaders: MaxWaves: 3684 -> 3708 (+0.65%) Instrs: 111150 -> 111055 (-0.09%); split: -0.20%, +0.11% CodeSize: 587176 -> 590800 (+0.62%); split: -0.01%, +0.63% VGPRs: 6540 -> 6480 (-0.92%) Latency: 382775 -> 383332 (+0.15%); split: -0.15%, +0.29% InvThroughput: 80909 -> 80530 (-0.47%); split: -0.51%, +0.04% VClause: 1433 -> 1430 (-0.21%) SClause: 1834 -> 1841 (+0.38%); split: -0.11%, +0.49% Copies: 6130 -> 6096 (-0.55%); split: -1.29%, +0.73% PreSGPRs: 7352 -> 7356 (+0.05%) PreVGPRs: 4797 -> 4721 (-1.58%) VALU: 71892 -> 71435 (-0.64%); split: -0.64%, +0.01% SALU: 12665 -> 13056 (+3.09%); split: -0.06%, +3.14% Reviewed-by: Marek Olšák Part-of: --- src/compiler/nir/nir_opt_uniform_subgroup.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/compiler/nir/nir_opt_uniform_subgroup.c b/src/compiler/nir/nir_opt_uniform_subgroup.c index f907092bc51..ae23e3d84f6 100644 --- a/src/compiler/nir/nir_opt_uniform_subgroup.c +++ b/src/compiler/nir/nir_opt_uniform_subgroup.c @@ -221,6 +221,21 @@ opt_uniform_subgroup_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *_s } break; + case nir_intrinsic_ddx: + case nir_intrinsic_ddx_coarse: + case nir_intrinsic_ddx_fine: + case nir_intrinsic_ddy: + case nir_intrinsic_ddy_coarse: + case nir_intrinsic_ddy_fine: + if (nir_src_is_divergent(&intrin->src[0])) + return false; + + nir_def *x = intrin->src[0].ssa; + b->fp_math_ctrl = nir_fp_no_fast_math; + replacement = nir_fsub(b, x, x); + b->fp_math_ctrl = nir_fp_fast_math; + break; + case nir_intrinsic_reduce: case nir_intrinsic_exclusive_scan: case nir_intrinsic_inclusive_scan: {