From 75a61b8e2d68ec5f9ebfd84b5a6e6a0e6bb553e0 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Sun, 12 Apr 2026 19:21:59 +0200 Subject: [PATCH 1/6] nir/opt_sink: support load_global_nv --- src/compiler/nir/nir_opt_sink.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/compiler/nir/nir_opt_sink.c b/src/compiler/nir/nir_opt_sink.c index 324dd0f29dd..9aedce02fc5 100644 --- a/src/compiler/nir/nir_opt_sink.c +++ b/src/compiler/nir/nir_opt_sink.c @@ -166,6 +166,7 @@ can_sink_instr(nir_instr *instr, nir_move_options options, bool *can_mov_out_of_ if (intrin->intrinsic == nir_intrinsic_load_global || intrin->intrinsic == nir_intrinsic_load_global_amd || + intrin->intrinsic == nir_intrinsic_load_global_nv || intrin->intrinsic == nir_intrinsic_load_ubo || intrin->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel || intrin->intrinsic == nir_intrinsic_load_ssbo) { @@ -228,6 +229,7 @@ can_sink_instr(nir_instr *instr, nir_move_options options, bool *can_mov_out_of_ case nir_intrinsic_load_global: case nir_intrinsic_load_global_amd: /* = global + convergent */ + case nir_intrinsic_load_global_nv: return options & nir_move_load_global; case nir_intrinsic_ldc_nv: From ce1a530d4118603fd274621aa4b35c2d2df3a4fc Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Sun, 12 Apr 2026 18:33:48 +0200 Subject: [PATCH 2/6] nir/opt_shrink_vectors: support more load_global intrinsics This will prevent perf regressions when moving more lowering into nak_nir_lower_load_store. --- src/compiler/nir/nir_opt_shrink_vectors.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/compiler/nir/nir_opt_shrink_vectors.c b/src/compiler/nir/nir_opt_shrink_vectors.c index 9235accb2f4..f96da8d8315 100644 --- a/src/compiler/nir/nir_opt_shrink_vectors.c +++ b/src/compiler/nir/nir_opt_shrink_vectors.c @@ -377,7 +377,10 @@ opt_shrink_vectors_intrinsic(nir_builder *b, nir_intrinsic_instr *instr, case nir_intrinsic_load_constant: case nir_intrinsic_load_shared: case nir_intrinsic_load_global: + case nir_intrinsic_load_global_bounded: case nir_intrinsic_load_global_constant: + case nir_intrinsic_load_global_constant_bounded: + case nir_intrinsic_load_global_constant_offset: case nir_intrinsic_load_kernel_input: case nir_intrinsic_load_scratch: case nir_intrinsic_load_attribute_pan: { From 356c279daa62a6f6e4b56432dfee1d6893c36748 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Sun, 12 Apr 2026 17:44:06 +0200 Subject: [PATCH 3/6] nak: run nir_opt_constant_folding after nak_nir_lower_load_store It seems beneficial to run constant_folding after we lower our load/stores as it allows nir_opt_offsets to move even more constants into the base index. This will prevent perf regressions when moving more lowering into nak_nir_lower_load_store. Totals from 78165 (6.44% of 1212873) affected shaders: CodeSize: 1834431840 -> 1780937808 (-2.92%); split: -2.92%, +0.00% Number of GPRs: 5233739 -> 5234571 (+0.02%); split: -0.01%, +0.03% Static cycle count: 1855933069 -> 1851091739 (-0.26%); split: -0.26%, +0.00% Spills to memory: 12453 -> 12451 (-0.02%) Fills from memory: 12453 -> 12451 (-0.02%) Spills to reg: 69790 -> 69754 (-0.05%); split: -0.08%, +0.03% Fills from reg: 57092 -> 57072 (-0.04%); split: -0.07%, +0.03% Max warps/SM: 2572632 -> 2572196 (-0.02%); split: +0.01%, -0.02% --- src/nouveau/compiler/nak_nir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/nouveau/compiler/nak_nir.c b/src/nouveau/compiler/nak_nir.c index 27a3384f5cd..297c95e90e5 100644 --- a/src/nouveau/compiler/nak_nir.c +++ b/src/nouveau/compiler/nak_nir.c @@ -1307,7 +1307,8 @@ nak_postprocess_nir(nir_shader *nir, UNREACHABLE("Unsupported shader stage"); } - OPT(nir, nak_nir_lower_load_store, nak); + if (OPT(nir, nak_nir_lower_load_store, nak)) + OPT(nir, nir_opt_constant_folding); struct nir_opt_offsets_options nak_offset_options = { .max_offset_cb = nak_nir_max_imm_offset, From 93758c43c3c5d2b5da3a8ef703b590802f8a1f80 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Sun, 12 Apr 2026 16:37:23 +0200 Subject: [PATCH 4/6] nak: move all counstant_bounded and constant_offset lowering to nak_nir_lower_load_store We want to run nir_opt_sink before the lowering and for that it's best if we add the bound checking math after sinking the loads. Totals: CodeSize: 9003190576 -> 9003190080 (-0.00%); split: -0.00%, +0.00% Static cycle count: 5001955177 -> 5001932761 (-0.00%); split: -0.00%, +0.00% Totals from 350 (0.03% of 1212873) affected shaders: CodeSize: 9562192 -> 9561696 (-0.01%); split: -0.02%, +0.02% Static cycle count: 6031366 -> 6008950 (-0.37%); split: -0.40%, +0.03% --- src/nouveau/compiler/nak_nir.c | 29 +++++++++++++++++-- src/nouveau/vulkan/nvk_shader.c | 49 --------------------------------- 2 files changed, 26 insertions(+), 52 deletions(-) diff --git a/src/nouveau/compiler/nak_nir.c b/src/nouveau/compiler/nak_nir.c index 297c95e90e5..9bdd405ff02 100644 --- a/src/nouveau/compiler/nak_nir.c +++ b/src/nouveau/compiler/nak_nir.c @@ -1029,8 +1029,6 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak) } case nir_intrinsic_load_global_bounded: case nir_intrinsic_load_global_constant_bounded: { - assert(nak->sm >= 73); - nir_src *base = &intr->src[0]; nir_src *offset = &intr->src[1]; nir_src *size = &intr->src[2]; @@ -1040,7 +1038,31 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak) nir_def *addr = nir_iadd(&b, base->ssa, nir_u2u64(&b, offset->ssa)); nir_def *last_byte = nir_iadd_imm(&b, offset->ssa, load_size - 1); nir_def *cond = nir_ult(&b, last_byte, size->ssa); + + nir_def *zero = NULL; + if (nak->sm < 73) { + zero = nir_imm_zero(&b, intr->def.num_components, intr->def.bit_size); + nir_push_if(&b, cond); + } + res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr, cond); + new = nir_def_as_intrinsic(res); + + if (nak->sm < 73) { + nir_pop_if(&b, NULL); + res = nir_if_phi(&b, res, zero); + } + + break; + } + case nir_intrinsic_load_global_constant_offset: { + nir_src *base = &intr->src[0]; + nir_src *offset = &intr->src[1]; + + nir_def *address = nir_iadd(&b, base->ssa, nir_u2u64(&b, offset->ssa)); + nir_def *nir_true = nir_imm_bool(&b, true); + + res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, address, nir_true); break; } case nir_intrinsic_load_scratch: @@ -1080,7 +1102,8 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak) if (nir_intrinsic_has_access(intr)) nir_intrinsic_set_access(new, nir_intrinsic_access(intr)); if (intr->intrinsic == nir_intrinsic_load_global_constant || - intr->intrinsic == nir_intrinsic_load_global_constant_bounded) + intr->intrinsic == nir_intrinsic_load_global_constant_bounded || + intr->intrinsic == nir_intrinsic_load_global_constant_offset) nir_intrinsic_set_access(new, nir_intrinsic_access(new) | ACCESS_CAN_REORDER); if (nir_intrinsic_has_align_mul(intr)) diff --git a/src/nouveau/vulkan/nvk_shader.c b/src/nouveau/vulkan/nvk_shader.c index 8f7f781034b..b90742c4ee7 100644 --- a/src/nouveau/vulkan/nvk_shader.c +++ b/src/nouveau/vulkan/nvk_shader.c @@ -244,8 +244,6 @@ static bool lower_load_intrinsic(nir_builder *b, nir_intrinsic_instr *load, UNUSED void *data) { - struct nvk_physical_device *pdev = data; - switch (load->intrinsic) { case nir_intrinsic_load_ubo: { b->cursor = nir_before_instr(&load->instr); @@ -275,53 +273,6 @@ lower_load_intrinsic(nir_builder *b, nir_intrinsic_instr *load, return true; } - case nir_intrinsic_load_global_constant_bounded: - /* Handled inside nak_nir_lower_load_store */ - if (pdev->info.sm >= 73) - return false; - FALLTHROUGH; - case nir_intrinsic_load_global_constant_offset: { - b->cursor = nir_before_instr(&load->instr); - - nir_def *base_addr = load->src[0].ssa; - nir_def *offset = load->src[1].ssa; - - nir_def *zero = NULL; - if (load->intrinsic == nir_intrinsic_load_global_constant_bounded) { - nir_def *bound = load->src[2].ssa; - - unsigned bit_size = load->def.bit_size; - assert(bit_size >= 8 && bit_size % 8 == 0); - unsigned byte_size = bit_size / 8; - - zero = nir_imm_zero(b, load->num_components, bit_size); - - unsigned load_size = byte_size * load->num_components; - - nir_def *sat_offset = - nir_umin(b, offset, nir_imm_int(b, UINT32_MAX - (load_size - 1))); - nir_def *in_bounds = - nir_ilt(b, nir_iadd_imm(b, sat_offset, load_size - 1), bound); - - nir_push_if(b, in_bounds); - } - - nir_def *val = - nir_load_global_constant(b, load->def.num_components, - load->def.bit_size, - nir_iadd(b, base_addr, nir_u2u64(b, offset)), - .align_mul = nir_intrinsic_align_mul(load), - .align_offset = nir_intrinsic_align_offset(load)); - - if (load->intrinsic == nir_intrinsic_load_global_constant_bounded) { - nir_pop_if(b, NULL); - val = nir_if_phi(b, val, zero); - } - - nir_def_rewrite_uses(&load->def, val); - return true; - } - default: return false; } From 81095eb0c024a6cb453f91b4ece16fe4a5413f3f Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Sun, 12 Apr 2026 02:25:22 +0200 Subject: [PATCH 5/6] nak: sink ubo and ssbo loads before load/store lowering Our own lowering of bounded loads adds ALU instruction that make it unviable to run this later for ssbos and ubos, which also include load_global_bounded (ssbo) and load_global_constant_* (ubo). There are some regressions, but in many cases loads get moved down into control flow which makes them less likely to be executed, so we can take the hit as it's overall an improvement. Other regressions come from RA having a more difficult time to allocate registers well enough around control flow. Totals from 76135 (6.28% of 1212873) affected shaders: CodeSize: 2042701584 -> 2040842272 (-0.09%); split: -0.30%, +0.21% Number of GPRs: 5710526 -> 5642243 (-1.20%); split: -1.49%, +0.29% SLM Size: 364520 -> 362628 (-0.52%); split: -0.54%, +0.02% Static cycle count: 2318074214 -> 2368034561 (+2.16%); split: -0.17%, +2.32% Spills to memory: 28423 -> 27656 (-2.70%); split: -3.92%, +1.22% Fills from memory: 28423 -> 27656 (-2.70%); split: -3.92%, +1.22% Spills to reg: 108907 -> 112461 (+3.26%); split: -3.02%, +6.28% Fills from reg: 88071 -> 89601 (+1.74%); split: -2.90%, +4.64% Max warps/SM: 2297172 -> 2317116 (+0.87%); split: +1.17%, -0.31% --- src/nouveau/compiler/nak_nir.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/nouveau/compiler/nak_nir.c b/src/nouveau/compiler/nak_nir.c index 9bdd405ff02..83d80bc20c4 100644 --- a/src/nouveau/compiler/nak_nir.c +++ b/src/nouveau/compiler/nak_nir.c @@ -1330,6 +1330,8 @@ nak_postprocess_nir(nir_shader *nir, UNREACHABLE("Unsupported shader stage"); } + /* sink memory loads once before we add our own bound checking */ + OPT(nir, nir_opt_sink, nir_move_load_ssbo | nir_move_load_ubo); if (OPT(nir, nak_nir_lower_load_store, nak)) OPT(nir, nir_opt_constant_folding); From e147c095b4daaf678a4bc611a3660c949944fedf Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Sun, 12 Apr 2026 19:45:43 +0200 Subject: [PATCH 6/6] nak: sink load_global This seem to hurt smaller shaders, but really helps a lot with some of the worst cases we have reducing spilling by quite a lot. Totals from 15249 (1.26% of 1212873) affected shaders: CodeSize: 332565024 -> 333294192 (+0.22%); split: -0.38%, +0.60% Number of GPRs: 1020912 -> 1017649 (-0.32%); split: -2.15%, +1.83% SLM Size: 114724 -> 90240 (-21.34%); split: -21.37%, +0.03% Static cycle count: 538619840 -> 537174408 (-0.27%); split: -0.88%, +0.61% Spills to memory: 46686 -> 38685 (-17.14%) Fills from memory: 46686 -> 38685 (-17.14%) Spills to reg: 21224 -> 20703 (-2.45%); split: -3.23%, +0.77% Fills from reg: 19768 -> 19327 (-2.23%); split: -2.87%, +0.64% Max warps/SM: 519248 -> 519768 (+0.10%); split: +1.10%, -1.00% --- src/nouveau/compiler/nak_nir.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nouveau/compiler/nak_nir.c b/src/nouveau/compiler/nak_nir.c index 83d80bc20c4..7b8dbb17893 100644 --- a/src/nouveau/compiler/nak_nir.c +++ b/src/nouveau/compiler/nak_nir.c @@ -1375,6 +1375,7 @@ nak_postprocess_nir(nir_shader *nir, } while (progress); OPT(nir, nir_opt_move, nir_move_comparisons | nir_move_load_ubo); + OPT(nir, nir_opt_sink, nir_move_load_global); if (nak->sm < 70) { const nir_split_conversions_options split_conv_opts = {