From f7ad45e5fced03bf152479fdf0162ba673119b6b Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Fri, 6 Mar 2026 10:45:48 +0100 Subject: [PATCH] nak: support has_load_global_bounded on turing and newer Totals: CodeSize: 9401446416 -> 8663482432 (-7.85%); split: -7.85%, +0.00% Number of GPRs: 47297665 -> 47508294 (+0.45%); split: -0.14%, +0.59% SLM Size: 1202912 -> 1203000 (+0.01%); split: -0.09%, +0.10% Static cycle count: 5984801035 -> 4714013561 (-21.23%); split: -21.24%, +0.00% Spills to memory: 44482 -> 45073 (+1.33%); split: -1.68%, +3.01% Fills from memory: 44482 -> 45073 (+1.33%); split: -1.68%, +3.01% Spills to reg: 184822 -> 149129 (-19.31%); split: -21.54%, +2.23% Fills from reg: 223885 -> 170692 (-23.76%); split: -25.49%, +1.73% Max warps/SM: 50642520 -> 50564740 (-0.15%); split: +0.03%, -0.19% Totals from 185510 (15.95% of 1163204) affected shaders: CodeSize: 3910084048 -> 3172120064 (-18.87%); split: -18.88%, +0.01% Number of GPRs: 10625243 -> 10835872 (+1.98%); split: -0.63%, +2.61% SLM Size: 659568 -> 659656 (+0.01%); split: -0.17%, +0.19% Static cycle count: 3920553863 -> 2649766389 (-32.41%); split: -32.42%, +0.01% Spills to memory: 8498 -> 9089 (+6.95%); split: -8.81%, +15.77% Fills from memory: 8498 -> 9089 (+6.95%); split: -8.81%, +15.77% Spills to reg: 109049 -> 73356 (-32.73%); split: -36.51%, +3.77% Fills from reg: 116031 -> 62838 (-45.84%); split: -49.18%, +3.34% Max warps/SM: 6885584 -> 6807804 (-1.13%); split: +0.25%, -1.38% This also helps significantly reduce shader compile times since it reduces the number of basic blocks. With DragonAge: The Veilguard, it reduces shader compile times by around 20%. Reviewed-by: Mary Guillemard Reviewed-by: Faith Ekstrand Acked-by: Mel Henning Part-of: --- src/nouveau/compiler/nak/api.rs | 1 + src/nouveau/compiler/nak_nir.c | 19 ++++++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/nouveau/compiler/nak/api.rs b/src/nouveau/compiler/nak/api.rs index 81bf4dc096e..7b50e6b5e80 100644 --- a/src/nouveau/compiler/nak/api.rs +++ b/src/nouveau/compiler/nak/api.rs @@ -181,6 +181,7 @@ fn nir_options(dev: &nv_device_info) -> nir_shader_compiler_options { has_pack_half_2x16_rtz: true, has_bfm: dev.sm >= 70, discard_is_demote: true, + has_load_global_bounded: dev.sm >= 73, max_unroll_iterations: 32, max_samples: 8, diff --git a/src/nouveau/compiler/nak_nir.c b/src/nouveau/compiler/nak_nir.c index 2bd45879170..16e326867d7 100644 --- a/src/nouveau/compiler/nak_nir.c +++ b/src/nouveau/compiler/nak_nir.c @@ -1019,6 +1019,22 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak) res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa, nir_true); break; } + case nir_intrinsic_load_global_bounded: + case nir_intrinsic_load_global_constant_bounded: { + assert(nak->sm >= 73); + + nir_src *base = &intr->src[0]; + nir_src *offset = &intr->src[1]; + nir_src *size = &intr->src[2]; + unsigned load_size = intr->def.num_components * intr->def.bit_size / 8; + + /* see addr_is_in_bounds in nir_lower_explicit_io.c */ + nir_def *addr = nir_iadd(&b, base->ssa, nir_u2u64(&b, offset->ssa)); + nir_def *last_byte = nir_iadd_imm(&b, offset->ssa, load_size - 1); + nir_def *cond = nir_ult(&b, last_byte, size->ssa); + res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr, cond); + break; + } case nir_intrinsic_load_scratch: res = nir_load_scratch_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa); break; @@ -1055,7 +1071,8 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak) if (nir_intrinsic_has_access(intr)) nir_intrinsic_set_access(new, nir_intrinsic_access(intr)); - if (intr->intrinsic == nir_intrinsic_load_global_constant) + if (intr->intrinsic == nir_intrinsic_load_global_constant || + intr->intrinsic == nir_intrinsic_load_global_constant_bounded) nir_intrinsic_set_access(new, nir_intrinsic_access(new) | ACCESS_CAN_REORDER); if (nir_intrinsic_has_align_mul(intr))