nak: support has_load_global_bounded on turing and newer

Totals:
CodeSize: 9401446416 -> 8663482432 (-7.85%); split: -7.85%, +0.00%
Number of GPRs: 47297665 -> 47508294 (+0.45%); split: -0.14%, +0.59%
SLM Size: 1202912 -> 1203000 (+0.01%); split: -0.09%, +0.10%
Static cycle count: 5984801035 -> 4714013561 (-21.23%); split: -21.24%, +0.00%
Spills to memory: 44482 -> 45073 (+1.33%); split: -1.68%, +3.01%
Fills from memory: 44482 -> 45073 (+1.33%); split: -1.68%, +3.01%
Spills to reg: 184822 -> 149129 (-19.31%); split: -21.54%, +2.23%
Fills from reg: 223885 -> 170692 (-23.76%); split: -25.49%, +1.73%
Max warps/SM: 50642520 -> 50564740 (-0.15%); split: +0.03%, -0.19%

Totals from 185510 (15.95% of 1163204) affected shaders:
CodeSize: 3910084048 -> 3172120064 (-18.87%); split: -18.88%, +0.01%
Number of GPRs: 10625243 -> 10835872 (+1.98%); split: -0.63%, +2.61%
SLM Size: 659568 -> 659656 (+0.01%); split: -0.17%, +0.19%
Static cycle count: 3920553863 -> 2649766389 (-32.41%); split: -32.42%, +0.01%
Spills to memory: 8498 -> 9089 (+6.95%); split: -8.81%, +15.77%
Fills from memory: 8498 -> 9089 (+6.95%); split: -8.81%, +15.77%
Spills to reg: 109049 -> 73356 (-32.73%); split: -36.51%, +3.77%
Fills from reg: 116031 -> 62838 (-45.84%); split: -49.18%, +3.34%
Max warps/SM: 6885584 -> 6807804 (-1.13%); split: +0.25%, -1.38%

This also helps significantly reduce shader compile times since it reduces
the number of basic blocks.  With DragonAge: The Veilguard, it reduces
shader compile times by around 20%.

Reviewed-by: Mary Guillemard <mary@mary.zone>
Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Acked-by: Mel Henning <mhenning@darkrefraction.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40272>
This commit is contained in:
Karol Herbst 2026-03-06 10:45:48 +01:00 committed by Marge Bot
parent 7722bde53b
commit f7ad45e5fc
2 changed files with 19 additions and 1 deletions

View file

@ -181,6 +181,7 @@ fn nir_options(dev: &nv_device_info) -> nir_shader_compiler_options {
has_pack_half_2x16_rtz: true,
has_bfm: dev.sm >= 70,
discard_is_demote: true,
has_load_global_bounded: dev.sm >= 73,
max_unroll_iterations: 32,
max_samples: 8,

View file

@ -1019,6 +1019,22 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak)
res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa, nir_true);
break;
}
case nir_intrinsic_load_global_bounded:
case nir_intrinsic_load_global_constant_bounded: {
assert(nak->sm >= 73);
nir_src *base = &intr->src[0];
nir_src *offset = &intr->src[1];
nir_src *size = &intr->src[2];
unsigned load_size = intr->def.num_components * intr->def.bit_size / 8;
/* see addr_is_in_bounds in nir_lower_explicit_io.c */
nir_def *addr = nir_iadd(&b, base->ssa, nir_u2u64(&b, offset->ssa));
nir_def *last_byte = nir_iadd_imm(&b, offset->ssa, load_size - 1);
nir_def *cond = nir_ult(&b, last_byte, size->ssa);
res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr, cond);
break;
}
case nir_intrinsic_load_scratch:
res = nir_load_scratch_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa);
break;
@ -1055,7 +1071,8 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak)
if (nir_intrinsic_has_access(intr))
nir_intrinsic_set_access(new, nir_intrinsic_access(intr));
if (intr->intrinsic == nir_intrinsic_load_global_constant)
if (intr->intrinsic == nir_intrinsic_load_global_constant ||
intr->intrinsic == nir_intrinsic_load_global_constant_bounded)
nir_intrinsic_set_access(new, nir_intrinsic_access(new) | ACCESS_CAN_REORDER);
if (nir_intrinsic_has_align_mul(intr))