From f7ad45e5fced03bf152479fdf0162ba673119b6b Mon Sep 17 00:00:00 2001
From: Karol Herbst <kherbst@redhat.com>
Date: Fri, 6 Mar 2026 10:45:48 +0100
Subject: [PATCH] nak: support has_load_global_bounded on turing and newer

Totals:
CodeSize: 9401446416 -> 8663482432 (-7.85%); split: -7.85%, +0.00%
Number of GPRs: 47297665 -> 47508294 (+0.45%); split: -0.14%, +0.59%
SLM Size: 1202912 -> 1203000 (+0.01%); split: -0.09%, +0.10%
Static cycle count: 5984801035 -> 4714013561 (-21.23%); split: -21.24%, +0.00%
Spills to memory: 44482 -> 45073 (+1.33%); split: -1.68%, +3.01%
Fills from memory: 44482 -> 45073 (+1.33%); split: -1.68%, +3.01%
Spills to reg: 184822 -> 149129 (-19.31%); split: -21.54%, +2.23%
Fills from reg: 223885 -> 170692 (-23.76%); split: -25.49%, +1.73%
Max warps/SM: 50642520 -> 50564740 (-0.15%); split: +0.03%, -0.19%

Totals from 185510 (15.95% of 1163204) affected shaders:
CodeSize: 3910084048 -> 3172120064 (-18.87%); split: -18.88%, +0.01%
Number of GPRs: 10625243 -> 10835872 (+1.98%); split: -0.63%, +2.61%
SLM Size: 659568 -> 659656 (+0.01%); split: -0.17%, +0.19%
Static cycle count: 3920553863 -> 2649766389 (-32.41%); split: -32.42%, +0.01%
Spills to memory: 8498 -> 9089 (+6.95%); split: -8.81%, +15.77%
Fills from memory: 8498 -> 9089 (+6.95%); split: -8.81%, +15.77%
Spills to reg: 109049 -> 73356 (-32.73%); split: -36.51%, +3.77%
Fills from reg: 116031 -> 62838 (-45.84%); split: -49.18%, +3.34%
Max warps/SM: 6885584 -> 6807804 (-1.13%); split: +0.25%, -1.38%

This also helps significantly reduce shader compile times since it reduces
the number of basic blocks.  With DragonAge: The Veilguard, it reduces
shader compile times by around 20%.

Reviewed-by: Mary Guillemard <mary@mary.zone>
Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Acked-by: Mel Henning <mhenning@darkrefraction.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40272>
---
 src/nouveau/compiler/nak/api.rs |  1 +
 src/nouveau/compiler/nak_nir.c  | 19 ++++++++++++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/nouveau/compiler/nak/api.rs b/src/nouveau/compiler/nak/api.rs
index 81bf4dc096e..7b50e6b5e80 100644
--- a/src/nouveau/compiler/nak/api.rs
+++ b/src/nouveau/compiler/nak/api.rs
@@ -181,6 +181,7 @@ fn nir_options(dev: &nv_device_info) -> nir_shader_compiler_options {
         has_pack_half_2x16_rtz: true,
         has_bfm: dev.sm >= 70,
         discard_is_demote: true,
+        has_load_global_bounded: dev.sm >= 73,
 
         max_unroll_iterations: 32,
         max_samples: 8,
diff --git a/src/nouveau/compiler/nak_nir.c b/src/nouveau/compiler/nak_nir.c
index 2bd45879170..16e326867d7 100644
--- a/src/nouveau/compiler/nak_nir.c
+++ b/src/nouveau/compiler/nak_nir.c
@@ -1019,6 +1019,22 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak)
                res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa, nir_true);
                break;
             }
+            case nir_intrinsic_load_global_bounded:
+            case nir_intrinsic_load_global_constant_bounded: {
+               assert(nak->sm >= 73);
+
+               nir_src *base = &intr->src[0];
+               nir_src *offset = &intr->src[1];
+               nir_src *size = &intr->src[2];
+               unsigned load_size = intr->def.num_components * intr->def.bit_size / 8;
+
+               /* see addr_is_in_bounds in nir_lower_explicit_io.c */
+               nir_def *addr = nir_iadd(&b, base->ssa, nir_u2u64(&b, offset->ssa));
+               nir_def *last_byte = nir_iadd_imm(&b, offset->ssa, load_size - 1);
+               nir_def *cond = nir_ult(&b, last_byte, size->ssa);
+               res = nir_load_global_nv(&b, intr->def.num_components, intr->def.bit_size, addr, cond);
+               break;
+            }
             case nir_intrinsic_load_scratch:
                res = nir_load_scratch_nv(&b, intr->def.num_components, intr->def.bit_size, addr->ssa);
                break;
@@ -1055,7 +1071,8 @@ nak_nir_lower_load_store(nir_shader *nir, const struct nak_compiler *nak)
 
             if (nir_intrinsic_has_access(intr))
                nir_intrinsic_set_access(new, nir_intrinsic_access(intr));
-            if (intr->intrinsic == nir_intrinsic_load_global_constant)
+            if (intr->intrinsic == nir_intrinsic_load_global_constant ||
+                intr->intrinsic == nir_intrinsic_load_global_constant_bounded)
                nir_intrinsic_set_access(new, nir_intrinsic_access(new) | ACCESS_CAN_REORDER);
 
             if (nir_intrinsic_has_align_mul(intr))