ir3: always use byte offset for @load/store_global_ir3

Before a7xx, ldg/stg.a use an offset in units of their type size while on a7xx and later, the offset is always in bytes. Currently, @load/store_global_ir3 take their offset in dwords (32-bits). This has a few downsides: offsets need an extra shl during codegen on a7xx and addressing sub-dword-aligned addresses is only possible by doing 64-bit math on the base address. Improve the situation by always using a byte offset for @load/store_global_ir3 and adding the offset_shift index to support type units pre-a7xx. While we're at it, add the base index as well to support all ldg/stg.g features in @load/store_global_ir3. Supporting these renewed intrinsics consists of two parts: - ir3_nir_lower_io_offsets legalizes the offset_shift on a6xx: for ldg.a/stg.a, the offset has to be in units of the type size so extra shifts are inserted to accomplish this if necessary. On a7xx, offsets are always in bytes so nothing needs to be done. - The intrinsics are emitted as ldg/stg if the offset is a small enough constant and as ldg.a/stg.a otherwise. a6xx supports an extra shift for ldg.a/stg.a that only applies to the GPR offset (not the immediate base); NIR is pattern matched at this point to extract this if possible. All users of @load/store_global_ir3 are updated to generate the offset in units of bytes. ir3_nir_analyze_ubo_ranges is updated to take the new offset_shift into account. Totals from 2029 (1.15% of 176266) affected shaders: MaxWaves: 26728 -> 26660 (-0.25%); split: +0.01%, -0.26% Instrs: 1314089 -> 1278603 (-2.70%); split: -2.72%, +0.02% CodeSize: 2739108 -> 2633236 (-3.87%); split: -3.87%, +0.01% NOPs: 197537 -> 200843 (+1.67%); split: -1.62%, +3.30% MOVs: 43771 -> 44025 (+0.58%); split: -1.11%, +1.69% Full: 31849 -> 31948 (+0.31%); split: -0.03%, +0.34% (ss): 37965 -> 42027 (+10.70%); split: -3.47%, +14.17% (sy): 13752 -> 13566 (-1.35%); split: -4.04%, +2.68% (ss)-stall: 154238 -> 170353 (+10.45%); split: -1.72%, +12.16% (sy)-stall: 804442 -> 806518 (+0.26%); split: -4.65%, +4.91% Preamble Instrs: 326728 -> 293488 (-10.17%) Cat0: 217926 -> 220947 (+1.39%); split: -1.58%, +2.96% Cat1: 50182 -> 50446 (+0.53%); split: -0.97%, +1.49% Cat2: 460987 -> 452101 (-1.93%); split: -2.26%, +0.33% Cat3: 390696 -> 361271 (-7.53%) Cat7: 39148 -> 38688 (-1.18%); split: -1.24%, +0.06% Signed-off-by: Job Noorman <jnoorman@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41342>
2026-05-07 09:18:04 +02:00 · 2026-05-05 06:25:49 +02:00 · 2026-05-05 06:25:49 +02:00 · c784af5ca0
commit c784af5ca0
parent 6158072e6f
9 changed files with 288 additions and 84 deletions
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@ -1594,11 +1594,17 @@ load("shared_ir3", [1], [BASE, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE])

 # src[] = { value, address(vec2 of hi+lo uint32_t), offset }.
 # const_index[] = { write_mask, align_mul, align_offset }
-store("global_ir3", [1, 1], indices=[ACCESS, ALIGN_MUL, ALIGN_OFFSET])
+# Final address is calculated as `address + ((offset + BASE) << OFFSET_SHIFT)
+# `offset` is sign-extended to 64-bits first so the offset calculation does not
+# cause 32-bit overflows.
+# a6xx has another shift field which only applies to `offset`; this is not
+# represented here.
+store("global_ir3", [1, 1], indices=[ACCESS, ALIGN_MUL, ALIGN_OFFSET, OFFSET_SHIFT, BASE])
 # src[] = { address(vec2 of hi+lo uint32_t), offset }.
 # const_index[] = { access, align_mul, align_offset }
 # the alignment applies to the base address
-load("global_ir3", [1, 1], indices=[ACCESS, ALIGN_MUL, ALIGN_OFFSET, RANGE_BASE, RANGE], flags=[CAN_ELIMINATE])
+# Final address is calculated as for @store_global_ir3
+load("global_ir3", [1, 1], indices=[ACCESS, ALIGN_MUL, ALIGN_OFFSET, RANGE_BASE, RANGE, OFFSET_SHIFT, BASE], flags=[CAN_ELIMINATE])

 # Etnaviv-specific load/glboal intrinsics. They take a 32-bit base address and
 # a 32-bit offset, which doesn't need to be an immediate.
--- a/src/freedreno/ir3/ir3_a6xx.c
+++ b/src/freedreno/ir3/ir3_a6xx.c
@ -431,6 +431,132 @@ emit_intrinsic_image_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
   ir3_split_dest(b, dst, resinfo, 0, intr->num_components);
 }

+/* On a6xx, on top of the offset_shift that applies to the whole offset, there's
+ * a second shift that only applies to the GPR part of the offset (so not to the
+ * immediate part). We extract that here by simply pattern matching for ishl on
+ * the offset src. Returns the shift if a match is found and it fits in the
+ * 2-bit field, in which case *offset_src is set to the src of ishl and
+ * *offset_src_comp to the component of *offset_src.
+ */
+static unsigned
+parse_src_shift(struct ir3_context *ctx, nir_src **offset_src,
+                unsigned *offset_src_comp)
+{
+   *offset_src_comp = 0;
+
+   if (ctx->compiler->gen >= 7) {
+      return 0;
+   }
+
+   nir_scalar offset =
+      nir_scalar_chase_movs(nir_get_scalar((*offset_src)->ssa, 0));
+
+   if (!nir_scalar_is_alu(offset) || nir_scalar_alu_op(offset) != nir_op_ishl) {
+      return 0;
+   }
+
+   nir_scalar shift_src = nir_scalar_chase_alu_src(offset, 1);
+
+   if (!nir_scalar_is_const(shift_src)) {
+      return 0;
+   }
+
+   unsigned shift = nir_scalar_as_uint(shift_src);
+
+   if (shift >= (1 << 2)) {
+      return 0;
+   }
+
+   nir_alu_instr *offset_alu = nir_def_as_alu(offset.def);
+   *offset_src = &offset_alu->src[0].src;
+   *offset_src_comp = offset_alu->src[0].swizzle[offset.comp];
+   return shift;
+}
+
+static bool
+base_fits_ldg_stg_a(struct ir3_compiler *compiler, unsigned base)
+{
+   if (compiler->gen >= 7) {
+      return base < (1 << 8);
+   }
+
+   return base < (1 << 2);
+}
+
+/* Represents an offset for ldg/stg(.a):
+ * - src == NULL: ldg/stg base_address + imm
+ * - src != NULL:
+ *   - a6xx: ldg/stg.a base_addr + (src << src_shift) + imm
+ *   - a7xx: ldg/stg.a base_addr + src + imm
+ */
+struct ldg_stg_offset {
+   struct ir3_instruction *src;
+   struct ir3_instruction *src_shift;
+   struct ir3_instruction *imm;
+};
+
+static struct ldg_stg_offset
+ldg_stg_offset(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+   assert(intr->intrinsic == nir_intrinsic_load_global_ir3 ||
+          intr->intrinsic == nir_intrinsic_store_global_ir3);
+
+   if (ctx->compiler->gen >= 7) {
+      assert(nir_intrinsic_offset_shift(intr) == 0);
+   } else {
+      ASSERTED unsigned bit_size =
+         intr->intrinsic == nir_intrinsic_load_global_ir3
+            ? intr->def.bit_size
+            : intr->src[0].ssa->bit_size;
+      assert(nir_intrinsic_offset_shift(intr) == ffs(bit_size / 8) - 1);
+   }
+
+   struct ldg_stg_offset offset = {};
+   nir_src *offset_src = nir_get_io_offset_src(intr);
+   int32_t base = nir_intrinsic_base(intr);
+   unsigned offset_shift = nir_intrinsic_offset_shift(intr);
+   struct ir3_builder *b = &ctx->build;
+
+   if (nir_src_is_const(*offset_src)) {
+      int32_t full_imm_offset = base + nir_src_as_int(*offset_src);
+      int32_t full_imm_offset_bytes = full_imm_offset << offset_shift;
+
+      /* ldg/stg offset immediate is 13 bits. Note that ldg/stg use byte offsets
+       * even on a6xx.
+       */
+      if (full_imm_offset_bytes < (1 << 12) &&
+          full_imm_offset_bytes >= -(1 << 12)) {
+         offset.imm = create_immed(b, full_imm_offset_bytes);
+      } else {
+         /* The immediate offset does not fit. Generate ldg/stg.a with the
+          * immediate in a GPR.
+          */
+         offset.src = create_immed(b, full_imm_offset);
+         offset.src_shift = create_immed(b, 0);
+         offset.imm = create_immed(b, 0);
+      }
+   } else {
+      if (base_fits_ldg_stg_a(ctx->compiler, base)) {
+         unsigned offset_src_comp;
+         unsigned shift = parse_src_shift(ctx, &offset_src, &offset_src_comp);
+         offset.src = ir3_get_src(ctx, offset_src)[offset_src_comp];
+         offset.src_shift = create_immed(b, shift);
+         offset.imm = create_immed(b, base);
+      } else {
+         /* This should be rare, but various passes might update
+          * base/offset_shift in a way that makes the combination illegal.
+          * Detect that here and replace base by an add.
+          */
+         offset.src = ir3_ADD_U(b, ir3_get_src(ctx, offset_src)[0], 0,
+                                create_immed(b, base), 0);
+         offset.src_shift = create_immed(b, 0);
+         offset.imm = create_immed(b, 0);
+      }
+   }
+
+   return offset;
+}
+
 static void
 emit_intrinsic_load_global_ir3(struct ir3_context *ctx,
                               nir_intrinsic_instr *intr,
@ -438,31 +564,19 @@ emit_intrinsic_load_global_ir3(struct ir3_context *ctx,
 {
   struct ir3_builder *b = &ctx->build;
   unsigned dest_components = nir_intrinsic_dest_components(intr);
-   struct ir3_instruction *addr, *offset;
+   struct ir3_instruction *addr;

   addr = ir3_collect(b, ir3_get_src(ctx, &intr->src[0])[0]);

+   struct ldg_stg_offset offset = ldg_stg_offset(ctx, intr);
   struct ir3_instruction *load;

-   bool const_offset_in_bounds =
-      nir_src_is_const(intr->src[1]) &&
-      nir_src_as_int(intr->src[1]) < (1 << 8) &&
-      nir_src_as_int(intr->src[1]) > -(1 << 8);
-
-   if (const_offset_in_bounds) {
-      load = ir3_LDG(b, addr, 0,
-                     create_immed(b, nir_src_as_int(intr->src[1]) * 4),
-                     0, create_immed(b, dest_components), 0);
+   if (!offset.src) {
+      load = ir3_LDG(b, addr, 0, offset.imm, 0,
+                     create_immed(b, dest_components), 0);
   } else {
-      unsigned shift = ctx->compiler->gen >= 7 ? 2 : 0;
-      offset = ir3_get_src(ctx, &intr->src[1])[0];
-      if (shift) {
-         /* A7XX TODO: Move to NIR for it to be properly optimized? */
-         offset = ir3_SHL_B(b, offset, 0, create_immed(b, shift), 0);
-      }
-      load =
-         ir3_LDG_A(b, addr, 0, offset, 0, create_immed(b, 0), 0,
-                   create_immed(b, 0), 0, create_immed(b, dest_components), 0);
+      load = ir3_LDG_A(b, addr, 0, offset.src, 0, offset.src_shift, 0,
+                       offset.imm, 0, create_immed(b, dest_components), 0);
   }

   load->cat6.type = type_uint_size(intr->def.bit_size);
@ -479,33 +593,22 @@ emit_intrinsic_store_global_ir3(struct ir3_context *ctx,
                                nir_intrinsic_instr *intr)
 {
   struct ir3_builder *b = &ctx->build;
-   struct ir3_instruction *value, *addr, *offset;
+   struct ir3_instruction *value, *addr;
   unsigned ncomp = nir_intrinsic_src_components(intr, 0);

   addr = ir3_collect(b, ir3_get_src(ctx, &intr->src[1])[0]);

   value = ir3_create_collect(b, ir3_get_src(ctx, &intr->src[0]), ncomp);

+   struct ldg_stg_offset offset = ldg_stg_offset(ctx, intr);
   struct ir3_instruction *stg;

-   bool const_offset_in_bounds = nir_src_is_const(intr->src[2]) &&
-                                 nir_src_as_int(intr->src[2]) < (1 << 10) &&
-                                 nir_src_as_int(intr->src[2]) > -(1 << 10);
-
-   if (const_offset_in_bounds) {
-      stg = ir3_STG(b, addr, 0,
-                    create_immed(b, nir_src_as_int(intr->src[2]) * 4), 0,
-                    value, 0,
-                    create_immed(b, ncomp), 0);
+   if (!offset.src) {
+      stg = ir3_STG(b, addr, 0, offset.imm, 0, value, 0, create_immed(b, ncomp),
+                    0);
   } else {
-      offset = ir3_get_src(ctx, &intr->src[2])[0];
-      if (ctx->compiler->gen >= 7) {
-         /* A7XX TODO: Move to NIR for it to be properly optimized? */
-         offset = ir3_SHL_B(b, offset, 0, create_immed(b, 2), 0);
-      }
-      stg =
-         ir3_STG_A(b, addr, 0, offset, 0, create_immed(b, 0), 0,
-                   create_immed(b, 0), 0, value, 0, create_immed(b, ncomp), 0);
+      stg = ir3_STG_A(b, addr, 0, offset.src, 0, offset.src_shift, 0,
+                      offset.imm, 0, value, 0, create_immed(b, ncomp), 0);
   }

   stg->cat6.type = type_uint_size(intr->src[0].ssa->bit_size);
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@ -607,7 +607,7 @@ lower_shader_clock(struct nir_builder *b, nir_intrinsic_instr *instr, void *data
      nir_def *clock_lo =
         nir_load_global_ir3(b, 1, 32, base_addr, nir_imm_int(b, 0));
      nir_def *clock_hi =
-         nir_load_global_ir3(b, 1, 32, base_addr, nir_imm_int(b, 1));
+         nir_load_global_ir3(b, 1, 32, base_addr, nir_imm_int(b, 4));
      clock = nir_vec2(b, clock_lo, clock_hi);
   }
   nir_push_else(b, NULL);
@ -2052,3 +2052,20 @@ ir3_nir_intrinsic_barycentric_sysval(nir_intrinsic_instr *intr)

   return sysval;
 }
+
+nir_io_offset
+ir3_nir_get_global_offset(nir_builder *b, struct ir3_compiler *compiler,
+                          nir_def *offset, unsigned offset_shift)
+{
+   if (compiler->gen >= 7) {
+      return (nir_io_offset){
+         .def = nir_ishl_imm(b, offset, offset_shift),
+         .shift = 0,
+      };
+   }
+
+   return (nir_io_offset){
+      .def = offset,
+      .shift = offset_shift,
+   };
+}
--- a/src/freedreno/ir3/ir3_nir.h
+++ b/src/freedreno/ir3/ir3_nir.h
@ -212,6 +212,10 @@ unsigned ir3_nir_max_offset_shift(nir_intrinsic_instr *intr, const void *data);
 gl_system_value
 ir3_nir_intrinsic_barycentric_sysval(nir_intrinsic_instr *intr);

+nir_io_offset ir3_nir_get_global_offset(nir_builder *b,
+                                        struct ir3_compiler *compiler,
+                                        nir_def *offset, unsigned offset_shift);
+
 ENDC;

 #endif /* IR3_NIR_H_ */
--- a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
+++ b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
@ -16,18 +16,13 @@ get_ubo_load_range(nir_shader *nir, nir_intrinsic_instr *instr,
   uint32_t offset = nir_intrinsic_range_base(instr);
   uint32_t size = nir_intrinsic_range(instr);

-   if (instr->intrinsic == nir_intrinsic_load_global_ir3) {
-      offset *= 4;
-      size *= 4;
-   }
-
   /* If the offset is constant, the range is trivial (and NIR may not have
    * figured it out).
    */
   if (nir_src_is_const(instr->src[1])) {
      offset = nir_src_as_uint(instr->src[1]);
      if (instr->intrinsic == nir_intrinsic_load_global_ir3)
-         offset *= 4;
+         offset <<= nir_intrinsic_offset_shift(instr);
      size = nir_intrinsic_dest_components(instr) * 4;
   }

@ -297,25 +292,30 @@ lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b,

   nir_def *uniform_offset = ubo_offset;

-   if (instr->intrinsic == nir_intrinsic_load_ubo) {
-      /* UBO offset is in bytes, but uniform offset is in units of
-       * dwords, so we need to divide by 4 (right-shift by 2). For ldc the
-       * offset is in units of 16 bytes, so we need to multiply by 4. And
-       * also the same for the constant part of the offset:
-       */
-      const int shift = -2;
-      nir_def *new_offset = ir3_nir_try_propagate_bit_shift(b, ubo_offset, -2);
-      if (new_offset) {
-         uniform_offset = new_offset;
-      } else {
-         uniform_offset = shift > 0
-                             ? nir_ishl_imm(b, ubo_offset, shift)
-                             : nir_ushr_imm(b, ubo_offset, -shift);
-      }
+   /* UBO/global offset is in bytes, but uniform offset is in units of
+    * dwords, so we need to divide by 4 (right-shift by 2). For ldc the
+    * offset is in units of 16 bytes, so we need to multiply by 4. And
+    * also the same for the constant part of the offset:
+    */
+   int shift = -2;
+
+   if (instr->intrinsic == nir_intrinsic_load_global_ir3) {
+      unsigned offset_shift = nir_intrinsic_offset_shift(instr);
+      assert(offset_shift <= 2);
+
+      shift = -(2 - offset_shift);
+   }
+
+   nir_def *new_offset = ir3_nir_try_propagate_bit_shift(b, ubo_offset, shift);
+   if (new_offset) {
+      uniform_offset = new_offset;
+   } else {
+      uniform_offset = shift > 0 ? nir_ishl_imm(b, ubo_offset, shift)
+                                 : nir_ushr_imm(b, ubo_offset, -shift);
   }

   assert(!(const_offset & 0x3));
-   const_offset >>= 2;
+   const_offset >>= -shift;

   const int range_offset = ((int)range->offset - (int)range->start) / 4;
   const_offset += range_offset;
--- a/src/freedreno/ir3/ir3_nir_lower_io_offsets.c
+++ b/src/freedreno/ir3/ir3_nir_lower_io_offsets.c
@ -258,6 +258,52 @@ lower_offset_for_ssbo(nir_intrinsic_instr *intrinsic, nir_builder *b,
   return true;
 }

+/* On a6xx, global memory is accessed in units of the type size. Legalize
+ * offset_shift to correspond to this.
+ */
+static bool
+lower_offset_for_global(nir_builder *b, nir_intrinsic_instr *intr,
+                        struct ir3_compiler *compiler)
+{
+   if (compiler->gen >= 7) {
+      assert(nir_intrinsic_offset_shift(intr) == 0);
+      return false;
+   }
+
+   unsigned bit_size = intr->intrinsic == nir_intrinsic_load_global_ir3
+                          ? intr->def.bit_size
+                          : intr->src[0].ssa->bit_size;
+
+   assert(bit_size < 64);
+
+   int shift = ffs(bit_size / 8) - 1;
+   int cur_shift = nir_intrinsic_offset_shift(intr);
+   int extra_shift = shift - cur_shift;
+
+   if (extra_shift == 0) {
+      return false;
+   }
+
+   b->cursor = nir_before_instr(&intr->instr);
+
+   nir_src *offset_src = nir_get_io_offset_src(intr);
+   nir_io_offset new_offset = {
+      .def = ir3_nir_try_propagate_bit_shift(b, offset_src->ssa, -extra_shift),
+      .shift = shift,
+   };
+
+   if (!new_offset.def) {
+      if (extra_shift > 0) {
+         new_offset.def = nir_ushr_imm(b, offset_src->ssa, extra_shift);
+      } else {
+         new_offset.def = nir_ishl_imm(b, offset_src->ssa, -extra_shift);
+      }
+   }
+
+   nir_set_io_offset(intr, new_offset);
+   return true;
+}
+
 static bool
 lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx,
                       struct ir3_compiler *c)
@ -288,6 +334,11 @@ lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx,
         scalarize_load(intr, b);
         progress = true;
      }
+
+      if (intr->intrinsic == nir_intrinsic_load_global_ir3 ||
+          intr->intrinsic == nir_intrinsic_store_global_ir3) {
+         progress |= lower_offset_for_global(b, intr, c);
+      }
   }

   return progress;
--- a/src/freedreno/ir3/ir3_nir_lower_tess.c
+++ b/src/freedreno/ir3/ir3_nir_lower_tess.c
@ -8,6 +8,7 @@
 #include "ir3_nir.h"

 struct state {
+   struct ir3_compiler *compiler;
   uint32_t topology;

   struct primitive_map {
@ -190,6 +191,33 @@ replace_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
   return new_intr;
 }

+static void
+replace_with_load_global(nir_builder *b, struct ir3_compiler *compiler,
+                         nir_intrinsic_instr *intr, nir_def *addr,
+                         nir_def *offset)
+{
+   /* Our offsets are in units of 4B. */
+   nir_io_offset global_offset =
+      ir3_nir_get_global_offset(b, compiler, offset, 2);
+   nir_def *load = nir_load_global_ir3(
+      b, intr->def.num_components, intr->def.bit_size, addr, global_offset.def,
+      .align_mul = 4, .align_offset = 0, .offset_shift = global_offset.shift);
+   nir_def_replace(&intr->def, load);
+}
+
+static void
+replace_with_store_global(nir_builder *b, struct ir3_compiler *compiler,
+                          nir_intrinsic_instr *intr, nir_def *val,
+                          nir_def *addr, nir_def *offset)
+{
+   /* Our offsets are in units of 4B. */
+   nir_io_offset global_offset =
+      ir3_nir_get_global_offset(b, compiler, offset, 2);
+   nir_store_global_ir3(b, val, addr, global_offset.def, .align_mul = 4,
+                        .align_offset = 0, .offset_shift = global_offset.shift);
+   nir_instr_remove(&intr->instr);
+}
+
 static void
 build_primitive_map(nir_shader *shader, struct primitive_map *map)
 {
@ -577,8 +605,7 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
            nir_intrinsic_io_semantics(intr).location,
            nir_intrinsic_component(intr), intr->src[1].ssa);

-         replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
-                           offset, NULL);
+         replace_with_load_global(b, state->compiler, intr, address, offset);
         break;
      }

@ -598,8 +625,8 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
            nir_intrinsic_io_semantics(intr).location,
            nir_intrinsic_component(intr), intr->src[2].ssa);

-         replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, value,
-                           address, offset);
+         replace_with_store_global(b, state->compiler, intr, value, address,
+                                   offset);

         break;
      }
@ -623,8 +650,7 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
                                        intr->src[0].ssa);
         }

-         replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
-                           offset, NULL);
+         replace_with_load_global(b, state->compiler, intr, address, offset);
         break;
      }

@ -664,10 +690,9 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
            nir_def *offset = build_tessfactor_base(
               b, location, nir_intrinsic_component(intr), state);

-            replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
-                              intr->src[0].ssa,
-                              load_tess_factor_base(b),
-                              nir_iadd(b, intr->src[1].ssa, offset));
+            replace_with_store_global(
+               b, state->compiler, intr, intr->src[0].ssa,
+               load_tess_factor_base(b), nir_iadd(b, intr->src[1].ssa, offset));

            if (location != VARYING_SLOT_PRIMITIVE_ID) {
               nir_pop_if(b, nif);
@ -678,8 +703,8 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
               b, state, location, nir_intrinsic_component(intr),
               intr->src[1].ssa);

-            replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
-                              intr->src[0].ssa, address, offset);
+            replace_with_store_global(b, state->compiler, intr,
+                                      intr->src[0].ssa, address, offset);
         }
         break;
      }
@ -694,7 +719,7 @@ bool
 ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader_variant *v,
                        unsigned topology)
 {
-   struct state state = {.topology = topology};
+   struct state state = {.topology = topology, .compiler = v->compiler};

   if (shader_debug_enabled(shader->info.stage, shader->info.internal)) {
      mesa_logi("NIR (before tess lowering) for %s shader:",
@ -787,8 +812,7 @@ lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state)
            nir_intrinsic_io_semantics(intr).location,
            nir_intrinsic_component(intr), intr->src[1].ssa);

-         replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
-                           offset, NULL);
+         replace_with_load_global(b, state->compiler, intr, address, offset);
         break;
      }

@ -811,8 +835,7 @@ lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state)
                                        intr->src[0].ssa);
         }

-         replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
-                           offset, NULL);
+         replace_with_load_global(b, state->compiler, intr, address, offset);
         break;
      }

@ -826,7 +849,7 @@ bool
 ir3_nir_lower_tess_eval(nir_shader *shader, struct ir3_shader_variant *v,
                        unsigned topology)
 {
-   struct state state = {.topology = topology};
+   struct state state = {.topology = topology, .compiler = v->compiler};

   if (shader_debug_enabled(shader->info.stage, shader->info.internal)) {
      mesa_logi("NIR (before tess lowering) for %s shader:",
--- a/src/freedreno/vulkan/tu_nir_lower_ray_query.cc
+++ b/src/freedreno/vulkan/tu_nir_lower_ray_query.cc
@ -262,8 +262,8 @@ load_tlas(nir_builder *b, nir_def *tlas,
   } else {
      return nir_load_global_ir3(b, components, 32,
                                 nir_pack_64_2x32(b, tlas),
-                                 nir_iadd_imm(b, nir_imul_imm(b, index, AS_RECORD_SIZE / 4),
-                                              offset / 4),
+                                 nir_iadd_imm(b, nir_imul_imm(b, index, AS_RECORD_SIZE),
+                                              offset),
                                 /* The required alignment of the
                                  * user-specified base from the Vulkan spec.
                                  */
--- a/src/freedreno/vulkan/tu_shader.cc
+++ b/src/freedreno/vulkan/tu_shader.cc
@ -1024,7 +1024,7 @@ lower_inline_ubo(nir_builder *b, nir_intrinsic_instr *intrin, void *cb_data)
      val = nir_load_global_ir3(b, intrin->num_components,
                                intrin->def.bit_size,
                                nir_pack_64_2x32(b, base_addr),
-                                nir_ishr_imm(b, offset, 2),
+                                offset,
                                .access =
                                 (enum gl_access_qualifier)(
                                    (enum gl_access_qualifier)(ACCESS_NON_WRITEABLE | ACCESS_CAN_REORDER) |
@ -1032,7 +1032,7 @@ lower_inline_ubo(nir_builder *b, nir_intrinsic_instr *intrin, void *cb_data)
                                .align_mul = 16,
                                .align_offset = 0,
                                .range_base = 0,
-                                .range = range);
+                                .range = range * 4);
   } else {
      val =
         nir_load_const_ir3(b, intrin->num_components, intrin->def.bit_size,