diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py
index 9381f728fea..148c59804b4 100644
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -1594,11 +1594,17 @@ load("shared_ir3", [1], [BASE, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE])
 
 # src[] = { value, address(vec2 of hi+lo uint32_t), offset }.
 # const_index[] = { write_mask, align_mul, align_offset }
-store("global_ir3", [1, 1], indices=[ACCESS, ALIGN_MUL, ALIGN_OFFSET])
+# Final address is calculated as `address + ((offset + BASE) << OFFSET_SHIFT)
+# `offset` is sign-extended to 64-bits first so the offset calculation does not
+# cause 32-bit overflows.
+# a6xx has another shift field which only applies to `offset`; this is not
+# represented here.
+store("global_ir3", [1, 1], indices=[ACCESS, ALIGN_MUL, ALIGN_OFFSET, OFFSET_SHIFT, BASE])
 # src[] = { address(vec2 of hi+lo uint32_t), offset }.
 # const_index[] = { access, align_mul, align_offset }
 # the alignment applies to the base address
-load("global_ir3", [1, 1], indices=[ACCESS, ALIGN_MUL, ALIGN_OFFSET, RANGE_BASE, RANGE], flags=[CAN_ELIMINATE])
+# Final address is calculated as for @store_global_ir3
+load("global_ir3", [1, 1], indices=[ACCESS, ALIGN_MUL, ALIGN_OFFSET, RANGE_BASE, RANGE, OFFSET_SHIFT, BASE], flags=[CAN_ELIMINATE])
 
 # Etnaviv-specific load/glboal intrinsics. They take a 32-bit base address and
 # a 32-bit offset, which doesn't need to be an immediate.
diff --git a/src/freedreno/ir3/ir3_a6xx.c b/src/freedreno/ir3/ir3_a6xx.c
index badc964f367..bd6df930149 100644
--- a/src/freedreno/ir3/ir3_a6xx.c
+++ b/src/freedreno/ir3/ir3_a6xx.c
@@ -431,6 +431,132 @@ emit_intrinsic_image_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
    ir3_split_dest(b, dst, resinfo, 0, intr->num_components);
 }
 
+/* On a6xx, on top of the offset_shift that applies to the whole offset, there's
+ * a second shift that only applies to the GPR part of the offset (so not to the
+ * immediate part). We extract that here by simply pattern matching for ishl on
+ * the offset src. Returns the shift if a match is found and it fits in the
+ * 2-bit field, in which case *offset_src is set to the src of ishl and
+ * *offset_src_comp to the component of *offset_src.
+ */
+static unsigned
+parse_src_shift(struct ir3_context *ctx, nir_src **offset_src,
+                unsigned *offset_src_comp)
+{
+   *offset_src_comp = 0;
+
+   if (ctx->compiler->gen >= 7) {
+      return 0;
+   }
+
+   nir_scalar offset =
+      nir_scalar_chase_movs(nir_get_scalar((*offset_src)->ssa, 0));
+
+   if (!nir_scalar_is_alu(offset) || nir_scalar_alu_op(offset) != nir_op_ishl) {
+      return 0;
+   }
+
+   nir_scalar shift_src = nir_scalar_chase_alu_src(offset, 1);
+
+   if (!nir_scalar_is_const(shift_src)) {
+      return 0;
+   }
+
+   unsigned shift = nir_scalar_as_uint(shift_src);
+
+   if (shift >= (1 << 2)) {
+      return 0;
+   }
+
+   nir_alu_instr *offset_alu = nir_def_as_alu(offset.def);
+   *offset_src = &offset_alu->src[0].src;
+   *offset_src_comp = offset_alu->src[0].swizzle[offset.comp];
+   return shift;
+}
+
+static bool
+base_fits_ldg_stg_a(struct ir3_compiler *compiler, unsigned base)
+{
+   if (compiler->gen >= 7) {
+      return base < (1 << 8);
+   }
+
+   return base < (1 << 2);
+}
+
+/* Represents an offset for ldg/stg(.a):
+ * - src == NULL: ldg/stg base_address + imm
+ * - src != NULL:
+ *   - a6xx: ldg/stg.a base_addr + (src << src_shift) + imm
+ *   - a7xx: ldg/stg.a base_addr + src + imm
+ */
+struct ldg_stg_offset {
+   struct ir3_instruction *src;
+   struct ir3_instruction *src_shift;
+   struct ir3_instruction *imm;
+};
+
+static struct ldg_stg_offset
+ldg_stg_offset(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+   assert(intr->intrinsic == nir_intrinsic_load_global_ir3 ||
+          intr->intrinsic == nir_intrinsic_store_global_ir3);
+
+   if (ctx->compiler->gen >= 7) {
+      assert(nir_intrinsic_offset_shift(intr) == 0);
+   } else {
+      ASSERTED unsigned bit_size =
+         intr->intrinsic == nir_intrinsic_load_global_ir3
+            ? intr->def.bit_size
+            : intr->src[0].ssa->bit_size;
+      assert(nir_intrinsic_offset_shift(intr) == ffs(bit_size / 8) - 1);
+   }
+
+   struct ldg_stg_offset offset = {};
+   nir_src *offset_src = nir_get_io_offset_src(intr);
+   int32_t base = nir_intrinsic_base(intr);
+   unsigned offset_shift = nir_intrinsic_offset_shift(intr);
+   struct ir3_builder *b = &ctx->build;
+
+   if (nir_src_is_const(*offset_src)) {
+      int32_t full_imm_offset = base + nir_src_as_int(*offset_src);
+      int32_t full_imm_offset_bytes = full_imm_offset << offset_shift;
+
+      /* ldg/stg offset immediate is 13 bits. Note that ldg/stg use byte offsets
+       * even on a6xx.
+       */
+      if (full_imm_offset_bytes < (1 << 12) &&
+          full_imm_offset_bytes >= -(1 << 12)) {
+         offset.imm = create_immed(b, full_imm_offset_bytes);
+      } else {
+         /* The immediate offset does not fit. Generate ldg/stg.a with the
+          * immediate in a GPR.
+          */
+         offset.src = create_immed(b, full_imm_offset);
+         offset.src_shift = create_immed(b, 0);
+         offset.imm = create_immed(b, 0);
+      }
+   } else {
+      if (base_fits_ldg_stg_a(ctx->compiler, base)) {
+         unsigned offset_src_comp;
+         unsigned shift = parse_src_shift(ctx, &offset_src, &offset_src_comp);
+         offset.src = ir3_get_src(ctx, offset_src)[offset_src_comp];
+         offset.src_shift = create_immed(b, shift);
+         offset.imm = create_immed(b, base);
+      } else {
+         /* This should be rare, but various passes might update
+          * base/offset_shift in a way that makes the combination illegal.
+          * Detect that here and replace base by an add.
+          */
+         offset.src = ir3_ADD_U(b, ir3_get_src(ctx, offset_src)[0], 0,
+                                create_immed(b, base), 0);
+         offset.src_shift = create_immed(b, 0);
+         offset.imm = create_immed(b, 0);
+      }
+   }
+
+   return offset;
+}
+
 static void
 emit_intrinsic_load_global_ir3(struct ir3_context *ctx,
                                nir_intrinsic_instr *intr,
@@ -438,31 +564,19 @@ emit_intrinsic_load_global_ir3(struct ir3_context *ctx,
 {
    struct ir3_builder *b = &ctx->build;
    unsigned dest_components = nir_intrinsic_dest_components(intr);
-   struct ir3_instruction *addr, *offset;
+   struct ir3_instruction *addr;
 
    addr = ir3_collect(b, ir3_get_src(ctx, &intr->src[0])[0]);
 
+   struct ldg_stg_offset offset = ldg_stg_offset(ctx, intr);
    struct ir3_instruction *load;
 
-   bool const_offset_in_bounds =
-      nir_src_is_const(intr->src[1]) &&
-      nir_src_as_int(intr->src[1]) < (1 << 8) &&
-      nir_src_as_int(intr->src[1]) > -(1 << 8);
-
-   if (const_offset_in_bounds) {
-      load = ir3_LDG(b, addr, 0,
-                     create_immed(b, nir_src_as_int(intr->src[1]) * 4),
-                     0, create_immed(b, dest_components), 0);
+   if (!offset.src) {
+      load = ir3_LDG(b, addr, 0, offset.imm, 0,
+                     create_immed(b, dest_components), 0);
    } else {
-      unsigned shift = ctx->compiler->gen >= 7 ? 2 : 0;
-      offset = ir3_get_src(ctx, &intr->src[1])[0];
-      if (shift) {
-         /* A7XX TODO: Move to NIR for it to be properly optimized? */
-         offset = ir3_SHL_B(b, offset, 0, create_immed(b, shift), 0);
-      }
-      load =
-         ir3_LDG_A(b, addr, 0, offset, 0, create_immed(b, 0), 0,
-                   create_immed(b, 0), 0, create_immed(b, dest_components), 0);
+      load = ir3_LDG_A(b, addr, 0, offset.src, 0, offset.src_shift, 0,
+                       offset.imm, 0, create_immed(b, dest_components), 0);
    }
 
    load->cat6.type = type_uint_size(intr->def.bit_size);
@@ -479,33 +593,22 @@ emit_intrinsic_store_global_ir3(struct ir3_context *ctx,
                                 nir_intrinsic_instr *intr)
 {
    struct ir3_builder *b = &ctx->build;
-   struct ir3_instruction *value, *addr, *offset;
+   struct ir3_instruction *value, *addr;
    unsigned ncomp = nir_intrinsic_src_components(intr, 0);
 
    addr = ir3_collect(b, ir3_get_src(ctx, &intr->src[1])[0]);
 
    value = ir3_create_collect(b, ir3_get_src(ctx, &intr->src[0]), ncomp);
 
+   struct ldg_stg_offset offset = ldg_stg_offset(ctx, intr);
    struct ir3_instruction *stg;
 
-   bool const_offset_in_bounds = nir_src_is_const(intr->src[2]) &&
-                                 nir_src_as_int(intr->src[2]) < (1 << 10) &&
-                                 nir_src_as_int(intr->src[2]) > -(1 << 10);
-
-   if (const_offset_in_bounds) {
-      stg = ir3_STG(b, addr, 0,
-                    create_immed(b, nir_src_as_int(intr->src[2]) * 4), 0,
-                    value, 0,
-                    create_immed(b, ncomp), 0);
+   if (!offset.src) {
+      stg = ir3_STG(b, addr, 0, offset.imm, 0, value, 0, create_immed(b, ncomp),
+                    0);
    } else {
-      offset = ir3_get_src(ctx, &intr->src[2])[0];
-      if (ctx->compiler->gen >= 7) {
-         /* A7XX TODO: Move to NIR for it to be properly optimized? */
-         offset = ir3_SHL_B(b, offset, 0, create_immed(b, 2), 0);
-      }
-      stg =
-         ir3_STG_A(b, addr, 0, offset, 0, create_immed(b, 0), 0,
-                   create_immed(b, 0), 0, value, 0, create_immed(b, ncomp), 0);
+      stg = ir3_STG_A(b, addr, 0, offset.src, 0, offset.src_shift, 0,
+                      offset.imm, 0, value, 0, create_immed(b, ncomp), 0);
    }
 
    stg->cat6.type = type_uint_size(intr->src[0].ssa->bit_size);
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c
index 200e53e6997..a04061e8106 100644
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -607,7 +607,7 @@ lower_shader_clock(struct nir_builder *b, nir_intrinsic_instr *instr, void *data
       nir_def *clock_lo =
          nir_load_global_ir3(b, 1, 32, base_addr, nir_imm_int(b, 0));
       nir_def *clock_hi =
-         nir_load_global_ir3(b, 1, 32, base_addr, nir_imm_int(b, 1));
+         nir_load_global_ir3(b, 1, 32, base_addr, nir_imm_int(b, 4));
       clock = nir_vec2(b, clock_lo, clock_hi);
    }
    nir_push_else(b, NULL);
@@ -2052,3 +2052,20 @@ ir3_nir_intrinsic_barycentric_sysval(nir_intrinsic_instr *intr)
 
    return sysval;
 }
+
+nir_io_offset
+ir3_nir_get_global_offset(nir_builder *b, struct ir3_compiler *compiler,
+                          nir_def *offset, unsigned offset_shift)
+{
+   if (compiler->gen >= 7) {
+      return (nir_io_offset){
+         .def = nir_ishl_imm(b, offset, offset_shift),
+         .shift = 0,
+      };
+   }
+
+   return (nir_io_offset){
+      .def = offset,
+      .shift = offset_shift,
+   };
+}
diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h
index 887170d68cf..4797fc3b4c1 100644
--- a/src/freedreno/ir3/ir3_nir.h
+++ b/src/freedreno/ir3/ir3_nir.h
@@ -212,6 +212,10 @@ unsigned ir3_nir_max_offset_shift(nir_intrinsic_instr *intr, const void *data);
 gl_system_value
 ir3_nir_intrinsic_barycentric_sysval(nir_intrinsic_instr *intr);
 
+nir_io_offset ir3_nir_get_global_offset(nir_builder *b,
+                                        struct ir3_compiler *compiler,
+                                        nir_def *offset, unsigned offset_shift);
+
 ENDC;
 
 #endif /* IR3_NIR_H_ */
diff --git a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
index 3e83573d0eb..aa6b04693f3 100644
--- a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
+++ b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
@@ -16,18 +16,13 @@ get_ubo_load_range(nir_shader *nir, nir_intrinsic_instr *instr,
    uint32_t offset = nir_intrinsic_range_base(instr);
    uint32_t size = nir_intrinsic_range(instr);
 
-   if (instr->intrinsic == nir_intrinsic_load_global_ir3) {
-      offset *= 4;
-      size *= 4;
-   }
-
    /* If the offset is constant, the range is trivial (and NIR may not have
     * figured it out).
     */
    if (nir_src_is_const(instr->src[1])) {
       offset = nir_src_as_uint(instr->src[1]);
       if (instr->intrinsic == nir_intrinsic_load_global_ir3)
-         offset *= 4;
+         offset <<= nir_intrinsic_offset_shift(instr);
       size = nir_intrinsic_dest_components(instr) * 4;
    }
 
@@ -297,25 +292,30 @@ lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b,
 
    nir_def *uniform_offset = ubo_offset;
 
-   if (instr->intrinsic == nir_intrinsic_load_ubo) {
-      /* UBO offset is in bytes, but uniform offset is in units of
-       * dwords, so we need to divide by 4 (right-shift by 2). For ldc the
-       * offset is in units of 16 bytes, so we need to multiply by 4. And
-       * also the same for the constant part of the offset:
-       */
-      const int shift = -2;
-      nir_def *new_offset = ir3_nir_try_propagate_bit_shift(b, ubo_offset, -2);
-      if (new_offset) {
-         uniform_offset = new_offset;
-      } else {
-         uniform_offset = shift > 0
-                             ? nir_ishl_imm(b, ubo_offset, shift)
-                             : nir_ushr_imm(b, ubo_offset, -shift);
-      }
+   /* UBO/global offset is in bytes, but uniform offset is in units of
+    * dwords, so we need to divide by 4 (right-shift by 2). For ldc the
+    * offset is in units of 16 bytes, so we need to multiply by 4. And
+    * also the same for the constant part of the offset:
+    */
+   int shift = -2;
+
+   if (instr->intrinsic == nir_intrinsic_load_global_ir3) {
+      unsigned offset_shift = nir_intrinsic_offset_shift(instr);
+      assert(offset_shift <= 2);
+
+      shift = -(2 - offset_shift);
+   }
+
+   nir_def *new_offset = ir3_nir_try_propagate_bit_shift(b, ubo_offset, shift);
+   if (new_offset) {
+      uniform_offset = new_offset;
+   } else {
+      uniform_offset = shift > 0 ? nir_ishl_imm(b, ubo_offset, shift)
+                                 : nir_ushr_imm(b, ubo_offset, -shift);
    }
 
    assert(!(const_offset & 0x3));
-   const_offset >>= 2;
+   const_offset >>= -shift;
 
    const int range_offset = ((int)range->offset - (int)range->start) / 4;
    const_offset += range_offset;
diff --git a/src/freedreno/ir3/ir3_nir_lower_io_offsets.c b/src/freedreno/ir3/ir3_nir_lower_io_offsets.c
index dd89a6e7efe..e01eb2fd336 100644
--- a/src/freedreno/ir3/ir3_nir_lower_io_offsets.c
+++ b/src/freedreno/ir3/ir3_nir_lower_io_offsets.c
@@ -258,6 +258,52 @@ lower_offset_for_ssbo(nir_intrinsic_instr *intrinsic, nir_builder *b,
    return true;
 }
 
+/* On a6xx, global memory is accessed in units of the type size. Legalize
+ * offset_shift to correspond to this.
+ */
+static bool
+lower_offset_for_global(nir_builder *b, nir_intrinsic_instr *intr,
+                        struct ir3_compiler *compiler)
+{
+   if (compiler->gen >= 7) {
+      assert(nir_intrinsic_offset_shift(intr) == 0);
+      return false;
+   }
+
+   unsigned bit_size = intr->intrinsic == nir_intrinsic_load_global_ir3
+                          ? intr->def.bit_size
+                          : intr->src[0].ssa->bit_size;
+
+   assert(bit_size < 64);
+
+   int shift = ffs(bit_size / 8) - 1;
+   int cur_shift = nir_intrinsic_offset_shift(intr);
+   int extra_shift = shift - cur_shift;
+
+   if (extra_shift == 0) {
+      return false;
+   }
+
+   b->cursor = nir_before_instr(&intr->instr);
+
+   nir_src *offset_src = nir_get_io_offset_src(intr);
+   nir_io_offset new_offset = {
+      .def = ir3_nir_try_propagate_bit_shift(b, offset_src->ssa, -extra_shift),
+      .shift = shift,
+   };
+
+   if (!new_offset.def) {
+      if (extra_shift > 0) {
+         new_offset.def = nir_ushr_imm(b, offset_src->ssa, extra_shift);
+      } else {
+         new_offset.def = nir_ishl_imm(b, offset_src->ssa, -extra_shift);
+      }
+   }
+
+   nir_set_io_offset(intr, new_offset);
+   return true;
+}
+
 static bool
 lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx,
                        struct ir3_compiler *c)
@@ -288,6 +334,11 @@ lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx,
          scalarize_load(intr, b);
          progress = true;
       }
+
+      if (intr->intrinsic == nir_intrinsic_load_global_ir3 ||
+          intr->intrinsic == nir_intrinsic_store_global_ir3) {
+         progress |= lower_offset_for_global(b, intr, c);
+      }
    }
 
    return progress;
diff --git a/src/freedreno/ir3/ir3_nir_lower_tess.c b/src/freedreno/ir3/ir3_nir_lower_tess.c
index 6b5d66a77b7..b961b92b6e5 100644
--- a/src/freedreno/ir3/ir3_nir_lower_tess.c
+++ b/src/freedreno/ir3/ir3_nir_lower_tess.c
@@ -8,6 +8,7 @@
 #include "ir3_nir.h"
 
 struct state {
+   struct ir3_compiler *compiler;
    uint32_t topology;
 
    struct primitive_map {
@@ -190,6 +191,33 @@ replace_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
    return new_intr;
 }
 
+static void
+replace_with_load_global(nir_builder *b, struct ir3_compiler *compiler,
+                         nir_intrinsic_instr *intr, nir_def *addr,
+                         nir_def *offset)
+{
+   /* Our offsets are in units of 4B. */
+   nir_io_offset global_offset =
+      ir3_nir_get_global_offset(b, compiler, offset, 2);
+   nir_def *load = nir_load_global_ir3(
+      b, intr->def.num_components, intr->def.bit_size, addr, global_offset.def,
+      .align_mul = 4, .align_offset = 0, .offset_shift = global_offset.shift);
+   nir_def_replace(&intr->def, load);
+}
+
+static void
+replace_with_store_global(nir_builder *b, struct ir3_compiler *compiler,
+                          nir_intrinsic_instr *intr, nir_def *val,
+                          nir_def *addr, nir_def *offset)
+{
+   /* Our offsets are in units of 4B. */
+   nir_io_offset global_offset =
+      ir3_nir_get_global_offset(b, compiler, offset, 2);
+   nir_store_global_ir3(b, val, addr, global_offset.def, .align_mul = 4,
+                        .align_offset = 0, .offset_shift = global_offset.shift);
+   nir_instr_remove(&intr->instr);
+}
+
 static void
 build_primitive_map(nir_shader *shader, struct primitive_map *map)
 {
@@ -577,8 +605,7 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
             nir_intrinsic_io_semantics(intr).location,
             nir_intrinsic_component(intr), intr->src[1].ssa);
 
-         replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
-                           offset, NULL);
+         replace_with_load_global(b, state->compiler, intr, address, offset);
          break;
       }
 
@@ -598,8 +625,8 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
             nir_intrinsic_io_semantics(intr).location,
             nir_intrinsic_component(intr), intr->src[2].ssa);
 
-         replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3, value,
-                           address, offset);
+         replace_with_store_global(b, state->compiler, intr, value, address,
+                                   offset);
 
          break;
       }
@@ -623,8 +650,7 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
                                         intr->src[0].ssa);
          }
 
-         replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
-                           offset, NULL);
+         replace_with_load_global(b, state->compiler, intr, address, offset);
          break;
       }
 
@@ -664,10 +690,9 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
             nir_def *offset = build_tessfactor_base(
                b, location, nir_intrinsic_component(intr), state);
 
-            replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
-                              intr->src[0].ssa,
-                              load_tess_factor_base(b),
-                              nir_iadd(b, intr->src[1].ssa, offset));
+            replace_with_store_global(
+               b, state->compiler, intr, intr->src[0].ssa,
+               load_tess_factor_base(b), nir_iadd(b, intr->src[1].ssa, offset));
 
             if (location != VARYING_SLOT_PRIMITIVE_ID) {
                nir_pop_if(b, nif);
@@ -678,8 +703,8 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
                b, state, location, nir_intrinsic_component(intr),
                intr->src[1].ssa);
 
-            replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
-                              intr->src[0].ssa, address, offset);
+            replace_with_store_global(b, state->compiler, intr,
+                                      intr->src[0].ssa, address, offset);
          }
          break;
       }
@@ -694,7 +719,7 @@ bool
 ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader_variant *v,
                         unsigned topology)
 {
-   struct state state = {.topology = topology};
+   struct state state = {.topology = topology, .compiler = v->compiler};
 
    if (shader_debug_enabled(shader->info.stage, shader->info.internal)) {
       mesa_logi("NIR (before tess lowering) for %s shader:",
@@ -787,8 +812,7 @@ lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state)
             nir_intrinsic_io_semantics(intr).location,
             nir_intrinsic_component(intr), intr->src[1].ssa);
 
-         replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
-                           offset, NULL);
+         replace_with_load_global(b, state->compiler, intr, address, offset);
          break;
       }
 
@@ -811,8 +835,7 @@ lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state)
                                         intr->src[0].ssa);
          }
 
-         replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
-                           offset, NULL);
+         replace_with_load_global(b, state->compiler, intr, address, offset);
          break;
       }
 
@@ -826,7 +849,7 @@ bool
 ir3_nir_lower_tess_eval(nir_shader *shader, struct ir3_shader_variant *v,
                         unsigned topology)
 {
-   struct state state = {.topology = topology};
+   struct state state = {.topology = topology, .compiler = v->compiler};
 
    if (shader_debug_enabled(shader->info.stage, shader->info.internal)) {
       mesa_logi("NIR (before tess lowering) for %s shader:",
diff --git a/src/freedreno/vulkan/tu_nir_lower_ray_query.cc b/src/freedreno/vulkan/tu_nir_lower_ray_query.cc
index 8c506fb8515..6dc3acf7925 100644
--- a/src/freedreno/vulkan/tu_nir_lower_ray_query.cc
+++ b/src/freedreno/vulkan/tu_nir_lower_ray_query.cc
@@ -262,8 +262,8 @@ load_tlas(nir_builder *b, nir_def *tlas,
    } else {
       return nir_load_global_ir3(b, components, 32,
                                  nir_pack_64_2x32(b, tlas),
-                                 nir_iadd_imm(b, nir_imul_imm(b, index, AS_RECORD_SIZE / 4),
-                                              offset / 4),
+                                 nir_iadd_imm(b, nir_imul_imm(b, index, AS_RECORD_SIZE),
+                                              offset),
                                  /* The required alignment of the
                                   * user-specified base from the Vulkan spec.
                                   */
diff --git a/src/freedreno/vulkan/tu_shader.cc b/src/freedreno/vulkan/tu_shader.cc
index 1b604c484b8..7a37e60aab5 100644
--- a/src/freedreno/vulkan/tu_shader.cc
+++ b/src/freedreno/vulkan/tu_shader.cc
@@ -1024,7 +1024,7 @@ lower_inline_ubo(nir_builder *b, nir_intrinsic_instr *intrin, void *cb_data)
       val = nir_load_global_ir3(b, intrin->num_components,
                                 intrin->def.bit_size,
                                 nir_pack_64_2x32(b, base_addr),
-                                nir_ishr_imm(b, offset, 2),
+                                offset,
                                 .access =
                                  (enum gl_access_qualifier)(
                                     (enum gl_access_qualifier)(ACCESS_NON_WRITEABLE | ACCESS_CAN_REORDER) |
@@ -1032,7 +1032,7 @@ lower_inline_ubo(nir_builder *b, nir_intrinsic_instr *intrin, void *cb_data)
                                 .align_mul = 16,
                                 .align_offset = 0,
                                 .range_base = 0,
-                                .range = range);
+                                .range = range * 4);
    } else {
       val =
          nir_load_const_ir3(b, intrin->num_components, intrin->def.bit_size,