From 81df5175537a20133e4460c2c4d047a4fe09d8f8 Mon Sep 17 00:00:00 2001
From: Rhys Perry <pendingchaos02@gmail.com>
Date: Mon, 25 Aug 2025 16:20:32 +0100
Subject: [PATCH] aco: avoid unaligned offsets when selecting load_global_amd
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SMEM instructions mask off the low bits for the base and offset sources
both before and after they're added. However, NIR expects ACO to only
care about the alignment of the final address.

fossil-db (gfx1201):
Totals from 21 (0.03% of 79839) affected shaders:
Instrs: 229780 -> 229876 (+0.04%)
CodeSize: 1267724 -> 1268080 (+0.03%)
Latency: 2800924 -> 2800978 (+0.00%)
InvThroughput: 520250 -> 520256 (+0.00%)
Copies: 27878 -> 27876 (-0.01%); split: -0.01%, +0.00%
SALU: 29591 -> 29643 (+0.18%)

fossil-db (polaris10):
Totals from 3 (0.00% of 62201) affected shaders:
Latency: 2651 -> 2652 (+0.04%)
InvThroughput: 662 -> 663 (+0.15%)
PreSGPRs: 51 -> 54 (+5.88%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37301>
---
 .../aco_instruction_selection.h               |  1 +
 .../instruction_selection/aco_isel_setup.cpp  |  2 +
 .../aco_select_nir_intrinsics.cpp             | 37 ++++++++++++++++++-
 3 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/src/amd/compiler/instruction_selection/aco_instruction_selection.h b/src/amd/compiler/instruction_selection/aco_instruction_selection.h
index 9405a01b871..29e9b631585 100644
--- a/src/amd/compiler/instruction_selection/aco_instruction_selection.h
+++ b/src/amd/compiler/instruction_selection/aco_instruction_selection.h
@@ -138,6 +138,7 @@ struct isel_context {
 
    /* NIR range analysis. */
    struct hash_table* range_ht;
+   struct hash_table* numlsb_ht;
 
    Temp arg_temps[AC_MAX_ARGS];
    Operand workgroup_id[3];
diff --git a/src/amd/compiler/instruction_selection/aco_isel_setup.cpp b/src/amd/compiler/instruction_selection/aco_isel_setup.cpp
index 7746b05f418..d57105cec31 100644
--- a/src/amd/compiler/instruction_selection/aco_isel_setup.cpp
+++ b/src/amd/compiler/instruction_selection/aco_isel_setup.cpp
@@ -375,6 +375,7 @@ init_context(isel_context* ctx, nir_shader* shader)
 
    /* Init NIR range analysis. */
    ctx->range_ht = _mesa_pointer_hash_table_create(NULL);
+   ctx->numlsb_ht = _mesa_pointer_hash_table_create(NULL);
 
    uint32_t options =
       shader->options->divergence_analysis_options | nir_divergence_ignore_undef_if_phi_srcs;
@@ -728,6 +729,7 @@ init_context(isel_context* ctx, nir_shader* shader)
 void
 cleanup_context(isel_context* ctx)
 {
+   _mesa_hash_table_destroy(ctx->numlsb_ht, NULL);
    _mesa_hash_table_destroy(ctx->range_ht, NULL);
 }
 
diff --git a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp
index 1d3c6f9b9ec..01bc8ff9774 100644
--- a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp
+++ b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp
@@ -9,6 +9,8 @@
 #include "aco_instruction_selection.h"
 #include "aco_ir.h"
 
+#include "nir_range_analysis.h"
+
 #include "ac_descriptors.h"
 #include "ac_nir.h"
 #include "amdgfxregs.h"
@@ -281,6 +283,7 @@ struct LoadEmitInfo {
    unsigned align_mul = 0;
    unsigned align_offset = 0;
    pipe_format format;
+   nir_src* resource_src = NULL; /* should be equal to resource or NULL */
    nir_src* offset_src = NULL; /* should be equal to offset or NULL */
    isel_context* ctx;
 
@@ -350,6 +353,7 @@ emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
          if (new_info.resource.id() && new_info.resource.size() == 2 &&
              add_might_overflow(ctx, info.offset_src, to_add)) {
             new_info.resource = add64_32(bld, new_info.resource, Operand::c32(to_add));
+            new_info.resource_src = NULL;
             offset_changed = false;
          } else if (offset.isConstant()) {
             offset = Operand::c32(offset.constantValue() + to_add);
@@ -502,6 +506,12 @@ get_smem_opcode(amd_gfx_level level, unsigned bytes, bool buffer, bool round_dow
       return {buffer ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16, 64};
 }
 
+unsigned
+src_has_req_lsb(isel_context* ctx, nir_src* src, unsigned req)
+{
+   return src && nir_def_num_lsb_zero(ctx->numlsb_ht, nir_get_scalar(src->ssa, 0)) >= req;
+}
+
 Temp
 smem_load_callback(Builder& bld, const LoadEmitInfo& info, unsigned bytes_needed, unsigned align)
 {
@@ -540,6 +550,30 @@ smem_load_callback(Builder& bld, const LoadEmitInfo& info, unsigned bytes_needed
     */
    RegClass rc(RegType::sgpr, DIV_ROUND_UP(util_next_power_of_two(bytes_needed), 4u));
 
+   unsigned req_lsb_zero = bytes_needed == 1 ? 0 : (bytes_needed == 2 ? 1 : 2);
+   if (!buffer) {
+      /* We require each offset source and the final address to be aligned, so ensure at least
+       * two sources are aligned. The remaining one can then be assumed to be aligned, otherwise the
+       * final address is unaligned. */
+      // TODO: lower in NIR
+      bool addr_aligned = src_has_req_lsb(info.ctx, info.resource_src, req_lsb_zero);
+      bool offset_aligned =
+         !offset.id() || src_has_req_lsb(info.ctx, info.offset_src, req_lsb_zero);
+      bool const_aligned = !const_offset || ffs(const_offset) > req_lsb_zero;
+
+      if (!offset_aligned && (!addr_aligned || !const_aligned)) {
+         addr = add64_32(bld, addr, Operand(offset));
+         offset = Temp();
+      }
+      if (!const_aligned && (!addr_aligned || !offset_aligned)) {
+         addr = add64_32(bld, addr, Operand::c32(const_offset));
+         const_offset = 0;
+      }
+   } else {
+      /* We assume the buffer resource is also aligned. */
+      assert(!const_offset || ffs(const_offset) > req_lsb_zero);
+   }
+
    bool soe = !buffer && offset.id() && const_offset && bld.program->gfx_level >= GFX9;
    aco_ptr<Instruction> load{create_instruction(op, Format::SMEM, 2 + soe, 1)};
    if (buffer) {
@@ -2334,7 +2368,8 @@ visit_load_global(isel_context* ctx, nir_intrinsic_instr* instr)
    info.align_mul = nir_intrinsic_align_mul(instr);
    info.align_offset = nir_intrinsic_align_offset(instr);
    info.sync = get_memory_sync_info(instr, storage_buffer, 0);
-   info.offset_src = &instr->src[1];
+   info.resource_src = &instr->src[0];
+   info.offset_src = offset.id() ? &instr->src[1] : NULL;
    info.cache = get_cache_flags(ctx, access, ac_access_type_load);
    info.disable_wqm = access & ACCESS_SKIP_HELPERS;