aco/isel: refactor load_shared() by directly matching NIR intrinsics to ACO opcodes

Totals from 3 (0.00% of 79839) affected shaders: (Navi48) Instrs: 700 -> 698 (-0.29%) CodeSize: 3860 -> 3852 (-0.21%) Latency: 2351 -> 2349 (-0.09%) Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36133>
2026-05-02 10:08:08 +02:00 · 2025-07-14 16:23:44 +02:00 · 2025-07-14 16:23:44 +02:00 · 1fde289539
commit 1fde289539
parent 4632ee4c37
1 changed files with 52 additions and 105 deletions
--- a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp
+++ b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp
@ -286,7 +286,6 @@ struct LoadEmitInfo {

   ac_hw_cache_flags cache = {{0, 0, 0, 0, 0}};
   bool split_by_component_stride = true;
-   bool readfirstlane_for_uniform = false;
   unsigned swizzle_component_size = 0;
   memory_sync_info sync;
   Temp soffset = Temp(0, s1);
@ -451,11 +450,8 @@ emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
      /* try to p_as_uniform early so we can create more optimizable code and
       * also update allocated_vec */
      for (unsigned j = start; j < components_split; j++) {
-         if (allocated_vec[j].bytes() % 4 == 0 && info.dst.type() == RegType::sgpr) {
-            allocated_vec[j] = emit_vector_as_uniform(
-               ctx, allocated_vec[j], bld.tmp(RegClass(RegType::sgpr, allocated_vec[j].size())),
-               info.readfirstlane_for_uniform);
-         }
+         if (allocated_vec[j].bytes() % 4 == 0 && info.dst.type() == RegType::sgpr)
+            allocated_vec[j] = bld.as_uniform(allocated_vec[j]);
         has_vgprs |= allocated_vec[j].type() == RegType::vgpr;
      }
   }
@ -477,84 +473,13 @@ emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
      Temp tmp = bld.tmp(RegType::vgpr, info.dst.size());
      vec->definitions[0] = Definition(tmp);
      bld.insert(std::move(vec));
-      emit_vector_as_uniform(ctx, tmp, info.dst, info.readfirstlane_for_uniform);
+      bld.pseudo(aco_opcode::p_as_uniform, Definition(info.dst), tmp);
   } else {
      vec->definitions[0] = Definition(info.dst);
      bld.insert(std::move(vec));
   }
 }

-Temp
-lds_load_callback(Builder& bld, const LoadEmitInfo& info, unsigned bytes_needed, unsigned align)
-{
-   Temp offset =
-      info.offset.regClass() == s1 ? bld.copy(bld.def(v1), info.offset) : info.offset.getTemp();
-   uint32_t const_offset = info.const_offset;
-
-   Operand m = load_lds_size_m0(bld);
-
-   bool large_ds_read = bld.program->gfx_level >= GFX7;
-   bool usable_read2 = bld.program->gfx_level >= GFX7;
-
-   bool read2 = false;
-   unsigned size = 0;
-   aco_opcode op;
-   if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) {
-      size = 16;
-      op = aco_opcode::ds_read_b128;
-   } else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) {
-      size = 16;
-      read2 = true;
-      op = aco_opcode::ds_read2_b64;
-   } else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) {
-      size = 12;
-      op = aco_opcode::ds_read_b96;
-   } else if (bytes_needed >= 8 && align % 8 == 0) {
-      size = 8;
-      op = aco_opcode::ds_read_b64;
-   } else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0 && usable_read2) {
-      size = 8;
-      read2 = true;
-      op = aco_opcode::ds_read2_b32;
-   } else if (bytes_needed >= 4 && align % 4 == 0) {
-      size = 4;
-      op = aco_opcode::ds_read_b32;
-   } else if (bytes_needed >= 2 && align % 2 == 0) {
-      size = 2;
-      op = bld.program->gfx_level >= GFX9 ? aco_opcode::ds_read_u16_d16 : aco_opcode::ds_read_u16;
-   } else {
-      size = 1;
-      op = bld.program->gfx_level >= GFX9 ? aco_opcode::ds_read_u8_d16 : aco_opcode::ds_read_u8;
-   }
-
-   unsigned const_offset_unit = read2 ? size / 2u : 1u;
-   unsigned const_offset_range = read2 ? 255 * const_offset_unit : 65536;
-
-   if (const_offset > (const_offset_range - const_offset_unit)) {
-      unsigned excess = const_offset - (const_offset % const_offset_range);
-      offset = bld.vadd32(bld.def(v1), offset, Operand::c32(excess));
-      const_offset -= excess;
-   }
-
-   const_offset /= const_offset_unit;
-
-   RegClass rc = RegClass::get(RegType::vgpr, size);
-   Temp val = rc == info.dst.regClass() ? info.dst : bld.tmp(rc);
-   Instruction* instr;
-   if (read2)
-      instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
-   else
-      instr = bld.ds(op, Definition(val), offset, m, const_offset);
-   instr->ds().sync = info.sync;
-
-   if (m.isUndefined())
-      instr->operands.pop_back();
-
-   return val;
-}
-
-const EmitLoadParameters lds_load_params{lds_load_callback, UINT32_MAX};
-
 std::pair<aco_opcode, unsigned>
 get_smem_opcode(amd_gfx_level level, unsigned bytes, bool buffer, bool round_down)
 {
@ -1019,31 +944,6 @@ global_load_callback(Builder& bld, const LoadEmitInfo& info, unsigned bytes_need

 const EmitLoadParameters global_load_params{global_load_callback, UINT32_MAX};

-Temp
-load_lds(isel_context* ctx, unsigned elem_size_bytes, unsigned num_components, Temp dst,
-         Temp address, unsigned base_offset, unsigned align)
-{
-   assert(util_is_power_of_two_nonzero(align));
-
-   Builder bld(ctx->program, ctx->block);
-
-   LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
-   info.align_mul = align;
-   info.align_offset = 0;
-   info.sync = memory_sync_info(storage_shared);
-   info.const_offset = base_offset;
-   /* The 2 separate loads for gfx10+ wave64 can see different values, even for uniform addresses,
-    * if another wave writes LDS in between. Use v_readfirstlane instead of p_as_uniform in order
-    * to avoid copy-propagation.
-    */
-   info.readfirstlane_for_uniform = ctx->options->gfx_level >= GFX10 &&
-                                    ctx->program->wave_size == 64 &&
-                                    ctx->program->workgroup_size > 64;
-   emit_load(ctx, bld, info, lds_load_params);
-
-   return dst;
-}
-
 void
 split_store_data(isel_context* ctx, RegType dst_type, unsigned count, Temp* dst, unsigned* bytes,
                 Temp src)
@ -3103,15 +3003,62 @@ emit_barrier(isel_context* ctx, nir_intrinsic_instr* instr)
 void
 visit_load_shared(isel_context* ctx, nir_intrinsic_instr* instr)
 {
-   // TODO: implement sparse reads using ds_read2_b32 and nir_def_components_read()
   Temp dst = get_ssa_temp(ctx, &instr->def);
   Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
   Builder bld(ctx->program, ctx->block);

   unsigned elem_size_bytes = instr->def.bit_size / 8;
   unsigned num_components = instr->def.num_components;
+   unsigned bytes = elem_size_bytes * num_components;
   unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
-   load_lds(ctx, elem_size_bytes, num_components, dst, address, nir_intrinsic_base(instr), align);
+   assert(bytes == 12 ? align % 16 == 0 : align % bytes == 0);
+
+   Operand m = load_lds_size_m0(bld);
+   aco_opcode op;
+
+   switch (bytes) {
+   case 16: op = aco_opcode::ds_read_b128; break;
+   case 12: op = aco_opcode::ds_read_b96; break;
+   case 8: op = aco_opcode::ds_read_b64; break;
+   case 4: op = aco_opcode::ds_read_b32; break;
+   case 2:
+      op = bld.program->gfx_level >= GFX9 ? aco_opcode::ds_read_u16_d16 : aco_opcode::ds_read_u16;
+      break;
+   case 1:
+      op = bld.program->gfx_level >= GFX9 ? aco_opcode::ds_read_u8_d16 : aco_opcode::ds_read_u8;
+      break;
+   default: UNREACHABLE("Unsupported load_shared size");
+   }
+
+   unsigned const_offset = nir_intrinsic_base(instr);
+   unsigned const_offset_range = 65536;
+   if (const_offset >= const_offset_range) {
+      unsigned excess = const_offset - (const_offset % const_offset_range);
+      address = bld.vadd32(bld.def(v1), address, Operand::c32(excess));
+      const_offset -= excess;
+   }
+
+   Definition def = dst.regClass().type() == RegType::sgpr
+                       ? bld.def(RegClass::get(RegType::vgpr, bytes))
+                       : Definition(dst);
+   Instruction* ds = bld.ds(op, def, address, m, const_offset);
+   ds->ds().sync = memory_sync_info(storage_shared);
+
+   if (m.isUndefined())
+      ds->operands.pop_back();
+
+   if (def.getTemp() != dst) {
+      /* The 2 separate loads for gfx10+ wave64 can see different values, even for uniform
+       * addresses, if another wave writes LDS in between. Use v_readfirstlane instead of
+       * p_as_uniform in order to avoid copy-propagation.
+       */
+      bool readfirstlane_for_uniform = ctx->options->gfx_level >= GFX10 &&
+                                       ctx->program->wave_size == 64 &&
+                                       ctx->program->workgroup_size > 64;
+      emit_vector_as_uniform(ctx, def.getTemp(), dst, readfirstlane_for_uniform);
+   }
+
+   emit_split_vector(ctx, dst, instr->def.num_components);
 }

 void