aco: support 8/16-bit loads in smem_combine()

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Georg Lehmann <dadschoorse@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34162>
2025-12-25 08:40:11 +01:00 · 2025-03-11 14:11:45 +00:00 · 2025-03-11 14:11:45 +00:00 · 75efc218f5
commit 75efc218f5
parent 8abb787c6b
1 changed files with 22 additions and 8 deletions
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@ -791,7 +791,7 @@ parse_base_offset(opt_ctx& ctx, Instruction* instr, unsigned op_index, Temp* bas
 }

 void
-skip_smem_offset_align(opt_ctx& ctx, SMEM_instruction* smem)
+skip_smem_offset_align(opt_ctx& ctx, SMEM_instruction* smem, uint32_t align)
 {
   bool soe = smem->operands.size() >= (!smem->definitions.empty() ? 3 : 4);
   if (soe && !smem->operands[1].isConstant())
@ -808,10 +808,11 @@ skip_smem_offset_align(opt_ctx& ctx, SMEM_instruction* smem)
   if (bitwise_instr->opcode != aco_opcode::s_and_b32)
      return;

-   if (bitwise_instr->operands[0].constantEquals(-4) &&
+   uint32_t mask = ~(align - 1u);
+   if (bitwise_instr->operands[0].constantEquals(mask) &&
       bitwise_instr->operands[1].isOfType(op.regClass().type()))
      op.setTemp(bitwise_instr->operands[1].getTemp());
-   else if (bitwise_instr->operands[1].constantEquals(-4) &&
+   else if (bitwise_instr->operands[1].constantEquals(mask) &&
            bitwise_instr->operands[0].isOfType(op.regClass().type()))
      op.setTemp(bitwise_instr->operands[0].getTemp());
 }
@ -819,9 +820,22 @@ skip_smem_offset_align(opt_ctx& ctx, SMEM_instruction* smem)
 void
 smem_combine(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
+   uint32_t align = 4;
+   switch (instr->opcode) {
+   case aco_opcode::s_load_sbyte:
+   case aco_opcode::s_load_ubyte:
+   case aco_opcode::s_buffer_load_sbyte:
+   case aco_opcode::s_buffer_load_ubyte: align = 1; break;
+   case aco_opcode::s_load_sshort:
+   case aco_opcode::s_load_ushort:
+   case aco_opcode::s_buffer_load_sshort:
+   case aco_opcode::s_buffer_load_ushort: align = 2; break;
+   default: break;
+   }
+
   /* skip &-4 before offset additions: load((a + 16) & -4, 0) */
-   if (!instr->operands.empty())
-      skip_smem_offset_align(ctx, &instr->smem());
+   if (!instr->operands.empty() && align > 1)
+      skip_smem_offset_align(ctx, &instr->smem(), align);

   /* propagate constants and combine additions */
   if (!instr->operands.empty() && instr->operands[1].isTemp()) {
@ -834,7 +848,7 @@ smem_combine(opt_ctx& ctx, aco_ptr<Instruction>& instr)
         instr->operands[1] = Operand::c32(info.val);
      } else if (parse_base_offset(ctx, instr.get(), 1, &base, &offset, true) &&
                 base.regClass() == s1 && offset <= ctx.program->dev.smem_offset_max &&
-                 ctx.program->gfx_level >= GFX9 && offset % 4u == 0) {
+                 ctx.program->gfx_level >= GFX9 && offset % align == 0) {
         bool soe = smem.operands.size() >= (!smem.definitions.empty() ? 3 : 4);
         if (soe) {
            if (ctx.info[smem.operands.back().tempId()].is_constant_or_literal(32) &&
@ -860,8 +874,8 @@ smem_combine(opt_ctx& ctx, aco_ptr<Instruction>& instr)
   }

   /* skip &-4 after offset additions: load(a & -4, 16) */
-   if (!instr->operands.empty())
-      skip_smem_offset_align(ctx, &instr->smem());
+   if (!instr->operands.empty() && align > 1)
+      skip_smem_offset_align(ctx, &instr->smem(), align);
 }

 Operand