aco: lower p_constaddr into separate instructions earlier

This allows them to be scheduled properly and simplifies the assembler a little. Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8994>
2025-12-21 13:40:16 +01:00 · 2021-02-01 12:42:38 +00:00 · 2021-02-01 12:42:38 +00:00 · 0af7ff49fd
commit 0af7ff49fd
parent ab957bb899
5 changed files with 48 additions and 45 deletions
--- a/src/amd/compiler/aco_assembler.cpp
+++ b/src/amd/compiler/aco_assembler.cpp
@ -1,5 +1,6 @@
 #include <vector>
 #include <algorithm>
 #include <map>
 #include "aco_ir.h"
 #include "aco_builder.h"
@ -10,11 +11,16 @@
 namespace aco {
 struct constaddr_info {
   unsigned getpc_end;
   unsigned add_literal;
 };
 struct asm_context {
   Program *program;
   enum chip_class chip_class;
   std::vector<std::pair<int, SOPP_instruction*>> branches;
-   std::vector<unsigned> constaddrs;
+   std::map<unsigned, constaddr_info> constaddrs;
   const int16_t* opcode;
   // TODO: keep track of branch instructions referring blocks
   // and, when emitting the block, correct the offset in instr
@ -45,39 +51,17 @@ static uint32_t get_sdwa_sel(unsigned sel, PhysReg reg)
 void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* instr)
 {
   /* lower remaining pseudo-instructions */
-   if (instr->opcode == aco_opcode::p_constaddr) {
+   if (instr->opcode == aco_opcode::p_constaddr_getpc) {
-      unsigned dest = instr->definitions[0].physReg();
+      ctx.constaddrs[instr->operands[0].constantValue()].getpc_end = out.size() + 1;
      unsigned offset = instr->operands[0].constantValue();
-      /* s_getpc_b64 dest[0:1] */
+      instr->opcode = aco_opcode::s_getpc_b64;
-      uint32_t encoding = (0b101111101 << 23);
+      instr->operands.pop_back();
-      uint32_t opcode = ctx.opcode[(int)aco_opcode::s_getpc_b64];
+   } else if (instr->opcode == aco_opcode::p_constaddr_addlo) {
-      if (opcode >= 55 && ctx.chip_class <= GFX9) {
+      ctx.constaddrs[instr->operands[1].constantValue()].add_literal = out.size() + 1;
         assert(ctx.chip_class == GFX9 && opcode < 60);
         opcode = opcode - 4;
      }
      encoding |= dest << 16;
      encoding |= opcode << 8;
      out.push_back(encoding);
-      /* s_add_u32 dest[0], dest[0], ... */
+      instr->opcode = aco_opcode::s_add_u32;
-      encoding = (0b10 << 30);
+      instr->operands[1] = Operand(0u);
-      encoding |= ctx.opcode[(int)aco_opcode::s_add_u32] << 23;
+      instr->operands[1].setFixed(PhysReg(255));
      encoding |= dest << 16;
      encoding |= dest;
      encoding |= 255 << 8;
      out.push_back(encoding);
      ctx.constaddrs.push_back(out.size());
      out.push_back(offset);
      /* s_addc_u32 dest[1], dest[1], 0 */
      encoding = (0b10 << 30);
      encoding |= ctx.opcode[(int)aco_opcode::s_addc_u32] << 23;
      encoding |= (dest + 1) << 16;
      encoding |= dest + 1;
      encoding |= 128 << 8;
      out.push_back(encoding);
      return;
   }
   uint32_t opcode = ctx.opcode[(int)instr->opcode];
@ -798,14 +782,14 @@ static void insert_code(asm_context& ctx, std::vector<uint32_t>& out, unsigned i
   for (; branch_it != ctx.branches.end(); ++branch_it)
      branch_it->first += insert_count;
-   /* Find first constant address after the inserted code */
+   /* Update the locations of p_constaddr instructions */
-   auto caddr_it = std::find_if(ctx.constaddrs.begin(), ctx.constaddrs.end(), [insert_before](const int &caddr_pos) -> bool {
+   for (auto& constaddr : ctx.constaddrs) {
-      return (unsigned)caddr_pos >= insert_before;
+      constaddr_info& info = constaddr.second;
-   });
+      if (info.getpc_end >= insert_before)
-
+         info.getpc_end += insert_count;
-   /* Update the locations of constant addresses */
+      if (info.add_literal >= insert_before)
-   for (; caddr_it != ctx.constaddrs.end(); ++caddr_it)
+         info.add_literal += insert_count;
-      (*caddr_it) += insert_count;
+   }
 }
 static void fix_branches_gfx10(asm_context& ctx, std::vector<uint32_t>& out)
@ -928,8 +912,10 @@ void fix_branches(asm_context& ctx, std::vector<uint32_t>& out)
 void fix_constaddrs(asm_context& ctx, std::vector<uint32_t>& out)
 {
-   for (unsigned addr : ctx.constaddrs)
+   for (auto& constaddr : ctx.constaddrs) {
-      out[addr] += (out.size() - addr + 1u) * 4u;
+      constaddr_info& info = constaddr.second;
      out[info.add_literal] += (out.size() - info.getpc_end) * 4u;
   }
 }
 unsigned emit_program(Program* program,
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@ -5550,7 +5550,7 @@ void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr)
      offset = bld.vadd32(bld.def(v1), Operand(base), offset);
   Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
-                          bld.sop1(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(ctx->constant_data_offset)),
+                          bld.pseudo(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(ctx->constant_data_offset)),
                          Operand(MIN2(base + range, ctx->shader->constant_data_size)),
                          Operand(desc_type));
   unsigned size = instr->dest.ssa.bit_size / 8;
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@ -1979,6 +1979,17 @@ void lower_to_hw_instr(Program* program)
                  unreachable("Current hardware supports ds_bpermute, don't emit p_bpermute.");
               break;
            }
            case aco_opcode::p_constaddr:
            {
               unsigned id = instr->definitions[0].tempId();
               PhysReg reg = instr->definitions[0].physReg();
               bld.sop1(aco_opcode::p_constaddr_getpc, instr->definitions[0], Operand(id));
               bld.sop2(aco_opcode::p_constaddr_addlo, Definition(reg, s1), bld.def(s1, scc),
                        Operand(reg, s1), Operand(id));
               bld.sop2(aco_opcode::s_addc_u32, Definition(reg.advance(4), s1), bld.def(s1, scc),
                        Operand(reg.advance(4), s1), Operand(0u), Operand(scc, s1));
               break;
            }
            default:
               break;
            }
--- a/src/amd/compiler/aco_opcodes.py
+++ b/src/amd/compiler/aco_opcodes.py
@ -288,6 +288,8 @@ opcode("p_exit_early_if")
 # simulates proper bpermute behavior when it's unsupported, eg. GFX10 wave64
 opcode("p_bpermute")
 opcode("p_constaddr")
 # SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc)
 SOP2 = {
  # GFX6, GFX7, GFX8, GFX9, GFX10, name
@ -344,6 +346,8 @@ SOP2 = {
   (  -1,   -1,   -1, 0x34, 0x34, "s_pack_hh_b32_b16"),
   (  -1,   -1,   -1, 0x2c, 0x35, "s_mul_hi_u32"),
   (  -1,   -1,   -1, 0x2d, 0x36, "s_mul_hi_i32"),
   # actually a pseudo-instruction. it's lowered to SALU during assembly though, so it's useful to identify it as a SOP2.
   (  -1,   -1,   -1,   -1,   -1, "p_constaddr_addlo"),
 }
 for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOP2:
    opcode(name, gfx7, gfx9, gfx10, Format.SOP2)
@ -457,7 +461,7 @@ SOP1 = {
   (  -1,   -1,   -1,   -1, 0x47, "s_andn2_wrexec_b32"),
   (  -1,   -1,   -1,   -1, 0x49, "s_movrelsd_2_b32"),
   # actually a pseudo-instruction. it's lowered to SALU during assembly though, so it's useful to identify it as a SOP1.
-   (  -1,   -1,   -1,   -1,   -1, "p_constaddr"),
+   (  -1,   -1,   -1,   -1,   -1, "p_constaddr_getpc"),
 }
 for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOP1:
   opcode(name, gfx7, gfx9, gfx10, Format.SOP1)
--- a/src/amd/compiler/tests/test_assembler.cpp
+++ b/src/amd/compiler/tests/test_assembler.cpp
@ -220,7 +220,9 @@ BEGIN_TEST(assembler.long_jump.constaddr)
   //>> s_getpc_b64 s[0:1]                                          ; be801f00
   //! s_add_u32 s0, s0, 0xe0                                      ; 8000ff00 000000e0
-   bld.sop1(aco_opcode::p_constaddr, Definition(PhysReg(0), s2), Operand(0u));
+   bld.sop1(aco_opcode::p_constaddr_getpc, Definition(PhysReg(0), s2), Operand(0u));
   bld.sop2(aco_opcode::p_constaddr_addlo, Definition(PhysReg(0), s1), bld.def(s1, scc),
            Operand(PhysReg(0), s1), Operand(0u));
   program->blocks[2].linear_preds.push_back(0u);
   program->blocks[2].linear_preds.push_back(1u);