aco: change return type of create_instruction() to Instruction*

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28370>
2026-05-04 20:38:06 +02:00 · 2024-03-25 12:05:50 +01:00 · 2024-03-25 12:05:50 +01:00 · 9b0ebcc39b
commit 9b0ebcc39b
parent cd62f97719
17 changed files with 298 additions and 297 deletions
--- a/src/amd/compiler/aco_insert_NOPs.cpp
+++ b/src/amd/compiler/aco_insert_NOPs.cpp
@ -611,9 +611,9 @@ handle_instruction_gfx6(State& state, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& i
   // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles
   if (NOPs) {
      /* create NOP */
-      aco_ptr<SALU_instruction> nop{
+      aco_ptr<Instruction> nop{
         create_instruction<SALU_instruction>(aco_opcode::s_nop, Format::SOPP, 0, 0)};
-      nop->imm = NOPs - 1;
+      nop->salu().imm = NOPs - 1;
      new_instructions.emplace_back(std::move(nop));
   }

--- a/src/amd/compiler/aco_insert_exec_mask.cpp
+++ b/src/amd/compiler/aco_insert_exec_mask.cpp
@ -237,7 +237,7 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>

      /* create ssa names for outer exec masks */
      if (info.has_discard && preds.size() > 1) {
-         aco_ptr<Pseudo_instruction> phi;
+         aco_ptr<Instruction> phi;
         for (int i = 0; i < info.num_exec_masks - 1; i++) {
            phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi,
                                                             Format::PSEUDO, preds.size(), 1));
@ -251,7 +251,7 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>

      if (info.has_divergent_continue) {
         /* create ssa name for loop active mask */
-         aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(
+         aco_ptr<Instruction> phi{create_instruction<Pseudo_instruction>(
            aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
         phi->definitions[0] = bld.def(bld.lm);
         phi->operands[0] = get_exec_op(ctx.info[preds[0]].exec.back().first);
@ -312,7 +312,7 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>
            ctx.info[idx].exec.emplace_back(same, type);
         } else {
            /* create phi for loop footer */
-            aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(
+            aco_ptr<Instruction> phi{create_instruction<Pseudo_instruction>(
               aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
            phi->definitions[0] = bld.def(bld.lm);
            for (unsigned i = 0; i < phi->operands.size(); i++)
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@ -993,17 +993,17 @@ emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wai
 {
   if (imm.vs != wait_imm::unset_counter) {
      assert(ctx.gfx_level >= GFX10);
-      SALU_instruction* waitcnt_vs =
+      Instruction* waitcnt_vs =
         create_instruction<SALU_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 1, 0);
      waitcnt_vs->operands[0] = Operand(sgpr_null, s1);
-      waitcnt_vs->imm = imm.vs;
+      waitcnt_vs->salu().imm = imm.vs;
      instructions.emplace_back(waitcnt_vs);
      imm.vs = wait_imm::unset_counter;
   }
   if (!imm.empty()) {
-      SALU_instruction* waitcnt =
+      Instruction* waitcnt =
         create_instruction<SALU_instruction>(aco_opcode::s_waitcnt, Format::SOPP, 0, 0);
-      waitcnt->imm = imm.pack(ctx.gfx_level);
+      waitcnt->salu().imm = imm.pack(ctx.gfx_level);
      instructions.emplace_back(waitcnt);
   }
   imm = wait_imm();
@ -1030,9 +1030,9 @@ emit_delay_alu(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions,
      imm |= ((uint32_t)alu_delay_wait::SALU_CYCLE_1 + cycles - 1) << (imm ? 7 : 0);
   }

-   SALU_instruction* inst =
+   Instruction* inst =
      create_instruction<SALU_instruction>(aco_opcode::s_delay_alu, Format::SOPP, 0, 0);
-   inst->imm = imm;
+   inst->salu().imm = imm;
   inst->pass_flags = (delay.valu_cycles | (delay.trans_cycles << 16));
   instructions.emplace_back(inst);
   delay = alu_delay_info();
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@ -380,7 +380,7 @@ emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
   } else {
      rc = RegClass(vec_src.type(), vec_src.size() / num_components);
   }
-   aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
+   aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(
      aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
   split->operands[0] = Operand(vec_src);
   std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
@ -432,7 +432,7 @@ expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components
   if (zero_padding)
      padding = bld.copy(bld.def(dst_rc), Operand::zero(component_bytes));

-   aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
+   aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
      aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
   vec->definitions[0] = Definition(dst);
   unsigned k = 0;
@ -553,7 +553,7 @@ byte_align_vector(isel_context* ctx, Temp vec, Operand offset, Temp dst, unsigne
   if (dst.type() == RegType::vgpr) {
      /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
      num_components = dst.bytes() / component_size;
-      aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(
+      aco_ptr<Instruction> create_vec{create_instruction<Pseudo_instruction>(
         aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
      for (unsigned i = 0; i < num_components; i++)
         create_vec->operands[i] = Operand(elems[i]);
@ -749,7 +749,7 @@ get_alu_src(struct isel_context* ctx, nir_alu_src src, unsigned size = 1)
   } else {
      assert(size <= 4);
      std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
-      aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(
+      aco_ptr<Instruction> vec_instr{create_instruction<Pseudo_instruction>(
         aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
      for (unsigned i = 0; i < size; ++i) {
         elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
@ -823,7 +823,7 @@ void
 emit_sop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
                      bool writes_scc, uint8_t uses_ub = 0)
 {
-   aco_ptr<SALU_instruction> sop2{
+   aco_ptr<Instruction> sop2{
      create_instruction<SALU_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
   sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
   sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
@ -1407,7 +1407,7 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
         elems[i] = get_alu_src(ctx, instr->src[i]);

      if (instr->def.bit_size >= 32 || dst.type() == RegType::vgpr) {
-         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
+         aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
            aco_opcode::p_create_vector, Format::PSEUDO, instr->def.num_components, 1)};
         RegClass elem_rc = RegClass::get(RegType::vgpr, instr->def.bit_size / 8u);
         for (unsigned i = 0; i < num; ++i) {
@ -1484,7 +1484,7 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
         if (dst.size() == 1)
            bld.copy(Definition(dst), packed[0]);
         else {
-            aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
+            aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
               aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
            vec->definitions[0] = Definition(dst);
            for (unsigned i = 0; i < dst.size(); ++i)
@ -3954,7 +3954,7 @@ visit_load_const(isel_context* ctx, nir_load_const_instr* instr)
      bld.copy(Definition(dst), Operand::c32(instr->value[0].u32));
   } else {
      assert(dst.size() != 1);
-      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
+      aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
         aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
      if (instr->def.bit_size == 64)
         for (unsigned i = 0; i < dst.size(); i++)
@ -3978,7 +3978,7 @@ emit_readfirstlane(isel_context* ctx, Temp src, Temp dst)
   } else if (src.size() == 1) {
      bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(dst), src);
   } else {
-      aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
+      aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(
         aco_opcode::p_split_vector, Format::PSEUDO, 1, src.size())};
      split->operands[0] = Operand(src);

@ -3990,7 +3990,7 @@ emit_readfirstlane(isel_context* ctx, Temp src, Temp dst)
      Instruction* split_raw = split.get();
      ctx->block->instructions.emplace_back(std::move(split));

-      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
+      aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
         aco_opcode::p_create_vector, Format::PSEUDO, src.size(), 1)};
      vec->definitions[0] = Definition(dst);
      for (unsigned i = 0; i < src.size(); i++) {
@ -4246,7 +4246,7 @@ emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
         tmp[num_tmps++] = vals[i++];
      }
      if (num_tmps > 1) {
-         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
+         aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
            aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
         for (unsigned j = 0; j < num_tmps; j++)
            vec->operands[j] = Operand(tmp[j]);
@ -4272,7 +4272,7 @@ emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
         allocated_vec[components_split++] = tmp[0];
      } else {
         assert(tmp_size % elem_rc.bytes() == 0);
-         aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
+         aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(
            aco_opcode::p_split_vector, Format::PSEUDO, 1, tmp_size / elem_rc.bytes())};
         for (auto& def : split->definitions) {
            Temp component = bld.tmp(elem_rc);
@ -4305,7 +4305,7 @@ emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
   int padding_bytes =
      MAX2((int)info.dst.bytes() - int(allocated_vec[0].bytes() * info.num_components), 0);

-   aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
+   aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
      aco_opcode::p_create_vector, Format::PSEUDO, info.num_components + !!padding_bytes, 1)};
   for (unsigned i = 0; i < info.num_components; i++)
      vec->operands[i] = Operand(allocated_vec[i]);
@ -4440,7 +4440,7 @@ smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned
      op = buffer ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16;
   }

-   aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
+   aco_ptr<Instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
   if (buffer) {
      if (const_offset)
         offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
@ -4460,9 +4460,10 @@ smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned
   RegClass rc(RegType::sgpr, DIV_ROUND_UP(bytes_needed, 4u));
   Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc);
   load->definitions[0] = Definition(val);
-   load->glc = info.glc;
-   load->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
-   load->sync = info.sync;
+   load->smem().glc = info.glc;
+   load->smem().dlc =
+      info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
+   load->smem().sync = info.sync;
   bld.insert(std::move(load));
   return val;
 }
@ -4514,18 +4515,19 @@ mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigne
      bytes_size = 16;
      op = aco_opcode::buffer_load_dwordx4;
   }
-   aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
+   aco_ptr<Instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
   mubuf->operands[0] = Operand(info.resource);
   mubuf->operands[1] = vaddr;
   mubuf->operands[2] = soffset;
-   mubuf->offen = offen;
-   mubuf->idxen = idxen;
-   mubuf->glc = info.glc;
-   mubuf->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
-   mubuf->slc = info.slc;
-   mubuf->sync = info.sync;
-   mubuf->offset = const_offset;
-   mubuf->swizzled = info.swizzle_component_size != 0;
+   mubuf->mubuf().offen = offen;
+   mubuf->mubuf().idxen = idxen;
+   mubuf->mubuf().glc = info.glc;
+   mubuf->mubuf().dlc =
+      info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
+   mubuf->mubuf().slc = info.slc;
+   mubuf->mubuf().sync = info.sync;
+   mubuf->mubuf().offset = const_offset;
+   mubuf->mubuf().swizzled = info.swizzle_component_size != 0;
   RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
   Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
   mubuf->definitions[0] = Definition(val);
@ -4581,17 +4583,18 @@ mubuf_load_format_callback(Builder& bld, const LoadEmitInfo& info, Temp offset,
      }
   }

-   aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
+   aco_ptr<Instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
   mubuf->operands[0] = Operand(info.resource);
   mubuf->operands[1] = vaddr;
   mubuf->operands[2] = soffset;
-   mubuf->offen = offen;
-   mubuf->idxen = idxen;
-   mubuf->glc = info.glc;
-   mubuf->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
-   mubuf->slc = info.slc;
-   mubuf->sync = info.sync;
-   mubuf->offset = const_offset;
+   mubuf->mubuf().offen = offen;
+   mubuf->mubuf().idxen = idxen;
+   mubuf->mubuf().glc = info.glc;
+   mubuf->mubuf().dlc =
+      info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
+   mubuf->mubuf().slc = info.slc;
+   mubuf->mubuf().sync = info.sync;
+   mubuf->mubuf().offset = const_offset;
   RegClass rc = RegClass::get(RegType::vgpr, bytes_needed);
   Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
   mubuf->definitions[0] = Definition(val);
@ -4629,11 +4632,11 @@ scratch_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsig
   }
   RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
   Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
-   aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, Format::SCRATCH, 2, 1)};
+   aco_ptr<Instruction> flat{create_instruction<FLAT_instruction>(op, Format::SCRATCH, 2, 1)};
   flat->operands[0] = offset.regClass() == s1 ? Operand(v1) : Operand(offset);
   flat->operands[1] = offset.regClass() == s1 ? Operand(offset) : Operand(s1);
-   flat->sync = info.sync;
-   flat->offset = const_offset;
+   flat->scratch().sync = info.sync;
+   flat->scratch().offset = const_offset;
   flat->definitions[0] = Definition(val);
   bld.insert(std::move(flat));

@ -4793,21 +4796,20 @@ global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsign
   RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
   Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
   if (use_mubuf) {
-      aco_ptr<MUBUF_instruction> mubuf{
-         create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
+      aco_ptr<Instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
      mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, addr));
      mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
      mubuf->operands[2] = Operand(offset);
-      mubuf->glc = info.glc;
-      mubuf->dlc = false;
-      mubuf->offset = const_offset;
-      mubuf->addr64 = addr.type() == RegType::vgpr;
-      mubuf->disable_wqm = false;
-      mubuf->sync = info.sync;
+      mubuf->mubuf().glc = info.glc;
+      mubuf->mubuf().dlc = false;
+      mubuf->mubuf().offset = const_offset;
+      mubuf->mubuf().addr64 = addr.type() == RegType::vgpr;
+      mubuf->mubuf().disable_wqm = false;
+      mubuf->mubuf().sync = info.sync;
      mubuf->definitions[0] = Definition(val);
      bld.insert(std::move(mubuf));
   } else {
-      aco_ptr<FLAT_instruction> flat{
+      aco_ptr<Instruction> flat{
         create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
      if (addr.regClass() == s2) {
         assert(global && offset.id() && offset.type() == RegType::vgpr);
@ -4818,12 +4820,12 @@ global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsign
         flat->operands[0] = Operand(addr);
         flat->operands[1] = Operand(s1);
      }
-      flat->glc = info.glc;
-      flat->dlc =
+      flat->flatlike().glc = info.glc;
+      flat->flatlike().dlc =
         info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
-      flat->sync = info.sync;
+      flat->flatlike().sync = info.sync;
      assert(global || !const_offset);
-      flat->offset = const_offset;
+      flat->flatlike().offset = const_offset;
      flat->definitions[0] = Definition(val);
      bld.insert(std::move(flat));
   }
@ -5178,7 +5180,7 @@ create_vec_from_array(isel_context* ctx, Temp arr[], unsigned cnt, RegType reg_t
      dst = bld.tmp(RegClass(reg_type, cnt * dword_size));

   std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
-   aco_ptr<Pseudo_instruction> instr{
+   aco_ptr<Instruction> instr{
      create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
   instr->definitions[0] = Definition(dst);

@ -5553,7 +5555,7 @@ emit_load_frag_coord(isel_context* ctx, Temp dst, unsigned num_components)
 {
   Builder bld(ctx->program, ctx->block);

-   aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(
+   aco_ptr<Instruction> vec(create_instruction<Pseudo_instruction>(
      aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
   for (unsigned i = 0; i < num_components; i++) {
      if (ctx->args->frag_pos[i].used)
@ -5617,7 +5619,7 @@ visit_load_interpolated_input(isel_context* ctx, nir_intrinsic_instr* instr)
   if (instr->def.num_components == 1) {
      emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
   } else {
-      aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(
+      aco_ptr<Instruction> vec(create_instruction<Pseudo_instruction>(
         aco_opcode::p_create_vector, Format::PSEUDO, instr->def.num_components, 1));
      for (unsigned i = 0; i < instr->def.num_components; i++) {
         Temp tmp = ctx->program->allocateTmp(instr->def.bit_size == 16 ? v2b : v1);
@ -5709,19 +5711,20 @@ mtbuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigne
      abort();
   }

-   aco_ptr<MTBUF_instruction> mtbuf{create_instruction<MTBUF_instruction>(op, Format::MTBUF, 3, 1)};
+   aco_ptr<Instruction> mtbuf{create_instruction<MTBUF_instruction>(op, Format::MTBUF, 3, 1)};
   mtbuf->operands[0] = Operand(info.resource);
   mtbuf->operands[1] = vaddr;
   mtbuf->operands[2] = soffset;
-   mtbuf->offen = offen;
-   mtbuf->idxen = idxen;
-   mtbuf->glc = info.glc;
-   mtbuf->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
-   mtbuf->slc = info.slc;
-   mtbuf->sync = info.sync;
-   mtbuf->offset = const_offset;
-   mtbuf->dfmt = fetch_fmt & 0xf;
-   mtbuf->nfmt = fetch_fmt >> 4;
+   mtbuf->mtbuf().offen = offen;
+   mtbuf->mtbuf().idxen = idxen;
+   mtbuf->mtbuf().glc = info.glc;
+   mtbuf->mtbuf().dlc =
+      info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
+   mtbuf->mtbuf().slc = info.slc;
+   mtbuf->mtbuf().sync = info.sync;
+   mtbuf->mtbuf().offset = const_offset;
+   mtbuf->mtbuf().dfmt = fetch_fmt & 0xf;
+   mtbuf->mtbuf().nfmt = fetch_fmt >> 4;
   RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
   Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
   mtbuf->definitions[0] = Definition(val);
@ -5757,7 +5760,7 @@ visit_load_fs_input(isel_context* ctx, nir_intrinsic_instr* instr)
      unsigned num_components = instr->def.num_components;
      if (instr->def.bit_size == 64)
         num_components *= 2;
-      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
+      aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
         aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
      for (unsigned i = 0; i < num_components; i++) {
         unsigned chan_component = (component + i) % 4;
@ -5876,7 +5879,7 @@ visit_load_push_constant(isel_context* ctx, nir_intrinsic_instr* instr)
      if ((ctx->args->inline_push_const_mask | mask) == ctx->args->inline_push_const_mask &&
          start + count <= (sizeof(ctx->args->inline_push_const_mask) * 8u)) {
         std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
-         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
+         aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
            aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
         unsigned arg_index =
            util_bitcount64(ctx->args->inline_push_const_mask & BITFIELD64_MASK(start));
@ -6066,7 +6069,7 @@ emit_mimg(Builder& bld, aco_opcode op, Temp dst, Temp rsrc, Operand samp, std::v
   if (nsa_size < coords.size()) {
      Temp coord = coords[nsa_size];
      if (coords.size() - nsa_size > 1) {
-         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
+         aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
            aco_opcode::p_create_vector, Format::PSEUDO, coords.size() - nsa_size, 1)};

         unsigned coord_size = 0;
@ -6088,7 +6091,7 @@ emit_mimg(Builder& bld, aco_opcode op, Temp dst, Temp rsrc, Operand samp, std::v

   bool has_dst = dst.id() != 0;

-   aco_ptr<MIMG_instruction> mimg{
+   aco_ptr<Instruction> mimg{
      create_instruction<MIMG_instruction>(op, Format::MIMG, 3 + coords.size(), has_dst)};
   if (has_dst)
      mimg->definitions[0] = Definition(dst);
@ -6100,11 +6103,9 @@ emit_mimg(Builder& bld, aco_opcode op, Temp dst, Temp rsrc, Operand samp, std::v
      if (coords[i].regClass().is_linear_vgpr())
         mimg->operands[3 + i].setLateKill(true);
   }
-   mimg->strict_wqm = strict_wqm;
+   mimg->mimg().strict_wqm = strict_wqm;

-   MIMG_instruction* res = mimg.get();
-   bld.insert(std::move(mimg));
-   return res;
+   return &bld.insert(std::move(mimg))->mimg();
 }

 void
@ -6257,8 +6258,8 @@ emit_tfe_init(Builder& bld, Temp dst)
 {
   Temp tmp = bld.tmp(dst.regClass());

-   aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
-      aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
+   aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
+                                                                   Format::PSEUDO, dst.size(), 1)};
   for (unsigned i = 0; i < dst.size(); i++)
      vec->operands[i] = Operand::zero();
   vec->definitions[0] = Definition(tmp);
@ -6332,19 +6333,19 @@ visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)
         default: unreachable(">4 channel buffer image load");
         }
      }
-      aco_ptr<MUBUF_instruction> load{
+      aco_ptr<Instruction> load{
         create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3 + is_sparse, 1)};
      load->operands[0] = Operand(resource);
      load->operands[1] = Operand(vindex);
      load->operands[2] = Operand::c32(0);
      load->definitions[0] = Definition(tmp);
-      load->idxen = true;
-      load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
-      load->dlc =
-         load->glc && (ctx->options->gfx_level == GFX10 || ctx->options->gfx_level == GFX10_3);
-      load->sync = sync;
-      load->tfe = is_sparse;
-      if (load->tfe)
+      load->mubuf().idxen = true;
+      load->mubuf().glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
+      load->mubuf().dlc = load->mubuf().glc &&
+                          (ctx->options->gfx_level == GFX10 || ctx->options->gfx_level == GFX10_3);
+      load->mubuf().sync = sync;
+      load->mubuf().tfe = is_sparse;
+      if (load->mubuf().tfe)
         load->operands[3] = emit_tfe_init(bld, tmp);
      ctx->block->instructions.emplace_back(std::move(load));
   } else {
@ -6446,7 +6447,7 @@ visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
         if (dmask_count == 1) {
            data = emit_extract_vector(ctx, data, ffs(dmask) - 1, rc);
         } else {
-            aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
+            aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
               aco_opcode::p_create_vector, Format::PSEUDO, dmask_count, 1)};
            uint32_t index = 0;
            u_foreach_bit (bit, dmask) {
@ -6480,17 +6481,17 @@ visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
         default: unreachable(">4 channel buffer image store");
         }
      }
-      aco_ptr<MUBUF_instruction> store{
+      aco_ptr<Instruction> store{
         create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
      store->operands[0] = Operand(rsrc);
      store->operands[1] = Operand(vindex);
      store->operands[2] = Operand::c32(0);
      store->operands[3] = Operand(data);
-      store->idxen = true;
-      store->glc = glc;
-      store->dlc = false;
-      store->disable_wqm = true;
-      store->sync = sync;
+      store->mubuf().idxen = true;
+      store->mubuf().glc = glc;
+      store->mubuf().dlc = false;
+      store->mubuf().disable_wqm = true;
+      store->mubuf().sync = sync;
      ctx->program->needs_exact = true;
      ctx->block->instructions.emplace_back(std::move(store));
      return;
@ -6634,7 +6635,7 @@ visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
      Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
      // assert(ctx->options->gfx_level < GFX9 && "GFX9 stride size workaround not yet
      // implemented.");
-      aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(
+      aco_ptr<Instruction> mubuf{create_instruction<MUBUF_instruction>(
         is_64bit ? buf_op64 : buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
      mubuf->operands[0] = Operand(resource);
      mubuf->operands[1] = Operand(vindex);
@ -6644,12 +6645,12 @@ visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
         return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
      if (return_previous)
         mubuf->definitions[0] = def;
-      mubuf->offset = 0;
-      mubuf->idxen = true;
-      mubuf->glc = return_previous;
-      mubuf->dlc = false; /* Not needed for atomics */
-      mubuf->disable_wqm = true;
-      mubuf->sync = sync;
+      mubuf->mubuf().offset = 0;
+      mubuf->mubuf().idxen = true;
+      mubuf->mubuf().glc = return_previous;
+      mubuf->mubuf().dlc = false; /* Not needed for atomics */
+      mubuf->mubuf().disable_wqm = true;
+      mubuf->mubuf().sync = sync;
      ctx->program->needs_exact = true;
      ctx->block->instructions.emplace_back(std::move(mubuf));
      if (return_previous && cmpswap)
@ -6728,18 +6729,17 @@ visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
   for (unsigned i = 0; i < write_count; i++) {
      aco_opcode op = get_buffer_store_op(write_datas[i].bytes());

-      aco_ptr<MUBUF_instruction> store{
-         create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
+      aco_ptr<Instruction> store{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
      store->operands[0] = Operand(rsrc);
      store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
      store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
      store->operands[3] = Operand(write_datas[i]);
-      store->offset = offsets[i];
-      store->offen = (offset.type() == RegType::vgpr);
-      store->glc = glc;
-      store->dlc = false;
-      store->disable_wqm = true;
-      store->sync = sync;
+      store->mubuf().offset = offsets[i];
+      store->mubuf().offen = (offset.type() == RegType::vgpr);
+      store->mubuf().glc = glc;
+      store->mubuf().dlc = false;
+      store->mubuf().disable_wqm = true;
+      store->mubuf().sync = sync;
      ctx->program->needs_exact = true;
      ctx->block->instructions.emplace_back(std::move(store));
   }
@ -6767,7 +6767,7 @@ visit_atomic_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
   Temp dst = get_ssa_temp(ctx, &instr->def);

   aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
-   aco_ptr<MUBUF_instruction> mubuf{
+   aco_ptr<Instruction> mubuf{
      create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
   mubuf->operands[0] = Operand(rsrc);
   mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
@ -6777,12 +6777,12 @@ visit_atomic_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
      return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
   if (return_previous)
      mubuf->definitions[0] = def;
-   mubuf->offset = 0;
-   mubuf->offen = (offset.type() == RegType::vgpr);
-   mubuf->glc = return_previous;
-   mubuf->dlc = false; /* Not needed for atomics */
-   mubuf->disable_wqm = true;
-   mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
+   mubuf->mubuf().offset = 0;
+   mubuf->mubuf().offen = (offset.type() == RegType::vgpr);
+   mubuf->mubuf().glc = return_previous;
+   mubuf->mubuf().dlc = false; /* Not needed for atomics */
+   mubuf->mubuf().disable_wqm = true;
+   mubuf->mubuf().sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
   ctx->program->needs_exact = true;
   ctx->block->instructions.emplace_back(std::move(mubuf));
   if (return_previous && cmpswap)
@ -6901,7 +6901,7 @@ visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)
         default: unreachable("store_global not implemented for this size.");
         }

-         aco_ptr<FLAT_instruction> flat{
+         aco_ptr<Instruction> flat{
            create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
         if (write_address.regClass() == s2) {
            assert(global && write_offset.id() && write_offset.type() == RegType::vgpr);
@ -6913,12 +6913,12 @@ visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)
            flat->operands[1] = Operand(s1);
         }
         flat->operands[2] = Operand(write_datas[i]);
-         flat->glc = glc;
-         flat->dlc = false;
+         flat->flatlike().glc = glc;
+         flat->flatlike().dlc = false;
         assert(global || !write_const_offset);
-         flat->offset = write_const_offset;
-         flat->disable_wqm = true;
-         flat->sync = sync;
+         flat->flatlike().offset = write_const_offset;
+         flat->flatlike().disable_wqm = true;
+         flat->flatlike().sync = sync;
         ctx->program->needs_exact = true;
         ctx->block->instructions.emplace_back(std::move(flat));
      } else {
@ -6928,19 +6928,18 @@ visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)

         Temp rsrc = get_gfx6_global_rsrc(bld, write_address);

-         aco_ptr<MUBUF_instruction> mubuf{
-            create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
+         aco_ptr<Instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
         mubuf->operands[0] = Operand(rsrc);
         mubuf->operands[1] =
            write_address.type() == RegType::vgpr ? Operand(write_address) : Operand(v1);
         mubuf->operands[2] = Operand(write_offset);
         mubuf->operands[3] = Operand(write_datas[i]);
-         mubuf->glc = glc;
-         mubuf->dlc = false;
-         mubuf->offset = write_const_offset;
-         mubuf->addr64 = write_address.type() == RegType::vgpr;
-         mubuf->disable_wqm = true;
-         mubuf->sync = sync;
+         mubuf->mubuf().glc = glc;
+         mubuf->mubuf().dlc = false;
+         mubuf->mubuf().offset = write_const_offset;
+         mubuf->mubuf().addr64 = write_address.type() == RegType::vgpr;
+         mubuf->mubuf().disable_wqm = true;
+         mubuf->mubuf().sync = sync;
         ctx->program->needs_exact = true;
         ctx->block->instructions.emplace_back(std::move(mubuf));
      }
@ -7029,7 +7028,7 @@ visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
      }

      aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
-      aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(
+      aco_ptr<Instruction> flat{create_instruction<FLAT_instruction>(
         op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)};
      if (addr.regClass() == s2) {
         assert(global && offset.id() && offset.type() == RegType::vgpr);
@ -7043,12 +7042,12 @@ visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
      flat->operands[2] = Operand(data);
      if (return_previous)
         flat->definitions[0] = Definition(dst);
-      flat->glc = return_previous;
-      flat->dlc = false; /* Not needed for atomics */
+      flat->flatlike().glc = return_previous;
+      flat->flatlike().dlc = false; /* Not needed for atomics */
      assert(global || !const_offset);
-      flat->offset = const_offset;
-      flat->disable_wqm = true;
-      flat->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
+      flat->flatlike().offset = const_offset;
+      flat->flatlike().disable_wqm = true;
+      flat->flatlike().sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
      ctx->program->needs_exact = true;
      ctx->block->instructions.emplace_back(std::move(flat));
   } else {
@ -7061,7 +7060,7 @@ visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr)

      aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;

-      aco_ptr<MUBUF_instruction> mubuf{
+      aco_ptr<Instruction> mubuf{
         create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
      mubuf->operands[0] = Operand(rsrc);
      mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
@ -7071,12 +7070,12 @@ visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
         return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
      if (return_previous)
         mubuf->definitions[0] = def;
-      mubuf->glc = return_previous;
-      mubuf->dlc = false;
-      mubuf->offset = const_offset;
-      mubuf->addr64 = addr.type() == RegType::vgpr;
-      mubuf->disable_wqm = true;
-      mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
+      mubuf->mubuf().glc = return_previous;
+      mubuf->mubuf().dlc = false;
+      mubuf->mubuf().offset = const_offset;
+      mubuf->mubuf().addr64 = addr.type() == RegType::vgpr;
+      mubuf->mubuf().disable_wqm = true;
+      mubuf->mubuf().sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
      ctx->program->needs_exact = true;
      ctx->block->instructions.emplace_back(std::move(mubuf));
      if (return_previous && cmpswap)
@ -7473,7 +7472,7 @@ visit_shared_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
      offset = 0;
   }

-   aco_ptr<DS_instruction> ds;
+   aco_ptr<Instruction> ds;
   ds.reset(
      create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
   ds->operands[0] = Operand(address);
@ -7485,10 +7484,10 @@ visit_shared_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
         std::swap(ds->operands[1], ds->operands[2]);
   }
   ds->operands[num_operands - 1] = m;
-   ds->offset0 = offset;
+   ds->ds().offset0 = offset;
   if (return_previous)
      ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->def));
-   ds->sync = memory_sync_info(storage_shared, semantic_atomicrmw);
+   ds->ds().sync = memory_sync_info(storage_shared, semantic_atomicrmw);

   if (m.isUndefined())
      ds->operands.pop_back();
@ -7916,7 +7915,7 @@ emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned
   if (clobber_vcc)
      defs[num_defs++] = bld.def(bld.lm, vcc);

-   Pseudo_reduction_instruction* reduce = create_instruction<Pseudo_reduction_instruction>(
+   Instruction* reduce = create_instruction<Pseudo_reduction_instruction>(
      aco_op, Format::PSEUDO_REDUCTION, 3, num_defs);
   reduce->operands[0] = Operand(src);
   /* setup_reduce_temp will update these undef operands if needed */
@ -7924,8 +7923,8 @@ emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned
   reduce->operands[2] = Operand(v1.as_linear());
   std::copy(defs, defs + num_defs, reduce->definitions.begin());

-   reduce->reduce_op = op;
-   reduce->cluster_size = cluster_size;
+   reduce->reduction().reduce_op = op;
+   reduce->reduction().cluster_size = cluster_size;
   bld.insert(std::move(reduce));

   return dst.getTemp();
@ -8111,7 +8110,7 @@ create_fs_dual_src_export_gfx11(isel_context* ctx, const struct aco_export_mrt*
 {
   Builder bld(ctx->program, ctx->block);

-   aco_ptr<Pseudo_instruction> exp{create_instruction<Pseudo_instruction>(
+   aco_ptr<Instruction> exp{create_instruction<Pseudo_instruction>(
      aco_opcode::p_dual_src_export_gfx11, Format::PSEUDO, 8, 6)};
   for (unsigned i = 0; i < 4; i++) {
      exp->operands[i] = mrt0 ? mrt0->out[i] : Operand(v1);
@ -9149,7 +9148,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
         bld.ds(aco_opcode::ds_ordered_count, bld.def(v1), gds_base, m, offset0, offset1, true);
      ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_volatile);

-      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
+      aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
         aco_opcode::p_create_vector, Format::PSEUDO, instr->num_components, 1)};
      unsigned write_mask = nir_intrinsic_write_mask(instr);

@ -9209,36 +9208,37 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)

      const bool row_en = instr->intrinsic == nir_intrinsic_export_row_amd;

-      aco_ptr<Export_instruction> exp{
+      aco_ptr<Instruction> exp{
         create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4 + row_en, 0)};

-      exp->dest = target;
-      exp->enabled_mask = write_mask;
-      exp->compressed = flags & AC_EXP_FLAG_COMPRESSED;
+      exp->exp().dest = target;
+      exp->exp().enabled_mask = write_mask;
+      exp->exp().compressed = flags & AC_EXP_FLAG_COMPRESSED;

      /* ACO may reorder position/mrt export instructions, then mark done for last
       * export instruction. So don't respect the nir AC_EXP_FLAG_DONE for position/mrt
       * exports here and leave it to ACO.
       */
      if (target == V_008DFC_SQ_EXP_PRIM)
-         exp->done = flags & AC_EXP_FLAG_DONE;
+         exp->exp().done = flags & AC_EXP_FLAG_DONE;
      else
-         exp->done = false;
+         exp->exp().done = false;

      /* ACO may reorder mrt export instructions, then mark valid mask for last
       * export instruction. So don't respect the nir AC_EXP_FLAG_VALID_MASK for mrt
       * exports here and leave it to ACO.
       */
      if (target > V_008DFC_SQ_EXP_NULL)
-         exp->valid_mask = flags & AC_EXP_FLAG_VALID_MASK;
+         exp->exp().valid_mask = flags & AC_EXP_FLAG_VALID_MASK;
      else
-         exp->valid_mask = false;
+         exp->exp().valid_mask = false;

-      exp->row_en = row_en;
+      exp->exp().row_en = row_en;

      /* Compressed export uses two bits for a channel. */
-      uint32_t channel_mask =
-         exp->compressed ? (write_mask & 0x3 ? 1 : 0) | (write_mask & 0xc ? 2 : 0) : write_mask;
+      uint32_t channel_mask = exp->exp().compressed
+                                 ? (write_mask & 0x3 ? 1 : 0) | (write_mask & 0xc ? 2 : 0)
+                                 : write_mask;

      Temp value = get_ssa_temp(ctx, instr->src[0].ssa);
      for (unsigned i = 0; i < 4; i++) {
@ -9287,7 +9287,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
      if (it != ctx->allocated_vec.end())
         num_src = src.bytes() / it->second[0].bytes();

-      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
+      aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
         aco_opcode::p_start_linear_vgpr, Format::PSEUDO, num_src + !!begin_size, 1)};

      if (begin_size)
@ -9730,15 +9730,15 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr)
         }
      }

-      aco_ptr<MUBUF_instruction> mubuf{
+      aco_ptr<Instruction> mubuf{
         create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3 + instr->is_sparse, 1)};
      mubuf->operands[0] = Operand(resource);
      mubuf->operands[1] = Operand(coords[0]);
      mubuf->operands[2] = Operand::c32(0);
      mubuf->definitions[0] = Definition(tmp_dst);
-      mubuf->idxen = true;
-      mubuf->tfe = instr->is_sparse;
-      if (mubuf->tfe)
+      mubuf->mubuf().idxen = true;
+      mubuf->mubuf().tfe = instr->is_sparse;
+      if (mubuf->mubuf().tfe)
         mubuf->operands[3] = emit_tfe_init(bld, tmp_dst);
      ctx->block->instructions.emplace_back(std::move(mubuf));

@ -10008,7 +10008,7 @@ get_phi_operand(isel_context* ctx, nir_def* ssa, RegClass rc, bool logical)
 void
 visit_phi(isel_context* ctx, nir_phi_instr* instr)
 {
-   aco_ptr<Pseudo_instruction> phi;
+   aco_ptr<Instruction> phi;
   Temp dst = get_ssa_temp(ctx, &instr->def);
   assert(instr->def.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);

@ -10111,7 +10111,7 @@ visit_undef(isel_context* ctx, nir_undef_instr* instr)
   if (dst.size() == 1) {
      Builder(ctx->program, ctx->block).copy(Definition(dst), Operand::zero());
   } else {
-      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
+      aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
         aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
      for (unsigned i = 0; i < dst.size(); i++)
         vec->operands[i] = Operand::zero();
@ -10210,7 +10210,7 @@ end_loop(isel_context* ctx, loop_context* lc)
   /* trim linear phis in loop header */
   for (auto&& instr : loop_entry->instructions) {
      if (instr->opcode == aco_opcode::p_linear_phi) {
-         aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};
+         aco_ptr<Instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};
         new_phi->definitions[0] = instr->definitions[0];
         for (unsigned i = 0; i < new_phi->operands.size(); i++)
            new_phi->operands[i] = instr->operands[i];
@ -10484,13 +10484,14 @@ begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond,

   /* branch to linear then block */
   assert(cond.regClass() == ctx->program->lane_mask);
-   aco_ptr<Pseudo_branch_instruction> branch;
+   aco_ptr<Instruction> branch;
   branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z,
                                                              Format::PSEUDO_BRANCH, 1, 1));
   branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
   branch->operands[0] = Operand(cond);
-   branch->selection_control_remove = sel_ctrl == nir_selection_control_flatten ||
-                                      sel_ctrl == nir_selection_control_divergent_always_taken;
+   branch->branch().selection_control_remove =
+      sel_ctrl == nir_selection_control_flatten ||
+      sel_ctrl == nir_selection_control_divergent_always_taken;
   ctx->block->instructions.push_back(std::move(branch));

   ic->BB_if_idx = ctx->block->index;
@ -10528,7 +10529,7 @@ begin_divergent_if_else(isel_context* ctx, if_context* ic,
   Block* BB_then_logical = ctx->block;
   append_logical_end(BB_then_logical);
   /* branch from logical then block to invert block */
-   aco_ptr<Pseudo_branch_instruction> branch;
+   aco_ptr<Instruction> branch;
   branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
                                                              Format::PSEUDO_BRANCH, 0, 1));
   branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
@ -10561,8 +10562,9 @@ begin_divergent_if_else(isel_context* ctx, if_context* ic,
   branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
                                                              Format::PSEUDO_BRANCH, 0, 1));
   branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
-   branch->selection_control_remove = sel_ctrl == nir_selection_control_flatten ||
-                                      sel_ctrl == nir_selection_control_divergent_always_taken;
+   branch->branch().selection_control_remove =
+      sel_ctrl == nir_selection_control_flatten ||
+      sel_ctrl == nir_selection_control_divergent_always_taken;
   ctx->block->instructions.push_back(std::move(branch));

   ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard;
@ -10593,7 +10595,7 @@ end_divergent_if(isel_context* ctx, if_context* ic)
   append_logical_end(BB_else_logical);

   /* branch from logical else block to endif block */
-   aco_ptr<Pseudo_branch_instruction> branch;
+   aco_ptr<Instruction> branch;
   branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
                                                              Format::PSEUDO_BRANCH, 0, 1));
   branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
@ -10650,7 +10652,7 @@ begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond)
   append_logical_end(ctx->block);
   ctx->block->kind |= block_kind_uniform;

-   aco_ptr<Pseudo_branch_instruction> branch;
+   aco_ptr<Instruction> branch;
   aco_opcode branch_opcode = aco_opcode::p_cbranch_z;
   branch.reset(
      create_instruction<Pseudo_branch_instruction>(branch_opcode, Format::PSEUDO_BRANCH, 1, 1));
@ -10687,7 +10689,7 @@ begin_uniform_if_else(isel_context* ctx, if_context* ic)
   if (!ic->uniform_has_then_branch) {
      append_logical_end(BB_then);
      /* branch from then block to endif block */
-      aco_ptr<Pseudo_branch_instruction> branch;
+      aco_ptr<Instruction> branch;
      branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
                                                                 Format::PSEUDO_BRANCH, 0, 1));
      branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
@ -10719,7 +10721,7 @@ end_uniform_if(isel_context* ctx, if_context* ic)
   if (!ctx->cf_info.has_branch) {
      append_logical_end(BB_else);
      /* branch from then block to endif block */
-      aco_ptr<Pseudo_branch_instruction> branch;
+      aco_ptr<Instruction> branch;
      branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
                                                                 Format::PSEUDO_BRANCH, 0, 1));
      branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
@ -10747,7 +10749,7 @@ visit_if(isel_context* ctx, nir_if* if_stmt)
 {
   Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
   Builder bld(ctx->program, ctx->block);
-   aco_ptr<Pseudo_branch_instruction> branch;
+   aco_ptr<Instruction> branch;
   if_context ic;

   if (!nir_src_is_divergent(if_stmt->condition)) { /* uniform condition */
@ -11138,7 +11140,7 @@ create_fs_jump_to_epilog(isel_context* ctx)

   Temp continue_pc = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.epilog_pc));

-   aco_ptr<Pseudo_instruction> jump{create_instruction<Pseudo_instruction>(
+   aco_ptr<Instruction> jump{create_instruction<Pseudo_instruction>(
      aco_opcode::p_jump_to_epilog, Format::PSEUDO, 1 + exports.size(), 0)};
   jump->operands[0] = Operand(continue_pc);
   for (unsigned i = 0; i < exports.size(); i++) {
@ -11192,8 +11194,8 @@ passthrough_all_args(isel_context* ctx, std::vector<Operand>& regs)
 static void
 build_end_with_regs(isel_context* ctx, std::vector<Operand>& regs)
 {
-   aco_ptr<Pseudo_instruction> end{create_instruction<Pseudo_instruction>(
-      aco_opcode::p_end_with_regs, Format::PSEUDO, regs.size(), 0)};
+   aco_ptr<Instruction> end{create_instruction<Pseudo_instruction>(aco_opcode::p_end_with_regs,
+                                                                   Format::PSEUDO, regs.size(), 0)};

   for (unsigned i = 0; i < regs.size(); i++)
      end->operands[i] = regs[i];
@ -11240,7 +11242,7 @@ create_tcs_jump_to_epilog(isel_context* ctx)

   Temp continue_pc = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.epilog_pc));

-   aco_ptr<Pseudo_instruction> jump{
+   aco_ptr<Instruction> jump{
      create_instruction<Pseudo_instruction>(aco_opcode::p_jump_to_epilog, Format::PSEUDO, 14, 0)};
   jump->operands[0] = Operand(continue_pc);
   jump->operands[1] = ring_offsets;
@ -11374,7 +11376,7 @@ create_fs_end_for_epilog(isel_context* ctx)
   ctx->program->needs_exact = true;
 }

-Pseudo_instruction*
+Instruction*
 add_startpgm(struct isel_context* ctx)
 {
   unsigned def_count = 0;
@ -11388,7 +11390,7 @@ add_startpgm(struct isel_context* ctx)
         def_count++;
   }

-   Pseudo_instruction* startpgm =
+   Instruction* startpgm =
      create_instruction<Pseudo_instruction>(aco_opcode::p_startpgm, Format::PSEUDO, 0, def_count);
   ctx->block->instructions.emplace_back(startpgm);
   for (unsigned i = 0, arg = 0; i < ctx->args->arg_count; i++) {
@ -11478,7 +11480,7 @@ fix_ls_vgpr_init_bug(isel_context* ctx)
 }

 void
-split_arguments(isel_context* ctx, Pseudo_instruction* startpgm)
+split_arguments(isel_context* ctx, Instruction* startpgm)
 {
   /* Split all arguments except for the first (ring_offsets) and the last
    * (exec) so that the dead channels don't stay live throughout the program.
@ -11645,7 +11647,7 @@ insert_rt_jump_next(isel_context& ctx, const struct ac_shader_args* args)
   for (unsigned i = 0; i < ctx.args->arg_count; i++)
      src_count += !!BITSET_TEST(ctx.output_args, i);

-   Pseudo_instruction* ret =
+   Instruction* ret =
      create_instruction<Pseudo_instruction>(aco_opcode::p_return, Format::PSEUDO, src_count, 0);
   ctx.block->instructions.emplace_back(ret);

@ -11682,7 +11684,7 @@ select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* c
      init_context(&ctx, nir);
      setup_fp_mode(&ctx, nir);

-      Pseudo_instruction* startpgm = add_startpgm(&ctx);
+      Instruction* startpgm = add_startpgm(&ctx);
      append_logical_start(ctx.block);
      split_arguments(&ctx, startpgm);
      visit_cf_list(&ctx, &nir_shader_get_entrypoint(nir)->body);
@ -11839,7 +11841,7 @@ create_merged_jump_to_epilog(isel_context* ctx)
   Temp continue_pc =
      convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.next_stage_pc));

-   aco_ptr<Pseudo_instruction> jump{create_instruction<Pseudo_instruction>(
+   aco_ptr<Instruction> jump{create_instruction<Pseudo_instruction>(
      aco_opcode::p_jump_to_epilog, Format::PSEUDO, 1 + regs.size(), 0)};
   jump->operands[0] = Operand(continue_pc);
   for (unsigned i = 0; i < regs.size(); i++) {
@ -11884,7 +11886,7 @@ select_shader(isel_context& ctx, nir_shader* nir, const bool need_startpgm, cons

   if (need_startpgm) {
      /* Needs to be after init_context() for FS. */
-      Pseudo_instruction* startpgm = add_startpgm(&ctx);
+      Instruction* startpgm = add_startpgm(&ctx);
      append_logical_start(ctx.block);

      if (ctx.options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs &&
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@ -1671,7 +1671,7 @@ struct instr_deleter_functor {
 template <typename T> using aco_ptr = std::unique_ptr<T, instr_deleter_functor>;

 template <typename T>
-T*
+Instruction*
 create_instruction(aco_opcode opcode, Format format, uint32_t num_operands,
                   uint32_t num_definitions)
 {
@ -1679,7 +1679,7 @@ create_instruction(aco_opcode opcode, Format format, uint32_t num_operands,
      sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
   void* data = instruction_buffer->allocate(size, alignof(uint32_t));
   memset(data, 0, size);
-   T* inst = (T*)data;
+   Instruction* inst = (Instruction*)data;

   inst->opcode = opcode;
   inst->format = format;
--- a/src/amd/compiler/aco_lower_phis.cpp
+++ b/src/amd/compiler/aco_lower_phis.cpp
@ -108,8 +108,8 @@ get_output(Program* program, unsigned block_idx, ssa_state* state)
   }

   /* create phi */
-   aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(
-      aco_opcode::p_linear_phi, Format::PSEUDO, num_preds, 1)};
+   aco_ptr<Instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi,
+                                                                   Format::PSEUDO, num_preds, 1)};
   for (unsigned i = 0; i < num_preds; i++)
      phi->operands[i] = state->outputs[block.linear_preds[i]];
   phi->definitions[0] = Definition(output.getTemp());
@ -347,8 +347,8 @@ lower_divergent_bool_phi(Program* program, ssa_state* state, Block* block,

   unsigned num_preds = block->linear_preds.size();
   if (phi->operands.size() != num_preds) {
-      Pseudo_instruction* new_phi{create_instruction<Pseudo_instruction>(
-         aco_opcode::p_linear_phi, Format::PSEUDO, num_preds, 1)};
+      Instruction* new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi,
+                                                                  Format::PSEUDO, num_preds, 1)};
      new_phi->definitions[0] = phi->definitions[0];
      phi.reset(new_phi);
   } else {
--- a/src/amd/compiler/aco_lower_to_cssa.cpp
+++ b/src/amd/compiler/aco_lower_to_cssa.cpp
@ -424,8 +424,8 @@ emit_copies_block(Builder& bld, std::map<uint32_t, ltg_node>& ltg, RegType type)
      // TODO: this should be restricted to a feasible number of registers
      // and otherwise use a temporary to avoid having to reload more (spilled)
      // variables than we have registers.
-      aco_ptr<Pseudo_instruction> copy{create_instruction<Pseudo_instruction>(
-         aco_opcode::p_parallelcopy, Format::PSEUDO, num, num)};
+      aco_ptr<Instruction> copy{create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy,
+                                                                       Format::PSEUDO, num, num)};
      it = ltg.begin();
      for (unsigned i = 0; i < num; i++) {
         while (it->second.cp.def.regClass().type() != type)
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@ -601,13 +601,13 @@ emit_reduction(lower_context* ctx, aco_opcode op, ReduceOp reduce_op, unsigned c

   if (src.regClass() == v1b) {
      if (ctx->program->gfx_level >= GFX8 && ctx->program->gfx_level < GFX11) {
-         aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(
+         aco_ptr<Instruction> sdwa{create_instruction<SDWA_instruction>(
            aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
         sdwa->operands[0] = Operand(PhysReg{tmp}, v1);
         sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);
         bool sext = reduce_op == imin8 || reduce_op == imax8;
-         sdwa->sel[0] = SubdwordSel(1, 0, sext);
-         sdwa->dst_sel = SubdwordSel::dword;
+         sdwa->sdwa().sel[0] = SubdwordSel(1, 0, sext);
+         sdwa->sdwa().dst_sel = SubdwordSel::dword;
         bld.insert(std::move(sdwa));
      } else {
         aco_opcode opcode;
@ -624,13 +624,13 @@ emit_reduction(lower_context* ctx, aco_opcode op, ReduceOp reduce_op, unsigned c
      bool is_add_cmp = reduce_op == iadd16 || reduce_op == imax16 || reduce_op == imin16 ||
                        reduce_op == umin16 || reduce_op == umax16;
      if (ctx->program->gfx_level >= GFX10 && ctx->program->gfx_level < GFX11 && is_add_cmp) {
-         aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(
+         aco_ptr<Instruction> sdwa{create_instruction<SDWA_instruction>(
            aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
         sdwa->operands[0] = Operand(PhysReg{tmp}, v1);
         sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);
         bool sext = reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16;
-         sdwa->sel[0] = SubdwordSel(2, 0, sext);
-         sdwa->dst_sel = SubdwordSel::dword;
+         sdwa->sdwa().sel[0] = SubdwordSel(2, 0, sext);
+         sdwa->sdwa().dst_sel = SubdwordSel::dword;
         bld.insert(std::move(sdwa));
      } else if (ctx->program->gfx_level <= GFX7 ||
                 (ctx->program->gfx_level >= GFX11 && is_add_cmp)) {
@ -2259,7 +2259,7 @@ lower_image_sample(lower_context* ctx, aco_ptr<Instruction>& instr)
   instr->mimg().strict_wqm = false;

   if ((3 + num_vaddr) > instr->operands.size()) {
-      MIMG_instruction* new_instr = create_instruction<MIMG_instruction>(
+      Instruction* new_instr = create_instruction<MIMG_instruction>(
         instr->opcode, Format::MIMG, 3 + num_vaddr, instr->definitions.size());
      std::copy(instr->definitions.cbegin(), instr->definitions.cend(),
                new_instr->definitions.begin());
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@ -905,7 +905,7 @@ smem_combine(opt_ctx& ctx, aco_ptr<Instruction>& instr)
               smem.operands.back() = Operand(base);
            }
         } else {
-            SMEM_instruction* new_instr = create_instruction<SMEM_instruction>(
+            Instruction* new_instr = create_instruction<SMEM_instruction>(
               smem.opcode, Format::SMEM, smem.operands.size() + 1, smem.definitions.size());
            new_instr->operands[0] = smem.operands[0];
            new_instr->operands[1] = Operand::c32(offset);
@ -914,11 +914,11 @@ smem_combine(opt_ctx& ctx, aco_ptr<Instruction>& instr)
            new_instr->operands.back() = Operand(base);
            if (!smem.definitions.empty())
               new_instr->definitions[0] = smem.definitions[0];
-            new_instr->sync = smem.sync;
-            new_instr->glc = smem.glc;
-            new_instr->dlc = smem.dlc;
-            new_instr->nv = smem.nv;
-            new_instr->disable_wqm = smem.disable_wqm;
+            new_instr->smem().sync = smem.sync;
+            new_instr->smem().glc = smem.glc;
+            new_instr->smem().dlc = smem.dlc;
+            new_instr->smem().nv = smem.nv;
+            new_instr->smem().disable_wqm = smem.disable_wqm;
            instr.reset(new_instr);
         }
      }
@ -2312,10 +2312,10 @@ combine_ordering_test(opt_ctx& ctx, aco_ptr<Instruction>& instr)
   case 64: new_op = is_or ? aco_opcode::v_cmp_u_f64 : aco_opcode::v_cmp_o_f64; break;
   }
   bool needs_vop3 = num_sgprs > 1 || (opsel[0] && op[0].type() != RegType::vgpr);
-   VALU_instruction* new_instr = create_instruction<VALU_instruction>(
+   Instruction* new_instr = create_instruction<VALU_instruction>(
      new_op, needs_vop3 ? asVOP3(Format::VOPC) : Format::VOPC, 2, 1);

-   new_instr->opsel = opsel;
+   new_instr->valu().opsel = opsel;
   new_instr->operands[0] = copy_operand(ctx, Operand(op[0]));
   new_instr->operands[1] = copy_operand(ctx, Operand(op[1]));
   new_instr->definitions[0] = instr->definitions[0];
@ -2381,13 +2381,13 @@ combine_comparison_ordering(opt_ctx& ctx, aco_ptr<Instruction>& instr)
      return false;

   aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode);
-   VALU_instruction* new_instr = create_instruction<VALU_instruction>(
+   Instruction* new_instr = create_instruction<VALU_instruction>(
      new_op, cmp->isVOP3() ? asVOP3(Format::VOPC) : Format::VOPC, 2, 1);
-   new_instr->neg = cmp_valu.neg;
-   new_instr->abs = cmp_valu.abs;
-   new_instr->clamp = cmp_valu.clamp;
-   new_instr->omod = cmp_valu.omod;
-   new_instr->opsel = cmp_valu.opsel;
+   new_instr->valu().neg = cmp_valu.neg;
+   new_instr->valu().abs = cmp_valu.abs;
+   new_instr->valu().clamp = cmp_valu.clamp;
+   new_instr->valu().omod = cmp_valu.omod;
+   new_instr->valu().opsel = cmp_valu.opsel;
   new_instr->operands[0] = copy_operand(ctx, cmp->operands[0]);
   new_instr->operands[1] = copy_operand(ctx, cmp->operands[1]);
   new_instr->definitions[0] = instr->definitions[0];
@ -2701,12 +2701,12 @@ create_vop3_for_op3(opt_ctx& ctx, aco_opcode opcode, aco_ptr<Instruction>& instr
                    Operand operands[3], uint8_t neg, uint8_t abs, uint8_t opsel, bool clamp,
                    unsigned omod)
 {
-   VALU_instruction* new_instr = create_instruction<VALU_instruction>(opcode, Format::VOP3, 3, 1);
-   new_instr->neg = neg;
-   new_instr->abs = abs;
-   new_instr->clamp = clamp;
-   new_instr->omod = omod;
-   new_instr->opsel = opsel;
+   Instruction* new_instr = create_instruction<VALU_instruction>(opcode, Format::VOP3, 3, 1);
+   new_instr->valu().neg = neg;
+   new_instr->valu().abs = abs;
+   new_instr->valu().clamp = clamp;
+   new_instr->valu().omod = omod;
+   new_instr->valu().opsel = opsel;
   new_instr->operands[0] = operands[0];
   new_instr->operands[1] = operands[1];
   new_instr->operands[2] = operands[2];
@ -3746,7 +3746,7 @@ combine_add_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr, bool is_sub)
         ctx.uses[instr->operands[i].tempId()]--;

         aco_opcode mad_op = is_sub ? aco_opcode::v_mad_i32_i24 : aco_opcode::v_mad_u32_u24;
-         aco_ptr<VALU_instruction> new_instr{
+         aco_ptr<Instruction> new_instr{
            create_instruction<VALU_instruction>(mad_op, Format::VOP3, 3, 1)};
         for (unsigned op_idx = 0; op_idx < 3; ++op_idx)
            new_instr->operands[op_idx] = ops[op_idx];
@ -3930,23 +3930,23 @@ combine_vop3p(opt_ctx& ctx, aco_ptr<Instruction>& instr)

      /* turn mul + packed add into v_pk_fma_f16 */
      aco_opcode mad = fadd ? aco_opcode::v_pk_fma_f16 : aco_opcode::v_pk_mad_u16;
-      aco_ptr<VALU_instruction> fma{create_instruction<VALU_instruction>(mad, Format::VOP3P, 3, 1)};
+      aco_ptr<Instruction> fma{create_instruction<VALU_instruction>(mad, Format::VOP3P, 3, 1)};
      fma->operands[0] = copy_operand(ctx, mul_instr->operands[0]);
      fma->operands[1] = copy_operand(ctx, mul_instr->operands[1]);
      fma->operands[2] = instr->operands[add_op_idx];
-      fma->clamp = vop3p->clamp;
-      fma->neg_lo = mul_neg_lo;
-      fma->neg_hi = mul_neg_hi;
-      fma->opsel_lo = mul_opsel_lo;
-      fma->opsel_hi = mul_opsel_hi;
-      propagate_swizzles(fma.get(), vop3p->opsel_lo[1 - add_op_idx],
+      fma->valu().clamp = vop3p->clamp;
+      fma->valu().neg_lo = mul_neg_lo;
+      fma->valu().neg_hi = mul_neg_hi;
+      fma->valu().opsel_lo = mul_opsel_lo;
+      fma->valu().opsel_hi = mul_opsel_hi;
+      propagate_swizzles(&fma->valu(), vop3p->opsel_lo[1 - add_op_idx],
                         vop3p->opsel_hi[1 - add_op_idx]);
-      fma->opsel_lo[2] = vop3p->opsel_lo[add_op_idx];
-      fma->opsel_hi[2] = vop3p->opsel_hi[add_op_idx];
-      fma->neg_lo[2] = vop3p->neg_lo[add_op_idx];
-      fma->neg_hi[2] = vop3p->neg_hi[add_op_idx];
-      fma->neg_lo[1] = fma->neg_lo[1] ^ vop3p->neg_lo[1 - add_op_idx];
-      fma->neg_hi[1] = fma->neg_hi[1] ^ vop3p->neg_hi[1 - add_op_idx];
+      fma->valu().opsel_lo[2] = vop3p->opsel_lo[add_op_idx];
+      fma->valu().opsel_hi[2] = vop3p->opsel_hi[add_op_idx];
+      fma->valu().neg_lo[2] = vop3p->neg_lo[add_op_idx];
+      fma->valu().neg_hi[2] = vop3p->neg_hi[add_op_idx];
+      fma->valu().neg_lo[1] = fma->valu().neg_lo[1] ^ vop3p->neg_lo[1 - add_op_idx];
+      fma->valu().neg_hi[1] = fma->valu().neg_hi[1] ^ vop3p->neg_hi[1 - add_op_idx];
      fma->definitions[0] = instr->definitions[0];
      fma->pass_flags = instr->pass_flags;
      instr = std::move(fma);
@ -3995,26 +3995,26 @@ to_mad_mix(opt_ctx& ctx, aco_ptr<Instruction>& instr)

   bool is_add = instr->opcode != aco_opcode::v_mul_f32;

-   aco_ptr<VALU_instruction> vop3p{
+   aco_ptr<Instruction> vop3p{
      create_instruction<VALU_instruction>(aco_opcode::v_fma_mix_f32, Format::VOP3P, 3, 1)};

   for (unsigned i = 0; i < instr->operands.size(); i++) {
      vop3p->operands[is_add + i] = instr->operands[i];
-      vop3p->neg_lo[is_add + i] = instr->valu().neg[i];
-      vop3p->neg_hi[is_add + i] = instr->valu().abs[i];
+      vop3p->valu().neg_lo[is_add + i] = instr->valu().neg[i];
+      vop3p->valu().neg_hi[is_add + i] = instr->valu().abs[i];
   }
   if (instr->opcode == aco_opcode::v_mul_f32) {
      vop3p->operands[2] = Operand::zero();
-      vop3p->neg_lo[2] = true;
+      vop3p->valu().neg_lo[2] = true;
   } else if (is_add) {
      vop3p->operands[0] = Operand::c32(0x3f800000);
      if (instr->opcode == aco_opcode::v_sub_f32)
-         vop3p->neg_lo[2] ^= true;
+         vop3p->valu().neg_lo[2] ^= true;
      else if (instr->opcode == aco_opcode::v_subrev_f32)
-         vop3p->neg_lo[1] ^= true;
+         vop3p->valu().neg_lo[1] ^= true;
   }
   vop3p->definitions[0] = instr->definitions[0];
-   vop3p->clamp = instr->valu().clamp;
+   vop3p->valu().clamp = instr->valu().clamp;
   vop3p->pass_flags = instr->pass_flags;
   instr = std::move(vop3p);

@ -4418,7 +4418,7 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
            neg[2 - add_op_idx] = neg[2 - add_op_idx] ^ true;

         aco_ptr<Instruction> add_instr = std::move(instr);
-         aco_ptr<VALU_instruction> mad;
+         aco_ptr<Instruction> mad;
         if (add_instr->isVOP3P() || mul_instr->isVOP3P()) {
            assert(!omod);
            assert(!opsel);
@ -4448,14 +4448,14 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)

         for (unsigned i = 0; i < 3; i++) {
            mad->operands[i] = op[i];
-            mad->neg[i] = neg[i];
-            mad->abs[i] = abs[i];
+            mad->valu().neg[i] = neg[i];
+            mad->valu().abs[i] = abs[i];
         }
-         mad->omod = omod;
-         mad->clamp = clamp;
-         mad->opsel_lo = opsel_lo;
-         mad->opsel_hi = opsel_hi;
-         mad->opsel = opsel;
+         mad->valu().omod = omod;
+         mad->valu().clamp = clamp;
+         mad->valu().opsel_lo = opsel_lo;
+         mad->valu().opsel_hi = opsel_hi;
+         mad->valu().opsel = opsel;
         mad->definitions[0] = add_instr->definitions[0];
         mad->definitions[0].setPrecise(add_instr->definitions[0].isPrecise() ||
                                        mul_instr->definitions[0].isPrecise());
@ -4481,7 +4481,7 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
            ctx.uses[instr->operands[i].tempId()]--;
            ctx.uses[ctx.info[instr->operands[i].tempId()].temp.id()]++;

-            aco_ptr<VALU_instruction> new_instr{
+            aco_ptr<Instruction> new_instr{
               create_instruction<VALU_instruction>(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1)};
            new_instr->operands[0] = Operand::zero();
            new_instr->operands[1] = instr->operands[!i];
@ -4805,7 +4805,7 @@ select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
            if (op.isTemp())
               ctx.uses[op.tempId()]++;

-            aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(
+            aco_ptr<Instruction> extract{create_instruction<Pseudo_instruction>(
               aco_opcode::p_create_vector, Format::PSEUDO, 1, 1)};
            extract->operands[0] = op;
            extract->definitions[0] = instr->definitions[idx];
@ -4818,7 +4818,7 @@ select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
      if (!done && num_used == 1 &&
          instr->operands[0].bytes() % instr->definitions[idx].bytes() == 0 &&
          split_offset % instr->definitions[idx].bytes() == 0) {
-         aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(
+         aco_ptr<Instruction> extract{create_instruction<Pseudo_instruction>(
            aco_opcode::p_extract_vector, Format::PSEUDO, 2, 1)};
         extract->operands[0] = instr->operands[0];
         extract->operands[1] =
--- a/src/amd/compiler/aco_reduce_assign.cpp
+++ b/src/amd/compiler/aco_reduce_assign.cpp
@ -109,7 +109,7 @@ setup_reduce_temp(Program* program)

         if ((int)last_top_level_block_idx != inserted_at) {
            reduceTmp = program->allocateTmp(reduceTmp.regClass());
-            aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(
+            aco_ptr<Instruction> create{create_instruction<Pseudo_instruction>(
               aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
            create->definitions[0] = Definition(reduceTmp);
            /* find the right place to insert this definition */
@ -154,7 +154,7 @@ setup_reduce_temp(Program* program)

         if (need_vtmp && (int)last_top_level_block_idx != vtmp_inserted_at) {
            vtmp = program->allocateTmp(vtmp.regClass());
-            aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(
+            aco_ptr<Instruction> create{create_instruction<Pseudo_instruction>(
               aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
            create->definitions[0] = Definition(vtmp);
            if (last_top_level_block_idx == block.index) {
--- a/src/amd/compiler/aco_register_allocation.cpp
+++ b/src/amd/compiler/aco_register_allocation.cpp
@ -2885,7 +2885,7 @@ emit_parallel_copy_internal(ra_ctx& ctx, std::vector<std::pair<Operand, Definiti
   if (parallelcopy.empty())
      return;

-   aco_ptr<Pseudo_instruction> pc;
+   aco_ptr<Instruction> pc;
   pc.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO,
                                                   parallelcopy.size(), parallelcopy.size()));
   bool linear_vgpr = false;
@ -2935,8 +2935,8 @@ emit_parallel_copy_internal(ra_ctx& ctx, std::vector<std::pair<Operand, Definiti

      handle_pseudo(ctx, tmp_file, pc.get());
   } else {
-      pc->needs_scratch_reg = sgpr_operands_alias_defs || linear_vgpr;
-      pc->tmp_in_scc = false;
+      pc->pseudo().needs_scratch_reg = sgpr_operands_alias_defs || linear_vgpr;
+      pc->pseudo().tmp_in_scc = false;
   }

   instructions.emplace_back(std::move(pc));
--- a/src/amd/compiler/aco_scheduler_ilp.cpp
+++ b/src/amd/compiler/aco_scheduler_ilp.cpp
@ -686,9 +686,8 @@ create_vopd_instruction(const SchedILPContext& ctx, unsigned idx)
   get_vopd_opcode_operands(x, x_info, swap_x, &x_op, &num_operands, operands);
   get_vopd_opcode_operands(y, y_info, swap_y, &y_op, &num_operands, operands + num_operands);

-   VOPD_instruction* instr =
-      create_instruction<VOPD_instruction>(x_op, Format::VOPD, num_operands, 2);
-   instr->opy = y_op;
+   Instruction* instr = create_instruction<VOPD_instruction>(x_op, Format::VOPD, num_operands, 2);
+   instr->vopd().opy = y_op;
   instr->definitions[0] = x->definitions[0];
   instr->definitions[1] = y->definitions[0];
   std::copy(operands, operands + num_operands, instr->operands.begin());
--- a/src/amd/compiler/aco_spill.cpp
+++ b/src/amd/compiler/aco_spill.cpp
@ -376,7 +376,7 @@ do_reload(spill_ctx& ctx, Temp tmp, Temp new_name, uint32_t spill_id)
      res->definitions[0] = Definition(new_name);
      return res;
   } else {
-      aco_ptr<Pseudo_instruction> reload{
+      aco_ptr<Instruction> reload{
         create_instruction<Pseudo_instruction>(aco_opcode::p_reload, Format::PSEUDO, 1, 1)};
      reload->operands[0] = Operand::c32(spill_id);
      reload->definitions[0] = Definition(new_name);
@ -845,7 +845,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
         for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx])
            ctx.add_interference(def_spill_id, pair.second);

-         aco_ptr<Pseudo_instruction> spill{
+         aco_ptr<Instruction> spill{
            create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
         spill->operands[0] = spill_op;
         spill->operands[1] = Operand::c32(def_spill_id);
@ -915,7 +915,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
            ctx.renames[pred_idx].erase(rename_it);
         }

-         aco_ptr<Pseudo_instruction> spill{
+         aco_ptr<Instruction> spill{
            create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
         spill->operands[0] = Operand(var);
         spill->operands[1] = Operand::c32(pair.second);
@ -1054,7 +1054,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
      if (!is_same) {
         /* the variable was renamed differently in the predecessors: we have to create a phi */
         aco_opcode opcode = pair.first.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
-         aco_ptr<Pseudo_instruction> phi{
+         aco_ptr<Instruction> phi{
            create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
         rename = ctx.program->allocateTmp(pair.first.regClass());
         for (unsigned i = 0; i < phi->operands.size(); i++) {
@ -1229,7 +1229,7 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s
            }

            /* add spill to new instructions */
-            aco_ptr<Pseudo_instruction> spill{
+            aco_ptr<Instruction> spill{
               create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
            spill->operands[0] = Operand(to_spill);
            spill->operands[1] = Operand::c32(spill_id);
@ -1757,7 +1757,7 @@ assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr)
               if (vgpr_spill_temps[spill_slot / ctx.wave_size] == Temp()) {
                  Temp linear_vgpr = ctx.program->allocateTmp(v1.as_linear());
                  vgpr_spill_temps[spill_slot / ctx.wave_size] = linear_vgpr;
-                  aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(
+                  aco_ptr<Instruction> create{create_instruction<Pseudo_instruction>(
                     aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
                  create->definitions[0] = Definition(linear_vgpr);
                  /* find the right place to insert this definition */
@ -1774,7 +1774,7 @@ assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr)
               }

               /* spill sgpr: just add the vgpr temp to operands */
-               Pseudo_instruction* spill =
+               Instruction* spill =
                  create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 3, 0);
               spill->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]);
               spill->operands[0].setLateKill(true);
@ -1798,7 +1798,7 @@ assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr)
               if (vgpr_spill_temps[spill_slot / ctx.wave_size] == Temp()) {
                  Temp linear_vgpr = ctx.program->allocateTmp(v1.as_linear());
                  vgpr_spill_temps[spill_slot / ctx.wave_size] = linear_vgpr;
-                  aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(
+                  aco_ptr<Instruction> create{create_instruction<Pseudo_instruction>(
                     aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
                  create->definitions[0] = Definition(linear_vgpr);
                  /* find the right place to insert this definition */
@ -1815,8 +1815,8 @@ assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr)
               }

               /* reload sgpr: just add the vgpr temp to operands */
-               Pseudo_instruction* reload = create_instruction<Pseudo_instruction>(
-                  aco_opcode::p_reload, Format::PSEUDO, 2, 1);
+               Instruction* reload = create_instruction<Pseudo_instruction>(aco_opcode::p_reload,
+                                                                            Format::PSEUDO, 2, 1);
               reload->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]);
               reload->operands[0].setLateKill(true);
               reload->operands[1] = Operand::c32(spill_slot % ctx.wave_size);
--- a/src/amd/compiler/aco_ssa_elimination.cpp
+++ b/src/amd/compiler/aco_ssa_elimination.cpp
@ -97,7 +97,7 @@ insert_parallelcopies(ssa_elimination_ctx& ctx)
      }

      std::vector<aco_ptr<Instruction>>::iterator it = std::next(block.instructions.begin(), idx);
-      aco_ptr<Pseudo_instruction> pc{
+      aco_ptr<Instruction> pc{
         create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO,
                                                logical_phi_info.size(), logical_phi_info.size())};
      unsigned i = 0;
@ -107,7 +107,7 @@ insert_parallelcopies(ssa_elimination_ctx& ctx)
         i++;
      }
      /* this shouldn't be needed since we're only copying vgprs */
-      pc->tmp_in_scc = false;
+      pc->pseudo().tmp_in_scc = false;
      block.instructions.insert(it, std::move(pc));
   }

@ -122,7 +122,7 @@ insert_parallelcopies(ssa_elimination_ctx& ctx)
      --it;
      assert((*it)->isBranch());
      PhysReg scratch_sgpr = (*it)->definitions[0].physReg();
-      aco_ptr<Pseudo_instruction> pc{
+      aco_ptr<Instruction> pc{
         create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO,
                                                linear_phi_info.size(), linear_phi_info.size())};
      unsigned i = 0;
@ -131,9 +131,9 @@ insert_parallelcopies(ssa_elimination_ctx& ctx)
         pc->operands[i] = phi_info.op;
         i++;
      }
-      pc->tmp_in_scc = block.scc_live_out;
-      pc->scratch_sgpr = scratch_sgpr;
-      pc->needs_scratch_reg = true;
+      pc->pseudo().tmp_in_scc = block.scc_live_out;
+      pc->pseudo().scratch_sgpr = scratch_sgpr;
+      pc->pseudo().needs_scratch_reg = true;
      block.instructions.insert(it, std::move(pc));
   }
 }
--- a/src/amd/compiler/tests/test_assembler.cpp
+++ b/src/amd/compiler/tests/test_assembler.cpp
@ -269,7 +269,7 @@ BEGIN_TEST(assembler.v_add3)

      //~gfx9>> v_add3_u32 v0, 0, 0, 0 ; d1ff0000 02010080
      //~gfx10>> v_add3_u32 v0, 0, 0, 0 ; d76d0000 02010080
-      aco_ptr<VALU_instruction> add3{
+      aco_ptr<Instruction> add3{
         create_instruction<VALU_instruction>(aco_opcode::v_add3_u32, Format::VOP3, 3, 1)};
      add3->operands[0] = Operand::zero();
      add3->operands[1] = Operand::zero();
@ -288,13 +288,13 @@ BEGIN_TEST(assembler.v_add3_clamp)

      //~gfx9>> integer addition + clamp ; d1ff8000 02010080
      //~gfx10>> integer addition + clamp ; d76d8000 02010080
-      aco_ptr<VALU_instruction> add3{
+      aco_ptr<Instruction> add3{
         create_instruction<VALU_instruction>(aco_opcode::v_add3_u32, Format::VOP3, 3, 1)};
      add3->operands[0] = Operand::zero();
      add3->operands[1] = Operand::zero();
      add3->operands[2] = Operand::zero();
      add3->definitions[0] = Definition(PhysReg(0), v1);
-      add3->clamp = 1;
+      add3->valu().clamp = 1;
      bld.insert(std::move(add3));

      finish_assembler_test();
--- a/src/amd/compiler/tests/test_hard_clause.cpp
+++ b/src/amd/compiler/tests/test_hard_clause.cpp
@ -69,7 +69,7 @@ create_global()
 static void
 create_mimg(bool nsa, Temp desc = Temp(0, s8))
 {
-   aco_ptr<MIMG_instruction> mimg{
+   aco_ptr<Instruction> mimg{
      create_instruction<MIMG_instruction>(aco_opcode::image_sample, Format::MIMG, 5, 1)};
   mimg->definitions[0] = Definition(PhysReg(256), v1);
   mimg->operands[0] = Operand(desc);
@ -78,8 +78,8 @@ create_mimg(bool nsa, Temp desc = Temp(0, s8))
   mimg->operands[2] = Operand(v1);
   for (unsigned i = 0; i < 2; i++)
      mimg->operands[3 + i] = Operand(PhysReg(256 + (nsa ? i * 2 : i)), v1);
-   mimg->dmask = 0x1;
-   mimg->dim = ac_image_2d;
+   mimg->mimg().dmask = 0x1;
+   mimg->mimg().dim = ac_image_2d;

   bld.insert(std::move(mimg));
 }
--- a/src/amd/compiler/tests/test_insert_nops.cpp
+++ b/src/amd/compiler/tests/test_insert_nops.cpp
@ -42,7 +42,7 @@ create_mubuf_store(PhysReg src = PhysReg(256))
 void
 create_mimg(bool nsa, unsigned addrs, unsigned instr_dwords)
 {
-   aco_ptr<MIMG_instruction> mimg{
+   aco_ptr<Instruction> mimg{
      create_instruction<MIMG_instruction>(aco_opcode::image_sample, Format::MIMG, 3 + addrs, 1)};
   mimg->definitions[0] = Definition(PhysReg(256), v1);
   mimg->operands[0] = Operand(PhysReg(0), s8);
@ -50,8 +50,8 @@ create_mimg(bool nsa, unsigned addrs, unsigned instr_dwords)
   mimg->operands[2] = Operand(v1);
   for (unsigned i = 0; i < addrs; i++)
      mimg->operands[3 + i] = Operand(PhysReg(256 + (nsa ? i * 2 : i)), v1);
-   mimg->dmask = 0x1;
-   mimg->dim = ac_image_2d;
+   mimg->mimg().dmask = 0x1;
+   mimg->mimg().dim = ac_image_2d;

   assert(get_mimg_nsa_dwords(mimg.get()) + 2 == instr_dwords);