aco: use new disable_wqm for mimg

Foz-DB GFX1201: Totals from 88 (0.11% of 80251) affected shaders: Instrs: 81954 -> 82218 (+0.32%); split: -0.02%, +0.34% CodeSize: 451824 -> 452880 (+0.23%); split: -0.02%, +0.25% Latency: 308818 -> 308746 (-0.02%); split: -0.05%, +0.02% VClause: 1324 -> 1318 (-0.45%) Copies: 2795 -> 2784 (-0.39%) PreSGPRs: 4029 -> 4035 (+0.15%) SALU: 6563 -> 6809 (+3.75%); split: -0.15%, +3.90% Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35970>
2026-01-27 05:30:24 +01:00 · 2025-07-06 21:21:00 +02:00 · 2025-07-06 21:21:00 +02:00 · 0e66f2b2cc
commit 0e66f2b2cc
parent 922f559c3c
8 changed files with 56 additions and 31 deletions
--- a/src/amd/compiler/aco_insert_exec_mask.cpp
+++ b/src/amd/compiler/aco_insert_exec_mask.cpp
@ -62,11 +62,7 @@ struct exec_ctx {
 bool
 needs_exact(aco_ptr<Instruction>& instr)
 {
-   if (instr->isMIMG()) {
-      return instr->mimg().disable_wqm;
-   } else {
-      return instr->isEXP() || instr->opcode == aco_opcode::p_dual_src_export_gfx11;
-   }
+   return instr->isEXP() || instr->opcode == aco_opcode::p_dual_src_export_gfx11;
 }

 WQMState
@ -420,6 +416,8 @@ remove_disable_wqm(Instruction* instr)
      instr->mtbuf().disable_wqm = false;
   } else if (instr->isFlatLike()) {
      instr->flatlike().disable_wqm = false;
+   } else if (instr->isMIMG()) {
+      instr->mimg().disable_wqm = false;
   }

   /* Remove the two masks so that the assembler doesn't need to handle them. */
@ -843,6 +841,8 @@ instr_disables_wqm(Instruction* instr)
      return instr->mtbuf().disable_wqm;
   } else if (instr->isFlatLike()) {
      return instr->flatlike().disable_wqm;
+   } else if (instr->isMIMG()) {
+      return instr->mimg().disable_wqm;
   }

   return false;
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@ -2157,9 +2157,18 @@ lower_image_sample(lower_context* ctx, aco_ptr<Instruction>& instr)
 {
   Operand linear_vgpr = instr->operands[3];

+   bool disable_wqm = instr->mimg().disable_wqm;
+   Operand exact_mask;
+   Operand wqm_mask;
+   if (disable_wqm) {
+      exact_mask = instr_exact_mask(instr.get());
+      wqm_mask = instr_wqm_mask(instr.get());
+   }
+
   unsigned nsa_size = ctx->program->dev.max_nsa_vgprs;
   unsigned vaddr_size = linear_vgpr.size();
-   unsigned num_copied_vgprs = instr->operands.size() - 4;
+   unsigned non_mask_operands = instr->operands.size() - (2 * disable_wqm);
+   unsigned num_copied_vgprs = non_mask_operands - 4;
   nsa_size = num_copied_vgprs > 0 && (ctx->program->gfx_level >= GFX11 || vaddr_size <= nsa_size)
                 ? nsa_size
                 : 0;
@ -2180,7 +2189,7 @@ lower_image_sample(lower_context* ctx, aco_ptr<Instruction>& instr)
   } else {
      PhysReg reg = linear_vgpr.physReg();
      std::map<PhysReg, copy_operation> copy_operations;
-      for (unsigned i = 4; i < instr->operands.size(); i++) {
+      for (unsigned i = 4; i < non_mask_operands; i++) {
         Operand arg = instr->operands[i];
         Definition def(reg, RegClass::get(RegType::vgpr, arg.bytes()));
         copy_operations[def.physReg()] = {arg, def, def.bytes()};
@ -2193,10 +2202,11 @@ lower_image_sample(lower_context* ctx, aco_ptr<Instruction>& instr)
   }

   instr->mimg().strict_wqm = false;
+   unsigned new_op_count = 3 + num_vaddr + (2 * disable_wqm);

-   if ((3 + num_vaddr) > instr->operands.size()) {
+   if (new_op_count > instr->operands.size()) {
      Instruction* new_instr =
-         create_instruction(instr->opcode, Format::MIMG, 3 + num_vaddr, instr->definitions.size());
+         create_instruction(instr->opcode, Format::MIMG, new_op_count, instr->definitions.size());
      std::copy(instr->definitions.cbegin(), instr->definitions.cend(),
                new_instr->definitions.begin());
      new_instr->operands[0] = instr->operands[0];
@ -2206,10 +2216,15 @@ lower_image_sample(lower_context* ctx, aco_ptr<Instruction>& instr)
             sizeof(MIMG_instruction) - sizeof(Instruction));
      instr.reset(new_instr);
   } else {
-      while (instr->operands.size() > (3 + num_vaddr))
+      while (instr->operands.size() > new_op_count)
         instr->operands.pop_back();
   }
   std::copy(vaddr, vaddr + num_vaddr, std::next(instr->operands.begin(), 3));
+
+   if (disable_wqm) {
+      instr_exact_mask(instr.get()) = exact_mask;
+      instr_wqm_mask(instr.get()) = wqm_mask;
+   }
 }

 } /* end namespace */
--- a/src/amd/compiler/aco_register_allocation.cpp
+++ b/src/amd/compiler/aco_register_allocation.cpp
@ -111,7 +111,7 @@ struct PhysRegIterator {
 struct vector_info {
   vector_info() : is_weak(false), num_parts(0), parts(NULL) {}
   vector_info(Instruction* instr, unsigned start = 0, bool weak = false)
-       : is_weak(weak), num_parts(instr->operands.size() - start),
+       : is_weak(weak), num_parts(instr->operands.size() - start - (instr_disables_wqm(instr) * 2)),
         parts(instr->operands.begin() + start)
   {
      if (parts[0].isVectorAligned()) {
@ -3065,7 +3065,8 @@ get_affinities(ra_ctx& ctx)
                    !instr->mimg().strict_wqm) {

            bool is_vector = false;
-            for (unsigned i = 3, vector_begin = 3; i < instr->operands.size(); i++) {
+            unsigned op_count = instr->operands.size() - (instr->mimg().disable_wqm * 2);
+            for (unsigned i = 3, vector_begin = 3; i < op_count; i++) {
               if (is_vector || instr->operands[i].isVectorAligned())
                  ctx.vectors[instr->operands[i].tempId()] = vector_info(instr.get(), vector_begin);
               else if (ctx.program->gfx_level < GFX12 && !instr->operands[3].isVectorAligned())
--- a/src/amd/compiler/aco_validate.cpp
+++ b/src/amd/compiler/aco_validate.cpp
@ -859,13 +859,15 @@ validate_ir(Program* program)
                     instr.get());
            }

+            unsigned non_mask_ops = instr->operands.size() - (instr->mimg().disable_wqm * 2);
+
            if (instr->mimg().strict_wqm) {
               check(instr->operands[3].hasRegClass() &&
                        instr->operands[3].regClass().is_linear_vgpr(),
                     "MIMG operands[3] must be temp linear VGPR.", instr.get());

               unsigned total_size = 0;
-               for (unsigned i = 4; i < instr->operands.size(); i++) {
+               for (unsigned i = 4; i < non_mask_ops; i++) {
                  check(instr->operands[i].hasRegClass() && instr->operands[i].regClass() == v1,
                        "MIMG operands[4+] (VADDR) must be v1", instr.get());
                  total_size += instr->operands[i].bytes();
@ -873,19 +875,18 @@ validate_ir(Program* program)
               check(total_size <= instr->operands[3].bytes(),
                     "MIMG operands[4+] must fit within operands[3].", instr.get());
            } else {
-               check(instr->operands.size() == 4 || program->gfx_level >= GFX10,
+               check(non_mask_ops == 4 || program->gfx_level >= GFX10,
                     "NSA is only supported on GFX10+", instr.get());
-               for (unsigned i = 3; i < instr->operands.size(); i++) {
+               for (unsigned i = 3; i < non_mask_ops; i++) {
                  check(instr->operands[i].hasRegClass() &&
                           instr->operands[i].regClass().type() == RegType::vgpr,
                        "MIMG operands[3+] (VADDR) must be VGPR", instr.get());
-                  if (instr->operands.size() > 4) {
+                  if (non_mask_ops > 4) {
                     if (program->gfx_level < GFX11) {
                        check(instr->operands[i].regClass() == v1,
                              "GFX10 MIMG VADDR must be v1 if NSA is used", instr.get());
                     } else {
-                        unsigned num_scalar =
-                           program->gfx_level >= GFX12 ? (instr->operands.size() - 4) : 4;
+                        unsigned num_scalar = program->gfx_level >= GFX12 ? (non_mask_ops - 4) : 4;
                        if (instr->opcode != aco_opcode::image_bvh_intersect_ray &&
                            instr->opcode != aco_opcode::image_bvh64_intersect_ray &&
                            instr->opcode != aco_opcode::image_bvh_dual_intersect_ray &&
--- a/src/amd/compiler/instruction_selection/aco_instruction_selection.h
+++ b/src/amd/compiler/instruction_selection/aco_instruction_selection.h
@ -228,7 +228,8 @@ void emit_interp_mov_instr(isel_context* ctx, unsigned idx, unsigned component,
                           Temp dst, Temp prim_mask, bool high_16bits);
 std::vector<Temp> emit_pack_v1(isel_context* ctx, const std::vector<Temp>& unpacked);
 MIMG_instruction* emit_mimg(Builder& bld, aco_opcode op, std::vector<Temp> dsts, Temp rsrc,
-                            Operand samp, std::vector<Temp> coords, Operand vdata = Operand(v1));
+                            Operand samp, std::vector<Temp> coords, bool disable_wqm,
+                            Operand vdata = Operand(v1));
 Operand emit_tfe_init(Builder& bld, Temp dst);
 struct aco_export_mrt {
   Operand out[4];
--- a/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp
+++ b/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp
@ -498,7 +498,7 @@ emit_pack_v1(isel_context* ctx, const std::vector<Temp>& unpacked)

 MIMG_instruction*
 emit_mimg(Builder& bld, aco_opcode op, std::vector<Temp> dsts, Temp rsrc, Operand samp,
-          std::vector<Temp> coords, Operand vdata)
+          std::vector<Temp> coords, bool disable_wqm, Operand vdata)
 {
   bool is_vsample = !samp.isUndefined() || op == aco_opcode::image_msaa_load;

@ -541,7 +541,8 @@ emit_mimg(Builder& bld, aco_opcode op, std::vector<Temp> dsts, Temp rsrc, Operan
      coords.resize(nsa_size + 1);
   }

-   aco_ptr<Instruction> mimg{create_instruction(op, Format::MIMG, 3 + coords.size(), dsts.size())};
+   aco_ptr<Instruction> mimg{
+      create_instruction(op, Format::MIMG, 3 + coords.size() + disable_wqm * 2, dsts.size())};
   for (unsigned i = 0; i < dsts.size(); ++i)
      mimg->definitions[i] = Definition(dsts[i]);
   mimg->operands[0] = Operand(rsrc);
@ -549,6 +550,14 @@ emit_mimg(Builder& bld, aco_opcode op, std::vector<Temp> dsts, Temp rsrc, Operan
   mimg->operands[2] = vdata;
   for (unsigned i = 0; i < coords.size(); i++)
      mimg->operands[3 + i] = Operand(coords[i]);
+
+   if (disable_wqm) {
+      instr_exact_mask(mimg.get()) = Operand();
+      instr_wqm_mask(mimg.get()) = Operand();
+      mimg->mimg().disable_wqm = true;
+      bld.program->needs_exact = true;
+   }
+
   mimg->mimg().strict_wqm = strict_wqm;

   return &bld.insert(std::move(mimg))->mimg();
--- a/src/amd/compiler/instruction_selection/aco_select_nir.cpp
+++ b/src/amd/compiler/instruction_selection/aco_select_nir.cpp
@ -338,7 +338,7 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr)
         Temp tg4_lod = bld.copy(bld.def(v1), Operand::zero());
         Temp size = bld.tmp(v2);
         MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, {size}, resource,
-                                           Operand(s4), std::vector<Temp>{tg4_lod});
+                                           Operand(s4), std::vector<Temp>{tg4_lod}, false);
         tex->dim = dim;
         tex->dmask = 0x3;
         tex->da = da;
@ -494,7 +494,8 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr)
                         ? aco_opcode::image_load
                         : aco_opcode::image_load_mip;
      Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
-      MIMG_instruction* tex = emit_mimg(bld, op, {tmp_dst}, resource, Operand(s4), args, vdata);
+      MIMG_instruction* tex =
+         emit_mimg(bld, op, {tmp_dst}, resource, Operand(s4), args, false, vdata);
      if (instr->op == nir_texop_fragment_mask_fetch_amd)
         tex->dim = da ? ac_image_2darray : ac_image_2d;
      else
@ -674,7 +675,7 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr)

   Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
   MIMG_instruction* tex =
-      emit_mimg(bld, opcode, {tmp_dst}, resource, Operand(sampler), args, vdata);
+      emit_mimg(bld, opcode, {tmp_dst}, resource, Operand(sampler), args, false, vdata);
   tex->dim = dim;
   tex->dmask = dmask & 0xf;
   tex->da = da;
--- a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp
+++ b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp
@ -2087,7 +2087,8 @@ visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)
      }

      Operand vdata = is_sparse ? emit_tfe_init(bld, tmp) : Operand(v1);
-      MIMG_instruction* load = emit_mimg(bld, opcode, {tmp}, resource, Operand(s4), coords, vdata);
+      MIMG_instruction* load =
+         emit_mimg(bld, opcode, {tmp}, resource, Operand(s4), coords, false, vdata);
      load->cache = get_cache_flags(ctx, nir_intrinsic_access(instr) | ACCESS_TYPE_LOAD);
      load->a16 = instr->src[1].ssa->bit_size == 16;
      load->d16 = d16;
@ -2230,7 +2231,7 @@ visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
   aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;

   MIMG_instruction* store =
-      emit_mimg(bld, opcode, {}, resource, Operand(s4), coords, Operand(data));
+      emit_mimg(bld, opcode, {}, resource, Operand(s4), coords, true, Operand(data));
   store->cache = cache;
   store->a16 = instr->src[1].ssa->bit_size == 16;
   store->d16 = d16;
@ -2239,9 +2240,7 @@ visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
   ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
   store->dim = sdim;
   store->da = should_declare_array(sdim);
-   store->disable_wqm = true;
   store->sync = sync;
-   ctx->program->needs_exact = true;
   return;
 }

@ -2389,7 +2388,7 @@ visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
   if (return_previous)
      tmps = {(cmpswap ? bld.tmp(data.regClass()) : dst)};
   MIMG_instruction* mimg =
-      emit_mimg(bld, image_op, tmps, resource, Operand(s4), coords, Operand(data));
+      emit_mimg(bld, image_op, tmps, resource, Operand(s4), coords, true, Operand(data));
   mimg->cache = get_atomic_cache_flags(ctx, return_previous);
   mimg->dmask = (1 << data.size()) - 1;
   mimg->a16 = instr->src[1].ssa->bit_size == 16;
@ -2397,9 +2396,7 @@ visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
   ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
   mimg->dim = sdim;
   mimg->da = should_declare_array(sdim);
-   mimg->disable_wqm = true;
   mimg->sync = sync;
-   ctx->program->needs_exact = true;
   if (return_previous && cmpswap)
      bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), tmps[0], Operand::zero());
   return;