From 0e66f2b2cc4c50833bcf04e6da2c3dca158e721f Mon Sep 17 00:00:00 2001
From: Georg Lehmann <dadschoorse@gmail.com>
Date: Sun, 6 Jul 2025 21:21:00 +0200
Subject: [PATCH] aco: use new disable_wqm for mimg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Foz-DB GFX1201:
Totals from 88 (0.11% of 80251) affected shaders:
Instrs: 81954 -> 82218 (+0.32%); split: -0.02%, +0.34%
CodeSize: 451824 -> 452880 (+0.23%); split: -0.02%, +0.25%
Latency: 308818 -> 308746 (-0.02%); split: -0.05%, +0.02%
VClause: 1324 -> 1318 (-0.45%)
Copies: 2795 -> 2784 (-0.39%)
PreSGPRs: 4029 -> 4035 (+0.15%)
SALU: 6563 -> 6809 (+3.75%); split: -0.15%, +3.90%

Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35970>
---
 src/amd/compiler/aco_insert_exec_mask.cpp     | 10 ++++----
 src/amd/compiler/aco_lower_to_hw_instr.cpp    | 25 +++++++++++++++----
 src/amd/compiler/aco_register_allocation.cpp  |  5 ++--
 src/amd/compiler/aco_validate.cpp             | 13 +++++-----
 .../aco_instruction_selection.h               |  3 ++-
 .../aco_isel_helpers.cpp                      | 13 ++++++++--
 .../instruction_selection/aco_select_nir.cpp  |  7 +++---
 .../aco_select_nir_intrinsics.cpp             | 11 +++-----
 8 files changed, 56 insertions(+), 31 deletions(-)
diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp
index f382647f197..001455bc5ff 100644
--- a/src/amd/compiler/aco_insert_exec_mask.cpp
+++ b/src/amd/compiler/aco_insert_exec_mask.cpp
@@ -62,11 +62,7 @@ struct exec_ctx {
 bool
 needs_exact(aco_ptr<Instruction>& instr)
 {
-   if (instr->isMIMG()) {
-      return instr->mimg().disable_wqm;
-   } else {
-      return instr->isEXP() || instr->opcode == aco_opcode::p_dual_src_export_gfx11;
-   }
+   return instr->isEXP() || instr->opcode == aco_opcode::p_dual_src_export_gfx11;
 }
 
 WQMState
@@ -420,6 +416,8 @@ remove_disable_wqm(Instruction* instr)
       instr->mtbuf().disable_wqm = false;
    } else if (instr->isFlatLike()) {
       instr->flatlike().disable_wqm = false;
+   } else if (instr->isMIMG()) {
+      instr->mimg().disable_wqm = false;
    }
 
    /* Remove the two masks so that the assembler doesn't need to handle them. */
@@ -843,6 +841,8 @@ instr_disables_wqm(Instruction* instr)
       return instr->mtbuf().disable_wqm;
    } else if (instr->isFlatLike()) {
       return instr->flatlike().disable_wqm;
+   } else if (instr->isMIMG()) {
+      return instr->mimg().disable_wqm;
    }
 
    return false;
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp
index df32dd7099b..e3f91501850 100644
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -2157,9 +2157,18 @@ lower_image_sample(lower_context* ctx, aco_ptr<Instruction>& instr)
 {
    Operand linear_vgpr = instr->operands[3];
 
+   bool disable_wqm = instr->mimg().disable_wqm;
+   Operand exact_mask;
+   Operand wqm_mask;
+   if (disable_wqm) {
+      exact_mask = instr_exact_mask(instr.get());
+      wqm_mask = instr_wqm_mask(instr.get());
+   }
+
    unsigned nsa_size = ctx->program->dev.max_nsa_vgprs;
    unsigned vaddr_size = linear_vgpr.size();
-   unsigned num_copied_vgprs = instr->operands.size() - 4;
+   unsigned non_mask_operands = instr->operands.size() - (2 * disable_wqm);
+   unsigned num_copied_vgprs = non_mask_operands - 4;
    nsa_size = num_copied_vgprs > 0 && (ctx->program->gfx_level >= GFX11 || vaddr_size <= nsa_size)
                  ? nsa_size
                  : 0;
@@ -2180,7 +2189,7 @@ lower_image_sample(lower_context* ctx, aco_ptr<Instruction>& instr)
    } else {
       PhysReg reg = linear_vgpr.physReg();
       std::map<PhysReg, copy_operation> copy_operations;
-      for (unsigned i = 4; i < instr->operands.size(); i++) {
+      for (unsigned i = 4; i < non_mask_operands; i++) {
          Operand arg = instr->operands[i];
          Definition def(reg, RegClass::get(RegType::vgpr, arg.bytes()));
          copy_operations[def.physReg()] = {arg, def, def.bytes()};
@@ -2193,10 +2202,11 @@ lower_image_sample(lower_context* ctx, aco_ptr<Instruction>& instr)
    }
 
    instr->mimg().strict_wqm = false;
+   unsigned new_op_count = 3 + num_vaddr + (2 * disable_wqm);
 
-   if ((3 + num_vaddr) > instr->operands.size()) {
+   if (new_op_count > instr->operands.size()) {
       Instruction* new_instr =
-         create_instruction(instr->opcode, Format::MIMG, 3 + num_vaddr, instr->definitions.size());
+         create_instruction(instr->opcode, Format::MIMG, new_op_count, instr->definitions.size());
       std::copy(instr->definitions.cbegin(), instr->definitions.cend(),
                 new_instr->definitions.begin());
       new_instr->operands[0] = instr->operands[0];
@@ -2206,10 +2216,15 @@ lower_image_sample(lower_context* ctx, aco_ptr<Instruction>& instr)
              sizeof(MIMG_instruction) - sizeof(Instruction));
       instr.reset(new_instr);
    } else {
-      while (instr->operands.size() > (3 + num_vaddr))
+      while (instr->operands.size() > new_op_count)
          instr->operands.pop_back();
    }
    std::copy(vaddr, vaddr + num_vaddr, std::next(instr->operands.begin(), 3));
+
+   if (disable_wqm) {
+      instr_exact_mask(instr.get()) = exact_mask;
+      instr_wqm_mask(instr.get()) = wqm_mask;
+   }
 }
 
 } /* end namespace */
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp
index ce03b82757a..316e7fd7729 100644
--- a/src/amd/compiler/aco_register_allocation.cpp
+++ b/src/amd/compiler/aco_register_allocation.cpp
@@ -111,7 +111,7 @@ struct PhysRegIterator {
 struct vector_info {
    vector_info() : is_weak(false), num_parts(0), parts(NULL) {}
    vector_info(Instruction* instr, unsigned start = 0, bool weak = false)
-       : is_weak(weak), num_parts(instr->operands.size() - start),
+       : is_weak(weak), num_parts(instr->operands.size() - start - (instr_disables_wqm(instr) * 2)),
          parts(instr->operands.begin() + start)
    {
       if (parts[0].isVectorAligned()) {
@@ -3065,7 +3065,8 @@ get_affinities(ra_ctx& ctx)
                     !instr->mimg().strict_wqm) {
 
             bool is_vector = false;
-            for (unsigned i = 3, vector_begin = 3; i < instr->operands.size(); i++) {
+            unsigned op_count = instr->operands.size() - (instr->mimg().disable_wqm * 2);
+            for (unsigned i = 3, vector_begin = 3; i < op_count; i++) {
                if (is_vector || instr->operands[i].isVectorAligned())
                   ctx.vectors[instr->operands[i].tempId()] = vector_info(instr.get(), vector_begin);
                else if (ctx.program->gfx_level < GFX12 && !instr->operands[3].isVectorAligned())
diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp
index f3a2a92e32b..c1ee9296579 100644
--- a/src/amd/compiler/aco_validate.cpp
+++ b/src/amd/compiler/aco_validate.cpp
@@ -859,13 +859,15 @@ validate_ir(Program* program)
                      instr.get());
             }
 
+            unsigned non_mask_ops = instr->operands.size() - (instr->mimg().disable_wqm * 2);
+
             if (instr->mimg().strict_wqm) {
                check(instr->operands[3].hasRegClass() &&
                         instr->operands[3].regClass().is_linear_vgpr(),
                      "MIMG operands[3] must be temp linear VGPR.", instr.get());
 
                unsigned total_size = 0;
-               for (unsigned i = 4; i < instr->operands.size(); i++) {
+               for (unsigned i = 4; i < non_mask_ops; i++) {
                   check(instr->operands[i].hasRegClass() && instr->operands[i].regClass() == v1,
                         "MIMG operands[4+] (VADDR) must be v1", instr.get());
                   total_size += instr->operands[i].bytes();
@@ -873,19 +875,18 @@ validate_ir(Program* program)
                check(total_size <= instr->operands[3].bytes(),
                      "MIMG operands[4+] must fit within operands[3].", instr.get());
             } else {
-               check(instr->operands.size() == 4 || program->gfx_level >= GFX10,
+               check(non_mask_ops == 4 || program->gfx_level >= GFX10,
                      "NSA is only supported on GFX10+", instr.get());
-               for (unsigned i = 3; i < instr->operands.size(); i++) {
+               for (unsigned i = 3; i < non_mask_ops; i++) {
                   check(instr->operands[i].hasRegClass() &&
                            instr->operands[i].regClass().type() == RegType::vgpr,
                         "MIMG operands[3+] (VADDR) must be VGPR", instr.get());
-                  if (instr->operands.size() > 4) {
+                  if (non_mask_ops > 4) {
                      if (program->gfx_level < GFX11) {
                         check(instr->operands[i].regClass() == v1,
                               "GFX10 MIMG VADDR must be v1 if NSA is used", instr.get());
                      } else {
-                        unsigned num_scalar =
-                           program->gfx_level >= GFX12 ? (instr->operands.size() - 4) : 4;
+                        unsigned num_scalar = program->gfx_level >= GFX12 ? (non_mask_ops - 4) : 4;
                         if (instr->opcode != aco_opcode::image_bvh_intersect_ray &&
                             instr->opcode != aco_opcode::image_bvh64_intersect_ray &&
                             instr->opcode != aco_opcode::image_bvh_dual_intersect_ray &&
diff --git a/src/amd/compiler/instruction_selection/aco_instruction_selection.h b/src/amd/compiler/instruction_selection/aco_instruction_selection.h
index cb2504e3c66..73ece52ad89 100644
--- a/src/amd/compiler/instruction_selection/aco_instruction_selection.h
+++ b/src/amd/compiler/instruction_selection/aco_instruction_selection.h
@@ -228,7 +228,8 @@ void emit_interp_mov_instr(isel_context* ctx, unsigned idx, unsigned component,
                            Temp dst, Temp prim_mask, bool high_16bits);
 std::vector<Temp> emit_pack_v1(isel_context* ctx, const std::vector<Temp>& unpacked);
 MIMG_instruction* emit_mimg(Builder& bld, aco_opcode op, std::vector<Temp> dsts, Temp rsrc,
-                            Operand samp, std::vector<Temp> coords, Operand vdata = Operand(v1));
+                            Operand samp, std::vector<Temp> coords, bool disable_wqm,
+                            Operand vdata = Operand(v1));
 Operand emit_tfe_init(Builder& bld, Temp dst);
 struct aco_export_mrt {
    Operand out[4];
diff --git a/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp b/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp
index c753168e757..01b1744da6c 100644
--- a/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp
+++ b/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp
@@ -498,7 +498,7 @@ emit_pack_v1(isel_context* ctx, const std::vector<Temp>& unpacked)
 
 MIMG_instruction*
 emit_mimg(Builder& bld, aco_opcode op, std::vector<Temp> dsts, Temp rsrc, Operand samp,
-          std::vector<Temp> coords, Operand vdata)
+          std::vector<Temp> coords, bool disable_wqm, Operand vdata)
 {
    bool is_vsample = !samp.isUndefined() || op == aco_opcode::image_msaa_load;
 
@@ -541,7 +541,8 @@ emit_mimg(Builder& bld, aco_opcode op, std::vector<Temp> dsts, Temp rsrc, Operan
       coords.resize(nsa_size + 1);
    }
 
-   aco_ptr<Instruction> mimg{create_instruction(op, Format::MIMG, 3 + coords.size(), dsts.size())};
+   aco_ptr<Instruction> mimg{
+      create_instruction(op, Format::MIMG, 3 + coords.size() + disable_wqm * 2, dsts.size())};
    for (unsigned i = 0; i < dsts.size(); ++i)
       mimg->definitions[i] = Definition(dsts[i]);
    mimg->operands[0] = Operand(rsrc);
@@ -549,6 +550,14 @@ emit_mimg(Builder& bld, aco_opcode op, std::vector<Temp> dsts, Temp rsrc, Operan
    mimg->operands[2] = vdata;
    for (unsigned i = 0; i < coords.size(); i++)
       mimg->operands[3 + i] = Operand(coords[i]);
+
+   if (disable_wqm) {
+      instr_exact_mask(mimg.get()) = Operand();
+      instr_wqm_mask(mimg.get()) = Operand();
+      mimg->mimg().disable_wqm = true;
+      bld.program->needs_exact = true;
+   }
+
    mimg->mimg().strict_wqm = strict_wqm;
 
    return &bld.insert(std::move(mimg))->mimg();
diff --git a/src/amd/compiler/instruction_selection/aco_select_nir.cpp b/src/amd/compiler/instruction_selection/aco_select_nir.cpp
index 17ccbc3f91c..84bc23ddb01 100644
--- a/src/amd/compiler/instruction_selection/aco_select_nir.cpp
+++ b/src/amd/compiler/instruction_selection/aco_select_nir.cpp
@@ -338,7 +338,7 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr)
          Temp tg4_lod = bld.copy(bld.def(v1), Operand::zero());
          Temp size = bld.tmp(v2);
          MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, {size}, resource,
-                                           Operand(s4), std::vector<Temp>{tg4_lod});
+                                           Operand(s4), std::vector<Temp>{tg4_lod}, false);
          tex->dim = dim;
          tex->dmask = 0x3;
          tex->da = da;
@@ -494,7 +494,8 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr)
                          ? aco_opcode::image_load
                          : aco_opcode::image_load_mip;
       Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
-      MIMG_instruction* tex = emit_mimg(bld, op, {tmp_dst}, resource, Operand(s4), args, vdata);
+      MIMG_instruction* tex =
+         emit_mimg(bld, op, {tmp_dst}, resource, Operand(s4), args, false, vdata);
       if (instr->op == nir_texop_fragment_mask_fetch_amd)
          tex->dim = da ? ac_image_2darray : ac_image_2d;
       else
@@ -674,7 +675,7 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr)
 
    Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
    MIMG_instruction* tex =
-      emit_mimg(bld, opcode, {tmp_dst}, resource, Operand(sampler), args, vdata);
+      emit_mimg(bld, opcode, {tmp_dst}, resource, Operand(sampler), args, false, vdata);
    tex->dim = dim;
    tex->dmask = dmask & 0xf;
    tex->da = da;
diff --git a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp
index 95a744c948d..f208ada6c06 100644
--- a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp
+++ b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp
@@ -2087,7 +2087,8 @@ visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)
       }
 
       Operand vdata = is_sparse ? emit_tfe_init(bld, tmp) : Operand(v1);
-      MIMG_instruction* load = emit_mimg(bld, opcode, {tmp}, resource, Operand(s4), coords, vdata);
+      MIMG_instruction* load =
+         emit_mimg(bld, opcode, {tmp}, resource, Operand(s4), coords, false, vdata);
       load->cache = get_cache_flags(ctx, nir_intrinsic_access(instr) | ACCESS_TYPE_LOAD);
       load->a16 = instr->src[1].ssa->bit_size == 16;
       load->d16 = d16;
@@ -2230,7 +2231,7 @@ visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
    aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
 
    MIMG_instruction* store =
-      emit_mimg(bld, opcode, {}, resource, Operand(s4), coords, Operand(data));
+      emit_mimg(bld, opcode, {}, resource, Operand(s4), coords, true, Operand(data));
    store->cache = cache;
    store->a16 = instr->src[1].ssa->bit_size == 16;
    store->d16 = d16;
@@ -2239,9 +2240,7 @@ visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
    ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
    store->dim = sdim;
    store->da = should_declare_array(sdim);
-   store->disable_wqm = true;
    store->sync = sync;
-   ctx->program->needs_exact = true;
    return;
 }
 
@@ -2389,7 +2388,7 @@ visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
    if (return_previous)
       tmps = {(cmpswap ? bld.tmp(data.regClass()) : dst)};
    MIMG_instruction* mimg =
-      emit_mimg(bld, image_op, tmps, resource, Operand(s4), coords, Operand(data));
+      emit_mimg(bld, image_op, tmps, resource, Operand(s4), coords, true, Operand(data));
    mimg->cache = get_atomic_cache_flags(ctx, return_previous);
    mimg->dmask = (1 << data.size()) - 1;
    mimg->a16 = instr->src[1].ssa->bit_size == 16;
@@ -2397,9 +2396,7 @@ visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
    ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
    mimg->dim = sdim;
    mimg->da = should_declare_array(sdim);
-   mimg->disable_wqm = true;
    mimg->sync = sync;
-   ctx->program->needs_exact = true;
    if (return_previous && cmpswap)
       bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), tmps[0], Operand::zero());
    return;