From 0e66f2b2cc4c50833bcf04e6da2c3dca158e721f Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Sun, 6 Jul 2025 21:21:00 +0200 Subject: [PATCH] aco: use new disable_wqm for mimg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Foz-DB GFX1201: Totals from 88 (0.11% of 80251) affected shaders: Instrs: 81954 -> 82218 (+0.32%); split: -0.02%, +0.34% CodeSize: 451824 -> 452880 (+0.23%); split: -0.02%, +0.25% Latency: 308818 -> 308746 (-0.02%); split: -0.05%, +0.02% VClause: 1324 -> 1318 (-0.45%) Copies: 2795 -> 2784 (-0.39%) PreSGPRs: 4029 -> 4035 (+0.15%) SALU: 6563 -> 6809 (+3.75%); split: -0.15%, +3.90% Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_insert_exec_mask.cpp | 10 ++++---- src/amd/compiler/aco_lower_to_hw_instr.cpp | 25 +++++++++++++++---- src/amd/compiler/aco_register_allocation.cpp | 5 ++-- src/amd/compiler/aco_validate.cpp | 13 +++++----- .../aco_instruction_selection.h | 3 ++- .../aco_isel_helpers.cpp | 13 ++++++++-- .../instruction_selection/aco_select_nir.cpp | 7 +++--- .../aco_select_nir_intrinsics.cpp | 11 +++----- 8 files changed, 56 insertions(+), 31 deletions(-) diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp index f382647f197..001455bc5ff 100644 --- a/src/amd/compiler/aco_insert_exec_mask.cpp +++ b/src/amd/compiler/aco_insert_exec_mask.cpp @@ -62,11 +62,7 @@ struct exec_ctx { bool needs_exact(aco_ptr& instr) { - if (instr->isMIMG()) { - return instr->mimg().disable_wqm; - } else { - return instr->isEXP() || instr->opcode == aco_opcode::p_dual_src_export_gfx11; - } + return instr->isEXP() || instr->opcode == aco_opcode::p_dual_src_export_gfx11; } WQMState @@ -420,6 +416,8 @@ remove_disable_wqm(Instruction* instr) instr->mtbuf().disable_wqm = false; } else if (instr->isFlatLike()) { instr->flatlike().disable_wqm = false; + } else if (instr->isMIMG()) { + instr->mimg().disable_wqm = false; } /* Remove the two masks so that the assembler doesn't need to handle them. */ @@ -843,6 +841,8 @@ instr_disables_wqm(Instruction* instr) return instr->mtbuf().disable_wqm; } else if (instr->isFlatLike()) { return instr->flatlike().disable_wqm; + } else if (instr->isMIMG()) { + return instr->mimg().disable_wqm; } return false; diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index df32dd7099b..e3f91501850 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -2157,9 +2157,18 @@ lower_image_sample(lower_context* ctx, aco_ptr& instr) { Operand linear_vgpr = instr->operands[3]; + bool disable_wqm = instr->mimg().disable_wqm; + Operand exact_mask; + Operand wqm_mask; + if (disable_wqm) { + exact_mask = instr_exact_mask(instr.get()); + wqm_mask = instr_wqm_mask(instr.get()); + } + unsigned nsa_size = ctx->program->dev.max_nsa_vgprs; unsigned vaddr_size = linear_vgpr.size(); - unsigned num_copied_vgprs = instr->operands.size() - 4; + unsigned non_mask_operands = instr->operands.size() - (2 * disable_wqm); + unsigned num_copied_vgprs = non_mask_operands - 4; nsa_size = num_copied_vgprs > 0 && (ctx->program->gfx_level >= GFX11 || vaddr_size <= nsa_size) ? nsa_size : 0; @@ -2180,7 +2189,7 @@ lower_image_sample(lower_context* ctx, aco_ptr& instr) } else { PhysReg reg = linear_vgpr.physReg(); std::map copy_operations; - for (unsigned i = 4; i < instr->operands.size(); i++) { + for (unsigned i = 4; i < non_mask_operands; i++) { Operand arg = instr->operands[i]; Definition def(reg, RegClass::get(RegType::vgpr, arg.bytes())); copy_operations[def.physReg()] = {arg, def, def.bytes()}; @@ -2193,10 +2202,11 @@ lower_image_sample(lower_context* ctx, aco_ptr& instr) } instr->mimg().strict_wqm = false; + unsigned new_op_count = 3 + num_vaddr + (2 * disable_wqm); - if ((3 + num_vaddr) > instr->operands.size()) { + if (new_op_count > instr->operands.size()) { Instruction* new_instr = - create_instruction(instr->opcode, Format::MIMG, 3 + num_vaddr, instr->definitions.size()); + create_instruction(instr->opcode, Format::MIMG, new_op_count, instr->definitions.size()); std::copy(instr->definitions.cbegin(), instr->definitions.cend(), new_instr->definitions.begin()); new_instr->operands[0] = instr->operands[0]; @@ -2206,10 +2216,15 @@ lower_image_sample(lower_context* ctx, aco_ptr& instr) sizeof(MIMG_instruction) - sizeof(Instruction)); instr.reset(new_instr); } else { - while (instr->operands.size() > (3 + num_vaddr)) + while (instr->operands.size() > new_op_count) instr->operands.pop_back(); } std::copy(vaddr, vaddr + num_vaddr, std::next(instr->operands.begin(), 3)); + + if (disable_wqm) { + instr_exact_mask(instr.get()) = exact_mask; + instr_wqm_mask(instr.get()) = wqm_mask; + } } } /* end namespace */ diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index ce03b82757a..316e7fd7729 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -111,7 +111,7 @@ struct PhysRegIterator { struct vector_info { vector_info() : is_weak(false), num_parts(0), parts(NULL) {} vector_info(Instruction* instr, unsigned start = 0, bool weak = false) - : is_weak(weak), num_parts(instr->operands.size() - start), + : is_weak(weak), num_parts(instr->operands.size() - start - (instr_disables_wqm(instr) * 2)), parts(instr->operands.begin() + start) { if (parts[0].isVectorAligned()) { @@ -3065,7 +3065,8 @@ get_affinities(ra_ctx& ctx) !instr->mimg().strict_wqm) { bool is_vector = false; - for (unsigned i = 3, vector_begin = 3; i < instr->operands.size(); i++) { + unsigned op_count = instr->operands.size() - (instr->mimg().disable_wqm * 2); + for (unsigned i = 3, vector_begin = 3; i < op_count; i++) { if (is_vector || instr->operands[i].isVectorAligned()) ctx.vectors[instr->operands[i].tempId()] = vector_info(instr.get(), vector_begin); else if (ctx.program->gfx_level < GFX12 && !instr->operands[3].isVectorAligned()) diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index f3a2a92e32b..c1ee9296579 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -859,13 +859,15 @@ validate_ir(Program* program) instr.get()); } + unsigned non_mask_ops = instr->operands.size() - (instr->mimg().disable_wqm * 2); + if (instr->mimg().strict_wqm) { check(instr->operands[3].hasRegClass() && instr->operands[3].regClass().is_linear_vgpr(), "MIMG operands[3] must be temp linear VGPR.", instr.get()); unsigned total_size = 0; - for (unsigned i = 4; i < instr->operands.size(); i++) { + for (unsigned i = 4; i < non_mask_ops; i++) { check(instr->operands[i].hasRegClass() && instr->operands[i].regClass() == v1, "MIMG operands[4+] (VADDR) must be v1", instr.get()); total_size += instr->operands[i].bytes(); @@ -873,19 +875,18 @@ validate_ir(Program* program) check(total_size <= instr->operands[3].bytes(), "MIMG operands[4+] must fit within operands[3].", instr.get()); } else { - check(instr->operands.size() == 4 || program->gfx_level >= GFX10, + check(non_mask_ops == 4 || program->gfx_level >= GFX10, "NSA is only supported on GFX10+", instr.get()); - for (unsigned i = 3; i < instr->operands.size(); i++) { + for (unsigned i = 3; i < non_mask_ops; i++) { check(instr->operands[i].hasRegClass() && instr->operands[i].regClass().type() == RegType::vgpr, "MIMG operands[3+] (VADDR) must be VGPR", instr.get()); - if (instr->operands.size() > 4) { + if (non_mask_ops > 4) { if (program->gfx_level < GFX11) { check(instr->operands[i].regClass() == v1, "GFX10 MIMG VADDR must be v1 if NSA is used", instr.get()); } else { - unsigned num_scalar = - program->gfx_level >= GFX12 ? (instr->operands.size() - 4) : 4; + unsigned num_scalar = program->gfx_level >= GFX12 ? (non_mask_ops - 4) : 4; if (instr->opcode != aco_opcode::image_bvh_intersect_ray && instr->opcode != aco_opcode::image_bvh64_intersect_ray && instr->opcode != aco_opcode::image_bvh_dual_intersect_ray && diff --git a/src/amd/compiler/instruction_selection/aco_instruction_selection.h b/src/amd/compiler/instruction_selection/aco_instruction_selection.h index cb2504e3c66..73ece52ad89 100644 --- a/src/amd/compiler/instruction_selection/aco_instruction_selection.h +++ b/src/amd/compiler/instruction_selection/aco_instruction_selection.h @@ -228,7 +228,8 @@ void emit_interp_mov_instr(isel_context* ctx, unsigned idx, unsigned component, Temp dst, Temp prim_mask, bool high_16bits); std::vector emit_pack_v1(isel_context* ctx, const std::vector& unpacked); MIMG_instruction* emit_mimg(Builder& bld, aco_opcode op, std::vector dsts, Temp rsrc, - Operand samp, std::vector coords, Operand vdata = Operand(v1)); + Operand samp, std::vector coords, bool disable_wqm, + Operand vdata = Operand(v1)); Operand emit_tfe_init(Builder& bld, Temp dst); struct aco_export_mrt { Operand out[4]; diff --git a/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp b/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp index c753168e757..01b1744da6c 100644 --- a/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp +++ b/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp @@ -498,7 +498,7 @@ emit_pack_v1(isel_context* ctx, const std::vector& unpacked) MIMG_instruction* emit_mimg(Builder& bld, aco_opcode op, std::vector dsts, Temp rsrc, Operand samp, - std::vector coords, Operand vdata) + std::vector coords, bool disable_wqm, Operand vdata) { bool is_vsample = !samp.isUndefined() || op == aco_opcode::image_msaa_load; @@ -541,7 +541,8 @@ emit_mimg(Builder& bld, aco_opcode op, std::vector dsts, Temp rsrc, Operan coords.resize(nsa_size + 1); } - aco_ptr mimg{create_instruction(op, Format::MIMG, 3 + coords.size(), dsts.size())}; + aco_ptr mimg{ + create_instruction(op, Format::MIMG, 3 + coords.size() + disable_wqm * 2, dsts.size())}; for (unsigned i = 0; i < dsts.size(); ++i) mimg->definitions[i] = Definition(dsts[i]); mimg->operands[0] = Operand(rsrc); @@ -549,6 +550,14 @@ emit_mimg(Builder& bld, aco_opcode op, std::vector dsts, Temp rsrc, Operan mimg->operands[2] = vdata; for (unsigned i = 0; i < coords.size(); i++) mimg->operands[3 + i] = Operand(coords[i]); + + if (disable_wqm) { + instr_exact_mask(mimg.get()) = Operand(); + instr_wqm_mask(mimg.get()) = Operand(); + mimg->mimg().disable_wqm = true; + bld.program->needs_exact = true; + } + mimg->mimg().strict_wqm = strict_wqm; return &bld.insert(std::move(mimg))->mimg(); diff --git a/src/amd/compiler/instruction_selection/aco_select_nir.cpp b/src/amd/compiler/instruction_selection/aco_select_nir.cpp index 17ccbc3f91c..84bc23ddb01 100644 --- a/src/amd/compiler/instruction_selection/aco_select_nir.cpp +++ b/src/amd/compiler/instruction_selection/aco_select_nir.cpp @@ -338,7 +338,7 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr) Temp tg4_lod = bld.copy(bld.def(v1), Operand::zero()); Temp size = bld.tmp(v2); MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, {size}, resource, - Operand(s4), std::vector{tg4_lod}); + Operand(s4), std::vector{tg4_lod}, false); tex->dim = dim; tex->dmask = 0x3; tex->da = da; @@ -494,7 +494,8 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr) ? aco_opcode::image_load : aco_opcode::image_load_mip; Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1); - MIMG_instruction* tex = emit_mimg(bld, op, {tmp_dst}, resource, Operand(s4), args, vdata); + MIMG_instruction* tex = + emit_mimg(bld, op, {tmp_dst}, resource, Operand(s4), args, false, vdata); if (instr->op == nir_texop_fragment_mask_fetch_amd) tex->dim = da ? ac_image_2darray : ac_image_2d; else @@ -674,7 +675,7 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr) Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1); MIMG_instruction* tex = - emit_mimg(bld, opcode, {tmp_dst}, resource, Operand(sampler), args, vdata); + emit_mimg(bld, opcode, {tmp_dst}, resource, Operand(sampler), args, false, vdata); tex->dim = dim; tex->dmask = dmask & 0xf; tex->da = da; diff --git a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp index 95a744c948d..f208ada6c06 100644 --- a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp +++ b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp @@ -2087,7 +2087,8 @@ visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr) } Operand vdata = is_sparse ? emit_tfe_init(bld, tmp) : Operand(v1); - MIMG_instruction* load = emit_mimg(bld, opcode, {tmp}, resource, Operand(s4), coords, vdata); + MIMG_instruction* load = + emit_mimg(bld, opcode, {tmp}, resource, Operand(s4), coords, false, vdata); load->cache = get_cache_flags(ctx, nir_intrinsic_access(instr) | ACCESS_TYPE_LOAD); load->a16 = instr->src[1].ssa->bit_size == 16; load->d16 = d16; @@ -2230,7 +2231,7 @@ visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr) aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip; MIMG_instruction* store = - emit_mimg(bld, opcode, {}, resource, Operand(s4), coords, Operand(data)); + emit_mimg(bld, opcode, {}, resource, Operand(s4), coords, true, Operand(data)); store->cache = cache; store->a16 = instr->src[1].ssa->bit_size == 16; store->d16 = d16; @@ -2239,9 +2240,7 @@ visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr) ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array); store->dim = sdim; store->da = should_declare_array(sdim); - store->disable_wqm = true; store->sync = sync; - ctx->program->needs_exact = true; return; } @@ -2389,7 +2388,7 @@ visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr) if (return_previous) tmps = {(cmpswap ? bld.tmp(data.regClass()) : dst)}; MIMG_instruction* mimg = - emit_mimg(bld, image_op, tmps, resource, Operand(s4), coords, Operand(data)); + emit_mimg(bld, image_op, tmps, resource, Operand(s4), coords, true, Operand(data)); mimg->cache = get_atomic_cache_flags(ctx, return_previous); mimg->dmask = (1 << data.size()) - 1; mimg->a16 = instr->src[1].ssa->bit_size == 16; @@ -2397,9 +2396,7 @@ visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr) ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array); mimg->dim = sdim; mimg->da = should_declare_array(sdim); - mimg->disable_wqm = true; mimg->sync = sync; - ctx->program->needs_exact = true; if (return_previous && cmpswap) bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), tmps[0], Operand::zero()); return;