From f01cac835f95bcc65b2ca0afeed4f29a6bbb64f0 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Fri, 3 May 2024 12:04:59 +0100 Subject: [PATCH] aco/stats: support GFX12 in collect_preasm_stats() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Rhys Perry Reviewed-by: Georg Lehmann Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_insert_waitcnt.cpp | 21 --------------------- src/amd/compiler/aco_ir.cpp | 15 +++++++++++++++ src/amd/compiler/aco_ir.h | 11 +++++++++++ src/amd/compiler/aco_statistics.cpp | 19 +++++++++++++------ 4 files changed, 39 insertions(+), 27 deletions(-) diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index 277228f6b83..9369240c17f 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -74,12 +74,6 @@ enum counter_type : uint8_t { wait_counters = BITFIELD_MASK(wait_type_num), }; -enum vmem_type : uint8_t { - vmem_nosampler = 1 << 0, - vmem_sampler = 1 << 1, - vmem_bvh = 1 << 2, -}; - /* On GFX11+ the SIMD frontend doesn't switch to issuing instructions from a different * wave if there is an ALU stall. Hence we have an instruction (s_delay_alu) to signal * that we should switch to a different wave and contains info on dependencies as to @@ -349,21 +343,6 @@ struct wait_ctx { } }; -uint8_t -get_vmem_type(enum amd_gfx_level gfx_level, Instruction* instr) -{ - if (instr->opcode == aco_opcode::image_bvh64_intersect_ray) - return vmem_bvh; - else if (gfx_level >= GFX12 && instr->opcode == aco_opcode::image_msaa_load) - return vmem_sampler; - else if (instr->isMIMG() && !instr->operands[1].isUndefined() && - instr->operands[1].regClass() == s4) - return vmem_sampler; - else if (instr->isVMEM() || instr->isScratch() || instr->isGlobal()) - return vmem_nosampler; - return 0; -} - wait_event get_vmem_event(wait_ctx& ctx, Instruction* instr, uint8_t type) { diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index efe4611c9d5..a987bfa33ad 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -1414,6 +1414,21 @@ get_op_fixed_to_def(Instruction* instr) return -1; } +uint8_t +get_vmem_type(enum amd_gfx_level gfx_level, Instruction* instr) +{ + if (instr->opcode == aco_opcode::image_bvh64_intersect_ray) + return vmem_bvh; + else if (gfx_level >= GFX12 && instr->opcode == aco_opcode::image_msaa_load) + return vmem_sampler; + else if (instr->isMIMG() && !instr->operands[1].isUndefined() && + instr->operands[1].regClass() == s4) + return vmem_sampler; + else if (instr->isVMEM() || instr->isScratch() || instr->isGlobal()) + return vmem_nosampler; + return 0; +} + bool dealloc_vgprs(Program* program) { diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 97049361859..527ee8abb8d 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1781,6 +1781,17 @@ unsigned get_operand_size(aco_ptr& instr, unsigned index); bool should_form_clause(const Instruction* a, const Instruction* b); +enum vmem_type : uint8_t { + vmem_nosampler = 1 << 0, + vmem_sampler = 1 << 1, + vmem_bvh = 1 << 2, +}; + +/* VMEM instructions of the same type return in-order. For GFX12+, this determines which counter + * is used. + */ +uint8_t get_vmem_type(enum amd_gfx_level gfx_level, Instruction* instr); + enum block_kind { /* uniform indicates that leaving this block, * all actives lanes stay active */ diff --git a/src/amd/compiler/aco_statistics.cpp b/src/amd/compiler/aco_statistics.cpp index 9260ed94b18..31b5829ee2b 100644 --- a/src/amd/compiler/aco_statistics.cpp +++ b/src/amd/compiler/aco_statistics.cpp @@ -288,10 +288,11 @@ get_wait_counter_info(amd_gfx_level gfx_level, aco_ptr& instr) else info[wait_type_vs] = 320; } else if (instr->isSMEM()) { + wait_type type = gfx_level >= GFX12 ? wait_type_km : wait_type_lgkm; if (instr->definitions.empty()) { - info[wait_type_lgkm] = 200; + info[type] = 200; } else if (instr->operands.empty()) { /* s_memtime and s_memrealtime */ - info[wait_type_lgkm] = 1; + info[type] = 1; } else { bool likely_desc_load = instr->operands[0].size() == 2; bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4); @@ -299,15 +300,21 @@ get_wait_counter_info(amd_gfx_level gfx_level, aco_ptr& instr) instr->operands[1].isConstant() && (!soe || instr->operands.back().isConstant()); if (likely_desc_load || const_offset) - info[wait_type_lgkm] = 30; /* likely to hit L0 cache */ + info[type] = 30; /* likely to hit L0 cache */ else - info[wait_type_lgkm] = 200; + info[type] = 200; } } else if (instr->isDS()) { info[wait_type_lgkm] = 20; + } else if (instr->isVMEM() && instr->definitions.empty() && gfx_level >= GFX10) { + info[wait_type_vs] = 320; } else if (instr->isVMEM()) { - wait_type type = - instr->definitions.empty() && gfx_level >= GFX10 ? wait_type_vs : wait_type_vm; + uint8_t vm_type = get_vmem_type(gfx_level, instr.get()); + wait_type type = wait_type_vm; + if (gfx_level >= GFX12 && vm_type == vmem_bvh) + type = wait_type_bvh; + else if (gfx_level >= GFX12 && vm_type == vmem_sampler) + type = wait_type_sample; info[type] = 320; }