From fae2a85d57a49bfbd4efb05ea1d4e53071c2ffd2 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Fri, 12 Apr 2024 12:33:56 +0100 Subject: [PATCH] aco/gfx12: implement subgroup shader clock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Rhys Perry Reviewed-by: Georg Lehmann Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_instruction_selection.cpp | 9 +++++++++ src/amd/compiler/aco_lower_to_hw_instr.cpp | 11 +++++++++++ src/amd/compiler/aco_opcodes.py | 2 ++ src/amd/compiler/aco_scheduler.cpp | 1 + 4 files changed, 23 insertions(+) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index d0b3b3deec9..9264427aae9 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -8977,6 +8977,15 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) case nir_intrinsic_shader_clock: { Temp dst = get_ssa_temp(ctx, &instr->def); if (nir_intrinsic_memory_scope(instr) == SCOPE_SUBGROUP && + ctx->options->gfx_level >= GFX12) { + Temp hi0 = bld.tmp(s1); + Temp hi1 = bld.tmp(s1); + Temp lo = bld.tmp(s1); + bld.pseudo(aco_opcode::p_shader_cycles_hi_lo_hi, Definition(hi0), Definition(lo), Definition(hi1)); + Temp hi_eq = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), hi0, hi1); + lo = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), lo, Operand::zero(), bld.scc(hi_eq)); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi1); + } else if (nir_intrinsic_memory_scope(instr) == SCOPE_SUBGROUP && ctx->options->gfx_level >= GFX10_3) { /* "((size - 1) << 11) | register" (SHADER_CYCLES is encoded as register 29) */ Temp clock = bld.sopk(aco_opcode::s_getreg_b32, bld.def(s1), ((20 - 1) << 11) | 29); diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 00ebab3d082..a4d4682be36 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -2726,6 +2726,17 @@ lower_to_hw_instr(Program* program) end_with_regs_block_index = block->index; break; } + case aco_opcode::p_shader_cycles_hi_lo_hi: { + unsigned shader_cycles_lo = 29; + unsigned shader_cycles_hi = 30; + bld.sopk(aco_opcode::s_getreg_b32, instr->definitions[0], + ((32 - 1) << 11) | shader_cycles_hi); + bld.sopk(aco_opcode::s_getreg_b32, instr->definitions[1], + ((32 - 1) << 11) | shader_cycles_lo); + bld.sopk(aco_opcode::s_getreg_b32, instr->definitions[2], + ((32 - 1) << 11) | shader_cycles_hi); + break; + } default: break; } } else if (instr->isBranch()) { diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index 2087783b33b..9f23d595d78 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -445,6 +445,8 @@ insn("p_dual_src_export_gfx11") # shader to pass arguments to next part. insn("p_end_with_regs") +insn("p_shader_cycles_hi_lo_hi") + # SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc) SOP2 = { ("s_add_u32", dst(1, SCC), src(1, 1), op(0x00)), diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp index 3bb119f5e43..bc471c5316e 100644 --- a/src/amd/compiler/aco_scheduler.cpp +++ b/src/amd/compiler/aco_scheduler.cpp @@ -596,6 +596,7 @@ perform_hazard_query(hazard_query* query, Instruction* instr, bool upwards) /* don't move non-reorderable instructions */ if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime || instr->opcode == aco_opcode::s_setprio || instr->opcode == aco_opcode::s_getreg_b32 || + instr->opcode == aco_opcode::p_shader_cycles_hi_lo_hi || instr->opcode == aco_opcode::p_init_scratch || instr->opcode == aco_opcode::p_jump_to_epilog || instr->opcode == aco_opcode::s_sendmsg_rtn_b32 ||