From 7f092cbd91c7df88f4de4ad6d8994d514b6bb119 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Fri, 20 Sep 2024 10:47:13 +0100 Subject: [PATCH] aco: workaround hazards in emit_long_jump fossil-db (navi31): Totals from 29 (0.04% of 79395) affected shaders: CodeSize: 17612888 -> 17615096 (+0.01%) Signed-off-by: Rhys Perry Reviewed-by: Georg Lehmann Backport-to: 24.2 Part-of: --- src/amd/compiler/aco_assembler.cpp | 64 +++++++++++++++-------- src/amd/compiler/tests/test_assembler.cpp | 23 +++++--- 2 files changed, 57 insertions(+), 30 deletions(-) diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp index e7d91d3cbca..9f50c3f5982 100644 --- a/src/amd/compiler/aco_assembler.cpp +++ b/src/amd/compiler/aco_assembler.cpp @@ -1513,6 +1513,21 @@ emit_long_jump(asm_context& ctx, SALU_instruction* branch, bool backwards, { Builder bld(ctx.program); + auto emit = [&](Instruction *instr_ptr, uint32_t *size=NULL) { + aco_ptr instr(instr_ptr); + emit_instruction(ctx, out, instr.get()); + + if (size) + *size = out.size(); + + /* VALUMaskWriteHazard and VALUReadSGPRHazard needs sa_sdst=0 wait */ + if (ctx.gfx_level >= GFX11 && !instr_ptr->definitions.empty() && + instr_ptr->definitions[0].physReg() != scc) { + instr.reset(bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe)); + emit_instruction(ctx, out, instr.get()); + } + }; + Definition def; if (branch->definitions.empty()) { assert(ctx.program->blocks[branch->imm].kind & block_kind_discard_early_exit); @@ -1526,8 +1541,7 @@ emit_long_jump(asm_context& ctx, SALU_instruction* branch, bool backwards, Definition def_tmp_hi(def.physReg().advance(4), s1); Operand op_tmp_hi(def.physReg().advance(4), s1); - aco_ptr instr; - + size_t conditional_br_imm = 0; if (branch->opcode != aco_opcode::s_branch) { /* for conditional branches, skip the long jump if the condition is false */ aco_opcode inv; @@ -1540,36 +1554,39 @@ emit_long_jump(asm_context& ctx, SALU_instruction* branch, bool backwards, case aco_opcode::s_cbranch_execnz: inv = aco_opcode::s_cbranch_execz; break; default: unreachable("Unhandled long jump."); } - unsigned size = ctx.gfx_level >= GFX12 ? 7 : 6; - instr.reset(bld.sopp(inv, size)); + aco_ptr instr; + instr.reset(bld.sopp(inv, 0)); emit_sopp_instruction(ctx, out, instr.get(), true); + conditional_br_imm = out.size() - 1; } + if (ctx.gfx_level == GFX10) /* VMEMtoScalarWriteHazard needs vm_vsrc=0 wait */ + emit(bld.sopp(aco_opcode::s_waitcnt_depctr, 0xffe3)); + /* create the new PC and stash SCC in the LSB */ - instr.reset(bld.sop1(aco_opcode::s_getpc_b64, def).instr); - emit_instruction(ctx, out, instr.get()); + uint32_t getpc_loc, add_literal_loc; + emit(bld.sop1(aco_opcode::s_getpc_b64, def), &getpc_loc); + if (ctx.gfx_level >= GFX12) + emit(bld.sop1(aco_opcode::s_sext_i32_i16, def_tmp_hi, op_tmp_hi)); + emit(bld.sop2(aco_opcode::s_addc_u32, def_tmp_lo, op_tmp_lo, Operand::literal32(0)), + &add_literal_loc); - if (ctx.gfx_level >= GFX12) { - instr.reset(bld.sop1(aco_opcode::s_sext_i32_i16, def_tmp_hi, op_tmp_hi).instr); - emit_instruction(ctx, out, instr.get()); - } - - instr.reset( - bld.sop2(aco_opcode::s_addc_u32, def_tmp_lo, op_tmp_lo, Operand::literal32(0)).instr); - emit_instruction(ctx, out, instr.get()); - branch->pass_flags = out.size(); + branch->pass_flags = getpc_loc | (add_literal_loc << 16); /* s_addc_u32 for high 32 bits not needed because the program is in a 32-bit VA range */ /* restore SCC and clear the LSB of the new PC */ - instr.reset(bld.sopc(aco_opcode::s_bitcmp1_b32, def_tmp_lo, op_tmp_lo, Operand::zero()).instr); - emit_instruction(ctx, out, instr.get()); - instr.reset(bld.sop1(aco_opcode::s_bitset0_b32, def_tmp_lo, Operand::zero()).instr); - emit_instruction(ctx, out, instr.get()); + emit(bld.sopc(aco_opcode::s_bitcmp1_b32, Definition(scc, s1), op_tmp_lo, Operand::zero())); + emit(bld.sop1(aco_opcode::s_bitset0_b32, def_tmp_lo, Operand::zero())); /* create the s_setpc_b64 to jump */ - instr.reset(bld.sop1(aco_opcode::s_setpc_b64, Operand(def.physReg(), s2)).instr); - emit_instruction(ctx, out, instr.get()); + emit(bld.sop1(aco_opcode::s_setpc_b64, Operand(def.physReg(), s2))); + + if (branch->opcode != aco_opcode::s_branch) { + uint32_t imm = out.size() - conditional_br_imm - 1; + assert(imm != 0x3f || ctx.gfx_level != GFX10); + out[conditional_br_imm] |= imm; + } } void @@ -1598,9 +1615,10 @@ fix_branches(asm_context& ctx, std::vector& out) } if (branch.second->pass_flags) { - int after_getpc = branch.first + branch.second->pass_flags - 2; + uint32_t after_getpc = branch.first + (branch.second->pass_flags & 0xffff); + uint32_t add_literal = branch.first + (branch.second->pass_flags >> 16) - 1; offset = (int)ctx.program->blocks[branch.second->imm].offset - after_getpc; - out[branch.first + branch.second->pass_flags - 1] = offset * 4; + out[add_literal] = offset * 4; } else { out[branch.first] &= 0xffff0000u; out[branch.first] |= (uint16_t)offset; diff --git a/src/amd/compiler/tests/test_assembler.cpp b/src/amd/compiler/tests/test_assembler.cpp index 322000482b4..293f37120ab 100644 --- a/src/amd/compiler/tests/test_assembler.cpp +++ b/src/amd/compiler/tests/test_assembler.cpp @@ -60,6 +60,7 @@ BEGIN_TEST(assembler.long_jump.unconditional_forwards) return; //!BB0: + //! s_waitcnt_depctr 0xffe3 ; bfa3ffe3 //! s_getpc_b64 s[0:1] ; be801f00 //! s_addc_u32 s0, s0, 0x20014 ; 8200ff00 00020014 //! s_bitcmp1_b32 s0, 0 ; bf0d8000 @@ -91,12 +92,17 @@ BEGIN_TEST(assembler.long_jump.conditional_forwards) //! BB0: //! s_cbranch_scc1 BB1 ; $_ + //~gfx10! s_waitcnt_depctr 0xffe3 ; $_ //! s_getpc_b64 s[0:1] ; $_ + //~gfx12! s_wait_alu 0xfffe ; $_ //~gfx12! s_sext_i32_i16 s1, s1 ; $_ + //~gfx12! s_wait_alu 0xfffe ; $_ //~gfx10! s_addc_u32 s0, s0, 0x20014 ; $_ $_ - //~gfx12! s_add_co_ci_u32 s0, s0, 0x20014 ; $_ $_ + //~gfx12! s_add_co_ci_u32 s0, s0, 0x20028 ; $_ $_ + //~gfx12! s_wait_alu 0xfffe ; $_ //! s_bitcmp1_b32 s0, 0 ; $_ //! s_bitset0_b32 s0, 0 ; $_ + //~gfx12! s_wait_alu 0xfffe ; $_ //! s_setpc_b64 s[0:1] ; $_ bld.sopp(aco_opcode::s_cbranch_scc0, Definition(PhysReg(0), s2), 2); @@ -130,8 +136,9 @@ BEGIN_TEST(assembler.long_jump.unconditional_backwards) for (unsigned i = 0; i < INT16_MAX + 1; i++) bld.sopp(aco_opcode::s_nop, 0); + //! s_waitcnt_depctr 0xffe3 ; bfa3ffe3 //! s_getpc_b64 s[0:1] ; be801f00 - //! s_addc_u32 s0, s0, 0xfffdfffc ; 8200ff00 fffdfffc + //! s_addc_u32 s0, s0, 0xfffdfff8 ; 8200ff00 fffdfff8 //! s_bitcmp1_b32 s0, 0 ; bf0d8000 //! s_bitset0_b32 s0, 0 ; be801b80 //! s_setpc_b64 s[0:1] ; be802000 @@ -157,9 +164,10 @@ BEGIN_TEST(assembler.long_jump.conditional_backwards) for (unsigned i = 0; i < INT16_MAX + 1; i++) bld.sopp(aco_opcode::s_nop, 0); - //! s_cbranch_execz BB1 ; bf880006 + //! s_cbranch_execz BB1 ; bf880007 + //! s_waitcnt_depctr 0xffe3 ; bfa3ffe3 //! s_getpc_b64 s[0:1] ; be801f00 - //! s_addc_u32 s0, s0, 0xfffdfff8 ; 8200ff00 fffdfff8 + //! s_addc_u32 s0, s0, 0xfffdfff4 ; 8200ff00 fffdfff4 //! s_bitcmp1_b32 s0, 0 ; bf0d8000 //! s_bitset0_b32 s0, 0 ; be801b80 //! s_setpc_b64 s[0:1] ; be802000 @@ -175,7 +183,7 @@ BEGIN_TEST(assembler.long_jump.conditional_backwards) finish_assembler_test(); END_TEST -BEGIN_TEST(assembler.long_jump .3f) +BEGIN_TEST(assembler.long_jump._3f) if (!setup_cs(NULL, (amd_gfx_level)GFX10)) return; @@ -184,7 +192,7 @@ BEGIN_TEST(assembler.long_jump .3f) //! s_nop 0 ; bf800000 bld.sopp(aco_opcode::s_branch, Definition(PhysReg(0), s2), 1); - for (unsigned i = 0; i < 0x3f - 6; i++) // a unconditional long jump is 6 dwords + for (unsigned i = 0; i < 0x3f - 7; i++) // an unconditional long jump is 7 dwords bld.vop1(aco_opcode::v_nop); bld.sopp(aco_opcode::s_branch, Definition(PhysReg(0), s2), 2); @@ -231,7 +239,8 @@ BEGIN_TEST(assembler.long_jump.discard_early_exit) return; //! BB0: - //! s_cbranch_scc1 BB1 ; bf850006 + //! s_cbranch_scc1 BB1 ; bf850007 + //! s_waitcnt_depctr 0xffe3 ; bfa3ffe3 //! s_getpc_b64 s[0:1] ; be801f00 //! s_addc_u32 s0, s0, 0x20014 ; 8200ff00 00020014 //! s_bitcmp1_b32 s0, 0 ; bf0d8000