mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-07 13:38:06 +02:00
aco: workaround hazards in emit_long_jump
fossil-db (navi31): Totals from 29 (0.04% of 79395) affected shaders: CodeSize: 17612888 -> 17615096 (+0.01%) Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Georg Lehmann <dadschoorse@gmail.com> Backport-to: 24.2 Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31316>
This commit is contained in:
parent
9fb97085d1
commit
7f092cbd91
2 changed files with 57 additions and 30 deletions
|
|
@ -1513,6 +1513,21 @@ emit_long_jump(asm_context& ctx, SALU_instruction* branch, bool backwards,
|
||||||
{
|
{
|
||||||
Builder bld(ctx.program);
|
Builder bld(ctx.program);
|
||||||
|
|
||||||
|
auto emit = [&](Instruction *instr_ptr, uint32_t *size=NULL) {
|
||||||
|
aco_ptr<Instruction> instr(instr_ptr);
|
||||||
|
emit_instruction(ctx, out, instr.get());
|
||||||
|
|
||||||
|
if (size)
|
||||||
|
*size = out.size();
|
||||||
|
|
||||||
|
/* VALUMaskWriteHazard and VALUReadSGPRHazard needs sa_sdst=0 wait */
|
||||||
|
if (ctx.gfx_level >= GFX11 && !instr_ptr->definitions.empty() &&
|
||||||
|
instr_ptr->definitions[0].physReg() != scc) {
|
||||||
|
instr.reset(bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe));
|
||||||
|
emit_instruction(ctx, out, instr.get());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
Definition def;
|
Definition def;
|
||||||
if (branch->definitions.empty()) {
|
if (branch->definitions.empty()) {
|
||||||
assert(ctx.program->blocks[branch->imm].kind & block_kind_discard_early_exit);
|
assert(ctx.program->blocks[branch->imm].kind & block_kind_discard_early_exit);
|
||||||
|
|
@ -1526,8 +1541,7 @@ emit_long_jump(asm_context& ctx, SALU_instruction* branch, bool backwards,
|
||||||
Definition def_tmp_hi(def.physReg().advance(4), s1);
|
Definition def_tmp_hi(def.physReg().advance(4), s1);
|
||||||
Operand op_tmp_hi(def.physReg().advance(4), s1);
|
Operand op_tmp_hi(def.physReg().advance(4), s1);
|
||||||
|
|
||||||
aco_ptr<Instruction> instr;
|
size_t conditional_br_imm = 0;
|
||||||
|
|
||||||
if (branch->opcode != aco_opcode::s_branch) {
|
if (branch->opcode != aco_opcode::s_branch) {
|
||||||
/* for conditional branches, skip the long jump if the condition is false */
|
/* for conditional branches, skip the long jump if the condition is false */
|
||||||
aco_opcode inv;
|
aco_opcode inv;
|
||||||
|
|
@ -1540,36 +1554,39 @@ emit_long_jump(asm_context& ctx, SALU_instruction* branch, bool backwards,
|
||||||
case aco_opcode::s_cbranch_execnz: inv = aco_opcode::s_cbranch_execz; break;
|
case aco_opcode::s_cbranch_execnz: inv = aco_opcode::s_cbranch_execz; break;
|
||||||
default: unreachable("Unhandled long jump.");
|
default: unreachable("Unhandled long jump.");
|
||||||
}
|
}
|
||||||
unsigned size = ctx.gfx_level >= GFX12 ? 7 : 6;
|
aco_ptr<Instruction> instr;
|
||||||
instr.reset(bld.sopp(inv, size));
|
instr.reset(bld.sopp(inv, 0));
|
||||||
emit_sopp_instruction(ctx, out, instr.get(), true);
|
emit_sopp_instruction(ctx, out, instr.get(), true);
|
||||||
|
conditional_br_imm = out.size() - 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ctx.gfx_level == GFX10) /* VMEMtoScalarWriteHazard needs vm_vsrc=0 wait */
|
||||||
|
emit(bld.sopp(aco_opcode::s_waitcnt_depctr, 0xffe3));
|
||||||
|
|
||||||
/* create the new PC and stash SCC in the LSB */
|
/* create the new PC and stash SCC in the LSB */
|
||||||
instr.reset(bld.sop1(aco_opcode::s_getpc_b64, def).instr);
|
uint32_t getpc_loc, add_literal_loc;
|
||||||
emit_instruction(ctx, out, instr.get());
|
emit(bld.sop1(aco_opcode::s_getpc_b64, def), &getpc_loc);
|
||||||
|
if (ctx.gfx_level >= GFX12)
|
||||||
|
emit(bld.sop1(aco_opcode::s_sext_i32_i16, def_tmp_hi, op_tmp_hi));
|
||||||
|
emit(bld.sop2(aco_opcode::s_addc_u32, def_tmp_lo, op_tmp_lo, Operand::literal32(0)),
|
||||||
|
&add_literal_loc);
|
||||||
|
|
||||||
if (ctx.gfx_level >= GFX12) {
|
branch->pass_flags = getpc_loc | (add_literal_loc << 16);
|
||||||
instr.reset(bld.sop1(aco_opcode::s_sext_i32_i16, def_tmp_hi, op_tmp_hi).instr);
|
|
||||||
emit_instruction(ctx, out, instr.get());
|
|
||||||
}
|
|
||||||
|
|
||||||
instr.reset(
|
|
||||||
bld.sop2(aco_opcode::s_addc_u32, def_tmp_lo, op_tmp_lo, Operand::literal32(0)).instr);
|
|
||||||
emit_instruction(ctx, out, instr.get());
|
|
||||||
branch->pass_flags = out.size();
|
|
||||||
|
|
||||||
/* s_addc_u32 for high 32 bits not needed because the program is in a 32-bit VA range */
|
/* s_addc_u32 for high 32 bits not needed because the program is in a 32-bit VA range */
|
||||||
|
|
||||||
/* restore SCC and clear the LSB of the new PC */
|
/* restore SCC and clear the LSB of the new PC */
|
||||||
instr.reset(bld.sopc(aco_opcode::s_bitcmp1_b32, def_tmp_lo, op_tmp_lo, Operand::zero()).instr);
|
emit(bld.sopc(aco_opcode::s_bitcmp1_b32, Definition(scc, s1), op_tmp_lo, Operand::zero()));
|
||||||
emit_instruction(ctx, out, instr.get());
|
emit(bld.sop1(aco_opcode::s_bitset0_b32, def_tmp_lo, Operand::zero()));
|
||||||
instr.reset(bld.sop1(aco_opcode::s_bitset0_b32, def_tmp_lo, Operand::zero()).instr);
|
|
||||||
emit_instruction(ctx, out, instr.get());
|
|
||||||
|
|
||||||
/* create the s_setpc_b64 to jump */
|
/* create the s_setpc_b64 to jump */
|
||||||
instr.reset(bld.sop1(aco_opcode::s_setpc_b64, Operand(def.physReg(), s2)).instr);
|
emit(bld.sop1(aco_opcode::s_setpc_b64, Operand(def.physReg(), s2)));
|
||||||
emit_instruction(ctx, out, instr.get());
|
|
||||||
|
if (branch->opcode != aco_opcode::s_branch) {
|
||||||
|
uint32_t imm = out.size() - conditional_br_imm - 1;
|
||||||
|
assert(imm != 0x3f || ctx.gfx_level != GFX10);
|
||||||
|
out[conditional_br_imm] |= imm;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
|
@ -1598,9 +1615,10 @@ fix_branches(asm_context& ctx, std::vector<uint32_t>& out)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (branch.second->pass_flags) {
|
if (branch.second->pass_flags) {
|
||||||
int after_getpc = branch.first + branch.second->pass_flags - 2;
|
uint32_t after_getpc = branch.first + (branch.second->pass_flags & 0xffff);
|
||||||
|
uint32_t add_literal = branch.first + (branch.second->pass_flags >> 16) - 1;
|
||||||
offset = (int)ctx.program->blocks[branch.second->imm].offset - after_getpc;
|
offset = (int)ctx.program->blocks[branch.second->imm].offset - after_getpc;
|
||||||
out[branch.first + branch.second->pass_flags - 1] = offset * 4;
|
out[add_literal] = offset * 4;
|
||||||
} else {
|
} else {
|
||||||
out[branch.first] &= 0xffff0000u;
|
out[branch.first] &= 0xffff0000u;
|
||||||
out[branch.first] |= (uint16_t)offset;
|
out[branch.first] |= (uint16_t)offset;
|
||||||
|
|
|
||||||
|
|
@ -60,6 +60,7 @@ BEGIN_TEST(assembler.long_jump.unconditional_forwards)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
//!BB0:
|
//!BB0:
|
||||||
|
//! s_waitcnt_depctr 0xffe3 ; bfa3ffe3
|
||||||
//! s_getpc_b64 s[0:1] ; be801f00
|
//! s_getpc_b64 s[0:1] ; be801f00
|
||||||
//! s_addc_u32 s0, s0, 0x20014 ; 8200ff00 00020014
|
//! s_addc_u32 s0, s0, 0x20014 ; 8200ff00 00020014
|
||||||
//! s_bitcmp1_b32 s0, 0 ; bf0d8000
|
//! s_bitcmp1_b32 s0, 0 ; bf0d8000
|
||||||
|
|
@ -91,12 +92,17 @@ BEGIN_TEST(assembler.long_jump.conditional_forwards)
|
||||||
|
|
||||||
//! BB0:
|
//! BB0:
|
||||||
//! s_cbranch_scc1 BB1 ; $_
|
//! s_cbranch_scc1 BB1 ; $_
|
||||||
|
//~gfx10! s_waitcnt_depctr 0xffe3 ; $_
|
||||||
//! s_getpc_b64 s[0:1] ; $_
|
//! s_getpc_b64 s[0:1] ; $_
|
||||||
|
//~gfx12! s_wait_alu 0xfffe ; $_
|
||||||
//~gfx12! s_sext_i32_i16 s1, s1 ; $_
|
//~gfx12! s_sext_i32_i16 s1, s1 ; $_
|
||||||
|
//~gfx12! s_wait_alu 0xfffe ; $_
|
||||||
//~gfx10! s_addc_u32 s0, s0, 0x20014 ; $_ $_
|
//~gfx10! s_addc_u32 s0, s0, 0x20014 ; $_ $_
|
||||||
//~gfx12! s_add_co_ci_u32 s0, s0, 0x20014 ; $_ $_
|
//~gfx12! s_add_co_ci_u32 s0, s0, 0x20028 ; $_ $_
|
||||||
|
//~gfx12! s_wait_alu 0xfffe ; $_
|
||||||
//! s_bitcmp1_b32 s0, 0 ; $_
|
//! s_bitcmp1_b32 s0, 0 ; $_
|
||||||
//! s_bitset0_b32 s0, 0 ; $_
|
//! s_bitset0_b32 s0, 0 ; $_
|
||||||
|
//~gfx12! s_wait_alu 0xfffe ; $_
|
||||||
//! s_setpc_b64 s[0:1] ; $_
|
//! s_setpc_b64 s[0:1] ; $_
|
||||||
bld.sopp(aco_opcode::s_cbranch_scc0, Definition(PhysReg(0), s2), 2);
|
bld.sopp(aco_opcode::s_cbranch_scc0, Definition(PhysReg(0), s2), 2);
|
||||||
|
|
||||||
|
|
@ -130,8 +136,9 @@ BEGIN_TEST(assembler.long_jump.unconditional_backwards)
|
||||||
for (unsigned i = 0; i < INT16_MAX + 1; i++)
|
for (unsigned i = 0; i < INT16_MAX + 1; i++)
|
||||||
bld.sopp(aco_opcode::s_nop, 0);
|
bld.sopp(aco_opcode::s_nop, 0);
|
||||||
|
|
||||||
|
//! s_waitcnt_depctr 0xffe3 ; bfa3ffe3
|
||||||
//! s_getpc_b64 s[0:1] ; be801f00
|
//! s_getpc_b64 s[0:1] ; be801f00
|
||||||
//! s_addc_u32 s0, s0, 0xfffdfffc ; 8200ff00 fffdfffc
|
//! s_addc_u32 s0, s0, 0xfffdfff8 ; 8200ff00 fffdfff8
|
||||||
//! s_bitcmp1_b32 s0, 0 ; bf0d8000
|
//! s_bitcmp1_b32 s0, 0 ; bf0d8000
|
||||||
//! s_bitset0_b32 s0, 0 ; be801b80
|
//! s_bitset0_b32 s0, 0 ; be801b80
|
||||||
//! s_setpc_b64 s[0:1] ; be802000
|
//! s_setpc_b64 s[0:1] ; be802000
|
||||||
|
|
@ -157,9 +164,10 @@ BEGIN_TEST(assembler.long_jump.conditional_backwards)
|
||||||
for (unsigned i = 0; i < INT16_MAX + 1; i++)
|
for (unsigned i = 0; i < INT16_MAX + 1; i++)
|
||||||
bld.sopp(aco_opcode::s_nop, 0);
|
bld.sopp(aco_opcode::s_nop, 0);
|
||||||
|
|
||||||
//! s_cbranch_execz BB1 ; bf880006
|
//! s_cbranch_execz BB1 ; bf880007
|
||||||
|
//! s_waitcnt_depctr 0xffe3 ; bfa3ffe3
|
||||||
//! s_getpc_b64 s[0:1] ; be801f00
|
//! s_getpc_b64 s[0:1] ; be801f00
|
||||||
//! s_addc_u32 s0, s0, 0xfffdfff8 ; 8200ff00 fffdfff8
|
//! s_addc_u32 s0, s0, 0xfffdfff4 ; 8200ff00 fffdfff4
|
||||||
//! s_bitcmp1_b32 s0, 0 ; bf0d8000
|
//! s_bitcmp1_b32 s0, 0 ; bf0d8000
|
||||||
//! s_bitset0_b32 s0, 0 ; be801b80
|
//! s_bitset0_b32 s0, 0 ; be801b80
|
||||||
//! s_setpc_b64 s[0:1] ; be802000
|
//! s_setpc_b64 s[0:1] ; be802000
|
||||||
|
|
@ -175,7 +183,7 @@ BEGIN_TEST(assembler.long_jump.conditional_backwards)
|
||||||
finish_assembler_test();
|
finish_assembler_test();
|
||||||
END_TEST
|
END_TEST
|
||||||
|
|
||||||
BEGIN_TEST(assembler.long_jump .3f)
|
BEGIN_TEST(assembler.long_jump._3f)
|
||||||
if (!setup_cs(NULL, (amd_gfx_level)GFX10))
|
if (!setup_cs(NULL, (amd_gfx_level)GFX10))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
|
@ -184,7 +192,7 @@ BEGIN_TEST(assembler.long_jump .3f)
|
||||||
//! s_nop 0 ; bf800000
|
//! s_nop 0 ; bf800000
|
||||||
bld.sopp(aco_opcode::s_branch, Definition(PhysReg(0), s2), 1);
|
bld.sopp(aco_opcode::s_branch, Definition(PhysReg(0), s2), 1);
|
||||||
|
|
||||||
for (unsigned i = 0; i < 0x3f - 6; i++) // a unconditional long jump is 6 dwords
|
for (unsigned i = 0; i < 0x3f - 7; i++) // an unconditional long jump is 7 dwords
|
||||||
bld.vop1(aco_opcode::v_nop);
|
bld.vop1(aco_opcode::v_nop);
|
||||||
bld.sopp(aco_opcode::s_branch, Definition(PhysReg(0), s2), 2);
|
bld.sopp(aco_opcode::s_branch, Definition(PhysReg(0), s2), 2);
|
||||||
|
|
||||||
|
|
@ -231,7 +239,8 @@ BEGIN_TEST(assembler.long_jump.discard_early_exit)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
//! BB0:
|
//! BB0:
|
||||||
//! s_cbranch_scc1 BB1 ; bf850006
|
//! s_cbranch_scc1 BB1 ; bf850007
|
||||||
|
//! s_waitcnt_depctr 0xffe3 ; bfa3ffe3
|
||||||
//! s_getpc_b64 s[0:1] ; be801f00
|
//! s_getpc_b64 s[0:1] ; be801f00
|
||||||
//! s_addc_u32 s0, s0, 0x20014 ; 8200ff00 00020014
|
//! s_addc_u32 s0, s0, 0x20014 ; 8200ff00 00020014
|
||||||
//! s_bitcmp1_b32 s0, 0 ; bf0d8000
|
//! s_bitcmp1_b32 s0, 0 ; bf0d8000
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue