mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-09 11:08:03 +02:00
aco: handle FPAtomicToDenormModeHazard
This is quite unlikely to happen, but I guess it might be possible and it's relatively simple to work around. Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Georg Lehmann <dadschoorse@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35884>
This commit is contained in:
parent
d9e5e8f5fc
commit
34f1a8f707
2 changed files with 161 additions and 3 deletions
|
|
@ -133,6 +133,7 @@ struct NOP_ctx_gfx10 {
|
|||
std::bitset<128> sgprs_read_by_VMEM_store;
|
||||
std::bitset<128> sgprs_read_by_DS;
|
||||
std::bitset<128> sgprs_read_by_SMEM;
|
||||
int waits_since_fp_atomic = 3;
|
||||
|
||||
void join(const NOP_ctx_gfx10& other)
|
||||
{
|
||||
|
|
@ -148,6 +149,7 @@ struct NOP_ctx_gfx10 {
|
|||
sgprs_read_by_DS |= other.sgprs_read_by_DS;
|
||||
sgprs_read_by_VMEM_store |= other.sgprs_read_by_VMEM_store;
|
||||
sgprs_read_by_SMEM |= other.sgprs_read_by_SMEM;
|
||||
waits_since_fp_atomic = std::min(waits_since_fp_atomic, other.waits_since_fp_atomic);
|
||||
}
|
||||
|
||||
bool operator==(const NOP_ctx_gfx10& other)
|
||||
|
|
@ -160,7 +162,8 @@ struct NOP_ctx_gfx10 {
|
|||
sgprs_read_by_VMEM == other.sgprs_read_by_VMEM &&
|
||||
sgprs_read_by_DS == other.sgprs_read_by_DS &&
|
||||
sgprs_read_by_VMEM_store == other.sgprs_read_by_VMEM_store &&
|
||||
sgprs_read_by_SMEM == other.sgprs_read_by_SMEM;
|
||||
sgprs_read_by_SMEM == other.sgprs_read_by_SMEM &&
|
||||
waits_since_fp_atomic == other.waits_since_fp_atomic;
|
||||
}
|
||||
};
|
||||
|
||||
|
|
@ -867,6 +870,50 @@ instr_is_branch(const aco_ptr<Instruction>& instr)
|
|||
instr->opcode == aco_opcode::s_getpc_b64 || instr->opcode == aco_opcode::s_call_b64;
|
||||
}
|
||||
|
||||
bool
|
||||
instr_is_vmem_fp_atomic(const aco_ptr<Instruction>& instr)
|
||||
{
|
||||
if (!instr->isVMEM() && !instr->isFlatLike())
|
||||
return false;
|
||||
|
||||
switch (instr->opcode) {
|
||||
case aco_opcode::buffer_atomic_fcmpswap:
|
||||
case aco_opcode::buffer_atomic_fmin:
|
||||
case aco_opcode::buffer_atomic_fmax:
|
||||
case aco_opcode::buffer_atomic_fcmpswap_x2:
|
||||
case aco_opcode::buffer_atomic_fmin_x2:
|
||||
case aco_opcode::buffer_atomic_fmax_x2:
|
||||
case aco_opcode::buffer_atomic_add_f32:
|
||||
case aco_opcode::buffer_atomic_pk_add_f16:
|
||||
case aco_opcode::buffer_atomic_pk_add_bf16:
|
||||
case aco_opcode::image_atomic_fcmpswap:
|
||||
case aco_opcode::image_atomic_fmin:
|
||||
case aco_opcode::image_atomic_fmax:
|
||||
case aco_opcode::image_atomic_pk_add_f16:
|
||||
case aco_opcode::image_atomic_pk_add_bf16:
|
||||
case aco_opcode::image_atomic_add_flt:
|
||||
case aco_opcode::flat_atomic_fcmpswap:
|
||||
case aco_opcode::flat_atomic_fmin:
|
||||
case aco_opcode::flat_atomic_fmax:
|
||||
case aco_opcode::flat_atomic_fcmpswap_x2:
|
||||
case aco_opcode::flat_atomic_fmin_x2:
|
||||
case aco_opcode::flat_atomic_fmax_x2:
|
||||
case aco_opcode::flat_atomic_add_f32:
|
||||
case aco_opcode::flat_atomic_pk_add_f16:
|
||||
case aco_opcode::flat_atomic_pk_add_bf16:
|
||||
case aco_opcode::global_atomic_fcmpswap:
|
||||
case aco_opcode::global_atomic_fmin:
|
||||
case aco_opcode::global_atomic_fmax:
|
||||
case aco_opcode::global_atomic_fcmpswap_x2:
|
||||
case aco_opcode::global_atomic_fmin_x2:
|
||||
case aco_opcode::global_atomic_fmax_x2:
|
||||
case aco_opcode::global_atomic_add_f32:
|
||||
case aco_opcode::global_atomic_pk_add_f16:
|
||||
case aco_opcode::global_atomic_pk_add_bf16: return true;
|
||||
default: return false;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr<Instruction>& instr,
|
||||
std::vector<aco_ptr<Instruction>>& new_instructions)
|
||||
|
|
@ -886,6 +933,27 @@ handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr<Instruction>&
|
|||
sa_sdst = instr->salu().imm & 0x1;
|
||||
}
|
||||
|
||||
/* FPAtomicToDenormModeHazard */
|
||||
if (instr->opcode == aco_opcode::s_denorm_mode && ctx.waits_since_fp_atomic < 3) {
|
||||
bld.sopp(aco_opcode::s_nop, 3 - ctx.waits_since_fp_atomic - 1);
|
||||
ctx.waits_since_fp_atomic = 3;
|
||||
} else if (instr->isVALU() || instr->opcode == aco_opcode::s_waitcnt ||
|
||||
instr->opcode == aco_opcode::s_waitcnt_vscnt ||
|
||||
instr->opcode == aco_opcode::s_waitcnt_vmcnt ||
|
||||
instr->opcode == aco_opcode::s_waitcnt_expcnt ||
|
||||
instr->opcode == aco_opcode::s_waitcnt_lgkmcnt ||
|
||||
instr->opcode == aco_opcode::s_wait_idle) {
|
||||
ctx.waits_since_fp_atomic = 3;
|
||||
} else if (instr_is_vmem_fp_atomic(instr)) {
|
||||
ctx.waits_since_fp_atomic = 0;
|
||||
} else {
|
||||
ctx.waits_since_fp_atomic += get_wait_states(instr);
|
||||
ctx.waits_since_fp_atomic = std::min(ctx.waits_since_fp_atomic, 3);
|
||||
}
|
||||
|
||||
if (state.program->gfx_level != GFX10)
|
||||
return; /* no other hazards/bugs to mitigate */
|
||||
|
||||
/* VMEMtoScalarWriteHazard
|
||||
* Handle EXEC/M0/SGPR write following a VMEM/DS instruction without a VALU or "waitcnt vmcnt(0)"
|
||||
* in-between.
|
||||
|
|
@ -1052,6 +1120,13 @@ resolve_all_gfx10(State& state, NOP_ctx_gfx10& ctx,
|
|||
|
||||
size_t prev_count = new_instructions.size();
|
||||
|
||||
/* FPAtomicToDenormModeHazard */
|
||||
if (ctx.waits_since_fp_atomic < 3)
|
||||
bld.sopp(aco_opcode::s_nop, 3 - ctx.waits_since_fp_atomic - 1);
|
||||
|
||||
if (state.program->gfx_level != GFX10)
|
||||
return; /* no other hazards/bugs to mitigate */
|
||||
|
||||
/* VcmpxPermlaneHazard */
|
||||
if (ctx.has_VOPC_write_exec) {
|
||||
ctx.has_VOPC_write_exec = false;
|
||||
|
|
@ -2004,8 +2079,6 @@ insert_NOPs(Program* program)
|
|||
|
||||
mitigate_hazards<NOP_ctx_gfx11, handle_instruction_gfx11, resolve_all_gfx11>(program,
|
||||
initial_ctx);
|
||||
} else if (program->gfx_level >= GFX10_3) {
|
||||
; /* no hazards/bugs to mitigate */
|
||||
} else if (program->gfx_level >= GFX10) {
|
||||
mitigate_hazards<NOP_ctx_gfx10, handle_instruction_gfx10, resolve_all_gfx10>(program);
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -2016,6 +2016,17 @@ BEGIN_TEST(insert_nops.setpc_gfx10)
|
|||
bld.vop3(aco_opcode::v_writelane_b32_e64, Definition(PhysReg(256), v1),
|
||||
Operand(PhysReg(257), v1), Operand::zero(4), Operand(PhysReg(256), v1));
|
||||
finish_insert_nops_test(false);
|
||||
|
||||
/* FPAtomicToDenormModeHazard */
|
||||
//>> p_unit_test 10
|
||||
//! flat_atomic_fmin %0:v[0-1], s1: undef, %0:v[0]
|
||||
//! s_nop imm:2
|
||||
//! s_waitcnt_depctr vm_vsrc(0)
|
||||
create_program(GFX10, compute_cs, 64, CHIP_UNKNOWN);
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
|
||||
bld.flat(aco_opcode::flat_atomic_fmin, Operand(PhysReg(256), v2), Operand(s1),
|
||||
Operand(PhysReg(256), v1));
|
||||
finish_insert_nops_test(false);
|
||||
END_TEST
|
||||
|
||||
BEGIN_TEST(insert_nops.setpc_gfx11)
|
||||
|
|
@ -2252,3 +2263,77 @@ BEGIN_TEST(insert_nops.setpc_gfx12)
|
|||
|
||||
finish_insert_nops_test(true);
|
||||
END_TEST
|
||||
|
||||
BEGIN_TEST(insert_nops.fpatomic_to_denorm_mode)
|
||||
for (amd_gfx_level lvl : {GFX10, GFX10_3}) {
|
||||
if (!setup_cs(NULL, lvl))
|
||||
continue;
|
||||
|
||||
//>> p_unit_test 0
|
||||
//! global_atomic_fmin %0:v[0-1], s1: undef, %0:v[0]
|
||||
//! s_nop imm:2
|
||||
//! s_denorm_mode imm:42
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
|
||||
bld.global(aco_opcode::global_atomic_fmin, Operand(PhysReg(256), v2), Operand(s1),
|
||||
Operand(PhysReg(256), v1));
|
||||
bld.sopp(aco_opcode::s_denorm_mode, 42);
|
||||
|
||||
//! p_unit_test 1
|
||||
//! global_atomic_fmin %0:v[0-1], s1: undef, %0:v[0]
|
||||
//! s_nop
|
||||
//! s_nop imm:1
|
||||
//! s_denorm_mode imm:42
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
|
||||
bld.global(aco_opcode::global_atomic_fmin, Operand(PhysReg(256), v2), Operand(s1),
|
||||
Operand(PhysReg(256), v1));
|
||||
bld.sopp(aco_opcode::s_nop, 0);
|
||||
bld.sopp(aco_opcode::s_denorm_mode, 42);
|
||||
|
||||
// VALU, waitcnt or enough wait states mitigates the hazard
|
||||
//! p_unit_test 2
|
||||
//! global_atomic_fmin %0:v[0-1], s1: undef, %0:v[0]
|
||||
//! v_nop
|
||||
//! s_denorm_mode imm:42
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
|
||||
bld.global(aco_opcode::global_atomic_fmin, Operand(PhysReg(256), v2), Operand(s1),
|
||||
Operand(PhysReg(256), v1));
|
||||
bld.vop1(aco_opcode::v_nop);
|
||||
bld.sopp(aco_opcode::s_denorm_mode, 42);
|
||||
|
||||
//! p_unit_test 3
|
||||
//! global_atomic_fmin %0:v[0-1], s1: undef, %0:v[0]
|
||||
//! s_waitcnt expcnt(0) lgkmcnt(0) vmcnt(0)
|
||||
//! s_denorm_mode imm:42
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
|
||||
bld.global(aco_opcode::global_atomic_fmin, Operand(PhysReg(256), v2), Operand(s1),
|
||||
Operand(PhysReg(256), v1));
|
||||
bld.sopp(aco_opcode::s_waitcnt, 0);
|
||||
bld.sopp(aco_opcode::s_denorm_mode, 42);
|
||||
|
||||
//! p_unit_test 4
|
||||
//! global_atomic_fmin %0:v[0-1], s1: undef, %0:v[0]
|
||||
//! s_nop imm:2
|
||||
//! s_denorm_mode imm:42
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
|
||||
bld.global(aco_opcode::global_atomic_fmin, Operand(PhysReg(256), v2), Operand(s1),
|
||||
Operand(PhysReg(256), v1));
|
||||
bld.sopp(aco_opcode::s_nop, 2);
|
||||
bld.sopp(aco_opcode::s_denorm_mode, 42);
|
||||
|
||||
//! p_unit_test 5
|
||||
//! global_atomic_fmin %0:v[0-1], s1: undef, %0:v[0]
|
||||
//! s_nop
|
||||
//! s_nop
|
||||
//! s_nop
|
||||
//! s_denorm_mode imm:42
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
|
||||
bld.global(aco_opcode::global_atomic_fmin, Operand(PhysReg(256), v2), Operand(s1),
|
||||
Operand(PhysReg(256), v1));
|
||||
bld.sopp(aco_opcode::s_nop, 0);
|
||||
bld.sopp(aco_opcode::s_nop, 0);
|
||||
bld.sopp(aco_opcode::s_nop, 0);
|
||||
bld.sopp(aco_opcode::s_denorm_mode, 42);
|
||||
|
||||
finish_insert_nops_test();
|
||||
}
|
||||
END_TEST
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue