aco: handle FPAtomicToDenormModeHazard
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

This is quite unlikely to happen, but I guess it might be possible and
it's relatively simple to work around.

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35884>
This commit is contained in:
Rhys Perry 2025-07-01 15:20:54 +01:00 committed by Marge Bot
parent d9e5e8f5fc
commit 34f1a8f707
2 changed files with 161 additions and 3 deletions

View file

@ -133,6 +133,7 @@ struct NOP_ctx_gfx10 {
std::bitset<128> sgprs_read_by_VMEM_store;
std::bitset<128> sgprs_read_by_DS;
std::bitset<128> sgprs_read_by_SMEM;
int waits_since_fp_atomic = 3;
void join(const NOP_ctx_gfx10& other)
{
@ -148,6 +149,7 @@ struct NOP_ctx_gfx10 {
sgprs_read_by_DS |= other.sgprs_read_by_DS;
sgprs_read_by_VMEM_store |= other.sgprs_read_by_VMEM_store;
sgprs_read_by_SMEM |= other.sgprs_read_by_SMEM;
waits_since_fp_atomic = std::min(waits_since_fp_atomic, other.waits_since_fp_atomic);
}
bool operator==(const NOP_ctx_gfx10& other)
@ -160,7 +162,8 @@ struct NOP_ctx_gfx10 {
sgprs_read_by_VMEM == other.sgprs_read_by_VMEM &&
sgprs_read_by_DS == other.sgprs_read_by_DS &&
sgprs_read_by_VMEM_store == other.sgprs_read_by_VMEM_store &&
sgprs_read_by_SMEM == other.sgprs_read_by_SMEM;
sgprs_read_by_SMEM == other.sgprs_read_by_SMEM &&
waits_since_fp_atomic == other.waits_since_fp_atomic;
}
};
@ -867,6 +870,50 @@ instr_is_branch(const aco_ptr<Instruction>& instr)
instr->opcode == aco_opcode::s_getpc_b64 || instr->opcode == aco_opcode::s_call_b64;
}
bool
instr_is_vmem_fp_atomic(const aco_ptr<Instruction>& instr)
{
if (!instr->isVMEM() && !instr->isFlatLike())
return false;
switch (instr->opcode) {
case aco_opcode::buffer_atomic_fcmpswap:
case aco_opcode::buffer_atomic_fmin:
case aco_opcode::buffer_atomic_fmax:
case aco_opcode::buffer_atomic_fcmpswap_x2:
case aco_opcode::buffer_atomic_fmin_x2:
case aco_opcode::buffer_atomic_fmax_x2:
case aco_opcode::buffer_atomic_add_f32:
case aco_opcode::buffer_atomic_pk_add_f16:
case aco_opcode::buffer_atomic_pk_add_bf16:
case aco_opcode::image_atomic_fcmpswap:
case aco_opcode::image_atomic_fmin:
case aco_opcode::image_atomic_fmax:
case aco_opcode::image_atomic_pk_add_f16:
case aco_opcode::image_atomic_pk_add_bf16:
case aco_opcode::image_atomic_add_flt:
case aco_opcode::flat_atomic_fcmpswap:
case aco_opcode::flat_atomic_fmin:
case aco_opcode::flat_atomic_fmax:
case aco_opcode::flat_atomic_fcmpswap_x2:
case aco_opcode::flat_atomic_fmin_x2:
case aco_opcode::flat_atomic_fmax_x2:
case aco_opcode::flat_atomic_add_f32:
case aco_opcode::flat_atomic_pk_add_f16:
case aco_opcode::flat_atomic_pk_add_bf16:
case aco_opcode::global_atomic_fcmpswap:
case aco_opcode::global_atomic_fmin:
case aco_opcode::global_atomic_fmax:
case aco_opcode::global_atomic_fcmpswap_x2:
case aco_opcode::global_atomic_fmin_x2:
case aco_opcode::global_atomic_fmax_x2:
case aco_opcode::global_atomic_add_f32:
case aco_opcode::global_atomic_pk_add_f16:
case aco_opcode::global_atomic_pk_add_bf16: return true;
default: return false;
}
}
void
handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr<Instruction>& instr,
std::vector<aco_ptr<Instruction>>& new_instructions)
@ -886,6 +933,27 @@ handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr<Instruction>&
sa_sdst = instr->salu().imm & 0x1;
}
/* FPAtomicToDenormModeHazard */
if (instr->opcode == aco_opcode::s_denorm_mode && ctx.waits_since_fp_atomic < 3) {
bld.sopp(aco_opcode::s_nop, 3 - ctx.waits_since_fp_atomic - 1);
ctx.waits_since_fp_atomic = 3;
} else if (instr->isVALU() || instr->opcode == aco_opcode::s_waitcnt ||
instr->opcode == aco_opcode::s_waitcnt_vscnt ||
instr->opcode == aco_opcode::s_waitcnt_vmcnt ||
instr->opcode == aco_opcode::s_waitcnt_expcnt ||
instr->opcode == aco_opcode::s_waitcnt_lgkmcnt ||
instr->opcode == aco_opcode::s_wait_idle) {
ctx.waits_since_fp_atomic = 3;
} else if (instr_is_vmem_fp_atomic(instr)) {
ctx.waits_since_fp_atomic = 0;
} else {
ctx.waits_since_fp_atomic += get_wait_states(instr);
ctx.waits_since_fp_atomic = std::min(ctx.waits_since_fp_atomic, 3);
}
if (state.program->gfx_level != GFX10)
return; /* no other hazards/bugs to mitigate */
/* VMEMtoScalarWriteHazard
* Handle EXEC/M0/SGPR write following a VMEM/DS instruction without a VALU or "waitcnt vmcnt(0)"
* in-between.
@ -1052,6 +1120,13 @@ resolve_all_gfx10(State& state, NOP_ctx_gfx10& ctx,
size_t prev_count = new_instructions.size();
/* FPAtomicToDenormModeHazard */
if (ctx.waits_since_fp_atomic < 3)
bld.sopp(aco_opcode::s_nop, 3 - ctx.waits_since_fp_atomic - 1);
if (state.program->gfx_level != GFX10)
return; /* no other hazards/bugs to mitigate */
/* VcmpxPermlaneHazard */
if (ctx.has_VOPC_write_exec) {
ctx.has_VOPC_write_exec = false;
@ -2004,8 +2079,6 @@ insert_NOPs(Program* program)
mitigate_hazards<NOP_ctx_gfx11, handle_instruction_gfx11, resolve_all_gfx11>(program,
initial_ctx);
} else if (program->gfx_level >= GFX10_3) {
; /* no hazards/bugs to mitigate */
} else if (program->gfx_level >= GFX10) {
mitigate_hazards<NOP_ctx_gfx10, handle_instruction_gfx10, resolve_all_gfx10>(program);
} else {

View file

@ -2016,6 +2016,17 @@ BEGIN_TEST(insert_nops.setpc_gfx10)
bld.vop3(aco_opcode::v_writelane_b32_e64, Definition(PhysReg(256), v1),
Operand(PhysReg(257), v1), Operand::zero(4), Operand(PhysReg(256), v1));
finish_insert_nops_test(false);
/* FPAtomicToDenormModeHazard */
//>> p_unit_test 10
//! flat_atomic_fmin %0:v[0-1], s1: undef, %0:v[0]
//! s_nop imm:2
//! s_waitcnt_depctr vm_vsrc(0)
create_program(GFX10, compute_cs, 64, CHIP_UNKNOWN);
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
bld.flat(aco_opcode::flat_atomic_fmin, Operand(PhysReg(256), v2), Operand(s1),
Operand(PhysReg(256), v1));
finish_insert_nops_test(false);
END_TEST
BEGIN_TEST(insert_nops.setpc_gfx11)
@ -2252,3 +2263,77 @@ BEGIN_TEST(insert_nops.setpc_gfx12)
finish_insert_nops_test(true);
END_TEST
BEGIN_TEST(insert_nops.fpatomic_to_denorm_mode)
for (amd_gfx_level lvl : {GFX10, GFX10_3}) {
if (!setup_cs(NULL, lvl))
continue;
//>> p_unit_test 0
//! global_atomic_fmin %0:v[0-1], s1: undef, %0:v[0]
//! s_nop imm:2
//! s_denorm_mode imm:42
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
bld.global(aco_opcode::global_atomic_fmin, Operand(PhysReg(256), v2), Operand(s1),
Operand(PhysReg(256), v1));
bld.sopp(aco_opcode::s_denorm_mode, 42);
//! p_unit_test 1
//! global_atomic_fmin %0:v[0-1], s1: undef, %0:v[0]
//! s_nop
//! s_nop imm:1
//! s_denorm_mode imm:42
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
bld.global(aco_opcode::global_atomic_fmin, Operand(PhysReg(256), v2), Operand(s1),
Operand(PhysReg(256), v1));
bld.sopp(aco_opcode::s_nop, 0);
bld.sopp(aco_opcode::s_denorm_mode, 42);
// VALU, waitcnt or enough wait states mitigates the hazard
//! p_unit_test 2
//! global_atomic_fmin %0:v[0-1], s1: undef, %0:v[0]
//! v_nop
//! s_denorm_mode imm:42
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
bld.global(aco_opcode::global_atomic_fmin, Operand(PhysReg(256), v2), Operand(s1),
Operand(PhysReg(256), v1));
bld.vop1(aco_opcode::v_nop);
bld.sopp(aco_opcode::s_denorm_mode, 42);
//! p_unit_test 3
//! global_atomic_fmin %0:v[0-1], s1: undef, %0:v[0]
//! s_waitcnt expcnt(0) lgkmcnt(0) vmcnt(0)
//! s_denorm_mode imm:42
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
bld.global(aco_opcode::global_atomic_fmin, Operand(PhysReg(256), v2), Operand(s1),
Operand(PhysReg(256), v1));
bld.sopp(aco_opcode::s_waitcnt, 0);
bld.sopp(aco_opcode::s_denorm_mode, 42);
//! p_unit_test 4
//! global_atomic_fmin %0:v[0-1], s1: undef, %0:v[0]
//! s_nop imm:2
//! s_denorm_mode imm:42
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
bld.global(aco_opcode::global_atomic_fmin, Operand(PhysReg(256), v2), Operand(s1),
Operand(PhysReg(256), v1));
bld.sopp(aco_opcode::s_nop, 2);
bld.sopp(aco_opcode::s_denorm_mode, 42);
//! p_unit_test 5
//! global_atomic_fmin %0:v[0-1], s1: undef, %0:v[0]
//! s_nop
//! s_nop
//! s_nop
//! s_denorm_mode imm:42
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
bld.global(aco_opcode::global_atomic_fmin, Operand(PhysReg(256), v2), Operand(s1),
Operand(PhysReg(256), v1));
bld.sopp(aco_opcode::s_nop, 0);
bld.sopp(aco_opcode::s_nop, 0);
bld.sopp(aco_opcode::s_nop, 0);
bld.sopp(aco_opcode::s_denorm_mode, 42);
finish_insert_nops_test();
}
END_TEST