diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp index 8d7a37461fa..a69847a0592 100644 --- a/src/amd/compiler/aco_insert_NOPs.cpp +++ b/src/amd/compiler/aco_insert_NOPs.cpp @@ -133,6 +133,7 @@ struct NOP_ctx_gfx10 { std::bitset<128> sgprs_read_by_VMEM_store; std::bitset<128> sgprs_read_by_DS; std::bitset<128> sgprs_read_by_SMEM; + int waits_since_fp_atomic = 3; void join(const NOP_ctx_gfx10& other) { @@ -148,6 +149,7 @@ struct NOP_ctx_gfx10 { sgprs_read_by_DS |= other.sgprs_read_by_DS; sgprs_read_by_VMEM_store |= other.sgprs_read_by_VMEM_store; sgprs_read_by_SMEM |= other.sgprs_read_by_SMEM; + waits_since_fp_atomic = std::min(waits_since_fp_atomic, other.waits_since_fp_atomic); } bool operator==(const NOP_ctx_gfx10& other) @@ -160,7 +162,8 @@ struct NOP_ctx_gfx10 { sgprs_read_by_VMEM == other.sgprs_read_by_VMEM && sgprs_read_by_DS == other.sgprs_read_by_DS && sgprs_read_by_VMEM_store == other.sgprs_read_by_VMEM_store && - sgprs_read_by_SMEM == other.sgprs_read_by_SMEM; + sgprs_read_by_SMEM == other.sgprs_read_by_SMEM && + waits_since_fp_atomic == other.waits_since_fp_atomic; } }; @@ -867,6 +870,50 @@ instr_is_branch(const aco_ptr& instr) instr->opcode == aco_opcode::s_getpc_b64 || instr->opcode == aco_opcode::s_call_b64; } +bool +instr_is_vmem_fp_atomic(const aco_ptr& instr) +{ + if (!instr->isVMEM() && !instr->isFlatLike()) + return false; + + switch (instr->opcode) { + case aco_opcode::buffer_atomic_fcmpswap: + case aco_opcode::buffer_atomic_fmin: + case aco_opcode::buffer_atomic_fmax: + case aco_opcode::buffer_atomic_fcmpswap_x2: + case aco_opcode::buffer_atomic_fmin_x2: + case aco_opcode::buffer_atomic_fmax_x2: + case aco_opcode::buffer_atomic_add_f32: + case aco_opcode::buffer_atomic_pk_add_f16: + case aco_opcode::buffer_atomic_pk_add_bf16: + case aco_opcode::image_atomic_fcmpswap: + case aco_opcode::image_atomic_fmin: + case aco_opcode::image_atomic_fmax: + case aco_opcode::image_atomic_pk_add_f16: + case aco_opcode::image_atomic_pk_add_bf16: + case aco_opcode::image_atomic_add_flt: + case aco_opcode::flat_atomic_fcmpswap: + case aco_opcode::flat_atomic_fmin: + case aco_opcode::flat_atomic_fmax: + case aco_opcode::flat_atomic_fcmpswap_x2: + case aco_opcode::flat_atomic_fmin_x2: + case aco_opcode::flat_atomic_fmax_x2: + case aco_opcode::flat_atomic_add_f32: + case aco_opcode::flat_atomic_pk_add_f16: + case aco_opcode::flat_atomic_pk_add_bf16: + case aco_opcode::global_atomic_fcmpswap: + case aco_opcode::global_atomic_fmin: + case aco_opcode::global_atomic_fmax: + case aco_opcode::global_atomic_fcmpswap_x2: + case aco_opcode::global_atomic_fmin_x2: + case aco_opcode::global_atomic_fmax_x2: + case aco_opcode::global_atomic_add_f32: + case aco_opcode::global_atomic_pk_add_f16: + case aco_opcode::global_atomic_pk_add_bf16: return true; + default: return false; + } +} + void handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr& instr, std::vector>& new_instructions) @@ -886,6 +933,27 @@ handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr& sa_sdst = instr->salu().imm & 0x1; } + /* FPAtomicToDenormModeHazard */ + if (instr->opcode == aco_opcode::s_denorm_mode && ctx.waits_since_fp_atomic < 3) { + bld.sopp(aco_opcode::s_nop, 3 - ctx.waits_since_fp_atomic - 1); + ctx.waits_since_fp_atomic = 3; + } else if (instr->isVALU() || instr->opcode == aco_opcode::s_waitcnt || + instr->opcode == aco_opcode::s_waitcnt_vscnt || + instr->opcode == aco_opcode::s_waitcnt_vmcnt || + instr->opcode == aco_opcode::s_waitcnt_expcnt || + instr->opcode == aco_opcode::s_waitcnt_lgkmcnt || + instr->opcode == aco_opcode::s_wait_idle) { + ctx.waits_since_fp_atomic = 3; + } else if (instr_is_vmem_fp_atomic(instr)) { + ctx.waits_since_fp_atomic = 0; + } else { + ctx.waits_since_fp_atomic += get_wait_states(instr); + ctx.waits_since_fp_atomic = std::min(ctx.waits_since_fp_atomic, 3); + } + + if (state.program->gfx_level != GFX10) + return; /* no other hazards/bugs to mitigate */ + /* VMEMtoScalarWriteHazard * Handle EXEC/M0/SGPR write following a VMEM/DS instruction without a VALU or "waitcnt vmcnt(0)" * in-between. @@ -1052,6 +1120,13 @@ resolve_all_gfx10(State& state, NOP_ctx_gfx10& ctx, size_t prev_count = new_instructions.size(); + /* FPAtomicToDenormModeHazard */ + if (ctx.waits_since_fp_atomic < 3) + bld.sopp(aco_opcode::s_nop, 3 - ctx.waits_since_fp_atomic - 1); + + if (state.program->gfx_level != GFX10) + return; /* no other hazards/bugs to mitigate */ + /* VcmpxPermlaneHazard */ if (ctx.has_VOPC_write_exec) { ctx.has_VOPC_write_exec = false; @@ -2004,8 +2079,6 @@ insert_NOPs(Program* program) mitigate_hazards(program, initial_ctx); - } else if (program->gfx_level >= GFX10_3) { - ; /* no hazards/bugs to mitigate */ } else if (program->gfx_level >= GFX10) { mitigate_hazards(program); } else { diff --git a/src/amd/compiler/tests/test_insert_nops.cpp b/src/amd/compiler/tests/test_insert_nops.cpp index 96b372bbe81..d4ce6ec43ec 100644 --- a/src/amd/compiler/tests/test_insert_nops.cpp +++ b/src/amd/compiler/tests/test_insert_nops.cpp @@ -2016,6 +2016,17 @@ BEGIN_TEST(insert_nops.setpc_gfx10) bld.vop3(aco_opcode::v_writelane_b32_e64, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1), Operand::zero(4), Operand(PhysReg(256), v1)); finish_insert_nops_test(false); + + /* FPAtomicToDenormModeHazard */ + //>> p_unit_test 10 + //! flat_atomic_fmin %0:v[0-1], s1: undef, %0:v[0] + //! s_nop imm:2 + //! s_waitcnt_depctr vm_vsrc(0) + create_program(GFX10, compute_cs, 64, CHIP_UNKNOWN); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10)); + bld.flat(aco_opcode::flat_atomic_fmin, Operand(PhysReg(256), v2), Operand(s1), + Operand(PhysReg(256), v1)); + finish_insert_nops_test(false); END_TEST BEGIN_TEST(insert_nops.setpc_gfx11) @@ -2252,3 +2263,77 @@ BEGIN_TEST(insert_nops.setpc_gfx12) finish_insert_nops_test(true); END_TEST + +BEGIN_TEST(insert_nops.fpatomic_to_denorm_mode) + for (amd_gfx_level lvl : {GFX10, GFX10_3}) { + if (!setup_cs(NULL, lvl)) + continue; + + //>> p_unit_test 0 + //! global_atomic_fmin %0:v[0-1], s1: undef, %0:v[0] + //! s_nop imm:2 + //! s_denorm_mode imm:42 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); + bld.global(aco_opcode::global_atomic_fmin, Operand(PhysReg(256), v2), Operand(s1), + Operand(PhysReg(256), v1)); + bld.sopp(aco_opcode::s_denorm_mode, 42); + + //! p_unit_test 1 + //! global_atomic_fmin %0:v[0-1], s1: undef, %0:v[0] + //! s_nop + //! s_nop imm:1 + //! s_denorm_mode imm:42 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); + bld.global(aco_opcode::global_atomic_fmin, Operand(PhysReg(256), v2), Operand(s1), + Operand(PhysReg(256), v1)); + bld.sopp(aco_opcode::s_nop, 0); + bld.sopp(aco_opcode::s_denorm_mode, 42); + + // VALU, waitcnt or enough wait states mitigates the hazard + //! p_unit_test 2 + //! global_atomic_fmin %0:v[0-1], s1: undef, %0:v[0] + //! v_nop + //! s_denorm_mode imm:42 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); + bld.global(aco_opcode::global_atomic_fmin, Operand(PhysReg(256), v2), Operand(s1), + Operand(PhysReg(256), v1)); + bld.vop1(aco_opcode::v_nop); + bld.sopp(aco_opcode::s_denorm_mode, 42); + + //! p_unit_test 3 + //! global_atomic_fmin %0:v[0-1], s1: undef, %0:v[0] + //! s_waitcnt expcnt(0) lgkmcnt(0) vmcnt(0) + //! s_denorm_mode imm:42 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); + bld.global(aco_opcode::global_atomic_fmin, Operand(PhysReg(256), v2), Operand(s1), + Operand(PhysReg(256), v1)); + bld.sopp(aco_opcode::s_waitcnt, 0); + bld.sopp(aco_opcode::s_denorm_mode, 42); + + //! p_unit_test 4 + //! global_atomic_fmin %0:v[0-1], s1: undef, %0:v[0] + //! s_nop imm:2 + //! s_denorm_mode imm:42 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); + bld.global(aco_opcode::global_atomic_fmin, Operand(PhysReg(256), v2), Operand(s1), + Operand(PhysReg(256), v1)); + bld.sopp(aco_opcode::s_nop, 2); + bld.sopp(aco_opcode::s_denorm_mode, 42); + + //! p_unit_test 5 + //! global_atomic_fmin %0:v[0-1], s1: undef, %0:v[0] + //! s_nop + //! s_nop + //! s_nop + //! s_denorm_mode imm:42 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); + bld.global(aco_opcode::global_atomic_fmin, Operand(PhysReg(256), v2), Operand(s1), + Operand(PhysReg(256), v1)); + bld.sopp(aco_opcode::s_nop, 0); + bld.sopp(aco_opcode::s_nop, 0); + bld.sopp(aco_opcode::s_nop, 0); + bld.sopp(aco_opcode::s_denorm_mode, 42); + + finish_insert_nops_test(); + } +END_TEST