diff --git a/src/amd/compiler/README-ISA.md b/src/amd/compiler/README-ISA.md index 518946e1e84..d59ca9fde5d 100644 --- a/src/amd/compiler/README-ISA.md +++ b/src/amd/compiler/README-ISA.md @@ -320,3 +320,13 @@ transcendental instructions in-between. Mitigated by: A va_vdst=0 wait: `s_waitcnt_deptr 0x0fff` + +### VALUPartialForwardingHazard + +Triggered by: +A VALU instruction reading two VGPRs: one written before an exec write by SALU and one after. To +trigger, there must be less than 3 VALU between the first and second VGPR writes and less than 5 +VALU between the second VGPR write and the current instruction. + +Mitigated by: +A va_vdst=0 wait: `s_waitcnt_deptr 0x0fff` diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp index bc956a0f616..f4ce028c0f3 100644 --- a/src/amd/compiler/aco_insert_NOPs.cpp +++ b/src/amd/compiler/aco_insert_NOPs.cpp @@ -1075,6 +1075,141 @@ handle_lds_direct_valu_hazard(State& state, aco_ptr& instr) return global_state.wait_vdst; } +enum VALUPartialForwardingHazardState : uint8_t { + nothing_written, + written_after_exec_write, + exec_written, +}; + +struct VALUPartialForwardingHazardGlobalState { + bool hazard_found = false; + std::set loop_headers_visited; +}; + +struct VALUPartialForwardingHazardBlockState { + /* initialized by number of VGPRs read by VALU, decrement when encountered to return early */ + uint8_t num_vgprs_read = 0; + BITSET_DECLARE(vgprs_read, 256) = {0}; + enum VALUPartialForwardingHazardState state = nothing_written; + unsigned num_valu_since_read = 0; + unsigned num_valu_since_write = 0; +}; + +bool +handle_valu_partial_forwarding_hazard_instr(VALUPartialForwardingHazardGlobalState& global_state, + VALUPartialForwardingHazardBlockState& block_state, + aco_ptr& instr) +{ + if (instr->isSALU() && !instr->definitions.empty()) { + if (block_state.state == written_after_exec_write && instr_writes_exec(instr)) + block_state.state = exec_written; + } else if (instr->isVALU() || instr->isVINTERP_INREG()) { + bool vgpr_write = false; + for (Definition& def : instr->definitions) { + if (def.physReg().reg() < 256) + continue; + + for (unsigned i = 0; i < def.size(); i++) { + unsigned reg = def.physReg().reg() - 256 + i; + if (!BITSET_TEST(block_state.vgprs_read, reg)) + continue; + + if (block_state.state == exec_written && block_state.num_valu_since_write < 3) { + global_state.hazard_found = true; + return true; + } + + BITSET_CLEAR(block_state.vgprs_read, reg); + block_state.num_vgprs_read--; + vgpr_write = true; + } + } + + if (vgpr_write) { + /* If the state is nothing_written: the check below should ensure that this write is + * close enough to the read. + * + * If the state is exec_written: the current choice of second write has failed. Reset and + * try with the current write as the second one, if it's close enough to the read. + * + * If the state is written_after_exec_write: a further second write would be better, if + * it's close enough to the read. + */ + if (block_state.state == nothing_written || block_state.num_valu_since_read < 5) { + block_state.state = written_after_exec_write; + block_state.num_valu_since_write = 0; + } else { + block_state.num_valu_since_write++; + } + } else { + block_state.num_valu_since_write++; + } + + block_state.num_valu_since_read++; + } else if (parse_vdst_wait(instr) == 0) { + return true; + } + + if (block_state.num_valu_since_read >= (block_state.state == nothing_written ? 5 : 8)) + return true; /* Hazard not possible at this distance. */ + if (block_state.num_vgprs_read == 0) + return true; /* All VGPRs have been written and a hazard was never found. */ + + return false; +} + +bool +handle_valu_partial_forwarding_hazard_block(VALUPartialForwardingHazardGlobalState& global_state, + VALUPartialForwardingHazardBlockState& block_state, + Block* block) +{ + if (block->kind & block_kind_loop_header) { + if (global_state.loop_headers_visited.count(block->index)) + return false; + global_state.loop_headers_visited.insert(block->index); + } + + return true; +} + +bool +handle_valu_partial_forwarding_hazard(State& state, aco_ptr& instr) +{ + /* VALUPartialForwardingHazard + * VALU instruction reads two VGPRs: one written before an exec write by SALU and one after. + * For the hazard, there must be less than 3 VALU between the first and second VGPR writes. + * There also must be less than 5 VALU between the second VGPR write and the current instruction. + */ + if (state.program->wave_size != 64 || (!instr->isVALU() && !instr->isVINTERP_INREG())) + return false; + + unsigned num_vgprs = 0; + for (Operand& op : instr->operands) + num_vgprs += op.physReg().reg() < 256 ? op.size() : 1; + if (num_vgprs <= 1) + return false; /* early exit */ + + VALUPartialForwardingHazardBlockState block_state; + + for (unsigned i = 0; i < instr->operands.size(); i++) { + Operand& op = instr->operands[i]; + if (op.physReg().reg() < 256) + continue; + for (unsigned j = 0; j < op.size(); j++) + BITSET_SET(block_state.vgprs_read, op.physReg().reg() - 256 + j); + } + block_state.num_vgprs_read = BITSET_COUNT(block_state.vgprs_read); + + if (block_state.num_vgprs_read <= 1) + return false; /* early exit */ + + VALUPartialForwardingHazardGlobalState global_state; + search_backwards(state, global_state, block_state); + return global_state.hazard_found; +} + void handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr& instr, std::vector>& new_instructions) @@ -1125,6 +1260,11 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr& } } + if (va_vdst > 0 && handle_valu_partial_forwarding_hazard(state, instr)) { + bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0x0fff); + va_vdst = 0; + } + va_vdst = std::min(va_vdst, parse_vdst_wait(instr)); if (va_vdst == 0) { ctx.valu_since_wr_by_trans.reset(); diff --git a/src/amd/compiler/tests/test_insert_nops.cpp b/src/amd/compiler/tests/test_insert_nops.cpp index 41f8b62f08c..bedf267af4f 100644 --- a/src/amd/compiler/tests/test_insert_nops.cpp +++ b/src/amd/compiler/tests/test_insert_nops.cpp @@ -654,3 +654,269 @@ BEGIN_TEST(insert_nops.valu_trans_use) finish_insert_nops_test(); END_TEST + +BEGIN_TEST(insert_nops.valu_partial_forwarding.basic) + if (!setup_cs(NULL, GFX11)) + return; + + /* Basic case. */ + //>> p_unit_test 0 + //! v1: %0:v[0] = v_mov_b32 0 + //! s2: %0:exec = s_mov_b64 -1 + //! v1: %0:v[1] = v_mov_b32 1 + //! s_waitcnt_depctr va_vdst(0) + //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); + bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1)); + bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1), + Operand(PhysReg(257), v1)); + + /* We should consider both the closest and further VALU after the exec write. */ + //! p_unit_test 1 + //! v1: %0:v[0] = v_mov_b32 0 + //! s2: %0:exec = s_mov_b64 -1 + //! v1: %0:v[1] = v_mov_b32 1 + //; for i in range(2): insert_pattern('v_nop') + //! v1: %0:v[2] = v_mov_b32 2 + //! s_waitcnt_depctr va_vdst(0) + //! v1: %0:v[2] = v_max3_f32 %0:v[0], %0:v[1], %0:v[2] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); + bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1)); + bld.vop1(aco_opcode::v_nop); + bld.vop1(aco_opcode::v_nop); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand::c32(2)); + bld.vop3(aco_opcode::v_max3_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1), + Operand(PhysReg(257), v1), Operand(PhysReg(258), v1)); + + //! p_unit_test 2 + //! v1: %0:v[0] = v_mov_b32 0 + //! s2: %0:exec = s_mov_b64 -1 + //! v1: %0:v[1] = v_mov_b32 1 + //! v1: %0:v[2] = v_mov_b32 2 + //; for i in range(4): insert_pattern('v_nop') + //! s_waitcnt_depctr va_vdst(0) + //! v1: %0:v[2] = v_max3_f32 %0:v[0], %0:v[1], %0:v[2] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); + bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand::c32(2)); + for (unsigned i = 0; i < 4; i++) + bld.vop1(aco_opcode::v_nop); + bld.vop3(aco_opcode::v_max3_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1), + Operand(PhysReg(257), v1), Operand(PhysReg(258), v1)); + + /* If a VALU writes a read VGPR in-between the first and second writes, it should still be + * counted towards the distance between the first and second writes. + */ + //! p_unit_test 3 + //! v1: %0:v[0] = v_mov_b32 0 + //! s2: %0:exec = s_mov_b64 -1 + //! v1: %0:v[1] = v_mov_b32 1 + //; for i in range(2): insert_pattern('v_nop') + //! v1: %0:v[2] = v_mov_b32 2 + //; for i in range(3): insert_pattern('v_nop') + //! v1: %0:v[2] = v_max3_f32 %0:v[0], %0:v[1], %0:v[2] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); + bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1)); + bld.vop1(aco_opcode::v_nop); + bld.vop1(aco_opcode::v_nop); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand::c32(2)); + for (unsigned i = 0; i < 3; i++) + bld.vop1(aco_opcode::v_nop); + bld.vop3(aco_opcode::v_max3_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1), + Operand(PhysReg(257), v1), Operand(PhysReg(258), v1)); + + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); + + finish_insert_nops_test(); +END_TEST + +BEGIN_TEST(insert_nops.valu_partial_forwarding.multiple_exec_writes) + if (!setup_cs(NULL, GFX11)) + return; + + //>> p_unit_test 0 + //! v1: %0:v[0] = v_mov_b32 0 + //! s2: %0:exec = s_mov_b64 0 + //! s2: %0:exec = s_mov_b64 -1 + //! v1: %0:v[1] = v_mov_b32 1 + //! s_waitcnt_depctr va_vdst(0) + //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); + bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(0)); + bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1)); + bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1), + Operand(PhysReg(257), v1)); + + //! p_unit_test 1 + //! v1: %0:v[0] = v_mov_b32 0 + //! s2: %0:exec = s_mov_b64 0 + //! v1: %0:v[1] = v_mov_b32 1 + //! s2: %0:exec = s_mov_b64 -1 + //! s_waitcnt_depctr va_vdst(0) + //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); + bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(0)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1)); + bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1)); + bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1), + Operand(PhysReg(257), v1)); + + finish_insert_nops_test(); +END_TEST + +BEGIN_TEST(insert_nops.valu_partial_forwarding.control_flow) + if (!setup_cs(NULL, GFX11)) + return; + + /* Control flow merges: one branch shouldn't interfere with the other (clobbering VALU closer + * than interesting one). + */ + //>> p_unit_test 0 + //! s_cbranch_scc1 block:BB2 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0u)); + bld.sopp(aco_opcode::s_cbranch_scc1, 2); + + //! BB1 + //! /* logical preds: / linear preds: BB0, / kind: */ + //! v1: %0:v[0] = v_mov_b32 0 + //! s2: %0:exec = s_mov_b64 -1 + //! v_nop + //! s_branch block:BB3 + bld.reset(program->create_and_insert_block()); + program->blocks[0].linear_succs.push_back(1); + program->blocks[1].linear_preds.push_back(0); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); + bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1)); + bld.vop1(aco_opcode::v_nop); + bld.sopp(aco_opcode::s_branch, 3); + + //! BB2 + //! /* logical preds: / linear preds: BB0, / kind: */ + //! v1: %0:v[0] = v_mov_b32 0 + bld.reset(program->create_and_insert_block()); + program->blocks[0].linear_succs.push_back(2); + program->blocks[2].linear_preds.push_back(0); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); + + //! BB3 + //! /* logical preds: / linear preds: BB1, BB2, / kind: */ + //! v1: %0:v[1] = v_mov_b32 1 + //! s_waitcnt_depctr va_vdst(0) + //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1] + bld.reset(program->create_and_insert_block()); + program->blocks[1].linear_succs.push_back(3); + program->blocks[2].linear_succs.push_back(3); + program->blocks[3].linear_preds.push_back(1); + program->blocks[3].linear_preds.push_back(2); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1)); + bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1), + Operand(PhysReg(257), v1)); + + /* Control flow merges: one branch shouldn't interfere with the other (should consider furthest + * VALU writes after exec). + */ + //! p_unit_test 1 + //! s_cbranch_scc1 block:BB5 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u)); + bld.sopp(aco_opcode::s_cbranch_scc1, 5); + + //! BB4 + //! /* logical preds: / linear preds: BB3, / kind: */ + //! v1: %0:v[0] = v_mov_b32 0 + //! s2: %0:exec = s_mov_b64 -1 + //; for i in range(2): insert_pattern('v_nop') + //! v1: %0:v[1] = v_mov_b32 1 + //! v_nop + //! s_branch block:BB6 + bld.reset(program->create_and_insert_block()); + program->blocks[3].linear_succs.push_back(4); + program->blocks[4].linear_preds.push_back(3); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); + bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1)); + bld.vop1(aco_opcode::v_nop); + bld.vop1(aco_opcode::v_nop); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1)); + bld.vop1(aco_opcode::v_nop); + bld.sopp(aco_opcode::s_branch, 6); + + //! BB5 + //! /* logical preds: / linear preds: BB3, / kind: */ + //! v1: %0:v[1] = v_mov_b32 1 + bld.reset(program->create_and_insert_block()); + program->blocks[3].linear_succs.push_back(5); + program->blocks[5].linear_preds.push_back(3); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1)); + + //! BB6 + //! /* logical preds: / linear preds: BB4, BB5, / kind: */ + //! s_waitcnt_depctr va_vdst(0) + //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1] + bld.reset(program->create_and_insert_block()); + program->blocks[4].linear_succs.push_back(6); + program->blocks[5].linear_succs.push_back(6); + program->blocks[6].linear_preds.push_back(4); + program->blocks[6].linear_preds.push_back(5); + bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1), + Operand(PhysReg(257), v1)); + + /* Control flow merges: one branch shouldn't interfere with the other (should consider closest + * VALU writes after exec). + */ + //! p_unit_test 2 + //! s_cbranch_scc1 block:BB8 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u)); + bld.sopp(aco_opcode::s_cbranch_scc1, 8); + + //! BB7 + //! /* logical preds: / linear preds: BB6, / kind: */ + //! v1: %0:v[0] = v_mov_b32 0 + //! s2: %0:exec = s_mov_b64 -1 + //! v1: %0:v[1] = v_mov_b32 1 + //; for i in range(4): insert_pattern('v_nop') + //! s_branch block:BB9 + bld.reset(program->create_and_insert_block()); + program->blocks[6].linear_succs.push_back(7); + program->blocks[7].linear_preds.push_back(6); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); + bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1)); + for (unsigned i = 0; i < 4; i++) + bld.vop1(aco_opcode::v_nop); + bld.sopp(aco_opcode::s_branch, 9); + + //! BB8 + //! /* logical preds: / linear preds: BB6, / kind: */ + //! v1: %0:v[1] = v_mov_b32 1 + //; for i in range(5): insert_pattern('v_nop') + bld.reset(program->create_and_insert_block()); + program->blocks[6].linear_succs.push_back(8); + program->blocks[8].linear_preds.push_back(6); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1)); + for (unsigned i = 0; i < 5; i++) + bld.vop1(aco_opcode::v_nop); + + //! BB9 + //! /* logical preds: / linear preds: BB7, BB8, / kind: uniform, */ + //! s_waitcnt_depctr va_vdst(0) + //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1] + bld.reset(program->create_and_insert_block()); + program->blocks[7].linear_succs.push_back(9); + program->blocks[8].linear_succs.push_back(9); + program->blocks[9].linear_preds.push_back(7); + program->blocks[9].linear_preds.push_back(8); + bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1), + Operand(PhysReg(257), v1)); + + finish_insert_nops_test(); +END_TEST