diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp index 6e7cf49f337..09f2d63b4e3 100644 --- a/src/amd/compiler/aco_insert_exec_mask.cpp +++ b/src/amd/compiler/aco_insert_exec_mask.cpp @@ -103,7 +103,8 @@ needs_exact(aco_ptr& instr) * emitted inside the same block, the main FS will always jump to the PS * epilog without considering the exec mask. */ - return instr->isEXP() || instr->opcode == aco_opcode::p_jump_to_epilog; + return instr->isEXP() || instr->opcode == aco_opcode::p_jump_to_epilog || + instr->opcode == aco_opcode::p_dual_src_export_gfx11; } } diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 393527574a9..18ed836c437 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1857,7 +1857,8 @@ inline bool is_dead(const std::vector& uses, const Instruction* instr) { if (instr->definitions.empty() || instr->isBranch() || - instr->opcode == aco_opcode::p_init_scratch) + instr->opcode == aco_opcode::p_init_scratch || + instr->opcode == aco_opcode::p_dual_src_export_gfx11) return false; if (std::any_of(instr->definitions.begin(), instr->definitions.end(), diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index ac18c044589..dfb45dd8966 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -2432,6 +2432,85 @@ lower_to_hw_instr(Program* program) } break; } + case aco_opcode::p_dual_src_export_gfx11: { + PhysReg dst0 = instr->definitions[0].physReg(); + PhysReg dst1 = instr->definitions[1].physReg(); + Definition tmp = instr->definitions[2]; + Definition exec_tmp = instr->definitions[3]; + Definition clobber_vcc = instr->definitions[4]; + Definition clobber_scc = instr->definitions[5]; + + assert(tmp.regClass() == v1); + assert(exec_tmp.regClass() == bld.lm); + assert(clobber_vcc.regClass() == bld.lm && clobber_vcc.physReg() == vcc); + assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc); + + bld.sop1(Builder::s_mov, Definition(exec_tmp.physReg(), bld.lm), + Operand(exec, bld.lm)); + bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), clobber_scc, + Operand(exec, bld.lm)); + + uint8_t enabled_channels = 0; + Operand mrt0[4], mrt1[4]; + + bld.sop1(aco_opcode::s_mov_b32, Definition(clobber_vcc.physReg(), s1), + Operand::c32(0x55555555)); + if (ctx.program->wave_size == 64) + bld.sop1(aco_opcode::s_mov_b32, Definition(clobber_vcc.physReg().advance(4), s1), + Operand::c32(0x55555555)); + + for (unsigned i = 0; i < 4; i++) { + if (instr->operands[i].isUndefined() && instr->operands[i + 4].isUndefined()) { + mrt0[i] = instr->operands[i]; + mrt1[i] = instr->operands[i + 4]; + continue; + } + + Operand src0 = instr->operands[i]; + Operand src1 = instr->operands[i + 4]; + + /* Swap odd, even lanes of mrt0. */ + Builder::Result ret = + bld.vop1_dpp8(aco_opcode::v_mov_b32, Definition(dst0, v1), src0); + for (unsigned j = 0; j < 8; j++) { + ret.instr->dpp8().lane_sel[j] = j ^ 1; + } + + /* Swap even lanes between mrt0 and mrt1. */ + bld.vop2(aco_opcode::v_cndmask_b32, tmp, Operand(dst0, v1), src1, + Operand(clobber_vcc.physReg(), bld.lm)); + bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst1, v1), src1, Operand(dst0, v1), + Operand(clobber_vcc.physReg(), bld.lm)); + + /* Swap odd, even lanes of mrt0 again. */ + ret = bld.vop1_dpp8(aco_opcode::v_mov_b32, Definition(dst0, v1), + Operand(tmp.physReg(), v1)); + for (unsigned j = 0; j < 8; j++) { + ret.instr->dpp8().lane_sel[j] = j ^ 1; + } + + mrt0[i] = Operand(dst0, v1); + mrt1[i] = Operand(dst1, v1); + + enabled_channels |= 1 << i; + + dst0 = dst0.advance(4); + dst1 = dst1.advance(4); + } + + bld.sop1(Builder::s_mov, Definition(exec, bld.lm), + Operand(exec_tmp.physReg(), bld.lm)); + + /* Force export all channels when everything is undefined. */ + if (!enabled_channels) + enabled_channels = 0xf; + + bld.exp(aco_opcode::exp, mrt0[0], mrt0[1], mrt0[2], mrt0[3], enabled_channels, + V_008DFC_SQ_EXP_MRT + 21, false); + bld.exp(aco_opcode::exp, mrt1[0], mrt1[1], mrt1[2], mrt1[3], enabled_channels, + V_008DFC_SQ_EXP_MRT + 22, false); + break; + } default: break; } } else if (instr->isBranch()) { diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index cd89b402707..2c11cf255b8 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -340,6 +340,9 @@ opcode("p_jump_to_epilog") #dst0=result, dst1=exec_tmp, dst2=clobber_scc, src0=linear_vgpr, src1=attribute, src2=component, src3=dpp_ctrl, src4=m0 opcode("p_interp_gfx11") +# performs dual source MRTs swizzling and emits exports on GFX11 +opcode("p_dual_src_export_gfx11") + # SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc) SOP2 = { # GFX6, GFX7, GFX8, GFX9, GFX10,GFX11,name diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index f4c219aca2a..3c90ce66297 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -673,7 +673,8 @@ alu_can_accept_constant(aco_opcode opcode, unsigned operand) case aco_opcode::v_readfirstlane_b32: case aco_opcode::p_extract: case aco_opcode::p_insert: return operand != 0; - case aco_opcode::p_interp_gfx11: return false; + case aco_opcode::p_interp_gfx11: + case aco_opcode::p_dual_src_export_gfx11: return false; default: return true; } } diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index d0367e7ffee..b2aa99df1ac 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -262,6 +262,7 @@ validate_ir(Program* program) bool can_be_undef = is_phi(instr) || instr->isEXP() || instr->isReduction() || instr->opcode == aco_opcode::p_create_vector || instr->opcode == aco_opcode::p_jump_to_epilog || + instr->opcode == aco_opcode::p_dual_src_export_gfx11 || (instr->opcode == aco_opcode::p_interp_gfx11 && i == 0) || (flat && i == 1) || (instr->isMIMG() && (i == 1 || i == 2)) || ((instr->isMUBUF() || instr->isMTBUF()) && i == 1) || @@ -526,6 +527,26 @@ validate_ir(Program* program) instr->operands[i].isUndefined(), "Other operands of p_jump_to_epilog must be VGPRs or undef", instr.get()); } + } else if (instr->opcode == aco_opcode::p_dual_src_export_gfx11) { + check(instr->definitions.size() == 6, + "p_dual_src_export_gfx11 must have 6 definitions", instr.get()); + check(instr->definitions[2].getTemp().type() == RegType::vgpr && + instr->definitions[2].getTemp().size() == 1, + "Third definition of p_dual_src_export_gfx11 must be a v1", instr.get()); + check(instr->definitions[3].getTemp().type() == RegType::sgpr && + instr->definitions[3].getTemp().size() == 2, + "Fourth definition of p_dual_src_export_gfx11 must be a s2", instr.get()); + check(instr->definitions[4].physReg() == vcc, + "Fifth definition of p_dual_src_export_gfx11 must be vcc", instr.get()); + check(instr->definitions[5].physReg() == scc, + "Sixth definition of p_dual_src_export_gfx11 must be scc", instr.get()); + check(instr->operands.size() == 8, "p_dual_src_export_gfx11 must have 8 operands", + instr.get()); + for (unsigned i = 0; i < instr->operands.size(); i++) { + check(instr->operands[i].getTemp().type() == RegType::vgpr || + instr->operands[i].isUndefined(), + "Operands of p_dual_src_export_gfx11 must be VGPRs or undef", instr.get()); + } } break; }