aco: add p_dual_src_export_gfx11 for dual source blending on GFX11

Dual source blending must be in strict WQM mode.

Cc: 22.3 mesa-stable
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19643>
This commit is contained in:
Samuel Pitoiset 2022-11-16 15:18:54 +01:00 committed by Marge Bot
parent e93de8a75e
commit bb90d29660
6 changed files with 109 additions and 3 deletions

View file

@ -103,7 +103,8 @@ needs_exact(aco_ptr<Instruction>& instr)
* emitted inside the same block, the main FS will always jump to the PS * emitted inside the same block, the main FS will always jump to the PS
* epilog without considering the exec mask. * epilog without considering the exec mask.
*/ */
return instr->isEXP() || instr->opcode == aco_opcode::p_jump_to_epilog; return instr->isEXP() || instr->opcode == aco_opcode::p_jump_to_epilog ||
instr->opcode == aco_opcode::p_dual_src_export_gfx11;
} }
} }

View file

@ -1857,7 +1857,8 @@ inline bool
is_dead(const std::vector<uint16_t>& uses, const Instruction* instr) is_dead(const std::vector<uint16_t>& uses, const Instruction* instr)
{ {
if (instr->definitions.empty() || instr->isBranch() || if (instr->definitions.empty() || instr->isBranch() ||
instr->opcode == aco_opcode::p_init_scratch) instr->opcode == aco_opcode::p_init_scratch ||
instr->opcode == aco_opcode::p_dual_src_export_gfx11)
return false; return false;
if (std::any_of(instr->definitions.begin(), instr->definitions.end(), if (std::any_of(instr->definitions.begin(), instr->definitions.end(),

View file

@ -2432,6 +2432,85 @@ lower_to_hw_instr(Program* program)
} }
break; break;
} }
case aco_opcode::p_dual_src_export_gfx11: {
PhysReg dst0 = instr->definitions[0].physReg();
PhysReg dst1 = instr->definitions[1].physReg();
Definition tmp = instr->definitions[2];
Definition exec_tmp = instr->definitions[3];
Definition clobber_vcc = instr->definitions[4];
Definition clobber_scc = instr->definitions[5];
assert(tmp.regClass() == v1);
assert(exec_tmp.regClass() == bld.lm);
assert(clobber_vcc.regClass() == bld.lm && clobber_vcc.physReg() == vcc);
assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
bld.sop1(Builder::s_mov, Definition(exec_tmp.physReg(), bld.lm),
Operand(exec, bld.lm));
bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), clobber_scc,
Operand(exec, bld.lm));
uint8_t enabled_channels = 0;
Operand mrt0[4], mrt1[4];
bld.sop1(aco_opcode::s_mov_b32, Definition(clobber_vcc.physReg(), s1),
Operand::c32(0x55555555));
if (ctx.program->wave_size == 64)
bld.sop1(aco_opcode::s_mov_b32, Definition(clobber_vcc.physReg().advance(4), s1),
Operand::c32(0x55555555));
for (unsigned i = 0; i < 4; i++) {
if (instr->operands[i].isUndefined() && instr->operands[i + 4].isUndefined()) {
mrt0[i] = instr->operands[i];
mrt1[i] = instr->operands[i + 4];
continue;
}
Operand src0 = instr->operands[i];
Operand src1 = instr->operands[i + 4];
/* Swap odd, even lanes of mrt0. */
Builder::Result ret =
bld.vop1_dpp8(aco_opcode::v_mov_b32, Definition(dst0, v1), src0);
for (unsigned j = 0; j < 8; j++) {
ret.instr->dpp8().lane_sel[j] = j ^ 1;
}
/* Swap even lanes between mrt0 and mrt1. */
bld.vop2(aco_opcode::v_cndmask_b32, tmp, Operand(dst0, v1), src1,
Operand(clobber_vcc.physReg(), bld.lm));
bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst1, v1), src1, Operand(dst0, v1),
Operand(clobber_vcc.physReg(), bld.lm));
/* Swap odd, even lanes of mrt0 again. */
ret = bld.vop1_dpp8(aco_opcode::v_mov_b32, Definition(dst0, v1),
Operand(tmp.physReg(), v1));
for (unsigned j = 0; j < 8; j++) {
ret.instr->dpp8().lane_sel[j] = j ^ 1;
}
mrt0[i] = Operand(dst0, v1);
mrt1[i] = Operand(dst1, v1);
enabled_channels |= 1 << i;
dst0 = dst0.advance(4);
dst1 = dst1.advance(4);
}
bld.sop1(Builder::s_mov, Definition(exec, bld.lm),
Operand(exec_tmp.physReg(), bld.lm));
/* Force export all channels when everything is undefined. */
if (!enabled_channels)
enabled_channels = 0xf;
bld.exp(aco_opcode::exp, mrt0[0], mrt0[1], mrt0[2], mrt0[3], enabled_channels,
V_008DFC_SQ_EXP_MRT + 21, false);
bld.exp(aco_opcode::exp, mrt1[0], mrt1[1], mrt1[2], mrt1[3], enabled_channels,
V_008DFC_SQ_EXP_MRT + 22, false);
break;
}
default: break; default: break;
} }
} else if (instr->isBranch()) { } else if (instr->isBranch()) {

View file

@ -340,6 +340,9 @@ opcode("p_jump_to_epilog")
#dst0=result, dst1=exec_tmp, dst2=clobber_scc, src0=linear_vgpr, src1=attribute, src2=component, src3=dpp_ctrl, src4=m0 #dst0=result, dst1=exec_tmp, dst2=clobber_scc, src0=linear_vgpr, src1=attribute, src2=component, src3=dpp_ctrl, src4=m0
opcode("p_interp_gfx11") opcode("p_interp_gfx11")
# performs dual source MRTs swizzling and emits exports on GFX11
opcode("p_dual_src_export_gfx11")
# SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc) # SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc)
SOP2 = { SOP2 = {
# GFX6, GFX7, GFX8, GFX9, GFX10,GFX11,name # GFX6, GFX7, GFX8, GFX9, GFX10,GFX11,name

View file

@ -673,7 +673,8 @@ alu_can_accept_constant(aco_opcode opcode, unsigned operand)
case aco_opcode::v_readfirstlane_b32: case aco_opcode::v_readfirstlane_b32:
case aco_opcode::p_extract: case aco_opcode::p_extract:
case aco_opcode::p_insert: return operand != 0; case aco_opcode::p_insert: return operand != 0;
case aco_opcode::p_interp_gfx11: return false; case aco_opcode::p_interp_gfx11:
case aco_opcode::p_dual_src_export_gfx11: return false;
default: return true; default: return true;
} }
} }

View file

@ -262,6 +262,7 @@ validate_ir(Program* program)
bool can_be_undef = is_phi(instr) || instr->isEXP() || instr->isReduction() || bool can_be_undef = is_phi(instr) || instr->isEXP() || instr->isReduction() ||
instr->opcode == aco_opcode::p_create_vector || instr->opcode == aco_opcode::p_create_vector ||
instr->opcode == aco_opcode::p_jump_to_epilog || instr->opcode == aco_opcode::p_jump_to_epilog ||
instr->opcode == aco_opcode::p_dual_src_export_gfx11 ||
(instr->opcode == aco_opcode::p_interp_gfx11 && i == 0) || (instr->opcode == aco_opcode::p_interp_gfx11 && i == 0) ||
(flat && i == 1) || (instr->isMIMG() && (i == 1 || i == 2)) || (flat && i == 1) || (instr->isMIMG() && (i == 1 || i == 2)) ||
((instr->isMUBUF() || instr->isMTBUF()) && i == 1) || ((instr->isMUBUF() || instr->isMTBUF()) && i == 1) ||
@ -526,6 +527,26 @@ validate_ir(Program* program)
instr->operands[i].isUndefined(), instr->operands[i].isUndefined(),
"Other operands of p_jump_to_epilog must be VGPRs or undef", instr.get()); "Other operands of p_jump_to_epilog must be VGPRs or undef", instr.get());
} }
} else if (instr->opcode == aco_opcode::p_dual_src_export_gfx11) {
check(instr->definitions.size() == 6,
"p_dual_src_export_gfx11 must have 6 definitions", instr.get());
check(instr->definitions[2].getTemp().type() == RegType::vgpr &&
instr->definitions[2].getTemp().size() == 1,
"Third definition of p_dual_src_export_gfx11 must be a v1", instr.get());
check(instr->definitions[3].getTemp().type() == RegType::sgpr &&
instr->definitions[3].getTemp().size() == 2,
"Fourth definition of p_dual_src_export_gfx11 must be a s2", instr.get());
check(instr->definitions[4].physReg() == vcc,
"Fifth definition of p_dual_src_export_gfx11 must be vcc", instr.get());
check(instr->definitions[5].physReg() == scc,
"Sixth definition of p_dual_src_export_gfx11 must be scc", instr.get());
check(instr->operands.size() == 8, "p_dual_src_export_gfx11 must have 8 operands",
instr.get());
for (unsigned i = 0; i < instr->operands.size(); i++) {
check(instr->operands[i].getTemp().type() == RegType::vgpr ||
instr->operands[i].isUndefined(),
"Operands of p_dual_src_export_gfx11 must be VGPRs or undef", instr.get());
}
} }
break; break;
} }