mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-05 05:18:08 +02:00
aco: Fix reductions on GFX10.
Fixes p_reduce (all cluster sizes), p_inclusive_scan and p_exclusive_scan with all reduction operations. Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
This commit is contained in:
parent
cd04b63c00
commit
3865448012
3 changed files with 95 additions and 18 deletions
|
|
@ -841,7 +841,7 @@ enum ReduceOp {
|
|||
* Operand(2): vector temporary
|
||||
* Definition(0): result
|
||||
* Definition(1): scalar temporary
|
||||
* Definition(2): scalar identity temporary
|
||||
* Definition(2): scalar identity temporary (not used to store identity on GFX10)
|
||||
* Definition(3): scc clobber
|
||||
* Definition(4): vcc clobber
|
||||
*
|
||||
|
|
|
|||
|
|
@ -85,6 +85,22 @@ void emit_dpp_op(lower_context *ctx, PhysReg dst, PhysReg src0, PhysReg src1, Ph
|
|||
}
|
||||
}
|
||||
|
||||
void emit_op(lower_context *ctx, PhysReg dst, PhysReg src0, PhysReg src1,
|
||||
aco_opcode op, Format format, bool clobber_vcc, unsigned size)
|
||||
{
|
||||
aco_ptr<Instruction> instr;
|
||||
if (format == Format::VOP3)
|
||||
instr.reset(create_instruction<VOP3A_instruction>(op, format, 2, clobber_vcc ? 2 : 1));
|
||||
else
|
||||
instr.reset(create_instruction<VOP2_instruction>(op, format, 2, clobber_vcc ? 2 : 1));
|
||||
instr->operands[0] = Operand(src0, src0.reg >= 256 ? v1 : s1);
|
||||
instr->operands[1] = Operand(src1, v1);
|
||||
instr->definitions[0] = Definition(dst, v1);
|
||||
if (clobber_vcc)
|
||||
instr->definitions[1] = Definition(vcc, s2);
|
||||
ctx->instructions.emplace_back(std::move(instr));
|
||||
}
|
||||
|
||||
uint32_t get_reduction_identity(ReduceOp op, unsigned idx)
|
||||
{
|
||||
switch (op) {
|
||||
|
|
@ -236,12 +252,12 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
|
|||
Operand vcndmask_identity[2] = {identity[0], identity[1]};
|
||||
|
||||
/* First, copy the source to tmp and set inactive lanes to the identity */
|
||||
// note: this clobbers SCC!
|
||||
bld.sop1(aco_opcode::s_or_saveexec_b64, Definition(stmp, s2), Definition(scc, s1), Definition(exec, s2), Operand(UINT64_MAX), Operand(exec, s2));
|
||||
|
||||
for (unsigned i = 0; i < src.size(); i++) {
|
||||
/* p_exclusive_scan needs it to be a sgpr or inline constant for the v_writelane_b32 */
|
||||
if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan) {
|
||||
/* p_exclusive_scan needs it to be a sgpr or inline constant for the v_writelane_b32
|
||||
* except on GFX10, where v_writelane_b32 can take a literal. */
|
||||
if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan && ctx->program->chip_class < GFX10) {
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg{sitmp+i}, s1), identity[i]);
|
||||
identity[i] = Operand(PhysReg{sitmp+i}, s1);
|
||||
|
||||
|
|
@ -283,6 +299,16 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
|
|||
exec_restored = true;
|
||||
emit_vopn(ctx, dst.physReg(), vtmp, tmp, src.regClass(), reduce_opcode, format, should_clobber_vcc);
|
||||
dst_written = true;
|
||||
} else if (ctx->program->chip_class >= GFX10) {
|
||||
assert(cluster_size == 64);
|
||||
/* GFX10+ doesn't support row_bcast15 and row_bcast31 */
|
||||
for (unsigned i = 0; i < src.size(); i++)
|
||||
bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1), Operand(0u), Operand(0u));
|
||||
emit_op(ctx, tmp, tmp, vtmp, reduce_opcode, format, should_clobber_vcc, src.size());
|
||||
|
||||
for (unsigned i = 0; i < src.size(); i++)
|
||||
bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
|
||||
emit_op(ctx, tmp, sitmp, tmp, reduce_opcode, format, should_clobber_vcc, src.size());
|
||||
} else {
|
||||
assert(cluster_size == 64);
|
||||
emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
|
||||
|
|
@ -292,11 +318,38 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
|
|||
}
|
||||
break;
|
||||
case aco_opcode::p_exclusive_scan:
|
||||
emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, aco_opcode::v_mov_b32, Format::VOP1, false,
|
||||
dpp_wf_sr1, 0xf, 0xf, true, src.size());
|
||||
if (ctx->program->chip_class >= GFX10) { /* gfx10 doesn't support wf_sr1, so emulate it */
|
||||
/* shift rows right */
|
||||
for (unsigned i = 0; i < src.size(); i++) {
|
||||
bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, s1), dpp_row_sr(1), 0xf, 0xf, true);
|
||||
}
|
||||
|
||||
/* fill in the gaps in rows 1 and 3 */
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0x10000u));
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0x10000u));
|
||||
for (unsigned i = 0; i < src.size(); i++) {
|
||||
Instruction *perm = bld.vop3(aco_opcode::v_permlanex16_b32,
|
||||
Definition(PhysReg{vtmp+i}, v1),
|
||||
Operand(PhysReg{tmp+i}, v1),
|
||||
Operand(0xffffffffu), Operand(0xffffffffu)).instr;
|
||||
static_cast<VOP3A_instruction*>(perm)->opsel[0] = true; /* FI (Fetch Inactive) */
|
||||
}
|
||||
bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
|
||||
|
||||
/* fill in the gap in row 2 */
|
||||
for (unsigned i = 0; i < src.size(); i++) {
|
||||
bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
|
||||
bld.vop3(aco_opcode::v_writelane_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u));
|
||||
}
|
||||
std::swap(tmp, vtmp);
|
||||
} else {
|
||||
emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, aco_opcode::v_mov_b32, Format::VOP1, false,
|
||||
dpp_wf_sr1, 0xf, 0xf, true, src.size());
|
||||
}
|
||||
for (unsigned i = 0; i < src.size(); i++) {
|
||||
if (!identity[i].isConstant() || identity[i].constantValue()) { /* bound_ctrl should take case of this overwise */
|
||||
assert((identity[i].isConstant() && !identity[i].isLiteral()) || identity[i].physReg() == PhysReg{sitmp+i});
|
||||
if (ctx->program->chip_class < GFX10)
|
||||
assert((identity[i].isConstant() && !identity[i].isLiteral()) || identity[i].physReg() == PhysReg{sitmp+i});
|
||||
bld.vop3(aco_opcode::v_writelane_b32, Definition(PhysReg{tmp+i}, v1),
|
||||
identity[i], Operand(0u));
|
||||
}
|
||||
|
|
@ -312,10 +365,29 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
|
|||
dpp_row_sr(4), 0xf, 0xf, false, src.size(), identity);
|
||||
emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
|
||||
dpp_row_sr(8), 0xf, 0xf, false, src.size(), identity);
|
||||
emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
|
||||
dpp_row_bcast15, 0xa, 0xf, false, src.size(), identity);
|
||||
emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
|
||||
dpp_row_bcast31, 0xc, 0xf, false, src.size(), identity);
|
||||
if (ctx->program->chip_class >= GFX10) {
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0xffff0000u));
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0xffff0000u));
|
||||
for (unsigned i = 0; i < src.size(); i++) {
|
||||
Instruction *perm = bld.vop3(aco_opcode::v_permlanex16_b32,
|
||||
Definition(PhysReg{vtmp+i}, v1),
|
||||
Operand(PhysReg{tmp+i}, v1),
|
||||
Operand(0xffffffffu), Operand(0xffffffffu)).instr;
|
||||
static_cast<VOP3A_instruction*>(perm)->opsel[0] = true; /* FI (Fetch Inactive) */
|
||||
}
|
||||
emit_op(ctx, tmp, tmp, vtmp, reduce_opcode, format, should_clobber_vcc, src.size());
|
||||
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0u));
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0xffffffffu));
|
||||
for (unsigned i = 0; i < src.size(); i++)
|
||||
bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
|
||||
emit_op(ctx, tmp, sitmp, tmp, reduce_opcode, format, should_clobber_vcc, src.size());
|
||||
} else {
|
||||
emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
|
||||
dpp_row_bcast15, 0xa, 0xf, false, src.size(), identity);
|
||||
emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
|
||||
dpp_row_bcast31, 0xc, 0xf, false, src.size(), identity);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
unreachable("Invalid reduction mode");
|
||||
|
|
|
|||
|
|
@ -115,10 +115,13 @@ void setup_reduce_temp(Program* program)
|
|||
}
|
||||
|
||||
/* same as before, except for the vector temporary instead of the reduce temporary */
|
||||
unsigned cluster_size = static_cast<Pseudo_reduction_instruction *>(instr)->cluster_size;
|
||||
bool need_vtmp = op == imul32 || op == fadd64 || op == fmul64 ||
|
||||
op == fmin64 || op == fmax64;
|
||||
if (program->chip_class >= GFX10 && cluster_size == 64)
|
||||
need_vtmp = true;
|
||||
|
||||
need_vtmp |= static_cast<Pseudo_reduction_instruction *>(instr)->cluster_size == 32;
|
||||
need_vtmp |= cluster_size == 32;
|
||||
vtmp_in_loop |= need_vtmp && block.loop_nest_depth > 0;
|
||||
if (need_vtmp && (int)last_top_level_block_idx != vtmp_inserted_at) {
|
||||
vtmp = {program->allocateId(), vtmp.regClass()};
|
||||
|
|
@ -144,12 +147,14 @@ void setup_reduce_temp(Program* program)
|
|||
instr->definitions[1] = bld.def(s2);
|
||||
|
||||
/* scalar identity temporary */
|
||||
if (instr->opcode == aco_opcode::p_exclusive_scan &&
|
||||
(op == imin32 || op == imin64 ||
|
||||
op == imax32 || op == imax64 ||
|
||||
op == fmin32 || op == fmin64 ||
|
||||
op == fmax32 || op == fmax64 ||
|
||||
op == fmul64)) {
|
||||
bool need_sitmp = program->chip_class >= GFX10 && cluster_size == 64;
|
||||
if (instr->opcode == aco_opcode::p_exclusive_scan) {
|
||||
need_sitmp |=
|
||||
(op == imin32 || op == imin64 || op == imax32 || op == imax64 ||
|
||||
op == fmin32 || op == fmin64 || op == fmax32 || op == fmax64 ||
|
||||
op == fmul64);
|
||||
}
|
||||
if (need_sitmp) {
|
||||
instr->definitions[2] = bld.def(RegClass(RegType::sgpr, instr->operands[0].size()));
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue