aco: Fix reductions on GFX10.

Fixes p_reduce (all cluster sizes), p_inclusive_scan and p_exclusive_scan
with all reduction operations.

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
This commit is contained in:
Rhys Perry 2019-09-12 19:28:52 +01:00
parent cd04b63c00
commit 3865448012
3 changed files with 95 additions and 18 deletions

View file

@ -841,7 +841,7 @@ enum ReduceOp {
* Operand(2): vector temporary
* Definition(0): result
* Definition(1): scalar temporary
* Definition(2): scalar identity temporary
* Definition(2): scalar identity temporary (not used to store identity on GFX10)
* Definition(3): scc clobber
* Definition(4): vcc clobber
*

View file

@ -85,6 +85,22 @@ void emit_dpp_op(lower_context *ctx, PhysReg dst, PhysReg src0, PhysReg src1, Ph
}
}
void emit_op(lower_context *ctx, PhysReg dst, PhysReg src0, PhysReg src1,
aco_opcode op, Format format, bool clobber_vcc, unsigned size)
{
aco_ptr<Instruction> instr;
if (format == Format::VOP3)
instr.reset(create_instruction<VOP3A_instruction>(op, format, 2, clobber_vcc ? 2 : 1));
else
instr.reset(create_instruction<VOP2_instruction>(op, format, 2, clobber_vcc ? 2 : 1));
instr->operands[0] = Operand(src0, src0.reg >= 256 ? v1 : s1);
instr->operands[1] = Operand(src1, v1);
instr->definitions[0] = Definition(dst, v1);
if (clobber_vcc)
instr->definitions[1] = Definition(vcc, s2);
ctx->instructions.emplace_back(std::move(instr));
}
uint32_t get_reduction_identity(ReduceOp op, unsigned idx)
{
switch (op) {
@ -236,12 +252,12 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
Operand vcndmask_identity[2] = {identity[0], identity[1]};
/* First, copy the source to tmp and set inactive lanes to the identity */
// note: this clobbers SCC!
bld.sop1(aco_opcode::s_or_saveexec_b64, Definition(stmp, s2), Definition(scc, s1), Definition(exec, s2), Operand(UINT64_MAX), Operand(exec, s2));
for (unsigned i = 0; i < src.size(); i++) {
/* p_exclusive_scan needs it to be a sgpr or inline constant for the v_writelane_b32 */
if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan) {
/* p_exclusive_scan needs it to be a sgpr or inline constant for the v_writelane_b32
* except on GFX10, where v_writelane_b32 can take a literal. */
if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan && ctx->program->chip_class < GFX10) {
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg{sitmp+i}, s1), identity[i]);
identity[i] = Operand(PhysReg{sitmp+i}, s1);
@ -283,6 +299,16 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
exec_restored = true;
emit_vopn(ctx, dst.physReg(), vtmp, tmp, src.regClass(), reduce_opcode, format, should_clobber_vcc);
dst_written = true;
} else if (ctx->program->chip_class >= GFX10) {
assert(cluster_size == 64);
/* GFX10+ doesn't support row_bcast15 and row_bcast31 */
for (unsigned i = 0; i < src.size(); i++)
bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1), Operand(0u), Operand(0u));
emit_op(ctx, tmp, tmp, vtmp, reduce_opcode, format, should_clobber_vcc, src.size());
for (unsigned i = 0; i < src.size(); i++)
bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
emit_op(ctx, tmp, sitmp, tmp, reduce_opcode, format, should_clobber_vcc, src.size());
} else {
assert(cluster_size == 64);
emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
@ -292,11 +318,38 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
}
break;
case aco_opcode::p_exclusive_scan:
emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, aco_opcode::v_mov_b32, Format::VOP1, false,
dpp_wf_sr1, 0xf, 0xf, true, src.size());
if (ctx->program->chip_class >= GFX10) { /* gfx10 doesn't support wf_sr1, so emulate it */
/* shift rows right */
for (unsigned i = 0; i < src.size(); i++) {
bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, s1), dpp_row_sr(1), 0xf, 0xf, true);
}
/* fill in the gaps in rows 1 and 3 */
bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0x10000u));
bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0x10000u));
for (unsigned i = 0; i < src.size(); i++) {
Instruction *perm = bld.vop3(aco_opcode::v_permlanex16_b32,
Definition(PhysReg{vtmp+i}, v1),
Operand(PhysReg{tmp+i}, v1),
Operand(0xffffffffu), Operand(0xffffffffu)).instr;
static_cast<VOP3A_instruction*>(perm)->opsel[0] = true; /* FI (Fetch Inactive) */
}
bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
/* fill in the gap in row 2 */
for (unsigned i = 0; i < src.size(); i++) {
bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
bld.vop3(aco_opcode::v_writelane_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u));
}
std::swap(tmp, vtmp);
} else {
emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, aco_opcode::v_mov_b32, Format::VOP1, false,
dpp_wf_sr1, 0xf, 0xf, true, src.size());
}
for (unsigned i = 0; i < src.size(); i++) {
if (!identity[i].isConstant() || identity[i].constantValue()) { /* bound_ctrl should take case of this overwise */
assert((identity[i].isConstant() && !identity[i].isLiteral()) || identity[i].physReg() == PhysReg{sitmp+i});
if (ctx->program->chip_class < GFX10)
assert((identity[i].isConstant() && !identity[i].isLiteral()) || identity[i].physReg() == PhysReg{sitmp+i});
bld.vop3(aco_opcode::v_writelane_b32, Definition(PhysReg{tmp+i}, v1),
identity[i], Operand(0u));
}
@ -312,10 +365,29 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
dpp_row_sr(4), 0xf, 0xf, false, src.size(), identity);
emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
dpp_row_sr(8), 0xf, 0xf, false, src.size(), identity);
emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
dpp_row_bcast15, 0xa, 0xf, false, src.size(), identity);
emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
dpp_row_bcast31, 0xc, 0xf, false, src.size(), identity);
if (ctx->program->chip_class >= GFX10) {
bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0xffff0000u));
bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0xffff0000u));
for (unsigned i = 0; i < src.size(); i++) {
Instruction *perm = bld.vop3(aco_opcode::v_permlanex16_b32,
Definition(PhysReg{vtmp+i}, v1),
Operand(PhysReg{tmp+i}, v1),
Operand(0xffffffffu), Operand(0xffffffffu)).instr;
static_cast<VOP3A_instruction*>(perm)->opsel[0] = true; /* FI (Fetch Inactive) */
}
emit_op(ctx, tmp, tmp, vtmp, reduce_opcode, format, should_clobber_vcc, src.size());
bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0u));
bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0xffffffffu));
for (unsigned i = 0; i < src.size(); i++)
bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
emit_op(ctx, tmp, sitmp, tmp, reduce_opcode, format, should_clobber_vcc, src.size());
} else {
emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
dpp_row_bcast15, 0xa, 0xf, false, src.size(), identity);
emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
dpp_row_bcast31, 0xc, 0xf, false, src.size(), identity);
}
break;
default:
unreachable("Invalid reduction mode");

View file

@ -115,10 +115,13 @@ void setup_reduce_temp(Program* program)
}
/* same as before, except for the vector temporary instead of the reduce temporary */
unsigned cluster_size = static_cast<Pseudo_reduction_instruction *>(instr)->cluster_size;
bool need_vtmp = op == imul32 || op == fadd64 || op == fmul64 ||
op == fmin64 || op == fmax64;
if (program->chip_class >= GFX10 && cluster_size == 64)
need_vtmp = true;
need_vtmp |= static_cast<Pseudo_reduction_instruction *>(instr)->cluster_size == 32;
need_vtmp |= cluster_size == 32;
vtmp_in_loop |= need_vtmp && block.loop_nest_depth > 0;
if (need_vtmp && (int)last_top_level_block_idx != vtmp_inserted_at) {
vtmp = {program->allocateId(), vtmp.regClass()};
@ -144,12 +147,14 @@ void setup_reduce_temp(Program* program)
instr->definitions[1] = bld.def(s2);
/* scalar identity temporary */
if (instr->opcode == aco_opcode::p_exclusive_scan &&
(op == imin32 || op == imin64 ||
op == imax32 || op == imax64 ||
op == fmin32 || op == fmin64 ||
op == fmax32 || op == fmax64 ||
op == fmul64)) {
bool need_sitmp = program->chip_class >= GFX10 && cluster_size == 64;
if (instr->opcode == aco_opcode::p_exclusive_scan) {
need_sitmp |=
(op == imin32 || op == imin64 || op == imax32 || op == imax64 ||
op == fmin32 || op == fmin64 || op == fmax32 || op == fmax64 ||
op == fmul64);
}
if (need_sitmp) {
instr->definitions[2] = bld.def(RegClass(RegType::sgpr, instr->operands[0].size()));
}