mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-04-27 04:30:37 +02:00
aco: Emulate Wave64 bpermute on GFX11.
Similar to emit_gfx10_wave64_bpermute, but uses the new v_permlane64_b32 instruction to swap data between wave halves. Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20293>
This commit is contained in:
parent
853e76f007
commit
db5c3f170f
6 changed files with 81 additions and 4 deletions
|
|
@ -228,7 +228,9 @@ emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)
|
|||
return bld.pseudo(aco_opcode::p_bpermute_gfx10w64, bld.def(v1), bld.def(s2),
|
||||
bld.def(s1, scc), index_x4, input_data, same_half);
|
||||
} else {
|
||||
unreachable("emit_bpermute does not yet support GFX11+");
|
||||
return bld.pseudo(aco_opcode::p_bpermute_gfx11w64, bld.def(v1), bld.def(s2),
|
||||
bld.def(s1, scc), Operand(v1.as_linear()), index_x4, input_data,
|
||||
same_half);
|
||||
}
|
||||
} else {
|
||||
/* GFX8-9 or GFX10 wave32: bpermute works normally */
|
||||
|
|
|
|||
|
|
@ -838,6 +838,67 @@ emit_reduction(lower_context* ctx, aco_opcode op, ReduceOp reduce_op, unsigned c
|
|||
}
|
||||
}
|
||||
|
||||
void
|
||||
emit_gfx11_wave64_bpermute(Program* program, aco_ptr<Instruction>& instr, Builder& bld)
|
||||
{
|
||||
/* Emulates proper bpermute on GFX11 in wave64 mode.
|
||||
*
|
||||
* Similar to emit_gfx10_wave64_bpermute, but uses the new
|
||||
* v_permlane64_b32 instruction to swap data between lo and hi halves.
|
||||
*/
|
||||
|
||||
assert(program->gfx_level >= GFX11);
|
||||
assert(program->wave_size == 64);
|
||||
|
||||
Definition dst = instr->definitions[0];
|
||||
Definition tmp_exec = instr->definitions[1];
|
||||
Definition clobber_scc = instr->definitions[2];
|
||||
Operand tmp_op = instr->operands[0];
|
||||
Operand index_x4 = instr->operands[1];
|
||||
Operand input_data = instr->operands[2];
|
||||
Operand same_half = instr->operands[3];
|
||||
|
||||
assert(dst.regClass() == v1);
|
||||
assert(tmp_exec.regClass() == bld.lm);
|
||||
assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
|
||||
assert(same_half.regClass() == bld.lm);
|
||||
assert(tmp_op.regClass() == v1.as_linear());
|
||||
assert(index_x4.regClass() == v1);
|
||||
assert(input_data.regClass().type() == RegType::vgpr);
|
||||
assert(input_data.bytes() <= 4);
|
||||
|
||||
Definition tmp_def(tmp_op.physReg(), tmp_op.regClass());
|
||||
|
||||
/* Permute the input within the same half-wave. */
|
||||
bld.ds(aco_opcode::ds_bpermute_b32, dst, index_x4, input_data);
|
||||
|
||||
/* Save EXEC and enable all lanes. */
|
||||
bld.sop1(aco_opcode::s_or_saveexec_b64, tmp_exec, clobber_scc, Definition(exec, s2),
|
||||
Operand::c32(-1u), Operand(exec, s2));
|
||||
|
||||
/* Copy input data from other half to current half's linear VGPR. */
|
||||
bld.vop1(aco_opcode::v_permlane64_b32, tmp_def, input_data);
|
||||
|
||||
/* Permute the input from the other half-wave, write to linear VGPR. */
|
||||
bld.ds(aco_opcode::ds_bpermute_b32, tmp_def, index_x4, tmp_op);
|
||||
|
||||
/* Restore saved EXEC. */
|
||||
bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(tmp_exec.physReg(), s2));
|
||||
|
||||
/* Select correct permute result. */
|
||||
bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, tmp_op, Operand(dst.physReg(), dst.regClass()),
|
||||
same_half);
|
||||
|
||||
/* RA assumes that the result is always in the low part of the register, so we have to shift,
|
||||
* if it's not there already.
|
||||
*/
|
||||
if (input_data.physReg().byte()) {
|
||||
unsigned right_shift = input_data.physReg().byte() * 8;
|
||||
bld.vop2(aco_opcode::v_lshrrev_b32, dst, Operand::c32(right_shift),
|
||||
Operand(dst.physReg(), dst.regClass()));
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
emit_gfx10_wave64_bpermute(Program* program, aco_ptr<Instruction>& instr, Builder& bld)
|
||||
{
|
||||
|
|
@ -2202,6 +2263,10 @@ lower_to_hw_instr(Program* program)
|
|||
emit_gfx10_wave64_bpermute(program, instr, bld);
|
||||
break;
|
||||
}
|
||||
case aco_opcode::p_bpermute_gfx11w64: {
|
||||
emit_gfx11_wave64_bpermute(program, instr, bld);
|
||||
break;
|
||||
}
|
||||
case aco_opcode::p_constaddr: {
|
||||
unsigned id = instr->definitions[0].tempId();
|
||||
PhysReg reg = instr->definitions[0].physReg();
|
||||
|
|
|
|||
|
|
@ -325,6 +325,11 @@ opcode("p_bpermute_gfx6")
|
|||
# operands: index * 4, input data, same half (bool)
|
||||
opcode("p_bpermute_gfx10w64")
|
||||
|
||||
# simulates proper bpermute behavior on GFX11
|
||||
# definitions: result VGPR, temp EXEC, clobbered SCC
|
||||
# operands: linear VGPR, index * 4, input data, same half (bool)
|
||||
opcode("p_bpermute_gfx11w64")
|
||||
|
||||
# creates a lane mask where only the first active lane is selected
|
||||
opcode("p_elect")
|
||||
|
||||
|
|
|
|||
|
|
@ -675,6 +675,7 @@ alu_can_accept_constant(aco_opcode opcode, unsigned operand)
|
|||
case aco_opcode::p_insert: return operand != 0;
|
||||
case aco_opcode::p_bpermute_gfx6:
|
||||
case aco_opcode::p_bpermute_gfx10w64:
|
||||
case aco_opcode::p_bpermute_gfx11w64:
|
||||
case aco_opcode::p_interp_gfx11:
|
||||
case aco_opcode::p_dual_src_export_gfx11: return false;
|
||||
default: return true;
|
||||
|
|
|
|||
|
|
@ -45,7 +45,8 @@ setup_reduce_temp(Program* program)
|
|||
std::vector<bool> hasReductions(program->blocks.size());
|
||||
for (Block& block : program->blocks) {
|
||||
for (aco_ptr<Instruction>& instr : block.instructions) {
|
||||
if (instr->opcode == aco_opcode::p_interp_gfx11) {
|
||||
if (instr->opcode == aco_opcode::p_interp_gfx11 ||
|
||||
instr->opcode == aco_opcode::p_bpermute_gfx11w64) {
|
||||
maxSize = MAX2(maxSize, 1);
|
||||
hasReductions[block.index] = true;
|
||||
} else if (instr->format == Format::PSEUDO_REDUCTION) {
|
||||
|
|
@ -95,7 +96,8 @@ setup_reduce_temp(Program* program)
|
|||
for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
|
||||
Instruction* instr = (*it).get();
|
||||
if (instr->format != Format::PSEUDO_REDUCTION &&
|
||||
instr->opcode != aco_opcode::p_interp_gfx11)
|
||||
instr->opcode != aco_opcode::p_interp_gfx11 &&
|
||||
instr->opcode != aco_opcode::p_bpermute_gfx11w64)
|
||||
continue;
|
||||
|
||||
reduceTmp_in_loop |= block.loop_nest_depth > 0;
|
||||
|
|
@ -169,7 +171,8 @@ setup_reduce_temp(Program* program)
|
|||
if (need_vtmp)
|
||||
instr->operands[2] = Operand(vtmp);
|
||||
} else {
|
||||
assert(instr->opcode == aco_opcode::p_interp_gfx11);
|
||||
assert(instr->opcode == aco_opcode::p_interp_gfx11 ||
|
||||
instr->opcode == aco_opcode::p_bpermute_gfx11w64);
|
||||
instr->operands[0] = Operand(reduceTmp);
|
||||
instr->operands[0].setLateKill(true);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -264,6 +264,7 @@ validate_ir(Program* program)
|
|||
instr->opcode == aco_opcode::p_jump_to_epilog ||
|
||||
instr->opcode == aco_opcode::p_dual_src_export_gfx11 ||
|
||||
(instr->opcode == aco_opcode::p_interp_gfx11 && i == 0) ||
|
||||
(instr->opcode == aco_opcode::p_bpermute_gfx11w64 && i == 0) ||
|
||||
(flat && i == 1) || (instr->isMIMG() && (i == 1 || i == 2)) ||
|
||||
((instr->isMUBUF() || instr->isMTBUF()) && i == 1) ||
|
||||
(instr->isScratch() && i == 0);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue