aco: Split opcodes for GFX6 and GFX10 emulated bpermute.

Different sequences are emitted for these, so it makes sense to
have different opcodes too.

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20293>
This commit is contained in:
Timur Kristóf 2022-12-13 09:39:30 +01:00 committed by Marge Bot
parent 614348f28b
commit 640e801651
4 changed files with 20 additions and 13 deletions

View file

@ -199,8 +199,8 @@ emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)
index_op.setLateKill(true);
input_data.setLateKill(true);
return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc),
index_op, input_data);
return bld.pseudo(aco_opcode::p_bpermute_gfx6, bld.def(v1), bld.def(bld.lm),
bld.def(bld.lm, vcc), index_op, input_data);
} else if (ctx->options->gfx_level >= GFX10 && ctx->program->wave_size == 64) {
/* GFX10 wave64 mode: emulate full-wave bpermute */
@ -223,7 +223,7 @@ emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)
* Note, that these have twice the allocation granularity of normal VGPRs */
ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule;
return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc),
return bld.pseudo(aco_opcode::p_bpermute_gfx10w64, bld.def(v1), bld.def(s2), bld.def(s1, scc),
index_x4, input_data, same_half);
} else {
/* GFX8-9 or GFX10 wave32: bpermute works normally */

View file

@ -2193,13 +2193,12 @@ lower_to_hw_instr(Program* program)
}
break;
}
case aco_opcode::p_bpermute: {
if (ctx.program->gfx_level <= GFX7)
emit_gfx6_bpermute(program, instr, bld);
else if (ctx.program->gfx_level >= GFX10 && ctx.program->wave_size == 64)
emit_gfx10_wave64_bpermute(program, instr, bld);
else
unreachable("Current hardware supports ds_bpermute, don't emit p_bpermute.");
case aco_opcode::p_bpermute_gfx6: {
emit_gfx6_bpermute(program, instr, bld);
break;
}
case aco_opcode::p_bpermute_gfx10w64: {
emit_gfx10_wave64_bpermute(program, instr, bld);
break;
}
case aco_opcode::p_constaddr: {

View file

@ -315,8 +315,15 @@ opcode("p_demote_to_helper")
opcode("p_is_helper")
opcode("p_exit_early_if")
# simulates proper bpermute behavior when it's unsupported, eg. GFX10 wave64
opcode("p_bpermute")
# simulates proper bpermute behavior on GFX6
# definitions: result VGPR, temp EXEC, clobbered VCC
# operands: index, input data
opcode("p_bpermute_gfx6")
# simulates proper bpermute behavior on GFX10
# definitions: result VGPR, temp EXEC, clobbered SCC
# operands: index * 4, input data, same half (bool)
opcode("p_bpermute_gfx10w64")
# creates a lane mask where only the first active lane is selected
opcode("p_elect")

View file

@ -673,7 +673,8 @@ alu_can_accept_constant(aco_opcode opcode, unsigned operand)
case aco_opcode::v_readfirstlane_b32:
case aco_opcode::p_extract:
case aco_opcode::p_insert: return operand != 0;
case aco_opcode::p_bpermute:
case aco_opcode::p_bpermute_gfx6:
case aco_opcode::p_bpermute_gfx10w64:
case aco_opcode::p_interp_gfx11:
case aco_opcode::p_dual_src_export_gfx11: return false;
default: return true;