aco: handle SGPR limitations when applying extract

We were already doing this, but missing it in a few places.

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31762>
This commit is contained in:
Rhys Perry 2024-10-17 16:56:53 +01:00 committed by Marge Bot
parent 07e28dad75
commit d3ac69f79b
2 changed files with 31 additions and 1 deletions

View file

@ -1057,6 +1057,10 @@ can_apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_i
(instr->opcode == aco_opcode::s_pack_hl_b32_b16 && idx == 1))) {
return true;
} else if (instr->opcode == aco_opcode::p_extract) {
if (ctx.program->gfx_level < GFX9 && !info.instr->operands[0].isOfType(RegType::vgpr) &&
instr->definitions[0].regClass().is_subdword())
return false;
SubdwordSel instrSel = parse_extract(instr.get());
/* the outer offset must be within extracted range */
@ -1088,7 +1092,7 @@ apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info&
ctx.info[tmp.id()].label &= ~label_insert;
if (sel.size() == 4) {
if (sel.size() == 4 && tmp.type() == instr->operands[idx].regClass().type()) {
/* full dword selection */
} else if ((instr->opcode == aco_opcode::v_cvt_f32_u32 ||
instr->opcode == aco_opcode::v_cvt_f32_i32) &&
@ -1104,6 +1108,8 @@ apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info&
((sel.size() == 2 && instr->operands[0].constantValue() >= 16u) ||
(sel.size() == 1 && instr->operands[0].constantValue() >= 24u))) {
/* The undesirable upper bits are already shifted out. */
if (!instr->isVOP3() && !info.instr->operands[0].isOfType(RegType::vgpr))
instr->format = asVOP3(instr->format);
return;
} else if (instr->opcode == aco_opcode::v_mul_u32_u24 && ctx.program->gfx_level >= GFX10 &&
!instr->usesModifiers() && sel.size() == 2 && !sel.sign_extend() &&

View file

@ -634,3 +634,27 @@ BEGIN_TEST(optimize.sdwa.special_case_valu)
finish_opt_test();
END_TEST
BEGIN_TEST(optimize.sdwa.extract_sgpr_limits)
//>> s1: %a = p_startpgm
if (!setup_cs("s1", GFX8))
return;
Temp a = inputs[0];
Temp a_vgpr = bld.copy(bld.def(v1), a);
/* The optimizer should make this VOP3 */
//! v1: %res0 = v_lshlrev_b32 16, %a
//! p_unit_test 0, %res0
writeout(
0, bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(16), ext_ushort(a_vgpr, 0)));
/* Unsupported on GFX8 */
//! v1: %res1_tmp = p_extract %a, 0, 16, 0
//! v1b: %res1 = p_extract %res1_tmp, 0, 8, 0
//! p_unit_test 1, %res1
writeout(1, bld.pseudo(aco_opcode::p_extract, bld.def(v1b), ext_ushort(a_vgpr, 0),
Operand::c32(0), Operand::c32(8), Operand::c32(false)));
finish_opt_test();
END_TEST