aco/lower_to_hw: Fix SGPR Operand RegClasses for pack_2x16

Cc: mesa-stable
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39107>
This commit is contained in:
Daniel Schürmann 2026-01-02 11:31:24 +01:00 committed by Marge Bot
parent 9f5996ae8a
commit b087bf2fbf

View file

@ -1613,6 +1613,12 @@ do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Opera
bool can_use_vop3 = ctx->program->gfx_level >= GFX10 ||
(!lo.isLiteral() && !hi.isLiteral() && !both_ops_are_sgpr);
/* Fix SGPR operands RegClasses. */
bool opsel_lo = lo.physReg().byte();
bool opsel_hi = hi.physReg().byte();
lo = lo.isOfType(RegType::sgpr) ? Operand(PhysReg(lo.physReg().reg()), lo.regClass()) : lo;
hi = hi.isOfType(RegType::sgpr) ? Operand(PhysReg(hi.physReg().reg()), hi.regClass()) : hi;
/* v_pack_b32_f16 can be used for bit exact copies if:
* - fp16 input denorms are enabled, otherwise they get flushed to zero
* - signalling input NaNs are kept, which is the case with IEEE_MODE=0
@ -1625,12 +1631,13 @@ do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Opera
if (can_use_pack) {
Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, def, lo, hi);
/* opsel: 0 = select low half, 1 = select high half. [0] = src0, [1] = src1 */
instr->valu().opsel = hi.physReg().byte() | (lo.physReg().byte() >> 1);
instr->valu().opsel[0] = opsel_lo;
instr->valu().opsel[1] = opsel_hi;
return;
}
/* a single alignbyte can be sufficient: hi can be a 32-bit integer constant */
if (lo.physReg().byte() == 2 && hi.physReg().byte() == 0 && can_use_vop3 &&
if (opsel_lo && !opsel_hi && can_use_vop3 &&
(!hi.isConstant() || (hi.constantValue() && (!Operand::c32(hi.constantValue()).isLiteral() ||
ctx->program->gfx_level >= GFX10)))) {
if (hi.isConstant())
@ -1649,8 +1656,9 @@ do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Opera
ops[i] =
ops[i].isConstant() ? Operand::c32((int32_t)(int16_t)ops[i].constantValue()) : ops[i];
swiz[i * 2 + 0] = i * 4 + ops[i].physReg().byte();
swiz[i * 2 + 1] = i * 4 + ops[i].physReg().byte() + 1;
unsigned opsel = i ? opsel_hi : opsel_lo;
swiz[i * 2 + 0] = i * 4 + opsel * 2;
swiz[i * 2 + 1] = i * 4 + opsel * 2 + 1;
if (ops[i].isLiteral()) {
Operand b0 = Operand::c32((int32_t)(int8_t)ops[i].constantValue());
@ -1681,7 +1689,7 @@ do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Opera
if (lo.isConstant()) {
/* move hi and zero low bits */
if (hi.physReg().byte() == 0)
if (!opsel_hi)
bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), hi);
else
bld.vop2(aco_opcode::v_and_b32, def_hi, Operand::c32(~0xFFFFu), hi);
@ -1692,7 +1700,7 @@ do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Opera
}
if (hi.isConstant()) {
/* move lo and zero high bits */
if (lo.physReg().byte() == 2)
if (opsel_lo)
bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), lo);
else if (ctx->program->gfx_level >= GFX11)
bld.vop1(aco_opcode::v_cvt_u32_u16, def, lo);
@ -1706,36 +1714,39 @@ do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Opera
if (lo.physReg().reg() == def.physReg().reg()) {
/* lo is in the high bits of def */
assert(lo.physReg().byte() == 2);
assert(opsel_lo);
bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), lo);
lo.setFixed(def.physReg());
} else if (hi.physReg() == def.physReg()) {
lo = Operand(def.physReg(), v2b);
} else if (hi.physReg().reg() == def.physReg().reg()) {
/* hi is in the low bits of def */
assert(hi.physReg().byte() == 0);
assert(!opsel_hi);
bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), hi);
hi.setFixed(def.physReg().advance(2));
hi = Operand(def.physReg().advance(2), v2b);
} else if (ctx->program->gfx_level >= GFX8) {
/* Either lo or hi can be placed with just a v_mov. SDWA is not needed, because
* op.physReg().byte()==def.physReg().byte() and the other half will be overwritten.
* op.physReg().byte() == def.physReg().byte() and the other half will be overwritten.
*/
assert(lo.physReg().byte() == 0 || hi.physReg().byte() == 2);
Operand& op = lo.physReg().byte() == 0 ? lo : hi;
assert(!opsel_lo || opsel_hi);
Operand& op = !opsel_lo ? lo : hi;
PhysReg reg = def.physReg().advance(op.physReg().byte());
bld.vop1(aco_opcode::v_mov_b32, Definition(reg, v2b), op);
op.setFixed(reg);
op = Operand(reg, v2b);
}
/* either hi or lo are already placed correctly */
/* Either hi or lo are already placed correctly. */
bool opsel = lo.physReg() == def.physReg() ? opsel_hi : opsel_lo;
Operand op = lo.physReg() == def.physReg() ? hi : lo;
def = lo.physReg() == def.physReg() ? def_hi : def_lo;
if (ctx->program->gfx_level >= GFX11) {
if (lo.physReg().reg() == def.physReg().reg())
emit_v_mov_b16(bld, def_hi, hi);
else
emit_v_mov_b16(bld, def_lo, lo);
Instruction* instr = bld.vop1(aco_opcode::v_mov_b16, def, op);
instr->valu().opsel[0] = opsel;
instr->valu().opsel[3] = def.physReg().byte();
if (op.isOfType(RegType::sgpr) && opsel)
instr->format = asVOP3(instr->format);
} else {
if (lo.physReg().reg() == def.physReg().reg())
bld.vop1_sdwa(aco_opcode::v_mov_b32, def_hi, hi);
else
bld.vop1_sdwa(aco_opcode::v_mov_b32, def_lo, lo);
Instruction* instr = bld.vop1_sdwa(aco_opcode::v_mov_b32, def, op);
if (op.isOfType(RegType::sgpr))
instr->sdwa().sel[0] = opsel ? SubdwordSel::uword1 : SubdwordSel::uword0;
}
}