aco/gfx11+: don't use VOP3 v_swap_b16

v_swap_b16 is not offically supported as VOP3, so it can't be used with v128-255.
Tests show that VOP3 appears to work correctly, but according to AMD that should
not be relied on.

https://github.com/llvm/llvm-project/pull/100442#discussion_r1703929676

Foz-DB Navi31:
Totals from 6 (0.01% of 79395) affected shaders:
Instrs: 64799 -> 65932 (+1.75%)
CodeSize: 360180 -> 368440 (+2.29%)
Latency: 1364648 -> 1365922 (+0.09%)
InvThroughput: 635843 -> 636475 (+0.10%)
Copies: 14766 -> 15698 (+6.31%)
VALU: 38743 -> 39675 (+2.41%)

Fixes: 80b8bbf0c5 ("aco/gfx11: use v_swap_b16")

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30515>
This commit is contained in:
Georg Lehmann 2024-08-05 14:21:19 +02:00 committed by Marge Bot
parent 796b3ab23d
commit e0818cb87b

View file

@ -1408,9 +1408,28 @@ swap_subdword_gfx11(Builder& bld, Definition def, Operand op)
if (def.bytes() == 2) {
Operand def_as_op = Operand(def.physReg(), def.regClass());
Definition op_as_def = Definition(op.physReg(), op.regClass());
Instruction* instr = bld.vop1(aco_opcode::v_swap_b16, def, op_as_def, op, def_as_op);
instr->valu().opsel[0] = op.physReg().byte();
instr->valu().opsel[3] = def.physReg().byte();
/* v_swap_b16 is not offically supported as VOP3, so it can't be used with v128-255.
* Tests show that VOP3 appears to work correctly, but according to AMD that should
* not be relied on.
*/
if (def.physReg() < (256 + 128) && op.physReg() < (256 + 128)) {
Instruction* instr = bld.vop1(aco_opcode::v_swap_b16, def, op_as_def, op, def_as_op);
instr->valu().opsel[0] = op.physReg().byte();
instr->valu().opsel[3] = def.physReg().byte();
} else {
Instruction* instr = bld.vop3(aco_opcode::v_xor_b16, def, op, def_as_op);
instr->valu().opsel[0] = op.physReg().byte();
instr->valu().opsel[1] = def_as_op.physReg().byte();
instr->valu().opsel[3] = def.physReg().byte();
instr = bld.vop3(aco_opcode::v_xor_b16, op_as_def, op, def_as_op);
instr->valu().opsel[0] = op.physReg().byte();
instr->valu().opsel[1] = def_as_op.physReg().byte();
instr->valu().opsel[3] = op_as_def.physReg().byte();
instr = bld.vop3(aco_opcode::v_xor_b16, def, op, def_as_op);
instr->valu().opsel[0] = op.physReg().byte();
instr->valu().opsel[1] = def_as_op.physReg().byte();
instr->valu().opsel[3] = def.physReg().byte();
}
} else {
PhysReg op_half = op.physReg();
op_half.reg_b &= ~1;