aco/gfx12: VOPD src0/1 are src bank compatible if they are the same vgpr

fossil-db (gfx1201):
Totals from 66518 (83.80% of 79377) affected shaders:
Instrs: 36939667 -> 36656685 (-0.77%); split: -0.79%, +0.02%
CodeSize: 220575208 -> 220201764 (-0.17%); split: -0.21%, +0.04%
Latency: 258919732 -> 258137974 (-0.30%); split: -0.35%, +0.05%
InvThroughput: 49911351 -> 49643836 (-0.54%); split: -0.55%, +0.02%
VClause: 788661 -> 788430 (-0.03%); split: -0.04%, +0.01%
SClause: 1176416 -> 1176263 (-0.01%); split: -0.02%, +0.01%
VALU: 18014058 -> 17818119 (-1.09%); split: -1.10%, +0.01%
VOPD: 4926983 -> 5122922 (+3.98%); split: +4.01%, -0.04%

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34246>
This commit is contained in:
Rhys Perry 2025-03-26 16:01:29 +00:00 committed by Marge Bot
parent 3446f2059d
commit 4fcf2eb1d7
2 changed files with 128 additions and 9 deletions

View file

@ -37,6 +37,7 @@ struct VOPDInfo {
uint16_t is_commutative : 1;
aco_opcode op = aco_opcode::num_opcodes;
uint32_t literal = 0;
uint8_t port_vgprs[2] = {0, 0};
};
struct InstrInfo {
@ -190,8 +191,11 @@ get_vopd_info(const SchedILPContext& ctx, const Instruction* instr)
op = Operand::get_const(ctx.program->gfx_level, util_bitreverse(op.constantValue()), 4);
unsigned port = (instr->opcode == aco_opcode::v_fmamk_f32 && i == 1) ? 2 : i;
if (op.isOfType(RegType::vgpr))
if (op.isOfType(RegType::vgpr)) {
info.src_banks |= 1 << (port * 4 + (op.physReg().reg() & bank_mask[port]));
if (port < 2)
info.port_vgprs[port] = op.physReg().reg();
}
/* Check all operands because of fmaak/fmamk. */
if (op.isLiteral()) {
@ -225,12 +229,29 @@ are_src_banks_compatible(enum amd_gfx_level gfx_level, const VOPDInfo& a, const
}
uint16_t a_src_banks = a.src_banks;
uint8_t a_port_vgprs[2] = {a.port_vgprs[0], a.port_vgprs[1]};
if (swap) {
uint16_t src0 = a.src_banks & 0xf;
uint16_t src1 = a.src_banks & 0xf0;
uint16_t src2 = a.src_banks & 0x300;
a_src_banks = (src0 << 4) | (src1 >> 4) | src2;
std::swap(a_port_vgprs[0], a_port_vgprs[1]);
}
/* On GFX12+, we can skip checking a src0/src1 port if both SRCx and SRCy use the same VGPR and
* the same sized operand.
*/
if (gfx_level >= GFX12) {
bool a_is_dot2cc =
a.op == aco_opcode::v_dual_dot2acc_f32_f16 || a.op == aco_opcode::v_dual_dot2acc_f32_bf16;
bool b_is_dot2cc =
b.op == aco_opcode::v_dual_dot2acc_f32_f16 || b.op == aco_opcode::v_dual_dot2acc_f32_bf16;
if (a_port_vgprs[0] == b.port_vgprs[0] && a_is_dot2cc == b_is_dot2cc)
a_src_banks &= ~0xf;
if (a_port_vgprs[1] == b.port_vgprs[1] && a_is_dot2cc == b_is_dot2cc)
a_src_banks &= ~0xf0;
}
return (a_src_banks & b.src_banks) == 0;
}

View file

@ -99,8 +99,9 @@ BEGIN_TEST(vopd_sched.mov_to_add_bfrev)
*/
bld.reset(program->create_and_insert_block());
//>> p_unit_test 1
//! v1: %0:v[0] = v_mov_b32 %0:v[2]
//! v1: %0:v[1] = v_lshlrev_b32 %0:v[2], %0:v[3]
//~gfx11! v1: %0:v[0] = v_mov_b32 %0:v[2]
//~gfx11! v1: %0:v[1] = v_lshlrev_b32 %0:v[2], %0:v[3]
//~gfx12! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_lshlrev_b32 %0:v[2], %0:v[3]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
bld.vop1(aco_opcode::v_mov_b32, Definition(reg_v0, v1), Operand(reg_v2, v1));
bld.vop2(aco_opcode::v_lshlrev_b32, Definition(reg_v1, v1), Operand(reg_v2, v1),
@ -108,8 +109,9 @@ BEGIN_TEST(vopd_sched.mov_to_add_bfrev)
bld.reset(program->create_and_insert_block());
//>> p_unit_test 2
//! v1: %0:v[1] = v_lshlrev_b32 %0:v[2], %0:v[3]
//! v1: %0:v[0] = v_mov_b32 %0:v[2]
//~gfx11! v1: %0:v[1] = v_lshlrev_b32 %0:v[2], %0:v[3]
//~gfx11! v1: %0:v[0] = v_mov_b32 %0:v[2]
//~gfx12! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_lshlrev_b32 %0:v[2], %0:v[3]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
bld.vop2(aco_opcode::v_lshlrev_b32, Definition(reg_v1, v1), Operand(reg_v2, v1),
Operand(reg_v3, v1));
@ -117,7 +119,8 @@ BEGIN_TEST(vopd_sched.mov_to_add_bfrev)
bld.reset(program->create_and_insert_block());
//>> p_unit_test 3
//! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_and_b32 %0:v[3], %0:v[2]
//~gfx11! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_and_b32 %0:v[3], %0:v[2]
//~gfx12! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_and_b32 %0:v[2], %0:v[3]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
bld.vop1(aco_opcode::v_mov_b32, Definition(reg_v0, v1), Operand(reg_v2, v1));
bld.vop2(aco_opcode::v_and_b32, Definition(reg_v1, v1), Operand(reg_v2, v1),
@ -125,7 +128,8 @@ BEGIN_TEST(vopd_sched.mov_to_add_bfrev)
bld.reset(program->create_and_insert_block());
//>> p_unit_test 4
//! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_and_b32 %0:v[3], %0:v[2]
//~gfx11! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_and_b32 %0:v[3], %0:v[2]
//~gfx12! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_and_b32 %0:v[2], %0:v[3]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
bld.vop2(aco_opcode::v_and_b32, Definition(reg_v1, v1), Operand(reg_v2, v1),
Operand(reg_v3, v1));
@ -134,7 +138,8 @@ BEGIN_TEST(vopd_sched.mov_to_add_bfrev)
/* The v_add_u32 should be OPY, not OPX. */
bld.reset(program->create_and_insert_block());
//>> p_unit_test 5
//! v1: %0:v[1] = v_dual_fmamk_f32 %0:v[2], %0:v[3], 0 :: v1: %0:v[0] = v_dual_add_nc_u32 0, %0:v[2]
//~gfx11! v1: %0:v[1] = v_dual_fmamk_f32 %0:v[2], %0:v[3], 0 :: v1: %0:v[0] = v_dual_add_nc_u32 0, %0:v[2]
//~gfx12! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_fmamk_f32 %0:v[2], %0:v[3], 0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
bld.vop2(aco_opcode::v_fmamk_f32, Definition(reg_v1, v1), Operand(reg_v2, v1),
Operand(reg_v3, v1), Operand::zero());
@ -142,7 +147,8 @@ BEGIN_TEST(vopd_sched.mov_to_add_bfrev)
bld.reset(program->create_and_insert_block());
//>> p_unit_test 6
//! v1: %0:v[1] = v_dual_fmamk_f32 %0:v[2], %0:v[3], 0 :: v1: %0:v[0] = v_dual_add_nc_u32 0, %0:v[2]
//~gfx11! v1: %0:v[1] = v_dual_fmamk_f32 %0:v[2], %0:v[3], 0 :: v1: %0:v[0] = v_dual_add_nc_u32 0, %0:v[2]
//~gfx12! v1: %0:v[1] = v_dual_fmamk_f32 %0:v[2], %0:v[3], 0 :: v1: %0:v[0] = v_dual_mov_b32 %0:v[2]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
bld.vop1(aco_opcode::v_mov_b32, Definition(reg_v0, v1), Operand(reg_v2, v1));
bld.vop2(aco_opcode::v_fmamk_f32, Definition(reg_v1, v1), Operand(reg_v2, v1),
@ -191,3 +197,95 @@ BEGIN_TEST(vopd_sched.war)
finish_schedule_vopd_test();
}
END_TEST
BEGIN_TEST(vopd_sched.same_vgpr)
for (amd_gfx_level gfx : {GFX11, GFX12}) {
if (!setup_cs(NULL, gfx, CHIP_UNKNOWN, "", 32))
continue;
PhysReg reg_v0{256};
PhysReg reg_v1{257};
PhysReg reg_v2{258};
PhysReg reg_v3{259};
//>> p_unit_test 0
//~gfx11! v1: %0:v[1] = v_dual_add_f32 %0:v[1], %0:v[2] :: v1: %0:v[0] = v_dual_add_f32 %0:v[2], %0:v[0]
//~gfx12! v1: %0:v[1] = v_dual_add_f32 %0:v[2], %0:v[1] :: v1: %0:v[0] = v_dual_add_f32 %0:v[2], %0:v[0]
bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v0, v1), Operand(reg_v2, v1),
Operand(reg_v0, v1));
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v1, v1), Operand(reg_v2, v1),
Operand(reg_v1, v1));
//>> p_unit_test 1
//~gfx11! v1: %0:v[1] = v_dual_add_f32 %0:v[2], %0:v[1] :: v1: %0:v[0] = v_dual_add_f32 %0:v[0], %0:v[2]
//~gfx12! v1: %0:v[1] = v_dual_add_f32 %0:v[1], %0:v[2] :: v1: %0:v[0] = v_dual_add_f32 %0:v[0], %0:v[2]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v0, v1), Operand(reg_v0, v1),
Operand(reg_v2, v1));
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v1, v1), Operand(reg_v1, v1),
Operand(reg_v2, v1));
//>> p_unit_test 2
//~gfx11! v1: %0:v[1] = v_dual_add_f32 %0:v[3], %0:v[2] :: v1: %0:v[0] = v_dual_add_f32 %0:v[2], %0:v[3]
//~gfx12! v1: %0:v[1] = v_dual_add_f32 %0:v[2], %0:v[3] :: v1: %0:v[0] = v_dual_add_f32 %0:v[2], %0:v[3]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v0, v1), Operand(reg_v2, v1),
Operand(reg_v3, v1));
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v1, v1), Operand(reg_v2, v1),
Operand(reg_v3, v1));
//>> p_unit_test 3
//~gfx11! v1: %0:v[0] = v_add_f32 %0:v[2], %0:v[2]
//~gfx11! v1: %0:v[1] = v_add_f32 %0:v[2], %0:v[2]
//~gfx12! v1: %0:v[1] = v_dual_add_f32 %0:v[2], %0:v[2] :: v1: %0:v[0] = v_dual_add_f32 %0:v[2], %0:v[2]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v0, v1), Operand(reg_v2, v1),
Operand(reg_v2, v1));
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v1, v1), Operand(reg_v2, v1),
Operand(reg_v2, v1));
/* the sources can't be swapped because src0 is an SGPR */
//>> p_unit_test 4
//~gfx11! v1: %0:v[1] = v_mul_f32 %0:s[1], %0:v[2]
//~gfx11! v1: %0:v[0] = v_mul_f32 %0:s[2], %0:v[2]
//~gfx12! v1: %0:v[0] = v_dual_mul_f32 %0:s[2], %0:v[2] :: v1: %0:v[1] = v_dual_mul_f32 %0:s[1], %0:v[2]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
bld.vop2(aco_opcode::v_mul_f32, Definition(reg_v1, v1), Operand(PhysReg(1), s1),
Operand(reg_v2, v1));
bld.vop2(aco_opcode::v_mul_f32, Definition(reg_v0, v1), Operand(PhysReg(2), s1),
Operand(reg_v2, v1));
/* fmamk uses src2 for the second source, which doesn't allow the same VGPR */
//>> p_unit_test 5
//! v1: %0:v[0] = v_fmamk_f32 %0:v[0], %0:v[2], 0x80
//! v1: %0:v[1] = v_fmamk_f32 %0:v[1], %0:v[2], 0x80
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
bld.vop2(aco_opcode::v_fmamk_f32, Definition(reg_v0, v1), Operand(reg_v0, v1),
Operand(reg_v2, v1), Operand::literal32(128));
bld.vop2(aco_opcode::v_fmamk_f32, Definition(reg_v1, v1), Operand(reg_v1, v1),
Operand(reg_v2, v1), Operand::literal32(128));
/* the two sources have to be the same size */
//>> p_unit_test 6
//! v1: %0:v[0] = v_add_f32 %0:v[2], %0:v[2]
//! v1: %0:v[1] = v_dot2c_f32_f16 %0:v[2], %0:v[2]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v0, v1), Operand(reg_v2, v1),
Operand(reg_v2, v1));
bld.vop2(aco_opcode::v_dot2c_f32_f16, Definition(reg_v1, v1), Operand(reg_v2, v1),
Operand(reg_v2, v1));
//>> p_unit_test 7
//~gfx11! v1: %0:v[0] = v_dot2c_f32_f16 %0:v[2], %0:v[2], %0:v[0]
//~gfx11! v1: %0:v[1] = v_dot2c_f32_f16 %0:v[2], %0:v[2], %0:v[1]
//~gfx12! v1: %0:v[1] = v_dual_dot2acc_f32_f16 %0:v[2], %0:v[2], %0:v[1] :: v1: %0:v[0] = v_dual_dot2acc_f32_f16 %0:v[2], %0:v[2], %0:v[0]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
bld.vop2(aco_opcode::v_dot2c_f32_f16, Definition(reg_v0, v1), Operand(reg_v2, v1),
Operand(reg_v2, v1), Operand(reg_v0, v1));
bld.vop2(aco_opcode::v_dot2c_f32_f16, Definition(reg_v1, v1), Operand(reg_v2, v1),
Operand(reg_v2, v1), Operand(reg_v1, v1));
finish_schedule_vopd_test();
}
END_TEST