mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-05 16:08:04 +02:00
aco/gfx12: VOPD src0/1 are src bank compatible if they are the same vgpr
fossil-db (gfx1201): Totals from 66518 (83.80% of 79377) affected shaders: Instrs: 36939667 -> 36656685 (-0.77%); split: -0.79%, +0.02% CodeSize: 220575208 -> 220201764 (-0.17%); split: -0.21%, +0.04% Latency: 258919732 -> 258137974 (-0.30%); split: -0.35%, +0.05% InvThroughput: 49911351 -> 49643836 (-0.54%); split: -0.55%, +0.02% VClause: 788661 -> 788430 (-0.03%); split: -0.04%, +0.01% SClause: 1176416 -> 1176263 (-0.01%); split: -0.02%, +0.01% VALU: 18014058 -> 17818119 (-1.09%); split: -1.10%, +0.01% VOPD: 4926983 -> 5122922 (+3.98%); split: +4.01%, -0.04% Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34246>
This commit is contained in:
parent
3446f2059d
commit
4fcf2eb1d7
2 changed files with 128 additions and 9 deletions
|
|
@ -37,6 +37,7 @@ struct VOPDInfo {
|
|||
uint16_t is_commutative : 1;
|
||||
aco_opcode op = aco_opcode::num_opcodes;
|
||||
uint32_t literal = 0;
|
||||
uint8_t port_vgprs[2] = {0, 0};
|
||||
};
|
||||
|
||||
struct InstrInfo {
|
||||
|
|
@ -190,8 +191,11 @@ get_vopd_info(const SchedILPContext& ctx, const Instruction* instr)
|
|||
op = Operand::get_const(ctx.program->gfx_level, util_bitreverse(op.constantValue()), 4);
|
||||
|
||||
unsigned port = (instr->opcode == aco_opcode::v_fmamk_f32 && i == 1) ? 2 : i;
|
||||
if (op.isOfType(RegType::vgpr))
|
||||
if (op.isOfType(RegType::vgpr)) {
|
||||
info.src_banks |= 1 << (port * 4 + (op.physReg().reg() & bank_mask[port]));
|
||||
if (port < 2)
|
||||
info.port_vgprs[port] = op.physReg().reg();
|
||||
}
|
||||
|
||||
/* Check all operands because of fmaak/fmamk. */
|
||||
if (op.isLiteral()) {
|
||||
|
|
@ -225,12 +229,29 @@ are_src_banks_compatible(enum amd_gfx_level gfx_level, const VOPDInfo& a, const
|
|||
}
|
||||
|
||||
uint16_t a_src_banks = a.src_banks;
|
||||
uint8_t a_port_vgprs[2] = {a.port_vgprs[0], a.port_vgprs[1]};
|
||||
if (swap) {
|
||||
uint16_t src0 = a.src_banks & 0xf;
|
||||
uint16_t src1 = a.src_banks & 0xf0;
|
||||
uint16_t src2 = a.src_banks & 0x300;
|
||||
a_src_banks = (src0 << 4) | (src1 >> 4) | src2;
|
||||
std::swap(a_port_vgprs[0], a_port_vgprs[1]);
|
||||
}
|
||||
|
||||
/* On GFX12+, we can skip checking a src0/src1 port if both SRCx and SRCy use the same VGPR and
|
||||
* the same sized operand.
|
||||
*/
|
||||
if (gfx_level >= GFX12) {
|
||||
bool a_is_dot2cc =
|
||||
a.op == aco_opcode::v_dual_dot2acc_f32_f16 || a.op == aco_opcode::v_dual_dot2acc_f32_bf16;
|
||||
bool b_is_dot2cc =
|
||||
b.op == aco_opcode::v_dual_dot2acc_f32_f16 || b.op == aco_opcode::v_dual_dot2acc_f32_bf16;
|
||||
if (a_port_vgprs[0] == b.port_vgprs[0] && a_is_dot2cc == b_is_dot2cc)
|
||||
a_src_banks &= ~0xf;
|
||||
if (a_port_vgprs[1] == b.port_vgprs[1] && a_is_dot2cc == b_is_dot2cc)
|
||||
a_src_banks &= ~0xf0;
|
||||
}
|
||||
|
||||
return (a_src_banks & b.src_banks) == 0;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -99,8 +99,9 @@ BEGIN_TEST(vopd_sched.mov_to_add_bfrev)
|
|||
*/
|
||||
bld.reset(program->create_and_insert_block());
|
||||
//>> p_unit_test 1
|
||||
//! v1: %0:v[0] = v_mov_b32 %0:v[2]
|
||||
//! v1: %0:v[1] = v_lshlrev_b32 %0:v[2], %0:v[3]
|
||||
//~gfx11! v1: %0:v[0] = v_mov_b32 %0:v[2]
|
||||
//~gfx11! v1: %0:v[1] = v_lshlrev_b32 %0:v[2], %0:v[3]
|
||||
//~gfx12! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_lshlrev_b32 %0:v[2], %0:v[3]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
|
||||
bld.vop1(aco_opcode::v_mov_b32, Definition(reg_v0, v1), Operand(reg_v2, v1));
|
||||
bld.vop2(aco_opcode::v_lshlrev_b32, Definition(reg_v1, v1), Operand(reg_v2, v1),
|
||||
|
|
@ -108,8 +109,9 @@ BEGIN_TEST(vopd_sched.mov_to_add_bfrev)
|
|||
|
||||
bld.reset(program->create_and_insert_block());
|
||||
//>> p_unit_test 2
|
||||
//! v1: %0:v[1] = v_lshlrev_b32 %0:v[2], %0:v[3]
|
||||
//! v1: %0:v[0] = v_mov_b32 %0:v[2]
|
||||
//~gfx11! v1: %0:v[1] = v_lshlrev_b32 %0:v[2], %0:v[3]
|
||||
//~gfx11! v1: %0:v[0] = v_mov_b32 %0:v[2]
|
||||
//~gfx12! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_lshlrev_b32 %0:v[2], %0:v[3]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
|
||||
bld.vop2(aco_opcode::v_lshlrev_b32, Definition(reg_v1, v1), Operand(reg_v2, v1),
|
||||
Operand(reg_v3, v1));
|
||||
|
|
@ -117,7 +119,8 @@ BEGIN_TEST(vopd_sched.mov_to_add_bfrev)
|
|||
|
||||
bld.reset(program->create_and_insert_block());
|
||||
//>> p_unit_test 3
|
||||
//! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_and_b32 %0:v[3], %0:v[2]
|
||||
//~gfx11! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_and_b32 %0:v[3], %0:v[2]
|
||||
//~gfx12! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_and_b32 %0:v[2], %0:v[3]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
|
||||
bld.vop1(aco_opcode::v_mov_b32, Definition(reg_v0, v1), Operand(reg_v2, v1));
|
||||
bld.vop2(aco_opcode::v_and_b32, Definition(reg_v1, v1), Operand(reg_v2, v1),
|
||||
|
|
@ -125,7 +128,8 @@ BEGIN_TEST(vopd_sched.mov_to_add_bfrev)
|
|||
|
||||
bld.reset(program->create_and_insert_block());
|
||||
//>> p_unit_test 4
|
||||
//! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_and_b32 %0:v[3], %0:v[2]
|
||||
//~gfx11! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_and_b32 %0:v[3], %0:v[2]
|
||||
//~gfx12! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_and_b32 %0:v[2], %0:v[3]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
|
||||
bld.vop2(aco_opcode::v_and_b32, Definition(reg_v1, v1), Operand(reg_v2, v1),
|
||||
Operand(reg_v3, v1));
|
||||
|
|
@ -134,7 +138,8 @@ BEGIN_TEST(vopd_sched.mov_to_add_bfrev)
|
|||
/* The v_add_u32 should be OPY, not OPX. */
|
||||
bld.reset(program->create_and_insert_block());
|
||||
//>> p_unit_test 5
|
||||
//! v1: %0:v[1] = v_dual_fmamk_f32 %0:v[2], %0:v[3], 0 :: v1: %0:v[0] = v_dual_add_nc_u32 0, %0:v[2]
|
||||
//~gfx11! v1: %0:v[1] = v_dual_fmamk_f32 %0:v[2], %0:v[3], 0 :: v1: %0:v[0] = v_dual_add_nc_u32 0, %0:v[2]
|
||||
//~gfx12! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_fmamk_f32 %0:v[2], %0:v[3], 0
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
|
||||
bld.vop2(aco_opcode::v_fmamk_f32, Definition(reg_v1, v1), Operand(reg_v2, v1),
|
||||
Operand(reg_v3, v1), Operand::zero());
|
||||
|
|
@ -142,7 +147,8 @@ BEGIN_TEST(vopd_sched.mov_to_add_bfrev)
|
|||
|
||||
bld.reset(program->create_and_insert_block());
|
||||
//>> p_unit_test 6
|
||||
//! v1: %0:v[1] = v_dual_fmamk_f32 %0:v[2], %0:v[3], 0 :: v1: %0:v[0] = v_dual_add_nc_u32 0, %0:v[2]
|
||||
//~gfx11! v1: %0:v[1] = v_dual_fmamk_f32 %0:v[2], %0:v[3], 0 :: v1: %0:v[0] = v_dual_add_nc_u32 0, %0:v[2]
|
||||
//~gfx12! v1: %0:v[1] = v_dual_fmamk_f32 %0:v[2], %0:v[3], 0 :: v1: %0:v[0] = v_dual_mov_b32 %0:v[2]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
|
||||
bld.vop1(aco_opcode::v_mov_b32, Definition(reg_v0, v1), Operand(reg_v2, v1));
|
||||
bld.vop2(aco_opcode::v_fmamk_f32, Definition(reg_v1, v1), Operand(reg_v2, v1),
|
||||
|
|
@ -191,3 +197,95 @@ BEGIN_TEST(vopd_sched.war)
|
|||
finish_schedule_vopd_test();
|
||||
}
|
||||
END_TEST
|
||||
|
||||
BEGIN_TEST(vopd_sched.same_vgpr)
|
||||
for (amd_gfx_level gfx : {GFX11, GFX12}) {
|
||||
if (!setup_cs(NULL, gfx, CHIP_UNKNOWN, "", 32))
|
||||
continue;
|
||||
|
||||
PhysReg reg_v0{256};
|
||||
PhysReg reg_v1{257};
|
||||
PhysReg reg_v2{258};
|
||||
PhysReg reg_v3{259};
|
||||
|
||||
//>> p_unit_test 0
|
||||
//~gfx11! v1: %0:v[1] = v_dual_add_f32 %0:v[1], %0:v[2] :: v1: %0:v[0] = v_dual_add_f32 %0:v[2], %0:v[0]
|
||||
//~gfx12! v1: %0:v[1] = v_dual_add_f32 %0:v[2], %0:v[1] :: v1: %0:v[0] = v_dual_add_f32 %0:v[2], %0:v[0]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
|
||||
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v0, v1), Operand(reg_v2, v1),
|
||||
Operand(reg_v0, v1));
|
||||
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v1, v1), Operand(reg_v2, v1),
|
||||
Operand(reg_v1, v1));
|
||||
|
||||
//>> p_unit_test 1
|
||||
//~gfx11! v1: %0:v[1] = v_dual_add_f32 %0:v[2], %0:v[1] :: v1: %0:v[0] = v_dual_add_f32 %0:v[0], %0:v[2]
|
||||
//~gfx12! v1: %0:v[1] = v_dual_add_f32 %0:v[1], %0:v[2] :: v1: %0:v[0] = v_dual_add_f32 %0:v[0], %0:v[2]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
|
||||
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v0, v1), Operand(reg_v0, v1),
|
||||
Operand(reg_v2, v1));
|
||||
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v1, v1), Operand(reg_v1, v1),
|
||||
Operand(reg_v2, v1));
|
||||
|
||||
//>> p_unit_test 2
|
||||
//~gfx11! v1: %0:v[1] = v_dual_add_f32 %0:v[3], %0:v[2] :: v1: %0:v[0] = v_dual_add_f32 %0:v[2], %0:v[3]
|
||||
//~gfx12! v1: %0:v[1] = v_dual_add_f32 %0:v[2], %0:v[3] :: v1: %0:v[0] = v_dual_add_f32 %0:v[2], %0:v[3]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
|
||||
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v0, v1), Operand(reg_v2, v1),
|
||||
Operand(reg_v3, v1));
|
||||
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v1, v1), Operand(reg_v2, v1),
|
||||
Operand(reg_v3, v1));
|
||||
|
||||
//>> p_unit_test 3
|
||||
//~gfx11! v1: %0:v[0] = v_add_f32 %0:v[2], %0:v[2]
|
||||
//~gfx11! v1: %0:v[1] = v_add_f32 %0:v[2], %0:v[2]
|
||||
//~gfx12! v1: %0:v[1] = v_dual_add_f32 %0:v[2], %0:v[2] :: v1: %0:v[0] = v_dual_add_f32 %0:v[2], %0:v[2]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
|
||||
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v0, v1), Operand(reg_v2, v1),
|
||||
Operand(reg_v2, v1));
|
||||
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v1, v1), Operand(reg_v2, v1),
|
||||
Operand(reg_v2, v1));
|
||||
|
||||
/* the sources can't be swapped because src0 is an SGPR */
|
||||
//>> p_unit_test 4
|
||||
//~gfx11! v1: %0:v[1] = v_mul_f32 %0:s[1], %0:v[2]
|
||||
//~gfx11! v1: %0:v[0] = v_mul_f32 %0:s[2], %0:v[2]
|
||||
//~gfx12! v1: %0:v[0] = v_dual_mul_f32 %0:s[2], %0:v[2] :: v1: %0:v[1] = v_dual_mul_f32 %0:s[1], %0:v[2]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
|
||||
bld.vop2(aco_opcode::v_mul_f32, Definition(reg_v1, v1), Operand(PhysReg(1), s1),
|
||||
Operand(reg_v2, v1));
|
||||
bld.vop2(aco_opcode::v_mul_f32, Definition(reg_v0, v1), Operand(PhysReg(2), s1),
|
||||
Operand(reg_v2, v1));
|
||||
|
||||
/* fmamk uses src2 for the second source, which doesn't allow the same VGPR */
|
||||
//>> p_unit_test 5
|
||||
//! v1: %0:v[0] = v_fmamk_f32 %0:v[0], %0:v[2], 0x80
|
||||
//! v1: %0:v[1] = v_fmamk_f32 %0:v[1], %0:v[2], 0x80
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
|
||||
bld.vop2(aco_opcode::v_fmamk_f32, Definition(reg_v0, v1), Operand(reg_v0, v1),
|
||||
Operand(reg_v2, v1), Operand::literal32(128));
|
||||
bld.vop2(aco_opcode::v_fmamk_f32, Definition(reg_v1, v1), Operand(reg_v1, v1),
|
||||
Operand(reg_v2, v1), Operand::literal32(128));
|
||||
|
||||
/* the two sources have to be the same size */
|
||||
//>> p_unit_test 6
|
||||
//! v1: %0:v[0] = v_add_f32 %0:v[2], %0:v[2]
|
||||
//! v1: %0:v[1] = v_dot2c_f32_f16 %0:v[2], %0:v[2]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
|
||||
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v0, v1), Operand(reg_v2, v1),
|
||||
Operand(reg_v2, v1));
|
||||
bld.vop2(aco_opcode::v_dot2c_f32_f16, Definition(reg_v1, v1), Operand(reg_v2, v1),
|
||||
Operand(reg_v2, v1));
|
||||
|
||||
//>> p_unit_test 7
|
||||
//~gfx11! v1: %0:v[0] = v_dot2c_f32_f16 %0:v[2], %0:v[2], %0:v[0]
|
||||
//~gfx11! v1: %0:v[1] = v_dot2c_f32_f16 %0:v[2], %0:v[2], %0:v[1]
|
||||
//~gfx12! v1: %0:v[1] = v_dual_dot2acc_f32_f16 %0:v[2], %0:v[2], %0:v[1] :: v1: %0:v[0] = v_dual_dot2acc_f32_f16 %0:v[2], %0:v[2], %0:v[0]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
|
||||
bld.vop2(aco_opcode::v_dot2c_f32_f16, Definition(reg_v0, v1), Operand(reg_v2, v1),
|
||||
Operand(reg_v2, v1), Operand(reg_v0, v1));
|
||||
bld.vop2(aco_opcode::v_dot2c_f32_f16, Definition(reg_v1, v1), Operand(reg_v2, v1),
|
||||
Operand(reg_v2, v1), Operand(reg_v1, v1));
|
||||
|
||||
finish_schedule_vopd_test();
|
||||
}
|
||||
END_TEST
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue