mesa/src/amd/compiler/tests/test_scheduler.cpp
Rhys Perry 4fcf2eb1d7 aco/gfx12: VOPD src0/1 are src bank compatible if they are the same vgpr
fossil-db (gfx1201):
Totals from 66518 (83.80% of 79377) affected shaders:
Instrs: 36939667 -> 36656685 (-0.77%); split: -0.79%, +0.02%
CodeSize: 220575208 -> 220201764 (-0.17%); split: -0.21%, +0.04%
Latency: 258919732 -> 258137974 (-0.30%); split: -0.35%, +0.05%
InvThroughput: 49911351 -> 49643836 (-0.54%); split: -0.55%, +0.02%
VClause: 788661 -> 788430 (-0.03%); split: -0.04%, +0.01%
SClause: 1176416 -> 1176263 (-0.01%); split: -0.02%, +0.01%
VALU: 18014058 -> 17818119 (-1.09%); split: -1.10%, +0.01%
VOPD: 4926983 -> 5122922 (+3.98%); split: +4.01%, -0.04%

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34246>
2025-04-17 14:00:29 +00:00

291 lines
14 KiB
C++

/*
* Copyright 2024 Valve Corporation
* SPDX-License-Identifier: MIT
*/
#include "helpers.h"
using namespace aco;
BEGIN_TEST(vopd_sched.commutative)
if (!setup_cs(NULL, GFX11, CHIP_UNKNOWN, "", 32))
return;
PhysReg reg_v0{256};
PhysReg reg_v1{257};
PhysReg reg_v2{258};
PhysReg reg_v3{259};
//>> p_unit_test 0
//! v1: %0:v[1] = v_dual_add_f32 %0:v[3], %0:v[2] :: v1: %0:v[0] = v_dual_mul_f32 %0:v[2], %0:v[3]
bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
bld.vop2(aco_opcode::v_mul_f32, Definition(reg_v0, v1), Operand(reg_v2, v1),
Operand(reg_v3, v1));
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v1, v1), Operand(reg_v2, v1),
Operand(reg_v3, v1));
/* Neither of these opcodes are commutative. */
bld.reset(program->create_and_insert_block());
//>> p_unit_test 1
//! v1: %0:v[0] = v_fmamk_f32 %0:v[2], %0:v[3], 0
//! v1: %0:v[1] = v_lshlrev_b32 %0:v[2], %0:v[3]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
bld.vop2(aco_opcode::v_fmamk_f32, Definition(reg_v0, v1), Operand(reg_v2, v1),
Operand(reg_v3, v1), Operand::zero());
bld.vop2(aco_opcode::v_lshlrev_b32, Definition(reg_v1, v1), Operand(reg_v2, v1),
Operand(reg_v3, v1));
/* We have to change the opcode for subtractions. */
bld.reset(program->create_and_insert_block());
//>> p_unit_test 2
//! v1: %0:v[1] = v_dual_subrev_f32 %0:v[3], %0:v[2] :: v1: %0:v[0] = v_dual_fmamk_f32 %0:v[2], %0:v[3], 0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
bld.vop2(aco_opcode::v_fmamk_f32, Definition(reg_v0, v1), Operand(reg_v2, v1),
Operand(reg_v3, v1), Operand::zero());
bld.vop2(aco_opcode::v_sub_f32, Definition(reg_v1, v1), Operand(reg_v2, v1),
Operand(reg_v3, v1));
bld.reset(program->create_and_insert_block());
//>> p_unit_test 3
//! v1: %0:v[0] = v_dual_fmamk_f32 %0:v[2], %0:v[3], 0 :: v1: %0:v[1] = v_dual_sub_f32 %0:v[3], %0:v[2]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
bld.vop2(aco_opcode::v_subrev_f32, Definition(reg_v1, v1), Operand(reg_v2, v1),
Operand(reg_v3, v1));
bld.vop2(aco_opcode::v_fmamk_f32, Definition(reg_v0, v1), Operand(reg_v2, v1),
Operand(reg_v3, v1), Operand::zero());
/* If we have to move the second instruction into OPY instead of OPX, then swapping must still be
* correct. */
bld.reset(program->create_and_insert_block());
//>> p_unit_test 4
//! v1: %0:v[0] = v_dual_mul_f32 %0:v[3], %0:v[2] :: v1: %0:v[1] = v_dual_lshlrev_b32 %0:v[2], %0:v[3]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
bld.vop2(aco_opcode::v_mul_f32, Definition(reg_v0, v1), Operand(reg_v2, v1),
Operand(reg_v3, v1));
bld.vop2(aco_opcode::v_lshlrev_b32, Definition(reg_v1, v1), Operand(reg_v2, v1),
Operand(reg_v3, v1));
bld.reset(program->create_and_insert_block());
//>> p_unit_test 5
//! v1: %0:v[0] = v_dual_mul_f32 %0:v[3], %0:v[2] :: v1: %0:v[1] = v_dual_lshlrev_b32 %0:v[2], %0:v[3]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
bld.vop2(aco_opcode::v_lshlrev_b32, Definition(reg_v1, v1), Operand(reg_v2, v1),
Operand(reg_v3, v1));
bld.vop2(aco_opcode::v_mul_f32, Definition(reg_v0, v1), Operand(reg_v2, v1),
Operand(reg_v3, v1));
finish_schedule_vopd_test();
END_TEST
BEGIN_TEST(vopd_sched.mov_to_add_bfrev)
for (amd_gfx_level gfx : {GFX11, GFX12}) {
if (!setup_cs(NULL, gfx, CHIP_UNKNOWN, "", 32))
continue;
PhysReg reg_v0{256};
PhysReg reg_v1{257};
PhysReg reg_v2{258};
PhysReg reg_v3{259};
//>> p_unit_test 0
//~gfx11! v1: %0:v[1] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[0] = v_dual_add_nc_u32 0, %0:v[2]
//~gfx12! v1: %0:v[1] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[0] = v_dual_mov_b32 %0:v[2]
bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
bld.vop1(aco_opcode::v_mov_b32, Definition(reg_v0, v1), Operand(reg_v2, v1));
bld.vop1(aco_opcode::v_mov_b32, Definition(reg_v1, v1), Operand(reg_v2, v1));
/* We can't turn the v_mov_b32 into a v_add_u32 because then both instructions would be
* OPY-only.
*/
bld.reset(program->create_and_insert_block());
//>> p_unit_test 1
//~gfx11! v1: %0:v[0] = v_mov_b32 %0:v[2]
//~gfx11! v1: %0:v[1] = v_lshlrev_b32 %0:v[2], %0:v[3]
//~gfx12! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_lshlrev_b32 %0:v[2], %0:v[3]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
bld.vop1(aco_opcode::v_mov_b32, Definition(reg_v0, v1), Operand(reg_v2, v1));
bld.vop2(aco_opcode::v_lshlrev_b32, Definition(reg_v1, v1), Operand(reg_v2, v1),
Operand(reg_v3, v1));
bld.reset(program->create_and_insert_block());
//>> p_unit_test 2
//~gfx11! v1: %0:v[1] = v_lshlrev_b32 %0:v[2], %0:v[3]
//~gfx11! v1: %0:v[0] = v_mov_b32 %0:v[2]
//~gfx12! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_lshlrev_b32 %0:v[2], %0:v[3]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
bld.vop2(aco_opcode::v_lshlrev_b32, Definition(reg_v1, v1), Operand(reg_v2, v1),
Operand(reg_v3, v1));
bld.vop1(aco_opcode::v_mov_b32, Definition(reg_v0, v1), Operand(reg_v2, v1));
bld.reset(program->create_and_insert_block());
//>> p_unit_test 3
//~gfx11! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_and_b32 %0:v[3], %0:v[2]
//~gfx12! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_and_b32 %0:v[2], %0:v[3]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
bld.vop1(aco_opcode::v_mov_b32, Definition(reg_v0, v1), Operand(reg_v2, v1));
bld.vop2(aco_opcode::v_and_b32, Definition(reg_v1, v1), Operand(reg_v2, v1),
Operand(reg_v3, v1));
bld.reset(program->create_and_insert_block());
//>> p_unit_test 4
//~gfx11! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_and_b32 %0:v[3], %0:v[2]
//~gfx12! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_and_b32 %0:v[2], %0:v[3]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
bld.vop2(aco_opcode::v_and_b32, Definition(reg_v1, v1), Operand(reg_v2, v1),
Operand(reg_v3, v1));
bld.vop1(aco_opcode::v_mov_b32, Definition(reg_v0, v1), Operand(reg_v2, v1));
/* The v_add_u32 should be OPY, not OPX. */
bld.reset(program->create_and_insert_block());
//>> p_unit_test 5
//~gfx11! v1: %0:v[1] = v_dual_fmamk_f32 %0:v[2], %0:v[3], 0 :: v1: %0:v[0] = v_dual_add_nc_u32 0, %0:v[2]
//~gfx12! v1: %0:v[0] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[1] = v_dual_fmamk_f32 %0:v[2], %0:v[3], 0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
bld.vop2(aco_opcode::v_fmamk_f32, Definition(reg_v1, v1), Operand(reg_v2, v1),
Operand(reg_v3, v1), Operand::zero());
bld.vop1(aco_opcode::v_mov_b32, Definition(reg_v0, v1), Operand(reg_v2, v1));
bld.reset(program->create_and_insert_block());
//>> p_unit_test 6
//~gfx11! v1: %0:v[1] = v_dual_fmamk_f32 %0:v[2], %0:v[3], 0 :: v1: %0:v[0] = v_dual_add_nc_u32 0, %0:v[2]
//~gfx12! v1: %0:v[1] = v_dual_fmamk_f32 %0:v[2], %0:v[3], 0 :: v1: %0:v[0] = v_dual_mov_b32 %0:v[2]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
bld.vop1(aco_opcode::v_mov_b32, Definition(reg_v0, v1), Operand(reg_v2, v1));
bld.vop2(aco_opcode::v_fmamk_f32, Definition(reg_v1, v1), Operand(reg_v2, v1),
Operand(reg_v3, v1), Operand::zero());
//>> p_unit_test 7
//! v1: %0:v[1] = v_dual_mov_b32 %0:v[2] :: v1: %0:v[0] = v_dual_mov_b32 0x3c000000
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
bld.vop1(aco_opcode::v_bfrev_b32, Definition(reg_v0, v1), Operand::c32(60));
bld.vop1(aco_opcode::v_mov_b32, Definition(reg_v1, v1), Operand(reg_v2, v1));
finish_schedule_vopd_test();
}
END_TEST
BEGIN_TEST(vopd_sched.war)
for (amd_gfx_level gfx : {GFX11, GFX12}) {
if (!setup_cs(NULL, gfx, CHIP_UNKNOWN, "", 32))
continue;
PhysReg reg_v0{256};
PhysReg reg_v1{257};
PhysReg reg_v3{259};
//>> p_unit_test 0
//~gfx11! v1: %0:v[1] = v_dual_add_f32 %0:v[3], %0:v[1] :: v1: %0:v[0] = v_dual_mul_f32 %0:v[1], %0:v[3]
//~gfx12! v1: %0:v[0] = v_dual_mul_f32 %0:v[1], %0:v[3] :: v1: %0:v[1] = v_dual_add_f32 %0:v[3], %0:v[1]
bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
bld.vop2(aco_opcode::v_mul_f32, Definition(reg_v0, v1), Operand(reg_v1, v1),
Operand(reg_v3, v1));
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v1, v1), Operand(reg_v3, v1),
Operand(reg_v1, v1));
/* We can't use OPX for the v_mul_f32 because of the WaR, but we also can't use OPX for the
* v_add_u32 because that opcode is OPY-only. */
//>> p_unit_test 1
//~gfx11! v1: %0:v[1] = v_dual_mul_f32 %0:v[3], %0:v[1] :: v1: %0:v[0] = v_dual_add_nc_u32 %0:v[1], %0:v[3]
//~gfx12! v1: %0:v[0] = v_add_u32 %0:v[1], %0:v[3]
//~gfx12! v1: %0:v[1] = v_mul_f32 %0:v[3], %0:v[1]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
bld.vop2(aco_opcode::v_add_u32, Definition(reg_v0, v1), Operand(reg_v1, v1),
Operand(reg_v3, v1));
bld.vop2(aco_opcode::v_mul_f32, Definition(reg_v1, v1), Operand(reg_v3, v1),
Operand(reg_v1, v1));
finish_schedule_vopd_test();
}
END_TEST
BEGIN_TEST(vopd_sched.same_vgpr)
for (amd_gfx_level gfx : {GFX11, GFX12}) {
if (!setup_cs(NULL, gfx, CHIP_UNKNOWN, "", 32))
continue;
PhysReg reg_v0{256};
PhysReg reg_v1{257};
PhysReg reg_v2{258};
PhysReg reg_v3{259};
//>> p_unit_test 0
//~gfx11! v1: %0:v[1] = v_dual_add_f32 %0:v[1], %0:v[2] :: v1: %0:v[0] = v_dual_add_f32 %0:v[2], %0:v[0]
//~gfx12! v1: %0:v[1] = v_dual_add_f32 %0:v[2], %0:v[1] :: v1: %0:v[0] = v_dual_add_f32 %0:v[2], %0:v[0]
bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v0, v1), Operand(reg_v2, v1),
Operand(reg_v0, v1));
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v1, v1), Operand(reg_v2, v1),
Operand(reg_v1, v1));
//>> p_unit_test 1
//~gfx11! v1: %0:v[1] = v_dual_add_f32 %0:v[2], %0:v[1] :: v1: %0:v[0] = v_dual_add_f32 %0:v[0], %0:v[2]
//~gfx12! v1: %0:v[1] = v_dual_add_f32 %0:v[1], %0:v[2] :: v1: %0:v[0] = v_dual_add_f32 %0:v[0], %0:v[2]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v0, v1), Operand(reg_v0, v1),
Operand(reg_v2, v1));
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v1, v1), Operand(reg_v1, v1),
Operand(reg_v2, v1));
//>> p_unit_test 2
//~gfx11! v1: %0:v[1] = v_dual_add_f32 %0:v[3], %0:v[2] :: v1: %0:v[0] = v_dual_add_f32 %0:v[2], %0:v[3]
//~gfx12! v1: %0:v[1] = v_dual_add_f32 %0:v[2], %0:v[3] :: v1: %0:v[0] = v_dual_add_f32 %0:v[2], %0:v[3]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v0, v1), Operand(reg_v2, v1),
Operand(reg_v3, v1));
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v1, v1), Operand(reg_v2, v1),
Operand(reg_v3, v1));
//>> p_unit_test 3
//~gfx11! v1: %0:v[0] = v_add_f32 %0:v[2], %0:v[2]
//~gfx11! v1: %0:v[1] = v_add_f32 %0:v[2], %0:v[2]
//~gfx12! v1: %0:v[1] = v_dual_add_f32 %0:v[2], %0:v[2] :: v1: %0:v[0] = v_dual_add_f32 %0:v[2], %0:v[2]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v0, v1), Operand(reg_v2, v1),
Operand(reg_v2, v1));
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v1, v1), Operand(reg_v2, v1),
Operand(reg_v2, v1));
/* the sources can't be swapped because src0 is an SGPR */
//>> p_unit_test 4
//~gfx11! v1: %0:v[1] = v_mul_f32 %0:s[1], %0:v[2]
//~gfx11! v1: %0:v[0] = v_mul_f32 %0:s[2], %0:v[2]
//~gfx12! v1: %0:v[0] = v_dual_mul_f32 %0:s[2], %0:v[2] :: v1: %0:v[1] = v_dual_mul_f32 %0:s[1], %0:v[2]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
bld.vop2(aco_opcode::v_mul_f32, Definition(reg_v1, v1), Operand(PhysReg(1), s1),
Operand(reg_v2, v1));
bld.vop2(aco_opcode::v_mul_f32, Definition(reg_v0, v1), Operand(PhysReg(2), s1),
Operand(reg_v2, v1));
/* fmamk uses src2 for the second source, which doesn't allow the same VGPR */
//>> p_unit_test 5
//! v1: %0:v[0] = v_fmamk_f32 %0:v[0], %0:v[2], 0x80
//! v1: %0:v[1] = v_fmamk_f32 %0:v[1], %0:v[2], 0x80
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
bld.vop2(aco_opcode::v_fmamk_f32, Definition(reg_v0, v1), Operand(reg_v0, v1),
Operand(reg_v2, v1), Operand::literal32(128));
bld.vop2(aco_opcode::v_fmamk_f32, Definition(reg_v1, v1), Operand(reg_v1, v1),
Operand(reg_v2, v1), Operand::literal32(128));
/* the two sources have to be the same size */
//>> p_unit_test 6
//! v1: %0:v[0] = v_add_f32 %0:v[2], %0:v[2]
//! v1: %0:v[1] = v_dot2c_f32_f16 %0:v[2], %0:v[2]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
bld.vop2(aco_opcode::v_add_f32, Definition(reg_v0, v1), Operand(reg_v2, v1),
Operand(reg_v2, v1));
bld.vop2(aco_opcode::v_dot2c_f32_f16, Definition(reg_v1, v1), Operand(reg_v2, v1),
Operand(reg_v2, v1));
//>> p_unit_test 7
//~gfx11! v1: %0:v[0] = v_dot2c_f32_f16 %0:v[2], %0:v[2], %0:v[0]
//~gfx11! v1: %0:v[1] = v_dot2c_f32_f16 %0:v[2], %0:v[2], %0:v[1]
//~gfx12! v1: %0:v[1] = v_dual_dot2acc_f32_f16 %0:v[2], %0:v[2], %0:v[1] :: v1: %0:v[0] = v_dual_dot2acc_f32_f16 %0:v[2], %0:v[2], %0:v[0]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
bld.vop2(aco_opcode::v_dot2c_f32_f16, Definition(reg_v0, v1), Operand(reg_v2, v1),
Operand(reg_v2, v1), Operand(reg_v0, v1));
bld.vop2(aco_opcode::v_dot2c_f32_f16, Definition(reg_v1, v1), Operand(reg_v2, v1),
Operand(reg_v2, v1), Operand(reg_v1, v1));
finish_schedule_vopd_test();
}
END_TEST