aco: don't consider gfx1153 to have point sample acceleration
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34978>
This commit is contained in:
Rhys Perry 2025-05-14 15:17:21 +01:00
parent f10b49781d
commit 86ccceb4de
7 changed files with 86 additions and 84 deletions

View file

@ -51,7 +51,7 @@ get_type(Program* program, aco_ptr<Instruction>& instr)
if (program->gfx_level >= GFX11) {
if (instr->isMIMG()) {
uint8_t vmem_type = get_vmem_type(program->gfx_level, instr.get());
uint8_t vmem_type = get_vmem_type(program->gfx_level, program->family, instr.get());
switch (vmem_type) {
case vmem_bvh: return clause_bvh;
case vmem_sampler: return clause_mimg_sample;

View file

@ -1628,9 +1628,10 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
for (Operand& op : instr->operands)
fill_vgpr_bitset(ctx.vgpr_used_by_vmem_store, op.physReg(), op.bytes());
} else {
uint8_t vmem_type = state.program->gfx_level >= GFX12
? get_vmem_type(state.program->gfx_level, instr.get())
: vmem_nosampler;
uint8_t vmem_type =
state.program->gfx_level >= GFX12
? get_vmem_type(state.program->gfx_level, state.program->family, instr.get())
: vmem_nosampler;
std::bitset<256>* vgprs = &ctx.vgpr_used_by_vmem_load;
if (vmem_type == vmem_sampler)
vgprs = &ctx.vgpr_used_by_vmem_sample;

View file

@ -366,7 +366,7 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)
* We can do this for GFX12 and different types for GFX11 if we know that the two
* VMEM loads do not write the same register half or the same lanes.
*/
uint8_t vmem_type = get_vmem_type(ctx.gfx_level, instr);
uint8_t vmem_type = get_vmem_type(ctx.gfx_level, ctx.program->family, instr);
if (vmem_type) {
wait_event event = get_vmem_event(ctx, instr, vmem_type);
wait_type type = (wait_type)(ffs(ctx.info->get_counters_for_event(event)) - 1);
@ -740,7 +740,7 @@ gen(Instruction* instr, wait_ctx& ctx)
case Format::MIMG:
case Format::GLOBAL:
case Format::SCRATCH: {
uint8_t type = get_vmem_type(ctx.gfx_level, instr);
uint8_t type = get_vmem_type(ctx.gfx_level, ctx.program->family, instr);
wait_event ev = get_vmem_event(ctx, instr, type);
uint32_t mask = ev == event_vmem ? get_vmem_mask(ctx, instr) : 0;

View file

@ -1452,7 +1452,7 @@ get_tied_defs(Instruction* instr)
}
uint8_t
get_vmem_type(enum amd_gfx_level gfx_level, Instruction* instr)
get_vmem_type(amd_gfx_level gfx_level, radeon_family family, Instruction* instr)
{
if (instr->opcode == aco_opcode::image_bvh_intersect_ray ||
instr->opcode == aco_opcode::image_bvh64_intersect_ray ||
@ -1463,10 +1463,10 @@ get_vmem_type(enum amd_gfx_level gfx_level, Instruction* instr)
return vmem_sampler;
} else if (instr->isMIMG() && !instr->operands[1].isUndefined() &&
instr->operands[1].regClass() == s4) {
bool point_sample_accel =
gfx_level == GFX11_5 && (instr->opcode == aco_opcode::image_sample ||
instr->opcode == aco_opcode::image_sample_l ||
instr->opcode == aco_opcode::image_sample_lz);
bool point_sample_accel = gfx_level == GFX11_5 && family != CHIP_GFX1153 &&
(instr->opcode == aco_opcode::image_sample ||
instr->opcode == aco_opcode::image_sample_l ||
instr->opcode == aco_opcode::image_sample_lz);
return vmem_sampler | (point_sample_accel ? vmem_nosampler : 0);
} else if (instr->isVMEM() || instr->isScratch() || instr->isGlobal()) {
return vmem_nosampler;

View file

@ -1916,7 +1916,7 @@ enum vmem_type : uint8_t {
/* VMEM instructions of the same type return in-order. For GFX12+, this determines which counter
* is used.
*/
uint8_t get_vmem_type(enum amd_gfx_level gfx_level, Instruction* instr);
uint8_t get_vmem_type(amd_gfx_level gfx_level, radeon_family family, Instruction* instr);
/* For all of the counters, the maximum value means no wait.
* Some of the counters are larger than their bit field,

View file

@ -263,7 +263,7 @@ BlockCycleEstimator::cycles_until_res_available(aco_ptr<Instruction>& instr)
}
static std::array<unsigned, wait_type_num>
get_wait_counter_info(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr)
get_wait_counter_info(Program* program, aco_ptr<Instruction>& instr)
{
/* These numbers are all a bit nonsense. LDS/VMEM/SMEM/EXP performance
* depends a lot on the situation. */
@ -276,12 +276,12 @@ get_wait_counter_info(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr)
info[wait_type_exp] = 13;
} else if (instr->isFlatLike()) {
info[wait_type_lgkm] = instr->isFlat() ? 20 : 0;
if (!instr->definitions.empty() || gfx_level < GFX10)
if (!instr->definitions.empty() || program->gfx_level < GFX10)
info[wait_type_vm] = 320;
else
info[wait_type_vs] = 320;
} else if (instr->isSMEM()) {
wait_type type = gfx_level >= GFX12 ? wait_type_km : wait_type_lgkm;
wait_type type = program->gfx_level >= GFX12 ? wait_type_km : wait_type_lgkm;
if (instr->definitions.empty()) {
info[type] = 200;
} else if (instr->operands.empty()) { /* s_memtime and s_memrealtime */
@ -299,14 +299,14 @@ get_wait_counter_info(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr)
}
} else if (instr->isDS()) {
info[wait_type_lgkm] = 20;
} else if (instr->isVMEM() && instr->definitions.empty() && gfx_level >= GFX10) {
} else if (instr->isVMEM() && instr->definitions.empty() && program->gfx_level >= GFX10) {
info[wait_type_vs] = 320;
} else if (instr->isVMEM()) {
uint8_t vm_type = get_vmem_type(gfx_level, instr.get());
uint8_t vm_type = get_vmem_type(program->gfx_level, program->family, instr.get());
wait_type type = wait_type_vm;
if (gfx_level >= GFX12 && vm_type == vmem_bvh)
if (program->gfx_level >= GFX12 && vm_type == vmem_bvh)
type = wait_type_bvh;
else if (gfx_level >= GFX12 && vm_type == vmem_sampler)
else if (program->gfx_level >= GFX12 && vm_type == vmem_sampler)
type = wait_type_sample;
info[type] = 320;
}
@ -328,8 +328,7 @@ get_wait_imm(Program* program, aco_ptr<Instruction>& instr)
imm.exp = wait_imm::unset_counter;
} else {
/* If an instruction increases a counter, it waits for it to be below maximum first. */
std::array<unsigned, wait_type_num> wait_info =
get_wait_counter_info(program->gfx_level, instr);
std::array<unsigned, wait_type_num> wait_info = get_wait_counter_info(program, instr);
wait_imm max = wait_imm::max(program->gfx_level);
for (unsigned i = 0; i < wait_type_num; i++) {
if (wait_info[i])
@ -418,7 +417,7 @@ BlockCycleEstimator::add(aco_ptr<Instruction>& instr)
mem_ops[i].pop_front();
}
std::array<unsigned, wait_type_num> wait_info = get_wait_counter_info(program->gfx_level, instr);
std::array<unsigned, wait_type_num> wait_info = get_wait_counter_info(program, instr);
for (unsigned i = 0; i < wait_type_num; i++) {
if (wait_info[i])
mem_ops[i].push_back(cur_cycle + wait_info[i]);

View file

@ -341,78 +341,80 @@ BEGIN_TEST(insert_waitcnt.waw.vmem_types)
END_TEST
BEGIN_TEST(insert_waitcnt.waw.point_sample_accel)
if (!setup_cs(NULL, GFX11_5))
return;
for (radeon_family family : {CHIP_GFX1150, CHIP_GFX1153}) {
if (!setup_cs(NULL, GFX11_5, family, family == CHIP_GFX1153 ? "_3" : "_0"))
continue;
Definition def_v4(PhysReg(260), v1);
Operand op_v0(PhysReg(256), v1);
Operand desc_s4(PhysReg(0), s4);
Operand desc_s8(PhysReg(8), s8);
Definition def_v4(PhysReg(260), v1);
Operand op_v0(PhysReg(256), v1);
Operand desc_s4(PhysReg(0), s4);
Operand desc_s8(PhysReg(8), s8);
/* image_sample has point sample acceleration, but image_sample_b does not. Both are VMEM sample
* instructions. */
/* image_sample has point sample acceleration, but image_sample_b does not. Both are VMEM
* sample instructions. */
//>> p_unit_test 0
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
//! s_waitcnt vmcnt(0)
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
//>> p_unit_test 0
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
//~gfx11_5_0! s_waitcnt vmcnt(0)
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
//>> p_unit_test 1
//! v1: %0:v[4] = image_sample_b %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
//! s_waitcnt vmcnt(0)
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
bld.mimg(aco_opcode::image_sample_b, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
//>> p_unit_test 1
//! v1: %0:v[4] = image_sample_b %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
//~gfx11_5_0! s_waitcnt vmcnt(0)
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
bld.mimg(aco_opcode::image_sample_b, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
//>> p_unit_test 2
//! v1: %0:v[4] = image_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d
//! s_waitcnt vmcnt(0)
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
bld.mimg(aco_opcode::image_load, def_v4, desc_s8, Operand(s4), Operand(v1), op_v0);
bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
//>> p_unit_test 2
//! v1: %0:v[4] = image_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d
//! s_waitcnt vmcnt(0)
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
bld.mimg(aco_opcode::image_load, def_v4, desc_s8, Operand(s4), Operand(v1), op_v0);
bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
//>> p_unit_test 3
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
//! s_waitcnt vmcnt(0)
//! v1: %0:v[4] = image_sample_b %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
bld.mimg(aco_opcode::image_sample_b, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
//>> p_unit_test 3
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
//~gfx11_5_0! s_waitcnt vmcnt(0)
//! v1: %0:v[4] = image_sample_b %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
bld.mimg(aco_opcode::image_sample_b, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
//>> p_unit_test 4
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
//! s_waitcnt vmcnt(0)
//! v1: %0:v[4] = image_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
bld.mimg(aco_opcode::image_load, def_v4, desc_s8, Operand(s4), Operand(v1), op_v0);
//>> p_unit_test 4
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
//! s_waitcnt vmcnt(0)
//! v1: %0:v[4] = image_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
bld.mimg(aco_opcode::image_load, def_v4, desc_s8, Operand(s4), Operand(v1), op_v0);
//>> p_unit_test 5
//! v1: %0:v[4] = image_sample_b %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
//! v1: %0:v[4] = image_sample_b %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
bld.mimg(aco_opcode::image_sample_b, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
bld.mimg(aco_opcode::image_sample_b, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
//>> p_unit_test 5
//! v1: %0:v[4] = image_sample_b %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
//! v1: %0:v[4] = image_sample_b %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
bld.mimg(aco_opcode::image_sample_b, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
bld.mimg(aco_opcode::image_sample_b, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
//>> p_unit_test 5
//! v1: %0:v[4] = image_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d
//! v1: %0:v[4] = image_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
bld.mimg(aco_opcode::image_load, def_v4, desc_s8, Operand(s4), Operand(v1), op_v0);
bld.mimg(aco_opcode::image_load, def_v4, desc_s8, Operand(s4), Operand(v1), op_v0);
//>> p_unit_test 5
//! v1: %0:v[4] = image_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d
//! v1: %0:v[4] = image_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
bld.mimg(aco_opcode::image_load, def_v4, desc_s8, Operand(s4), Operand(v1), op_v0);
bld.mimg(aco_opcode::image_load, def_v4, desc_s8, Operand(s4), Operand(v1), op_v0);
finish_waitcnt_test();
finish_waitcnt_test();
}
END_TEST
BEGIN_TEST(insert_waitcnt.vmem)