aco/gfx115: consider point sample acceleration

Like 15428e0d786939a5c7629a9978947c8a9112ce96 in LLVM.

fossil-db (gfx1150):
Totals from 909 (1.14% of 79653) affected shaders:
Instrs: 5840489 -> 5840705 (+0.00%); split: -0.00%, +0.00%
CodeSize: 31133460 -> 31134296 (+0.00%); split: -0.00%, +0.00%
Latency: 52982280 -> 53438577 (+0.86%); split: -0.00%, +0.86%
InvThroughput: 10841454 -> 10942682 (+0.93%); split: -0.00%, +0.93%

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Backport-to: 25.0
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34935>
This commit is contained in:
Rhys Perry 2025-05-08 18:17:41 +01:00 committed by Marge Bot
parent cbd85acf9a
commit 171920ceed
3 changed files with 94 additions and 8 deletions

View file

@ -287,8 +287,13 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)
if (vmem_type && ctx.gfx_level < GFX12) {
wait_event event = get_vmem_event(ctx, instr, vmem_type);
wait_type type = (wait_type)(ffs(ctx.info->get_counters_for_event(event)) - 1);
if ((it->second.events & ctx.info->events[type]) == event &&
(type != wait_type_vm || it->second.vmem_types == vmem_type))
bool event_matches = (it->second.events & ctx.info->events[type]) == event;
/* wait_type_vm/counter_vm can have several different vmem_types */
bool type_matches = type != wait_type_vm || (it->second.vmem_types == vmem_type &&
util_bitcount(vmem_type) == 1);
if (event_matches && type_matches)
reg_imm[type] = wait_imm::unset_counter;
}

View file

@ -75,6 +75,7 @@ init_program(Program* program, Stage stage, const struct aco_shader_info* info,
case GFX10: program->family = CHIP_NAVI10; break;
case GFX10_3: program->family = CHIP_NAVI21; break;
case GFX11: program->family = CHIP_NAVI31; break;
case GFX11_5: program->family = CHIP_GFX1150; break;
case GFX12: program->family = CHIP_GFX1200; break;
default: program->family = CHIP_UNKNOWN; break;
}
@ -1460,15 +1461,20 @@ uint8_t
get_vmem_type(enum amd_gfx_level gfx_level, Instruction* instr)
{
if (instr->opcode == aco_opcode::image_bvh64_intersect_ray ||
instr->opcode == aco_opcode::image_bvh8_intersect_ray)
instr->opcode == aco_opcode::image_bvh8_intersect_ray) {
return vmem_bvh;
else if (gfx_level >= GFX12 && instr->opcode == aco_opcode::image_msaa_load)
} else if (gfx_level >= GFX12 && instr->opcode == aco_opcode::image_msaa_load) {
return vmem_sampler;
else if (instr->isMIMG() && !instr->operands[1].isUndefined() &&
instr->operands[1].regClass() == s4)
return vmem_sampler;
else if (instr->isVMEM() || instr->isScratch() || instr->isGlobal())
} else if (instr->isMIMG() && !instr->operands[1].isUndefined() &&
instr->operands[1].regClass() == s4) {
bool point_sample_accel =
gfx_level == GFX11_5 && (instr->opcode == aco_opcode::image_sample ||
instr->opcode == aco_opcode::image_sample_l ||
instr->opcode == aco_opcode::image_sample_lz);
return vmem_sampler | (point_sample_accel ? vmem_nosampler : 0);
} else if (instr->isVMEM() || instr->isScratch() || instr->isGlobal()) {
return vmem_nosampler;
}
return 0;
}

View file

@ -340,6 +340,81 @@ BEGIN_TEST(insert_waitcnt.waw.vmem_types)
}
END_TEST
BEGIN_TEST(insert_waitcnt.waw.point_sample_accel)
if (!setup_cs(NULL, GFX11_5))
return;
Definition def_v4(PhysReg(260), v1);
Operand op_v0(PhysReg(256), v1);
Operand desc_s4(PhysReg(0), s4);
Operand desc_s8(PhysReg(8), s8);
/* image_sample has point sample acceleration, but image_sample_b does not. Both are VMEM sample
* instructions. */
//>> p_unit_test 0
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
//! s_waitcnt vmcnt(0)
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
//>> p_unit_test 1
//! v1: %0:v[4] = image_sample_b %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
//! s_waitcnt vmcnt(0)
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
bld.mimg(aco_opcode::image_sample_b, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
//>> p_unit_test 2
//! v1: %0:v[4] = image_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d
//! s_waitcnt vmcnt(0)
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
bld.mimg(aco_opcode::image_load, def_v4, desc_s8, Operand(s4), Operand(v1), op_v0);
bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
//>> p_unit_test 3
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
//! s_waitcnt vmcnt(0)
//! v1: %0:v[4] = image_sample_b %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
bld.mimg(aco_opcode::image_sample_b, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
//>> p_unit_test 4
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
//! s_waitcnt vmcnt(0)
//! v1: %0:v[4] = image_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
bld.mimg(aco_opcode::image_load, def_v4, desc_s8, Operand(s4), Operand(v1), op_v0);
//>> p_unit_test 5
//! v1: %0:v[4] = image_sample_b %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
//! v1: %0:v[4] = image_sample_b %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
bld.mimg(aco_opcode::image_sample_b, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
bld.mimg(aco_opcode::image_sample_b, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
//>> p_unit_test 5
//! v1: %0:v[4] = image_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d
//! v1: %0:v[4] = image_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
bld.mimg(aco_opcode::image_load, def_v4, desc_s8, Operand(s4), Operand(v1), op_v0);
bld.mimg(aco_opcode::image_load, def_v4, desc_s8, Operand(s4), Operand(v1), op_v0);
finish_waitcnt_test();
END_TEST
BEGIN_TEST(insert_waitcnt.vmem)
if (!setup_cs(NULL, GFX12))
return;