From 171920ceed59b018cfa637ff5fd022d39aeef105 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 8 May 2025 18:17:41 +0100 Subject: [PATCH] aco/gfx115: consider point sample acceleration Like 15428e0d786939a5c7629a9978947c8a9112ce96 in LLVM. fossil-db (gfx1150): Totals from 909 (1.14% of 79653) affected shaders: Instrs: 5840489 -> 5840705 (+0.00%); split: -0.00%, +0.00% CodeSize: 31133460 -> 31134296 (+0.00%); split: -0.00%, +0.00% Latency: 52982280 -> 53438577 (+0.86%); split: -0.00%, +0.86% InvThroughput: 10841454 -> 10942682 (+0.93%); split: -0.00%, +0.93% Signed-off-by: Rhys Perry Backport-to: 25.0 Reviewed-by: Georg Lehmann Part-of: --- src/amd/compiler/aco_insert_waitcnt.cpp | 9 ++- src/amd/compiler/aco_ir.cpp | 18 +++-- .../compiler/tests/test_insert_waitcnt.cpp | 75 +++++++++++++++++++ 3 files changed, 94 insertions(+), 8 deletions(-) diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index 6916a0579e7..62cbaaadf30 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -287,8 +287,13 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr) if (vmem_type && ctx.gfx_level < GFX12) { wait_event event = get_vmem_event(ctx, instr, vmem_type); wait_type type = (wait_type)(ffs(ctx.info->get_counters_for_event(event)) - 1); - if ((it->second.events & ctx.info->events[type]) == event && - (type != wait_type_vm || it->second.vmem_types == vmem_type)) + + bool event_matches = (it->second.events & ctx.info->events[type]) == event; + /* wait_type_vm/counter_vm can have several different vmem_types */ + bool type_matches = type != wait_type_vm || (it->second.vmem_types == vmem_type && + util_bitcount(vmem_type) == 1); + + if (event_matches && type_matches) reg_imm[type] = wait_imm::unset_counter; } diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index ba5dc303e34..513e928ed6d 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -75,6 +75,7 @@ init_program(Program* program, Stage stage, const struct aco_shader_info* info, case GFX10: program->family = CHIP_NAVI10; break; case GFX10_3: program->family = CHIP_NAVI21; break; case GFX11: program->family = CHIP_NAVI31; break; + case GFX11_5: program->family = CHIP_GFX1150; break; case GFX12: program->family = CHIP_GFX1200; break; default: program->family = CHIP_UNKNOWN; break; } @@ -1460,15 +1461,20 @@ uint8_t get_vmem_type(enum amd_gfx_level gfx_level, Instruction* instr) { if (instr->opcode == aco_opcode::image_bvh64_intersect_ray || - instr->opcode == aco_opcode::image_bvh8_intersect_ray) + instr->opcode == aco_opcode::image_bvh8_intersect_ray) { return vmem_bvh; - else if (gfx_level >= GFX12 && instr->opcode == aco_opcode::image_msaa_load) + } else if (gfx_level >= GFX12 && instr->opcode == aco_opcode::image_msaa_load) { return vmem_sampler; - else if (instr->isMIMG() && !instr->operands[1].isUndefined() && - instr->operands[1].regClass() == s4) - return vmem_sampler; - else if (instr->isVMEM() || instr->isScratch() || instr->isGlobal()) + } else if (instr->isMIMG() && !instr->operands[1].isUndefined() && + instr->operands[1].regClass() == s4) { + bool point_sample_accel = + gfx_level == GFX11_5 && (instr->opcode == aco_opcode::image_sample || + instr->opcode == aco_opcode::image_sample_l || + instr->opcode == aco_opcode::image_sample_lz); + return vmem_sampler | (point_sample_accel ? vmem_nosampler : 0); + } else if (instr->isVMEM() || instr->isScratch() || instr->isGlobal()) { return vmem_nosampler; + } return 0; } diff --git a/src/amd/compiler/tests/test_insert_waitcnt.cpp b/src/amd/compiler/tests/test_insert_waitcnt.cpp index c69772c5d05..8afecdbb879 100644 --- a/src/amd/compiler/tests/test_insert_waitcnt.cpp +++ b/src/amd/compiler/tests/test_insert_waitcnt.cpp @@ -340,6 +340,81 @@ BEGIN_TEST(insert_waitcnt.waw.vmem_types) } END_TEST +BEGIN_TEST(insert_waitcnt.waw.point_sample_accel) + if (!setup_cs(NULL, GFX11_5)) + return; + + Definition def_v4(PhysReg(260), v1); + Operand op_v0(PhysReg(256), v1); + Operand desc_s4(PhysReg(0), s4); + Operand desc_s8(PhysReg(8), s8); + + /* image_sample has point sample acceleration, but image_sample_b does not. Both are VMEM sample + * instructions. */ + + //>> p_unit_test 0 + //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d + //! s_waitcnt vmcnt(0) + //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); + bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); + bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); + + //>> p_unit_test 1 + //! v1: %0:v[4] = image_sample_b %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d + //! s_waitcnt vmcnt(0) + //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); + bld.mimg(aco_opcode::image_sample_b, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); + bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); + + //>> p_unit_test 2 + //! v1: %0:v[4] = image_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d + //! s_waitcnt vmcnt(0) + //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); + bld.mimg(aco_opcode::image_load, def_v4, desc_s8, Operand(s4), Operand(v1), op_v0); + bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); + + //>> p_unit_test 3 + //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d + //! s_waitcnt vmcnt(0) + //! v1: %0:v[4] = image_sample_b %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); + bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); + bld.mimg(aco_opcode::image_sample_b, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); + + //>> p_unit_test 4 + //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d + //! s_waitcnt vmcnt(0) + //! v1: %0:v[4] = image_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); + bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); + bld.mimg(aco_opcode::image_load, def_v4, desc_s8, Operand(s4), Operand(v1), op_v0); + + //>> p_unit_test 5 + //! v1: %0:v[4] = image_sample_b %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d + //! v1: %0:v[4] = image_sample_b %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); + bld.mimg(aco_opcode::image_sample_b, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); + bld.mimg(aco_opcode::image_sample_b, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); + + //>> p_unit_test 5 + //! v1: %0:v[4] = image_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d + //! v1: %0:v[4] = image_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); + bld.mimg(aco_opcode::image_load, def_v4, desc_s8, Operand(s4), Operand(v1), op_v0); + bld.mimg(aco_opcode::image_load, def_v4, desc_s8, Operand(s4), Operand(v1), op_v0); + + finish_waitcnt_test(); +END_TEST + BEGIN_TEST(insert_waitcnt.vmem) if (!setup_cs(NULL, GFX12)) return;