amd: add ac_cu_info::has_point_sample_accel flag and use in ACO

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38701>
2026-05-05 18:18:06 +02:00 · 2025-11-27 13:27:00 +01:00 · 2025-11-27 13:27:00 +01:00 · 7b7bdb76ab
commit 7b7bdb76ab
parent cfb745592d
8 changed files with 17 additions and 11 deletions
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@ -304,6 +304,8 @@ ac_fill_cu_info(struct radeon_info *info, struct drm_amdgpu_info_device *device_
   cu_info->has_lds_bank_count_16 = info->family == CHIP_KABINI || info->family == CHIP_STONEY;
   cu_info->has_sram_ecc_enabled = info->family == CHIP_VEGA20 || info->family == CHIP_MI100 ||
                                   info->family == CHIP_MI200 || info->family == CHIP_GFX940;
+   cu_info->has_point_sample_accel = info->family == CHIP_STRIX1 || info->family == CHIP_STRIX_HALO ||
+                                     info->family == CHIP_KRACKAN1;
   cu_info->has_fast_fma32 = info->gfx_level >= GFX9 || info->family == CHIP_TAHITI ||
                             info->family == CHIP_HAWAII || info->family == CHIP_CARRIZO;
   cu_info->has_fma_mix = info->gfx_level >= GFX10 ||
--- a/src/amd/common/ac_gpu_info.h
+++ b/src/amd/common/ac_gpu_info.h
@ -47,6 +47,8 @@ struct ac_cu_info {
   /* Flags */
   bool has_lds_bank_count_16 : 1;
   bool has_sram_ecc_enabled : 1;
+   /* Whether image_sample* instructions can be either a sampler or no-sampler access.*/
+   bool has_point_sample_accel : 1;
   bool has_fast_fma32 : 1;
   /* Whether chips support fused v_fma_mix* instructions.
    * Otherwise, unfused v_mad_mix* is available on GFX9.
--- a/src/amd/compiler/aco_form_hard_clauses.cpp
+++ b/src/amd/compiler/aco_form_hard_clauses.cpp
@ -51,7 +51,7 @@ get_type(Program* program, aco_ptr<Instruction>& instr)

   if (program->gfx_level >= GFX11) {
      if (instr->isMIMG()) {
-         uint8_t vmem_type = get_vmem_type(program->gfx_level, program->family, instr.get());
+         uint8_t vmem_type = get_vmem_type(instr.get(), program->dev.has_point_sample_accel);
         switch (vmem_type) {
         case vmem_bvh: return clause_bvh;
         case vmem_sampler: return clause_mimg_sample;
--- a/src/amd/compiler/aco_insert_NOPs.cpp
+++ b/src/amd/compiler/aco_insert_NOPs.cpp
@ -1687,7 +1687,7 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
      } else {
         uint8_t vmem_type =
            state.program->gfx_level >= GFX12
-               ? get_vmem_type(state.program->gfx_level, state.program->family, instr.get())
+               ? get_vmem_type(instr.get(), state.program->dev.has_point_sample_accel)
               : vmem_nosampler;
         std::bitset<256>* vgprs = &ctx.vgpr_used_by_vmem_load;
         if (vmem_type == vmem_sampler)
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@ -408,7 +408,7 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)
          * We can do this for GFX12 and different types for GFX11 if we know that the two
          * VMEM loads do not write the same register half or the same lanes.
          */
-         uint8_t vmem_type = get_vmem_type(ctx.gfx_level, ctx.program->family, instr);
+         uint8_t vmem_type = get_vmem_type(instr, ctx.program->dev.has_point_sample_accel);
         if (vmem_type) {
            wait_event event = get_vmem_event(ctx, instr, vmem_type);
            wait_type type = (wait_type)(ffs(ctx.info->get_counters_for_event(event)) - 1);
@ -834,7 +834,7 @@ gen(Instruction* instr, wait_ctx& ctx)
   case Format::MIMG:
   case Format::GLOBAL:
   case Format::SCRATCH: {
-      uint8_t type = get_vmem_type(ctx.gfx_level, ctx.program->family, instr);
+      uint8_t type = get_vmem_type(instr, ctx.program->dev.has_point_sample_accel);
      wait_event ev = get_vmem_event(ctx, instr, type);
      uint32_t mask = ev == event_vmem ? get_vmem_mask(ctx, instr) : 0;

--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@ -107,6 +107,7 @@ init_program(Program* program, Stage stage, const struct aco_shader_info* info,
   program->dev.xnack_enabled = false;

   program->dev.sram_ecc_enabled = options->cu_info->has_sram_ecc_enabled;
+   program->dev.has_point_sample_accel = options->cu_info->has_point_sample_accel;

   program->dev.has_fast_fma32 = options->cu_info->has_fast_fma32;
   program->dev.has_mac_legacy32 = program->gfx_level <= GFX7 || program->gfx_level == GFX10;
@ -1470,7 +1471,7 @@ get_tied_defs(Instruction* instr)
 }

 uint8_t
-get_vmem_type(amd_gfx_level gfx_level, radeon_family family, Instruction* instr)
+get_vmem_type(Instruction* instr, bool has_point_sample_accel)
 {
   if (instr->opcode == aco_opcode::image_bvh_intersect_ray ||
       instr->opcode == aco_opcode::image_bvh64_intersect_ray ||
@ -1481,10 +1482,10 @@ get_vmem_type(amd_gfx_level gfx_level, radeon_family family, Instruction* instr)
      return vmem_sampler;
   } else if (instr->isMIMG() && !instr->operands[1].isUndefined() &&
              instr->operands[1].regClass() == s4) {
-      bool point_sample_accel = gfx_level == GFX11_5 && family != CHIP_GFX1153 &&
-                                (instr->opcode == aco_opcode::image_sample ||
-                                 instr->opcode == aco_opcode::image_sample_l ||
-                                 instr->opcode == aco_opcode::image_sample_lz);
+      bool point_sample_accel =
+         has_point_sample_accel && (instr->opcode == aco_opcode::image_sample ||
+                                    instr->opcode == aco_opcode::image_sample_l ||
+                                    instr->opcode == aco_opcode::image_sample_lz);
      return vmem_sampler | (point_sample_accel ? vmem_nosampler : 0);
   } else if (instr->isVMEM() || instr->isScratch() || instr->isGlobal()) {
      return vmem_nosampler;
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@ -2054,7 +2054,7 @@ enum vmem_type : uint8_t {
 /* VMEM instructions of the same type return in-order. For GFX12+, this determines which counter
 * is used.
 */
-uint8_t get_vmem_type(amd_gfx_level gfx_level, radeon_family family, Instruction* instr);
+uint8_t get_vmem_type(Instruction* instr, bool has_point_sample_accel);

 /* For all of the counters, the maximum value means no wait.
 * Some of the counters are larger than their bit field,
@ -2249,6 +2249,7 @@ struct DeviceInfo {
   bool fused_mad_mix = false;
   bool xnack_enabled = false;
   bool sram_ecc_enabled = false;
+   bool has_point_sample_accel = false;

   int32_t scratch_global_offset_min;
   int32_t scratch_global_offset_max;
--- a/src/amd/compiler/aco_statistics.cpp
+++ b/src/amd/compiler/aco_statistics.cpp
@ -320,7 +320,7 @@ get_wait_counter_info(Program* program, aco_ptr<Instruction>& instr)
   } else if (instr->isVMEM() && instr->definitions.empty() && program->gfx_level >= GFX10) {
      info[wait_type_vs] = 320;
   } else if (instr->isVMEM()) {
-      uint8_t vm_type = get_vmem_type(program->gfx_level, program->family, instr.get());
+      uint8_t vm_type = get_vmem_type(instr.get(), program->dev.has_point_sample_accel);
      wait_type type = wait_type_vm;
      if (program->gfx_level >= GFX12 && vm_type == vmem_bvh)
         type = wait_type_bvh;