ac/llvm: use new s_wait instructions and split the existing ones for gfx12

Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29007>
2026-05-03 14:18:07 +02:00 · 2023-04-27 03:34:01 -04:00 · 2023-04-27 03:34:01 -04:00 · a6c46509cc
commit a6c46509cc
parent 12bca6123a
5 changed files with 66 additions and 46 deletions
--- a/src/amd/llvm/ac_llvm_build.c
+++ b/src/amd/llvm/ac_llvm_build.c
@ -2066,44 +2066,60 @@ void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)
   if (!wait_flags)
      return;

-   unsigned expcnt = 7;
-   unsigned lgkmcnt = 63;
-   unsigned vmcnt = ctx->gfx_level >= GFX9 ? 63 : 15;
-   unsigned vscnt = 63;
+   if (ctx->gfx_level >= GFX12) {
+      if (wait_flags & AC_WAIT_DS)
+         ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.dscnt", ctx->voidt, &ctx->i16_0, 1, 0);
+      if (wait_flags & AC_WAIT_KM)
+         ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.kmcnt", ctx->voidt, &ctx->i16_0, 1, 0);
+      if (wait_flags & AC_WAIT_EXP)
+         ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.expcnt", ctx->voidt, &ctx->i16_0, 1, 0);
+      if (wait_flags & AC_WAIT_LOAD)
+         ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.loadcnt", ctx->voidt, &ctx->i16_0, 1, 0);
+      if (wait_flags & AC_WAIT_STORE)
+         ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.storecnt", ctx->voidt, &ctx->i16_0, 1, 0);
+      if (wait_flags & AC_WAIT_SAMPLE)
+         ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.samplecnt", ctx->voidt, &ctx->i16_0, 1, 0);
+      if (wait_flags & AC_WAIT_BVH)
+         ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.bvhcnt", ctx->voidt, &ctx->i16_0, 1, 0);
+   } else {
+      unsigned expcnt = 7;
+      unsigned lgkmcnt = 63;
+      unsigned vmcnt = ctx->gfx_level >= GFX9 ? 63 : 15;
+      unsigned vscnt = 63;

-   if (wait_flags & AC_WAIT_EXP)
-      expcnt = 0;
-   if (wait_flags & AC_WAIT_LGKM)
-      lgkmcnt = 0;
-   if (wait_flags & AC_WAIT_VLOAD)
-      vmcnt = 0;
-
-   if (wait_flags & AC_WAIT_VSTORE) {
-      if (ctx->gfx_level >= GFX10)
-         vscnt = 0;
-      else
+      if (wait_flags & AC_WAIT_EXP)
+         expcnt = 0;
+      if (wait_flags & (AC_WAIT_DS | AC_WAIT_KM))
+         lgkmcnt = 0;
+      if (wait_flags & (AC_WAIT_LOAD | AC_WAIT_SAMPLE | AC_WAIT_BVH))
         vmcnt = 0;
+
+      if (wait_flags & AC_WAIT_STORE) {
+         if (ctx->gfx_level >= GFX10)
+            vscnt = 0;
+         else
+            vmcnt = 0;
+      }
+
+      /* There is no intrinsic for vscnt(0), so use a fence. It waits for everything except expcnt. */
+      if (vscnt == 0) {
+         assert(!(wait_flags & AC_WAIT_EXP));
+         LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, "");
+         return;
+      }
+
+      unsigned simm16;
+
+      if (ctx->gfx_level >= GFX11)
+         simm16 = expcnt | (lgkmcnt << 4) | (vmcnt << 10);
+      else
+         simm16 = (lgkmcnt << 8) | (expcnt << 4) | (vmcnt & 0xf) | ((vmcnt >> 4) << 14);
+
+      LLVMValueRef args[1] = {
+         LLVMConstInt(ctx->i32, simm16, false),
+      };
+      ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", ctx->voidt, args, 1, 0);
   }
-
-   /* There is no intrinsic for vscnt(0), so use a fence. */
-   if ((wait_flags & AC_WAIT_LGKM && wait_flags & AC_WAIT_VLOAD && wait_flags & AC_WAIT_VSTORE) ||
-       vscnt == 0) {
-      assert(!(wait_flags & AC_WAIT_EXP));
-      LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, "");
-      return;
-   }
-
-   unsigned simm16;
-
-   if (ctx->gfx_level >= GFX11)
-      simm16 = expcnt | (lgkmcnt << 4) | (vmcnt << 10);
-   else
-      simm16 = (lgkmcnt << 8) | (expcnt << 4) | (vmcnt & 0xf) | ((vmcnt >> 4) << 14);
-
-   LLVMValueRef args[1] = {
-      LLVMConstInt(ctx->i32, simm16, false),
-   };
-   ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", ctx->voidt, args, 1, 0);
 }

 LLVMValueRef ac_build_fsat(struct ac_llvm_context *ctx, LLVMValueRef src,
--- a/src/amd/llvm/ac_llvm_build.h
+++ b/src/amd/llvm/ac_llvm_build.h
@ -30,10 +30,14 @@ enum
   AC_ADDR_SPACE_CONST_32BIT = 6, /* same as CONST, but the pointer type has 32 bits */
 };

-#define AC_WAIT_LGKM   (1 << 0) /* LDS, GDS, constant, message */
-#define AC_WAIT_VLOAD  (1 << 1) /* VMEM load/sample instructions */
-#define AC_WAIT_VSTORE (1 << 2) /* VMEM store instructions */
-#define AC_WAIT_EXP    (1 << 3) /* EXP instructions */
+/* Fine-grained wait flags for GFX12, older chips merge them. */
+#define AC_WAIT_DS     (1 << 0) /* s_wait_dscnt (GFX12) or lgkmcnt (GFX6-11): LDS (all gens), GDS (GFX6-11) */
+#define AC_WAIT_KM     (1 << 1) /* s_wait_kmcnt (GFX12) or lgkmcnt (GFX6-11): SMEM, message */
+#define AC_WAIT_EXP    (1 << 2) /* s_wait_expcnt: Exports */
+#define AC_WAIT_LOAD   (1 << 3) /* s_wait_loadcnt (GFX12) or vmcnt (GFX6-11): VMEM loads */
+#define AC_WAIT_STORE  (1 << 4) /* s_wait_storecnt (GFX12) or vscnt (GFX10-11) or vmcnt (GFX6-9): VMEM stores */
+#define AC_WAIT_SAMPLE (1 << 5) /* s_wait_samplecnt (GFX12) or vmcnt (GFX6-11): VMEM sample/gather */
+#define AC_WAIT_BVH    (1 << 6) /* s_wait_bvhcnt (GFX12) or vmcnt (GFX6-11): BVH */

 struct ac_llvm_flow;
 struct ac_llvm_compiler;
--- a/src/amd/llvm/ac_nir_to_llvm.c
+++ b/src/amd/llvm/ac_nir_to_llvm.c
@ -3208,9 +3208,9 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins

      unsigned wait_flags = 0;
      if (modes & (nir_var_mem_global | nir_var_mem_ssbo | nir_var_image))
-         wait_flags |= AC_WAIT_VLOAD | AC_WAIT_VSTORE;
+         wait_flags |= AC_WAIT_LOAD | AC_WAIT_STORE;
      if (modes & nir_var_mem_shared)
-         wait_flags |= AC_WAIT_LGKM;
+         wait_flags |= AC_WAIT_DS;

      if (wait_flags)
         ac_build_waitcnt(&ctx->ac, wait_flags);
@ -3624,7 +3624,7 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
      /* Set release=0 to start a GDS mutex. Set done=0 because it's not the last one. */
      ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", ctx->ac.i32,
                         args, ARRAY_SIZE(args), 0);
-      ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
+      ac_build_waitcnt(&ctx->ac, AC_WAIT_DS);

      LLVMValueRef global_count[4];
      LLVMValueRef count_vec = get_src(ctx, instr->src[1]);
@ -3644,7 +3644,7 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
         }
      }

-      ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
+      ac_build_waitcnt(&ctx->ac, AC_WAIT_DS);

      /* Set release=1 to end a GDS mutex. Set done=1 because it's the last one. */
      args[6] = args[7] = ctx->ac.i1true;
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@ -351,7 +351,7 @@ ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, const struct radv_nir
          * and contains a barrier, it will wait there and then
          * reach s_endpgm.
          */
-         ac_build_waitcnt(&ctx.ac, AC_WAIT_LGKM);
+         ac_build_waitcnt(&ctx.ac, AC_WAIT_DS);
         ac_build_s_barrier(&ctx.ac, shaders[shader_idx]->info.stage);
      }

--- a/src/gallium/drivers/radeonsi/si_shader_llvm.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c
@ -704,7 +704,7 @@ static bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shade
         if (!shader->key.ge.opt.same_patch_vertices ||
             shader->selector->info.base.inputs_read &
             ~shader->selector->info.tcs_vgpr_only_inputs) {
-            ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
+            ac_build_waitcnt(&ctx->ac, AC_WAIT_DS);

            /* If both input and output patches are wholly in one wave, we don't need a barrier.
             * That's true when both VS and TCS have the same number of patch vertices and
@ -715,7 +715,7 @@ static bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shade
               ac_build_s_barrier(&ctx->ac, ctx->stage);
         }
      } else if (ctx->stage == MESA_SHADER_GEOMETRY) {
-         ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM);
+         ac_build_waitcnt(&ctx->ac, AC_WAIT_DS);
         ac_build_s_barrier(&ctx->ac, ctx->stage);
      }
   }