radeonsi: add si_cp_acquire_mem helper and clean up its usage for gfx6-9

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31168>
2026-05-05 22:38:05 +02:00 · 2024-08-06 23:41:17 -04:00 · 2024-08-06 23:41:17 -04:00 · a42d9db1b6
commit a42d9db1b6
parent 1d5ffb13d6
3 changed files with 98 additions and 71 deletions
--- a/src/gallium/drivers/radeonsi/si_cp_utils.c
+++ b/src/gallium/drivers/radeonsi/si_cp_utils.c
@ -131,3 +131,56 @@ void si_cp_release_acquire_mem_pws(struct si_context *sctx, struct radeon_cmdbuf
   si_cp_release_mem_pws(sctx, cs, event_type, gcr_cntl);
   si_cp_acquire_mem_pws(sctx, cs, event_type, stage_sel, 0, 0, sqtt_flush_flags);
 }
+
+/* Execute plain ACQUIRE_MEM that just flushes caches. This optionally waits for idle on older
+ * chips. "engine" determines whether to sync in PFP or ME.
+ */
+void si_cp_acquire_mem(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned gcr_cntl,
+                       unsigned engine)
+{
+   assert(engine == V_580_CP_PFP || engine == V_580_CP_ME);
+   assert(gcr_cntl);
+
+   if (sctx->gfx_level >= GFX10) {
+      /* TODO */
+   } else {
+      bool compute_ib = !sctx->has_graphics;
+
+      /* This seems problematic with GFX7 (see #4764) */
+      if (sctx->gfx_level != GFX7)
+         gcr_cntl |= 1u << 31; /* don't sync PFP, i.e. execute the sync in ME */
+
+      if (sctx->gfx_level == GFX9 || compute_ib) {
+         /* Flush caches and wait for the caches to assert idle. */
+         radeon_begin(cs);
+         radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 5, 0));
+         radeon_emit(gcr_cntl);      /* CP_COHER_CNTL */
+         radeon_emit(0xffffffff);    /* CP_COHER_SIZE */
+         radeon_emit(0xffffff);      /* CP_COHER_SIZE_HI */
+         radeon_emit(0);             /* CP_COHER_BASE */
+         radeon_emit(0);             /* CP_COHER_BASE_HI */
+         radeon_emit(0x0000000A);    /* POLL_INTERVAL */
+         radeon_end();
+      } else {
+         /* ACQUIRE_MEM is only required on the compute ring. */
+         radeon_begin(cs);
+         radeon_emit(PKT3(PKT3_SURFACE_SYNC, 3, 0));
+         radeon_emit(gcr_cntl);      /* CP_COHER_CNTL */
+         radeon_emit(0xffffffff);    /* CP_COHER_SIZE */
+         radeon_emit(0);             /* CP_COHER_BASE */
+         radeon_emit(0x0000000A);    /* POLL_INTERVAL */
+         radeon_end();
+      }
+
+      /* ACQUIRE_MEM & SURFACE_SYNC roll the context if the current context is busy. */
+      if (!compute_ib)
+         sctx->context_roll = true;
+
+      if (engine == V_580_CP_PFP) {
+         radeon_begin(cs);
+         radeon_emit(PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+         radeon_emit(0);
+         radeon_end();
+      }
+   }
+}
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@ -684,43 +684,6 @@ void si_emit_ts(struct si_context *sctx, struct si_resource* buffer, unsigned in
                        EOP_DATA_SEL_TIMESTAMP, buffer, va, 0, PIPE_QUERY_TIMESTAMP);
 }

-void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned cp_coher_cntl)
-{
-   bool compute_ib = !sctx->has_graphics;
-
-   assert(sctx->gfx_level <= GFX9);
-
-   /* This seems problematic with GFX7 (see #4764) */
-   if (sctx->gfx_level != GFX7)
-      cp_coher_cntl |= 1u << 31; /* don't sync PFP, i.e. execute the sync in ME */
-
-   radeon_begin(cs);
-
-   if (sctx->gfx_level == GFX9 || compute_ib) {
-      /* Flush caches and wait for the caches to assert idle. */
-      radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 5, 0));
-      radeon_emit(cp_coher_cntl); /* CP_COHER_CNTL */
-      radeon_emit(0xffffffff);    /* CP_COHER_SIZE */
-      radeon_emit(0xffffff);      /* CP_COHER_SIZE_HI */
-      radeon_emit(0);             /* CP_COHER_BASE */
-      radeon_emit(0);             /* CP_COHER_BASE_HI */
-      radeon_emit(0x0000000A);    /* POLL_INTERVAL */
-   } else {
-      /* ACQUIRE_MEM is only required on a compute ring. */
-      radeon_emit(PKT3(PKT3_SURFACE_SYNC, 3, 0));
-      radeon_emit(cp_coher_cntl); /* CP_COHER_CNTL */
-      radeon_emit(0xffffffff);    /* CP_COHER_SIZE */
-      radeon_emit(0);             /* CP_COHER_BASE */
-      radeon_emit(0x0000000A);    /* POLL_INTERVAL */
-   }
-   radeon_end();
-
-   /* ACQUIRE_MEM has an implicit context roll if the current context
-    * is busy. */
-   if (!compute_ib)
-      sctx->context_roll = true;
-}
-
 static struct si_resource *si_get_wait_mem_scratch_bo(struct si_context *ctx,
                                                      struct radeon_cmdbuf *cs, bool is_secure)
 {
@ -1135,27 +1098,24 @@ void gfx6_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
      }
   }

-   /* GFX6-GFX8 only:
-    *   When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC
-    *   waits for idle, so it should be last. SURFACE_SYNC is done in PFP.
+   /* GFX6-GFX8 only: When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC waits
+    * for idle, so it should be last.
    *
-    * cp_coher_cntl should contain all necessary flags except TC and PFP flags
-    * at this point.
+    * cp_coher_cntl should contain everything except TC flags at this point.
    *
    * GFX6-GFX7 don't support L2 write-back.
    */
-   if (flags & SI_CONTEXT_INV_L2 || (sctx->gfx_level <= GFX7 && (flags & SI_CONTEXT_WB_L2))) {
-      /* Invalidate L1 & L2. (L1 is always invalidated on GFX6)
-       * WB must be set on GFX8+ when TC_ACTION is set.
-       */
-      si_emit_surface_sync(sctx, cs,
-                           cp_coher_cntl | S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
-                              S_0301F0_TC_WB_ACTION_ENA(sctx->gfx_level >= GFX8));
-      cp_coher_cntl = 0;
+   unsigned engine = flags & SI_CONTEXT_PFP_SYNC_ME ? V_580_CP_PFP : V_580_CP_ME;
+
+   if (flags & SI_CONTEXT_INV_L2 || (sctx->gfx_level <= GFX7 && flags & SI_CONTEXT_WB_L2)) {
+      /* Invalidate L1 & L2. WB must be set on GFX8+ when TC_ACTION is set. */
+      si_cp_acquire_mem(sctx, cs,
+                        cp_coher_cntl | S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
+                        S_0301F0_TC_WB_ACTION_ENA(sctx->gfx_level >= GFX8), engine);
      sctx->num_L2_invalidates++;
   } else {
-      /* L1 invalidation and L2 writeback must be done separately,
-       * because both operations can't be done together.
+      /* L1 invalidation and L2 writeback must be done separately, because both operations can't
+       * be done together.
       */
      if (flags & SI_CONTEXT_WB_L2) {
         /* WB = write-back
@ -1163,29 +1123,43 @@ void gfx6_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
          *      (i.e. MTYPE <= 1, which is what we use everywhere)
          *
          * WB doesn't work without NC.
+          *
+          * If we get here, the only flag that can't be executed together with WB_L2 is VMEM cache
+          * invalidation.
          */
-         si_emit_surface_sync(
-            sctx, cs,
-            cp_coher_cntl | S_0301F0_TC_WB_ACTION_ENA(1) | S_0301F0_TC_NC_ACTION_ENA(1));
+         bool last_acquire_mem = !(flags & SI_CONTEXT_INV_VCACHE);
+
+         si_cp_acquire_mem(sctx, cs,
+                           cp_coher_cntl | S_0301F0_TC_WB_ACTION_ENA(1) |
+                           S_0301F0_TC_NC_ACTION_ENA(1),
+                           /* If this is not the last ACQUIRE_MEM, flush in ME.
+                            * We only want to synchronize with PFP in the last ACQUIRE_MEM. */
+                           last_acquire_mem ? engine : V_580_CP_ME);
+
+         if (last_acquire_mem)
+            flags &= ~SI_CONTEXT_PFP_SYNC_ME;
         cp_coher_cntl = 0;
         sctx->num_L2_writebacks++;
      }
-      if (flags & SI_CONTEXT_INV_VCACHE) {
-         /* Invalidate per-CU VMEM L1. */
-         si_emit_surface_sync(sctx, cs, cp_coher_cntl | S_0085F0_TCL1_ACTION_ENA(1));
-         cp_coher_cntl = 0;
+
+      if (flags & SI_CONTEXT_INV_VCACHE)
+         cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1);
+
+      /* If there are still some cache flags left... */
+      if (cp_coher_cntl) {
+         si_cp_acquire_mem(sctx, cs, cp_coher_cntl, engine);
+         flags &= ~SI_CONTEXT_PFP_SYNC_ME;
      }
-   }

-   /* If TC flushes haven't cleared this... */
-   if (cp_coher_cntl)
-      si_emit_surface_sync(sctx, cs, cp_coher_cntl);
-
-   if (flags & SI_CONTEXT_PFP_SYNC_ME) {
-      radeon_begin(cs);
-      radeon_emit(PKT3(PKT3_PFP_SYNC_ME, 0, 0));
-      radeon_emit(0);
-      radeon_end();
+      /* This might be needed even without any cache flags, such as when doing buffer stores
+       * to an index buffer.
+       */
+      if (flags & SI_CONTEXT_PFP_SYNC_ME) {
+         radeon_begin(cs);
+         radeon_emit(PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+         radeon_emit(0);
+         radeon_end();
+      }
   }

   if (flags & SI_CONTEXT_START_PIPELINE_STATS && sctx->pipeline_stats_enabled != 1) {
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@ -1568,6 +1568,8 @@ void si_cp_acquire_mem_pws(struct si_context *sctx, struct radeon_cmdbuf *cs,
 void si_cp_release_acquire_mem_pws(struct si_context *sctx, struct radeon_cmdbuf *cs,
                                   unsigned event_type, unsigned gcr_cntl, unsigned stage_sel,
                                   unsigned sqtt_flush_flags);
+void si_cp_acquire_mem(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned gcr_cntl,
+                       unsigned engine);

 /* si_debug.c */
 void si_gather_context_rolls(struct si_context *sctx);
@ -1610,8 +1612,6 @@ void si_set_tracked_regs_to_clear_state(struct si_context *ctx);
 void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs);
 void si_trace_emit(struct si_context *sctx);
 void si_emit_ts(struct si_context *sctx, struct si_resource* buffer, unsigned int offset);
-void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs,
-                          unsigned cp_coher_cntl);
 void gfx10_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
 void gfx6_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
 /* Replace the sctx->b.draw_vbo function with a wrapper. This can be use to implement