radeonsi: use si_barrier_{before,after}_internal_op for CP DMA

This makes the CP DMA code simpler and reuses the logic we use for internal compute shaders. The only thing that can't be handled in the barrier functions is "!cp_dma_has_L2 -> SI_CONTEXT_INV_L2" because the barrier functions should assume that only compute shader coherency is required to make them usable everywhere, and the CP DMA code has to deal with it. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31168>
2026-01-02 18:10:17 +01:00 · 2024-08-22 10:44:38 -04:00 · 2024-08-22 10:44:38 -04:00 · 88eb1be2fc
commit 88eb1be2fc
parent 0526ea067d
3 changed files with 51 additions and 59 deletions
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@ -57,12 +57,12 @@ static void si_improve_sync_flags(struct si_context *sctx, struct pipe_resource
   }
 }

-static void si_barrier_before_internal_op(struct si_context *sctx, unsigned flags,
-                                          unsigned num_buffers,
-                                          const struct pipe_shader_buffer *buffers,
-                                          unsigned writable_buffers_mask,
-                                          unsigned num_images,
-                                          const struct pipe_image_view *images)
+void si_barrier_before_internal_op(struct si_context *sctx, unsigned flags,
+                                   unsigned num_buffers,
+                                   const struct pipe_shader_buffer *buffers,
+                                   unsigned writable_buffers_mask,
+                                   unsigned num_images,
+                                   const struct pipe_image_view *images)
 {
   for (unsigned i = 0; i < num_images; i++) {
      /* The driver doesn't decompress resources automatically for internal blits, so do it manually. */
@ -95,12 +95,12 @@ static void si_barrier_before_internal_op(struct si_context *sctx, unsigned flag
      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
 }

-static void si_barrier_after_internal_op(struct si_context *sctx, unsigned flags,
-                                         unsigned num_buffers,
-                                         const struct pipe_shader_buffer *buffers,
-                                         unsigned writable_buffers_mask,
-                                         unsigned num_images,
-                                         const struct pipe_image_view *images)
+void si_barrier_after_internal_op(struct si_context *sctx, unsigned flags,
+                                  unsigned num_buffers,
+                                  const struct pipe_shader_buffer *buffers,
+                                  unsigned writable_buffers_mask,
+                                  unsigned num_images,
+                                  const struct pipe_image_view *images)
 {
   if (flags & SI_OP_SYNC_AFTER) {
      sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@ -17,7 +17,6 @@
 * CP DMA packets. */
 #define CP_DMA_RAW_WAIT    (1 << 1)
 #define CP_DMA_CLEAR       (1 << 2)
-#define CP_DMA_PFP_SYNC_ME (1 << 3)

 static bool cp_dma_use_L2(struct si_context *sctx)
 {
@ -96,14 +95,6 @@ static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, ui
      radeon_emit(command);
   }
   radeon_end();
-
-   /* CP DMA is executed in ME, but index buffers are read by PFP.
-    * This ensures that ME (CP DMA) is idle before PFP starts fetching
-    * indices. If we wanted to execute CP DMA in PFP, this packet
-    * should precede it.
-    */
-   if (sctx->has_graphics && flags & CP_DMA_PFP_SYNC_ME)
-      si_cp_pfp_sync_me(cs);
 }

 void si_cp_dma_wait_for_idle(struct si_context *sctx, struct radeon_cmdbuf *cs)
@ -146,7 +137,7 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst
    * is written to memory.
    */
   if (user_flags & SI_OP_SYNC_AFTER && byte_count == remaining_size)
-      *packet_flags |= CP_DMA_SYNC | CP_DMA_PFP_SYNC_ME;
+      *packet_flags |= CP_DMA_SYNC;
 }

 void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
@ -160,28 +151,23 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
   assert(!sctx->screen->info.cp_sdma_ge_use_system_memory_scope);
   assert(size && size % 4 == 0);

-   if (user_flags & SI_OP_SYNC_GE_BEFORE)
-      sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
+   if (!(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE) && !cp_dma_use_L2(sctx)) {
+      sctx->flags |= SI_CONTEXT_INV_L2;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
+   }

-   if (user_flags & SI_OP_SYNC_CS_BEFORE)
-      sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+   struct pipe_shader_buffer barrier_buffer;
+   barrier_buffer.buffer = dst;
+   barrier_buffer.buffer_offset = MIN2(offset, UINT32_MAX);
+   barrier_buffer.buffer_size = MIN2(size, UINT32_MAX);

-   if (user_flags & SI_OP_SYNC_PS_BEFORE)
-      sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+   si_barrier_before_internal_op(sctx, user_flags, 1, &barrier_buffer, 0x1, 0, NULL);

   /* Mark the buffer range of destination as valid (initialized),
    * so that transfer_map knows it should wait for the GPU when mapping
    * that range. */
   util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size);

-   if (!(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE)) {
-      sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
-                     (cp_dma_use_L2(sctx) ? 0 : SI_CONTEXT_INV_L2);
-   }
-
-   if (sctx->flags)
-      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
-
   while (size) {
      unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
      unsigned dma_flags = CP_DMA_CLEAR;
@ -206,9 +192,7 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
      va += byte_count;
   }

-   if (cp_dma_use_L2(sctx))
-      sdst->TC_L2_dirty = true;
-
+   si_barrier_after_internal_op(sctx, user_flags, 1, &barrier_buffer, 0x1, 0, NULL);
   sctx->num_cp_dma_calls++;
 }

@ -262,6 +246,21 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
   assert(size);
   assert(dst && src);

+   if (!(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE) && !cp_dma_use_L2(sctx)) {
+      sctx->flags |= SI_CONTEXT_INV_L2;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
+   }
+
+   struct pipe_shader_buffer barrier_buffers[2];
+   barrier_buffers[0].buffer = dst;
+   barrier_buffers[0].buffer_offset = MIN2(dst_offset, UINT32_MAX);
+   barrier_buffers[0].buffer_size = MIN2(size, UINT32_MAX);
+   barrier_buffers[1].buffer = src;
+   barrier_buffers[1].buffer_offset = MIN2(src_offset, UINT32_MAX);
+   barrier_buffers[1].buffer_size = MIN2(size, UINT32_MAX);
+
+   si_barrier_before_internal_op(sctx, user_flags, 2, barrier_buffers, 0x1, 0, NULL);
+
   /* Mark the buffer range of destination as valid (initialized),
    * so that transfer_map knows it should wait for the GPU when mapping
    * that range.
@ -305,23 +304,6 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
      }
   }

-   if (user_flags & SI_OP_SYNC_GE_BEFORE)
-      sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
-
-   if (user_flags & SI_OP_SYNC_CS_BEFORE)
-      sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
-
-   if (user_flags & SI_OP_SYNC_PS_BEFORE)
-      sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
-
-   if (!(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE)) {
-      sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
-                     (cp_dma_use_L2(sctx) ? 0 : SI_CONTEXT_INV_L2);
-   }
-
-   if (sctx->flags)
-      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
-
   /* This is the main part doing the copying. Src is always aligned. */
   uint64_t main_dst_offset = dst_offset + skipped_size;
   uint64_t main_src_offset = src_offset + skipped_size;
@ -376,9 +358,7 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
   if (realign_size)
      si_cp_dma_realign_engine(sctx, realign_size, user_flags, &is_first);

-   if (cp_dma_use_L2(sctx))
-      si_resource(dst)->TC_L2_dirty = true;
-
+   si_barrier_after_internal_op(sctx, user_flags, 2, barrier_buffers, 0x1, 0, NULL);
   sctx->num_cp_dma_calls++;
 }

--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@ -1474,6 +1474,18 @@ void si_destroy_compute(struct si_compute *program);
 #define SI_OP_FAIL_IF_SLOW                (1 << 9)
 #define SI_OP_IS_NESTED                   (1 << 10)

+void si_barrier_before_internal_op(struct si_context *sctx, unsigned flags,
+                                   unsigned num_buffers,
+                                   const struct pipe_shader_buffer *buffers,
+                                   unsigned writable_buffers_mask,
+                                   unsigned num_images,
+                                   const struct pipe_image_view *images);
+void si_barrier_after_internal_op(struct si_context *sctx, unsigned flags,
+                                  unsigned num_buffers,
+                                  const struct pipe_shader_buffer *buffers,
+                                  unsigned writable_buffers_mask,
+                                  unsigned num_images,
+                                  const struct pipe_image_view *images);
 bool si_should_blit_clamp_to_edge(const struct pipe_blit_info *info, unsigned coord_mask);
 void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_info *info,
                                   void *shader, unsigned flags, unsigned num_buffers,