From 88eb1be2fcf30dca7d0fa50a53e0bea3bc7ee656 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 22 Aug 2024 10:44:38 -0400
Subject: [PATCH] radeonsi: use si_barrier_{before,after}_internal_op for CP
 DMA

This makes the CP DMA code simpler and reuses the logic we use for internal
compute shaders.

The only thing that can't be handled in the barrier functions is
"!cp_dma_has_L2 -> SI_CONTEXT_INV_L2" because the barrier functions should
assume that only compute shader coherency is required to make them usable
everywhere, and the CP DMA code has to deal with it.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31168>
---
 .../drivers/radeonsi/si_compute_blit.c        | 24 +++---
 src/gallium/drivers/radeonsi/si_cp_dma.c      | 74 +++++++------------
 src/gallium/drivers/radeonsi/si_pipe.h        | 12 +++
 3 files changed, 51 insertions(+), 59 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c
index 83d6a9e212a..6146592444e 100644
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -57,12 +57,12 @@ static void si_improve_sync_flags(struct si_context *sctx, struct pipe_resource
    }
 }
 
-static void si_barrier_before_internal_op(struct si_context *sctx, unsigned flags,
-                                          unsigned num_buffers,
-                                          const struct pipe_shader_buffer *buffers,
-                                          unsigned writable_buffers_mask,
-                                          unsigned num_images,
-                                          const struct pipe_image_view *images)
+void si_barrier_before_internal_op(struct si_context *sctx, unsigned flags,
+                                   unsigned num_buffers,
+                                   const struct pipe_shader_buffer *buffers,
+                                   unsigned writable_buffers_mask,
+                                   unsigned num_images,
+                                   const struct pipe_image_view *images)
 {
    for (unsigned i = 0; i < num_images; i++) {
       /* The driver doesn't decompress resources automatically for internal blits, so do it manually. */
@@ -95,12 +95,12 @@ static void si_barrier_before_internal_op(struct si_context *sctx, unsigned flag
       si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
 }
 
-static void si_barrier_after_internal_op(struct si_context *sctx, unsigned flags,
-                                         unsigned num_buffers,
-                                         const struct pipe_shader_buffer *buffers,
-                                         unsigned writable_buffers_mask,
-                                         unsigned num_images,
-                                         const struct pipe_image_view *images)
+void si_barrier_after_internal_op(struct si_context *sctx, unsigned flags,
+                                  unsigned num_buffers,
+                                  const struct pipe_shader_buffer *buffers,
+                                  unsigned writable_buffers_mask,
+                                  unsigned num_images,
+                                  const struct pipe_image_view *images)
 {
    if (flags & SI_OP_SYNC_AFTER) {
       sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index ff1a09babfc..60331bb20ec 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -17,7 +17,6 @@
  * CP DMA packets. */
 #define CP_DMA_RAW_WAIT    (1 << 1)
 #define CP_DMA_CLEAR       (1 << 2)
-#define CP_DMA_PFP_SYNC_ME (1 << 3)
 
 static bool cp_dma_use_L2(struct si_context *sctx)
 {
@@ -96,14 +95,6 @@ static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, ui
       radeon_emit(command);
    }
    radeon_end();
-
-   /* CP DMA is executed in ME, but index buffers are read by PFP.
-    * This ensures that ME (CP DMA) is idle before PFP starts fetching
-    * indices. If we wanted to execute CP DMA in PFP, this packet
-    * should precede it.
-    */
-   if (sctx->has_graphics && flags & CP_DMA_PFP_SYNC_ME)
-      si_cp_pfp_sync_me(cs);
 }
 
 void si_cp_dma_wait_for_idle(struct si_context *sctx, struct radeon_cmdbuf *cs)
@@ -146,7 +137,7 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst
     * is written to memory.
     */
    if (user_flags & SI_OP_SYNC_AFTER && byte_count == remaining_size)
-      *packet_flags |= CP_DMA_SYNC | CP_DMA_PFP_SYNC_ME;
+      *packet_flags |= CP_DMA_SYNC;
 }
 
 void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
@@ -160,28 +151,23 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
    assert(!sctx->screen->info.cp_sdma_ge_use_system_memory_scope);
    assert(size && size % 4 == 0);
 
-   if (user_flags & SI_OP_SYNC_GE_BEFORE)
-      sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
+   if (!(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE) && !cp_dma_use_L2(sctx)) {
+      sctx->flags |= SI_CONTEXT_INV_L2;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
+   }
 
-   if (user_flags & SI_OP_SYNC_CS_BEFORE)
-      sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+   struct pipe_shader_buffer barrier_buffer;
+   barrier_buffer.buffer = dst;
+   barrier_buffer.buffer_offset = MIN2(offset, UINT32_MAX);
+   barrier_buffer.buffer_size = MIN2(size, UINT32_MAX);
 
-   if (user_flags & SI_OP_SYNC_PS_BEFORE)
-      sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+   si_barrier_before_internal_op(sctx, user_flags, 1, &barrier_buffer, 0x1, 0, NULL);
 
    /* Mark the buffer range of destination as valid (initialized),
     * so that transfer_map knows it should wait for the GPU when mapping
     * that range. */
    util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size);
 
-   if (!(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE)) {
-      sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
-                     (cp_dma_use_L2(sctx) ? 0 : SI_CONTEXT_INV_L2);
-   }
-
-   if (sctx->flags)
-      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
-
    while (size) {
       unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
       unsigned dma_flags = CP_DMA_CLEAR;
@@ -206,9 +192,7 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
       va += byte_count;
    }
 
-   if (cp_dma_use_L2(sctx))
-      sdst->TC_L2_dirty = true;
-
+   si_barrier_after_internal_op(sctx, user_flags, 1, &barrier_buffer, 0x1, 0, NULL);
    sctx->num_cp_dma_calls++;
 }
 
@@ -262,6 +246,21 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
    assert(size);
    assert(dst && src);
 
+   if (!(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE) && !cp_dma_use_L2(sctx)) {
+      sctx->flags |= SI_CONTEXT_INV_L2;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
+   }
+
+   struct pipe_shader_buffer barrier_buffers[2];
+   barrier_buffers[0].buffer = dst;
+   barrier_buffers[0].buffer_offset = MIN2(dst_offset, UINT32_MAX);
+   barrier_buffers[0].buffer_size = MIN2(size, UINT32_MAX);
+   barrier_buffers[1].buffer = src;
+   barrier_buffers[1].buffer_offset = MIN2(src_offset, UINT32_MAX);
+   barrier_buffers[1].buffer_size = MIN2(size, UINT32_MAX);
+
+   si_barrier_before_internal_op(sctx, user_flags, 2, barrier_buffers, 0x1, 0, NULL);
+
    /* Mark the buffer range of destination as valid (initialized),
     * so that transfer_map knows it should wait for the GPU when mapping
     * that range.
@@ -305,23 +304,6 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
       }
    }
 
-   if (user_flags & SI_OP_SYNC_GE_BEFORE)
-      sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
-
-   if (user_flags & SI_OP_SYNC_CS_BEFORE)
-      sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
-
-   if (user_flags & SI_OP_SYNC_PS_BEFORE)
-      sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
-
-   if (!(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE)) {
-      sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
-                     (cp_dma_use_L2(sctx) ? 0 : SI_CONTEXT_INV_L2);
-   }
-
-   if (sctx->flags)
-      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
-
    /* This is the main part doing the copying. Src is always aligned. */
    uint64_t main_dst_offset = dst_offset + skipped_size;
    uint64_t main_src_offset = src_offset + skipped_size;
@@ -376,9 +358,7 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
    if (realign_size)
       si_cp_dma_realign_engine(sctx, realign_size, user_flags, &is_first);
 
-   if (cp_dma_use_L2(sctx))
-      si_resource(dst)->TC_L2_dirty = true;
-
+   si_barrier_after_internal_op(sctx, user_flags, 2, barrier_buffers, 0x1, 0, NULL);
    sctx->num_cp_dma_calls++;
 }
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 859f9ffd4f3..8b44fabc85f 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1474,6 +1474,18 @@ void si_destroy_compute(struct si_compute *program);
 #define SI_OP_FAIL_IF_SLOW                (1 << 9)
 #define SI_OP_IS_NESTED                   (1 << 10)
 
+void si_barrier_before_internal_op(struct si_context *sctx, unsigned flags,
+                                   unsigned num_buffers,
+                                   const struct pipe_shader_buffer *buffers,
+                                   unsigned writable_buffers_mask,
+                                   unsigned num_images,
+                                   const struct pipe_image_view *images);
+void si_barrier_after_internal_op(struct si_context *sctx, unsigned flags,
+                                  unsigned num_buffers,
+                                  const struct pipe_shader_buffer *buffers,
+                                  unsigned writable_buffers_mask,
+                                  unsigned num_images,
+                                  const struct pipe_image_view *images);
 bool si_should_blit_clamp_to_edge(const struct pipe_blit_info *info, unsigned coord_mask);
 void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_info *info,
                                    void *shader, unsigned flags, unsigned num_buffers,