diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c
index 83d6a9e212a..6146592444e 100644
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -57,12 +57,12 @@ static void si_improve_sync_flags(struct si_context *sctx, struct pipe_resource
    }
 }
 
-static void si_barrier_before_internal_op(struct si_context *sctx, unsigned flags,
-                                          unsigned num_buffers,
-                                          const struct pipe_shader_buffer *buffers,
-                                          unsigned writable_buffers_mask,
-                                          unsigned num_images,
-                                          const struct pipe_image_view *images)
+void si_barrier_before_internal_op(struct si_context *sctx, unsigned flags,
+                                   unsigned num_buffers,
+                                   const struct pipe_shader_buffer *buffers,
+                                   unsigned writable_buffers_mask,
+                                   unsigned num_images,
+                                   const struct pipe_image_view *images)
 {
    for (unsigned i = 0; i < num_images; i++) {
       /* The driver doesn't decompress resources automatically for internal blits, so do it manually. */
@@ -95,12 +95,12 @@ static void si_barrier_before_internal_op(struct si_context *sctx, unsigned flag
       si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
 }
 
-static void si_barrier_after_internal_op(struct si_context *sctx, unsigned flags,
-                                         unsigned num_buffers,
-                                         const struct pipe_shader_buffer *buffers,
-                                         unsigned writable_buffers_mask,
-                                         unsigned num_images,
-                                         const struct pipe_image_view *images)
+void si_barrier_after_internal_op(struct si_context *sctx, unsigned flags,
+                                  unsigned num_buffers,
+                                  const struct pipe_shader_buffer *buffers,
+                                  unsigned writable_buffers_mask,
+                                  unsigned num_images,
+                                  const struct pipe_image_view *images)
 {
    if (flags & SI_OP_SYNC_AFTER) {
       sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index ff1a09babfc..60331bb20ec 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -17,7 +17,6 @@
  * CP DMA packets. */
 #define CP_DMA_RAW_WAIT    (1 << 1)
 #define CP_DMA_CLEAR       (1 << 2)
-#define CP_DMA_PFP_SYNC_ME (1 << 3)
 
 static bool cp_dma_use_L2(struct si_context *sctx)
 {
@@ -96,14 +95,6 @@ static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, ui
       radeon_emit(command);
    }
    radeon_end();
-
-   /* CP DMA is executed in ME, but index buffers are read by PFP.
-    * This ensures that ME (CP DMA) is idle before PFP starts fetching
-    * indices. If we wanted to execute CP DMA in PFP, this packet
-    * should precede it.
-    */
-   if (sctx->has_graphics && flags & CP_DMA_PFP_SYNC_ME)
-      si_cp_pfp_sync_me(cs);
 }
 
 void si_cp_dma_wait_for_idle(struct si_context *sctx, struct radeon_cmdbuf *cs)
@@ -146,7 +137,7 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst
     * is written to memory.
     */
    if (user_flags & SI_OP_SYNC_AFTER && byte_count == remaining_size)
-      *packet_flags |= CP_DMA_SYNC | CP_DMA_PFP_SYNC_ME;
+      *packet_flags |= CP_DMA_SYNC;
 }
 
 void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
@@ -160,28 +151,23 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
    assert(!sctx->screen->info.cp_sdma_ge_use_system_memory_scope);
    assert(size && size % 4 == 0);
 
-   if (user_flags & SI_OP_SYNC_GE_BEFORE)
-      sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
+   if (!(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE) && !cp_dma_use_L2(sctx)) {
+      sctx->flags |= SI_CONTEXT_INV_L2;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
+   }
 
-   if (user_flags & SI_OP_SYNC_CS_BEFORE)
-      sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+   struct pipe_shader_buffer barrier_buffer;
+   barrier_buffer.buffer = dst;
+   barrier_buffer.buffer_offset = MIN2(offset, UINT32_MAX);
+   barrier_buffer.buffer_size = MIN2(size, UINT32_MAX);
 
-   if (user_flags & SI_OP_SYNC_PS_BEFORE)
-      sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+   si_barrier_before_internal_op(sctx, user_flags, 1, &barrier_buffer, 0x1, 0, NULL);
 
    /* Mark the buffer range of destination as valid (initialized),
     * so that transfer_map knows it should wait for the GPU when mapping
     * that range. */
    util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size);
 
-   if (!(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE)) {
-      sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
-                     (cp_dma_use_L2(sctx) ? 0 : SI_CONTEXT_INV_L2);
-   }
-
-   if (sctx->flags)
-      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
-
    while (size) {
       unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
       unsigned dma_flags = CP_DMA_CLEAR;
@@ -206,9 +192,7 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
       va += byte_count;
    }
 
-   if (cp_dma_use_L2(sctx))
-      sdst->TC_L2_dirty = true;
-
+   si_barrier_after_internal_op(sctx, user_flags, 1, &barrier_buffer, 0x1, 0, NULL);
    sctx->num_cp_dma_calls++;
 }
 
@@ -262,6 +246,21 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
    assert(size);
    assert(dst && src);
 
+   if (!(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE) && !cp_dma_use_L2(sctx)) {
+      sctx->flags |= SI_CONTEXT_INV_L2;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
+   }
+
+   struct pipe_shader_buffer barrier_buffers[2];
+   barrier_buffers[0].buffer = dst;
+   barrier_buffers[0].buffer_offset = MIN2(dst_offset, UINT32_MAX);
+   barrier_buffers[0].buffer_size = MIN2(size, UINT32_MAX);
+   barrier_buffers[1].buffer = src;
+   barrier_buffers[1].buffer_offset = MIN2(src_offset, UINT32_MAX);
+   barrier_buffers[1].buffer_size = MIN2(size, UINT32_MAX);
+
+   si_barrier_before_internal_op(sctx, user_flags, 2, barrier_buffers, 0x1, 0, NULL);
+
    /* Mark the buffer range of destination as valid (initialized),
     * so that transfer_map knows it should wait for the GPU when mapping
     * that range.
@@ -305,23 +304,6 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
       }
    }
 
-   if (user_flags & SI_OP_SYNC_GE_BEFORE)
-      sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
-
-   if (user_flags & SI_OP_SYNC_CS_BEFORE)
-      sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
-
-   if (user_flags & SI_OP_SYNC_PS_BEFORE)
-      sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
-
-   if (!(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE)) {
-      sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
-                     (cp_dma_use_L2(sctx) ? 0 : SI_CONTEXT_INV_L2);
-   }
-
-   if (sctx->flags)
-      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
-
    /* This is the main part doing the copying. Src is always aligned. */
    uint64_t main_dst_offset = dst_offset + skipped_size;
    uint64_t main_src_offset = src_offset + skipped_size;
@@ -376,9 +358,7 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
    if (realign_size)
       si_cp_dma_realign_engine(sctx, realign_size, user_flags, &is_first);
 
-   if (cp_dma_use_L2(sctx))
-      si_resource(dst)->TC_L2_dirty = true;
-
+   si_barrier_after_internal_op(sctx, user_flags, 2, barrier_buffers, 0x1, 0, NULL);
    sctx->num_cp_dma_calls++;
 }
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 859f9ffd4f3..8b44fabc85f 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1474,6 +1474,18 @@ void si_destroy_compute(struct si_compute *program);
 #define SI_OP_FAIL_IF_SLOW                (1 << 9)
 #define SI_OP_IS_NESTED                   (1 << 10)
 
+void si_barrier_before_internal_op(struct si_context *sctx, unsigned flags,
+                                   unsigned num_buffers,
+                                   const struct pipe_shader_buffer *buffers,
+                                   unsigned writable_buffers_mask,
+                                   unsigned num_images,
+                                   const struct pipe_image_view *images);
+void si_barrier_after_internal_op(struct si_context *sctx, unsigned flags,
+                                  unsigned num_buffers,
+                                  const struct pipe_shader_buffer *buffers,
+                                  unsigned writable_buffers_mask,
+                                  unsigned num_images,
+                                  const struct pipe_image_view *images);
 bool si_should_blit_clamp_to_edge(const struct pipe_blit_info *info, unsigned coord_mask);
 void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_info *info,
                                    void *shader, unsigned flags, unsigned num_buffers,