diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index 83d6a9e212a..6146592444e 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -57,12 +57,12 @@ static void si_improve_sync_flags(struct si_context *sctx, struct pipe_resource } } -static void si_barrier_before_internal_op(struct si_context *sctx, unsigned flags, - unsigned num_buffers, - const struct pipe_shader_buffer *buffers, - unsigned writable_buffers_mask, - unsigned num_images, - const struct pipe_image_view *images) +void si_barrier_before_internal_op(struct si_context *sctx, unsigned flags, + unsigned num_buffers, + const struct pipe_shader_buffer *buffers, + unsigned writable_buffers_mask, + unsigned num_images, + const struct pipe_image_view *images) { for (unsigned i = 0; i < num_images; i++) { /* The driver doesn't decompress resources automatically for internal blits, so do it manually. */ @@ -95,12 +95,12 @@ static void si_barrier_before_internal_op(struct si_context *sctx, unsigned flag si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); } -static void si_barrier_after_internal_op(struct si_context *sctx, unsigned flags, - unsigned num_buffers, - const struct pipe_shader_buffer *buffers, - unsigned writable_buffers_mask, - unsigned num_images, - const struct pipe_image_view *images) +void si_barrier_after_internal_op(struct si_context *sctx, unsigned flags, + unsigned num_buffers, + const struct pipe_shader_buffer *buffers, + unsigned writable_buffers_mask, + unsigned num_images, + const struct pipe_image_view *images) { if (flags & SI_OP_SYNC_AFTER) { sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index ff1a09babfc..60331bb20ec 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -17,7 +17,6 @@ * CP DMA packets. */ #define CP_DMA_RAW_WAIT (1 << 1) #define CP_DMA_CLEAR (1 << 2) -#define CP_DMA_PFP_SYNC_ME (1 << 3) static bool cp_dma_use_L2(struct si_context *sctx) { @@ -96,14 +95,6 @@ static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, ui radeon_emit(command); } radeon_end(); - - /* CP DMA is executed in ME, but index buffers are read by PFP. - * This ensures that ME (CP DMA) is idle before PFP starts fetching - * indices. If we wanted to execute CP DMA in PFP, this packet - * should precede it. - */ - if (sctx->has_graphics && flags & CP_DMA_PFP_SYNC_ME) - si_cp_pfp_sync_me(cs); } void si_cp_dma_wait_for_idle(struct si_context *sctx, struct radeon_cmdbuf *cs) @@ -146,7 +137,7 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst * is written to memory. */ if (user_flags & SI_OP_SYNC_AFTER && byte_count == remaining_size) - *packet_flags |= CP_DMA_SYNC | CP_DMA_PFP_SYNC_ME; + *packet_flags |= CP_DMA_SYNC; } void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs, @@ -160,28 +151,23 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs, assert(!sctx->screen->info.cp_sdma_ge_use_system_memory_scope); assert(size && size % 4 == 0); - if (user_flags & SI_OP_SYNC_GE_BEFORE) - sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH; + if (!(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE) && !cp_dma_use_L2(sctx)) { + sctx->flags |= SI_CONTEXT_INV_L2; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); + } - if (user_flags & SI_OP_SYNC_CS_BEFORE) - sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; + struct pipe_shader_buffer barrier_buffer; + barrier_buffer.buffer = dst; + barrier_buffer.buffer_offset = MIN2(offset, UINT32_MAX); + barrier_buffer.buffer_size = MIN2(size, UINT32_MAX); - if (user_flags & SI_OP_SYNC_PS_BEFORE) - sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; + si_barrier_before_internal_op(sctx, user_flags, 1, &barrier_buffer, 0x1, 0, NULL); /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. */ util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size); - if (!(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE)) { - sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | - (cp_dma_use_L2(sctx) ? 0 : SI_CONTEXT_INV_L2); - } - - if (sctx->flags) - si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); - while (size) { unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx)); unsigned dma_flags = CP_DMA_CLEAR; @@ -206,9 +192,7 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs, va += byte_count; } - if (cp_dma_use_L2(sctx)) - sdst->TC_L2_dirty = true; - + si_barrier_after_internal_op(sctx, user_flags, 1, &barrier_buffer, 0x1, 0, NULL); sctx->num_cp_dma_calls++; } @@ -262,6 +246,21 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, assert(size); assert(dst && src); + if (!(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE) && !cp_dma_use_L2(sctx)) { + sctx->flags |= SI_CONTEXT_INV_L2; + si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); + } + + struct pipe_shader_buffer barrier_buffers[2]; + barrier_buffers[0].buffer = dst; + barrier_buffers[0].buffer_offset = MIN2(dst_offset, UINT32_MAX); + barrier_buffers[0].buffer_size = MIN2(size, UINT32_MAX); + barrier_buffers[1].buffer = src; + barrier_buffers[1].buffer_offset = MIN2(src_offset, UINT32_MAX); + barrier_buffers[1].buffer_size = MIN2(size, UINT32_MAX); + + si_barrier_before_internal_op(sctx, user_flags, 2, barrier_buffers, 0x1, 0, NULL); + /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. @@ -305,23 +304,6 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, } } - if (user_flags & SI_OP_SYNC_GE_BEFORE) - sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH; - - if (user_flags & SI_OP_SYNC_CS_BEFORE) - sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; - - if (user_flags & SI_OP_SYNC_PS_BEFORE) - sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; - - if (!(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE)) { - sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | - (cp_dma_use_L2(sctx) ? 0 : SI_CONTEXT_INV_L2); - } - - if (sctx->flags) - si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); - /* This is the main part doing the copying. Src is always aligned. */ uint64_t main_dst_offset = dst_offset + skipped_size; uint64_t main_src_offset = src_offset + skipped_size; @@ -376,9 +358,7 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, if (realign_size) si_cp_dma_realign_engine(sctx, realign_size, user_flags, &is_first); - if (cp_dma_use_L2(sctx)) - si_resource(dst)->TC_L2_dirty = true; - + si_barrier_after_internal_op(sctx, user_flags, 2, barrier_buffers, 0x1, 0, NULL); sctx->num_cp_dma_calls++; } diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 859f9ffd4f3..8b44fabc85f 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1474,6 +1474,18 @@ void si_destroy_compute(struct si_compute *program); #define SI_OP_FAIL_IF_SLOW (1 << 9) #define SI_OP_IS_NESTED (1 << 10) +void si_barrier_before_internal_op(struct si_context *sctx, unsigned flags, + unsigned num_buffers, + const struct pipe_shader_buffer *buffers, + unsigned writable_buffers_mask, + unsigned num_images, + const struct pipe_image_view *images); +void si_barrier_after_internal_op(struct si_context *sctx, unsigned flags, + unsigned num_buffers, + const struct pipe_shader_buffer *buffers, + unsigned writable_buffers_mask, + unsigned num_images, + const struct pipe_image_view *images); bool si_should_blit_clamp_to_edge(const struct pipe_blit_info *info, unsigned coord_mask); void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_info *info, void *shader, unsigned flags, unsigned num_buffers,