diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index 7836cdb467a..0e14a6be9e2 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -70,6 +70,9 @@ void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *inf if (flags & SI_OP_SYNC_CS_BEFORE) sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; + if (!(flags & SI_OP_CS_IMAGE)) + sctx->flags |= SI_CONTEXT_PFP_SYNC_ME; + /* Invalidate L0-L1 caches. */ /* sL0 is never invalidated, because src resources don't use it. */ if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE)) @@ -107,7 +110,7 @@ void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *inf sctx->flags |= SI_CONTEXT_INV_VCACHE; } else { /* Make sure buffer stores are visible to all CUs. */ - sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE; + sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | SI_CONTEXT_PFP_SYNC_ME; } } } diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index 5a379d9da48..26f9f3a2311 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -197,10 +197,10 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs, assert(size && size % 4 == 0); if (user_flags & SI_OP_SYNC_CS_BEFORE) - sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; + sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME; if (user_flags & SI_OP_SYNC_PS_BEFORE) - sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME; /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping @@ -340,10 +340,10 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, } if (user_flags & SI_OP_SYNC_CS_BEFORE) - sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; + sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME; if (user_flags & SI_OP_SYNC_PS_BEFORE) - sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME; if ((dst || src) && !(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE)) sctx->flags |= si_get_flush_flags(sctx, coher, cache_policy); diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index 6d19dd18d55..35af5c9d4d2 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -569,6 +569,8 @@ void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, uns assert(sctx->chip_class <= GFX9); + cp_coher_cntl |= 1u << 31; /* don't sync PFP, i.e. execute the sync in ME */ + radeon_begin(cs); if (sctx->chip_class == GFX9 || compute_ib) { @@ -749,21 +751,22 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs) /* Ignore fields that only modify the behavior of other fields. */ if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) { + unsigned dont_sync_pfp = (!(flags & SI_CONTEXT_PFP_SYNC_ME)) << 31; + /* Flush caches and wait for the caches to assert idle. * The cache flush is executed in the ME, but the PFP waits * for completion. */ radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0)); - radeon_emit(cs, 0); /* CP_COHER_CNTL */ + radeon_emit(cs, dont_sync_pfp); /* CP_COHER_CNTL */ radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */ radeon_emit(cs, 0); /* CP_COHER_BASE */ radeon_emit(cs, 0); /* CP_COHER_BASE_HI */ radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ radeon_emit(cs, gcr_cntl); /* GCR_CNTL */ - } else if (cb_db_event || (flags & (SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH | - SI_CONTEXT_CS_PARTIAL_FLUSH))) { - /* We need to ensure that PFP waits as well. */ + } else if (flags & SI_CONTEXT_PFP_SYNC_ME) { + /* Synchronize PFP with ME. (this stalls PFP) */ radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); radeon_emit(cs, 0); } @@ -953,23 +956,11 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs) si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL); } - /* Make sure ME is idle (it executes most packets) before continuing. - * This prevents read-after-write hazards between PFP and ME. - */ - if (sctx->has_graphics && - (cp_coher_cntl || (flags & (SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_INV_VCACHE | - SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2)))) { - radeon_begin(cs); - radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); - radeon_emit(cs, 0); - radeon_end(); - } - /* GFX6-GFX8 only: * When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC * waits for idle, so it should be last. SURFACE_SYNC is done in PFP. * - * cp_coher_cntl should contain all necessary flags except TC flags + * cp_coher_cntl should contain all necessary flags except TC and PFP flags * at this point. * * GFX6-GFX7 don't support L2 write-back. @@ -1011,6 +1002,13 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs) if (cp_coher_cntl) si_emit_surface_sync(sctx, cs, cp_coher_cntl); + if (flags & SI_CONTEXT_PFP_SYNC_ME) { + radeon_begin(cs); + radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); + radeon_emit(cs, 0); + radeon_end(); + } + if (is_barrier) si_prim_discard_signal_next_compute_ib_start(sctx); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 691f1b9f865..ec0c54223ac 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -98,6 +98,9 @@ extern "C" { #define SI_CONTEXT_CS_PARTIAL_FLUSH (1 << 14) #define SI_CONTEXT_VGT_FLUSH (1 << 15) #define SI_CONTEXT_VGT_STREAMOUT_SYNC (1 << 16) +/* PFP waits for ME to finish. Used to sync for index and indirect buffers and render + * condition. It's typically set when doing a VS/PS/CS partial flush for buffers. */ +#define SI_CONTEXT_PFP_SYNC_ME (1 << 17) #define SI_PREFETCH_LS (1 << 1) #define SI_PREFETCH_HS (1 << 2) diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c index 4aa83fae3b6..898997fe968 100644 --- a/src/gallium/drivers/radeonsi/si_query.c +++ b/src/gallium/drivers/radeonsi/si_query.c @@ -1582,7 +1582,7 @@ static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_q } sctx->b.launch_grid(&sctx->b, &grid); - sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; + sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME; } si_restore_qbo_state(sctx, &saved_state); diff --git a/src/gallium/drivers/radeonsi/si_sqtt.c b/src/gallium/drivers/radeonsi/si_sqtt.c index dcd40c12913..e4b745eb6e5 100644 --- a/src/gallium/drivers/radeonsi/si_sqtt.c +++ b/src/gallium/drivers/radeonsi/si_sqtt.c @@ -383,7 +383,7 @@ si_thread_trace_start(struct si_context *sctx, int family, struct radeon_cmdbuf sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | - SI_CONTEXT_INV_L2; + SI_CONTEXT_INV_L2 | SI_CONTEXT_PFP_SYNC_ME; sctx->emit_cache_flush(sctx, cs); si_inhibit_clockgating(sctx, cs, true); @@ -426,7 +426,7 @@ si_thread_trace_stop(struct si_context *sctx, int family, struct radeon_cmdbuf * sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | - SI_CONTEXT_INV_L2; + SI_CONTEXT_INV_L2 | SI_CONTEXT_PFP_SYNC_ME; sctx->emit_cache_flush(sctx, cs); si_emit_thread_trace_stop(sctx, cs, family); diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index cc4c9f6da1d..a1b86633c4a 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -5001,7 +5001,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags) /* Subsequent commands must wait for all shader invocations to * complete. */ - sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH; + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | + SI_CONTEXT_PFP_SYNC_ME; if (flags & PIPE_BARRIER_CONSTANT_BUFFER) sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE; diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c index 5303ad5e83d..d5173aaf440 100644 --- a/src/gallium/drivers/radeonsi/si_state_streamout.c +++ b/src/gallium/drivers/radeonsi/si_state_streamout.c @@ -112,7 +112,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ /* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */ if (sctx->screen->use_ngg_streamout) { - sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME; /* Wait now. This is needed to make sure that GDS is not * busy at the end of IBs. @@ -122,7 +122,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ */ wait_now = true; } else { - sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH; + sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME; } } @@ -133,7 +133,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ if (sctx->screen->use_ngg_streamout) si_allocate_gds(sctx); - sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH; + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | + SI_CONTEXT_PFP_SYNC_ME; } /* Streamout buffers must be bound in 2 places: