radeonsi: handle deferred cache flushes as a state (si_atom)

This allows us to remove a little bit of code from si_draw, and enable
removing more code in the future.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24732>
This commit is contained in:
Marek Olšák 2023-07-16 10:38:20 -04:00 committed by Marge Bot
parent c3129b2b83
commit 1e4b539042
16 changed files with 117 additions and 27 deletions

View file

@ -496,16 +496,20 @@ static void si_blit_decompress_color(struct si_context *sctx, struct si_texture
/* Required before and after FMASK and DCC_DECOMPRESS. */ /* Required before and after FMASK and DCC_DECOMPRESS. */
if (custom_blend == sctx->custom_blend_fmask_decompress || if (custom_blend == sctx->custom_blend_fmask_decompress ||
custom_blend == sctx->custom_blend_dcc_decompress) custom_blend == sctx->custom_blend_dcc_decompress) {
sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB; sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
si_blitter_begin(sctx, SI_DECOMPRESS); si_blitter_begin(sctx, SI_DECOMPRESS);
util_blitter_custom_color(sctx->blitter, cbsurf, custom_blend); util_blitter_custom_color(sctx->blitter, cbsurf, custom_blend);
si_blitter_end(sctx); si_blitter_end(sctx);
if (custom_blend == sctx->custom_blend_fmask_decompress || if (custom_blend == sctx->custom_blend_fmask_decompress ||
custom_blend == sctx->custom_blend_dcc_decompress) custom_blend == sctx->custom_blend_dcc_decompress) {
sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB; sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
/* When running FMASK decompression with DCC, we need to run the "eliminate fast clear" pass /* When running FMASK decompression with DCC, we need to run the "eliminate fast clear" pass
* separately because FMASK decompression doesn't eliminate DCC fast clear. This makes * separately because FMASK decompression doesn't eliminate DCC fast clear. This makes
@ -1036,6 +1040,7 @@ static void si_do_CB_resolve(struct si_context *sctx, const struct pipe_blit_inf
{ {
/* Required before and after CB_RESOLVE. */ /* Required before and after CB_RESOLVE. */
sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB; sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
si_blitter_begin( si_blitter_begin(
sctx, SI_COLOR_RESOLVE | (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND)); sctx, SI_COLOR_RESOLVE | (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));

View file

@ -55,6 +55,8 @@ void si_execute_clears(struct si_context *sctx, struct si_clear_info *info,
if (sctx->gfx_level <= GFX8) if (sctx->gfx_level <= GFX8)
sctx->flags |= SI_CONTEXT_INV_L2; sctx->flags |= SI_CONTEXT_INV_L2;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
/* Execute clears. */ /* Execute clears. */
for (unsigned i = 0; i < num_clears; i++) { for (unsigned i = 0; i < num_clears; i++) {
if (info[i].is_dcc_msaa) { if (info[i].is_dcc_msaa) {
@ -83,6 +85,8 @@ void si_execute_clears(struct si_context *sctx, struct si_clear_info *info,
/* GFX6-8: CB and DB don't use L2. */ /* GFX6-8: CB and DB don't use L2. */
if (sctx->gfx_level <= GFX8) if (sctx->gfx_level <= GFX8)
sctx->flags |= SI_CONTEXT_WB_L2; sctx->flags |= SI_CONTEXT_WB_L2;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
} }
static bool si_alloc_separate_cmask(struct si_screen *sscreen, struct si_texture *tex) static bool si_alloc_separate_cmask(struct si_screen *sscreen, struct si_texture *tex)
@ -1162,8 +1166,10 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers,
si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state); si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
} }
if (needs_db_flush) if (needs_db_flush) {
sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB; sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
} }
if (unlikely(sctx->sqtt_enabled)) { if (unlikely(sctx->sqtt_enabled)) {

View file

@ -947,8 +947,10 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
bool cs_regalloc_hang = sscreen->info.has_cs_regalloc_hang_bug && bool cs_regalloc_hang = sscreen->info.has_cs_regalloc_hang_bug &&
info->block[0] * info->block[1] * info->block[2] > 256; info->block[0] * info->block[1] * info->block[2] > 256;
if (cs_regalloc_hang) if (cs_regalloc_hang) {
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH; sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
if (program->ir_type != PIPE_SHADER_IR_NATIVE && program->shader.compilation_failed) if (program->ir_type != PIPE_SHADER_IR_NATIVE && program->shader.compilation_failed)
return; return;
@ -976,6 +978,7 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
/* Indirect buffers use TC L2 on GFX9, but not older hw. */ /* Indirect buffers use TC L2 on GFX9, but not older hw. */
if (sctx->gfx_level <= GFX8 && si_resource(info->indirect)->TC_L2_dirty) { if (sctx->gfx_level <= GFX8 && si_resource(info->indirect)->TC_L2_dirty) {
sctx->flags |= SI_CONTEXT_WB_L2; sctx->flags |= SI_CONTEXT_WB_L2;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
si_resource(info->indirect)->TC_L2_dirty = false; si_resource(info->indirect)->TC_L2_dirty = false;
} }
} }
@ -1024,7 +1027,7 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
/* Registers that are not read from memory should be set before this: */ /* Registers that are not read from memory should be set before this: */
if (sctx->flags) if (sctx->flags)
sctx->emit_cache_flush(sctx, &sctx->gfx_cs); si_emit_cache_flush_direct(sctx);
if (sctx->has_graphics && si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) { if (sctx->has_graphics && si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) {
sctx->atoms.s.render_cond.emit(sctx, -1); sctx->atoms.s.render_cond.emit(sctx, -1);
@ -1060,8 +1063,10 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
sctx->compute_is_busy = true; sctx->compute_is_busy = true;
sctx->num_compute_calls++; sctx->num_compute_calls++;
if (cs_regalloc_hang) if (cs_regalloc_hang) {
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
} }
void si_destroy_compute(struct si_compute *program) void si_destroy_compute(struct si_compute *program)

View file

@ -163,6 +163,9 @@ static void si_launch_grid_internal(struct si_context *sctx, const struct pipe_g
if (sctx->num_hw_pipestat_streamout_queries) if (sctx->num_hw_pipestat_streamout_queries)
sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS; sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
if (sctx->flags)
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
if (!(flags & SI_OP_CS_RENDER_COND_ENABLE)) if (!(flags & SI_OP_CS_RENDER_COND_ENABLE))
sctx->render_cond_enabled = false; sctx->render_cond_enabled = false;
@ -213,6 +216,9 @@ static void si_launch_grid_internal(struct si_context *sctx, const struct pipe_g
sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | SI_CONTEXT_PFP_SYNC_ME; sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | SI_CONTEXT_PFP_SYNC_ME;
} }
} }
if (sctx->flags)
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
} }
void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_info *info, void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_info *info,
@ -220,8 +226,10 @@ void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_inf
unsigned num_buffers, const struct pipe_shader_buffer *buffers, unsigned num_buffers, const struct pipe_shader_buffer *buffers,
unsigned writeable_bitmask) unsigned writeable_bitmask)
{ {
if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE)) if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE)) {
sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY); sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
/* Save states. */ /* Save states. */
struct pipe_shader_buffer saved_sb[3] = {}; struct pipe_shader_buffer saved_sb[3] = {};
@ -243,8 +251,10 @@ void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_inf
/* Do cache flushing at the end. */ /* Do cache flushing at the end. */
if (get_cache_policy(sctx, coher, 0) == L2_BYPASS) { if (get_cache_policy(sctx, coher, 0) == L2_BYPASS) {
if (flags & SI_OP_SYNC_AFTER) if (flags & SI_OP_SYNC_AFTER) {
sctx->flags |= SI_CONTEXT_WB_L2; sctx->flags |= SI_CONTEXT_WB_L2;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
} else { } else {
while (writeable_bitmask) while (writeable_bitmask)
si_resource(buffers[u_bit_scan(&writeable_bitmask)].buffer)->TC_L2_dirty = true; si_resource(buffers[u_bit_scan(&writeable_bitmask)].buffer)->TC_L2_dirty = true;

View file

@ -144,7 +144,7 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst
* Also wait for the previous CP DMA operations. * Also wait for the previous CP DMA operations.
*/ */
if (*is_first && sctx->flags) if (*is_first && sctx->flags)
sctx->emit_cache_flush(sctx, &sctx->gfx_cs); si_emit_cache_flush_direct(sctx);
if (user_flags & SI_OP_SYNC_CPDMA_BEFORE && *is_first && !(*packet_flags & CP_DMA_CLEAR)) if (user_flags & SI_OP_SYNC_CPDMA_BEFORE && *is_first && !(*packet_flags & CP_DMA_CLEAR))
*packet_flags |= CP_DMA_RAW_WAIT; *packet_flags |= CP_DMA_RAW_WAIT;
@ -192,6 +192,9 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
sctx->flags |= si_get_flush_flags(sctx, coher, cache_policy); sctx->flags |= si_get_flush_flags(sctx, coher, cache_policy);
} }
if (sctx->flags)
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
while (size) { while (size) {
unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx)); unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
unsigned dma_flags = CP_DMA_CLEAR | (sdst ? 0 : CP_DMA_DST_IS_GDS); unsigned dma_flags = CP_DMA_CLEAR | (sdst ? 0 : CP_DMA_DST_IS_GDS);
@ -330,6 +333,9 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
if ((dst || src) && !(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE)) if ((dst || src) && !(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE))
sctx->flags |= si_get_flush_flags(sctx, coher, cache_policy); sctx->flags |= si_get_flush_flags(sctx, coher, cache_policy);
if (sctx->flags)
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
/* This is the main part doing the copying. Src is always aligned. */ /* This is the main part doing the copying. Src is always aligned. */
main_dst_offset = dst_offset + skipped_size; main_dst_offset = dst_offset + skipped_size;
main_src_offset = src_offset + skipped_size; main_src_offset = src_offset + skipped_size;

View file

@ -1926,7 +1926,7 @@ static void si_upload_bindless_descriptors(struct si_context *sctx)
* descriptors directly in memory, in case the GPU is using them. * descriptors directly in memory, in case the GPU is using them.
*/ */
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH; sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
sctx->emit_cache_flush(sctx, &sctx->gfx_cs); si_emit_cache_flush_direct(sctx);
util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) { util_dynarray_foreach (&sctx->resident_tex_handles, struct si_texture_handle *, tex_handle) {
unsigned desc_slot = (*tex_handle)->desc_slot; unsigned desc_slot = (*tex_handle)->desc_slot;
@ -1950,6 +1950,7 @@ static void si_upload_bindless_descriptors(struct si_context *sctx)
/* Invalidate scalar L0 because the cache doesn't know that L2 changed. */ /* Invalidate scalar L0 because the cache doesn't know that L2 changed. */
sctx->flags |= SI_CONTEXT_INV_SCACHE; sctx->flags |= SI_CONTEXT_INV_SCACHE;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
sctx->bindless_descriptors_dirty = false; sctx->bindless_descriptors_dirty = false;
} }

View file

@ -108,7 +108,7 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h
/* Wait for draw calls to finish if needed. */ /* Wait for draw calls to finish if needed. */
if (wait_flags) { if (wait_flags) {
ctx->flags |= wait_flags; ctx->flags |= wait_flags;
ctx->emit_cache_flush(ctx, &ctx->gfx_cs); si_emit_cache_flush_direct(ctx);
} }
ctx->gfx_last_ib_is_busy = (wait_flags & wait_ps_cs) != wait_ps_cs; ctx->gfx_last_ib_is_busy = (wait_flags & wait_ps_cs) != wait_ps_cs;
@ -396,6 +396,8 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
if (ctx->screen->info.has_vgt_flush_ngg_legacy_bug && !ctx->ngg) if (ctx->screen->info.has_vgt_flush_ngg_legacy_bug && !ctx->ngg)
ctx->flags |= SI_CONTEXT_VGT_FLUSH; ctx->flags |= SI_CONTEXT_VGT_FLUSH;
si_mark_atom_dirty(ctx, &ctx->atoms.s.cache_flush);
if (ctx->screen->attribute_ring) { if (ctx->screen->attribute_ring) {
radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, ctx->screen->attribute_ring, radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, ctx->screen->attribute_ring,
RADEON_USAGE_READWRITE | RADEON_PRIO_SHADER_RINGS); RADEON_USAGE_READWRITE | RADEON_PRIO_SHADER_RINGS);
@ -658,6 +660,9 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
unsigned cb_db_event = 0; unsigned cb_db_event = 0;
unsigned flags = ctx->flags; unsigned flags = ctx->flags;
if (!flags)
return;
if (!ctx->has_graphics) { if (!ctx->has_graphics) {
/* Only process compute flags. */ /* Only process compute flags. */
flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
@ -911,10 +916,13 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
ctx->flags = 0; ctx->flags = 0;
} }
void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs) void gfx6_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
{ {
uint32_t flags = sctx->flags; uint32_t flags = sctx->flags;
if (!flags)
return;
if (!sctx->has_graphics) { if (!sctx->has_graphics) {
/* Only process compute flags. */ /* Only process compute flags. */
flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |

View file

@ -611,7 +611,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
if (sctx->gfx_level >= GFX10) if (sctx->gfx_level >= GFX10)
sctx->emit_cache_flush = gfx10_emit_cache_flush; sctx->emit_cache_flush = gfx10_emit_cache_flush;
else else
sctx->emit_cache_flush = si_emit_cache_flush; sctx->emit_cache_flush = gfx6_emit_cache_flush;
sctx->b.emit_string_marker = si_emit_string_marker; sctx->b.emit_string_marker = si_emit_string_marker;
sctx->b.set_debug_callback = si_set_debug_callback; sctx->b.set_debug_callback = si_set_debug_callback;

View file

@ -1531,7 +1531,7 @@ void si_trace_emit(struct si_context *sctx);
void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs,
unsigned cp_coher_cntl); unsigned cp_coher_cntl);
void gfx10_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs); void gfx10_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs); void gfx6_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
/* Replace the sctx->b.draw_vbo function with a wrapper. This can be use to implement /* Replace the sctx->b.draw_vbo function with a wrapper. This can be use to implement
* optimizations without affecting the normal draw_vbo functions perf. * optimizations without affecting the normal draw_vbo functions perf.
*/ */
@ -1851,6 +1851,8 @@ static inline void si_make_CB_shader_coherent(struct si_context *sctx, unsigned
/* GFX6-GFX8 */ /* GFX6-GFX8 */
sctx->flags |= SI_CONTEXT_INV_L2; sctx->flags |= SI_CONTEXT_INV_L2;
} }
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
} }
static inline void si_make_DB_shader_coherent(struct si_context *sctx, unsigned num_samples, static inline void si_make_DB_shader_coherent(struct si_context *sctx, unsigned num_samples,
@ -1876,6 +1878,8 @@ static inline void si_make_DB_shader_coherent(struct si_context *sctx, unsigned
/* GFX6-GFX8 */ /* GFX6-GFX8 */
sctx->flags |= SI_CONTEXT_INV_L2; sctx->flags |= SI_CONTEXT_INV_L2;
} }
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
} }
static inline bool si_can_sample_zs(struct si_texture *tex, bool stencil_sampler) static inline bool si_can_sample_zs(struct si_texture *tex, bool stencil_sampler)
@ -2116,6 +2120,23 @@ si_set_rasterized_prim(struct si_context *sctx, enum mesa_prim rast_prim,
} }
} }
/* There are 3 ways to flush caches and all of them are correct.
*
* 1) sctx->flags |= ...;
* si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); // deferred
*
* 2) sctx->flags |= ...;
* si_emit_cache_flush_direct(sctx); // immediate
*
* 3) sctx->flags |= ...;
* sctx->emit_cache_flush(sctx, cs); // immediate (2 is better though)
*/
static inline void si_emit_cache_flush_direct(struct si_context *sctx)
{
sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
sctx->dirty_atoms &= ~SI_ATOM_BIT(cache_flush);
}
#define PRINT_ERR(fmt, args...) \ #define PRINT_ERR(fmt, args...) \
fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args) fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)

View file

@ -859,9 +859,11 @@ static void si_update_hw_pipeline_stats(struct si_context *sctx, unsigned type,
if (diff == 1 && sctx->num_hw_pipestat_streamout_queries == 1) { if (diff == 1 && sctx->num_hw_pipestat_streamout_queries == 1) {
sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS; sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS; sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
} else if (diff == -1 && sctx->num_hw_pipestat_streamout_queries == 0) { } else if (diff == -1 && sctx->num_hw_pipestat_streamout_queries == 0) {
sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS; sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS; sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
} }
} }
} }
@ -1569,6 +1571,7 @@ static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_q
} }
sctx->flags |= sctx->screen->barrier_flags.cp_to_L2; sctx->flags |= sctx->screen->barrier_flags.cp_to_L2;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) { for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
if (query->b.type != PIPE_QUERY_TIMESTAMP) { if (query->b.type != PIPE_QUERY_TIMESTAMP) {
@ -1664,6 +1667,7 @@ static void si_render_condition(struct pipe_context *ctx, struct pipe_query *que
/* Settings this in the render cond atom is too late, /* Settings this in the render cond atom is too late,
* so set it here. */ * so set it here. */
sctx->flags |= sctx->screen->barrier_flags.L2_to_cp | SI_CONTEXT_FLUSH_FOR_RENDER_COND; sctx->flags |= sctx->screen->barrier_flags.L2_to_cp | SI_CONTEXT_FLUSH_FOR_RENDER_COND;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
sctx->render_cond_enabled = old_render_cond_enabled; sctx->render_cond_enabled = old_render_cond_enabled;
} }

View file

@ -1494,11 +1494,13 @@ static void si_set_active_query_state(struct pipe_context *ctx, bool enable)
if (sctx->num_hw_pipestat_streamout_queries) { if (sctx->num_hw_pipestat_streamout_queries) {
sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS; sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS; sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
} }
} else { } else {
if (sctx->num_hw_pipestat_streamout_queries) { if (sctx->num_hw_pipestat_streamout_queries) {
sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS; sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS; sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
} }
} }
@ -2893,6 +2895,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
} }
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH; sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
/* u_blitter doesn't invoke depth decompression when it does multiple /* u_blitter doesn't invoke depth decompression when it does multiple
* blits in a row, but the only case when it matters for DB is when * blits in a row, but the only case when it matters for DB is when
@ -2910,6 +2913,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
* Flushing DB metadata works around the problem. * Flushing DB metadata works around the problem.
*/ */
sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB_META; sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB_META;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
} }
/* Take the maximum of the old and new count. If the new count is lower, /* Take the maximum of the old and new count. If the new count is lower,
@ -5390,6 +5394,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
/* Indirect buffers use TC L2 on GFX9, but not older hw. */ /* Indirect buffers use TC L2 on GFX9, but not older hw. */
if (sctx->screen->info.gfx_level <= GFX8 && flags & PIPE_BARRIER_INDIRECT_BUFFER) if (sctx->screen->info.gfx_level <= GFX8 && flags & PIPE_BARRIER_INDIRECT_BUFFER)
sctx->flags |= SI_CONTEXT_WB_L2; sctx->flags |= SI_CONTEXT_WB_L2;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
} }
static void *si_create_blend_custom(struct si_context *sctx, unsigned mode) static void *si_create_blend_custom(struct si_context *sctx, unsigned mode)
@ -5402,6 +5408,11 @@ static void *si_create_blend_custom(struct si_context *sctx, unsigned mode)
return si_create_blend_state_mode(&sctx->b, &blend, mode); return si_create_blend_state_mode(&sctx->b, &blend, mode);
} }
static void si_emit_cache_flush_state(struct si_context *sctx, unsigned index)
{
sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
}
void si_init_state_compute_functions(struct si_context *sctx) void si_init_state_compute_functions(struct si_context *sctx)
{ {
sctx->b.create_sampler_state = si_create_sampler_state; sctx->b.create_sampler_state = si_create_sampler_state;
@ -5434,6 +5445,7 @@ void si_init_state_functions(struct si_context *sctx)
sctx->atoms.s.clip_regs.emit = si_emit_clip_regs; sctx->atoms.s.clip_regs.emit = si_emit_clip_regs;
sctx->atoms.s.clip_state.emit = si_emit_clip_state; sctx->atoms.s.clip_state.emit = si_emit_clip_state;
sctx->atoms.s.stencil_ref.emit = si_emit_stencil_ref; sctx->atoms.s.stencil_ref.emit = si_emit_stencil_ref;
sctx->atoms.s.cache_flush.emit = si_emit_cache_flush_state;
sctx->b.create_blend_state = si_create_blend_state; sctx->b.create_blend_state = si_create_blend_state;
sctx->b.bind_blend_state = si_bind_blend_state; sctx->b.bind_blend_state = si_bind_blend_state;

View file

@ -208,6 +208,7 @@ union si_state_atoms {
struct si_atom ngg_cull_state; struct si_atom ngg_cull_state;
struct si_atom vgt_pipeline_state; struct si_atom vgt_pipeline_state;
struct si_atom tess_io_layout; struct si_atom tess_io_layout;
struct si_atom cache_flush;
} s; } s;
struct si_atom array[sizeof(struct si_atoms_s) / sizeof(struct si_atom)]; struct si_atom array[sizeof(struct si_atoms_s) / sizeof(struct si_atom)];
}; };

View file

@ -833,8 +833,10 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
if (GFX_VERSION == GFX7 && if (GFX_VERSION == GFX7 &&
sctx->family == CHIP_HAWAII && G_028AA8_SWITCH_ON_EOI(ia_multi_vgt_param) && sctx->family == CHIP_HAWAII && G_028AA8_SWITCH_ON_EOI(ia_multi_vgt_param) &&
num_instanced_prims_less_than<IS_DRAW_VERTEX_STATE>(indirect, prim, min_vertex_count, num_instanced_prims_less_than<IS_DRAW_VERTEX_STATE>(indirect, prim, min_vertex_count,
instance_count, 2, sctx->patch_vertices)) instance_count, 2, sctx->patch_vertices)) {
sctx->flags |= SI_CONTEXT_VGT_FLUSH; sctx->flags |= SI_CONTEXT_VGT_FLUSH;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
} }
return ia_multi_vgt_param; return ia_multi_vgt_param;
@ -2086,6 +2088,7 @@ static void si_draw(struct pipe_context *ctx,
/* GFX8 reads index buffers through TC L2, so it doesn't /* GFX8 reads index buffers through TC L2, so it doesn't
* need this. */ * need this. */
sctx->flags |= SI_CONTEXT_WB_L2; sctx->flags |= SI_CONTEXT_WB_L2;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
si_resource(indexbuf)->TC_L2_dirty = false; si_resource(indexbuf)->TC_L2_dirty = false;
} }
} }
@ -2098,12 +2101,14 @@ static void si_draw(struct pipe_context *ctx,
if (GFX_VERSION <= GFX8) { if (GFX_VERSION <= GFX8) {
if (indirect->buffer && si_resource(indirect->buffer)->TC_L2_dirty) { if (indirect->buffer && si_resource(indirect->buffer)->TC_L2_dirty) {
sctx->flags |= SI_CONTEXT_WB_L2; sctx->flags |= SI_CONTEXT_WB_L2;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
si_resource(indirect->buffer)->TC_L2_dirty = false; si_resource(indirect->buffer)->TC_L2_dirty = false;
} }
if (indirect->indirect_draw_count && if (indirect->indirect_draw_count &&
si_resource(indirect->indirect_draw_count)->TC_L2_dirty) { si_resource(indirect->indirect_draw_count)->TC_L2_dirty) {
sctx->flags |= SI_CONTEXT_WB_L2; sctx->flags |= SI_CONTEXT_WB_L2;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
si_resource(indirect->indirect_draw_count)->TC_L2_dirty = false; si_resource(indirect->indirect_draw_count)->TC_L2_dirty = false;
} }
} }
@ -2260,18 +2265,17 @@ static void si_draw(struct pipe_context *ctx,
/* Emit all states except possibly render condition. */ /* Emit all states except possibly render condition. */
si_emit_rasterizer_prim_state<GFX_VERSION, HAS_GS, NGG, IS_BLIT>(sctx); si_emit_rasterizer_prim_state<GFX_VERSION, HAS_GS, NGG, IS_BLIT>(sctx);
si_emit_all_states(sctx, masked_atoms); /* This must be done before si_emit_all_states because it can set cache flush flags. */
/* Emit draw states. */
si_emit_vs_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_BLIT, HAS_PAIRS>
(sctx, index_size);
si_emit_draw_registers<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE> si_emit_draw_registers<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE>
(sctx, indirect, prim, index_size, instance_count, primitive_restart, (sctx, indirect, prim, index_size, instance_count, primitive_restart,
info->restart_index, min_direct_count); info->restart_index, min_direct_count);
/* This emits states and flushes caches. */
si_emit_all_states(sctx, masked_atoms);
/* <-- CUs are idle here if the cache_flush state waited. */
if (sctx->flags) /* This must be done after si_emit_all_states, which can affect this. */
sctx->emit_cache_flush(sctx, &sctx->gfx_cs); si_emit_vs_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_BLIT, HAS_PAIRS>
/* <-- CUs are idle here if we waited. */ (sctx, index_size);
/* If we haven't emitted the render condition state (because it depends on cache flushes), /* If we haven't emitted the render condition state (because it depends on cache flushes),
* do it now. * do it now.
@ -2328,6 +2332,7 @@ static void si_draw(struct pipe_context *ctx,
(GFX_VERSION == GFX8 && (sctx->family == CHIP_TONGA || sctx->family == CHIP_FIJI))) && (GFX_VERSION == GFX8 && (sctx->family == CHIP_TONGA || sctx->family == CHIP_FIJI))) &&
si_get_strmout_en(sctx)) { si_get_strmout_en(sctx)) {
sctx->flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC; sctx->flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
} }
if (unlikely(IS_BLIT && sctx->decompression_enabled)) { if (unlikely(IS_BLIT && sctx->decompression_enabled)) {

View file

@ -3427,6 +3427,8 @@ bool si_update_ngg(struct si_context *sctx)
*/ */
if (sctx->screen->info.has_vgt_flush_ngg_legacy_bug && !new_ngg) { if (sctx->screen->info.has_vgt_flush_ngg_legacy_bug && !new_ngg) {
sctx->flags |= SI_CONTEXT_VGT_FLUSH; sctx->flags |= SI_CONTEXT_VGT_FLUSH;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
if (sctx->gfx_level == GFX10) { if (sctx->gfx_level == GFX10) {
/* Workaround for https://gitlab.freedesktop.org/mesa/mesa/-/issues/2941 */ /* Workaround for https://gitlab.freedesktop.org/mesa/mesa/-/issues/2941 */
si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);

View file

@ -115,6 +115,9 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
SI_CONTEXT_PFP_SYNC_ME; SI_CONTEXT_PFP_SYNC_ME;
} }
if (sctx->flags)
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
/* Streamout buffers must be bound in 2 places: /* Streamout buffers must be bound in 2 places:
* 1) in VGT by setting the VGT_STRMOUT registers * 1) in VGT by setting the VGT_STRMOUT registers
* 2) as shader resources * 2) as shader resources
@ -193,7 +196,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL); si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
if (wait_now) if (wait_now)
sctx->emit_cache_flush(sctx, &sctx->gfx_cs); si_emit_cache_flush_direct(sctx);
} }
static void si_flush_vgt_streamout(struct si_context *sctx) static void si_flush_vgt_streamout(struct si_context *sctx)
@ -309,7 +312,7 @@ void si_emit_streamout_end(struct si_context *sctx)
if (sctx->gfx_level >= GFX11) { if (sctx->gfx_level >= GFX11) {
/* Wait for streamout to finish before reading GDS_STRMOUT registers. */ /* Wait for streamout to finish before reading GDS_STRMOUT registers. */
sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH; sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
sctx->emit_cache_flush(sctx, &sctx->gfx_cs); si_emit_cache_flush_direct(sctx);
} else { } else {
si_flush_vgt_streamout(sctx); si_flush_vgt_streamout(sctx);
} }
@ -326,6 +329,7 @@ void si_emit_streamout_end(struct si_context *sctx)
COPY_DATA_REG, NULL, COPY_DATA_REG, NULL,
(R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i); (R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i);
sctx->flags |= SI_CONTEXT_PFP_SYNC_ME; sctx->flags |= SI_CONTEXT_PFP_SYNC_ME;
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
} else { } else {
radeon_begin(cs); radeon_begin(cs);
radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));

View file

@ -153,7 +153,7 @@ void si_test_dma_perf(struct si_screen *sscreen)
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_CB |
SI_CONTEXT_FLUSH_AND_INV_DB; SI_CONTEXT_FLUSH_AND_INV_DB;
sctx->emit_cache_flush(sctx, &sctx->gfx_cs); si_emit_cache_flush_direct(sctx);
struct pipe_query *q = ctx->create_query(ctx, query_type, 0); struct pipe_query *q = ctx->create_query(ctx, query_type, 0);
ctx->begin_query(ctx, q); ctx->begin_query(ctx, q);
@ -217,7 +217,7 @@ void si_test_dma_perf(struct si_screen *sscreen)
sctx->flags |= SI_CONTEXT_INV_VCACHE | sctx->flags |= SI_CONTEXT_INV_VCACHE |
(cache_policy == L2_LRU ? 0 : SI_CONTEXT_INV_L2) | (cache_policy == L2_LRU ? 0 : SI_CONTEXT_INV_L2) |
SI_CONTEXT_CS_PARTIAL_FLUSH; SI_CONTEXT_CS_PARTIAL_FLUSH;
sctx->emit_cache_flush(sctx, &sctx->gfx_cs); si_emit_cache_flush_direct(sctx);
} }
ctx->end_query(ctx, q); ctx->end_query(ctx, q);