diff --git a/src/gallium/drivers/radeonsi/gfx11_query.c b/src/gallium/drivers/radeonsi/gfx11_query.c index a2495b40eb5..f92768ef53d 100644 --- a/src/gallium/drivers/radeonsi/gfx11_query.c +++ b/src/gallium/drivers/radeonsi/gfx11_query.c @@ -353,10 +353,8 @@ static void gfx11_sh_query_get_result_resource(struct si_context *sctx, struct s grid.grid[2] = 1; /* TODO: Range-invalidate GL2 */ - if (sctx->screen->info.cp_sdma_ge_use_system_memory_scope) { - sctx->barrier_flags |= SI_BARRIER_INV_L2; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); - } + if (sctx->screen->info.cp_sdma_ge_use_system_memory_scope) + si_set_barrier_flags(sctx, SI_BARRIER_INV_L2); struct gfx11_sh_query_buffer *qbuf = query->first; for (;;) { diff --git a/src/gallium/drivers/radeonsi/si_barrier.c b/src/gallium/drivers/radeonsi/si_barrier.c index d59cbe9673c..43a9c336477 100644 --- a/src/gallium/drivers/radeonsi/si_barrier.c +++ b/src/gallium/drivers/radeonsi/si_barrier.c @@ -521,6 +521,11 @@ void si_barrier_before_internal_op(struct si_context *sctx, unsigned flags, unsigned num_images, const struct pipe_image_view *images) { + unsigned new_barriers; + + /* Invalidate the VMEM cache only. The SMEM cache isn't used by shader buffers. */ + new_barriers = SI_BARRIER_INV_VMEM; + for (unsigned i = 0; i < num_images; i++) { /* The driver doesn't decompress resources automatically for internal blits, so do it manually. */ si_decompress_subresource(&sctx->b, images[i].resource, PIPE_MASK_RGBAZS, @@ -551,12 +556,12 @@ void si_barrier_before_internal_op(struct si_context *sctx, unsigned flags, if (!si_is_buffer_idle(sctx, buf, RADEON_USAGE_WRITE | (writable_buffers_mask & BITFIELD_BIT(i) ? RADEON_USAGE_READ : 0))) { if (buf->bind_history & ps_mask) - sctx->barrier_flags |= SI_BARRIER_SYNC_PS; + new_barriers |= SI_BARRIER_SYNC_PS; else - sctx->barrier_flags |= SI_BARRIER_SYNC_VS; + new_barriers |= SI_BARRIER_SYNC_VS; if (buf->bind_history & cs_mask) - sctx->barrier_flags |= SI_BARRIER_SYNC_CS; + new_barriers |= SI_BARRIER_SYNC_CS; } } @@ -571,13 +576,11 @@ void si_barrier_before_internal_op(struct si_context *sctx, unsigned flags, if (!si_is_buffer_idle(sctx, img, RADEON_USAGE_WRITE | (writable ? RADEON_USAGE_READ : 0))) { si_make_CB_shader_coherent(sctx, images[i].resource->nr_samples, true, ((struct si_texture*)images[i].resource)->surface.u.gfx9.color.dcc.pipe_aligned); - sctx->barrier_flags |= SI_BARRIER_SYNC_PS | SI_BARRIER_SYNC_CS; + new_barriers |= SI_BARRIER_SYNC_PS | SI_BARRIER_SYNC_CS; } } - /* Invalidate the VMEM cache only. The SMEM cache isn't used by shader buffers. */ - sctx->barrier_flags |= SI_BARRIER_INV_VMEM; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); + si_set_barrier_flags(sctx, new_barriers); } void si_barrier_after_internal_op(struct si_context *sctx, unsigned flags, @@ -587,18 +590,18 @@ void si_barrier_after_internal_op(struct si_context *sctx, unsigned flags, unsigned num_images, const struct pipe_image_view *images) { - sctx->barrier_flags |= SI_BARRIER_SYNC_CS; + unsigned new_barriers = SI_BARRIER_SYNC_CS; if (num_images) { /* Make sure image stores are visible to CB, which doesn't use L2 on GFX6-8. */ - sctx->barrier_flags |= sctx->gfx_level <= GFX8 ? SI_BARRIER_WB_L2 : 0; + new_barriers |= sctx->gfx_level <= GFX8 ? SI_BARRIER_WB_L2 : 0; /* Make sure image stores are visible to all CUs. */ - sctx->barrier_flags |= SI_BARRIER_INV_VMEM; + new_barriers |= SI_BARRIER_INV_VMEM; } /* Make sure buffer stores are visible to all CUs and also as index/indirect buffers. */ if (num_buffers) - sctx->barrier_flags |= SI_BARRIER_INV_SMEM | SI_BARRIER_INV_VMEM | SI_BARRIER_PFP_SYNC_ME; + new_barriers |= SI_BARRIER_INV_SMEM | SI_BARRIER_INV_VMEM | SI_BARRIER_PFP_SYNC_ME; /* We must set L2_cache_dirty for buffers because: * - GFX6,12: CP DMA doesn't use L2. @@ -618,13 +621,13 @@ void si_barrier_after_internal_op(struct si_context *sctx, unsigned flags, images[i].access & PIPE_IMAGE_ACCESS_WRITE && (sctx->screen->always_allow_dcc_stores || images[i].access & SI_IMAGE_ACCESS_ALLOW_DCC_STORE)) { - sctx->barrier_flags |= SI_BARRIER_INV_L2; + new_barriers |= SI_BARRIER_INV_L2; break; } } } - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); + si_set_barrier_flags(sctx, new_barriers); } static void si_set_dst_src_barrier_buffers(struct pipe_shader_buffer *buffers, @@ -664,6 +667,7 @@ static void si_texture_barrier(struct pipe_context *ctx, unsigned flags) static void si_memory_barrier(struct pipe_context *ctx, unsigned flags) { struct si_context *sctx = (struct si_context *)ctx; + unsigned new_barriers; /* Ignore PIPE_BARRIER_UPDATE_BUFFER - it synchronizes against updates like buffer_subdata. */ /* Ignore PIPE_BARRIER_UPDATE_TEXTURE - it synchronizes against updates like texture_subdata. */ @@ -677,10 +681,10 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags) if (!flags) return; - sctx->barrier_flags |= SI_BARRIER_SYNC_PS | SI_BARRIER_SYNC_CS; + new_barriers = SI_BARRIER_SYNC_PS | SI_BARRIER_SYNC_CS; if (flags & PIPE_BARRIER_CONSTANT_BUFFER) - sctx->barrier_flags |= SI_BARRIER_INV_SMEM | SI_BARRIER_INV_VMEM; + new_barriers |= SI_BARRIER_INV_SMEM | SI_BARRIER_INV_VMEM; /* VMEM cache contents are written back to L2 automatically at the end of waves, but * the contents of other VMEM caches might still be stale. @@ -689,47 +693,47 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags) */ if (flags & (PIPE_BARRIER_VERTEX_BUFFER | PIPE_BARRIER_SHADER_BUFFER | PIPE_BARRIER_TEXTURE | PIPE_BARRIER_IMAGE | PIPE_BARRIER_STREAMOUT_BUFFER | PIPE_BARRIER_GLOBAL_BUFFER)) - sctx->barrier_flags |= SI_BARRIER_INV_VMEM; + new_barriers |= SI_BARRIER_INV_VMEM; /* Unlike LLVM, ACO may use SMEM for SSBOs and global access. */ if (sctx->screen->use_aco && (flags & (PIPE_BARRIER_SHADER_BUFFER | PIPE_BARRIER_GLOBAL_BUFFER))) - sctx->barrier_flags |= SI_BARRIER_INV_SMEM; + new_barriers |= SI_BARRIER_INV_SMEM; if (flags & (PIPE_BARRIER_INDEX_BUFFER | PIPE_BARRIER_INDIRECT_BUFFER)) - sctx->barrier_flags |= SI_BARRIER_PFP_SYNC_ME; + new_barriers |= SI_BARRIER_PFP_SYNC_ME; /* Index buffers use L2 since GFX8 */ if (flags & PIPE_BARRIER_INDEX_BUFFER && (sctx->gfx_level <= GFX7 || sctx->screen->info.cp_sdma_ge_use_system_memory_scope)) - sctx->barrier_flags |= SI_BARRIER_WB_L2; + new_barriers |= SI_BARRIER_WB_L2; /* Indirect buffers use L2 since GFX9. */ if (flags & PIPE_BARRIER_INDIRECT_BUFFER && (sctx->gfx_level <= GFX8 || sctx->screen->info.cp_sdma_ge_use_system_memory_scope)) - sctx->barrier_flags |= SI_BARRIER_WB_L2; + new_barriers |= SI_BARRIER_WB_L2; /* MSAA color images are flushed in si_decompress_textures when needed. * Shaders never write to depth/stencil images. */ if (flags & PIPE_BARRIER_FRAMEBUFFER && sctx->framebuffer.uncompressed_cb_mask) { - sctx->barrier_flags |= SI_BARRIER_SYNC_AND_INV_CB; + new_barriers |= SI_BARRIER_SYNC_AND_INV_CB; if (sctx->gfx_level >= GFX10 && sctx->gfx_level < GFX12) { if (sctx->screen->info.tcc_rb_non_coherent) - sctx->barrier_flags |= SI_BARRIER_INV_L2; + new_barriers |= SI_BARRIER_INV_L2; else /* We don't know which shaders do image stores with DCC: */ - sctx->barrier_flags |= SI_BARRIER_INV_L2_METADATA; + new_barriers |= SI_BARRIER_INV_L2_METADATA; } else if (sctx->gfx_level == GFX9) { /* We have to invalidate L2 for MSAA and when DCC can have pipe_aligned=0. */ - sctx->barrier_flags |= SI_BARRIER_INV_L2; + new_barriers |= SI_BARRIER_INV_L2; } else if (sctx->gfx_level <= GFX8) { /* CB doesn't use L2 on GFX6-8. */ - sctx->barrier_flags |= SI_BARRIER_WB_L2; + new_barriers |= SI_BARRIER_WB_L2; } } - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); + si_set_barrier_flags(sctx, new_barriers); } static void si_set_sampler_depth_decompress_mask(struct si_context *sctx, struct si_texture *tex) @@ -752,10 +756,8 @@ static void si_set_sampler_depth_decompress_mask(struct si_context *sctx, struct void si_fb_barrier_before_rendering(struct si_context *sctx) { /* Wait for all shaders because all image loads must finish before CB/DB can write there. */ - if (sctx->framebuffer.state.nr_cbufs || sctx->framebuffer.state.zsbuf.texture) { - sctx->barrier_flags |= SI_BARRIER_SYNC_CS | SI_BARRIER_SYNC_PS; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); - } + if (sctx->framebuffer.state.nr_cbufs || sctx->framebuffer.state.zsbuf.texture) + si_set_barrier_flags(sctx, SI_BARRIER_SYNC_CS | SI_BARRIER_SYNC_PS); } void si_fb_barrier_after_rendering(struct si_context *sctx, unsigned flags) @@ -823,8 +825,7 @@ void si_fb_barrier_after_rendering(struct si_context *sctx, unsigned flags) * * This seems to fix them: */ - sctx->barrier_flags |= SI_BARRIER_SYNC_AND_INV_DB | SI_BARRIER_INV_L2; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); + si_set_barrier_flags(sctx, SI_BARRIER_SYNC_AND_INV_DB | SI_BARRIER_INV_L2); } } else if (sctx->gfx_level == GFX9) { /* It appears that DB metadata "leaks" in a sequence of: @@ -833,14 +834,16 @@ void si_fb_barrier_after_rendering(struct si_context *sctx, unsigned flags) * - render with DEPTH_BEFORE_SHADER=1 * Flushing DB metadata works around the problem. */ - sctx->barrier_flags |= SI_BARRIER_EVENT_FLUSH_AND_INV_DB_META; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); + si_set_barrier_flags(sctx, SI_BARRIER_EVENT_FLUSH_AND_INV_DB_META); } } } void si_barrier_before_image_fast_clear(struct si_context *sctx, unsigned types) { + /* Invalidate the VMEM cache because we always use compute. */ + unsigned new_barriers = SI_BARRIER_INV_VMEM; + /* Flush caches and wait for idle. */ if (types & (SI_CLEAR_TYPE_CMASK | SI_CLEAR_TYPE_DCC)) { si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples, @@ -853,26 +856,23 @@ void si_barrier_before_image_fast_clear(struct si_context *sctx, unsigned types) sctx->framebuffer.DB_has_shader_readable_metadata); } - /* Invalidate the VMEM cache because we always use compute. */ - sctx->barrier_flags |= SI_BARRIER_INV_VMEM; - /* GFX6-8: CB and DB don't use L2. */ if (sctx->gfx_level <= GFX8) - sctx->barrier_flags |= SI_BARRIER_INV_L2; + new_barriers |= SI_BARRIER_INV_L2; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); + si_set_barrier_flags(sctx, new_barriers); } void si_barrier_after_image_fast_clear(struct si_context *sctx) { /* Wait for idle. */ - sctx->barrier_flags |= SI_BARRIER_SYNC_CS; + unsigned new_barriers = SI_BARRIER_SYNC_CS; /* GFX6-8: CB and DB don't use L2. */ if (sctx->gfx_level <= GFX8) - sctx->barrier_flags |= SI_BARRIER_WB_L2; + new_barriers |= SI_BARRIER_WB_L2; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); + si_set_barrier_flags(sctx, new_barriers); } void si_init_barrier_functions(struct si_context *sctx) diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index 8540cac5797..70e1c5a5549 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -501,10 +501,8 @@ static void si_blit_decompress_color(struct si_context *sctx, struct si_texture /* Required before and after FMASK and DCC_DECOMPRESS. */ if (custom_blend == sctx->custom_blend_fmask_decompress || - custom_blend == sctx->custom_blend_dcc_decompress) { - sctx->barrier_flags |= SI_BARRIER_SYNC_AND_INV_CB; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); - } + custom_blend == sctx->custom_blend_dcc_decompress) + si_set_barrier_flags(sctx, SI_BARRIER_SYNC_AND_INV_CB); si_blitter_begin(sctx, SI_DECOMPRESS); util_blitter_custom_color(sctx->blitter, &cbsurf, custom_blend); @@ -512,8 +510,7 @@ static void si_blit_decompress_color(struct si_context *sctx, struct si_texture if (custom_blend == sctx->custom_blend_fmask_decompress || custom_blend == sctx->custom_blend_dcc_decompress) { - sctx->barrier_flags |= SI_BARRIER_SYNC_AND_INV_CB; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); + si_set_barrier_flags(sctx, SI_BARRIER_SYNC_AND_INV_CB); } /* When running FMASK decompression with DCC, we need to run the "eliminate fast clear" pass @@ -1044,8 +1041,7 @@ static void si_do_CB_resolve(struct si_context *sctx, const struct pipe_blit_inf enum pipe_format format) { /* Required before and after CB_RESOLVE. */ - sctx->barrier_flags |= SI_BARRIER_SYNC_AND_INV_CB; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); + si_set_barrier_flags(sctx, SI_BARRIER_SYNC_AND_INV_CB); si_blitter_begin( sctx, SI_COLOR_RESOLVE | (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND)); diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/drivers/radeonsi/si_clear.c index 909eead75d7..06b821f9e55 100644 --- a/src/gallium/drivers/radeonsi/si_clear.c +++ b/src/gallium/drivers/radeonsi/si_clear.c @@ -1152,8 +1152,7 @@ static void gfx6_clear(struct pipe_context *ctx, unsigned buffers, if ((zstex->depth_clear_value[level] != 0) != (depth != 0)) { /* ZRANGE_PRECISION register of a bound surface will change so we * must flush the DB caches. */ - sctx->barrier_flags |= SI_BARRIER_SYNC_AND_INV_DB; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); + si_set_barrier_flags(sctx, SI_BARRIER_SYNC_AND_INV_DB); } /* Update DB_DEPTH_CLEAR. */ zstex->depth_clear_value[level] = depth; @@ -1187,10 +1186,8 @@ static void gfx6_clear(struct pipe_context *ctx, unsigned buffers, /* TODO: This hack fixes dEQP-GLES[23].functional.fragment_ops.random.* on Navi31. * The root cause is unknown. */ - if (sctx->gfx_level == GFX11 || sctx->gfx_level == GFX11_5) { - sctx->barrier_flags |= SI_BARRIER_SYNC_VS; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); - } + if (sctx->gfx_level == GFX11 || sctx->gfx_level == GFX11_5) + si_set_barrier_flags(sctx, SI_BARRIER_SYNC_VS); } if (unlikely(sctx->sqtt_enabled)) { diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 0018d5197e1..b60ade08cc4 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -838,10 +838,8 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info bool cs_regalloc_hang = sscreen->info.has_cs_regalloc_hang_bug && info->block[0] * info->block[1] * info->block[2] > 256; - if (cs_regalloc_hang) { - sctx->barrier_flags |= SI_BARRIER_SYNC_PS | SI_BARRIER_SYNC_CS; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); - } + if (cs_regalloc_hang) + si_set_barrier_flags(sctx, SI_BARRIER_SYNC_PS | SI_BARRIER_SYNC_CS); si_check_dirty_buffers_textures(sctx); @@ -875,8 +873,7 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info /* Indirect buffers are read through L2 on GFX9-GFX11, but not other hw. */ if ((sctx->gfx_level <= GFX8 || sscreen->info.cp_sdma_ge_use_system_memory_scope) && si_resource(info->indirect)->L2_cache_dirty) { - sctx->barrier_flags |= SI_BARRIER_WB_L2 | SI_BARRIER_PFP_SYNC_ME; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); + si_set_barrier_flags(sctx, SI_BARRIER_WB_L2 | SI_BARRIER_PFP_SYNC_ME); si_resource(info->indirect)->L2_cache_dirty = false; } } @@ -974,10 +971,8 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info if (unlikely(sctx->perfetto_enabled)) trace_si_end_compute(&sctx->trace, info->grid[0], info->grid[1], info->grid[2]); - if (cs_regalloc_hang) { - sctx->barrier_flags |= SI_BARRIER_SYNC_CS; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); - } + if (cs_regalloc_hang) + si_set_barrier_flags(sctx, SI_BARRIER_SYNC_CS); } void si_destroy_compute(struct si_compute *program) diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index 486d9baf922..a75ffee215f 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -15,10 +15,8 @@ static void si_compute_begin_internal(struct si_context *sctx, bool render_condition_enabled) { sctx->barrier_flags &= ~SI_BARRIER_EVENT_PIPELINESTAT_START; - if (sctx->num_hw_pipestat_streamout_queries) { - sctx->barrier_flags |= SI_BARRIER_EVENT_PIPELINESTAT_STOP; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); - } + if (sctx->num_hw_pipestat_streamout_queries) + si_set_barrier_flags(sctx, SI_BARRIER_EVENT_PIPELINESTAT_STOP); if (!render_condition_enabled) sctx->render_cond_enabled = false; @@ -33,10 +31,8 @@ static void si_compute_begin_internal(struct si_context *sctx, bool render_condi static void si_compute_end_internal(struct si_context *sctx) { sctx->barrier_flags &= ~SI_BARRIER_EVENT_PIPELINESTAT_STOP; - if (sctx->num_hw_pipestat_streamout_queries) { - sctx->barrier_flags |= SI_BARRIER_EVENT_PIPELINESTAT_START; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); - } + if (sctx->num_hw_pipestat_streamout_queries) + si_set_barrier_flags(sctx, SI_BARRIER_EVENT_PIPELINESTAT_START); sctx->render_cond_enabled = sctx->render_cond; sctx->blitter_running = false; @@ -345,8 +341,7 @@ void si_retile_dcc(struct si_context *sctx, struct si_texture *tex) assert(sctx->gfx_level < GFX12); /* Flush and wait for CB before retiling DCC. */ - sctx->barrier_flags |= SI_BARRIER_SYNC_AND_INV_CB; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); + si_set_barrier_flags(sctx, SI_BARRIER_SYNC_AND_INV_CB); /* Set the DCC buffer. */ assert(tex->surface.meta_offset && tex->surface.meta_offset <= UINT_MAX); diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index 7f71ab0469e..04913e05036 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -153,10 +153,8 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs, assert(!sctx->screen->info.cp_sdma_ge_use_system_memory_scope); assert(size && size % 4 == 0); - if (!sctx->screen->info.cp_dma_use_L2) { - sctx->barrier_flags |= SI_BARRIER_INV_L2; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); - } + if (!sctx->screen->info.cp_dma_use_L2) + si_set_barrier_flags(sctx, SI_BARRIER_INV_L2); /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping @@ -236,10 +234,8 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, assert(size); assert(dst && src); - if (!sctx->screen->info.cp_dma_use_L2) { - sctx->barrier_flags |= SI_BARRIER_INV_L2; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); - } + if (!sctx->screen->info.cp_dma_use_L2) + si_set_barrier_flags(sctx, SI_BARRIER_INV_L2); /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index a717d2081d4..643ff066acf 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -1890,6 +1890,7 @@ static void si_upload_bindless_descriptors(struct si_context *sctx) (*img_handle)->desc_dirty = false; } + assert(sctx->dirty_atoms & si_get_atom_bit(sctx, &sctx->atoms.s.barrier)); /* Invalidate scalar L0 because the cache doesn't know that L2 changed. */ sctx->barrier_flags |= SI_BARRIER_INV_SMEM; diff --git a/src/gallium/drivers/radeonsi/si_mesh_shader.c b/src/gallium/drivers/radeonsi/si_mesh_shader.c index cdf050b4b6e..3861f883941 100644 --- a/src/gallium/drivers/radeonsi/si_mesh_shader.c +++ b/src/gallium/drivers/radeonsi/si_mesh_shader.c @@ -772,8 +772,7 @@ static void handle_indirect_resource(struct si_context *sctx, struct si_resource /* Indirect buffers are read through L2 on GFX9-GFX11, but not other hw. */ if (sscreen->info.cp_sdma_ge_use_system_memory_scope && res->L2_cache_dirty) { - sctx->barrier_flags |= SI_BARRIER_WB_L2 | SI_BARRIER_PFP_SYNC_ME; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); + si_set_barrier_flags(sctx, SI_BARRIER_WB_L2 | SI_BARRIER_PFP_SYNC_ME); res->L2_cache_dirty = false; } diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 0693ee01c56..4910999a8ae 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1372,6 +1372,13 @@ struct si_context { #define SI_FB_BARRIER_SYNC_DB BITFIELD_BIT(1) #define SI_FB_BARRIER_SYNC_ALL BITFIELD_RANGE(0, 2) +static void si_mark_atom_dirty(struct si_context *sctx, struct si_atom *atom); +static inline void si_set_barrier_flags(struct si_context *sctx, unsigned flags) +{ + sctx->barrier_flags |= flags; + si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); +} + void si_barrier_before_internal_op(struct si_context *sctx, unsigned flags, unsigned num_buffers, const struct pipe_shader_buffer *buffers, @@ -2256,15 +2263,17 @@ si_set_rasterized_prim(struct si_context *sctx, enum mesa_prim rast_prim, } } -/* There are 3 ways to flush caches and all of them are correct. +/* There are 4 ways to flush caches and all of them are correct. * - * 1) sctx->flags |= ...; + * 1) si_set_barrier_flags(sctx, ...); // deferred + * + * 2) sctx->barrier_flags |= ...; // multiple times * si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); // deferred * - * 2) sctx->flags |= ...; + * 3) sctx->flags |= ...; * si_emit_barrier_direct(sctx); // immediate * - * 3) sctx->flags |= ...; + * 4) sctx->flags |= ...; * sctx->emit_barrier(sctx, cs); // immediate (2 is better though) */ static inline void si_emit_barrier_direct(struct si_context *sctx) diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c index efdf172c8bd..9eb97379447 100644 --- a/src/gallium/drivers/radeonsi/si_query.c +++ b/src/gallium/drivers/radeonsi/si_query.c @@ -1610,9 +1610,8 @@ static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_q break; } - sctx->barrier_flags |= SI_BARRIER_INV_SMEM | SI_BARRIER_INV_VMEM | - (sctx->gfx_level <= GFX8 ? SI_BARRIER_INV_L2 : 0); - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); + si_set_barrier_flags(sctx, SI_BARRIER_INV_SMEM | SI_BARRIER_INV_VMEM | + (sctx->gfx_level <= GFX8 ? SI_BARRIER_INV_L2 : 0)); for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) { if (query->b.type != PIPE_QUERY_TIMESTAMP) { @@ -1708,10 +1707,8 @@ static void si_render_condition(struct pipe_context *ctx, struct pipe_query *que /* Settings this in the render cond atom is too late, * so set it here. */ - if (sctx->gfx_level <= GFX8 || sctx->screen->info.cp_sdma_ge_use_system_memory_scope) { - sctx->barrier_flags |= SI_BARRIER_WB_L2 | SI_BARRIER_PFP_SYNC_ME; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); - } + if (sctx->gfx_level <= GFX8 || sctx->screen->info.cp_sdma_ge_use_system_memory_scope) + si_set_barrier_flags(sctx, SI_BARRIER_WB_L2 | SI_BARRIER_PFP_SYNC_ME); sctx->render_cond_enabled = old_render_cond_enabled; } diff --git a/src/gallium/drivers/radeonsi/si_shader_binary.c b/src/gallium/drivers/radeonsi/si_shader_binary.c index 963b90cb6ba..eff38f0da58 100644 --- a/src/gallium/drivers/radeonsi/si_shader_binary.c +++ b/src/gallium/drivers/radeonsi/si_shader_binary.c @@ -196,7 +196,7 @@ static void post_upload_binary(struct si_screen *sscreen, struct si_shader *shad si_cp_dma_copy_buffer(upload_ctx, &shader->bo->b.b, staging, 0, staging_offset, binary_size); si_barrier_after_simple_buffer_op(upload_ctx, 0, &shader->bo->b.b, staging); - upload_ctx->barrier_flags |= SI_BARRIER_INV_ICACHE | SI_BARRIER_INV_L2; + si_set_barrier_flags(upload_ctx, SI_BARRIER_INV_ICACHE | SI_BARRIER_INV_L2); #if 0 /* debug: validate whether the copy was successful */ uint32_t *dst_binary = malloc(binary_size); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 6f93cb85b2e..d612aca0a06 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -2264,8 +2264,7 @@ static void si_draw(struct pipe_context *ctx, index_size = 2; /* GFX6-7 don't read index buffers through L2. */ - sctx->barrier_flags |= SI_BARRIER_WB_L2 | SI_BARRIER_PFP_SYNC_ME; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); + si_set_barrier_flags(sctx, SI_BARRIER_WB_L2 | SI_BARRIER_PFP_SYNC_ME); si_resource(indexbuf)->L2_cache_dirty = false; } else if (!IS_DRAW_VERTEX_STATE && info->has_user_indices) { struct pipe_resource *release_buf = NULL; @@ -2293,8 +2292,7 @@ static void si_draw(struct pipe_context *ctx, si_resource(indexbuf)->L2_cache_dirty) { /* GFX8-GFX11.5 reads index buffers through L2, so it doesn't * need this. */ - sctx->barrier_flags |= SI_BARRIER_WB_L2 | SI_BARRIER_PFP_SYNC_ME; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); + si_set_barrier_flags(sctx, SI_BARRIER_WB_L2 | SI_BARRIER_PFP_SYNC_ME); si_resource(indexbuf)->L2_cache_dirty = false; } } @@ -2308,15 +2306,13 @@ static void si_draw(struct pipe_context *ctx, /* Indirect buffers use L2 on GFX9-GFX11.5, but not other hw. */ if (GFX_VERSION <= GFX8 || GFX_VERSION == GFX12) { if (indirect->buffer && si_resource(indirect->buffer)->L2_cache_dirty) { - sctx->barrier_flags |= SI_BARRIER_WB_L2 | SI_BARRIER_PFP_SYNC_ME; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); + si_set_barrier_flags(sctx, SI_BARRIER_WB_L2 | SI_BARRIER_PFP_SYNC_ME); si_resource(indirect->buffer)->L2_cache_dirty = false; } if (indirect->indirect_draw_count && si_resource(indirect->indirect_draw_count)->L2_cache_dirty) { - sctx->barrier_flags |= SI_BARRIER_WB_L2 | SI_BARRIER_PFP_SYNC_ME; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); + si_set_barrier_flags(sctx, SI_BARRIER_WB_L2 | SI_BARRIER_PFP_SYNC_ME); si_resource(indirect->indirect_draw_count)->L2_cache_dirty = false; } } diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 1cea01a6856..0eef2f2d6de 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -3782,8 +3782,7 @@ bool si_update_ngg(struct si_context *sctx) * pointers are set. */ if (sctx->screen->info.has_vgt_flush_ngg_legacy_bug && !new_ngg) { - sctx->barrier_flags |= SI_BARRIER_EVENT_VGT_FLUSH; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); + si_set_barrier_flags(sctx, SI_BARRIER_EVENT_VGT_FLUSH); if (sctx->gfx_level == GFX10) { /* Workaround for https://gitlab.freedesktop.org/mesa/mesa/-/issues/2941 */ diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c index 372e50520e1..3097ac7e5fb 100644 --- a/src/gallium/drivers/radeonsi/si_state_streamout.c +++ b/src/gallium/drivers/radeonsi/si_state_streamout.c @@ -247,9 +247,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ /* All readers of the streamout targets need to be finished before we can * start writing to them. */ - sctx->barrier_flags |= SI_BARRIER_SYNC_PS | SI_BARRIER_SYNC_CS | - SI_BARRIER_PFP_SYNC_ME; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); + si_set_barrier_flags(sctx, SI_BARRIER_SYNC_PS | SI_BARRIER_SYNC_CS | SI_BARRIER_PFP_SYNC_ME); } else { si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false); si_set_streamout_enable(sctx, false); @@ -379,8 +377,7 @@ void si_emit_streamout_end(struct si_context *sctx) COPY_DATA_REG, NULL, (R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i); /* For DrawTF reading buf_filled_size: */ - sctx->barrier_flags |= SI_BARRIER_PFP_SYNC_ME; - si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); + si_set_barrier_flags(sctx, SI_BARRIER_PFP_SYNC_ME); } else { uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; diff --git a/src/gallium/drivers/radeonsi/si_test_blit_perf.c b/src/gallium/drivers/radeonsi/si_test_blit_perf.c index f9391412c1c..84dcaa13df1 100644 --- a/src/gallium/drivers/radeonsi/si_test_blit_perf.c +++ b/src/gallium/drivers/radeonsi/si_test_blit_perf.c @@ -541,7 +541,7 @@ void si_test_blit_perf(struct si_screen *sscreen) case METHOD_DEFAULT: if (test_flavor == TEST_FB_CLEAR) { ctx->clear(ctx, PIPE_CLEAR_COLOR, NULL, clear_color, 0, 0); - sctx->barrier_flags |= SI_BARRIER_SYNC_AND_INV_CB | SI_BARRIER_INV_L2; + si_set_barrier_flags(sctx, SI_BARRIER_SYNC_AND_INV_CB | SI_BARRIER_INV_L2); } else { ctx->clear_render_target(ctx, &surf_templ, clear_color, dst_box.x, dst_box.y, diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c b/src/gallium/drivers/radeonsi/si_test_dma_perf.c index 45707f448aa..678f4dc3721 100644 --- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c +++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c @@ -257,7 +257,7 @@ void si_test_dma_perf(struct si_screen *sscreen) si_barrier_after_simple_buffer_op(sctx, 0, dst, src); } - sctx->barrier_flags |= SI_BARRIER_INV_L2; + si_set_barrier_flags(sctx, SI_BARRIER_INV_L2); } ctx->end_query(ctx, q);