panvk/csf: Fix BY_REGION dependencies

When only the tile buffers are touched, it's okay to take care of the
dependency at the draw level, with DCD_FLAGS_2, but as soon as one side
of the dep has side effects that could impact the other side, we need to
split the render pass and insert a real barrier, with a proper flush on
read-only L1 caches.

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Aksel Hjerpbakk <aksel.hjerpbakk@arm.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38950>
This commit is contained in:
Boris Brezillon 2025-12-15 11:15:18 +01:00
parent a00f6ee033
commit aa3c8e6fb0
3 changed files with 63 additions and 32 deletions

View file

@ -4,16 +4,8 @@ asan-dEQP-VK.glsl.loops.special.do_while_dynamic_iterations.dowhile_trap_vertex,
# New failures with VKCTS 1.4.4.0 # New failures with VKCTS 1.4.4.0
dEQP-VK.binding_model.unused_invalid_descriptor.write.invalid.combined_image_sampler,Crash dEQP-VK.binding_model.unused_invalid_descriptor.write.invalid.combined_image_sampler,Crash
dEQP-VK.binding_model.unused_invalid_descriptor.write.invalid.sampled_image,Crash dEQP-VK.binding_model.unused_invalid_descriptor.write.invalid.sampled_image,Crash
dEQP-VK.image.general_layout.memory_barrier.fragment.read_write.shader_read_write,Fail
dEQP-VK.image.general_layout.memory_barrier.fragment.write_read.sampled_read_storage_write,Fail
dEQP-VK.image.general_layout.memory_barrier.fragment.write_read.shader_read_write,Fail
dEQP-VK.image.general_layout.memory_barrier.fragment.write_read.storage_read_storage_write,Fail
asan-dEQP-VK.binding_model.unused_invalid_descriptor.write.invalid.combined_image_sampler,Crash asan-dEQP-VK.binding_model.unused_invalid_descriptor.write.invalid.combined_image_sampler,Crash
asan-dEQP-VK.binding_model.unused_invalid_descriptor.write.invalid.sampled_image,Crash asan-dEQP-VK.binding_model.unused_invalid_descriptor.write.invalid.sampled_image,Crash
asan-dEQP-VK.image.general_layout.memory_barrier.fragment.read_write.shader_read_write,Fail
asan-dEQP-VK.image.general_layout.memory_barrier.fragment.write_read.sampled_read_storage_write,Fail
asan-dEQP-VK.image.general_layout.memory_barrier.fragment.write_read.shader_read_write,Fail
asan-dEQP-VK.image.general_layout.memory_barrier.fragment.write_read.storage_read_storage_write,Fail
afbcp-spec@arb_shader_texture_lod@execution@arb_shader_texture_lod-texgradcube,Fail afbcp-spec@arb_shader_texture_lod@execution@arb_shader_texture_lod-texgradcube,Fail

View file

@ -198,9 +198,6 @@ dEQP-VK.glsl.swizzles.vector_swizzles.mediump_vec4_x_fragment
dEQP-VK.glsl.arrays.constructor.int4_fragment dEQP-VK.glsl.arrays.constructor.int4_fragment
dEQP-VK.glsl.arrays.declaration.implicit_size_int_ivec3_fragment dEQP-VK.glsl.arrays.declaration.implicit_size_int_ivec3_fragment
dEQP-VK.image.general_layout.memory_barrier.fragment.write_read.shader_read_write
dEQP-VK.image.general_layout.memory_barrier.fragment.write_read.sampled_read_storage_write
dEQP-VK.image.general_layout.memory_barrier.fragment.write_read.storage_read_storage_write
dEQP-VK.image.store.with_format.2d_array.r8_snorm_single_layer dEQP-VK.image.store.with_format.2d_array.r8_snorm_single_layer
dEQP-VK.image.subresource_layout.2d.4_levels.r8g8b8a8_uint_offset dEQP-VK.image.subresource_layout.2d.4_levels.r8g8b8a8_uint_offset
dEQP-VK.image.texel_view_compatible.graphic.extended.1d_image.texture_read.astc_5x4_unorm_block.r32g32b32a32_sint dEQP-VK.image.texel_view_compatible.graphic.extended.1d_image.texture_read.astc_5x4_unorm_block.r32g32b32a32_sint

View file

@ -386,6 +386,35 @@ add_memory_dependency(struct panvk_cache_flush_info *cache_flush,
} }
} }
static bool
frag_subqueue_needs_sidefx_barrier(VkAccessFlags2 src_access,
VkAccessFlags2 dst_access)
{
bool src_reads_mem = src_access & (VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
VK_ACCESS_2_MEMORY_READ_BIT);
bool dst_reads_mem = dst_access & (VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
VK_ACCESS_2_MEMORY_READ_BIT);
bool src_writes_mem = src_access & (VK_ACCESS_2_MEMORY_WRITE_BIT |
VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT);
bool dst_writes_mem = dst_access & (VK_ACCESS_2_MEMORY_WRITE_BIT |
VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT);
/* If there's no read -> write, write -> write or write -> read
* memory dependency, we can skip, otherwise we have to split the
* render pass. We could possibly add the dependency at the draw level,
* using extra bits in the DCD2 flags to encode storage reads/writes and
* adding extra WAIT/WAIT_RESOURCE shader side, but we can't flush the
* texture cache, so it wouldn't work for SAMPLED_READ. Let's keep things
* simple and consider any side effect as requiring a split, until this
* proves to be a real bottleneck.
*/
return (src_reads_mem && dst_writes_mem) ||
(src_writes_mem && dst_writes_mem) ||
(src_writes_mem && dst_reads_mem);
}
static bool static bool
should_split_render_pass(const uint32_t wait_masks[static PANVK_SUBQUEUE_COUNT], should_split_render_pass(const uint32_t wait_masks[static PANVK_SUBQUEUE_COUNT],
VkAccessFlags2 src_access, VkAccessFlags2 dst_access) VkAccessFlags2 src_access, VkAccessFlags2 dst_access)
@ -406,16 +435,20 @@ should_split_render_pass(const uint32_t wait_masks[static PANVK_SUBQUEUE_COUNT],
BITFIELD_BIT(PANVK_SUBQUEUE_FRAGMENT)) BITFIELD_BIT(PANVK_SUBQUEUE_FRAGMENT))
return true; return true;
if (wait_masks[PANVK_SUBQUEUE_FRAGMENT] &
BITFIELD_BIT(PANVK_SUBQUEUE_FRAGMENT)) {
/* split if the fragment subqueue self-waits with a feedback loop, because /* split if the fragment subqueue self-waits with a feedback loop, because
* we lower subpassLoad to texelFetch * we lower subpassLoad to texelFetch
*/ */
if ((wait_masks[PANVK_SUBQUEUE_FRAGMENT] & if ((src_access & (VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT |
BITFIELD_BIT(PANVK_SUBQUEUE_FRAGMENT)) &&
(src_access & (VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT |
VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT)) && VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT)) &&
(dst_access & VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT)) (dst_access & VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT))
return true; return true;
if (frag_subqueue_needs_sidefx_barrier(src_access, dst_access))
return true;
}
return false; return false;
} }
@ -433,13 +466,29 @@ collect_cache_flush_info(enum panvk_subqueue_id subqueue,
add_memory_dependency(cache_flush, src_access, dst_access); add_memory_dependency(cache_flush, src_access, dst_access);
} }
static bool
can_skip_barrier(struct panvk_cmd_buffer *cmdbuf, const VkDependencyInfo *info,
struct panvk_sync_scope src, struct panvk_sync_scope dst)
{
bool inside_rp = cmdbuf->state.gfx.render.tiler || inherits_render_ctx(cmdbuf);
bool by_region = info->dependencyFlags & VK_DEPENDENCY_BY_REGION_BIT;
if (inside_rp && by_region &&
!frag_subqueue_needs_sidefx_barrier(src.access, dst.access))
return true;
return false;
}
static void static void
collect_cs_deps(struct panvk_cmd_buffer *cmdbuf, collect_cs_deps(struct panvk_cmd_buffer *cmdbuf, const VkDependencyInfo *info,
struct panvk_sync_scope src, struct panvk_sync_scope dst, struct panvk_sync_scope src, struct panvk_sync_scope dst,
struct panvk_cs_deps *deps) struct panvk_cs_deps *deps)
{ {
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); if (can_skip_barrier(cmdbuf, info, src, dst))
return;
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
uint32_t wait_masks[PANVK_SUBQUEUE_COUNT] = {0}; uint32_t wait_masks[PANVK_SUBQUEUE_COUNT] = {0};
add_execution_dependency(wait_masks, src.stages, dst.stages); add_execution_dependency(wait_masks, src.stages, dst.stages);
@ -581,7 +630,7 @@ panvk_per_arch(add_cs_deps)(struct panvk_cmd_buffer *cmdbuf,
VK_QUEUE_FAMILY_IGNORED, VK_QUEUE_FAMILY_IGNORED,
barrier_stage); barrier_stage);
collect_cs_deps(cmdbuf, src, dst, out); collect_cs_deps(cmdbuf, in, src, dst, out);
} }
for (uint32_t i = 0; i < in->bufferMemoryBarrierCount; i++) { for (uint32_t i = 0; i < in->bufferMemoryBarrierCount; i++) {
@ -593,7 +642,7 @@ panvk_per_arch(add_cs_deps)(struct panvk_cmd_buffer *cmdbuf,
barrier->dstQueueFamilyIndex, barrier->dstQueueFamilyIndex,
barrier_stage); barrier_stage);
collect_cs_deps(cmdbuf, src, dst, out); collect_cs_deps(cmdbuf, in, src, dst, out);
} }
for (uint32_t i = 0; i < in->imageMemoryBarrierCount; i++) { for (uint32_t i = 0; i < in->imageMemoryBarrierCount; i++) {
@ -608,7 +657,7 @@ panvk_per_arch(add_cs_deps)(struct panvk_cmd_buffer *cmdbuf,
barrier->dstQueueFamilyIndex, barrier->dstQueueFamilyIndex,
barrier_stage); barrier_stage);
collect_cs_deps(cmdbuf, src, dst, out); collect_cs_deps(cmdbuf, in, src, dst, out);
if (barrier_stage == PANVK_BARRIER_STAGE_FIRST && transition.stages) if (barrier_stage == PANVK_BARRIER_STAGE_FIRST && transition.stages)
out->needs_layout_transitions = true; out->needs_layout_transitions = true;
@ -736,13 +785,6 @@ panvk_per_arch(CmdPipelineBarrier2)(VkCommandBuffer commandBuffer,
const VkDependencyInfo *pDependencyInfo) const VkDependencyInfo *pDependencyInfo)
{ {
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
/* Intra render pass barriers can be skipped iff we're inside a render
* pass. */
if ((cmdbuf->state.gfx.render.tiler || inherits_render_ctx(cmdbuf)) &&
(pDependencyInfo->dependencyFlags & VK_DEPENDENCY_BY_REGION_BIT))
return;
struct panvk_cs_deps deps = {0}; struct panvk_cs_deps deps = {0};
panvk_per_arch(add_cs_deps)(cmdbuf, PANVK_BARRIER_STAGE_FIRST, pDependencyInfo, &deps, false); panvk_per_arch(add_cs_deps)(cmdbuf, PANVK_BARRIER_STAGE_FIRST, pDependencyInfo, &deps, false);