diff --git a/src/panfrost/ci/panfrost-g610-fails.txt b/src/panfrost/ci/panfrost-g610-fails.txt index 82cd44030f8..87f826cce24 100644 --- a/src/panfrost/ci/panfrost-g610-fails.txt +++ b/src/panfrost/ci/panfrost-g610-fails.txt @@ -4,16 +4,8 @@ asan-dEQP-VK.glsl.loops.special.do_while_dynamic_iterations.dowhile_trap_vertex, # New failures with VKCTS 1.4.4.0 dEQP-VK.binding_model.unused_invalid_descriptor.write.invalid.combined_image_sampler,Crash dEQP-VK.binding_model.unused_invalid_descriptor.write.invalid.sampled_image,Crash -dEQP-VK.image.general_layout.memory_barrier.fragment.read_write.shader_read_write,Fail -dEQP-VK.image.general_layout.memory_barrier.fragment.write_read.sampled_read_storage_write,Fail -dEQP-VK.image.general_layout.memory_barrier.fragment.write_read.shader_read_write,Fail -dEQP-VK.image.general_layout.memory_barrier.fragment.write_read.storage_read_storage_write,Fail asan-dEQP-VK.binding_model.unused_invalid_descriptor.write.invalid.combined_image_sampler,Crash asan-dEQP-VK.binding_model.unused_invalid_descriptor.write.invalid.sampled_image,Crash -asan-dEQP-VK.image.general_layout.memory_barrier.fragment.read_write.shader_read_write,Fail -asan-dEQP-VK.image.general_layout.memory_barrier.fragment.write_read.sampled_read_storage_write,Fail -asan-dEQP-VK.image.general_layout.memory_barrier.fragment.write_read.shader_read_write,Fail -asan-dEQP-VK.image.general_layout.memory_barrier.fragment.write_read.storage_read_storage_write,Fail afbcp-spec@arb_shader_texture_lod@execution@arb_shader_texture_lod-texgradcube,Fail diff --git a/src/panfrost/ci/panfrost-g610-flakes.txt b/src/panfrost/ci/panfrost-g610-flakes.txt index a70deb17e7e..bfc5509094f 100644 --- a/src/panfrost/ci/panfrost-g610-flakes.txt +++ b/src/panfrost/ci/panfrost-g610-flakes.txt @@ -198,9 +198,6 @@ dEQP-VK.glsl.swizzles.vector_swizzles.mediump_vec4_x_fragment dEQP-VK.glsl.arrays.constructor.int4_fragment dEQP-VK.glsl.arrays.declaration.implicit_size_int_ivec3_fragment -dEQP-VK.image.general_layout.memory_barrier.fragment.write_read.shader_read_write -dEQP-VK.image.general_layout.memory_barrier.fragment.write_read.sampled_read_storage_write -dEQP-VK.image.general_layout.memory_barrier.fragment.write_read.storage_read_storage_write dEQP-VK.image.store.with_format.2d_array.r8_snorm_single_layer dEQP-VK.image.subresource_layout.2d.4_levels.r8g8b8a8_uint_offset dEQP-VK.image.texel_view_compatible.graphic.extended.1d_image.texture_read.astc_5x4_unorm_block.r32g32b32a32_sint diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c index 749a1c2f505..20580e5b232 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c @@ -386,6 +386,35 @@ add_memory_dependency(struct panvk_cache_flush_info *cache_flush, } } +static bool +frag_subqueue_needs_sidefx_barrier(VkAccessFlags2 src_access, + VkAccessFlags2 dst_access) +{ + bool src_reads_mem = src_access & (VK_ACCESS_2_SHADER_SAMPLED_READ_BIT | + VK_ACCESS_2_SHADER_STORAGE_READ_BIT | + VK_ACCESS_2_MEMORY_READ_BIT); + bool dst_reads_mem = dst_access & (VK_ACCESS_2_SHADER_SAMPLED_READ_BIT | + VK_ACCESS_2_SHADER_STORAGE_READ_BIT | + VK_ACCESS_2_MEMORY_READ_BIT); + bool src_writes_mem = src_access & (VK_ACCESS_2_MEMORY_WRITE_BIT | + VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT); + bool dst_writes_mem = dst_access & (VK_ACCESS_2_MEMORY_WRITE_BIT | + VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT); + + /* If there's no read -> write, write -> write or write -> read + * memory dependency, we can skip, otherwise we have to split the + * render pass. We could possibly add the dependency at the draw level, + * using extra bits in the DCD2 flags to encode storage reads/writes and + * adding extra WAIT/WAIT_RESOURCE shader side, but we can't flush the + * texture cache, so it wouldn't work for SAMPLED_READ. Let's keep things + * simple and consider any side effect as requiring a split, until this + * proves to be a real bottleneck. + */ + return (src_reads_mem && dst_writes_mem) || + (src_writes_mem && dst_writes_mem) || + (src_writes_mem && dst_reads_mem); +} + static bool should_split_render_pass(const uint32_t wait_masks[static PANVK_SUBQUEUE_COUNT], VkAccessFlags2 src_access, VkAccessFlags2 dst_access) @@ -406,15 +435,19 @@ should_split_render_pass(const uint32_t wait_masks[static PANVK_SUBQUEUE_COUNT], BITFIELD_BIT(PANVK_SUBQUEUE_FRAGMENT)) return true; - /* split if the fragment subqueue self-waits with a feedback loop, because - * we lower subpassLoad to texelFetch - */ - if ((wait_masks[PANVK_SUBQUEUE_FRAGMENT] & - BITFIELD_BIT(PANVK_SUBQUEUE_FRAGMENT)) && - (src_access & (VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT | - VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT)) && - (dst_access & VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT)) - return true; + if (wait_masks[PANVK_SUBQUEUE_FRAGMENT] & + BITFIELD_BIT(PANVK_SUBQUEUE_FRAGMENT)) { + /* split if the fragment subqueue self-waits with a feedback loop, because + * we lower subpassLoad to texelFetch + */ + if ((src_access & (VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT)) && + (dst_access & VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT)) + return true; + + if (frag_subqueue_needs_sidefx_barrier(src_access, dst_access)) + return true; + } return false; } @@ -433,13 +466,29 @@ collect_cache_flush_info(enum panvk_subqueue_id subqueue, add_memory_dependency(cache_flush, src_access, dst_access); } +static bool +can_skip_barrier(struct panvk_cmd_buffer *cmdbuf, const VkDependencyInfo *info, + struct panvk_sync_scope src, struct panvk_sync_scope dst) +{ + bool inside_rp = cmdbuf->state.gfx.render.tiler || inherits_render_ctx(cmdbuf); + bool by_region = info->dependencyFlags & VK_DEPENDENCY_BY_REGION_BIT; + + if (inside_rp && by_region && + !frag_subqueue_needs_sidefx_barrier(src.access, dst.access)) + return true; + + return false; +} + static void -collect_cs_deps(struct panvk_cmd_buffer *cmdbuf, +collect_cs_deps(struct panvk_cmd_buffer *cmdbuf, const VkDependencyInfo *info, struct panvk_sync_scope src, struct panvk_sync_scope dst, struct panvk_cs_deps *deps) { - struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); + if (can_skip_barrier(cmdbuf, info, src, dst)) + return; + struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); uint32_t wait_masks[PANVK_SUBQUEUE_COUNT] = {0}; add_execution_dependency(wait_masks, src.stages, dst.stages); @@ -581,7 +630,7 @@ panvk_per_arch(add_cs_deps)(struct panvk_cmd_buffer *cmdbuf, VK_QUEUE_FAMILY_IGNORED, barrier_stage); - collect_cs_deps(cmdbuf, src, dst, out); + collect_cs_deps(cmdbuf, in, src, dst, out); } for (uint32_t i = 0; i < in->bufferMemoryBarrierCount; i++) { @@ -593,7 +642,7 @@ panvk_per_arch(add_cs_deps)(struct panvk_cmd_buffer *cmdbuf, barrier->dstQueueFamilyIndex, barrier_stage); - collect_cs_deps(cmdbuf, src, dst, out); + collect_cs_deps(cmdbuf, in, src, dst, out); } for (uint32_t i = 0; i < in->imageMemoryBarrierCount; i++) { @@ -608,7 +657,7 @@ panvk_per_arch(add_cs_deps)(struct panvk_cmd_buffer *cmdbuf, barrier->dstQueueFamilyIndex, barrier_stage); - collect_cs_deps(cmdbuf, src, dst, out); + collect_cs_deps(cmdbuf, in, src, dst, out); if (barrier_stage == PANVK_BARRIER_STAGE_FIRST && transition.stages) out->needs_layout_transitions = true; @@ -736,13 +785,6 @@ panvk_per_arch(CmdPipelineBarrier2)(VkCommandBuffer commandBuffer, const VkDependencyInfo *pDependencyInfo) { VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); - - /* Intra render pass barriers can be skipped iff we're inside a render - * pass. */ - if ((cmdbuf->state.gfx.render.tiler || inherits_render_ctx(cmdbuf)) && - (pDependencyInfo->dependencyFlags & VK_DEPENDENCY_BY_REGION_BIT)) - return; - struct panvk_cs_deps deps = {0}; panvk_per_arch(add_cs_deps)(cmdbuf, PANVK_BARRIER_STAGE_FIRST, pDependencyInfo, &deps, false);