diff --git a/src/panfrost/ci/panfrost-g610-fails.txt b/src/panfrost/ci/panfrost-g610-fails.txt index c9737cffa88..47ecb716962 100644 --- a/src/panfrost/ci/panfrost-g610-fails.txt +++ b/src/panfrost/ci/panfrost-g610-fails.txt @@ -324,105 +324,3 @@ dEQP-VK.wsi.xlib.swapchain.simulate_oom.image_format,Crash dEQP-VK.wsi.xlib.swapchain.simulate_oom.min_image_count,Crash dEQP-VK.wsi.xlib.swapchain.simulate_oom.pre_transform,Crash dEQP-VK.wsi.xlib.swapchain.simulate_oom.present_mode,Crash - -# introduced by multivews, but the failure actually seems unrelated... -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed0_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed1_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed10_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed11_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed12_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed13_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed14_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed15_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed16_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed17_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed18_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed19_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed2_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed20_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed21_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed22_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed23_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed24_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed25_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed26_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed27_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed28_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed29_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed3_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed30_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed31_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed32_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed33_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed34_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed35_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed36_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed37_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed38_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed39_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed4_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed40_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed41_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed42_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed43_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed44_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed45_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed46_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed47_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed48_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed49_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed5_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed50_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed51_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed52_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed53_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed54_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed55_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed56_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed57_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed58_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed59_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed6_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed60_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed61_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed62_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed63_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed64_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed65_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed66_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed67_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed68_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed69_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed7_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed70_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed71_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed72_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed73_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed74_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed75_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed76_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed77_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed78_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed79_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed8_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed80_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed81_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed82_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed83_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed84_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed85_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed86_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed87_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed88_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed89_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed9_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed90_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed91_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed92_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed93_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed94_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed95_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed96_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed97_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed98_multiview,Crash -dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed99_multiview,Crash diff --git a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h index 9cd31bf4462..404181e5a0b 100644 --- a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h +++ b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h @@ -85,6 +85,11 @@ get_fbd_size(bool has_zs_ext, uint32_t rt_count) (TILER_OOM_CTX_FIELD_OFFSET(fbds) + \ (PANVK_IR_##_pass##_PASS * sizeof(uint64_t))) +struct panvk_cs_occlusion_query { + uint64_t next; + uint64_t syncobj; +}; + struct panvk_cs_subqueue_context { uint64_t syncobjs; uint32_t iter_sb; @@ -93,6 +98,7 @@ struct panvk_cs_subqueue_context { struct panvk_cs_desc_ringbuf desc_ringbuf; uint64_t tiler_heap; uint64_t geom_buf; + uint64_t oq_chain; } render; struct { uint32_t counter; @@ -428,9 +434,8 @@ void panvk_per_arch(get_cs_deps)(struct panvk_cmd_buffer *cmdbuf, const VkDependencyInfo *in, struct panvk_cs_deps *out); -void panvk_per_arch(cmd_prepare_exec_cmd_for_draws)( - struct panvk_cmd_buffer *primary, - struct panvk_cmd_buffer *secondary); +VkResult panvk_per_arch(cmd_prepare_exec_cmd_for_draws)( + struct panvk_cmd_buffer *primary, struct panvk_cmd_buffer *secondary); void panvk_per_arch(cmd_inherit_render_state)( struct panvk_cmd_buffer *cmdbuf, diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c index 4f56c22b88e..00f57c0b0e8 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c @@ -1320,6 +1320,85 @@ prepare_ds(struct panvk_cmd_buffer *cmdbuf) return VK_SUCCESS; } +static VkResult +wrap_prev_oq(struct panvk_cmd_buffer *cmdbuf) +{ + uint64_t last_syncobj = cmdbuf->state.gfx.render.oq.last; + + if (!last_syncobj) + return VK_SUCCESS; + + uint64_t prev_oq_node = cmdbuf->state.gfx.render.oq.chain; + struct panfrost_ptr new_oq_node = panvk_cmd_alloc_dev_mem( + cmdbuf, desc, sizeof(struct panvk_cs_occlusion_query), 8); + + if (!new_oq_node.gpu) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + + cmdbuf->state.gfx.render.oq.chain = new_oq_node.gpu; + + struct panvk_cs_occlusion_query *oq = new_oq_node.cpu; + + *oq = (struct panvk_cs_occlusion_query){ + .syncobj = last_syncobj, + .next = prev_oq_node, + }; + + /* If we already had an OQ in the chain, we don't need to initialize the + * oq_chain field in the subqueue ctx. */ + if (prev_oq_node) + return VK_SUCCESS; + + /* If we're a secondary cmdbuf inside a render pass, we let the primary + * cmdbuf link the OQ chain. */ + if (cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) + return VK_SUCCESS; + + struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT); + struct cs_index oq_node_reg = cs_scratch_reg64(b, 0); + + cs_move64_to(b, oq_node_reg, new_oq_node.gpu); + + /* If we're resuming, we need to link with the previous oq_chain, if any. */ + if (cmdbuf->state.gfx.render.flags & VK_RENDERING_RESUMING_BIT) { + struct cs_index prev_oq_node_reg = cs_scratch_reg64(b, 2); + + cs_load64_to( + b, prev_oq_node_reg, cs_subqueue_ctx_reg(b), + offsetof(struct panvk_cs_subqueue_context, render.oq_chain)); + cs_wait_slot(b, SB_ID(LS), false); + cs_store64(b, prev_oq_node_reg, oq_node_reg, + offsetof(struct panvk_cs_occlusion_query, next)); + cs_wait_slot(b, SB_ID(LS), false); + } + + cs_store64(b, oq_node_reg, cs_subqueue_ctx_reg(b), + offsetof(struct panvk_cs_subqueue_context, render.oq_chain)); + cs_wait_slot(b, SB_ID(LS), false); + return VK_SUCCESS; +} + +static VkResult +prepare_oq(struct panvk_cmd_buffer *cmdbuf) +{ + if (!gfx_state_dirty(cmdbuf, OQ) || + cmdbuf->state.gfx.occlusion_query.syncobj == + cmdbuf->state.gfx.render.oq.last) + return VK_SUCCESS; + + VkResult result = wrap_prev_oq(cmdbuf); + if (result) + return result; + + struct cs_builder *b = + panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER); + cs_move64_to(b, cs_sr_reg64(b, 46), cmdbuf->state.gfx.occlusion_query.ptr); + + cmdbuf->state.gfx.render.oq.last = + cmdbuf->state.gfx.occlusion_query.syncobj; + return VK_SUCCESS; +} + static void prepare_dcd(struct panvk_cmd_buffer *cmdbuf) { @@ -1596,13 +1675,13 @@ prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw) if (result != VK_SUCCESS) return result; + result = prepare_oq(cmdbuf); + if (result != VK_SUCCESS) + return result; + prepare_dcd(cmdbuf); prepare_vp(cmdbuf); prepare_tiler_primitive_size(cmdbuf); - - if (gfx_state_dirty(cmdbuf, OQ)) - cs_move64_to(b, cs_sr_reg64(b, 46), - cmdbuf->state.gfx.occlusion_query.ptr); } clear_dirty_after_draw(cmdbuf); @@ -1700,19 +1779,21 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw) cs_req_res(b, 0); } -void +VkResult panvk_per_arch(cmd_prepare_exec_cmd_for_draws)( struct panvk_cmd_buffer *primary, struct panvk_cmd_buffer *secondary) { - VkResult result; + if (!(secondary->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) + return VK_SUCCESS; - if ((secondary->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) && - !inherits_render_ctx(primary)) { - result = get_render_ctx(primary); + if (!inherits_render_ctx(primary)) { + VkResult result = get_render_ctx(primary); if (result != VK_SUCCESS) - return; + return result; } + + return prepare_oq(primary); } VKAPI_ATTR void VKAPI_CALL @@ -2012,7 +2093,6 @@ panvk_per_arch(CmdBeginRendering)(VkCommandBuffer commandBuffer, if (!resuming) panvk_per_arch(cmd_preload_render_area_border)(cmdbuf, pRenderingInfo); - } static void @@ -2160,6 +2240,7 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) &cmdbuf->state.cs[PANVK_SUBQUEUE_FRAGMENT].tracing; struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info; struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT); + bool has_oq_chain = cmdbuf->state.gfx.render.oq.chain != 0; /* Reserve a scoreboard for the fragment job. */ panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_FRAGMENT); @@ -2262,6 +2343,7 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) struct cs_index iter_sb = cs_scratch_reg32(b, 2); struct cs_index cmp_scratch = cs_scratch_reg32(b, 3); struct cs_index add_val = cs_scratch_reg64(b, 4); + struct cs_index add_val_lo = cs_scratch_reg32(b, 4); struct cs_index ringbuf_sync_addr = cs_scratch_reg64(b, 6); struct cs_index release_sz = cs_scratch_reg32(b, 8); @@ -2270,6 +2352,10 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) struct cs_index completed_bottom = cs_scratch_reg64(b, 12); struct cs_index cur_tiler = cs_sr_reg64(b, 38); struct cs_index tiler_count = cs_sr_reg32(b, 47); + struct cs_index oq_chain = cs_scratch_reg64(b, 10); + struct cs_index oq_chain_lo = cs_scratch_reg32(b, 10); + struct cs_index oq_chain_hi = cs_scratch_reg32(b, 11); + struct cs_index oq_syncobj = cs_scratch_reg64(b, 12); cs_move64_to(b, add_val, 1); cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b), @@ -2314,6 +2400,39 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) cs_sync32_add(b, true, MALI_CS_SYNC_SCOPE_CSG, release_sz, \ ringbuf_sync_addr, async); \ } \ + if (has_oq_chain) { \ + struct cs_index flush_id = oq_chain_lo; \ + cs_move32_to(b, flush_id, 0); \ + cs_flush_caches(b, MALI_CS_FLUSH_MODE_CLEAN, \ + MALI_CS_FLUSH_MODE_CLEAN, false, flush_id, \ + cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_FLUSH))); \ + cs_load64_to( \ + b, oq_chain, cs_subqueue_ctx_reg(b), \ + offsetof(struct panvk_cs_subqueue_context, render.oq_chain)); \ + cs_wait_slot(b, SB_ID(LS), false); \ + /* We use oq_syncobj as a placeholder to reset the oq_chain. */ \ + cs_move64_to(b, oq_syncobj, 0); \ + cs_store64( \ + b, oq_syncobj, cs_subqueue_ctx_reg(b), \ + offsetof(struct panvk_cs_subqueue_context, render.oq_chain)); \ + cs_wait_slot(b, SB_ID(LS), false); \ + cs_while(b, MALI_CS_CONDITION_ALWAYS, cs_undef()) { \ + cs_load64_to(b, oq_syncobj, oq_chain, \ + offsetof(struct panvk_cs_occlusion_query, syncobj)); \ + cs_wait_slot(b, SB_ID(LS), false); \ + cs_load64_to(b, oq_chain, oq_chain, \ + offsetof(struct panvk_cs_occlusion_query, next)); \ + cs_wait_slot(b, SB_ID(LS), false); \ + cs_sync32_set( \ + b, true, MALI_CS_SYNC_SCOPE_CSG, add_val_lo, oq_syncobj, \ + cs_defer(SB_MASK(DEFERRED_FLUSH), SB_ID(DEFERRED_SYNC))); \ + cs_if(b, MALI_CS_CONDITION_NEQUAL, oq_chain_lo) \ + cs_continue(b); \ + cs_if(b, MALI_CS_CONDITION_NEQUAL, oq_chain_hi) \ + cs_continue(b); \ + cs_break(b); \ + } \ + } \ cs_sync64_add(b, true, MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr, \ async); \ cs_move32_to(b, iter_sb, next_iter_sb(x)); \ @@ -2372,6 +2491,7 @@ panvk_per_arch(CmdEndRendering)(VkCommandBuffer commandBuffer) { VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); bool suspending = cmdbuf->state.gfx.render.flags & VK_RENDERING_SUSPENDING_BIT; + VkResult result; if (!suspending) { struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info; @@ -2380,7 +2500,16 @@ panvk_per_arch(CmdEndRendering)(VkCommandBuffer commandBuffer) clear |= fbinfo->rts[i].clear; if (clear && !inherits_render_ctx(cmdbuf)) { - VkResult result = get_fb_descs(cmdbuf); + result = get_fb_descs(cmdbuf); + if (result != VK_SUCCESS) + return; + } + + /* Flush the last occlusion query before ending the render pass if + * this query has ended while we were inside the render pass. */ + if (cmdbuf->state.gfx.render.oq.last != + cmdbuf->state.gfx.occlusion_query.syncobj) { + result = wrap_prev_oq(cmdbuf); if (result != VK_SUCCESS) return; } @@ -2400,6 +2529,7 @@ panvk_per_arch(CmdEndRendering)(VkCommandBuffer commandBuffer) memset(&cmdbuf->state.gfx.render.fbds, 0, sizeof(cmdbuf->state.gfx.render.fbds)); + memset(&cmdbuf->state.gfx.render.oq, 0, sizeof(cmdbuf->state.gfx.render.oq)); cmdbuf->state.gfx.render.tiler = 0; /* If we're not suspending, we need to resolve attachments. */ diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_query.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_query.c index 45404cbb66f..0d99f5cb842 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_query.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_query.c @@ -44,6 +44,45 @@ * 0 and does not need to wait. */ +static void +reset_oq_batch(struct cs_builder *b, struct cs_index addr, + struct cs_index zero_regs, uint32_t query_count) +{ + const uint32_t regs_per_query = 2; + const uint32_t queries_per_batch = zero_regs.size / regs_per_query; + uint32_t remaining_queries = query_count; + + assert(zero_regs.size > 2 && ALIGN_POT(zero_regs.size, 2) == zero_regs.size); + + if (query_count > queries_per_batch * 4) { + struct cs_index counter = cs_reg32(b, zero_regs.reg + zero_regs.size - 1); + struct cs_index new_zero_regs = + cs_reg_tuple(b, zero_regs.reg, zero_regs.size - 2); + const uint32_t adjusted_queries_per_batch = + new_zero_regs.size / regs_per_query; + uint32_t full_batches = query_count / adjusted_queries_per_batch; + + cs_move32_to(b, counter, full_batches); + cs_while(b, MALI_CS_CONDITION_GREATER, counter) { + cs_store(b, new_zero_regs, addr, BITFIELD_MASK(new_zero_regs.size), 0); + cs_add64(b, addr, addr, new_zero_regs.size * sizeof(uint32_t)); + cs_add32(b, counter, counter, -1); + } + + remaining_queries = + query_count - (full_batches * adjusted_queries_per_batch); + } + + for (uint32_t i = 0; i < remaining_queries; i += queries_per_batch) { + struct cs_index new_zero_regs = cs_reg_tuple( + b, zero_regs.reg, + MIN2(remaining_queries - i, queries_per_batch) * regs_per_query); + + cs_store(b, new_zero_regs, addr, BITFIELD_MASK(new_zero_regs.size), + i * sizeof(uint32_t)); + } +} + static void panvk_cmd_reset_occlusion_queries(struct panvk_cmd_buffer *cmd, struct panvk_query_pool *pool, @@ -54,19 +93,35 @@ panvk_cmd_reset_occlusion_queries(struct panvk_cmd_buffer *cmd, /* Wait on deferred sync to ensure all prior query operations have * completed */ - cs_wait_slots(b, SB_MASK(DEFERRED_SYNC), false); + cs_wait_slot(b, SB_ID(DEFERRED_SYNC), false); - struct cs_index addr = cs_scratch_reg64(b, 0); - struct cs_index zero32 = cs_scratch_reg32(b, 2); - cs_move32_to(b, zero32, 0); + struct cs_index addr = cs_scratch_reg64(b, 16); + struct cs_index zero_regs = cs_scratch_reg_tuple(b, 0, 16); - /* Mark all query syncobj as not available */ - for (uint32_t query = first_query; query < first_query + query_count; - query++) { - cs_move64_to(b, addr, panvk_query_available_dev_addr(pool, query)); - cs_sync32_set(b, true, MALI_CS_SYNC_SCOPE_CSG, zero32, addr, - cs_defer(SB_IMM_MASK, SB_ID(DEFERRED_SYNC))); - } + for (uint32_t i = 0; i < zero_regs.size; i += 2) + cs_move64_to(b, cs_scratch_reg64(b, i), 0); + + /* Zero all query syncobj so it reports non-available. We don't use + * cs_sync32_set() because no-one is waiting on this syncobj with + * cs_sync32_wait(). The only reason we use a syncobj is so we can + * defer the signalling in the issue_fragmnent_jobs() path. */ + cs_move64_to(b, addr, panvk_query_available_dev_addr(pool, first_query)); + reset_oq_batch(b, addr, zero_regs, query_count); + + cs_move64_to(b, addr, panvk_query_report_dev_addr(pool, first_query)); + reset_oq_batch(b, addr, zero_regs, query_count); + + /* reset_oq_batch() only does the stores, we need to flush those explicitly + * here. */ + cs_wait_slot(b, SB_ID(LS), false); + + /* We flush the caches to make the new value visible to the CPU. */ + struct cs_index flush_id = cs_scratch_reg32(b, 0); + + cs_flush_caches(b, MALI_CS_FLUSH_MODE_CLEAN, MALI_CS_FLUSH_MODE_CLEAN, false, + flush_id, + cs_defer(SB_IMM_MASK, SB_ID(IMM_FLUSH))); + cs_wait_slot(b, SB_ID(IMM_FLUSH), false); } static void @@ -77,6 +132,8 @@ panvk_cmd_begin_occlusion_query(struct panvk_cmd_buffer *cmd, uint64_t report_addr = panvk_query_report_dev_addr(pool, query); cmd->state.gfx.occlusion_query.ptr = report_addr; + cmd->state.gfx.occlusion_query.syncobj = + panvk_query_available_dev_addr(pool, query); cmd->state.gfx.occlusion_query.mode = flags & VK_QUERY_CONTROL_PRECISE_BIT ? MALI_OCCLUSION_MODE_COUNTER : MALI_OCCLUSION_MODE_PREDICATE; @@ -102,34 +159,79 @@ static void panvk_cmd_end_occlusion_query(struct panvk_cmd_buffer *cmd, struct panvk_query_pool *pool, uint32_t query) { - /* Ensure all RUN_FRAGMENT are encoded before this */ - panvk_per_arch(cmd_flush_draws)(cmd); + uint64_t syncobj_addr = panvk_query_available_dev_addr(pool, query); + cmd->state.gfx.occlusion_query.ptr = 0; + cmd->state.gfx.occlusion_query.syncobj = 0; cmd->state.gfx.occlusion_query.mode = MALI_OCCLUSION_MODE_DISABLED; gfx_state_set_dirty(cmd, OQ); - struct cs_builder *b = panvk_get_cs_builder(cmd, PANVK_SUBQUEUE_FRAGMENT); + /* If the render pass is active, we let EndRendering take care of the + * occlusion query end when the fragment job is issued. */ + if (cmd->state.gfx.render.oq.last == syncobj_addr) + return; - struct cs_index flush_id = cs_scratch_reg32(b, 0); - cs_move32_to(b, flush_id, 0); + struct cs_builder *b = panvk_get_cs_builder(cmd, PANVK_SUBQUEUE_FRAGMENT); + struct cs_index oq_syncobj = cs_scratch_reg64(b, 0); + struct cs_index val = cs_scratch_reg32(b, 2); /* OQ accumulates sample counts to the report which is on a cached memory. * Wait for the accumulation and flush the caches. */ + cs_move32_to(b, val, 0); cs_flush_caches(b, MALI_CS_FLUSH_MODE_CLEAN, MALI_CS_FLUSH_MODE_CLEAN, false, - flush_id, - cs_defer(SB_ALL_ITERS_MASK, SB_ID(DEFERRED_FLUSH))); + val, cs_defer(SB_ALL_ITERS_MASK, SB_ID(DEFERRED_FLUSH))); - struct cs_index sync_addr = cs_scratch_reg64(b, 0); - struct cs_index seqno = cs_scratch_reg32(b, 2); - cs_move32_to(b, seqno, 1); - - /* We wait on any previous flush, and defer on sync */ - cs_move64_to(b, sync_addr, panvk_query_available_dev_addr(pool, query)); - cs_sync32_set(b, true, MALI_CS_SYNC_SCOPE_CSG, seqno, sync_addr, + /* Signal the query syncobj after the flush is effective. */ + cs_move32_to(b, val, 1); + cs_move64_to(b, oq_syncobj, panvk_query_available_dev_addr(pool, query)); + cs_sync32_set(b, true, MALI_CS_SYNC_SCOPE_CSG, val, oq_syncobj, cs_defer(SB_MASK(DEFERRED_FLUSH), SB_ID(DEFERRED_SYNC))); } +static void +copy_oq_result_batch(struct cs_builder *b, + VkQueryResultFlags flags, + struct cs_index dst_addr, + VkDeviceSize dst_stride, + struct cs_index res_addr, + struct cs_index avail_addr, + struct cs_index scratch_regs, + uint32_t query_count) +{ + uint32_t res_size = (flags & VK_QUERY_RESULT_64_BIT) ? 2 : 1; + uint32_t regs_per_copy = + res_size + ((flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) ? 1 : 0); + + assert(query_count <= scratch_regs.size / regs_per_copy); + + for (uint32_t i = 0; i < query_count; i++) { + struct cs_index res = + cs_reg_tuple(b, scratch_regs.reg + (i * regs_per_copy), res_size); + struct cs_index avail = cs_reg32(b, res.reg + res_size); + + cs_load_to(b, res, res_addr, BITFIELD_MASK(res.size), + i * sizeof(uint64_t)); + + if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) + cs_load32_to(b, avail, avail_addr, i * sizeof(struct panvk_cs_sync32)); + } + + /* Flush the loads. */ + cs_wait_slot(b, SB_ID(LS), false); + + for (uint32_t i = 0; i < query_count; i++) { + struct cs_index store_src = + cs_reg_tuple(b, scratch_regs.reg + (i * regs_per_copy), regs_per_copy); + + cs_store(b, store_src, dst_addr, BITFIELD_MASK(regs_per_copy), + i * dst_stride); + } + + /* Flush the stores. */ + cs_wait_slot(b, SB_ID(LS), false); +} + static void panvk_copy_occlusion_query_results(struct panvk_cmd_buffer *cmd, struct panvk_query_pool *pool, @@ -138,63 +240,63 @@ panvk_copy_occlusion_query_results(struct panvk_cmd_buffer *cmd, VkDeviceSize stride, VkQueryResultFlags flags) { - unsigned result_stride = - flags & VK_QUERY_RESULT_64_BIT ? sizeof(uint64_t) : sizeof(uint32_t); - struct cs_builder *b = panvk_get_cs_builder(cmd, PANVK_SUBQUEUE_FRAGMENT); - struct cs_index scratch_addr0 = cs_scratch_reg64(b, 0); - struct cs_index scratch_addr1 = cs_scratch_reg64(b, 2); - struct cs_index scratch_val0 = cs_scratch_reg32(b, 4); - struct cs_index available = cs_scratch_reg32(b, 5); - struct cs_index write_results = cs_scratch_reg32(b, 6); - struct cs_index scratch_val2 = cs_scratch_reg64(b, 8); - for (uint32_t query = first_query; query < first_query + query_count; - query++) { - cs_move64_to(b, scratch_addr0, - panvk_query_available_dev_addr(pool, query)); + /* Wait for occlusion query syncobjs to be signalled. */ + if (flags & VK_QUERY_RESULT_WAIT_BIT) + cs_wait_slot(b, SB_ID(DEFERRED_SYNC), false); - if (flags & VK_QUERY_RESULT_WAIT_BIT) { - cs_move32_to(b, scratch_val0, 0); + uint32_t res_size = (flags & VK_QUERY_RESULT_64_BIT) ? 2 : 1; + uint32_t regs_per_copy = + res_size + ((flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) ? 1 : 0); - /* Wait on the sync object of the current query */ - cs_sync32_wait(b, false, MALI_CS_CONDITION_GREATER, scratch_val0, - scratch_addr0); + struct cs_index dst_addr = cs_scratch_reg64(b, 16); + struct cs_index res_addr = cs_scratch_reg64(b, 14); + struct cs_index avail_addr = cs_scratch_reg64(b, 12); + struct cs_index counter = cs_scratch_reg32(b, 11); + struct cs_index scratch_regs = cs_scratch_reg_tuple(b, 0, 11); + uint32_t queries_per_batch = scratch_regs.size / regs_per_copy; - /* After the wait, all subqueues are available */ - cs_move32_to(b, available, 1); - } else { - cs_move64_to(b, scratch_addr0, - panvk_query_available_dev_addr(pool, query)); - cs_load32_to(b, available, scratch_addr0, 0); - cs_wait_slot(b, SB_ID(LS), false); + /* Store offset is a 16-bit signed integer, so we might be limited by the + * stride here. */ + queries_per_batch = MIN2(((1u << 15) / stride) + 1, queries_per_batch); + + /* Stop unrolling the loop when it takes more than 2 steps to copy the + * queries. */ + if (query_count > 2 * queries_per_batch) { + uint32_t copied_query_count = + query_count - (query_count % queries_per_batch); + + cs_move32_to(b, counter, copied_query_count); + cs_move64_to(b, dst_addr, dst_buffer_addr); + cs_move64_to(b, res_addr, panvk_query_report_dev_addr(pool, first_query)); + cs_move64_to(b, avail_addr, + panvk_query_available_dev_addr(pool, first_query)); + cs_while(b, MALI_CS_CONDITION_GREATER, counter) { + copy_oq_result_batch(b, flags, dst_addr, stride, res_addr, avail_addr, + scratch_regs, queries_per_batch); + + cs_add32(b, counter, counter, -queries_per_batch); + cs_add64(b, dst_addr, dst_addr, queries_per_batch * stride); + cs_add64(b, res_addr, res_addr, queries_per_batch * sizeof(uint64_t)); + cs_add64(b, avail_addr, avail_addr, + queries_per_batch * sizeof(uint64_t)); } - cs_add32(b, write_results, available, - (flags & VK_QUERY_RESULT_PARTIAL_BIT) != 0); + dst_buffer_addr += stride * copied_query_count; + first_query += copied_query_count; + query_count -= copied_query_count; + } - assert(pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION); - - cs_if(b, MALI_CS_CONDITION_GREATER, write_results) { - cs_move64_to(b, scratch_addr0, - panvk_query_report_dev_addr(pool, query)); - cs_move64_to(b, scratch_addr1, dst_buffer_addr + query * stride); - - if (flags & VK_QUERY_RESULT_64_BIT) { - cs_load64_to(b, scratch_val2, scratch_addr0, 0); - cs_wait_slot(b, SB_ID(LS), false); - cs_store64(b, scratch_val2, scratch_addr1, 0); - } else { - cs_load32_to(b, scratch_val0, scratch_addr0, 0); - cs_wait_slot(b, SB_ID(LS), false); - cs_store32(b, scratch_val0, scratch_addr1, 0); - } - - if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) - cs_store32(b, available, scratch_addr1, result_stride); - - cs_wait_slot(b, SB_ID(LS), false); - } + for (uint32_t i = 0; i < query_count; i += queries_per_batch) { + cs_move64_to(b, dst_addr, dst_buffer_addr + (i * stride)); + cs_move64_to(b, res_addr, + panvk_query_report_dev_addr(pool, i + first_query)); + cs_move64_to(b, avail_addr, + panvk_query_available_dev_addr(pool, i + first_query)); + copy_oq_result_batch(b, flags, dst_addr, stride, res_addr, avail_addr, + scratch_regs, + MIN2(queries_per_batch, query_count - i)); } } diff --git a/src/panfrost/vulkan/panvk_cmd_draw.h b/src/panfrost/vulkan/panvk_cmd_draw.h index 20ce456f646..842ca6b8927 100644 --- a/src/panfrost/vulkan/panvk_cmd_draw.h +++ b/src/panfrost/vulkan/panvk_cmd_draw.h @@ -76,6 +76,16 @@ struct panvk_rendering_state { /* When a secondary command buffer has to flush draws, it disturbs the * inherited context, and the primary command buffer needs to know. */ bool invalidate_inherited_ctx; + + struct { + /* != 0 if the render pass contains one or more occlusion queries to + * signal. */ + uint64_t chain; + + /* Point to the syncobj of the last occlusion query that was passed + * to a draw. */ + uint64_t last; + } oq; #endif }; diff --git a/src/panfrost/vulkan/panvk_cmd_oq.h b/src/panfrost/vulkan/panvk_cmd_oq.h index 33ed61b992e..efe2eeddbcd 100644 --- a/src/panfrost/vulkan/panvk_cmd_oq.h +++ b/src/panfrost/vulkan/panvk_cmd_oq.h @@ -13,6 +13,9 @@ #include "genxml/gen_macros.h" struct panvk_occlusion_query_state { +#if PAN_ARCH >= 10 + uint64_t syncobj; +#endif uint64_t ptr; enum mali_occlusion_mode mode; };