From f75569734e8c31bf8681b01f6069afe0a56e2fa0 Mon Sep 17 00:00:00 2001 From: Christoph Pillmayer Date: Tue, 13 May 2025 13:52:16 +0000 Subject: [PATCH] panvk: Remove explicit LS waits Reviewed-by: Boris Brezillon Reviewed-by: Lars-Ivar Hesselberg Simonsen Part-of: --- src/panfrost/genxml/cs_builder.h | 19 +++---- src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c | 6 +- .../vulkan/csf/panvk_vX_cmd_dispatch.c | 9 +-- src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c | 57 +++++-------------- src/panfrost/vulkan/csf/panvk_vX_cmd_event.c | 2 - .../vulkan/csf/panvk_vX_cmd_precomp.c | 6 +- src/panfrost/vulkan/csf/panvk_vX_cmd_query.c | 9 +-- .../vulkan/csf/panvk_vX_exception_handler.c | 4 -- 8 files changed, 31 insertions(+), 81 deletions(-) diff --git a/src/panfrost/genxml/cs_builder.h b/src/panfrost/genxml/cs_builder.h index 10e1d0aa4c8..9171818a3d9 100644 --- a/src/panfrost/genxml/cs_builder.h +++ b/src/panfrost/genxml/cs_builder.h @@ -1937,7 +1937,6 @@ cs_exception_handler_end(struct cs_builder *b, cs_load64_to(b, addr_reg, handler->ctx.ctx_reg, handler->ctx.dump_addr_offset); - cs_wait_slot(b, handler->ctx.ls_sb_slot); for (unsigned i = 0; i < num_ranges; ++i) { unsigned reg_count = util_bitcount(masks[i]); @@ -1946,7 +1945,7 @@ cs_exception_handler_end(struct cs_builder *b, offset += reg_count * 4; } - cs_wait_slot(b, handler->ctx.ls_sb_slot); + cs_flush_stores(b); } /* Now that the preamble is emitted, we can flush the instructions we have in @@ -1959,7 +1958,6 @@ cs_exception_handler_end(struct cs_builder *b, cs_load64_to(b, addr_reg, handler->ctx.ctx_reg, handler->ctx.dump_addr_offset); - cs_wait_slot(b, handler->ctx.ls_sb_slot); for (unsigned i = 0; i < num_ranges; ++i) { unsigned reg_count = util_bitcount(masks[i]); @@ -1968,7 +1966,7 @@ cs_exception_handler_end(struct cs_builder *b, offset += reg_count * 4; } - cs_wait_slot(b, handler->ctx.ls_sb_slot); + cs_flush_loads(b); } /* Fill the rest of the buffer with NOPs. */ @@ -2005,10 +2003,9 @@ cs_trace_preamble(struct cs_builder *b, const struct cs_tracing_ctx *ctx, * access. Use cs_trace_field_offset() to get an offset taking this * pre-increment into account. */ cs_load64_to(b, tracebuf_addr, ctx->ctx_reg, ctx->tracebuf_addr_offset); - cs_wait_slot(b, ctx->ls_sb_slot); cs_add64(b, tracebuf_addr, tracebuf_addr, trace_size); cs_store64(b, tracebuf_addr, ctx->ctx_reg, ctx->tracebuf_addr_offset); - cs_wait_slot(b, ctx->ls_sb_slot); + cs_flush_stores(b); } #define cs_trace_field_offset(__type, __field) \ @@ -2044,7 +2041,7 @@ cs_trace_run_fragment(struct cs_builder *b, const struct cs_tracing_ctx *ctx, cs_store(b, cs_reg_tuple(b, 40, 7), tracebuf_addr, BITFIELD_MASK(7), cs_trace_field_offset(run_fragment, sr)); - cs_wait_slot(b, ctx->ls_sb_slot); + cs_flush_stores(b); } #if PAN_ARCH >= 12 @@ -2087,7 +2084,7 @@ cs_trace_run_idvs2(struct cs_builder *b, const struct cs_tracing_ctx *ctx, cs_trace_field_offset(run_idvs2, sr[i])); cs_store(b, cs_reg_tuple(b, 64, 2), tracebuf_addr, BITFIELD_MASK(2), cs_trace_field_offset(run_idvs2, sr[64])); - cs_wait_slot(b, ctx->ls_sb_slot); + cs_flush_stores(b); } #else struct cs_run_idvs_trace { @@ -2130,7 +2127,7 @@ cs_trace_run_idvs(struct cs_builder *b, const struct cs_tracing_ctx *ctx, cs_trace_field_offset(run_idvs, sr[i])); cs_store(b, cs_reg_tuple(b, 48, 13), tracebuf_addr, BITFIELD_MASK(13), cs_trace_field_offset(run_idvs, sr[48])); - cs_wait_slot(b, ctx->ls_sb_slot); + cs_flush_stores(b); } #endif @@ -2166,7 +2163,7 @@ cs_trace_run_compute(struct cs_builder *b, const struct cs_tracing_ctx *ctx, cs_trace_field_offset(run_compute, sr[i])); cs_store(b, cs_reg_tuple(b, 32, 8), tracebuf_addr, BITFIELD_MASK(8), cs_trace_field_offset(run_compute, sr[32])); - cs_wait_slot(b, ctx->ls_sb_slot); + cs_flush_stores(b); } static inline void @@ -2197,5 +2194,5 @@ cs_trace_run_compute_indirect(struct cs_builder *b, cs_trace_field_offset(run_compute, sr[i])); cs_store(b, cs_reg_tuple(b, 32, 8), tracebuf_addr, BITFIELD_MASK(8), cs_trace_field_offset(run_compute, sr[32])); - cs_wait_slot(b, ctx->ls_sb_slot); + cs_flush_stores(b); } diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c index 1efe2ea35f1..a69f564e101 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c @@ -144,7 +144,6 @@ finish_cs(struct panvk_cmd_buffer *cmdbuf, uint32_t subqueue) cs_move32_to(b, one, 1); cs_load64_to(b, debug_sync_addr, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, debug.syncobjs)); - cs_wait_slot(b, SB_ID(LS)); cs_add64(b, debug_sync_addr, debug_sync_addr, sizeof(struct panvk_cs_sync32) * subqueue); cs_load32_to(b, error, debug_sync_addr, @@ -162,7 +161,7 @@ finish_cs(struct panvk_cmd_buffer *cmdbuf, uint32_t subqueue) /* Overwrite the sync error with the first error we encountered. */ cs_store32(b, error, debug_sync_addr, offsetof(struct panvk_cs_sync32, error)); - cs_wait_slot(b, SB_ID(LS)); + cs_flush_stores(b); } } } @@ -589,7 +588,6 @@ panvk_per_arch(CmdPipelineBarrier2)(VkCommandBuffer commandBuffer, cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, syncobjs)); - cs_wait_slot(b, SB_ID(LS)); cs_add64(b, sync_addr, sync_addr, sizeof(struct panvk_cs_sync64) * i); cs_move64_to(b, add_val, 1); cs_sync64_add(b, false, MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr, @@ -607,7 +605,6 @@ panvk_per_arch(CmdPipelineBarrier2)(VkCommandBuffer commandBuffer, cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, syncobjs)); - cs_wait_slot(b, SB_ID(LS)); cs_add64(b, sync_addr, sync_addr, sizeof(struct panvk_cs_sync64) * j); cs_add64(b, wait_val, cs_progress_seqno_reg(b, j), @@ -628,7 +625,6 @@ panvk_per_arch(cs_pick_iter_sb)(struct panvk_cmd_buffer *cmdbuf, cs_load32_to(b, iter_sb, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, iter_sb)); - cs_wait_slot(b, SB_ID(LS)); cs_match(b, iter_sb, cmp_scratch) { #define CASE(x) \ diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c index 75e6b0ff74b..90aece4bae3 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c @@ -227,10 +227,9 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) if (shader->info.tls_size) { cs_move64_to(b, cs_scratch_reg64(b, 0), cmdbuf->state.tls.desc.gpu); cs_load64_to(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8); - cs_wait_slot(b, SB_ID(LS)); cs_move64_to(b, cs_scratch_reg64(b, 0), tsd); cs_store64(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8); - cs_wait_slot(b, SB_ID(LS)); + cs_flush_stores(b); } cs_update_compute_ctx(b) { @@ -279,7 +278,6 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) cs_scratch_reg64(b, 0), BITFIELD_MASK(3), 0); cs_move64_to(b, cs_scratch_reg64(b, 0), cmdbuf->state.compute.push_uniforms); - cs_wait_slot(b, SB_ID(LS)); if (shader_uses_sysval(shader, compute, num_work_groups.x)) { cs_store32(b, cs_sr_reg32(b, COMPUTE, JOB_SIZE_X), @@ -302,7 +300,7 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) shader, sysval_offset(compute, num_work_groups.z))); } - cs_wait_slot(b, SB_ID(LS)); + cs_flush_stores(b); } else { cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_SIZE_X), info->direct.wg_count.x); @@ -345,7 +343,6 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b), BITFIELD_MASK(3), offsetof(struct panvk_cs_subqueue_context, syncobjs)); - cs_wait_slot(b, SB_ID(LS)); cs_add64(b, sync_addr, sync_addr, PANVK_SUBQUEUE_COMPUTE * sizeof(struct panvk_cs_sync64)); @@ -369,7 +366,7 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, iter_sb)); - cs_wait_slot(b, SB_ID(LS)); + cs_flush_stores(b); ++cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].relative_sync_point; clear_dirty_after_dispatch(cmdbuf); diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c index 7f473e9e9c9..026a5a0dd9f 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c @@ -769,7 +769,6 @@ cs_render_desc_ringbuf_reserve(struct cs_builder *b, uint32_t size) cs_load64_to( b, ringbuf_sync, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.syncobj)); - cs_wait_slot(b, SB_ID(LS)); /* Wait for the other end to release memory. */ cs_move32_to(b, sz_reg, size - 1); @@ -793,7 +792,6 @@ cs_render_desc_ringbuf_move_ptr(struct cs_builder *b, uint32_t size, b, cs_scratch_reg_tuple(b, 2, 3), cs_subqueue_ctx_reg(b), BITFIELD_MASK(3), offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.ptr)); - cs_wait_slot(b, SB_ID(LS)); /* Update the relative position and absolute address. */ cs_add32(b, ptr_lo, ptr_lo, size); @@ -813,7 +811,7 @@ cs_render_desc_ringbuf_move_ptr(struct cs_builder *b, uint32_t size, b, cs_scratch_reg_tuple(b, 2, 3), cs_subqueue_ctx_reg(b), BITFIELD_MASK(3), offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.ptr)); - cs_wait_slot(b, SB_ID(LS)); + cs_flush_stores(b); } static VkResult @@ -927,8 +925,6 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf) cs_move64_to(b, cs_scratch_reg64(b, 12), 0); cs_move64_to(b, cs_scratch_reg64(b, 14), 0); - cs_wait_slot(b, SB_ID(LS)); - /* Take care of the tiler desc with layer_offset=0 outside of the loop. */ cs_move32_to(b, cs_scratch_reg32(b, 4), MIN2(cmdbuf->state.gfx.render.layer_count - 1, @@ -942,8 +938,6 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf) cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr, BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 96); - cs_wait_slot(b, SB_ID(LS)); - uint32_t remaining_layers = td_count > 1 ? cmdbuf->state.gfx.render.layer_count % MAX_LAYERS_PER_TILER_DESC @@ -970,7 +964,6 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf) BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 64); cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr, BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 96); - cs_wait_slot(b, SB_ID(LS)); cs_update_vt_ctx(b) cs_add64(b, tiler_ctx_addr, tiler_ctx_addr, @@ -1006,8 +999,6 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf) cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr, BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 96); - cs_wait_slot(b, SB_ID(LS)); - cs_add32(b, cs_scratch_reg32(b, 4), cs_scratch_reg32(b, 4), MAX_LAYERS_PER_TILER_DESC << 8); @@ -1018,6 +1009,9 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf) } } + /* Flush all stores to tiler_ctx_addr. */ + cs_flush_stores(b); + /* Then we change the scoreboard slot used for iterators. */ panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER); @@ -1230,7 +1224,6 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) cs_load64_to(b, cur_tiler, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.ptr)); - cs_wait_slot(b, SB_ID(LS)); cs_add64(b, dst_fbd_ptr, cur_tiler, pan_size(TILER_CONTEXT) * td_count); } @@ -1258,16 +1251,17 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) cs_load_to(b, cs_scratch_reg_tuple(b, 0, 16), pass_src_fbd_ptr, BITFIELD_MASK(16), fbd_off); } - cs_wait_slot(b, SB_ID(LS)); cs_store(b, cs_scratch_reg_tuple(b, 0, 16), pass_dst_fbd_ptr, BITFIELD_MASK(16), fbd_off); - cs_wait_slot(b, SB_ID(LS)); } cs_add64(b, pass_src_fbd_ptr, pass_src_fbd_ptr, fbd_ir_pass_offset); cs_add64(b, pass_dst_fbd_ptr, pass_dst_fbd_ptr, fbd_ir_pass_offset); cs_add32(b, pass_count, pass_count, -1); } + /* Finish stores to pass_dst_fbd_ptr. */ + cs_flush_stores(b); + cs_add64(b, src_fbd_ptr, src_fbd_ptr, fbd_sz); cs_update_frag_ctx(b) cs_add64(b, dst_fbd_ptr, dst_fbd_ptr, fbd_sz); @@ -1608,15 +1602,13 @@ wrap_prev_oq(struct panvk_cmd_buffer *cmdbuf) cs_load64_to( b, prev_oq_node_reg, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, render.oq_chain)); - cs_wait_slot(b, SB_ID(LS)); cs_store64(b, prev_oq_node_reg, oq_node_reg, offsetof(struct panvk_cs_occlusion_query, next)); - cs_wait_slot(b, SB_ID(LS)); } cs_store64(b, oq_node_reg, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, render.oq_chain)); - cs_wait_slot(b, SB_ID(LS)); + cs_flush_stores(b); return VK_SUCCESS; } @@ -2260,9 +2252,6 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf, cs_move32_to(b, max_draw_count, draw->indirect.draw_count); cs_move64_to(b, draw_params_addr, draw->indirect.count_buffer_dev_addr); cs_load32_to(b, draw_count, draw_params_addr, 0); - - /* wait for draw_count to load from buffer */ - cs_wait_slot(b, SB_ID(LS)); cs_umin32(b, draw_count, draw_count, max_draw_count); } else { cs_move32_to(b, draw_count, draw->indirect.draw_count); @@ -2283,9 +2272,6 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf, draw_params_addr, reg_mask, 0); } - /* Wait for the SR33-37 indirect buffer load. */ - cs_wait_slot(b, SB_ID(LS)); - if (patch_faus) { if (shader_uses_sysval(vs, graphics, vs.first_vertex)) { cs_store32(b, cs_sr_reg32(b, IDVS, VERTEX_OFFSET), vs_fau_addr, @@ -2298,10 +2284,6 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf, shader_remapped_sysval_offset( vs, sysval_offset(graphics, vs.base_instance))); } - - /* Wait for the store using SR-37 as src to finish, so we can - * overwrite it. */ - cs_wait_slot(b, SB_ID(LS)); } if (patch_attribs != 0) { @@ -2318,7 +2300,6 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf, cs_load32_to(b, attrib_offset, vs_drv_set, pan_size(ATTRIBUTE) * i + (2 * sizeof(uint32_t))); - cs_wait_slot(b, SB_ID(LS)); /* Emulated immediate multiply: we walk the bits in * base_instance, and accumulate (stride << bit_pos) if the bit @@ -2349,7 +2330,7 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf, cs_store32(b, attrib_offset, vs_drv_set, pan_size(ATTRIBUTE) * i + (2 * sizeof(uint32_t))); - cs_wait_slot(b, SB_ID(LS)); + cs_flush_stores(b); } } } @@ -2629,7 +2610,6 @@ flush_tiling(struct panvk_cmd_buffer *cmdbuf) cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b), BITFIELD_MASK(3), offsetof(struct panvk_cs_subqueue_context, syncobjs)); - cs_wait_slot(b, SB_ID(LS)); /* We're relying on PANVK_SUBQUEUE_VERTEX_TILER being the first queue to * skip an ADD operation on the syncobjs pointer. */ @@ -2660,14 +2640,13 @@ flush_tiling(struct panvk_cmd_buffer *cmdbuf) cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, iter_sb)); - cs_wait_slot(b, SB_ID(LS)); + cs_flush_stores(b); /* Update the vertex seqno. */ ++cmdbuf->state.cs[PANVK_SUBQUEUE_VERTEX_TILER].relative_sync_point; } else { cs_load64_to(b, render_ctx, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, render)); - cs_wait_slot(b, SB_ID(LS)); } } @@ -2682,7 +2661,6 @@ wait_finish_tiling(struct panvk_cmd_buffer *cmdbuf) cs_load64_to(b, vt_sync_addr, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, syncobjs)); - cs_wait_slot(b, SB_ID(LS)); cs_add64(b, vt_sync_point, cs_progress_seqno_reg(b, PANVK_SUBQUEUE_VERTEX_TILER), @@ -2741,7 +2719,7 @@ setup_tiler_oom_ctx(struct panvk_cmd_buffer *cmdbuf) cs_store32(b, layer_count, cs_subqueue_ctx_reg(b), TILER_OOM_CTX_FIELD_OFFSET(layer_count)); - cs_wait_slot(b, SB_ID(LS)); + cs_flush_stores(b); } static VkResult @@ -2828,7 +2806,6 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) cs_load32_to( b, counter, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, tiler_oom_ctx.counter)); - cs_wait_slot(b, SB_ID(LS)); cs_if(b, MALI_CS_CONDITION_GREATER, counter) cs_update_frag_ctx(b) cs_add64(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER), @@ -2896,8 +2873,6 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) render.desc_ringbuf.syncobj)); } - cs_wait_slot(b, SB_ID(LS)); - cs_add64(b, sync_addr, sync_addr, PANVK_SUBQUEUE_FRAGMENT * sizeof(struct panvk_cs_sync64)); cs_move32_to(b, tiler_count, td_count); @@ -2909,12 +2884,10 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC)); \ if (td_count == 1) { \ cs_load_to(b, completed, cur_tiler, BITFIELD_MASK(4), 40); \ - cs_wait_slot(b, SB_ID(LS)); \ cs_finish_fragment(b, true, completed_top, completed_bottom, async); \ } else if (td_count > 1) { \ cs_while(b, MALI_CS_CONDITION_GREATER, tiler_count) { \ cs_load_to(b, completed, cur_tiler, BITFIELD_MASK(4), 40); \ - cs_wait_slot(b, SB_ID(LS)); \ cs_finish_fragment(b, false, completed_top, completed_bottom, \ async); \ cs_update_frag_ctx(b) \ @@ -2937,20 +2910,18 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) cs_load64_to( \ b, oq_chain, cs_subqueue_ctx_reg(b), \ offsetof(struct panvk_cs_subqueue_context, render.oq_chain)); \ - cs_wait_slot(b, SB_ID(LS)); \ + /* For WAR dependency on subqueue_context.render.oq_chain. */ \ + cs_flush_loads(b); \ /* We use oq_syncobj as a placeholder to reset the oq_chain. */ \ cs_move64_to(b, oq_syncobj, 0); \ cs_store64( \ b, oq_syncobj, cs_subqueue_ctx_reg(b), \ offsetof(struct panvk_cs_subqueue_context, render.oq_chain)); \ - cs_wait_slot(b, SB_ID(LS)); \ cs_while(b, MALI_CS_CONDITION_ALWAYS, cs_undef()) { \ cs_load64_to(b, oq_syncobj, oq_chain, \ offsetof(struct panvk_cs_occlusion_query, syncobj)); \ - cs_wait_slot(b, SB_ID(LS)); \ cs_load64_to(b, oq_chain, oq_chain, \ offsetof(struct panvk_cs_occlusion_query, next)); \ - cs_wait_slot(b, SB_ID(LS)); \ cs_sync32_set( \ b, true, MALI_CS_SYNC_SCOPE_CSG, add_val_lo, oq_syncobj, \ cs_defer(SB_MASK(DEFERRED_FLUSH), SB_ID(DEFERRED_SYNC))); \ @@ -2976,7 +2947,7 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, iter_sb)); - cs_wait_slot(b, SB_ID(LS)); + cs_flush_stores(b); /* Update the ring buffer position. */ if (free_render_descs) { diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_event.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_event.c index 61519b6d820..48034f1dffc 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_event.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_event.c @@ -41,7 +41,6 @@ panvk_per_arch(CmdResetEvent2)(VkCommandBuffer commandBuffer, VkEvent _event, (i * sizeof(struct panvk_cs_sync32))); cs_load32_to(b, seqno, sync_addr, offsetof(struct panvk_cs_sync32, seqno)); - cs_wait_slot(b, SB_ID(LS)); cs_match(b, seqno, cmp_scratch) { cs_case(b, 0) { @@ -83,7 +82,6 @@ panvk_per_arch(CmdSetEvent2)(VkCommandBuffer commandBuffer, VkEvent _event, (i * sizeof(struct panvk_cs_sync32))); cs_load32_to(b, seqno, sync_addr, offsetof(struct panvk_cs_sync32, seqno)); - cs_wait_slot(b, SB_ID(LS)); cs_match(b, seqno, cmp_scratch) { cs_case(b, 0) { diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c index c160ab3f68d..110ac5cb670 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c @@ -95,10 +95,9 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx, if (shader->info.tls_size) { cs_move64_to(b, cs_scratch_reg64(b, 0), cmdbuf->state.tls.desc.gpu); cs_load64_to(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8); - cs_wait_slot(b, SB_ID(LS)); cs_move64_to(b, cs_scratch_reg64(b, 0), tsd); cs_store64(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8); - cs_wait_slot(b, SB_ID(LS)); + cs_flush_stores(b); } cs_update_compute_ctx(b) { @@ -156,7 +155,6 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx, cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b), BITFIELD_MASK(3), offsetof(struct panvk_cs_subqueue_context, syncobjs)); - cs_wait_slot(b, SB_ID(LS)); cs_add64(b, sync_addr, sync_addr, PANVK_SUBQUEUE_COMPUTE * sizeof(struct panvk_cs_sync64)); @@ -180,7 +178,7 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx, cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, iter_sb)); - cs_wait_slot(b, SB_ID(LS)); + cs_flush_stores(b); ++cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].relative_sync_point; diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_query.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_query.c index 4566288c6eb..fcc0793e027 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_query.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_query.c @@ -113,7 +113,7 @@ panvk_cmd_reset_occlusion_queries(struct panvk_cmd_buffer *cmd, /* reset_oq_batch() only does the stores, we need to flush those explicitly * here. */ - cs_wait_slot(b, SB_ID(LS)); + cs_flush_stores(b); /* We flush the caches to make the new value visible to the CPU. */ struct cs_index flush_id = cs_scratch_reg32(b, 0); @@ -152,7 +152,7 @@ panvk_cmd_begin_occlusion_query(struct panvk_cmd_buffer *cmd, cs_move64_to(b, report_addr_gpu, report_addr); cs_move64_to(b, clear_value, 0); cs_store64(b, clear_value, report_addr_gpu, 0); - cs_wait_slot(b, SB_ID(LS)); + cs_flush_stores(b); } static void @@ -218,9 +218,6 @@ copy_oq_result_batch(struct cs_builder *b, cs_load32_to(b, avail, avail_addr, i * sizeof(struct panvk_cs_sync32)); } - /* Flush the loads. */ - cs_wait_slot(b, SB_ID(LS)); - for (uint32_t i = 0; i < query_count; i++) { struct cs_index store_src = cs_reg_tuple(b, scratch_regs.reg + (i * regs_per_copy), regs_per_copy); @@ -230,7 +227,7 @@ copy_oq_result_batch(struct cs_builder *b, } /* Flush the stores. */ - cs_wait_slot(b, SB_ID(LS)); + cs_flush_stores(b); } static void diff --git a/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c b/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c index 8cf19875ff6..106fc471b0d 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c +++ b/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c @@ -77,7 +77,6 @@ generate_tiler_oom_handler(struct panvk_device *dev, * rendering has already been triggered */ cs_load32_to(&b, counter, subqueue_ctx, TILER_OOM_CTX_FIELD_OFFSET(counter)); - cs_wait_slot(&b, SB_ID(LS)); cs_if(&b, MALI_CS_CONDITION_GREATER, counter) cs_load64_to(&b, fbd_ptr, subqueue_ctx, @@ -88,7 +87,6 @@ generate_tiler_oom_handler(struct panvk_device *dev, cs_load32_to(&b, layer_count, subqueue_ctx, TILER_OOM_CTX_FIELD_OFFSET(layer_count)); - cs_wait_slot(&b, SB_ID(LS)); cs_while(&b, MALI_CS_CONDITION_GREATER, layer_count) { cs_trace_run_fragment(&b, &tracing_ctx, cs_scratch_reg_tuple(&b, 8, 4), @@ -109,12 +107,10 @@ generate_tiler_oom_handler(struct panvk_device *dev, cs_load32_to(&b, td_count, subqueue_ctx, TILER_OOM_CTX_FIELD_OFFSET(td_count)); cs_move64_to(&b, zero, 0); - cs_wait_slot(&b, SB_ID(LS)); cs_while(&b, MALI_CS_CONDITION_GREATER, td_count) { /* Load completed chunks */ cs_load_to(&b, completed_chunks, tiler_ptr, BITFIELD_MASK(4), 10 * 4); - cs_wait_slot(&b, SB_ID(LS)); cs_finish_fragment(&b, false, completed_top, completed_bottom, cs_now());