panvk: Remove explicit LS waits

Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34808>
This commit is contained in:
Christoph Pillmayer 2025-05-13 13:52:16 +00:00 committed by Marge Bot
parent c28497c355
commit f75569734e
8 changed files with 31 additions and 81 deletions

View file

@ -1937,7 +1937,6 @@ cs_exception_handler_end(struct cs_builder *b,
cs_load64_to(b, addr_reg, handler->ctx.ctx_reg,
handler->ctx.dump_addr_offset);
cs_wait_slot(b, handler->ctx.ls_sb_slot);
for (unsigned i = 0; i < num_ranges; ++i) {
unsigned reg_count = util_bitcount(masks[i]);
@ -1946,7 +1945,7 @@ cs_exception_handler_end(struct cs_builder *b,
offset += reg_count * 4;
}
cs_wait_slot(b, handler->ctx.ls_sb_slot);
cs_flush_stores(b);
}
/* Now that the preamble is emitted, we can flush the instructions we have in
@ -1959,7 +1958,6 @@ cs_exception_handler_end(struct cs_builder *b,
cs_load64_to(b, addr_reg, handler->ctx.ctx_reg,
handler->ctx.dump_addr_offset);
cs_wait_slot(b, handler->ctx.ls_sb_slot);
for (unsigned i = 0; i < num_ranges; ++i) {
unsigned reg_count = util_bitcount(masks[i]);
@ -1968,7 +1966,7 @@ cs_exception_handler_end(struct cs_builder *b,
offset += reg_count * 4;
}
cs_wait_slot(b, handler->ctx.ls_sb_slot);
cs_flush_loads(b);
}
/* Fill the rest of the buffer with NOPs. */
@ -2005,10 +2003,9 @@ cs_trace_preamble(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
* access. Use cs_trace_field_offset() to get an offset taking this
* pre-increment into account. */
cs_load64_to(b, tracebuf_addr, ctx->ctx_reg, ctx->tracebuf_addr_offset);
cs_wait_slot(b, ctx->ls_sb_slot);
cs_add64(b, tracebuf_addr, tracebuf_addr, trace_size);
cs_store64(b, tracebuf_addr, ctx->ctx_reg, ctx->tracebuf_addr_offset);
cs_wait_slot(b, ctx->ls_sb_slot);
cs_flush_stores(b);
}
#define cs_trace_field_offset(__type, __field) \
@ -2044,7 +2041,7 @@ cs_trace_run_fragment(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
cs_store(b, cs_reg_tuple(b, 40, 7), tracebuf_addr, BITFIELD_MASK(7),
cs_trace_field_offset(run_fragment, sr));
cs_wait_slot(b, ctx->ls_sb_slot);
cs_flush_stores(b);
}
#if PAN_ARCH >= 12
@ -2087,7 +2084,7 @@ cs_trace_run_idvs2(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
cs_trace_field_offset(run_idvs2, sr[i]));
cs_store(b, cs_reg_tuple(b, 64, 2), tracebuf_addr, BITFIELD_MASK(2),
cs_trace_field_offset(run_idvs2, sr[64]));
cs_wait_slot(b, ctx->ls_sb_slot);
cs_flush_stores(b);
}
#else
struct cs_run_idvs_trace {
@ -2130,7 +2127,7 @@ cs_trace_run_idvs(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
cs_trace_field_offset(run_idvs, sr[i]));
cs_store(b, cs_reg_tuple(b, 48, 13), tracebuf_addr, BITFIELD_MASK(13),
cs_trace_field_offset(run_idvs, sr[48]));
cs_wait_slot(b, ctx->ls_sb_slot);
cs_flush_stores(b);
}
#endif
@ -2166,7 +2163,7 @@ cs_trace_run_compute(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
cs_trace_field_offset(run_compute, sr[i]));
cs_store(b, cs_reg_tuple(b, 32, 8), tracebuf_addr, BITFIELD_MASK(8),
cs_trace_field_offset(run_compute, sr[32]));
cs_wait_slot(b, ctx->ls_sb_slot);
cs_flush_stores(b);
}
static inline void
@ -2197,5 +2194,5 @@ cs_trace_run_compute_indirect(struct cs_builder *b,
cs_trace_field_offset(run_compute, sr[i]));
cs_store(b, cs_reg_tuple(b, 32, 8), tracebuf_addr, BITFIELD_MASK(8),
cs_trace_field_offset(run_compute, sr[32]));
cs_wait_slot(b, ctx->ls_sb_slot);
cs_flush_stores(b);
}

View file

@ -144,7 +144,6 @@ finish_cs(struct panvk_cmd_buffer *cmdbuf, uint32_t subqueue)
cs_move32_to(b, one, 1);
cs_load64_to(b, debug_sync_addr, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context, debug.syncobjs));
cs_wait_slot(b, SB_ID(LS));
cs_add64(b, debug_sync_addr, debug_sync_addr,
sizeof(struct panvk_cs_sync32) * subqueue);
cs_load32_to(b, error, debug_sync_addr,
@ -162,7 +161,7 @@ finish_cs(struct panvk_cmd_buffer *cmdbuf, uint32_t subqueue)
/* Overwrite the sync error with the first error we encountered. */
cs_store32(b, error, debug_sync_addr,
offsetof(struct panvk_cs_sync32, error));
cs_wait_slot(b, SB_ID(LS));
cs_flush_stores(b);
}
}
}
@ -589,7 +588,6 @@ panvk_per_arch(CmdPipelineBarrier2)(VkCommandBuffer commandBuffer,
cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context, syncobjs));
cs_wait_slot(b, SB_ID(LS));
cs_add64(b, sync_addr, sync_addr, sizeof(struct panvk_cs_sync64) * i);
cs_move64_to(b, add_val, 1);
cs_sync64_add(b, false, MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr,
@ -607,7 +605,6 @@ panvk_per_arch(CmdPipelineBarrier2)(VkCommandBuffer commandBuffer,
cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context, syncobjs));
cs_wait_slot(b, SB_ID(LS));
cs_add64(b, sync_addr, sync_addr, sizeof(struct panvk_cs_sync64) * j);
cs_add64(b, wait_val, cs_progress_seqno_reg(b, j),
@ -628,7 +625,6 @@ panvk_per_arch(cs_pick_iter_sb)(struct panvk_cmd_buffer *cmdbuf,
cs_load32_to(b, iter_sb, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context, iter_sb));
cs_wait_slot(b, SB_ID(LS));
cs_match(b, iter_sb, cmp_scratch) {
#define CASE(x) \

View file

@ -227,10 +227,9 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
if (shader->info.tls_size) {
cs_move64_to(b, cs_scratch_reg64(b, 0), cmdbuf->state.tls.desc.gpu);
cs_load64_to(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8);
cs_wait_slot(b, SB_ID(LS));
cs_move64_to(b, cs_scratch_reg64(b, 0), tsd);
cs_store64(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8);
cs_wait_slot(b, SB_ID(LS));
cs_flush_stores(b);
}
cs_update_compute_ctx(b) {
@ -279,7 +278,6 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
cs_scratch_reg64(b, 0), BITFIELD_MASK(3), 0);
cs_move64_to(b, cs_scratch_reg64(b, 0),
cmdbuf->state.compute.push_uniforms);
cs_wait_slot(b, SB_ID(LS));
if (shader_uses_sysval(shader, compute, num_work_groups.x)) {
cs_store32(b, cs_sr_reg32(b, COMPUTE, JOB_SIZE_X),
@ -302,7 +300,7 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
shader, sysval_offset(compute, num_work_groups.z)));
}
cs_wait_slot(b, SB_ID(LS));
cs_flush_stores(b);
} else {
cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_SIZE_X),
info->direct.wg_count.x);
@ -345,7 +343,6 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b),
BITFIELD_MASK(3),
offsetof(struct panvk_cs_subqueue_context, syncobjs));
cs_wait_slot(b, SB_ID(LS));
cs_add64(b, sync_addr, sync_addr,
PANVK_SUBQUEUE_COMPUTE * sizeof(struct panvk_cs_sync64));
@ -369,7 +366,7 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context, iter_sb));
cs_wait_slot(b, SB_ID(LS));
cs_flush_stores(b);
++cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].relative_sync_point;
clear_dirty_after_dispatch(cmdbuf);

View file

@ -769,7 +769,6 @@ cs_render_desc_ringbuf_reserve(struct cs_builder *b, uint32_t size)
cs_load64_to(
b, ringbuf_sync, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.syncobj));
cs_wait_slot(b, SB_ID(LS));
/* Wait for the other end to release memory. */
cs_move32_to(b, sz_reg, size - 1);
@ -793,7 +792,6 @@ cs_render_desc_ringbuf_move_ptr(struct cs_builder *b, uint32_t size,
b, cs_scratch_reg_tuple(b, 2, 3), cs_subqueue_ctx_reg(b),
BITFIELD_MASK(3),
offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.ptr));
cs_wait_slot(b, SB_ID(LS));
/* Update the relative position and absolute address. */
cs_add32(b, ptr_lo, ptr_lo, size);
@ -813,7 +811,7 @@ cs_render_desc_ringbuf_move_ptr(struct cs_builder *b, uint32_t size,
b, cs_scratch_reg_tuple(b, 2, 3), cs_subqueue_ctx_reg(b),
BITFIELD_MASK(3),
offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.ptr));
cs_wait_slot(b, SB_ID(LS));
cs_flush_stores(b);
}
static VkResult
@ -927,8 +925,6 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf)
cs_move64_to(b, cs_scratch_reg64(b, 12), 0);
cs_move64_to(b, cs_scratch_reg64(b, 14), 0);
cs_wait_slot(b, SB_ID(LS));
/* Take care of the tiler desc with layer_offset=0 outside of the loop. */
cs_move32_to(b, cs_scratch_reg32(b, 4),
MIN2(cmdbuf->state.gfx.render.layer_count - 1,
@ -942,8 +938,6 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf)
cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 96);
cs_wait_slot(b, SB_ID(LS));
uint32_t remaining_layers =
td_count > 1
? cmdbuf->state.gfx.render.layer_count % MAX_LAYERS_PER_TILER_DESC
@ -970,7 +964,6 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf)
BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 64);
cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 96);
cs_wait_slot(b, SB_ID(LS));
cs_update_vt_ctx(b)
cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
@ -1006,8 +999,6 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf)
cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 96);
cs_wait_slot(b, SB_ID(LS));
cs_add32(b, cs_scratch_reg32(b, 4), cs_scratch_reg32(b, 4),
MAX_LAYERS_PER_TILER_DESC << 8);
@ -1018,6 +1009,9 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf)
}
}
/* Flush all stores to tiler_ctx_addr. */
cs_flush_stores(b);
/* Then we change the scoreboard slot used for iterators. */
panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
@ -1230,7 +1224,6 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
cs_load64_to(b, cur_tiler, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context,
render.desc_ringbuf.ptr));
cs_wait_slot(b, SB_ID(LS));
cs_add64(b, dst_fbd_ptr, cur_tiler,
pan_size(TILER_CONTEXT) * td_count);
}
@ -1258,16 +1251,17 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
cs_load_to(b, cs_scratch_reg_tuple(b, 0, 16),
pass_src_fbd_ptr, BITFIELD_MASK(16), fbd_off);
}
cs_wait_slot(b, SB_ID(LS));
cs_store(b, cs_scratch_reg_tuple(b, 0, 16), pass_dst_fbd_ptr,
BITFIELD_MASK(16), fbd_off);
cs_wait_slot(b, SB_ID(LS));
}
cs_add64(b, pass_src_fbd_ptr, pass_src_fbd_ptr, fbd_ir_pass_offset);
cs_add64(b, pass_dst_fbd_ptr, pass_dst_fbd_ptr, fbd_ir_pass_offset);
cs_add32(b, pass_count, pass_count, -1);
}
/* Finish stores to pass_dst_fbd_ptr. */
cs_flush_stores(b);
cs_add64(b, src_fbd_ptr, src_fbd_ptr, fbd_sz);
cs_update_frag_ctx(b)
cs_add64(b, dst_fbd_ptr, dst_fbd_ptr, fbd_sz);
@ -1608,15 +1602,13 @@ wrap_prev_oq(struct panvk_cmd_buffer *cmdbuf)
cs_load64_to(
b, prev_oq_node_reg, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context, render.oq_chain));
cs_wait_slot(b, SB_ID(LS));
cs_store64(b, prev_oq_node_reg, oq_node_reg,
offsetof(struct panvk_cs_occlusion_query, next));
cs_wait_slot(b, SB_ID(LS));
}
cs_store64(b, oq_node_reg, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context, render.oq_chain));
cs_wait_slot(b, SB_ID(LS));
cs_flush_stores(b);
return VK_SUCCESS;
}
@ -2260,9 +2252,6 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
cs_move32_to(b, max_draw_count, draw->indirect.draw_count);
cs_move64_to(b, draw_params_addr, draw->indirect.count_buffer_dev_addr);
cs_load32_to(b, draw_count, draw_params_addr, 0);
/* wait for draw_count to load from buffer */
cs_wait_slot(b, SB_ID(LS));
cs_umin32(b, draw_count, draw_count, max_draw_count);
} else {
cs_move32_to(b, draw_count, draw->indirect.draw_count);
@ -2283,9 +2272,6 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
draw_params_addr, reg_mask, 0);
}
/* Wait for the SR33-37 indirect buffer load. */
cs_wait_slot(b, SB_ID(LS));
if (patch_faus) {
if (shader_uses_sysval(vs, graphics, vs.first_vertex)) {
cs_store32(b, cs_sr_reg32(b, IDVS, VERTEX_OFFSET), vs_fau_addr,
@ -2298,10 +2284,6 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
shader_remapped_sysval_offset(
vs, sysval_offset(graphics, vs.base_instance)));
}
/* Wait for the store using SR-37 as src to finish, so we can
* overwrite it. */
cs_wait_slot(b, SB_ID(LS));
}
if (patch_attribs != 0) {
@ -2318,7 +2300,6 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
cs_load32_to(b, attrib_offset, vs_drv_set,
pan_size(ATTRIBUTE) * i + (2 * sizeof(uint32_t)));
cs_wait_slot(b, SB_ID(LS));
/* Emulated immediate multiply: we walk the bits in
* base_instance, and accumulate (stride << bit_pos) if the bit
@ -2349,7 +2330,7 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
cs_store32(b, attrib_offset, vs_drv_set,
pan_size(ATTRIBUTE) * i + (2 * sizeof(uint32_t)));
cs_wait_slot(b, SB_ID(LS));
cs_flush_stores(b);
}
}
}
@ -2629,7 +2610,6 @@ flush_tiling(struct panvk_cmd_buffer *cmdbuf)
cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b),
BITFIELD_MASK(3),
offsetof(struct panvk_cs_subqueue_context, syncobjs));
cs_wait_slot(b, SB_ID(LS));
/* We're relying on PANVK_SUBQUEUE_VERTEX_TILER being the first queue to
* skip an ADD operation on the syncobjs pointer. */
@ -2660,14 +2640,13 @@ flush_tiling(struct panvk_cmd_buffer *cmdbuf)
cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context, iter_sb));
cs_wait_slot(b, SB_ID(LS));
cs_flush_stores(b);
/* Update the vertex seqno. */
++cmdbuf->state.cs[PANVK_SUBQUEUE_VERTEX_TILER].relative_sync_point;
} else {
cs_load64_to(b, render_ctx, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context, render));
cs_wait_slot(b, SB_ID(LS));
}
}
@ -2682,7 +2661,6 @@ wait_finish_tiling(struct panvk_cmd_buffer *cmdbuf)
cs_load64_to(b, vt_sync_addr, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context, syncobjs));
cs_wait_slot(b, SB_ID(LS));
cs_add64(b, vt_sync_point,
cs_progress_seqno_reg(b, PANVK_SUBQUEUE_VERTEX_TILER),
@ -2741,7 +2719,7 @@ setup_tiler_oom_ctx(struct panvk_cmd_buffer *cmdbuf)
cs_store32(b, layer_count, cs_subqueue_ctx_reg(b),
TILER_OOM_CTX_FIELD_OFFSET(layer_count));
cs_wait_slot(b, SB_ID(LS));
cs_flush_stores(b);
}
static VkResult
@ -2828,7 +2806,6 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
cs_load32_to(
b, counter, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context, tiler_oom_ctx.counter));
cs_wait_slot(b, SB_ID(LS));
cs_if(b, MALI_CS_CONDITION_GREATER, counter)
cs_update_frag_ctx(b)
cs_add64(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER),
@ -2896,8 +2873,6 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
render.desc_ringbuf.syncobj));
}
cs_wait_slot(b, SB_ID(LS));
cs_add64(b, sync_addr, sync_addr,
PANVK_SUBQUEUE_FRAGMENT * sizeof(struct panvk_cs_sync64));
cs_move32_to(b, tiler_count, td_count);
@ -2909,12 +2884,10 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC)); \
if (td_count == 1) { \
cs_load_to(b, completed, cur_tiler, BITFIELD_MASK(4), 40); \
cs_wait_slot(b, SB_ID(LS)); \
cs_finish_fragment(b, true, completed_top, completed_bottom, async); \
} else if (td_count > 1) { \
cs_while(b, MALI_CS_CONDITION_GREATER, tiler_count) { \
cs_load_to(b, completed, cur_tiler, BITFIELD_MASK(4), 40); \
cs_wait_slot(b, SB_ID(LS)); \
cs_finish_fragment(b, false, completed_top, completed_bottom, \
async); \
cs_update_frag_ctx(b) \
@ -2937,20 +2910,18 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
cs_load64_to( \
b, oq_chain, cs_subqueue_ctx_reg(b), \
offsetof(struct panvk_cs_subqueue_context, render.oq_chain)); \
cs_wait_slot(b, SB_ID(LS)); \
/* For WAR dependency on subqueue_context.render.oq_chain. */ \
cs_flush_loads(b); \
/* We use oq_syncobj as a placeholder to reset the oq_chain. */ \
cs_move64_to(b, oq_syncobj, 0); \
cs_store64( \
b, oq_syncobj, cs_subqueue_ctx_reg(b), \
offsetof(struct panvk_cs_subqueue_context, render.oq_chain)); \
cs_wait_slot(b, SB_ID(LS)); \
cs_while(b, MALI_CS_CONDITION_ALWAYS, cs_undef()) { \
cs_load64_to(b, oq_syncobj, oq_chain, \
offsetof(struct panvk_cs_occlusion_query, syncobj)); \
cs_wait_slot(b, SB_ID(LS)); \
cs_load64_to(b, oq_chain, oq_chain, \
offsetof(struct panvk_cs_occlusion_query, next)); \
cs_wait_slot(b, SB_ID(LS)); \
cs_sync32_set( \
b, true, MALI_CS_SYNC_SCOPE_CSG, add_val_lo, oq_syncobj, \
cs_defer(SB_MASK(DEFERRED_FLUSH), SB_ID(DEFERRED_SYNC))); \
@ -2976,7 +2947,7 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context, iter_sb));
cs_wait_slot(b, SB_ID(LS));
cs_flush_stores(b);
/* Update the ring buffer position. */
if (free_render_descs) {

View file

@ -41,7 +41,6 @@ panvk_per_arch(CmdResetEvent2)(VkCommandBuffer commandBuffer, VkEvent _event,
(i * sizeof(struct panvk_cs_sync32)));
cs_load32_to(b, seqno, sync_addr,
offsetof(struct panvk_cs_sync32, seqno));
cs_wait_slot(b, SB_ID(LS));
cs_match(b, seqno, cmp_scratch) {
cs_case(b, 0) {
@ -83,7 +82,6 @@ panvk_per_arch(CmdSetEvent2)(VkCommandBuffer commandBuffer, VkEvent _event,
(i * sizeof(struct panvk_cs_sync32)));
cs_load32_to(b, seqno, sync_addr,
offsetof(struct panvk_cs_sync32, seqno));
cs_wait_slot(b, SB_ID(LS));
cs_match(b, seqno, cmp_scratch) {
cs_case(b, 0) {

View file

@ -95,10 +95,9 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx,
if (shader->info.tls_size) {
cs_move64_to(b, cs_scratch_reg64(b, 0), cmdbuf->state.tls.desc.gpu);
cs_load64_to(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8);
cs_wait_slot(b, SB_ID(LS));
cs_move64_to(b, cs_scratch_reg64(b, 0), tsd);
cs_store64(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8);
cs_wait_slot(b, SB_ID(LS));
cs_flush_stores(b);
}
cs_update_compute_ctx(b) {
@ -156,7 +155,6 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx,
cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b),
BITFIELD_MASK(3),
offsetof(struct panvk_cs_subqueue_context, syncobjs));
cs_wait_slot(b, SB_ID(LS));
cs_add64(b, sync_addr, sync_addr,
PANVK_SUBQUEUE_COMPUTE * sizeof(struct panvk_cs_sync64));
@ -180,7 +178,7 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx,
cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context, iter_sb));
cs_wait_slot(b, SB_ID(LS));
cs_flush_stores(b);
++cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].relative_sync_point;

View file

@ -113,7 +113,7 @@ panvk_cmd_reset_occlusion_queries(struct panvk_cmd_buffer *cmd,
/* reset_oq_batch() only does the stores, we need to flush those explicitly
* here. */
cs_wait_slot(b, SB_ID(LS));
cs_flush_stores(b);
/* We flush the caches to make the new value visible to the CPU. */
struct cs_index flush_id = cs_scratch_reg32(b, 0);
@ -152,7 +152,7 @@ panvk_cmd_begin_occlusion_query(struct panvk_cmd_buffer *cmd,
cs_move64_to(b, report_addr_gpu, report_addr);
cs_move64_to(b, clear_value, 0);
cs_store64(b, clear_value, report_addr_gpu, 0);
cs_wait_slot(b, SB_ID(LS));
cs_flush_stores(b);
}
static void
@ -218,9 +218,6 @@ copy_oq_result_batch(struct cs_builder *b,
cs_load32_to(b, avail, avail_addr, i * sizeof(struct panvk_cs_sync32));
}
/* Flush the loads. */
cs_wait_slot(b, SB_ID(LS));
for (uint32_t i = 0; i < query_count; i++) {
struct cs_index store_src =
cs_reg_tuple(b, scratch_regs.reg + (i * regs_per_copy), regs_per_copy);
@ -230,7 +227,7 @@ copy_oq_result_batch(struct cs_builder *b,
}
/* Flush the stores. */
cs_wait_slot(b, SB_ID(LS));
cs_flush_stores(b);
}
static void

View file

@ -77,7 +77,6 @@ generate_tiler_oom_handler(struct panvk_device *dev,
* rendering has already been triggered */
cs_load32_to(&b, counter, subqueue_ctx,
TILER_OOM_CTX_FIELD_OFFSET(counter));
cs_wait_slot(&b, SB_ID(LS));
cs_if(&b, MALI_CS_CONDITION_GREATER, counter)
cs_load64_to(&b, fbd_ptr, subqueue_ctx,
@ -88,7 +87,6 @@ generate_tiler_oom_handler(struct panvk_device *dev,
cs_load32_to(&b, layer_count, subqueue_ctx,
TILER_OOM_CTX_FIELD_OFFSET(layer_count));
cs_wait_slot(&b, SB_ID(LS));
cs_while(&b, MALI_CS_CONDITION_GREATER, layer_count) {
cs_trace_run_fragment(&b, &tracing_ctx, cs_scratch_reg_tuple(&b, 8, 4),
@ -109,12 +107,10 @@ generate_tiler_oom_handler(struct panvk_device *dev,
cs_load32_to(&b, td_count, subqueue_ctx,
TILER_OOM_CTX_FIELD_OFFSET(td_count));
cs_move64_to(&b, zero, 0);
cs_wait_slot(&b, SB_ID(LS));
cs_while(&b, MALI_CS_CONDITION_GREATER, td_count) {
/* Load completed chunks */
cs_load_to(&b, completed_chunks, tiler_ptr, BITFIELD_MASK(4), 10 * 4);
cs_wait_slot(&b, SB_ID(LS));
cs_finish_fragment(&b, false, completed_top, completed_bottom,
cs_now());