From 53f780ec916d1cd4773af6cec40c3149042d852f Mon Sep 17 00:00:00 2001 From: Mary Guillemard Date: Tue, 13 May 2025 12:56:25 +0200 Subject: [PATCH] panfrost: Remove progress_increment from all CS builders Progression logic is deprecated since v11 and we don't plan to use it. Let's get ride of all increment logic on all instructions. Signed-off-by: Mary Guillemard Acked-by: Boris Brezillon Part-of: --- src/gallium/drivers/panfrost/pan_csf.c | 52 ++++----- src/gallium/drivers/panfrost/pan_precomp.c | 3 +- src/panfrost/genxml/cs_builder.h | 107 ++++++++---------- src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c | 22 ++-- .../vulkan/csf/panvk_vX_cmd_dispatch.c | 16 +-- src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c | 93 ++++++++------- src/panfrost/vulkan/csf/panvk_vX_cmd_event.c | 4 +- .../vulkan/csf/panvk_vX_cmd_precomp.c | 10 +- src/panfrost/vulkan/csf/panvk_vX_cmd_query.c | 14 +-- .../vulkan/csf/panvk_vX_exception_handler.c | 17 ++- src/panfrost/vulkan/csf/panvk_vX_utrace.c | 8 +- 11 files changed, 164 insertions(+), 182 deletions(-) diff --git a/src/gallium/drivers/panfrost/pan_csf.c b/src/gallium/drivers/panfrost/pan_csf.c index 9cf97d3db23..b40b18a7bee 100644 --- a/src/gallium/drivers/panfrost/pan_csf.c +++ b/src/gallium/drivers/panfrost/pan_csf.c @@ -148,7 +148,7 @@ csf_oom_handler_init(struct panfrost_context *ctx) /* Use different framebuffer descriptor depending on whether incremental * rendering has already been triggered */ cs_load32_to(&b, counter, tiler_oom_ctx, FIELD_OFFSET(counter)); - cs_wait_slot(&b, 0, false); + cs_wait_slot(&b, 0); cs_if(&b, MALI_CS_CONDITION_GREATER, counter) { cs_load64_to(&b, cs_sr_reg64(&b, FRAGMENT, FBD_POINTER), tiler_oom_ctx, FBD_OFFSET(MIDDLE)); @@ -164,12 +164,12 @@ csf_oom_handler_init(struct panfrost_context *ctx) FIELD_OFFSET(bbox_max)); cs_move64_to(&b, cs_sr_reg64(&b, FRAGMENT, TEM_POINTER), 0); cs_move32_to(&b, cs_sr_reg32(&b, FRAGMENT, TEM_ROW_STRIDE), 0); - cs_wait_slot(&b, 0, false); + cs_wait_slot(&b, 0); /* Run the fragment job and wait */ cs_select_sb_entries_for_async_ops(&b, 3); - cs_run_fragment(&b, false, MALI_TILE_RENDER_ORDER_Z_ORDER, false); - cs_wait_slot(&b, 3, false); + cs_run_fragment(&b, MALI_TILE_RENDER_ORDER_Z_ORDER, false); + cs_wait_slot(&b, 3); /* Increment counter */ cs_add32(&b, counter, counter, 1); @@ -177,9 +177,9 @@ csf_oom_handler_init(struct panfrost_context *ctx) /* Load completed chunks */ cs_load64_to(&b, tiler_ctx, tiler_oom_ctx, FIELD_OFFSET(tiler_desc)); - cs_wait_slot(&b, 0, false); + cs_wait_slot(&b, 0); cs_load_to(&b, completed_chunks, tiler_ctx, BITFIELD_MASK(4), 10 * 4); - cs_wait_slot(&b, 0, false); + cs_wait_slot(&b, 0); cs_finish_fragment(&b, false, completed_top, completed_bottom, cs_now()); @@ -195,7 +195,7 @@ csf_oom_handler_init(struct panfrost_context *ctx) MALI_CS_OTHER_FLUSH_MODE_INVALIDATE, flush_id, cs_defer(0, 0)); - cs_wait_slot(&b, 0, false); + cs_wait_slot(&b, 0); cs_select_sb_entries_for_async_ops(&b, 2); } @@ -347,7 +347,7 @@ csf_emit_batch_end(struct panfrost_batch *batch) struct cs_builder *b = batch->csf.cs.builder; /* Barrier to let everything finish */ - cs_wait_slots(b, BITFIELD_MASK(8), false); + cs_wait_slots(b, BITFIELD_MASK(8)); if (dev->debug & PAN_DBG_SYNC) { /* Get the CS state */ @@ -367,7 +367,7 @@ csf_emit_batch_end(struct panfrost_batch *batch) cs_flush_caches(b, MALI_CS_FLUSH_MODE_CLEAN, MALI_CS_FLUSH_MODE_CLEAN, MALI_CS_OTHER_FLUSH_MODE_INVALIDATE, flush_id, cs_defer(0, 0)); - cs_wait_slot(b, 0, false); + cs_wait_slot(b, 0); /* Finish the command stream */ if (!cs_is_valid(batch->csf.cs.builder)) @@ -821,8 +821,8 @@ GENX(csf_emit_fragment_job)(struct panfrost_batch *batch, if (batch->draw_count > 0) { /* Finish tiling and wait for IDVS and tiling */ - cs_finish_tiling(b, false); - cs_wait_slot(b, 2, false); + cs_finish_tiling(b); + cs_wait_slot(b, 2); cs_vt_end(b, cs_now()); } @@ -841,7 +841,7 @@ GENX(csf_emit_fragment_job)(struct panfrost_batch *batch, if (batch->draw_count > 0) { struct cs_index counter = cs_reg32(b, 78); cs_load32_to(b, counter, cs_reg64(b, TILER_OOM_CTX_REG), 0); - cs_wait_slot(b, 0, false); + cs_wait_slot(b, 0); cs_if(b, MALI_CS_CONDITION_GREATER, counter) { cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER), GET_FBD(oom_ctx, LAST).gpu); @@ -849,8 +849,8 @@ GENX(csf_emit_fragment_job)(struct panfrost_batch *batch, } /* Run the fragment job and wait */ - cs_run_fragment(b, false, MALI_TILE_RENDER_ORDER_Z_ORDER, false); - cs_wait_slot(b, 2, false); + cs_run_fragment(b, MALI_TILE_RENDER_ORDER_Z_ORDER, false); + cs_wait_slot(b, 2); /* Gather freed heap chunks and add them to the heap context free list * so they can be re-used next time the tiler heap runs out of chunks. @@ -862,7 +862,7 @@ GENX(csf_emit_fragment_job)(struct panfrost_batch *batch, cs_move64_to(b, cs_reg64(b, 90), batch->tiler_ctx.valhall.desc); cs_load_to(b, cs_reg_tuple(b, 86, 4), cs_reg64(b, 90), BITFIELD_MASK(4), 40); - cs_wait_slot(b, 0, false); + cs_wait_slot(b, 0); cs_finish_fragment(b, true, cs_reg64(b, 86), cs_reg64(b, 88), cs_now()); } } @@ -950,7 +950,7 @@ GENX(csf_launch_grid)(struct panfrost_batch *batch, cs_load_to(b, grid_xyz, address, BITFIELD_MASK(3), 0); /* Wait for the load */ - cs_wait_slot(b, 0, false); + cs_wait_slot(b, 0); /* Copy to FAU */ for (unsigned i = 0; i < 3; ++i) { @@ -962,7 +962,7 @@ GENX(csf_launch_grid)(struct panfrost_batch *batch, } /* Wait for the stores */ - cs_wait_slot(b, 0, false); + cs_wait_slot(b, 0); /* Use run_compute with a set task axis instead of run_compute_indirect as * run_compute_indirect has been found to cause intermittent hangs. This @@ -973,7 +973,7 @@ GENX(csf_launch_grid)(struct panfrost_batch *batch, * this is somewhat offset by run_compute being a native instruction. */ unsigned task_axis = MALI_TASK_AXIS_X; cs_run_compute(b, DIV_ROUND_UP(max_thread_cnt, threads_per_wg), task_axis, - false, cs_shader_res_sel(0, 0, 0, 0)); + cs_shader_res_sel(0, 0, 0, 0)); } else { /* Set size in workgroups per dimension immediately */ cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_SIZE_X), info->grid[0]); @@ -1008,7 +1008,7 @@ GENX(csf_launch_grid)(struct panfrost_batch *batch, assert(task_axis <= MALI_TASK_AXIS_Z); assert(task_increment > 0); - cs_run_compute(b, task_increment, task_axis, false, + cs_run_compute(b, task_increment, task_axis, cs_shader_res_sel(0, 0, 0, 0)); } } @@ -1050,10 +1050,10 @@ GENX(csf_launch_xfb)(struct panfrost_batch *batch, csf_emit_shader_regs(batch, PIPE_SHADER_VERTEX, batch->rsd[PIPE_SHADER_VERTEX]); /* force a barrier to avoid read/write sync issues with buffers */ - cs_wait_slot(b, 2, false); + cs_wait_slot(b, 2); /* XXX: Choose correctly */ - cs_run_compute(b, 1, MALI_TASK_AXIS_Z, false, cs_shader_res_sel(0, 0, 0, 0)); + cs_run_compute(b, 1, MALI_TASK_AXIS_Z, cs_shader_res_sel(0, 0, 0, 0)); } static void @@ -1364,10 +1364,10 @@ GENX(csf_launch_draw)(struct panfrost_batch *batch, } #if PAN_ARCH >= 12 - cs_run_idvs2(b, flags_override, false, true, drawid, + cs_run_idvs2(b, flags_override, true, drawid, MALI_IDVS_SHADING_MODE_EARLY); #else - cs_run_idvs(b, flags_override, false, true, cs_shader_res_sel(0, 0, 1, 0), + cs_run_idvs(b, flags_override, true, cs_shader_res_sel(0, 0, 1, 0), cs_shader_res_sel(2, 2, 2, 0), drawid); #endif } @@ -1409,12 +1409,12 @@ GENX(csf_launch_draw_indirect)(struct panfrost_batch *batch, cs_move32_to(b, cs_sr_reg32(b, IDVS, INDEX_BUFFER_SIZE), 0); } - cs_wait_slot(b, 0, false); + cs_wait_slot(b, 0); #if PAN_ARCH >= 12 - cs_run_idvs2(b, flags_override, false, true, drawid, + cs_run_idvs2(b, flags_override, true, drawid, MALI_IDVS_SHADING_MODE_EARLY); #else - cs_run_idvs(b, flags_override, false, true, cs_shader_res_sel(0, 0, 1, 0), + cs_run_idvs(b, flags_override, true, cs_shader_res_sel(0, 0, 1, 0), cs_shader_res_sel(2, 2, 2, 0), drawid); #endif diff --git a/src/gallium/drivers/panfrost/pan_precomp.c b/src/gallium/drivers/panfrost/pan_precomp.c index 22133415ec1..75f5c2638e4 100644 --- a/src/gallium/drivers/panfrost/pan_precomp.c +++ b/src/gallium/drivers/panfrost/pan_precomp.c @@ -386,7 +386,6 @@ GENX(panfrost_launch_precomp)(struct panfrost_batch *batch, assert(task_axis <= MALI_TASK_AXIS_Z); assert(task_increment > 0); - cs_run_compute(b, task_increment, task_axis, false, - cs_shader_res_sel(0, 0, 0, 0)); + cs_run_compute(b, task_increment, task_axis, cs_shader_res_sel(0, 0, 0, 0)); #endif } diff --git a/src/panfrost/genxml/cs_builder.h b/src/panfrost/genxml/cs_builder.h index 08ab313e82f..df1af320165 100644 --- a/src/panfrost/genxml/cs_builder.h +++ b/src/panfrost/genxml/cs_builder.h @@ -598,7 +598,7 @@ cs_flush_block_instrs(struct cs_builder *b) sizeof(uint64_t)); /* Drop the prev_load_ip_target value and replace it by the final - * IP. */ + * IP. */ *instr &= ~BITFIELD64_MASK(32); *instr |= ip; @@ -1110,13 +1110,12 @@ cs_move64_to(struct cs_builder *b, struct cs_index dest, uint64_t imm) } static inline void -cs_wait_slots(struct cs_builder *b, unsigned wait_mask, bool progress_inc) +cs_wait_slots(struct cs_builder *b, unsigned wait_mask) { struct cs_load_store_tracker *ls_tracker = b->conf.ls_tracker; cs_emit(b, WAIT, I) { I.wait_mask = wait_mask; - I.progress_increment = progress_inc; } /* We don't do advanced tracking of cs_defer(), and assume that @@ -1130,11 +1129,11 @@ cs_wait_slots(struct cs_builder *b, unsigned wait_mask, bool progress_inc) } static inline void -cs_wait_slot(struct cs_builder *b, unsigned slot, bool progress_inc) +cs_wait_slot(struct cs_builder *b, unsigned slot) { assert(slot < 8 && "invalid slot"); - cs_wait_slots(b, BITFIELD_BIT(slot), progress_inc); + cs_wait_slots(b, BITFIELD_BIT(slot)); } struct cs_shader_res_sel { @@ -1154,13 +1153,11 @@ cs_shader_res_sel(unsigned srt, unsigned fau, unsigned spd, unsigned tsd) static inline void cs_run_compute(struct cs_builder *b, unsigned task_increment, - enum mali_task_axis task_axis, bool progress_inc, - struct cs_shader_res_sel res_sel) + enum mali_task_axis task_axis, struct cs_shader_res_sel res_sel) { cs_emit(b, RUN_COMPUTE, I) { I.task_increment = task_increment; I.task_axis = task_axis; - I.progress_increment = progress_inc; I.srt_select = res_sel.srt; I.spd_select = res_sel.spd; I.tsd_select = res_sel.tsd; @@ -1170,12 +1167,11 @@ cs_run_compute(struct cs_builder *b, unsigned task_increment, #if PAN_ARCH == 10 static inline void -cs_run_tiling(struct cs_builder *b, uint32_t flags_override, bool progress_inc, +cs_run_tiling(struct cs_builder *b, uint32_t flags_override, struct cs_shader_res_sel res_sel) { cs_emit(b, RUN_TILING, I) { I.flags_override = flags_override; - I.progress_increment = progress_inc; I.srt_select = res_sel.srt; I.spd_select = res_sel.spd; I.tsd_select = res_sel.tsd; @@ -1186,13 +1182,12 @@ cs_run_tiling(struct cs_builder *b, uint32_t flags_override, bool progress_inc, #if PAN_ARCH >= 12 static inline void -cs_run_idvs2(struct cs_builder *b, uint32_t flags_override, bool progress_inc, - bool malloc_enable, struct cs_index draw_id, +cs_run_idvs2(struct cs_builder *b, uint32_t flags_override, bool malloc_enable, + struct cs_index draw_id, enum mali_idvs_shading_mode vertex_shading_mode) { cs_emit(b, RUN_IDVS2, I) { I.flags_override = flags_override; - I.progress_increment = progress_inc; I.malloc_enable = malloc_enable; I.vertex_shading_mode = vertex_shading_mode; @@ -1206,13 +1201,12 @@ cs_run_idvs2(struct cs_builder *b, uint32_t flags_override, bool progress_inc, } #else static inline void -cs_run_idvs(struct cs_builder *b, uint32_t flags_override, bool progress_inc, - bool malloc_enable, struct cs_shader_res_sel varying_sel, +cs_run_idvs(struct cs_builder *b, uint32_t flags_override, bool malloc_enable, + struct cs_shader_res_sel varying_sel, struct cs_shader_res_sel frag_sel, struct cs_index draw_id) { cs_emit(b, RUN_IDVS, I) { I.flags_override = flags_override; - I.progress_increment = progress_inc; I.malloc_enable = malloc_enable; if (draw_id.type == CS_INDEX_UNDEF) { @@ -1242,31 +1236,29 @@ cs_run_idvs(struct cs_builder *b, uint32_t flags_override, bool progress_inc, static inline void cs_run_fragment(struct cs_builder *b, bool enable_tem, - enum mali_tile_render_order tile_order, bool progress_inc) + enum mali_tile_render_order tile_order) { cs_emit(b, RUN_FRAGMENT, I) { I.enable_tem = enable_tem; I.tile_order = tile_order; - I.progress_increment = progress_inc; } } static inline void cs_run_fullscreen(struct cs_builder *b, uint32_t flags_override, - bool progress_inc, struct cs_index dcd) + struct cs_index dcd) { cs_emit(b, RUN_FULLSCREEN, I) { I.flags_override = flags_override; - I.progress_increment = progress_inc; I.dcd = cs_src64(b, dcd); } } static inline void -cs_finish_tiling(struct cs_builder *b, bool progress_inc) +cs_finish_tiling(struct cs_builder *b) { cs_emit(b, FINISH_TILING, I) - I.progress_increment = progress_inc; + ; } static inline void @@ -1596,11 +1588,10 @@ cs_progress_load(struct cs_builder *b, struct cs_index dst) static inline void cs_run_compute_indirect(struct cs_builder *b, unsigned wg_per_task, - bool progress_inc, struct cs_shader_res_sel res_sel) + struct cs_shader_res_sel res_sel) { cs_emit(b, RUN_COMPUTE_INDIRECT, I) { I.workgroups_per_task = wg_per_task; - I.progress_increment = progress_inc; I.srt_select = res_sel.srt; I.spd_select = res_sel.spd; I.tsd_select = res_sel.tsd; @@ -1924,7 +1915,7 @@ cs_exception_handler_end(struct cs_builder *b, cs_load64_to(b, addr_reg, handler->ctx.ctx_reg, handler->ctx.dump_addr_offset); - cs_wait_slot(b, handler->ctx.ls_sb_slot, false); + cs_wait_slot(b, handler->ctx.ls_sb_slot); for (unsigned i = 0; i < num_ranges; ++i) { unsigned reg_count = util_bitcount(masks[i]); @@ -1933,7 +1924,7 @@ cs_exception_handler_end(struct cs_builder *b, offset += reg_count * 4; } - cs_wait_slot(b, handler->ctx.ls_sb_slot, false); + cs_wait_slot(b, handler->ctx.ls_sb_slot); } /* Now that the preamble is emitted, we can flush the instructions we have in @@ -1946,7 +1937,7 @@ cs_exception_handler_end(struct cs_builder *b, cs_load64_to(b, addr_reg, handler->ctx.ctx_reg, handler->ctx.dump_addr_offset); - cs_wait_slot(b, handler->ctx.ls_sb_slot, false); + cs_wait_slot(b, handler->ctx.ls_sb_slot); for (unsigned i = 0; i < num_ranges; ++i) { unsigned reg_count = util_bitcount(masks[i]); @@ -1955,7 +1946,7 @@ cs_exception_handler_end(struct cs_builder *b, offset += reg_count * 4; } - cs_wait_slot(b, handler->ctx.ls_sb_slot, false); + cs_wait_slot(b, handler->ctx.ls_sb_slot); } /* Fill the rest of the buffer with NOPs. */ @@ -1992,10 +1983,10 @@ cs_trace_preamble(struct cs_builder *b, const struct cs_tracing_ctx *ctx, * access. Use cs_trace_field_offset() to get an offset taking this * pre-increment into account. */ cs_load64_to(b, tracebuf_addr, ctx->ctx_reg, ctx->tracebuf_addr_offset); - cs_wait_slot(b, ctx->ls_sb_slot, false); + cs_wait_slot(b, ctx->ls_sb_slot); cs_add64(b, tracebuf_addr, tracebuf_addr, trace_size); cs_store64(b, tracebuf_addr, ctx->ctx_reg, ctx->tracebuf_addr_offset); - cs_wait_slot(b, ctx->ls_sb_slot, false); + cs_wait_slot(b, ctx->ls_sb_slot); } #define cs_trace_field_offset(__type, __field) \ @@ -2010,10 +2001,10 @@ struct cs_run_fragment_trace { static inline void cs_trace_run_fragment(struct cs_builder *b, const struct cs_tracing_ctx *ctx, struct cs_index scratch_regs, bool enable_tem, - enum mali_tile_render_order tile_order, bool progress_inc) + enum mali_tile_render_order tile_order) { if (likely(!ctx->enabled)) { - cs_run_fragment(b, enable_tem, tile_order, progress_inc); + cs_run_fragment(b, enable_tem, tile_order); return; } @@ -2026,12 +2017,12 @@ cs_trace_run_fragment(struct cs_builder *b, const struct cs_tracing_ctx *ctx, /* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP * won't point to the right instruction. */ cs_load_ip_to(b, data); - cs_run_fragment(b, enable_tem, tile_order, progress_inc); + cs_run_fragment(b, enable_tem, tile_order); cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_fragment, ip)); cs_store(b, cs_reg_tuple(b, 40, 7), tracebuf_addr, BITFIELD_MASK(7), cs_trace_field_offset(run_fragment, sr)); - cs_wait_slot(b, ctx->ls_sb_slot, false); + cs_wait_slot(b, ctx->ls_sb_slot); } #if PAN_ARCH >= 12 @@ -2045,12 +2036,11 @@ struct cs_run_idvs2_trace { static inline void cs_trace_run_idvs2(struct cs_builder *b, const struct cs_tracing_ctx *ctx, struct cs_index scratch_regs, uint32_t flags_override, - bool progress_inc, bool malloc_enable, - struct cs_index draw_id, + bool malloc_enable, struct cs_index draw_id, enum mali_idvs_shading_mode vertex_shading_mode) { if (likely(!ctx->enabled)) { - cs_run_idvs2(b, flags_override, progress_inc, malloc_enable, draw_id, + cs_run_idvs2(b, flags_override, malloc_enable, draw_id, vertex_shading_mode); return; } @@ -2063,8 +2053,7 @@ cs_trace_run_idvs2(struct cs_builder *b, const struct cs_tracing_ctx *ctx, /* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP * won't point to the right instruction. */ cs_load_ip_to(b, data); - cs_run_idvs2(b, flags_override, progress_inc, malloc_enable, draw_id, - vertex_shading_mode); + cs_run_idvs2(b, flags_override, malloc_enable, draw_id, vertex_shading_mode); cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_idvs2, ip)); if (draw_id.type != CS_INDEX_UNDEF) @@ -2076,7 +2065,7 @@ cs_trace_run_idvs2(struct cs_builder *b, const struct cs_tracing_ctx *ctx, cs_trace_field_offset(run_idvs2, sr[i])); cs_store(b, cs_reg_tuple(b, 64, 2), tracebuf_addr, BITFIELD_MASK(2), cs_trace_field_offset(run_idvs2, sr[64])); - cs_wait_slot(b, ctx->ls_sb_slot, false); + cs_wait_slot(b, ctx->ls_sb_slot); } #else struct cs_run_idvs_trace { @@ -2089,27 +2078,25 @@ struct cs_run_idvs_trace { static inline void cs_trace_run_idvs(struct cs_builder *b, const struct cs_tracing_ctx *ctx, struct cs_index scratch_regs, uint32_t flags_override, - bool progress_inc, bool malloc_enable, - struct cs_shader_res_sel varying_sel, + bool malloc_enable, struct cs_shader_res_sel varying_sel, struct cs_shader_res_sel frag_sel, struct cs_index draw_id) { if (likely(!ctx->enabled)) { - cs_run_idvs(b, flags_override, progress_inc, malloc_enable, varying_sel, - frag_sel, draw_id); + cs_run_idvs(b, flags_override, malloc_enable, varying_sel, frag_sel, + draw_id); return; } struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg); struct cs_index data = cs_reg64(b, scratch_regs.reg + 2); - cs_trace_preamble(b, ctx, scratch_regs, - sizeof(struct cs_run_idvs_trace)); + cs_trace_preamble(b, ctx, scratch_regs, sizeof(struct cs_run_idvs_trace)); /* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP * won't point to the right instruction. */ cs_load_ip_to(b, data); - cs_run_idvs(b, flags_override, progress_inc, malloc_enable, varying_sel, - frag_sel, draw_id); + cs_run_idvs(b, flags_override, malloc_enable, varying_sel, frag_sel, + draw_id); cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_idvs, ip)); if (draw_id.type != CS_INDEX_UNDEF) @@ -2121,7 +2108,7 @@ cs_trace_run_idvs(struct cs_builder *b, const struct cs_tracing_ctx *ctx, cs_trace_field_offset(run_idvs, sr[i])); cs_store(b, cs_reg_tuple(b, 48, 13), tracebuf_addr, BITFIELD_MASK(13), cs_trace_field_offset(run_idvs, sr[48])); - cs_wait_slot(b, ctx->ls_sb_slot, false); + cs_wait_slot(b, ctx->ls_sb_slot); } #endif @@ -2133,24 +2120,23 @@ struct cs_run_compute_trace { static inline void cs_trace_run_compute(struct cs_builder *b, const struct cs_tracing_ctx *ctx, struct cs_index scratch_regs, unsigned task_increment, - enum mali_task_axis task_axis, bool progress_inc, + enum mali_task_axis task_axis, struct cs_shader_res_sel res_sel) { if (likely(!ctx->enabled)) { - cs_run_compute(b, task_increment, task_axis, progress_inc, res_sel); + cs_run_compute(b, task_increment, task_axis, res_sel); return; } struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg); struct cs_index data = cs_reg64(b, scratch_regs.reg + 2); - cs_trace_preamble(b, ctx, scratch_regs, - sizeof(struct cs_run_compute_trace)); + cs_trace_preamble(b, ctx, scratch_regs, sizeof(struct cs_run_compute_trace)); /* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP * won't point to the right instruction. */ cs_load_ip_to(b, data); - cs_run_compute(b, task_increment, task_axis, progress_inc, res_sel); + cs_run_compute(b, task_increment, task_axis, res_sel); cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_compute, ip)); for (unsigned i = 0; i < 32; i += 16) @@ -2158,31 +2144,30 @@ cs_trace_run_compute(struct cs_builder *b, const struct cs_tracing_ctx *ctx, cs_trace_field_offset(run_compute, sr[i])); cs_store(b, cs_reg_tuple(b, 32, 8), tracebuf_addr, BITFIELD_MASK(8), cs_trace_field_offset(run_compute, sr[32])); - cs_wait_slot(b, ctx->ls_sb_slot, false); + cs_wait_slot(b, ctx->ls_sb_slot); } static inline void cs_trace_run_compute_indirect(struct cs_builder *b, const struct cs_tracing_ctx *ctx, struct cs_index scratch_regs, - unsigned wg_per_task, bool progress_inc, + unsigned wg_per_task, struct cs_shader_res_sel res_sel) { if (likely(!ctx->enabled)) { - cs_run_compute_indirect(b, wg_per_task, progress_inc, res_sel); + cs_run_compute_indirect(b, wg_per_task, res_sel); return; } struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg); struct cs_index data = cs_reg64(b, scratch_regs.reg + 2); - cs_trace_preamble(b, ctx, scratch_regs, - sizeof(struct cs_run_compute_trace)); + cs_trace_preamble(b, ctx, scratch_regs, sizeof(struct cs_run_compute_trace)); /* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP * won't point to the right instruction. */ cs_load_ip_to(b, data); - cs_run_compute_indirect(b, wg_per_task, progress_inc, res_sel); + cs_run_compute_indirect(b, wg_per_task, res_sel); cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_compute, ip)); for (unsigned i = 0; i < 32; i += 16) @@ -2190,5 +2175,5 @@ cs_trace_run_compute_indirect(struct cs_builder *b, cs_trace_field_offset(run_compute, sr[i])); cs_store(b, cs_reg_tuple(b, 32, 8), tracebuf_addr, BITFIELD_MASK(8), cs_trace_field_offset(run_compute, sr[32])); - cs_wait_slot(b, ctx->ls_sb_slot, false); + cs_wait_slot(b, ctx->ls_sb_slot); } diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c index 08db1b56469..571561c7459 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c @@ -128,11 +128,11 @@ finish_cs(struct panvk_cmd_buffer *cmdbuf, uint32_t subqueue) struct cs_index flush_id = cs_scratch_reg32(b, 0); cs_move32_to(b, flush_id, 0); - cs_wait_slots(b, SB_ALL_MASK, false); + cs_wait_slots(b, SB_ALL_MASK); cs_flush_caches(b, MALI_CS_FLUSH_MODE_CLEAN, MALI_CS_FLUSH_MODE_CLEAN, MALI_CS_OTHER_FLUSH_MODE_NONE, flush_id, cs_defer(SB_IMM_MASK, SB_ID(IMM_FLUSH))); - cs_wait_slot(b, SB_ID(IMM_FLUSH), false); + cs_wait_slot(b, SB_ID(IMM_FLUSH)); /* If we're in sync/trace more, we signal the debug object. */ if (instance->debug_flags & (PANVK_DEBUG_SYNC | PANVK_DEBUG_TRACE)) { @@ -144,12 +144,12 @@ finish_cs(struct panvk_cmd_buffer *cmdbuf, uint32_t subqueue) cs_move32_to(b, one, 1); cs_load64_to(b, debug_sync_addr, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, debug.syncobjs)); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); cs_add64(b, debug_sync_addr, debug_sync_addr, sizeof(struct panvk_cs_sync32) * subqueue); cs_load32_to(b, error, debug_sync_addr, offsetof(struct panvk_cs_sync32, error)); - cs_wait_slots(b, SB_ALL_MASK, false); + cs_wait_slots(b, SB_ALL_MASK); if (cmdbuf->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) cs_sync32_add(b, true, MALI_CS_SYNC_SCOPE_CSG, one, debug_sync_addr, cs_now()); @@ -162,7 +162,7 @@ finish_cs(struct panvk_cmd_buffer *cmdbuf, uint32_t subqueue) /* Overwrite the sync error with the first error we encountered. */ cs_store32(b, error, debug_sync_addr, offsetof(struct panvk_cs_sync32, error)); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); } } } @@ -566,7 +566,7 @@ panvk_per_arch(CmdPipelineBarrier2)(VkCommandBuffer commandBuffer, struct panvk_cs_state *cs_state = &cmdbuf->state.cs[i]; if (deps.src[i].wait_sb_mask) - cs_wait_slots(b, deps.src[i].wait_sb_mask, false); + cs_wait_slots(b, deps.src[i].wait_sb_mask); struct panvk_cache_flush_info cache_flush = deps.src[i].cache_flush; if (cache_flush.l2 != MALI_CS_FLUSH_MODE_NONE || @@ -577,7 +577,7 @@ panvk_per_arch(CmdPipelineBarrier2)(VkCommandBuffer commandBuffer, cs_move32_to(b, flush_id, 0); cs_flush_caches(b, cache_flush.l2, cache_flush.lsc, cache_flush.others, flush_id, cs_defer(SB_IMM_MASK, SB_ID(IMM_FLUSH))); - cs_wait_slot(b, SB_ID(IMM_FLUSH), false); + cs_wait_slot(b, SB_ID(IMM_FLUSH)); } /* If no one waits on us, there's no point signaling the sync object. */ @@ -589,7 +589,7 @@ panvk_per_arch(CmdPipelineBarrier2)(VkCommandBuffer commandBuffer, cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, syncobjs)); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); cs_add64(b, sync_addr, sync_addr, sizeof(struct panvk_cs_sync64) * i); cs_move64_to(b, add_val, 1); cs_sync64_add(b, false, MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr, @@ -607,7 +607,7 @@ panvk_per_arch(CmdPipelineBarrier2)(VkCommandBuffer commandBuffer, cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, syncobjs)); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); cs_add64(b, sync_addr, sync_addr, sizeof(struct panvk_cs_sync64) * j); cs_add64(b, wait_val, cs_progress_seqno_reg(b, j), @@ -628,12 +628,12 @@ panvk_per_arch(cs_pick_iter_sb)(struct panvk_cmd_buffer *cmdbuf, cs_load32_to(b, iter_sb, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, iter_sb)); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); cs_match(b, iter_sb, cmp_scratch) { #define CASE(x) \ cs_case(b, x) { \ - cs_wait_slot(b, SB_ITER(x), false); \ + cs_wait_slot(b, SB_ITER(x)); \ cs_select_sb_entries_for_async_ops(b, SB_ITER(x)); \ } diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c index 547e640e288..02e822ae577 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c @@ -227,10 +227,10 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) if (shader->info.tls_size) { cs_move64_to(b, cs_scratch_reg64(b, 0), cmdbuf->state.tls.desc.gpu); cs_load64_to(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); cs_move64_to(b, cs_scratch_reg64(b, 0), tsd); cs_store64(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); } cs_update_compute_ctx(b) { @@ -279,7 +279,7 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) cs_scratch_reg64(b, 0), BITFIELD_MASK(3), 0); cs_move64_to(b, cs_scratch_reg64(b, 0), cmdbuf->state.compute.push_uniforms); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); if (shader_uses_sysval(shader, compute, num_work_groups.x)) { cs_store32(b, cs_sr_reg32(b, COMPUTE, JOB_SIZE_X), @@ -302,7 +302,7 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) shader, sysval_offset(compute, num_work_groups.z))); } - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); } else { cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_SIZE_X), info->direct.wg_count.x); @@ -326,7 +326,7 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) * this is somewhat offset by run_compute being a native instruction. */ unsigned task_axis = MALI_TASK_AXIS_X; cs_trace_run_compute(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4), - wg_per_task, task_axis, false, + wg_per_task, task_axis, cs_shader_res_sel(0, 0, 0, 0)); } else { unsigned task_axis = MALI_TASK_AXIS_X; @@ -334,7 +334,7 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) panvk_per_arch(calculate_task_axis_and_increment)( shader, phys_dev, &task_axis, &task_increment); cs_trace_run_compute(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4), - task_increment, task_axis, false, + task_increment, task_axis, cs_shader_res_sel(0, 0, 0, 0)); } cs_req_res(b, 0); @@ -347,7 +347,7 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b), BITFIELD_MASK(3), offsetof(struct panvk_cs_subqueue_context, syncobjs)); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); cs_add64(b, sync_addr, sync_addr, PANVK_SUBQUEUE_COMPUTE * sizeof(struct panvk_cs_sync64)); @@ -371,7 +371,7 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, iter_sb)); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); ++cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].relative_sync_point; clear_dirty_after_dispatch(cmdbuf); diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c index 49ce920bbe3..97247d6b885 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c @@ -769,7 +769,7 @@ cs_render_desc_ringbuf_reserve(struct cs_builder *b, uint32_t size) cs_load64_to( b, ringbuf_sync, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.syncobj)); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); /* Wait for the other end to release memory. */ cs_move32_to(b, sz_reg, size - 1); @@ -793,7 +793,7 @@ cs_render_desc_ringbuf_move_ptr(struct cs_builder *b, uint32_t size, b, cs_scratch_reg_tuple(b, 2, 3), cs_subqueue_ctx_reg(b), BITFIELD_MASK(3), offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.ptr)); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); /* Update the relative position and absolute address. */ cs_add32(b, ptr_lo, ptr_lo, size); @@ -813,7 +813,7 @@ cs_render_desc_ringbuf_move_ptr(struct cs_builder *b, uint32_t size, b, cs_scratch_reg_tuple(b, 2, 3), cs_subqueue_ctx_reg(b), BITFIELD_MASK(3), offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.ptr)); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); } static VkResult @@ -927,7 +927,7 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf) cs_move64_to(b, cs_scratch_reg64(b, 12), 0); cs_move64_to(b, cs_scratch_reg64(b, 14), 0); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); /* Take care of the tiler desc with layer_offset=0 outside of the loop. */ cs_move32_to(b, cs_scratch_reg32(b, 4), @@ -942,7 +942,7 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf) cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr, BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 96); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); uint32_t remaining_layers = td_count > 1 @@ -970,7 +970,7 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf) BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 64); cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr, BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 96); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); cs_update_vt_ctx(b) cs_add64(b, tiler_ctx_addr, tiler_ctx_addr, @@ -1006,7 +1006,7 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf) cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr, BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 96); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); cs_add32(b, cs_scratch_reg32(b, 4), cs_scratch_reg32(b, 4), MAX_LAYERS_PER_TILER_DESC << 8); @@ -1230,7 +1230,7 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) cs_load64_to(b, cur_tiler, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.ptr)); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); cs_add64(b, dst_fbd_ptr, cur_tiler, pan_size(TILER_CONTEXT) * td_count); } @@ -1258,10 +1258,10 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) cs_load_to(b, cs_scratch_reg_tuple(b, 0, 16), pass_src_fbd_ptr, BITFIELD_MASK(16), fbd_off); } - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); cs_store(b, cs_scratch_reg_tuple(b, 0, 16), pass_dst_fbd_ptr, BITFIELD_MASK(16), fbd_off); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); } cs_add64(b, pass_src_fbd_ptr, pass_src_fbd_ptr, fbd_ir_pass_offset); cs_add64(b, pass_dst_fbd_ptr, pass_dst_fbd_ptr, fbd_ir_pass_offset); @@ -1608,15 +1608,15 @@ wrap_prev_oq(struct panvk_cmd_buffer *cmdbuf) cs_load64_to( b, prev_oq_node_reg, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, render.oq_chain)); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); cs_store64(b, prev_oq_node_reg, oq_node_reg, offsetof(struct panvk_cs_occlusion_query, next)); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); } cs_store64(b, oq_node_reg, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, render.oq_chain)); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); return VK_SUCCESS; } @@ -2084,11 +2084,11 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw) cs_while(b, MALI_CS_CONDITION_GREATER, counter_reg) { #if PAN_ARCH >= 12 cs_trace_run_idvs2(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4), - flags_override.opaque[0], false, true, cs_undef(), + flags_override.opaque[0], true, cs_undef(), MALI_IDVS_SHADING_MODE_EARLY); #else cs_trace_run_idvs(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4), - flags_override.opaque[0], false, true, + flags_override.opaque[0], true, cs_shader_res_sel(0, 0, 1, 0), cs_shader_res_sel(2, 2, 2, 0), cs_undef()); #endif @@ -2107,11 +2107,11 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw) } else { #if PAN_ARCH >= 12 cs_trace_run_idvs2(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4), - flags_override.opaque[0], false, true, cs_undef(), - MALI_IDVS_SHADING_MODE_EARLY); + flags_override.opaque[0], true, cs_undef(), + MALI_IDVS_SHADING_MODE_EARLY); #else cs_trace_run_idvs(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4), - flags_override.opaque[0], false, true, + flags_override.opaque[0], true, cs_shader_res_sel(0, 0, 1, 0), cs_shader_res_sel(2, 2, 2, 0), cs_undef()); #endif @@ -2264,7 +2264,7 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf, cs_load32_to(b, draw_count, draw_params_addr, 0); /* wait for draw_count to load from buffer */ - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); cs_umin32(b, draw_count, draw_count, max_draw_count); } else { cs_move32_to(b, draw_count, draw->indirect.draw_count); @@ -2288,7 +2288,7 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf, } /* Wait for the SR33-37 indirect buffer load. */ - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); if (patch_faus) { if (shader_uses_sysval(vs, graphics, vs.first_vertex)) { @@ -2305,7 +2305,7 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf, /* Wait for the store using SR-37 as src to finish, so we can * overwrite it. */ - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); } if (patch_attribs != 0) { @@ -2322,7 +2322,7 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf, cs_load32_to(b, attrib_offset, vs_drv_set, pan_size(ATTRIBUTE) * i + (2 * sizeof(uint32_t))); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); /* Emulated immediate multiply: we walk the bits in * base_instance, and accumulate (stride << bit_pos) if the bit @@ -2353,7 +2353,7 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf, cs_store32(b, attrib_offset, vs_drv_set, pan_size(ATTRIBUTE) * i + (2 * sizeof(uint32_t))); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); } } } @@ -2368,13 +2368,12 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf, #if PAN_ARCH >= 12 cs_trace_run_idvs2(b, tracing_ctx, tracing_scratch_regs, - flags_override.opaque[0], false, true, draw_id, - MALI_IDVS_SHADING_MODE_EARLY); + flags_override.opaque[0], true, draw_id, + MALI_IDVS_SHADING_MODE_EARLY); #else - cs_trace_run_idvs(b, tracing_ctx, tracing_scratch_regs, - flags_override.opaque[0], false, true, - cs_shader_res_sel(0, 0, 1, 0), - cs_shader_res_sel(2, 2, 2, 0), draw_id); + cs_trace_run_idvs( + b, tracing_ctx, tracing_scratch_regs, flags_override.opaque[0], true, + cs_shader_res_sel(0, 0, 1, 0), cs_shader_res_sel(2, 2, 2, 0), draw_id); #endif cs_add32(b, draw_count, draw_count, -1); @@ -2627,7 +2626,7 @@ flush_tiling(struct panvk_cmd_buffer *cmdbuf) if (cmdbuf->state.gfx.render.tiler || inherits_render_ctx(cmdbuf)) { /* Flush the tiling operations and signal the internal sync object. */ cs_req_res(b, CS_TILER_RES); - cs_finish_tiling(b, false); + cs_finish_tiling(b); cs_req_res(b, 0); struct cs_index sync_addr = cs_scratch_reg64(b, 0); @@ -2638,7 +2637,7 @@ flush_tiling(struct panvk_cmd_buffer *cmdbuf) cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b), BITFIELD_MASK(3), offsetof(struct panvk_cs_subqueue_context, syncobjs)); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); /* We're relying on PANVK_SUBQUEUE_VERTEX_TILER being the first queue to * skip an ADD operation on the syncobjs pointer. */ @@ -2669,14 +2668,14 @@ flush_tiling(struct panvk_cmd_buffer *cmdbuf) cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, iter_sb)); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); /* Update the vertex seqno. */ ++cmdbuf->state.cs[PANVK_SUBQUEUE_VERTEX_TILER].relative_sync_point; } else { cs_load64_to(b, render_ctx, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, render)); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); } } @@ -2691,7 +2690,7 @@ wait_finish_tiling(struct panvk_cmd_buffer *cmdbuf) cs_load64_to(b, vt_sync_addr, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, syncobjs)); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); cs_add64(b, vt_sync_point, cs_progress_seqno_reg(b, PANVK_SUBQUEUE_VERTEX_TILER), @@ -2750,7 +2749,7 @@ setup_tiler_oom_ctx(struct panvk_cmd_buffer *cmdbuf) cs_store32(b, layer_count, cs_subqueue_ctx_reg(b), TILER_OOM_CTX_FIELD_OFFSET(layer_count)); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); } static VkResult @@ -2837,7 +2836,7 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) cs_load32_to( b, counter, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, tiler_oom_ctx.counter)); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); cs_if(b, MALI_CS_CONDITION_GREATER, counter) cs_update_frag_ctx(b) cs_add64(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER), @@ -2854,7 +2853,7 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) cs_flush_caches(b, MALI_CS_FLUSH_MODE_NONE, MALI_CS_FLUSH_MODE_NONE, MALI_CS_OTHER_FLUSH_MODE_INVALIDATE, length_reg, cs_defer(0x0, SB_ID(IMM_FLUSH))); - cs_wait_slot(b, SB_ID(IMM_FLUSH), false); + cs_wait_slot(b, SB_ID(IMM_FLUSH)); } cs_req_res(b, CS_FRAG_RES); @@ -2864,7 +2863,7 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) cs_move32_to(b, layer_count, calc_enabled_layer_count(cmdbuf)); cs_while(b, MALI_CS_CONDITION_GREATER, layer_count) { cs_trace_run_fragment(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4), - false, MALI_TILE_RENDER_ORDER_Z_ORDER, false); + false, MALI_TILE_RENDER_ORDER_Z_ORDER); cs_add32(b, layer_count, layer_count, -1); cs_update_frag_ctx(b) @@ -2873,7 +2872,7 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) } } else { cs_trace_run_fragment(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4), - false, MALI_TILE_RENDER_ORDER_Z_ORDER, false); + false, MALI_TILE_RENDER_ORDER_Z_ORDER); } cs_req_res(b, 0); @@ -2907,7 +2906,7 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) render.desc_ringbuf.syncobj)); } - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); cs_add64(b, sync_addr, sync_addr, PANVK_SUBQUEUE_FRAGMENT * sizeof(struct panvk_cs_sync64)); @@ -2920,12 +2919,12 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC)); \ if (td_count == 1) { \ cs_load_to(b, completed, cur_tiler, BITFIELD_MASK(4), 40); \ - cs_wait_slot(b, SB_ID(LS), false); \ + cs_wait_slot(b, SB_ID(LS)); \ cs_finish_fragment(b, true, completed_top, completed_bottom, async); \ } else if (td_count > 1) { \ cs_while(b, MALI_CS_CONDITION_GREATER, tiler_count) { \ cs_load_to(b, completed, cur_tiler, BITFIELD_MASK(4), 40); \ - cs_wait_slot(b, SB_ID(LS), false); \ + cs_wait_slot(b, SB_ID(LS)); \ cs_finish_fragment(b, false, completed_top, completed_bottom, \ async); \ cs_update_frag_ctx(b) \ @@ -2948,20 +2947,20 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) cs_load64_to( \ b, oq_chain, cs_subqueue_ctx_reg(b), \ offsetof(struct panvk_cs_subqueue_context, render.oq_chain)); \ - cs_wait_slot(b, SB_ID(LS), false); \ + cs_wait_slot(b, SB_ID(LS)); \ /* We use oq_syncobj as a placeholder to reset the oq_chain. */ \ cs_move64_to(b, oq_syncobj, 0); \ cs_store64( \ b, oq_syncobj, cs_subqueue_ctx_reg(b), \ offsetof(struct panvk_cs_subqueue_context, render.oq_chain)); \ - cs_wait_slot(b, SB_ID(LS), false); \ + cs_wait_slot(b, SB_ID(LS)); \ cs_while(b, MALI_CS_CONDITION_ALWAYS, cs_undef()) { \ cs_load64_to(b, oq_syncobj, oq_chain, \ offsetof(struct panvk_cs_occlusion_query, syncobj)); \ - cs_wait_slot(b, SB_ID(LS), false); \ + cs_wait_slot(b, SB_ID(LS)); \ cs_load64_to(b, oq_chain, oq_chain, \ offsetof(struct panvk_cs_occlusion_query, next)); \ - cs_wait_slot(b, SB_ID(LS), false); \ + cs_wait_slot(b, SB_ID(LS)); \ cs_sync32_set( \ b, true, MALI_CS_SYNC_SCOPE_CSG, add_val_lo, oq_syncobj, \ cs_defer(SB_MASK(DEFERRED_FLUSH), SB_ID(DEFERRED_SYNC))); \ @@ -2987,7 +2986,7 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, iter_sb)); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); /* Update the ring buffer position. */ if (free_render_descs) { diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_event.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_event.c index ddfcff534a0..61519b6d820 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_event.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_event.c @@ -41,7 +41,7 @@ panvk_per_arch(CmdResetEvent2)(VkCommandBuffer commandBuffer, VkEvent _event, (i * sizeof(struct panvk_cs_sync32))); cs_load32_to(b, seqno, sync_addr, offsetof(struct panvk_cs_sync32, seqno)); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); cs_match(b, seqno, cmp_scratch) { cs_case(b, 0) { @@ -83,7 +83,7 @@ panvk_per_arch(CmdSetEvent2)(VkCommandBuffer commandBuffer, VkEvent _event, (i * sizeof(struct panvk_cs_sync32))); cs_load32_to(b, seqno, sync_addr, offsetof(struct panvk_cs_sync32, seqno)); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); cs_match(b, seqno, cmp_scratch) { cs_case(b, 0) { diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c index aafe0917fee..e1c5411e1d3 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c @@ -95,10 +95,10 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx, if (shader->info.tls_size) { cs_move64_to(b, cs_scratch_reg64(b, 0), cmdbuf->state.tls.desc.gpu); cs_load64_to(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); cs_move64_to(b, cs_scratch_reg64(b, 0), tsd); cs_store64(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); } cs_update_compute_ctx(b) { @@ -146,7 +146,7 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx, panvk_per_arch(calculate_task_axis_and_increment)( shader, phys_dev, &task_axis, &task_increment); cs_trace_run_compute(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4), - task_increment, task_axis, false, + task_increment, task_axis, cs_shader_res_sel(0, 0, 0, 0)); cs_req_res(b, 0); @@ -158,7 +158,7 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx, cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b), BITFIELD_MASK(3), offsetof(struct panvk_cs_subqueue_context, syncobjs)); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); cs_add64(b, sync_addr, sync_addr, PANVK_SUBQUEUE_COMPUTE * sizeof(struct panvk_cs_sync64)); @@ -182,7 +182,7 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx, cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, iter_sb)); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); ++cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].relative_sync_point; diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_query.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_query.c index 1cd444d5205..4566288c6eb 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_query.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_query.c @@ -93,7 +93,7 @@ panvk_cmd_reset_occlusion_queries(struct panvk_cmd_buffer *cmd, /* Wait on deferred sync to ensure all prior query operations have * completed */ - cs_wait_slot(b, SB_ID(DEFERRED_SYNC), false); + cs_wait_slot(b, SB_ID(DEFERRED_SYNC)); struct cs_index addr = cs_scratch_reg64(b, 16); struct cs_index zero_regs = cs_scratch_reg_tuple(b, 0, 16); @@ -113,7 +113,7 @@ panvk_cmd_reset_occlusion_queries(struct panvk_cmd_buffer *cmd, /* reset_oq_batch() only does the stores, we need to flush those explicitly * here. */ - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); /* We flush the caches to make the new value visible to the CPU. */ struct cs_index flush_id = cs_scratch_reg32(b, 0); @@ -121,7 +121,7 @@ panvk_cmd_reset_occlusion_queries(struct panvk_cmd_buffer *cmd, cs_flush_caches(b, MALI_CS_FLUSH_MODE_CLEAN, MALI_CS_FLUSH_MODE_CLEAN, MALI_CS_OTHER_FLUSH_MODE_NONE, flush_id, cs_defer(SB_IMM_MASK, SB_ID(IMM_FLUSH))); - cs_wait_slot(b, SB_ID(IMM_FLUSH), false); + cs_wait_slot(b, SB_ID(IMM_FLUSH)); } static void @@ -152,7 +152,7 @@ panvk_cmd_begin_occlusion_query(struct panvk_cmd_buffer *cmd, cs_move64_to(b, report_addr_gpu, report_addr); cs_move64_to(b, clear_value, 0); cs_store64(b, clear_value, report_addr_gpu, 0); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); } static void @@ -219,7 +219,7 @@ copy_oq_result_batch(struct cs_builder *b, } /* Flush the loads. */ - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); for (uint32_t i = 0; i < query_count; i++) { struct cs_index store_src = @@ -230,7 +230,7 @@ copy_oq_result_batch(struct cs_builder *b, } /* Flush the stores. */ - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); } static void @@ -245,7 +245,7 @@ panvk_copy_occlusion_query_results(struct panvk_cmd_buffer *cmd, /* Wait for occlusion query syncobjs to be signalled. */ if (flags & VK_QUERY_RESULT_WAIT_BIT) - cs_wait_slot(b, SB_ID(DEFERRED_SYNC), false); + cs_wait_slot(b, SB_ID(DEFERRED_SYNC)); uint32_t res_size = (flags & VK_QUERY_RESULT_64_BIT) ? 2 : 1; uint32_t regs_per_copy = diff --git a/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c b/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c index 0819419ed21..cac8873a061 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c +++ b/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c @@ -76,7 +76,7 @@ generate_tiler_oom_handler(struct panvk_device *dev, * rendering has already been triggered */ cs_load32_to(&b, counter, subqueue_ctx, TILER_OOM_CTX_FIELD_OFFSET(counter)); - cs_wait_slot(&b, SB_ID(LS), false); + cs_wait_slot(&b, SB_ID(LS)); cs_if(&b, MALI_CS_CONDITION_GREATER, counter) cs_load64_to(&b, fbd_ptr, subqueue_ctx, @@ -87,19 +87,18 @@ generate_tiler_oom_handler(struct panvk_device *dev, cs_load32_to(&b, layer_count, subqueue_ctx, TILER_OOM_CTX_FIELD_OFFSET(layer_count)); - cs_wait_slot(&b, SB_ID(LS), false); + cs_wait_slot(&b, SB_ID(LS)); cs_req_res(&b, CS_FRAG_RES); cs_while(&b, MALI_CS_CONDITION_GREATER, layer_count) { - cs_trace_run_fragment(&b, &tracing_ctx, - cs_scratch_reg_tuple(&b, 8, 4), false, - MALI_TILE_RENDER_ORDER_Z_ORDER, false); + cs_trace_run_fragment(&b, &tracing_ctx, cs_scratch_reg_tuple(&b, 8, 4), + false, MALI_TILE_RENDER_ORDER_Z_ORDER); cs_add32(&b, layer_count, layer_count, -1); cs_add64(&b, fbd_ptr, fbd_ptr, fbd_size); } cs_req_res(&b, 0); /* Wait for all iter scoreboards for simplicity. */ - cs_wait_slots(&b, SB_ALL_ITERS_MASK, false); + cs_wait_slots(&b, SB_ALL_ITERS_MASK); /* Increment counter */ cs_add32(&b, counter, counter, 1); @@ -111,12 +110,12 @@ generate_tiler_oom_handler(struct panvk_device *dev, cs_load32_to(&b, td_count, subqueue_ctx, TILER_OOM_CTX_FIELD_OFFSET(td_count)); cs_move64_to(&b, zero, 0); - cs_wait_slot(&b, SB_ID(LS), false); + cs_wait_slot(&b, SB_ID(LS)); cs_while(&b, MALI_CS_CONDITION_GREATER, td_count) { /* Load completed chunks */ cs_load_to(&b, completed_chunks, tiler_ptr, BITFIELD_MASK(4), 10 * 4); - cs_wait_slot(&b, SB_ID(LS), false); + cs_wait_slot(&b, SB_ID(LS)); cs_finish_fragment(&b, false, completed_top, completed_bottom, cs_now()); @@ -136,7 +135,7 @@ generate_tiler_oom_handler(struct panvk_device *dev, MALI_CS_OTHER_FLUSH_MODE_INVALIDATE, flush_id, cs_defer(SB_IMM_MASK, SB_ID(IMM_FLUSH))); - cs_wait_slot(&b, SB_ID(IMM_FLUSH), false); + cs_wait_slot(&b, SB_ID(IMM_FLUSH)); } assert(cs_is_valid(&b)); diff --git a/src/panfrost/vulkan/csf/panvk_vX_utrace.c b/src/panfrost/vulkan/csf/panvk_vX_utrace.c index 412588f3894..5d38534c6a3 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_utrace.c +++ b/src/panfrost/vulkan/csf/panvk_vX_utrace.c @@ -31,7 +31,7 @@ cmd_copy_data(struct cs_builder *b, uint64_t dst_addr, uint64_t src_addr, assert((dst_addr | src_addr | size) % sizeof(uint32_t) == 0); /* wait for timestamp writes */ - cs_wait_slot(b, SB_ID(DEFERRED_SYNC), false); + cs_wait_slot(b, SB_ID(DEFERRED_SYNC)); /* Depending on where this is called from, we could potentially use SR * registers or copy with a compute job. @@ -52,7 +52,7 @@ cmd_copy_data(struct cs_builder *b, uint64_t dst_addr, uint64_t src_addr, const struct cs_index reg = cs_scratch_reg_tuple(b, 4, count); cs_load_to(b, reg, src_addr_reg, BITFIELD_MASK(count), offset); - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); cs_store(b, reg, dst_addr_reg, BITFIELD_MASK(count), offset); copy_count -= count; @@ -64,7 +64,7 @@ cmd_copy_data(struct cs_builder *b, uint64_t dst_addr, uint64_t src_addr, size -= offset; } - cs_wait_slot(b, SB_ID(LS), false); + cs_wait_slot(b, SB_ID(LS)); } static struct cs_builder * @@ -170,7 +170,7 @@ panvk_per_arch(utrace_clone_finish_builder)(struct cs_builder *b) cs_flush_caches(b, MALI_CS_FLUSH_MODE_CLEAN, MALI_CS_FLUSH_MODE_NONE, MALI_CS_OTHER_FLUSH_MODE_NONE, flush_id, cs_defer(SB_IMM_MASK, SB_ID(IMM_FLUSH))); - cs_wait_slot(b, SB_ID(IMM_FLUSH), false); + cs_wait_slot(b, SB_ID(IMM_FLUSH)); cs_finish(b); }