From 626e9e41799787a670f8ae3f4293ec72f75130a9 Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Wed, 12 Feb 2025 13:06:39 +0100 Subject: [PATCH] panvk: Request resources during subqueue init REQ_RES is a quite expensive operation, so calling it before and after each RUN-command slows down RUN-command heavy workloads. This commit moves REQ_RES calls to subqueue_init. Reviewed-by: John Anthony Reviewed-by: Boris Brezillon Tested-by: Heiko Stuebner Part-of: --- src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c | 2 -- src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c | 10 ---------- src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c | 2 -- .../vulkan/csf/panvk_vX_exception_handler.c | 2 -- src/panfrost/vulkan/csf/panvk_vX_queue.c | 17 +++++++++++++++++ 5 files changed, 17 insertions(+), 16 deletions(-) diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c index 02e822ae577..75e6b0ff74b 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c @@ -315,7 +315,6 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_COMPUTE); - cs_req_res(b, CS_COMPUTE_RES); if (indirect) { /* Use run_compute with a set task axis instead of run_compute_indirect as * run_compute_indirect has been found to cause intermittent hangs. This @@ -337,7 +336,6 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) task_increment, task_axis, cs_shader_res_sel(0, 0, 0, 0)); } - cs_req_res(b, 0); struct cs_index sync_addr = cs_scratch_reg64(b, 0); struct cs_index iter_sb = cs_scratch_reg32(b, 2); diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c index 97247d6b885..7f473e9e9c9 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c @@ -2074,7 +2074,6 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw) uint32_t idvs_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count, MAX_LAYERS_PER_TILER_DESC); - cs_req_res(b, CS_IDVS_RES); if (idvs_count > 1) { struct cs_index counter_reg = cs_scratch_reg32(b, 17); struct cs_index tiler_ctx_addr = cs_sr_reg64(b, IDVS, TILER_CTX); @@ -2116,7 +2115,6 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw) cs_shader_res_sel(2, 2, 2, 0), cs_undef()); #endif } - cs_req_res(b, 0); } VkResult @@ -2276,8 +2274,6 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf, cs_move64_to(b, draw_params_addr, draw->indirect.buffer_dev_addr); cs_move32_to(b, draw_id, 0); - cs_req_res(b, CS_IDVS_RES); - cs_while(b, MALI_CS_CONDITION_GREATER, draw_count) { cs_update_vt_ctx(b) { cs_move32_to(b, cs_sr_reg32(b, IDVS, GLOBAL_ATTRIBUTE_OFFSET), 0); @@ -2400,8 +2396,6 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf, } } } - - cs_req_res(b, 0); } VKAPI_ATTR void VKAPI_CALL @@ -2625,9 +2619,7 @@ flush_tiling(struct panvk_cmd_buffer *cmdbuf) if (cmdbuf->state.gfx.render.tiler || inherits_render_ctx(cmdbuf)) { /* Flush the tiling operations and signal the internal sync object. */ - cs_req_res(b, CS_TILER_RES); cs_finish_tiling(b); - cs_req_res(b, 0); struct cs_index sync_addr = cs_scratch_reg64(b, 0); struct cs_index iter_sb = cs_scratch_reg32(b, 2); @@ -2856,7 +2848,6 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) cs_wait_slot(b, SB_ID(IMM_FLUSH)); } - cs_req_res(b, CS_FRAG_RES); if (cmdbuf->state.gfx.render.layer_count > 1) { struct cs_index layer_count = cs_reg32(b, 47); @@ -2874,7 +2865,6 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) cs_trace_run_fragment(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4), false, MALI_TILE_RENDER_ORDER_Z_ORDER); } - cs_req_res(b, 0); struct cs_index sync_addr = cs_scratch_reg64(b, 0); struct cs_index iter_sb = cs_scratch_reg32(b, 2); diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c index e1c5411e1d3..c160ab3f68d 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c @@ -140,7 +140,6 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx, panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_COMPUTE); - cs_req_res(b, CS_COMPUTE_RES); unsigned task_axis = MALI_TASK_AXIS_X; unsigned task_increment = 0; panvk_per_arch(calculate_task_axis_and_increment)( @@ -148,7 +147,6 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx, cs_trace_run_compute(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4), task_increment, task_axis, cs_shader_res_sel(0, 0, 0, 0)); - cs_req_res(b, 0); struct cs_index sync_addr = cs_scratch_reg64(b, 0); struct cs_index iter_sb = cs_scratch_reg32(b, 2); diff --git a/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c b/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c index cac8873a061..2baf0016591 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c +++ b/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c @@ -89,14 +89,12 @@ generate_tiler_oom_handler(struct panvk_device *dev, TILER_OOM_CTX_FIELD_OFFSET(layer_count)); cs_wait_slot(&b, SB_ID(LS)); - cs_req_res(&b, CS_FRAG_RES); cs_while(&b, MALI_CS_CONDITION_GREATER, layer_count) { cs_trace_run_fragment(&b, &tracing_ctx, cs_scratch_reg_tuple(&b, 8, 4), false, MALI_TILE_RENDER_ORDER_Z_ORDER); cs_add32(&b, layer_count, layer_count, -1); cs_add64(&b, fbd_ptr, fbd_ptr, fbd_size); } - cs_req_res(&b, 0); /* Wait for all iter scoreboards for simplicity. */ cs_wait_slots(&b, SB_ALL_ITERS_MASK); diff --git a/src/panfrost/vulkan/csf/panvk_vX_queue.c b/src/panfrost/vulkan/csf/panvk_vX_queue.c index 7d7007fa57f..3c39caeed2e 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_queue.c +++ b/src/panfrost/vulkan/csf/panvk_vX_queue.c @@ -446,6 +446,23 @@ init_subqueue(struct panvk_queue *queue, enum panvk_subqueue_id subqueue) cs_heap_set(&b, heap_ctx_addr); } + /* Request resources for each subqueue during initialization, as the req_res + * is an expensive operation which should be called sparingly. */ + switch (subqueue) { + case PANVK_SUBQUEUE_VERTEX_TILER: + cs_req_res(&b, CS_IDVS_RES | CS_TILER_RES); + break; + case PANVK_SUBQUEUE_FRAGMENT: + cs_req_res(&b, CS_FRAG_RES); + break; + case PANVK_SUBQUEUE_COMPUTE: + cs_req_res(&b, CS_COMPUTE_RES); + break; + default: + unreachable("Unknown subqueue"); + break; + } + cs_finish(&b); assert(cs_is_valid(&b));