diff --git a/src/panfrost/genxml/cs_builder.h b/src/panfrost/genxml/cs_builder.h index a324307b985..69c55e31dd8 100644 --- a/src/panfrost/genxml/cs_builder.h +++ b/src/panfrost/genxml/cs_builder.h @@ -218,6 +218,9 @@ struct cs_builder { /* ralloc context used for cs_maybe allocations */ void *maybe_ctx; + /* Mask of resources required by this CS. */ + uint32_t req_resource_mask; + /* Temporary storage for inner blocks that need to be built * and copied in one monolithic sequence of instructions with no * jump in the middle. @@ -1511,6 +1514,13 @@ cs_shader_res_sel(uint8_t srt, uint8_t fau, uint8_t spd, uint8_t tsd) }; } +enum cs_res_id { + CS_COMPUTE_RES = BITFIELD_BIT(0), + CS_FRAG_RES = BITFIELD_BIT(1), + CS_TILER_RES = BITFIELD_BIT(2), + CS_IDVS_RES = BITFIELD_BIT(3), +}; + static inline void cs_run_compute(struct cs_builder *b, unsigned task_increment, enum mali_task_axis task_axis, struct cs_shader_res_sel res_sel) @@ -1518,6 +1528,8 @@ cs_run_compute(struct cs_builder *b, unsigned task_increment, /* Staging regs */ cs_flush_loads(b); + b->req_resource_mask |= CS_COMPUTE_RES; + cs_emit(b, RUN_COMPUTE, I) { I.task_increment = task_increment; I.task_axis = task_axis; @@ -1536,6 +1548,8 @@ cs_run_tiling(struct cs_builder *b, uint32_t flags_override, /* Staging regs */ cs_flush_loads(b); + b->req_resource_mask |= CS_TILER_RES; + cs_emit(b, RUN_TILING, I) { I.flags_override = flags_override; I.srt_select = res_sel.srt; @@ -1555,6 +1569,8 @@ cs_run_idvs2(struct cs_builder *b, uint32_t flags_override, bool malloc_enable, /* Staging regs */ cs_flush_loads(b); + b->req_resource_mask |= CS_IDVS_RES; + cs_emit(b, RUN_IDVS2, I) { I.flags_override = flags_override; I.malloc_enable = malloc_enable; @@ -1577,6 +1593,8 @@ cs_run_idvs(struct cs_builder *b, uint32_t flags_override, bool malloc_enable, /* Staging regs */ cs_flush_loads(b); + b->req_resource_mask |= CS_IDVS_RES; + cs_emit(b, RUN_IDVS, I) { I.flags_override = flags_override; I.malloc_enable = malloc_enable; @@ -1613,6 +1631,8 @@ cs_run_fragment(struct cs_builder *b, bool enable_tem, /* Staging regs */ cs_flush_loads(b); + b->req_resource_mask |= CS_FRAG_RES; + cs_emit(b, RUN_FRAGMENT, I) { I.enable_tem = enable_tem; I.tile_order = tile_order; @@ -1626,6 +1646,8 @@ cs_run_fullscreen(struct cs_builder *b, uint32_t flags_override, /* Staging regs */ cs_flush_loads(b); + b->req_resource_mask |= CS_TILER_RES; + cs_emit(b, RUN_FULLSCREEN, I) { I.flags_override = flags_override; I.dcd = cs_src64(b, dcd); @@ -1635,6 +1657,8 @@ cs_run_fullscreen(struct cs_builder *b, uint32_t flags_override, static inline void cs_finish_tiling(struct cs_builder *b) { + b->req_resource_mask |= CS_TILER_RES; + cs_emit(b, FINISH_TILING, I) ; } @@ -1947,13 +1971,6 @@ cs_jump(struct cs_builder *b, struct cs_index address, struct cs_index length) } } -enum cs_res_id { - CS_COMPUTE_RES = BITFIELD_BIT(0), - CS_FRAG_RES = BITFIELD_BIT(1), - CS_TILER_RES = BITFIELD_BIT(2), - CS_IDVS_RES = BITFIELD_BIT(3), -}; - static inline void cs_req_res(struct cs_builder *b, uint32_t res_mask) { @@ -2053,6 +2070,8 @@ cs_run_compute_indirect(struct cs_builder *b, unsigned wg_per_task, /* Staging regs */ cs_flush_loads(b); + b->req_resource_mask |= CS_COMPUTE_RES; + cs_emit(b, RUN_COMPUTE_INDIRECT, I) { I.workgroups_per_task = wg_per_task; I.srt_select = res_sel.srt; diff --git a/src/panfrost/vulkan/csf/panvk_queue.h b/src/panfrost/vulkan/csf/panvk_queue.h index c305184b8af..5431674b135 100644 --- a/src/panfrost/vulkan/csf/panvk_queue.h +++ b/src/panfrost/vulkan/csf/panvk_queue.h @@ -44,6 +44,17 @@ struct panvk_subqueue { */ struct panvk_priv_mem regs_save; + + struct { + /* Mask of resources requested by this subqueue. */ + uint32_t mask; + /* Address and size of the linear buffer containing REQ_RESOURCE. */ + uint32_t cs_buffer_size; + uint64_t cs_buffer_addr; + /* Allocation */ + struct panvk_priv_mem buf; + } req_resource; + struct { struct pan_kmod_bo *bo; uint64_t size; diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c index cf94312c70d..c2ca98c6be6 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c @@ -1105,6 +1105,7 @@ panvk_per_arch(CmdExecuteCommands)(VkCommandBuffer commandBuffer, cs_move64_to(prim_b, addr, cs_root_chunk_gpu_addr(sec_b)); cs_move32_to(prim_b, size, cs_root_chunk_size(sec_b)); cs_call(prim_b, addr, size); + prim_b->req_resource_mask |= sec_b->req_resource_mask; struct u_trace *prim_ut = &primary->utrace.uts[j]; struct u_trace *sec_ut = &secondary->utrace.uts[j]; diff --git a/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c b/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c index f073ca9e9e4..cbcace40c73 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c +++ b/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c @@ -302,6 +302,7 @@ static void finish_subqueue(struct panvk_gpu_queue *queue, enum panvk_subqueue_id subqueue) { panvk_pool_free_mem(&queue->subqueues[subqueue].context); + panvk_pool_free_mem(&queue->subqueues[subqueue].req_resource.buf); panvk_pool_free_mem(&queue->subqueues[subqueue].regs_save); finish_subqueue_tracing(queue, subqueue); } @@ -328,6 +329,21 @@ init_utrace(struct panvk_gpu_queue *queue) return VK_SUCCESS; } +static uint32_t +get_resource_mask(enum panvk_subqueue_id subqueue) +{ + switch (subqueue) { + case PANVK_SUBQUEUE_VERTEX_TILER: + return CS_IDVS_RES | CS_TILER_RES; + case PANVK_SUBQUEUE_FRAGMENT: + return CS_FRAG_RES; + case PANVK_SUBQUEUE_COMPUTE: + return CS_COMPUTE_RES; + default: + UNREACHABLE("Unknown subqueue"); + } +} + static VkResult init_subqueue(struct panvk_gpu_queue *queue, enum panvk_subqueue_id subqueue) { @@ -353,14 +369,43 @@ init_subqueue(struct panvk_gpu_queue *queue, enum panvk_subqueue_id subqueue) } } - alloc_info.size = sizeof(struct panvk_cs_subqueue_context); - alloc_info.alignment = 64; - /* When tracing is enabled, we want to use a non-cached pool, so can get * up-to-date context even if the CS crashed in the middle. */ struct panvk_pool *mempool = PANVK_DEBUG(TRACE) ? &dev->mempools.rw_nc : &dev->mempools.rw; + alloc_info.size = sizeof(uint64_t); + alloc_info.alignment = 64; + subq->req_resource.buf = panvk_pool_alloc_mem(mempool, alloc_info); + if (!panvk_priv_mem_host_addr(subq->req_resource.buf)) + return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "Failed to create a req_resource buffer"); + + struct cs_builder b; + const struct drm_panthor_csif_info *csif_info = + panthor_kmod_get_csif_props(dev->kmod.dev); + + struct cs_buffer root_cs = { + .cpu = panvk_priv_mem_host_addr(subq->req_resource.buf), + .gpu = panvk_priv_mem_dev_addr(subq->req_resource.buf), + .capacity = 1, + }; + struct cs_builder_conf conf = { + .nr_registers = csif_info->cs_reg_count, + .nr_kernel_registers = MAX2(csif_info->unpreserved_cs_reg_count, 4), + .ls_sb_slot = SB_ID(LS), + }; + + cs_builder_init(&b, &conf, root_cs); + cs_req_res(&b, get_resource_mask(subqueue)); + cs_finish(&b); + assert(cs_is_valid(&b)); + subq->req_resource.cs_buffer_size = cs_root_chunk_size(&b); + subq->req_resource.cs_buffer_addr = cs_root_chunk_gpu_addr(&b); + + alloc_info.size = sizeof(struct panvk_cs_subqueue_context); + alloc_info.alignment = 64; + subq->context = panvk_pool_alloc_mem(mempool, alloc_info); if (!panvk_priv_mem_host_addr(subq->context)) return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, @@ -382,19 +427,16 @@ init_subqueue(struct panvk_gpu_queue *queue, enum panvk_subqueue_id subqueue) }; /* We use the geometry buffer for our temporary CS buffer. */ - struct cs_buffer root_cs = { + root_cs = (struct cs_buffer){ .cpu = panvk_priv_mem_host_addr(queue->tiler_heap.desc) + 4096, .gpu = panvk_priv_mem_dev_addr(queue->tiler_heap.desc) + 4096, .capacity = 64 * 1024 / sizeof(uint64_t), }; - const struct drm_panthor_csif_info *csif_info = - panthor_kmod_get_csif_props(dev->kmod.dev); - const struct cs_builder_conf conf = { + conf = (struct cs_builder_conf){ .nr_registers = csif_info->cs_reg_count, .nr_kernel_registers = MAX2(csif_info->unpreserved_cs_reg_count, 4), .ls_sb_slot = SB_ID(LS), }; - struct cs_builder b; assert(panvk_priv_mem_dev_addr(queue->tiler_heap.desc) != 0); @@ -440,24 +482,6 @@ init_subqueue(struct panvk_gpu_queue *queue, enum panvk_subqueue_id subqueue) cs_move64_to(&b, heap_ctx_addr, queue->tiler_heap.context.dev_addr); cs_heap_set(&b, heap_ctx_addr); } - - /* Request resources for each subqueue during initialization, as the req_res - * is an expensive operation which should be called sparingly. */ - switch (subqueue) { - case PANVK_SUBQUEUE_VERTEX_TILER: - cs_req_res(&b, CS_IDVS_RES | CS_TILER_RES); - break; - case PANVK_SUBQUEUE_FRAGMENT: - cs_req_res(&b, CS_FRAG_RES); - break; - case PANVK_SUBQUEUE_COMPUTE: - cs_req_res(&b, CS_COMPUTE_RES); - break; - default: - UNREACHABLE("Unknown subqueue"); - break; - } - cs_finish(&b); assert(cs_is_valid(&b)); @@ -708,6 +732,7 @@ struct panvk_queue_submit { uint32_t qsubmit_count; uint32_t wait_queue_mask; uint32_t signal_queue_mask; + uint32_t req_resource_subqueue_mask; struct drm_panthor_queue_submit *qsubmits; struct drm_panthor_sync_op *wait_ops; @@ -769,6 +794,16 @@ panvk_queue_submit_init_storage( submit->qsubmit_count++; + struct panvk_subqueue *subq = &submit->queue->subqueues[j]; + /* If we need a resource the subqueue has not requested yet. */ + if (b->req_resource_mask & (~subq->req_resource.mask)) { + /* Ensure we do not need a resource not expected for this subqueue. */ + assert(!(b->req_resource_mask & (~get_resource_mask(j)))); + submit->qsubmit_count++; + submit->req_resource_subqueue_mask |= BITFIELD_BIT(j); + subq->req_resource.mask = get_resource_mask(j); + } + struct u_trace *ut = &cmdbuf->utrace.uts[j]; if (submit->process_utrace && u_trace_has_points(ut)) { submit->utrace.queue_mask |= BITFIELD_BIT(j); @@ -893,6 +928,27 @@ panvk_queue_submit_init_utrace(struct panvk_queue_submit *submit, } } +static void +panvk_queue_submit_init_req_resource(struct panvk_queue_submit *submit) +{ + if (!submit->req_resource_subqueue_mask) + return; + + struct panvk_device *dev = submit->dev; + uint32_t flush_id = panthor_kmod_get_flush_id(dev->kmod.dev); + + u_foreach_bit(i, submit->req_resource_subqueue_mask) { + struct panvk_subqueue *subq = &submit->queue->subqueues[i]; + submit->qsubmits[submit->qsubmit_count++] = + (struct drm_panthor_queue_submit){ + .queue_index = i, + .stream_size = subq->req_resource.cs_buffer_size, + .stream_addr = subq->req_resource.cs_buffer_addr, + .latest_flush = flush_id, + }; + } +} + static void panvk_queue_submit_init_waits(struct panvk_queue_submit *submit, const struct vk_queue_submit *vk_submit) @@ -1190,6 +1246,7 @@ panvk_per_arch(gpu_queue_submit)(struct vk_queue *vk_queue, struct vk_queue_subm panvk_queue_submit_init(&submit, vk_queue); panvk_queue_submit_init_storage(&submit, vk_submit, &stack_storage); panvk_queue_submit_init_utrace(&submit, vk_submit); + panvk_queue_submit_init_req_resource(&submit); panvk_queue_submit_init_waits(&submit, vk_submit); panvk_queue_submit_init_cmdbufs(&submit, vk_submit); panvk_queue_submit_init_signals(&submit, vk_submit);