panvk: Only call req_res when required
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

Rather than calling REQ_RES during subqueue_init, only call it during
submit the first time we see a command buffer that requires the use of
specific resources.

This ensures queues processing compute-only workloads (i.e. not actually
requiring tiler/fragment resources) don't preempt vertex/fragment work
on other queues due to resource congestion.

Reviewed-by: Christoph Pillmayer <christoph.pillmayer@arm.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37890>
This commit is contained in:
Lars-Ivar Hesselberg Simonsen 2025-10-15 13:20:15 +02:00 committed by Marge Bot
parent 849d41dbf8
commit e8c0fdbc45
4 changed files with 121 additions and 33 deletions

View file

@ -218,6 +218,9 @@ struct cs_builder {
/* ralloc context used for cs_maybe allocations */
void *maybe_ctx;
/* Mask of resources required by this CS. */
uint32_t req_resource_mask;
/* Temporary storage for inner blocks that need to be built
* and copied in one monolithic sequence of instructions with no
* jump in the middle.
@ -1511,6 +1514,13 @@ cs_shader_res_sel(uint8_t srt, uint8_t fau, uint8_t spd, uint8_t tsd)
};
}
enum cs_res_id {
CS_COMPUTE_RES = BITFIELD_BIT(0),
CS_FRAG_RES = BITFIELD_BIT(1),
CS_TILER_RES = BITFIELD_BIT(2),
CS_IDVS_RES = BITFIELD_BIT(3),
};
static inline void
cs_run_compute(struct cs_builder *b, unsigned task_increment,
enum mali_task_axis task_axis, struct cs_shader_res_sel res_sel)
@ -1518,6 +1528,8 @@ cs_run_compute(struct cs_builder *b, unsigned task_increment,
/* Staging regs */
cs_flush_loads(b);
b->req_resource_mask |= CS_COMPUTE_RES;
cs_emit(b, RUN_COMPUTE, I) {
I.task_increment = task_increment;
I.task_axis = task_axis;
@ -1536,6 +1548,8 @@ cs_run_tiling(struct cs_builder *b, uint32_t flags_override,
/* Staging regs */
cs_flush_loads(b);
b->req_resource_mask |= CS_TILER_RES;
cs_emit(b, RUN_TILING, I) {
I.flags_override = flags_override;
I.srt_select = res_sel.srt;
@ -1555,6 +1569,8 @@ cs_run_idvs2(struct cs_builder *b, uint32_t flags_override, bool malloc_enable,
/* Staging regs */
cs_flush_loads(b);
b->req_resource_mask |= CS_IDVS_RES;
cs_emit(b, RUN_IDVS2, I) {
I.flags_override = flags_override;
I.malloc_enable = malloc_enable;
@ -1577,6 +1593,8 @@ cs_run_idvs(struct cs_builder *b, uint32_t flags_override, bool malloc_enable,
/* Staging regs */
cs_flush_loads(b);
b->req_resource_mask |= CS_IDVS_RES;
cs_emit(b, RUN_IDVS, I) {
I.flags_override = flags_override;
I.malloc_enable = malloc_enable;
@ -1613,6 +1631,8 @@ cs_run_fragment(struct cs_builder *b, bool enable_tem,
/* Staging regs */
cs_flush_loads(b);
b->req_resource_mask |= CS_FRAG_RES;
cs_emit(b, RUN_FRAGMENT, I) {
I.enable_tem = enable_tem;
I.tile_order = tile_order;
@ -1626,6 +1646,8 @@ cs_run_fullscreen(struct cs_builder *b, uint32_t flags_override,
/* Staging regs */
cs_flush_loads(b);
b->req_resource_mask |= CS_TILER_RES;
cs_emit(b, RUN_FULLSCREEN, I) {
I.flags_override = flags_override;
I.dcd = cs_src64(b, dcd);
@ -1635,6 +1657,8 @@ cs_run_fullscreen(struct cs_builder *b, uint32_t flags_override,
static inline void
cs_finish_tiling(struct cs_builder *b)
{
b->req_resource_mask |= CS_TILER_RES;
cs_emit(b, FINISH_TILING, I)
;
}
@ -1947,13 +1971,6 @@ cs_jump(struct cs_builder *b, struct cs_index address, struct cs_index length)
}
}
enum cs_res_id {
CS_COMPUTE_RES = BITFIELD_BIT(0),
CS_FRAG_RES = BITFIELD_BIT(1),
CS_TILER_RES = BITFIELD_BIT(2),
CS_IDVS_RES = BITFIELD_BIT(3),
};
static inline void
cs_req_res(struct cs_builder *b, uint32_t res_mask)
{
@ -2053,6 +2070,8 @@ cs_run_compute_indirect(struct cs_builder *b, unsigned wg_per_task,
/* Staging regs */
cs_flush_loads(b);
b->req_resource_mask |= CS_COMPUTE_RES;
cs_emit(b, RUN_COMPUTE_INDIRECT, I) {
I.workgroups_per_task = wg_per_task;
I.srt_select = res_sel.srt;

View file

@ -44,6 +44,17 @@ struct panvk_subqueue {
*/
struct panvk_priv_mem regs_save;
struct {
/* Mask of resources requested by this subqueue. */
uint32_t mask;
/* Address and size of the linear buffer containing REQ_RESOURCE. */
uint32_t cs_buffer_size;
uint64_t cs_buffer_addr;
/* Allocation */
struct panvk_priv_mem buf;
} req_resource;
struct {
struct pan_kmod_bo *bo;
uint64_t size;

View file

@ -1105,6 +1105,7 @@ panvk_per_arch(CmdExecuteCommands)(VkCommandBuffer commandBuffer,
cs_move64_to(prim_b, addr, cs_root_chunk_gpu_addr(sec_b));
cs_move32_to(prim_b, size, cs_root_chunk_size(sec_b));
cs_call(prim_b, addr, size);
prim_b->req_resource_mask |= sec_b->req_resource_mask;
struct u_trace *prim_ut = &primary->utrace.uts[j];
struct u_trace *sec_ut = &secondary->utrace.uts[j];

View file

@ -302,6 +302,7 @@ static void
finish_subqueue(struct panvk_gpu_queue *queue, enum panvk_subqueue_id subqueue)
{
panvk_pool_free_mem(&queue->subqueues[subqueue].context);
panvk_pool_free_mem(&queue->subqueues[subqueue].req_resource.buf);
panvk_pool_free_mem(&queue->subqueues[subqueue].regs_save);
finish_subqueue_tracing(queue, subqueue);
}
@ -328,6 +329,21 @@ init_utrace(struct panvk_gpu_queue *queue)
return VK_SUCCESS;
}
static uint32_t
get_resource_mask(enum panvk_subqueue_id subqueue)
{
switch (subqueue) {
case PANVK_SUBQUEUE_VERTEX_TILER:
return CS_IDVS_RES | CS_TILER_RES;
case PANVK_SUBQUEUE_FRAGMENT:
return CS_FRAG_RES;
case PANVK_SUBQUEUE_COMPUTE:
return CS_COMPUTE_RES;
default:
UNREACHABLE("Unknown subqueue");
}
}
static VkResult
init_subqueue(struct panvk_gpu_queue *queue, enum panvk_subqueue_id subqueue)
{
@ -353,14 +369,43 @@ init_subqueue(struct panvk_gpu_queue *queue, enum panvk_subqueue_id subqueue)
}
}
alloc_info.size = sizeof(struct panvk_cs_subqueue_context);
alloc_info.alignment = 64;
/* When tracing is enabled, we want to use a non-cached pool, so can get
* up-to-date context even if the CS crashed in the middle. */
struct panvk_pool *mempool =
PANVK_DEBUG(TRACE) ? &dev->mempools.rw_nc : &dev->mempools.rw;
alloc_info.size = sizeof(uint64_t);
alloc_info.alignment = 64;
subq->req_resource.buf = panvk_pool_alloc_mem(mempool, alloc_info);
if (!panvk_priv_mem_host_addr(subq->req_resource.buf))
return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
"Failed to create a req_resource buffer");
struct cs_builder b;
const struct drm_panthor_csif_info *csif_info =
panthor_kmod_get_csif_props(dev->kmod.dev);
struct cs_buffer root_cs = {
.cpu = panvk_priv_mem_host_addr(subq->req_resource.buf),
.gpu = panvk_priv_mem_dev_addr(subq->req_resource.buf),
.capacity = 1,
};
struct cs_builder_conf conf = {
.nr_registers = csif_info->cs_reg_count,
.nr_kernel_registers = MAX2(csif_info->unpreserved_cs_reg_count, 4),
.ls_sb_slot = SB_ID(LS),
};
cs_builder_init(&b, &conf, root_cs);
cs_req_res(&b, get_resource_mask(subqueue));
cs_finish(&b);
assert(cs_is_valid(&b));
subq->req_resource.cs_buffer_size = cs_root_chunk_size(&b);
subq->req_resource.cs_buffer_addr = cs_root_chunk_gpu_addr(&b);
alloc_info.size = sizeof(struct panvk_cs_subqueue_context);
alloc_info.alignment = 64;
subq->context = panvk_pool_alloc_mem(mempool, alloc_info);
if (!panvk_priv_mem_host_addr(subq->context))
return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
@ -382,19 +427,16 @@ init_subqueue(struct panvk_gpu_queue *queue, enum panvk_subqueue_id subqueue)
};
/* We use the geometry buffer for our temporary CS buffer. */
struct cs_buffer root_cs = {
root_cs = (struct cs_buffer){
.cpu = panvk_priv_mem_host_addr(queue->tiler_heap.desc) + 4096,
.gpu = panvk_priv_mem_dev_addr(queue->tiler_heap.desc) + 4096,
.capacity = 64 * 1024 / sizeof(uint64_t),
};
const struct drm_panthor_csif_info *csif_info =
panthor_kmod_get_csif_props(dev->kmod.dev);
const struct cs_builder_conf conf = {
conf = (struct cs_builder_conf){
.nr_registers = csif_info->cs_reg_count,
.nr_kernel_registers = MAX2(csif_info->unpreserved_cs_reg_count, 4),
.ls_sb_slot = SB_ID(LS),
};
struct cs_builder b;
assert(panvk_priv_mem_dev_addr(queue->tiler_heap.desc) != 0);
@ -440,24 +482,6 @@ init_subqueue(struct panvk_gpu_queue *queue, enum panvk_subqueue_id subqueue)
cs_move64_to(&b, heap_ctx_addr, queue->tiler_heap.context.dev_addr);
cs_heap_set(&b, heap_ctx_addr);
}
/* Request resources for each subqueue during initialization, as the req_res
* is an expensive operation which should be called sparingly. */
switch (subqueue) {
case PANVK_SUBQUEUE_VERTEX_TILER:
cs_req_res(&b, CS_IDVS_RES | CS_TILER_RES);
break;
case PANVK_SUBQUEUE_FRAGMENT:
cs_req_res(&b, CS_FRAG_RES);
break;
case PANVK_SUBQUEUE_COMPUTE:
cs_req_res(&b, CS_COMPUTE_RES);
break;
default:
UNREACHABLE("Unknown subqueue");
break;
}
cs_finish(&b);
assert(cs_is_valid(&b));
@ -708,6 +732,7 @@ struct panvk_queue_submit {
uint32_t qsubmit_count;
uint32_t wait_queue_mask;
uint32_t signal_queue_mask;
uint32_t req_resource_subqueue_mask;
struct drm_panthor_queue_submit *qsubmits;
struct drm_panthor_sync_op *wait_ops;
@ -769,6 +794,16 @@ panvk_queue_submit_init_storage(
submit->qsubmit_count++;
struct panvk_subqueue *subq = &submit->queue->subqueues[j];
/* If we need a resource the subqueue has not requested yet. */
if (b->req_resource_mask & (~subq->req_resource.mask)) {
/* Ensure we do not need a resource not expected for this subqueue. */
assert(!(b->req_resource_mask & (~get_resource_mask(j))));
submit->qsubmit_count++;
submit->req_resource_subqueue_mask |= BITFIELD_BIT(j);
subq->req_resource.mask = get_resource_mask(j);
}
struct u_trace *ut = &cmdbuf->utrace.uts[j];
if (submit->process_utrace && u_trace_has_points(ut)) {
submit->utrace.queue_mask |= BITFIELD_BIT(j);
@ -893,6 +928,27 @@ panvk_queue_submit_init_utrace(struct panvk_queue_submit *submit,
}
}
static void
panvk_queue_submit_init_req_resource(struct panvk_queue_submit *submit)
{
if (!submit->req_resource_subqueue_mask)
return;
struct panvk_device *dev = submit->dev;
uint32_t flush_id = panthor_kmod_get_flush_id(dev->kmod.dev);
u_foreach_bit(i, submit->req_resource_subqueue_mask) {
struct panvk_subqueue *subq = &submit->queue->subqueues[i];
submit->qsubmits[submit->qsubmit_count++] =
(struct drm_panthor_queue_submit){
.queue_index = i,
.stream_size = subq->req_resource.cs_buffer_size,
.stream_addr = subq->req_resource.cs_buffer_addr,
.latest_flush = flush_id,
};
}
}
static void
panvk_queue_submit_init_waits(struct panvk_queue_submit *submit,
const struct vk_queue_submit *vk_submit)
@ -1190,6 +1246,7 @@ panvk_per_arch(gpu_queue_submit)(struct vk_queue *vk_queue, struct vk_queue_subm
panvk_queue_submit_init(&submit, vk_queue);
panvk_queue_submit_init_storage(&submit, vk_submit, &stack_storage);
panvk_queue_submit_init_utrace(&submit, vk_submit);
panvk_queue_submit_init_req_resource(&submit);
panvk_queue_submit_init_waits(&submit, vk_submit);
panvk_queue_submit_init_cmdbufs(&submit, vk_submit);
panvk_queue_submit_init_signals(&submit, vk_submit);