mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-06 04:30:10 +01:00
panvk: Only call req_res when required
Rather than calling REQ_RES during subqueue_init, only call it during submit the first time we see a command buffer that requires the use of specific resources. This ensures queues processing compute-only workloads (i.e. not actually requiring tiler/fragment resources) don't preempt vertex/fragment work on other queues due to resource congestion. Reviewed-by: Christoph Pillmayer <christoph.pillmayer@arm.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37890>
This commit is contained in:
parent
849d41dbf8
commit
e8c0fdbc45
4 changed files with 121 additions and 33 deletions
|
|
@ -218,6 +218,9 @@ struct cs_builder {
|
|||
/* ralloc context used for cs_maybe allocations */
|
||||
void *maybe_ctx;
|
||||
|
||||
/* Mask of resources required by this CS. */
|
||||
uint32_t req_resource_mask;
|
||||
|
||||
/* Temporary storage for inner blocks that need to be built
|
||||
* and copied in one monolithic sequence of instructions with no
|
||||
* jump in the middle.
|
||||
|
|
@ -1511,6 +1514,13 @@ cs_shader_res_sel(uint8_t srt, uint8_t fau, uint8_t spd, uint8_t tsd)
|
|||
};
|
||||
}
|
||||
|
||||
enum cs_res_id {
|
||||
CS_COMPUTE_RES = BITFIELD_BIT(0),
|
||||
CS_FRAG_RES = BITFIELD_BIT(1),
|
||||
CS_TILER_RES = BITFIELD_BIT(2),
|
||||
CS_IDVS_RES = BITFIELD_BIT(3),
|
||||
};
|
||||
|
||||
static inline void
|
||||
cs_run_compute(struct cs_builder *b, unsigned task_increment,
|
||||
enum mali_task_axis task_axis, struct cs_shader_res_sel res_sel)
|
||||
|
|
@ -1518,6 +1528,8 @@ cs_run_compute(struct cs_builder *b, unsigned task_increment,
|
|||
/* Staging regs */
|
||||
cs_flush_loads(b);
|
||||
|
||||
b->req_resource_mask |= CS_COMPUTE_RES;
|
||||
|
||||
cs_emit(b, RUN_COMPUTE, I) {
|
||||
I.task_increment = task_increment;
|
||||
I.task_axis = task_axis;
|
||||
|
|
@ -1536,6 +1548,8 @@ cs_run_tiling(struct cs_builder *b, uint32_t flags_override,
|
|||
/* Staging regs */
|
||||
cs_flush_loads(b);
|
||||
|
||||
b->req_resource_mask |= CS_TILER_RES;
|
||||
|
||||
cs_emit(b, RUN_TILING, I) {
|
||||
I.flags_override = flags_override;
|
||||
I.srt_select = res_sel.srt;
|
||||
|
|
@ -1555,6 +1569,8 @@ cs_run_idvs2(struct cs_builder *b, uint32_t flags_override, bool malloc_enable,
|
|||
/* Staging regs */
|
||||
cs_flush_loads(b);
|
||||
|
||||
b->req_resource_mask |= CS_IDVS_RES;
|
||||
|
||||
cs_emit(b, RUN_IDVS2, I) {
|
||||
I.flags_override = flags_override;
|
||||
I.malloc_enable = malloc_enable;
|
||||
|
|
@ -1577,6 +1593,8 @@ cs_run_idvs(struct cs_builder *b, uint32_t flags_override, bool malloc_enable,
|
|||
/* Staging regs */
|
||||
cs_flush_loads(b);
|
||||
|
||||
b->req_resource_mask |= CS_IDVS_RES;
|
||||
|
||||
cs_emit(b, RUN_IDVS, I) {
|
||||
I.flags_override = flags_override;
|
||||
I.malloc_enable = malloc_enable;
|
||||
|
|
@ -1613,6 +1631,8 @@ cs_run_fragment(struct cs_builder *b, bool enable_tem,
|
|||
/* Staging regs */
|
||||
cs_flush_loads(b);
|
||||
|
||||
b->req_resource_mask |= CS_FRAG_RES;
|
||||
|
||||
cs_emit(b, RUN_FRAGMENT, I) {
|
||||
I.enable_tem = enable_tem;
|
||||
I.tile_order = tile_order;
|
||||
|
|
@ -1626,6 +1646,8 @@ cs_run_fullscreen(struct cs_builder *b, uint32_t flags_override,
|
|||
/* Staging regs */
|
||||
cs_flush_loads(b);
|
||||
|
||||
b->req_resource_mask |= CS_TILER_RES;
|
||||
|
||||
cs_emit(b, RUN_FULLSCREEN, I) {
|
||||
I.flags_override = flags_override;
|
||||
I.dcd = cs_src64(b, dcd);
|
||||
|
|
@ -1635,6 +1657,8 @@ cs_run_fullscreen(struct cs_builder *b, uint32_t flags_override,
|
|||
static inline void
|
||||
cs_finish_tiling(struct cs_builder *b)
|
||||
{
|
||||
b->req_resource_mask |= CS_TILER_RES;
|
||||
|
||||
cs_emit(b, FINISH_TILING, I)
|
||||
;
|
||||
}
|
||||
|
|
@ -1947,13 +1971,6 @@ cs_jump(struct cs_builder *b, struct cs_index address, struct cs_index length)
|
|||
}
|
||||
}
|
||||
|
||||
enum cs_res_id {
|
||||
CS_COMPUTE_RES = BITFIELD_BIT(0),
|
||||
CS_FRAG_RES = BITFIELD_BIT(1),
|
||||
CS_TILER_RES = BITFIELD_BIT(2),
|
||||
CS_IDVS_RES = BITFIELD_BIT(3),
|
||||
};
|
||||
|
||||
static inline void
|
||||
cs_req_res(struct cs_builder *b, uint32_t res_mask)
|
||||
{
|
||||
|
|
@ -2053,6 +2070,8 @@ cs_run_compute_indirect(struct cs_builder *b, unsigned wg_per_task,
|
|||
/* Staging regs */
|
||||
cs_flush_loads(b);
|
||||
|
||||
b->req_resource_mask |= CS_COMPUTE_RES;
|
||||
|
||||
cs_emit(b, RUN_COMPUTE_INDIRECT, I) {
|
||||
I.workgroups_per_task = wg_per_task;
|
||||
I.srt_select = res_sel.srt;
|
||||
|
|
|
|||
|
|
@ -44,6 +44,17 @@ struct panvk_subqueue {
|
|||
*/
|
||||
struct panvk_priv_mem regs_save;
|
||||
|
||||
|
||||
struct {
|
||||
/* Mask of resources requested by this subqueue. */
|
||||
uint32_t mask;
|
||||
/* Address and size of the linear buffer containing REQ_RESOURCE. */
|
||||
uint32_t cs_buffer_size;
|
||||
uint64_t cs_buffer_addr;
|
||||
/* Allocation */
|
||||
struct panvk_priv_mem buf;
|
||||
} req_resource;
|
||||
|
||||
struct {
|
||||
struct pan_kmod_bo *bo;
|
||||
uint64_t size;
|
||||
|
|
|
|||
|
|
@ -1105,6 +1105,7 @@ panvk_per_arch(CmdExecuteCommands)(VkCommandBuffer commandBuffer,
|
|||
cs_move64_to(prim_b, addr, cs_root_chunk_gpu_addr(sec_b));
|
||||
cs_move32_to(prim_b, size, cs_root_chunk_size(sec_b));
|
||||
cs_call(prim_b, addr, size);
|
||||
prim_b->req_resource_mask |= sec_b->req_resource_mask;
|
||||
|
||||
struct u_trace *prim_ut = &primary->utrace.uts[j];
|
||||
struct u_trace *sec_ut = &secondary->utrace.uts[j];
|
||||
|
|
|
|||
|
|
@ -302,6 +302,7 @@ static void
|
|||
finish_subqueue(struct panvk_gpu_queue *queue, enum panvk_subqueue_id subqueue)
|
||||
{
|
||||
panvk_pool_free_mem(&queue->subqueues[subqueue].context);
|
||||
panvk_pool_free_mem(&queue->subqueues[subqueue].req_resource.buf);
|
||||
panvk_pool_free_mem(&queue->subqueues[subqueue].regs_save);
|
||||
finish_subqueue_tracing(queue, subqueue);
|
||||
}
|
||||
|
|
@ -328,6 +329,21 @@ init_utrace(struct panvk_gpu_queue *queue)
|
|||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
get_resource_mask(enum panvk_subqueue_id subqueue)
|
||||
{
|
||||
switch (subqueue) {
|
||||
case PANVK_SUBQUEUE_VERTEX_TILER:
|
||||
return CS_IDVS_RES | CS_TILER_RES;
|
||||
case PANVK_SUBQUEUE_FRAGMENT:
|
||||
return CS_FRAG_RES;
|
||||
case PANVK_SUBQUEUE_COMPUTE:
|
||||
return CS_COMPUTE_RES;
|
||||
default:
|
||||
UNREACHABLE("Unknown subqueue");
|
||||
}
|
||||
}
|
||||
|
||||
static VkResult
|
||||
init_subqueue(struct panvk_gpu_queue *queue, enum panvk_subqueue_id subqueue)
|
||||
{
|
||||
|
|
@ -353,14 +369,43 @@ init_subqueue(struct panvk_gpu_queue *queue, enum panvk_subqueue_id subqueue)
|
|||
}
|
||||
}
|
||||
|
||||
alloc_info.size = sizeof(struct panvk_cs_subqueue_context);
|
||||
alloc_info.alignment = 64;
|
||||
|
||||
/* When tracing is enabled, we want to use a non-cached pool, so can get
|
||||
* up-to-date context even if the CS crashed in the middle. */
|
||||
struct panvk_pool *mempool =
|
||||
PANVK_DEBUG(TRACE) ? &dev->mempools.rw_nc : &dev->mempools.rw;
|
||||
|
||||
alloc_info.size = sizeof(uint64_t);
|
||||
alloc_info.alignment = 64;
|
||||
subq->req_resource.buf = panvk_pool_alloc_mem(mempool, alloc_info);
|
||||
if (!panvk_priv_mem_host_addr(subq->req_resource.buf))
|
||||
return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
|
||||
"Failed to create a req_resource buffer");
|
||||
|
||||
struct cs_builder b;
|
||||
const struct drm_panthor_csif_info *csif_info =
|
||||
panthor_kmod_get_csif_props(dev->kmod.dev);
|
||||
|
||||
struct cs_buffer root_cs = {
|
||||
.cpu = panvk_priv_mem_host_addr(subq->req_resource.buf),
|
||||
.gpu = panvk_priv_mem_dev_addr(subq->req_resource.buf),
|
||||
.capacity = 1,
|
||||
};
|
||||
struct cs_builder_conf conf = {
|
||||
.nr_registers = csif_info->cs_reg_count,
|
||||
.nr_kernel_registers = MAX2(csif_info->unpreserved_cs_reg_count, 4),
|
||||
.ls_sb_slot = SB_ID(LS),
|
||||
};
|
||||
|
||||
cs_builder_init(&b, &conf, root_cs);
|
||||
cs_req_res(&b, get_resource_mask(subqueue));
|
||||
cs_finish(&b);
|
||||
assert(cs_is_valid(&b));
|
||||
subq->req_resource.cs_buffer_size = cs_root_chunk_size(&b);
|
||||
subq->req_resource.cs_buffer_addr = cs_root_chunk_gpu_addr(&b);
|
||||
|
||||
alloc_info.size = sizeof(struct panvk_cs_subqueue_context);
|
||||
alloc_info.alignment = 64;
|
||||
|
||||
subq->context = panvk_pool_alloc_mem(mempool, alloc_info);
|
||||
if (!panvk_priv_mem_host_addr(subq->context))
|
||||
return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
|
||||
|
|
@ -382,19 +427,16 @@ init_subqueue(struct panvk_gpu_queue *queue, enum panvk_subqueue_id subqueue)
|
|||
};
|
||||
|
||||
/* We use the geometry buffer for our temporary CS buffer. */
|
||||
struct cs_buffer root_cs = {
|
||||
root_cs = (struct cs_buffer){
|
||||
.cpu = panvk_priv_mem_host_addr(queue->tiler_heap.desc) + 4096,
|
||||
.gpu = panvk_priv_mem_dev_addr(queue->tiler_heap.desc) + 4096,
|
||||
.capacity = 64 * 1024 / sizeof(uint64_t),
|
||||
};
|
||||
const struct drm_panthor_csif_info *csif_info =
|
||||
panthor_kmod_get_csif_props(dev->kmod.dev);
|
||||
const struct cs_builder_conf conf = {
|
||||
conf = (struct cs_builder_conf){
|
||||
.nr_registers = csif_info->cs_reg_count,
|
||||
.nr_kernel_registers = MAX2(csif_info->unpreserved_cs_reg_count, 4),
|
||||
.ls_sb_slot = SB_ID(LS),
|
||||
};
|
||||
struct cs_builder b;
|
||||
|
||||
assert(panvk_priv_mem_dev_addr(queue->tiler_heap.desc) != 0);
|
||||
|
||||
|
|
@ -440,24 +482,6 @@ init_subqueue(struct panvk_gpu_queue *queue, enum panvk_subqueue_id subqueue)
|
|||
cs_move64_to(&b, heap_ctx_addr, queue->tiler_heap.context.dev_addr);
|
||||
cs_heap_set(&b, heap_ctx_addr);
|
||||
}
|
||||
|
||||
/* Request resources for each subqueue during initialization, as the req_res
|
||||
* is an expensive operation which should be called sparingly. */
|
||||
switch (subqueue) {
|
||||
case PANVK_SUBQUEUE_VERTEX_TILER:
|
||||
cs_req_res(&b, CS_IDVS_RES | CS_TILER_RES);
|
||||
break;
|
||||
case PANVK_SUBQUEUE_FRAGMENT:
|
||||
cs_req_res(&b, CS_FRAG_RES);
|
||||
break;
|
||||
case PANVK_SUBQUEUE_COMPUTE:
|
||||
cs_req_res(&b, CS_COMPUTE_RES);
|
||||
break;
|
||||
default:
|
||||
UNREACHABLE("Unknown subqueue");
|
||||
break;
|
||||
}
|
||||
|
||||
cs_finish(&b);
|
||||
|
||||
assert(cs_is_valid(&b));
|
||||
|
|
@ -708,6 +732,7 @@ struct panvk_queue_submit {
|
|||
uint32_t qsubmit_count;
|
||||
uint32_t wait_queue_mask;
|
||||
uint32_t signal_queue_mask;
|
||||
uint32_t req_resource_subqueue_mask;
|
||||
|
||||
struct drm_panthor_queue_submit *qsubmits;
|
||||
struct drm_panthor_sync_op *wait_ops;
|
||||
|
|
@ -769,6 +794,16 @@ panvk_queue_submit_init_storage(
|
|||
|
||||
submit->qsubmit_count++;
|
||||
|
||||
struct panvk_subqueue *subq = &submit->queue->subqueues[j];
|
||||
/* If we need a resource the subqueue has not requested yet. */
|
||||
if (b->req_resource_mask & (~subq->req_resource.mask)) {
|
||||
/* Ensure we do not need a resource not expected for this subqueue. */
|
||||
assert(!(b->req_resource_mask & (~get_resource_mask(j))));
|
||||
submit->qsubmit_count++;
|
||||
submit->req_resource_subqueue_mask |= BITFIELD_BIT(j);
|
||||
subq->req_resource.mask = get_resource_mask(j);
|
||||
}
|
||||
|
||||
struct u_trace *ut = &cmdbuf->utrace.uts[j];
|
||||
if (submit->process_utrace && u_trace_has_points(ut)) {
|
||||
submit->utrace.queue_mask |= BITFIELD_BIT(j);
|
||||
|
|
@ -893,6 +928,27 @@ panvk_queue_submit_init_utrace(struct panvk_queue_submit *submit,
|
|||
}
|
||||
}
|
||||
|
||||
static void
|
||||
panvk_queue_submit_init_req_resource(struct panvk_queue_submit *submit)
|
||||
{
|
||||
if (!submit->req_resource_subqueue_mask)
|
||||
return;
|
||||
|
||||
struct panvk_device *dev = submit->dev;
|
||||
uint32_t flush_id = panthor_kmod_get_flush_id(dev->kmod.dev);
|
||||
|
||||
u_foreach_bit(i, submit->req_resource_subqueue_mask) {
|
||||
struct panvk_subqueue *subq = &submit->queue->subqueues[i];
|
||||
submit->qsubmits[submit->qsubmit_count++] =
|
||||
(struct drm_panthor_queue_submit){
|
||||
.queue_index = i,
|
||||
.stream_size = subq->req_resource.cs_buffer_size,
|
||||
.stream_addr = subq->req_resource.cs_buffer_addr,
|
||||
.latest_flush = flush_id,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
panvk_queue_submit_init_waits(struct panvk_queue_submit *submit,
|
||||
const struct vk_queue_submit *vk_submit)
|
||||
|
|
@ -1190,6 +1246,7 @@ panvk_per_arch(gpu_queue_submit)(struct vk_queue *vk_queue, struct vk_queue_subm
|
|||
panvk_queue_submit_init(&submit, vk_queue);
|
||||
panvk_queue_submit_init_storage(&submit, vk_submit, &stack_storage);
|
||||
panvk_queue_submit_init_utrace(&submit, vk_submit);
|
||||
panvk_queue_submit_init_req_resource(&submit);
|
||||
panvk_queue_submit_init_waits(&submit, vk_submit);
|
||||
panvk_queue_submit_init_cmdbufs(&submit, vk_submit);
|
||||
panvk_queue_submit_init_signals(&submit, vk_submit);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue