From d60c6883176eaa731216c87caa3b1dbbbfd358e0 Mon Sep 17 00:00:00 2001 From: Olivia Lee Date: Fri, 28 Mar 2025 00:24:07 -0700 Subject: [PATCH] panvk/csf: set up shared register dump regions for cs functions The tiler OOM exception handler allocated a region of memory to dump save/restored registers. For defining more functions in the future, we allocate a register dump region for each subqueue, that can hold the largest number of registers needed by any functions executed on that subqueue. This does mean that we cannot have function calls more than one deep. If we ever need nested function calls, we will have to consider a real stack. Signed-off-by: Olivia Lee Tested-by: Mary Guillemard Reviewed-by: Mary Guillemard Reviewed-by: Ryan Mckeever Part-of: --- src/panfrost/vulkan/csf/panvk_cmd_buffer.h | 2 +- src/panfrost/vulkan/csf/panvk_queue.h | 7 +++- .../vulkan/csf/panvk_vX_exception_handler.c | 9 ++--- src/panfrost/vulkan/csf/panvk_vX_queue.c | 33 +++++++++---------- src/panfrost/vulkan/panvk_device.h | 5 ++- src/panfrost/vulkan/panvk_vX_device.c | 14 ++++++++ 6 files changed, 46 insertions(+), 24 deletions(-) diff --git a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h index 33b79e14061..4863d04c2e2 100644 --- a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h +++ b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h @@ -94,6 +94,7 @@ struct panvk_cs_subqueue_context { uint64_t syncobjs; uint32_t iter_sb; uint32_t pad; + uint64_t reg_dump_addr; struct { struct panvk_cs_desc_ringbuf desc_ringbuf; uint64_t tiler_heap; @@ -105,7 +106,6 @@ struct panvk_cs_subqueue_context { uint64_t fbds[PANVK_IR_PASS_COUNT]; uint32_t td_count; uint32_t layer_count; - uint64_t reg_dump_addr; } tiler_oom_ctx; struct { uint64_t syncobjs; diff --git a/src/panfrost/vulkan/csf/panvk_queue.h b/src/panfrost/vulkan/csf/panvk_queue.h index 09041a748e6..1d39b05c2bf 100644 --- a/src/panfrost/vulkan/csf/panvk_queue.h +++ b/src/panfrost/vulkan/csf/panvk_queue.h @@ -38,6 +38,12 @@ struct panvk_subqueue { struct panvk_priv_mem context; uint32_t *reg_file; + /* Memory to save/restore CS registers in functions/exception handlers. + * Because registers are dumped to a fixed address rather than a moving + * stack pointer, nested function/exception handler calls are not supported. + */ + struct panvk_priv_mem regs_save; + struct { struct pan_kmod_bo *bo; size_t size; @@ -68,7 +74,6 @@ struct panvk_queue { struct panvk_desc_ringbuf render_desc_ringbuf; struct panvk_priv_mem syncobjs; struct panvk_priv_mem debug_syncobjs; - struct panvk_priv_mem tiler_oom_regs_save; struct { struct vk_sync *sync; diff --git a/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c b/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c index 3b4f1238a7e..5a746db7737 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c +++ b/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c @@ -47,7 +47,8 @@ generate_tiler_oom_handler(struct panvk_device *dev, struct cs_function handler; struct cs_function_ctx handler_ctx = { .ctx_reg = cs_subqueue_ctx_reg(&b), - .dump_addr_offset = TILER_OOM_CTX_FIELD_OFFSET(reg_dump_addr), + .dump_addr_offset = + offsetof(struct panvk_cs_subqueue_context, reg_dump_addr), }; struct cs_tracing_ctx tracing_ctx = { .enabled = tracing_enabled, @@ -172,10 +173,10 @@ panvk_per_arch(init_tiler_oom)(struct panvk_device *device) /* All handlers must have the same length */ assert(idx == 0 || handler_length == device->tiler_oom.handler_stride); - assert(idx == 0 || - dump_region_size == device->tiler_oom.dump_region_size); device->tiler_oom.handler_stride = handler_length; - device->tiler_oom.dump_region_size = dump_region_size; + device->dump_region_size[PANVK_SUBQUEUE_FRAGMENT] = + MAX2(device->dump_region_size[PANVK_SUBQUEUE_FRAGMENT], + dump_region_size); } } diff --git a/src/panfrost/vulkan/csf/panvk_vX_queue.c b/src/panfrost/vulkan/csf/panvk_vX_queue.c index 7f02bc1a520..86e50d98366 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_queue.c +++ b/src/panfrost/vulkan/csf/panvk_vX_queue.c @@ -321,6 +321,7 @@ static void finish_subqueue(struct panvk_queue *queue, enum panvk_subqueue_id subqueue) { panvk_pool_free_mem(&queue->subqueues[subqueue].context); + panvk_pool_free_mem(&queue->subqueues[subqueue].regs_save); finish_subqueue_tracing(queue, subqueue); } @@ -362,10 +363,20 @@ init_subqueue(struct panvk_queue *queue, enum panvk_subqueue_id subqueue) if (result != VK_SUCCESS) return result; - struct panvk_pool_alloc_info alloc_info = { - .size = sizeof(struct panvk_cs_subqueue_context), - .alignment = 64, - }; + struct panvk_pool_alloc_info alloc_info; + + if (dev->dump_region_size[subqueue]) { + alloc_info.size = dev->dump_region_size[subqueue]; + alloc_info.alignment = sizeof(uint32_t); + subq->regs_save = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info); + if (!panvk_priv_mem_host_addr(subq->regs_save)) { + return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "Failed to allocate register save area"); + } + } + + alloc_info.size = sizeof(struct panvk_cs_subqueue_context); + alloc_info.alignment = 64; /* When tracing is enabled, we want to use a non-cached pool, so can get * up-to-date context even if the CS crashed in the middle. */ @@ -385,8 +396,7 @@ init_subqueue(struct panvk_queue *queue, enum panvk_subqueue_id subqueue) .debug.syncobjs = panvk_priv_mem_dev_addr(queue->debug_syncobjs), .debug.tracebuf.cs = subq->tracebuf.addr.dev, .iter_sb = 0, - .tiler_oom_ctx.reg_dump_addr = - panvk_priv_mem_dev_addr(queue->tiler_oom_regs_save), + .reg_dump_addr = panvk_priv_mem_dev_addr(subq->regs_save), }; /* We use the geometry buffer for our temporary CS buffer. */ @@ -524,7 +534,6 @@ cleanup_queue(struct panvk_queue *queue) finish_render_desc_ringbuf(queue); - panvk_pool_free_mem(&queue->tiler_oom_regs_save); panvk_pool_free_mem(&queue->debug_syncobjs); panvk_pool_free_mem(&queue->syncobjs); } @@ -561,16 +570,6 @@ init_queue(struct panvk_queue *queue) } } - alloc_info.size = dev->tiler_oom.dump_region_size; - alloc_info.alignment = sizeof(uint32_t); - queue->tiler_oom_regs_save = - panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info); - if (!panvk_priv_mem_host_addr(queue->tiler_oom_regs_save)) { - result = panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, - "Failed to allocate tiler oom register save area"); - goto err_cleanup_queue; - } - result = init_render_desc_ringbuf(queue); if (result != VK_SUCCESS) goto err_cleanup_queue; diff --git a/src/panfrost/vulkan/panvk_device.h b/src/panfrost/vulkan/panvk_device.h index 33c1f78a973..cda025c2a91 100644 --- a/src/panfrost/vulkan/panvk_device.h +++ b/src/panfrost/vulkan/panvk_device.h @@ -51,7 +51,6 @@ struct panvk_device { struct { struct panvk_priv_bo *handlers_bo; uint32_t handler_stride; - uint32_t dump_region_size; } tiler_oom; struct vk_meta_device meta; @@ -62,6 +61,10 @@ struct panvk_device { struct panvk_pool exec; } mempools; + /* For each subqueue, maximum size of the register dump region needed by + * exception handlers or functions */ + uint32_t *dump_region_size; + struct vk_device_dispatch_table cmd_dispatch; struct panvk_queue *queues[PANVK_MAX_QUEUE_FAMILIES]; diff --git a/src/panfrost/vulkan/panvk_vX_device.c b/src/panfrost/vulkan/panvk_vX_device.c index aa89d7a09fd..f4397f6d058 100644 --- a/src/panfrost/vulkan/panvk_vX_device.c +++ b/src/panfrost/vulkan/panvk_vX_device.c @@ -321,6 +321,18 @@ panvk_per_arch(create_device)(struct panvk_physical_device *physical_device, panvk_device_init_mempools(device); +#if PAN_ARCH >= 10 + /* The only reason this is a heap allocation is that PANVK_SUBQUEUE_COUNT + * isn't available in the header */ + device->dump_region_size = + vk_zalloc(&device->vk.alloc, PANVK_SUBQUEUE_COUNT * sizeof(uint32_t), + alignof(uint32_t), VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + if (!device->dump_region_size) { + result = panvk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + goto err_free_priv_bos; + } +#endif + #if PAN_ARCH <= 9 result = panvk_priv_bo_create( device, 128 * 1024 * 1024, @@ -423,6 +435,7 @@ err_free_priv_bos: panvk_priv_bo_unref(device->sample_positions); panvk_priv_bo_unref(device->tiler_heap); panvk_device_cleanup_mempools(device); + vk_free(&device->vk.alloc, device->dump_region_size); pan_kmod_vm_destroy(device->kmod.vm); util_vma_heap_finish(&device->as.heap); simple_mtx_destroy(&device->as.lock); @@ -462,6 +475,7 @@ panvk_per_arch(destroy_device)(struct panvk_device *device, panvk_priv_bo_unref(device->tiler_heap); panvk_priv_bo_unref(device->sample_positions); panvk_device_cleanup_mempools(device); + vk_free(&device->vk.alloc, device->dump_region_size); pan_kmod_vm_destroy(device->kmod.vm); util_vma_heap_finish(&device->as.heap); simple_mtx_destroy(&device->as.lock);