diff --git a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h index cc620944de5..0f5dd00addc 100644 --- a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h +++ b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h @@ -100,7 +100,12 @@ struct panvk_cs_subqueue_context { uint32_t layer_count; mali_ptr reg_dump_addr; } tiler_oom_ctx; - uint64_t debug_syncobjs; + struct { + uint64_t syncobjs; + struct { + uint64_t cs; + } tracebuf; + } debug; } __attribute__((aligned(64))); struct panvk_cache_flush_info { @@ -249,6 +254,8 @@ struct panvk_cs_state { /* Sync point relative to the beginning of the command buffer. * Needs to be offset with the subqueue sync point. */ int32_t relative_sync_point; + + struct cs_tracing_ctx tracing; }; static inline struct panvk_cs_reg_upd_context * diff --git a/src/panfrost/vulkan/csf/panvk_queue.h b/src/panfrost/vulkan/csf/panvk_queue.h index c9347ad7098..484b0c7ccb3 100644 --- a/src/panfrost/vulkan/csf/panvk_queue.h +++ b/src/panfrost/vulkan/csf/panvk_queue.h @@ -37,11 +37,21 @@ struct panvk_tiler_heap { struct panvk_subqueue { struct panvk_priv_mem context; uint32_t *reg_file; + + struct { + struct pan_kmod_bo *bo; + size_t size; + struct { + uint64_t dev; + void *host; + } addr; + } tracebuf; }; struct panvk_desc_ringbuf { struct panvk_priv_mem syncobj; struct pan_kmod_bo *bo; + size_t size; struct { uint64_t dev; void *host; diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c index 4e6dc29f06c..10ce47bbaec 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c @@ -138,7 +138,7 @@ finish_cs(struct panvk_cmd_buffer *cmdbuf, uint32_t subqueue) cs_move32_to(b, one, 1); cs_load64_to(b, debug_sync_addr, cs_subqueue_ctx_reg(b), - offsetof(struct panvk_cs_subqueue_context, debug_syncobjs)); + offsetof(struct panvk_cs_subqueue_context, debug.syncobjs)); cs_wait_slot(b, SB_ID(LS), false); cs_add64(b, debug_sync_addr, debug_sync_addr, sizeof(struct panvk_cs_sync32) * subqueue); @@ -679,6 +679,7 @@ init_cs_builders(struct panvk_cmd_buffer *cmdbuf) }; for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->state.cs); i++) { + struct cs_builder *b = &cmdbuf->state.cs[i].builder; /* Lazy allocation of the root CS. */ struct cs_buffer root_cs = {0}; @@ -701,7 +702,17 @@ init_cs_builders(struct panvk_cmd_buffer *cmdbuf) conf.reg_perm = cs_reg_perm; } - cs_builder_init(&cmdbuf->state.cs[i].builder, &conf, root_cs); + cs_builder_init(b, &conf, root_cs); + + if (instance->debug_flags & PANVK_DEBUG_TRACE) { + cmdbuf->state.cs[i].tracing = (struct cs_tracing_ctx){ + .enabled = true, + .ctx_reg = cs_subqueue_ctx_reg(b), + .tracebuf_addr_offset = + offsetof(struct panvk_cs_subqueue_context, debug.tracebuf.cs), + .ls_sb_slot = SB_ID(LS), + }; + } } } diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c index 45fc5d252b1..a348895e71b 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c @@ -168,6 +168,8 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) &cmdbuf->state.compute.desc_state; struct panvk_shader_desc_state *cs_desc_state = &cmdbuf->state.compute.cs.desc; + const struct cs_tracing_ctx *tracing_ctx = + &cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].tracing; struct panfrost_ptr tsd = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE); if (!tsd.gpu) @@ -349,16 +351,18 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_COMPUTE); cs_req_res(b, CS_COMPUTE_RES); - if (indirect) - cs_run_compute_indirect(b, wg_per_task, false, - cs_shader_res_sel(0, 0, 0, 0)); - else { + if (indirect) { + cs_trace_run_compute_indirect(b, tracing_ctx, + cs_scratch_reg_tuple(b, 0, 4), wg_per_task, + false, cs_shader_res_sel(0, 0, 0, 0)); + } else { unsigned task_axis = MALI_TASK_AXIS_X; unsigned task_increment = 0; calculate_task_axis_and_increment(shader, phys_dev, &task_axis, &task_increment); - cs_run_compute(b, task_increment, task_axis, false, - cs_shader_res_sel(0, 0, 0, 0)); + cs_trace_run_compute(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4), + task_increment, task_axis, false, + cs_shader_res_sel(0, 0, 0, 0)); } cs_req_res(b, 0); diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c index be642a808f0..9696db30dbc 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c @@ -688,7 +688,8 @@ cs_render_desc_ringbuf_reserve(struct cs_builder *b, uint32_t size) } static void -cs_render_desc_ringbuf_move_ptr(struct cs_builder *b, uint32_t size) +cs_render_desc_ringbuf_move_ptr(struct cs_builder *b, uint32_t size, + bool wrap_around) { struct cs_index scratch_reg = cs_scratch_reg32(b, 0); struct cs_index ptr_lo = cs_scratch_reg32(b, 2); @@ -703,12 +704,15 @@ cs_render_desc_ringbuf_move_ptr(struct cs_builder *b, uint32_t size) /* Update the relative position and absolute address. */ cs_add32(b, ptr_lo, ptr_lo, size); cs_add32(b, pos, pos, size); - cs_add32(b, scratch_reg, pos, -RENDER_DESC_RINGBUF_SIZE); /* Wrap-around. */ - cs_if(b, MALI_CS_CONDITION_GEQUAL, scratch_reg) { - cs_add32(b, ptr_lo, ptr_lo, -RENDER_DESC_RINGBUF_SIZE); - cs_add32(b, pos, pos, -RENDER_DESC_RINGBUF_SIZE); + if (likely(wrap_around)) { + cs_add32(b, scratch_reg, pos, -RENDER_DESC_RINGBUF_SIZE); + + cs_if(b, MALI_CS_CONDITION_GEQUAL, scratch_reg) { + cs_add32(b, ptr_lo, ptr_lo, -RENDER_DESC_RINGBUF_SIZE); + cs_add32(b, pos, pos, -RENDER_DESC_RINGBUF_SIZE); + } } cs_store( @@ -740,6 +744,9 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf) panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER); struct panvk_physical_device *phys_dev = to_panvk_physical_device(cmdbuf->vk.base.device->physical); + struct panvk_instance *instance = + to_panvk_instance(phys_dev->vk.instance); + bool tracing_enabled = instance->debug_flags & PANVK_DEBUG_TRACE; struct panfrost_tiler_features tiler_features = panfrost_query_tiler_features(&phys_dev->kmod.props); bool simul_use = @@ -797,7 +804,7 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf) render.desc_ringbuf.ptr)); } - cs_render_desc_ringbuf_move_ptr(b, descs_sz); + cs_render_desc_ringbuf_move_ptr(b, descs_sz, !tracing_enabled); } else { cs_update_vt_ctx(b) { cs_move64_to(b, tiler_ctx_addr, tiler_desc.gpu); @@ -1661,6 +1668,8 @@ prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw) static void panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw) { + const struct cs_tracing_ctx *tracing_ctx = + &cmdbuf->state.cs[PANVK_SUBQUEUE_VERTEX_TILER].tracing; const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader; struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER); @@ -1718,11 +1727,12 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw) cs_move32_to(b, counter_reg, idvs_count); cs_while(b, MALI_CS_CONDITION_GREATER, counter_reg) { - cs_run_idvs(b, flags_override.opaque[0], false, true, - cs_shader_res_sel(0, 0, 1, 0), - cs_shader_res_sel(2, 2, 2, 0), cs_undef()); + cs_trace_run_idvs(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4), + flags_override.opaque[0], false, true, + cs_shader_res_sel(0, 0, 1, 0), + cs_shader_res_sel(2, 2, 2, 0), cs_undef()); - cs_add32(b, counter_reg, counter_reg, -1); + cs_add32(b, counter_reg, counter_reg, -1); cs_update_vt_ctx(b) { cs_add64(b, tiler_ctx_addr, tiler_ctx_addr, pan_size(TILER_CONTEXT)); @@ -1734,9 +1744,10 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw) -(idvs_count * pan_size(TILER_CONTEXT))); } } else { - cs_run_idvs(b, flags_override.opaque[0], false, true, - cs_shader_res_sel(0, 0, 1, 0), cs_shader_res_sel(2, 2, 2, 0), - cs_undef()); + cs_trace_run_idvs(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4), + flags_override.opaque[0], false, true, + cs_shader_res_sel(0, 0, 1, 0), + cs_shader_res_sel(2, 2, 2, 0), cs_undef()); } cs_req_res(b, 0); } @@ -1803,6 +1814,8 @@ static void panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw) { + const struct cs_tracing_ctx *tracing_ctx = + &cmdbuf->state.cs[PANVK_SUBQUEUE_VERTEX_TILER].tracing; const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader; struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER); @@ -1852,9 +1865,10 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf, cs_wait_slot(b, SB_ID(LS), false); cs_req_res(b, CS_IDVS_RES); - cs_run_idvs(b, flags_override.opaque[0], false, true, - cs_shader_res_sel(0, 0, 1, 0), cs_shader_res_sel(2, 2, 2, 0), - cs_undef()); + cs_trace_run_idvs(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4), + flags_override.opaque[0], false, true, + cs_shader_res_sel(0, 0, 1, 0), + cs_shader_res_sel(2, 2, 2, 0), cs_undef()); cs_req_res(b, 0); } @@ -2147,6 +2161,8 @@ static VkResult issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) { struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); + const struct cs_tracing_ctx *tracing_ctx = + &cmdbuf->state.cs[PANVK_SUBQUEUE_FRAGMENT].tracing; struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info; struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT); @@ -2234,13 +2250,16 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) cs_move32_to(b, layer_count, cmdbuf->state.gfx.render.layer_count); cs_while(b, MALI_CS_CONDITION_GREATER, layer_count) { - cs_run_fragment(b, false, MALI_TILE_RENDER_ORDER_Z_ORDER, false); + cs_trace_run_fragment(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4), + false, MALI_TILE_RENDER_ORDER_Z_ORDER, false); + cs_add32(b, layer_count, layer_count, -1); cs_update_frag_ctx(b) cs_add64(b, cs_sr_reg64(b, 40), cs_sr_reg64(b, 40), fbd_sz); } } else { - cs_run_fragment(b, false, MALI_TILE_RENDER_ORDER_Z_ORDER, false); + cs_trace_run_fragment(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4), + false, MALI_TILE_RENDER_ORDER_Z_ORDER, false); } cs_req_res(b, 0); @@ -2318,8 +2337,10 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) cs_wait_slot(b, SB_ID(LS), false); /* Update the ring buffer position. */ - if (free_render_descs) - cs_render_desc_ringbuf_move_ptr(b, calc_render_descs_size(cmdbuf)); + if (free_render_descs) { + cs_render_desc_ringbuf_move_ptr(b, calc_render_descs_size(cmdbuf), + !tracing_ctx); + } /* Update the frag seqno. */ ++cmdbuf->state.cs[PANVK_SUBQUEUE_FRAGMENT].relative_sync_point; diff --git a/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c b/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c index 93a57fd4f46..013494654c1 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c +++ b/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c @@ -25,7 +25,8 @@ tiler_oom_reg_perm_cb(struct cs_builder *b, unsigned reg) static size_t generate_tiler_oom_handler(struct cs_buffer handler_mem, bool has_zs_ext, - uint32_t rt_count, uint32_t *dump_region_size) + uint32_t rt_count, bool tracing_enabled, + uint32_t *dump_region_size) { assert(rt_count >= 1 && rt_count <= MAX_RTS); uint32_t fbd_size = get_fbd_size(has_zs_ext, rt_count); @@ -44,6 +45,12 @@ generate_tiler_oom_handler(struct cs_buffer handler_mem, bool has_zs_ext, .dump_addr_offset = TILER_OOM_CTX_FIELD_OFFSET(reg_dump_addr), .ls_sb_slot = SB_ID(LS), }; + struct cs_tracing_ctx tracing_ctx = { + .ctx_reg = cs_subqueue_ctx_reg(&b), + .tracebuf_addr_offset = + offsetof(struct panvk_cs_subqueue_context, debug.tracebuf.cs), + .ls_sb_slot = SB_ID(LS), + }; cs_exception_handler_def(&b, &handler, handler_ctx) { struct cs_index subqueue_ctx = cs_subqueue_ctx_reg(&b); @@ -79,7 +86,12 @@ generate_tiler_oom_handler(struct cs_buffer handler_mem, bool has_zs_ext, cs_req_res(&b, CS_FRAG_RES); cs_while(&b, MALI_CS_CONDITION_GREATER, layer_count) { - cs_run_fragment(&b, false, MALI_TILE_RENDER_ORDER_Z_ORDER, false); + if (tracing_enabled) + cs_trace_run_fragment(&b, &tracing_ctx, + cs_scratch_reg_tuple(&b, 8, 4), false, + MALI_TILE_RENDER_ORDER_Z_ORDER, false); + else + cs_run_fragment(&b, false, MALI_TILE_RENDER_ORDER_Z_ORDER, false); cs_add32(&b, layer_count, layer_count, -1); cs_add64(&b, fbd_ptr, fbd_ptr, fbd_size); } @@ -135,6 +147,9 @@ generate_tiler_oom_handler(struct cs_buffer handler_mem, bool has_zs_ext, VkResult panvk_per_arch(init_tiler_oom)(struct panvk_device *device) { + struct panvk_instance *instance = + to_panvk_instance(device->vk.physical->instance); + bool tracing_enabled = instance->debug_flags & PANVK_DEBUG_TRACE; VkResult result = panvk_priv_bo_create( device, TILER_OOM_HANDLER_MAX_SIZE * 2 * MAX_RTS, 0, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE, &device->tiler_oom.handlers_bo); @@ -157,7 +172,7 @@ panvk_per_arch(init_tiler_oom)(struct panvk_device *device) uint32_t dump_region_size; size_t handler_length = generate_tiler_oom_handler( - handler_mem, zs_ext, rt_count, &dump_region_size); + handler_mem, zs_ext, rt_count, tracing_enabled, &dump_region_size); /* All handlers must have the same length */ assert(idx == 0 || handler_length == device->tiler_oom.handler_stride); diff --git a/src/panfrost/vulkan/csf/panvk_vX_queue.c b/src/panfrost/vulkan/csf/panvk_vX_queue.c index df3a9c23bce..ca94bfeddb4 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_queue.c +++ b/src/panfrost/vulkan/csf/panvk_vX_queue.c @@ -17,20 +17,29 @@ #include "vk_drm_syncobj.h" #include "vk_log.h" +#define MIN_DESC_TRACEBUF_SIZE (128 * 1024) +#define DEFAULT_DESC_TRACEBUF_SIZE (2 * 1024 * 1024) +#define MIN_CS_TRACEBUF_SIZE (512 * 1024) +#define DEFAULT_CS_TRACEBUF_SIZE (2 * 1024 * 1024) + static void finish_render_desc_ringbuf(struct panvk_queue *queue) { struct panvk_device *dev = to_panvk_device(queue->vk.base.device); + struct panvk_instance *instance = + to_panvk_instance(dev->vk.physical->instance); + bool tracing_enabled = instance->debug_flags & PANVK_DEBUG_TRACE; struct panvk_desc_ringbuf *ringbuf = &queue->render_desc_ringbuf; panvk_pool_free_mem(&ringbuf->syncobj); if (dev->debug.decode_ctx && ringbuf->addr.dev) { pandecode_inject_free(dev->debug.decode_ctx, ringbuf->addr.dev, - RENDER_DESC_RINGBUF_SIZE); - pandecode_inject_free(dev->debug.decode_ctx, - ringbuf->addr.dev + RENDER_DESC_RINGBUF_SIZE, - RENDER_DESC_RINGBUF_SIZE); + ringbuf->size); + if (!tracing_enabled) + pandecode_inject_free(dev->debug.decode_ctx, + ringbuf->addr.dev + ringbuf->size, + ringbuf->size); } if (ringbuf->addr.dev) { @@ -38,7 +47,7 @@ finish_render_desc_ringbuf(struct panvk_queue *queue) .type = PAN_KMOD_VM_OP_TYPE_UNMAP, .va = { .start = ringbuf->addr.dev, - .size = RENDER_DESC_RINGBUF_SIZE * 2, + .size = ringbuf->size * (tracing_enabled ? 2 : 1), }, }; @@ -47,14 +56,13 @@ finish_render_desc_ringbuf(struct panvk_queue *queue) assert(!ret); simple_mtx_lock(&dev->as.lock); - util_vma_heap_free(&dev->as.heap, ringbuf->addr.dev, - RENDER_DESC_RINGBUF_SIZE * 2); + util_vma_heap_free(&dev->as.heap, ringbuf->addr.dev, ringbuf->size * 2); simple_mtx_unlock(&dev->as.lock); } if (ringbuf->addr.host) { ASSERTED int ret = - os_munmap(ringbuf->addr.host, RENDER_DESC_RINGBUF_SIZE); + os_munmap(ringbuf->addr.host, ringbuf->size); assert(!ret); } @@ -65,21 +73,35 @@ static VkResult init_render_desc_ringbuf(struct panvk_queue *queue) { struct panvk_device *dev = to_panvk_device(queue->vk.base.device); + struct panvk_instance *instance = + to_panvk_instance(dev->vk.physical->instance); + bool tracing_enabled = instance->debug_flags & PANVK_DEBUG_TRACE; uint32_t flags = panvk_device_adjust_bo_flags(dev, PAN_KMOD_BO_FLAG_NO_MMAP); struct panvk_desc_ringbuf *ringbuf = &queue->render_desc_ringbuf; - const size_t size = RENDER_DESC_RINGBUF_SIZE; uint64_t dev_addr = 0; VkResult result; int ret; - ringbuf->bo = pan_kmod_bo_alloc(dev->kmod.dev, dev->kmod.vm, size, flags); + if (tracing_enabled) { + ringbuf->size = debug_get_num_option("PANVK_DESC_TRACEBUF_SIZE", + DEFAULT_DESC_TRACEBUF_SIZE); + flags |= PAN_KMOD_BO_FLAG_GPU_UNCACHED; + assert(ringbuf->size > MIN_DESC_TRACEBUF_SIZE && + util_is_power_of_two_nonzero(ringbuf->size)); + } else { + ringbuf->size = RENDER_DESC_RINGBUF_SIZE; + } + + ringbuf->bo = + pan_kmod_bo_alloc(dev->kmod.dev, dev->kmod.vm, ringbuf->size, flags); if (!ringbuf->bo) return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, "Failed to create a descriptor ring buffer context"); if (!(flags & PAN_KMOD_BO_FLAG_NO_MMAP)) { - ringbuf->addr.host = pan_kmod_bo_mmap( - ringbuf->bo, 0, size, PROT_READ | PROT_WRITE, MAP_SHARED, NULL); + ringbuf->addr.host = + pan_kmod_bo_mmap(ringbuf->bo, 0, ringbuf->size, PROT_READ | PROT_WRITE, + MAP_SHARED, NULL); if (ringbuf->addr.host == MAP_FAILED) { result = panvk_errorf(dev, VK_ERROR_OUT_OF_HOST_MEMORY, "Failed to CPU map ringbuf BO"); @@ -91,7 +113,8 @@ init_render_desc_ringbuf(struct panvk_queue *queue) * boundary when accessing the mapping. This way we can encode the wraparound * using 32-bit operations. */ simple_mtx_lock(&dev->as.lock); - dev_addr = util_vma_heap_alloc(&dev->as.heap, size * 2, size * 2); + dev_addr = + util_vma_heap_alloc(&dev->as.heap, ringbuf->size * 2, ringbuf->size * 2); simple_mtx_unlock(&dev->as.lock); if (!dev_addr) { @@ -106,7 +129,7 @@ init_render_desc_ringbuf(struct panvk_queue *queue) .type = PAN_KMOD_VM_OP_TYPE_MAP, .va = { .start = dev_addr, - .size = RENDER_DESC_RINGBUF_SIZE, + .size = ringbuf->size, }, .map = { .bo = ringbuf->bo, @@ -116,8 +139,8 @@ init_render_desc_ringbuf(struct panvk_queue *queue) { .type = PAN_KMOD_VM_OP_TYPE_MAP, .va = { - .start = dev_addr + RENDER_DESC_RINGBUF_SIZE, - .size = RENDER_DESC_RINGBUF_SIZE, + .start = dev_addr + ringbuf->size, + .size = ringbuf->size, }, .map = { .bo = ringbuf->bo, @@ -126,8 +149,10 @@ init_render_desc_ringbuf(struct panvk_queue *queue) }, }; + /* If tracing is enabled, we keep the second part of the mapping unmapped + * to serve as a guard region. */ ret = pan_kmod_vm_bind(dev->kmod.vm, PAN_KMOD_VM_OP_MODE_IMMEDIATE, vm_ops, - ARRAY_SIZE(vm_ops)); + tracing_enabled ? 1 : ARRAY_SIZE(vm_ops)); if (ret) { result = panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, "Failed to GPU map ringbuf BO"); @@ -138,10 +163,11 @@ init_render_desc_ringbuf(struct panvk_queue *queue) if (dev->debug.decode_ctx) { pandecode_inject_mmap(dev->debug.decode_ctx, ringbuf->addr.dev, - ringbuf->addr.host, RENDER_DESC_RINGBUF_SIZE, NULL); - pandecode_inject_mmap(dev->debug.decode_ctx, - ringbuf->addr.dev + RENDER_DESC_RINGBUF_SIZE, - ringbuf->addr.host, RENDER_DESC_RINGBUF_SIZE, NULL); + ringbuf->addr.host, ringbuf->size, NULL); + if (!tracing_enabled) + pandecode_inject_mmap(dev->debug.decode_ctx, + ringbuf->addr.dev + ringbuf->size, + ringbuf->addr.host, ringbuf->size, NULL); } struct panvk_pool_alloc_info alloc_info = { @@ -168,7 +194,7 @@ init_render_desc_ringbuf(struct panvk_queue *queue) err_finish_ringbuf: if (dev_addr && !ringbuf->addr.dev) { simple_mtx_lock(&dev->as.lock); - util_vma_heap_free(&dev->as.heap, dev_addr, size * 2); + util_vma_heap_free(&dev->as.heap, dev_addr, ringbuf->size * 2); simple_mtx_unlock(&dev->as.lock); } @@ -176,6 +202,143 @@ err_finish_ringbuf: return result; } +static void +finish_subqueue_tracing(struct panvk_queue *queue, + enum panvk_subqueue_id subqueue) +{ + struct panvk_device *dev = to_panvk_device(queue->vk.base.device); + struct panvk_subqueue *subq = &queue->subqueues[subqueue]; + + if (subq->tracebuf.addr.dev) { + size_t pgsize = getpagesize(); + + pandecode_inject_free(dev->debug.decode_ctx, subq->tracebuf.addr.dev, + subq->tracebuf.size); + + struct pan_kmod_vm_op op = { + .type = PAN_KMOD_VM_OP_TYPE_UNMAP, + .va = { + .start = subq->tracebuf.addr.dev, + .size = subq->tracebuf.size, + }, + }; + + ASSERTED int ret = + pan_kmod_vm_bind(dev->kmod.vm, PAN_KMOD_VM_OP_MODE_IMMEDIATE, &op, 1); + assert(!ret); + + simple_mtx_lock(&dev->as.lock); + util_vma_heap_free(&dev->as.heap, subq->tracebuf.addr.dev, + subq->tracebuf.size + pgsize); + simple_mtx_unlock(&dev->as.lock); + } + + if (subq->tracebuf.addr.host) { + ASSERTED int ret = + os_munmap(subq->tracebuf.addr.host, subq->tracebuf.size); + assert(!ret); + } + + pan_kmod_bo_put(subq->tracebuf.bo); + + vk_free(&dev->vk.alloc, subq->reg_file); +} + +static VkResult +init_subqueue_tracing(struct panvk_queue *queue, + enum panvk_subqueue_id subqueue) +{ + struct panvk_device *dev = to_panvk_device(queue->vk.base.device); + struct panvk_subqueue *subq = &queue->subqueues[subqueue]; + struct panvk_instance *instance = + to_panvk_instance(dev->vk.physical->instance); + unsigned debug = instance->debug_flags; + uint64_t dev_addr; + + if (!(debug & PANVK_DEBUG_TRACE)) + return VK_SUCCESS; + + subq->reg_file = + vk_zalloc(&dev->vk.alloc, sizeof(uint32_t) * 256, sizeof(uint64_t), + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + if (!subq->reg_file) + return panvk_errorf(dev->vk.physical, VK_ERROR_OUT_OF_HOST_MEMORY, + "Failed to allocate reg file cache"); + + subq->tracebuf.size = debug_get_num_option("PANVK_CS_TRACEBUF_SIZE", + DEFAULT_CS_TRACEBUF_SIZE); + assert(subq->tracebuf.size > MIN_CS_TRACEBUF_SIZE && + util_is_power_of_two_nonzero(subq->tracebuf.size)); + + subq->tracebuf.bo = + pan_kmod_bo_alloc(dev->kmod.dev, dev->kmod.vm, subq->tracebuf.size, + PAN_KMOD_BO_FLAG_GPU_UNCACHED); + if (!subq->tracebuf.bo) + return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "Failed to create a CS tracebuf"); + + subq->tracebuf.addr.host = + pan_kmod_bo_mmap(subq->tracebuf.bo, 0, subq->tracebuf.size, + PROT_READ | PROT_WRITE, MAP_SHARED, NULL); + if (subq->tracebuf.addr.host == MAP_FAILED) { + subq->tracebuf.addr.host = NULL; + return panvk_errorf(dev, VK_ERROR_OUT_OF_HOST_MEMORY, + "Failed to CPU map tracebuf"); + } + + /* Add a guard page. */ + size_t pgsize = getpagesize(); + simple_mtx_lock(&dev->as.lock); + dev_addr = + util_vma_heap_alloc(&dev->as.heap, subq->tracebuf.size + pgsize, pgsize); + simple_mtx_unlock(&dev->as.lock); + + if (!dev_addr) + return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "Failed to allocate virtual address for tracebuf"); + + struct pan_kmod_vm_op vm_op = { + .type = PAN_KMOD_VM_OP_TYPE_MAP, + .va = { + .start = dev_addr, + .size = subq->tracebuf.size, + }, + .map = { + .bo = subq->tracebuf.bo, + .bo_offset = 0, + }, + }; + + /* If tracing is enabled, we keep the second part of the mapping unmapped + * to serve as a guard region. */ + int ret = + pan_kmod_vm_bind(dev->kmod.vm, PAN_KMOD_VM_OP_MODE_IMMEDIATE, &vm_op, 1); + if (ret) { + simple_mtx_lock(&dev->as.lock); + util_vma_heap_free(&dev->as.heap, dev_addr, subq->tracebuf.size + pgsize); + simple_mtx_unlock(&dev->as.lock); + return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "Failed to GPU map ringbuf BO"); + } + + subq->tracebuf.addr.dev = dev_addr; + + if (dev->debug.decode_ctx) { + pandecode_inject_mmap(dev->debug.decode_ctx, subq->tracebuf.addr.dev, + subq->tracebuf.addr.host, subq->tracebuf.size, + NULL); + } + + return VK_SUCCESS; +} + +static void +finish_subqueue(struct panvk_queue *queue, enum panvk_subqueue_id subqueue) +{ + panvk_pool_free_mem(&queue->subqueues[subqueue].context); + finish_subqueue_tracing(queue, subqueue); +} + static VkResult init_subqueue(struct panvk_queue *queue, enum panvk_subqueue_id subqueue) { @@ -188,21 +351,21 @@ init_subqueue(struct panvk_queue *queue, enum panvk_subqueue_id subqueue) unsigned debug = instance->debug_flags; struct panvk_cs_sync64 *syncobjs = panvk_priv_mem_host_addr(queue->syncobjs); - if (debug & PANVK_DEBUG_TRACE) { - subq->reg_file = - vk_zalloc(&dev->vk.alloc, sizeof(uint32_t) * 256, sizeof(uint64_t), - VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); - if (!subq->reg_file) - return panvk_errorf(dev->vk.physical, VK_ERROR_OUT_OF_HOST_MEMORY, - "Failed to allocate reg file cache"); - } + VkResult result = init_subqueue_tracing(queue, subqueue); + if (result != VK_SUCCESS) + return result; struct panvk_pool_alloc_info alloc_info = { .size = sizeof(struct panvk_cs_subqueue_context), .alignment = 64, }; - subq->context = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info); + /* When tracing is enabled, we want to use a non-cached pool, so can get + * up-to-date context even if the CS crashed in the middle. */ + struct panvk_pool *mempool = + (debug & PANVK_DEBUG_TRACE) ? &dev->mempools.rw : &dev->mempools.rw_nc; + + subq->context = panvk_pool_alloc_mem(mempool, alloc_info); if (!panvk_priv_mem_host_addr(subq->context)) return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, "Failed to create a queue context"); @@ -212,7 +375,8 @@ init_subqueue(struct panvk_queue *queue, enum panvk_subqueue_id subqueue) *cs_ctx = (struct panvk_cs_subqueue_context){ .syncobjs = panvk_priv_mem_dev_addr(queue->syncobjs), - .debug_syncobjs = panvk_priv_mem_dev_addr(queue->debug_syncobjs), + .debug.syncobjs = panvk_priv_mem_dev_addr(queue->debug_syncobjs), + .debug.tracebuf.cs = subq->tracebuf.addr.dev, .iter_sb = 0, .tiler_oom_ctx.reg_dump_addr = panvk_priv_mem_dev_addr(queue->tiler_oom_regs_save), @@ -302,10 +466,11 @@ init_subqueue(struct panvk_queue *queue, enum panvk_subqueue_id subqueue) "SyncobjWait failed: %m"); if (debug & PANVK_DEBUG_TRACE) { - pandecode_interpret_cs(dev->debug.decode_ctx, qsubmit.stream_addr, - qsubmit.stream_size, - phys_dev->kmod.props.gpu_prod_id, subq->reg_file); - pandecode_next_frame(dev->debug.decode_ctx); + pandecode_user_msg(dev->debug.decode_ctx, "Init subqueue %d binary\n\n", + subqueue); + pandecode_cs_binary(dev->debug.decode_ctx, qsubmit.stream_addr, + qsubmit.stream_size, + phys_dev->kmod.props.gpu_prod_id); } return VK_SUCCESS; @@ -314,12 +479,8 @@ init_subqueue(struct panvk_queue *queue, enum panvk_subqueue_id subqueue) static void cleanup_queue(struct panvk_queue *queue) { - struct panvk_device *dev = to_panvk_device(queue->vk.base.device); - - for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) { - panvk_pool_free_mem(&queue->subqueues[i].context); - vk_free(&dev->vk.alloc, queue->subqueues[i].reg_file); - } + for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) + finish_subqueue(queue, i); finish_render_desc_ringbuf(queue); @@ -334,6 +495,7 @@ init_queue(struct panvk_queue *queue) struct panvk_device *dev = to_panvk_device(queue->vk.base.device); struct panvk_instance *instance = to_panvk_instance(dev->vk.physical->instance); + unsigned debug = instance->debug_flags; VkResult result; struct panvk_pool_alloc_info alloc_info = { @@ -379,6 +541,9 @@ init_queue(struct panvk_queue *queue) goto err_cleanup_queue; } + if (debug & PANVK_DEBUG_TRACE) + pandecode_next_frame(dev->debug.decode_ctx); + return VK_SUCCESS; err_cleanup_queue: @@ -718,9 +883,27 @@ static VkResult panvk_queue_submit_ioctl(struct panvk_queue_submit *submit) { const struct panvk_device *dev = submit->dev; + const struct panvk_instance *instance = submit->instance; struct panvk_queue *queue = submit->queue; int ret; + if (instance->debug_flags & PANVK_DEBUG_TRACE) { + /* If we're tracing, we need to reset the desc ringbufs and the CS + * tracebuf. */ + for (uint32_t i = 0; i < ARRAY_SIZE(queue->subqueues); i++) { + struct panvk_cs_subqueue_context *ctx = + panvk_priv_mem_host_addr(queue->subqueues[i].context); + + if (ctx->render.desc_ringbuf.ptr) { + ctx->render.desc_ringbuf.ptr = queue->render_desc_ringbuf.addr.dev; + ctx->render.desc_ringbuf.pos = 0; + } + + if (ctx->debug.tracebuf.cs) + ctx->debug.tracebuf.cs = queue->subqueues[i].tracebuf.addr.dev; + } + } + struct drm_panthor_group_submit gsubmit = { .group_handle = queue->group_handle, .queue_submits = @@ -781,16 +964,36 @@ panvk_queue_submit_process_debug(const struct panvk_queue_submit *submit) if (!qsubmit->stream_size) continue; - uint32_t subqueue = qsubmit->queue_index; + pandecode_user_msg(decode_ctx, "CS %d on subqueue %d binaries\n\n", i, + qsubmit->queue_index); + pandecode_cs_binary(decode_ctx, qsubmit->stream_addr, + qsubmit->stream_size, props->gpu_prod_id); + pandecode_user_msg(decode_ctx, "\n"); + } - simple_mtx_lock(&decode_ctx->lock); - pandecode_dump_file_open(decode_ctx); - pandecode_log(decode_ctx, "CS%d\n", subqueue); - simple_mtx_unlock(&decode_ctx->lock); + for (uint32_t i = 0; i < ARRAY_SIZE(queue->subqueues); i++) { + struct panvk_cs_subqueue_context *ctx = + panvk_priv_mem_host_addr(queue->subqueues[i].context); - pandecode_interpret_cs(decode_ctx, qsubmit->stream_addr, - qsubmit->stream_size, props->gpu_prod_id, - queue->subqueues[subqueue].reg_file); + size_t trace_size = + ctx->debug.tracebuf.cs - queue->subqueues[i].tracebuf.addr.dev; + if (!trace_size) + continue; + + assert( + trace_size <= queue->subqueues[i].tracebuf.size || + !"OOB access on the CS tracebuf, pass a bigger PANVK_CS_TRACEBUF_SIZE"); + + assert( + !ctx->render.desc_ringbuf.ptr || + ctx->render.desc_ringbuf.pos <= queue->render_desc_ringbuf.size || + !"OOB access on the desc tracebuf, pass a bigger PANVK_DESC_TRACEBUF_SIZE"); + + mali_ptr trace = queue->subqueues[i].tracebuf.addr.dev; + + pandecode_user_msg(decode_ctx, "\nCS traces on subqueue %d\n\n", i); + pandecode_cs_trace(decode_ctx, trace, trace_size, props->gpu_prod_id); + pandecode_user_msg(decode_ctx, "\n"); } }