panvk/csf: Use event-based CS tracing

Use the new event-based tracing system to capture IDVS/COMPUTE/FRAGMENT
jobs and their context.

When tracing is enabled, the descriptor ring buffer is replaced by
a bigger linear buffer such that descriptors are not recycled before
we get a change to decode the trace.

If the decode buffer is too small and a OOB is detected, the driver will
suggest the user to allocate a bigger buffer with the
PANVK_{DESC,CS}_TRACEBUF_SIZE env vars.

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Mary Guillemard <mary.guillemard@collabora.com>
Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32284>
This commit is contained in:
Boris Brezillon 2024-11-21 18:39:17 +01:00 committed by Marge Bot
parent bf05842a8d
commit bd49fa68b0
7 changed files with 353 additions and 82 deletions

View file

@ -100,7 +100,12 @@ struct panvk_cs_subqueue_context {
uint32_t layer_count;
mali_ptr reg_dump_addr;
} tiler_oom_ctx;
uint64_t debug_syncobjs;
struct {
uint64_t syncobjs;
struct {
uint64_t cs;
} tracebuf;
} debug;
} __attribute__((aligned(64)));
struct panvk_cache_flush_info {
@ -249,6 +254,8 @@ struct panvk_cs_state {
/* Sync point relative to the beginning of the command buffer.
* Needs to be offset with the subqueue sync point. */
int32_t relative_sync_point;
struct cs_tracing_ctx tracing;
};
static inline struct panvk_cs_reg_upd_context *

View file

@ -37,11 +37,21 @@ struct panvk_tiler_heap {
struct panvk_subqueue {
struct panvk_priv_mem context;
uint32_t *reg_file;
struct {
struct pan_kmod_bo *bo;
size_t size;
struct {
uint64_t dev;
void *host;
} addr;
} tracebuf;
};
struct panvk_desc_ringbuf {
struct panvk_priv_mem syncobj;
struct pan_kmod_bo *bo;
size_t size;
struct {
uint64_t dev;
void *host;

View file

@ -138,7 +138,7 @@ finish_cs(struct panvk_cmd_buffer *cmdbuf, uint32_t subqueue)
cs_move32_to(b, one, 1);
cs_load64_to(b, debug_sync_addr, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context, debug_syncobjs));
offsetof(struct panvk_cs_subqueue_context, debug.syncobjs));
cs_wait_slot(b, SB_ID(LS), false);
cs_add64(b, debug_sync_addr, debug_sync_addr,
sizeof(struct panvk_cs_sync32) * subqueue);
@ -679,6 +679,7 @@ init_cs_builders(struct panvk_cmd_buffer *cmdbuf)
};
for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->state.cs); i++) {
struct cs_builder *b = &cmdbuf->state.cs[i].builder;
/* Lazy allocation of the root CS. */
struct cs_buffer root_cs = {0};
@ -701,7 +702,17 @@ init_cs_builders(struct panvk_cmd_buffer *cmdbuf)
conf.reg_perm = cs_reg_perm;
}
cs_builder_init(&cmdbuf->state.cs[i].builder, &conf, root_cs);
cs_builder_init(b, &conf, root_cs);
if (instance->debug_flags & PANVK_DEBUG_TRACE) {
cmdbuf->state.cs[i].tracing = (struct cs_tracing_ctx){
.enabled = true,
.ctx_reg = cs_subqueue_ctx_reg(b),
.tracebuf_addr_offset =
offsetof(struct panvk_cs_subqueue_context, debug.tracebuf.cs),
.ls_sb_slot = SB_ID(LS),
};
}
}
}

View file

@ -168,6 +168,8 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
&cmdbuf->state.compute.desc_state;
struct panvk_shader_desc_state *cs_desc_state =
&cmdbuf->state.compute.cs.desc;
const struct cs_tracing_ctx *tracing_ctx =
&cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].tracing;
struct panfrost_ptr tsd = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE);
if (!tsd.gpu)
@ -349,16 +351,18 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_COMPUTE);
cs_req_res(b, CS_COMPUTE_RES);
if (indirect)
cs_run_compute_indirect(b, wg_per_task, false,
cs_shader_res_sel(0, 0, 0, 0));
else {
if (indirect) {
cs_trace_run_compute_indirect(b, tracing_ctx,
cs_scratch_reg_tuple(b, 0, 4), wg_per_task,
false, cs_shader_res_sel(0, 0, 0, 0));
} else {
unsigned task_axis = MALI_TASK_AXIS_X;
unsigned task_increment = 0;
calculate_task_axis_and_increment(shader, phys_dev, &task_axis,
&task_increment);
cs_run_compute(b, task_increment, task_axis, false,
cs_shader_res_sel(0, 0, 0, 0));
cs_trace_run_compute(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
task_increment, task_axis, false,
cs_shader_res_sel(0, 0, 0, 0));
}
cs_req_res(b, 0);

View file

@ -688,7 +688,8 @@ cs_render_desc_ringbuf_reserve(struct cs_builder *b, uint32_t size)
}
static void
cs_render_desc_ringbuf_move_ptr(struct cs_builder *b, uint32_t size)
cs_render_desc_ringbuf_move_ptr(struct cs_builder *b, uint32_t size,
bool wrap_around)
{
struct cs_index scratch_reg = cs_scratch_reg32(b, 0);
struct cs_index ptr_lo = cs_scratch_reg32(b, 2);
@ -703,12 +704,15 @@ cs_render_desc_ringbuf_move_ptr(struct cs_builder *b, uint32_t size)
/* Update the relative position and absolute address. */
cs_add32(b, ptr_lo, ptr_lo, size);
cs_add32(b, pos, pos, size);
cs_add32(b, scratch_reg, pos, -RENDER_DESC_RINGBUF_SIZE);
/* Wrap-around. */
cs_if(b, MALI_CS_CONDITION_GEQUAL, scratch_reg) {
cs_add32(b, ptr_lo, ptr_lo, -RENDER_DESC_RINGBUF_SIZE);
cs_add32(b, pos, pos, -RENDER_DESC_RINGBUF_SIZE);
if (likely(wrap_around)) {
cs_add32(b, scratch_reg, pos, -RENDER_DESC_RINGBUF_SIZE);
cs_if(b, MALI_CS_CONDITION_GEQUAL, scratch_reg) {
cs_add32(b, ptr_lo, ptr_lo, -RENDER_DESC_RINGBUF_SIZE);
cs_add32(b, pos, pos, -RENDER_DESC_RINGBUF_SIZE);
}
}
cs_store(
@ -740,6 +744,9 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf)
panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
struct panvk_physical_device *phys_dev =
to_panvk_physical_device(cmdbuf->vk.base.device->physical);
struct panvk_instance *instance =
to_panvk_instance(phys_dev->vk.instance);
bool tracing_enabled = instance->debug_flags & PANVK_DEBUG_TRACE;
struct panfrost_tiler_features tiler_features =
panfrost_query_tiler_features(&phys_dev->kmod.props);
bool simul_use =
@ -797,7 +804,7 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf)
render.desc_ringbuf.ptr));
}
cs_render_desc_ringbuf_move_ptr(b, descs_sz);
cs_render_desc_ringbuf_move_ptr(b, descs_sz, !tracing_enabled);
} else {
cs_update_vt_ctx(b) {
cs_move64_to(b, tiler_ctx_addr, tiler_desc.gpu);
@ -1661,6 +1668,8 @@ prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
static void
panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
{
const struct cs_tracing_ctx *tracing_ctx =
&cmdbuf->state.cs[PANVK_SUBQUEUE_VERTEX_TILER].tracing;
const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
struct cs_builder *b =
panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
@ -1718,11 +1727,12 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
cs_move32_to(b, counter_reg, idvs_count);
cs_while(b, MALI_CS_CONDITION_GREATER, counter_reg) {
cs_run_idvs(b, flags_override.opaque[0], false, true,
cs_shader_res_sel(0, 0, 1, 0),
cs_shader_res_sel(2, 2, 2, 0), cs_undef());
cs_trace_run_idvs(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
flags_override.opaque[0], false, true,
cs_shader_res_sel(0, 0, 1, 0),
cs_shader_res_sel(2, 2, 2, 0), cs_undef());
cs_add32(b, counter_reg, counter_reg, -1);
cs_add32(b, counter_reg, counter_reg, -1);
cs_update_vt_ctx(b) {
cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
pan_size(TILER_CONTEXT));
@ -1734,9 +1744,10 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
-(idvs_count * pan_size(TILER_CONTEXT)));
}
} else {
cs_run_idvs(b, flags_override.opaque[0], false, true,
cs_shader_res_sel(0, 0, 1, 0), cs_shader_res_sel(2, 2, 2, 0),
cs_undef());
cs_trace_run_idvs(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
flags_override.opaque[0], false, true,
cs_shader_res_sel(0, 0, 1, 0),
cs_shader_res_sel(2, 2, 2, 0), cs_undef());
}
cs_req_res(b, 0);
}
@ -1803,6 +1814,8 @@ static void
panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
struct panvk_draw_info *draw)
{
const struct cs_tracing_ctx *tracing_ctx =
&cmdbuf->state.cs[PANVK_SUBQUEUE_VERTEX_TILER].tracing;
const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
struct cs_builder *b =
panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
@ -1852,9 +1865,10 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
cs_wait_slot(b, SB_ID(LS), false);
cs_req_res(b, CS_IDVS_RES);
cs_run_idvs(b, flags_override.opaque[0], false, true,
cs_shader_res_sel(0, 0, 1, 0), cs_shader_res_sel(2, 2, 2, 0),
cs_undef());
cs_trace_run_idvs(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
flags_override.opaque[0], false, true,
cs_shader_res_sel(0, 0, 1, 0),
cs_shader_res_sel(2, 2, 2, 0), cs_undef());
cs_req_res(b, 0);
}
@ -2147,6 +2161,8 @@ static VkResult
issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
{
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
const struct cs_tracing_ctx *tracing_ctx =
&cmdbuf->state.cs[PANVK_SUBQUEUE_FRAGMENT].tracing;
struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
@ -2234,13 +2250,16 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
cs_move32_to(b, layer_count, cmdbuf->state.gfx.render.layer_count);
cs_while(b, MALI_CS_CONDITION_GREATER, layer_count) {
cs_run_fragment(b, false, MALI_TILE_RENDER_ORDER_Z_ORDER, false);
cs_trace_run_fragment(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
false, MALI_TILE_RENDER_ORDER_Z_ORDER, false);
cs_add32(b, layer_count, layer_count, -1);
cs_update_frag_ctx(b)
cs_add64(b, cs_sr_reg64(b, 40), cs_sr_reg64(b, 40), fbd_sz);
}
} else {
cs_run_fragment(b, false, MALI_TILE_RENDER_ORDER_Z_ORDER, false);
cs_trace_run_fragment(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
false, MALI_TILE_RENDER_ORDER_Z_ORDER, false);
}
cs_req_res(b, 0);
@ -2318,8 +2337,10 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
cs_wait_slot(b, SB_ID(LS), false);
/* Update the ring buffer position. */
if (free_render_descs)
cs_render_desc_ringbuf_move_ptr(b, calc_render_descs_size(cmdbuf));
if (free_render_descs) {
cs_render_desc_ringbuf_move_ptr(b, calc_render_descs_size(cmdbuf),
!tracing_ctx);
}
/* Update the frag seqno. */
++cmdbuf->state.cs[PANVK_SUBQUEUE_FRAGMENT].relative_sync_point;

View file

@ -25,7 +25,8 @@ tiler_oom_reg_perm_cb(struct cs_builder *b, unsigned reg)
static size_t
generate_tiler_oom_handler(struct cs_buffer handler_mem, bool has_zs_ext,
uint32_t rt_count, uint32_t *dump_region_size)
uint32_t rt_count, bool tracing_enabled,
uint32_t *dump_region_size)
{
assert(rt_count >= 1 && rt_count <= MAX_RTS);
uint32_t fbd_size = get_fbd_size(has_zs_ext, rt_count);
@ -44,6 +45,12 @@ generate_tiler_oom_handler(struct cs_buffer handler_mem, bool has_zs_ext,
.dump_addr_offset = TILER_OOM_CTX_FIELD_OFFSET(reg_dump_addr),
.ls_sb_slot = SB_ID(LS),
};
struct cs_tracing_ctx tracing_ctx = {
.ctx_reg = cs_subqueue_ctx_reg(&b),
.tracebuf_addr_offset =
offsetof(struct panvk_cs_subqueue_context, debug.tracebuf.cs),
.ls_sb_slot = SB_ID(LS),
};
cs_exception_handler_def(&b, &handler, handler_ctx) {
struct cs_index subqueue_ctx = cs_subqueue_ctx_reg(&b);
@ -79,7 +86,12 @@ generate_tiler_oom_handler(struct cs_buffer handler_mem, bool has_zs_ext,
cs_req_res(&b, CS_FRAG_RES);
cs_while(&b, MALI_CS_CONDITION_GREATER, layer_count) {
cs_run_fragment(&b, false, MALI_TILE_RENDER_ORDER_Z_ORDER, false);
if (tracing_enabled)
cs_trace_run_fragment(&b, &tracing_ctx,
cs_scratch_reg_tuple(&b, 8, 4), false,
MALI_TILE_RENDER_ORDER_Z_ORDER, false);
else
cs_run_fragment(&b, false, MALI_TILE_RENDER_ORDER_Z_ORDER, false);
cs_add32(&b, layer_count, layer_count, -1);
cs_add64(&b, fbd_ptr, fbd_ptr, fbd_size);
}
@ -135,6 +147,9 @@ generate_tiler_oom_handler(struct cs_buffer handler_mem, bool has_zs_ext,
VkResult
panvk_per_arch(init_tiler_oom)(struct panvk_device *device)
{
struct panvk_instance *instance =
to_panvk_instance(device->vk.physical->instance);
bool tracing_enabled = instance->debug_flags & PANVK_DEBUG_TRACE;
VkResult result = panvk_priv_bo_create(
device, TILER_OOM_HANDLER_MAX_SIZE * 2 * MAX_RTS, 0,
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE, &device->tiler_oom.handlers_bo);
@ -157,7 +172,7 @@ panvk_per_arch(init_tiler_oom)(struct panvk_device *device)
uint32_t dump_region_size;
size_t handler_length = generate_tiler_oom_handler(
handler_mem, zs_ext, rt_count, &dump_region_size);
handler_mem, zs_ext, rt_count, tracing_enabled, &dump_region_size);
/* All handlers must have the same length */
assert(idx == 0 || handler_length == device->tiler_oom.handler_stride);

View file

@ -17,20 +17,29 @@
#include "vk_drm_syncobj.h"
#include "vk_log.h"
#define MIN_DESC_TRACEBUF_SIZE (128 * 1024)
#define DEFAULT_DESC_TRACEBUF_SIZE (2 * 1024 * 1024)
#define MIN_CS_TRACEBUF_SIZE (512 * 1024)
#define DEFAULT_CS_TRACEBUF_SIZE (2 * 1024 * 1024)
static void
finish_render_desc_ringbuf(struct panvk_queue *queue)
{
struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
struct panvk_instance *instance =
to_panvk_instance(dev->vk.physical->instance);
bool tracing_enabled = instance->debug_flags & PANVK_DEBUG_TRACE;
struct panvk_desc_ringbuf *ringbuf = &queue->render_desc_ringbuf;
panvk_pool_free_mem(&ringbuf->syncobj);
if (dev->debug.decode_ctx && ringbuf->addr.dev) {
pandecode_inject_free(dev->debug.decode_ctx, ringbuf->addr.dev,
RENDER_DESC_RINGBUF_SIZE);
pandecode_inject_free(dev->debug.decode_ctx,
ringbuf->addr.dev + RENDER_DESC_RINGBUF_SIZE,
RENDER_DESC_RINGBUF_SIZE);
ringbuf->size);
if (!tracing_enabled)
pandecode_inject_free(dev->debug.decode_ctx,
ringbuf->addr.dev + ringbuf->size,
ringbuf->size);
}
if (ringbuf->addr.dev) {
@ -38,7 +47,7 @@ finish_render_desc_ringbuf(struct panvk_queue *queue)
.type = PAN_KMOD_VM_OP_TYPE_UNMAP,
.va = {
.start = ringbuf->addr.dev,
.size = RENDER_DESC_RINGBUF_SIZE * 2,
.size = ringbuf->size * (tracing_enabled ? 2 : 1),
},
};
@ -47,14 +56,13 @@ finish_render_desc_ringbuf(struct panvk_queue *queue)
assert(!ret);
simple_mtx_lock(&dev->as.lock);
util_vma_heap_free(&dev->as.heap, ringbuf->addr.dev,
RENDER_DESC_RINGBUF_SIZE * 2);
util_vma_heap_free(&dev->as.heap, ringbuf->addr.dev, ringbuf->size * 2);
simple_mtx_unlock(&dev->as.lock);
}
if (ringbuf->addr.host) {
ASSERTED int ret =
os_munmap(ringbuf->addr.host, RENDER_DESC_RINGBUF_SIZE);
os_munmap(ringbuf->addr.host, ringbuf->size);
assert(!ret);
}
@ -65,21 +73,35 @@ static VkResult
init_render_desc_ringbuf(struct panvk_queue *queue)
{
struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
struct panvk_instance *instance =
to_panvk_instance(dev->vk.physical->instance);
bool tracing_enabled = instance->debug_flags & PANVK_DEBUG_TRACE;
uint32_t flags = panvk_device_adjust_bo_flags(dev, PAN_KMOD_BO_FLAG_NO_MMAP);
struct panvk_desc_ringbuf *ringbuf = &queue->render_desc_ringbuf;
const size_t size = RENDER_DESC_RINGBUF_SIZE;
uint64_t dev_addr = 0;
VkResult result;
int ret;
ringbuf->bo = pan_kmod_bo_alloc(dev->kmod.dev, dev->kmod.vm, size, flags);
if (tracing_enabled) {
ringbuf->size = debug_get_num_option("PANVK_DESC_TRACEBUF_SIZE",
DEFAULT_DESC_TRACEBUF_SIZE);
flags |= PAN_KMOD_BO_FLAG_GPU_UNCACHED;
assert(ringbuf->size > MIN_DESC_TRACEBUF_SIZE &&
util_is_power_of_two_nonzero(ringbuf->size));
} else {
ringbuf->size = RENDER_DESC_RINGBUF_SIZE;
}
ringbuf->bo =
pan_kmod_bo_alloc(dev->kmod.dev, dev->kmod.vm, ringbuf->size, flags);
if (!ringbuf->bo)
return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
"Failed to create a descriptor ring buffer context");
if (!(flags & PAN_KMOD_BO_FLAG_NO_MMAP)) {
ringbuf->addr.host = pan_kmod_bo_mmap(
ringbuf->bo, 0, size, PROT_READ | PROT_WRITE, MAP_SHARED, NULL);
ringbuf->addr.host =
pan_kmod_bo_mmap(ringbuf->bo, 0, ringbuf->size, PROT_READ | PROT_WRITE,
MAP_SHARED, NULL);
if (ringbuf->addr.host == MAP_FAILED) {
result = panvk_errorf(dev, VK_ERROR_OUT_OF_HOST_MEMORY,
"Failed to CPU map ringbuf BO");
@ -91,7 +113,8 @@ init_render_desc_ringbuf(struct panvk_queue *queue)
* boundary when accessing the mapping. This way we can encode the wraparound
* using 32-bit operations. */
simple_mtx_lock(&dev->as.lock);
dev_addr = util_vma_heap_alloc(&dev->as.heap, size * 2, size * 2);
dev_addr =
util_vma_heap_alloc(&dev->as.heap, ringbuf->size * 2, ringbuf->size * 2);
simple_mtx_unlock(&dev->as.lock);
if (!dev_addr) {
@ -106,7 +129,7 @@ init_render_desc_ringbuf(struct panvk_queue *queue)
.type = PAN_KMOD_VM_OP_TYPE_MAP,
.va = {
.start = dev_addr,
.size = RENDER_DESC_RINGBUF_SIZE,
.size = ringbuf->size,
},
.map = {
.bo = ringbuf->bo,
@ -116,8 +139,8 @@ init_render_desc_ringbuf(struct panvk_queue *queue)
{
.type = PAN_KMOD_VM_OP_TYPE_MAP,
.va = {
.start = dev_addr + RENDER_DESC_RINGBUF_SIZE,
.size = RENDER_DESC_RINGBUF_SIZE,
.start = dev_addr + ringbuf->size,
.size = ringbuf->size,
},
.map = {
.bo = ringbuf->bo,
@ -126,8 +149,10 @@ init_render_desc_ringbuf(struct panvk_queue *queue)
},
};
/* If tracing is enabled, we keep the second part of the mapping unmapped
* to serve as a guard region. */
ret = pan_kmod_vm_bind(dev->kmod.vm, PAN_KMOD_VM_OP_MODE_IMMEDIATE, vm_ops,
ARRAY_SIZE(vm_ops));
tracing_enabled ? 1 : ARRAY_SIZE(vm_ops));
if (ret) {
result = panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
"Failed to GPU map ringbuf BO");
@ -138,10 +163,11 @@ init_render_desc_ringbuf(struct panvk_queue *queue)
if (dev->debug.decode_ctx) {
pandecode_inject_mmap(dev->debug.decode_ctx, ringbuf->addr.dev,
ringbuf->addr.host, RENDER_DESC_RINGBUF_SIZE, NULL);
pandecode_inject_mmap(dev->debug.decode_ctx,
ringbuf->addr.dev + RENDER_DESC_RINGBUF_SIZE,
ringbuf->addr.host, RENDER_DESC_RINGBUF_SIZE, NULL);
ringbuf->addr.host, ringbuf->size, NULL);
if (!tracing_enabled)
pandecode_inject_mmap(dev->debug.decode_ctx,
ringbuf->addr.dev + ringbuf->size,
ringbuf->addr.host, ringbuf->size, NULL);
}
struct panvk_pool_alloc_info alloc_info = {
@ -168,7 +194,7 @@ init_render_desc_ringbuf(struct panvk_queue *queue)
err_finish_ringbuf:
if (dev_addr && !ringbuf->addr.dev) {
simple_mtx_lock(&dev->as.lock);
util_vma_heap_free(&dev->as.heap, dev_addr, size * 2);
util_vma_heap_free(&dev->as.heap, dev_addr, ringbuf->size * 2);
simple_mtx_unlock(&dev->as.lock);
}
@ -176,6 +202,143 @@ err_finish_ringbuf:
return result;
}
static void
finish_subqueue_tracing(struct panvk_queue *queue,
enum panvk_subqueue_id subqueue)
{
struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
struct panvk_subqueue *subq = &queue->subqueues[subqueue];
if (subq->tracebuf.addr.dev) {
size_t pgsize = getpagesize();
pandecode_inject_free(dev->debug.decode_ctx, subq->tracebuf.addr.dev,
subq->tracebuf.size);
struct pan_kmod_vm_op op = {
.type = PAN_KMOD_VM_OP_TYPE_UNMAP,
.va = {
.start = subq->tracebuf.addr.dev,
.size = subq->tracebuf.size,
},
};
ASSERTED int ret =
pan_kmod_vm_bind(dev->kmod.vm, PAN_KMOD_VM_OP_MODE_IMMEDIATE, &op, 1);
assert(!ret);
simple_mtx_lock(&dev->as.lock);
util_vma_heap_free(&dev->as.heap, subq->tracebuf.addr.dev,
subq->tracebuf.size + pgsize);
simple_mtx_unlock(&dev->as.lock);
}
if (subq->tracebuf.addr.host) {
ASSERTED int ret =
os_munmap(subq->tracebuf.addr.host, subq->tracebuf.size);
assert(!ret);
}
pan_kmod_bo_put(subq->tracebuf.bo);
vk_free(&dev->vk.alloc, subq->reg_file);
}
static VkResult
init_subqueue_tracing(struct panvk_queue *queue,
enum panvk_subqueue_id subqueue)
{
struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
struct panvk_subqueue *subq = &queue->subqueues[subqueue];
struct panvk_instance *instance =
to_panvk_instance(dev->vk.physical->instance);
unsigned debug = instance->debug_flags;
uint64_t dev_addr;
if (!(debug & PANVK_DEBUG_TRACE))
return VK_SUCCESS;
subq->reg_file =
vk_zalloc(&dev->vk.alloc, sizeof(uint32_t) * 256, sizeof(uint64_t),
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
if (!subq->reg_file)
return panvk_errorf(dev->vk.physical, VK_ERROR_OUT_OF_HOST_MEMORY,
"Failed to allocate reg file cache");
subq->tracebuf.size = debug_get_num_option("PANVK_CS_TRACEBUF_SIZE",
DEFAULT_CS_TRACEBUF_SIZE);
assert(subq->tracebuf.size > MIN_CS_TRACEBUF_SIZE &&
util_is_power_of_two_nonzero(subq->tracebuf.size));
subq->tracebuf.bo =
pan_kmod_bo_alloc(dev->kmod.dev, dev->kmod.vm, subq->tracebuf.size,
PAN_KMOD_BO_FLAG_GPU_UNCACHED);
if (!subq->tracebuf.bo)
return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
"Failed to create a CS tracebuf");
subq->tracebuf.addr.host =
pan_kmod_bo_mmap(subq->tracebuf.bo, 0, subq->tracebuf.size,
PROT_READ | PROT_WRITE, MAP_SHARED, NULL);
if (subq->tracebuf.addr.host == MAP_FAILED) {
subq->tracebuf.addr.host = NULL;
return panvk_errorf(dev, VK_ERROR_OUT_OF_HOST_MEMORY,
"Failed to CPU map tracebuf");
}
/* Add a guard page. */
size_t pgsize = getpagesize();
simple_mtx_lock(&dev->as.lock);
dev_addr =
util_vma_heap_alloc(&dev->as.heap, subq->tracebuf.size + pgsize, pgsize);
simple_mtx_unlock(&dev->as.lock);
if (!dev_addr)
return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
"Failed to allocate virtual address for tracebuf");
struct pan_kmod_vm_op vm_op = {
.type = PAN_KMOD_VM_OP_TYPE_MAP,
.va = {
.start = dev_addr,
.size = subq->tracebuf.size,
},
.map = {
.bo = subq->tracebuf.bo,
.bo_offset = 0,
},
};
/* If tracing is enabled, we keep the second part of the mapping unmapped
* to serve as a guard region. */
int ret =
pan_kmod_vm_bind(dev->kmod.vm, PAN_KMOD_VM_OP_MODE_IMMEDIATE, &vm_op, 1);
if (ret) {
simple_mtx_lock(&dev->as.lock);
util_vma_heap_free(&dev->as.heap, dev_addr, subq->tracebuf.size + pgsize);
simple_mtx_unlock(&dev->as.lock);
return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
"Failed to GPU map ringbuf BO");
}
subq->tracebuf.addr.dev = dev_addr;
if (dev->debug.decode_ctx) {
pandecode_inject_mmap(dev->debug.decode_ctx, subq->tracebuf.addr.dev,
subq->tracebuf.addr.host, subq->tracebuf.size,
NULL);
}
return VK_SUCCESS;
}
static void
finish_subqueue(struct panvk_queue *queue, enum panvk_subqueue_id subqueue)
{
panvk_pool_free_mem(&queue->subqueues[subqueue].context);
finish_subqueue_tracing(queue, subqueue);
}
static VkResult
init_subqueue(struct panvk_queue *queue, enum panvk_subqueue_id subqueue)
{
@ -188,21 +351,21 @@ init_subqueue(struct panvk_queue *queue, enum panvk_subqueue_id subqueue)
unsigned debug = instance->debug_flags;
struct panvk_cs_sync64 *syncobjs = panvk_priv_mem_host_addr(queue->syncobjs);
if (debug & PANVK_DEBUG_TRACE) {
subq->reg_file =
vk_zalloc(&dev->vk.alloc, sizeof(uint32_t) * 256, sizeof(uint64_t),
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
if (!subq->reg_file)
return panvk_errorf(dev->vk.physical, VK_ERROR_OUT_OF_HOST_MEMORY,
"Failed to allocate reg file cache");
}
VkResult result = init_subqueue_tracing(queue, subqueue);
if (result != VK_SUCCESS)
return result;
struct panvk_pool_alloc_info alloc_info = {
.size = sizeof(struct panvk_cs_subqueue_context),
.alignment = 64,
};
subq->context = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
/* When tracing is enabled, we want to use a non-cached pool, so can get
* up-to-date context even if the CS crashed in the middle. */
struct panvk_pool *mempool =
(debug & PANVK_DEBUG_TRACE) ? &dev->mempools.rw : &dev->mempools.rw_nc;
subq->context = panvk_pool_alloc_mem(mempool, alloc_info);
if (!panvk_priv_mem_host_addr(subq->context))
return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
"Failed to create a queue context");
@ -212,7 +375,8 @@ init_subqueue(struct panvk_queue *queue, enum panvk_subqueue_id subqueue)
*cs_ctx = (struct panvk_cs_subqueue_context){
.syncobjs = panvk_priv_mem_dev_addr(queue->syncobjs),
.debug_syncobjs = panvk_priv_mem_dev_addr(queue->debug_syncobjs),
.debug.syncobjs = panvk_priv_mem_dev_addr(queue->debug_syncobjs),
.debug.tracebuf.cs = subq->tracebuf.addr.dev,
.iter_sb = 0,
.tiler_oom_ctx.reg_dump_addr =
panvk_priv_mem_dev_addr(queue->tiler_oom_regs_save),
@ -302,10 +466,11 @@ init_subqueue(struct panvk_queue *queue, enum panvk_subqueue_id subqueue)
"SyncobjWait failed: %m");
if (debug & PANVK_DEBUG_TRACE) {
pandecode_interpret_cs(dev->debug.decode_ctx, qsubmit.stream_addr,
qsubmit.stream_size,
phys_dev->kmod.props.gpu_prod_id, subq->reg_file);
pandecode_next_frame(dev->debug.decode_ctx);
pandecode_user_msg(dev->debug.decode_ctx, "Init subqueue %d binary\n\n",
subqueue);
pandecode_cs_binary(dev->debug.decode_ctx, qsubmit.stream_addr,
qsubmit.stream_size,
phys_dev->kmod.props.gpu_prod_id);
}
return VK_SUCCESS;
@ -314,12 +479,8 @@ init_subqueue(struct panvk_queue *queue, enum panvk_subqueue_id subqueue)
static void
cleanup_queue(struct panvk_queue *queue)
{
struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
panvk_pool_free_mem(&queue->subqueues[i].context);
vk_free(&dev->vk.alloc, queue->subqueues[i].reg_file);
}
for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++)
finish_subqueue(queue, i);
finish_render_desc_ringbuf(queue);
@ -334,6 +495,7 @@ init_queue(struct panvk_queue *queue)
struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
struct panvk_instance *instance =
to_panvk_instance(dev->vk.physical->instance);
unsigned debug = instance->debug_flags;
VkResult result;
struct panvk_pool_alloc_info alloc_info = {
@ -379,6 +541,9 @@ init_queue(struct panvk_queue *queue)
goto err_cleanup_queue;
}
if (debug & PANVK_DEBUG_TRACE)
pandecode_next_frame(dev->debug.decode_ctx);
return VK_SUCCESS;
err_cleanup_queue:
@ -718,9 +883,27 @@ static VkResult
panvk_queue_submit_ioctl(struct panvk_queue_submit *submit)
{
const struct panvk_device *dev = submit->dev;
const struct panvk_instance *instance = submit->instance;
struct panvk_queue *queue = submit->queue;
int ret;
if (instance->debug_flags & PANVK_DEBUG_TRACE) {
/* If we're tracing, we need to reset the desc ringbufs and the CS
* tracebuf. */
for (uint32_t i = 0; i < ARRAY_SIZE(queue->subqueues); i++) {
struct panvk_cs_subqueue_context *ctx =
panvk_priv_mem_host_addr(queue->subqueues[i].context);
if (ctx->render.desc_ringbuf.ptr) {
ctx->render.desc_ringbuf.ptr = queue->render_desc_ringbuf.addr.dev;
ctx->render.desc_ringbuf.pos = 0;
}
if (ctx->debug.tracebuf.cs)
ctx->debug.tracebuf.cs = queue->subqueues[i].tracebuf.addr.dev;
}
}
struct drm_panthor_group_submit gsubmit = {
.group_handle = queue->group_handle,
.queue_submits =
@ -781,16 +964,36 @@ panvk_queue_submit_process_debug(const struct panvk_queue_submit *submit)
if (!qsubmit->stream_size)
continue;
uint32_t subqueue = qsubmit->queue_index;
pandecode_user_msg(decode_ctx, "CS %d on subqueue %d binaries\n\n", i,
qsubmit->queue_index);
pandecode_cs_binary(decode_ctx, qsubmit->stream_addr,
qsubmit->stream_size, props->gpu_prod_id);
pandecode_user_msg(decode_ctx, "\n");
}
simple_mtx_lock(&decode_ctx->lock);
pandecode_dump_file_open(decode_ctx);
pandecode_log(decode_ctx, "CS%d\n", subqueue);
simple_mtx_unlock(&decode_ctx->lock);
for (uint32_t i = 0; i < ARRAY_SIZE(queue->subqueues); i++) {
struct panvk_cs_subqueue_context *ctx =
panvk_priv_mem_host_addr(queue->subqueues[i].context);
pandecode_interpret_cs(decode_ctx, qsubmit->stream_addr,
qsubmit->stream_size, props->gpu_prod_id,
queue->subqueues[subqueue].reg_file);
size_t trace_size =
ctx->debug.tracebuf.cs - queue->subqueues[i].tracebuf.addr.dev;
if (!trace_size)
continue;
assert(
trace_size <= queue->subqueues[i].tracebuf.size ||
!"OOB access on the CS tracebuf, pass a bigger PANVK_CS_TRACEBUF_SIZE");
assert(
!ctx->render.desc_ringbuf.ptr ||
ctx->render.desc_ringbuf.pos <= queue->render_desc_ringbuf.size ||
!"OOB access on the desc tracebuf, pass a bigger PANVK_DESC_TRACEBUF_SIZE");
mali_ptr trace = queue->subqueues[i].tracebuf.addr.dev;
pandecode_user_msg(decode_ctx, "\nCS traces on subqueue %d\n\n", i);
pandecode_cs_trace(decode_ctx, trace, trace_size, props->gpu_prod_id);
pandecode_user_msg(decode_ctx, "\n");
}
}