From bf05842a8ddf192c0e46cbb3e7bbe25b7a08c761 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Thu, 21 Nov 2024 18:31:48 +0100 Subject: [PATCH] pan/cs: Add an event-based tracing mechanism Interpreting the command buffer only really works if everything is static, but panvk started to make extensive use of loops, and conditionals which depends on memory values that get updated by the command stream itself. This makes it impossible to walk back to the original state in order to replay the CS actions. Move away from this approach in favor of an event-based tracing mechanism recording particular CS commands and their context at execution time. Of course, that means the auxiliary descriptors shouldn't be recycled until the traces are decoded, but that's more tractable. We just need to turn the descriptor ring buffers into linear buffers with a guard page, and crash on OOB, with a message suggesting the user to tweak the maximum trace buffer sizes. Signed-off-by: Boris Brezillon Reviewed-by: Mary Guillemard Reviewed-by: Lars-Ivar Hesselberg Simonsen Part-of: --- src/panfrost/lib/genxml/cs_builder.h | 176 ++++++++++++++++++++++++ src/panfrost/lib/genxml/decode.h | 2 + src/panfrost/lib/genxml/decode_common.c | 17 +++ src/panfrost/lib/genxml/decode_csf.c | 107 ++++++++++++++ src/panfrost/lib/wrap.h | 3 + 5 files changed, 305 insertions(+) diff --git a/src/panfrost/lib/genxml/cs_builder.h b/src/panfrost/lib/genxml/cs_builder.h index 86a27e0180c..3a8ba777496 100644 --- a/src/panfrost/lib/genxml/cs_builder.h +++ b/src/panfrost/lib/genxml/cs_builder.h @@ -1880,3 +1880,179 @@ cs_exception_handler_end(struct cs_builder *b, cs_exception_handler_start(__b, __handler, __ctx); \ __ehandler != NULL; \ cs_exception_handler_end(__b, __handler), __ehandler = NULL) + +struct cs_tracing_ctx { + bool enabled; + struct cs_index ctx_reg; + unsigned tracebuf_addr_offset; + uint8_t ls_sb_slot; +}; + +static inline void +cs_trace_preamble(struct cs_builder *b, const struct cs_tracing_ctx *ctx, + struct cs_index scratch_regs, unsigned trace_size) +{ + assert(trace_size > 0 && ALIGN_POT(trace_size, 64) == trace_size && + trace_size < INT16_MAX); + assert(scratch_regs.size >= 4 && !(scratch_regs.reg & 1)); + + struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg); + + /* We always update the tracebuf position first, so we can easily detect OOB + * access. Use cs_trace_field_offset() to get an offset taking this + * pre-increment into account. */ + cs_load64_to(b, tracebuf_addr, ctx->ctx_reg, ctx->tracebuf_addr_offset); + cs_wait_slot(b, ctx->ls_sb_slot, false); + cs_add64(b, tracebuf_addr, tracebuf_addr, trace_size); + cs_store64(b, tracebuf_addr, ctx->ctx_reg, ctx->tracebuf_addr_offset); + cs_wait_slot(b, ctx->ls_sb_slot, false); +} + +#define cs_trace_field_offset(__type, __field) \ + (int16_t)(offsetof(struct cs_##__type##_trace, __field) - \ + sizeof(struct cs_##__type##_trace)) + +struct cs_run_fragment_trace { + uint64_t ip; + uint32_t sr[7]; +} __attribute__((aligned(64))); + +static inline void +cs_trace_run_fragment(struct cs_builder *b, const struct cs_tracing_ctx *ctx, + struct cs_index scratch_regs, bool enable_tem, + enum mali_tile_render_order tile_order, bool progress_inc) +{ + if (likely(!ctx->enabled)) { + cs_run_fragment(b, enable_tem, tile_order, progress_inc); + return; + } + + struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg); + struct cs_index data = cs_reg64(b, scratch_regs.reg + 2); + + cs_trace_preamble(b, ctx, scratch_regs, + sizeof(struct cs_run_fragment_trace)); + + /* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP + * won't point to the right instruction. */ + cs_load_ip_to(b, data); + cs_run_fragment(b, enable_tem, tile_order, progress_inc); + cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_fragment, ip)); + + cs_store(b, cs_reg_tuple(b, 40, 7), tracebuf_addr, BITFIELD_MASK(7), + cs_trace_field_offset(run_fragment, sr)); + cs_wait_slot(b, ctx->ls_sb_slot, false); +} + +struct cs_run_idvs_trace { + uint64_t ip; + uint32_t draw_id; + uint32_t pad; + uint32_t sr[61]; +} __attribute__((aligned(64))); + +static inline void +cs_trace_run_idvs(struct cs_builder *b, const struct cs_tracing_ctx *ctx, + struct cs_index scratch_regs, uint32_t flags_override, + bool progress_inc, bool malloc_enable, + struct cs_shader_res_sel varying_sel, + struct cs_shader_res_sel frag_sel, struct cs_index draw_id) +{ + if (likely(!ctx->enabled)) { + cs_run_idvs(b, flags_override, progress_inc, malloc_enable, varying_sel, + frag_sel, draw_id); + return; + } + + struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg); + struct cs_index data = cs_reg64(b, scratch_regs.reg + 2); + + cs_trace_preamble(b, ctx, scratch_regs, + sizeof(struct cs_run_idvs_trace)); + + /* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP + * won't point to the right instruction. */ + cs_load_ip_to(b, data); + cs_run_idvs(b, flags_override, progress_inc, malloc_enable, varying_sel, + frag_sel, draw_id); + cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_idvs, ip)); + + if (draw_id.type != CS_INDEX_UNDEF) + cs_store32(b, draw_id, tracebuf_addr, + cs_trace_field_offset(run_idvs, draw_id)); + + for (unsigned i = 0; i < 48; i++) + cs_store(b, cs_reg_tuple(b, i, 16), tracebuf_addr, BITFIELD_MASK(16), + cs_trace_field_offset(run_idvs, sr[i])); + cs_store(b, cs_reg_tuple(b, 48, 13), tracebuf_addr, BITFIELD_MASK(13), + cs_trace_field_offset(run_idvs, sr[48])); + cs_wait_slot(b, ctx->ls_sb_slot, false); +} + +struct cs_run_compute_trace { + uint64_t ip; + uint32_t sr[40]; +} __attribute__((aligned(64))); + +static inline void +cs_trace_run_compute(struct cs_builder *b, const struct cs_tracing_ctx *ctx, + struct cs_index scratch_regs, unsigned task_increment, + enum mali_task_axis task_axis, bool progress_inc, + struct cs_shader_res_sel res_sel) +{ + if (likely(!ctx->enabled)) { + cs_run_compute(b, task_increment, task_axis, progress_inc, res_sel); + return; + } + + struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg); + struct cs_index data = cs_reg64(b, scratch_regs.reg + 2); + + cs_trace_preamble(b, ctx, scratch_regs, + sizeof(struct cs_run_compute_trace)); + + /* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP + * won't point to the right instruction. */ + cs_load_ip_to(b, data); + cs_run_compute(b, task_increment, task_axis, progress_inc, res_sel); + cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_compute, ip)); + + for (unsigned i = 0; i < 32; i++) + cs_store(b, cs_reg_tuple(b, i, 16), tracebuf_addr, BITFIELD_MASK(16), + cs_trace_field_offset(run_compute, sr[i])); + cs_store(b, cs_reg_tuple(b, 32, 8), tracebuf_addr, BITFIELD_MASK(8), + cs_trace_field_offset(run_compute, sr[32])); + cs_wait_slot(b, ctx->ls_sb_slot, false); +} + +static inline void +cs_trace_run_compute_indirect(struct cs_builder *b, + const struct cs_tracing_ctx *ctx, + struct cs_index scratch_regs, + unsigned wg_per_task, bool progress_inc, + struct cs_shader_res_sel res_sel) +{ + if (likely(!ctx->enabled)) { + cs_run_compute_indirect(b, wg_per_task, progress_inc, res_sel); + return; + } + + struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg); + struct cs_index data = cs_reg64(b, scratch_regs.reg + 2); + + cs_trace_preamble(b, ctx, scratch_regs, + sizeof(struct cs_run_compute_trace)); + + /* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP + * won't point to the right instruction. */ + cs_load_ip_to(b, data); + cs_run_compute_indirect(b, wg_per_task, progress_inc, res_sel); + cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_compute, ip)); + + for (unsigned i = 0; i < 32; i++) + cs_store(b, cs_reg_tuple(b, i, 16), tracebuf_addr, BITFIELD_MASK(16), + cs_trace_field_offset(run_compute, sr[i])); + cs_store(b, cs_reg_tuple(b, 32, 8), tracebuf_addr, BITFIELD_MASK(8), + cs_trace_field_offset(run_compute, sr[32])); + cs_wait_slot(b, ctx->ls_sb_slot, false); +} diff --git a/src/panfrost/lib/genxml/decode.h b/src/panfrost/lib/genxml/decode.h index 41c57db5e0c..be6b5ce4a58 100644 --- a/src/panfrost/lib/genxml/decode.h +++ b/src/panfrost/lib/genxml/decode.h @@ -134,6 +134,8 @@ void pandecode_interpret_cs_v10(struct pandecode_context *ctx, mali_ptr queue, uint32_t size, unsigned gpu_id, uint32_t *regs); void pandecode_cs_binary_v10(struct pandecode_context *ctx, mali_ptr bin, uint32_t bin_size, unsigned gpu_id); +void pandecode_cs_trace_v10(struct pandecode_context *ctx, mali_ptr trace, + uint32_t trace_size, unsigned gpu_id); /* Logging infrastructure */ static void diff --git a/src/panfrost/lib/genxml/decode_common.c b/src/panfrost/lib/genxml/decode_common.c index 1db94b8847d..c096c9d9687 100644 --- a/src/panfrost/lib/genxml/decode_common.c +++ b/src/panfrost/lib/genxml/decode_common.c @@ -450,6 +450,23 @@ pandecode_cs_binary(struct pandecode_context *ctx, mali_ptr bin_gpu_va, simple_mtx_unlock(&ctx->lock); } +void +pandecode_cs_trace(struct pandecode_context *ctx, mali_ptr trace_gpu_va, + uint32_t size, unsigned gpu_id) +{ + simple_mtx_lock(&ctx->lock); + + switch (pan_arch(gpu_id)) { + case 10: + pandecode_cs_trace_v10(ctx, trace_gpu_va, size, gpu_id); + break; + default: + unreachable("Unsupported architecture"); + } + + simple_mtx_unlock(&ctx->lock); +} + void pandecode_shader_disassemble(struct pandecode_context *ctx, mali_ptr shader_ptr, unsigned gpu_id) diff --git a/src/panfrost/lib/genxml/decode_csf.c b/src/panfrost/lib/genxml/decode_csf.c index 65994ab3cb2..5aa5bf162ec 100644 --- a/src/panfrost/lib/genxml/decode_csf.c +++ b/src/panfrost/lib/genxml/decode_csf.c @@ -30,6 +30,9 @@ #include "decode.h" #if PAN_ARCH >= 10 + +#include "genxml/cs_builder.h" + /* Limit for Mali-G610. -1 because we're not including the active frame */ #define MAX_CALL_STACK_DEPTH (8 - 1) @@ -1498,6 +1501,14 @@ print_cs_binary(struct pandecode_context *ctx, mali_ptr bin, break; } + case MALI_CS_OPCODE_RUN_IDVS: + case MALI_CS_OPCODE_RUN_FRAGMENT: + case MALI_CS_OPCODE_RUN_COMPUTE: + case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT: + fprintf(ctx->dump_stream, " // tracepoint_%" PRIx64, + bin + (i * sizeof(uint64_t))); + break; + default: break; } @@ -1533,4 +1544,100 @@ GENX(pandecode_cs_binary)(struct pandecode_context *ctx, mali_ptr bin, pandecode_map_read_write(ctx); } + +void +GENX(pandecode_cs_trace)(struct pandecode_context *ctx, mali_ptr trace, + uint32_t trace_size, unsigned gpu_id) +{ + pandecode_dump_file_open(ctx); + + void *trace_data = pandecode_fetch_gpu_mem(ctx, trace, trace_size); + + while (trace_size > 0) { + uint32_t regs[256] = {}; + uint64_t *ip = trace_data; + + uint64_t *instr = pandecode_fetch_gpu_mem(ctx, *ip, sizeof(*instr)); + + /* Mali-G610 has 96 registers. Other devices not yet supported, we can + * make this configurable later when we encounter new Malis. + */ + struct queue_ctx qctx = { + .nr_regs = 96, + .regs = regs, + .ip = instr, + .end = instr + 1, + .gpu_id = gpu_id, + }; + + pandecode_make_indent(ctx); + print_cs_instr(ctx->dump_stream, *instr); + fprintf(ctx->dump_stream, " // from tracepoint_%" PRIx64 "\n", *ip); + + pan_unpack(instr, CS_BASE, base); + + switch (base.opcode) { + case MALI_CS_OPCODE_RUN_IDVS: { + struct cs_run_idvs_trace *idvs_trace = trace_data; + + assert(trace_size >= sizeof(idvs_trace)); + pan_unpack(instr, CS_RUN_IDVS, I); + memcpy(regs, idvs_trace->sr, sizeof(idvs_trace->sr)); + + if (I.draw_id_register_enable) + regs[I.draw_id] = idvs_trace->draw_id; + + pandecode_run_idvs(ctx, ctx->dump_stream, &qctx, &I); + trace_data = idvs_trace + 1; + trace_size -= sizeof(*idvs_trace); + break; + } + + case MALI_CS_OPCODE_RUN_FRAGMENT: { + struct cs_run_fragment_trace *frag_trace = trace_data; + + assert(trace_size >= sizeof(frag_trace)); + pan_unpack(instr, CS_RUN_FRAGMENT, I); + memcpy(®s[40], frag_trace->sr, sizeof(frag_trace->sr)); + pandecode_run_fragment(ctx, ctx->dump_stream, &qctx, &I); + trace_data = frag_trace + 1; + trace_size -= sizeof(*frag_trace); + break; + } + + case MALI_CS_OPCODE_RUN_COMPUTE: { + struct cs_run_compute_trace *comp_trace = trace_data; + + assert(trace_size >= sizeof(comp_trace)); + pan_unpack(instr, CS_RUN_COMPUTE, I); + memcpy(regs, comp_trace->sr, sizeof(comp_trace->sr)); + pandecode_run_compute(ctx, ctx->dump_stream, &qctx, &I); + trace_data = comp_trace + 1; + trace_size -= sizeof(*comp_trace); + break; + } + + case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT: { + struct cs_run_compute_trace *comp_trace = trace_data; + + assert(trace_size >= sizeof(comp_trace)); + pan_unpack(instr, CS_RUN_COMPUTE_INDIRECT, I); + memcpy(regs, comp_trace->sr, sizeof(comp_trace->sr)); + pandecode_run_compute_indirect(ctx, ctx->dump_stream, &qctx, &I); + trace_data = comp_trace + 1; + trace_size -= sizeof(*comp_trace); + break; + } + + default: + assert(!"Invalid trace packet"); + break; + } + + pandecode_log(ctx, "\n"); + } + + fflush(ctx->dump_stream); + pandecode_map_read_write(ctx); +} #endif diff --git a/src/panfrost/lib/wrap.h b/src/panfrost/lib/wrap.h index 753be2da28a..2dfabf3f562 100644 --- a/src/panfrost/lib/wrap.h +++ b/src/panfrost/lib/wrap.h @@ -68,6 +68,9 @@ void pandecode_interpret_cs(struct pandecode_context *ctx, void pandecode_cs_binary(struct pandecode_context *ctx, mali_ptr binary_gpu_va, uint32_t size, unsigned gpu_id); +void pandecode_cs_trace(struct pandecode_context *ctx, mali_ptr trace_gpu_va, + uint32_t size, unsigned gpu_id); + void pandecode_abort_on_fault(struct pandecode_context *ctx, uint64_t jc_gpu_va, unsigned gpu_id);