diff --git a/src/panfrost/lib/genxml/cs_builder.h b/src/panfrost/lib/genxml/cs_builder.h index 86a27e0180c..3a8ba777496 100644 --- a/src/panfrost/lib/genxml/cs_builder.h +++ b/src/panfrost/lib/genxml/cs_builder.h @@ -1880,3 +1880,179 @@ cs_exception_handler_end(struct cs_builder *b, cs_exception_handler_start(__b, __handler, __ctx); \ __ehandler != NULL; \ cs_exception_handler_end(__b, __handler), __ehandler = NULL) + +struct cs_tracing_ctx { + bool enabled; + struct cs_index ctx_reg; + unsigned tracebuf_addr_offset; + uint8_t ls_sb_slot; +}; + +static inline void +cs_trace_preamble(struct cs_builder *b, const struct cs_tracing_ctx *ctx, + struct cs_index scratch_regs, unsigned trace_size) +{ + assert(trace_size > 0 && ALIGN_POT(trace_size, 64) == trace_size && + trace_size < INT16_MAX); + assert(scratch_regs.size >= 4 && !(scratch_regs.reg & 1)); + + struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg); + + /* We always update the tracebuf position first, so we can easily detect OOB + * access. Use cs_trace_field_offset() to get an offset taking this + * pre-increment into account. */ + cs_load64_to(b, tracebuf_addr, ctx->ctx_reg, ctx->tracebuf_addr_offset); + cs_wait_slot(b, ctx->ls_sb_slot, false); + cs_add64(b, tracebuf_addr, tracebuf_addr, trace_size); + cs_store64(b, tracebuf_addr, ctx->ctx_reg, ctx->tracebuf_addr_offset); + cs_wait_slot(b, ctx->ls_sb_slot, false); +} + +#define cs_trace_field_offset(__type, __field) \ + (int16_t)(offsetof(struct cs_##__type##_trace, __field) - \ + sizeof(struct cs_##__type##_trace)) + +struct cs_run_fragment_trace { + uint64_t ip; + uint32_t sr[7]; +} __attribute__((aligned(64))); + +static inline void +cs_trace_run_fragment(struct cs_builder *b, const struct cs_tracing_ctx *ctx, + struct cs_index scratch_regs, bool enable_tem, + enum mali_tile_render_order tile_order, bool progress_inc) +{ + if (likely(!ctx->enabled)) { + cs_run_fragment(b, enable_tem, tile_order, progress_inc); + return; + } + + struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg); + struct cs_index data = cs_reg64(b, scratch_regs.reg + 2); + + cs_trace_preamble(b, ctx, scratch_regs, + sizeof(struct cs_run_fragment_trace)); + + /* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP + * won't point to the right instruction. */ + cs_load_ip_to(b, data); + cs_run_fragment(b, enable_tem, tile_order, progress_inc); + cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_fragment, ip)); + + cs_store(b, cs_reg_tuple(b, 40, 7), tracebuf_addr, BITFIELD_MASK(7), + cs_trace_field_offset(run_fragment, sr)); + cs_wait_slot(b, ctx->ls_sb_slot, false); +} + +struct cs_run_idvs_trace { + uint64_t ip; + uint32_t draw_id; + uint32_t pad; + uint32_t sr[61]; +} __attribute__((aligned(64))); + +static inline void +cs_trace_run_idvs(struct cs_builder *b, const struct cs_tracing_ctx *ctx, + struct cs_index scratch_regs, uint32_t flags_override, + bool progress_inc, bool malloc_enable, + struct cs_shader_res_sel varying_sel, + struct cs_shader_res_sel frag_sel, struct cs_index draw_id) +{ + if (likely(!ctx->enabled)) { + cs_run_idvs(b, flags_override, progress_inc, malloc_enable, varying_sel, + frag_sel, draw_id); + return; + } + + struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg); + struct cs_index data = cs_reg64(b, scratch_regs.reg + 2); + + cs_trace_preamble(b, ctx, scratch_regs, + sizeof(struct cs_run_idvs_trace)); + + /* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP + * won't point to the right instruction. */ + cs_load_ip_to(b, data); + cs_run_idvs(b, flags_override, progress_inc, malloc_enable, varying_sel, + frag_sel, draw_id); + cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_idvs, ip)); + + if (draw_id.type != CS_INDEX_UNDEF) + cs_store32(b, draw_id, tracebuf_addr, + cs_trace_field_offset(run_idvs, draw_id)); + + for (unsigned i = 0; i < 48; i++) + cs_store(b, cs_reg_tuple(b, i, 16), tracebuf_addr, BITFIELD_MASK(16), + cs_trace_field_offset(run_idvs, sr[i])); + cs_store(b, cs_reg_tuple(b, 48, 13), tracebuf_addr, BITFIELD_MASK(13), + cs_trace_field_offset(run_idvs, sr[48])); + cs_wait_slot(b, ctx->ls_sb_slot, false); +} + +struct cs_run_compute_trace { + uint64_t ip; + uint32_t sr[40]; +} __attribute__((aligned(64))); + +static inline void +cs_trace_run_compute(struct cs_builder *b, const struct cs_tracing_ctx *ctx, + struct cs_index scratch_regs, unsigned task_increment, + enum mali_task_axis task_axis, bool progress_inc, + struct cs_shader_res_sel res_sel) +{ + if (likely(!ctx->enabled)) { + cs_run_compute(b, task_increment, task_axis, progress_inc, res_sel); + return; + } + + struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg); + struct cs_index data = cs_reg64(b, scratch_regs.reg + 2); + + cs_trace_preamble(b, ctx, scratch_regs, + sizeof(struct cs_run_compute_trace)); + + /* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP + * won't point to the right instruction. */ + cs_load_ip_to(b, data); + cs_run_compute(b, task_increment, task_axis, progress_inc, res_sel); + cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_compute, ip)); + + for (unsigned i = 0; i < 32; i++) + cs_store(b, cs_reg_tuple(b, i, 16), tracebuf_addr, BITFIELD_MASK(16), + cs_trace_field_offset(run_compute, sr[i])); + cs_store(b, cs_reg_tuple(b, 32, 8), tracebuf_addr, BITFIELD_MASK(8), + cs_trace_field_offset(run_compute, sr[32])); + cs_wait_slot(b, ctx->ls_sb_slot, false); +} + +static inline void +cs_trace_run_compute_indirect(struct cs_builder *b, + const struct cs_tracing_ctx *ctx, + struct cs_index scratch_regs, + unsigned wg_per_task, bool progress_inc, + struct cs_shader_res_sel res_sel) +{ + if (likely(!ctx->enabled)) { + cs_run_compute_indirect(b, wg_per_task, progress_inc, res_sel); + return; + } + + struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg); + struct cs_index data = cs_reg64(b, scratch_regs.reg + 2); + + cs_trace_preamble(b, ctx, scratch_regs, + sizeof(struct cs_run_compute_trace)); + + /* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP + * won't point to the right instruction. */ + cs_load_ip_to(b, data); + cs_run_compute_indirect(b, wg_per_task, progress_inc, res_sel); + cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_compute, ip)); + + for (unsigned i = 0; i < 32; i++) + cs_store(b, cs_reg_tuple(b, i, 16), tracebuf_addr, BITFIELD_MASK(16), + cs_trace_field_offset(run_compute, sr[i])); + cs_store(b, cs_reg_tuple(b, 32, 8), tracebuf_addr, BITFIELD_MASK(8), + cs_trace_field_offset(run_compute, sr[32])); + cs_wait_slot(b, ctx->ls_sb_slot, false); +} diff --git a/src/panfrost/lib/genxml/decode.h b/src/panfrost/lib/genxml/decode.h index 41c57db5e0c..be6b5ce4a58 100644 --- a/src/panfrost/lib/genxml/decode.h +++ b/src/panfrost/lib/genxml/decode.h @@ -134,6 +134,8 @@ void pandecode_interpret_cs_v10(struct pandecode_context *ctx, mali_ptr queue, uint32_t size, unsigned gpu_id, uint32_t *regs); void pandecode_cs_binary_v10(struct pandecode_context *ctx, mali_ptr bin, uint32_t bin_size, unsigned gpu_id); +void pandecode_cs_trace_v10(struct pandecode_context *ctx, mali_ptr trace, + uint32_t trace_size, unsigned gpu_id); /* Logging infrastructure */ static void diff --git a/src/panfrost/lib/genxml/decode_common.c b/src/panfrost/lib/genxml/decode_common.c index 1db94b8847d..c096c9d9687 100644 --- a/src/panfrost/lib/genxml/decode_common.c +++ b/src/panfrost/lib/genxml/decode_common.c @@ -450,6 +450,23 @@ pandecode_cs_binary(struct pandecode_context *ctx, mali_ptr bin_gpu_va, simple_mtx_unlock(&ctx->lock); } +void +pandecode_cs_trace(struct pandecode_context *ctx, mali_ptr trace_gpu_va, + uint32_t size, unsigned gpu_id) +{ + simple_mtx_lock(&ctx->lock); + + switch (pan_arch(gpu_id)) { + case 10: + pandecode_cs_trace_v10(ctx, trace_gpu_va, size, gpu_id); + break; + default: + unreachable("Unsupported architecture"); + } + + simple_mtx_unlock(&ctx->lock); +} + void pandecode_shader_disassemble(struct pandecode_context *ctx, mali_ptr shader_ptr, unsigned gpu_id) diff --git a/src/panfrost/lib/genxml/decode_csf.c b/src/panfrost/lib/genxml/decode_csf.c index 65994ab3cb2..5aa5bf162ec 100644 --- a/src/panfrost/lib/genxml/decode_csf.c +++ b/src/panfrost/lib/genxml/decode_csf.c @@ -30,6 +30,9 @@ #include "decode.h" #if PAN_ARCH >= 10 + +#include "genxml/cs_builder.h" + /* Limit for Mali-G610. -1 because we're not including the active frame */ #define MAX_CALL_STACK_DEPTH (8 - 1) @@ -1498,6 +1501,14 @@ print_cs_binary(struct pandecode_context *ctx, mali_ptr bin, break; } + case MALI_CS_OPCODE_RUN_IDVS: + case MALI_CS_OPCODE_RUN_FRAGMENT: + case MALI_CS_OPCODE_RUN_COMPUTE: + case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT: + fprintf(ctx->dump_stream, " // tracepoint_%" PRIx64, + bin + (i * sizeof(uint64_t))); + break; + default: break; } @@ -1533,4 +1544,100 @@ GENX(pandecode_cs_binary)(struct pandecode_context *ctx, mali_ptr bin, pandecode_map_read_write(ctx); } + +void +GENX(pandecode_cs_trace)(struct pandecode_context *ctx, mali_ptr trace, + uint32_t trace_size, unsigned gpu_id) +{ + pandecode_dump_file_open(ctx); + + void *trace_data = pandecode_fetch_gpu_mem(ctx, trace, trace_size); + + while (trace_size > 0) { + uint32_t regs[256] = {}; + uint64_t *ip = trace_data; + + uint64_t *instr = pandecode_fetch_gpu_mem(ctx, *ip, sizeof(*instr)); + + /* Mali-G610 has 96 registers. Other devices not yet supported, we can + * make this configurable later when we encounter new Malis. + */ + struct queue_ctx qctx = { + .nr_regs = 96, + .regs = regs, + .ip = instr, + .end = instr + 1, + .gpu_id = gpu_id, + }; + + pandecode_make_indent(ctx); + print_cs_instr(ctx->dump_stream, *instr); + fprintf(ctx->dump_stream, " // from tracepoint_%" PRIx64 "\n", *ip); + + pan_unpack(instr, CS_BASE, base); + + switch (base.opcode) { + case MALI_CS_OPCODE_RUN_IDVS: { + struct cs_run_idvs_trace *idvs_trace = trace_data; + + assert(trace_size >= sizeof(idvs_trace)); + pan_unpack(instr, CS_RUN_IDVS, I); + memcpy(regs, idvs_trace->sr, sizeof(idvs_trace->sr)); + + if (I.draw_id_register_enable) + regs[I.draw_id] = idvs_trace->draw_id; + + pandecode_run_idvs(ctx, ctx->dump_stream, &qctx, &I); + trace_data = idvs_trace + 1; + trace_size -= sizeof(*idvs_trace); + break; + } + + case MALI_CS_OPCODE_RUN_FRAGMENT: { + struct cs_run_fragment_trace *frag_trace = trace_data; + + assert(trace_size >= sizeof(frag_trace)); + pan_unpack(instr, CS_RUN_FRAGMENT, I); + memcpy(®s[40], frag_trace->sr, sizeof(frag_trace->sr)); + pandecode_run_fragment(ctx, ctx->dump_stream, &qctx, &I); + trace_data = frag_trace + 1; + trace_size -= sizeof(*frag_trace); + break; + } + + case MALI_CS_OPCODE_RUN_COMPUTE: { + struct cs_run_compute_trace *comp_trace = trace_data; + + assert(trace_size >= sizeof(comp_trace)); + pan_unpack(instr, CS_RUN_COMPUTE, I); + memcpy(regs, comp_trace->sr, sizeof(comp_trace->sr)); + pandecode_run_compute(ctx, ctx->dump_stream, &qctx, &I); + trace_data = comp_trace + 1; + trace_size -= sizeof(*comp_trace); + break; + } + + case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT: { + struct cs_run_compute_trace *comp_trace = trace_data; + + assert(trace_size >= sizeof(comp_trace)); + pan_unpack(instr, CS_RUN_COMPUTE_INDIRECT, I); + memcpy(regs, comp_trace->sr, sizeof(comp_trace->sr)); + pandecode_run_compute_indirect(ctx, ctx->dump_stream, &qctx, &I); + trace_data = comp_trace + 1; + trace_size -= sizeof(*comp_trace); + break; + } + + default: + assert(!"Invalid trace packet"); + break; + } + + pandecode_log(ctx, "\n"); + } + + fflush(ctx->dump_stream); + pandecode_map_read_write(ctx); +} #endif diff --git a/src/panfrost/lib/wrap.h b/src/panfrost/lib/wrap.h index 753be2da28a..2dfabf3f562 100644 --- a/src/panfrost/lib/wrap.h +++ b/src/panfrost/lib/wrap.h @@ -68,6 +68,9 @@ void pandecode_interpret_cs(struct pandecode_context *ctx, void pandecode_cs_binary(struct pandecode_context *ctx, mali_ptr binary_gpu_va, uint32_t size, unsigned gpu_id); +void pandecode_cs_trace(struct pandecode_context *ctx, mali_ptr trace_gpu_va, + uint32_t size, unsigned gpu_id); + void pandecode_abort_on_fault(struct pandecode_context *ctx, uint64_t jc_gpu_va, unsigned gpu_id);