pan/cs: Add an event-based tracing mechanism

Interpreting the command buffer only really works if everything is
static, but panvk started to make extensive use of loops, and
conditionals which depends on memory values that get updated by the
command stream itself. This makes it impossible to walk back to the
original state in order to replay the CS actions.

Move away from this approach in favor of an event-based tracing
mechanism recording particular CS commands and their context at
execution time. Of course, that means the auxiliary descriptors
shouldn't be recycled until the traces are decoded, but that's more
tractable. We just need to turn the descriptor ring buffers into
linear buffers with a guard page, and crash on OOB, with a message
suggesting the user to tweak the maximum trace buffer sizes.

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Mary Guillemard <mary.guillemard@collabora.com>
Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32284>
This commit is contained in:
Boris Brezillon 2024-11-21 18:31:48 +01:00 committed by Marge Bot
parent 4e5f75d1d7
commit bf05842a8d
5 changed files with 305 additions and 0 deletions

View file

@ -1880,3 +1880,179 @@ cs_exception_handler_end(struct cs_builder *b,
cs_exception_handler_start(__b, __handler, __ctx); \
__ehandler != NULL; \
cs_exception_handler_end(__b, __handler), __ehandler = NULL)
struct cs_tracing_ctx {
bool enabled;
struct cs_index ctx_reg;
unsigned tracebuf_addr_offset;
uint8_t ls_sb_slot;
};
static inline void
cs_trace_preamble(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
struct cs_index scratch_regs, unsigned trace_size)
{
assert(trace_size > 0 && ALIGN_POT(trace_size, 64) == trace_size &&
trace_size < INT16_MAX);
assert(scratch_regs.size >= 4 && !(scratch_regs.reg & 1));
struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
/* We always update the tracebuf position first, so we can easily detect OOB
* access. Use cs_trace_field_offset() to get an offset taking this
* pre-increment into account. */
cs_load64_to(b, tracebuf_addr, ctx->ctx_reg, ctx->tracebuf_addr_offset);
cs_wait_slot(b, ctx->ls_sb_slot, false);
cs_add64(b, tracebuf_addr, tracebuf_addr, trace_size);
cs_store64(b, tracebuf_addr, ctx->ctx_reg, ctx->tracebuf_addr_offset);
cs_wait_slot(b, ctx->ls_sb_slot, false);
}
#define cs_trace_field_offset(__type, __field) \
(int16_t)(offsetof(struct cs_##__type##_trace, __field) - \
sizeof(struct cs_##__type##_trace))
struct cs_run_fragment_trace {
uint64_t ip;
uint32_t sr[7];
} __attribute__((aligned(64)));
static inline void
cs_trace_run_fragment(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
struct cs_index scratch_regs, bool enable_tem,
enum mali_tile_render_order tile_order, bool progress_inc)
{
if (likely(!ctx->enabled)) {
cs_run_fragment(b, enable_tem, tile_order, progress_inc);
return;
}
struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);
cs_trace_preamble(b, ctx, scratch_regs,
sizeof(struct cs_run_fragment_trace));
/* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
* won't point to the right instruction. */
cs_load_ip_to(b, data);
cs_run_fragment(b, enable_tem, tile_order, progress_inc);
cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_fragment, ip));
cs_store(b, cs_reg_tuple(b, 40, 7), tracebuf_addr, BITFIELD_MASK(7),
cs_trace_field_offset(run_fragment, sr));
cs_wait_slot(b, ctx->ls_sb_slot, false);
}
struct cs_run_idvs_trace {
uint64_t ip;
uint32_t draw_id;
uint32_t pad;
uint32_t sr[61];
} __attribute__((aligned(64)));
static inline void
cs_trace_run_idvs(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
struct cs_index scratch_regs, uint32_t flags_override,
bool progress_inc, bool malloc_enable,
struct cs_shader_res_sel varying_sel,
struct cs_shader_res_sel frag_sel, struct cs_index draw_id)
{
if (likely(!ctx->enabled)) {
cs_run_idvs(b, flags_override, progress_inc, malloc_enable, varying_sel,
frag_sel, draw_id);
return;
}
struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);
cs_trace_preamble(b, ctx, scratch_regs,
sizeof(struct cs_run_idvs_trace));
/* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
* won't point to the right instruction. */
cs_load_ip_to(b, data);
cs_run_idvs(b, flags_override, progress_inc, malloc_enable, varying_sel,
frag_sel, draw_id);
cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_idvs, ip));
if (draw_id.type != CS_INDEX_UNDEF)
cs_store32(b, draw_id, tracebuf_addr,
cs_trace_field_offset(run_idvs, draw_id));
for (unsigned i = 0; i < 48; i++)
cs_store(b, cs_reg_tuple(b, i, 16), tracebuf_addr, BITFIELD_MASK(16),
cs_trace_field_offset(run_idvs, sr[i]));
cs_store(b, cs_reg_tuple(b, 48, 13), tracebuf_addr, BITFIELD_MASK(13),
cs_trace_field_offset(run_idvs, sr[48]));
cs_wait_slot(b, ctx->ls_sb_slot, false);
}
struct cs_run_compute_trace {
uint64_t ip;
uint32_t sr[40];
} __attribute__((aligned(64)));
static inline void
cs_trace_run_compute(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
struct cs_index scratch_regs, unsigned task_increment,
enum mali_task_axis task_axis, bool progress_inc,
struct cs_shader_res_sel res_sel)
{
if (likely(!ctx->enabled)) {
cs_run_compute(b, task_increment, task_axis, progress_inc, res_sel);
return;
}
struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);
cs_trace_preamble(b, ctx, scratch_regs,
sizeof(struct cs_run_compute_trace));
/* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
* won't point to the right instruction. */
cs_load_ip_to(b, data);
cs_run_compute(b, task_increment, task_axis, progress_inc, res_sel);
cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_compute, ip));
for (unsigned i = 0; i < 32; i++)
cs_store(b, cs_reg_tuple(b, i, 16), tracebuf_addr, BITFIELD_MASK(16),
cs_trace_field_offset(run_compute, sr[i]));
cs_store(b, cs_reg_tuple(b, 32, 8), tracebuf_addr, BITFIELD_MASK(8),
cs_trace_field_offset(run_compute, sr[32]));
cs_wait_slot(b, ctx->ls_sb_slot, false);
}
static inline void
cs_trace_run_compute_indirect(struct cs_builder *b,
const struct cs_tracing_ctx *ctx,
struct cs_index scratch_regs,
unsigned wg_per_task, bool progress_inc,
struct cs_shader_res_sel res_sel)
{
if (likely(!ctx->enabled)) {
cs_run_compute_indirect(b, wg_per_task, progress_inc, res_sel);
return;
}
struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);
cs_trace_preamble(b, ctx, scratch_regs,
sizeof(struct cs_run_compute_trace));
/* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
* won't point to the right instruction. */
cs_load_ip_to(b, data);
cs_run_compute_indirect(b, wg_per_task, progress_inc, res_sel);
cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_compute, ip));
for (unsigned i = 0; i < 32; i++)
cs_store(b, cs_reg_tuple(b, i, 16), tracebuf_addr, BITFIELD_MASK(16),
cs_trace_field_offset(run_compute, sr[i]));
cs_store(b, cs_reg_tuple(b, 32, 8), tracebuf_addr, BITFIELD_MASK(8),
cs_trace_field_offset(run_compute, sr[32]));
cs_wait_slot(b, ctx->ls_sb_slot, false);
}

View file

@ -134,6 +134,8 @@ void pandecode_interpret_cs_v10(struct pandecode_context *ctx, mali_ptr queue,
uint32_t size, unsigned gpu_id, uint32_t *regs);
void pandecode_cs_binary_v10(struct pandecode_context *ctx, mali_ptr bin,
uint32_t bin_size, unsigned gpu_id);
void pandecode_cs_trace_v10(struct pandecode_context *ctx, mali_ptr trace,
uint32_t trace_size, unsigned gpu_id);
/* Logging infrastructure */
static void

View file

@ -450,6 +450,23 @@ pandecode_cs_binary(struct pandecode_context *ctx, mali_ptr bin_gpu_va,
simple_mtx_unlock(&ctx->lock);
}
void
pandecode_cs_trace(struct pandecode_context *ctx, mali_ptr trace_gpu_va,
uint32_t size, unsigned gpu_id)
{
simple_mtx_lock(&ctx->lock);
switch (pan_arch(gpu_id)) {
case 10:
pandecode_cs_trace_v10(ctx, trace_gpu_va, size, gpu_id);
break;
default:
unreachable("Unsupported architecture");
}
simple_mtx_unlock(&ctx->lock);
}
void
pandecode_shader_disassemble(struct pandecode_context *ctx, mali_ptr shader_ptr,
unsigned gpu_id)

View file

@ -30,6 +30,9 @@
#include "decode.h"
#if PAN_ARCH >= 10
#include "genxml/cs_builder.h"
/* Limit for Mali-G610. -1 because we're not including the active frame */
#define MAX_CALL_STACK_DEPTH (8 - 1)
@ -1498,6 +1501,14 @@ print_cs_binary(struct pandecode_context *ctx, mali_ptr bin,
break;
}
case MALI_CS_OPCODE_RUN_IDVS:
case MALI_CS_OPCODE_RUN_FRAGMENT:
case MALI_CS_OPCODE_RUN_COMPUTE:
case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT:
fprintf(ctx->dump_stream, " // tracepoint_%" PRIx64,
bin + (i * sizeof(uint64_t)));
break;
default:
break;
}
@ -1533,4 +1544,100 @@ GENX(pandecode_cs_binary)(struct pandecode_context *ctx, mali_ptr bin,
pandecode_map_read_write(ctx);
}
void
GENX(pandecode_cs_trace)(struct pandecode_context *ctx, mali_ptr trace,
uint32_t trace_size, unsigned gpu_id)
{
pandecode_dump_file_open(ctx);
void *trace_data = pandecode_fetch_gpu_mem(ctx, trace, trace_size);
while (trace_size > 0) {
uint32_t regs[256] = {};
uint64_t *ip = trace_data;
uint64_t *instr = pandecode_fetch_gpu_mem(ctx, *ip, sizeof(*instr));
/* Mali-G610 has 96 registers. Other devices not yet supported, we can
* make this configurable later when we encounter new Malis.
*/
struct queue_ctx qctx = {
.nr_regs = 96,
.regs = regs,
.ip = instr,
.end = instr + 1,
.gpu_id = gpu_id,
};
pandecode_make_indent(ctx);
print_cs_instr(ctx->dump_stream, *instr);
fprintf(ctx->dump_stream, " // from tracepoint_%" PRIx64 "\n", *ip);
pan_unpack(instr, CS_BASE, base);
switch (base.opcode) {
case MALI_CS_OPCODE_RUN_IDVS: {
struct cs_run_idvs_trace *idvs_trace = trace_data;
assert(trace_size >= sizeof(idvs_trace));
pan_unpack(instr, CS_RUN_IDVS, I);
memcpy(regs, idvs_trace->sr, sizeof(idvs_trace->sr));
if (I.draw_id_register_enable)
regs[I.draw_id] = idvs_trace->draw_id;
pandecode_run_idvs(ctx, ctx->dump_stream, &qctx, &I);
trace_data = idvs_trace + 1;
trace_size -= sizeof(*idvs_trace);
break;
}
case MALI_CS_OPCODE_RUN_FRAGMENT: {
struct cs_run_fragment_trace *frag_trace = trace_data;
assert(trace_size >= sizeof(frag_trace));
pan_unpack(instr, CS_RUN_FRAGMENT, I);
memcpy(&regs[40], frag_trace->sr, sizeof(frag_trace->sr));
pandecode_run_fragment(ctx, ctx->dump_stream, &qctx, &I);
trace_data = frag_trace + 1;
trace_size -= sizeof(*frag_trace);
break;
}
case MALI_CS_OPCODE_RUN_COMPUTE: {
struct cs_run_compute_trace *comp_trace = trace_data;
assert(trace_size >= sizeof(comp_trace));
pan_unpack(instr, CS_RUN_COMPUTE, I);
memcpy(regs, comp_trace->sr, sizeof(comp_trace->sr));
pandecode_run_compute(ctx, ctx->dump_stream, &qctx, &I);
trace_data = comp_trace + 1;
trace_size -= sizeof(*comp_trace);
break;
}
case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT: {
struct cs_run_compute_trace *comp_trace = trace_data;
assert(trace_size >= sizeof(comp_trace));
pan_unpack(instr, CS_RUN_COMPUTE_INDIRECT, I);
memcpy(regs, comp_trace->sr, sizeof(comp_trace->sr));
pandecode_run_compute_indirect(ctx, ctx->dump_stream, &qctx, &I);
trace_data = comp_trace + 1;
trace_size -= sizeof(*comp_trace);
break;
}
default:
assert(!"Invalid trace packet");
break;
}
pandecode_log(ctx, "\n");
}
fflush(ctx->dump_stream);
pandecode_map_read_write(ctx);
}
#endif

View file

@ -68,6 +68,9 @@ void pandecode_interpret_cs(struct pandecode_context *ctx,
void pandecode_cs_binary(struct pandecode_context *ctx, mali_ptr binary_gpu_va,
uint32_t size, unsigned gpu_id);
void pandecode_cs_trace(struct pandecode_context *ctx, mali_ptr trace_gpu_va,
uint32_t size, unsigned gpu_id);
void pandecode_abort_on_fault(struct pandecode_context *ctx, uint64_t jc_gpu_va,
unsigned gpu_id);