mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-23 06:50:11 +01:00
pan/cs: Add an event-based tracing mechanism
Interpreting the command buffer only really works if everything is static, but panvk started to make extensive use of loops, and conditionals which depends on memory values that get updated by the command stream itself. This makes it impossible to walk back to the original state in order to replay the CS actions. Move away from this approach in favor of an event-based tracing mechanism recording particular CS commands and their context at execution time. Of course, that means the auxiliary descriptors shouldn't be recycled until the traces are decoded, but that's more tractable. We just need to turn the descriptor ring buffers into linear buffers with a guard page, and crash on OOB, with a message suggesting the user to tweak the maximum trace buffer sizes. Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Mary Guillemard <mary.guillemard@collabora.com> Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32284>
This commit is contained in:
parent
4e5f75d1d7
commit
bf05842a8d
5 changed files with 305 additions and 0 deletions
|
|
@ -1880,3 +1880,179 @@ cs_exception_handler_end(struct cs_builder *b,
|
||||||
cs_exception_handler_start(__b, __handler, __ctx); \
|
cs_exception_handler_start(__b, __handler, __ctx); \
|
||||||
__ehandler != NULL; \
|
__ehandler != NULL; \
|
||||||
cs_exception_handler_end(__b, __handler), __ehandler = NULL)
|
cs_exception_handler_end(__b, __handler), __ehandler = NULL)
|
||||||
|
|
||||||
|
struct cs_tracing_ctx {
|
||||||
|
bool enabled;
|
||||||
|
struct cs_index ctx_reg;
|
||||||
|
unsigned tracebuf_addr_offset;
|
||||||
|
uint8_t ls_sb_slot;
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
cs_trace_preamble(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
|
||||||
|
struct cs_index scratch_regs, unsigned trace_size)
|
||||||
|
{
|
||||||
|
assert(trace_size > 0 && ALIGN_POT(trace_size, 64) == trace_size &&
|
||||||
|
trace_size < INT16_MAX);
|
||||||
|
assert(scratch_regs.size >= 4 && !(scratch_regs.reg & 1));
|
||||||
|
|
||||||
|
struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
|
||||||
|
|
||||||
|
/* We always update the tracebuf position first, so we can easily detect OOB
|
||||||
|
* access. Use cs_trace_field_offset() to get an offset taking this
|
||||||
|
* pre-increment into account. */
|
||||||
|
cs_load64_to(b, tracebuf_addr, ctx->ctx_reg, ctx->tracebuf_addr_offset);
|
||||||
|
cs_wait_slot(b, ctx->ls_sb_slot, false);
|
||||||
|
cs_add64(b, tracebuf_addr, tracebuf_addr, trace_size);
|
||||||
|
cs_store64(b, tracebuf_addr, ctx->ctx_reg, ctx->tracebuf_addr_offset);
|
||||||
|
cs_wait_slot(b, ctx->ls_sb_slot, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define cs_trace_field_offset(__type, __field) \
|
||||||
|
(int16_t)(offsetof(struct cs_##__type##_trace, __field) - \
|
||||||
|
sizeof(struct cs_##__type##_trace))
|
||||||
|
|
||||||
|
struct cs_run_fragment_trace {
|
||||||
|
uint64_t ip;
|
||||||
|
uint32_t sr[7];
|
||||||
|
} __attribute__((aligned(64)));
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
cs_trace_run_fragment(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
|
||||||
|
struct cs_index scratch_regs, bool enable_tem,
|
||||||
|
enum mali_tile_render_order tile_order, bool progress_inc)
|
||||||
|
{
|
||||||
|
if (likely(!ctx->enabled)) {
|
||||||
|
cs_run_fragment(b, enable_tem, tile_order, progress_inc);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
|
||||||
|
struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);
|
||||||
|
|
||||||
|
cs_trace_preamble(b, ctx, scratch_regs,
|
||||||
|
sizeof(struct cs_run_fragment_trace));
|
||||||
|
|
||||||
|
/* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
|
||||||
|
* won't point to the right instruction. */
|
||||||
|
cs_load_ip_to(b, data);
|
||||||
|
cs_run_fragment(b, enable_tem, tile_order, progress_inc);
|
||||||
|
cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_fragment, ip));
|
||||||
|
|
||||||
|
cs_store(b, cs_reg_tuple(b, 40, 7), tracebuf_addr, BITFIELD_MASK(7),
|
||||||
|
cs_trace_field_offset(run_fragment, sr));
|
||||||
|
cs_wait_slot(b, ctx->ls_sb_slot, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct cs_run_idvs_trace {
|
||||||
|
uint64_t ip;
|
||||||
|
uint32_t draw_id;
|
||||||
|
uint32_t pad;
|
||||||
|
uint32_t sr[61];
|
||||||
|
} __attribute__((aligned(64)));
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
cs_trace_run_idvs(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
|
||||||
|
struct cs_index scratch_regs, uint32_t flags_override,
|
||||||
|
bool progress_inc, bool malloc_enable,
|
||||||
|
struct cs_shader_res_sel varying_sel,
|
||||||
|
struct cs_shader_res_sel frag_sel, struct cs_index draw_id)
|
||||||
|
{
|
||||||
|
if (likely(!ctx->enabled)) {
|
||||||
|
cs_run_idvs(b, flags_override, progress_inc, malloc_enable, varying_sel,
|
||||||
|
frag_sel, draw_id);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
|
||||||
|
struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);
|
||||||
|
|
||||||
|
cs_trace_preamble(b, ctx, scratch_regs,
|
||||||
|
sizeof(struct cs_run_idvs_trace));
|
||||||
|
|
||||||
|
/* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
|
||||||
|
* won't point to the right instruction. */
|
||||||
|
cs_load_ip_to(b, data);
|
||||||
|
cs_run_idvs(b, flags_override, progress_inc, malloc_enable, varying_sel,
|
||||||
|
frag_sel, draw_id);
|
||||||
|
cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_idvs, ip));
|
||||||
|
|
||||||
|
if (draw_id.type != CS_INDEX_UNDEF)
|
||||||
|
cs_store32(b, draw_id, tracebuf_addr,
|
||||||
|
cs_trace_field_offset(run_idvs, draw_id));
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < 48; i++)
|
||||||
|
cs_store(b, cs_reg_tuple(b, i, 16), tracebuf_addr, BITFIELD_MASK(16),
|
||||||
|
cs_trace_field_offset(run_idvs, sr[i]));
|
||||||
|
cs_store(b, cs_reg_tuple(b, 48, 13), tracebuf_addr, BITFIELD_MASK(13),
|
||||||
|
cs_trace_field_offset(run_idvs, sr[48]));
|
||||||
|
cs_wait_slot(b, ctx->ls_sb_slot, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct cs_run_compute_trace {
|
||||||
|
uint64_t ip;
|
||||||
|
uint32_t sr[40];
|
||||||
|
} __attribute__((aligned(64)));
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
cs_trace_run_compute(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
|
||||||
|
struct cs_index scratch_regs, unsigned task_increment,
|
||||||
|
enum mali_task_axis task_axis, bool progress_inc,
|
||||||
|
struct cs_shader_res_sel res_sel)
|
||||||
|
{
|
||||||
|
if (likely(!ctx->enabled)) {
|
||||||
|
cs_run_compute(b, task_increment, task_axis, progress_inc, res_sel);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
|
||||||
|
struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);
|
||||||
|
|
||||||
|
cs_trace_preamble(b, ctx, scratch_regs,
|
||||||
|
sizeof(struct cs_run_compute_trace));
|
||||||
|
|
||||||
|
/* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
|
||||||
|
* won't point to the right instruction. */
|
||||||
|
cs_load_ip_to(b, data);
|
||||||
|
cs_run_compute(b, task_increment, task_axis, progress_inc, res_sel);
|
||||||
|
cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_compute, ip));
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < 32; i++)
|
||||||
|
cs_store(b, cs_reg_tuple(b, i, 16), tracebuf_addr, BITFIELD_MASK(16),
|
||||||
|
cs_trace_field_offset(run_compute, sr[i]));
|
||||||
|
cs_store(b, cs_reg_tuple(b, 32, 8), tracebuf_addr, BITFIELD_MASK(8),
|
||||||
|
cs_trace_field_offset(run_compute, sr[32]));
|
||||||
|
cs_wait_slot(b, ctx->ls_sb_slot, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
cs_trace_run_compute_indirect(struct cs_builder *b,
|
||||||
|
const struct cs_tracing_ctx *ctx,
|
||||||
|
struct cs_index scratch_regs,
|
||||||
|
unsigned wg_per_task, bool progress_inc,
|
||||||
|
struct cs_shader_res_sel res_sel)
|
||||||
|
{
|
||||||
|
if (likely(!ctx->enabled)) {
|
||||||
|
cs_run_compute_indirect(b, wg_per_task, progress_inc, res_sel);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
|
||||||
|
struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);
|
||||||
|
|
||||||
|
cs_trace_preamble(b, ctx, scratch_regs,
|
||||||
|
sizeof(struct cs_run_compute_trace));
|
||||||
|
|
||||||
|
/* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
|
||||||
|
* won't point to the right instruction. */
|
||||||
|
cs_load_ip_to(b, data);
|
||||||
|
cs_run_compute_indirect(b, wg_per_task, progress_inc, res_sel);
|
||||||
|
cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_compute, ip));
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < 32; i++)
|
||||||
|
cs_store(b, cs_reg_tuple(b, i, 16), tracebuf_addr, BITFIELD_MASK(16),
|
||||||
|
cs_trace_field_offset(run_compute, sr[i]));
|
||||||
|
cs_store(b, cs_reg_tuple(b, 32, 8), tracebuf_addr, BITFIELD_MASK(8),
|
||||||
|
cs_trace_field_offset(run_compute, sr[32]));
|
||||||
|
cs_wait_slot(b, ctx->ls_sb_slot, false);
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -134,6 +134,8 @@ void pandecode_interpret_cs_v10(struct pandecode_context *ctx, mali_ptr queue,
|
||||||
uint32_t size, unsigned gpu_id, uint32_t *regs);
|
uint32_t size, unsigned gpu_id, uint32_t *regs);
|
||||||
void pandecode_cs_binary_v10(struct pandecode_context *ctx, mali_ptr bin,
|
void pandecode_cs_binary_v10(struct pandecode_context *ctx, mali_ptr bin,
|
||||||
uint32_t bin_size, unsigned gpu_id);
|
uint32_t bin_size, unsigned gpu_id);
|
||||||
|
void pandecode_cs_trace_v10(struct pandecode_context *ctx, mali_ptr trace,
|
||||||
|
uint32_t trace_size, unsigned gpu_id);
|
||||||
|
|
||||||
/* Logging infrastructure */
|
/* Logging infrastructure */
|
||||||
static void
|
static void
|
||||||
|
|
|
||||||
|
|
@ -450,6 +450,23 @@ pandecode_cs_binary(struct pandecode_context *ctx, mali_ptr bin_gpu_va,
|
||||||
simple_mtx_unlock(&ctx->lock);
|
simple_mtx_unlock(&ctx->lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
pandecode_cs_trace(struct pandecode_context *ctx, mali_ptr trace_gpu_va,
|
||||||
|
uint32_t size, unsigned gpu_id)
|
||||||
|
{
|
||||||
|
simple_mtx_lock(&ctx->lock);
|
||||||
|
|
||||||
|
switch (pan_arch(gpu_id)) {
|
||||||
|
case 10:
|
||||||
|
pandecode_cs_trace_v10(ctx, trace_gpu_va, size, gpu_id);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
unreachable("Unsupported architecture");
|
||||||
|
}
|
||||||
|
|
||||||
|
simple_mtx_unlock(&ctx->lock);
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
pandecode_shader_disassemble(struct pandecode_context *ctx, mali_ptr shader_ptr,
|
pandecode_shader_disassemble(struct pandecode_context *ctx, mali_ptr shader_ptr,
|
||||||
unsigned gpu_id)
|
unsigned gpu_id)
|
||||||
|
|
|
||||||
|
|
@ -30,6 +30,9 @@
|
||||||
#include "decode.h"
|
#include "decode.h"
|
||||||
|
|
||||||
#if PAN_ARCH >= 10
|
#if PAN_ARCH >= 10
|
||||||
|
|
||||||
|
#include "genxml/cs_builder.h"
|
||||||
|
|
||||||
/* Limit for Mali-G610. -1 because we're not including the active frame */
|
/* Limit for Mali-G610. -1 because we're not including the active frame */
|
||||||
#define MAX_CALL_STACK_DEPTH (8 - 1)
|
#define MAX_CALL_STACK_DEPTH (8 - 1)
|
||||||
|
|
||||||
|
|
@ -1498,6 +1501,14 @@ print_cs_binary(struct pandecode_context *ctx, mali_ptr bin,
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case MALI_CS_OPCODE_RUN_IDVS:
|
||||||
|
case MALI_CS_OPCODE_RUN_FRAGMENT:
|
||||||
|
case MALI_CS_OPCODE_RUN_COMPUTE:
|
||||||
|
case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT:
|
||||||
|
fprintf(ctx->dump_stream, " // tracepoint_%" PRIx64,
|
||||||
|
bin + (i * sizeof(uint64_t)));
|
||||||
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
@ -1533,4 +1544,100 @@ GENX(pandecode_cs_binary)(struct pandecode_context *ctx, mali_ptr bin,
|
||||||
|
|
||||||
pandecode_map_read_write(ctx);
|
pandecode_map_read_write(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
GENX(pandecode_cs_trace)(struct pandecode_context *ctx, mali_ptr trace,
|
||||||
|
uint32_t trace_size, unsigned gpu_id)
|
||||||
|
{
|
||||||
|
pandecode_dump_file_open(ctx);
|
||||||
|
|
||||||
|
void *trace_data = pandecode_fetch_gpu_mem(ctx, trace, trace_size);
|
||||||
|
|
||||||
|
while (trace_size > 0) {
|
||||||
|
uint32_t regs[256] = {};
|
||||||
|
uint64_t *ip = trace_data;
|
||||||
|
|
||||||
|
uint64_t *instr = pandecode_fetch_gpu_mem(ctx, *ip, sizeof(*instr));
|
||||||
|
|
||||||
|
/* Mali-G610 has 96 registers. Other devices not yet supported, we can
|
||||||
|
* make this configurable later when we encounter new Malis.
|
||||||
|
*/
|
||||||
|
struct queue_ctx qctx = {
|
||||||
|
.nr_regs = 96,
|
||||||
|
.regs = regs,
|
||||||
|
.ip = instr,
|
||||||
|
.end = instr + 1,
|
||||||
|
.gpu_id = gpu_id,
|
||||||
|
};
|
||||||
|
|
||||||
|
pandecode_make_indent(ctx);
|
||||||
|
print_cs_instr(ctx->dump_stream, *instr);
|
||||||
|
fprintf(ctx->dump_stream, " // from tracepoint_%" PRIx64 "\n", *ip);
|
||||||
|
|
||||||
|
pan_unpack(instr, CS_BASE, base);
|
||||||
|
|
||||||
|
switch (base.opcode) {
|
||||||
|
case MALI_CS_OPCODE_RUN_IDVS: {
|
||||||
|
struct cs_run_idvs_trace *idvs_trace = trace_data;
|
||||||
|
|
||||||
|
assert(trace_size >= sizeof(idvs_trace));
|
||||||
|
pan_unpack(instr, CS_RUN_IDVS, I);
|
||||||
|
memcpy(regs, idvs_trace->sr, sizeof(idvs_trace->sr));
|
||||||
|
|
||||||
|
if (I.draw_id_register_enable)
|
||||||
|
regs[I.draw_id] = idvs_trace->draw_id;
|
||||||
|
|
||||||
|
pandecode_run_idvs(ctx, ctx->dump_stream, &qctx, &I);
|
||||||
|
trace_data = idvs_trace + 1;
|
||||||
|
trace_size -= sizeof(*idvs_trace);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case MALI_CS_OPCODE_RUN_FRAGMENT: {
|
||||||
|
struct cs_run_fragment_trace *frag_trace = trace_data;
|
||||||
|
|
||||||
|
assert(trace_size >= sizeof(frag_trace));
|
||||||
|
pan_unpack(instr, CS_RUN_FRAGMENT, I);
|
||||||
|
memcpy(®s[40], frag_trace->sr, sizeof(frag_trace->sr));
|
||||||
|
pandecode_run_fragment(ctx, ctx->dump_stream, &qctx, &I);
|
||||||
|
trace_data = frag_trace + 1;
|
||||||
|
trace_size -= sizeof(*frag_trace);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case MALI_CS_OPCODE_RUN_COMPUTE: {
|
||||||
|
struct cs_run_compute_trace *comp_trace = trace_data;
|
||||||
|
|
||||||
|
assert(trace_size >= sizeof(comp_trace));
|
||||||
|
pan_unpack(instr, CS_RUN_COMPUTE, I);
|
||||||
|
memcpy(regs, comp_trace->sr, sizeof(comp_trace->sr));
|
||||||
|
pandecode_run_compute(ctx, ctx->dump_stream, &qctx, &I);
|
||||||
|
trace_data = comp_trace + 1;
|
||||||
|
trace_size -= sizeof(*comp_trace);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT: {
|
||||||
|
struct cs_run_compute_trace *comp_trace = trace_data;
|
||||||
|
|
||||||
|
assert(trace_size >= sizeof(comp_trace));
|
||||||
|
pan_unpack(instr, CS_RUN_COMPUTE_INDIRECT, I);
|
||||||
|
memcpy(regs, comp_trace->sr, sizeof(comp_trace->sr));
|
||||||
|
pandecode_run_compute_indirect(ctx, ctx->dump_stream, &qctx, &I);
|
||||||
|
trace_data = comp_trace + 1;
|
||||||
|
trace_size -= sizeof(*comp_trace);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
default:
|
||||||
|
assert(!"Invalid trace packet");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
pandecode_log(ctx, "\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
fflush(ctx->dump_stream);
|
||||||
|
pandecode_map_read_write(ctx);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -68,6 +68,9 @@ void pandecode_interpret_cs(struct pandecode_context *ctx,
|
||||||
void pandecode_cs_binary(struct pandecode_context *ctx, mali_ptr binary_gpu_va,
|
void pandecode_cs_binary(struct pandecode_context *ctx, mali_ptr binary_gpu_va,
|
||||||
uint32_t size, unsigned gpu_id);
|
uint32_t size, unsigned gpu_id);
|
||||||
|
|
||||||
|
void pandecode_cs_trace(struct pandecode_context *ctx, mali_ptr trace_gpu_va,
|
||||||
|
uint32_t size, unsigned gpu_id);
|
||||||
|
|
||||||
void pandecode_abort_on_fault(struct pandecode_context *ctx, uint64_t jc_gpu_va,
|
void pandecode_abort_on_fault(struct pandecode_context *ctx, uint64_t jc_gpu_va,
|
||||||
unsigned gpu_id);
|
unsigned gpu_id);
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue