From 7d0dc3d30ca7cb9589f2590d827dac32f72fe835 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Sun, 24 Nov 2024 19:23:56 +0100 Subject: [PATCH] pan/decode: Add a helper to print CS binaries without interpreting them In panvk, we want to switch from interpretation-based decoding to event-tracing based decoding, so we no longer depend on the memory state to get accurate job information. Even if we're not interested in interpreting the CS, we still want to dump CS binaries so developers can know what's passed to the GPU. Signed-off-by: Boris Brezillon Reviewed-by: Mary Guillemard Reviewed-by: Lars-Ivar Hesselberg Simonsen Part-of: --- src/panfrost/lib/genxml/decode.h | 2 + src/panfrost/lib/genxml/decode_common.c | 17 + src/panfrost/lib/genxml/decode_csf.c | 452 ++++++++++++++++++++++++ src/panfrost/lib/wrap.h | 3 + 4 files changed, 474 insertions(+) diff --git a/src/panfrost/lib/genxml/decode.h b/src/panfrost/lib/genxml/decode.h index 41bb9c4b8c0..9b2b78307e2 100644 --- a/src/panfrost/lib/genxml/decode.h +++ b/src/panfrost/lib/genxml/decode.h @@ -132,6 +132,8 @@ void pandecode_abort_on_fault_v9(struct pandecode_context *ctx, void pandecode_interpret_cs_v10(struct pandecode_context *ctx, mali_ptr queue, uint32_t size, unsigned gpu_id, uint32_t *regs); +void pandecode_cs_binary_v10(struct pandecode_context *ctx, mali_ptr bin, + uint32_t bin_size, unsigned gpu_id); /* Logging infrastructure */ static void diff --git a/src/panfrost/lib/genxml/decode_common.c b/src/panfrost/lib/genxml/decode_common.c index ad22e6fc41f..1db94b8847d 100644 --- a/src/panfrost/lib/genxml/decode_common.c +++ b/src/panfrost/lib/genxml/decode_common.c @@ -433,6 +433,23 @@ pandecode_interpret_cs(struct pandecode_context *ctx, mali_ptr queue_gpu_va, simple_mtx_unlock(&ctx->lock); } +void +pandecode_cs_binary(struct pandecode_context *ctx, mali_ptr bin_gpu_va, + uint32_t size, unsigned gpu_id) +{ + simple_mtx_lock(&ctx->lock); + + switch (pan_arch(gpu_id)) { + case 10: + pandecode_cs_binary_v10(ctx, bin_gpu_va, size, gpu_id); + break; + default: + unreachable("Unsupported architecture"); + } + + simple_mtx_unlock(&ctx->lock); +} + void pandecode_shader_disassemble(struct pandecode_context *ctx, mali_ptr shader_ptr, unsigned gpu_id) diff --git a/src/panfrost/lib/genxml/decode_csf.c b/src/panfrost/lib/genxml/decode_csf.c index 9d0723f9cc1..65994ab3cb2 100644 --- a/src/panfrost/lib/genxml/decode_csf.c +++ b/src/panfrost/lib/genxml/decode_csf.c @@ -21,6 +21,11 @@ * SOFTWARE. */ +#include "util/bitset.h" +#include "util/hash_table.h" +#include "util/list.h" +#include "util/ralloc.h" + #include "genxml/gen_macros.h" #include "decode.h" @@ -1081,4 +1086,451 @@ GENX(pandecode_interpret_cs)(struct pandecode_context *ctx, mali_ptr queue, fflush(ctx->dump_stream); pandecode_map_read_write(ctx); } + +struct cs_code_block { + struct list_head node; + unsigned start; + unsigned size; + struct util_dynarray predecessors; + unsigned successors[2]; +}; + +struct cs_indirect_branch_target { + uint64_t address; + uint32_t length; +}; + +struct cs_indirect_branch { + unsigned instr_idx; + bool has_unknown_targets; + struct util_dynarray targets; +}; + +struct cs_code_cfg { + uint64_t *instrs; + unsigned instr_count; + struct cs_code_block **blk_map; + struct util_dynarray indirect_branches; +}; + +static struct cs_code_block * +cs_code_block_alloc(void *alloc_ctx, unsigned start, unsigned size) +{ + struct cs_code_block *block = rzalloc(alloc_ctx, struct cs_code_block); + + block->start = start; + block->size = size; + memset(block->successors, ~0, sizeof(block->successors)); + list_inithead(&block->node); + util_dynarray_init(&block->predecessors, alloc_ctx); + return block; +} + +static void +record_indirect_branch_target(struct cs_code_cfg *cfg, + struct list_head *blk_stack, + struct cs_code_block *cur_blk, unsigned blk_offs, + struct cs_indirect_branch *ibranch) +{ + union { + uint32_t u32[256]; + uint32_t u64[256]; + } reg_file = {0}; + + list_add(&cur_blk->node, blk_stack); + list_for_each_entry(struct cs_code_block, blk, blk_stack, node) { + for (; blk_offs < blk->size && + blk->start + blk_offs != ibranch->instr_idx; + blk_offs++) { + uint64_t instr = cfg->instrs[blk->start + blk_offs]; + pan_unpack(&instr, CS_BASE, base); + switch (base.opcode) { + case MALI_CS_OPCODE_MOVE: { + pan_unpack(&instr, CS_MOVE, I); + reg_file.u64[I.destination] = I.immediate; + break; + } + + case MALI_CS_OPCODE_MOVE32: { + pan_unpack(&instr, CS_MOVE32, I); + reg_file.u32[I.destination] = I.immediate; + break; + } + + case MALI_CS_OPCODE_ADD_IMMEDIATE32: { + pan_unpack(&instr, CS_ADD_IMMEDIATE32, I); + reg_file.u32[I.destination] = reg_file.u32[I.source] + I.immediate; + break; + } + + case MALI_CS_OPCODE_ADD_IMMEDIATE64: { + pan_unpack(&instr, CS_ADD_IMMEDIATE64, I); + reg_file.u64[I.destination] = reg_file.u64[I.source] + I.immediate; + break; + } + + case MALI_CS_OPCODE_UMIN32: { + pan_unpack(&instr, CS_UMIN32, I); + reg_file.u32[I.destination] = + MIN2(reg_file.u32[I.source_1], reg_file.u32[I.source_2]); + break; + } + + default: + break; + } + } + blk_offs = 0; + } + list_delinit(&cur_blk->node); + + uint64_t instr = cfg->instrs[ibranch->instr_idx]; + pan_unpack(&instr, CS_JUMP, I); + + struct cs_indirect_branch_target target = { + .address = reg_file.u64[I.address], + .length = reg_file.u32[I.length], + }; + + util_dynarray_append(&ibranch->targets, struct cs_indirect_branch_target, + target); +} + +static void +collect_indirect_branch_targets_recurse(struct cs_code_cfg *cfg, + struct list_head *blk_stack, + BITSET_WORD *track_map, + struct cs_code_block *cur_blk, + int instr_ptr, + struct cs_indirect_branch *ibranch) +{ + for (; instr_ptr >= (int)cur_blk->start; instr_ptr--) { + assert(instr_ptr >= 0); + uint64_t instr = cfg->instrs[instr_ptr]; + pan_unpack(&instr, CS_BASE, base); + switch (base.opcode) { + case MALI_CS_OPCODE_MOVE: { + pan_unpack(&instr, CS_MOVE, I); + BITSET_CLEAR(track_map, I.destination); + BITSET_CLEAR(track_map, I.destination + 1); + break; + } + + case MALI_CS_OPCODE_MOVE32: { + pan_unpack(&instr, CS_MOVE32, I); + BITSET_CLEAR(track_map, I.destination); + break; + } + + case MALI_CS_OPCODE_ADD_IMMEDIATE32: { + pan_unpack(&instr, CS_ADD_IMMEDIATE32, I); + if (BITSET_TEST(track_map, I.destination)) { + BITSET_SET(track_map, I.source); + BITSET_CLEAR(track_map, I.destination); + } + break; + } + + case MALI_CS_OPCODE_ADD_IMMEDIATE64: { + pan_unpack(&instr, CS_ADD_IMMEDIATE64, I); + if (BITSET_TEST(track_map, I.destination)) { + BITSET_SET(track_map, I.source); + BITSET_CLEAR(track_map, I.destination); + } + if (BITSET_TEST(track_map, I.destination + 1)) { + BITSET_SET(track_map, I.source + 1); + BITSET_CLEAR(track_map, I.destination + 1); + } + break; + } + + case MALI_CS_OPCODE_UMIN32: { + pan_unpack(&instr, CS_UMIN32, I); + if (BITSET_TEST(track_map, I.destination)) { + BITSET_SET(track_map, I.source_1); + BITSET_SET(track_map, I.source_2); + BITSET_CLEAR(track_map, I.destination); + } + break; + } + + case MALI_CS_OPCODE_LOAD_MULTIPLE: { + pan_unpack(&instr, CS_LOAD_MULTIPLE, I); + for (unsigned i = 0; i < 16; i++) { + if ((I.mask & BITFIELD_BIT(i)) && + BITSET_TEST(track_map, I.base_register + i)) { + ibranch->has_unknown_targets = true; + return; + } + } + break; + } + + case MALI_CS_OPCODE_PROGRESS_LOAD: { + pan_unpack(&instr, CS_PROGRESS_LOAD, I); + for (unsigned i = 0; i < 16; i++) { + if (BITSET_TEST(track_map, I.destination) || + BITSET_TEST(track_map, I.destination + 1)) { + ibranch->has_unknown_targets = true; + return; + } + } + break; + } + + default: + break; + } + + if (__bitset_is_empty(track_map, BITSET_WORDS(256))) { + record_indirect_branch_target(cfg, blk_stack, cur_blk, + instr_ptr - cur_blk->start, ibranch); + return; + } + } + + assert(!__bitset_is_empty(track_map, BITSET_WORDS(256))); + + if (util_dynarray_num_elements(&cur_blk->predecessors, unsigned) == 0) { + ibranch->has_unknown_targets = true; + return; + } + + list_add(&cur_blk->node, blk_stack); + util_dynarray_foreach(&cur_blk->predecessors, unsigned, pred) { + struct cs_code_block *prev_blk = cfg->blk_map[*pred]; + + /* If the node is already in the block stack, we skip it + * and consider this path leading to an unknown target. */ + if (!list_is_empty(&cur_blk->node)) { + ibranch->has_unknown_targets = true; + continue; + } + + collect_indirect_branch_targets_recurse( + cfg, blk_stack, track_map, prev_blk, + prev_blk->start + prev_blk->size - 1, ibranch); + } + list_delinit(&cur_blk->node); + + return; +} + +static void +collect_indirect_branch_targets(struct cs_code_cfg *cfg, + struct cs_indirect_branch *ibranch) +{ + uint64_t instr = cfg->instrs[ibranch->instr_idx]; + struct cs_code_block *cur_blk = cfg->blk_map[ibranch->instr_idx]; + struct list_head blk_stack; + BITSET_DECLARE(track_map, 256) = {0}; + + list_inithead(&blk_stack); + + pan_unpack(&instr, CS_JUMP, I); + BITSET_SET(track_map, I.address); + BITSET_SET(track_map, I.address + 1); + BITSET_SET(track_map, I.length); + + collect_indirect_branch_targets_recurse(cfg, &blk_stack, track_map, cur_blk, + ibranch->instr_idx - 1, ibranch); +} + +static struct cs_code_cfg * +get_cs_cfg(struct pandecode_context *ctx, struct hash_table_u64 *symbols, + mali_ptr bin, uint32_t bin_size) +{ + uint32_t instr_count = bin_size / sizeof(uint64_t); + struct cs_code_cfg *cfg = _mesa_hash_table_u64_search(symbols, bin); + + if (cfg) { + assert(cfg->instr_count == instr_count); + return cfg; + } + + uint64_t *instrs = pandecode_fetch_gpu_mem(ctx, bin, bin_size); + + cfg = rzalloc(symbols, struct cs_code_cfg); + _mesa_hash_table_u64_insert(symbols, bin, cfg); + + util_dynarray_init(&cfg->indirect_branches, cfg); + + cfg->blk_map = + rzalloc_array(cfg, struct cs_code_block *, instr_count); + cfg->instrs = instrs; + cfg->instr_count = instr_count; + + struct cs_code_block *block = cs_code_block_alloc(cfg, 0, 0); + + for (unsigned i = 0; i < instr_count; i++) { + uint64_t instr = instrs[i]; + + if (!cfg->blk_map[i]) { + cfg->blk_map[i] = block; + block->size++; + } else { + if (block->successors[0] == ~0) + block->successors[0] = i; + + block = cfg->blk_map[i]; + util_dynarray_append(&block->predecessors, unsigned, i - 1); + } + + pan_unpack(&instr, CS_BASE, base); + + if (base.opcode == MALI_CS_OPCODE_JUMP || + base.opcode == MALI_CS_OPCODE_CALL) { + struct cs_indirect_branch ibranch = { + .instr_idx = i, + }; + + util_dynarray_append(&cfg->indirect_branches, + struct cs_indirect_branch, ibranch); + } + + if (base.opcode != MALI_CS_OPCODE_BRANCH) + continue; + + pan_unpack(&instr, CS_BRANCH, I); + + unsigned target = MIN2(i + 1 + I.offset, instr_count); + + /* If the target of the branch is the next instruction, it's just a NOP, + * and we consider it the same block. */ + if (target == i + 1) + continue; + + if (I.offset < 0 && cfg->blk_map[target]->start != target) { + struct cs_code_block *old = cfg->blk_map[target]; + struct cs_code_block *new = + cs_code_block_alloc(cfg, target, old->start + old->size - target); + + util_dynarray_append(&new->predecessors, unsigned, target - 1); + memcpy(&new->successors, &old->successors, sizeof(new->successors)); + + old->successors[0] = target; + old->successors[1] = ~0; + old->size = new->start - old->start; + + for (unsigned j = 0; j <= new->size; j++) + cfg->blk_map[new->start + j] = new; + } + + if (I.offset > 0 && target < instr_count && !cfg->blk_map[target]) { + struct cs_code_block *new = cs_code_block_alloc(cfg, target, 1); + + cfg->blk_map[target] = new; + util_dynarray_append(&new->predecessors, unsigned, i); + } + + block->successors[0] = target; + if (I.condition != MALI_CS_CONDITION_ALWAYS) + block->successors[1] = i + 1; + + block = cs_code_block_alloc(cfg, i + 1, 0); + + if (target == i + 1 || I.condition != MALI_CS_CONDITION_ALWAYS) + util_dynarray_append(&block->predecessors, unsigned, i); + } + + util_dynarray_foreach(&cfg->indirect_branches, struct cs_indirect_branch, + ibranch) { + collect_indirect_branch_targets(cfg, ibranch); + util_dynarray_foreach(&ibranch->targets, + struct cs_indirect_branch_target, target) { + get_cs_cfg(ctx, symbols, target->address, target->length); + } + } + + return cfg; +} + +static void +print_cs_binary(struct pandecode_context *ctx, mali_ptr bin, + struct cs_code_cfg *cfg, const char *name) +{ + pandecode_log(ctx, "%s@%" PRIx64 "{\n", name, bin); + unsigned ibranch_idx = 0; + + ctx->indent++; + for (unsigned i = 0; i < cfg->instr_count; i++) { + if (i && cfg->blk_map[i - 1] != cfg->blk_map[i]) { + ctx->indent--; + pandecode_log(ctx, "label_%" PRIx64 ":\n", bin + i * sizeof(uint64_t)); + ctx->indent++; + } + + pandecode_make_indent(ctx); + print_cs_instr(ctx->dump_stream, cfg->instrs[i]); + pan_unpack(&cfg->instrs[i], CS_BASE, base); + switch (base.opcode) { + case MALI_CS_OPCODE_JUMP: + case MALI_CS_OPCODE_CALL: { + struct cs_indirect_branch *ibranch = util_dynarray_element( + &cfg->indirect_branches, struct cs_indirect_branch, ibranch_idx); + + assert(ibranch->instr_idx == i); + fprintf(ctx->dump_stream, " // "); + util_dynarray_foreach(&ibranch->targets, + struct cs_indirect_branch_target, target) { + fprintf(ctx->dump_stream, "%scs@%" PRIx64, + target == ibranch->targets.data ? "" : ",", + target->address); + } + if (ibranch->has_unknown_targets) + fprintf(ctx->dump_stream, "%s??", + ibranch->targets.size ? "," : ""); + ibranch_idx++; + break; + } + + case MALI_CS_OPCODE_BRANCH: { + pan_unpack(&cfg->instrs[i], CS_BRANCH, I); + fprintf(ctx->dump_stream, " // "); + + unsigned target = i + 1 + I.offset; + + if (target < cfg->instr_count) + fprintf(ctx->dump_stream, "label_%" PRIx64, + bin + (target * sizeof(uint64_t))); + else + fprintf(ctx->dump_stream, "end_of_cs"); + break; + } + + default: + break; + } + + fprintf(ctx->dump_stream, "\n"); + } + ctx->indent--; + pandecode_log(ctx, "} // %s@%" PRIx64 "\n\n", name, bin); +} + +void +GENX(pandecode_cs_binary)(struct pandecode_context *ctx, mali_ptr bin, + uint32_t bin_size, unsigned gpu_id) +{ + if (!bin_size) + return; + + pandecode_dump_file_open(ctx); + + struct hash_table_u64 *symbols = _mesa_hash_table_u64_create(NULL); + struct cs_code_cfg *main_cfg = get_cs_cfg(ctx, symbols, bin, bin_size); + + print_cs_binary(ctx, bin, main_cfg, "main_cs"); + hash_table_u64_foreach(symbols, he) { + struct cs_code_cfg *other_cfg = he.data; + if (other_cfg == main_cfg) + continue; + + print_cs_binary(ctx, he.key, other_cfg, "cs"); + } + + ralloc_free(symbols); + + pandecode_map_read_write(ctx); +} #endif diff --git a/src/panfrost/lib/wrap.h b/src/panfrost/lib/wrap.h index 5d7d89adfd9..753be2da28a 100644 --- a/src/panfrost/lib/wrap.h +++ b/src/panfrost/lib/wrap.h @@ -65,6 +65,9 @@ void pandecode_interpret_cs(struct pandecode_context *ctx, mali_ptr queue_gpu_va, uint32_t size, unsigned gpu_id, uint32_t *regs); +void pandecode_cs_binary(struct pandecode_context *ctx, mali_ptr binary_gpu_va, + uint32_t size, unsigned gpu_id); + void pandecode_abort_on_fault(struct pandecode_context *ctx, uint64_t jc_gpu_va, unsigned gpu_id);