radeonsi: write shader asm annotated with wave info into GPU hang reports

Note that the disassembly is written twice - first the unmodified compiler
output and then the wave-annotated output only if there are waves executing
the shader.

Sample output from a real GPU hang most likely caused by image_sample:

The number of active waves = 28

Pixel Shader - annotated disassembly:
    s_mov_b64 s[6:7], exec                                        ; BE86017E [PC=0x10f3e3800, off=0, size=4]
    s_wqm_b64 exec, exec                                          ; BEFE077E [PC=0x10f3e3804, off=4, size=4]
...
    image_sample v[7:9], v[0:1], s[12:19], s[20:23] dmask:0x7     ; F0800700 00A30700 [PC=0x10f3e3a94, off=660, size=8]
    s_buffer_load_dword s20, s[0:3], 0x50                         ; C0220500 00000050 [PC=0x10f3e3a9c, off=668, size=8]
    s_load_dwordx4 s[24:27], s[4:5], 0x170                        ; C00A0602 00000170 [PC=0x10f3e3aa4, off=676, size=8]
    s_load_dwordx8 s[12:19], s[4:5], 0x140                        ; C00E0302 00000140 [PC=0x10f3e3aac, off=684, size=8]
    s_buffer_load_dword s11, s[0:3], 0x5c                         ; C02202C0 0000005C [PC=0x10f3e3ab4, off=692, size=8]
    s_buffer_load_dword s21, s[0:3], 0x54                         ; C0220540 00000054 [PC=0x10f3e3abc, off=700, size=8]
    s_buffer_load_dword s22, s[0:3], 0x58                         ; C0220580 00000058 [PC=0x10f3e3ac4, off=708, size=8]
    s_waitcnt vmcnt(0)                                            ; BF8C0F70 [PC=0x10f3e3acc, off=716, size=4]
          ^ SE0 SH0 CU1 SIMD1 WAVE0  EXEC=aaaaaaa555aaaaaa  INST32=BF8C0F70
          ^ SE0 SH0 CU1 SIMD2 WAVE0  EXEC=aaaa85555555552a  INST32=BF8C0F70
          ^ SE0 SH0 CU1 SIMD3 WAVE0  EXEC=000000000000000a  INST32=BF8C0F70
          ^ SE0 SH0 CU6 SIMD1 WAVE0  EXEC=25a5a5aa82aaaaaa  INST32=BF8C0F70
          ^ SE0 SH0 CU6 SIMD3 WAVE0  EXEC=50aaaa8fffa55555  INST32=BF8C0F70
          ^ SE0 SH0 CU7 SIMD0 WAVE0  EXEC=5554aaaaaaa1a555  INST32=BF8C0F70
          ^ SE0 SH0 CU7 SIMD0 WAVE1  EXEC=aaaa5555ffffffff  INST32=BF8C0F70
          ^ SE0 SH0 CU7 SIMD1 WAVE0  EXEC=555557aaaaaaaaa5  INST32=BF8C0F70
          ^ SE0 SH0 CU7 SIMD3 WAVE0  EXEC=5555aaaaaaaaaa85  INST32=BF8C0F70
          ^ SE1 SH0 CU3 SIMD1 WAVE0  EXEC=aaaaaaaaaaaaaaaa  INST32=BF8C0F70
          ^ SE1 SH0 CU4 SIMD0 WAVE0  EXEC=aaaaaaaa5a5a5a5a  INST32=BF8C0F70
          ^ SE1 SH0 CU4 SIMD1 WAVE0  EXEC=aaaaaaa5a5a5a4a5  INST32=BF8C0F70
          ^ SE1 SH0 CU4 SIMD2 WAVE0  EXEC=5555555000000000  INST32=BF8C0F70
          ^ SE1 SH0 CU4 SIMD3 WAVE0  EXEC=aa555554155aaaaa  INST32=BF8C0F70
          ^ SE1 SH0 CU5 SIMD0 WAVE0  EXEC=55ffff55555555aa  INST32=BF8C0F70
          ^ SE1 SH0 CU5 SIMD1 WAVE0  EXEC=555555555aaaaaaa  INST32=BF8C0F70
          ^ SE1 SH0 CU5 SIMD2 WAVE0  EXEC=a0aaaaaaa8555555  INST32=BF8C0F70
          ^ SE1 SH0 CU5 SIMD3 WAVE0  EXEC=8aaaaaaaaaaaa555  INST32=BF8C0F70
          ^ SE1 SH0 CU6 SIMD0 WAVE0  EXEC=000000002aaaaaaa  INST32=BF8C0F70
          ^ SE2 SH0 CU1 SIMD0 WAVE0  EXEC=5aaaa5400aaaa15a  INST32=BF8C0F70
          ^ SE2 SH0 CU1 SIMD1 WAVE0  EXEC=00aaaaaaaa5555aa  INST32=BF8C0F70
          ^ SE2 SH0 CU1 SIMD2 WAVE0  EXEC=aa00005555554555  INST32=BF8C0F70
          ^ SE2 SH0 CU1 SIMD3 WAVE0  EXEC=aaaaaaa000000000  INST32=BF8C0F70
          ^ SE3 SH0 CU4 SIMD0 WAVE0  EXEC=5555aaaaaaaaaaaa  INST32=BF8C0F70
          ^ SE3 SH0 CU4 SIMD2 WAVE0  EXEC=ffaaaaaaaaaa5555  INST32=BF8C0F70
          ^ SE3 SH0 CU4 SIMD3 WAVE0  EXEC=aaaa55555555aa00  INST32=BF8C0F70
          ^ SE3 SH0 CU5 SIMD0 WAVE0  EXEC=00aaaaaaaaaaaa5a  INST32=BF8C0F70
          ^ SE3 SH0 CU5 SIMD1 WAVE0  EXEC=5a555555005555ff  INST32=BF8C0F70
    v_mul_f32_e32 v7, s6, v7                                      ; 0A0E0E06 [PC=0x10f3e3ad0, off=720, size=4]
...

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
This commit is contained in:
Marek Olšák 2016-11-29 15:35:11 +01:00
parent 3de8c5a3c5
commit 28c06b3ceb
3 changed files with 252 additions and 3 deletions

View file

@ -468,6 +468,254 @@ static void si_dump_descriptors(struct si_context *sctx,
num_elements[i], f);
}
struct si_shader_inst {
char text[160]; /* one disasm line */
unsigned offset; /* instruction offset */
unsigned size; /* instruction size = 4 or 8 */
};
/* Split a disassembly string into lines and add them to the array pointed
* to by "instructions". */
static void si_add_split_disasm(const char *disasm,
uint64_t start_addr,
unsigned *num,
struct si_shader_inst *instructions)
{
struct si_shader_inst *last_inst = *num ? &instructions[*num - 1] : NULL;
char *next;
while ((next = strchr(disasm, '\n'))) {
struct si_shader_inst *inst = &instructions[*num];
unsigned len = next - disasm;
assert(len < ARRAY_SIZE(inst->text));
memcpy(inst->text, disasm, len);
inst->text[len] = 0;
inst->offset = last_inst ? last_inst->offset + last_inst->size : 0;
const char *semicolon = strchr(disasm, ';');
assert(semicolon);
/* More than 16 chars after ";" means the instruction is 8 bytes long. */
inst->size = next - semicolon > 16 ? 8 : 4;
snprintf(inst->text + len, ARRAY_SIZE(inst->text) - len,
" [PC=0x%"PRIx64", off=%u, size=%u]",
start_addr + inst->offset, inst->offset, inst->size);
last_inst = inst;
(*num)++;
disasm = next + 1;
}
}
#define MAX_WAVES_PER_CHIP (64 * 40)
struct si_wave_info {
unsigned se; /* shader engine */
unsigned sh; /* shader array */
unsigned cu; /* compute unit */
unsigned simd;
unsigned wave;
uint32_t status;
uint64_t pc; /* program counter */
uint32_t inst_dw0;
uint32_t inst_dw1;
uint64_t exec;
bool matched; /* whether the wave is used by a currently-bound shader */
};
static int compare_wave(const void *p1, const void *p2)
{
struct si_wave_info *w1 = (struct si_wave_info *)p1;
struct si_wave_info *w2 = (struct si_wave_info *)p2;
/* Sort waves according to PC and then SE, SH, CU, etc. */
if (w1->pc < w2->pc)
return -1;
if (w1->pc > w2->pc)
return 1;
if (w1->se < w2->se)
return -1;
if (w1->se > w2->se)
return 1;
if (w1->sh < w2->sh)
return -1;
if (w1->sh > w2->sh)
return 1;
if (w1->cu < w2->cu)
return -1;
if (w1->cu > w2->cu)
return 1;
if (w1->simd < w2->simd)
return -1;
if (w1->simd > w2->simd)
return 1;
if (w1->wave < w2->wave)
return -1;
if (w1->wave > w2->wave)
return 1;
return 0;
}
/* Return wave information. "waves" should be a large enough array. */
static unsigned si_get_wave_info(struct si_wave_info waves[MAX_WAVES_PER_CHIP])
{
char line[2000];
unsigned num_waves = 0;
FILE *p = popen("umr -wa", "r");
if (!p)
return 0;
if (!fgets(line, sizeof(line), p) ||
strncmp(line, "SE", 2) != 0) {
pclose(p);
return 0;
}
while (fgets(line, sizeof(line), p)) {
struct si_wave_info *w;
uint32_t pc_hi, pc_lo, exec_hi, exec_lo;
assert(num_waves < MAX_WAVES_PER_CHIP);
w = &waves[num_waves];
if (sscanf(line, "%u %u %u %u %u %x %x %x %x %x %x %x",
&w->se, &w->sh, &w->cu, &w->simd, &w->wave,
&w->status, &pc_hi, &pc_lo, &w->inst_dw0,
&w->inst_dw1, &exec_hi, &exec_lo) == 12) {
w->pc = ((uint64_t)pc_hi << 32) | pc_lo;
w->exec = ((uint64_t)exec_hi << 32) | exec_lo;
w->matched = false;
num_waves++;
}
}
qsort(waves, num_waves, sizeof(struct si_wave_info), compare_wave);
pclose(p);
return num_waves;
}
/* If the shader is being executed, print its asm instructions, and annotate
* those that are being executed right now with information about waves that
* execute them. This is most useful during a GPU hang.
*/
static void si_print_annotated_shader(struct si_shader *shader,
struct si_wave_info *waves,
unsigned num_waves,
FILE *f)
{
if (!shader || !shader->binary.disasm_string)
return;
uint64_t start_addr = shader->bo->gpu_address;
uint64_t end_addr = start_addr + shader->bo->b.b.width0;
unsigned i;
/* See if any wave executes the shader. */
for (i = 0; i < num_waves; i++) {
if (start_addr <= waves[i].pc && waves[i].pc <= end_addr)
break;
}
if (i == num_waves)
return; /* the shader is not being executed */
/* Remember the first found wave. The waves are sorted according to PC. */
waves = &waves[i];
num_waves -= i;
/* Get the list of instructions.
* Buffer size / 4 is the upper bound of the instruction count.
*/
unsigned num_inst = 0;
struct si_shader_inst *instructions =
calloc(shader->bo->b.b.width0 / 4, sizeof(struct si_shader_inst));
if (shader->prolog) {
si_add_split_disasm(shader->prolog->binary.disasm_string,
start_addr, &num_inst, instructions);
}
si_add_split_disasm(shader->binary.disasm_string,
start_addr, &num_inst, instructions);
if (shader->epilog) {
si_add_split_disasm(shader->epilog->binary.disasm_string,
start_addr, &num_inst, instructions);
}
fprintf(f, COLOR_YELLOW "%s - annotated disassembly:" COLOR_RESET "\n",
si_get_shader_name(shader, shader->selector->type));
/* Print instructions with annotations. */
for (i = 0; i < num_inst; i++) {
struct si_shader_inst *inst = &instructions[i];
fprintf(f, "%s\n", inst->text);
/* Print which waves execute the instruction right now. */
while (num_waves && start_addr + inst->offset == waves->pc) {
fprintf(f,
" " COLOR_GREEN "^ SE%u SH%u CU%u "
"SIMD%u WAVE%u EXEC=%016"PRIx64 " ",
waves->se, waves->sh, waves->cu, waves->simd,
waves->wave, waves->exec);
if (inst->size == 4) {
fprintf(f, "INST32=%08X" COLOR_RESET "\n",
waves->inst_dw0);
} else {
fprintf(f, "INST64=%08X %08X" COLOR_RESET "\n",
waves->inst_dw0, waves->inst_dw1);
}
waves->matched = true;
waves = &waves[1];
num_waves--;
}
}
fprintf(f, "\n\n");
free(instructions);
}
static void si_dump_annotated_shaders(struct si_context *sctx, FILE *f)
{
struct si_wave_info waves[MAX_WAVES_PER_CHIP];
unsigned num_waves = si_get_wave_info(waves);
fprintf(f, COLOR_CYAN "The number of active waves = %u" COLOR_RESET
"\n\n", num_waves);
si_print_annotated_shader(sctx->vs_shader.current, waves, num_waves, f);
si_print_annotated_shader(sctx->tcs_shader.current, waves, num_waves, f);
si_print_annotated_shader(sctx->tes_shader.current, waves, num_waves, f);
si_print_annotated_shader(sctx->gs_shader.current, waves, num_waves, f);
si_print_annotated_shader(sctx->ps_shader.current, waves, num_waves, f);
/* Print waves executing shaders that are not currently bound. */
unsigned i;
bool found = false;
for (i = 0; i < num_waves; i++) {
if (waves[i].matched)
continue;
if (!found) {
fprintf(f, COLOR_CYAN
"Waves not executing currently-bound shaders:"
COLOR_RESET "\n");
found = true;
}
fprintf(f, " SE%u SH%u CU%u SIMD%u WAVE%u EXEC=%016"PRIx64
" INST=%08X %08X PC=%"PRIx64"\n",
waves[i].se, waves[i].sh, waves[i].cu, waves[i].simd,
waves[i].wave, waves[i].exec, waves[i].inst_dw0,
waves[i].inst_dw1, waves[i].pc);
}
if (found)
fprintf(f, "\n\n");
}
static void si_dump_command(const char *title, const char *command, FILE *f)
{
char line[2000];
@ -502,7 +750,8 @@ static void si_dump_debug_state(struct pipe_context *ctx, FILE *f,
si_dump_shader(sctx->screen, &sctx->ps_shader, f);
if (flags & PIPE_DUMP_DEVICE_STATUS_REGISTERS) {
si_dump_command("Active waves", "umr -wa | column -t", f);
si_dump_annotated_shaders(sctx, f);
si_dump_command("Active waves (raw data)", "umr -wa | column -t", f);
si_dump_command("Wave information", "umr -O bits -wa", f);
}

View file

@ -5918,8 +5918,7 @@ static void si_shader_dump_stats(struct si_screen *sscreen,
conf->spilled_vgprs, conf->private_mem_vgprs);
}
static const char *si_get_shader_name(struct si_shader *shader,
unsigned processor)
const char *si_get_shader_name(struct si_shader *shader, unsigned processor)
{
switch (processor) {
case PIPE_SHADER_VERTEX:

View file

@ -576,5 +576,6 @@ void si_shader_binary_read_config(struct radeon_shader_binary *binary,
unsigned symbol_offset);
unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
bool writes_samplemask);
const char *si_get_shader_name(struct si_shader *shader, unsigned processor);
#endif