ac/debug,radv: Read UMR wave dumps into memory before parsing

Allows RADV to reuse the wave dump, which leads to more consistency
between pipeline.log and umr_waves.log.

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28838>
This commit is contained in:
Konstantin 2024-04-20 11:09:41 +02:00 committed by Marge Bot
parent 9a43987780
commit 575565af58
4 changed files with 79 additions and 42 deletions

View file

@ -199,6 +199,41 @@ bool ac_vm_fault_occurred(enum amd_gfx_level gfx_level, uint64_t *old_dmesg_time
#endif
}
char *
ac_get_umr_waves(const struct radeon_info *info, enum amd_ip_type ring)
{
/* TODO: Dump compute ring. */
if (ring != AMD_IP_GFX)
return NULL;
#ifndef _WIN32
char *data;
size_t size;
FILE *f = open_memstream(&data, &size);
if (!f)
return NULL;
char cmd[256];
sprintf(cmd, "umr --by-pci %04x:%02x:%02x.%01x -O bits,halt_waves -go 0 -wa %s -go 1 2>&1", info->pci.domain,
info->pci.bus, info->pci.dev, info->pci.func, info->gfx_level >= GFX10 ? "gfx_0.0.0" : "gfx");
char line[2048];
FILE *p = popen(cmd, "r");
if (p) {
while (fgets(line, sizeof(line), p))
fputs(line, f);
fprintf(f, "\n");
pclose(p);
}
fclose(f);
return data;
#else
return NULL;
#endif
}
static int compare_wave(const void *p1, const void *p2)
{
struct ac_wave_info *w1 = (struct ac_wave_info *)p1;
@ -236,9 +271,9 @@ static int compare_wave(const void *p1, const void *p2)
#define AC_UMR_REGISTERS_LINE "Main Registers"
static bool
ac_read_umr_register(char **_scan, const char *name, uint32_t *value)
ac_read_umr_register(const char **_scan, const char *name, uint32_t *value)
{
char *scan = *_scan;
const char *scan = *_scan;
if (strncmp(scan, name, MIN2(strlen(scan), strlen(name))))
return false;
@ -252,37 +287,44 @@ ac_read_umr_register(char **_scan, const char *name, uint32_t *value)
/* Return wave information. "waves" should be a large enough array. */
unsigned ac_get_wave_info(enum amd_gfx_level gfx_level, const struct radeon_info *info,
const char *wave_dump,
struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP])
{
#ifdef _WIN32
return 0;
#else
char line[2000], cmd[256];
char *dump = NULL;
if (!wave_dump) {
dump = ac_get_umr_waves(info, AMD_IP_GFX);
wave_dump = dump;
}
unsigned num_waves = 0;
sprintf(cmd, "umr --by-pci %04x:%02x:%02x.%01x -O halt_waves -wa %s",
info->pci.domain, info->pci.bus, info->pci.dev, info->pci.func,
gfx_level >= GFX10 ? "gfx_0.0.0" : "gfx");
while (true) {
const char *end = strchr(wave_dump, '\n');
if (!end)
break;
FILE *p = popen(cmd, "r");
if (!p)
return 0;
while (fgets(line, sizeof(line), p)) {
if (strncmp(line, AC_UMR_REGISTERS_LINE, strlen(AC_UMR_REGISTERS_LINE)))
if (strncmp(wave_dump, AC_UMR_REGISTERS_LINE, strlen(AC_UMR_REGISTERS_LINE))) {
wave_dump = end + 1;
continue;
}
assert(num_waves < AC_MAX_WAVES_PER_CHIP);
struct ac_wave_info *w = &waves[num_waves];
memset(w, 0, sizeof(struct ac_wave_info));
num_waves++;
while (fgets(line, sizeof(line), p)) {
if (strlen(line) < 2)
while (true) {
const char *end2 = strchr(wave_dump, '\n');
if (!end2)
break;
if (end2 - wave_dump < 2)
break;
char *scan = line;
while (scan < line + strlen(line)) {
const char *scan = wave_dump;
while (scan < end2) {
if (strncmp(scan, "ix", MIN2(strlen(scan), strlen("ix")))) {
scan++;
continue;
@ -323,7 +365,7 @@ unsigned ac_get_wave_info(enum amd_gfx_level gfx_level, const struct radeon_info
/* Skip registers we do not handle. */
if (!progress) {
while (scan < line + strlen(line)) {
while (scan < end2) {
if (*scan == '|') {
progress = true;
break;
@ -335,12 +377,15 @@ unsigned ac_get_wave_info(enum amd_gfx_level gfx_level, const struct radeon_info
if (!progress)
break;
}
wave_dump = end2 + 1;
}
}
qsort(waves, num_waves, sizeof(struct ac_wave_info), compare_wave);
pclose(p);
free(dump);
return num_waves;
#endif
}

View file

@ -69,7 +69,9 @@ bool ac_register_exists(enum amd_gfx_level gfx_level, enum radeon_family family,
unsigned offset);
bool ac_vm_fault_occurred(enum amd_gfx_level gfx_level, uint64_t *old_dmesg_timestamp,
uint64_t *out_addr);
char *ac_get_umr_waves(const struct radeon_info *info, enum amd_ip_type ring);
unsigned ac_get_wave_info(enum amd_gfx_level gfx_level, const struct radeon_info *info,
const char *wave_dump,
struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP]);
void ac_print_gpuvm_fault_status(FILE *output, enum amd_gfx_level gfx_level,
uint32_t status);

View file

@ -457,11 +457,10 @@ radv_get_saved_pipeline(struct radv_device *device, enum amd_ip_type ring)
}
static void
radv_dump_queue_state(struct radv_queue *queue, const char *dump_dir, FILE *f)
radv_dump_queue_state(struct radv_queue *queue, const char *dump_dir, const char *wave_dump, FILE *f)
{
struct radv_device *device = radv_queue_device(queue);
const struct radv_physical_device *pdev = radv_device_physical(device);
const struct radv_instance *instance = radv_physical_device_instance(pdev);
enum amd_ip_type ring = radv_queue_ring(queue);
struct radv_pipeline *pipeline;
@ -500,10 +499,10 @@ radv_dump_queue_state(struct radv_queue *queue, const char *dump_dir, FILE *f)
MESA_SHADER_COMPUTE, dump_dir, f);
}
if (!(instance->debug_flags & RADV_DEBUG_NO_UMR)) {
if (wave_dump) {
struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP];
enum amd_gfx_level gfx_level = pdev->info.gfx_level;
unsigned num_waves = ac_get_wave_info(gfx_level, &pdev->info, waves);
unsigned num_waves = ac_get_wave_info(gfx_level, &pdev->info, wave_dump, waves);
fprintf(f, COLOR_CYAN "The number of active waves = %u" COLOR_RESET "\n\n", num_waves);
@ -681,24 +680,9 @@ radv_dump_umr_ring(const struct radv_queue *queue, FILE *f)
}
static void
radv_dump_umr_waves(struct radv_queue *queue, FILE *f)
radv_dump_umr_waves(struct radv_queue *queue, const char *wave_dump, FILE *f)
{
#ifndef _WIN32
const struct radv_device *device = radv_queue_device(queue);
const struct radv_physical_device *pdev = radv_device_physical(device);
enum amd_ip_type ring = radv_queue_ring(queue);
char cmd[256];
/* TODO: Dump compute ring. */
if (ring != AMD_IP_GFX)
return;
sprintf(cmd, "umr --by-pci %04x:%02x:%02x.%01x -O bits,halt_waves -go 0 -wa %s -go 1 2>&1", pdev->bus_info.domain,
pdev->bus_info.bus, pdev->bus_info.dev, pdev->bus_info.func,
pdev->info.gfx_level >= GFX10 ? "gfx_0.0.0" : "gfx");
fprintf(f, "\nUMR GFX waves:\n\n");
radv_dump_cmd(cmd, f);
#endif
fprintf(f, "\nUMR GFX waves:\n\n%s", wave_dump ? wave_dump : "");
}
static bool
@ -794,6 +778,10 @@ radv_check_gpu_hangs(struct radv_queue *queue, const struct radv_winsys_submit_i
{"bo_history"}, {"vm_fault"}, {"app_info"}, {"gpu_info"}, {"dmesg"},
};
char *wave_dump = NULL;
if (!(instance->debug_flags & RADV_DEBUG_NO_UMR))
wave_dump = ac_get_umr_waves(&pdev->info, radv_queue_ring(queue));
for (uint32_t i = 0; i < RADV_DEVICE_FAULT_CHUNK_COUNT; i++) {
if (save_hang_report) {
@ -812,11 +800,11 @@ radv_check_gpu_hangs(struct radv_queue *queue, const struct radv_winsys_submit_i
radv_dump_trace(device, submit_info->cs_array[0], f);
break;
case RADV_DEVICE_FAULT_CHUNK_QUEUE_STATE:
radv_dump_queue_state(queue, dump_dir, f);
radv_dump_queue_state(queue, dump_dir, wave_dump, f);
break;
case RADV_DEVICE_FAULT_CHUNK_UMR_WAVES:
if (!(instance->debug_flags & RADV_DEBUG_NO_UMR))
radv_dump_umr_waves(queue, f);
radv_dump_umr_waves(queue, wave_dump, f);
break;
case RADV_DEVICE_FAULT_CHUNK_UMR_RING:
if (!(instance->debug_flags & RADV_DEBUG_NO_UMR))
@ -855,6 +843,8 @@ radv_check_gpu_hangs(struct radv_queue *queue, const struct radv_winsys_submit_i
fclose(f);
}
free(wave_dump);
if (save_hang_report) {
fprintf(stderr, "radv: GPU hang report saved successfully!\n");
abort();

View file

@ -975,7 +975,7 @@ static void si_print_annotated_shader(struct si_shader *shader, struct ac_wave_i
static void si_dump_annotated_shaders(struct si_context *sctx, FILE *f)
{
struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP];
unsigned num_waves = ac_get_wave_info(sctx->gfx_level, &sctx->screen->info, waves);
unsigned num_waves = ac_get_wave_info(sctx->gfx_level, &sctx->screen->info, NULL, waves);
fprintf(f, COLOR_CYAN "The number of active waves = %u" COLOR_RESET "\n\n", num_waves);