panfrost: Abort on faults in SYNC mode

This allows failing fast (optionally still tracing, if set with
PAN_MESA_DEBUG=trace) when a GPU fault is introduced. This is better
behaviour for both use cases:

1. When debugging a known fault, setting this mode together with trace
   will stop the driver as soon as a buggy command stream is submitted,
   and the offending stream will be the last trace file.

2. When running test suites (particularly in CI), setting this mode
   will detect faults and crash, causing the pipeline to fail fast as
   opposed to incorrectly marking the run green if the test happens to
   pass despite the faults and slow downs.

Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10938>
This commit is contained in:
Alyssa Rosenzweig 2021-05-21 17:38:00 -04:00 committed by Marge Bot
parent 7bc3730b3f
commit 2f4b5a4ebe
4 changed files with 33 additions and 4 deletions

View file

@ -892,9 +892,11 @@ panfrost_batch_submit_ioctl(struct panfrost_batch *batch,
drmSyncobjWait(dev->fd, &out_sync, 1,
INT64_MAX, 0, NULL);
/* Trace gets priority over sync */
bool minimal = !(dev->debug & PAN_DBG_TRACE);
pandecode_jc(submit.jc, pan_is_bifrost(dev), dev->gpu_id, minimal);
if (dev->debug & PAN_DBG_TRACE)
pandecode_jc(submit.jc, pan_is_bifrost(dev), dev->gpu_id, false);
if (dev->debug & PAN_DBG_SYNC)
pandecode_abort_on_fault(submit.jc);
}
return 0;

View file

@ -62,7 +62,7 @@ static const struct debug_named_value panfrost_debug_options[] = {
{"trace", PAN_DBG_TRACE, "Trace the command stream"},
{"deqp", PAN_DBG_DEQP, "Hacks for dEQP"},
{"dirty", PAN_DBG_DIRTY, "Always re-emit all state"},
{"sync", PAN_DBG_SYNC, "Wait for each job's completion and check for any GPU fault"},
{"sync", PAN_DBG_SYNC, "Wait for each job's completion and abort on GPU faults"},
{"precompile", PAN_DBG_PRECOMPILE, "Precompile shaders for shader-db"},
{"nofp16", PAN_DBG_NOFP16, "Disable 16-bit support"},
{"gl3", PAN_DBG_GL3, "Enable experimental GL 3.x implementation, up to 3.3"},

View file

@ -29,6 +29,7 @@
#include <memory.h>
#include <stdbool.h>
#include <stdarg.h>
#include <errno.h>
#include <ctype.h>
#include "decode.h"
@ -1138,3 +1139,26 @@ pandecode_jc(mali_ptr jc_gpu_va, bool bifrost, unsigned gpu_id, bool minimal)
pandecode_map_read_write();
}
void
pandecode_abort_on_fault(mali_ptr jc_gpu_va)
{
mali_ptr next_job = 0;
do {
struct pandecode_mapped_memory *mem =
pandecode_find_mapped_gpu_mem_containing(jc_gpu_va);
pan_unpack(PANDECODE_PTR(mem, jc_gpu_va, struct mali_job_header_packed),
JOB_HEADER, h);
next_job = h.next;
/* Ensure the job is marked COMPLETE */
if (h.exception_status != 0x1) {
fprintf(stderr, "Incomplete job or timeout");
exit(EIO);
}
} while ((jc_gpu_va = next_job));
pandecode_map_read_write();
}

View file

@ -55,4 +55,7 @@ void pandecode_inject_free(uint64_t gpu_va, unsigned sz);
void pandecode_jc(uint64_t jc_gpu_va, bool bifrost, unsigned gpu_id, bool minimal);
void
pandecode_abort_on_fault(uint64_t jc_gpu_va);
#endif /* __MMAP_TRACE_H__ */