mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-27 01:50:10 +01:00
gitlab-ci: detect a3xx gpu hang recovery failure
But don't bail immediately, instead print out some more lines after the hang, hopefully catching info about the cause of the hang. Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu> Reviewed-by: Emma Anholt <emma@anholt.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14033>
This commit is contained in:
parent
eb0b08ea1a
commit
268fc8e5c1
1 changed files with 18 additions and 0 deletions
|
|
@ -70,7 +70,13 @@ class FastbootRun:
|
|||
if self.logged_system(self.fastboot) != 0:
|
||||
return 1
|
||||
|
||||
print_more_lines = -1
|
||||
for line in self.ser.lines():
|
||||
if print_more_lines == 0:
|
||||
return 2
|
||||
if print_more_lines > 0:
|
||||
print_more_lines -= 1
|
||||
|
||||
if re.search("---. end Kernel panic", line):
|
||||
return 1
|
||||
|
||||
|
|
@ -92,6 +98,18 @@ class FastbootRun:
|
|||
"Detected network device failure, restarting run...")
|
||||
return 2
|
||||
|
||||
# A3xx recovery doesn't quite work. Sometimes the GPU will get
|
||||
# wedged and recovery will fail (because power can't be reset?)
|
||||
# This assumes that the jobs are sufficiently well-tested that GPU
|
||||
# hangs aren't always triggered, so just try again. But print some
|
||||
# more lines first so that we get better information on the cause
|
||||
# of the hang. Once a hang happens, it's pretty chatty.
|
||||
if "[drm:adreno_recover] *ERROR* gpu hw init failed: -22" in line:
|
||||
self.print_error(
|
||||
"Detected GPU hang, restarting run...")
|
||||
if print_more_lines == -1:
|
||||
print_more_lines = 30
|
||||
|
||||
result = re.search("hwci: mesa: (\S*)", line)
|
||||
if result:
|
||||
if result.group(1) == "pass":
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue