mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-25 23:50:11 +01:00
ci/lava: Detect a6xx gpu recovery failures
Sporadically a6xx gpu will fail to recover causing the lava job
a660_vk_full to loop on error messages for three hours before timing
out.
A few sporadic error messages may still be recoverable, but when multiple
errors occur over a short period, successful recovery is unlikely. Parse
the logs to look for repeated error messages within a short time period.
If found, cancel the lava job and rerun it.
Also add unit tests for this behaviour.
cc: mesa-stable
Reported-by: Valentine Burley <valentine.burley@gmail.com>
Acked-by: Daniel Stone <daniel.stone@collabora.com>
Reviewed-by: Guilherme Gallo <guilherme.gallo@collabora.com>
Signed-off-by: Deborah Brouwer <deborah.brouwer@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30032>
(cherry picked from commit 72c182f873)
This commit is contained in:
parent
d956bc9ec2
commit
dad8f2d4e2
4 changed files with 84 additions and 3 deletions
|
|
@ -23,3 +23,10 @@ KNOWN_ISSUE_R8152_PATTERNS: tuple[str, ...] = (
|
|||
# This is considered noise, since LAVA produces this log after receiving a package of feedback
|
||||
# messages.
|
||||
LOG_DEBUG_FEEDBACK_NOISE = "Listened to connection for namespace 'dut' done"
|
||||
|
||||
A6XX_GPU_RECOVERY_WATCH_PERIOD_MIN = 3
|
||||
A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT = 30
|
||||
A6XX_GPU_RECOVERY_FAILURE_MESSAGE = (
|
||||
"cx gdsc didn't collapse",
|
||||
"Timeout waiting for GMU OOB",
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,8 +1,9 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING, Any, Sequence
|
||||
from typing import TYPE_CHECKING, Any, Optional, Sequence
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from lava.utils import LogFollower
|
||||
|
|
@ -13,6 +14,9 @@ from lava.utils.constants import (
|
|||
KNOWN_ISSUE_R8152_MAX_CONSECUTIVE_COUNTER,
|
||||
LOG_DEBUG_FEEDBACK_NOISE,
|
||||
KNOWN_ISSUE_R8152_PATTERNS,
|
||||
A6XX_GPU_RECOVERY_WATCH_PERIOD_MIN,
|
||||
A6XX_GPU_RECOVERY_FAILURE_MESSAGE,
|
||||
A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT,
|
||||
)
|
||||
from lava.utils.log_section import LogSectionType
|
||||
|
||||
|
|
@ -29,6 +33,8 @@ class LAVALogHints:
|
|||
log_follower: LogFollower
|
||||
r8152_issue_consecutive_counter: int = field(default=0, init=False)
|
||||
reboot_counter: int = field(default=0, init=False)
|
||||
a6xx_gpu_recovery_fail_counter: int = field(default=0, init=False)
|
||||
a6xx_gpu_first_fail_time: Optional[datetime] = field(default=None, init=False)
|
||||
|
||||
def raise_known_issue(self, message) -> None:
|
||||
raise MesaCIKnownIssueException(
|
||||
|
|
@ -44,6 +50,7 @@ class LAVALogHints:
|
|||
continue
|
||||
self.detect_r8152_issue(line)
|
||||
self.detect_forced_reboot(line)
|
||||
self.detect_a6xx_gpu_recovery_failure(line)
|
||||
|
||||
def detect_r8152_issue(self, line):
|
||||
if self.log_follower.phase in (
|
||||
|
|
@ -77,3 +84,23 @@ class LAVALogHints:
|
|||
self.raise_known_issue(
|
||||
"Forced reboot detected during test phase, failing the job..."
|
||||
)
|
||||
|
||||
# If the a6xx gpu repeatedly fails to recover over a short period of time,
|
||||
# then successful recovery is unlikely so cancel the job preemptively.
|
||||
def detect_a6xx_gpu_recovery_failure(self, line: dict[str, Any]) -> None:
|
||||
if search_known_issue_patterns(A6XX_GPU_RECOVERY_FAILURE_MESSAGE, line["msg"]):
|
||||
time_of_failure = datetime.fromisoformat(line["dt"])
|
||||
self.a6xx_gpu_recovery_fail_counter += 1
|
||||
|
||||
if self.a6xx_gpu_first_fail_time is None:
|
||||
self.a6xx_gpu_first_fail_time = time_of_failure
|
||||
|
||||
if self.a6xx_gpu_recovery_fail_counter == A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT:
|
||||
time_since_first_fail = time_of_failure - self.a6xx_gpu_first_fail_time
|
||||
if time_since_first_fail <= timedelta(minutes=A6XX_GPU_RECOVERY_WATCH_PERIOD_MIN):
|
||||
self.raise_known_issue(
|
||||
"Repeated GPU recovery failure detected: cancelling the job"
|
||||
)
|
||||
else:
|
||||
self.a6xx_gpu_first_fail_time = None
|
||||
self.a6xx_gpu_recovery_fail_counter = 0
|
||||
|
|
|
|||
|
|
@ -16,7 +16,13 @@ from lava.utils import (
|
|||
fix_lava_gitlab_section_log,
|
||||
hide_sensitive_data,
|
||||
)
|
||||
from lava.utils.constants import KNOWN_ISSUE_R8152_MAX_CONSECUTIVE_COUNTER
|
||||
from lava.utils.constants import (
|
||||
KNOWN_ISSUE_R8152_MAX_CONSECUTIVE_COUNTER,
|
||||
A6XX_GPU_RECOVERY_WATCH_PERIOD_MIN,
|
||||
A6XX_GPU_RECOVERY_FAILURE_MESSAGE,
|
||||
A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT,
|
||||
)
|
||||
from lava.utils.lava_log_hints import LAVALogHints
|
||||
|
||||
from ..lava.helpers import (
|
||||
create_lava_yaml_msg,
|
||||
|
|
@ -390,3 +396,44 @@ def test_detect_failure(messages, expectation):
|
|||
lf = LogFollower(starting_section=boot_section)
|
||||
with expectation:
|
||||
lf.feed(messages)
|
||||
|
||||
def test_detect_a6xx_gpu_recovery_failure(frozen_time):
|
||||
log_follower = LogFollower()
|
||||
lava_log_hints = LAVALogHints(log_follower=log_follower)
|
||||
failure_message = {
|
||||
"dt": datetime.now().isoformat(),
|
||||
"msg": A6XX_GPU_RECOVERY_FAILURE_MESSAGE[0],
|
||||
"lvl": "feedback",
|
||||
}
|
||||
with pytest.raises(MesaCIKnownIssueException):
|
||||
for _ in range(A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT):
|
||||
lava_log_hints.detect_a6xx_gpu_recovery_failure(failure_message)
|
||||
# Simulate the passage of time within the watch period
|
||||
frozen_time.tick(1)
|
||||
failure_message["dt"] = datetime.now().isoformat()
|
||||
|
||||
def test_detect_a6xx_gpu_recovery_success(frozen_time):
|
||||
log_follower = LogFollower()
|
||||
lava_log_hints = LAVALogHints(log_follower=log_follower)
|
||||
failure_message = {
|
||||
"dt": datetime.now().isoformat(),
|
||||
"msg": A6XX_GPU_RECOVERY_FAILURE_MESSAGE[0],
|
||||
"lvl": "feedback",
|
||||
}
|
||||
# Simulate sending a tolerable number of failure messages
|
||||
for _ in range(A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT - 1):
|
||||
lava_log_hints.detect_a6xx_gpu_recovery_failure(failure_message)
|
||||
frozen_time.tick(1)
|
||||
failure_message["dt"] = datetime.now().isoformat()
|
||||
|
||||
# Simulate the passage of time outside of the watch period
|
||||
frozen_time.tick(60 * A6XX_GPU_RECOVERY_WATCH_PERIOD_MIN + 1)
|
||||
failure_message = {
|
||||
"dt": datetime.now().isoformat(),
|
||||
"msg": A6XX_GPU_RECOVERY_FAILURE_MESSAGE[1],
|
||||
"lvl": "feedback",
|
||||
}
|
||||
with does_not_raise():
|
||||
lava_log_hints.detect_a6xx_gpu_recovery_failure(failure_message)
|
||||
assert lava_log_hints.a6xx_gpu_first_fail_time is None, "a6xx_gpu_first_fail_time is not None"
|
||||
assert lava_log_hints.a6xx_gpu_recovery_fail_counter == 0, "a6xx_gpu_recovery_fail_counter is not 0"
|
||||
|
|
|
|||
|
|
@ -374,7 +374,7 @@
|
|||
"description": "ci/lava: Detect a6xx gpu recovery failures",
|
||||
"nominated": true,
|
||||
"nomination_type": 0,
|
||||
"resolution": 0,
|
||||
"resolution": 1,
|
||||
"main_sha": null,
|
||||
"because_sha": null,
|
||||
"notes": null
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue