ci/lava: Detect a6xx gpu recovery failures

Sporadically a6xx gpu will fail to recover causing the lava job a660_vk_full to loop on error messages for three hours before timing out. A few sporadic error messages may still be recoverable, but when multiple errors occur over a short period, successful recovery is unlikely. Parse the logs to look for repeated error messages within a short time period. If found, cancel the lava job and rerun it. Also add unit tests for this behaviour. cc: mesa-stable Reported-by: Valentine Burley <valentine.burley@gmail.com> Acked-by: Daniel Stone <daniel.stone@collabora.com> Reviewed-by: Guilherme Gallo <guilherme.gallo@collabora.com> Signed-off-by: Deborah Brouwer <deborah.brouwer@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30032> (cherry picked from commit 72c182f873)
2025-12-25 23:50:11 +01:00 · 2024-07-05 15:28:24 -07:00 · 2024-07-05 15:28:24 -07:00 · dad8f2d4e2
commit dad8f2d4e2
parent d956bc9ec2
4 changed files with 84 additions and 3 deletions
--- a/.gitlab-ci/lava/utils/constants.py
+++ b/.gitlab-ci/lava/utils/constants.py
@ -23,3 +23,10 @@ KNOWN_ISSUE_R8152_PATTERNS: tuple[str, ...] = (
 # This is considered noise, since LAVA produces this log after receiving a package of feedback
 # messages.
 LOG_DEBUG_FEEDBACK_NOISE = "Listened to connection for namespace 'dut' done"
+
+A6XX_GPU_RECOVERY_WATCH_PERIOD_MIN = 3
+A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT = 30
+A6XX_GPU_RECOVERY_FAILURE_MESSAGE = (
+    "cx gdsc didn't collapse",
+    "Timeout waiting for GMU OOB",
+)
--- a/.gitlab-ci/lava/utils/lava_log_hints.py
+++ b/.gitlab-ci/lava/utils/lava_log_hints.py
@ -1,8 +1,9 @@
 from __future__ import annotations

 import re
+from datetime import datetime, timedelta
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any, Sequence
+from typing import TYPE_CHECKING, Any, Optional, Sequence

 if TYPE_CHECKING:
    from lava.utils import LogFollower
@ -13,6 +14,9 @@ from lava.utils.constants import (
    KNOWN_ISSUE_R8152_MAX_CONSECUTIVE_COUNTER,
    LOG_DEBUG_FEEDBACK_NOISE,
    KNOWN_ISSUE_R8152_PATTERNS,
+    A6XX_GPU_RECOVERY_WATCH_PERIOD_MIN,
+    A6XX_GPU_RECOVERY_FAILURE_MESSAGE,
+    A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT,
 )
 from lava.utils.log_section import LogSectionType

@ -29,6 +33,8 @@ class LAVALogHints:
    log_follower: LogFollower
    r8152_issue_consecutive_counter: int = field(default=0, init=False)
    reboot_counter: int = field(default=0, init=False)
+    a6xx_gpu_recovery_fail_counter: int = field(default=0, init=False)
+    a6xx_gpu_first_fail_time: Optional[datetime] = field(default=None, init=False)

    def raise_known_issue(self, message) -> None:
        raise MesaCIKnownIssueException(
@ -44,6 +50,7 @@ class LAVALogHints:
                continue
            self.detect_r8152_issue(line)
            self.detect_forced_reboot(line)
+            self.detect_a6xx_gpu_recovery_failure(line)

    def detect_r8152_issue(self, line):
        if self.log_follower.phase in (
@ -77,3 +84,23 @@ class LAVALogHints:
                    self.raise_known_issue(
                        "Forced reboot detected during test phase, failing the job..."
                    )
+
+    # If the a6xx gpu repeatedly fails to recover over a short period of time,
+    # then successful recovery is unlikely so cancel the job preemptively.
+    def detect_a6xx_gpu_recovery_failure(self, line: dict[str, Any]) -> None:
+        if search_known_issue_patterns(A6XX_GPU_RECOVERY_FAILURE_MESSAGE, line["msg"]):
+            time_of_failure = datetime.fromisoformat(line["dt"])
+            self.a6xx_gpu_recovery_fail_counter += 1
+
+            if self.a6xx_gpu_first_fail_time is None:
+                self.a6xx_gpu_first_fail_time = time_of_failure
+
+            if self.a6xx_gpu_recovery_fail_counter == A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT:
+                time_since_first_fail = time_of_failure - self.a6xx_gpu_first_fail_time
+                if time_since_first_fail <=  timedelta(minutes=A6XX_GPU_RECOVERY_WATCH_PERIOD_MIN):
+                    self.raise_known_issue(
+                        "Repeated GPU recovery failure detected: cancelling the job"
+                    )
+                else:
+                    self.a6xx_gpu_first_fail_time = None
+                    self.a6xx_gpu_recovery_fail_counter = 0
--- a/.gitlab-ci/tests/utils/test_lava_log.py
+++ b/.gitlab-ci/tests/utils/test_lava_log.py
@ -16,7 +16,13 @@ from lava.utils import (
    fix_lava_gitlab_section_log,
    hide_sensitive_data,
 )
-from lava.utils.constants import KNOWN_ISSUE_R8152_MAX_CONSECUTIVE_COUNTER
+from lava.utils.constants import (
+    KNOWN_ISSUE_R8152_MAX_CONSECUTIVE_COUNTER,
+    A6XX_GPU_RECOVERY_WATCH_PERIOD_MIN,
+    A6XX_GPU_RECOVERY_FAILURE_MESSAGE,
+    A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT,
+)
+from lava.utils.lava_log_hints import LAVALogHints

 from ..lava.helpers import (
    create_lava_yaml_msg,
@ -390,3 +396,44 @@ def test_detect_failure(messages, expectation):
    lf = LogFollower(starting_section=boot_section)
    with expectation:
        lf.feed(messages)
+
+def test_detect_a6xx_gpu_recovery_failure(frozen_time):
+    log_follower = LogFollower()
+    lava_log_hints = LAVALogHints(log_follower=log_follower)
+    failure_message = {
+        "dt": datetime.now().isoformat(),
+        "msg": A6XX_GPU_RECOVERY_FAILURE_MESSAGE[0],
+        "lvl": "feedback",
+    }
+    with pytest.raises(MesaCIKnownIssueException):
+        for _ in range(A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT):
+            lava_log_hints.detect_a6xx_gpu_recovery_failure(failure_message)
+            # Simulate the passage of time within the watch period
+            frozen_time.tick(1)
+            failure_message["dt"] = datetime.now().isoformat()
+
+def test_detect_a6xx_gpu_recovery_success(frozen_time):
+    log_follower = LogFollower()
+    lava_log_hints = LAVALogHints(log_follower=log_follower)
+    failure_message = {
+        "dt": datetime.now().isoformat(),
+        "msg": A6XX_GPU_RECOVERY_FAILURE_MESSAGE[0],
+        "lvl": "feedback",
+    }
+    # Simulate sending a tolerable number of failure messages
+    for _ in range(A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT - 1):
+        lava_log_hints.detect_a6xx_gpu_recovery_failure(failure_message)
+        frozen_time.tick(1)
+        failure_message["dt"] = datetime.now().isoformat()
+
+    # Simulate the passage of time outside of the watch period
+    frozen_time.tick(60 * A6XX_GPU_RECOVERY_WATCH_PERIOD_MIN + 1)
+    failure_message = {
+        "dt": datetime.now().isoformat(),
+        "msg": A6XX_GPU_RECOVERY_FAILURE_MESSAGE[1],
+        "lvl": "feedback",
+    }
+    with does_not_raise():
+        lava_log_hints.detect_a6xx_gpu_recovery_failure(failure_message)
+    assert lava_log_hints.a6xx_gpu_first_fail_time is None, "a6xx_gpu_first_fail_time is not None"
+    assert lava_log_hints.a6xx_gpu_recovery_fail_counter == 0, "a6xx_gpu_recovery_fail_counter is not 0"
--- a/.pick_status.json
+++ b/.pick_status.json
@ -374,7 +374,7 @@
        "description": "ci/lava: Detect a6xx gpu recovery failures",
        "nominated": true,
        "nomination_type": 0,
-        "resolution": 0,
+        "resolution": 1,
        "main_sha": null,
        "because_sha": null,
        "notes": null