diff --git a/.gitlab-ci/lava/lava_job_submitter.py b/.gitlab-ci/lava/lava_job_submitter.py index 4bc0628433c..b120210e727 100755 --- a/.gitlab-ci/lava/lava_job_submitter.py +++ b/.gitlab-ci/lava/lava_job_submitter.py @@ -16,7 +16,7 @@ import sys import time from collections import defaultdict from dataclasses import dataclass, fields -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from os import environ, getenv, path from typing import Any, Optional @@ -83,6 +83,17 @@ NUMBER_OF_RETRIES_TIMEOUT_DETECTION = int( getenv("LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION", 2) ) +CI_JOB_TIMEOUT_SEC = int(getenv("CI_JOB_TIMEOUT", 3600)) +# How many seconds the script will wait to let LAVA run the job and give the final details. +EXPECTED_JOB_DURATION_SEC = int(getenv("EXPECTED_JOB_DURATION_SEC", 60 * 10)) +# CI_JOB_STARTED is given by GitLab CI/CD in UTC timezone by default. +CI_JOB_STARTED_AT_RAW = getenv("CI_JOB_STARTED_AT", "") +CI_JOB_STARTED_AT: datetime = ( + datetime.fromisoformat(CI_JOB_STARTED_AT_RAW) + if CI_JOB_STARTED_AT_RAW + else datetime.now(timezone.utc) +) + def raise_exception_from_metadata(metadata: dict, job_id: int) -> None: """ @@ -221,9 +232,23 @@ def submit_job(job): ) from mesa_ci_err -def wait_for_job_get_started(job): +def wait_for_job_get_started(job, attempt_no): print_log(f"Waiting for job {job.job_id} to start.") while not job.is_started(): + current_job_duration_sec: int = int( + (datetime.now(timezone.utc) - CI_JOB_STARTED_AT).total_seconds() + ) + remaining_time_sec: int = max(0, CI_JOB_TIMEOUT_SEC - current_job_duration_sec) + if remaining_time_sec < EXPECTED_JOB_DURATION_SEC: + job.cancel() + raise MesaCIFatalException( + f"{CONSOLE_LOG['BOLD']}" + f"{CONSOLE_LOG['FG_YELLOW']}" + f"Job {job.job_id} only has {remaining_time_sec} seconds " + "remaining to run, but it is expected to take at least " + f"{EXPECTED_JOB_DURATION_SEC} seconds." + f"{CONSOLE_LOG['RESET']}", + ) time.sleep(WAIT_FOR_DEVICE_POLLING_TIME_SEC) job.refresh_log() print_log(f"Job {job.job_id} started.") @@ -299,7 +324,7 @@ def execute_job_with_retries( try: job_log["submitter_start_time"] = datetime.now().isoformat() submit_job(job) - wait_for_job_get_started(job) + wait_for_job_get_started(job, attempt_no) log_follower: LogFollower = bootstrap_log_follower() follow_job_execution(job, log_follower) return job diff --git a/.gitlab-ci/tests/test_lava_job_submitter.py b/.gitlab-ci/tests/test_lava_job_submitter.py index 5116ea30545..02569396983 100644 --- a/.gitlab-ci/tests/test_lava_job_submitter.py +++ b/.gitlab-ci/tests/test_lava_job_submitter.py @@ -15,7 +15,7 @@ from typing import Generator from unittest.mock import MagicMock, patch import pytest -from lava.exceptions import MesaCIException, MesaCIRetryError +from lava.exceptions import MesaCIException, MesaCIRetryError, MesaCIFatalException from lava.lava_job_submitter import ( DEVICE_HANGING_TIMEOUT_SEC, NUMBER_OF_RETRIES_TIMEOUT_DETECTION, @@ -24,6 +24,7 @@ from lava.lava_job_submitter import ( bootstrap_log_follower, follow_job_execution, retriable_follow_job, + wait_for_job_get_started, ) from lava.utils import LogSectionType @@ -257,6 +258,27 @@ def test_simulate_a_long_wait_to_start_a_job( assert delta_time.total_seconds() >= wait_time +LONG_LAVA_QUEUE_SCENARIOS = { + "no_time_to_run": (0, pytest.raises(MesaCIFatalException)), + "enough_time_to_run": (9999999999, does_not_raise()), +} + + +@pytest.mark.parametrize( + "job_timeout, expectation", + LONG_LAVA_QUEUE_SCENARIOS.values(), + ids=LONG_LAVA_QUEUE_SCENARIOS.keys(), +) +def test_wait_for_job_get_started_no_time_to_run(monkeypatch, job_timeout, expectation): + monkeypatch.setattr("lava.lava_job_submitter.CI_JOB_TIMEOUT_SEC", job_timeout) + job = MagicMock() + # Make it escape the loop + job.is_started.side_effect = (False, False, True) + with expectation as e: + wait_for_job_get_started(job, 1) + if e: + job.cancel.assert_called_with() + CORRUPTED_LOG_SCENARIOS = { "too much subsequent corrupted data": (