diff --git a/.gitlab-ci/lava/lava_job_submitter.py b/.gitlab-ci/lava/lava_job_submitter.py index af20abdbc64..e5ff2d8917e 100755 --- a/.gitlab-ci/lava/lava_job_submitter.py +++ b/.gitlab-ci/lava/lava_job_submitter.py @@ -34,7 +34,7 @@ import urllib.parse import xmlrpc import yaml -from datetime import datetime +from datetime import datetime, timedelta from lavacli.utils import loader @@ -210,24 +210,44 @@ def get_job_results(proxy, job_id, test_suite, test_case): return True +def wait_until_job_is_started(proxy, job_id): + print_log(f"Waiting for job {job_id} to start.") + current_state = "Submitted" + waiting_states = ["Submitted", "Scheduling", "Scheduled"] + while current_state in waiting_states: + job_state = _call_proxy(proxy.scheduler.job_state, job_id) + current_state = job_state["job_state"] + + time.sleep(30) + print_log(f"Job {job_id} started.") def follow_job_execution(proxy, job_id): line_count = 0 finished = False + last_time_logs = datetime.now() while not finished: (finished, data) = _call_proxy(proxy.scheduler.jobs.logs, job_id, line_count) logs = yaml.load(str(data), Loader=loader(False)) if logs: + # Reset the timeout + last_time_logs = datetime.now() for line in logs: print("{} {}".format(line["dt"], line["msg"])) line_count += len(logs) + else: + time_limit = timedelta(minutes=1) + if datetime.now() - last_time_logs > time_limit: + print_log("LAVA job {} doesn't advance (machine got hung?). Retry.".format(job_id)) + return False + # `proxy.scheduler.jobs.logs` does not block, even when there is no # new log to be fetched. To avoid dosing the LAVA dispatcher # machine, let's add a sleep to save them some stamina. time.sleep(5) + return True def show_job_data(proxy, job_id): show = _call_proxy(proxy.scheduler.jobs.show, job_id) @@ -262,13 +282,22 @@ def main(args): print("LAVA job definition validated successfully") return + retry_count = 2 - while True: + while retry_count >= 0: job_id = submit_job(proxy, yaml_file) print_log("LAVA job id: {}".format(job_id)) - follow_job_execution(proxy, job_id) + wait_until_job_is_started(proxy, job_id) + + if not follow_job_execution(proxy, job_id): + print_log(f"Job {job_id} has timed out. Cancelling it.") + # Cancel the job as it is considered unreachable by Mesa CI. + proxy.scheduler.jobs.cancel(job_id) + + retry_count -= 1 + continue show_job_data(proxy, job_id)