From d222502624203286c60000bb645e0b79ab1218cf Mon Sep 17 00:00:00 2001 From: Guilherme Gallo Date: Thu, 8 Jun 2023 12:34:35 -0300 Subject: [PATCH] ci/lava: Increase Docker action failure_retry counter Our LAVA farm is currently experiencing issues with running and pulling docker. LAVA has been detecting (with a low rate) timeouts during these commands, causing some jobs to fail with infrastructure errors. Increasing the failure_retry will make the job retry run the container when LAVA detects the failure without losing its place in the job queue. We are currently investigating why docker times out. But, when LAVA fails to detect it, we cancel the job on our side and resubmit it to the job queue. For more information, please refer to following dashboard: https://ci-stats-grafana.freedesktop.org/goto/VjZvaA_4z?orgId=1 Signed-off-by: Guilherme Gallo Part-of: --- .gitlab-ci/lava/utils/ssh_job_definition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci/lava/utils/ssh_job_definition.py b/.gitlab-ci/lava/utils/ssh_job_definition.py index ea8c1a84c38..cc2acc66507 100644 --- a/.gitlab-ci/lava/utils/ssh_job_definition.py +++ b/.gitlab-ci/lava/utils/ssh_job_definition.py @@ -117,7 +117,7 @@ def generate_docker_test(args): init_stages_test = { "namespace": "container", "timeout": {"minutes": args.job_timeout_min}, - "failure_retry": 1, + "failure_retry": 3, "definitions": [ { "name": "docker_ssh_client",