bin/ci: crnm: bug while stressing a single job

When one launches a stress test on a single job, the script behaves like the stress number is not set. After this wrong end, relaunch the command works only if stress is bigger than 2. In case 2, it can confuse the number of executions. When in stress mode, don't exit the monitor_pipeline method as if there were only one job run. One job run, prints in std the job trace, but in stress mode, there are more than one job execution. The stress_status_counter structure lost the information about job IDs, and the bug happens when it counts twice the same job. Reported-by: Martin Roukala (né Peres) <martin.roukala@mupuf.org> Signed-off-by: Sergi Blanch Torne <sergi.blanch.torne@collabora.com> Reviewed-by: Guilherme Gallo <guilherme.gallo@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33750>
2026-05-09 06:48:06 +02:00 · 2025-04-17 14:57:50 +02:00 · 2025-04-17 14:57:50 +02:00 · e6843c1705
commit e6843c1705
parent ce200e6a4a
1 changed files with 20 additions and 15 deletions
--- a/bin/ci/ci_run_n_monitor.py
+++ b/bin/ci/ci_run_n_monitor.py
@ -16,7 +16,7 @@ import argparse
 import re
 import sys
 import time
-from collections import defaultdict
+from collections import defaultdict, Counter
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from itertools import chain
@ -118,7 +118,6 @@ def run_target_job(
    job: gitlab.v4.objects.ProjectPipelineJob,
    enable_job_fn: Callable,
    stress: int,
-    stress_status_counter: dict,
    execution_times: dict,
    target_statuses: dict,
    name_field_pad: int,
@ -126,9 +125,8 @@ def run_target_job(
    if stress and job.status in COMPLETED_STATUSES:
        if (
            stress < 0
-            or sum(stress_status_counter[job.name].values()) < stress
+            or len(execution_times[job.name]) < stress
        ):
-            stress_status_counter[job.name][job.status] += 1
            execution_times[job.name][job.id] = (job_duration(job), job.status, job.web_url)
            enable_job_fn(job=job, action_type="retry")
            # Wait for the next loop to get the updated job object
@ -153,8 +151,7 @@ def monitor_pipeline(
    """Monitors pipeline and delegate canceling jobs"""
    statuses: dict[str, str] = defaultdict(str)
    target_statuses: dict[str, str] = defaultdict(str)
-    stress_status_counter: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
-    execution_times = defaultdict(lambda: defaultdict(tuple))
+    execution_times: dict[str, dict[str, tuple[float, str, str]]] = defaultdict(lambda: defaultdict(tuple))
    target_id: int = -1
    name_field_pad: int = len(max(dependencies, key=len))+2
    # In a running pipeline, we can skip following job traces that are in these statuses.
@ -168,7 +165,6 @@ def monitor_pipeline(
               include_stage_regex.fullmatch(job.stage) and \
               not exclude_stage_regex.fullmatch(job.stage) and \
               job.status in COMPLETED_STATUSES:
-                stress_status_counter[job.name][job.status] += 1
                execution_times[job.name][job.id] = (job_duration(job), job.status, job.web_url)

    # jobs_waiting is a list of job names that are waiting for status update.
@ -197,7 +193,6 @@ def monitor_pipeline(
                    job,
                    enable_job_fn,
                    stress,
-                    stress_status_counter,
                    execution_times,
                    target_statuses,
                    name_field_pad,
@ -223,14 +218,22 @@ def monitor_pipeline(

        if stress:
            enough = True
-            for job_name, status in sorted(stress_status_counter.items()):
+            status_counters = {
+                name: Counter(info[1] for info in runs.values())
+                for name, runs in execution_times.items()
+            }
+            for job_name, counter in sorted(status_counters.items()):
+                n_succeed = counter.get("success", 0)
+                n_failed = counter.get("failed", 0)
+                n_total_completed = n_succeed + n_failed
+                n_total_seen = len(execution_times[job_name])
                print(
-                    f"* {job_name:{name_field_pad}}succ: {status['success']}; "
-                    f"fail: {status['failed']}; "
-                    f"total: {sum(status.values())} of {stress}",
+                    f"* {job_name:{name_field_pad}}succ: {n_succeed}; "
+                    f"fail: {n_failed}; "
+                    f"total: {n_total_seen} of {stress}",
                    flush=False,
                )
-                if stress < 0 or sum(status.values()) < stress:
+                if stress < 0 or n_total_completed < stress:
                    enough = False

            if not enough:
@ -246,8 +249,10 @@ def monitor_pipeline(
            pretty_wait(REFRESH_WAIT_JOBS)
            continue

-        if len(target_statuses) == 1 and RUNNING_STATUSES.intersection(
-            target_statuses.values()
+        if (
+            stress in [0, 1]
+            and len(target_statuses) == 1
+            and RUNNING_STATUSES.intersection(target_statuses.values())
        ):
            return target_id, None, execution_times