mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-05 16:08:04 +02:00
bin/ci: crnm: bug while stressing a single job
When one launches a stress test on a single job, the script behaves like the stress number is not set. After this wrong end, relaunch the command works only if stress is bigger than 2. In case 2, it can confuse the number of executions. When in stress mode, don't exit the monitor_pipeline method as if there were only one job run. One job run, prints in std the job trace, but in stress mode, there are more than one job execution. The stress_status_counter structure lost the information about job IDs, and the bug happens when it counts twice the same job. Reported-by: Martin Roukala (né Peres) <martin.roukala@mupuf.org> Signed-off-by: Sergi Blanch Torne <sergi.blanch.torne@collabora.com> Reviewed-by: Guilherme Gallo <guilherme.gallo@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33750>
This commit is contained in:
parent
ce200e6a4a
commit
e6843c1705
1 changed files with 20 additions and 15 deletions
|
|
@ -16,7 +16,7 @@ import argparse
|
|||
import re
|
||||
import sys
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from collections import defaultdict, Counter
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from functools import partial
|
||||
from itertools import chain
|
||||
|
|
@ -118,7 +118,6 @@ def run_target_job(
|
|||
job: gitlab.v4.objects.ProjectPipelineJob,
|
||||
enable_job_fn: Callable,
|
||||
stress: int,
|
||||
stress_status_counter: dict,
|
||||
execution_times: dict,
|
||||
target_statuses: dict,
|
||||
name_field_pad: int,
|
||||
|
|
@ -126,9 +125,8 @@ def run_target_job(
|
|||
if stress and job.status in COMPLETED_STATUSES:
|
||||
if (
|
||||
stress < 0
|
||||
or sum(stress_status_counter[job.name].values()) < stress
|
||||
or len(execution_times[job.name]) < stress
|
||||
):
|
||||
stress_status_counter[job.name][job.status] += 1
|
||||
execution_times[job.name][job.id] = (job_duration(job), job.status, job.web_url)
|
||||
enable_job_fn(job=job, action_type="retry")
|
||||
# Wait for the next loop to get the updated job object
|
||||
|
|
@ -153,8 +151,7 @@ def monitor_pipeline(
|
|||
"""Monitors pipeline and delegate canceling jobs"""
|
||||
statuses: dict[str, str] = defaultdict(str)
|
||||
target_statuses: dict[str, str] = defaultdict(str)
|
||||
stress_status_counter: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
|
||||
execution_times = defaultdict(lambda: defaultdict(tuple))
|
||||
execution_times: dict[str, dict[str, tuple[float, str, str]]] = defaultdict(lambda: defaultdict(tuple))
|
||||
target_id: int = -1
|
||||
name_field_pad: int = len(max(dependencies, key=len))+2
|
||||
# In a running pipeline, we can skip following job traces that are in these statuses.
|
||||
|
|
@ -168,7 +165,6 @@ def monitor_pipeline(
|
|||
include_stage_regex.fullmatch(job.stage) and \
|
||||
not exclude_stage_regex.fullmatch(job.stage) and \
|
||||
job.status in COMPLETED_STATUSES:
|
||||
stress_status_counter[job.name][job.status] += 1
|
||||
execution_times[job.name][job.id] = (job_duration(job), job.status, job.web_url)
|
||||
|
||||
# jobs_waiting is a list of job names that are waiting for status update.
|
||||
|
|
@ -197,7 +193,6 @@ def monitor_pipeline(
|
|||
job,
|
||||
enable_job_fn,
|
||||
stress,
|
||||
stress_status_counter,
|
||||
execution_times,
|
||||
target_statuses,
|
||||
name_field_pad,
|
||||
|
|
@ -223,14 +218,22 @@ def monitor_pipeline(
|
|||
|
||||
if stress:
|
||||
enough = True
|
||||
for job_name, status in sorted(stress_status_counter.items()):
|
||||
status_counters = {
|
||||
name: Counter(info[1] for info in runs.values())
|
||||
for name, runs in execution_times.items()
|
||||
}
|
||||
for job_name, counter in sorted(status_counters.items()):
|
||||
n_succeed = counter.get("success", 0)
|
||||
n_failed = counter.get("failed", 0)
|
||||
n_total_completed = n_succeed + n_failed
|
||||
n_total_seen = len(execution_times[job_name])
|
||||
print(
|
||||
f"* {job_name:{name_field_pad}}succ: {status['success']}; "
|
||||
f"fail: {status['failed']}; "
|
||||
f"total: {sum(status.values())} of {stress}",
|
||||
f"* {job_name:{name_field_pad}}succ: {n_succeed}; "
|
||||
f"fail: {n_failed}; "
|
||||
f"total: {n_total_seen} of {stress}",
|
||||
flush=False,
|
||||
)
|
||||
if stress < 0 or sum(status.values()) < stress:
|
||||
if stress < 0 or n_total_completed < stress:
|
||||
enough = False
|
||||
|
||||
if not enough:
|
||||
|
|
@ -246,8 +249,10 @@ def monitor_pipeline(
|
|||
pretty_wait(REFRESH_WAIT_JOBS)
|
||||
continue
|
||||
|
||||
if len(target_statuses) == 1 and RUNNING_STATUSES.intersection(
|
||||
target_statuses.values()
|
||||
if (
|
||||
stress in [0, 1]
|
||||
and len(target_statuses) == 1
|
||||
and RUNNING_STATUSES.intersection(target_statuses.values())
|
||||
):
|
||||
return target_id, None, execution_times
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue