From 87e6f19447368d7e65c17f0d7d831d146ade741e Mon Sep 17 00:00:00 2001 From: James Lamb Date: Fri, 13 Feb 2026 13:21:25 -0600 Subject: [PATCH 01/14] check-nightly-success: limit to 1 branch, other improvements --- .gitignore | 1 + .pre-commit-config.yaml | 20 +- check_nightly_success/README.md | 67 ++++ .../check-nightly-success/action.yaml | 32 +- .../check-nightly-success/check.py | 299 ++++++++++++------ check_nightly_success/dispatch/action.yml | 12 +- pyproject.toml | 18 +- telemetry-impls/summarize/bump_time.py | 6 +- telemetry-impls/summarize/send_trace.py | 4 +- 9 files changed, 325 insertions(+), 134 deletions(-) create mode 100644 check_nightly_success/README.md diff --git a/.gitignore b/.gitignore index 1377554e..9c92e4ec 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ *.swp +.venv diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 66c32140..23d91c8a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,12 +18,24 @@ repos: hooks: - id: actionlint-docker - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.14.10 + rev: v0.15.1 hooks: - - id: ruff - args: ["--fix"] + - id: ruff-check + args: ["--fix", "--config=pyproject.toml"] - id: ruff-format + args: ["--config=pyproject.toml"] - repo: https://github.com/rapidsai/pre-commit-hooks - rev: v1.2.1 + rev: v1.4.3 hooks: - id: verify-copyright + - repo: https://github.com/pre-commit/mirrors-mypy + rev: 'v1.13.0' + hooks: + - id: mypy + additional_dependencies: + - "requests>=2.32.4" + - "types-requests>=2.32.4" + args: + - "--config-file=pyproject.toml" + - "check_nightly_success/" + pass_filenames: false diff --git a/check_nightly_success/README.md b/check_nightly_success/README.md new file mode 100644 index 00000000..56b1ff6c --- /dev/null +++ b/check_nightly_success/README.md @@ -0,0 +1,67 @@ +# check_nightly_success + +Action that can be used to fail CI if a given GitHub Actions workflow hasn't had at least 1 recent succcessful run. + +Add it to any GitHub Actions workflow configuration like this: + +```yaml + check-nightly-ci: + runs-on: ubuntu-latest + permissions: + actions: read + id-token: write + env: + GH_TOKEN: ${{ github.token }} + steps: + - name: Check if nightly CI is passing + uses: rapidsai/shared-actions/check_nightly_success/dispatch@main + with: + repo: ${{ github.repository }} + target_branch: ${{ github.base_ref }} + workflow_id: 'test.yaml' + max_days_without_success: 7 +``` + +## Testing + +The code for the actions is implemented in Python. +Try the following locally to test it. + +```shell +python -m venv .venv/ +source .venv/bin/activate +python -m pip install requests + +GH_TOKEN=$(gh auth token) \ +python ./check-nightly-success/check.py \ + --repo 'rapidsai/cudf' \ + --branch 'main' \ + --workflow-id 'test.yaml' \ + --max-days-without-success 7 +``` + +If this succeeds, you'll see a `0` exit code and output text similar to the following: + +> Found 4 successful runs of workflow 'test.yaml' on branch 'main' in the previous 7 days. +The most recent successful run of workflow 'test.yaml' on branch 'main' was '2026-02-13 13:40:18+00:00', which is within the last 7 days. View logs: + - https://github.com/rapidsai/cudf/actions/runs/21978265026 + + To see it fail, try on a repo that doesn't have that workflow. + +```shell +GH_TOKEN=$(gh auth token) \ +python ./check-nightly-success/check.py \ + --repo 'rapidsai/build-planniing' \ + --branch 'main' \ + --workflow-id 'test.yaml' \ + --max-days-without-success 7 +``` + +That'll return exit code `1` and output similar to this: + +> RuntimeError: Failed to fetch https://api.github.com/repos/rapidsai/build-planniing/actions/workflows/test.yaml/runs after 5 attempts with the following errors: + 404 Client Error: Not Found for url: https://api.github.com/repos/rapidsai/build-planniing/actions/workflows/test.yaml/runs?branch=main&status=success&per_page=100&created=%3E%3D2026-02-05 + 404 Client Error: Not Found for url: https://api.github.com/repos/rapidsai/build-planniing/actions/workflows/test.yaml/runs?branch=main&status=success&per_page=100&created=%3E%3D2026-02-05 + 404 Client Error: Not Found for url: https://api.github.com/repos/rapidsai/build-planniing/actions/workflows/test.yaml/runs?branch=main&status=success&per_page=100&created=%3E%3D2026-02-05 + 404 Client Error: Not Found for url: https://api.github.com/repos/rapidsai/build-planniing/actions/workflows/test.yaml/runs?branch=main&status=success&per_page=100&created=%3E%3D2026-02-05 + 404 Client Error: Not Found for url: https://api.github.com/repos/rapidsai/build-planniing/actions/workflows/test.yaml/runs?branch=main&status=success&per_page=100&created=%3E%3D2026-02-05 diff --git a/check_nightly_success/check-nightly-success/action.yaml b/check_nightly_success/check-nightly-success/action.yaml index 2ad52971..803ac5dd 100644 --- a/check_nightly_success/check-nightly-success/action.yaml +++ b/check_nightly_success/check-nightly-success/action.yaml @@ -1,24 +1,26 @@ name: check-nightly-success description: Check if the nightlies have succeeded recently. + +# these inputs should all be 'required: true' without defaults... this action should only +# ever be invoked by check_nightly_success/dispatch inputs: repo: - description: "The repository to check" + description: "Repository name with owner (e.g. 'rapidsai/cudf' not 'cudf')" required: true type: string - repo_owner: - description: "The org that owns the repo (default: rapidsai)" - required: false - default: "rapidsai" + target_branch: + description: | + Branch the pull request this is running on targets. + Only statuses of nightly runs on that branch will be considered. + required: true type: string workflow_id: description: "The workflow whose runs to check" - required: false - default: "test.yaml" + required: true type: string max_days_without_success: description: "The number of consecutive days that may go by without a successful CI run" - required: false - default: 7 + required: true type: integer runs: @@ -28,9 +30,15 @@ runs: shell: bash env: REPO: ${{ inputs.repo }} - REPO_OWNER: ${{ inputs.repo_owner }} + TARGET_BRANCH: ${{ inputs.target_branch }} WORKFLOW_ID: ${{ inputs.workflow_id }} MAX_DAYS_WITHOUT_SUCCESS: ${{ inputs.max_days_without_success }} run: | - python -m pip install requests - python shared-actions/check_nightly_success/check-nightly-success/check.py ${REPO} --repo-owner ${REPO_OWNER} --workflow-id ${WORKFLOW_ID} --max-days-without-success ${MAX_DAYS_WITHOUT_SUCCESS} + python -m pip install \ + --prefer-binary \ + 'requests>=2.32.4' + python shared-actions/check_nightly_success/check-nightly-success/check.py \ + --repo ${REPO} \ + --branch ${TARGET_BRANCH} \ + --workflow-id ${WORKFLOW_ID} \ + --max-days-without-success ${MAX_DAYS_WITHOUT_SUCCESS} diff --git a/check_nightly_success/check-nightly-success/check.py b/check_nightly_success/check-nightly-success/check.py index e36c2298..4d375d80 100644 --- a/check_nightly_success/check-nightly-success/check.py +++ b/check_nightly_success/check-nightly-success/check.py @@ -1,148 +1,237 @@ -# Copyright (c) 2024-2025, NVIDIA CORPORATION. +# Copyright (c) 2024-2026, NVIDIA CORPORATION. """Check whether a GHA workflow has run successfully in the last N days.""" -# ruff: noqa: INP001 import argparse import os -import re import sys -from collections import defaultdict -from datetime import datetime +import time +from dataclasses import dataclass +from datetime import datetime, timedelta, timezone import requests # Constants -GITHUB_TOKEN = os.environ["RAPIDS_GH_TOKEN"] -GOOD_STATUSES = {"success"} +GITHUB_TOKEN = os.environ["GH_TOKEN"] + + +@dataclass +class _WorkflowRun: + """GitHub workflow run data, filtered to only the fields this action cares about.""" + + html_url: str + run_started_at: datetime + + +@dataclass +class _ResponseData: + data: list[_WorkflowRun] + next_url: str | None + + +# We are producing Unix return codes so success/failure is inverted from the +# expected Python boolean values. +@dataclass +class ExitCode: + FAILURE = 1 + SUCCESS = 0 + + +class GitHubClient: + def __init__( + self, + *, + max_retries: int, + retry_backoff_seconds: float, + request_timeout_seconds: float, + ) -> None: + self.max_retries = max_retries + self.request_timeout_seconds = request_timeout_seconds + self.retry_backoff_seconds = retry_backoff_seconds + + def __get_next_page( + self, + *, + url: str, + headers: dict[str, str], + params: dict[str, int | str], + ) -> _ResponseData: + """Get one page of results""" + exceptions = [] + for _ in range(self.max_retries): + try: + response = requests.get( + url, + headers=headers, + params=params, + timeout=self.request_timeout_seconds, + ) + response.raise_for_status() + break + except requests.RequestException as e: + exceptions.append(str(e)) + # simple backoff, without jitter, exponential backoff, etc., should be fine for this + time.sleep(self.retry_backoff_seconds) + else: + # this needs to be done outside the f-string to avoid: + # "Cannot use an escape sequence (backslash) in f-strings on Python 3.10 (syntax was added in Python 3.12)" + exception_text = "\n\t".join(exceptions) + msg = ( + f"Failed to fetch {url} after {self.max_retries} attempts with the following " + f"errors: \n\t{exception_text}" + ) + raise RuntimeError(msg) + + # if we get here, the request succeeded...return its data, in the format we want + return _ResponseData( + data=[ + _WorkflowRun( + html_url=workflow_run["html_url"], + run_started_at=datetime.fromisoformat(workflow_run["run_started_at"]), + ) + for workflow_run in response.json()["workflow_runs"] + ], + next_url=response.links.get("next", None), + ) + + def get_all_runs( + self, + *, + url: str, + headers: dict[str, str], + params: dict[str, int | str], + ) -> list[_WorkflowRun]: + """ + Paginate over requests to api.github.com/repos/{repo_owner}/{repo}/actions/workflows/{workflow_id}/runs + and return all the results. + """ + data = [] + while True: + page = self.__get_next_page( + url=url, + headers=headers, + params=params, + ) + data.extend(page.data) + if page.next_url is None: + break + # just use the pagination URL, not the original query one + url = page.next_url + params = None # type: ignore[assignment] + return data def main( + *, repo: str, - repo_owner: str, + target_branch: str, workflow_id: str, max_days_without_success: int, - num_attempts: int = 5, -) -> bool: + num_attempts: int, + request_timeout_seconds: float, + retry_backoff_seconds: float, +) -> int: """Check whether a GHA workflow has run successfully in the last N days. Returns True if the workflow has not run successfully in the last N days, False otherwise (values are inverted for use as a return code). """ - headers = {"Authorization": f"token {GITHUB_TOKEN}"} - url = f"https://api.github.com/repos/{repo_owner}/{repo}/actions/workflows/{workflow_id}/runs" - exceptions = [] - for _ in range(num_attempts): - try: - response = requests.get(url, headers=headers, timeout=10) - response.raise_for_status() - break - except requests.RequestException as e: - exceptions.append(e) - else: - sep = "\n\t" - msg = ( - f"Failed to fetch {url} after {num_attempts} attempts with the following " - f"errors: {sep}{'{sep}'.join(exceptions)}" - ) - raise RuntimeError(msg) - - runs = response.json()["workflow_runs"] - tz = datetime.fromisoformat(runs[0]["run_started_at"]).tzinfo - now = datetime.now(tz=tz) - - latest_success = {} - workflow_active_for_max_days = {} - # Rather frustratingly, the workflow runs returned from the GitHub API can - # have alternating ordering of `head_branch` - # e.g. - # run[0]['head_branch'] == "release/25.02" - # run[1]['head_branch'] == "release/25.04" - # run[2]['head_branch'] == "release/25.02" + # Timezones in GitHub API responses are guaranteed to be in UTC time. # - # In this situation, the behavior of `itertools.groupby` (previously used - # here) is to only group _consecutive_ runs, so the results of the - # subsequent branch match (i.e. the second group of `release/25.02` runs) - # will overwrite the results of the first one, potentially overwriting a - # previous success. The snippet below unifies the groups so it's more like a - # SQL groupby and there is no chance of overwriting. - branch_dict = defaultdict(list) - for run in runs: - branch_dict[run["head_branch"]].append(run) - - for branch, branch_runs in branch_dict.items(): - # Only consider 'main' and RAPIDS release branches, which have versions like - # '25.10' (RAPIDS) or '0.46' (ucxx). - if not re.match(r"(main|release/[0-9]{1,2}\.[0-9]{2})", branch): - continue - - latest_success[branch] = None - runs = sorted(branch_runs, key=lambda r: r["run_started_at"], reverse=True) - for run in runs: - days_since_run = (now - datetime.fromisoformat(run["run_started_at"])).days - if days_since_run > max_days_without_success: - break - if run["conclusion"] in GOOD_STATUSES: - latest_success[branch] = run - break + # ref: https://docs.github.com/en/rest/using-the-rest-api/timezones-and-the-rest-api?apiVersion=2022-11-28 + # + # This code is a little imprecise (doing the math in 'days' means that moving from 11:59p to 12:01a buys you + # another 23 hours and 58 minutes of time), but that difference shouldn't be important for this action. + # + # Dealing with day-precision date-times makes filtering in the GitHub API simpler, see + # https://docs.github.com/en/search-github/getting-started-with-searching-on-github/understanding-the-search-syntax#query-for-dates + # + oldest_date_to_pull = datetime.now(timezone.utc) - timedelta(days=max_days_without_success) - workflow_active_for_max_days[branch] = False - if len(runs) > 0: - run = runs[-1] - days_since_run = (now - datetime.fromisoformat(run["run_started_at"])).days - if days_since_run > max_days_without_success: - workflow_active_for_max_days[branch] = True - - latest_branch = max(latest_success) - has_latest_success = latest_success[latest_branch] is not None - - # We are producing Unix return codes so success/failure is inverted from the - # expected Python boolean values. - if has_latest_success: - print( # noqa: T201 - f"The most recent successful run of the {workflow_id} workflow on " - f"{latest_branch} was " - f"{datetime.fromisoformat(latest_success[latest_branch]['run_started_at'])}, " - f"which is within the last {max_days_without_success} days. View logs:" - f"\n - {latest_success[latest_branch]['html_url']}" - ) - return 0 - elif not workflow_active_for_max_days[latest_branch]: - print( # noqa: T201 - f"The oldest run of the {workflow_id} workflow on {latest_branch} was less " - f"than {max_days_without_success} days ago. This exempts the workflow from " - "check-nightly-success because the workflow has not been running for very long." + # get all the matching runs + client = GitHubClient( + max_retries=num_attempts, + request_timeout_seconds=request_timeout_seconds, + retry_backoff_seconds=retry_backoff_seconds, + ) + all_runs = client.get_all_runs( + url=f"https://api.github.com/repos/{repo}/actions/workflows/{workflow_id}/runs", + headers={"Authorization": f"token {GITHUB_TOKEN}"}, + params={ + # only care about runs from one branch (usually, the PR target branch) + "branch": target_branch, + # only care about successful runs + "status": "success", + # pull as many results per page as possible + "per_page": 100, + # filter to recent-enough runs + "created": f">={oldest_date_to_pull.strftime('%Y-%m-%d')}", + }, + ) + + # if there were 0 successful runs, immediately exit with 1... by definition that means there + # hasn't been a success in the last `max_days_without_success` days + if not all_runs: + print( + f"There were 0 successful runs of workflow '{workflow_id}' on branch '{target_branch}' in the last " + f"{max_days_without_success} days." ) - return 0 + return ExitCode.FAILURE + + # If we get here, then there must have been at least 1 successful run in the allowed time. We need to find its + # precise time for a log message, but otherwise the code an exit with success. + print( + f"Found {len(all_runs)} successful runs of workflow '{workflow_id}' on branch '{target_branch}' " + f"in the previous {max_days_without_success} days." + ) - print( # noqa: T201 - f"{latest_branch} has no successful runs of {workflow_id} in the last {max_days_without_success} days" + # sort runs by "run_started_at" + most_recent_successful_run = max(all_runs, key=lambda r: r.run_started_at) + print( + f"The most recent successful run of workflow '{workflow_id}' on branch '{target_branch}' " + f"was '{most_recent_successful_run.run_started_at}', which is within the " + f"last {max_days_without_success} days. " + f"View logs:\n - {most_recent_successful_run.html_url}" ) - return 1 + return ExitCode.SUCCESS if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("repo", type=str, help="Repository name") parser.add_argument( - "--repo-owner", - default="rapidsai", - help="Repository organization/owner", + "--repo", + type=str, + required=True, + help="Repository name with owner (e.g. 'rapidsai/cudf' not 'cudf')", + ) + parser.add_argument( + "--branch", + type=str, + required=True, + help="Branch to check for recent workflow runs.", + ) + parser.add_argument( + "--workflow-id", + type=str, + required=True, + help="Workflow ID (e.g. 'test.yaml')", ) - parser.add_argument("--workflow-id", default="test.yaml", help="Workflow ID") parser.add_argument( "--max-days-without-success", type=int, - default=7, + required=True, help="Maximum number of days without a successful run", ) args = parser.parse_args() sys.exit( main( - args.repo, - args.repo_owner, - args.workflow_id, - args.max_days_without_success, + repo=args.repo, + target_branch=args.branch, + workflow_id=args.workflow_id, + max_days_without_success=args.max_days_without_success, + num_attempts=5, + request_timeout_seconds=10, + retry_backoff_seconds=0.5, ), ) diff --git a/check_nightly_success/dispatch/action.yml b/check_nightly_success/dispatch/action.yml index 5a4223ac..4656b345 100644 --- a/check_nightly_success/dispatch/action.yml +++ b/check_nightly_success/dispatch/action.yml @@ -2,13 +2,15 @@ name: dispatch-check-nightly-success description: Clone shared-actions and dispatch to the check-nightly-success action. inputs: repo: - description: "The repository to check" + description: "Repository name with owner (e.g. 'rapidsai/cudf' not 'cudf')" required: true type: string - repo_owner: - description: "The org that owns the repo (default: rapidsai)" + target_branch: + description: | + Branch the pull request this is running on targets. + Only statuses of nightly runs on that branch will be considered. required: false - default: "rapidsai" + default: "main" type: string workflow_id: description: "The workflow whose runs to check" @@ -34,6 +36,6 @@ runs: uses: ./shared-actions/check_nightly_success/check-nightly-success with: repo: ${{ inputs.repo }} - repo_owner: ${{ inputs.repo_owner }} + target_branch: ${{ inputs.target_branch }} workflow_id: ${{ inputs.workflow_id }} max_days_without_success: ${{ inputs.max_days_without_success }} diff --git a/pyproject.toml b/pyproject.toml index 5304ac2f..23c81e9f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,4 +1,4 @@ -# Copyright (c) 2024-2025, NVIDIA CORPORATION. +# Copyright (c) 2024-2026, NVIDIA CORPORATION. [tool.ruff] line-length = 120 @@ -6,6 +6,10 @@ target-version = "py310" [tool.ruff.lint] select = [ + # flake8-bugbear + "B", + # flake8-datetimez + "DTZ", # pycodestyle "E", # pyflakes @@ -14,10 +18,18 @@ select = [ "I", # numpy "NPY", + # perflint + "PERF", + # flake8-pie + "PIE", + # flake8-return + "RET", + # ruff-exclusive checks + "RUF", # pyupgrade "UP", - # flake8-bugbear - "B" + # flake8-bandit + "S", ] ignore = [ # Incompatible with D211 diff --git a/telemetry-impls/summarize/bump_time.py b/telemetry-impls/summarize/bump_time.py index 906e686f..2d58c371 100644 --- a/telemetry-impls/summarize/bump_time.py +++ b/telemetry-impls/summarize/bump_time.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# Copyright (c) 2024-2025, NVIDIA CORPORATION. +# Copyright (c) 2024-2026, NVIDIA CORPORATION. # This script is meant to act on an 'all_jobs.json' file that comes from # the summarize job when debug info is enabled. Bumping the time makes @@ -14,12 +14,12 @@ def _parse_time(x: str) -> int: - return int(datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ").timestamp() * 1e9) + return int(datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ").timestamp() * 1e9) # noqa: DTZ007 start_time = _parse_time(jobs[0]["created_at"]) needed_time = _parse_time(jobs[-3]["completed_at"]) - _parse_time(jobs[0]["created_at"]) -new_start_time = datetime.datetime.utcnow() - datetime.timedelta(minutes=60) +new_start_time = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(minutes=60) for idx, job in enumerate(jobs): if job["created_at"]: diff --git a/telemetry-impls/summarize/send_trace.py b/telemetry-impls/summarize/send_trace.py index df2fcc61..4d597e7c 100644 --- a/telemetry-impls/summarize/send_trace.py +++ b/telemetry-impls/summarize/send_trace.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2025, NVIDIA CORPORATION. +# Copyright (c) 2019-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -247,7 +247,7 @@ def get_sccache_stats(artifact_folder: Path) -> dict[str, str]: return parsed_stats -def process_job_blob( # noqa: PLR0913 +def process_job_blob( trace_id: int, job: Mapping[str, Any], env_vars: Mapping[str, str], From c0a20e29c878339a3bdac7f2ab2b0f1077088df1 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Fri, 13 Feb 2026 13:59:00 -0600 Subject: [PATCH 02/14] make page size configurable --- check_nightly_success/README.md | 12 ++++++++++++ .../check-nightly-success/check.py | 16 ++++++++++++++-- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/check_nightly_success/README.md b/check_nightly_success/README.md index 56b1ff6c..ba992237 100644 --- a/check_nightly_success/README.md +++ b/check_nightly_success/README.md @@ -65,3 +65,15 @@ That'll return exit code `1` and output similar to this: 404 Client Error: Not Found for url: https://api.github.com/repos/rapidsai/build-planniing/actions/workflows/test.yaml/runs?branch=main&status=success&per_page=100&created=%3E%3D2026-02-05 404 Client Error: Not Found for url: https://api.github.com/repos/rapidsai/build-planniing/actions/workflows/test.yaml/runs?branch=main&status=success&per_page=100&created=%3E%3D2026-02-05 404 Client Error: Not Found for url: https://api.github.com/repos/rapidsai/build-planniing/actions/workflows/test.yaml/runs?branch=main&status=success&per_page=100&created=%3E%3D2026-02-05 + +Set `--request-page-size` to `1` to test that pagination is working. + +```shell +GH_TOKEN=$(gh auth token) \ +python ./check-nightly-success/check.py \ + --repo 'rapidsai/cudf' \ + --branch 'main' \ + --workflow-id 'test.yaml' \ + --max-days-without-success 30 \ + --request-page-size 5 +``` diff --git a/check_nightly_success/check-nightly-success/check.py b/check_nightly_success/check-nightly-success/check.py index 4d375d80..8ddb5fa6 100644 --- a/check_nightly_success/check-nightly-success/check.py +++ b/check_nightly_success/check-nightly-success/check.py @@ -91,7 +91,7 @@ def __get_next_page( ) for workflow_run in response.json()["workflow_runs"] ], - next_url=response.links.get("next", None), + next_url=response.links.get("next", dict()).get("url", None), ) def get_all_runs( @@ -106,7 +106,9 @@ def get_all_runs( and return all the results. """ data = [] + page_num = 1 while True: + print(f"requesting page {page_num} of results") page = self.__get_next_page( url=url, headers=headers, @@ -118,6 +120,7 @@ def get_all_runs( # just use the pagination URL, not the original query one url = page.next_url params = None # type: ignore[assignment] + page_num += 1 return data @@ -128,6 +131,7 @@ def main( workflow_id: str, max_days_without_success: int, num_attempts: int, + request_page_size: int, request_timeout_seconds: float, retry_backoff_seconds: float, ) -> int: @@ -163,7 +167,7 @@ def main( # only care about successful runs "status": "success", # pull as many results per page as possible - "per_page": 100, + "per_page": request_page_size, # filter to recent-enough runs "created": f">={oldest_date_to_pull.strftime('%Y-%m-%d')}", }, @@ -222,6 +226,13 @@ def main( required=True, help="Maximum number of days without a successful run", ) + parser.add_argument( + "--request-page-size", + type=int, + default=100, + required=False, + help="Number of responses per page of data. Decrease this to reduce memory usage.", + ) args = parser.parse_args() sys.exit( @@ -231,6 +242,7 @@ def main( workflow_id=args.workflow_id, max_days_without_success=args.max_days_without_success, num_attempts=5, + request_page_size=args.request_page_size, request_timeout_seconds=10, retry_backoff_seconds=0.5, ), From e6fb6cd0f3ad4c2d7f9bd08919507bc370d2235b Mon Sep 17 00:00:00 2001 From: James Lamb Date: Fri, 13 Feb 2026 16:02:45 -0600 Subject: [PATCH 03/14] use kebob-case everywhere --- check_nightly_success/README.md | 6 +++--- .../check-nightly-success/action.yaml | 12 ++++++------ check_nightly_success/dispatch/action.yml | 10 +++++----- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/check_nightly_success/README.md b/check_nightly_success/README.md index ba992237..fd559965 100644 --- a/check_nightly_success/README.md +++ b/check_nightly_success/README.md @@ -17,9 +17,9 @@ Add it to any GitHub Actions workflow configuration like this: uses: rapidsai/shared-actions/check_nightly_success/dispatch@main with: repo: ${{ github.repository }} - target_branch: ${{ github.base_ref }} - workflow_id: 'test.yaml' - max_days_without_success: 7 + target-branch: ${{ github.base_ref }} + workflow-id: 'test.yaml' + max-days-without-success: 7 ``` ## Testing diff --git a/check_nightly_success/check-nightly-success/action.yaml b/check_nightly_success/check-nightly-success/action.yaml index 803ac5dd..4c149b7a 100644 --- a/check_nightly_success/check-nightly-success/action.yaml +++ b/check_nightly_success/check-nightly-success/action.yaml @@ -8,17 +8,17 @@ inputs: description: "Repository name with owner (e.g. 'rapidsai/cudf' not 'cudf')" required: true type: string - target_branch: + target-branch: description: | Branch the pull request this is running on targets. Only statuses of nightly runs on that branch will be considered. required: true type: string - workflow_id: + workflow-id: description: "The workflow whose runs to check" required: true type: string - max_days_without_success: + max-days-without-success: description: "The number of consecutive days that may go by without a successful CI run" required: true type: integer @@ -30,9 +30,9 @@ runs: shell: bash env: REPO: ${{ inputs.repo }} - TARGET_BRANCH: ${{ inputs.target_branch }} - WORKFLOW_ID: ${{ inputs.workflow_id }} - MAX_DAYS_WITHOUT_SUCCESS: ${{ inputs.max_days_without_success }} + TARGET_BRANCH: ${{ inputs.target-branch }} + WORKFLOW_ID: ${{ inputs.workflow-id }} + MAX_DAYS_WITHOUT_SUCCESS: ${{ inputs.max-days-without-success }} run: | python -m pip install \ --prefer-binary \ diff --git a/check_nightly_success/dispatch/action.yml b/check_nightly_success/dispatch/action.yml index 4656b345..872dc96f 100644 --- a/check_nightly_success/dispatch/action.yml +++ b/check_nightly_success/dispatch/action.yml @@ -12,12 +12,12 @@ inputs: required: false default: "main" type: string - workflow_id: + workflow-id: description: "The workflow whose runs to check" required: false default: "test.yaml" type: string - max_days_without_success: + max-days-without-success: description: "The number of consecutive days that may go by without a successful CI run" required: false default: 7 @@ -36,6 +36,6 @@ runs: uses: ./shared-actions/check_nightly_success/check-nightly-success with: repo: ${{ inputs.repo }} - target_branch: ${{ inputs.target_branch }} - workflow_id: ${{ inputs.workflow_id }} - max_days_without_success: ${{ inputs.max_days_without_success }} + target_branch: ${{ inputs.target-branch }} + workflow_id: ${{ inputs.workflow-id }} + max_days_without_success: ${{ inputs.max-days-without-success }} From d2ea94ed872532781633b6f380167e30abe0dda0 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Fri, 13 Feb 2026 16:12:40 -0600 Subject: [PATCH 04/14] more kebob-case --- check_nightly_success/dispatch/action.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/check_nightly_success/dispatch/action.yml b/check_nightly_success/dispatch/action.yml index 872dc96f..eceea33a 100644 --- a/check_nightly_success/dispatch/action.yml +++ b/check_nightly_success/dispatch/action.yml @@ -5,7 +5,7 @@ inputs: description: "Repository name with owner (e.g. 'rapidsai/cudf' not 'cudf')" required: true type: string - target_branch: + target-branch: description: | Branch the pull request this is running on targets. Only statuses of nightly runs on that branch will be considered. @@ -36,6 +36,6 @@ runs: uses: ./shared-actions/check_nightly_success/check-nightly-success with: repo: ${{ inputs.repo }} - target_branch: ${{ inputs.target-branch }} - workflow_id: ${{ inputs.workflow-id }} - max_days_without_success: ${{ inputs.max-days-without-success }} + target-branch: ${{ inputs.target-branch }} + workflow-id: ${{ inputs.workflow-id }} + max-days-without-success: ${{ inputs.max-days-without-success }} From 8d049331f75409afab70c8e4f671de1618c8c5d4 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 17 Feb 2026 10:15:34 -0600 Subject: [PATCH 05/14] start trying to handle exemption --- check_nightly_success/README.md | 3 + .../check-nightly-success/check.py | 81 ++++++++++++++----- check_nightly_success/dispatch/action.yml | 3 +- 3 files changed, 66 insertions(+), 21 deletions(-) diff --git a/check_nightly_success/README.md b/check_nightly_success/README.md index fd559965..8c0a6abf 100644 --- a/check_nightly_success/README.md +++ b/check_nightly_success/README.md @@ -13,6 +13,9 @@ Add it to any GitHub Actions workflow configuration like this: env: GH_TOKEN: ${{ github.token }} steps: + - name: Get PR Info + id: get-pr-info + uses: nv-gha-runners/get-pr-info@main - name: Check if nightly CI is passing uses: rapidsai/shared-actions/check_nightly_success/dispatch@main with: diff --git a/check_nightly_success/check-nightly-success/check.py b/check_nightly_success/check-nightly-success/check.py index 8ddb5fa6..194da05a 100644 --- a/check_nightly_success/check-nightly-success/check.py +++ b/check_nightly_success/check-nightly-success/check.py @@ -158,7 +158,7 @@ def main( request_timeout_seconds=request_timeout_seconds, retry_backoff_seconds=retry_backoff_seconds, ) - all_runs = client.get_all_runs( + successful_runs = client.get_all_runs( url=f"https://api.github.com/repos/{repo}/actions/workflows/{workflow_id}/runs", headers={"Authorization": f"token {GITHUB_TOKEN}"}, params={ @@ -173,32 +173,75 @@ def main( }, ) + if successful_runs: + most_recent_successful_run = max(successful_runs, key=lambda r: r.run_started_at) + print( + f"Found {len(successful_runs)} successful runs of workflow '{workflow_id}' on branch '{target_branch}' " + f"in the previous {max_days_without_success} days (most recent: '{most_recent_successful_run.run_started_at}'). " + f"View logs:\n - {most_recent_successful_run.html_url}" + ) + return ExitCode.SUCCESS + + # It's ok for there to be 0 successful runs if the branch is fairly new or the workflow hasn't been running on it + # very long. + # + # When new release branches are cut, we want to give a couple of days of space for teams to get their nightly tests working + all_runs = client.get_all_runs( + url=f"https://api.github.com/repos/{repo}/actions/workflows/{workflow_id}/runs", + headers={"Authorization": f"token {GITHUB_TOKEN}"}, + params={ + # only care about runs from one branch (usually, the PR target branch) + "branch": target_branch, + # pull as many results per page as possible + "per_page": request_page_size, + # filter to recent-enough runs + "created": f">={oldest_date_to_pull.strftime('%Y-%m-%d')}", + }, + ) + + # + if not all_runs: + print( + f"There were 0 runs (successful or unsuccessful) of workflow '{workflow_id}' on branch " + f"'{target_branch}' in the last {max_days_without_success} days." + ) + return ExitCode.FAILURE + + if len(all_runs) < max_days_without_success: + print( + "There have only been" + + f"The oldest run of the {workflow_id} workflow on {latest_branch} was less " + f"than {max_days_without_success} days ago. This exempts the workflow from " + "check-nightly-success because the workflow has not been running for very long." + else: + + # if there were 0 successful runs, immediately exit with 1... by definition that means there # hasn't been a success in the last `max_days_without_success` days - if not all_runs: + if not successful_runs: + + successful_runs = client.get_all_runs( + url=f"https://api.github.com/repos/{repo}/actions/workflows/{workflow_id}/runs", + headers={"Authorization": f"token {GITHUB_TOKEN}"}, + params={ + # only care about runs from one branch (usually, the PR target branch) + "branch": target_branch, + # only care about successful runs + "status": "success", + # pull as many results per page as possible + "per_page": request_page_size, + # filter to recent-enough runs + "created": f">={oldest_date_to_pull.strftime('%Y-%m-%d')}", + }, + ) + print( f"There were 0 successful runs of workflow '{workflow_id}' on branch '{target_branch}' in the last " f"{max_days_without_success} days." ) return ExitCode.FAILURE - # If we get here, then there must have been at least 1 successful run in the allowed time. We need to find its - # precise time for a log message, but otherwise the code an exit with success. - print( - f"Found {len(all_runs)} successful runs of workflow '{workflow_id}' on branch '{target_branch}' " - f"in the previous {max_days_without_success} days." - ) - - # sort runs by "run_started_at" - most_recent_successful_run = max(all_runs, key=lambda r: r.run_started_at) - print( - f"The most recent successful run of workflow '{workflow_id}' on branch '{target_branch}' " - f"was '{most_recent_successful_run.run_started_at}', which is within the " - f"last {max_days_without_success} days. " - f"View logs:\n - {most_recent_successful_run.html_url}" - ) - return ExitCode.SUCCESS - if __name__ == "__main__": parser = argparse.ArgumentParser() diff --git a/check_nightly_success/dispatch/action.yml b/check_nightly_success/dispatch/action.yml index eceea33a..02409d59 100644 --- a/check_nightly_success/dispatch/action.yml +++ b/check_nightly_success/dispatch/action.yml @@ -9,8 +9,7 @@ inputs: description: | Branch the pull request this is running on targets. Only statuses of nightly runs on that branch will be considered. - required: false - default: "main" + required: true type: string workflow-id: description: "The workflow whose runs to check" From 5692b2e79e75fed218b4b4dc9506a6e8049493d2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 17 Feb 2026 16:15:46 +0000 Subject: [PATCH 06/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- check_nightly_success/check-nightly-success/check.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/check_nightly_success/check-nightly-success/check.py b/check_nightly_success/check-nightly-success/check.py index 194da05a..00bb0ff2 100644 --- a/check_nightly_success/check-nightly-success/check.py +++ b/check_nightly_success/check-nightly-success/check.py @@ -199,7 +199,7 @@ def main( }, ) - # + # if not all_runs: print( f"There were 0 runs (successful or unsuccessful) of workflow '{workflow_id}' on branch " @@ -215,12 +215,12 @@ def main( f"than {max_days_without_success} days ago. This exempts the workflow from " "check-nightly-success because the workflow has not been running for very long." else: - + # if there were 0 successful runs, immediately exit with 1... by definition that means there # hasn't been a success in the last `max_days_without_success` days if not successful_runs: - + successful_runs = client.get_all_runs( url=f"https://api.github.com/repos/{repo}/actions/workflows/{workflow_id}/runs", headers={"Authorization": f"token {GITHUB_TOKEN}"}, From c13b9c4205991fb1865439f19aa2bf03d40ea11d Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 17 Feb 2026 11:10:29 -0600 Subject: [PATCH 07/14] handle exempting new workflows --- .../check-nightly-success/check.py | 61 ++++++++----------- 1 file changed, 26 insertions(+), 35 deletions(-) diff --git a/check_nightly_success/check-nightly-success/check.py b/check_nightly_success/check-nightly-success/check.py index 00bb0ff2..f972e877 100644 --- a/check_nightly_success/check-nightly-success/check.py +++ b/check_nightly_success/check-nightly-success/check.py @@ -173,11 +173,12 @@ def main( }, ) + # recent-enough, successful run = exit 0 if successful_runs: most_recent_successful_run = max(successful_runs, key=lambda r: r.run_started_at) print( f"Found {len(successful_runs)} successful runs of workflow '{workflow_id}' on branch '{target_branch}' " - f"in the previous {max_days_without_success} days (most recent: '{most_recent_successful_run.run_started_at}'). " + f"in the previous {max_days_without_success} days (most recent: '{most_recent_successful_run.run_started_at}'). " # noqa: E501 f"View logs:\n - {most_recent_successful_run.html_url}" ) return ExitCode.SUCCESS @@ -185,7 +186,11 @@ def main( # It's ok for there to be 0 successful runs if the branch is fairly new or the workflow hasn't been running on it # very long. # - # When new release branches are cut, we want to give a couple of days of space for teams to get their nightly tests working + # Code below looks for runs in the last `max_days_without_success * 2` days, to get an + # approximation of the entire history without having an unbounded "list all runs from all time" type of query + # (which could get expensive for very-active branches). + lookback_days = max_days_without_success * 2 + oldest_date_to_pull = datetime.now(timezone.utc) - timedelta(days=lookback_days) all_runs = client.get_all_runs( url=f"https://api.github.com/repos/{repo}/actions/workflows/{workflow_id}/runs", headers={"Authorization": f"token {GITHUB_TOKEN}"}, @@ -199,48 +204,34 @@ def main( }, ) - # + # Fail if there have not been any runs at all (to avoid silently skipping this check). if not all_runs: print( f"There were 0 runs (successful or unsuccessful) of workflow '{workflow_id}' on branch " - f"'{target_branch}' in the last {max_days_without_success} days." + f"'{target_branch}' in the last {lookback_days} days. " + "To resolve this, run the workflow at least once or increase 'max-days-without-success'." ) return ExitCode.FAILURE - if len(all_runs) < max_days_without_success: + # If the oldest run on the branch was less than {max_days_without_success} ago, warn but allow the check to pass. + oldest_run = min(all_runs, key=lambda r: r.run_started_at) + days_since_oldest_run = (datetime.now(tz=timezone.utc) - oldest_run.run_started_at).days + if days_since_oldest_run < max_days_without_success: print( - "There have only been" - - f"The oldest run of the {workflow_id} workflow on {latest_branch} was less " - f"than {max_days_without_success} days ago. This exempts the workflow from " - "check-nightly-success because the workflow has not been running for very long." - else: - - - # if there were 0 successful runs, immediately exit with 1... by definition that means there - # hasn't been a success in the last `max_days_without_success` days - if not successful_runs: - - successful_runs = client.get_all_runs( - url=f"https://api.github.com/repos/{repo}/actions/workflows/{workflow_id}/runs", - headers={"Authorization": f"token {GITHUB_TOKEN}"}, - params={ - # only care about runs from one branch (usually, the PR target branch) - "branch": target_branch, - # only care about successful runs - "status": "success", - # pull as many results per page as possible - "per_page": request_page_size, - # filter to recent-enough runs - "created": f">={oldest_date_to_pull.strftime('%Y-%m-%d')}", - }, + f"The oldest run of workflow '{workflow_id}' on branch '{target_branch}' was " + f"{days_since_oldest_run} days ago ({oldest_run.run_started_at}). Because that is less than " + f"'max-days-without-success = {max_days_without_success}' days, this workflow is exempted from " + "check-nightly-success. The check will start failing if there is not a successful run in " + "the next few days." ) + return ExitCode.SUCCESS - print( - f"There were 0 successful runs of workflow '{workflow_id}' on branch '{target_branch}' in the last " - f"{max_days_without_success} days." - ) - return ExitCode.FAILURE + # There isn't a recent-enough success and the branch isn't exempted... fail. + print( + f"There were 0 successful runs of workflow '{workflow_id}' on branch '{target_branch}' in the last " + f"{max_days_without_success} days." + ) + return ExitCode.FAILURE if __name__ == "__main__": From 82b50ba1636a69c124c5c3392f12c2a0db6677b7 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 17 Feb 2026 11:15:22 -0600 Subject: [PATCH 08/14] update docs --- check_nightly_success/README.md | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/check_nightly_success/README.md b/check_nightly_success/README.md index 8c0a6abf..ba3ee3a8 100644 --- a/check_nightly_success/README.md +++ b/check_nightly_success/README.md @@ -20,7 +20,7 @@ Add it to any GitHub Actions workflow configuration like this: uses: rapidsai/shared-actions/check_nightly_success/dispatch@main with: repo: ${{ github.repository }} - target-branch: ${{ github.base_ref }} + target-branch: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.ref }} workflow-id: 'test.yaml' max-days-without-success: 7 ``` @@ -45,16 +45,15 @@ python ./check-nightly-success/check.py \ If this succeeds, you'll see a `0` exit code and output text similar to the following: -> Found 4 successful runs of workflow 'test.yaml' on branch 'main' in the previous 7 days. -The most recent successful run of workflow 'test.yaml' on branch 'main' was '2026-02-13 13:40:18+00:00', which is within the last 7 days. View logs: - - https://github.com/rapidsai/cudf/actions/runs/21978265026 +> Found 4 successful runs of workflow 'test.yaml' on branch 'main' in the previous 7 days (most recent: '2026-02-16 06:26:04+00:00'). View logs: + - https://github.com/rapidsai/cudf/actions/runs/22052428055 - To see it fail, try on a repo that doesn't have that workflow. +To see it fail, try on a repo that doesn't have that workflow. ```shell GH_TOKEN=$(gh auth token) \ python ./check-nightly-success/check.py \ - --repo 'rapidsai/build-planniing' \ + --repo 'rapidsai/build-planning' \ --branch 'main' \ --workflow-id 'test.yaml' \ --max-days-without-success 7 @@ -62,12 +61,12 @@ python ./check-nightly-success/check.py \ That'll return exit code `1` and output similar to this: -> RuntimeError: Failed to fetch https://api.github.com/repos/rapidsai/build-planniing/actions/workflows/test.yaml/runs after 5 attempts with the following errors: - 404 Client Error: Not Found for url: https://api.github.com/repos/rapidsai/build-planniing/actions/workflows/test.yaml/runs?branch=main&status=success&per_page=100&created=%3E%3D2026-02-05 - 404 Client Error: Not Found for url: https://api.github.com/repos/rapidsai/build-planniing/actions/workflows/test.yaml/runs?branch=main&status=success&per_page=100&created=%3E%3D2026-02-05 - 404 Client Error: Not Found for url: https://api.github.com/repos/rapidsai/build-planniing/actions/workflows/test.yaml/runs?branch=main&status=success&per_page=100&created=%3E%3D2026-02-05 - 404 Client Error: Not Found for url: https://api.github.com/repos/rapidsai/build-planniing/actions/workflows/test.yaml/runs?branch=main&status=success&per_page=100&created=%3E%3D2026-02-05 - 404 Client Error: Not Found for url: https://api.github.com/repos/rapidsai/build-planniing/actions/workflows/test.yaml/runs?branch=main&status=success&per_page=100&created=%3E%3D2026-02-05 +> RuntimeError: Failed to fetch https://api.github.com/repos/rapidsai/build-planning/actions/workflows/test.yaml/runs after 5 attempts with the following errors: + 404 Client Error: Not Found for url: https://api.github.com/repos/rapidsai/build-planning/actions/workflows/test.yaml/runs?branch=main&status=success&per_page=100&created=%3E%3D2026-02-10 + 404 Client Error: Not Found for url: https://api.github.com/repos/rapidsai/build-planning/actions/workflows/test.yaml/runs?branch=main&status=success&per_page=100&created=%3E%3D2026-02-10 + 404 Client Error: Not Found for url: https://api.github.com/repos/rapidsai/build-planning/actions/workflows/test.yaml/runs?branch=main&status=success&per_page=100&created=%3E%3D2026-02-10 + 404 Client Error: Not Found for url: https://api.github.com/repos/rapidsai/build-planning/actions/workflows/test.yaml/runs?branch=main&status=success&per_page=100&created=%3E%3D2026-02-10 + 404 Client Error: Not Found for url: https://api.github.com/repos/rapidsai/build-planning/actions/workflows/test.yaml/runs?branch=main&status=success&per_page=100&created=%3E%3D2026-02-10 Set `--request-page-size` to `1` to test that pagination is working. From 552b206dd92779ef466a522b666988604fb3cd73 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 17 Feb 2026 11:33:31 -0600 Subject: [PATCH 09/14] more logging --- check_nightly_success/check-nightly-success/check.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/check_nightly_success/check-nightly-success/check.py b/check_nightly_success/check-nightly-success/check.py index f972e877..e36b4036 100644 --- a/check_nightly_success/check-nightly-success/check.py +++ b/check_nightly_success/check-nightly-success/check.py @@ -216,13 +216,15 @@ def main( # If the oldest run on the branch was less than {max_days_without_success} ago, warn but allow the check to pass. oldest_run = min(all_runs, key=lambda r: r.run_started_at) days_since_oldest_run = (datetime.now(tz=timezone.utc) - oldest_run.run_started_at).days + print( + f"The oldest run of workflow '{workflow_id}' on branch '{target_branch}' was " + f"{days_since_oldest_run} days ago ({oldest_run.run_started_at})." + ) if days_since_oldest_run < max_days_without_success: print( - f"The oldest run of workflow '{workflow_id}' on branch '{target_branch}' was " - f"{days_since_oldest_run} days ago ({oldest_run.run_started_at}). Because that is less than " - f"'max-days-without-success = {max_days_without_success}' days, this workflow is exempted from " - "check-nightly-success. The check will start failing if there is not a successful run in " - "the next few days." + f"Because the latest run was less than 'max-days-without-success = {max_days_without_success}' days ago, " + "this workflow is exempted from check-nightly-success. The check will start failing if there is not a " + "successful run in the next few days." ) return ExitCode.SUCCESS From 0167e97b8f25e36b6b9da76c6d3957f0949a2a12 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 17 Feb 2026 11:56:56 -0600 Subject: [PATCH 10/14] expand testing docs --- .gitignore | 1 + check_nightly_success/README.md | 46 ++++++++++++++++++++++++++++++++- 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 9c92e4ec..cfb60695 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ *.swp +ucxx/ .venv diff --git a/check_nightly_success/README.md b/check_nightly_success/README.md index ba3ee3a8..70bea189 100644 --- a/check_nightly_success/README.md +++ b/check_nightly_success/README.md @@ -28,6 +28,9 @@ Add it to any GitHub Actions workflow configuration like this: ## Testing The code for the actions is implemented in Python. + +### Case 1: Succeed on recent nightly test successes + Try the following locally to test it. ```shell @@ -48,7 +51,9 @@ If this succeeds, you'll see a `0` exit code and output text similar to the foll > Found 4 successful runs of workflow 'test.yaml' on branch 'main' in the previous 7 days (most recent: '2026-02-16 06:26:04+00:00'). View logs: - https://github.com/rapidsai/cudf/actions/runs/22052428055 -To see it fail, try on a repo that doesn't have that workflow. +### Case 2: Fail when branch has 0 runs (of any status) + +The check should fail on a repo without any runs of this workflow: ```shell GH_TOKEN=$(gh auth token) \ @@ -68,6 +73,45 @@ That'll return exit code `1` and output similar to this: 404 Client Error: Not Found for url: https://api.github.com/repos/rapidsai/build-planning/actions/workflows/test.yaml/runs?branch=main&status=success&per_page=100&created=%3E%3D2026-02-10 404 Client Error: Not Found for url: https://api.github.com/repos/rapidsai/build-planning/actions/workflows/test.yaml/runs?branch=main&status=success&per_page=100&created=%3E%3D2026-02-10 +### Case 3: Success on new branches with only very-recent runs + +Branches with only very-recent runs should be exempted from the check. + +```shell +# NOTE: this example requires write access to 'rapidsai/ucxx' +git clone -o upstream https://github.com/rapidsai/ucxx +pushd./ucxx +git checkout -b delete-me +git push upstream delete-me +popd + +gh workflow run \ + --repo rapidsai/ucxx \ + --ref delete-me \ + test.yaml \ + -f branch="delete-me" \ + -f date="$(date +%Y-%m-%d)" \ + -f sha="$(git rev-parse HEAD)" \ + -f build_type=nightly + +# (MANUAL - go to https://github.com/rapidsai/ucxx/actions/runs/22109183034 and manuall cancel that run) + +# run the check +GH_TOKEN=$(gh auth token) \ +python ./check-nightly-success/check.py \ + --repo 'rapidsai/ucxx' \ + --branch 'delete-me' \ + --workflow-id 'test.yaml' \ + --max-days-without-success 7 +``` + +That'll exit with code `0` and print something like this: + +> The oldest run of workflow 'test.yaml' on branch 'delete-me' was 0 days ago (2026-02-17 17:42:05+00:00). +Because the latest run was less than 'max-days-without-success = 7' days ago, this workflow is exempted from check-nightly-success. The check will start failing if there is not a successful run in the next few days. + +### Other testing: pagination + Set `--request-page-size` to `1` to test that pagination is working. ```shell From dd8e1c48370615caaa66f86b1a900bcd2ef78213 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 17 Feb 2026 13:50:17 -0600 Subject: [PATCH 11/14] Apply suggestions from code review Co-authored-by: Bradley Dice --- check_nightly_success/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/check_nightly_success/README.md b/check_nightly_success/README.md index 70bea189..2c9820db 100644 --- a/check_nightly_success/README.md +++ b/check_nightly_success/README.md @@ -80,7 +80,7 @@ Branches with only very-recent runs should be exempted from the check. ```shell # NOTE: this example requires write access to 'rapidsai/ucxx' git clone -o upstream https://github.com/rapidsai/ucxx -pushd./ucxx +pushd ./ucxx git checkout -b delete-me git push upstream delete-me popd @@ -94,7 +94,7 @@ gh workflow run \ -f sha="$(git rev-parse HEAD)" \ -f build_type=nightly -# (MANUAL - go to https://github.com/rapidsai/ucxx/actions/runs/22109183034 and manuall cancel that run) +# (MANUAL - go to https://github.com/rapidsai/ucxx/actions/runs/22109183034 and manually cancel that run) # run the check GH_TOKEN=$(gh auth token) \ From a018fa67daf01aefe0e3677e98bb1be6bf5aaac6 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 17 Feb 2026 14:49:11 -0600 Subject: [PATCH 12/14] switch to urllib3.util.Retry --- .../check-nightly-success/check.py | 48 ++++++++----------- 1 file changed, 19 insertions(+), 29 deletions(-) diff --git a/check_nightly_success/check-nightly-success/check.py b/check_nightly_success/check-nightly-success/check.py index e36b4036..8558b341 100644 --- a/check_nightly_success/check-nightly-success/check.py +++ b/check_nightly_success/check-nightly-success/check.py @@ -5,11 +5,12 @@ import argparse import os import sys -import time from dataclasses import dataclass from datetime import datetime, timedelta, timezone import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry # Constants GITHUB_TOKEN = os.environ["GH_TOKEN"] @@ -45,44 +46,33 @@ def __init__( retry_backoff_seconds: float, request_timeout_seconds: float, ) -> None: - self.max_retries = max_retries self.request_timeout_seconds = request_timeout_seconds - self.retry_backoff_seconds = retry_backoff_seconds + retry = Retry( + total=max_retries - 1, # 1 initial attempt + (total) retries = max_retries attempts + backoff_factor=retry_backoff_seconds, + status_forcelist=(429, 500, 502, 503, 504), + ) + adapter = HTTPAdapter(max_retries=retry) + self._session = requests.Session() + self._session.mount("https://", adapter) + self._session.mount("http://", adapter) def __get_next_page( self, *, url: str, headers: dict[str, str], - params: dict[str, int | str], + params: dict[str, int | str] | None, ) -> _ResponseData: """Get one page of results""" - exceptions = [] - for _ in range(self.max_retries): - try: - response = requests.get( - url, - headers=headers, - params=params, - timeout=self.request_timeout_seconds, - ) - response.raise_for_status() - break - except requests.RequestException as e: - exceptions.append(str(e)) - # simple backoff, without jitter, exponential backoff, etc., should be fine for this - time.sleep(self.retry_backoff_seconds) - else: - # this needs to be done outside the f-string to avoid: - # "Cannot use an escape sequence (backslash) in f-strings on Python 3.10 (syntax was added in Python 3.12)" - exception_text = "\n\t".join(exceptions) - msg = ( - f"Failed to fetch {url} after {self.max_retries} attempts with the following " - f"errors: \n\t{exception_text}" - ) - raise RuntimeError(msg) + response = self._session.get( + url, + headers=headers, + params=params, + timeout=self.request_timeout_seconds, + ) + response.raise_for_status() - # if we get here, the request succeeded...return its data, in the format we want return _ResponseData( data=[ _WorkflowRun( From 22a87b1f9b2067dbde801002c1df86790510e1bc Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 17 Feb 2026 14:54:24 -0600 Subject: [PATCH 13/14] do not trust a 403 or 404 from GitHub is not retryable --- check_nightly_success/README.md | 7 +------ check_nightly_success/check-nightly-success/check.py | 2 +- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/check_nightly_success/README.md b/check_nightly_success/README.md index 2c9820db..0a6a3d44 100644 --- a/check_nightly_success/README.md +++ b/check_nightly_success/README.md @@ -66,12 +66,7 @@ python ./check-nightly-success/check.py \ That'll return exit code `1` and output similar to this: -> RuntimeError: Failed to fetch https://api.github.com/repos/rapidsai/build-planning/actions/workflows/test.yaml/runs after 5 attempts with the following errors: - 404 Client Error: Not Found for url: https://api.github.com/repos/rapidsai/build-planning/actions/workflows/test.yaml/runs?branch=main&status=success&per_page=100&created=%3E%3D2026-02-10 - 404 Client Error: Not Found for url: https://api.github.com/repos/rapidsai/build-planning/actions/workflows/test.yaml/runs?branch=main&status=success&per_page=100&created=%3E%3D2026-02-10 - 404 Client Error: Not Found for url: https://api.github.com/repos/rapidsai/build-planning/actions/workflows/test.yaml/runs?branch=main&status=success&per_page=100&created=%3E%3D2026-02-10 - 404 Client Error: Not Found for url: https://api.github.com/repos/rapidsai/build-planning/actions/workflows/test.yaml/runs?branch=main&status=success&per_page=100&created=%3E%3D2026-02-10 - 404 Client Error: Not Found for url: https://api.github.com/repos/rapidsai/build-planning/actions/workflows/test.yaml/runs?branch=main&status=success&per_page=100&created=%3E%3D2026-02-10 +> requests.exceptions.RetryError: HTTPSConnectionPool(host='api.github.com', port=443): Max retries exceeded with url: /repos/rapidsai/build-planning/actions/workflows/test.yaml/runs?branch=main&status=success&per_page=100&created=%3E%3D2026-02-10 (Caused by ResponseError('too many 404 error responses')) ### Case 3: Success on new branches with only very-recent runs diff --git a/check_nightly_success/check-nightly-success/check.py b/check_nightly_success/check-nightly-success/check.py index 8558b341..d41b59b6 100644 --- a/check_nightly_success/check-nightly-success/check.py +++ b/check_nightly_success/check-nightly-success/check.py @@ -50,7 +50,7 @@ def __init__( retry = Retry( total=max_retries - 1, # 1 initial attempt + (total) retries = max_retries attempts backoff_factor=retry_backoff_seconds, - status_forcelist=(429, 500, 502, 503, 504), + status_forcelist=(403, 404, 429, 500, 502, 503, 504), ) adapter = HTTPAdapter(max_retries=retry) self._session = requests.Session() From e06b149c430bc840b0bf7c020a05e94cb700a2df Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 19 Feb 2026 09:39:35 -0600 Subject: [PATCH 14/14] update name, clone UCXX to a temporary directory --- .gitignore | 1 - check_nightly_success/README.md | 5 +++-- check_nightly_success/check-nightly-success/check.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index cfb60695..9c92e4ec 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,2 @@ *.swp -ucxx/ .venv diff --git a/check_nightly_success/README.md b/check_nightly_success/README.md index 0a6a3d44..d81b61bd 100644 --- a/check_nightly_success/README.md +++ b/check_nightly_success/README.md @@ -74,8 +74,9 @@ Branches with only very-recent runs should be exempted from the check. ```shell # NOTE: this example requires write access to 'rapidsai/ucxx' -git clone -o upstream https://github.com/rapidsai/ucxx -pushd ./ucxx +TMP_UCXX=$(mktemp -d) +git clone -o upstream https://github.com/rapidsai/ucxx "${TMP_UCXX}" +pushd "${TMP_UCXX}" git checkout -b delete-me git push upstream delete-me popd diff --git a/check_nightly_success/check-nightly-success/check.py b/check_nightly_success/check-nightly-success/check.py index d41b59b6..62351c5b 100644 --- a/check_nightly_success/check-nightly-success/check.py +++ b/check_nightly_success/check-nightly-success/check.py @@ -57,7 +57,7 @@ def __init__( self._session.mount("https://", adapter) self._session.mount("http://", adapter) - def __get_next_page( + def _get_next_page( self, *, url: str, @@ -99,7 +99,7 @@ def get_all_runs( page_num = 1 while True: print(f"requesting page {page_num} of results") - page = self.__get_next_page( + page = self._get_next_page( url=url, headers=headers, params=params,