diff --git a/.gitignore b/.gitignore index 1377554e..9c92e4ec 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ *.swp +.venv diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 66c32140..23d91c8a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,12 +18,24 @@ repos: hooks: - id: actionlint-docker - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.14.10 + rev: v0.15.1 hooks: - - id: ruff - args: ["--fix"] + - id: ruff-check + args: ["--fix", "--config=pyproject.toml"] - id: ruff-format + args: ["--config=pyproject.toml"] - repo: https://github.com/rapidsai/pre-commit-hooks - rev: v1.2.1 + rev: v1.4.3 hooks: - id: verify-copyright + - repo: https://github.com/pre-commit/mirrors-mypy + rev: 'v1.13.0' + hooks: + - id: mypy + additional_dependencies: + - "requests>=2.32.4" + - "types-requests>=2.32.4" + args: + - "--config-file=pyproject.toml" + - "check_nightly_success/" + pass_filenames: false diff --git a/check_nightly_success/README.md b/check_nightly_success/README.md new file mode 100644 index 00000000..d81b61bd --- /dev/null +++ b/check_nightly_success/README.md @@ -0,0 +1,121 @@ +# check_nightly_success + +Action that can be used to fail CI if a given GitHub Actions workflow hasn't had at least 1 recent succcessful run. + +Add it to any GitHub Actions workflow configuration like this: + +```yaml + check-nightly-ci: + runs-on: ubuntu-latest + permissions: + actions: read + id-token: write + env: + GH_TOKEN: ${{ github.token }} + steps: + - name: Get PR Info + id: get-pr-info + uses: nv-gha-runners/get-pr-info@main + - name: Check if nightly CI is passing + uses: rapidsai/shared-actions/check_nightly_success/dispatch@main + with: + repo: ${{ github.repository }} + target-branch: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.ref }} + workflow-id: 'test.yaml' + max-days-without-success: 7 +``` + +## Testing + +The code for the actions is implemented in Python. + +### Case 1: Succeed on recent nightly test successes + +Try the following locally to test it. + +```shell +python -m venv .venv/ +source .venv/bin/activate +python -m pip install requests + +GH_TOKEN=$(gh auth token) \ +python ./check-nightly-success/check.py \ + --repo 'rapidsai/cudf' \ + --branch 'main' \ + --workflow-id 'test.yaml' \ + --max-days-without-success 7 +``` + +If this succeeds, you'll see a `0` exit code and output text similar to the following: + +> Found 4 successful runs of workflow 'test.yaml' on branch 'main' in the previous 7 days (most recent: '2026-02-16 06:26:04+00:00'). View logs: + - https://github.com/rapidsai/cudf/actions/runs/22052428055 + +### Case 2: Fail when branch has 0 runs (of any status) + +The check should fail on a repo without any runs of this workflow: + +```shell +GH_TOKEN=$(gh auth token) \ +python ./check-nightly-success/check.py \ + --repo 'rapidsai/build-planning' \ + --branch 'main' \ + --workflow-id 'test.yaml' \ + --max-days-without-success 7 +``` + +That'll return exit code `1` and output similar to this: + +> requests.exceptions.RetryError: HTTPSConnectionPool(host='api.github.com', port=443): Max retries exceeded with url: /repos/rapidsai/build-planning/actions/workflows/test.yaml/runs?branch=main&status=success&per_page=100&created=%3E%3D2026-02-10 (Caused by ResponseError('too many 404 error responses')) + +### Case 3: Success on new branches with only very-recent runs + +Branches with only very-recent runs should be exempted from the check. + +```shell +# NOTE: this example requires write access to 'rapidsai/ucxx' +TMP_UCXX=$(mktemp -d) +git clone -o upstream https://github.com/rapidsai/ucxx "${TMP_UCXX}" +pushd "${TMP_UCXX}" +git checkout -b delete-me +git push upstream delete-me +popd + +gh workflow run \ + --repo rapidsai/ucxx \ + --ref delete-me \ + test.yaml \ + -f branch="delete-me" \ + -f date="$(date +%Y-%m-%d)" \ + -f sha="$(git rev-parse HEAD)" \ + -f build_type=nightly + +# (MANUAL - go to https://github.com/rapidsai/ucxx/actions/runs/22109183034 and manually cancel that run) + +# run the check +GH_TOKEN=$(gh auth token) \ +python ./check-nightly-success/check.py \ + --repo 'rapidsai/ucxx' \ + --branch 'delete-me' \ + --workflow-id 'test.yaml' \ + --max-days-without-success 7 +``` + +That'll exit with code `0` and print something like this: + +> The oldest run of workflow 'test.yaml' on branch 'delete-me' was 0 days ago (2026-02-17 17:42:05+00:00). +Because the latest run was less than 'max-days-without-success = 7' days ago, this workflow is exempted from check-nightly-success. The check will start failing if there is not a successful run in the next few days. + +### Other testing: pagination + +Set `--request-page-size` to `1` to test that pagination is working. + +```shell +GH_TOKEN=$(gh auth token) \ +python ./check-nightly-success/check.py \ + --repo 'rapidsai/cudf' \ + --branch 'main' \ + --workflow-id 'test.yaml' \ + --max-days-without-success 30 \ + --request-page-size 5 +``` diff --git a/check_nightly_success/check-nightly-success/action.yaml b/check_nightly_success/check-nightly-success/action.yaml index 2ad52971..4c149b7a 100644 --- a/check_nightly_success/check-nightly-success/action.yaml +++ b/check_nightly_success/check-nightly-success/action.yaml @@ -1,24 +1,26 @@ name: check-nightly-success description: Check if the nightlies have succeeded recently. + +# these inputs should all be 'required: true' without defaults... this action should only +# ever be invoked by check_nightly_success/dispatch inputs: repo: - description: "The repository to check" + description: "Repository name with owner (e.g. 'rapidsai/cudf' not 'cudf')" required: true type: string - repo_owner: - description: "The org that owns the repo (default: rapidsai)" - required: false - default: "rapidsai" + target-branch: + description: | + Branch the pull request this is running on targets. + Only statuses of nightly runs on that branch will be considered. + required: true type: string - workflow_id: + workflow-id: description: "The workflow whose runs to check" - required: false - default: "test.yaml" + required: true type: string - max_days_without_success: + max-days-without-success: description: "The number of consecutive days that may go by without a successful CI run" - required: false - default: 7 + required: true type: integer runs: @@ -28,9 +30,15 @@ runs: shell: bash env: REPO: ${{ inputs.repo }} - REPO_OWNER: ${{ inputs.repo_owner }} - WORKFLOW_ID: ${{ inputs.workflow_id }} - MAX_DAYS_WITHOUT_SUCCESS: ${{ inputs.max_days_without_success }} + TARGET_BRANCH: ${{ inputs.target-branch }} + WORKFLOW_ID: ${{ inputs.workflow-id }} + MAX_DAYS_WITHOUT_SUCCESS: ${{ inputs.max-days-without-success }} run: | - python -m pip install requests - python shared-actions/check_nightly_success/check-nightly-success/check.py ${REPO} --repo-owner ${REPO_OWNER} --workflow-id ${WORKFLOW_ID} --max-days-without-success ${MAX_DAYS_WITHOUT_SUCCESS} + python -m pip install \ + --prefer-binary \ + 'requests>=2.32.4' + python shared-actions/check_nightly_success/check-nightly-success/check.py \ + --repo ${REPO} \ + --branch ${TARGET_BRANCH} \ + --workflow-id ${WORKFLOW_ID} \ + --max-days-without-success ${MAX_DAYS_WITHOUT_SUCCESS} diff --git a/check_nightly_success/check-nightly-success/check.py b/check_nightly_success/check-nightly-success/check.py index e36c2298..62351c5b 100644 --- a/check_nightly_success/check-nightly-success/check.py +++ b/check_nightly_success/check-nightly-success/check.py @@ -1,148 +1,275 @@ -# Copyright (c) 2024-2025, NVIDIA CORPORATION. +# Copyright (c) 2024-2026, NVIDIA CORPORATION. """Check whether a GHA workflow has run successfully in the last N days.""" -# ruff: noqa: INP001 import argparse import os -import re import sys -from collections import defaultdict -from datetime import datetime +from dataclasses import dataclass +from datetime import datetime, timedelta, timezone import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry # Constants -GITHUB_TOKEN = os.environ["RAPIDS_GH_TOKEN"] -GOOD_STATUSES = {"success"} +GITHUB_TOKEN = os.environ["GH_TOKEN"] + + +@dataclass +class _WorkflowRun: + """GitHub workflow run data, filtered to only the fields this action cares about.""" + + html_url: str + run_started_at: datetime + + +@dataclass +class _ResponseData: + data: list[_WorkflowRun] + next_url: str | None + + +# We are producing Unix return codes so success/failure is inverted from the +# expected Python boolean values. +@dataclass +class ExitCode: + FAILURE = 1 + SUCCESS = 0 + + +class GitHubClient: + def __init__( + self, + *, + max_retries: int, + retry_backoff_seconds: float, + request_timeout_seconds: float, + ) -> None: + self.request_timeout_seconds = request_timeout_seconds + retry = Retry( + total=max_retries - 1, # 1 initial attempt + (total) retries = max_retries attempts + backoff_factor=retry_backoff_seconds, + status_forcelist=(403, 404, 429, 500, 502, 503, 504), + ) + adapter = HTTPAdapter(max_retries=retry) + self._session = requests.Session() + self._session.mount("https://", adapter) + self._session.mount("http://", adapter) + + def _get_next_page( + self, + *, + url: str, + headers: dict[str, str], + params: dict[str, int | str] | None, + ) -> _ResponseData: + """Get one page of results""" + response = self._session.get( + url, + headers=headers, + params=params, + timeout=self.request_timeout_seconds, + ) + response.raise_for_status() + + return _ResponseData( + data=[ + _WorkflowRun( + html_url=workflow_run["html_url"], + run_started_at=datetime.fromisoformat(workflow_run["run_started_at"]), + ) + for workflow_run in response.json()["workflow_runs"] + ], + next_url=response.links.get("next", dict()).get("url", None), + ) + + def get_all_runs( + self, + *, + url: str, + headers: dict[str, str], + params: dict[str, int | str], + ) -> list[_WorkflowRun]: + """ + Paginate over requests to api.github.com/repos/{repo_owner}/{repo}/actions/workflows/{workflow_id}/runs + and return all the results. + """ + data = [] + page_num = 1 + while True: + print(f"requesting page {page_num} of results") + page = self._get_next_page( + url=url, + headers=headers, + params=params, + ) + data.extend(page.data) + if page.next_url is None: + break + # just use the pagination URL, not the original query one + url = page.next_url + params = None # type: ignore[assignment] + page_num += 1 + return data def main( + *, repo: str, - repo_owner: str, + target_branch: str, workflow_id: str, max_days_without_success: int, - num_attempts: int = 5, -) -> bool: + num_attempts: int, + request_page_size: int, + request_timeout_seconds: float, + retry_backoff_seconds: float, +) -> int: """Check whether a GHA workflow has run successfully in the last N days. Returns True if the workflow has not run successfully in the last N days, False otherwise (values are inverted for use as a return code). """ - headers = {"Authorization": f"token {GITHUB_TOKEN}"} - url = f"https://api.github.com/repos/{repo_owner}/{repo}/actions/workflows/{workflow_id}/runs" - exceptions = [] - for _ in range(num_attempts): - try: - response = requests.get(url, headers=headers, timeout=10) - response.raise_for_status() - break - except requests.RequestException as e: - exceptions.append(e) - else: - sep = "\n\t" - msg = ( - f"Failed to fetch {url} after {num_attempts} attempts with the following " - f"errors: {sep}{'{sep}'.join(exceptions)}" + # Timezones in GitHub API responses are guaranteed to be in UTC time. + # + # ref: https://docs.github.com/en/rest/using-the-rest-api/timezones-and-the-rest-api?apiVersion=2022-11-28 + # + # This code is a little imprecise (doing the math in 'days' means that moving from 11:59p to 12:01a buys you + # another 23 hours and 58 minutes of time), but that difference shouldn't be important for this action. + # + # Dealing with day-precision date-times makes filtering in the GitHub API simpler, see + # https://docs.github.com/en/search-github/getting-started-with-searching-on-github/understanding-the-search-syntax#query-for-dates + # + oldest_date_to_pull = datetime.now(timezone.utc) - timedelta(days=max_days_without_success) + + # get all the matching runs + client = GitHubClient( + max_retries=num_attempts, + request_timeout_seconds=request_timeout_seconds, + retry_backoff_seconds=retry_backoff_seconds, + ) + successful_runs = client.get_all_runs( + url=f"https://api.github.com/repos/{repo}/actions/workflows/{workflow_id}/runs", + headers={"Authorization": f"token {GITHUB_TOKEN}"}, + params={ + # only care about runs from one branch (usually, the PR target branch) + "branch": target_branch, + # only care about successful runs + "status": "success", + # pull as many results per page as possible + "per_page": request_page_size, + # filter to recent-enough runs + "created": f">={oldest_date_to_pull.strftime('%Y-%m-%d')}", + }, + ) + + # recent-enough, successful run = exit 0 + if successful_runs: + most_recent_successful_run = max(successful_runs, key=lambda r: r.run_started_at) + print( + f"Found {len(successful_runs)} successful runs of workflow '{workflow_id}' on branch '{target_branch}' " + f"in the previous {max_days_without_success} days (most recent: '{most_recent_successful_run.run_started_at}'). " # noqa: E501 + f"View logs:\n - {most_recent_successful_run.html_url}" ) - raise RuntimeError(msg) - - runs = response.json()["workflow_runs"] - tz = datetime.fromisoformat(runs[0]["run_started_at"]).tzinfo - now = datetime.now(tz=tz) - - latest_success = {} - workflow_active_for_max_days = {} - # Rather frustratingly, the workflow runs returned from the GitHub API can - # have alternating ordering of `head_branch` - # e.g. - # run[0]['head_branch'] == "release/25.02" - # run[1]['head_branch'] == "release/25.04" - # run[2]['head_branch'] == "release/25.02" + return ExitCode.SUCCESS + + # It's ok for there to be 0 successful runs if the branch is fairly new or the workflow hasn't been running on it + # very long. # - # In this situation, the behavior of `itertools.groupby` (previously used - # here) is to only group _consecutive_ runs, so the results of the - # subsequent branch match (i.e. the second group of `release/25.02` runs) - # will overwrite the results of the first one, potentially overwriting a - # previous success. The snippet below unifies the groups so it's more like a - # SQL groupby and there is no chance of overwriting. - branch_dict = defaultdict(list) - for run in runs: - branch_dict[run["head_branch"]].append(run) - - for branch, branch_runs in branch_dict.items(): - # Only consider 'main' and RAPIDS release branches, which have versions like - # '25.10' (RAPIDS) or '0.46' (ucxx). - if not re.match(r"(main|release/[0-9]{1,2}\.[0-9]{2})", branch): - continue - - latest_success[branch] = None - runs = sorted(branch_runs, key=lambda r: r["run_started_at"], reverse=True) - for run in runs: - days_since_run = (now - datetime.fromisoformat(run["run_started_at"])).days - if days_since_run > max_days_without_success: - break - if run["conclusion"] in GOOD_STATUSES: - latest_success[branch] = run - break + # Code below looks for runs in the last `max_days_without_success * 2` days, to get an + # approximation of the entire history without having an unbounded "list all runs from all time" type of query + # (which could get expensive for very-active branches). + lookback_days = max_days_without_success * 2 + oldest_date_to_pull = datetime.now(timezone.utc) - timedelta(days=lookback_days) + all_runs = client.get_all_runs( + url=f"https://api.github.com/repos/{repo}/actions/workflows/{workflow_id}/runs", + headers={"Authorization": f"token {GITHUB_TOKEN}"}, + params={ + # only care about runs from one branch (usually, the PR target branch) + "branch": target_branch, + # pull as many results per page as possible + "per_page": request_page_size, + # filter to recent-enough runs + "created": f">={oldest_date_to_pull.strftime('%Y-%m-%d')}", + }, + ) - workflow_active_for_max_days[branch] = False - if len(runs) > 0: - run = runs[-1] - days_since_run = (now - datetime.fromisoformat(run["run_started_at"])).days - if days_since_run > max_days_without_success: - workflow_active_for_max_days[branch] = True - - latest_branch = max(latest_success) - has_latest_success = latest_success[latest_branch] is not None - - # We are producing Unix return codes so success/failure is inverted from the - # expected Python boolean values. - if has_latest_success: - print( # noqa: T201 - f"The most recent successful run of the {workflow_id} workflow on " - f"{latest_branch} was " - f"{datetime.fromisoformat(latest_success[latest_branch]['run_started_at'])}, " - f"which is within the last {max_days_without_success} days. View logs:" - f"\n - {latest_success[latest_branch]['html_url']}" + # Fail if there have not been any runs at all (to avoid silently skipping this check). + if not all_runs: + print( + f"There were 0 runs (successful or unsuccessful) of workflow '{workflow_id}' on branch " + f"'{target_branch}' in the last {lookback_days} days. " + "To resolve this, run the workflow at least once or increase 'max-days-without-success'." ) - return 0 - elif not workflow_active_for_max_days[latest_branch]: - print( # noqa: T201 - f"The oldest run of the {workflow_id} workflow on {latest_branch} was less " - f"than {max_days_without_success} days ago. This exempts the workflow from " - "check-nightly-success because the workflow has not been running for very long." + return ExitCode.FAILURE + + # If the oldest run on the branch was less than {max_days_without_success} ago, warn but allow the check to pass. + oldest_run = min(all_runs, key=lambda r: r.run_started_at) + days_since_oldest_run = (datetime.now(tz=timezone.utc) - oldest_run.run_started_at).days + print( + f"The oldest run of workflow '{workflow_id}' on branch '{target_branch}' was " + f"{days_since_oldest_run} days ago ({oldest_run.run_started_at})." + ) + if days_since_oldest_run < max_days_without_success: + print( + f"Because the latest run was less than 'max-days-without-success = {max_days_without_success}' days ago, " + "this workflow is exempted from check-nightly-success. The check will start failing if there is not a " + "successful run in the next few days." ) - return 0 + return ExitCode.SUCCESS - print( # noqa: T201 - f"{latest_branch} has no successful runs of {workflow_id} in the last {max_days_without_success} days" + # There isn't a recent-enough success and the branch isn't exempted... fail. + print( + f"There were 0 successful runs of workflow '{workflow_id}' on branch '{target_branch}' in the last " + f"{max_days_without_success} days." ) - return 1 + return ExitCode.FAILURE if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("repo", type=str, help="Repository name") parser.add_argument( - "--repo-owner", - default="rapidsai", - help="Repository organization/owner", + "--repo", + type=str, + required=True, + help="Repository name with owner (e.g. 'rapidsai/cudf' not 'cudf')", + ) + parser.add_argument( + "--branch", + type=str, + required=True, + help="Branch to check for recent workflow runs.", + ) + parser.add_argument( + "--workflow-id", + type=str, + required=True, + help="Workflow ID (e.g. 'test.yaml')", ) - parser.add_argument("--workflow-id", default="test.yaml", help="Workflow ID") parser.add_argument( "--max-days-without-success", type=int, - default=7, + required=True, help="Maximum number of days without a successful run", ) + parser.add_argument( + "--request-page-size", + type=int, + default=100, + required=False, + help="Number of responses per page of data. Decrease this to reduce memory usage.", + ) args = parser.parse_args() sys.exit( main( - args.repo, - args.repo_owner, - args.workflow_id, - args.max_days_without_success, + repo=args.repo, + target_branch=args.branch, + workflow_id=args.workflow_id, + max_days_without_success=args.max_days_without_success, + num_attempts=5, + request_page_size=args.request_page_size, + request_timeout_seconds=10, + retry_backoff_seconds=0.5, ), ) diff --git a/check_nightly_success/dispatch/action.yml b/check_nightly_success/dispatch/action.yml index 5a4223ac..02409d59 100644 --- a/check_nightly_success/dispatch/action.yml +++ b/check_nightly_success/dispatch/action.yml @@ -2,20 +2,21 @@ name: dispatch-check-nightly-success description: Clone shared-actions and dispatch to the check-nightly-success action. inputs: repo: - description: "The repository to check" + description: "Repository name with owner (e.g. 'rapidsai/cudf' not 'cudf')" required: true type: string - repo_owner: - description: "The org that owns the repo (default: rapidsai)" - required: false - default: "rapidsai" + target-branch: + description: | + Branch the pull request this is running on targets. + Only statuses of nightly runs on that branch will be considered. + required: true type: string - workflow_id: + workflow-id: description: "The workflow whose runs to check" required: false default: "test.yaml" type: string - max_days_without_success: + max-days-without-success: description: "The number of consecutive days that may go by without a successful CI run" required: false default: 7 @@ -34,6 +35,6 @@ runs: uses: ./shared-actions/check_nightly_success/check-nightly-success with: repo: ${{ inputs.repo }} - repo_owner: ${{ inputs.repo_owner }} - workflow_id: ${{ inputs.workflow_id }} - max_days_without_success: ${{ inputs.max_days_without_success }} + target-branch: ${{ inputs.target-branch }} + workflow-id: ${{ inputs.workflow-id }} + max-days-without-success: ${{ inputs.max-days-without-success }} diff --git a/pyproject.toml b/pyproject.toml index 5304ac2f..23c81e9f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,4 +1,4 @@ -# Copyright (c) 2024-2025, NVIDIA CORPORATION. +# Copyright (c) 2024-2026, NVIDIA CORPORATION. [tool.ruff] line-length = 120 @@ -6,6 +6,10 @@ target-version = "py310" [tool.ruff.lint] select = [ + # flake8-bugbear + "B", + # flake8-datetimez + "DTZ", # pycodestyle "E", # pyflakes @@ -14,10 +18,18 @@ select = [ "I", # numpy "NPY", + # perflint + "PERF", + # flake8-pie + "PIE", + # flake8-return + "RET", + # ruff-exclusive checks + "RUF", # pyupgrade "UP", - # flake8-bugbear - "B" + # flake8-bandit + "S", ] ignore = [ # Incompatible with D211 diff --git a/telemetry-impls/summarize/bump_time.py b/telemetry-impls/summarize/bump_time.py index 906e686f..2d58c371 100644 --- a/telemetry-impls/summarize/bump_time.py +++ b/telemetry-impls/summarize/bump_time.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# Copyright (c) 2024-2025, NVIDIA CORPORATION. +# Copyright (c) 2024-2026, NVIDIA CORPORATION. # This script is meant to act on an 'all_jobs.json' file that comes from # the summarize job when debug info is enabled. Bumping the time makes @@ -14,12 +14,12 @@ def _parse_time(x: str) -> int: - return int(datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ").timestamp() * 1e9) + return int(datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ").timestamp() * 1e9) # noqa: DTZ007 start_time = _parse_time(jobs[0]["created_at"]) needed_time = _parse_time(jobs[-3]["completed_at"]) - _parse_time(jobs[0]["created_at"]) -new_start_time = datetime.datetime.utcnow() - datetime.timedelta(minutes=60) +new_start_time = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(minutes=60) for idx, job in enumerate(jobs): if job["created_at"]: diff --git a/telemetry-impls/summarize/send_trace.py b/telemetry-impls/summarize/send_trace.py index df2fcc61..4d597e7c 100644 --- a/telemetry-impls/summarize/send_trace.py +++ b/telemetry-impls/summarize/send_trace.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2025, NVIDIA CORPORATION. +# Copyright (c) 2019-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -247,7 +247,7 @@ def get_sccache_stats(artifact_folder: Path) -> dict[str, str]: return parsed_stats -def process_job_blob( # noqa: PLR0913 +def process_job_blob( trace_id: int, job: Mapping[str, Any], env_vars: Mapping[str, str],