diff --git a/.github/workflows/install-dispatch-guard.yml b/.github/workflows/install-dispatch-guard.yml index 25a4a30d..a8342543 100644 --- a/.github/workflows/install-dispatch-guard.yml +++ b/.github/workflows/install-dispatch-guard.yml @@ -64,6 +64,8 @@ jobs: ENV_FILE="${BINANCE_PLATFORM_DISPATCH_ENV:-$HOME/.config/binance-platform/dispatch.env}" LOG_FILE="${BINANCE_PLATFORM_DISPATCH_LOG:-$HOME/binance-quant/ops/dispatch-runtime.log}" + DISPATCH_MAX_ATTEMPTS="${DISPATCH_MAX_ATTEMPTS:-4}" + DISPATCH_RETRY_BASE_SECONDS="${DISPATCH_RETRY_BASE_SECONDS:-15}" notify_telegram() { local message="$1" @@ -96,6 +98,13 @@ jobs: exit 1 } + is_retryable_http_status() { + case "$1" in + 000|500|502|503|504) return 0 ;; + *) return 1 ;; + esac + } + mkdir -p "$(dirname "$LOG_FILE")" [ -f "$ENV_FILE" ] || fail_dispatch "missing env file" "$ENV_FILE" @@ -108,32 +117,64 @@ jobs: fi done + if ! [[ "$DISPATCH_MAX_ATTEMPTS" =~ ^[1-9][0-9]*$ ]]; then + fail_dispatch "invalid DISPATCH_MAX_ATTEMPTS" "$DISPATCH_MAX_ATTEMPTS" + fi + if ! [[ "$DISPATCH_RETRY_BASE_SECONDS" =~ ^[0-9]+$ ]]; then + fail_dispatch "invalid DISPATCH_RETRY_BASE_SECONDS" "$DISPATCH_RETRY_BASE_SECONDS" + fi + response_file=$(mktemp) stderr_file=$(mktemp) trap 'rm -f "$response_file" "$stderr_file"' EXIT - set +e - http_status=$(curl --silent --show-error \ - --output "$response_file" \ - --write-out "%{http_code}" \ - -X POST \ - -H "Accept: application/vnd.github+json" \ - -H "Authorization: Bearer ${GITHUB_TOKEN}" \ - -H "X-GitHub-Api-Version: 2022-11-28" \ - "https://api.github.com/repos/${REPO}/actions/workflows/${WORKFLOW}/dispatches" \ - -d "{\"ref\":\"${REF}\"}" 2>"$stderr_file") - curl_exit=$? - set -e - - if [ "$curl_exit" -ne 0 ]; then - fail_dispatch "curl exited ${curl_exit}" "$(head -c 800 "$stderr_file")" - fi + attempt=1 + while [ "$attempt" -le "$DISPATCH_MAX_ATTEMPTS" ]; do + : > "$response_file" + : > "$stderr_file" + + set +e + http_status=$(curl --silent --show-error \ + --connect-timeout 20 \ + --max-time 60 \ + --output "$response_file" \ + --write-out "%{http_code}" \ + -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${GITHUB_TOKEN}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "https://api.github.com/repos/${REPO}/actions/workflows/${WORKFLOW}/dispatches" \ + -d "{\"ref\":\"${REF}\"}" 2>"$stderr_file") + curl_exit=$? + set -e + + if [ "$curl_exit" -eq 0 ] && [ "$http_status" = "204" ]; then + echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) dispatched ${REPO}/${WORKFLOW} ref=${REF} attempt=${attempt}/${DISPATCH_MAX_ATTEMPTS}" >> "$LOG_FILE" + exit 0 + fi - if [ "$http_status" != "204" ]; then - fail_dispatch "GitHub dispatch returned HTTP ${http_status}" "$(head -c 800 "$response_file")" - fi + retryable=false + if [ "$curl_exit" -ne 0 ]; then + reason="curl exited ${curl_exit}" + details="$(head -c 800 "$stderr_file")" + retryable=true + else + reason="GitHub dispatch returned HTTP ${http_status}" + details="$(head -c 800 "$response_file")" + if is_retryable_http_status "$http_status"; then + retryable=true + fi + fi - echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) dispatched ${REPO}/${WORKFLOW} ref=${REF}" >> "$LOG_FILE" + if [ "$attempt" -ge "$DISPATCH_MAX_ATTEMPTS" ] || [ "$retryable" != "true" ]; then + fail_dispatch "$reason" "$details" + fi + + delay=$((DISPATCH_RETRY_BASE_SECONDS * attempt)) + echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) dispatch retry ${attempt}/${DISPATCH_MAX_ATTEMPTS}: ${reason} ${details}" >> "$LOG_FILE" + sleep "$delay" + attempt=$((attempt + 1)) + done EOF chmod 700 "$SCRIPT_PATH" diff --git a/docs/operator_runbook.md b/docs/operator_runbook.md index e50fee88..cd8ca765 100644 --- a/docs/operator_runbook.md +++ b/docs/operator_runbook.md @@ -71,6 +71,7 @@ The monthly execution pool is locked to the accepted upstream `version` / `as_of - `main.yml` is `workflow_dispatch` only. - GitHub Actions no longer owns the hourly cadence for runtime execution in this repo. - Production cadence should come from one external scheduler, for example VPS cron calling the GitHub Actions dispatch API. +- The VPS dispatch guard retries bounded transient failures such as network errors and GitHub `500`/`502`/`503`/`504`, but still alerts immediately for configuration and permission failures. - Avoid overlapping dispatches from multiple schedulers or from a second manual run while the current runtime job is still in progress. ## Degraded Mode Ladder diff --git a/tests/test_dispatch_guard_workflow.py b/tests/test_dispatch_guard_workflow.py new file mode 100644 index 00000000..7a27bf22 --- /dev/null +++ b/tests/test_dispatch_guard_workflow.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +import unittest +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +WORKFLOW = ROOT / ".github" / "workflows" / "install-dispatch-guard.yml" + + +class DispatchGuardWorkflowTests(unittest.TestCase): + @classmethod + def setUpClass(cls) -> None: + cls.workflow_text = WORKFLOW.read_text(encoding="utf-8") + + def test_dispatch_guard_retries_transient_failures(self) -> None: + text = self.workflow_text + + self.assertIn('DISPATCH_MAX_ATTEMPTS="${DISPATCH_MAX_ATTEMPTS:-4}"', text) + self.assertIn('DISPATCH_RETRY_BASE_SECONDS="${DISPATCH_RETRY_BASE_SECONDS:-15}"', text) + self.assertIn("is_retryable_http_status()", text) + self.assertIn("000|500|502|503|504) return 0 ;;", text) + self.assertIn('retryable=true', text) + self.assertIn('sleep "$delay"', text) + self.assertIn('dispatch retry ${attempt}/${DISPATCH_MAX_ATTEMPTS}', text) + + def test_dispatch_guard_keeps_non_retryable_failures_immediate(self) -> None: + text = self.workflow_text + + self.assertIn('if [ "$attempt" -ge "$DISPATCH_MAX_ATTEMPTS" ] || [ "$retryable" != "true" ]; then', text) + self.assertIn('fail_dispatch "$reason" "$details"', text) + self.assertIn('GitHub dispatch returned HTTP ${http_status}', text) + + def test_dispatch_guard_bounds_curl_runtime(self) -> None: + text = self.workflow_text + + self.assertIn("--connect-timeout 20", text) + self.assertIn("--max-time 60", text) + + +if __name__ == "__main__": + unittest.main()