diff --git a/.github/scripts/aiter_prebuild_upload.sh b/.github/scripts/aiter_prebuild_upload.sh index 473ef1c75..4f039264e 100755 --- a/.github/scripts/aiter_prebuild_upload.sh +++ b/.github/scripts/aiter_prebuild_upload.sh @@ -7,8 +7,86 @@ set -euo pipefail # Inputs for upload (optional): # NVTE_AITER_PREBUILT_BASE_URL - base URL for prebuilts # NVTE_AITER_PREBUILT_UPLOAD_TOKEN - bearer token for Artifactory -# Optional flag: -# --build : build aiter libs before packaging/uploading; default is package-only. +# Optional flags: +# --preflight --upload +# Validate upload path: Artifactory ping, then HEAD on the probe URL with the bearer token. +# Use in CI before uploading prebuilts. +# --preflight --download +# Validate download path: same ping, then HEAD on the probe URL without credentials. +# Matches what CMake file(DOWNLOAD) sees when fetching prebuilts (no token). +# --build : build AITER libs before packaging/uploading; default is package-only. + +_aiter_set_artifactory_check_urls() { + if [[ -z "${NVTE_AITER_PREBUILT_BASE_URL:-}" ]]; then + echo "Missing vars.NVTE_AITER_PREBUILT_BASE_URL" >&2 + exit 1 + fi + local BASE="${NVTE_AITER_PREBUILT_BASE_URL%/}" + local ROOT_PREFIX="${BASE%%/artifactory/*}" + _AITER_ARTIFACTORY_SYSTEM_PING_URL="${ROOT_PREFIX}/artifactory/api/system/ping" + _AITER_PREBUILT_BASE_ACCESS_PROBE_URL="${BASE}/__aiter_repo_access_probe_not_a_real_artifact" +} + +_aiter_curl_artifactory_system_ping() { + echo "[AITER-PREBUILT] Preflight: GET ${_AITER_ARTIFACTORY_SYSTEM_PING_URL} ..." + curl -fsS --connect-timeout 25 --max-time 60 "${_AITER_ARTIFACTORY_SYSTEM_PING_URL}" >/dev/null +} + +_aiter_preflight_head_ok() { + local mode=$1 + local code=$2 + case "${code}" in + 404|200) + echo "[AITER-PREBUILT] Preflight ${mode}: HTTP ${code} (success)" + ;; + *) + echo "[AITER-PREBUILT] Preflight ${mode}: HTTP ${code} (failed)" >&2 + exit 1 + ;; + esac +} + +_aiter_check_artifactory_upload() { + _aiter_set_artifactory_check_urls + if [[ -z "${NVTE_AITER_PREBUILT_UPLOAD_TOKEN:-}" ]]; then + echo "Missing secrets.AITER_ARTIFACTORY_TOKEN" >&2 + exit 1 + fi + _aiter_curl_artifactory_system_ping + echo "[AITER-PREBUILT] Preflight (upload): HEAD ${_AITER_PREBUILT_BASE_ACCESS_PROBE_URL} (authenticated) ..." + local code + code="$(curl -sS -o /dev/null -w "%{http_code}" --connect-timeout 25 --max-time 90 \ + -H "Authorization: Bearer ${NVTE_AITER_PREBUILT_UPLOAD_TOKEN}" \ + -I "${_AITER_PREBUILT_BASE_ACCESS_PROBE_URL}" || true)" + _aiter_preflight_head_ok upload "${code}" +} + +_aiter_check_artifactory_download() { + _aiter_set_artifactory_check_urls + _aiter_curl_artifactory_system_ping + echo "[AITER-PREBUILT] Preflight (download): HEAD ${_AITER_PREBUILT_BASE_ACCESS_PROBE_URL} (anonymous) ..." + local code + code="$(curl -sS -o /dev/null -w "%{http_code}" --connect-timeout 25 --max-time 90 \ + -I "${_AITER_PREBUILT_BASE_ACCESS_PROBE_URL}" || true)" + _aiter_preflight_head_ok download "${code}" +} + +if [[ "${1:-}" == "--preflight" ]]; then + shift + case "${1:-}" in + --upload) + _aiter_check_artifactory_upload + ;; + --download) + _aiter_check_artifactory_download + ;; + *) + echo "Usage: $(basename "$0") --preflight --upload | --preflight --download" >&2 + exit 1 + ;; + esac + exit 0 +fi # Derive ROCm version and aiter commit -> cache key ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" diff --git a/.github/workflows/aiter-prebuilt-upload.yml b/.github/workflows/aiter-prebuilt-upload.yml index a45350a79..7ee13a919 100644 --- a/.github/workflows/aiter-prebuilt-upload.yml +++ b/.github/workflows/aiter-prebuilt-upload.yml @@ -10,10 +10,19 @@ on: description: "Docker image" required: false default: "" + workflow_call: + inputs: + docker_image: + description: "Docker image URI from rocm-ci select_image.outputs.image-tag" + required: true + type: string jobs: upload: runs-on: build-only-te + env: + NVTE_AITER_PREBUILT_BASE_URL: ${{ vars.NVTE_AITER_PREBUILT_BASE_URL }} + NVTE_AITER_PREBUILT_UPLOAD_TOKEN: ${{ secrets.AITER_ARTIFACTORY_TOKEN }} steps: - name: Checkout source uses: actions/checkout@v6 @@ -22,9 +31,32 @@ jobs: submodules: recursive fetch-depth: 0 + # Verify this runner can reach Artifactory for uploads + - name: "Preflight: Artifactory upload reachability" + run: | + set -euo pipefail + if bash .github/scripts/aiter_prebuild_upload.sh --preflight --upload; then + echo "::notice::Preflight upload reachability succeeded" + exit 0 + fi + echo "::error::Preflight upload reachability failed" + exit 1 + - name: Resolve docker image id: cfg run: | + set -euo pipefail + EVENT="${{ github.event_name }}" + if [ "$EVENT" = "workflow_call" ]; then + IMAGE="${{ inputs.docker_image }}" + if [ -z "$IMAGE" ]; then + echo "workflow_call requires non-empty docker_image." >&2 + exit 1 + fi + echo "Using docker_image from caller." + echo "image=${IMAGE}" >> "$GITHUB_OUTPUT" + exit 0 + fi IMAGE="${{ inputs.docker_image }}" if [ -z "$IMAGE" ]; then IMAGE="${{ vars.DEV_DOCKER_IMAGE }}" @@ -33,7 +65,7 @@ jobs: echo "No docker image provided and vars.DEV_DOCKER_IMAGE is empty." >&2 exit 1 fi - echo "image=${IMAGE}" >> $GITHUB_OUTPUT + echo "image=${IMAGE}" >> "$GITHUB_OUTPUT" - name: Pull docker image run: docker pull ${{ steps.cfg.outputs.image }} @@ -50,19 +82,12 @@ jobs: ${{ steps.cfg.outputs.image }} - name: Build and upload aiter prebuilt - env: - NVTE_AITER_PREBUILT_BASE_URL: https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/aiter-prebuilts - NVTE_AITER_PREBUILT_UPLOAD_TOKEN: ${{ secrets.AITER_ARTIFACTORY_TOKEN }} run: | docker exec \ -e NVTE_AITER_PREBUILT_BASE_URL=${NVTE_AITER_PREBUILT_BASE_URL} \ -e NVTE_AITER_PREBUILT_UPLOAD_TOKEN=${NVTE_AITER_PREBUILT_UPLOAD_TOKEN} \ te-aiter-upload bash -c "$(cat <<'EOF' set -ex - if [ -z "${NVTE_AITER_PREBUILT_UPLOAD_TOKEN}" ]; then - echo "Missing secrets.AITER_ARTIFACTORY_TOKEN" >&2 - exit 1 - fi export HIP_PATH="" git config --global --add safe.directory '*' bash .github/scripts/aiter_prebuild_upload.sh --build diff --git a/.github/workflows/rocm-ci-dispatch.yml b/.github/workflows/rocm-ci-dispatch.yml index e679ece46..b2a6c1735 100644 --- a/.github/workflows/rocm-ci-dispatch.yml +++ b/.github/workflows/rocm-ci-dispatch.yml @@ -6,12 +6,74 @@ name: PR Automatic CI on: pull_request: - types: [ labeled, synchronize, reopened ] + # Include opened: push-only types miss PR creation (first commit uses synchronize only after push). + types: [ opened, labeled, synchronize, reopened ] permissions: contents: read + pull-requests: read + jobs: + # To determine whether to upload AITER prebuilt to Artifactory + aiter_prebuilt_upload_trigger: + runs-on: ubuntu-latest + outputs: + trigger_aiter_upload: ${{ steps.set.outputs.trigger_aiter_upload }} + steps: + - name: Detect PR changes under 3rdparty/aiter + uses: dorny/paths-filter@v4 + id: paths + if: github.event.action == 'synchronize' + with: + # Include gitlink path: submodule bumps appear as `3rdparty/aiter`, not under **. + filters: | + aiter: + - '3rdparty/aiter/**' + - '3rdparty/aiter' + + - name: Detect skip_aiter_upload label + id: skip_label + uses: actions/github-script@v8 + with: + script: | + const labels = context.payload.pull_request?.labels || []; + const skip = labels.some((l) => l.name === 'skip_aiter_upload'); + core.info(`skip_aiter_upload label : ${skip}`); + core.setOutput('skip', skip ? 'true' : 'false'); + + - name: Set trigger_aiter_upload from paths and labels + id: set + run: | + set -euo pipefail + ACTION='${{ github.event.action }}' + echo "PR action=${ACTION}" + + if [ "$ACTION" != "synchronize" ]; then + echo "trigger_aiter_upload=false" >> "$GITHUB_OUTPUT" + echo "Not synchronize - trigger_aiter_upload = false" + exit 0 + fi + + SKIP='${{ steps.skip_label.outputs.skip }}' + echo "skip_aiter_upload label : ${SKIP}" + + if [ "$SKIP" = 'true' ]; then + echo "trigger_aiter_upload=false" >> "$GITHUB_OUTPUT" + echo "skip_aiter_upload label set - trigger_aiter_upload = false" + exit 0 + fi + + AITER_PATHS='${{ steps.paths.outputs.aiter }}' + + if [ "$AITER_PATHS" = "false" ]; then + echo "trigger_aiter_upload=true" >> "$GITHUB_OUTPUT" + echo "3rdparty/aiter changed on PR - trigger_aiter_upload = true" + else + echo "trigger_aiter_upload=false" >> "$GITHUB_OUTPUT" + echo "No 3rdparty/aiter changes on PR - trigger_aiter_upload = false" + fi + determine_level: runs-on: ubuntu-latest outputs: @@ -53,9 +115,12 @@ jobs: # - A commit was pushed with existing ci-level label(s) # - The PR was reopened or opened with existing ci-level label(s) if: ${{ needs.determine_level.outputs.test_level != '' }} - needs: determine_level + needs: [determine_level, aiter_prebuilt_upload_trigger] name: CI Level ${{ needs.determine_level.outputs.test_level }} uses: ./.github/workflows/rocm-ci.yml secrets: inherit with: test_level: ${{ needs.determine_level.outputs.test_level }} + trigger_aiter_upload: ${{ needs.aiter_prebuilt_upload_trigger.outputs.trigger_aiter_upload == 'true' }} + # true = select_image + optional AITER upload only (skip wheels/GPU). Keep false on default branch. + aiter_flow_test_only: true diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index 5e0ae242c..3c964d32f 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -26,6 +26,16 @@ on: required: false default: false type: boolean + trigger_aiter_upload: + description: 'True when 3rdparty/aiter changed on the PR (set by rocm-ci-dispatch)' + required: false + default: false + type: boolean + aiter_flow_test_only: + description: 'Skip wheel build + GPU tests (AITER path validation only). Cannot use env in job if; use this input.' + required: false + default: false + type: boolean workflow_dispatch: inputs: test_level: @@ -40,9 +50,14 @@ on: description: 'DEBUG: Use config.json from current source branch instead of dev' type: boolean default: false + aiter_flow_test_only: + description: 'Skip wheel build + GPU tests (AITER path validation only)' + type: boolean + default: false +# Single concurrency anchor for this PR/branch pipeline. Nested reusable workflows concurrency: - group: ${{ github.workflow }}-${{ github.ref }} + group: te-rocm-${{ github.event.pull_request.number && format('pr-{0}', github.event.pull_request.number) || github.ref }} cancel-in-progress: true env: @@ -99,13 +114,28 @@ jobs: echo "Selected image: $IMAGE_TO_USE" echo "image-tag=$IMAGE_TO_USE" >> $GITHUB_OUTPUT + upload_aiter_prebuilt: + name: Build and upload AITER prebuilt + needs: select_image + # Gate on inputs only: reusable runs from PR may report github.event_name as pull_request, not workflow_call. + # push-triggered runs leave inputs.trigger_aiter_upload unset/false (default). + if: ${{ inputs.trigger_aiter_upload == true || inputs.trigger_aiter_upload == 'true' }} + uses: ./.github/workflows/aiter-prebuilt-upload.yml + with: + docker_image: ${{ needs.select_image.outputs.image-tag }} + secrets: inherit + build: # Delegate wheel building to the reusable workflow on dev. It produces a core .whl plus framework .tar.gz sdists under artifact name `te-rocm-wheels`. + needs: [select_image, upload_aiter_prebuilt] + # job.if cannot use env.* — use inputs.aiter_flow_test_only; push runs skip that input (full CI). + if: ${{ (github.event_name == 'push' || inputs.aiter_flow_test_only != true) && always() && needs.select_image.result == 'success' && (needs.upload_aiter_prebuilt.result == 'skipped' || needs.upload_aiter_prebuilt.result == 'success') }} uses: ./.github/workflows/rocm-wheels-build.yml secrets: inherit sgpu_tests: name: sGPU Tests (${{ matrix.arch_label }}) + if: ${{ github.event_name == 'push' || inputs.aiter_flow_test_only != true }} needs: [select_image, build] timeout-minutes: 360 runs-on: ${{ matrix.arch_label == 'mi30x' && 'linux-te-mi30x-4' || 'linux-te-mi35x-4' }} @@ -307,6 +337,7 @@ jobs: mgpu_tests: name: mGPU ${{ matrix.framework == 'pytorch' && 'Torch' || 'JAX' }} (${{ matrix.arch_label }}) + if: ${{ github.event_name == 'push' || inputs.aiter_flow_test_only != true }} needs: [select_image, build] timeout-minutes: 360 runs-on: ${{ matrix.arch_label == 'mi30x' && 'linux-te-mi30x-8' || 'linux-te-mi35x-8' }} diff --git a/.github/workflows/rocm-wheels-build.yml b/.github/workflows/rocm-wheels-build.yml index c1a8ea087..98f76f039 100644 --- a/.github/workflows/rocm-wheels-build.yml +++ b/.github/workflows/rocm-wheels-build.yml @@ -70,12 +70,17 @@ env: DOCKER_IMAGE_NAME: te-rocm-manylinux-x86 MANYLINUX_PLATFORM: manylinux_2_28_x86_64 +# No workflow-level concurrency: rocm-ci.yml already gates PRs; sharing the same group +# with a parent reusable workflow causes GitHub deadlock detection. + # ───────────────────────────────────────────────────────────────────────────── jobs: build-rocm-wheels: name: Build ROCm Docker image and TransformerEngine wheels runs-on: build-only-te + env: + NVTE_AITER_PREBUILT_BASE_URL: ${{ vars.NVTE_AITER_PREBUILT_BASE_URL }} steps: - name: Checkout repository @@ -89,6 +94,19 @@ jobs: 3rdparty/QoLA \ 3rdparty/hipify_torch + # Verify this runner can reach Artifactory for anonymous prebuilt downloads + - name: "Preflight: Artifactory download reachability" + if: ${{ inputs.use_prebuilt_aiter }} + continue-on-error: true + run: | + set -euo pipefail + if bash .github/scripts/aiter_prebuild_upload.sh --preflight --download; then + echo "::notice::Preflight download reachability succeeded" + exit 0 + fi + echo "::warning::Preflight download reachability failed" + exit 1 + - name: Derive Docker image tag id: set-tag run: | @@ -187,7 +205,7 @@ jobs: # The container writes all wheels and logs under /wheelhouse. - name: Build TransformerEngine wheels run: | - NVTE_AITER_PREBUILT_BASE_URL="https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/aiter-prebuilts" + set -euo pipefail docker run --rm \ --env LOCAL_TREE_BUILD=1 \ --env NVTE_SKIP_SUBMODULE_CHECKS_DURING_BUILD=1 \