diff --git a/.github/scripts/aiter_prebuild_upload.sh b/.github/scripts/aiter_prebuild_upload.sh index 473ef1c75..4f039264e 100755 --- a/.github/scripts/aiter_prebuild_upload.sh +++ b/.github/scripts/aiter_prebuild_upload.sh @@ -7,8 +7,86 @@ set -euo pipefail # Inputs for upload (optional): # NVTE_AITER_PREBUILT_BASE_URL - base URL for prebuilts # NVTE_AITER_PREBUILT_UPLOAD_TOKEN - bearer token for Artifactory -# Optional flag: -# --build : build aiter libs before packaging/uploading; default is package-only. +# Optional flags: +# --preflight --upload +# Validate upload path: Artifactory ping, then HEAD on the probe URL with the bearer token. +# Use in CI before uploading prebuilts. +# --preflight --download +# Validate download path: same ping, then HEAD on the probe URL without credentials. +# Matches what CMake file(DOWNLOAD) sees when fetching prebuilts (no token). +# --build : build AITER libs before packaging/uploading; default is package-only. + +_aiter_set_artifactory_check_urls() { + if [[ -z "${NVTE_AITER_PREBUILT_BASE_URL:-}" ]]; then + echo "Missing vars.NVTE_AITER_PREBUILT_BASE_URL" >&2 + exit 1 + fi + local BASE="${NVTE_AITER_PREBUILT_BASE_URL%/}" + local ROOT_PREFIX="${BASE%%/artifactory/*}" + _AITER_ARTIFACTORY_SYSTEM_PING_URL="${ROOT_PREFIX}/artifactory/api/system/ping" + _AITER_PREBUILT_BASE_ACCESS_PROBE_URL="${BASE}/__aiter_repo_access_probe_not_a_real_artifact" +} + +_aiter_curl_artifactory_system_ping() { + echo "[AITER-PREBUILT] Preflight: GET ${_AITER_ARTIFACTORY_SYSTEM_PING_URL} ..." + curl -fsS --connect-timeout 25 --max-time 60 "${_AITER_ARTIFACTORY_SYSTEM_PING_URL}" >/dev/null +} + +_aiter_preflight_head_ok() { + local mode=$1 + local code=$2 + case "${code}" in + 404|200) + echo "[AITER-PREBUILT] Preflight ${mode}: HTTP ${code} (success)" + ;; + *) + echo "[AITER-PREBUILT] Preflight ${mode}: HTTP ${code} (failed)" >&2 + exit 1 + ;; + esac +} + +_aiter_check_artifactory_upload() { + _aiter_set_artifactory_check_urls + if [[ -z "${NVTE_AITER_PREBUILT_UPLOAD_TOKEN:-}" ]]; then + echo "Missing secrets.AITER_ARTIFACTORY_TOKEN" >&2 + exit 1 + fi + _aiter_curl_artifactory_system_ping + echo "[AITER-PREBUILT] Preflight (upload): HEAD ${_AITER_PREBUILT_BASE_ACCESS_PROBE_URL} (authenticated) ..." + local code + code="$(curl -sS -o /dev/null -w "%{http_code}" --connect-timeout 25 --max-time 90 \ + -H "Authorization: Bearer ${NVTE_AITER_PREBUILT_UPLOAD_TOKEN}" \ + -I "${_AITER_PREBUILT_BASE_ACCESS_PROBE_URL}" || true)" + _aiter_preflight_head_ok upload "${code}" +} + +_aiter_check_artifactory_download() { + _aiter_set_artifactory_check_urls + _aiter_curl_artifactory_system_ping + echo "[AITER-PREBUILT] Preflight (download): HEAD ${_AITER_PREBUILT_BASE_ACCESS_PROBE_URL} (anonymous) ..." + local code + code="$(curl -sS -o /dev/null -w "%{http_code}" --connect-timeout 25 --max-time 90 \ + -I "${_AITER_PREBUILT_BASE_ACCESS_PROBE_URL}" || true)" + _aiter_preflight_head_ok download "${code}" +} + +if [[ "${1:-}" == "--preflight" ]]; then + shift + case "${1:-}" in + --upload) + _aiter_check_artifactory_upload + ;; + --download) + _aiter_check_artifactory_download + ;; + *) + echo "Usage: $(basename "$0") --preflight --upload | --preflight --download" >&2 + exit 1 + ;; + esac + exit 0 +fi # Derive ROCm version and aiter commit -> cache key ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" diff --git a/.github/workflows/aiter-prebuilt-upload.yml b/.github/workflows/aiter-prebuilt-upload.yml index a45350a79..70447e8eb 100644 --- a/.github/workflows/aiter-prebuilt-upload.yml +++ b/.github/workflows/aiter-prebuilt-upload.yml @@ -6,14 +6,40 @@ name: AITER Prebuilt Upload on: workflow_dispatch: inputs: - docker_image: - description: "Docker image" + docker_image_override: + description: "Manual Docker Image (Leave empty to use config file value)" required: false - default: "" + type: string + workflow_call: + inputs: + docker_image_override: + description: "Manual Docker Image (Leave empty to use config file value)" + required: true + type: string + aiter_upload_cache_key: + description: "If non-empty, save Actions cache after success (same key as rocm-ci-dispatch restore)." + required: false + default: '' + type: string + +permissions: + actions: write + contents: read jobs: + # Same resolver as rocm-ci / dispatch; override comes from inputs for both workflow_dispatch and workflow_call. + select_docker_image: + uses: ./.github/workflows/select-docker-image.yml + with: + docker_image_override: ${{ inputs.docker_image_override }} + test_config_from_source: true + upload: + needs: [select_docker_image] runs-on: build-only-te + env: + NVTE_AITER_PREBUILT_BASE_URL: ${{ vars.NVTE_AITER_PREBUILT_BASE_URL }} + NVTE_AITER_PREBUILT_UPLOAD_TOKEN: ${{ secrets.AITER_ARTIFACTORY_TOKEN }} steps: - name: Checkout source uses: actions/checkout@v6 @@ -22,21 +48,19 @@ jobs: submodules: recursive fetch-depth: 0 - - name: Resolve docker image - id: cfg + # Verify this runner can reach Artifactory for uploads + - name: "Preflight: Artifactory upload reachability" run: | - IMAGE="${{ inputs.docker_image }}" - if [ -z "$IMAGE" ]; then - IMAGE="${{ vars.DEV_DOCKER_IMAGE }}" - fi - if [ -z "$IMAGE" ]; then - echo "No docker image provided and vars.DEV_DOCKER_IMAGE is empty." >&2 - exit 1 + set -euo pipefail + if bash .github/scripts/aiter_prebuild_upload.sh --preflight --upload; then + echo "::notice::Preflight upload reachability succeeded" + exit 0 fi - echo "image=${IMAGE}" >> $GITHUB_OUTPUT + echo "::error::Preflight upload reachability failed" + exit 1 - name: Pull docker image - run: docker pull ${{ steps.cfg.outputs.image }} + run: docker pull ${{ needs.select_docker_image.outputs.image_tag }} - name: Run container run: | @@ -47,28 +71,32 @@ jobs: --pid=host \ -v "${{ github.workspace }}:/workspace" \ -w /workspace \ - ${{ steps.cfg.outputs.image }} + ${{ needs.select_docker_image.outputs.image_tag }} - name: Build and upload aiter prebuilt - env: - NVTE_AITER_PREBUILT_BASE_URL: https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/aiter-prebuilts - NVTE_AITER_PREBUILT_UPLOAD_TOKEN: ${{ secrets.AITER_ARTIFACTORY_TOKEN }} run: | docker exec \ -e NVTE_AITER_PREBUILT_BASE_URL=${NVTE_AITER_PREBUILT_BASE_URL} \ -e NVTE_AITER_PREBUILT_UPLOAD_TOKEN=${NVTE_AITER_PREBUILT_UPLOAD_TOKEN} \ te-aiter-upload bash -c "$(cat <<'EOF' set -ex - if [ -z "${NVTE_AITER_PREBUILT_UPLOAD_TOKEN}" ]; then - echo "Missing secrets.AITER_ARTIFACTORY_TOKEN" >&2 - exit 1 - fi export HIP_PATH="" git config --global --add safe.directory '*' bash .github/scripts/aiter_prebuild_upload.sh --build EOF )" + - name: Record successful AITER prebuilt upload (cache marker) + if: success() && inputs.aiter_upload_cache_key != '' + run: echo ok > .aiter-upload-success + + - name: Save AITER upload success cache + if: success() && inputs.aiter_upload_cache_key != '' + uses: actions/cache/save@v4 + with: + path: .aiter-upload-success + key: ${{ inputs.aiter_upload_cache_key }} + - name: Cleanup container if: always() run: docker rm -f te-aiter-upload || true diff --git a/.github/workflows/rocm-ci-dispatch.yml b/.github/workflows/rocm-ci-dispatch.yml index e679ece46..422ff2078 100644 --- a/.github/workflows/rocm-ci-dispatch.yml +++ b/.github/workflows/rocm-ci-dispatch.yml @@ -10,8 +10,70 @@ on: permissions: contents: read + pull-requests: read + actions: write jobs: + # Resolve Docker image tag + select_ci_image: + uses: ./.github/workflows/select-docker-image.yml + with: + docker_image_override: '' + test_config_from_source: true + + # Whether the PR touches 3rdparty/aiter + aiter_gate: + runs-on: ubuntu-latest + outputs: + aiter_paths: ${{ steps.paths.outputs.aiter }} + steps: + - name: Detect PR changes under 3rdparty/aiter + uses: dorny/paths-filter@v4 + id: paths + if: github.event.action == 'synchronize' + with: + filters: | + aiter: + - '3rdparty/aiter/**' + + # Whether to upload AITER prebuilt to Artifactory + aiter_prebuilt_upload_trigger: + needs: [aiter_gate, select_ci_image] + runs-on: ubuntu-latest + outputs: + # true only on synchronize + aiter paths + cache miss (default false via expression) + trigger_aiter_upload: ${{ github.event.action == 'synchronize' && needs.aiter_gate.outputs.aiter_paths == 'true' && steps.aiter_upload_cache.outputs.cache-hit != 'true' }} + aiter_upload_cache_key: ${{ steps.aiter_key.outputs.cache_key }} + steps: + - name: Checkout PR head + if: ${{ github.event.action == 'synchronize' && needs.aiter_gate.outputs.aiter_paths == 'true' }} + uses: actions/checkout@v6 + with: + ref: ${{ github.event.pull_request.head.sha }} + fetch-depth: 1 + + - name: Compute AITER upload cache key + id: aiter_key + if: ${{ github.event.action == 'synchronize' && needs.aiter_gate.outputs.aiter_paths == 'true' }} + env: + IMAGE_TO_USE: ${{ needs.select_ci_image.outputs.image_tag }} + run: | + set -euo pipefail + AITER_SHA=$(git rev-parse HEAD:3rdparty/aiter) + IMAGE_SLUG=$(printf '%s' "$IMAGE_TO_USE" | sha256sum | awk '{print $1}') + echo "aiter_sha=$AITER_SHA" >> "$GITHUB_OUTPUT" + echo "image_slug=$IMAGE_SLUG" >> "$GITHUB_OUTPUT" + echo "cache_key=aiter-prebuilt-upload-ok-${IMAGE_SLUG}-${AITER_SHA}" >> "$GITHUB_OUTPUT" + echo "Resolved Docker image for cache key (select-docker-image.yml): $IMAGE_TO_USE" + + - name: AITER upload cache validation + id: aiter_upload_cache + if: ${{ github.event.action == 'synchronize' && needs.aiter_gate.outputs.aiter_paths == 'true' }} + uses: actions/cache/restore@v4 + with: + path: .aiter-upload-success + key: ${{ steps.aiter_key.outputs.cache_key }} + determine_level: runs-on: ubuntu-latest outputs: @@ -52,10 +114,14 @@ jobs: # - A ci-level label higher than any existing ci-level label(s) was added # - A commit was pushed with existing ci-level label(s) # - The PR was reopened or opened with existing ci-level label(s) - if: ${{ needs.determine_level.outputs.test_level != '' }} - needs: determine_level + if: ${{ always() && needs.select_ci_image.result == 'success' && needs.determine_level.outputs.test_level != '' }} + needs: [determine_level, aiter_prebuilt_upload_trigger, select_ci_image] name: CI Level ${{ needs.determine_level.outputs.test_level }} uses: ./.github/workflows/rocm-ci.yml secrets: inherit with: test_level: ${{ needs.determine_level.outputs.test_level }} + trigger_aiter_upload: ${{ needs.aiter_prebuilt_upload_trigger.outputs.trigger_aiter_upload == 'true' }} + aiter_upload_cache_key: ${{ needs.aiter_prebuilt_upload_trigger.outputs.aiter_upload_cache_key }} + docker_image_override: ${{ needs.select_ci_image.outputs.image_tag }} + test_config_from_source: true diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml index 5e0ae242c..e28008d77 100644 --- a/.github/workflows/rocm-ci.yml +++ b/.github/workflows/rocm-ci.yml @@ -19,13 +19,23 @@ on: type: string docker_image_override: description: 'Manual Docker Image (Leave empty to use config file value)' - required: false + required: true type: string test_config_from_source: description: 'DEBUG: Use config.json from current source branch instead of dev' required: false default: false type: boolean + trigger_aiter_upload: + description: 'True when 3rdparty/aiter changed on the PR (set by rocm-ci-dispatch)' + required: false + default: false + type: boolean + aiter_upload_cache_key: + description: 'Actions cache key for upload success (computed in rocm-ci-dispatch; empty otherwise)' + required: false + default: '' + type: string workflow_dispatch: inputs: test_level: @@ -51,56 +61,25 @@ env: jobs: select_image: name: Select Docker Image - runs-on: ubuntu-latest - timeout-minutes: 10 - outputs: - image-tag: ${{ steps.select-image.outputs.image-tag }} - steps: - - name: Checkout repository - uses: actions/checkout@v6 - with: - ref: ${{ inputs.test_config_from_source && github.ref_name || github.event.repository.default_branch || 'dev' }} - sparse-checkout: ci/ci_config.json - sparse-checkout-cone-mode: false - - - name: Select Docker Image Tag - id: select-image - run: | - if [[ "${{ inputs.test_config_from_source }}" == "true" ]]; then - echo "::notice::Debugging mode: Using ci/ci_config.json from ${{ github.ref_name }}" - else - echo "::notice::Using ci/ci_config.json from ${{ github.event.repository.default_branch || 'dev' }}" - fi - - if [[ ! -f "ci/ci_config.json" ]]; then - echo "::error::Config file not found in checkout." - exit 1 - fi - - BRANCH_NAME="${{ github.base_ref || github.ref_name }}" - echo "Determining image for branch: $BRANCH_NAME" - VERSION_KEY="$BRANCH_NAME" - - if jq -e --arg key "$VERSION_KEY" '.docker_images[$key]' ci/ci_config.json > /dev/null; then - JSON_KEY="$VERSION_KEY" - else - JSON_KEY="default" - fi - - echo "Selected config key: $JSON_KEY" - IMAGE_TO_USE=$(jq -r --arg key "$JSON_KEY" '.docker_images[$key]' ci/ci_config.json) - - MANUAL_OVERRIDE="${{ inputs.docker_image_override }}" - if [[ -n "$MANUAL_OVERRIDE" ]]; then - echo "::notice::Manual override detected: $MANUAL_OVERRIDE" - IMAGE_TO_USE="$MANUAL_OVERRIDE" - fi - - echo "Selected image: $IMAGE_TO_USE" - echo "image-tag=$IMAGE_TO_USE" >> $GITHUB_OUTPUT + uses: ./.github/workflows/select-docker-image.yml + with: + docker_image_override: ${{ inputs.docker_image_override }} + test_config_from_source: ${{ inputs.test_config_from_source }} + + upload_aiter_prebuilt: + name: Build and upload AITER prebuilt + needs: select_image + if: ${{ (github.event_name == 'pull_request' && inputs.trigger_aiter_upload == 'true') }} + uses: ./.github/workflows/aiter-prebuilt-upload.yml + with: + docker_image_override: ${{ needs.select_image.outputs.image_tag }} + aiter_upload_cache_key: ${{ inputs.aiter_upload_cache_key }} + secrets: inherit build: # Delegate wheel building to the reusable workflow on dev. It produces a core .whl plus framework .tar.gz sdists under artifact name `te-rocm-wheels`. + needs: [select_image, upload_aiter_prebuilt] + if: always() && needs.select_image.result == 'success' && (needs.upload_aiter_prebuilt.result == 'skipped' || needs.upload_aiter_prebuilt.result == 'success') uses: ./.github/workflows/rocm-wheels-build.yml secrets: inherit @@ -140,7 +119,7 @@ jobs: - name: Pull Docker Image run: | - docker pull ${{ needs.select_image.outputs.image-tag }} + docker pull ${{ needs.select_image.outputs.image_tag }} - name: Run Container run: | @@ -155,7 +134,7 @@ jobs: --group-add $(getent group video | cut -d: -f3) \ -v "${{ github.workspace }}:/workspace" \ -w /workspace \ - ${{ needs.select_image.outputs.image-tag }} + ${{ needs.select_image.outputs.image_tag }} - name: Install packages run: | @@ -336,7 +315,7 @@ jobs: - name: Pull Docker Image run: | - docker pull ${{ needs.select_image.outputs.image-tag }} + docker pull ${{ needs.select_image.outputs.image_tag }} - name: Run Container run: | @@ -351,7 +330,7 @@ jobs: --group-add $(getent group video | cut -d: -f3) \ -v "${{ github.workspace }}:/workspace" \ -w /workspace \ - ${{ needs.select_image.outputs.image-tag }} + ${{ needs.select_image.outputs.image_tag }} - name: Install packages env: diff --git a/.github/workflows/rocm-wheels-build.yml b/.github/workflows/rocm-wheels-build.yml index c1a8ea087..afaf5024f 100644 --- a/.github/workflows/rocm-wheels-build.yml +++ b/.github/workflows/rocm-wheels-build.yml @@ -76,6 +76,8 @@ jobs: build-rocm-wheels: name: Build ROCm Docker image and TransformerEngine wheels runs-on: build-only-te + env: + NVTE_AITER_PREBUILT_BASE_URL: ${{ vars.NVTE_AITER_PREBUILT_BASE_URL }} steps: - name: Checkout repository @@ -89,6 +91,19 @@ jobs: 3rdparty/QoLA \ 3rdparty/hipify_torch + # Verify this runner can reach Artifactory for anonymous prebuilt downloads + - name: "Preflight: Artifactory download reachability" + if: ${{ inputs.use_prebuilt_aiter }} + continue-on-error: true + run: | + set -euo pipefail + if bash .github/scripts/aiter_prebuild_upload.sh --preflight --download; then + echo "::notice::Preflight download reachability succeeded" + exit 0 + fi + echo "::warning::Preflight download reachability failed" + exit 1 + - name: Derive Docker image tag id: set-tag run: | @@ -187,7 +202,7 @@ jobs: # The container writes all wheels and logs under /wheelhouse. - name: Build TransformerEngine wheels run: | - NVTE_AITER_PREBUILT_BASE_URL="https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/aiter-prebuilts" + set -euo pipefail docker run --rm \ --env LOCAL_TREE_BUILD=1 \ --env NVTE_SKIP_SUBMODULE_CHECKS_DURING_BUILD=1 \ diff --git a/.github/workflows/select-docker-image.yml b/.github/workflows/select-docker-image.yml new file mode 100644 index 000000000..d1a12c2cf --- /dev/null +++ b/.github/workflows/select-docker-image.yml @@ -0,0 +1,74 @@ +# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. + +name: Select Docker Image + +on: + workflow_call: + inputs: + docker_image_override: + description: 'Manual Docker Image (Leave empty to use config file value)' + required: false + default: '' + type: string + test_config_from_source: + description: 'Use ci/ci_config.json from the caller ref instead of default branch' + required: false + default: false + type: boolean + outputs: + image_tag: + description: 'Docker image URI from ci/ci_config.json (or override)' + value: ${{ jobs.select.outputs.image_tag }} + +jobs: + select: + name: Select Docker Image + runs-on: ubuntu-latest + timeout-minutes: 10 + outputs: + image_tag: ${{ steps.select.outputs.image_tag }} + steps: + - name: Checkout repository + uses: actions/checkout@v6 + with: + ref: ${{ inputs.test_config_from_source && github.ref || github.event.repository.default_branch || 'dev' }} + sparse-checkout: ci/ci_config.json + sparse-checkout-cone-mode: false + + - name: Select Docker Image Tag + id: select + run: | + if [[ "${{ inputs.test_config_from_source }}" == "true" ]]; then + echo "::notice::Debugging mode: Using ci/ci_config.json from ${{ github.ref }} (${{ github.ref_name }})" + else + echo "::notice::Using ci/ci_config.json from ${{ github.event.repository.default_branch || 'dev' }}" + fi + + if [[ ! -f "ci/ci_config.json" ]]; then + echo "::error::Config file not found in checkout." + exit 1 + fi + + BRANCH_NAME="${{ github.base_ref || github.ref_name }}" + echo "Determining image for branch: $BRANCH_NAME" + VERSION_KEY="$BRANCH_NAME" + + if jq -e --arg key "$VERSION_KEY" '.docker_images[$key]' ci/ci_config.json > /dev/null; then + JSON_KEY="$VERSION_KEY" + else + JSON_KEY="default" + fi + + echo "Selected config key: $JSON_KEY" + IMAGE_TO_USE=$(jq -r --arg key "$JSON_KEY" '.docker_images[$key]' ci/ci_config.json) + + MANUAL_OVERRIDE="${{ inputs.docker_image_override }}" + if [[ -n "$MANUAL_OVERRIDE" ]]; then + echo "::notice::Manual override detected: $MANUAL_OVERRIDE" + IMAGE_TO_USE="$MANUAL_OVERRIDE" + fi + + echo "Selected image: $IMAGE_TO_USE" + echo "image_tag=$IMAGE_TO_USE" >> "$GITHUB_OUTPUT"