Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 80 additions & 2 deletions .github/scripts/aiter_prebuild_upload.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,86 @@ set -euo pipefail
# Inputs for upload (optional):
# NVTE_AITER_PREBUILT_BASE_URL - base URL for prebuilts
# NVTE_AITER_PREBUILT_UPLOAD_TOKEN - bearer token for Artifactory
# Optional flag:
# --build : build aiter libs before packaging/uploading; default is package-only.
# Optional flags:
# --preflight --upload
# Validate upload path: Artifactory ping, then HEAD on the probe URL with the bearer token.
# Use in CI before uploading prebuilts.
# --preflight --download
# Validate download path: same ping, then HEAD on the probe URL without credentials.
# Matches what CMake file(DOWNLOAD) sees when fetching prebuilts (no token).
# --build : build AITER libs before packaging/uploading; default is package-only.

_aiter_set_artifactory_check_urls() {
if [[ -z "${NVTE_AITER_PREBUILT_BASE_URL:-}" ]]; then
echo "Missing vars.NVTE_AITER_PREBUILT_BASE_URL" >&2
exit 1
fi
local BASE="${NVTE_AITER_PREBUILT_BASE_URL%/}"
local ROOT_PREFIX="${BASE%%/artifactory/*}"
_AITER_ARTIFACTORY_SYSTEM_PING_URL="${ROOT_PREFIX}/artifactory/api/system/ping"
_AITER_PREBUILT_BASE_ACCESS_PROBE_URL="${BASE}/__aiter_repo_access_probe_not_a_real_artifact"
}

_aiter_curl_artifactory_system_ping() {
echo "[AITER-PREBUILT] Preflight: GET ${_AITER_ARTIFACTORY_SYSTEM_PING_URL} ..."
curl -fsS --connect-timeout 25 --max-time 60 "${_AITER_ARTIFACTORY_SYSTEM_PING_URL}" >/dev/null
}

_aiter_preflight_head_ok() {
local mode=$1
local code=$2
case "${code}" in
404|200)
echo "[AITER-PREBUILT] Preflight ${mode}: HTTP ${code} (success)"
;;
*)
echo "[AITER-PREBUILT] Preflight ${mode}: HTTP ${code} (failed)" >&2
exit 1
;;
esac
}

_aiter_check_artifactory_upload() {
_aiter_set_artifactory_check_urls
if [[ -z "${NVTE_AITER_PREBUILT_UPLOAD_TOKEN:-}" ]]; then
echo "Missing secrets.AITER_ARTIFACTORY_TOKEN" >&2
exit 1
fi
_aiter_curl_artifactory_system_ping
echo "[AITER-PREBUILT] Preflight (upload): HEAD ${_AITER_PREBUILT_BASE_ACCESS_PROBE_URL} (authenticated) ..."
local code
code="$(curl -sS -o /dev/null -w "%{http_code}" --connect-timeout 25 --max-time 90 \
-H "Authorization: Bearer ${NVTE_AITER_PREBUILT_UPLOAD_TOKEN}" \
-I "${_AITER_PREBUILT_BASE_ACCESS_PROBE_URL}" || true)"
_aiter_preflight_head_ok upload "${code}"
}

_aiter_check_artifactory_download() {
_aiter_set_artifactory_check_urls
_aiter_curl_artifactory_system_ping
echo "[AITER-PREBUILT] Preflight (download): HEAD ${_AITER_PREBUILT_BASE_ACCESS_PROBE_URL} (anonymous) ..."
local code
code="$(curl -sS -o /dev/null -w "%{http_code}" --connect-timeout 25 --max-time 90 \
-I "${_AITER_PREBUILT_BASE_ACCESS_PROBE_URL}" || true)"
_aiter_preflight_head_ok download "${code}"
}

if [[ "${1:-}" == "--preflight" ]]; then
shift
case "${1:-}" in
--upload)
_aiter_check_artifactory_upload
;;
--download)
_aiter_check_artifactory_download
;;
*)
echo "Usage: $(basename "$0") --preflight --upload | --preflight --download" >&2
exit 1
;;
esac
exit 0
fi

# Derive ROCm version and aiter commit -> cache key
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
Expand Down
41 changes: 33 additions & 8 deletions .github/workflows/aiter-prebuilt-upload.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,19 @@ on:
description: "Docker image"
required: false
default: ""
workflow_call:
inputs:
docker_image:
description: "Docker image URI from rocm-ci select_image.outputs.image-tag"
required: true
type: string

jobs:
upload:
runs-on: build-only-te
env:
NVTE_AITER_PREBUILT_BASE_URL: ${{ vars.NVTE_AITER_PREBUILT_BASE_URL }}
NVTE_AITER_PREBUILT_UPLOAD_TOKEN: ${{ secrets.AITER_ARTIFACTORY_TOKEN }}
steps:
- name: Checkout source
uses: actions/checkout@v6
Expand All @@ -22,9 +31,32 @@ jobs:
submodules: recursive
fetch-depth: 0

# Verify this runner can reach Artifactory for uploads
- name: "Preflight: Artifactory upload reachability"
run: |
set -euo pipefail
if bash .github/scripts/aiter_prebuild_upload.sh --preflight --upload; then
echo "::notice::Preflight upload reachability succeeded"
exit 0
fi
echo "::error::Preflight upload reachability failed"
exit 1

- name: Resolve docker image
id: cfg
run: |
set -euo pipefail
EVENT="${{ github.event_name }}"
if [ "$EVENT" = "workflow_call" ]; then
IMAGE="${{ inputs.docker_image }}"
if [ -z "$IMAGE" ]; then
echo "workflow_call requires non-empty docker_image." >&2
exit 1
fi
echo "Using docker_image from caller."
echo "image=${IMAGE}" >> "$GITHUB_OUTPUT"
exit 0
fi
IMAGE="${{ inputs.docker_image }}"
if [ -z "$IMAGE" ]; then
IMAGE="${{ vars.DEV_DOCKER_IMAGE }}"
Expand All @@ -33,7 +65,7 @@ jobs:
echo "No docker image provided and vars.DEV_DOCKER_IMAGE is empty." >&2
exit 1
fi
echo "image=${IMAGE}" >> $GITHUB_OUTPUT
echo "image=${IMAGE}" >> "$GITHUB_OUTPUT"

- name: Pull docker image
run: docker pull ${{ steps.cfg.outputs.image }}
Expand All @@ -50,19 +82,12 @@ jobs:
${{ steps.cfg.outputs.image }}

- name: Build and upload aiter prebuilt
env:
NVTE_AITER_PREBUILT_BASE_URL: https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/aiter-prebuilts
NVTE_AITER_PREBUILT_UPLOAD_TOKEN: ${{ secrets.AITER_ARTIFACTORY_TOKEN }}
run: |
docker exec \
-e NVTE_AITER_PREBUILT_BASE_URL=${NVTE_AITER_PREBUILT_BASE_URL} \
-e NVTE_AITER_PREBUILT_UPLOAD_TOKEN=${NVTE_AITER_PREBUILT_UPLOAD_TOKEN} \
te-aiter-upload bash -c "$(cat <<'EOF'
set -ex
if [ -z "${NVTE_AITER_PREBUILT_UPLOAD_TOKEN}" ]; then
echo "Missing secrets.AITER_ARTIFACTORY_TOKEN" >&2
exit 1
fi
export HIP_PATH=""
git config --global --add safe.directory '*'
bash .github/scripts/aiter_prebuild_upload.sh --build
Expand Down
69 changes: 67 additions & 2 deletions .github/workflows/rocm-ci-dispatch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,74 @@ name: PR Automatic CI

on:
pull_request:
types: [ labeled, synchronize, reopened ]
# Include opened: push-only types miss PR creation (first commit uses synchronize only after push).
types: [ opened, labeled, synchronize, reopened ]

permissions:
contents: read
pull-requests: read


jobs:
# To determine whether to upload AITER prebuilt to Artifactory
aiter_prebuilt_upload_trigger:
runs-on: ubuntu-latest
outputs:
trigger_aiter_upload: ${{ steps.set.outputs.trigger_aiter_upload }}
steps:
- name: Detect PR changes under 3rdparty/aiter
uses: dorny/paths-filter@v4
id: paths
if: github.event.action == 'synchronize'
with:
# Include gitlink path: submodule bumps appear as `3rdparty/aiter`, not under **.
filters: |
aiter:
- '3rdparty/aiter/**'
- '3rdparty/aiter'

- name: Detect skip_aiter_upload label
id: skip_label
uses: actions/github-script@v8
with:
script: |
const labels = context.payload.pull_request?.labels || [];
const skip = labels.some((l) => l.name === 'skip_aiter_upload');
core.info(`skip_aiter_upload label : ${skip}`);
core.setOutput('skip', skip ? 'true' : 'false');

- name: Set trigger_aiter_upload from paths and labels
id: set
run: |
set -euo pipefail
ACTION='${{ github.event.action }}'
echo "PR action=${ACTION}"

if [ "$ACTION" != "synchronize" ]; then
echo "trigger_aiter_upload=false" >> "$GITHUB_OUTPUT"
echo "Not synchronize - trigger_aiter_upload = false"
exit 0
fi

SKIP='${{ steps.skip_label.outputs.skip }}'
echo "skip_aiter_upload label : ${SKIP}"

if [ "$SKIP" = 'true' ]; then
echo "trigger_aiter_upload=false" >> "$GITHUB_OUTPUT"
echo "skip_aiter_upload label set - trigger_aiter_upload = false"
exit 0
fi

AITER_PATHS='${{ steps.paths.outputs.aiter }}'

if [ "$AITER_PATHS" = "false" ]; then
echo "trigger_aiter_upload=true" >> "$GITHUB_OUTPUT"
echo "3rdparty/aiter changed on PR - trigger_aiter_upload = true"
else
echo "trigger_aiter_upload=false" >> "$GITHUB_OUTPUT"
echo "No 3rdparty/aiter changes on PR - trigger_aiter_upload = false"
fi

determine_level:
runs-on: ubuntu-latest
outputs:
Expand Down Expand Up @@ -53,9 +115,12 @@ jobs:
# - A commit was pushed with existing ci-level label(s)
# - The PR was reopened or opened with existing ci-level label(s)
if: ${{ needs.determine_level.outputs.test_level != '' }}
needs: determine_level
needs: [determine_level, aiter_prebuilt_upload_trigger]
name: CI Level ${{ needs.determine_level.outputs.test_level }}
uses: ./.github/workflows/rocm-ci.yml
secrets: inherit
with:
test_level: ${{ needs.determine_level.outputs.test_level }}
trigger_aiter_upload: ${{ needs.aiter_prebuilt_upload_trigger.outputs.trigger_aiter_upload == 'true' }}
# true = select_image + optional AITER upload only (skip wheels/GPU). Keep false on default branch.
aiter_flow_test_only: true
33 changes: 32 additions & 1 deletion .github/workflows/rocm-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,16 @@ on:
required: false
default: false
type: boolean
trigger_aiter_upload:
description: 'True when 3rdparty/aiter changed on the PR (set by rocm-ci-dispatch)'
required: false
default: false
type: boolean
aiter_flow_test_only:
description: 'Skip wheel build + GPU tests (AITER path validation only). Cannot use env in job if; use this input.'
required: false
default: false
type: boolean
workflow_dispatch:
inputs:
test_level:
Expand All @@ -40,9 +50,14 @@ on:
description: 'DEBUG: Use config.json from current source branch instead of dev'
type: boolean
default: false
aiter_flow_test_only:
description: 'Skip wheel build + GPU tests (AITER path validation only)'
type: boolean
default: false

# Single concurrency anchor for this PR/branch pipeline. Nested reusable workflows
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
group: te-rocm-${{ github.event.pull_request.number && format('pr-{0}', github.event.pull_request.number) || github.ref }}
cancel-in-progress: true

env:
Expand Down Expand Up @@ -99,13 +114,28 @@ jobs:
echo "Selected image: $IMAGE_TO_USE"
echo "image-tag=$IMAGE_TO_USE" >> $GITHUB_OUTPUT

upload_aiter_prebuilt:
name: Build and upload AITER prebuilt
needs: select_image
# Gate on inputs only: reusable runs from PR may report github.event_name as pull_request, not workflow_call.
# push-triggered runs leave inputs.trigger_aiter_upload unset/false (default).
if: ${{ inputs.trigger_aiter_upload == true || inputs.trigger_aiter_upload == 'true' }}
uses: ./.github/workflows/aiter-prebuilt-upload.yml
with:
docker_image: ${{ needs.select_image.outputs.image-tag }}
secrets: inherit

build:
# Delegate wheel building to the reusable workflow on dev. It produces a core .whl plus framework .tar.gz sdists under artifact name `te-rocm-wheels`.
needs: [select_image, upload_aiter_prebuilt]
# job.if cannot use env.* — use inputs.aiter_flow_test_only; push runs skip that input (full CI).
if: ${{ (github.event_name == 'push' || inputs.aiter_flow_test_only != true) && always() && needs.select_image.result == 'success' && (needs.upload_aiter_prebuilt.result == 'skipped' || needs.upload_aiter_prebuilt.result == 'success') }}
uses: ./.github/workflows/rocm-wheels-build.yml
secrets: inherit

sgpu_tests:
name: sGPU Tests (${{ matrix.arch_label }})
if: ${{ github.event_name == 'push' || inputs.aiter_flow_test_only != true }}
needs: [select_image, build]
timeout-minutes: 360
runs-on: ${{ matrix.arch_label == 'mi30x' && 'linux-te-mi30x-4' || 'linux-te-mi35x-4' }}
Expand Down Expand Up @@ -307,6 +337,7 @@ jobs:

mgpu_tests:
name: mGPU ${{ matrix.framework == 'pytorch' && 'Torch' || 'JAX' }} (${{ matrix.arch_label }})
if: ${{ github.event_name == 'push' || inputs.aiter_flow_test_only != true }}
needs: [select_image, build]
timeout-minutes: 360
runs-on: ${{ matrix.arch_label == 'mi30x' && 'linux-te-mi30x-8' || 'linux-te-mi35x-8' }}
Expand Down
20 changes: 19 additions & 1 deletion .github/workflows/rocm-wheels-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,17 @@ env:
DOCKER_IMAGE_NAME: te-rocm-manylinux-x86
MANYLINUX_PLATFORM: manylinux_2_28_x86_64

# No workflow-level concurrency: rocm-ci.yml already gates PRs; sharing the same group
# with a parent reusable workflow causes GitHub deadlock detection.

# ─────────────────────────────────────────────────────────────────────────────
jobs:

build-rocm-wheels:
name: Build ROCm Docker image and TransformerEngine wheels
runs-on: build-only-te
env:
NVTE_AITER_PREBUILT_BASE_URL: ${{ vars.NVTE_AITER_PREBUILT_BASE_URL }}

steps:
- name: Checkout repository
Expand All @@ -89,6 +94,19 @@ jobs:
3rdparty/QoLA \
3rdparty/hipify_torch

# Verify this runner can reach Artifactory for anonymous prebuilt downloads
- name: "Preflight: Artifactory download reachability"
if: ${{ inputs.use_prebuilt_aiter }}
continue-on-error: true
run: |
set -euo pipefail
if bash .github/scripts/aiter_prebuild_upload.sh --preflight --download; then
echo "::notice::Preflight download reachability succeeded"
exit 0
fi
echo "::warning::Preflight download reachability failed"
exit 1

- name: Derive Docker image tag
id: set-tag
run: |
Expand Down Expand Up @@ -187,7 +205,7 @@ jobs:
# The container writes all wheels and logs under /wheelhouse.
- name: Build TransformerEngine wheels
run: |
NVTE_AITER_PREBUILT_BASE_URL="https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/aiter-prebuilts"
set -euo pipefail
docker run --rm \
--env LOCAL_TREE_BUILD=1 \
--env NVTE_SKIP_SUBMODULE_CHECKS_DURING_BUILD=1 \
Expand Down
Loading