Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
321 changes: 321 additions & 0 deletions .github/actions/ecr-build-push-pull/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,321 @@
# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
# All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause

name: 'ECR Build-Push-Pull'
description: >
Builds a Docker image and pushes it to ECR, using ECR as the layer cache.
If the image already exists in ECR (same tag), pulls it instead of building.
Drop-in replacement for docker-build/action.yml with ECR-backed caching.

inputs:
image-tag:
description: 'Tag for the Docker image (e.g. my-image:latest).'
required: true
isaacsim-base-image:
description: 'IsaacSim base image (passed as ISAACSIM_BASE_IMAGE_ARG build-arg).'
required: true
isaacsim-version:
description: 'IsaacSim version (passed as ISAACSIM_VERSION_ARG build-arg).'
required: true
dockerfile-path:
description: 'Path to Dockerfile, relative to the repository root'
default: 'docker/Dockerfile.base'
required: false
ecr-url:
description: >
ECR repository URL (e.g. "123456789.dkr.ecr.us-west-2.amazonaws.com/my-repo").
Resolved in the following order:
1. ecr-url input, if provided.
2. ECR_CACHE_URL environment variable on the runner.
3. SSM parameter /github-runner/<instance-id>/ecr-cache-url.
4. If still empty, ECR cache is skipped and the image is built locally.
required: false
default: ''
cache-tag:
description: Tag used for the ECR layer cache image (e.g. "cache-base", "cache-curobo").
required: false
default: 'cache'
runs:
using: composite
steps:

##### 1: Setup docker config + Login to nvcr.io #####

# Create a temp docker config with credsStore disabled before any login.
# The runner's credential store backend is broken ("not implemented") and
# causes all docker login calls to fail unless we bypass it upfront.
# The temp config is exported as DOCKER_CONFIG so all subsequent steps
# (including ECR login in step 3) inherit it automatically.

- name: Setup docker config and login to nvcr.io
shell: bash
run: |
DOCKER_CONFIG_DIR=$(mktemp -d)
if [ -f "${HOME}/.docker/config.json" ]; then
python3 -c "import json; cfg=json.load(open('${HOME}/.docker/config.json')); cfg['credsStore']=''; cfg.pop('credHelpers',None); json.dump(cfg,open('${DOCKER_CONFIG_DIR}/config.json','w'))"
else
echo '{"credsStore":""}' > "${DOCKER_CONFIG_DIR}/config.json"
fi
echo "DOCKER_CONFIG=${DOCKER_CONFIG_DIR}" >> "$GITHUB_ENV"
export DOCKER_CONFIG="${DOCKER_CONFIG_DIR}"

if [ -n "${{ env.NGC_API_KEY }}" ]; then
echo "🔵 Logging into nvcr.io..."
docker login -u \$oauthtoken -p ${{ env.NGC_API_KEY }} nvcr.io
else
echo "🟠 NGC_API_KEY not set - skipping nvcr.io login (normal for fork PRs)"
fi

##### 2: Resolve ECR URL #####

# Tries: explicit input >> ECR_CACHE_URL env var >> SSM parameter on EC2.
# Exports ECR_URL to GITHUB_ENV and sets output `available`.

- name: Resolve ECR URL
id: resolve-ecr
shell: bash
env:
INPUT_ECR_URL: ${{ inputs.ecr-url }}
run: |
ECR_URL="${INPUT_ECR_URL:-}"

if [ -z "${ECR_URL}" ]; then
echo "🔵 ecr-url input not set, trying ECR_CACHE_URL env var..."
ECR_URL="${ECR_CACHE_URL:-}"
[ -n "${ECR_URL}" ] && echo "🟢 Using ECR_CACHE_URL env var: ${ECR_URL}"
fi

if [ -z "${ECR_URL}" ]; then
echo "🔵 ECR_CACHE_URL env var not set, trying SSM..."
IMDS_TOKEN=$(curl -sf -X PUT "http://169.254.169.254/latest/api/token" \
-H "X-aws-ec2-metadata-token-ttl-seconds: 21600") || true
INSTANCE_ID=$(curl -sf -H "X-aws-ec2-metadata-token: ${IMDS_TOKEN}" \
"http://169.254.169.254/latest/meta-data/instance-id") || true
INSTANCE_REGION=$(curl -sf -H "X-aws-ec2-metadata-token: ${IMDS_TOKEN}" \
"http://169.254.169.254/latest/meta-data/placement/region") || true

if [ -n "${INSTANCE_ID}" ]; then
ECR_URL=$(aws ssm get-parameter \
--name "/github-runner/${INSTANCE_ID}/ecr-cache-url" \
--region "${INSTANCE_REGION}" \
--query 'Parameter.Value' --output text 2>/dev/null) || ECR_URL=""
if [ -n "${ECR_URL}" ]; then
echo "🟢 Resolved ECR URL from SSM (/github-runner/${INSTANCE_ID}/ecr-cache-url): ${ECR_URL}"
else
echo "🔵 SSM parameter not found for instance ${INSTANCE_ID}"
fi
else
echo "🔵 Not running on EC2 or IMDS unavailable, skipping SSM lookup"
fi
fi

if [ -n "${ECR_URL}" ]; then
echo "ECR_URL=${ECR_URL}" >> "$GITHUB_ENV"
echo "available=true" >> "$GITHUB_OUTPUT"
else
echo "🟠 ECR URL cannot be resolved. Building locally without ECR cache."
fi

##### 3: Setup ECR authentication #####

# Validates the ECR URL, derives ECR image tags, and logs into ECR.
# DOCKER_CONFIG (with credsStore disabled) is already set by step 1.

- name: Setup ECR authentication
if: steps.resolve-ecr.outputs.available == 'true'
shell: bash
run: |
REGISTRY=$(echo "${ECR_URL}" | cut -d'/' -f1)
AWS_REGION=$(echo "${REGISTRY}" | sed 's/.*\.dkr\.ecr\.\(.*\)\.amazonaws\.com/\1/')

if [ "${AWS_REGION}" = "${REGISTRY}" ]; then
echo "🔴 Invalid ECR URL - cannot extract AWS region: ${ECR_URL}"
echo "🔴 Expected format: <account-id>.dkr.ecr.<region>.amazonaws.com/<repo>"
exit 1
fi

ECR_TAG=$(echo "${{ inputs.image-tag }}" | tr ':/' '--')
ECR_IMAGE="${ECR_URL}:${ECR_TAG}"
CACHE_IMAGE="${ECR_URL}:${{ inputs.cache-tag }}"

echo "ECR_IMAGE=${ECR_IMAGE}" >> "$GITHUB_ENV"
echo "CACHE_IMAGE=${CACHE_IMAGE}" >> "$GITHUB_ENV"

echo "🔵 Logging into ECR registry..."
aws ecr get-login-password --region "${AWS_REGION}" | \
docker login --username AWS --password-stdin "${REGISTRY}"

##### 4: Check if exact image exists in ECR #####

# Lightweight manifest check - fetches only the image manifest (~KB),
# not the actual layers. If the exact per-commit image already exists
# in ECR, sets output `hit: true` to skip all subsequent build/push steps.

- name: Check exact image in ECR
id: pull-exact
if: steps.resolve-ecr.outputs.available == 'true'
shell: bash
run: |
echo "🔵 Checking if commit-tagged image exists in ECR >> ${ECR_IMAGE}"
if docker manifest inspect "${ECR_IMAGE}" >/dev/null 2>&1; then
echo "🟢 Commit-tagged image found in ECR, skipping build!"
echo "hit=true" >> "$GITHUB_OUTPUT"
else
echo "🟠 Image ${ECR_IMAGE} not found in ECR, will try deps-cache strategy..."
fi

# Pull the image when the manifest check succeeded but the image is not
# available locally (test jobs need it for `docker run`). Build jobs
# that only push to ECR will already have the image or don't need it.
- name: Pull exact image from ECR
if: steps.pull-exact.outputs.hit == 'true'
shell: bash
run: |
if docker image inspect "${{ inputs.image-tag }}" >/dev/null 2>&1; then
echo "🟢 Image already available locally, skipping pull"
else
echo "🔵 Pulling ${ECR_IMAGE} from ECR..."
docker pull "${ECR_IMAGE}"
docker tag "${ECR_IMAGE}" "${{ inputs.image-tag }}"
echo "🟢 Image pulled and tagged as ${{ inputs.image-tag }}"
fi

##### 5: Check deps cache #####

# Hashes installation-relevant files + the base image digest to produce a stable
# deps-<hash> ECR tag. If the image exists in ECR, the build job succeeds
# immediately and test jobs pull the deps image with a source volume mount.

# Edit DEPS_FILES or DEPS_MANIFEST_PATTERN when install
# inputs change (new packages, new manifests, etc.).

- name: Check deps cache
id: deps-cache
if: steps.resolve-ecr.outputs.available == 'true' && steps.pull-exact.outputs.hit != 'true'
shell: bash
run: |
##### Deps-hash configuration #####
# Exact files/dirs whose full content is hashed. The Dockerfile is first.
DEPS_FILES=(
"${{ inputs.dockerfile-path }}"
isaaclab.sh
environment.yml
source/isaaclab/isaaclab/cli
)
# Manifest files matched repo-wide via git ls-files.
DEPS_MANIFEST_PATTERN='(setup\.py|pyproject\.toml|setup\.cfg|extension\.toml|requirements[^/]*\.txt|uv\.lock)$'

# Resolve the actual base image digest so a new push of a mutable tag
# (e.g. latest-develop) invalidates the deps cache automatically.
BASE_IMAGE_DIGEST=$(docker buildx imagetools inspect \
"${{ inputs.isaacsim-base-image }}:${{ inputs.isaacsim-version }}" \
--format '{{json .Manifest.Digest}}' 2>/dev/null | tr -d '"' || true)
if [ -n "${BASE_IMAGE_DIGEST}" ]; then
BASE_IMAGE_UNIQ_ID="${{ inputs.isaacsim-base-image }}:${{ inputs.isaacsim-version }}:${BASE_IMAGE_DIGEST}"
else
echo "🟠 Could not resolve base image digest, falling back to tag string"
BASE_IMAGE_UNIQ_ID="${{ inputs.isaacsim-base-image }}:${{ inputs.isaacsim-version }}"
fi

echo "🔵 Base image ID: ${BASE_IMAGE_UNIQ_ID}"

MANIFEST_FILES=$(git ls-files | grep -E "${DEPS_MANIFEST_PATTERN}" || true)
FILE_HASH=$(git ls-files -s "${DEPS_FILES[@]}" ${MANIFEST_FILES} 2>/dev/null \
| sha256sum | cut -c1-16)
DEPS_HASH=$(printf '%s %s' "${FILE_HASH}" "${BASE_IMAGE_UNIQ_ID}" | sha256sum | cut -c1-16)
DEPS_ECR_IMAGE="${ECR_URL}:deps-${DEPS_HASH}"
echo "🔵 Deps hash: ${DEPS_HASH}"
echo "🔵 Checking if deps image ${DEPS_ECR_IMAGE} exists in ECR..."

# Lightweight manifest check - fetches only the image manifest (~KB),
# not the actual layers, so this completes in seconds.
if docker manifest inspect "${DEPS_ECR_IMAGE}" >/dev/null 2>&1; then
echo "🟢 Deps cache HIT!!! Image exists in ECR: ${DEPS_ECR_IMAGE}"
# Create a commit-tagged alias pointing to the same manifest (registry-side,
# no layer download). Test jobs will pull this tag normally.
echo "🔵 Tagging as commit image ${ECR_IMAGE}..."
docker buildx imagetools create -t "${ECR_IMAGE}" "${DEPS_ECR_IMAGE}"
echo "🟢 Tagged ${ECR_IMAGE} >> ${DEPS_ECR_IMAGE}"
echo "deps-cache-hit=true" >> "$GITHUB_OUTPUT"
else
echo "🟠 Deps cache MISS 😿😿😿 (${DEPS_HASH}). Will build now. 🐢🐢🐢"
echo "DEPS_ECR_IMAGE=${DEPS_ECR_IMAGE}" >> "$GITHUB_ENV"
echo "PUSH_DEPS_IMAGE=true" >> "$GITHUB_ENV"
fi

##### 6: Full build #####

# Runs when neither the exact image nor the deps cache was available.
# Uses ECR layer cache (--cache-from/--cache-to) when ECR is available.

- name: Full build
if: steps.pull-exact.outputs.hit != 'true' && steps.deps-cache.outputs.deps-cache-hit != 'true'
shell: bash
run: |
BUILD_ARGS=(
--progress=plain
--platform linux/amd64
-f "${{ inputs.dockerfile-path }}"
--build-arg "ISAACSIM_BASE_IMAGE_ARG=${{ inputs.isaacsim-base-image }}"
--build-arg "ISAACSIM_VERSION_ARG=${{ inputs.isaacsim-version }}"
--build-arg "ISAACSIM_ROOT_PATH_ARG=/isaac-sim"
--build-arg "ISAACLAB_PATH_ARG=/workspace/isaaclab"
--build-arg "DOCKER_USER_HOME_ARG=/root"
-t "${{ inputs.image-tag }}"
)
if [ -n "${ECR_URL:-}" ]; then
BUILD_ARGS+=(
--cache-from "type=registry,ref=${CACHE_IMAGE}"
--cache-to "type=registry,ref=${CACHE_IMAGE},mode=max"
-t "${ECR_IMAGE}"
)
fi

BUILDER_NAME="ci-builder-${{ github.run_id }}-${{ github.job }}"
docker buildx create --use --driver docker-container --name "${BUILDER_NAME}" \
|| docker buildx use "${BUILDER_NAME}"
trap 'docker buildx rm "${BUILDER_NAME}" || true' EXIT

echo "🔵 Building ${{ inputs.image-tag }}..."
docker buildx build --load "${BUILD_ARGS[@]}" .

##### 7: Push to ECR #####

# Pushes the per-commit ECR image after a successful full build.
# Skipped if the image was pulled in (4).

- name: Push to ECR
if: >
steps.resolve-ecr.outputs.available == 'true' &&
steps.pull-exact.outputs.hit != 'true' &&
steps.deps-cache.outputs.deps-cache-hit != 'true'
shell: bash
run: |
echo "🔵 Pushing ${ECR_IMAGE} to ECR..."
docker push "${ECR_IMAGE}"
echo "🟢 Pushed ${ECR_IMAGE}"

##### 8: Push deps tag #####

# Tags the freshly built image as deps-<hash> so future runs with identical
# install inputs hit the fast path (step 5) instead of doing a full build.

- name: Push deps tag
if: env.PUSH_DEPS_IMAGE == 'true'
shell: bash
run: |
echo "🔵 Pushing deps image for future cache hits: ${DEPS_ECR_IMAGE}"
docker tag "${{ inputs.image-tag }}" "${DEPS_ECR_IMAGE}"
docker push "${DEPS_ECR_IMAGE}"

##### 9: Cleanup docker config #####

- name: Cleanup docker config
if: always()
shell: bash
run: |
if [ -n "${DOCKER_CONFIG}" ] && [ -d "${DOCKER_CONFIG}" ]; then
rm -rf "${DOCKER_CONFIG}"
fi
38 changes: 34 additions & 4 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,32 @@ env:
DOCKER_IMAGE_TAG: isaac-lab-dev:${{ github.event_name == 'pull_request' && format('pr-{0}', github.event.pull_request.number) || github.ref_name }}-${{ github.sha }}

jobs:
# Build the test image once and push it to ECR so the test jobs below pull it
# instead of each rebuilding it. The action falls back to a local build when
# ECR is unavailable (e.g. a runner without the SSM ecr-cache-url parameter).
build:
name: Build Docker Image
runs-on: [self-hosted, gpu]
timeout-minutes: 180

steps:
- name: Checkout Code
uses: actions/checkout@v4
with:
fetch-depth: 0
lfs: true

- name: Build and push image to ECR
uses: ./.github/actions/ecr-build-push-pull
with:
image-tag: ${{ env.DOCKER_IMAGE_TAG }}
isaacsim-base-image: ${{ env.ISAACSIM_BASE_IMAGE }}
isaacsim-version: ${{ env.ISAACSIM_BASE_VERSION }}
dockerfile-path: docker/Dockerfile.curobo
cache-tag: cache-main-curobo

test-isaaclab-tasks:
needs: [build]
runs-on: [self-hosted, gpu]
timeout-minutes: 180
continue-on-error: true
Expand All @@ -42,12 +67,14 @@ jobs:
fetch-depth: 0
lfs: true

- name: Build Docker Image
uses: ./.github/actions/docker-build
- name: Pull image from ECR
uses: ./.github/actions/ecr-build-push-pull
with:
image-tag: ${{ env.DOCKER_IMAGE_TAG }}
isaacsim-base-image: ${{ env.ISAACSIM_BASE_IMAGE }}
isaacsim-version: ${{ env.ISAACSIM_BASE_VERSION }}
dockerfile-path: docker/Dockerfile.curobo
cache-tag: cache-main-curobo

- name: Run IsaacLab Tasks Tests
uses: ./.github/actions/run-tests
Expand Down Expand Up @@ -91,6 +118,7 @@ jobs:
fi

test-general:
needs: [build]
runs-on: [self-hosted, gpu]
timeout-minutes: 180

Expand All @@ -101,12 +129,14 @@ jobs:
fetch-depth: 0
lfs: true

- name: Build Docker Image
uses: ./.github/actions/docker-build
- name: Pull image from ECR
uses: ./.github/actions/ecr-build-push-pull
with:
image-tag: ${{ env.DOCKER_IMAGE_TAG }}
isaacsim-base-image: ${{ env.ISAACSIM_BASE_IMAGE }}
isaacsim-version: ${{ env.ISAACSIM_BASE_VERSION }}
dockerfile-path: docker/Dockerfile.curobo
cache-tag: cache-main-curobo

- name: Run General Tests
id: run-general-tests
Expand Down
Loading