Skip to content

Add support for axlearn #259

Add support for axlearn

Add support for axlearn #259

Workflow file for this run

name: NCCL on Kubernetes
on:
schedule:
- cron: '30 8 * * *'
pull_request:
types:
- opened
- reopened
- ready_for_review
- synchronize
paths-ignore:
- '**.md'
workflow_dispatch:
inputs:
# Note that cuda-dl-base installs the NCCL tests, while the vanilla nvidia/cuda
# images do not; when JAX-Toolbox moves to using cuda-dl-base this workflow ought
# to be modified to test one of the JAX-Toolbox containers.
CONTAINER:
type: string
description: Container to test, this is assumed to already contain the NCCL tests e.g. cuda-dl-base or derived
default: ''
required: false
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
permissions:
actions: write # to cancel previous workflows
contents: read # to fetch code
packages: write # to upload container
jobs:
build-mpi-operator-compatible-base:
uses: ./.github/workflows/_build.yaml
with:
ARCHITECTURE: amd64
ARTIFACT_NAME: artifact-mpi-operator-compatible-base-build
BADGE_FILENAME: badge-mpi-operator-compatible-base-build
BUILD_DATE: 0000-00-00 # Not important; this image is never published
BASE_IMAGE: ${{ inputs.CONTAINER || 'nvcr.io/nvidia/cuda-dl-base:24.12-cuda12.6-devel-ubuntu24.04' }}
CONTAINER_NAME: mpi-operator-compatible-base
DOCKERFILE: .github/container/Dockerfile.mpi-operator-compatible-base
RUNNER_SIZE: small
secrets: inherit
nccl-tests:
needs: build-mpi-operator-compatible-base
runs-on: eks
strategy:
matrix:
test: [all_gather_perf_mpi, all_reduce_perf_mpi, broadcast_perf_mpi, reduce_scatter_perf_mpi]
env:
BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }}
TEST_NAME: ${{ matrix.test }}
steps:
- name: Check out the repository
uses: actions/checkout@v4
- name: Modify variables
id: var
shell: bash
run: |
export JOB_NAME="nccl-test-${{ github.run_id }}-${TEST_NAME//_/-}"
echo "JOB_NAME=${JOB_NAME}" >> $GITHUB_OUTPUT
echo "LAUNCHER_NAME=${JOB_NAME}-launcher" >> $GITHUB_OUTPUT
echo "TOKEN_NAME=nccl-test-${JOB_NAME}-token" >> $GITHUB_OUTPUT
- name: GHCR login and store K8s secret
uses: ./.github/actions/ghcr-login
with:
docker-username: ${{ github.repository_owner }}
docker-password: ${{ secrets.GITHUB_TOKEN }}
token-name: ${{ steps.var.outputs.TOKEN_NAME }}
- name: Configure Kubernetes job
shell: bash
run: |
export JOB_NAME="${{ steps.var.outputs.JOB_NAME }}"
export LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}"
export TOKEN_NAME="${{ steps.var.outputs.TOKEN_NAME }}"
export TEST_NAME="${{ env.TEST_NAME }}"
export WORKER_NAME="${JOB_NAME}-worker"
# Use yq to set our fields in-place
yq -i '.metadata.name = strenv(JOB_NAME)
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE)
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME)
| .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME)
| .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE)
| .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME)
| .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
.github/eks-workflow-files/mpi-nccl-test.yml
# (Optional) Show diff for debugging
git diff .github/eks-workflow-files/mpi-nccl-test.yml
- name: Submit & stream K8s job
uses: ./.github/actions/submit-k8s-job
with:
job-config-file: .github/eks-workflow-files/mpi-nccl-test.yml
job-name: ${{ steps.var.outputs.LAUNCHER_NAME }}
- name: Retrieve Kubernetes job status
shell: bash -exo pipefail {0}
run: |
LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}"
while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
failure=${status[0]:-0}
success=${status[1]:-0}
total=$((failure+success))
if [[ ${total} < 1 ]]; then
sleep 1
elif [[ ${total} == 1 ]]; then
break
else
# If total > 1, that suggests a mismatch that can occur if there's more than one launcher pod
exit 255
fi
done
exit ${failure}
- name: Debug failed Kubernetes job
if: ${{ failure() }}
shell: bash
run: |
LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}"
pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
if [[ -n "${pods}" ]]; then
kubectl describe ${pods}
fi
- name: Delete Kubernetes job
if: ${{ always() }}
uses: ./.github/actions/delete-k8s-job
with:
job-name: ${{ steps.var.outputs.LAUNCHER_NAME }}
- name: Delete GitHub Container Registry token
uses: ./.github/actions/delete-ghcr-token
if: ${{ always() }}
with:
token-name: ${{ steps.var.outputs.TOKEN_NAME }}