Add support for axlearn
#257
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: NCCL on Kubernetes | |
on: | |
schedule: | |
- cron: '30 8 * * *' | |
pull_request: | |
types: | |
- opened | |
- reopened | |
- ready_for_review | |
- synchronize | |
paths-ignore: | |
- '**.md' | |
workflow_dispatch: | |
inputs: | |
# Note that cuda-dl-base installs the NCCL tests, while the vanilla nvidia/cuda | |
# images do not; when JAX-Toolbox moves to using cuda-dl-base this workflow ought | |
# to be modified to test one of the JAX-Toolbox containers. | |
CONTAINER: | |
type: string | |
description: Container to test, this is assumed to already contain the NCCL tests e.g. cuda-dl-base or derived | |
default: '' | |
required: false | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} | |
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} | |
permissions: | |
actions: write # to cancel previous workflows | |
contents: read # to fetch code | |
packages: write # to upload container | |
jobs: | |
build-mpi-operator-compatible-base: | |
uses: ./.github/workflows/_build.yaml | |
with: | |
ARCHITECTURE: amd64 | |
ARTIFACT_NAME: artifact-mpi-operator-compatible-base-build | |
BADGE_FILENAME: badge-mpi-operator-compatible-base-build | |
BUILD_DATE: 0000-00-00 # Not important; this image is never published | |
BASE_IMAGE: ${{ inputs.CONTAINER || 'nvcr.io/nvidia/cuda-dl-base:24.12-cuda12.6-devel-ubuntu24.04' }} | |
CONTAINER_NAME: mpi-operator-compatible-base | |
DOCKERFILE: .github/container/Dockerfile.mpi-operator-compatible-base | |
RUNNER_SIZE: small | |
secrets: inherit | |
nccl-tests: | |
needs: build-mpi-operator-compatible-base | |
runs-on: eks | |
strategy: | |
matrix: | |
test: [all_gather_perf_mpi, all_reduce_perf_mpi, broadcast_perf_mpi, reduce_scatter_perf_mpi] | |
env: | |
BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }} | |
TEST_NAME: ${{ matrix.test }} | |
steps: | |
- name: Check out the repository | |
uses: actions/checkout@v4 | |
- name: Modify variables | |
id: var | |
shell: bash | |
run: | | |
export JOB_NAME="nccl-test-${{ github.run_id }}-${TEST_NAME//_/-}" | |
echo "JOB_NAME=${JOB_NAME}" >> $GITHUB_OUTPUT | |
echo "LAUNCHER_NAME=${JOB_NAME}-launcher" >> $GITHUB_OUTPUT | |
echo "TOKEN_NAME=nccl-test-${JOB_NAME}-token" >> $GITHUB_OUTPUT | |
- name: GHCR login and store K8s secret | |
uses: ./.github/actions/ghcr-login | |
with: | |
docker-username: ${{ github.repository_owner }} | |
docker-password: ${{ secrets.GITHUB_TOKEN }} | |
token-name: ${{ steps.var.outputs.TOKEN_NAME }} | |
- name: Configure Kubernetes job | |
shell: bash | |
run: | | |
export JOB_NAME="${{ steps.var.outputs.JOB_NAME }}" | |
export LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}" | |
export TOKEN_NAME="${{ steps.var.outputs.TOKEN_NAME }}" | |
export TEST_NAME="${{ env.TEST_NAME }}" | |
export WORKER_NAME="${JOB_NAME}-worker" | |
# Use yq to set our fields in-place | |
yq -i '.metadata.name = strenv(JOB_NAME) | |
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE) | |
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME) | |
| .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) | |
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME) | |
| .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE) | |
| .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME) | |
| .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \ | |
.github/eks-workflow-files/mpi-nccl-test.yml | |
# (Optional) Show diff for debugging | |
git diff .github/eks-workflow-files/mpi-nccl-test.yml | |
- name: Submit & stream K8s job | |
uses: ./.github/actions/submit-k8s-job | |
with: | |
job-config-file: .github/eks-workflow-files/mpi-nccl-test.yml | |
job-name: ${{ steps.var.outputs.LAUNCHER_NAME }} | |
- name: Retrieve Kubernetes job status | |
shell: bash -exo pipefail {0} | |
run: | | |
LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}" | |
while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do | |
failure=${status[0]:-0} | |
success=${status[1]:-0} | |
total=$((failure+success)) | |
if [[ ${total} < 1 ]]; then | |
sleep 1 | |
elif [[ ${total} == 1 ]]; then | |
break | |
else | |
# If total > 1, that suggests a mismatch that can occur if there's more than one launcher pod | |
exit 255 | |
fi | |
done | |
exit ${failure} | |
- name: Debug failed Kubernetes job | |
if: ${{ failure() }} | |
shell: bash | |
run: | | |
LAUNCHER_NAME="${{ steps.var.outputs.LAUNCHER_NAME }}" | |
pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name) | |
if [[ -n "${pods}" ]]; then | |
kubectl describe ${pods} | |
fi | |
- name: Delete Kubernetes job | |
if: ${{ always() }} | |
uses: ./.github/actions/delete-k8s-job | |
with: | |
job-name: ${{ steps.var.outputs.LAUNCHER_NAME }} | |
- name: Delete GitHub Container Registry token | |
uses: ./.github/actions/delete-ghcr-token | |
if: ${{ always() }} | |
with: | |
token-name: ${{ steps.var.outputs.TOKEN_NAME }} |