buildspec-ray.yml

version: 0.2

env:
  variables:
    RAY_TOOLKIT_VERSION: '0.8.5'
    RAY_TF_FRAMEWORK_VERSION: '2.1.0'
    RAY_TORCH_FRAMEWORK_VERSION: '1.5.0'
    CPU_INSTANCE_TYPE: 'ml.c4.xlarge'
    GPU_INSTANCE_TYPE: 'ml.p2.xlarge'
    PY_VERSION: '36'
    BASE_ECR_REPO: 'sagemaker-rl-ray-container'    # previous images repo for layer cache, same name as pro image repo
    PREPROD_ECR_REPO: 'sagemaker-test'
    PROD_ECR_REPO: 'sagemaker-rl-ray-container'
    GITHUB_REPO: 'sagemaker-rl-container'
    FRAMEWORK_BASE_IMAGE_ACCOUNT: '763104351884'     # base image account(tf/mxnet images) required for building rl container images
    SETUP_FILE: 'setup_cmds.sh'
    SETUP_CMDS: '#!/bin/bash\npip install --upgrade pip\npip install -U -e .'


phases:
  pre_build:
    commands:
      - start-dockerd
      - |
        ACCOUNT=$(aws sts get-caller-identity --query 'Account' --output text)
        BASE_IMAGE="$ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$BASE_ECR_REPO"
        PREPROD_IMAGE="$ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$PREPROD_ECR_REPO"
        PROD_IMAGE="$ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$PROD_ECR_REPO"
        # PR_NUM=$(echo $CODEBUILD_SOURCE_VERSION | grep -o '[0-9]\+')
      # keep ssh connection alive when communicating with remote ec2 server during integ test
      # largest connection idle time allowed: 10 seconds * 300 attempts = 50 minutes
      - |
        echo '  ServerAliveInterval 10' >> ~/.ssh/config
        echo '  ServerAliveCountMax 300' >> ~/.ssh/config
  build:
    commands:
      # install
      - echo "install"
      - pip3 install -U -e .
      # Update awscli for compatibility with the latest botocore version that breaks it
      # https://github.com/boto/boto3/issues/2596
      - pip3 install --upgrade awscli

      # launch remote gpu instance only in region us-west-2
      - |
        if [ "$AWS_DEFAULT_REGION" = "us-west-2" ]; then
          echo "launch remote gpu instance"
          prefix='ml.'
          instance_type=${GPU_INSTANCE_TYPE#"$prefix"}
          create-key-pair
          launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu
        else
          echo "skipping launch remote gpu instance"
        fi

      - $(aws ecr get-login --no-include-email --region $AWS_DEFAULT_REGION --registry-ids $FRAMEWORK_BASE_IMAGE_ACCOUNT)
      - |
        TF_IMAGE="$FRAMEWORK_BASE_IMAGE_ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/tensorflow-training"
        TORCH_IMAGE="$FRAMEWORK_BASE_IMAGE_ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/pytorch-training"
        BUILD_ID="$(echo $CODEBUILD_BUILD_ID | sed -e 's/:/-/g')"

      # pull tf cpu base images
      - echo "pull tf cpu base images"
      - |
        RAY_TF_CPU_BASE_TAG="$RAY_TF_FRAMEWORK_VERSION-cpu-py$PY_VERSION-ubuntu18.04"
        docker pull $TF_IMAGE:$RAY_TF_CPU_BASE_TAG

      # pull torch cpu base images
      - echo "pull torch cpu base images"
      - |
        RAY_TORCH_CPU_BASE_TAG="$RAY_TORCH_FRAMEWORK_VERSION-cpu-py$PY_VERSION-ubuntu16.04"
        docker pull $TORCH_IMAGE:$RAY_TORCH_CPU_BASE_TAG

      # pull tf gpu base images
      - echo "pull tf gpu base images"
      - |
        RAY_TF_GPU_BASE_TAG="$RAY_TF_FRAMEWORK_VERSION-gpu-py$PY_VERSION-cu101-ubuntu18.04"
        docker pull $TF_IMAGE:$RAY_TF_GPU_BASE_TAG

      # pull torch gpu base images
      - echo "pull torch gpu base images"
      - |
        RAY_TORCH_GPU_BASE_TAG="$RAY_TORCH_FRAMEWORK_VERSION-gpu-py$PY_VERSION-cu101-ubuntu16.04"
        docker pull $TORCH_IMAGE:$RAY_TORCH_GPU_BASE_TAG

      # build ray tf preprod cpu images
      - echo "build ray tf preprod cpu images"
      - |
        RAY_TF_CPU_TAG="ray-$RAY_TOOLKIT_VERSION-tf-cpu-py$PY_VERSION"
        RAY_TF_CPU_TAG_BUILD_ID="ray-$RAY_TOOLKIT_VERSION-tf-cpu-py$PY_VERSION-$BUILD_ID"

        echo "pulling previous_image $BASE_IMAGE:$RAY_TF_CPU_TAG for layer cache..."
        $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
        docker pull $BASE_IMAGE:$RAY_TF_CPU_TAG
        docker build --cache-from $BASE_IMAGE:$RAY_TF_CPU_TAG \
                     -t $PREPROD_IMAGE:$RAY_TF_CPU_TAG_BUILD_ID \
                     -f ray/docker/$RAY_TOOLKIT_VERSION/Dockerfile.tf \
                     --build-arg processor=cpu \
                     --build-arg suffix=ubuntu18.04 \
                     --build-arg region=$AWS_DEFAULT_REGION .

      # push ray tf preprod cpu images to ecr
      - echo "push ray tf preprod cpu images to ecr"
      - |
        $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
        docker push $PREPROD_IMAGE:$RAY_TF_CPU_TAG_BUILD_ID

      # run cpu integration tests for ray tf preprod cpu images
      - echo "run local cpu integration tests for ray tf preprod cpu images"
      - |
        if has-matching-changes "test/" "tests/" "src/*.py" "ray/*" "buildspec-ray.yml"; then
          pytest test/integration/local \
                  -k "test_ray" \
                  --region $AWS_DEFAULT_REGION \
                  --docker-base-name $PREPROD_IMAGE \
                  --tag $RAY_TF_CPU_TAG_BUILD_ID \
                  --framework tensorflow  \
                  --toolkit ray \
                  --processor cpu
        else
          echo "skipping local cpu integration tests"
        fi

      # build ray torch preprod cpu images
      - echo "build ray torch preprod cpu images"
      - |
        RAY_TORCH_CPU_TAG="ray-$RAY_TOOLKIT_VERSION-torch-cpu-py$PY_VERSION"
        RAY_TORCH_CPU_TAG_BUILD_ID="ray-$RAY_TOOLKIT_VERSION-torch-cpu-py$PY_VERSION-$BUILD_ID"

        echo "pulling previous_image $BASE_IMAGE:$RAY_TORCH_CPU_TAG for layer cache..."
        $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
        docker pull $BASE_IMAGE:$RAY_TORCH_CPU_TAG
        docker build --cache-from $BASE_IMAGE:$RAY_TORCH_CPU_TAG \
                     -t $PREPROD_IMAGE:$RAY_TORCH_CPU_TAG_BUILD_ID \
                     -f ray/docker/$RAY_TOOLKIT_VERSION/Dockerfile.torch \
                     --build-arg processor=cpu \
                     --build-arg suffix=ubuntu16.04 \
                     --build-arg region=$AWS_DEFAULT_REGION .

      # push ray torch preprod cpu images to ecr
      - echo "push ray torch preprod cpu images to ecr"
      - |
        $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
        docker push $PREPROD_IMAGE:$RAY_TORCH_CPU_TAG_BUILD_ID

      # run cpu integration tests for ray torch preprod cpu images
      - echo "run local cpu integration tests for ray torch preprod cpu images"
      - |
        if has-matching-changes "test/" "tests/" "src/*.py" "ray/*" "buildspec-ray.yml"; then
          pytest test/integration/local \
                  -k "test_ray" \
                  --region $AWS_DEFAULT_REGION \
                  --docker-base-name $PREPROD_IMAGE \
                  --tag $RAY_TORCH_CPU_TAG_BUILD_ID \
                  --framework torch  \
                  --toolkit ray \
                  --processor cpu
        else
          echo "skipping local cpu integration tests"
        fi

      # build ray tf preprod gpu images
      - echo "build ray tf preprod gpu images"
      - |
        RAY_TF_GPU_TAG="ray-$RAY_TOOLKIT_VERSION-tf-gpu-py$PY_VERSION"
        RAY_TF_GPU_TAG_BUILD_ID="ray-$RAY_TOOLKIT_VERSION-tf-gpu-py$PY_VERSION-$BUILD_ID"

        echo "pulling previous_image $BASE_IMAGE:$RAY_TF_GPU_TAG for layer cache..."
        $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
        docker pull $BASE_IMAGE:$RAY_TF_GPU_TAG
        docker build --cache-from $BASE_IMAGE:$RAY_TF_GPU_TAG \
                     -t $PREPROD_IMAGE:$RAY_TF_GPU_TAG_BUILD_ID \
                     -f ray/docker/$RAY_TOOLKIT_VERSION/Dockerfile.tf \
                     --build-arg processor=gpu \
                     --build-arg suffix=cu101-ubuntu18.04 \
                     --build-arg region=$AWS_DEFAULT_REGION .

      # push ray tf preprod gpu images to ecr
      - echo "push ray tf preprod gpu images to ecr"
      - |
        $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
        docker push $PREPROD_IMAGE:$RAY_TF_GPU_TAG_BUILD_ID

      # run gpu integration tests for ray tf preprod gpu images only in us-west-2
      - echo "run local gpu integration tests for ray tf preprod gpu images"
      - |
        if has-matching-changes "test/" "tests/" "src/*.py" "ray/*" "buildspec-ray.yml"; then
          if [ "$AWS_DEFAULT_REGION" = "us-west-2" ]; then
            printf "$SETUP_CMDS" > $SETUP_FILE
            cmd="pytest test/integration/local -k 'test_ray' --region $AWS_DEFAULT_REGION --toolkit ray --framework tensorflow --docker-base-name $PREPROD_IMAGE --tag $RAY_TF_GPU_TAG_BUILD_ID --processor gpu"
            remote-test --github-repo $GITHUB_REPO --branch master --test-cmd "$cmd" --setup-file $SETUP_FILE
          fi
        else
          echo "skipping local gpu integration tests"
        fi

      # build ray torch preprod gpu images
      - echo "build ray torch preprod gpu images"
      - |
        RAY_TORCH_GPU_TAG="ray-$RAY_TOOLKIT_VERSION-torch-gpu-py$PY_VERSION"
        RAY_TORCH_GPU_TAG_BUILD_ID="ray-$RAY_TOOLKIT_VERSION-torch-gpu-py$PY_VERSION-$BUILD_ID"

        echo "pulling previous_image $BASE_IMAGE:$RAY_TORCH_GPU_TAG for layer cache..."
        $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
        docker pull $BASE_IMAGE:$RAY_TORCH_GPU_TAG
        docker build --cache-from $BASE_IMAGE:$RAY_TORCH_GPU_TAG \
                     -t $PREPROD_IMAGE:$RAY_TORCH_GPU_TAG_BUILD_ID \
                     -f ray/docker/$RAY_TOOLKIT_VERSION/Dockerfile.torch \
                     --build-arg processor=gpu \
                     --build-arg suffix=cu101-ubuntu16.04 \
                     --build-arg region=$AWS_DEFAULT_REGION .

      # push ray torch preprod gpu images to ecr
      - echo "push ray torch preprod gpu images to ecr"
      - |
        $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
        docker push $PREPROD_IMAGE:$RAY_TORCH_GPU_TAG_BUILD_ID

      # run gpu integration tests for ray torch preprod gpu images only in us-west-2
      - echo "run local gpu integration tests for ray torch preprod gpu images"
      - |
        if has-matching-changes "test/" "tests/" "src/*.py" "ray/*" "buildspec-ray.yml"; then
          if [ "$AWS_DEFAULT_REGION" = "us-west-2" ]; then
            printf "$SETUP_CMDS" > $SETUP_FILE
            cmd="pytest test/integration/local -k 'test_ray' --region $AWS_DEFAULT_REGION --toolkit ray --framework torch --docker-base-name $PREPROD_IMAGE --tag $RAY_TORCH_GPU_TAG_BUILD_ID --processor gpu"
            remote-test --github-repo $GITHUB_REPO --branch master --test-cmd "$cmd" --setup-file $SETUP_FILE --skip-setup
          fi
        else
          echo "skipping local gpu integration tests"
        fi

      # run cpu sagemaker tests for ray tf preprod cpu images
      - echo "run cpu sagemaker tests for ray tf preprod cpu images"
      - |
        if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec-ray.yml"; then
          pytest test/integration/sagemaker \
                  -k "test_ray" \
                  --region $AWS_DEFAULT_REGION \
                  --docker-base-name $PREPROD_ECR_REPO \
                  --aws-id $ACCOUNT \
                  --tag $RAY_TF_CPU_TAG_BUILD_ID \
                  --framework tensorflow \
                  --toolkit ray \
                  --instance-type $CPU_INSTANCE_TYPE
        else
          echo "skipping cpu sagemaker tests"
        fi

      # run cpu sagemaker tests for ray torch preprod cpu images
      - echo "run cpu sagemaker tests for ray torch preprod cpu images"
      - |
        if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec-ray.yml"; then
          pytest test/integration/sagemaker \
                  -k "test_ray" \
                  --region $AWS_DEFAULT_REGION \
                  --docker-base-name $PREPROD_ECR_REPO \
                  --aws-id $ACCOUNT \
                  --tag $RAY_TORCH_CPU_TAG_BUILD_ID \
                  --framework torch \
                  --toolkit ray \
                  --instance-type $CPU_INSTANCE_TYPE
        else
          echo "skipping cpu sagemaker tests"
        fi

      # run gpu sagemaker tests for ray tf preprod gpu images
      - echo "run gpu sagemaker tests for ray tf preprod gpu images"
      - |
        if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec-ray.yml"; then
          if [ "$AWS_DEFAULT_REGION" = "us-west-2" ]; then
            pytest test/integration/sagemaker \
                    -k "test_ray" \
                    --region $AWS_DEFAULT_REGION \
                    --docker-base-name $PREPROD_ECR_REPO \
                    --aws-id $ACCOUNT \
                    --tag $RAY_TF_GPU_TAG_BUILD_ID \
                    --framework tensorflow \
                    --toolkit ray \
                    --instance-type $GPU_INSTANCE_TYPE
          fi
        else
          echo "skipping gpu sagemaker tests"
        fi

      # run gpu sagemaker tests for ray torch preprod gpu images
      - echo "run gpu sagemaker tests for ray torch preprod gpu images"
      - |
        if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec-ray.yml"; then
          if [ "$AWS_DEFAULT_REGION" = "us-west-2" ]; then
            pytest test/integration/sagemaker \
                    -k "test_ray" \
                    --region $AWS_DEFAULT_REGION \
                    --docker-base-name $PREPROD_ECR_REPO \
                    --aws-id $ACCOUNT \
                    --tag $RAY_TORCH_GPU_TAG_BUILD_ID \
                    --framework torch \
                    --toolkit ray \
                    --instance-type $GPU_INSTANCE_TYPE
          fi
        else
          echo "skipping gpu sagemaker tests"
        fi

      # publish cpu and gpu image to prod ecr repo if this is release build
      - |
        if is-release-build; then
          $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
          docker tag $PREPROD_IMAGE:$RAY_TF_CPU_TAG_BUILD_ID $PROD_IMAGE:$RAY_TF_CPU_TAG
          docker push $PROD_IMAGE:$RAY_TF_CPU_TAG

          docker tag $PREPROD_IMAGE:$RAY_TORCH_CPU_TAG_BUILD_ID $PROD_IMAGE:$RAY_TORCH_CPU_TAG
          docker push $PROD_IMAGE:$RAY_TORCH_CPU_TAG

          docker tag $PREPROD_IMAGE:$RAY_TF_GPU_TAG_BUILD_ID $PROD_IMAGE:$RAY_TF_GPU_TAG
          docker push $PROD_IMAGE:$RAY_TF_GPU_TAG

          docker tag $PREPROD_IMAGE:$RAY_TORCH_GPU_TAG_BUILD_ID $PROD_IMAGE:$RAY_TORCH_GPU_TAG
          docker push $PROD_IMAGE:$RAY_TORCH_GPU_TAG
        else
          echo "skipping publishing new image to production repo"
        fi

    finally:
      # only shut down remote gpu instance if in us-west-2
      - |
        if [ "$AWS_DEFAULT_REGION" = "us-west-2" ]; then
          echo "cleanup remote gpu instance"
          cleanup-gpu-instances
          cleanup-key-pairs
        else
          echo "No remote gpu instance to cleanup"
        fi

      # remove ecr image
      - |
        aws ecr batch-delete-image --repository-name $PREPROD_ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$RAY_TF_CPU_TAG_BUILD_ID
        aws ecr batch-delete-image --repository-name $PREPROD_ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$RAY_TORCH_CPU_TAG_BUILD_ID
        aws ecr batch-delete-image --repository-name $PREPROD_ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$RAY_TF_GPU_TAG_BUILD_ID
        aws ecr batch-delete-image --repository-name $PREPROD_ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$RAY_TORCH_GPU_TAG_BUILD_ID