-
Notifications
You must be signed in to change notification settings - Fork 45
/
Copy pathbuildspec-ray.yml
339 lines (304 loc) · 15.6 KB
/
buildspec-ray.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
version: 0.2
env:
variables:
RAY_TOOLKIT_VERSION: '0.8.5'
RAY_TF_FRAMEWORK_VERSION: '2.1.0'
RAY_TORCH_FRAMEWORK_VERSION: '1.5.0'
CPU_INSTANCE_TYPE: 'ml.c4.xlarge'
GPU_INSTANCE_TYPE: 'ml.p2.xlarge'
PY_VERSION: '36'
BASE_ECR_REPO: 'sagemaker-rl-ray-container' # previous images repo for layer cache, same name as pro image repo
PREPROD_ECR_REPO: 'sagemaker-test'
PROD_ECR_REPO: 'sagemaker-rl-ray-container'
GITHUB_REPO: 'sagemaker-rl-container'
FRAMEWORK_BASE_IMAGE_ACCOUNT: '763104351884' # base image account(tf/mxnet images) required for building rl container images
SETUP_FILE: 'setup_cmds.sh'
SETUP_CMDS: '#!/bin/bash\npip install --upgrade pip\npip install -U -e .'
phases:
pre_build:
commands:
- start-dockerd
- |
ACCOUNT=$(aws sts get-caller-identity --query 'Account' --output text)
BASE_IMAGE="$ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$BASE_ECR_REPO"
PREPROD_IMAGE="$ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$PREPROD_ECR_REPO"
PROD_IMAGE="$ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$PROD_ECR_REPO"
# PR_NUM=$(echo $CODEBUILD_SOURCE_VERSION | grep -o '[0-9]\+')
# keep ssh connection alive when communicating with remote ec2 server during integ test
# largest connection idle time allowed: 10 seconds * 300 attempts = 50 minutes
- |
echo ' ServerAliveInterval 10' >> ~/.ssh/config
echo ' ServerAliveCountMax 300' >> ~/.ssh/config
build:
commands:
# install
- echo "install"
- pip3 install -U -e .
# Update awscli for compatibility with the latest botocore version that breaks it
# https://github.com/boto/boto3/issues/2596
- pip3 install --upgrade awscli
# launch remote gpu instance only in region us-west-2
- |
if [ "$AWS_DEFAULT_REGION" = "us-west-2" ]; then
echo "launch remote gpu instance"
prefix='ml.'
instance_type=${GPU_INSTANCE_TYPE#"$prefix"}
create-key-pair
launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu
else
echo "skipping launch remote gpu instance"
fi
- $(aws ecr get-login --no-include-email --region $AWS_DEFAULT_REGION --registry-ids $FRAMEWORK_BASE_IMAGE_ACCOUNT)
- |
TF_IMAGE="$FRAMEWORK_BASE_IMAGE_ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/tensorflow-training"
TORCH_IMAGE="$FRAMEWORK_BASE_IMAGE_ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/pytorch-training"
BUILD_ID="$(echo $CODEBUILD_BUILD_ID | sed -e 's/:/-/g')"
# pull tf cpu base images
- echo "pull tf cpu base images"
- |
RAY_TF_CPU_BASE_TAG="$RAY_TF_FRAMEWORK_VERSION-cpu-py$PY_VERSION-ubuntu18.04"
docker pull $TF_IMAGE:$RAY_TF_CPU_BASE_TAG
# pull torch cpu base images
- echo "pull torch cpu base images"
- |
RAY_TORCH_CPU_BASE_TAG="$RAY_TORCH_FRAMEWORK_VERSION-cpu-py$PY_VERSION-ubuntu16.04"
docker pull $TORCH_IMAGE:$RAY_TORCH_CPU_BASE_TAG
# pull tf gpu base images
- echo "pull tf gpu base images"
- |
RAY_TF_GPU_BASE_TAG="$RAY_TF_FRAMEWORK_VERSION-gpu-py$PY_VERSION-cu101-ubuntu18.04"
docker pull $TF_IMAGE:$RAY_TF_GPU_BASE_TAG
# pull torch gpu base images
- echo "pull torch gpu base images"
- |
RAY_TORCH_GPU_BASE_TAG="$RAY_TORCH_FRAMEWORK_VERSION-gpu-py$PY_VERSION-cu101-ubuntu16.04"
docker pull $TORCH_IMAGE:$RAY_TORCH_GPU_BASE_TAG
# build ray tf preprod cpu images
- echo "build ray tf preprod cpu images"
- |
RAY_TF_CPU_TAG="ray-$RAY_TOOLKIT_VERSION-tf-cpu-py$PY_VERSION"
RAY_TF_CPU_TAG_BUILD_ID="ray-$RAY_TOOLKIT_VERSION-tf-cpu-py$PY_VERSION-$BUILD_ID"
echo "pulling previous_image $BASE_IMAGE:$RAY_TF_CPU_TAG for layer cache..."
$(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
docker pull $BASE_IMAGE:$RAY_TF_CPU_TAG
docker build --cache-from $BASE_IMAGE:$RAY_TF_CPU_TAG \
-t $PREPROD_IMAGE:$RAY_TF_CPU_TAG_BUILD_ID \
-f ray/docker/$RAY_TOOLKIT_VERSION/Dockerfile.tf \
--build-arg processor=cpu \
--build-arg suffix=ubuntu18.04 \
--build-arg region=$AWS_DEFAULT_REGION .
# push ray tf preprod cpu images to ecr
- echo "push ray tf preprod cpu images to ecr"
- |
$(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
docker push $PREPROD_IMAGE:$RAY_TF_CPU_TAG_BUILD_ID
# run cpu integration tests for ray tf preprod cpu images
- echo "run local cpu integration tests for ray tf preprod cpu images"
- |
if has-matching-changes "test/" "tests/" "src/*.py" "ray/*" "buildspec-ray.yml"; then
pytest test/integration/local \
-k "test_ray" \
--region $AWS_DEFAULT_REGION \
--docker-base-name $PREPROD_IMAGE \
--tag $RAY_TF_CPU_TAG_BUILD_ID \
--framework tensorflow \
--toolkit ray \
--processor cpu
else
echo "skipping local cpu integration tests"
fi
# build ray torch preprod cpu images
- echo "build ray torch preprod cpu images"
- |
RAY_TORCH_CPU_TAG="ray-$RAY_TOOLKIT_VERSION-torch-cpu-py$PY_VERSION"
RAY_TORCH_CPU_TAG_BUILD_ID="ray-$RAY_TOOLKIT_VERSION-torch-cpu-py$PY_VERSION-$BUILD_ID"
echo "pulling previous_image $BASE_IMAGE:$RAY_TORCH_CPU_TAG for layer cache..."
$(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
docker pull $BASE_IMAGE:$RAY_TORCH_CPU_TAG
docker build --cache-from $BASE_IMAGE:$RAY_TORCH_CPU_TAG \
-t $PREPROD_IMAGE:$RAY_TORCH_CPU_TAG_BUILD_ID \
-f ray/docker/$RAY_TOOLKIT_VERSION/Dockerfile.torch \
--build-arg processor=cpu \
--build-arg suffix=ubuntu16.04 \
--build-arg region=$AWS_DEFAULT_REGION .
# push ray torch preprod cpu images to ecr
- echo "push ray torch preprod cpu images to ecr"
- |
$(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
docker push $PREPROD_IMAGE:$RAY_TORCH_CPU_TAG_BUILD_ID
# run cpu integration tests for ray torch preprod cpu images
- echo "run local cpu integration tests for ray torch preprod cpu images"
- |
if has-matching-changes "test/" "tests/" "src/*.py" "ray/*" "buildspec-ray.yml"; then
pytest test/integration/local \
-k "test_ray" \
--region $AWS_DEFAULT_REGION \
--docker-base-name $PREPROD_IMAGE \
--tag $RAY_TORCH_CPU_TAG_BUILD_ID \
--framework torch \
--toolkit ray \
--processor cpu
else
echo "skipping local cpu integration tests"
fi
# build ray tf preprod gpu images
- echo "build ray tf preprod gpu images"
- |
RAY_TF_GPU_TAG="ray-$RAY_TOOLKIT_VERSION-tf-gpu-py$PY_VERSION"
RAY_TF_GPU_TAG_BUILD_ID="ray-$RAY_TOOLKIT_VERSION-tf-gpu-py$PY_VERSION-$BUILD_ID"
echo "pulling previous_image $BASE_IMAGE:$RAY_TF_GPU_TAG for layer cache..."
$(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
docker pull $BASE_IMAGE:$RAY_TF_GPU_TAG
docker build --cache-from $BASE_IMAGE:$RAY_TF_GPU_TAG \
-t $PREPROD_IMAGE:$RAY_TF_GPU_TAG_BUILD_ID \
-f ray/docker/$RAY_TOOLKIT_VERSION/Dockerfile.tf \
--build-arg processor=gpu \
--build-arg suffix=cu101-ubuntu18.04 \
--build-arg region=$AWS_DEFAULT_REGION .
# push ray tf preprod gpu images to ecr
- echo "push ray tf preprod gpu images to ecr"
- |
$(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
docker push $PREPROD_IMAGE:$RAY_TF_GPU_TAG_BUILD_ID
# run gpu integration tests for ray tf preprod gpu images only in us-west-2
- echo "run local gpu integration tests for ray tf preprod gpu images"
- |
if has-matching-changes "test/" "tests/" "src/*.py" "ray/*" "buildspec-ray.yml"; then
if [ "$AWS_DEFAULT_REGION" = "us-west-2" ]; then
printf "$SETUP_CMDS" > $SETUP_FILE
cmd="pytest test/integration/local -k 'test_ray' --region $AWS_DEFAULT_REGION --toolkit ray --framework tensorflow --docker-base-name $PREPROD_IMAGE --tag $RAY_TF_GPU_TAG_BUILD_ID --processor gpu"
remote-test --github-repo $GITHUB_REPO --branch master --test-cmd "$cmd" --setup-file $SETUP_FILE
fi
else
echo "skipping local gpu integration tests"
fi
# build ray torch preprod gpu images
- echo "build ray torch preprod gpu images"
- |
RAY_TORCH_GPU_TAG="ray-$RAY_TOOLKIT_VERSION-torch-gpu-py$PY_VERSION"
RAY_TORCH_GPU_TAG_BUILD_ID="ray-$RAY_TOOLKIT_VERSION-torch-gpu-py$PY_VERSION-$BUILD_ID"
echo "pulling previous_image $BASE_IMAGE:$RAY_TORCH_GPU_TAG for layer cache..."
$(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
docker pull $BASE_IMAGE:$RAY_TORCH_GPU_TAG
docker build --cache-from $BASE_IMAGE:$RAY_TORCH_GPU_TAG \
-t $PREPROD_IMAGE:$RAY_TORCH_GPU_TAG_BUILD_ID \
-f ray/docker/$RAY_TOOLKIT_VERSION/Dockerfile.torch \
--build-arg processor=gpu \
--build-arg suffix=cu101-ubuntu16.04 \
--build-arg region=$AWS_DEFAULT_REGION .
# push ray torch preprod gpu images to ecr
- echo "push ray torch preprod gpu images to ecr"
- |
$(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
docker push $PREPROD_IMAGE:$RAY_TORCH_GPU_TAG_BUILD_ID
# run gpu integration tests for ray torch preprod gpu images only in us-west-2
- echo "run local gpu integration tests for ray torch preprod gpu images"
- |
if has-matching-changes "test/" "tests/" "src/*.py" "ray/*" "buildspec-ray.yml"; then
if [ "$AWS_DEFAULT_REGION" = "us-west-2" ]; then
printf "$SETUP_CMDS" > $SETUP_FILE
cmd="pytest test/integration/local -k 'test_ray' --region $AWS_DEFAULT_REGION --toolkit ray --framework torch --docker-base-name $PREPROD_IMAGE --tag $RAY_TORCH_GPU_TAG_BUILD_ID --processor gpu"
remote-test --github-repo $GITHUB_REPO --branch master --test-cmd "$cmd" --setup-file $SETUP_FILE --skip-setup
fi
else
echo "skipping local gpu integration tests"
fi
# run cpu sagemaker tests for ray tf preprod cpu images
- echo "run cpu sagemaker tests for ray tf preprod cpu images"
- |
if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec-ray.yml"; then
pytest test/integration/sagemaker \
-k "test_ray" \
--region $AWS_DEFAULT_REGION \
--docker-base-name $PREPROD_ECR_REPO \
--aws-id $ACCOUNT \
--tag $RAY_TF_CPU_TAG_BUILD_ID \
--framework tensorflow \
--toolkit ray \
--instance-type $CPU_INSTANCE_TYPE
else
echo "skipping cpu sagemaker tests"
fi
# run cpu sagemaker tests for ray torch preprod cpu images
- echo "run cpu sagemaker tests for ray torch preprod cpu images"
- |
if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec-ray.yml"; then
pytest test/integration/sagemaker \
-k "test_ray" \
--region $AWS_DEFAULT_REGION \
--docker-base-name $PREPROD_ECR_REPO \
--aws-id $ACCOUNT \
--tag $RAY_TORCH_CPU_TAG_BUILD_ID \
--framework torch \
--toolkit ray \
--instance-type $CPU_INSTANCE_TYPE
else
echo "skipping cpu sagemaker tests"
fi
# run gpu sagemaker tests for ray tf preprod gpu images
- echo "run gpu sagemaker tests for ray tf preprod gpu images"
- |
if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec-ray.yml"; then
if [ "$AWS_DEFAULT_REGION" = "us-west-2" ]; then
pytest test/integration/sagemaker \
-k "test_ray" \
--region $AWS_DEFAULT_REGION \
--docker-base-name $PREPROD_ECR_REPO \
--aws-id $ACCOUNT \
--tag $RAY_TF_GPU_TAG_BUILD_ID \
--framework tensorflow \
--toolkit ray \
--instance-type $GPU_INSTANCE_TYPE
fi
else
echo "skipping gpu sagemaker tests"
fi
# run gpu sagemaker tests for ray torch preprod gpu images
- echo "run gpu sagemaker tests for ray torch preprod gpu images"
- |
if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec-ray.yml"; then
if [ "$AWS_DEFAULT_REGION" = "us-west-2" ]; then
pytest test/integration/sagemaker \
-k "test_ray" \
--region $AWS_DEFAULT_REGION \
--docker-base-name $PREPROD_ECR_REPO \
--aws-id $ACCOUNT \
--tag $RAY_TORCH_GPU_TAG_BUILD_ID \
--framework torch \
--toolkit ray \
--instance-type $GPU_INSTANCE_TYPE
fi
else
echo "skipping gpu sagemaker tests"
fi
# publish cpu and gpu image to prod ecr repo if this is release build
- |
if is-release-build; then
$(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
docker tag $PREPROD_IMAGE:$RAY_TF_CPU_TAG_BUILD_ID $PROD_IMAGE:$RAY_TF_CPU_TAG
docker push $PROD_IMAGE:$RAY_TF_CPU_TAG
docker tag $PREPROD_IMAGE:$RAY_TORCH_CPU_TAG_BUILD_ID $PROD_IMAGE:$RAY_TORCH_CPU_TAG
docker push $PROD_IMAGE:$RAY_TORCH_CPU_TAG
docker tag $PREPROD_IMAGE:$RAY_TF_GPU_TAG_BUILD_ID $PROD_IMAGE:$RAY_TF_GPU_TAG
docker push $PROD_IMAGE:$RAY_TF_GPU_TAG
docker tag $PREPROD_IMAGE:$RAY_TORCH_GPU_TAG_BUILD_ID $PROD_IMAGE:$RAY_TORCH_GPU_TAG
docker push $PROD_IMAGE:$RAY_TORCH_GPU_TAG
else
echo "skipping publishing new image to production repo"
fi
finally:
# only shut down remote gpu instance if in us-west-2
- |
if [ "$AWS_DEFAULT_REGION" = "us-west-2" ]; then
echo "cleanup remote gpu instance"
cleanup-gpu-instances
cleanup-key-pairs
else
echo "No remote gpu instance to cleanup"
fi
# remove ecr image
- |
aws ecr batch-delete-image --repository-name $PREPROD_ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$RAY_TF_CPU_TAG_BUILD_ID
aws ecr batch-delete-image --repository-name $PREPROD_ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$RAY_TORCH_CPU_TAG_BUILD_ID
aws ecr batch-delete-image --repository-name $PREPROD_ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$RAY_TF_GPU_TAG_BUILD_ID
aws ecr batch-delete-image --repository-name $PREPROD_ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$RAY_TORCH_GPU_TAG_BUILD_ID