-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathinfer.sh
38 lines (30 loc) · 1.03 KB
/
infer.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#!/bin/bash
#SBATCH -N 1
#SBATCH -t 00:05:00
#SBATCH -p gpu
#SBATCH --gres=gpu:a100:4
#SBATCH -A sc24-class
#SBATCH --exclusive
#SBATCH --mem=500G
#SBATCH --reservation=sc24tut
export SCRATCH="/scratch/zt1/project/sc24/shared/"
export HF_HOME="${SCRATCH}/.cache/huggingface"
export HF_TRANSFORMERS_CACHE="${HF_HOME}"
export HF_DATASETS_CACHE="${HF_HOME}/datasets"
# variables needed for torch.distributed
export MASTER_ADDR=$(hostname)
export MASTER_PORT=29500
echo "Copying python environment to fast node local storage"
start=`date +%s`
mkdir -p /tmp/tutorial_env
tar -xzf ${SCRATCH}/miniconda3.tar.gz -C /tmp/tutorial_env
end=`date +%s`
runtime=$((end-start))
echo "Copy completed. Time taken = ${runtime} s"
# activate environment
source /tmp/tutorial_env/bin/activate
CONFIG_FILE="${CONFIG_FILE:-configs/inference_axonn.json}"
# Run torchrun with specified number of GPUs
#srun -u python -u infer.py --config-file $CONFIG_FILE
srun -u python -u infer.py --config-file $CONFIG_FILE
#torchrun --nproc_per_node 1 infer.py --config-file $CONFIG_FILE