Spaces:
Sleeping
Sleeping
File size: 1,975 Bytes
7245cc5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
#!/bin/bash
#SBATCH --job-name=XYZ
#SBATCH --nodes=4
#SBATCH --mem=256gb
#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
#SBATCH --cpus-per-task=28
#SBATCH --gpus-per-node=4
#SBATCH --exclusive
#SBATCH --output=output/slurm-%j-%N.out
#SBATCH --error=error/slurm-%j-%N.err
#SBATCH --qos=scavenger
#SBATCH --signal=B:USR1@300
#SBATCH --nodelist=lse-hpcnode[1,3,4,5,10-12]
#6 and 9 are messed up
#7 is sketchy as well
set -x -e
if [ -z "$1" ]
then
#quit if no job number is passed
echo "No config file passed, quitting"
exit 1
else
config_file=$1
fi
source ~/.bashrc
conda activate gencam
cd /datasets/sai/gencam/cogvideox/training
echo "START TIME: $(date)"
# needed until we fix IB issues
export NCCL_IB_DISABLE=1
export NCCL_SOCKET_IFNAME=ens
# Training setup
GPUS_PER_NODE=4
# so processes know who to talk to
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
MASTER_PORT=6000
NNODES=$SLURM_NNODES
NODE_RANK=$SLURM_PROCID
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
#CMD="accelerate_test.py"
CMD="train_controlnet.py --config $config_file"
LAUNCHER="accelerate launch \
--multi_gpu \
--gpu_ids 0,1,2,3 \
--num_processes $WORLD_SIZE \
--num_machines $NNODES \
--main_process_ip $MASTER_ADDR \
--main_process_port $MASTER_PORT \
--rdzv_backend=c10d \
--max_restarts 0 \
--tee 3 \
"
# # NOT SURE THE FOLLOWING ENV VARS IS STRICTLY NEEDED (PROBABLY NOT)
# export CUDA_HOME=/usr/local/cuda-11.6
# export LD_PRELOAD=$CUDA_HOME/lib/libnccl.so
# export LD_LIBRARY_PATH=$CUDA_HOME/efa/lib:$CUDA_HOME/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH
SRUN_ARGS=" \
--wait=60 \
--kill-on-bad-exit=1 \
"
handler()
{
echo "Signal handler triggered at $(date)"
sleep 120 # Let training save
sbatch ${BASH_SOURCE[0]} $config_file
}
# register signal handler
trap handler SIGUSR1
clear; srun --cpu-bind=none --jobid $SLURM_JOB_ID $LAUNCHER $CMD & srun_pid=$!
wait
echo "END TIME: $(date)"
|