Spaces:
Sleeping
Sleeping
| #SBATCH --job-name=train_deblur | |
| #SBATCH --nodes=1 | |
| #SBATCH --gpus-per-node=4 | |
| #SBATCH --qos=scavenger | |
| #SBATCH --signal=B:USR1@600 | |
| #SBATCH --cpus-per-task=24 | |
| #SBATCH --output=output/slurm-%j.out | |
| #SBATCH --error=error/slurm-%j.err | |
| #SBATCH --exclude=lse-hpcnode9 | |
| # prepare your environment here | |
| source ~/.bashrc | |
| conda activate gencam | |
| cd /datasets/sai/gencam/cogvideox/training | |
| export CUDA_VISIBLE_DEVICES=0,1,2,3 | |
| export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True | |
| if [ -z "$1" ] | |
| then | |
| #quit if no job number is passed | |
| echo "No config file passed, quitting" | |
| exit 1 | |
| else | |
| config_file=$1 | |
| fi | |
| handler() | |
| { | |
| echo "function handler called at $(date)" | |
| # Send SIGUSR1 to the captured PID of the accelerate job | |
| if [ -n "$accelerate_pid" ]; then | |
| echo "Sending SIGUSR1 to accelerate PID: $accelerate_pid" | |
| python_id=$(ps --ppid $accelerate_pid -o pid=) | |
| kill -USR1 $python_id # Send SIGUSR1 to the accelerate job | |
| sleep 300 # Wait for 5 minutes | |
| else | |
| echo "No accelerate PID found" | |
| fi | |
| sbatch ${BASH_SOURCE[0]} $config_file | |
| } | |
| # register signal handler | |
| trap handler SIGUSR1 | |
| echo "Starting job at $(date)" | |
| #python train_controlnet.py #--config $config_file #& wait | |
| accelerate launch --config_file accelerator_configs/accelerator_val_config.yaml --multi_gpu train_controlnet.py --config $config_file & | |
| accelerate_pid=$! | |
| wait |