Spaces:
Running
on
Zero
Running
on
Zero
| # Example manual launches retained for reference: | |
| # accelerate launch --config_file /home/work/AIDAS/MMaDA/accelerate_configs/4_node_8_gpus_deepspeed_zero2_aidas.yaml --machine_rank 0 --main_process_port=8888 /home/work/AIDAS/MMaDA/training/train_omada_inst.py config=/home/work/AIDAS/MMaDA/configs/omada_instruction_tuning.yaml | |
| # accelerate launch --config_file /home/work/AIDAS/MMaDA/accelerate_configs/4_node_8_gpus_deepspeed_zero2_aidas.yaml --machine_rank 1 --main_process_port=8888 /home/work/AIDAS/MMaDA/training/train_omada_inst.py config=/home/work/AIDAS/MMaDA/configs/omada_instruction_tuning.yaml | |
| # accelerate launch --config_file /home/work/AIDAS/MMaDA/accelerate_configs/4_node_8_gpus_deepspeed_zero2_aidas.yaml --machine_rank 2 --main_process_port=8888 /home/work/AIDAS/MMaDA/training/train_omada_inst.py config=/home/work/AIDAS/MMaDA/configs/omada_instruction_tuning.yaml | |
| # accelerate launch --config_file /home/work/AIDAS/MMaDA/accelerate_configs/4_node_8_gpus_deepspeed_zero2_aidas.yaml --machine_rank 3 --main_process_port=8888 /home/work/AIDAS/MMaDA/training/train_omada_inst.py config=/home/work/AIDAS/MMaDA/configs/omada_instruction_tuning.yaml | |
| export AIDAS_TRAIN_HOSTS="main1 sub1 sub2 sub3" | |
| set -euo pipefail | |
| PROJECT_ROOT="/home/work/AIDAS" | |
| CONFIG_FILE="${PROJECT_ROOT}/MMaDA/accelerate_configs/4_node_8_gpus_deepspeed_zero2_aidas.yaml" | |
| TRAIN_SCRIPT="${PROJECT_ROOT}/MMaDA/training/train_omada_inst.py" | |
| EXPERIMENT_CFG="${PROJECT_ROOT}/MMaDA/configs/omada_instruction_tuning.yaml" | |
| LOG_DIR="${PROJECT_ROOT}/logs" | |
| MAIN_PORT="${MAIN_PORT:-8888}" | |
| REMOTE_SETUP="${REMOTE_SETUP:-source ~/.bashrc && conda activate mmada}" | |
| NCCL_DEBUG_LEVEL="${NCCL_DEBUG_LEVEL:-INFO}" | |
| if [[ -z "${AIDAS_TRAIN_HOSTS:-}" ]]; then | |
| echo "Set AIDAS_TRAIN_HOSTS=\"host0 host1 host2 host3\" before running this script." >&2 | |
| exit 1 | |
| fi | |
| read -r -a HOSTS <<< "${AIDAS_TRAIN_HOSTS}" | |
| NUM_MACHINES=${#HOSTS[@]} | |
| if (( NUM_MACHINES == 0 )); then | |
| echo "AIDAS_TRAIN_HOSTS is empty." >&2 | |
| exit 1 | |
| fi | |
| mkdir -p "$LOG_DIR" | |
| TIMESTAMP=$(date +%Y%m%d_%H%M%S) | |
| declare -a PIDS=() | |
| declare -a HOST_LABELS=() | |
| timestamp_lines() { | |
| while IFS= read -r line; do | |
| printf '%s %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$line" | |
| done | |
| } | |
| stop_all() { | |
| if (( ${#PIDS[@]} == 0 )); then | |
| return | |
| fi | |
| echo "Stopping launched processes..." | |
| for pid in "${PIDS[@]}"; do | |
| if [[ -n "${pid:-}" ]] && kill -0 "$pid" 2>/dev/null; then | |
| kill "$pid" >/dev/null 2>&1 || true | |
| fi | |
| done | |
| } | |
| on_signal() { | |
| echo "Signal received, terminating all ranks." | |
| stop_all | |
| exit 1 | |
| } | |
| trap on_signal INT TERM | |
| launch_rank() { | |
| local host="$1" | |
| local rank="$2" | |
| local log_file="$3" | |
| local host_label="$4" | |
| local base_cmd | |
| if [[ -n "${REMOTE_SETUP}" ]]; then | |
| base_cmd="${REMOTE_SETUP} && cd ${PROJECT_ROOT} && env NCCL_DEBUG=${NCCL_DEBUG_LEVEL} NCCL_SHM_DISABLE=1 NCCL_ASYNC_ERROR_HANDLING=1 accelerate launch --config_file ${CONFIG_FILE} --num_machines ${NUM_MACHINES} --machine_rank ${rank} --main_process_port ${MAIN_PORT} ${TRAIN_SCRIPT} config=${EXPERIMENT_CFG}" | |
| else | |
| base_cmd="cd ${PROJECT_ROOT} && env NCCL_DEBUG=${NCCL_DEBUG_LEVEL} NCCL_SHM_DISABLE=1 NCCL_ASYNC_ERROR_HANDLING=1 accelerate launch --config_file ${CONFIG_FILE} --num_machines ${NUM_MACHINES} --machine_rank ${rank} --main_process_port ${MAIN_PORT} ${TRAIN_SCRIPT} config=${EXPERIMENT_CFG}" | |
| fi | |
| local escaped_cmd | |
| escaped_cmd=$(printf '%q' "$base_cmd") | |
| if [[ "$host" == "localhost" || "$host" == "$(hostname)" || "$host" == "$(hostname -f)" ]]; then | |
| echo "[rank ${rank}] running locally (${host_label}), logging to ${log_file}" | |
| stdbuf -oL -eL bash -lc "$base_cmd" 2>&1 | timestamp_lines >"$log_file" & | |
| else | |
| local dest="${SSH_USER:-$USER}@${host}" | |
| echo "[rank ${rank}] ssh ${dest}, logging to ${log_file}" | |
| ssh "$dest" "bash -lc $escaped_cmd" 2>&1 | timestamp_lines >"$log_file" & | |
| fi | |
| PIDS[$rank]=$! | |
| HOST_LABELS[$rank]="$host_label" | |
| } | |
| for idx in "${!HOSTS[@]}"; do | |
| host="${HOSTS[$idx]}" | |
| safe_host=${host//[^A-Za-z0-9_.-]/_} | |
| log_file="${LOG_DIR}/train_inst_${TIMESTAMP}_rank${idx}_${safe_host}.log" | |
| launch_rank "$host" "$idx" "$log_file" "$safe_host" | |
| done | |
| echo "All nodes launched. Tail logs under ${LOG_DIR}." | |
| for rank in "${!PIDS[@]}"; do | |
| pid="${PIDS[$rank]}" | |
| [[ -n "${pid:-}" ]] || continue | |
| if ! wait "$pid"; then | |
| status=$? | |
| echo "[rank ${rank}] (${HOST_LABELS[$rank]}) exited with status ${status}" | |
| stop_all | |
| exit $status | |
| fi | |
| done | |