File size: 4,472 Bytes
7bfbdc3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env bash

# Example manual launches retained for reference:
# accelerate launch --config_file /home/work/AIDAS/MMaDA/accelerate_configs/4_node_8_gpus_deepspeed_zero2_aidas.yaml --machine_rank 0 --main_process_port=8888 /home/work/AIDAS/MMaDA/training/train_omada_inst.py config=/home/work/AIDAS/MMaDA/configs/omada_instruction_tuning.yaml
# accelerate launch --config_file /home/work/AIDAS/MMaDA/accelerate_configs/4_node_8_gpus_deepspeed_zero2_aidas.yaml --machine_rank 1 --main_process_port=8888 /home/work/AIDAS/MMaDA/training/train_omada_inst.py config=/home/work/AIDAS/MMaDA/configs/omada_instruction_tuning.yaml
# accelerate launch --config_file /home/work/AIDAS/MMaDA/accelerate_configs/4_node_8_gpus_deepspeed_zero2_aidas.yaml --machine_rank 2 --main_process_port=8888 /home/work/AIDAS/MMaDA/training/train_omada_inst.py config=/home/work/AIDAS/MMaDA/configs/omada_instruction_tuning.yaml
# accelerate launch --config_file /home/work/AIDAS/MMaDA/accelerate_configs/4_node_8_gpus_deepspeed_zero2_aidas.yaml --machine_rank 3 --main_process_port=8888 /home/work/AIDAS/MMaDA/training/train_omada_inst.py config=/home/work/AIDAS/MMaDA/configs/omada_instruction_tuning.yaml

export AIDAS_TRAIN_HOSTS="main1 sub1 sub2 sub3"

set -euo pipefail

PROJECT_ROOT="/home/work/AIDAS"
CONFIG_FILE="${PROJECT_ROOT}/MMaDA/accelerate_configs/4_node_8_gpus_deepspeed_zero2_aidas.yaml"
TRAIN_SCRIPT="${PROJECT_ROOT}/MMaDA/training/train_omada_inst.py"
EXPERIMENT_CFG="${PROJECT_ROOT}/MMaDA/configs/omada_instruction_tuning.yaml"
LOG_DIR="${PROJECT_ROOT}/logs"
MAIN_PORT="${MAIN_PORT:-8888}"
REMOTE_SETUP="${REMOTE_SETUP:-source ~/.bashrc && conda activate mmada}"
NCCL_DEBUG_LEVEL="${NCCL_DEBUG_LEVEL:-INFO}"

if [[ -z "${AIDAS_TRAIN_HOSTS:-}" ]]; then
  echo "Set AIDAS_TRAIN_HOSTS=\"host0 host1 host2 host3\" before running this script." >&2
  exit 1
fi

read -r -a HOSTS <<< "${AIDAS_TRAIN_HOSTS}"
NUM_MACHINES=${#HOSTS[@]}
if (( NUM_MACHINES == 0 )); then
  echo "AIDAS_TRAIN_HOSTS is empty." >&2
  exit 1
fi

mkdir -p "$LOG_DIR"

TIMESTAMP=$(date +%Y%m%d_%H%M%S)
declare -a PIDS=()
declare -a HOST_LABELS=()

timestamp_lines() {
  while IFS= read -r line; do
    printf '%s %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$line"
  done
}

stop_all() {
  if (( ${#PIDS[@]} == 0 )); then
    return
  fi
  echo "Stopping launched processes..."
  for pid in "${PIDS[@]}"; do
    if [[ -n "${pid:-}" ]] && kill -0 "$pid" 2>/dev/null; then
      kill "$pid" >/dev/null 2>&1 || true
    fi
  done
}

on_signal() {
  echo "Signal received, terminating all ranks."
  stop_all
  exit 1
}
trap on_signal INT TERM

launch_rank() {
  local host="$1"
  local rank="$2"
  local log_file="$3"
  local host_label="$4"

  local base_cmd
  if [[ -n "${REMOTE_SETUP}" ]]; then
    base_cmd="${REMOTE_SETUP} && cd ${PROJECT_ROOT} && env NCCL_DEBUG=${NCCL_DEBUG_LEVEL} NCCL_SHM_DISABLE=1 NCCL_ASYNC_ERROR_HANDLING=1 accelerate launch --config_file ${CONFIG_FILE} --num_machines ${NUM_MACHINES} --machine_rank ${rank} --main_process_port ${MAIN_PORT} ${TRAIN_SCRIPT} config=${EXPERIMENT_CFG}"
  else
    base_cmd="cd ${PROJECT_ROOT} && env NCCL_DEBUG=${NCCL_DEBUG_LEVEL} NCCL_SHM_DISABLE=1 NCCL_ASYNC_ERROR_HANDLING=1 accelerate launch --config_file ${CONFIG_FILE} --num_machines ${NUM_MACHINES} --machine_rank ${rank} --main_process_port ${MAIN_PORT} ${TRAIN_SCRIPT} config=${EXPERIMENT_CFG}"
  fi
  local escaped_cmd
  escaped_cmd=$(printf '%q' "$base_cmd")

  if [[ "$host" == "localhost" || "$host" == "$(hostname)" || "$host" == "$(hostname -f)" ]]; then
    echo "[rank ${rank}] running locally (${host_label}), logging to ${log_file}"
    stdbuf -oL -eL bash -lc "$base_cmd" 2>&1 | timestamp_lines >"$log_file" &
  else
    local dest="${SSH_USER:-$USER}@${host}"
    echo "[rank ${rank}] ssh ${dest}, logging to ${log_file}"
    ssh "$dest" "bash -lc $escaped_cmd" 2>&1 | timestamp_lines >"$log_file" &
  fi
  PIDS[$rank]=$!
  HOST_LABELS[$rank]="$host_label"
}

for idx in "${!HOSTS[@]}"; do
  host="${HOSTS[$idx]}"
  safe_host=${host//[^A-Za-z0-9_.-]/_}
  log_file="${LOG_DIR}/train_inst_${TIMESTAMP}_rank${idx}_${safe_host}.log"
  launch_rank "$host" "$idx" "$log_file" "$safe_host"
done

echo "All nodes launched. Tail logs under ${LOG_DIR}."

for rank in "${!PIDS[@]}"; do
  pid="${PIDS[$rank]}"
  [[ -n "${pid:-}" ]] || continue
  if ! wait "$pid"; then
    status=$?
    echo "[rank ${rank}] (${HOST_LABELS[$rank]}) exited with status ${status}"
    stop_all
    exit $status
  fi
done