1+ #! /bin/sh
2+ # Author: Chaoyu Chen
3+ # Last Modified: 2024/5/20
4+ # Description: # Launch script on Multiple Nodes
5+
6+ # Run this script on all Nodes.
7+
8+ # You need to export your number of nodes and number of GPUs per node first.
9+ N_NODE=4
10+ N_GPU_PER_NODE=8
11+
12+ # You need to export $RANK, $MASTER_ADDR, $MASTER_PORT automatically for each Node.
13+
14+ # config path
15+ CONFIG=" configs/xxx_train_config.json"
16+
17+ # envs used inside training
18+ export OMP_NUM_THREADS=4
19+ export TOKENIZERS_PARALLELISM=False
20+
21+ TODAY=$( date +%Y-%m%d-%H%M)
22+
23+ # accelerate launch --config_file accelerate_ds_config.yaml \
24+ accelerate launch \
25+ --num_machines $N_NODE \
26+ --num_processes $(( $N_NODE * $N_GPU_PER_NODE )) \
27+ --use_deepspeed \
28+ --deepspeed_multinode_launcher ' standard' \
29+ --zero_stage 2 \
30+ --offload_optimizer_device ' cpu' \
31+ --offload_param_device ' none' \
32+ --gradient_accumulation_steps 1 \
33+ --gradient_clipping 1.0 \
34+ --zero3_init_flag false \
35+ --zero3_save_16bit_model false \
36+ --main_training_function ' main' \
37+ --mixed_precision ' bf16' \
38+ --dynamo_backend ' no' \
39+ --same_network \
40+ --machine_rank $RANK \
41+ --main_process_ip $MASTER_ADDR \
42+ --main_process_port $MASTER_PORT \
43+ --rdzv_backend ' static' \
44+ pefts/mft_accelerate.py --train_config " $CONFIG " --distributed_type " deepspeed"
0 commit comments