大模型训练

根据实际情况修改${MASTER_PORT}、${NUM_WORKERS}、${NUM_GPUS_PER_WORKER}、${HOST_FILE_PATH}以及data_path。

run_cmd="HCCL_CONNECT_TIMEOUT=1200 deepspeed --master_port ${MASTER_PORT} --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER} --hostfile ${HOST_FILE_PATH} fastchat/train/train_mem.py \
    --model_name_or_path ./13B-vicuna \
    --data_path ./playground/data/alpaca-data-conversation.json \ # 自行修改数据集路径
    --fp16 True \
    --output_dir ./ckpt_16p \
    --num_train_epochs 1 \
    --per_device_train_batch_size 8 \
    --per_device_eval_batch_size 1 \
    --gradient_accumulation_steps 16 \
    --evaluation_strategy "no" \
    --save_strategy "steps" \
    --save_steps 500 \
    --save_total_limit 2 \
    --learning_rate 2e-5 \
    --weight_decay 0. \
    --warmup_ratio 0.03 \
    --lr_scheduler_type "cosine" \
    --logging_steps 1 \
    --tf32 False \
    --model_max_length 1024 \
    --gradient_checkpointing True \
    --lazy_preprocess True \
    --deepspeed ./deepspeed_config_13B.json > train_13B.log 2>&1 &"