大模型训练

  1. 创建examples/intern/pretrain_internlm_7b_zero.sh预训练启动脚本如下:

    export LD_LIBRARY_PATH=/usr/local/lib:/root/miniconda3/lib:$LD_LIBRARY_PATH             
    export HCCL_CONNECT_TIMEOUT=1200                                                               
    GPUS_PER_NODE=8                                                                                    
    MASTER_ADDR=localhost                                                                              
    MASTER_PORT=6000                                                                                   
    NNODES=1                                                                                            
    NODE_RANK=0                                                                                         
    WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))                                                        
    DATA=./dataset/internlm_text_document                                                         
    CHECKPOINT=./ckpt/                                                                                
    DS_CONFIG=ds_config.json                                                                          
    ZERO_STAGE=2                                                                                        
    GLOBAL_BATCH=64                                                                                    
    MICRO_BATCH=8                                                                                      
    cat <<EOT > $DS_CONFIG                                                                             
    {                                                                                                     
        "fp16": {                                                                                       
            "enabled": true,                                                                           
            "loss_scale": 0,                                                                              
            "loss_scale_window": 1000,                                                              
            "initial_scale_power": 8,                                                                
            "hysteresis": 2,                                                                             
            "min_loss_scale": 1                                                                       
        },                                                                                                
        "optimizer": {                                                                                 
            "type": "Adam"                                                                                
        },                                                                                                
        "zero_optimization": {                                                                        
            "stage": $ZERO_STAGE,                                                                     
            "allgather_partitions": true,                                                            
            "allgather_bucket_size": 5e8,                                                            
            "overlap_comm": true,                                                                      
            "reduce_scatter": true,                                                                   
            "reduce_bucket_size": 5e8,                                                                  
            "contiguous_gradients": true                                                              
        },                                                                                                
        "gradient_accumulation_steps": 1,                                                          
        "train_batch_size": $GLOBAL_BATCH,                                                               
        "train_micro_batch_size_per_gpu":$MICRO_BATCH,                                           
        "zero_allow_untested_optimizer": true                                                     
    }                                                                                                     
    EOT                                                                                                   
    ds_args=""                                                                                          
    ds_args=" --deepspeed ${ds_args}"                                                                 
    ds_args=" --no-pipeline-parallel ${ds_args}"                                                  
    ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}"                                        
    ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"                                              
    ds_args=" --deepspeed-activation-checkpointing ${ds_args}"                                
    deepspeed  pretrain_intern.py \                                                                 
           --DDP-impl torch \                                                                            
           --use-contiguous-buffers-in-ddp \                                                       
           --tensor-model-parallel-size 1 \                                                        
           --pipeline-model-parallel-size 1 \                                                         
           --num-layers 32 \                                                                           
           --hidden-size 4096 \                                                                        
           --ffn-hidden-size 11008 \                                                                   
           --num-attention-heads 32 \                                                                
           --micro-batch-size $MICRO_BATCH \                                                          
           --global-batch-size $GLOBAL_BATCH \                                                        
           --seq-length 2048 \                                                                         
           --max-position-embeddings 2048 \                                                        
           --train-iters 500000 \                                                                      
           --lr-decay-iters 320000 \                                                                 
           --data-path $DATA \                                                                          
           --tokenizer-name-or-path ./dataset/internlm \                                        
           --tokenizer-not-use-fast \                                                               
           --data-impl mmap \                                                                         
           --split 949,50,1 \                                                                          
           --distributed-backend nccl \                                                             
           --lr 0.00015 \                                                                              
           --lr-decay-style cosine \                                                                     
           --min-lr 1.0e-5 \                                                                          
           --weight-decay 1e-2 \                                                                             
           --clip-grad 1.0 \                                                                           
           --lr-warmup-fraction .01 \                                                                
           --checkpoint-activations \                                                                
           --log-interval 1 \                                                                         
           --save-interval 10000 \                                                                   
           --eval-interval 1000 \                                                                     
           --eval-iters 10 \                                                                           
           --use-cpu-initialization \                                                               
           $ds_args \                                                                                   
           --fp16 | tee logs/train.log

  2. 启动训练。

    bash examples/intern/pretrain_internlm_7b_zero.sh