运行仓上模型

  1. 下载原始代码仓:

    git clone https://gitee.com/ascend/AscendSpeed
    cd AscendSpeed 
    mkdir logs
    mkdir ckpt

  2. 准备环境:

    conda create -n (conda环境名称) python=3.7
    conda activate (conda环境名称)
    下载并安装torch及torch_npu:
    # 安装torch:
    pip install torch==1.11 -i https://pypi.tuna.tsinghua.edu.cn/simple
    # 安装torch_npu:
    wget https://gitee.com/ascend/pytorch/releases/download/v5.0.rc2-pytorch1.11.0/torch_npu-1.11.0.post1-cp37-cp37m-linux_aarch64.whl (ARM)
    wget https://gitee.com/ascend/pytorch/releases/download/v5.0.rc2-pytorch1.11.0/torch_npu-1.11.0.post1-cp37-cp37m-linux_x86_64.whl (X86)
    pip install torch_npu-1.11.0.post1-cp37-cp37m-linux_XXXXXX.whl
    安装deepspeed与deepspeed_npu:
    pip install deepspeed==0.9.2
    git clone https://gitee.com/ascend/DeepSpeed.git -b v0.9.2 deepspeed_npu
    cd deepspeed_npu
    pip3 install -e .
    安装其他所需要的依赖:
    pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple

  3. 获取数据集:

    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
    # 执行preprocess_data.py
    python tools/preprocess_data.py \
                --input ./oscar-1GB.jsonl \
                --output-prefix my-gpt2 --vocab ./gpt2-vocab.json \
                --dataset-impl mmap \
                --tokenizer-type GPT2BPETokenizer \
                --merge-file .//gpt2-merges.txt \
                --append-eod --workes 9 \
    # 移动到文件夹
    mkdir -p dataset/oscar_data_1g
    mv gpt2-vocab.json gpt2-merges.txt my-gpt2_text_* dataset/oscar_data_1g/
    mkdir -p dataset/bloom_vocab/vocab_file
    # 下载数据到vocab_file
    wget https://huggingface.co/bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles/tree/main 

  4. 启动任务:

    sh examples/bloom/pretrain_bloom_7b1.sh