git clone https://gitee.com/ascend/AscendSpeed cd AscendSpeed mkdir logs mkdir ckpt
conda create -n (conda环境名称) python=3.7 conda activate (conda环境名称)
# 安装torch: pip install torch==1.11 -i https://pypi.tuna.tsinghua.edu.cn/simple # 安装torch_npu: wget https://gitee.com/ascend/pytorch/releases/download/v5.0.rc2-pytorch1.11.0/torch_npu-1.11.0.post1-cp37-cp37m-linux_aarch64.whl (ARM) wget https://gitee.com/ascend/pytorch/releases/download/v5.0.rc2-pytorch1.11.0/torch_npu-1.11.0.post1-cp37-cp37m-linux_x86_64.whl (X86) pip install torch_npu-1.11.0.post1-cp37-cp37m-linux_XXXXXX.whl
pip install deepspeed==0.9.2 git clone https://gitee.com/ascend/DeepSpeed.git -b v0.9.2 deepspeed_npu cd deepspeed_npu pip3 install -e .
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json # 执行preprocess_data.py python tools/preprocess_data.py \ --input ./oscar-1GB.jsonl \ --output-prefix my-gpt2 --vocab ./gpt2-vocab.json \ --dataset-impl mmap \ --tokenizer-type GPT2BPETokenizer \ --merge-file .//gpt2-merges.txt \ --append-eod --workes 9 \ # 移动到文件夹 mkdir -p dataset/oscar_data_1g mv gpt2-vocab.json gpt2-merges.txt my-gpt2_text_* dataset/oscar_data_1g/ mkdir -p dataset/bloom_vocab/vocab_file # 下载数据到vocab_file wget https://huggingface.co/bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles/tree/main
sh examples/bloom/pretrain_bloom_7b1.sh