Model开发

Model侧的开发包括两个部分,首先组合前置开发的Layer层,再使得Pytorch等框架能够直接调用Model层。

Model组图调用。将多层layer进行拼接,能够减少Python侧调用时间,具体实现目录为“models/xx_model/model/*model.cpp”,整体实现上类似Layer,核心在于BuildGraph组图。

       void BuildGraph() {
           // 核心实现: 组图过程类似Layer构造
           // 先初始化各类Tensors和Nodes
           const int weightTensorSize = WORDEMBEDDINGNODE_WEIGHT_COUNT + WEIGHT_COUNT_PER_LAYER * param_.layerNum + FINALNORMNODE_WEIGHT_COUNT;
           graph_.weightTensors.resize(weightTensorSize);
           graph_.inTensors.resize(IN_TENSOR_NUM);
           graph_.outTensors.resize(OUT_TENSOR_NUM);
           graph_.nodes.resize(NODE_SIZE);
           graph_.internalTensors.resize(INTERNAL_TENSORS_SIZE);

           // 定义Node,传入实际的Tensor对象,Node可以对应Operation/Layer对象
           // Operation构造
           int nodeId = 0;
           auto &TestNode1 = graph_.nodes.at(nodeId++);
           atb::infer::TestParam testParam1;
           atb::Operation *op = nullptr;
           atb::CreateOp(testParam1, &op);
           testNode1.operation.reset(op);
           testNode1.inTensors = {&graph_.weightTensors.at(0), &graph_.inTensors.at(0)};
           testNode1.outTensors = {&graph_.internalTensors.at(0)};

           // 其他Node定义同上...

           // Layer构造 
           // 此处构造 layerNum * SingleAttentionLayer 来成图
           AsdOps::Tensor *firstInTensor = &graph_.inTensors.at(0);  // Layer之间传递的Tensor
           for (int layerId = 0; layerId < param_.layerNum; ++layerId) {
               auto &layerNode = graph_.nodes.at(nodeId++);
               TestLayerParam opParam;
               assign_param(opParam, param_); //完成opParam的初始化赋值
               // Layer构造
               TestLayerOperation(opParam, &op);
               layerNode.operation.reset(op);
               layerNode.inTensors.resize(layerNode.operation->GetInputNum());
               size_t inTensorId = 0;
               layerNode.inTensors.at(inTensorId++) = firstInTensor;
               for (size_t weightTensorId = 0; weightTensorId < WEIGHT_COUNT_PER_LAYER; ++weightTensorId) {
                   layerNode.inTensors.at(inTensorId++) = &graph_.weightTensors.at(
                       layerId * WEIGHT_COUNT_PER_LAYER + weightTensorId + WORDEMBEDDINGNODE_WEIGHT_COUNT);
	       }
	       layerNode.inTensors.at(inTensorId++) = &graph_.inTensors.at(IN_TENSOR_POSITIONID);    // positionIdTensor
               ...
	       // 其他输入输出tensor定义同上
	       layerNode.outTensors = {&graph_.internalTensors.at(OPERATION_COUNT_BEFORE_LAYER + layerId)};
	       firstInTensor = layerNode.outTensors.at(0);
           }
           // 剩余图构造流程同上
       };

对于一些特定Operation,需要利用param透传机制:部分Operation在每次执行时param都会改变,需要通过tensor.hostData传递这些数据,在layer和model中,需要将这些数据透传给Operation。

主要场景是SelfAttentionKvCache算子,随tokens变化Cache的取值会变化,其他场景较少不涉及。model级替换时,通过实现ParseParam和BindParamHostTensor两个函数完成这一功能。
       atb::Status TestModel::ParseParam(const std::string &param)
       {
           // 基于输入参数更新tokenOffset/seqLen变量
           nlohmann::json paramJson = nlohmann::json::parse(param);
           tokenOffset_.clear();
           for (auto item : paramJson["tokenOffset"]) {
               tokenOffset_.push_back(item.get<int>());
           }
           seqLen_.clear();
           for (auto item : paramJson["seqLen"]) {
               seqLen_.push_back(item.get<int>());
           }
           return atb::NO_ERROR;
       }
       atb::Status TestModel::BindParamHostTensor(uint32_t nodeId)
       {
           // 基于三个值对Param进行透传:layerId/tokenoffset/seqlen
           layerId_ = nodeId - OPERATION_COUNT_BEFORE_LAYER;
           auto &node = graph_.nodes.at(nodeId);
           const uint32_t tokenOffsetTensorId = 19;
           const uint32_t seqLenTensorId = 20;
           const uint32_t layerIdTensorId = 21;
           node.variantPack.inTensors.at(tokenOffsetTensorId).hostData = tokenOffset_.data();
           node.variantPack.inTensors.at(seqLenTensorId).hostData = seqLen_.data();    
           node.variantPack.inTensors.at(layerIdTensorId).hostData = &layerId_;
           return atb::NO_ERROR;
       };