fa_encoder_inference_demo.cpp

前置条件和编译命令请参见算子调用示例。本用例仅支持Atlas 推理系列产品

与示例1相较本示例主要有以下修改点:

场景:FA Encoder场景。

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#include <iostream>
#include <vector>
#include <numeric>
#include "acl/acl.h"
#include "atb/operation.h"
#include "atb/types.h"
#include "atb/atb_infer.h"

#include "demo_util.h"

const uint32_t BATCH_SIZE = 1;                    // 批处理大小
std::vector<int32_t> seqLenHost(BATCH_SIZE, 16);  // host侧tensor值,用于存储每个批处理中的序列长度
std::vector<int32_t> tokenOffsetHost(BATCH_SIZE, 16);                          // host侧tensor值,token偏移
std::vector<int32_t> layerId(1, 0);                                            // device侧,kvCache中取哪个计算
const uint32_t NTOKENS = accumulate(seqLenHost.begin(), seqLenHost.end(), 0);  // sum(seqLenHost)
const uint32_t MAX_SEQ_LEN = 256;                                              // 最大序列长度
const uint32_t HEAD_NUM = 16;                                                  // 头数
const uint32_t KV_HEAD_NUM = 16;                                               // kv头数
const uint32_t HEAD_SIZE = 16;                                                 // 头大小
const uint32_t LAYER_NUM = 1;                                                  // 层大小

/**
 * @brief 准备atb::VariantPack中的所有输入tensor
 * @param contextPtr context指针
 * @param stream stream
 * @param seqLenHost host侧tensor。序列长度向量,等于1时,为增量或全量;大于1时,为全量
 * @param tokenOffsetHost host侧tensor。计算完成后的token偏移
 * @param layerId layerId,取cache的kv中哪一个kv进行计算
 * @return atb::SVector<atb::Tensor> atb::VariantPack中的输入tensor
 * @note 需要传入所有host侧tensor
 */
atb::SVector<atb::Tensor> PrepareInTensor(atb::Context *contextPtr, aclrtStream stream,
    std::vector<int32_t> &seqLenHost, std::vector<int32_t> &tokenOffsetHost, std::vector<int32_t> &layerId)
{
    uint32_t qHiddenSize = HEAD_NUM * HEAD_SIZE;
    uint32_t kvHiddenSize = KV_HEAD_NUM * HEAD_SIZE;

    // 创建query tensor
    std::vector<float> qData(NTOKENS * qHiddenSize, 1.0);
    atb::Tensor tensorQ = CreateTensorFromVector(
        contextPtr, stream, qData, ACL_FLOAT16, aclFormat::ACL_FORMAT_ND, {NTOKENS, qHiddenSize});
    // 创建key,value tensor
    std::vector<float> kvData(NTOKENS * kvHiddenSize, 1.0);
    atb::Tensor tensorK = CreateTensorFromVector(
        contextPtr, stream, kvData, ACL_FLOAT16, aclFormat::ACL_FORMAT_ND, {NTOKENS, qHiddenSize});
    atb::Tensor tensorV = CreateTensorFromVector(
        contextPtr, stream, kvData, ACL_FLOAT16, aclFormat::ACL_FORMAT_ND, {NTOKENS, qHiddenSize});
    std::vector<unsigned int16_t> kvCacheData(LAYER_NUM * BATCH_SIZE * MAX_SEQ_LEN * kvHiddenSize, 0x3C00);
    std::vector<int64_t> kvCacheShape = {LAYER_NUM, BATCH_SIZE, kvHiddenSize / 16, MAX_SEQ_LEN, 16};
    atb::Tensor tensorCacheK = CreateTensorFromVector(
        contextPtr, stream, kvCacheData, ACL_FLOAT16, aclFormat::ACL_FORMAT_FRACTAL_NZ, kvCacheShape, ACL_FLOAT16);
    atb::Tensor tensorCacheV = CreateTensorFromVector(
        contextPtr, stream, kvCacheData, ACL_FLOAT16, aclFormat::ACL_FORMAT_FRACTAL_NZ, kvCacheShape, ACL_FLOAT16);
    // 创建tokenOffset,host侧tensor
    atb::Tensor tensorTokenOffset = CreateTensor(ACL_INT32, aclFormat::ACL_FORMAT_ND, {BATCH_SIZE});
    tensorTokenOffset.hostData = tokenOffsetHost.data();  // host侧tensor,拷贝值
    // 创建seqLen,host侧tensor
    atb::Tensor tensorSeqLen = CreateTensor(ACL_INT32, aclFormat::ACL_FORMAT_ND, {BATCH_SIZE});
    tensorSeqLen.hostData = seqLenHost.data();  // host侧tensor,拷贝值
    // 创建layerId
    atb::Tensor tensorLayerId = CreateTensor(ACL_INT32, aclFormat::ACL_FORMAT_ND, {1});
    CHECK_STATUS(aclrtMemcpy(tensorLayerId.deviceData,
        tensorLayerId.dataSize,
        layerId.data(),
        sizeof(short) * layerId.size(),
        ACL_MEMCPY_HOST_TO_DEVICE));
    // 根据顺序将所有输入tensor放入SVector
    atb::SVector<atb::Tensor> inTensors = {
        tensorQ, tensorK, tensorV, tensorCacheK, tensorCacheV, tensorTokenOffset, tensorSeqLen, tensorLayerId};
    return inTensors;
}

/**
 * @brief 创建一个FA encoder的Operation,并设置参数
 * @return atb::Operation * 返回一个Operation指针
 */
atb::Operation *PrepareOperation()
{
    atb::infer::SelfAttentionParam opParam;
    opParam.headNum = HEAD_NUM;
    opParam.kvHeadNum = KV_HEAD_NUM;
    opParam.calcType = atb::infer::SelfAttentionParam::CalcType::ENCODER;
    atb::Operation *encoderOp = nullptr;
    CHECK_STATUS(atb::CreateOperation(opParam, &encoderOp));
    return encoderOp;
}

int main(int argc, char **argv)
{
    // kv隐藏层大小,用于输出tensor shape
    uint32_t kvHiddenSize = KV_HEAD_NUM * HEAD_SIZE;

    // 设置卡号、创建context、设置stream
    CHECK_STATUS(aclInit(nullptr));
    int32_t deviceId = 0;
    CHECK_STATUS(aclrtSetDevice(deviceId));
    atb::Context *context = nullptr;
    CHECK_STATUS(atb::CreateContext(&context));
    void *stream = nullptr;
    CHECK_STATUS(aclrtCreateStream(&stream));
    context->SetExecuteStream(stream);

    // FA Encoder示例
    atb::Operation *encoderOp = PrepareOperation();
    // 准备输入张量
    atb::VariantPack variantPack;
    variantPack.inTensors = PrepareInTensor(context, stream, seqLenHost, tokenOffsetHost, layerId);  // 放入输入tensor
    atb::Tensor tensorOut = CreateTensor(ACL_FLOAT16, aclFormat::ACL_FORMAT_ND, {NTOKENS, kvHiddenSize});
    variantPack.outTensors.push_back(tensorOut);  // 放入输出tensor
    uint64_t workspaceSize = 0;
    // 计算workspaceSize大小
    CHECK_STATUS(encoderOp->Setup(variantPack, workspaceSize, context));
    uint8_t *workspacePtr = nullptr;
    if (workspaceSize > 0) {
        CHECK_STATUS(aclrtMalloc((void **)(&workspacePtr), workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
    }
    // FA Encoder执行
    encoderOp->Execute(variantPack, workspacePtr, workspaceSize, context);
    CHECK_STATUS(aclrtSynchronizeStream(stream));  // 流同步,等待device侧任务计算完成
    CHECK_STATUS(aclrtFree(tensorOut.deviceData));
    for (atb::Tensor &inTensor : variantPack.inTensors) {
        CHECK_STATUS(aclrtFree(inTensor.deviceData));
    }
    if (workspaceSize > 0) {
        CHECK_STATUS(aclrtFree(workspacePtr));
    }
    // 资源释放
    CHECK_STATUS(atb::DestroyOperation(encoderOp));  // operation,对象概念,先释放
    CHECK_STATUS(aclrtDestroyStream(stream));
    CHECK_STATUS(atb::DestroyContext(context));  // context,全局资源,后释放
    CHECK_STATUS((aclFinalize()));
    std::cout << "FA Encoder demo success!" << std::endl;
    return 0;
}