fa_pa_encoder_demo.cpp

前置条件和编译命令请参见算子调用示例。当前仅支持Atlas 800I A2 推理产品/Atlas A2 训练系列产品Atlas A3 推理系列产品/Atlas A3 训练系列产品

场景:FA PA_Encoder场景。

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#include <iostream>
#include <vector>
#include <numeric>
#include "acl/acl.h"
#include "atb/operation.h"
#include "atb/types.h"
#include "atb/atb_infer.h"

#include "demo_util.h"

const uint32_t BATCH_SIZE = 1;                    // 批处理大小
std::vector<int32_t> seqLenHost(BATCH_SIZE, 16);  // host侧tensor值,用于存储每个批处理中的序列长度
const uint32_t NTOKENS = accumulate(seqLenHost.begin(), seqLenHost.end(), 0);  // sum(seqLenHost)
const uint32_t MAX_SEQ_LEN = 1024;                                             // 最大序列长度
const uint32_t HEAD_NUM = 32;                                                  // 头数
const uint32_t KV_HEAD_NUM = 32;                                               // kv头数
const uint32_t HEAD_SIZE = 64;                                                 // 头大小

/**
 * @brief 准备atb::VariantPack中的所有输入tensor
 * @param contextPtr context指针
 * @param stream stream
 * @param seqLenHost host侧tensor。序列长度向量,等于1时,为增量或全量;大于1时,为全量
 * @return atb::SVector<atb::Tensor> atb::VariantPack中的输入tensor
 * @note 需要传入所有host侧tensor
 */
atb::SVector<atb::Tensor> PrepareInTensor(
    atb::Context *contextPtr, aclrtStream stream, std::vector<int32_t> &seqLenHost)
{
    // 创建query tensor
    atb::Tensor tensorQ = CreateTensorFromVector(contextPtr,
        stream,
        std::vector<float>(NTOKENS * HEAD_NUM * HEAD_SIZE, 1.0),
        ACL_FLOAT16,
        aclFormat::ACL_FORMAT_ND,
        {NTOKENS, HEAD_NUM, HEAD_SIZE});
    // 创建key,value tensor
    std::vector<float> kvData(NTOKENS * KV_HEAD_NUM * HEAD_SIZE, 1.0);
    atb::Tensor tensorK = CreateTensorFromVector(
        contextPtr, stream, kvData, ACL_FLOAT16, aclFormat::ACL_FORMAT_ND, {NTOKENS, KV_HEAD_NUM, HEAD_SIZE});
    atb::Tensor tensorV = CreateTensorFromVector(
        contextPtr, stream, kvData, ACL_FLOAT16, aclFormat::ACL_FORMAT_ND, {NTOKENS, KV_HEAD_NUM, HEAD_SIZE});
    std::vector<float> maskData(BATCH_SIZE * MAX_SEQ_LEN * MAX_SEQ_LEN, 0);
    // 创建norm mask,值为-inf的上三角mask
    for (int i = 0; i < BATCH_SIZE; ++i) {
        for (int j = 0; j < MAX_SEQ_LEN; ++j) {
            for (int k = j + 1; k < MAX_SEQ_LEN; ++k) {
                maskData[i * MAX_SEQ_LEN * MAX_SEQ_LEN + j * MAX_SEQ_LEN + k] = -32768;
            }
        }
    }
    atb::Tensor tensorMask = CreateTensorFromVector(
        contextPtr, stream, maskData, ACL_FLOAT16, aclFormat::ACL_FORMAT_ND, {BATCH_SIZE, MAX_SEQ_LEN, MAX_SEQ_LEN});
    // 创建seqLen,host侧tensor
    atb::Tensor tensorSeqLen = CreateTensor(ACL_INT32, aclFormat::ACL_FORMAT_ND, {BATCH_SIZE});
    tensorSeqLen.hostData = seqLenHost.data();  // seqLenHost中的值为seqLen
    // 根据顺序将所有输入tensor放入SVector
    atb::SVector<atb::Tensor> inTensors = {tensorQ, tensorK, tensorV, tensorMask, tensorSeqLen};
    return inTensors;
}

/**
 * @brief 创建一个FA encoder的Operation,并设置参数
 * @return atb::Operation * 返回一个Operation指针
 */
atb::Operation *PrepareOperation()
{
    atb::infer::SelfAttentionParam paOpParam;
    paOpParam.maskType = atb::infer::SelfAttentionParam::MaskType::MASK_TYPE_NORM;
    paOpParam.headNum = HEAD_NUM;
    paOpParam.kvHeadNum = KV_HEAD_NUM;
    paOpParam.calcType = atb::infer::SelfAttentionParam::CalcType::PA_ENCODER;
    atb::Operation *paEncoderOp = nullptr;
    CHECK_STATUS(atb::CreateOperation(paOpParam, &paEncoderOp));
    return paEncoderOp;
}

int main(int argc, char **argv)
{
    // 设置卡号、创建context、设置stream
    CHECK_STATUS(aclInit(nullptr));
    int32_t deviceId = 0;
    CHECK_STATUS(aclrtSetDevice(deviceId));
    atb::Context *context = nullptr;
    CHECK_STATUS(atb::CreateContext(&context));
    void *stream = nullptr;
    CHECK_STATUS(aclrtCreateStream(&stream));
    context->SetExecuteStream(stream);

    // FA PAEncoder示例
    atb::Operation *paEncoderOp = PrepareOperation();
    // 准备输入张量
    atb::VariantPack paVariantPack;
    paVariantPack.inTensors = PrepareInTensor(context, stream, seqLenHost);  // 放入输入tensor
    atb::Tensor tensorOut = CreateTensor(ACL_FLOAT16, aclFormat::ACL_FORMAT_ND, {NTOKENS, KV_HEAD_NUM, HEAD_SIZE});
    paVariantPack.outTensors.push_back(tensorOut);  // 放入输出tensor

    uint64_t workspaceSize = 0;
    // 计算workspaceSize大小
    CHECK_STATUS(paEncoderOp->Setup(paVariantPack, workspaceSize, context));
    uint8_t *workspacePtr = nullptr;
    if (workspaceSize > 0) {
        CHECK_STATUS(aclrtMalloc((void **)(&workspacePtr), workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
    }
    // FA Encoder执行
    paEncoderOp->Execute(paVariantPack, workspacePtr, workspaceSize, context);
    CHECK_STATUS(aclrtSynchronizeStream(stream));  // 流同步,等待device侧任务计算完成
    CHECK_STATUS(aclrtFree(tensorOut.deviceData));
    for (atb::Tensor &inTensor : paVariantPack.inTensors) {
        CHECK_STATUS(aclrtFree(inTensor.deviceData));
    }
    if (workspaceSize > 0) {
        CHECK_STATUS(aclrtFree(workspacePtr));
    }
    CHECK_STATUS(atb::DestroyOperation(paEncoderOp));  // operation,对象概念,先释放
    CHECK_STATUS(aclrtDestroyStream(stream));
    CHECK_STATUS(DestroyContext(context));  // context,全局资源,后释放
    CHECK_STATUS((aclFinalize()));
    std::cout << "FA PA Encoder demo success!" << std::endl;
    return 0;
}