fa_prefix_encoder_demo.cpp

前置条件和编译命令请参见算子调用示例。当前仅支持Atlas 800I A2 推理产品/Atlas A2 训练系列产品Atlas A3 推理系列产品/Atlas A3 训练系列产品

场景:FA Prefix_Encoder场景。

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#include <iostream>
#include <vector>
#include <numeric>
#include "acl/acl.h"
#include "atb/operation.h"
#include "atb/types.h"
#include "atb/atb_infer.h"

#include "demo_util.h"

const uint32_t BATCH_SIZE = 4;                       // 批处理大小
std::vector<int32_t> seqLenHost = {16, 16, 32, 32};  // host侧tensor值,用于存储Query每个批处理中的序列长度
const uint32_t NTOKENS = accumulate(seqLenHost.begin(), seqLenHost.end(), 0);  // sum(seqLenHost)
std::vector<int32_t> kvSeqLenHost = {16, 144, 32, 288};  // host侧tensor值,用于存储Key, Value每个批处理中的序列长度
const uint32_t NUM_BLOCKS = accumulate(kvSeqLenHost.begin(), kvSeqLenHost.end(), 0);  // sum(kvSeqLenHost)
const uint32_t HEAD_NUM = 32;                                                         // query头数
const uint32_t KV_HEAD_NUM = 32;                                                      // kv头数
const uint32_t HEAD_SIZE = 64;                                                        // 头大小
const uint32_t BLOCK_SIZE = 128;                                                      // 以block存放的kv块大小

/**
 * @brief 准备atb::VariantPack中的所有输入tensor
 * @param contextPtr context指针
 * @param stream stream
 * @param seqLenHost host侧tensor。Query序列长度向量
 * @param kvSeqLenHost host侧tensor。Key, Value序列长度向量
 * @return atb::SVector<atb::Tensor> atb::VariantPack中的输入tensor
 * @note 需要传入所有host侧tensor
 */
atb::SVector<atb::Tensor> PrepareInTensor(
    atb::Context *contextPtr, aclrtStream stream, std::vector<int32_t> &seqLenHost, std::vector<int32_t> &kvSeqLenHost)
{
    // 创建query tensor
    atb::Tensor tensorQ = CreateTensorFromVector(contextPtr,
        stream,
        std::vector<float>(NTOKENS * HEAD_NUM * HEAD_SIZE, 1.0),
        ACL_FLOAT16,
        aclFormat::ACL_FORMAT_ND,
        {NTOKENS, HEAD_NUM, HEAD_SIZE});
    // 创建key,value tensor
    std::vector<float> kvData(NUM_BLOCKS * BLOCK_SIZE * KV_HEAD_NUM * HEAD_SIZE, 1.0);
    std::vector<int64_t> kvShape = {NUM_BLOCKS, BLOCK_SIZE, KV_HEAD_NUM, HEAD_SIZE};
    atb::Tensor tensorK =
        CreateTensorFromVector(contextPtr, stream, kvData, ACL_FLOAT16, aclFormat::ACL_FORMAT_ND, kvShape);
    atb::Tensor tensorV =
        CreateTensorFromVector(contextPtr, stream, kvData, ACL_FLOAT16, aclFormat::ACL_FORMAT_ND, kvShape);
    // 创建blockTables
    atb::Tensor tensorBlockTables = CreateTensor(ACL_INT32, aclFormat::ACL_FORMAT_ND, {BATCH_SIZE, 4});
    std::vector<int32_t> blockTablesData(16);
    std::iota(blockTablesData.begin(), blockTablesData.end(), 0);
    CHECK_STATUS(aclrtMemcpy(tensorBlockTables.deviceData,
        tensorBlockTables.dataSize,
        blockTablesData.data(),
        sizeof(int32_t) * blockTablesData.size(),
        ACL_MEMCPY_HOST_TO_DEVICE));
    // 创建alibi128mask,值为1的上三角
    std::vector<float> maskData = std::vector<float>(HEAD_NUM * NTOKENS * 128, 0);  // alibi128 mask
    for (int i = 0; i < HEAD_NUM; ++i) {
        for (int j = 0; j < NTOKENS; ++j) {
            for (int k = j + 1; k < 128; ++k) {
                maskData[i * NTOKENS * 128 + j * 128 + k] = 1;
            }
        }
    }
    atb::Tensor tensorMask = CreateTensorFromVector(
        contextPtr, stream, maskData, ACL_FLOAT16, aclFormat::ACL_FORMAT_ND, {HEAD_NUM, NTOKENS, 128});
    // 创建seqLen,host侧tensor
    atb::Tensor tensorSeqLen = CreateTensor(ACL_INT32, aclFormat::ACL_FORMAT_ND, {BATCH_SIZE});
    tensorSeqLen.hostData = seqLenHost.data();
    // 创建kvSeqLen,host侧tensor
    atb::Tensor tensorKvSeqLen = CreateTensor(ACL_INT32, aclFormat::ACL_FORMAT_ND, {BATCH_SIZE});
    tensorKvSeqLen.hostData = kvSeqLenHost.data();
    atb::Tensor tensorSlopes = CreateTensor(ACL_FLOAT, aclFormat::ACL_FORMAT_ND, {HEAD_SIZE});
    std::vector<float> slData(HEAD_SIZE, 1.0);
    CHECK_STATUS(aclrtMemcpy(tensorSlopes.deviceData,
        tensorSlopes.dataSize,
        slData.data(),
        sizeof(float) * slData.size(),
        ACL_MEMCPY_HOST_TO_DEVICE));
    atb::SVector<atb::Tensor> inTensors = {
        tensorQ, tensorK, tensorV, tensorBlockTables, tensorMask, tensorSeqLen, tensorKvSeqLen, tensorSlopes};
    return inTensors;
}

/**
 * @brief 创建一个FA encoder的Operation,并设置参数
 * @return atb::Operation * 返回一个Operation指针
 */
atb::Operation *PrepareOperation()
{
    atb::infer::SelfAttentionParam prefixOpParam;
    prefixOpParam.headNum = HEAD_NUM;
    prefixOpParam.kvHeadNum = KV_HEAD_NUM;
    prefixOpParam.calcType = atb::infer::SelfAttentionParam::CalcType::PREFIX_ENCODER;
    prefixOpParam.kernelType = atb::infer::SelfAttentionParam::KernelType::KERNELTYPE_HIGH_PRECISION;
    prefixOpParam.maskType = atb::infer::SelfAttentionParam::MaskType::MASK_TYPE_ALIBI_COMPRESS;
    prefixOpParam.isTriuMask = 1;
    atb::Operation *prefixEncoderOp = nullptr;
    CHECK_STATUS(atb::CreateOperation(prefixOpParam, &prefixEncoderOp));
    return prefixEncoderOp;
}

int main(int argc, char **argv)
{
    // 设置卡号、创建context、设置stream
    CHECK_STATUS(aclInit(nullptr));
    int32_t deviceId = 0;
    CHECK_STATUS(aclrtSetDevice(deviceId));
    atb::Context *context = nullptr;
    CHECK_STATUS(atb::CreateContext(&context));
    void *stream = nullptr;
    CHECK_STATUS(aclrtCreateStream(&stream));
    context->SetExecuteStream(stream);

    // FA Prefix Encoder示例
    atb::Operation *prefixEncoderOp = PrepareOperation();
    // 准备输入tensor
    atb::VariantPack prefixVariantPack;
    prefixVariantPack.inTensors = PrepareInTensor(context, stream, seqLenHost, kvSeqLenHost);  // 放入输入tensor
    atb::Tensor tensorOut = CreateTensor(ACL_FLOAT16, aclFormat::ACL_FORMAT_ND, {NTOKENS, KV_HEAD_NUM, HEAD_SIZE});
    prefixVariantPack.outTensors = {tensorOut};  // 放入输出tensor

    uint64_t workspaceSize = 0;
    // 计算workspaceSize大小
    CHECK_STATUS(prefixEncoderOp->Setup(prefixVariantPack, workspaceSize, context));
    uint8_t *workspacePtr = nullptr;
    if (workspaceSize > 0) {
        CHECK_STATUS(aclrtMalloc((void **)(&workspacePtr), workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
    }
    // FA Prefix Encoder执行
    prefixEncoderOp->Execute(prefixVariantPack, workspacePtr, workspaceSize, context);
    CHECK_STATUS(aclrtSynchronizeStream(stream));  // 流同步,等待device侧任务计算完成

    CHECK_STATUS(aclrtFree(tensorOut.deviceData));
    for (atb::Tensor &inTensor : prefixVariantPack.inTensors) {
        CHECK_STATUS(aclrtFree(inTensor.deviceData));
    }
    if (workspaceSize > 0) {
        CHECK_STATUS(aclrtFree(workspacePtr));
    }
    CHECK_STATUS(atb::DestroyOperation(prefixEncoderOp));  // operation,对象概念,先释放
    CHECK_STATUS(aclrtDestroyStream(stream));
    CHECK_STATUS(DestroyContext(context));  // context,全局资源,后释放
    CHECK_STATUS((aclFinalize()));
    std::cout << "FA Prefix Encoder demo success!" << std::endl;
    return 0;
}