reshape_and_cache_demo.cpp

前置条件和编译命令请参见算子调用示例。当前仅支持Atlas 800I A2 推理产品/Atlas A2 训练系列产品Atlas A3 推理系列产品/Atlas A3 训练系列产品

场景:基础场景。

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#include <iostream>
#include <vector>
#include <numeric>
#include "acl/acl.h"
#include "atb/operation.h"
#include "atb/types.h"
#include "atb/atb_infer.h"
#include <random>
#include <algorithm>
#include <unordered_set>

#include "demo_util.h"

uint32_t NUM_TOKENS = 2;
uint32_t NUM_HEAD = 32;
uint32_t K_HEAD_SIZE = 128;
uint32_t V_HEAD_SIZE = K_HEAD_SIZE;
uint32_t NUM_BLOCKS = 512;
uint32_t BLOCK_SIZE = 128;

/**
 * @brief 准备随机输入tensorK或输入tensorV的内容
 * @param kvflag 0:key; 1:value
 * @return 输入tensorK或输入tensorV
 */
std::vector<float> KvGeneration(bool kvflag)
{
    // 创建随机数生成器
    std::random_device rd;
    std::mt19937 gen(rd());
    // 定义随机数分布范围
    std::uniform_real_distribution<> dis(-100.0, 100.0);
    // 定义要生成的随机数的个数
    size_t num_elements = kvflag ? NUM_TOKENS * NUM_HEAD * V_HEAD_SIZE : NUM_TOKENS * NUM_HEAD * K_HEAD_SIZE;
    // 创建一个 vector 并填充随机数
    std::vector<float> intensorKV;
    for (size_t i = 0; i < num_elements; ++i) {
        intensorKV.push_back(dis(gen));
    }
    return intensorKV;
}

/**
 * @brief 准备随机输入tensorSlotmapping的内容
 * @param slotRange 数据生成范围
 * @param num_tokens Slotmapping length
 * @return 输入tensorSlotmapping
 */
std::vector<int32_t> SlotmappingGeneration(int slotRange, size_t num_tokens) {
    // 创建一个包含范围 [-slotRange, slotRange] 的所有整数的 vector
    std::vector<int32_t> all_numbers;
    for (int i = -slotRange; i <= slotRange; ++i) {
        all_numbers.push_back(i);
    }
    // 检查 num_tokens 是否超过范围内的整数数量
    if (num_tokens > all_numbers.size()) {
        throw std::invalid_argument("num_tokens exceeds the range of unique numbers available");
    }
    // 创建随机数生成器
    std::random_device rd;
    std::mt19937 gen(rd());
    // 打乱所有整数
    std::shuffle(all_numbers.begin(), all_numbers.end(), gen);
    // 选取前 num_tokens 个元素
    std::vector<int32_t> slotmapping(all_numbers.begin(), all_numbers.begin() + num_tokens);
    return slotmapping;
}

/**
 * @brief 准备atb::VariantPack
 * @param contextPtr context指针
 * @param stream stream
 * @param seqLenHost host侧tensor。序列长度向量,等于1时,为增量或全量;大于1时,为全量
 * @param tokenOffsetHost host侧tensor。计算完成后的token偏移
 * @param layerId layerId,取cache的kv中哪一个kv进行计算
 * @return atb::VariantPack
 */
atb::VariantPack PrepareVariantPack(atb::Context *contextPtr, aclrtStream stream)
{
    // 创建key,value tensor
    std::vector<float> keyData = KvGeneration(false);
    atb::Tensor tensorKey = CreateTensorFromVector(
        contextPtr, stream, keyData, ACL_FLOAT16, aclFormat::ACL_FORMAT_ND, {NUM_TOKENS, NUM_HEAD, K_HEAD_SIZE});
    std::vector<float> valueData = KvGeneration(true);
    atb::Tensor tensorValue = CreateTensorFromVector(
        contextPtr, stream, valueData, ACL_FLOAT16, aclFormat::ACL_FORMAT_ND, {NUM_TOKENS, NUM_HEAD, V_HEAD_SIZE});
    // 创建kvCache tensor
    std::vector<float> kCacheData(NUM_BLOCKS * BLOCK_SIZE * NUM_HEAD * K_HEAD_SIZE, 0);
    atb::Tensor tensorKCache = CreateTensorFromVector(
        contextPtr, stream, kCacheData, ACL_FLOAT16, aclFormat::ACL_FORMAT_ND, {NUM_BLOCKS, BLOCK_SIZE, NUM_HEAD, K_HEAD_SIZE});
    std::vector<float> vCacheData(NUM_BLOCKS * BLOCK_SIZE * NUM_HEAD * V_HEAD_SIZE, 0);
    atb::Tensor tensorVCache = CreateTensorFromVector(
        contextPtr, stream, vCacheData, ACL_FLOAT16, aclFormat::ACL_FORMAT_ND, {NUM_BLOCKS, BLOCK_SIZE, NUM_HEAD, V_HEAD_SIZE});
    // 创建SlotMapping
    std::vector<int32_t> slotMappingData = SlotmappingGeneration(NUM_BLOCKS * BLOCK_SIZE, NUM_TOKENS);
    atb::Tensor tensorSlotMapping = CreateTensor(ACL_INT32, aclFormat::ACL_FORMAT_ND, {NUM_TOKENS});
    CHECK_STATUS(aclrtMemcpy(tensorSlotMapping.deviceData,
        tensorSlotMapping.dataSize,
        slotMappingData.data(),
        sizeof(int32_t) * slotMappingData.size(),
        ACL_MEMCPY_HOST_TO_DEVICE));
    // 根据顺序将所有输入tensor放入SVector
    atb::SVector<atb::Tensor> inTensors = {tensorKey,
        tensorValue,
        tensorKCache,
        tensorVCache,
        tensorSlotMapping};
    // 准备输入张量
    atb::VariantPack variantPack;
    variantPack.inTensors = inTensors;  // 放入输入tensor
    variantPack.outTensors = {tensorKCache, tensorVCache};  // 放入输出tensor
    return variantPack;
}

/**
 * @brief 创建一个Reshape and Cache的Operation,并设置参数
 * @return atb::Operation * 返回一个Operation指针
 */
atb::Operation *PrepareOperation()
{
    atb::infer::ReshapeAndCacheParam opParam;
    opParam.compressType = atb::infer::ReshapeAndCacheParam::CompressType::COMPRESS_TYPE_UNDEFINED;
    opParam.kvCacheCfg = atb::infer::ReshapeAndCacheParam::KvCacheCfg::K_CACHE_V_CACHE;
    atb::Operation *reshapeAndCacheOp = nullptr;
    CHECK_STATUS(atb::CreateOperation(opParam, &reshapeAndCacheOp));
    return reshapeAndCacheOp;
}

int main(int argc, char **argv)
{
    // 设置卡号、创建context、设置stream
    CHECK_STATUS(aclInit(nullptr));
    int32_t deviceId = 0;
    CHECK_STATUS(aclrtSetDevice(deviceId));
    atb::Context *context = nullptr;
    CHECK_STATUS(atb::CreateContext(&context));
    void *stream = nullptr;
    CHECK_STATUS(aclrtCreateStream(&stream));
    context->SetExecuteStream(stream);

    // RAC示例
    atb::Operation *reshapeAndCacheOp = PrepareOperation();
    // 准备variantPack
    atb::VariantPack variantPack = PrepareVariantPack(context, stream);
    uint64_t workspaceSize = 0;
    // 对输入tensor和输出tensor进行校验
    CHECK_STATUS(reshapeAndCacheOp->Setup(variantPack, workspaceSize, context));
    uint8_t *workspacePtr = nullptr;
    if (workspaceSize > 0) {
        CHECK_STATUS(aclrtMalloc((void **)(&workspacePtr), workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
    }
    // RAC执行
    reshapeAndCacheOp->Execute(variantPack, workspacePtr, workspaceSize, context);
    CHECK_STATUS(aclrtSynchronizeStream(stream));  // 流同步,等待device侧任务计算完成
    for (atb::Tensor &inTensor : variantPack.inTensors) {
        CHECK_STATUS(aclrtFree(inTensor.deviceData));
    }
    if (workspaceSize > 0) {
        CHECK_STATUS(aclrtFree(workspacePtr));
    }
    // 资源释放
    CHECK_STATUS(atb::DestroyOperation(reshapeAndCacheOp));  // operation,对象概念,先释放
    CHECK_STATUS(aclrtDestroyStream(stream));
    CHECK_STATUS(atb::DestroyContext(context));  // context,全局资源,后释放
    CHECK_STATUS((aclFinalize()));
    std::cout << "Reshape and Cache demo success!" << std::endl;
    return 0;
}