前置条件和编译命令请参见算子调用示例。当前仅支持
与示例1相较本示例主要有以下修改点:
场景:基础场景。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 | #include <iostream> #include <vector> #include <numeric> #include "acl/acl.h" #include "atb/operation.h" #include "atb/types.h" #include "atb/atb_infer.h" #include <random> #include <algorithm> #include <unordered_set> #include "demo_util.h" uint32_t NUM_TOKENS = 3; uint32_t NUM_HEAD = 4; uint32_t K_HEAD_SIZE = 128; uint32_t V_HEAD_SIZE = K_HEAD_SIZE; uint32_t NUM_BLOCKS = 512; uint32_t BLOCK_SIZE = 128; /** * @brief 准备随机输入tensorK或输入tensorV的内容 * @param kvflag 0:key; 1:value * @return 输入tensorK或输入tensorV */ std::vector<int8_t> KvGeneration(bool kvflag) { // 创建随机数生成器 std::random_device rd; std::mt19937 gen(rd()); // 定义随机数分布范围 std::uniform_real_distribution<> dis(-100.0, 100.0); // 定义要生成的随机数的个数 size_t num_elements = kvflag ? NUM_TOKENS * NUM_HEAD * V_HEAD_SIZE : NUM_TOKENS * NUM_HEAD * K_HEAD_SIZE; // 创建一个 vector 并填充随机数 std::vector<int8_t> intensorKV; for (size_t i = 0; i < num_elements; ++i) { intensorKV.push_back(dis(gen)); } return intensorKV; } /** * @brief 准备随机输入tensorSlotmapping的内容 * @param slotRange 数据生成范围 * @param num_tokens Slotmapping length * @return 输入tensorSlotmapping */ std::vector<int32_t> SlotmappingGeneration(int slotRange, size_t num_tokens) { // 创建一个包含范围 [-slotRange, slotRange] 的所有整数的 vector std::vector<int32_t> all_numbers; for (int i = -slotRange; i <= slotRange; ++i) { all_numbers.push_back(i); } // 检查 num_tokens 是否超过范围内的整数数量 if (num_tokens > all_numbers.size()) { throw std::invalid_argument("num_tokens exceeds the range of unique numbers available"); } // 创建随机数生成器 std::random_device rd; std::mt19937 gen(rd()); // 打乱所有整数 std::shuffle(all_numbers.begin(), all_numbers.end(), gen); // 选取前 num_tokens 个元素 std::vector<int32_t> slotmapping(all_numbers.begin(), all_numbers.begin() + num_tokens); return slotmapping; } /** * @brief 准备atb::VariantPack * @param contextPtr context指针 * @param stream stream * @param seqLenHost host侧tensor。序列长度向量,等于1时,为增量或全量;大于1时,为全量 * @param tokenOffsetHost host侧tensor。计算完成后的token偏移 * @param layerId layerId,取cache的kv中哪一个kv进行计算 * @return atb::VariantPack */ atb::VariantPack PrepareVariantPack(atb::Context *contextPtr, aclrtStream stream) { // 创建key,value tensor std::vector<int8_t> keyData = KvGeneration(false); atb::Tensor tensorKey = CreateTensorFromVector( contextPtr, stream, keyData, ACL_INT8, aclFormat::ACL_FORMAT_ND, {NUM_TOKENS, NUM_HEAD, K_HEAD_SIZE}); std::vector<int8_t> valueData = KvGeneration(true); atb::Tensor tensorValue = CreateTensorFromVector( contextPtr, stream, valueData, ACL_INT8, aclFormat::ACL_FORMAT_ND, {NUM_TOKENS, NUM_HEAD, V_HEAD_SIZE}); // 创建kvCache tensor std::vector<int8_t> kCacheData(NUM_BLOCKS * BLOCK_SIZE * NUM_HEAD * K_HEAD_SIZE, 0); atb::Tensor tensorKCache = CreateTensorFromVector(contextPtr, stream, kCacheData, ACL_INT8, aclFormat::ACL_FORMAT_FRACTAL_NZ, {NUM_BLOCKS, NUM_HEAD * K_HEAD_SIZE / 16, BLOCK_SIZE, 16}); std::vector<int8_t> vCacheData(NUM_BLOCKS * BLOCK_SIZE * NUM_HEAD * V_HEAD_SIZE, 0); atb::Tensor tensorVCache = CreateTensorFromVector(contextPtr, stream, vCacheData, ACL_INT8, aclFormat::ACL_FORMAT_FRACTAL_NZ, {NUM_BLOCKS, NUM_HEAD * V_HEAD_SIZE / 16, BLOCK_SIZE, 16}); // 创建SlotMapping std::vector<int32_t> slotMappingData = SlotmappingGeneration(NUM_BLOCKS * BLOCK_SIZE, NUM_TOKENS); atb::Tensor tensorSlotMapping = CreateTensor(ACL_INT32, aclFormat::ACL_FORMAT_ND, {NUM_TOKENS}); CHECK_STATUS(aclrtMemcpy(tensorSlotMapping.deviceData, tensorSlotMapping.dataSize, slotMappingData.data(), sizeof(int32_t) * slotMappingData.size(), ACL_MEMCPY_HOST_TO_DEVICE)); // 根据顺序将所有输入tensor放入SVector atb::SVector<atb::Tensor> inTensors = {tensorKey, tensorValue, tensorKCache, tensorVCache, tensorSlotMapping}; // 准备输入张量 atb::VariantPack variantPack; variantPack.inTensors = inTensors; // 放入输入tensor variantPack.outTensors = {tensorKCache, tensorVCache}; // 放入输出tensor return variantPack; } /** * @brief 创建一个Reshape and Cache的Operation,并设置参数 * @return atb::Operation * 返回一个Operation指针 */ atb::Operation *PrepareOperation() { atb::infer::ReshapeAndCacheParam opParam; opParam.compressType = atb::infer::ReshapeAndCacheParam::CompressType::COMPRESS_TYPE_UNDEFINED; opParam.kvCacheCfg = atb::infer::ReshapeAndCacheParam::KvCacheCfg::K_CACHE_V_CACHE; atb::Operation *reshapeAndCacheOp = nullptr; CHECK_STATUS(atb::CreateOperation(opParam, &reshapeAndCacheOp)); return reshapeAndCacheOp; } int main(int argc, char **argv) { // 设置卡号、创建context、设置stream CHECK_STATUS(aclInit(nullptr)); int32_t deviceId = 0; CHECK_STATUS(aclrtSetDevice(deviceId)); atb::Context *context = nullptr; CHECK_STATUS(atb::CreateContext(&context)); void *stream = nullptr; CHECK_STATUS(aclrtCreateStream(&stream)); context->SetExecuteStream(stream); // RAC示例 atb::Operation *reshapeAndCacheOp = PrepareOperation(); // 准备variantPack atb::VariantPack variantPack = PrepareVariantPack(context, stream); uint64_t workspaceSize = 0; // 对输入tensor和输出tensor进行校验 CHECK_STATUS(reshapeAndCacheOp->Setup(variantPack, workspaceSize, context)); uint8_t *workspacePtr = nullptr; if (workspaceSize > 0) { CHECK_STATUS(aclrtMalloc((void **)(&workspacePtr), workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); } // RAC执行 reshapeAndCacheOp->Execute(variantPack, workspacePtr, workspaceSize, context); CHECK_STATUS(aclrtSynchronizeStream(stream)); // 流同步,等待device侧任务计算完成 for (atb::Tensor &inTensor : variantPack.inTensors) { CHECK_STATUS(aclrtFree(inTensor.deviceData)); } if (workspaceSize > 0) { CHECK_STATUS(aclrtFree(workspacePtr)); } // 资源释放 CHECK_STATUS(atb::DestroyOperation(reshapeAndCacheOp)); // operation,对象概念,先释放 CHECK_STATUS(aclrtDestroyStream(stream)); CHECK_STATUS(atb::DestroyContext(context)); // context,全局资源,后释放 CHECK_STATUS((aclFinalize())); std::cout << "Reshape and Cache demo success!" << std::endl; return 0; } |