前置条件和编译命令请参见算子调用示例。
场景:基础场景。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | #include <iostream> #include <vector> #include <numeric> #include <random> #include "acl/acl.h" #include "atb/operation.h" #include "atb/types.h" #include "atb/atb_infer.h" #include "demo_util.h" const uint32_t BATCH_SIZE = 16; // 批处理大小 const uint32_t SEQ_LEN = 1024; // 序列长度 const uint32_t HIDDEN_SIZE = 4096; // 隐藏层维度 /** * @brief 准备atb::VariantPack中的输入tensor * @return atb::SVector<atb::Tensor> 返回{[BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE]}的输入tensor */ atb::SVector<atb::Tensor> PrepareInTensor() { // 创建一个[BATCH_SIZE*SEQ_LEN*HIDDEN_SIZE]的vector,其中各个值为取值范围为[-100,100)的随机数 std::vector<float> inTensorData(BATCH_SIZE * SEQ_LEN * HIDDEN_SIZE); std::random_device rd; std::mt19937 gen(rd()); std::uniform_real_distribution<float> dis(-100.0f, 100.0f); for (float &val : inTensorData) { val = dis(gen); } // 创建输入tensor atb::Tensor inTensor = CreateTensor(ACL_FLOAT, aclFormat::ACL_FORMAT_ND, {BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE}); CHECK_STATUS(aclrtMemcpy(inTensor.deviceData, inTensor.dataSize, inTensorData.data(), sizeof(float) * inTensorData.size(), ACL_MEMCPY_HOST_TO_DEVICE)); atb::SVector<atb::Tensor> inTensors = {inTensor}; return inTensors; } /** * @brief 创建一个Gelu Activation的Operation,并设置参数 * @return atb::Operation * 返回一个Operation指针 */ atb::Operation *GeluOperation() { atb::infer::ActivationParam opParam; opParam.activationType = atb::infer::ActivationType::ACTIVATION_FASTER_GELU_FORWARD; atb::Operation *geluOp = nullptr; CHECK_STATUS(atb::CreateOperation(opParam, &geluOp)); return geluOp; } /** * @brief 创建一个Swiglu_forward Activation的Operation,并设置参数 * @return atb::Operation * 返回一个Operation指针 */ atb::Operation *SwigluOperation() { atb::infer::ActivationParam opParam; opParam.activationType = atb::infer::ActivationType::ACTIVATION_SWIGLU_FORWARD; opParam.dim = -1; atb::Operation *swigluOp = nullptr; CHECK_STATUS(atb::CreateOperation(opParam, &swigluOp)); return swigluOp; } int main(int argc, char **argv) { // 设置卡号、创建context、设置stream CHECK_STATUS(aclInit(nullptr)); int32_t deviceId = 0; CHECK_STATUS(aclrtSetDevice(deviceId)); atb::Context *context = nullptr; CHECK_STATUS(atb::CreateContext(&context)); void *stream = nullptr; CHECK_STATUS(aclrtCreateStream(&stream)); context->SetExecuteStream(stream); // Activation Gelu示例 atb::Operation *geluOp = GeluOperation(); // 准备VariantPack atb::VariantPack geluVariantPack; geluVariantPack.inTensors = PrepareInTensor(); // 放入输入tensor atb::Tensor tensorOut = CreateTensor(ACL_FLOAT, aclFormat::ACL_FORMAT_ND, {BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE}); geluVariantPack.outTensors.push_back(tensorOut); // 放入输出tensor uint64_t geluWorkspaceSize = 0; // Gelu Activation准备工作 CHECK_STATUS(geluOp->Setup(geluVariantPack, geluWorkspaceSize, context)); uint8_t *geluWorkspacePtr = nullptr; if (geluWorkspaceSize > 0) { CHECK_STATUS(aclrtMalloc((void **)(&geluWorkspacePtr), geluWorkspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); } // Gelu Activation执行 geluOp->Execute(geluVariantPack, geluWorkspacePtr, geluWorkspaceSize, context); CHECK_STATUS(aclrtSynchronizeStream(stream)); // 流同步,等待device侧任务计算完成 for (atb::Tensor &inTensor : geluVariantPack.inTensors) { CHECK_STATUS(aclrtFree(inTensor.deviceData)); } for (atb::Tensor &outTensor : geluVariantPack.outTensors) { CHECK_STATUS(aclrtFree(outTensor.deviceData)); } if (geluWorkspaceSize > 0) { CHECK_STATUS(aclrtFree(geluWorkspacePtr)); } CHECK_STATUS(atb::DestroyOperation(geluOp)); // operation,对象概念,先释放 std::cout << "Gelu Activation demo success!" << std::endl; // Activation Swiglu_forward示例 atb::Operation *swigluOp = SwigluOperation(); // 准备VariantPack atb::VariantPack swigluVariantPack; swigluVariantPack.inTensors = PrepareInTensor(); // 放入输入tensor tensorOut = CreateTensor(ACL_FLOAT, aclFormat::ACL_FORMAT_ND, {BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE / 2}); swigluVariantPack.outTensors.push_back(tensorOut); // 放入输出tensor uint64_t swigluWorkspaceSize = 0; // Gelu Activation准备工作 CHECK_STATUS(swigluOp->Setup(swigluVariantPack, swigluWorkspaceSize, context)); uint8_t *swigluWorkspacePtr = nullptr; if (swigluWorkspaceSize > 0) { CHECK_STATUS(aclrtMalloc((void **)(&swigluWorkspacePtr), swigluWorkspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); } // Gelu Activation执行 swigluOp->Execute(swigluVariantPack, swigluWorkspacePtr, swigluWorkspaceSize, context); CHECK_STATUS(aclrtSynchronizeStream(stream)); // 流同步,等待device侧任务计算完成 for (atb::Tensor &inTensor : swigluVariantPack.inTensors) { CHECK_STATUS(aclrtFree(inTensor.deviceData)); } for (atb::Tensor &outTensor : swigluVariantPack.outTensors) { CHECK_STATUS(aclrtFree(outTensor.deviceData)); } if (swigluWorkspaceSize > 0) { CHECK_STATUS(aclrtFree(swigluWorkspacePtr)); } CHECK_STATUS(atb::DestroyOperation(swigluOp)); // operation,对象概念,先释放 std::cout << "Swiglu_forward Activation demo success!" << std::endl; // 资源释放 CHECK_STATUS(aclrtDestroyStream(stream)); CHECK_STATUS(atb::DestroyContext(context)); // context,全局资源,后释放 CHECK_STATUS(aclFinalize()); return 0; } |