算子调用示例(C++)

前置条件和编译命令请参见算子调用示例

场景:基础场景。

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#include <iostream>
#include <vector>
#include <numeric>
#include <random>
#include "acl/acl.h"
#include "atb/operation.h"
#include "atb/types.h"
#include "atb/atb_infer.h"
#include "demo_util.h"
const uint32_t BATCH_SIZE = 16;     // 批处理大小
const uint32_t SEQ_LEN = 1024;      // 序列长度
const uint32_t HIDDEN_SIZE = 4096;  // 隐藏层维度
/**
 * @brief 准备atb::VariantPack中的输入tensor
 * @return atb::SVector<atb::Tensor> 返回{[BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE]}的输入tensor
 */
atb::SVector<atb::Tensor> PrepareInTensor()
{
    // 创建一个[BATCH_SIZE*SEQ_LEN*HIDDEN_SIZE]的vector,其中各个值为取值范围为[-100,100)的随机数
    std::vector<float> inTensorData(BATCH_SIZE * SEQ_LEN * HIDDEN_SIZE);
    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_real_distribution<float> dis(-100.0f, 100.0f);
    for (float &val : inTensorData) {
        val = dis(gen);
    }
    // 创建输入tensor
    atb::Tensor inTensor = CreateTensor(ACL_FLOAT, aclFormat::ACL_FORMAT_ND, {BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE});
    CHECK_STATUS(aclrtMemcpy(inTensor.deviceData,
        inTensor.dataSize,
        inTensorData.data(),
        sizeof(float) * inTensorData.size(),
        ACL_MEMCPY_HOST_TO_DEVICE));
    atb::SVector<atb::Tensor> inTensors = {inTensor};
    return inTensors;
}
/**
 * @brief 创建一个Gelu Activation的Operation,并设置参数
 * @return atb::Operation * 返回一个Operation指针
 */
atb::Operation *GeluOperation()
{
    atb::infer::ActivationParam opParam;
    opParam.activationType = atb::infer::ActivationType::ACTIVATION_FASTER_GELU_FORWARD;
    atb::Operation *geluOp = nullptr;
    CHECK_STATUS(atb::CreateOperation(opParam, &geluOp));
    return geluOp;
}
/**
 * @brief 创建一个Swiglu_forward Activation的Operation,并设置参数
 * @return atb::Operation * 返回一个Operation指针
 */
atb::Operation *SwigluOperation()
{
    atb::infer::ActivationParam opParam;
    opParam.activationType = atb::infer::ActivationType::ACTIVATION_SWIGLU_FORWARD;
    opParam.dim = -1;
    atb::Operation *swigluOp = nullptr;
    CHECK_STATUS(atb::CreateOperation(opParam, &swigluOp));
    return swigluOp;
}
int main(int argc, char **argv)
{
    // 设置卡号、创建context、设置stream
    CHECK_STATUS(aclInit(nullptr));
    int32_t deviceId = 0;
    CHECK_STATUS(aclrtSetDevice(deviceId));
    atb::Context *context = nullptr;
    CHECK_STATUS(atb::CreateContext(&context));
    void *stream = nullptr;
    CHECK_STATUS(aclrtCreateStream(&stream));
    context->SetExecuteStream(stream);
    // Activation Gelu示例
    atb::Operation *geluOp = GeluOperation();
    // 准备VariantPack
    atb::VariantPack geluVariantPack;
    geluVariantPack.inTensors = PrepareInTensor();  // 放入输入tensor
    atb::Tensor tensorOut = CreateTensor(ACL_FLOAT, aclFormat::ACL_FORMAT_ND, {BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE});
    geluVariantPack.outTensors.push_back(tensorOut);  // 放入输出tensor
    uint64_t geluWorkspaceSize = 0;
    // Gelu Activation准备工作
    CHECK_STATUS(geluOp->Setup(geluVariantPack, geluWorkspaceSize, context));
    uint8_t *geluWorkspacePtr = nullptr;
    if (geluWorkspaceSize > 0) {
        CHECK_STATUS(aclrtMalloc((void **)(&geluWorkspacePtr), geluWorkspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
    }
    // Gelu Activation执行
    geluOp->Execute(geluVariantPack, geluWorkspacePtr, geluWorkspaceSize, context);
    CHECK_STATUS(aclrtSynchronizeStream(stream));  // 流同步,等待device侧任务计算完成
    for (atb::Tensor &inTensor : geluVariantPack.inTensors) {
        CHECK_STATUS(aclrtFree(inTensor.deviceData));
    }
    for (atb::Tensor &outTensor : geluVariantPack.outTensors) {
        CHECK_STATUS(aclrtFree(outTensor.deviceData));
    }
    if (geluWorkspaceSize > 0) {
        CHECK_STATUS(aclrtFree(geluWorkspacePtr));
    }
    CHECK_STATUS(atb::DestroyOperation(geluOp));  // operation,对象概念,先释放
    std::cout << "Gelu Activation demo success!" << std::endl;
    // Activation Swiglu_forward示例
    atb::Operation *swigluOp = SwigluOperation();
    // 准备VariantPack
    atb::VariantPack swigluVariantPack;
    swigluVariantPack.inTensors = PrepareInTensor();  // 放入输入tensor
    tensorOut = CreateTensor(ACL_FLOAT, aclFormat::ACL_FORMAT_ND, {BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE / 2});
    swigluVariantPack.outTensors.push_back(tensorOut);  // 放入输出tensor
    uint64_t swigluWorkspaceSize = 0;
    // Gelu Activation准备工作
    CHECK_STATUS(swigluOp->Setup(swigluVariantPack, swigluWorkspaceSize, context));
    uint8_t *swigluWorkspacePtr = nullptr;
    if (swigluWorkspaceSize > 0) {
        CHECK_STATUS(aclrtMalloc((void **)(&swigluWorkspacePtr), swigluWorkspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
    }
    // Gelu Activation执行
    swigluOp->Execute(swigluVariantPack, swigluWorkspacePtr, swigluWorkspaceSize, context);
    CHECK_STATUS(aclrtSynchronizeStream(stream));  // 流同步,等待device侧任务计算完成
    for (atb::Tensor &inTensor : swigluVariantPack.inTensors) {
        CHECK_STATUS(aclrtFree(inTensor.deviceData));
    }
    for (atb::Tensor &outTensor : swigluVariantPack.outTensors) {
        CHECK_STATUS(aclrtFree(outTensor.deviceData));
    }
    if (swigluWorkspaceSize > 0) {
        CHECK_STATUS(aclrtFree(swigluWorkspacePtr));
    }
    CHECK_STATUS(atb::DestroyOperation(swigluOp));  // operation,对象概念,先释放
    std::cout << "Swiglu_forward Activation demo success!" << std::endl;
    // 资源释放
    CHECK_STATUS(aclrtDestroyStream(stream));
    CHECK_STATUS(atb::DestroyContext(context));  // context,全局资源,后释放
    CHECK_STATUS(aclFinalize());
    return 0;
}