More Samples

Template Samples

This section provides a template sample of triple-operand scalar instructions to help you quickly run reference samples in specific instructions.

You can use the following template sample as the code framework and only need to copy the sample snippet in specific instructions to replace the content in bold.

Complete example 1: half type
#include "kernel_operator.h"
class KernelAxpy {
public:
    __aicore__ inline KernelAxpy() {}
    __aicore__ inline void Init(__gm__ uint8_t* srcGm, __gm__ uint8_t* dstGm)
    {
        srcGlobal.SetGlobalBuffer((__gm__ half*)srcGm);
        dstGlobal.SetGlobalBuffer((__gm__ half*)dstGm);
        pipe.InitBuffer(inQueueSrc, 1, 512 * sizeof(half));
        pipe.InitBuffer(outQueueDst, 1, 512 * sizeof(half));
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        Compute();
        CopyOut();
    }
private:
    __aicore__ inline void CopyIn()
    {
        AscendC::LocalTensor<half> srcLocal = inQueueSrc.AllocTensor<half>();
        AscendC::DataCopy(srcLocal, srcGlobal, 512);
        inQueueSrc.EnQue(srcLocal);
    }
    __aicore__ inline void Compute()
    {
        AscendC::LocalTensor<half> srcLocal = inQueueSrc.DeQue<half>();
        AscendC::LocalTensor<half> dstLocal = outQueueDst.AllocTensor<half>();
 
        AscendC::Axpy(dstLocal, srcLocal, (half)2.0, 512);
 
        outQueueDst.EnQue<half>(dstLocal);
        inQueueSrc.FreeTensor(srcLocal);
    }
    __aicore__ inline void CopyOut()
    {
        AscendC::LocalTensor<half> dstLocal = outQueueDst.DeQue<half>();
        AscendC::DataCopy(dstGlobal, dstLocal, 512);
        outQueueDst.FreeTensor(dstLocal);
    }
private:
    AscendC::TPipe pipe;
    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueSrc;
    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueueDst;
    AscendC::GlobalTensor<half> srcGlobal, dstGlobal;
};
extern "C" __global__ __aicore__ void kernel_vec_ternary_scalar_Axpy_half_2_half(__gm__ uint8_t* srcGm, __gm__ uint8_t* dstGm)
{
    KernelAxpy op;
    op.Init(srcGm, dstGm);
    op.Process();
}
Input (src1Gm):
[1. 1. 1. 1. 1. 1. ... 1.]
Output (dstGm):
[2. 2. 2. 2. 2. 2. ... 2.]

Complete example 2: mix type
#include "kernel_operator.h"
class KernelAxpy {
public:
    __aicore__ inline KernelAxpy() {}
    __aicore__ inline void Init(__gm__ uint8_t* srcGm, __gm__ uint8_t* dstGm)
    {
        srcGlobal.SetGlobalBuffer((__gm__ half*)srcGm);
        dstGlobal.SetGlobalBuffer((__gm__ float*)dstGm);
        pipe.InitBuffer(outQueueDst, 1, 512 * sizeof(float));
        pipe.InitBuffer(inQueueSrc, 1, 512 * sizeof(half));
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        Compute();
        CopyOut();
    }
private:
    __aicore__ inline void CopyIn()
    {
        AscendC::LocalTensor<half> srcLocal = inQueueSrc.AllocTensor<half>();
        AscendC::DataCopy(srcLocal, srcGlobal, 512);
        inQueueSrc.EnQue(srcLocal);
    }
    __aicore__ inline void Compute()
    {
        AscendC::LocalTensor<half> srcLocal = inQueueSrc.DeQue<half>();
        AscendC::LocalTensor<float> dstLocal = outQueueDst.AllocTensor<float>();
 
        AscendC::Axpy(dstLocal, srcLocal, (half)2.0, 64, 8,{ 1, 1, 8, 4 });
 
        outQueueDst.EnQue<float>(dstLocal);
        inQueueSrc.FreeTensor(srcLocal);
    }
    __aicore__ inline void CopyOut()
    {
        AscendC::LocalTensor<float> dstLocal = outQueueDst.DeQue<float>();
        AscendC::DataCopy(dstGlobal, dstLocal, 512);
        outQueueDst.FreeTensor(dstLocal);
    }
private:
    AscendC::TPipe pipe;
    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueSrc;
    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueueDst;
    AscendC::GlobalTensor<half> srcGlobal;
    AscendC::GlobalTensor<float> dstGlobal;
};
extern "C" __global__ __aicore__ void kernel_vec_ternary_scalar_Axpy_half_2_float(__gm__ uint8_t* srcGm, __gm__ uint8_t* dstGm)
{
    KernelAxpy op;
    op.Init(srcGm, dstGm);
    op.Process();
}
Input (src1Gm):
[1. 1. 1. 1. 1. 1. ... 1.]
Output (dstGm):
[2. 2. 2. 2. 2. 2. ... 2.]