更多样例

矢量计算基础API使用TensorTrait样例

#include "kernel_operator.h"
class KernelBinaryScalarTrait {
public:
    __aicore__ inline KernelBinaryScalarTrait() {}
    __aicore__ inline void Init(__gm__ uint8_t* src, __gm__ uint8_t* dstGm)
    {
        srcGlobal.SetGlobalBuffer((__gm__ int16_t*)src);
        dstGlobal.SetGlobalBuffer((__gm__ int16_t*)dstGm);
        pipe.InitBuffer(inQueueSrc, 1, 512 * sizeof(int16_t));
        pipe.InitBuffer(outQueueDst, 1, 512 * sizeof(int16_t));
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        Compute();
        CopyOut();
    }
private:
    __aicore__ inline void CopyIn()
    {
        AscendC::LocalTensor<AscendC::TensorTrait<int16_t>> srcLocal = inQueueSrc.AllocTensor<AscendC::TensorTrait<int16_t>>();
        AscendC::DataCopy(srcLocal, srcGlobal, 512);
        inQueueSrc.EnQue(srcLocal);
    }
    __aicore__ inline void Compute()
    {
        AscendC::LocalTensor<AscendC::TensorTrait<int16_t>> srcLocal = inQueueSrc.DeQue<AscendC::TensorTrait<int16_t>>();
        AscendC::LocalTensor<AscendC::TensorTrait<int16_t>> dstLocal = outQueueDst.AllocTensor<AscendC::TensorTrait<int16_t>>();

        uint64_t mask = 128;
        int16_t scalar = 2;
        // repeatTime = 4, 128 elements one repeat, 512 elements total
       // dstBlkStride, srcBlkStride = 1, no gap between blocks in one repeat
       // dstRepStride, srcRepStride =8, no gap between repeats
        AscendC::Adds(dstLocal, srcLocal, scalar, mask, 4, {1, 1, 8, 8});
        
        outQueueDst.EnQue(dstLocal);
        inQueueSrc.FreeTensor(srcLocal);
    }
    __aicore__ inline void CopyOut()
    {
        AscendC::LocalTensor<AscendC::TensorTrait<int16_t>> dstLocal = outQueueDst.DeQue<AscendC::TensorTrait<int16_t>>();
        AscendC::DataCopy(dstGlobal, dstLocal, 512);
        outQueueDst.FreeTensor(dstLocal);
    }
private:
    AscendC::TPipe pipe;
    AscendC::TQue<AscendC::TPosition::VECIN, 1> inQueueSrc;
    AscendC::TQue<AscendC::TPosition::VECOUT, 1> outQueueDst;
    AscendC::GlobalTensor<AscendC::TensorTrait<int16_t>> srcGlobal, dstGlobal;
};
extern "C" __global__ __aicore__ void binary_scalar_trait_kernel(__gm__ uint8_t* src, __gm__ uint8_t* dstGm)
{
    KernelBinaryScalarTrait op;
    op.Init(src, dstGm);
    op.Process();
}

矩阵计算基础API使用TensorTrait样例

#include "kernel_operator.h"
template <typename dst_T, typename fmap_T, typename weight_T, typename dstCO1_T, typename bias_T> class KernelMatmul {
public:
    __aicore__ inline KernelMatmul(uint16_t mIn, uint8_t kIn, uint8_t nIn, bool initl1In, bool initl0In)
    {
        m = mIn;
        k = kIn;
        n = nIn;
        aSize = m * k;
        bSize = k * n;
        cSize = m * n;
        initl0 = initl0In;
        initl1 = initl1In;
    }
    __aicore__ inline void Init(__gm__ uint8_t *a, __gm__ uint8_t *b, __gm__ uint8_t *c)
    {
        aGM.SetGlobalBuffer((__gm__ fmap_T *)a);
        bGM.SetGlobalBuffer((__gm__ weight_T *)b);
        cGM.SetGlobalBuffer((__gm__ dstCO1_T *)c);
        pipe.InitBuffer(inQueueA1, 1, aSize * sizeof(fmap_T));
        pipe.InitBuffer(inQueueA2, 1, aSize * sizeof(fmap_T));
        pipe.InitBuffer(inQueueB1, 1, bSize * sizeof(weight_T));
        pipe.InitBuffer(inQueueB2, 2, bSize * sizeof(weight_T));
        pipe.InitBuffer(outQueueCO1, 1, cSize * sizeof(dstCO1_T));
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        SplitA();
        SplitB();
        Compute();
        CopyOut();
    }
private:
    __aicore__ inline void CopyIn()
    {
        AscendC::LocalTensor<AscendC::TensorTrait<fmap_T>> a1Local = inQueueA1.AllocTensor<AscendC::TensorTrait<fmap_T>>();
        AscendC::LocalTensor<AscendC::TensorTrait<weight_T>> b1Local = inQueueB1.AllocTensor<AscendC::TensorTrait<weight_T>>();
        if(initl1 == true) {
            AscendC::InitConstValue(a1Local, {static_cast<uint16_t>(m * k * sizeof(fmap_T) / 32), 1, 0, 1});
            AscendC::InitConstValue(b1Local, {static_cast<uint16_t>(k * n * sizeof(weight_T) / 32), 1, 0, 1});
        } else {
            AscendC::DataCopy(a1Local, aGM, aSize);
            AscendC::DataCopy(b1Local, bGM, bSize);
        }
        inQueueA1.EnQue(a1Local);
        inQueueB1.EnQue(b1Local);
    }
    __aicore__ inline void SplitA()
    {
        AscendC::LocalTensor<AscendC::TensorTrait<fmap_T>> a1Local = inQueueA1.DeQue<AscendC::TensorTrait<fmap_T>>();
        AscendC::LocalTensor<AscendC::TensorTrait<fmap_T>> a2Local = inQueueA2.AllocTensor<AscendC::TensorTrait<fmap_T>>();
        // 1、load2d L1->L0A
        AscendC::LoadData2dParams loadL0AParams;
        loadL0AParams.repeatTimes = m * k * sizeof(fmap_T) / 512;
        loadL0AParams.srcStride = 1;
        loadL0AParams.dstGap = 0;
        if (initl0 == true) {
            InitConstValue(a2Local, {static_cast<uint16_t>(m * k * sizeof(fmap_T) / 512), 1, 0, 1});
        } else{
            LoadData(a2Local, a1Local, loadL0AParams);
        }
        inQueueA2.EnQue<AscendC::TensorTrait<fmap_T>>(a2Local);
        inQueueA1.FreeTensor(a1Local);
    }
    __aicore__ inline void SplitB()
    {
        AscendC::LocalTensor<AscendC::TensorTrait<weight_T>> b1Local = inQueueB1.DeQue<AscendC::TensorTrait<weight_T>>();
        AscendC::LocalTensor<AscendC::TensorTrait<weight_T>> b2Local = inQueueB2.AllocTensor<AscendC::TensorTrait<weight_T>>();
        // 2、load2d L1->L0B
        AscendC::LoadData2dParams loadL0BParams;
        loadL0BParams.repeatTimes = k * n * sizeof(weight_T) / 512;
        loadL0BParams.srcStride = 1;
        loadL0BParams.dstGap = 0;
        if (initl0 == true) {
            AscendC::InitConstValue(b2Local, {static_cast<uint16_t>(k * n * sizeof(weight_T) / 512), 1, 0, 1});
        } else{
            AscendC::LoadData(b2Local, b1Local, loadL0BParams);
        }
        inQueueB1.FreeTensor(b1Local);
        inQueueB2.EnQue<AscendC::TensorTrait<weight_T>>(b2Local);
    }
    __aicore__ inline void Compute()
    {
        AscendC::LocalTensor<AscendC::TensorTrait<fmap_T>> a2Local = inQueueA2.DeQue<AscendC::TensorTrait<fmap_T>>();
        AscendC::LocalTensor<AscendC::TensorTrait<weight_T>> b2Local = inQueueB2.DeQue<AscendC::TensorTrait<weight_T>>();
        AscendC::LocalTensor<AscendC::TensorTrait<dstCO1_T>> c1Local = outQueueCO1.AllocTensor<AscendC::TensorTrait<dstCO1_T>>();
        mmadParams.isBias = false;
        mmadParams.m = m;
        mmadParams.n = n;
        mmadParams.k = k;
        AscendC::Mmad(c1Local, a2Local, b2Local, mmadParams); // m*n
        outQueueCO1.EnQue<AscendC::TensorTrait<dstCO1_T>>(c1Local);
        inQueueA2.FreeTensor(a2Local);
        inQueueB2.FreeTensor(b2Local);
    }
#if __NPU_ARCH__ <= 2002
    __aicore__ inline void CopyOut()
    {
        AscendC::LocalTensor<AscendC::TensorTrait<dstCO1_T>> c1Local = outQueueCO1.DeQue<AscendC::TensorTrait<dstCO1_T>>();
        uint16_t M_ = Ceil(m, 16) * 16;
        AscendC::LocalTensor<AscendC::TensorTrait<dst_T>> ublocal;
        AscendC::TBuffAddr tbufublocal;
        tbufublocal.logicPos = (uint8_t)AscendC::TPosition::C1;
        ublocal.SetAddr(tbufublocal);
        ublocal.InitBuffer(0, M_ * n);
        DataCopyParams dataCopyParams;
        dataCopyParams.blockCount = 1;
        dataCopyParams.blockLen = Ceil(M_ * n * 4, 1024);
        DataCopyEnhancedParams enhancedParams;
        enhancedParams.blockMode = AscendC::BlockMode::BLOCK_MODE_MATRIX;
        AscendC::DataCopy(ublocal, c1Local, dataCopyParams, enhancedParams);
        PipeBarrier<PIPE_ALL>();
        outQueueCO1.FreeTensor(c1Local);
        dataCopyParams.blockCount = 1;
        dataCopyParams.blockLen = m * n *sizeof(dstCO1_T) / ONE_BLK_SIZE;
        dataCopyParams.srcStride = 0;
        dataCopyParams.dstStride = 0;
        AscendC::DataCopy(cGM, ublocal, dataCopyParams);
    }
#else
    __aicore__ inline void CopyOut()
    {
        AscendC::LocalTensor<AscendC::TensorTrait<dstCO1_T>> c1Local = outQueueCO1.DeQue<AscendC::TensorTrait<dstCO1_T>>();
        AscendC::FixpipeParamsV220 fixpipeParams;
        fixpipeParams.nSize = n;
        fixpipeParams.mSize = m;
        fixpipeParams.srcStride = m;
        fixpipeParams.dstStride = n;
        fixpipeParams.ndNum = 1;
        fixpipeParams.srcNdStride = 0;
        fixpipeParams.dstNdStride = 0;
        AscendC::Fixpipe(cGM, c1Local, fixpipeParams);
        outQueueCO1.FreeTensor(c1Local);
    }
#endif
private:
    AscendC::TPipe pipe;
    AscendC::TQue<AscendC::TPosition::A1, 1> inQueueA1;
    AscendC::TQue<AscendC::TPosition::A2, 1> inQueueA2;
    AscendC::TQue<AscendC::TPosition::B1, 1> inQueueB1;
    AscendC::TQue<AscendC::TPosition::B2, 1> inQueueB2;
    // dst queue
    AscendC::TQue<AscendC::TPosition::CO1, 1> outQueueCO1;
    AscendC::GlobalTensor<AscendC::TensorTrait<fmap_T>> aGM;
    AscendC::GlobalTensor<AscendC::TensorTrait<weight_T>> bGM;
    AscendC::GlobalTensor<AscendC::TensorTrait<dst_T>> cGM;
    uint16_t m, k, n;
    bool initl0, initl1;
    uint16_t aSize, bSize, cSize, b2Size;
    AscendC::MmadParams mmadParams;
};
extern "C" __global__ __aicore__ void cube_initconstvalue_simple_operator_half_16_32_16_true_false(
    __gm__ uint8_t *a, __gm__ uint8_t *b, __gm__ uint8_t *c)
{
    if ASCEND_IS_AIV {
        return;
    }
    KernelMatmul<float, half, half, float, half> op(16, 32, 16, true, false);
    op.Init(a, b, c);
    op.Process();
}

父主题： TensorTrait