更多样例
- 矢量计算基础API使用TensorTrait样例
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
#include "kernel_operator.h" class KernelBinaryScalarTrait { public: __aicore__ inline KernelBinaryScalarTrait() {} __aicore__ inline void Init(__gm__ uint8_t* src, __gm__ uint8_t* dstGm) { srcGlobal.SetGlobalBuffer((__gm__ int16_t*)src); dstGlobal.SetGlobalBuffer((__gm__ int16_t*)dstGm); pipe.InitBuffer(inQueueSrc, 1, 512 * sizeof(int16_t)); pipe.InitBuffer(outQueueDst, 1, 512 * sizeof(int16_t)); } __aicore__ inline void Process() { CopyIn(); Compute(); CopyOut(); } private: __aicore__ inline void CopyIn() { AscendC::LocalTensor<AscendC::TensorTrait<int16_t>> srcLocal = inQueueSrc.AllocTensor<AscendC::TensorTrait<int16_t>>(); AscendC::DataCopy(srcLocal, srcGlobal, 512); inQueueSrc.EnQue(srcLocal); } __aicore__ inline void Compute() { AscendC::LocalTensor<AscendC::TensorTrait<int16_t>> srcLocal = inQueueSrc.DeQue<AscendC::TensorTrait<int16_t>>(); AscendC::LocalTensor<AscendC::TensorTrait<int16_t>> dstLocal = outQueueDst.AllocTensor<AscendC::TensorTrait<int16_t>>(); uint64_t mask = 128; int16_t scalar = 2; // repeatTime = 4, 128 elements one repeat, 512 elements total // dstBlkStride, srcBlkStride = 1, no gap between blocks in one repeat // dstRepStride, srcRepStride =8, no gap between repeats AscendC::Adds(dstLocal, srcLocal, scalar, mask, 4, {1, 1, 8, 8}); outQueueDst.EnQue(dstLocal); inQueueSrc.FreeTensor(srcLocal); } __aicore__ inline void CopyOut() { AscendC::LocalTensor<AscendC::TensorTrait<int16_t>> dstLocal = outQueueDst.DeQue<AscendC::TensorTrait<int16_t>>(); AscendC::DataCopy(dstGlobal, dstLocal, 512); outQueueDst.FreeTensor(dstLocal); } private: AscendC::TPipe pipe; AscendC::TQue<AscendC::TPosition::VECIN, 1> inQueueSrc; AscendC::TQue<AscendC::TPosition::VECOUT, 1> outQueueDst; AscendC::GlobalTensor<AscendC::TensorTrait<int16_t>> srcGlobal, dstGlobal; }; extern "C" __global__ __aicore__ void binary_scalar_trait_kernel(__gm__ uint8_t* src, __gm__ uint8_t* dstGm) { KernelBinaryScalarTrait op; op.Init(src, dstGm); op.Process(); }
- 矩阵计算基础API使用TensorTrait样例
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
#include "kernel_operator.h" template <typename dst_T, typename fmap_T, typename weight_T, typename dstCO1_T, typename bias_T> class KernelMatmul { public: __aicore__ inline KernelMatmul(uint16_t mIn, uint8_t kIn, uint8_t nIn, bool initl1In, bool initl0In) { m = mIn; k = kIn; n = nIn; aSize = m * k; bSize = k * n; cSize = m * n; initl0 = initl0In; initl1 = initl1In; } __aicore__ inline void Init(__gm__ uint8_t *a, __gm__ uint8_t *b, __gm__ uint8_t *c) { aGM.SetGlobalBuffer((__gm__ fmap_T *)a); bGM.SetGlobalBuffer((__gm__ weight_T *)b); cGM.SetGlobalBuffer((__gm__ dstCO1_T *)c); pipe.InitBuffer(inQueueA1, 1, aSize * sizeof(fmap_T)); pipe.InitBuffer(inQueueA2, 1, aSize * sizeof(fmap_T)); pipe.InitBuffer(inQueueB1, 1, bSize * sizeof(weight_T)); pipe.InitBuffer(inQueueB2, 2, bSize * sizeof(weight_T)); pipe.InitBuffer(outQueueCO1, 1, cSize * sizeof(dstCO1_T)); } __aicore__ inline void Process() { CopyIn(); SplitA(); SplitB(); Compute(); CopyOut(); } private: __aicore__ inline void CopyIn() { AscendC::LocalTensor<AscendC::TensorTrait<fmap_T>> a1Local = inQueueA1.AllocTensor<AscendC::TensorTrait<fmap_T>>(); AscendC::LocalTensor<AscendC::TensorTrait<weight_T>> b1Local = inQueueB1.AllocTensor<AscendC::TensorTrait<weight_T>>(); if(initl1 == true) { AscendC::InitConstValue(a1Local, {static_cast<uint16_t>(m * k * sizeof(fmap_T) / 32), 1, 0, 1}); AscendC::InitConstValue(b1Local, {static_cast<uint16_t>(k * n * sizeof(weight_T) / 32), 1, 0, 1}); } else { AscendC::DataCopy(a1Local, aGM, aSize); AscendC::DataCopy(b1Local, bGM, bSize); } inQueueA1.EnQue(a1Local); inQueueB1.EnQue(b1Local); } __aicore__ inline void SplitA() { AscendC::LocalTensor<AscendC::TensorTrait<fmap_T>> a1Local = inQueueA1.DeQue<AscendC::TensorTrait<fmap_T>>(); AscendC::LocalTensor<AscendC::TensorTrait<fmap_T>> a2Local = inQueueA2.AllocTensor<AscendC::TensorTrait<fmap_T>>(); // 1、load2d L1->L0A AscendC::LoadData2dParams loadL0AParams; loadL0AParams.repeatTimes = m * k * sizeof(fmap_T) / 512; loadL0AParams.srcStride = 1; loadL0AParams.dstGap = 0; if (initl0 == true) { InitConstValue(a2Local, {static_cast<uint16_t>(m * k * sizeof(fmap_T) / 512), 1, 0, 1}); } else{ LoadData(a2Local, a1Local, loadL0AParams); } inQueueA2.EnQue<AscendC::TensorTrait<fmap_T>>(a2Local); inQueueA1.FreeTensor(a1Local); } __aicore__ inline void SplitB() { AscendC::LocalTensor<AscendC::TensorTrait<weight_T>> b1Local = inQueueB1.DeQue<AscendC::TensorTrait<weight_T>>(); AscendC::LocalTensor<AscendC::TensorTrait<weight_T>> b2Local = inQueueB2.AllocTensor<AscendC::TensorTrait<weight_T>>(); // 2、load2d L1->L0B AscendC::LoadData2dParams loadL0BParams; loadL0BParams.repeatTimes = k * n * sizeof(weight_T) / 512; loadL0BParams.srcStride = 1; loadL0BParams.dstGap = 0; if (initl0 == true) { AscendC::InitConstValue(b2Local, {static_cast<uint16_t>(k * n * sizeof(weight_T) / 512), 1, 0, 1}); } else{ AscendC::LoadData(b2Local, b1Local, loadL0BParams); } inQueueB1.FreeTensor(b1Local); inQueueB2.EnQue<AscendC::TensorTrait<weight_T>>(b2Local); } __aicore__ inline void Compute() { AscendC::LocalTensor<AscendC::TensorTrait<fmap_T>> a2Local = inQueueA2.DeQue<AscendC::TensorTrait<fmap_T>>(); AscendC::LocalTensor<AscendC::TensorTrait<weight_T>> b2Local = inQueueB2.DeQue<AscendC::TensorTrait<weight_T>>(); AscendC::LocalTensor<AscendC::TensorTrait<dstCO1_T>> c1Local = outQueueCO1.AllocTensor<AscendC::TensorTrait<dstCO1_T>>(); mmadParams.isBias = false; mmadParams.m = m; mmadParams.n = n; mmadParams.k = k; AscendC::Mmad(c1Local, a2Local, b2Local, mmadParams); // m*n outQueueCO1.EnQue<AscendC::TensorTrait<dstCO1_T>>(c1Local); inQueueA2.FreeTensor(a2Local); inQueueB2.FreeTensor(b2Local); } #if __NPU_ARCH__ <= 2002 __aicore__ inline void CopyOut() { AscendC::LocalTensor<AscendC::TensorTrait<dstCO1_T>> c1Local = outQueueCO1.DeQue<AscendC::TensorTrait<dstCO1_T>>(); uint16_t M_ = Ceil(m, 16) * 16; AscendC::LocalTensor<AscendC::TensorTrait<dst_T>> ublocal; AscendC::TBuffAddr tbufublocal; tbufublocal.logicPos = (uint8_t)AscendC::TPosition::C1; ublocal.SetAddr(tbufublocal); ublocal.InitBuffer(0, M_ * n); DataCopyParams dataCopyParams; dataCopyParams.blockCount = 1; dataCopyParams.blockLen = Ceil(M_ * n * 4, 1024); DataCopyEnhancedParams enhancedParams; enhancedParams.blockMode = AscendC::BlockMode::BLOCK_MODE_MATRIX; AscendC::DataCopy(ublocal, c1Local, dataCopyParams, enhancedParams); PipeBarrier<PIPE_ALL>(); outQueueCO1.FreeTensor(c1Local); dataCopyParams.blockCount = 1; dataCopyParams.blockLen = m * n *sizeof(dstCO1_T) / ONE_BLK_SIZE; dataCopyParams.srcStride = 0; dataCopyParams.dstStride = 0; AscendC::DataCopy(cGM, ublocal, dataCopyParams); } #else __aicore__ inline void CopyOut() { AscendC::LocalTensor<AscendC::TensorTrait<dstCO1_T>> c1Local = outQueueCO1.DeQue<AscendC::TensorTrait<dstCO1_T>>(); AscendC::FixpipeParamsV220 fixpipeParams; fixpipeParams.nSize = n; fixpipeParams.mSize = m; fixpipeParams.srcStride = m; fixpipeParams.dstStride = n; fixpipeParams.ndNum = 1; fixpipeParams.srcNdStride = 0; fixpipeParams.dstNdStride = 0; AscendC::Fixpipe(cGM, c1Local, fixpipeParams); outQueueCO1.FreeTensor(c1Local); } #endif private: AscendC::TPipe pipe; AscendC::TQue<AscendC::TPosition::A1, 1> inQueueA1; AscendC::TQue<AscendC::TPosition::A2, 1> inQueueA2; AscendC::TQue<AscendC::TPosition::B1, 1> inQueueB1; AscendC::TQue<AscendC::TPosition::B2, 1> inQueueB2; // dst queue AscendC::TQue<AscendC::TPosition::CO1, 1> outQueueCO1; AscendC::GlobalTensor<AscendC::TensorTrait<fmap_T>> aGM; AscendC::GlobalTensor<AscendC::TensorTrait<weight_T>> bGM; AscendC::GlobalTensor<AscendC::TensorTrait<dst_T>> cGM; uint16_t m, k, n; bool initl0, initl1; uint16_t aSize, bSize, cSize, b2Size; AscendC::MmadParams mmadParams; }; extern "C" __global__ __aicore__ void cube_initconstvalue_simple_operator_half_16_32_16_true_false( __gm__ uint8_t *a, __gm__ uint8_t *b, __gm__ uint8_t *c) { if ASCEND_IS_AIV { return; } KernelMatmul<float, half, half, float, half> op(16, 32, 16, true, false); op.Init(a, b, c); op.Process(); }
父主题: TensorTrait