InitConstValue
Supported Products
Product |
Supported/Unsupported |
|---|---|
√ |
|
√ |
|
√ |
|
√ |
|
x |
|
√ |
Function Usage
Initializes the LocalTensor of a specific TPosition to a specific value.
Prototype
1 2 | template <typename T, typename U = PrimT<T>, typename Std::enable_if<Std::is_same<PrimT<T>, U>::value, bool>::type = true> __aicore__ inline void InitConstValue(const LocalTensor<T>& dst, const InitConstValueParams<U>& initConstValueParams) |
Parameters
Parameter |
Description |
|---|---|
T |
Data type of the destination. For the For the For the For the For the |
U |
Data type of the initial value.
The last template parameter is used only for checking the preceding data types. |
Parameter |
Input/Output |
Meaning |
|---|---|---|
dst |
Output |
Destination operand for the result matrix, which is of the LocalTensor type. For the For the For the For the For the If TPosition is A1/B1, the start address must be 32-byte aligned. If TPosition is A2/B2, the start address must be 512-byte aligned. |
InitConstValueParams |
Input |
Initialization parameter, of the InitConstValueParams type. For details, see ${INSTALL_DIR}/include/ascendc/basic_api/interface/kernel_struct_mm.h. Replace ${INSTALL_DIR} with the actual CANN component directory. For details about the parameter description, see Table 3. For the Ascend 610's AI Core, only repeatTimes and initValue can be configured. For the For the
|
Parameter |
Meaning |
|---|---|
repeatTimes |
Number of iterations. The default value is 0.
|
blockNum |
Number of data blocks initialized in each iteration. blockNum ∈ [0, 32767]. The default value is 0.
|
dstGap |
Gap between the end address of the previous iteration and the start address of the next iteration of the destination operand.
Value range: dstGap ∈ [0, 32767]. The default value is 0. |
initValue |
Initial value. The supported data type is the same as that of dst. |
Restrictions
- For details about the operand address alignment requirements, see General Address Alignment Restrictions.
Example
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | #include "kernel_operator.h" template <typename dst_T, typename fmap_T, typename weight_T, typename dstCO1_T> class KernelCubeMmad { public: __aicore__ inline KernelCubeMmad() { C0 = 32 / sizeof(fmap_T); C1 = channelSize / C0; coutBlocks = (Cout + 16 - 1) / 16; ho = H - dilationH * (Kh - 1); wo = W - dilationW * (Kw - 1); howo = ho * wo; howoRound = ((howo + 16 - 1) / 16) * 16; featureMapA1Size = C1 * H * W * C0; // shape: [C1, H, W, C0] weightA1Size = C1 * Kh * Kw * Cout * C0; // shape: [C1, Kh, Kw, Cout, C0] featureMapA2Size = howoRound * (C1 * Kh * Kw * C0); weightB2Size = (C1 * Kh * Kw * C0) * coutBlocks * 16; m = howo; k = C1 * Kh * Kw * C0; n = Cout; biasSize = Cout; // shape: [Cout] dstSize = coutBlocks * howo * 16; // shape: [coutBlocks, howo, 16] dstCO1Size = coutBlocks * howoRound * 16; fmRepeat = featureMapA2Size / (16 * C0); weRepeat = weightB2Size / (16 * C0); } __aicore__ inline void Init(__gm__ uint8_t* fmGm, __gm__ uint8_t* weGm, __gm__ uint8_t* biasGm, __gm__ uint8_t* dstGm) { fmGlobal.SetGlobalBuffer((__gm__ fmap_T*)fmGm); weGlobal.SetGlobalBuffer((__gm__ weight_T*)weGm); biasGlobal.SetGlobalBuffer((__gm__ dstCO1_T*)biasGm); dstGlobal.SetGlobalBuffer((__gm__ dst_T*)dstGm); pipe.InitBuffer(inQueueFmA1, 1, featureMapA1Size * sizeof(fmap_T)); pipe.InitBuffer(inQueueFmA2, 1, featureMapA2Size * sizeof(fmap_T)); pipe.InitBuffer(inQueueWeB1, 1, weightA1Size * sizeof(weight_T)); pipe.InitBuffer(inQueueWeB2, 1, weightB2Size * sizeof(weight_T)); pipe.InitBuffer(inQueueBiasA1, 1, biasSize * sizeof(dstCO1_T)); pipe.InitBuffer(outQueueCO1, 1, dstCO1Size * sizeof(dstCO1_T)); } __aicore__ inline void Process() { CopyIn(); Split(); Compute(); CopyOut(); } private: __aicore__ inline void CopyIn() { AscendC::LocalTensor<fmap_T> featureMapA1 = inQueueFmA1.AllocTensor<fmap_T>(); AscendC::LocalTensor<weight_T> weightB1 = inQueueWeB1.AllocTensor<weight_T>(); AscendC::LocalTensor<dstCO1_T> biasA1 = inQueueBiasA1.AllocTensor<dstCO1_T>(); AscendC::InitConstValue(featureMapA1, {1, static_cast<uint16_t>(featureMapA1Size * sizeof(fmap_T) / 32), 0, 1}); AscendC::InitConstValue(weightB1, {1, static_cast<uint16_t>(weightA1Size * sizeof(weight_T) / 32), 0, 2}); AscendC::DataCopy(biasA1, biasGlobal, { 1, static_cast<uint16_t>(biasSize * sizeof(dstCO1_T) / 32), 0, 0 }); inQueueFmA1.EnQue(featureMapA1); inQueueWeB1.EnQue(weightB1); inQueueBiasA1.EnQue(biasA1); } __aicore__ inline void Split() { AscendC::LocalTensor<fmap_T> featureMapA1 = inQueueFmA1.DeQue<fmap_T>(); AscendC::LocalTensor<weight_T> weightB1 = inQueueWeB1.DeQue<weight_T>(); AscendC::LocalTensor<fmap_T> featureMapA2 = inQueueFmA2.AllocTensor<fmap_T>(); AscendC::LocalTensor<weight_T> weightB2 = inQueueWeB2.AllocTensor<weight_T>(); AscendC::InitConstValue(featureMapA2, {1, static_cast<uint16_t>(featureMapA2Size * sizeof(fmap_T) / 512), 0, 1}); AscendC::InitConstValue(weightB2, { 1, static_cast<uint16_t>(weightB2Size * sizeof(weight_T) / 512), 0, 2}); inQueueFmA2.EnQue<fmap_T>(featureMapA2); inQueueWeB2.EnQue<weight_T>(weightB2); inQueueFmA1.FreeTensor(featureMapA1); inQueueWeB1.FreeTensor(weightB1); } __aicore__ inline void Compute() { AscendC::LocalTensor<fmap_T> featureMapA2 = inQueueFmA2.DeQue<fmap_T>(); AscendC::LocalTensor<weight_T> weightB2 = inQueueWeB2.DeQue<weight_T>(); AscendC::LocalTensor<dstCO1_T> dstCO1 = outQueueCO1.AllocTensor<dstCO1_T>(); AscendC::LocalTensor<dstCO1_T> biasA1 = inQueueBiasA1.DeQue<dstCO1_T>(); AscendC::Mmad(dstCO1, featureMapA2, weightB2, biasA1, { m, n, k, true, 0, false, false, false }); outQueueCO1.EnQue<dstCO1_T>(dstCO1); inQueueFmA2.FreeTensor(featureMapA2); inQueueWeB2.FreeTensor(weightB2); inQueueBiasA1.FreeTensor(biasA1); } __aicore__ inline void CopyOut() { AscendC::LocalTensor<dstCO1_T> dstCO1 = outQueueCO1.DeQue<dstCO1_T>(); AscendC::FixpipeParamsV220 fixpipeParams; fixpipeParams.nSize = coutBlocks * 16; fixpipeParams.mSize = howo; fixpipeParams.srcStride = howo; fixpipeParams.dstStride = howo * AscendC::BLOCK_CUBE * sizeof(dst_T) / AscendC::ONE_BLK_SIZE; fixpipeParams.quantPre = deqMode; AscendC::Fixpipe<dst_T, dstCO1_T, AscendC::CFG_NZ>(dstGlobal, dstCO1, fixpipeParams); outQueueCO1.FreeTensor(dstCO1); } private: AscendC::TPipe pipe; // feature map queue AscendC::TQue<AscendC::TPosition::A1, 1> inQueueFmA1; AscendC::TQue<AscendC::TPosition::A2, 1> inQueueFmA2; // weight queue AscendC::TQue<AscendC::TPosition::B1, 1> inQueueWeB1; AscendC::TQue<AscendC::TPosition::B2, 1> inQueueWeB2; // bias queue AscendC::TQue<AscendC::TPosition::A1, 1> inQueueBiasA1; // dst queue AscendC::TQue<AscendC::TPosition::CO1, 1> outQueueCO1; AscendC::GlobalTensor<fmap_T> fmGlobal; AscendC::GlobalTensor<weight_T> weGlobal; AscendC::GlobalTensor<dst_T> dstGlobal; AscendC::GlobalTensor<dstCO1_T> biasGlobal; uint16_t channelSize = 32; uint16_t H = 4, W = 4; uint8_t Kh = 2, Kw = 2; uint16_t Cout = 16; uint16_t C0, C1; uint8_t dilationH = 2, dilationW = 2; uint16_t coutBlocks, ho, wo, howo, howoRound; uint32_t featureMapA1Size, weightA1Size, featureMapA2Size, weightB2Size, biasSize, dstSize, dstCO1Size; uint16_t m, k, n; uint8_t fmRepeat, weRepeat; AscendC::QuantMode_t deqMode = AscendC::QuantMode_t::F322F16; }; extern "C" __global__ __aicore__ void cube_mmad_simple_kernel(__gm__ uint8_t *fmGm, __gm__ uint8_t *weGm, __gm__ uint8_t *biasGm, __gm__ uint8_t *dstGm) { KernelCubeMmad<half, half, half, half> op; op.Init(fmGm, weGm, biasGm, dstGm); op.Process(); } |