Load2D
Supported Products
Product |
Supported/Unsupported |
|---|---|
√ |
|
√ |
|
√ |
|
√ |
|
x |
|
√ |
Functions
Load2D supports the following data transfer paths:
GM->A1; GM->B1; GM->A2; GM->B2;
A1 -> A2; B1 -> B2.
Prototype
- Load2D API
1 2 3 4
template <typename T> __aicore__ inline void LoadData(const LocalTensor<T>& dst, const LocalTensor<T>& src, const LoadData2DParams& loadDataParams) template <typename T> __aicore__ inline void LoadData(const LocalTensor<T>& dst, const GlobalTensor<T>& src, const LoadData2DParams& loadDataParams)
Parameters
Field |
Meaning |
|---|---|
T |
Data type of the source operand and destination operand.
|
Parameter |
Input/Output |
Meaning |
|---|---|---|
dst |
Output |
Destination operand of the LocalTensor type. The sequence of consecutive data is determined by the TPosition of the destination operand. The restrictions are as follows:
|
src |
Input |
Source operand of the LocalTensor or GlobalTensor type. The data type must be the same as that of dst. |
loadDataParams |
Input |
LoadData parameter structure.
For details about the preceding structure parameters, see ${INSTALL_DIR}/include/ascendc/basic_api/interface/kernel_struct_mm.h. Replace ${INSTALL_DIR} with the actual CANN component directory. |
Field |
Meaning |
|---|---|
startIndex |
Fractal matrix ID, indicating the fractal number of the source operand from which the load starts (0 indicates the first fractal matrix of the source operand). Value range: startIndex ∈ [0, 65535]. Unit: 512 bytes. The default value is 0. |
repeatTimes |
Number of iterations. 512-byte data can be processed in each iteration. Value range: repeatTimes ∈ [1, 255]. |
srcStride |
In adjacent iterations, the interval between the start addresses of fractals of the source operand (unit: 512 bytes). Value range: srcStride ∈ [0, 65535]. The default value is 0. |
sid |
Reserved parameter. Set it to 0. |
dstGap |
In adjacent iterations, the interval between the end address of a fractal and the start address of its next fractal of the destination operand (unit: 512 bytes). Value range: dstGap ∈ [0, 65535]. The default value is 0. Note: This parameter is disabled for the |
ifTranspose |
Transposition enable for each fractal matrix. The default value is false.
Note: The transpose function can be enabled only for the A1->A2 and B1->B2 data channels. When the transpose function is enabled, the source and destination operands support only the uint16_t, int16_t, and half types. |
addrMode |
This parameter is reserved. Set it to 0. |
Restrictions
- For details about the operand address alignment requirements, see General Address Alignment Restrictions.
Returns
None
Examples
The calling example supports the
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | #include "kernel_operator.h" class KernelLoadData { public: __aicore__ inline KernelLoadData() { coutBlocks = (Cout + 16 - 1) / 16; ho = (H + padTop + padBottom - dilationH * (Kh - 1) - 1) / strideH + 1; wo = (W + padLeft + padRight - dilationW * (Kw - 1) - 1) / strideW + 1; howo = ho * wo; howoRound = ((howo + 16 - 1) / 16) * 16; featureMapA1Size = C1 * H * W * C0; // shape: [C1, H, W, C0] weightA1Size = C1 * Kh * Kw * Cout * C0; // shape: [C1, Kh, Kw, Cout, C0] featureMapA2Size = howoRound * (C1 * Kh * Kw * C0); weightB2Size = (C1 * Kh * Kw * C0) * coutBlocks * 16; m = howo; k = C1 * Kh * Kw * C0; n = Cout; dstSize = coutBlocks * howo * 16; // shape: [coutBlocks, howo, 16] dstCO1Size = coutBlocks * howoRound * 16; fmRepeat = featureMapA2Size / (16 * C0); weRepeat = weightB2Size / (16 * C0); } __aicore__ inline void Init(__gm__ uint8_t* fmGm, __gm__ uint8_t* weGm, __gm__ uint8_t* dstGm) { fmGlobal.SetGlobalBuffer((__gm__ half*)fmGm); weGlobal.SetGlobalBuffer((__gm__ half*)weGm); dstGlobal.SetGlobalBuffer((__gm__ half*)dstGm); pipe.InitBuffer(inQueueFmA1, 1, featureMapA1Size * sizeof(half)); pipe.InitBuffer(inQueueFmA2, 1, featureMapA2Size * sizeof(half)); pipe.InitBuffer(inQueueWeB1, 1, weightA1Size * sizeof(half)); pipe.InitBuffer(inQueueWeB2, 1, weightB2Size * sizeof(half)); pipe.InitBuffer(outQueue , 1, dstCO1Size * sizeof(float)); pipe.InitBuffer(outQueueUB, 1, dstSize * sizeof(half)); } __aicore__ inline void Process() { CopyIn(); Split(); Compute(); CopyUB(); CopyOut(); } private: __aicore__ inline void CopyIn() { AscendC::LocalTensor<half> featureMapA1 = inQueueFmA1.AllocTensor<half>(); AscendC::LocalTensor<half> weightB1 = inQueueWeB1.AllocTensor<half>(); AscendC::DataCopy(featureMapA1, fmGlobal, { 1, static_cast<uint16_t>(featureMapA1Size * sizeof(half) / 32), 0, 0 }); AscendC::DataCopy(weightB1, weGlobal, { 1, static_cast<uint16_t>(weightA1Size * sizeof(half) / 32), 0, 0 }); inQueueFmA1.EnQue(featureMapA1); inQueueWeB1.EnQue(weightB1); } __aicore__ inline void Split() { AscendC::LocalTensor<half> featureMapA1 = inQueueFmA1.DeQue<half>(); AscendC::LocalTensor<half> weightB1 = inQueueWeB1.DeQue<half>(); AscendC::LocalTensor<half> featureMapA2 = inQueueFmA2.AllocTensor<half>(); AscendC::LocalTensor<half> weightB2 = inQueueWeB2.AllocTensor<half>(); uint8_t padList[4] = {padLeft, padRight, padTop, padBottom}; AscendC::LoadData(featureMapA2, featureMapA1, { padList, H, W, 0, 0, 0, -1, -1, strideW, strideH, Kw, Kh, dilationW, dilationH, 1, 0, fmRepeat, 0, (half)(0)}); AscendC::LoadData(weightB2, weightB1, { 0, weRepeat, 1, 0, 0, false, 0 }); inQueueFmA2.EnQue<half>(featureMapA2); inQueueWeB2.EnQue<half>(weightB2); inQueueFmA1.FreeTensor(featureMapA1); inQueueWeB1.FreeTensor(weightB1); } __aicore__ inline void Compute() { AscendC::LocalTensor<half> featureMapA2 = inQueueFmA2.DeQue<half>(); AscendC::LocalTensor<half> weightB2 = inQueueWeB2.DeQue<half>(); AscendC::LocalTensor<float> dstCO1 = outQueueCO1.AllocTensor<float>(); AscendC::Mmad(dstCO1, featureMapA2, weightB2, { m, n, k, 0, false, true }); outQueueCO1.EnQue<float>(dstCO1); inQueueFmA2.FreeTensor(featureMapA2); inQueueWeB2.FreeTensor(weightB2); } __aicore__ inline void CopyUB() { AscendC::LocalTensor<float> dstCO1 = outQueueCO1.DeQue<float>(); AscendC::LocalTensor<half> dstUB = outQueueUB.AllocTensor<half>(); AscendC::DataCopyParams dataCopyParams; dataCopyParams.blockCount = 1; dataCopyParams.blockLen = m * n * sizeof(float) / 1024; AscendC::DataCopyEnhancedParams enhancedParams; enhancedParams.blockMode = AscendC::BlockMode::BLOCK_MODE_MATRIX; AscendC::DataCopy(dstUB, dstCO1, dataCopyParams, enhancedParams); outQueueUB.EnQue<half>(dstUB); outQueueCO1.FreeTensor(dstCO1); } __aicore__ inline void CopyOut() { AscendC::LocalTensor<half> dstUB = outQueueUB.DeQue<half>(); AscendC::DataCopy(dstGlobal, dstUB, m * n); outQueueUB.FreeTensor(dstUB); } private: AscendC::TPipe pipe; // feature map queue AscendC::TQue<AscendC::TPosition::A1, 1> inQueueFmA1; AscendC::TQue<AscendC::TPosition::A2, 1> inQueueFmA2; // weight queue AscendC::TQue<AscendC::TPosition::B1, 1> inQueueWeB1; AscendC::TQue<AscendC::TPosition::B2, 1> inQueueWeB2; // dst queue AscendC::TQue<AscendC::TPosition::CO1, 1> outQueueCO1; AscendC::TQue<AscendC::TPosition::CO2, 1> outQueueUB; AscendC::GlobalTensor<half> fmGlobal, weGlobal, dstGlobal; uint16_t C1 = 2; uint16_t H = 4, W = 4; uint8_t Kh = 2, Kw = 2; uint16_t Cout = 16; uint16_t C0 = 16; uint8_t dilationH = 2, dilationW = 2; uint8_t padTop = 1, padBottom = 1, padLeft = 1, padRight = 1; uint8_t strideH = 1, strideW = 1; uint16_t coutBlocks, ho, wo, howo, howoRound; uint32_t featureMapA1Size, weightA1Size, featureMapA2Size, weightB2Size, dstSize, dstCO1Size; uint16_t m, k, n; uint8_t fmRepeat, weRepeat; }; extern "C" __global__ __aicore__ void load_data_simple_kernel(__gm__ uint8_t* fmGm, __gm__ uint8_t* weGm, __gm__ uint8_t* dstGm) { KernelLoadData op; op.Init(fmGm, weGm, dstGm); op.Process(); } |