SetFmatrix
Supported Products
|
Product |
Supported/Unsupported |
|---|---|
|
|
x |
|
|
x |
|
|
√ |
|
|
√ |
|
|
x |
|
|
x |
Function Usage
Sets the attribute description of the feature map when Load3Dv1/Load3Dv2 is called. If the template parameter isSetFMatrix of Load3Dv1 or Load3Dv2 is set to false, the description of the feature map attributes (including l1H, l1W, and padList. For details, see Table 4 Parameters in the LoadData3DParamsV1 structure and Table 5 Parameters in the LoadData3DParamsV2 structure) transferred by Load3Dv1 or Load3Dv2 does not take effect. Developers need to use this API to set the parameters.
Prototype
1
|
__aicore__ inline void SetFmatrix(uint16_t l1H, uint16_t l1W, const uint8_t padList[4], const FmatrixMode& fmatrixMode) |
Parameters
|
Parameter |
Input/Output |
Meaning |
||
|---|---|---|---|---|
|
l1H |
Input |
Height of the source operand. Value range: l1H ∈ [1, 32767]. |
||
|
l1W |
Input |
Width of the source operand. Value range: l1W ∈ [1, 32767]. |
||
|
padList |
Input |
Padding list [padding_left, padding_right, padding_top, padding_bottom]. The value range of each element is [0,255]. The default value is {0, 0, 0, 0}. |
||
|
fmatrixMode |
Input |
Whether the LoadData instruction obtains information from the left or right register. The FmatrixMode type is defined as follows. Currently, only FMATRIX_LEFT is supported. Both the left and right matrices use this configuration.
|
Restrictions
- This API must be used together with Load3Dv1 or Load3Dv2 and must be called before the Load3Dv1 or Load3Dv2.
- For details about the operand address alignment requirements, see General Address Alignment Restrictions.
Example
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
#include "kernel_operator.h" template <typename dst_T, typename fmap_T, typename weight_T, typename dstCO1_T> class KernelLoad3d { public: __aicore__ inline KernelLoad3d() { // ceiling of 16 C0 = 32 / sizeof(fmap_T); C1 = channelSize / C0; coutBlocks = (Cout + 16 - 1) / 16; ho = H - dilationH * (Kh - 1); wo = W - dilationW * (Kw - 1); howo = ho * wo; howoRound = ((howo + 16 - 1) / 16) * 16; featureMapA1Size = C1 * H * W * C0; // shape: [C1, H, W, C0] weightA1Size = C1 * Kh * Kw * Cout * C0; // shape: [C1, Kh, Kw, Cout, C0] featureMapA2Size = howoRound * (C1 * Kh * Kw * C0); weightB2Size = (C1 * Kh * Kw * C0) * coutBlocks * 16; m = howo; k = C1 * Kh * Kw * C0; n = Cout; biasSize = Cout; // shape: [Cout] dstSize = coutBlocks * howo * 16; // shape: [coutBlocks, howo, 16] dstCO1Size = coutBlocks * howoRound * 16; fmRepeat = featureMapA2Size / (16 * C0); weRepeat = weightB2Size / (16 * C0); } __aicore__ inline void Init(__gm__ uint8_t* fmGm, __gm__ uint8_t* weGm, __gm__ uint8_t* biasGm, __gm__ uint8_t* dstGm) { fmGlobal.SetGlobalBuffer((__gm__ fmap_T*)fmGm); weGlobal.SetGlobalBuffer((__gm__ weight_T*)weGm); biasGlobal.SetGlobalBuffer((__gm__ dstCO1_T*)biasGm); dstGlobal.SetGlobalBuffer((__gm__ dst_T*)dstGm); pipe.InitBuffer(inQueueFmA1, 1, featureMapA1Size * sizeof(fmap_T)); pipe.InitBuffer(inQueueFmA2, 1, featureMapA2Size * sizeof(fmap_T)); pipe.InitBuffer(inQueueWeB1, 1, weightA1Size * sizeof(weight_T)); pipe.InitBuffer(inQueueWeB2, 1, weightB2Size * sizeof(weight_T)); pipe.InitBuffer(outQueueCO1, 1, dstCO1Size * sizeof(dstCO1_T)); } __aicore__ inline void Process() { CopyIn(); Split(); Compute(); CopyOut(); } private: __aicore__ inline void CopyIn() { AscendC::LocalTensor<fmap_T> featureMapA1 = inQueueFmA1.AllocTensor<fmap_T>(); AscendC::LocalTensor<weight_T> weightB1 = inQueueWeB1.AllocTensor<weight_T>(); AscendC::DataCopy(featureMapA1, fmGlobal, { 1, static_cast<uint16_t>(featureMapA1Size * sizeof(fmap_T) / 32), 0, 0 }); AscendC::DataCopy(weightB1, weGlobal, { 1, static_cast<uint16_t>(weightA1Size * sizeof(weight_T) / 32), 0, 0 }); inQueueFmA1.EnQue(featureMapA1); inQueueWeB1.EnQue(weightB1); } __aicore__ inline void Split() { AscendC::LocalTensor<fmap_T> featureMapA1 = inQueueFmA1.DeQue<fmap_T>(); AscendC::LocalTensor<weight_T> weightB1 = inQueueWeB1.DeQue<weight_T>(); AscendC::LocalTensor<fmap_T> featureMapA2 = inQueueFmA2.AllocTensor<fmap_T>(); AscendC::LocalTensor<weight_T> weightB2 = inQueueWeB2.AllocTensor<weight_T>(); uint8_t padList[PAD_SIZE] = {0, 0, 0, 0}; AscendC::SetFmatrix(H, W, padList, FmatrixMode::FMATRIX_LEFT); AscendC::SetLoadDataPaddingValue(0); AscendC::SetLoadDataRepeat({0, 1, 0}); AscendC::SetLoadDataBoundary((uint32_t)0); static constexpr AscendC::IsResetLoad3dConfig LOAD3D_CONFIG = {false,false}; AscendC::LoadData<fmap_T, LOAD3D_CONFIG>(featureMapA2, featureMapA1, { padList, H, W, channelSize, k, howoRound, 0, 0, 1, 1, Kw, Kh, dilationW, dilationH, false, false, 0 }); AscendC::LoadData(weightB2, weightB1, { 0, weRepeat, 1, 0, 0, false, 0 }); inQueueFmA2.EnQue<fmap_T>(featureMapA2); inQueueWeB2.EnQue<weight_T>(weightB2); inQueueFmA1.FreeTensor(featureMapA1); inQueueWeB1.FreeTensor(weightB1); } __aicore__ inline void Compute() { AscendC::LocalTensor<fmap_T> featureMapA2 = inQueueFmA2.DeQue<fmap_T>(); AscendC::LocalTensor<weight_T> weightB2 = inQueueWeB2.DeQue<weight_T>(); AscendC::LocalTensor<dstCO1_T> dstCO1 = outQueueCO1.AllocTensor<dstCO1_T>(); AscendC::Mmad(dstCO1, featureMapA2, weightB2, { m, n, k, true, 0, false, false, false }); outQueueCO1.EnQue<dstCO1_T>(dstCO1); inQueueFmA2.FreeTensor(featureMapA2); inQueueWeB2.FreeTensor(weightB2); } __aicore__ inline void CopyOut() { AscendC::LocalTensor<dstCO1_T> dstCO1 = outQueueCO1.DeQue<dstCO1_T>(); AscendC::FixpipeParamsV220 fixpipeParams; fixpipeParams.nSize = coutBlocks * 16; fixpipeParams.mSize = howo; fixpipeParams.srcStride = howo; fixpipeParams.dstStride = howo * AscendC::BLOCK_CUBE * sizeof(dst_T) / AscendC::ONE_BLK_SIZE; fixpipeParams.quantPre = deqMode; AscendC::Fixpipe<dst_T, dstCO1_T, AscendC::CFG_NZ>(dstGlobal, dstCO1, fixpipeParams); outQueueCO1.FreeTensor(dstCO1); } private: AscendC::TPipe pipe; // feature map queue AscendC::TQue<AscendC::TPosition::A1, 1> inQueueFmA1; AscendC::TQue<AscendC::TPosition::A2, 1> inQueueFmA2; // weight queue AscendC::TQue<AscendC::TPosition::B1, 1> inQueueWeB1; AscendC::TQue<AscendC::TPosition::B2, 1> inQueueWeB2; // dst queue AscendC::TQue<AscendC::TPosition::CO1, 1> outQueueCO1; AscendC::GlobalTensor<fmap_T> fmGlobal; AscendC::GlobalTensor<weight_T> weGlobal; AscendC::GlobalTensor<dst_T> dstGlobal; AscendC::GlobalTensor<dstCO1_T> biasGlobal; uint16_t channelSize = 32; uint16_t H = 4, W = 4; uint8_t Kh = 2, Kw = 2; uint16_t Cout = 16; uint16_t C0, C1; uint8_t dilationH = 2, dilationW = 2; uint16_t coutBlocks, ho, wo, howo, howoRound; uint32_t featureMapA1Size, weightA1Size, featureMapA2Size, weightB2Size, biasSize, dstSize, dstCO1Size; uint16_t m, k, n; uint8_t fmRepeat, weRepeat; AscendC::QuantMode_t deqMode = AscendC::QuantMode_t::F322F16; }; extern "C" __global__ __aicore__ void load3d_simple_kernel(__gm__ uint8_t *fmGm, __gm__ uint8_t *weGm, __gm__ uint8_t *biasGm, __gm__ uint8_t *dstGm) { KernelLoad3d<dst_type, fmap_type, weight_type, dstCO1_type> op; op.Init(fmGm, weGm, biasGm, dstGm); op.Process(); } |