Efficient Matrix Multiplication Accumulation by Using L0C Buffer
[Priority] High
[Description] When an operator performs matrix multiplication accumulation (for example, accumulation of matrix A1 x B1 + A2 x B2...), the previous matrix multiplication result can be temporarily stored on CO1 (L0C) and the Mmad API can be called to implement matrix multiplication accumulation. This reduces the number of data transfers and improves memory utilization, compared with the process in which each matrix multiplication result is transferred from CO1 to the GM and then to the UB for accumulative computation.
[Negative Example]
Before optimization, the operator performs matrix multiplication accumulation twice through the following process:
- Transfer the previous matrix multiplication result from CO1 to the workspace, and then transfer the result from the workspace to the UB.
- Repeat the preceding step to transfer the result to the UB in the next matrix multiplication computation.
- Add the results of the two matrix multiplications on the UB.
When n matrix multiplications need to be accumulated, CO1-to-workspace and workspace-to-UB transfers and Add operations are increased by n times respectively.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
... // This following is only an example. It is not complete code, with some synchronization control code omitted. public: __aicore__ inline KernelSample() { aSize = m * k; bSize = k * n; cSize = m * n; } __aicore__ inline void Init(__gm__ uint8_t *a, __gm__ uint8_t *b, __gm__ uint8_t *c) { aGM.SetGlobalBuffer((__gm__ half *)a); bGM.SetGlobalBuffer((__gm__ half *)b); cGM.SetGlobalBuffer((__gm__ float *)c); pipe.InitBuffer(inQueueA1, 1, aSize * sizeof(half)); pipe.InitBuffer(inQueueA2, 1, aSize * sizeof(half)); pipe.InitBuffer(inQueueB1, 1, bSize * sizeof(half)); pipe.InitBuffer(inQueueB2, 2, bSize * sizeof(half)); pipe.InitBuffer(outQueueCO1, 1, cSize * sizeof(float)); pipe.InitBuffer(inQueueSrc0, 1, cSize * sizeof(float)); pipe.InitBuffer(inQueueSrc1, 1, cSize * sizeof(float)); pipe.InitBuffer(outQueueDst, 1, cSize * sizeof(float)); } __aicore__ inline void Process() { // Perform the first matrix multiplication. CopyIn(); SplitA(); SplitB(); Compute(); // Transfer out the result of the first matrix multiplication. CopyOut(); // Transfer the result of the first matrix multiplication to the UB. CopyIn1(); // Perform the second matrix multiplication. Compute1(); // Transfer out the result of the second matrix multiplication. CopyOut1(); // Transfer the result of the second matrix multiplication to the UB. CopyIn1(); // Add the results of two matrix multiplications. Compute2(); CopyOut2(); } private: __aicore__ inline void CopyIn() { LocalTensor<half> a1Local = inQueueA1.AllocTensor<half>(); LocalTensor<half> b1Local = inQueueB1.AllocTensor<half>(); Nd2NzParams dataCopyA1Params; dataCopyA1Params.ndNum = 1; dataCopyA1Params.nValue = m; dataCopyA1Params.dValue = k; dataCopyA1Params.srcNdMatrixStride = 0; dataCopyA1Params.srcDValue = k; dataCopyA1Params.dstNzC0Stride = m; dataCopyA1Params.dstNzNStride = 1; dataCopyA1Params.dstNzMatrixStride = 0; DataCopy(a1Local, aGM, dataCopyA1Params); Nd2NzParams dataCopyB1Params; dataCopyB1Params.ndNum = 1; dataCopyB1Params.nValue = k; dataCopyB1Params.dValue = n; dataCopyB1Params.srcNdMatrixStride = 0; dataCopyB1Params.srcDValue = n; dataCopyB1Params.dstNzC0Stride = k; dataCopyB1Params.dstNzNStride = 1; dataCopyB1Params.dstNzMatrixStride = 0; DataCopy(b1Local, bGM, dataCopyB1Params); inQueueA1.EnQue<half>(a1Local); inQueueB1.EnQue<half>(b1Local); } __aicore__ inline void SplitA() { ... } __aicore__ inline void SplitB() { ... } __aicore__ inline void Compute() { LocalTensor<half> a2Local = inQueueA2.DeQue<half>(); LocalTensor<half> b2Local = inQueueB2.DeQue<half>(); LocalTensor<float> c1Local = outQueueCO1.AllocTensor<float>(); MmadParams mmadParams; mmadParams.m = m; mmadParams.n = n; mmadParams.k = k; // Perform matrix multiplication. Mmad(c1Local, a2Local, b2Local, mmadParams); outQueueCO1.EnQue<float>(c1Local); inQueueA2.EnQue<half>(a2Local); inQueueB2.EnQue<half>(b2Local); } __aicore__ inline void CopyOut() { LocalTensor<float> c1Local = outQueueCO1.DeQue<float>(); GM_ADDR usrWorkspace = AscendC::GetUserWorkspace(workspace); xGm.SetGlobalBuffer((__gm__ float *)(usrWorkspace)); FixpipeParamsV220 fixpipeParams; fixpipeParams.nSize = n; fixpipeParams.mSize = m; fixpipeParams.srcStride = m; fixpipeParams.dstStride = n; fixpipeParams.ndNum = 1; fixpipeParams.srcNdStride = 0; fixpipeParams.dstNdStride = 0; // Transfer the matrix multiplication result from CO1 to the workspace. Fixpipe(xGm, c1Local, fixpipeParams); outQueueCO1.EnQue<float>(c1Local); } __aicore__ inline void CopyIn1() { LocalTensor<float> src0Local = inQueueSrc0.AllocTensor<float>(); // Transfer the matrix multiplication result from the workspace to the UB. DataCopy(src0Local, xGm, cSize); inQueueSrc0.EnQue<float>(src0Local); } __aicore__ inline void Compute1() { LocalTensor<half> a2Local = inQueueA2.DeQue<half>(); LocalTensor<half> b2Local = inQueueB2.DeQue<half>(); LocalTensor<float> c1Local = outQueueCO1.DeQue<float>(); MmadParams mmadParams; mmadParams.m = m; mmadParams.n = n; mmadParams.k = k; // Perform matrix multiplication. Mmad(c1Local, a2Local, b2Local, mmadParams); outQueueCO1.EnQue<float>(c1Local); inQueueA2.FreeTensor(a2Local); inQueueB2.FreeTensor(b2Local); } __aicore__ inline void CopyOut1() { LocalTensor<float> c1Local = outQueueCO1.DeQue<float>(); FixpipeParamsV220 fixpipeParams; fixpipeParams.nSize = n; fixpipeParams.mSize = m; fixpipeParams.srcStride = m; fixpipeParams.dstStride = n; fixpipeParams.ndNum = 1; fixpipeParams.srcNdStride = 0; fixpipeParams.dstNdStride = 0; // Transfer the matrix multiplication result from CO1 to the workspace. Fixpipe(xGm, c1Local, fixpipeParams); outQueueCO1.FreeTensor(c1Local); } __aicore__ inline void CopyIn2() { PipeBarrier<PIPE_ALL>(); LocalTensor<float> src1Local = inQueueSrc1.AllocTensor<float>(); // Transfer the matrix multiplication result from the workspace to the UB. DataCopy(src1Local, xGm, cSize); inQueueSrc1.EnQue<float>(src1Local); } __aicore__ inline void Compute2() { LocalTensor<float> src0Local = inQueueSrc0.DeQue<float>(); LocalTensor<float> src1Local = inQueueSrc1.DeQue<float>(); LocalTensor<float> dstLocal = outQueueDst.AllocTensor<float>(); // Add the results of two matrix multiplications. Add(dstLocal, src0Local, src1Local, cSize); outQueueDst.EnQue<float>(dstLocal); inQueueSrc0.FreeTensor(src0Local); inQueueSrc1.FreeTensor(src1Local); } __aicore__ inline void CopyOut2() { ... } private: TPipe pipe; TQue<TPosition::A1, 1> inQueueA1; TQue<TPosition::A2, 1> inQueueA2; TQue<TPosition::B1, 1> inQueueB1; TQue<TPosition::B2, 1> inQueueB2; TQue<TPosition::CO1, 1> outQueueCO1; TQue<TPosition::VECIN, 1> inQueueSrc0; TQue<TPosition::VECIN, 1> inQueueSrc1; TQue<TPosition::VECOUT, 1> outQueueDst; GlobalTensor<half> aGM; GlobalTensor<half> bGM; GlobalTensor<float> cGM; uint16_t m = 32, k = 32, n = 32; uint16_t aSize, bSize, cSize; ... |
[Positive Example]
After optimization, when the operator performs matrix computation accumulation, the previous matrix multiplication result can be temporarily stored on L0C. The Mmad API parameters cmatrixInitVal and cmatrixSource are used to configure the initialization value of matrix C, and the Mmad API is called twice to perform matrix multiplication accumulation twice.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
... // This following is only an example. It is not complete code, with some synchronization control code omitted. public: __aicore__ inline KernelSample() { aSize = m * k; bSize = k * n; cSize = m * n; } __aicore__ inline void Init(__gm__ uint8_t *a, __gm__ uint8_t *b, __gm__ uint8_t *c) { aGM.SetGlobalBuffer((__gm__ half *)a); bGM.SetGlobalBuffer((__gm__ half *)b); cGM.SetGlobalBuffer((__gm__ float *)c); pipe.InitBuffer(inQueueA1, 1, aSize * sizeof(half)); pipe.InitBuffer(inQueueA2, 1, aSize * sizeof(half)); pipe.InitBuffer(inQueueB1, 1, bSize * sizeof(half)); pipe.InitBuffer(inQueueB2, 2, bSize * sizeof(half)); pipe.InitBuffer(outQueueCO1, 1, cSize * sizeof(float)); } __aicore__ inline void Process() { CopyIn(); SplitA(); SplitB(); Compute(); CopyOut(); } private: __aicore__ inline void CopyIn() { LocalTensor<half> a1Local = inQueueA1.AllocTensor<half>(); LocalTensor<half> b1Local = inQueueB1.AllocTensor<half>(); Nd2NzParams dataCopyA1Params; dataCopyA1Params.ndNum = 1; dataCopyA1Params.nValue = m; dataCopyA1Params.dValue = k; dataCopyA1Params.srcNdMatrixStride = 0; dataCopyA1Params.srcDValue = k; dataCopyA1Params.dstNzC0Stride = m; dataCopyA1Params.dstNzNStride = 1; dataCopyA1Params.dstNzMatrixStride = 0; DataCopy(a1Local, aGM, dataCopyA1Params); Nd2NzParams dataCopyB1Params; dataCopyB1Params.ndNum = 1; dataCopyB1Params.nValue = k; dataCopyB1Params.dValue = n; dataCopyB1Params.srcNdMatrixStride = 0; dataCopyB1Params.srcDValue = n; dataCopyB1Params.dstNzC0Stride = k; dataCopyB1Params.dstNzNStride = 1; dataCopyB1Params.dstNzMatrixStride = 0; DataCopy(b1Local, bGM, dataCopyB1Params); inQueueA1.EnQue(a1Local); inQueueB1.EnQue(b1Local); } __aicore__ inline void SplitA() { ... } __aicore__ inline void SplitB() { ... } __aicore__ inline void Compute() { LocalTensor<half> a2Local = inQueueA2.DeQue<half>(); LocalTensor<half> b2Local = inQueueB2.DeQue<half>(); LocalTensor<float> c1Local = outQueueCO1.AllocTensor<float>(); MmadParams mmadParams; mmadParams.m = m; mmadParams.n = n; mmadParams.k = k; // Perform the first matrix multiplication. Mmad(c1Local, a2Local, b2Local, mmadParams); PipeBarrier<PIPE_M>(); // Add the results of the first and second matrix multiplications. mmadParams.cmatrixInitVal = false; Mmad(c1Local, a2Local, b2Local, c1Local, mmadParams); outQueueCO1.EnQue<float>(c1Local); inQueueA2.FreeTensor(a2Local); inQueueB2.FreeTensor(b2Local); } __aicore__ inline void CopyOut() { LocalTensor<float> c1Local = outQueueCO1.DeQue<float>(); FixpipeParamsV220 fixpipeParams; fixpipeParams.nSize = n; fixpipeParams.mSize = m; fixpipeParams.srcStride = m; fixpipeParams.dstStride = n; fixpipeParams.ndNum = 1; fixpipeParams.srcNdStride = 0; fixpipeParams.dstNdStride = 0; Fixpipe(cGM, c1Local, fixpipeParams); outQueueCO1.FreeTensor(c1Local); } private: TPipe pipe; TQue<TPosition::A1, 1> inQueueA1; TQue<TPosition::A2, 1> inQueueA2; TQue<TPosition::B1, 1> inQueueB1; TQue<TPosition::B2, 1> inQueueB2; TQue<TPosition::CO1, 1> outQueueCO1; GlobalTensor<half> aGM; GlobalTensor<half> bGM; GlobalTensor<dst_T> cGM; uint16_t m = 32, k = 32, n = 32; uint16_t aSize, bSize, cSize; |