Efficient Quantization by Storing Quantization Parameters in the FP Buffer
[Priority] High
[Description] When an operator performs quantization computation on the matrix multiplication result, the quantization parameters can be transferred to the C2PIPE2GM (Fixpipe Buffer) and the Fixpipe API can be called once to implement quantization computation of the matrix multiplication result. This reduces the number of data transfers and improves memory utilization, compared with the process in which the matrix multiplication result is transferred from CO1 (L0C) to the GM and then from the GM to the UB for quantization computation.
This performance optimization method takes effect only for the


[Negative Example]
The quantization computation on the matrix multiplication result is performed as follows:
- Transfer the matrix multiplication result from CO1 to the workspace.
- Transfer the data from the workspace to the UB.
- Transfer the quantization parameters to the UB, and perform a series of quantization computations on the UB together with the matrix multiplication result.
- Transfer the final quantization result from the UB to the GM.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | ... // This following is only an example. It is not complete code, with some synchronization control code omitted. public: __aicore__ inline KernelSample() { aSize = m * k; bSize = k * n; cSize = m * n; } __aicore__ inline void Init(__gm__ uint8_t *a, __gm__ uint8_t *b, __gm__ uint8_t *c, __gm__ uint8_t *deqTensor) { aGM.SetGlobalBuffer((__gm__ half *)a); bGM.SetGlobalBuffer((__gm__ half *)b); cGM.SetGlobalBuffer((__gm__ float *)c); deqGM.SetGlobalBuffer((__gm__ half *)deqTensor); pipe.InitBuffer(inQueueA1, 1, aSize * sizeof(half)); pipe.InitBuffer(inQueueA2, 1, aSize * sizeof(half)); pipe.InitBuffer(inQueueB1, 1, bSize * sizeof(half)); pipe.InitBuffer(inQueueB2, 2, bSize * sizeof(half)); pipe.InitBuffer(outQueueCO1, 1, cSize * sizeof(float)); pipe.InitBuffer(inQueueSrc0, 1, cSize * sizeof(float)); pipe.InitBuffer(inQueueTmp, 1, cSize * sizeof(half)); pipe.InitBuffer(inQueueDeq, 1, cSize * sizeof(half)); pipe.InitBuffer(outQueueDst, 1, cSize * sizeof(int8_t)); } __aicore__ inline void Process() { CopyIn(); SplitA(); SplitB(); Compute(); CopyOut(); CopyIn1(); Compute1(); CopyOut1(); } private: __aicore__ inline void CopyIn() { LocalTensor<half> a1Local = inQueueA1.AllocTensor<half>(); LocalTensor<half> b1Local = inQueueB1.AllocTensor<half>(); LocalTensor<half> deqLocal = inQueueDeq.AllocTensor<half>(); Nd2NzParams dataCopyA1Params; dataCopyA1Params.ndNum = 1; dataCopyA1Params.nValue = m; dataCopyA1Params.dValue = k; dataCopyA1Params.srcNdMatrixStride = 0; dataCopyA1Params.srcDValue = k; dataCopyA1Params.dstNzC0Stride = m; dataCopyA1Params.dstNzNStride = 1; dataCopyA1Params.dstNzMatrixStride = 0; DataCopy(a1Local, aGM, dataCopyA1Params); Nd2NzParams dataCopyB1Params; dataCopyB1Params.ndNum = 1; dataCopyB1Params.nValue = k; dataCopyB1Params.dValue = n; dataCopyB1Params.srcNdMatrixStride = 0; dataCopyB1Params.srcDValue = n; dataCopyB1Params.dstNzC0Stride = k; dataCopyB1Params.dstNzNStride = 1; dataCopyB1Params.dstNzMatrixStride = 0; DataCopy(b1Local, bGM, dataCopyB1Params); // Transfer quantization parameters to the UB. DataCopy(deqLocal, deqGM, cSize); inQueueA1.EnQue(a1Local); inQueueB1.EnQue(b1Local); inQueueDeq.EnQue(deqLocal); } __aicore__ inline void SplitA() { ... } __aicore__ inline void SplitB() { ... } __aicore__ inline void Compute() { LocalTensor<half> a2Local = inQueueA2.DeQue<half>(); LocalTensor<half> b2Local = inQueueB2.DeQue<half>(); LocalTensor<float> c1Local = outQueueCO1.AllocTensor<float>(); MmadParams mmadParams; mmadParams.m = m; mmadParams.n = n; mmadParams.k = k; // Perform matrix multiplication. Mmad(c1Local, a2Local, b2Local, mmadParams); // m*n outQueueCO1.EnQue<float>(c1Local); inQueueA2.FreeTensor(a2Local); inQueueB2.FreeTensor(b2Local); } __aicore__ inline void CopyOut() { LocalTensor<float> c1Local = outQueueCO1.DeQue<float>(); GM_ADDR usrWorkspace = AscendC::GetUserWorkspace(workspace); xGm.SetGlobalBuffer((__gm__ float *)(usrWorkspace)); FixpipeParamsV220 fixpipeParams; fixpipeParams.nSize = n; fixpipeParams.mSize = m; fixpipeParams.srcStride = m; fixpipeParams.dstStride = n; fixpipeParams.ndNum = 1; fixpipeParams.srcNdStride = 0; fixpipeParams.dstNdStride = 0; // Transfer the matrix multiplication result from CO1 to the workspace. Fixpipe(xGm, c1Local, fixpipeParams); outQueueCO1.FreeTensor(c1Local); } __aicore__ inline void CopyIn1() { // Transfer the matrix multiplication result from the workspace to the UB. LocalTensor<float> src0Local = inQueueSrc0.AllocTensor<float>(); DataCopy(src0Local, xGm, cSize); inQueueSrc0.EnQue(src0Local); } __aicore__ inline void Compute1() { LocalTensor<float> src0Local = inQueueSrc0.DeQue<float>(); LocalTensor<half> tmpLocal = inQueueTmp.AllocTensor<half>(); LocalTensor<half> deqLocal = inQueueDeq.DeQue<half>(); LocalTensor<int8_t> dstLocal = outQueueDst.AllocTensor<int8_t>(); // Perform quantization computation. Cast(tmpLocal, src0Local, RoundMode::CAST_NONE, cSize); LocalTensor<half> tmpHalfBuffer = src0Local.ReinterpretCast<half>(); Mul(tmpHalfBuffer, tmpLocal, deqLocal, cSize); Cast(dstLocal, tmpHalfBuffer, RoundMode::CAST_NONE, cSize); outQueueDst.EnQue<int8_t>(dstLocal); inQueueSrc0.FreeTensor(src0Local); inQueueTmp.FreeTensor(tmpLocal); inQueueDeq.FreeTensor(deqLocal); } __aicore__ inline void CopyOut1() { ... } private: TPipe pipe; TQue<TPosition::A1, 1> inQueueA1; TQue<TPosition::A2, 1> inQueueA2; TQue<TPosition::B1, 1> inQueueB1; TQue<TPosition::B2, 1> inQueueB2; TQue<TPosition::CO1, 1> outQueueCO1; TQue<TPosition::VECIN, 1> inQueueDeq; TQue<TPosition::VECIN, 1> inQueueSrc0; TQue<TPosition::VECCALC, 1> inQueueTmp; TQue<TPosition::VECOUT, 1> outQueueDst; GlobalTensor<half> aGM; GlobalTensor<half> bGM; GlobalTensor<float> cGM; GlobalTensor<float> biasGM; uint16_t m = 32, k = 32, n = 32; uint16_t aSize, bSize, cSize; ... |
[Positive Example]
When this operator performs quantization computation on the matrix multiplication result, the quantization parameters can be transferred to the Fixpipe Buffer (FB) and the Fixpipe API can be called once to implement quantization computation on the matrix multiplication result.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | ... public: __aicore__ inline KernelSample() { aSize = m * k; bSize = k * n; cSize = m * n; QuantSize = n; } __aicore__ inline void Init(__gm__ uint8_t *a, __gm__ uint8_t *b, __gm__ uint8_t *c, __gm__ uint8_t *deqTensor) { aGM.SetGlobalBuffer((__gm__ half *)a); bGM.SetGlobalBuffer((__gm__ half *)b); cGM.SetGlobalBuffer((__gm__ float *)c); deqGM.SetGlobalBuffer((__gm__ uint64_t *)deqTensor); pipe.InitBuffer(inQueueA1, 1, aSize * sizeof(half)); pipe.InitBuffer(inQueueA2, 1, aSize * sizeof(half)); pipe.InitBuffer(inQueueB1, 1, bSize * sizeof(half)); pipe.InitBuffer(inQueueB2, 2, bSize * sizeof(half)); pipe.InitBuffer(outQueueCO1, 1, cSize * sizeof(float)); pipe.InitBuffer(inQueueDeq1, 1, QuantSize * sizeof(uint64_t)); pipe.InitBuffer(inQueueDeq, 1, QuantSize * sizeof(uint64_t)); } __aicore__ inline void Process() { CopyIn(); SplitA(); SplitB(); SplitDeq(); Compute(); CopyOut(); } private: __aicore__ inline void CopyIn() { LocalTensor<half> a1Local = inQueueA1.AllocTensor<half>(); LocalTensor<half> b1Local = inQueueB1.AllocTensor<half>(); LocalTensor<uint64_t> deq1Local = inQueueDeq1.AllocTensor<uint64_t>(); Nd2NzParams dataCopyA1Params; dataCopyA1Params.ndNum = 1; dataCopyA1Params.nValue = m; dataCopyA1Params.dValue = k; dataCopyA1Params.srcNdMatrixStride = 0; dataCopyA1Params.srcDValue = k; dataCopyA1Params.dstNzC0Stride = m; dataCopyA1Params.dstNzNStride = 1; dataCopyA1Params.dstNzMatrixStride = 0; DataCopy(a1Local, aGM, dataCopyA1Params); Nd2NzParams dataCopyB1Params; dataCopyB1Params.ndNum = 1; dataCopyB1Params.nValue = k; dataCopyB1Params.dValue = n; dataCopyB1Params.srcNdMatrixStride = 0; dataCopyB1Params.srcDValue = n; dataCopyB1Params.dstNzC0Stride = k; dataCopyB1Params.dstNzNStride = 1; dataCopyB1Params.dstNzMatrixStride = 0; DataCopy(b1Local, bGM, dataCopyB1Params); // Transfer the quantization parameters to L1. DataCopy(deq1Local, deqGM, QuantSize); inQueueA1.EnQue(a1Local); inQueueB1.EnQue(b1Local); inQueueDeq.EnQue(deq1Local); } __aicore__ inline void SplitA() { ... } __aicore__ inline void SplitB() { ... } __aicore__ inline void SplitDeq() { LocalTensor<uint64_t> deq1Local = inQueueDeq1.DeQue<uint64_t>(); LocalTensor<uint64_t> deqLocal = inQueueDeq.AllocTensor<uint64_t>(); // Transfer the quantization parameters from L1 to the FB. DataCopy(deqLocal, deq1Local, { 1, (uint16_t)(QuantSize * sizeof(uint64_t) / 128), 0, 0 }); inQueueDeq.EnQue<uint64_t>(deqLocal); inQueueDeq1.FreeTensor(deq1Local); } __aicore__ inline void Compute() { LocalTensor<half> a2Local = inQueueA2.DeQue<half>(); LocalTensor<half> b2Local = inQueueB2.DeQue<half>(); LocalTensor<float> c1Local = outQueueCO1.AllocTensor<float>(); MmadParams mmadParams; mmadParams.m = m; mmadParams.n = n; mmadParams.k = k; // Perform matrix multiplication. Mmad(c1Local, a2Local, b2Local, mmadParams); // m*n outQueueCO1.EnQue<float>(c1Local); inQueueA2.FreeTensor(a2Local); inQueueB2.FreeTensor(b2Local); } __aicore__ inline void CopyOut() { LocalTensor<float> c1Local = outQueueCO1.DeQue<float>(); LocalTensor<uint64_t> deqLocal = inQueueDeq.DeQue<uint64_t>(); SetFixpipeNz2ndFlag(1, 0, 0); DataCopyCO12DstParams dataCopyParams; dataCopyParams.nSize = n; dataCopyParams.mSize = m; dataCopyParams.srcStride = m; dataCopyParams.dstStride = n; dataCopyParams.quantPre = QuantMode_t::VQF322B8_PRE; dataCopyParams.nz2ndEn = true; // Transfer out the computation result obtained after matrix multiplication is quantized. DataCopy(cGM, c1Local, DataCopyCO12DstParams); outQueueCO1.FreeTensor(c1Local); } private: TPipe pipe; TQue<QuePosition::A1, 1> inQueueA1; TQue<QuePosition::A2, 1> inQueueA2; TQue<QuePosition::B1, 1> inQueueB1; TQue<QuePosition::B2, 1> inQueueB2; TQue<QuePosition::C1, 1> inQueueDeq1; TQue<QuePosition::C2PIPE2GM, 1> inQueueDeq; TQue<QuePosition::CO1, 1> outQueueCO1; GlobalTensor<half> aGM; GlobalTensor<half> bGM; GlobalTensor<float> cGM; GlobalTensor<uint64_t> deqTensorGM; uint16_t m = 32, k = 32, n = 32; uint16_t aSize, bSize, cSize, QuantSize; ... |