实现样例
Ascend C Matmul算子实现文件:matmul_custom.cpp
/*
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
*
* Function : c = a * b (matrix multiplication)
* This sample is a very basic sample that implements Matmul on Ascend plaform.
* In this sample:
* Shape of matrix a is [m, k]: [32, 32]
* Shape of matrix b is [k, n]: [32, 32]
* Shape of matrix c is [m, n]: [32, 32]
*/
#include "kernel_operator.h"
using namespace AscendC;
class KernelMatmul {
public:
__aicore__ inline KernelMatmul()
{
aSize = m * k;
bSize = k * n;
cSize = m * m;
mBlocks = m / 16;
nBlocks = n / 16;
kBlocks = k / 16;
}
__aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR c)
{
aGM.SetGlobalBuffer((__gm__ half*)a);
bGM.SetGlobalBuffer((__gm__ half*)b);
cGM.SetGlobalBuffer((__gm__ float*)c);
pipe.InitBuffer(inQueueA1, 1, aSize * sizeof(half));
pipe.InitBuffer(inQueueA2, 1, aSize * sizeof(half));
pipe.InitBuffer(inQueueB1, 1, bSize * sizeof(half));
pipe.InitBuffer(inQueueB2, 2, bSize * sizeof(half) / 2);
pipe.InitBuffer(outQueueCO1, 2, cSize * sizeof(float) / 2);
pipe.InitBuffer(outQueueCO2, 1, cSize * sizeof(float));
}
__aicore__ inline void Process()
{
CopyIn();
SplitA();
LocalTensor<half> b1Local = inQueueB1.DeQue<half>();
LocalTensor<half> a2Local = inQueueA2.DeQue<half>();
LocalTensor<float> c2Local = outQueueCO2.AllocTensor<float>();
// split matrix b into 2 parts, [32, 16] and [32, 16]
for (int i = 0; i < 2; ++i) {
SplitB(b1Local, i);
Compute(a2Local);
Aggregate(c2Local, i);
}
inQueueB1.FreeTensor(b1Local);
inQueueA2.FreeTensor(a2Local);
outQueueCO2.EnQue<float>(c2Local);
CopyOut();
}
private:
__aicore__ inline void CopyND2NZ(const LocalTensor<half>& dst, const GlobalTensor<half>& src, const uint16_t height,
const uint16_t width)
{
for (int i = 0; i < width / 16; ++i) {
int srcOffset = i * 16;
int dstOffset = i * 16 * height;
DataCopy(dst[dstOffset], src[srcOffset], { height, 1, uint16_t(width / 16 - 1), 0 });
}
}
__aicore__ inline void CopyIn()
{
LocalTensor<half> a1Local = inQueueA1.AllocTensor<half>();
LocalTensor<half> b1Local = inQueueB1.AllocTensor<half>();
CopyND2NZ(a1Local, aGM, m, k);
CopyND2NZ(b1Local, bGM, k, n);
inQueueA1.EnQue(a1Local);
inQueueB1.EnQue(b1Local);
}
__aicore__ inline void SplitA()
{
int srcOffset = 0;
int dstOffset = 0;
LocalTensor<half> a1Local = inQueueA1.DeQue<half>();
LocalTensor<half> a2Local = inQueueA2.AllocTensor<half>();
// transform nz to zz
for (int i = 0; i < mBlocks; ++i) {
LoadData2dParams loadDataParams;
loadDataParams.repeatTimes = kBlocks;
loadDataParams.srcStride = mBlocks;
loadDataParams.ifTranspose = false;
LoadData(a2Local[dstOffset], a1Local[srcOffset], loadDataParams);
srcOffset += 16 * 16;
dstOffset += k * 16;
}
inQueueA2.EnQue<half>(a2Local);
inQueueA1.FreeTensor(a1Local);
}
__aicore__ inline void SplitB(const LocalTensor<half>& b1Local, const int bSplitIdx)
{
LocalTensor<half> b2Local = inQueueB2.AllocTensor<half>();
// transform nz to zn
LoadData2dParams loadDataParams;
loadDataParams.repeatTimes = kBlocks;
loadDataParams.srcStride = 1;
loadDataParams.ifTranspose = true;
LoadData(b2Local, b1Local[bSplitIdx * bSize / 2], loadDataParams);
inQueueB2.EnQue<half>(b2Local);
}
__aicore__ inline void Compute(const LocalTensor<half>& a2Local)
{
LocalTensor<half> b2Local = inQueueB2.DeQue<half>();
LocalTensor<float> c1Local = outQueueCO1.AllocTensor<float>();
Mmad(c1Local, a2Local, b2Local, { m, uint16_t(n / 2), k, false, 0, false, false, false });
outQueueCO1.EnQue<float>(c1Local);
inQueueB2.FreeTensor(b2Local);
}
__aicore__ inline void Aggregate(const LocalTensor<float>& c2Local, const int bSplitIdx)
{
LocalTensor<float> c1Local = outQueueCO1.DeQue<float>();
DataCopyParams dataCopyParams;
dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = 2;
DataCopyEnhancedParams enhancedParams;
enhancedParams.blockMode = BlockMode::BLOCK_MODE_MATRIX;
DataCopy(c2Local[bSplitIdx * cSize / 2], c1Local, dataCopyParams, enhancedParams);
outQueueCO1.FreeTensor(c1Local);
}
__aicore__ inline void CopyOut()
{
LocalTensor<float> c2Local = outQueueCO2.DeQue<float>();
// transform nz to nd
for (int i = 0; i < nBlocks; ++i) {
DataCopy(cGM[i * 16], c2Local[i * m * 16], { m, 2, 0, uint16_t((nBlocks - 1) * 2) });
}
outQueueCO2.FreeTensor(c2Local);
}
private:
TPipe pipe;
TQue<QuePosition::A1, 1> inQueueA1;
TQue<QuePosition::A2, 1> inQueueA2;
TQue<QuePosition::B1, 1> inQueueB1;
TQue<QuePosition::B2, 2> inQueueB2;
// dst queue
TQue<QuePosition::CO1, 2> outQueueCO1;
TQue<QuePosition::CO2, 1> outQueueCO2;
GlobalTensor<half> aGM, bGM;
GlobalTensor<float> cGM;
uint16_t m = 32;
uint16_t n = 32;
uint16_t k = 32;
uint16_t aSize, bSize, cSize, mBlocks, nBlocks, kBlocks;
};
extern "C" __global__ __aicore__ void matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR c)
{
KernelMatmul op;
op.Init(a, b, c);
op.Process();
}
#ifndef __CCE_KT_TEST__
// call of kernel function
void matmul_custom_do(uint32_t blockDim, void* l2ctrl, void* stream, uint8_t* a, uint8_t* b, uint8_t* c)
{
matmul_custom<<<blockDim, l2ctrl, stream>>>(a, b, c);
}
#endif
父主题: 矩阵编程