计算给定输入张量和权重张量的2-D卷积,输出结果张量。Conv2d卷积层多用于图像识别,使用过滤器提取图像中的特征。
template <typename dst_T, typename src_T> __aicore__ inline void Conv2D(const LocalTensor<dst_T>& dstLocal, const LocalTensor<dst_T>& bias, const LocalTensor<src_T>& featureMap, const LocalTensor<src_T>& weight, Conv2dParams& conv2dParams, Conv2dTilling& tilling);
template <typename dst_T, typename src_T> __aicore__ inline void Conv2D(const LocalTensor<dst_T>& dstLocal, const LocalTensor<src_T>& featureMap, const LocalTensor<src_T>& weight, Conv2dParams& conv2dParams, Conv2dTilling& tilling);
template <typename T> __aicore__ inline Conv2dTilling GetConv2dTiling(Conv2dParams& conv2dParams);
参数名称 |
类型 |
说明 |
---|---|---|
dstLocal |
输出 |
目的操作数。 Atlas 训练系列产品,支持的QuePosition为:CO1,CO2 Atlas推理系列产品AI Core,支持的QuePosition为:CO1,CO2 结果中有效张量格式为[Cout/16, Ho, Wo, 16],大小为Cout * Ho * Wo,Ho与Wo可以根据其他数据计算得出。 Ho = floor((H + pad_top + pad_bottom - dilation_h * (Kh - 1) - 1) / stride_h + 1) Wo = floor((W + pad_left + pad_right - dilation_w * (Kw - 1) - 1) / stride_w + 1) 由于硬件要求Ho*Wo需为16倍数,在申请dst Tensor时,shape应向上16对齐,实际申请shape大小应为Cout * round_howo。 round_howo = ceil(Ho * Wo /16) * 16。 |
bias |
输入 |
卷积偏置。 若使能偏置,输入bias为偏置操作数的起始元素,支持的数据类型为 Tensor(int32_t, float),需与dstLocal的数据类型保持一致,shape形状为[Cout,],Cout 为卷积核个数;Tensor 的QuePosition为A1或B1。 |
featureMap |
输入 |
输入张量,Tensor的QuePosition为A1。 输入张量“feature_map”的形状,格式是[C1, H, W, C0]。 C1*C0为输入的channel数,要求如下:
H为高,取值范围:[1,4096]。 W为宽,取值范围:[1,4096]。 |
weight |
输入 |
卷积核(权重)张量,Tensor的QuePosition为B1。 卷积核张量“weight”的形状,格式是[C1, Kh, Kw, Cout, C0]。 C1*C0为输入的channel数,对于C0要求如下:
Cout为卷积核数目,取值范围:[16, 4096], Cout必须为16的倍数。 Kh为卷积核高;值的范围:[1,255]。 Kw表示卷积核宽;值的范围:[1,255]。 |
conv2dParams |
输入 |
输入矩阵形状等状态参数,类型为Conv2dParams。结构体具体定义为: struct Conv2dParams { uint32_t imgShape[kConv2dImgSize]; // [H, W] uint32_t kernelShape[kConv2dkernelSize]; // [Kh, Kw] uint32_t stride[kConv2dStride]; // [stride_h, stride_w] uint32_t cin; // cin = C0 * C1; uint32_t cout; uint32_t padList[kConv2dPad]; // [pad_left, pad_right, pad_top, pad_bottom] uint32_t dilation[kConv2dDilation]; // [dilation_h, dilation_w] uint32_t initY; uint32_t partialSum; }; |
tilling |
输入 |
分形控制参数,类型为Conv2dTilling。结构体具体定义为: struct Conv2dTilling { const uint32_t blockSize = 16; // # M block size is always 16 LoopMode loopMode = LoopMode::ModeNm; uint32_t c0Size = 32; uint32_t dTypeSize = 1; uint32_t strideH = 0; uint32_t strideW = 0; uint32_t dilationH = 0; uint32_t dilationW = 0; uint32_t hi = 0; uint32_t wi = 0; uint32_t ho = 0; uint32_t wo = 0; uint32_t height = 0; uint32_t width = 0; uint32_t howo = 0; uint32_t mNum = 0; uint32_t nNum = 0; uint32_t kNum = 0; uint32_t mBlockNum = 0; uint32_t kBlockNum = 0; uint32_t nBlockNum = 0; uint32_t roundM = 0; uint32_t roundN = 0; uint32_t roundK = 0; uint32_t mTileBlock = 0; uint32_t nTileBlock = 0; uint32_t kTileBlock = 0; uint32_t mIterNum = 0; uint32_t nIterNum = 0; uint32_t kIterNum = 0; uint32_t mTileNums = 0; bool mHasTail = false; bool nHasTail = false; bool kHasTail = false; uint32_t kTailBlock = 0; uint32_t mTailBlock = 0; uint32_t nTailBlock = 0; uint32_t mTailNums = 0; }; |
参数名称 |
类型 |
说明 |
---|---|---|
imgShape |
vector<int> |
输入张量“feature_map”的形状,格式是[ H, W]。
|
kernelShape |
vector<int> |
卷积核张量“weight”的形状,格式是[Kh, Kw]。 |
stride |
vector<int> |
卷积步长,格式是[stride_h, stride_w]。
|
cin |
int |
分形排布参数,Cin = C1 * C0,Cin 为输入的channel数,C1取值范围:[1,256]。
|
cout |
int |
Cout为卷积核数目,取值范围:[16, 4096], Cout必须为16的倍数。 |
padList |
vector<int> |
padding行数/列数,格式是[pad_left, pad_right, pad_top, pad_bottom]。
|
dilation |
vector<int> |
空洞卷积参数,格式[dilation_h, dilation_w]。
膨胀后卷积核宽为dilation_w * (Kw - 1) + 1,高为dilation_h * (Kh - 1) + 1。 |
initY |
uint32_t |
表示dstLocal是否需要初始化。
|
partialSum |
uint32_t |
当dstLocal参数所在的QuePosition为CO2时,通过该参数控制计算结果是否搬出。
|
参数名称 |
类型 |
说明 |
---|---|---|
blockSize |
uint32_t |
固定值,恒为16,一个维度内存放的元素个数。 |
loopMode |
LoopMode |
遍历模式,结构体具体定义为: enum class LoopMode { ModeNm = 0, ModeMn = 1, ModeKm = 2, ModeKn = 3 }; |
c0Size |
uint32_t |
一个block的字节长度,范围[16或者32]。 |
dtypeSize |
uint32_t |
传入的数据类型的字节长度,范围[1, 2]。 |
strideH |
uint32_t |
卷积步长-高,范围:[1,63]。 |
strideW |
uint32_t |
卷积步长-宽,范围:[1,63]。 |
dilationH |
uint32_t |
空洞卷积参数-高,范围:[1,255]。 |
dilationW |
uint32_t |
空洞卷积参数-宽,范围:[1,255]。 |
hi |
uint32_t |
feature_map形状-高,范围:[1,4096]。 |
wi |
uint32_t |
feature_map形状-宽,范围:[1,4096]。 |
ho |
uint32_t |
feature_map形状-高,范围:[1,4096]。 |
wo |
uint32_t |
feature_map形状-宽,范围:[1,4096]。 |
height |
uint32_t |
weight形状-高,[1,255]。 |
width |
uint32_t |
weight形状-宽,[1,255]。 |
howo |
uint32_t |
feature_map形状大小,为ho * wo。 |
mNum |
uint32_t |
M轴等效数据长度参数值,范围:[1,4096]。 |
nNum |
uint32_t |
N轴等效数据长度参数值,范围:[1,4096]。 |
kNum |
uint32_t |
K轴等效数据长度参数值,范围:[1,4096]。 |
roundM |
uint32_t |
M轴等效数据长度参数值且以blockSize为倍数向上取整,范围:[1,4096]。 |
roundN |
uint32_t |
N轴等效数据长度参数值且以blockSize为倍数向上取整,范围:[1,4096]。 |
roundK |
uint32_t |
K轴等效数据长度参数值且以c0Size为倍数向上取整,范围:[1,4096]。 |
mBlockNum |
uint32_t |
M轴Block个数,mBlockNum = mNum / blockSize,范围:[1,4096]。 |
nBlockNum |
uint32_t |
N轴Block个数,nBlockNum = nNum / blockSize,范围:[1,4096]。 |
kBlockNum |
uint32_t |
K轴Block个数,kBlockNum = kNum / blockSize,范围:[1,4096]。 |
mIterNum |
uint32_t |
遍历M轴维度数量,范围:[1,4096]。 |
nIterNum |
uint32_t |
遍历N轴维度数量,范围:[1,4096]。 |
kIterNum |
uint32_t |
遍历K轴维度数量,范围:[1,4096]。 |
mTileBlock |
uint32_t |
M轴切分块个数,范围:[1,4096]。 |
nTileBlock |
uint32_t |
N轴切分块个数,范围:[1,4096]。 |
kTileBlock |
uint32_t |
K轴切分块个数,范围:[1,4096]。 |
kTailBlock |
uint32_t |
K轴尾块个数,范围:[1,4096]。 |
mTailBlock |
uint32_t |
M轴尾块个数,范围:[1,4096]。 |
nTailBlock |
uint32_t |
N轴尾块个数,范围:[1,4096]。 |
kHasTail |
bool |
K轴是否存在尾块。 |
mHasTail |
bool |
M轴是否存在尾块。 |
nHasTail |
bool |
N轴是否存在尾块。 |
mTileNums |
uint32_t |
M轴切分块个数的长度,范围:[1,4096]。 |
mTailNums |
uint32_t |
M轴尾块个数的长度,范围:[1,4096]。 |
feature_map.dtype |
weight.dtype |
dst.dtype |
---|---|---|
int8_t |
int8_t |
int32_t |
half |
half |
float |
half |
half |
half |
Atlas 训练系列产品
Atlas推理系列产品AI Core
#include "kernel_operator.h" namespace AscendC { class KernelCubeConv2D { public: __aicore__ inline KernelCubeConv2D() {} __aicore__ inline void Init(__gm__ half* fmGm, __gm__ half* weGm, __gm__ float* biasGm, __gm__ float* dstGm, Conv2dParams params, LoopMode mode) { conv2dParams.imgShape[0] = params.imgShape[0]; conv2dParams.imgShape[1] = params.imgShape[1]; conv2dParams.kernelShape[0] = params.kernelShape[0]; conv2dParams.kernelShape[1] = params.kernelShape[1]; conv2dParams.stride[0] = params.stride[0]; conv2dParams.stride[1] = params.stride[1]; conv2dParams.cin = params.cin; conv2dParams.cout = params.cout; conv2dParams.padList[0] = params.padList[0]; conv2dParams.padList[1] = params.padList[1]; conv2dParams.padList[2] = params.padList[2]; conv2dParams.padList[3] = params.padList[3]; conv2dParams.dilation[0] = params.dilation[0]; conv2dParams.dilation[1] = params.dilation[1]; conv2dParams.initY = params.initY; conv2dParams.partialSum = params.partialSum; loopMode = mode; Conv2dTilling tilling = GetConv2dTiling<half>(conv2dParams); roundm = tilling.roundM; roundn = tilling.roundN; roundk = tilling.roundK; m = tilling.mNum; n = tilling.nNum; featureMapA1Size = conv2dParams.cin * conv2dParams.imgShape[0] * conv2dParams.imgShape[1]; weightB1Size = conv2dParams.cin * conv2dParams.cout * conv2dParams.kernelShape[0] * conv2dParams.kernelShape[1]; biasSize = conv2dParams.cout; dstCO1Size = roundm * roundn; dstUBSize = m * n; fmGlobal.SetGlobalBuffer((__gm__ half*)fmGm); weGlobal.SetGlobalBuffer((__gm__ half*)weGm); biasGlobal.SetGlobalBuffer((__gm__ float*)biasGm); dstGlobal.SetGlobalBuffer((__gm__ float*)dstGm); pipe.InitBuffer(inQueueFmA1, 1, featureMapA1Size * sizeof(half)); pipe.InitBuffer(inQueueWeB1, 1, weightB1Size * sizeof(half)); pipe.InitBuffer(inQueueBiasA1, 1, biasSize * sizeof(float)); pipe.InitBuffer(outQueueCO1, 1, dstCO1Size * sizeof(float)); pipe.InitBuffer(outQueueUB, 1, dstUBSize * sizeof(float)); } __aicore__ inline void Process() { CopyIn(); Compute(); CopyUB(); CopyOut(); } private: __aicore__ inline void CopyIn() { LocalTensor<half> featureMapA1 = inQueueFmA1.AllocTensor<half>(); LocalTensor<half> weightB1 = inQueueWeB1.AllocTensor<half>(); LocalTensor<float> biasA1 = inQueueBiasA1.AllocTensor<float>(); DataCopy(featureMapA1, fmGlobal, featureMapA1Size); DataCopy(weightB1, weGlobal, weightB1Size); DataCopy(biasA1, biasGlobal, biasSize); inQueueFmA1.EnQue(featureMapA1); inQueueWeB1.EnQue(weightB1); inQueueBiasA1.EnQue(biasA1); } __aicore__ inline void Compute() { LocalTensor<half> featureMapA1 = inQueueFmA1.DeQue<half>(); LocalTensor<half> weightB1 = inQueueWeB1.DeQue<half>(); LocalTensor<float> biasA1 = inQueueBiasA1.DeQue<float>(); LocalTensor<float> dstCO1 = outQueueCO1.AllocTensor<float>(); Conv2dTilling tilling = GetConv2dTiling<half>(conv2dParams); tilling.loopMode = loopMode; // 输入矩阵形状为[C1, H, W, C0],特征矩阵的形状为[C1, Kh, Kw, Cout, C0], 计算结果搬出至GM,目的矩阵无需初始化 Conv2D(dstCO1, biasA1, featureMapA1, weightB1, conv2dParams, tilling); outQueueCO1.EnQue<float>(dstCO1); inQueueFmA1.FreeTensor(featureMapA1); inQueueWeB1.FreeTensor(weightB1); inQueueBiasA1.FreeTensor(biasA1); } __aicore__ inline void CopyUB() { LocalTensor<float> dstCO1 = outQueueCO1.DeQue<float>(); LocalTensor<float> dstUB = outQueueUB.AllocTensor<float>(); DataCopyParams dataCopyParams; dataCopyParams.blockCount = 1; dataCopyParams.blockLen = dstCO1Size * sizeof(float) / 1024; DataCopyEnhancedParams enhancedParams; enhancedParams.blockMode = BlockMode::BLOCK_MODE_MATRIX; DataCopy(dstUB, dstCO1, dataCopyParams, enhancedParams); outQueueUB.EnQue<float>(dstUB); outQueueCO1.FreeTensor(dstCO1); } __aicore__ inline void CopyOut() { LocalTensor<float> dstUB = outQueueUB.DeQue<float>(); DataCopy(dstGlobal, dstUB, dstUBSize); outQueueUB.FreeTensor(dstUB); } private: TPipe pipe; // feature map queue TQue<QuePosition::A1, 1> inQueueFmA1; // weight queue TQue<QuePosition::B1, 1> inQueueWeB1; // bias queue TQue<QuePosition::A1, 1> inQueueBiasA1; // dst queue TQue<QuePosition::CO1, 1> outQueueCO1; TQue<QuePosition::VECOUT, 1> outQueueUB; GlobalTensor<half> fmGlobal, weGlobal; GlobalTensor<float> biasGlobal, dstGlobal; LoopMode loopMode = LoopMode::ModeNm; Conv2dParams conv2dParams; uint32_t m, n; uint32_t roundm, roundn, roundk; uint32_t featureMapA1Size, weightB1Size, biasSize, dstCO1Size, dstUBSize; }; } // namespace AscendC extern "C" __global__ __aicore__ void kernel_conv2d_operator(__gm__ half* fmGm, __gm__ half* weGm, __gm__ float* biasGm, __gm__ float* dstGm, Conv2dParams params, LoopMode mode) { // 上方示例参数为: // imgShape[0] = 4; imgShape[1] = 4; kernelShape[0] = 2; kernelShape[1] = 2; stride[0] = 1; stride[1] = 1; // cin = 32; cout = 16; padList[0] = 0; padList[1] = 0; padList[2] = 0; padList[3] = 0; // dilation[0] = 2; dilation[1] = 2; initY = 0; partialSum = false; AscendC::KernelCubeConv2D op; op.Init(fmGm, weGm, biasGm, dstGm, params, mode); op.Process(); }