函数功能

提供根据MaskTensor对SrcTensor（源操作数，输入Tensor）进行过滤的功能，得到DstTensor（目的操作数、输出Tensor）。仅支持输入shape为ND格式。

该过滤功能包括两种模式，字节模式和比特模式。

字节模式
MaskTensor中存储的数值为布尔类型，每个布尔数值代表是否取用SrcTensor对应位置的数值：如果是，则选取SrcTensor中的数值存入DstTensor；否则，对DstTensor中的对应位置赋值为零。DstTensor，SrcTensor和MaskTensor的shape相同。示例如下：

SrcTensor=[1，2，3，4，5，6，7，8，9，10]

MaskTensor=[1，0，1，0，1，0，0，1，1，0]（每个数的数据类型为uint8_t）

DstTensor=[1，0，3，0，5，0，0，8，9，0]
比特模式
MaskTensor的每个bit数值，代表是否取用SrcTensor对应位置的数值：如果是，则选取SrcTensor中的数值存入DstTensor；否则，对DstTensor中的对应位置赋值为零。SrcTensor和DstTensor的shape相同，假设均为[height ， width]，MaskTensor的shape为[height ， (width / 8)]。示例如下：

SrcTensor=[1，2，3，4，5，6，7，8]

MaskTensor=[169]（转换为二进制表示为1010 1001）

DstTensor=[1，0，3，0，5，0，0，8]
- 特殊情况1：当MaskTensor有效数据非连续存放时，MaskTensor的width轴，为了满足32B对齐，需要填充无效数值，SrcTensor的width轴，需满足256Byte对齐。示例如下：
  SrcTensor=[1，2，3，4，5，6，7，8，11，12，13，14，15，16，17，18]
  
  MaskTensor=[1，0，1，0，1，0，0，1，X，X，1，0，1，0，1，0，0，1，X，X]（X为无效数值，假设数据已满足对齐要求，示例数值为二进制形式表示）
  
  DstTensor=[1，0，3，0，5，0，0，8，11，0， 13， 0， 15， 0， 0，18]
- 特殊情况2：当MaskTensor有效数据连续存放，maskTensor_size不满足32B对齐时，需要在MaskTensor的尾部补齐32B对齐时，对应SrcTensor的尾部也需要补充无效数据，使得srcTensor_size满足256B对齐。示例如下：
  SrcTensor=[1，2，3，4，5，6，7，8，11，12，13，14，15，16，17，18]
  
  MaskTensor=[1，0，1，0，1，0，0，1， 1， 0， 1， 0， 1， 0， 0， 1，X，X，X，X]（X为无效数值，假设数据已满足对齐要求，示例数值为二进制形式表示）
  
  DstTensor= [1，0，3，0，5，0，0，8， 11， 0， 13， 0， 15， 0， 0， 18]

函数原型

template <typename T> __aicore__ inline void DropOut(const LocalTensor<T>& dstLocal, const LocalTensor<T>& srcLocal, const LocalTensor<uint8_t>& maskLocal, const float probValue, const DropOutShapeInfo& info)

参数说明

参数名称	输入/输出	含义
dstLocal	输出	目的操作数，类型为LocalTensor，LocalTensor数据结构的定义请参考LocalTensor。 Atlas A2训练系列产品，支持的数据类型为：half/float
srcLocal	输入	源操作数，类型为LocalTensor。srcLocal的数据类型需要与目的操作数保持一致。 Atlas A2训练系列产品，支持的数据类型为：half/float
maskLocal	输入	存放mask的Tensor，类型为LocalTensor。 Atlas A2训练系列产品，支持的数据类型为：uint8_t
probValue	输入	权重系数，srcLocal中数据被保留的概率，过滤后的结果会除以权重系数，存放至dstLocal中。 probValue ∈（0，1） Atlas A2训练系列产品，支持的数据类型为：float
info	输入	DropOutShapeInfo类型，DropOutShapeInfo结构定义如下： struct DropOutShapeInfo { __aicore__ DropOutShapeInfo(){}; uint32_t firstAxis = 0; // srcLocal/maskTensor的height轴元素个数 uint32_t srcLastAxis = 0; // srcLocal的width轴元素个数 uint32_t maskLastAxis = 0;// maskTensor的width轴元素个数 };

返回值

无

支持的型号

Atlas A2训练系列产品

约束说明

srcLocal和dstLocal地址对齐要求请见：通用约束。
比特模式下，受限于接口内部调用指令的约束，half类型输入时，srcLocal.Size ∈[1，128*255]，float类型输入时，srcLocal.Size ∈[1，64*255]。

调用示例

#include "kernel_operator.h"

namespace AscendC {
template <typename srcType> class KernelDropout {
public:
    __aicore__ inline KernelDropout() {}
    __aicore__ inline void Init(GM_ADDR src_gm, GM_ADDR mask_gm, GM_ADDR dst_gm, uint32_t firstAxis, 
        uint32_t srcLastAxis, uint32_t maskLastAxis)
    {
        srcSize = firstAxis * srcLastAxis;
        maskSize = firstAxis * maskLastAxis;

        info.firstAxis = firstAxis;
        info.srcLastAxis = srcLastAxis;
        info.maskLastAxis = maskLastAxis;

        src_global.SetGlobalBuffer(reinterpret_cast<__gm__ srcType*>(src_gm), srcSize);
        mask_global.SetGlobalBuffer(reinterpret_cast<__gm__ uint8_t*>(mask_gm), maskSize);
        dst_global.SetGlobalBuffer(reinterpret_cast<__gm__ srcType*>(dst_gm), srcSize);
        
        pipe.InitBuffer(inQueueX, 1, srcSize * sizeof(srcType));
        pipe.InitBuffer(inQueueY, 1, maskSize * sizeof(uint8_t));
        pipe.InitBuffer(outQueue, 1, srcSize * sizeof(srcType));
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        Compute();
        CopyOut();
    }

private:
    __aicore__ inline void CopyIn()
    {
        LocalTensor<srcType> srcLocal = inQueueX.AllocTensor<srcType>();
        LocalTensor<uint8_t> maskLocal = inQueueY.AllocTensor<uint8_t>();

        DataCopy(srcLocal, src_global, srcSize);
        DataCopy(maskLocal, mask_global, maskSize);

        inQueueX.EnQue(srcLocal);
        inQueueY.EnQue(maskLocal);
    }
    __aicore__ inline void Compute()
    {
        LocalTensor<srcType> dstLocal = outQueue.AllocTensor<srcType>();

        LocalTensor<srcType> srcLocal = inQueueX.DeQue<srcType>();
        LocalTensor<uint8_t> maskLocal = inQueueY.DeQue<uint8_t>();

        DropOut(dstLocal, srcLocal, maskLocal, probValue, info);

        outQueue.EnQue<srcType>(dstLocal);

        inQueueX.FreeTensor(srcLocal);
        inQueueY.FreeTensor(maskLocal);
    }
    __aicore__ inline void CopyOut()
    {
        LocalTensor<srcType> dstLocal = outQueue.DeQue<srcType>();
        DataCopy(dst_global, dstLocal, srcSize);
        outQueue.FreeTensor(dstLocal);
    }

private:
    GlobalTensor<srcType> src_global;
    GlobalTensor<uint8_t> mask_global;
    GlobalTensor<srcType> dst_global;

    TPipe pipe;
    TQue<QuePosition::VECIN, 1> inQueueX;
    TQue<QuePosition::VECIN, 1> inQueueY;
    TQue<QuePosition::VECOUT, 1> outQueue;

    uint32_t srcSize = 0;
    uint32_t maskSize = 0;
    float probValue = 0;
    DropOutShapeInfo info;
};
}

template <typename dataType>
extern "C" __global__ __aicore__ void kernel_dropout_operator(GM_ADDR src_gm, GM_ADDR mask_gm, GM_ADDR dst_gm, uint32_t firstAxis,
    uint32_t srcLastAxis, uint32_t maskLastAxis)
{
    KernelDropout<dataType> op;
    op.Init(src_gm, mask_gm, dst_gm, firstAxis, srcLastAxis, maskLastAxis);
    op.Process();
}