功能说明

对输入tensor做softmax计算，当前仅支持传入shape为ND格式，内部的reduce过程都是按last轴进行。

公式如下：

函数原型

template <typename T, bool isReuseSource = false>

void SoftMax(const LocalTensor<T>& dst, const LocalTensor<T>& expSumTensor, const LocalTensor<T>& dstMax, const LocalTensor<T>& src, SoftMaxTiling& tiling)

参数说明

表1 接口参数说明
参数名	输入/输出	描述
dst	输出	目的操作数，类型为LocalTensor，last轴长度需要32B对齐。
expSumTensor	输出	目的操作数，类型为LocalTensor，会保存softmax计算过程中的exp结果再按last轴求reducesum的结果，last轴长度固定32B。
dstMax	输出	目的操作数，类型为LocalTensor，会保存softmax计算过程中的last轴求reducemax的结果，last轴长度固定32B。
src	输入	源操作数，类型为LocalTensor，last轴长度需要32B对齐。
tiling	输入	softmax计算所需tiling信息，Tiling信息的获取请参考SoftMax Tiling API。
isReuseSource	输入	dst是否复用src的空间

返回值

无

支持的型号

Atlas A2训练系列产品

注意事项

src和dst的Tensor空间可以复用。
expSumTensor和dstMax为输出，并且last轴长度必须固定32B。

操作数地址偏移对齐要求请参见通用约束。

调用示例

本样例输入src的Shape大小为[320,64]，输出Shape大小dst=[320,64]，其中中间计算结果expSumTensor=[320,16],dstMax=[320,16]，数据类型均为half。

#include "kernel_operator.h"
namespace AscendC {

template <typename T> class KernelSoftmax {
public:
    __aicore__ inline KernelSoftmax() {}
    __aicore__ inline void Init(__gm__ uint8_t* src1Gm, __gm__ uint8_t* dstGm)
    {
        elementNumPerBlk = 32 / sizeof(T);
        src1Global.SetGlobalBuffer((__gm__ T*)src1Gm);
        dstGlobal.SetGlobalBuffer((__gm__ T*)dstGm);
        pipe.InitBuffer(inQueueSrc1, 1, height*width * sizeof(T));
        pipe.InitBuffer(maxQueue, 1, height*elementNumPerBlk * sizeof(T));
        pipe.InitBuffer(sumQueue, 1, height*elementNumPerBlk * sizeof(T));
        pipe.InitBuffer(outQueueDst, 1, height*width * sizeof(T));
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        Compute();
        CopyOut();
    }

private:

    __aicore__ inline void CopyIn()
    {
        LocalTensor<T> srcLocal1 = inQueueSrc1.AllocTensor<T>();
        DataCopy(srcLocal1, src1Global, height*width);
        inQueueSrc1.EnQue(srcLocal1);
    }
    __aicore__ inline void Compute()
    {
        LocalTensor<T> srcLocal1 = inQueueSrc1.DeQue<T>();
        LocalTensor<T> sumTempLocal = sumQueue.AllocTensor<T>();
        LocalTensor<T> maxTempLocal = maxQueue.AllocTensor<T>();
        LocalTensor<T> dstLocal = outQueueDst.AllocTensor<T>();

        const uint32_t shapeDim = 2;
        uint32_t array[2] = {height, width};
        srcLocal1.SetShapeInfo(ShapeInfo(shapeDim, array));
        dstLocal.SetShapeInfo(ShapeInfo(shapeDim, array));

        array[0] = height;
        array[1] = elementNumPerBlk;
        sumTempLocal.SetShapeInfo(ShapeInfo(shapeDim, array));
        maxTempLocal.SetShapeInfo(ShapeInfo(shapeDim, array));

        SoftMaxTiling tiling; // 本示例tiling为演示用 实际内容需要通过Tiling Api获取
        SoftMax<T,false>(srcLocal1, sumTempLocal, maxTempLocal, srcLocal1, tiling);
        DataCopy(dstLocal, srcLocal1, height*width);

        outQueueDst.EnQue<T>(dstLocal);
        maxQueue.FreeTensor(maxTempLocal);
        sumQueue.FreeTensor(sumTempLocal);
        inQueueSrc1.FreeTensor(srcLocal1);
    }
    __aicore__ inline void CopyOut()
    {
        LocalTensor<T> dstLocal = outQueueDst.DeQue<T>();
        DataCopy(dstGlobal, dstLocal, height*width);
        outQueueDst.FreeTensor(dstLocal);
    }

private:
    TPipe pipe;
    TQue<QuePosition::VECIN, 1> inQueueSrc1;
    TQue<QuePosition::VECIN, 1> maxQueue;
    TQue<QuePosition::VECIN, 1> sumQueue;
    TQue<QuePosition::VECOUT, 1> outQueueDst;
    GlobalTensor<T> src1Global, dstGlobal;
    uint32_t elementNumPerBlk = 0;
    uint32_t width = 64;
    uint32_t height = 320;
};

}  // namespace AscendC

extern "C" __global__ __aicore__ void softmax_kernel_half(__gm__ uint8_t *src1Gm, __gm__ uint8_t *dstGm)
{
    AscendC::KernelSoftmax<half> op;
    op.Init(src1Gm, dstGm);
    op.Process();
}