功能说明

使用计算好的expsum和max数据对输入tensor做softmax计算，当前仅支持传入shape为ND格式，内部的reduce过程都是按last轴进行。

公式如下：

函数原型

template <typename T, bool isReuseSource = false>

void SimpleSoftMax(const LocalTensor<T>& dst, const LocalTensor<T>& inExpSumTensor, const LocalTensor<T>& inMaxTensor, const LocalTensor<T>& src, SoftMaxTiling& tiling)

参数说明

表1 接口参数说明
参数名	输入/输出	描述
dst	输出	目的操作数，类型为LocalTensor，last轴长度需要32B对齐。
inExpSumTensor	输入	源操作数，类型为LocalTensor，softmax计算需要的expSum值，last轴长度固定32B。
inMaxTensor	输入	源操作数，类型为LocalTensor，softmax计算需要的max值，last轴长度固定32B。
src	输入	源操作数，类型为LocalTensor，last轴长度需要32B对齐。
tiling	输入	simplesoftmax计算所需tiling信息。
isReuseSource	输入	是否复用src的空间。

返回值

无

支持的型号

Atlas A2训练系列产品

注意事项

src和dst的Tensor空间可以复用。
expSumTensor和dstMax为输入，并且last轴长度必须固定32B。

操作数地址偏移对齐要求请参见通用约束。

调用示例

本样例输入src的Shape大小为[320,64]，输出Shape大小dst=[320,64]，输入inExpSumTensor=[320,16]，输入inMaxTensor=[320,16]，数据类型均为half。

#include "kernel_operator.h"

namespace AscendC {

template <typename T> class KernelSoftmax {
public:
    __aicore__ inline KernelSoftmax() {}
    __aicore__ inline void Init(__gm__ uint8_t* src1Gm, __gm__ uint8_t* dstGm)
    {
        elementNumPerBlk = 32 / sizeof(T);
        src1Global.SetGlobalBuffer((__gm__ T*)src1Gm);
        dstGlobal.SetGlobalBuffer((__gm__ T*)dstGm);
        pipe.InitBuffer(inQueueSrc1, 1, height*width * sizeof(T));
        pipe.InitBuffer(maxQueue, 1, height*elementNumPerBlk * sizeof(T));
        pipe.InitBuffer(sumQueue, 1, height*elementNumPerBlk * sizeof(T));
        pipe.InitBuffer(outQueueDst, 1, height*width * sizeof(T));
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        Compute();
        CopyOut();
    }

private:

    __aicore__ inline void CopyIn()
    {
        LocalTensor<T> srcLocal1 = inQueueSrc1.AllocTensor<T>();
        DataCopy(srcLocal1, src1Global, height*width);
        inQueueSrc1.EnQue(srcLocal1);
    }
    __aicore__ inline void Compute()
    {
        LocalTensor<T> srcLocal1 = inQueueSrc1.DeQue<T>();
        LocalTensor<T> sumTempLocal = sumQueue.AllocTensor<T>();
        LocalTensor<T> maxTempLocal = maxQueue.AllocTensor<T>();
        LocalTensor<T> dstLocal = outQueueDst.AllocTensor<T>();

        const uint32_t shapeDim = 2;
        uint32_t array[2] = {height, width};
        srcLocal1.SetShapeInfo(ShapeInfo(shapeDim, array));
        dstLocal.SetShapeInfo(ShapeInfo(shapeDim, array));

        array[0] = height;
        array[1] = elementNumPerBlk;
        sumTempLocal.SetShapeInfo(ShapeInfo(shapeDim, array));
        maxTempLocal.SetShapeInfo(ShapeInfo(shapeDim, array));

        SoftMaxTiling tiling1; // 本示例tiling为演示用 实际内容需要通过Tiling Api获取
        SoftMax<T,false>(dstLocal, sumTempLocal, maxTempLocal, srcLocal1, tiling1); // 此处是为了计算SimpleSoftMax接口所需的sumTempLocal和maxTempLocal数据
        SoftMaxTiling tiling2; // 本示例tiling为演示用 实际内容需要通过Tiling Api获取
        SimpleSoftMax<T,false>(dstLocal, sumTempLocal, maxTempLocal, srcLocal1, tiling2);
        DataCopy(dstLocal, srcLocal1, height*width);

        outQueueDst.EnQue<T>(dstLocal);
        maxQueue.FreeTensor(maxTempLocal);
        sumQueue.FreeTensor(sumTempLocal);
        inQueueSrc1.FreeTensor(srcLocal1);
    }
    __aicore__ inline void CopyOut()
    {
        LocalTensor<T> dstLocal = outQueueDst.DeQue<T>();
        DataCopy(dstGlobal, dstLocal, height*width);
        outQueueDst.FreeTensor(dstLocal);
    }

private:
    TPipe pipe;
    TQue<QuePosition::VECIN, 1> inQueueSrc1;
    TQue<QuePosition::VECIN, 1> maxQueue;
    TQue<QuePosition::VECIN, 1> sumQueue;
    TQue<QuePosition::VECOUT, 1> outQueueDst;
    GlobalTensor<T> src1Global, dstGlobal;
    uint32_t elementNumPerBlk = 0;
    uint32_t width = 64;
    uint32_t height = 320;
};

}  // namespace AscendC

extern "C" __global__ __aicore__ void softmax_kernel_half(__gm__ uint8_t *src1Gm, __gm__ uint8_t *dstGm)
{
    AscendC::KernelSoftmax<half> op;
    op.Init(src1Gm, dstGm);
    op.Process();
}