Silu

Function Usage

Computes Silu element-wise using the following formula, where PAR indicates the number of elements that can be processed by the Vector Unit in one iteration.

$\text{[math]}$

Prototype

template <typename T, bool isReuseSource = false>
__aicore__ inline void Silu(const LocalTensor<T>& dstLocal, const LocalTensor<T>& srcLocal, uint32_t dataSize)

Parameters

**Table 1** Parameters in the template
Parameter	Description
T	Data type of the operand.
isReuseSource	Whether the source operand can be modified. This parameter is reserved. Pass the default value false.

**Table 2** API parameters
Parameter	Input/Output	Description
dstTensor	Output	Destination operand. The type is LocalTensor, and the supported TPosition is VECIN, VECCALC, or VECOUT.
srcTensor	Input	Source operand. The type is LocalTensor, and the supported TPosition is VECIN, VECCALC, or VECOUT. The source operand must have the same data type as the destination operand.
dataSize	Input	Number of actually computed data elements. Value range: dataSize ∈ [0, min(srcTensor.GetSize(), dstTensor.GetSize())].

Returns

None

Availability

Constraints

For details about the alignment requirements of the operand address offset, see General Restrictions.
The source operand address must not overlap the destination operand address.
Currently, only the ND format is supported.
Ensure that dataSize is less than or equal to the element range stored in srcTensor and dstTensor.

Example

#include "kernel_operator.h"

template <typename srcType>
class KernelSilu
{
public:
    __aicore__ inline KernelSilu() {}
    __aicore__ inline void Init(GM_ADDR srcGm, GM_ADDR dstGm, uint32_t inputSize)
    {
        dataSize = inputSize;
        srcGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ srcType *>(srcGm), dataSize);
        dstGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ srcType *>(dstGm), dataSize);

        pipe.InitBuffer(inQueueX, 1, dataSize * sizeof(srcType));
        pipe.InitBuffer(outQueue, 1, dataSize * sizeof(srcType));
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        Compute();
        CopyOut();
    }

private:
    __aicore__ inline void CopyIn()
    {
        AscendC::LocalTensor<srcType> srcLocal = inQueueX.AllocTensor<srcType>();
        AscendC::DataCopy(srcLocal, srcGlobal, dataSize);
        inQueueX.EnQue(srcLocal);
    }
    __aicore__ inline void Compute()
    {
        AscendC::LocalTensor<srcType> dstLocal = outQueue.AllocTensor<srcType>();
        AscendC::LocalTensor<srcType> srcLocal = inQueueX.DeQue<srcType>();
        AscendC::Silu(dstLocal, srcLocal, dataSize);
        outQueue.EnQue<srcType>(dstLocal);
        inQueueX.FreeTensor(srcLocal);
    }
    __aicore__ inline void CopyOut()
    {
        AscendC::LocalTensor<srcType> dstLocal = outQueue.DeQue<srcType>();
        AscendC::DataCopy(dstGlobal, dstLocal, dataSize);
        outQueue.FreeTensor(dstLocal);
    }

private:
    AscendC::GlobalTensor<srcType> srcGlobal;
    AscendC::GlobalTensor<srcType> dstGlobal;
    AscendC::TPipe pipe;
    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueX;
    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueue;
    uint32_t dataSize = 0;
};

template <typename dataType>
__aicore__ void kernel_Silu_operator(GM_ADDR srcGm, GM_ADDR dstGm, uint32_t dataSize)
{
    KernelSilu<dataType> op;
    op.Init(srcGm, dstGm, dataSize);
    op.Process();
}

Result example:

Input data (srcLocal):[3.304723 1.04788 ... -1.0512]
Output data (dstLocal): [3.185546875 0.77587890625 ... -0.272216796875]

Parent topic: Silu