Silu

Function Usage

Computes Silu element-wise using the following formula, where PAR indicates the number of elements that can be processed by the Vector Unit in one iteration.

Prototype

1
2
template <typename T, bool isReuseSource = false>
__aicore__ inline void Silu(const LocalTensor<T>& dstLocal, const LocalTensor<T>& srcLocal, uint32_t dataSize)

Parameters

Table 1 Parameters in the template

Parameter

Description

T

Data type of the operand.

isReuseSource

Whether the source operand can be modified. This parameter is reserved. Pass the default value false.

Table 2 API parameters

Parameter

Input/Output

Description

dstTensor

Output

Destination operand.

The type is LocalTensor, and the supported TPosition is VECIN, VECCALC, or VECOUT.

srcTensor

Input

Source operand.

The type is LocalTensor, and the supported TPosition is VECIN, VECCALC, or VECOUT.

The source operand must have the same data type as the destination operand.

dataSize

Input

Number of actually computed data elements. Value range: dataSize ∈ [0, min(srcTensor.GetSize(), dstTensor.GetSize())].

Returns

None

Availability

Constraints

  • For details about the alignment requirements of the operand address offset, see General Restrictions.
  • The source operand address must not overlap the destination operand address.
  • Currently, only the ND format is supported.
  • Ensure that dataSize is less than or equal to the element range stored in srcTensor and dstTensor.

Example

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#include "kernel_operator.h"

template <typename srcType>
class KernelSilu
{
public:
    __aicore__ inline KernelSilu() {}
    __aicore__ inline void Init(GM_ADDR srcGm, GM_ADDR dstGm, uint32_t inputSize)
    {
        dataSize = inputSize;
        srcGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ srcType *>(srcGm), dataSize);
        dstGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ srcType *>(dstGm), dataSize);

        pipe.InitBuffer(inQueueX, 1, dataSize * sizeof(srcType));
        pipe.InitBuffer(outQueue, 1, dataSize * sizeof(srcType));
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        Compute();
        CopyOut();
    }

private:
    __aicore__ inline void CopyIn()
    {
        AscendC::LocalTensor<srcType> srcLocal = inQueueX.AllocTensor<srcType>();
        AscendC::DataCopy(srcLocal, srcGlobal, dataSize);
        inQueueX.EnQue(srcLocal);
    }
    __aicore__ inline void Compute()
    {
        AscendC::LocalTensor<srcType> dstLocal = outQueue.AllocTensor<srcType>();
        AscendC::LocalTensor<srcType> srcLocal = inQueueX.DeQue<srcType>();
        AscendC::Silu(dstLocal, srcLocal, dataSize);
        outQueue.EnQue<srcType>(dstLocal);
        inQueueX.FreeTensor(srcLocal);
    }
    __aicore__ inline void CopyOut()
    {
        AscendC::LocalTensor<srcType> dstLocal = outQueue.DeQue<srcType>();
        AscendC::DataCopy(dstGlobal, dstLocal, dataSize);
        outQueue.FreeTensor(dstLocal);
    }

private:
    AscendC::GlobalTensor<srcType> srcGlobal;
    AscendC::GlobalTensor<srcType> dstGlobal;
    AscendC::TPipe pipe;
    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueX;
    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueue;
    uint32_t dataSize = 0;
};

template <typename dataType>
__aicore__ void kernel_Silu_operator(GM_ADDR srcGm, GM_ADDR dstGm, uint32_t dataSize)
{
    KernelSilu<dataType> op;
    op.Init(srcGm, dstGm, dataSize);
    op.Process();
}

Result example:

1
2
Input data (srcLocal):[3.304723 1.04788 ... -1.0512]
Output data (dstLocal): [3.185546875 0.77587890625 ... -0.272216796875]