More Examples

Example 1

The following example shows how to use the kernel API and tiling API GetXxxMaxMinTmpSize in the math library. The process is as follows:

The host calls the tiling API to obtain the size of the required temporary space and writes the size to the tiling data. The kernel reads the tiling data to obtain the size of the temporary space and then allocates the temporary space.

Usage example of the tiling API on the host:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#include <vector>

#include "register/op_def_registry.h"
#include "register/tilingdata_base.h"
#include "tiling/tiling_api.h"

namespace optiling {

BEGIN_TILING_DATA_DEF(AsinCustomTilingData)
  TILING_DATA_FIELD_DEF(uint32_t, srcSize);
  TILING_DATA_FIELD_DEF(uint32_t, tmpBufferSize);
END_TILING_DATA_DEF;

static ge::graphStatus TilingFunc(gert::TilingContext* context)
{
    // Input source shapes.
    std::vector<int64_t> srcDims = {16, 128};
    uint32_t srcSize = 1;
    for (auto dim : srcDims) {
        srcSize *= dim;
    }
    uint32_t typeSize = 2;
    ge::Shape shape(srcDims);
    uint32_t minValue = 0;
    uint32_t maxValue = 0;
    AscendC::GetAsinMaxMinTmpSize(shape, typeSize, false, maxValue, minValue);

    auto platformInfo = context->GetPlatformInfo();
    auto ascendcPlatform = platform_ascendc::PlatformAscendC(platformInfo);
    uint64_t tailSize = 0; // remaining space of UB
    ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, tailSize); // In this example, full UB space is used. In the actual situation, the already used UB space must be subtracted from tailSize.
    auto tmpSize = tailSize >= maxValue ? maxValue : tailSize;

    AsinCustomTilingData tiling;
    tiling.set_srcSize(srcSize);
    tiling.set_tmpBufferSize(tmpSize);
    context->SetBlockDim(1);
    tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
    context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
    context->SetTilingKey(1);

    return ge::GRAPH_SUCCESS;
}
} // namespace optiling
The kernel reads the tiling data, obtains the size of the temporary space, and then allocates the temporary space.
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#include "kernel_operator.h"

template <typename srcType>
class KernelAsin
{
public:
    __aicore__ inline KernelAsin() {}
    __aicore__ inline void Init(GM_ADDR srcGm, GM_ADDR dstGm, uint32_t srcSize, uint32_t tmpBufferSize)
    {
        srcGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ srcType *>(srcGm), srcSize);
        dstGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ srcType *>(dstGm), srcSize);

        pipe.InitBuffer(inQueue, 1, srcSize * sizeof(srcType));
        pipe.InitBuffer(outQueue, 1, srcSize * sizeof(srcType));
        pipe.InitBuffer(tmpQueue, 1, tmpBufferSize);
        bufferSize = srcSize;
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        Compute();
        CopyOut();
    }

private:
    __aicore__ inline void CopyIn()
    {
        AscendC::LocalTensor<srcType> srcLocal = inQueue.AllocTensor<srcType>();
        AscendC::DataCopy(srcLocal, srcGlobal, bufferSize);
        inQueue.EnQue(srcLocal);
    }
    __aicore__ inline void Compute()
    {
        AscendC::LocalTensor<srcType> dstLocal = outQueue.AllocTensor<srcType>();

        AscendC::LocalTensor<srcType> srcLocal = inQueue.DeQue<srcType>();
        AscendC::LocalTensor<uint8_t> sharedTmpBuffer = tmpQueue.AllocTensor<uint8_t>();
        AscendC::Asin<srcType, false>(dstLocal, srcLocal, sharedTmpBuffer, bufferSize);

        outQueue.EnQue<srcType>(dstLocal);
        inQueue.FreeTensor(srcLocal);
        tmpQueue.FreeTensor(sharedTmpBuffer);
    }
    __aicore__ inline void CopyOut()
    {
        AscendC::LocalTensor<srcType> dstLocal = outQueue.DeQue<srcType>();
        AscendC::DataCopy(dstGlobal, dstLocal, bufferSize);
        outQueue.FreeTensor(dstLocal);
    }

private:
    AscendC::GlobalTensor<srcType> srcGlobal;
    AscendC::GlobalTensor<srcType> dstGlobal;

    AscendC::TPipe pipe;
    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueue;
    AscendC::TQue<AscendC::QuePosition::VECCALC, 1> tmpQueue;
    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueue;
    uint32_t bufferSize = 0;
};

extern "C" __global__ __aicore__ void asin_custom(GM_ADDR srcGm, GM_ADDR dstGm, GM_ADDR workspace, GM_ADDR tiling)
{
    GET_TILING_DATA(tilingData, tiling);
    KernelAsin<half> op;
    op.Init(srcGm, dstGm, tilingData.srcSize, tilingData.tmpBufferSize);
    if (TILING_KEY_IS(1))
    {
        op.Process();
    }
}

Example 2

The following example shows how to use the kernel API and tiling API GetXxxTmpBufferFactorSize in the math library. The process is as follows:

The host calls the tiling API to obtain maxLiveNodeCnt and extraBuf, calculates the maximum number of elements that can be calculated by the operator at a time, and writes the number to the tiling data. The kernel reads the tiling data, obtains the value, and allocates temporary space based on the value.

Usage example of the tiling API on the host:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#include <vector>
#include <cassert>
#include "register/op_def_registry.h"
#include "register/tilingdata_base.h"
#include "tiling/tiling_api.h"

namespace optiling {
BEGIN_TILING_DATA_DEF(AsinCustomTilingData)
TILING_DATA_FIELD_DEF(uint32_t, srcSize);
TILING_DATA_FIELD_DEF(uint32_t, tmpBufferSize);
END_TILING_DATA_DEF;

static ge::graphStatus TilingFunc(gert::TilingContext *context)
{
    // Input source shapes.
    std::vector<int64_t> srcDims = { 16, 128 };
    uint32_t srcSize = 1;
    uint32_t srcCurSize = 1;
    for (auto dim : srcDims) {
        srcSize *= dim;
    }
    uint32_t typeSize = 2;

    auto platformInfo = context->GetPlatformInfo();
    auto ascendcPlatform = platform_ascendc::PlatformAscendC(platformInfo);
    uint64_t tailSize = 0; // remaining space of UB
    ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, tailSize);

    uint32_t asinMaxLiveNodeCnt = 0;
    uint32_t asinExtraBuf = 0;

    uint32_t acosMaxLiveNodeCnt = 0;
    uint32_t acosExtraBuf = 0;

    AscendC::GetAsinTmpBufferFactorSize(typeSize, asinMaxLiveNodeCnt, asinExtraBuf);
    AscendC::GetAcosTmpBufferFactorSize(typeSize, acosMaxLiveNodeCnt, acosExtraBuf);
    // The size of tmp must be subtracted from the size of the input and output occupied by the API called on UB.
    // This example includes the input and output of the Asin and Acos APIs. The output of Asin is used as the input of Acos. Therefore, three src spaces are required.
    auto tmpSize = tailSize - srcSize * typeSize * 3;
    assert(tmpSize >= asinExtraBuf);
    assert(tmpSize >= acosExtraBuf);
// Calculate the maximum number of elements that can be calculated by the Asin operator at a time.
    if (asinMaliveNodeCnt != 0) {
        srcAsinCurSize = (tmpSize - asinExtraBuf) / asinMaxLiveNodeCnt / typeSize;
    } else {
        srcAsinCurSize = srcSize;
    }
// Calculate the maximum number of elements that can be calculated by the Acos operator at a time.
    if (acosMaxLiveNodeCnt != 0) {
        srcAcosCurSize = (tmpSize - acosExtraBuf) / acosMaxLiveNodeCnt / typeSize; 
    } else {
        srcAcosCurSize = srcSize;
    }
    srcCurSize = min(srcAsinCurSize, srcAcosCurSize);

    AsinCustomTilingData tiling;
    tiling.set_srcSize(srcSize);
    tiling.set_srcCurSize(srcCurSize);
    tiling.set_tmpBufferSize(tmpSize);
    context->SetBlockDim(1);
    tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
    context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
    context->SetTilingKey(1);

    return ge::GRAPH_SUCCESS;
}
} // namespace optiling
Example on the kernel:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#include "kernel_operator.h"
template <typename srcType>
class KernelAsin {
public:
    __aicore__ inline KernelAsin(){}
    __aicore__ inline void Init(GM_ADDR srcGm, GM_ADDR dstGm, uint32_t srcSizeIn, uint32_t srcCurSizeIn, uint32_t tmpBufferSize)
    {
        srcSize = srcSizeIn;
        srcCurSize = srcCurSizeIn;
        srcGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ srcType *>(srcGm), srcSize);
        dstGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ srcType *>(dstGm), srcSize);

        pipe.InitBuffer(inQueue, 1, srcSize * sizeof(srcType));
        pipe.InitBuffer(outQueue, 1, srcSize * sizeof(srcType));
        pipe.InitBuffer(tmpQueue1, 1, srcCurSize * sizeof(srcType));
        pipe.InitBuffer(tmpQueue, 1, tmpBufferSize);
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        Compute();
        CopyOut();
    }

private:
    __aicore__ inline void CopyIn()
    {
        AscendC::LocalTensor<srcType> srcLocal = inQueue.AllocTensor<srcType>();
        AscendC::DataCopy(srcLocal, srcGlobal, srcSize);
        inQueue.EnQue(srcLocal);
    }
    __aicore__ inline void Compute()
    {
        AscendC::LocalTensor<srcType> dstLocal = outQueue.AllocTensor<srcType>();
        AscendC::LocalTensor<srcType> srcLocal = inQueue.DeQue<srcType>();
        AscendC::LocalTensor<uint8_t> sharedTmpBuffer = tmpQueue.AllocTensor<uint8_t>();
        AscendC::LocalTensor<srcType> tmpresBuffer = tmpQueue1.AllocTensor<srcType>();

        for (int32_t offset = 0; offset < srcSize; offset += srcCurSize) {
            AscendC::Asin<srcType, false>(tmpresBuffer, srcLocal[offset], sharedTmpBuffer, srcCurSize);
            AscendC::PipeBarrier<PIPE_V>();
            AscendC::Acos<srcType, false>(dstLocal[offset], tmpresBuffer, sharedTmpBuffer, srcCurSize);
            AscendC::PipeBarrier<PIPE_V>();
        }
        outQueue.EnQue<srcType>(dstLocal);
        inQueue.FreeTensor(srcLocal);
        tmpQueue.FreeTensor(sharedTmpBuffer);
    }
    __aicore__ inline void CopyOut()
    {
        AscendC::LocalTensor<srcType> dstLocal = outQueue.DeQue<srcType>();
        AscendC::DataCopy(dstGlobal, dstLocal, srcSize);
        outQueue.FreeTensor(dstLocal);
    }

private:
    AscendC::GlobalTensor<srcType> srcGlobal;
    AscendC::GlobalTensor<srcType> dstGlobal;

    AscendC::TPipe pipe;
    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueue;
    AscendC::TQue<AscendC::QuePosition::VECCALC, 1> tmpQueue;
    AscendC::TQue<AscendC::QuePosition::VECCALC, 1> tmpQueue1;
    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueue;
    uint32_t srcSize, srcCurSize;
};

extern "C" __global__ __aicore__ void kernel_asin_operator(GM_ADDR srcGm, GM_ADDR dstGm, GM_ADDR tiling)
{
    GET_TILING_DATA(tilingData, tiling);
    KernelAsin<half> op;
    op.Init(srcGm, dstGm, tilingData.srcSize, tilingData.srcCurSize, tilingData.tmpBufferSize);
    if (TILING_KEY_IS(1)) {
        op.Process();
    }
}

Example 3

The following uses the key code for calling the Exp API and data print results before and after calling as an example to describe the usage and impact of the template parameter isReuseSource.

The template parameter isReuseSource is of the bool type. If isReuseSource is set to false, the memory space of the source operand is not reused during internal computation of the API. If isReuseSource is set to true, the memory space of the source operand is reused during internal computation of the API to store some intermediate results, saving memory space. Note that the memory space of the source operand is not the original value after the API is executed.

  • isReuseSource = false
    1
    2
    3
    4
    5
    6
    7
    8
    9
    // Values of dstLocal and srcLocal before Exp is called
    // dstLocal: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    // srcLocal: [-7.5, -6.5, -5.5, -4.5, -3.5, -2.5, -1.5, -0.5, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]
    
    AscendC::Exp<float, 15, false>(dstLocal, srcLocal, 16);
    
    // Values of dstLocal and srcLocal after Exp is called
    // dstLocal: [0.000553084, 0.00150344, 0.00408677, 0.011109, 0.0301974, 0.082085, 0.22313, 0.606531, 1.64872, 4.48169, 12.1825, 33.1155, 90.0171, 244.692, 665.142, 1808.04]
    // srcLocal: [-7.5, -6.5, -5.5, -4.5, -3.5, -2.5, -1.5, -0.5, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]
    
  • isReuseSource = true
    1
    2
    3
    4
    5
    6
    7
    8
    9
    // Values of dstLocal and srcLocal before Exp is called
    // dstLocal: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    // srcLocal: [-7.5, -6.5, -5.5, -4.5, -3.5, -2.5, -1.5, -0.5, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]
    
    AscendC::Exp<float, 15, true>(dstLocal, srcLocal, 16);
    
    // Values of dstLocal and srcLocal after Exp is called
    // dstLocal: [0.000553084, 0.00150344, 0.00408677, 0.011109, 0.0301974, 0.082085, 0.22313, 0.606531, 1.64872, 4.48169, 12.1825, 33.1155, 90.0171, 244.692, 665.142, 1808.04]
    // srcLocal: [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]