Gatherb (ISASI)

Function Usage

Gathers a given input tensor to the result tensor based on the offset address tensor provided.

Prototype

1
2
template <typename T>
__aicore__ inline void Gatherb(const LocalTensor<T>& dstLocal, const LocalTensor<T>& src0Local, const LocalTensor<uint32_t>& offsetLocal, const uint8_t repeatTimes, const GatherRepeatParams& repeatParams)

Parameters

Table 1 Parameters

Parameter

Input/Output

Meaning

dstLocal

Output

Destination operand.

The type is LocalTensor, and the supported TPosition is VECIN, VECCALC, or VECOUT.

The start address of the LocalTensor must be 32-byte aligned.

For the Atlas A2 training products / Atlas A2 inference products , the supported data types are uint16_t and uint32_t.

For the Atlas A3 training products / Atlas A3 inference products , the supported data types are uint16_t and uint32_t.

For the Atlas 200I/500 A2 inference products , the supported data types are int8_t, uint8_t, int16_t, uint16_t, half, float, int32_t, uint32_t, bfloat16_t, and int64_t.

src0Local

Input

Source operand.

The type is LocalTensor, and the supported TPosition is VECIN, VECCALC, or VECOUT.

The start address of the LocalTensor must be 32-byte aligned.

The source operand must have the same data type as the destination operand.

For the Atlas A2 training products / Atlas A2 inference products , the supported data types are uint16_t and uint32_t.

For the Atlas A3 training products / Atlas A3 inference products , the supported data types are uint16_t and uint32_t.

For the Atlas 200I/500 A2 inference products , the supported data types are int8_t, uint8_t, int16_t, uint16_t, half, float, int32_t, uint32_t, bfloat16_t, and int64_t.

offsetLocal

Input

Address offset of each data block in the source operand.

The type is LocalTensor, and the supported TPosition is VECIN, VECCALC, or VECOUT.

The start address of the LocalTensor must be 32-byte aligned.

The offset is relative to the base address of srcLocal. The size of each element is at least 0, in bytes. Ensure that the offset address is 32-byte aligned.

repeatTimes

Input

Number of iteration repeats. Data of eight data blocks is collected in each repeat. Data range: repeatTimes ∈ [0,255]; Data type: uint8_t.

repeatParams

Input

Instruction repeat parameter of the GatherRepeatParams type. The following parameters can be configured:

  • dstBlkStride: Address stride of the vector destination operand between different data blocks in a single repeat
  • dstRepStride: Address stride of the vector destination operand for the same data block between adjacent repeats

GatherRepeatParams is defined as follows:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
struct GatherRepeatParams {
    __aicore__ GatherRepeatParams()
    {
        blockNumber = DEFAULT_BLK_NUM;
        dstBlkStride = DEFAULT_BLK_STRIDE;
        src0BlkStride = DEFAULT_BLK_STRIDE;
        src1BlkStride = DEFAULT_BLK_STRIDE;
        dstRepStride = DEFAULT_REPEAT_STRIDE;
        src0RepStride = DEFAULT_REPEAT_STRIDE;
        src1RepStride = DEFAULT_REPEAT_STRIDE;
    }
    __aicore__ GatherRepeatParams(const uint8_t dstBlkStrideIn, const uint8_t dstRepStrideIn)
    {
        dstBlkStride = dstBlkStrideIn;
        dstRepStride = dstRepStrideIn;
    }
    uint32_t blockNumber = DEFAULT_BLK_NUM;
    uint16_t dstRepStride = DEFAULT_REPEAT_STRIDE;
    uint8_t dstBlkStride = DEFAULT_BLK_STRIDE;
    uint8_t src0BlkStride = DEFAULT_BLK_STRIDE;
    uint8_t src1BlkStride = DEFAULT_BLK_STRIDE;
    uint8_t src0RepStride = DEFAULT_REPEAT_STRIDE;
    uint8_t src1RepStride = DEFAULT_REPEAT_STRIDE;
    bool repeatStrideMode = false;
    bool strideSizeMode = false;
};

Availability

Atlas A2 training products / Atlas A2 inference products

Atlas A3 training products / Atlas A3 inference products

Atlas 200I/500 A2 inference products

Precautions

None

Examples

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#include "kernel_operator.h"

class VgatherbCase {
public:
    __aicore__ inline VgatherbCase() {}

    __aicore__ inline void Init(__gm__ uint8_t *x, __gm__ uint8_t *y, __gm__ uint8_t *offset)
    {
        x_gm.SetGlobalBuffer(reinterpret_cast<__gm__ uint16_t *>(x));
        y_gm.SetGlobalBuffer(reinterpret_cast<__gm__ uint16_t *>(y));
        offset_gm.SetGlobalBuffer(reinterpret_cast<__gm__ uint32_t *>(offset));

        uint32_t len = 128;
        bufferLen = len;
        tpipe.InitBuffer(vecIn, 2, bufferLen * sizeof(uint16_t));
        tpipe.InitBuffer(vecOffset, 2, 8 * sizeof(uint32_t));
        tpipe.InitBuffer(vecOut, 2, bufferLen * sizeof(uint16_t));
    }

    __aicore__ inline void CopyIn(uint32_t index)
    {
        auto x_buf = vecIn.AllocTensor<uint16_t>();
        auto offset_buf = vecOffset.AllocTensor<uint32_t>();
        AscendC::DataCopy(x_buf, x_gm[index * bufferLen], bufferLen);
        AscendC::DataCopy(offset_buf, offset_gm[0], 8);
        vecIn.EnQue(x_buf);
        vecOffset.EnQue(offset_buf);
    }

    __aicore__ inline void CopyOut(uint32_t index)
    {
        auto y_buf = vecOut.DeQue<uint16_t>();
        AscendC::DataCopy(y_gm[index * bufferLen], y_buf, bufferLen);
        vecOut.FreeTensor(y_buf);
    }

    __aicore__ inline void Compute()
    {
        auto x_buf = vecIn.DeQue<uint16_t>();
        auto offset_buf = vecOffset.DeQue<uint32_t>();
        auto y_buf = vecOut.AllocTensor<uint16_t>();
        AscendC::GatherRepeatParams params{1, 8};
        uint8_t repeatTime = bufferLen * sizeof(uint16_t) / 256;
        AscendC::Gatherb<uint16_t>(y_buf, x_buf, offset_buf, repeatTime, params);
        vecIn.FreeTensor(x_buf);
        vecOffset.FreeTensor(offset_buf);
        vecOut.EnQue(y_buf);
    }

    __aicore__ inline void Process()
    {
        for (int i = 0; i < 1; i++) {
            CopyIn(i);
            Compute();
            CopyOut(i);
        }
    }

private:
    AscendC::GlobalTensor<uint16_t> x_gm;
    AscendC::GlobalTensor<uint16_t> y_gm;
    AscendC::GlobalTensor<uint32_t> offset_gm;

    AscendC::TPipe tpipe;
    AscendC::TQue<AscendC::QuePosition::VECIN, 2> vecIn;
    AscendC::TQue<AscendC::QuePosition::VECIN, 2> vecOffset;
    AscendC::TQue<AscendC::QuePosition::VECOUT, 2> vecOut;

    uint32_t bufferLen = 0;
};

extern "C" __global__ __aicore__ void vgatherb_core(__gm__ uint8_t *x, __gm__ uint8_t *y, __gm__ uint8_t *offset)
{
    VgatherbCase op;
    op.Init(x, y, offset);
    op.Process();
}
Result example:
Input (offsetLocal): [224 192 160 128 96 64 32 0]
Input (srcLocal): [0 1 2 3 4 5 6 7 ... 120 121 122 123 124 125 126 127]
Output (dstGlobal): [
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
... 
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
]