SetAtomicType

Supported Products

Product

Supported/Unsupported

Atlas A3 training products/Atlas A3 inference products

Atlas A2 training products/Atlas A2 inference products

Atlas 200I/500 A2 inference products

Atlas inference product's AI Core

Atlas inference product's Vector Core

x

Atlas training products

x

Function Usage

Sets different atomic operation data types using template parameters.

Prototype

1
2
template <typename T>
__aicore__ inline void SetAtomicType()

Parameters

Table 1 Parameters in the template

Parameter

Description

T

Sets different data types.

Atlas A2 training products/Atlas A2 inference products: int8_t, int16_t, half, bfloat16_t, int32_t, or float

Atlas A3 training products/Atlas A3 inference products: int8_t, int16_t, half, bfloat16_t, int32_t, or float

Atlas inference product's AI Core: int16_t, half, or float

Atlas 200I/500 A2 inference products: int16_t, half, int32_t, or float

Returns

None

Constraints

It must be used together with SetAtomicAdd, SetAtomicMax, and SetAtomicMin.

After using the API, you are advised to clear the atomic operating status (for details, see SetAtomicNone) to avoid affecting subsequent command functions.

Example

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
// In this example, atomic minimization is performed when DataCopy is used to move data from VECOUT to external dstGlobal, and SetAtomicType is used to modify the data type of atomic minimization.
#include "kernel_operator.h"

static const int data_size = 256;
template <typename T>
class KernelDataCopyAtomicMin {
public:
    __aicore__ inline KernelDataCopyAtomicMin() {}
    __aicore__ inline void Init(GM_ADDR src0_gm, GM_ADDR src1_gm, GM_ADDR dst_gm, uint32_t size)
    {
        this->size = size;
        src0Global.SetGlobalBuffer((__gm__ T *)src0_gm);
        src1Global.SetGlobalBuffer((__gm__ T *)src1_gm);
        dstGlobal.SetGlobalBuffer((__gm__ T *)dst_gm);
        pipe.InitBuffer(queueSrc0, 1, size * sizeof(T));
        pipe.InitBuffer(queueSrc1, 1, size * sizeof(T));
        pipe.InitBuffer(queueDst0, 1, size * sizeof(T));
        pipe.InitBuffer(queueDst1, 1, size * sizeof(T));
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        Compute();
        CopyOut();
    }
private:
    __aicore__ inline void CopyIn()
    {
        AscendC::LocalTensor<T> src0local = queueSrc0.AllocTensor<T>();
        AscendC::LocalTensor<T> src1local = queueSrc1.AllocTensor<T>();
        AscendC::DataCopy(src0local, src0Global, size);
        AscendC::DataCopy(src1local, src1Global, size);
        queueSrc0.EnQue(src0local);
        queueSrc1.EnQue(src1local);
    }
    __aicore__ inline void Compute()
    {
        AscendC::LocalTensor<T> src0local = queueSrc0.DeQue<T>();
        AscendC::LocalTensor<T> src1local = queueSrc1.DeQue<T>();
        AscendC::LocalTensor<T> dst0Local = queueDst0.AllocTensor<T>();
        AscendC::LocalTensor<T> dst1Local = queueDst1.AllocTensor<T>();
        AscendC::Abs(dst0Local, src0local, size);
        AscendC::Abs(dst1Local, src1local, size);
        queueDst0.EnQue(dst0Local);
        queueDst1.EnQue(dst1Local);
        queueSrc0.FreeTensor(src0local);
        queueSrc1.FreeTensor(src1local);
    }
    __aicore__ inline void CopyOut()
    {
        AscendC::LocalTensor<T> dst0Local = queueDst0.DeQue<T>();
        AscendC::LocalTensor<T> dst1Local = queueDst1.DeQue<T>();
        AscendC::DataCopy(dstGlobal, dst1Local, size);
        AscendC::PipeBarrier<PIPE_MTE3>();
        AscendC::SetAtomicMin<int8_t>();  // The value can be set to any type. In this example, int8_t is used.
        AscendC::SetAtomicType<T>(); // Set this parameter to the actual data type.
        AscendC::DataCopy(dstGlobal, dst0Local, size);
        queueDst0.FreeTensor(dst0Local);
        queueDst1.FreeTensor(dst1Local);
        AscendC::SetAtomicNone();
    }
private:
    AscendC::TPipe pipe;
    AscendC::TQue<AscendC::TPosition::VECIN, 1> queueSrc0;
    AscendC::TQue<AscendC::TPosition::VECIN, 1> queueSrc1;
    AscendC::TQue<AscendC::TPosition::VECOUT, 1> queueDst0;
    AscendC::TQue<AscendC::TPosition::VECOUT, 1> queueDst1;
    AscendC::GlobalTensor<T> src0Global, src1Global, dstGlobal;
    uint32_t size;
};
extern "C" __global__ __aicore__ void data_copy_atomic_min_kernel(GM_ADDR src0_gm, GM_ADDR src1_gm, GM_ADDR dst_gm)
{
    KernelDataCopyAtomicMin<half> op;
    op.Init(src0_gm, src1_gm, dst_gm, data_size);
    op.Process();
}
The input data of each core is as follows:
Src0: [1,1,1,1,1, ...,1] // 1 × 256
Src1: [2,2,2,2,2, ...,2] // 2 × 256
Final output data: [1,1,1,1,1,...,1] // 1 × 256