SetAtomicType

Product Support

Product	Supported
Atlas A3 training products/Atlas A3 inference products	√
Atlas A2 training products/Atlas A2 inference products	√
Atlas 200I/500 A2 inference products	√
Atlas inference product's AI Core	√
Atlas inference product's Vector Core	x
Atlas training products	x

Function

Sets different atomic operation data types using template parameters.

Prototype

template <typename T>
__aicore__ inline void SetAtomicType()

Parameters

**Table 1** Template parameters
Parameter	Description
T	Sets different data types. For Atlas A2 training products/Atlas A2 inference products, the supported data types are int8_t, int16_t, half, bfloat16_t, int32_t, and float. For Atlas A3 training products/Atlas A3 inference products, the supported data types are int8_t, int16_t, half, bfloat16_t, int32_t, and float. For the Atlas inference product's AI Core, the supported data types are int16_t, half, and float. For Atlas 200I/500 A2 inference products, the supported data types are int16_t, half, int32_t, and float.

Returns

None

Restrictions

It must be used together with SetAtomicAdd, SetAtomicMax, and SetAtomicMin.

After using the API, you are advised to clear the atomic operating status (for details, see SetAtomicNone) to avoid affecting subsequent command functions.

Example

// In this example, atomic minimization is performed when DataCopy is used to move data from VECOUT to external dstGlobal, and SetAtomicType is used to modify the data type of atomic minimization.
#include "kernel_operator.h"

static const int data_size = 256;
template <typename T>
class KernelDataCopyAtomicMin {
public:
    __aicore__ inline KernelDataCopyAtomicMin() {}
    __aicore__ inline void Init(GM_ADDR src0_gm, GM_ADDR src1_gm, GM_ADDR dst_gm, uint32_t size)
    {
        this->size = size;
        src0Global.SetGlobalBuffer((__gm__ T *)src0_gm);
        src1Global.SetGlobalBuffer((__gm__ T *)src1_gm);
        dstGlobal.SetGlobalBuffer((__gm__ T *)dst_gm);
        pipe.InitBuffer(queueSrc0, 1, size * sizeof(T));
        pipe.InitBuffer(queueSrc1, 1, size * sizeof(T));
        pipe.InitBuffer(queueDst0, 1, size * sizeof(T));
        pipe.InitBuffer(queueDst1, 1, size * sizeof(T));
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        Compute();
        CopyOut();
    }
private:
    __aicore__ inline void CopyIn()
    {
        AscendC::LocalTensor<T> src0local = queueSrc0.AllocTensor<T>();
        AscendC::LocalTensor<T> src1local = queueSrc1.AllocTensor<T>();
        AscendC::DataCopy(src0local, src0Global, size);
        AscendC::DataCopy(src1local, src1Global, size);
        queueSrc0.EnQue(src0local);
        queueSrc1.EnQue(src1local);
    }
    __aicore__ inline void Compute()
    {
        AscendC::LocalTensor<T> src0local = queueSrc0.DeQue<T>();
        AscendC::LocalTensor<T> src1local = queueSrc1.DeQue<T>();
        AscendC::LocalTensor<T> dst0Local = queueDst0.AllocTensor<T>();
        AscendC::LocalTensor<T> dst1Local = queueDst1.AllocTensor<T>();
        AscendC::Abs(dst0Local, src0local, size);
        AscendC::Abs(dst1Local, src1local, size);
        queueDst0.EnQue(dst0Local);
        queueDst1.EnQue(dst1Local);
        queueSrc0.FreeTensor(src0local);
        queueSrc1.FreeTensor(src1local);
    }
    __aicore__ inline void CopyOut()
    {
        AscendC::LocalTensor<T> dst0Local = queueDst0.DeQue<T>();
        AscendC::LocalTensor<T> dst1Local = queueDst1.DeQue<T>();
        AscendC::DataCopy(dstGlobal, dst1Local, size);
        AscendC::PipeBarrier<PIPE_MTE3>();
        AscendC::SetAtomicMin<int8_t>();  // The value can be set to any type. In this example, int8_t is used.
        AscendC::SetAtomicType<T>(); // Set this parameter to the actual data type.
        AscendC::DataCopy(dstGlobal, dst0Local, size);
        queueDst0.FreeTensor(dst0Local);
        queueDst1.FreeTensor(dst1Local);
        AscendC::SetAtomicNone();
    }
private:
    AscendC::TPipe pipe;
    AscendC::TQue<AscendC::TPosition::VECIN, 1> queueSrc0;
    AscendC::TQue<AscendC::TPosition::VECIN, 1> queueSrc1;
    AscendC::TQue<AscendC::TPosition::VECOUT, 1> queueDst0;
    AscendC::TQue<AscendC::TPosition::VECOUT, 1> queueDst1;
    AscendC::GlobalTensor<T> src0Global, src1Global, dstGlobal;
    uint32_t size;
};
extern "C" __global__ __aicore__ void data_copy_atomic_min_kernel(GM_ADDR src0_gm, GM_ADDR src1_gm, GM_ADDR dst_gm)
{
    KernelDataCopyAtomicMin<half> op;
    op.Init(src0_gm, src1_gm, dst_gm, data_size);
    op.Process();
}

The input data of each core is as follows:
Src0: [1,1,1,1,1, ...,1] // 1 × 256
Src1: [2,2,2,2,2, ...,2] // 2 × 256
Final output data: [1,1,1,1,1,...,1] // 1 × 256

Parent topic: Atomic Operations