SetAtomicMax (ISASI)

Product Support

Product	Supported
Atlas A3 training products/Atlas A3 inference products	√
Atlas A2 training products/Atlas A2 inference products	√
Atlas 200I/500 A2 inference products	x
Atlas inference product's AI Core	x
Atlas inference product's Vector Core	x
Atlas training products	x

Function

Sets whether to perform atomic comparison for subsequent data transferred from VECOUT to GM, which compares the content to be copied with the existing content in GM and writes the maximum value to GM.

You can set different data types using template parameters.

Prototype

template <typename T>
__aicore__ inline void SetAtomicMax() 

Parameters

**Table 1** Template parameters
Parameter	Description
T	Sets different data types. For Atlas A2 training products/Atlas A2 inference products, the supported data types are int8_t, int16_t, half, bfloat16_t, int32_t, and float. For Atlas A3 training products/Atlas A3 inference products, the supported data types are int8_t, int16_t, half, bfloat16_t, int32_t, and float.

Returns

None

Restrictions

You are advised to disable atomic maximization by using SetAtomicNone to avoid affecting subsequent functions.
For the Atlas A2 training products/Atlas A2 inference products, the inf/nan mode cannot be set for the bfloat16_t type.

Example

// In this example, atomic maximization is performed when DataCopy is used to move data from VECOUT to external dstGlobal.
#include "kernel_operator.h"
static const int data_size = 256;

template <typename T>
class KernelDataCopyAtomicMax {
public:
    __aicore__ inline KernelDataCopyAtomicMax() {}
    __aicore__ inline void Init(GM_ADDR src0_gm, GM_ADDR src1_gm, GM_ADDR dst_gm, uint32_t size)
    {
        this->size = size;
        src0Global.SetGlobalBuffer((__gm__ T *)src0_gm);
        src1Global.SetGlobalBuffer((__gm__ T *)src1_gm);
        dstGlobal.SetGlobalBuffer((__gm__ T *)dst_gm);
        pipe.InitBuffer(queueSrc0, 1, size * sizeof(T));
        pipe.InitBuffer(queueSrc1, 1, size * sizeof(T));
        pipe.InitBuffer(queueDst0, 1, size * sizeof(T));
        pipe.InitBuffer(queueDst1, 1, size * sizeof(T));
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        Compute();
        CopyOut();
    }
private:
    __aicore__ inline void CopyIn()
    {
        AscendC::LocalTensor<T> src0local = queueSrc0.AllocTensor<T>();
        AscendC::LocalTensor<T> src1local = queueSrc1.AllocTensor<T>();
        AscendC::DataCopy(src0local, src0Global, size);
        AscendC::DataCopy(src1local, src1Global, size);
        queueSrc0.EnQue(src0local);
        queueSrc1.EnQue(src1local);
    }
    __aicore__ inline void Compute()
    {
        AscendC::LocalTensor<T> src0local = queueSrc0.DeQue<T>();
        AscendC::LocalTensor<T> src1local = queueSrc1.DeQue<T>();
        AscendC::LocalTensor<T> dst0Local = queueDst0.AllocTensor<T>();
        AscendC::LocalTensor<T> dst1Local = queueDst1.AllocTensor<T>();
        AscendC::Abs(dst0Local, src0local, size);
        AscendC::Abs(dst1Local, src1local, size);
        queueDst0.EnQue(dst0Local);
        queueDst1.EnQue(dst1Local);
        queueSrc0.FreeTensor(src0local);
        queueSrc1.FreeTensor(src1local);
    }
    __aicore__ inline void CopyOut()
    {
        AscendC::LocalTensor<T> dst0Local = queueDst0.DeQue<T>();
        AscendC::LocalTensor<T> dst1Local = queueDst1.DeQue<T>();
        AscendC::DataCopy(dstGlobal, dst1Local, size);
        AscendC::PipeBarrier<PIPE_MTE3>();
        AscendC::SetAtomicMax<T>();
        AscendC::DataCopy(dstGlobal, dst0Local, size);
        queueDst0.FreeTensor(dst0Local);
        queueDst1.FreeTensor(dst1Local);
        AscendC::SetAtomicNone();
    }
private:
    AscendC::TPipe pipe;
    AscendC::TQue<AscendC::TPosition::VECIN, 1> queueSrc0;
    AscendC::TQue<AscendC::TPosition::VECIN, 1> queueSrc1;
    AscendC::TQue<AscendC::TPosition::VECOUT, 1> queueDst0;
    AscendC::TQue<AscendC::TPosition::VECOUT, 1> queueDst1;
    AscendC::GlobalTensor<T> src0Global, src1Global, dstGlobal;
    uint32_t size;
};
extern "C" __global__ __aicore__ void data_copy_atomic_max_kernel(GM_ADDR src0_gm, GM_ADDR src1_gm, GM_ADDR dst_gm)
{
    KernelDataCopyAtomicMax<half> op;
    op.Init(src0_gm, src1_gm, dst_gm, data_size);
    op.Process();
}
The input data of each core is as follows:
Src0: [1,1,1,1,1, ...,1] // 1 × 256
Src1: [2,2,2,2,2, ...,2] // 2 × 256
Final output data: [2,2,2,2,2, ...,2] // 2 × 256

Parent topic: Atomic Operations