SetAtomicMax (ISASI)
Supported Products
Product |
Supported/Unsupported |
|---|---|
√ |
|
√ |
|
x |
|
x |
|
x |
|
x |
Function Usage
Sets whether to perform atomic comparison for subsequent data transferred from VECOUT to GM, which compares the content to be copied with the existing content in GM and writes the maximum value to GM.
You can set different data types using template parameters.
Prototype
1 2 | template <typename T> __aicore__ inline void SetAtomicMax() |
Parameters
Parameter |
Description |
|---|---|
T |
Sets different data types. |
Returns
None
Constraints
- You are advised to disable atomic maximization by using SetAtomicNone to avoid affecting subsequent functions.
- For the
Atlas A2 training products /Atlas A2 inference products , the inf/nan mode cannot be set for the bfloat16_t type.
Example
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 | // This example performs the atomic maximum operation when DataCopy is used to transfer data from VECOUT to the external dstGlobal. #include "kernel_operator.h" static const int data_size = 256; template <typename T> class KernelDataCopyAtomicMax { public: __aicore__ inline KernelDataCopyAtomicMax() {} __aicore__ inline void Init(GM_ADDR src0_gm, GM_ADDR src1_gm, GM_ADDR dst_gm, uint32_t size) { this->size = size; src0Global.SetGlobalBuffer((__gm__ T *)src0_gm); src1Global.SetGlobalBuffer((__gm__ T *)src1_gm); dstGlobal.SetGlobalBuffer((__gm__ T *)dst_gm); pipe.InitBuffer(queueSrc0, 1, size * sizeof(T)); pipe.InitBuffer(queueSrc1, 1, size * sizeof(T)); pipe.InitBuffer(queueDst0, 1, size * sizeof(T)); pipe.InitBuffer(queueDst1, 1, size * sizeof(T)); } __aicore__ inline void Process() { CopyIn(); Compute(); CopyOut(); } private: __aicore__ inline void CopyIn() { AscendC::LocalTensor<T> src0local = queueSrc0.AllocTensor<T>(); AscendC::LocalTensor<T> src1local = queueSrc1.AllocTensor<T>(); AscendC::DataCopy(src0local, src0Global, size); AscendC::DataCopy(src1local, src1Global, size); queueSrc0.EnQue(src0local); queueSrc1.EnQue(src1local); } __aicore__ inline void Compute() { AscendC::LocalTensor<T> src0local = queueSrc0.DeQue<T>(); AscendC::LocalTensor<T> src1local = queueSrc1.DeQue<T>(); AscendC::LocalTensor<T> dst0Local = queueDst0.AllocTensor<T>(); AscendC::LocalTensor<T> dst1Local = queueDst1.AllocTensor<T>(); AscendC::Abs(dst0Local, src0local, size); AscendC::Abs(dst1Local, src1local, size); queueDst0.EnQue(dst0Local); queueDst1.EnQue(dst1Local); queueSrc0.FreeTensor(src0local); queueSrc1.FreeTensor(src1local); } __aicore__ inline void CopyOut() { AscendC::LocalTensor<T> dst0Local = queueDst0.DeQue<T>(); AscendC::LocalTensor<T> dst1Local = queueDst1.DeQue<T>(); AscendC::DataCopy(dstGlobal, dst1Local, size); AscendC::PipeBarrier<PIPE_MTE3>(); AscendC::SetAtomicMax<T>(); AscendC::DataCopy(dstGlobal, dst0Local, size); queueDst0.FreeTensor(dst0Local); queueDst1.FreeTensor(dst1Local); AscendC::SetAtomicNone(); } private: AscendC::TPipe pipe; AscendC::TQue<AscendC::TPosition::VECIN, 1> queueSrc0; AscendC::TQue<AscendC::TPosition::VECIN, 1> queueSrc1; AscendC::TQue<AscendC::TPosition::VECOUT, 1> queueDst0; AscendC::TQue<AscendC::TPosition::VECOUT, 1> queueDst1; AscendC::GlobalTensor<T> src0Global, src1Global, dstGlobal; uint32_t size; }; extern "C" __global__ __aicore__ void data_copy_atomic_max_kernel(GM_ADDR src0_gm, GM_ADDR src1_gm, GM_ADDR dst_gm) { KernelDataCopyAtomicMax<half> op; op.Init(src0_gm, src1_gm, dst_gm, data_size); op.Process(); } The input data of each core is as follows: Src0: [1,1,1,1,1, ...,1] // 1 × 256 Src1: [2,2,2,2,2, ...,2] // 2 × 256 Final output data: [2,2,2,2,2, ...,2] // 2 × 256 |