SetAtomicMin (ISASI)
Supported Products
Product |
Supported/Unsupported |
|---|---|
√ |
|
√ |
|
x |
|
x |
|
x |
|
x |
Function Usage
Sets whether to perform atomic comparison for subsequent data transferred from VECOUT to GM, which compares the content to be copied with the existing content in GM and writes the minimum value to GM.
You can set different data types using template parameters.
Prototype
1 2 | template <typename T> __aicore__ inline void SetAtomicMin() |
Parameters
Parameter |
Description |
|---|---|
T |
Sets different data types. |
Returns
None
Constraints
You are advised to disable atomic minimization by using SetAtomicNone to avoid affecting subsequent instruction functions.
Example
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | // In this example, atomic minimization is performed when DataCopy is used to move data from VECOUT to external dstGlobal. #include "kernel_operator.h" static const int data_size = 256; template <typename T> class KernelDataCopyAtomicMin { public: __aicore__ inline KernelDataCopyAtomicMin() {} __aicore__ inline void Init(GM_ADDR src0_gm, GM_ADDR src1_gm, GM_ADDR dst_gm, uint32_t size) { this->size = size; src0Global.SetGlobalBuffer((__gm__ T *)src0_gm); src1Global.SetGlobalBuffer((__gm__ T *)src1_gm); dstGlobal.SetGlobalBuffer((__gm__ T *)dst_gm); pipe.InitBuffer(queueSrc0, 1, size * sizeof(T)); pipe.InitBuffer(queueSrc1, 1, size * sizeof(T)); pipe.InitBuffer(queueDst0, 1, size * sizeof(T)); pipe.InitBuffer(queueDst1, 1, size * sizeof(T)); } __aicore__ inline void Process() { CopyIn(); Compute(); CopyOut(); } private: __aicore__ inline void CopyIn() { AscendC::LocalTensor<T> src0local = queueSrc0.AllocTensor<T>(); AscendC::LocalTensor<T> src1local = queueSrc1.AllocTensor<T>(); AscendC::DataCopy(src0local, src0Global, size); AscendC::DataCopy(src1local, src1Global, size); queueSrc0.EnQue(src0local); queueSrc1.EnQue(src1local); } __aicore__ inline void Compute() { AscendC::LocalTensor<T> src0local = queueSrc0.DeQue<T>(); AscendC::LocalTensor<T> src1local = queueSrc1.DeQue<T>(); AscendC::LocalTensor<T> dst0Local = queueDst0.AllocTensor<T>(); AscendC::LocalTensor<T> dst1Local = queueDst1.AllocTensor<T>(); AscendC::Abs(dst0Local, src0local, size); AscendC::Abs(dst1Local, src1local, size); queueDst0.EnQue(dst0Local); queueDst1.EnQue(dst1Local); queueSrc0.FreeTensor(src0local); queueSrc1.FreeTensor(src1local); } __aicore__ inline void CopyOut() { AscendC::LocalTensor<T> dst0Local = queueDst0.DeQue<T>(); AscendC::LocalTensor<T> dst1Local = queueDst1.DeQue<T>(); AscendC::DataCopy(dstGlobal, dst1Local, size); AscendC::PipeBarrier<PIPE_MTE3>(); AscendC::SetAtomicMin<T>(); AscendC::DataCopy(dstGlobal, dst0Local, size); queueDst0.FreeTensor(dst0Local); queueDst1.FreeTensor(dst1Local); AscendC::SetAtomicNone(); } private: AscendC::TPipe pipe; AscendC::TQue<AscendC::TPosition::VECIN, 1> queueSrc0; AscendC::TQue<AscendC::TPosition::VECIN, 1> queueSrc1; AscendC::TQue<AscendC::TPosition::VECOUT, 1> queueDst0; AscendC::TQue<AscendC::TPosition::VECOUT, 1> queueDst1; AscendC::GlobalTensor<T> src0Global, src1Global, dstGlobal; uint32_t size; }; extern "C" __global__ __aicore__ void data_copy_atomic_min_kernel(GM_ADDR src0_gm, GM_ADDR src1_gm, GM_ADDR dst_gm) { KernelDataCopyAtomicMin<half> op; op.Init(src0_gm, src1_gm, dst_gm, data_size); op.Process(); } The input data of each core is as follows: Src0: [1,1,1,1,1, ...,1] // 1 × 256 Src1: [2,2,2,2,2, ...,2] // 2 × 256 Final output data: [1,1,1,1,1,...,1] // 1 × 256 |