InitBufPool

Applicability

Product	Supported
Atlas A3 training products/Atlas A3 inference products	√
Atlas A2 training products/Atlas A2 inference products	√
Atlas 200I/500 A2 inference products	x
Atlas inference product's AI Core	√
Atlas inference product's Vector Core	x
Atlas training products	√

Function

TPipe::InitBufPool obtains a TbufPool resource block, which can be further divided into small resource blocks by TBufPool::InitBufPool.

Prototype

Non-sharing mode

template <class T>
__aicore__ inline bool InitBufPool(T& bufPool, uint32_t len)

Sharing mode

template <class T, class U>
__aicore__ inline bool InitBufPool(T& bufPool, uint32_t len, U& shareBuf)

Parameters

**Table 1** Template parameters
Parameter	Description
T	Parameter of the bufPool type.
U	Parameter of the shareBuf type.

**Table 2** Parameters in InitBufPool(T& bufPool, uint32_t len) prototype definition
Parameter	Input/Output	Description
bufPool	Input	New resource pool. The type is TBufPool.
len	Input	Length of the new resource pool, in bytes. If the length is not 32-byte aligned, it is automatically padded to 32-byte aligned.

**Table 3** Parameters in InitBufPool(T& bufPool, uint32_t len, U& shareBuf) prototype definition
Parameter	Input/Output	Description
bufPool	Input	New resource pool. The type is TBufPool.
len	Input	Length of the new resource pool, in bytes. If the length is not 32-byte aligned, it is automatically padded to 32-byte aligned.
shareBuf	Input	Reused resource pool of the TBufPool type. The new resource pool shares the start address and length with the reused resource pool.

Restrictions

The physical memory of the new resource pool must be the same as those of the reused resource pool. The two resource pools share the start address and length.
The entered length must be less than or equal to the length of the reused resource pool.
For details about other restrictions, see TBufPool.

Returns

None

Example

When the data volume is large and the memory is limited, data movement cannot be completed at a time. In this case, the data movement needs to be split into multiple phases. Each phase uses a part of the data. The TBufPool resource pool can be used for memory address reuse. In this example, the resource pool TBufPool0 is allocated from the TPipe. After TBufPool0 allocates space to src0Gm, TBufPool1 is allocated. TBufPool1 and TBufPool2 are reused and used for the first and second rounds of computation, respectively. In this case, TBufPool1 and TBufPool2 share the start address and length.

class ResetApi {
public:
    __aicore__ inline ResetApi() {}
    __aicore__ inline void Init(__gm__ uint8_t* src0Gm, __gm__ uint8_t* src1Gm, __gm__ uint8_t* dstGm)
    {
        src0Global.SetGlobalBuffer((__gm__ half*)src0Gm);
        src1Global.SetGlobalBuffer((__gm__ half*)src1Gm);
        dstGlobal.SetGlobalBuffer((__gm__ half*)dstGm);
        pipe.InitBufPool(tbufPool0, 131072);
        tbufPool0.InitBuffer(srcQue0, 1, 65536); // Total src0
        tbufPool0.InitBufPool(tbufPool1, 65536);
        tbufPool0.InitBufPool(tbufPool2, 65536, tbufPool1);
    }
    __aicore__ inline void Process()
    {
        tbufPool1.InitBuffer(srcQue1, 1, 32768);
        tbufPool1.InitBuffer(dstQue0, 1, 32768);
        CopyIn();
        Compute();
        CopyOut();
        tbufPool1.Reset();
        tbufPool2.InitBuffer(srcQue2, 1, 32768);
        tbufPool2.InitBuffer(dstQue1, 1, 32768);
        CopyIn1();
        Compute1();
        CopyOut1();
        tbufPool2.Reset();
        tbufPool0.Reset();
        pipe.Reset();
    }
private:
    __aicore__ inline void CopyIn()
    {
        AscendC::LocalTensor<half> src0Local = srcQue0.AllocTensor<half>();
        AscendC::LocalTensor<half> src1Local = srcQue1.AllocTensor<half>();
        AscendC::DataCopy(src0Local, src0Global, 16384);
        AscendC::DataCopy(src1Local, src1Global, 16384);
        srcQue0.EnQue(src0Local);
        srcQue1.EnQue(src1Local);
    }
    __aicore__ inline void Compute()
    {
        AscendC::LocalTensor<half> src0Local = srcQue0.DeQue<half>();
        AscendC::LocalTensor<half> src1Local = srcQue1.DeQue<half>();
        AscendC::LocalTensor<half> dstLocal = dstQue0.AllocTensor<half>();
        AscendC::Add(dstLocal, src0Local, src1Local, 16384);
        dstQue0.EnQue<half>(dstLocal);
        srcQue0.FreeTensor(src0Local);
        srcQue1.FreeTensor(src1Local);
    }
    __aicore__ inline void CopyOut()
    {
        AscendC::LocalTensor<half> dstLocal = dstQue0.DeQue<half>();
        AscendC::DataCopy(dstGlobal, dstLocal, 16384);
        dstQue0.FreeTensor(dstLocal);
    }
    __aicore__ inline void CopyIn1()
    {
        AscendC::LocalTensor<half> src0Local = srcQue0.AllocTensor<half>();
        AscendC::LocalTensor<half> src1Local = srcQue2.AllocTensor<half>();
        AscendC::DataCopy(src0Local, src0Global[16384], 16384);
        AscendC::DataCopy(src1Local, src1Global[16384], 16384);
        srcQue0.EnQue(src0Local);
        srcQue2.EnQue(src1Local);
    }
    __aicore__ inline void Compute1()
    {
        AscendC::LocalTensor<half> src0Local = srcQue0.DeQue<half>();
        AscendC::LocalTensor<half> src1Local = srcQue2.DeQue<half>();
        AscendC::LocalTensor<half> dstLocal = dstQue1.AllocTensor<half>();
        AscendC::Add(dstLocal, src0Local, src1Local, 16384);
        dstQue1.EnQue<half>(dstLocal);
        srcQue0.FreeTensor(src0Local);
        srcQue2.FreeTensor(src1Local);
    }
    __aicore__ inline void CopyOut1()
    {
        AscendC::LocalTensor<half> dstLocal = dstQue1.DeQue<half>();
        AscendC::DataCopy(dstGlobal[16384], dstLocal, 16384);
        dstQue1.FreeTensor(dstLocal);
    }
private:
    AscendC::TPipe pipe;
    AscendC::TBufPool<AscendC::TPosition::VECCALC> tbufPool0, tbufPool1, tbufPool2;
    AscendC::TQue<AscendC::TPosition::VECIN, 1> srcQue0, srcQue1, srcQue2;
    AscendC::TQue<AscendC::TPosition::VECOUT, 1> dstQue0, dstQue1;
    AscendC::GlobalTensor<half> src0Global, src1Global, dstGlobal;
};
extern "C" __global__ __aicore__ void tbufpool_kernel(__gm__ uint8_t* src0Gm, __gm__ uint8_t* src1Gm, __gm__ uint8_t* dstGm)
{
    ResetApi op;
    op.Init(src0Gm, src1Gm, dstGm);
    op.Process();
}

Parent topic: TBufPool