InitBufPool

Function Usage

Tpipe::InitBufPool obtains a TbufPool resource block, which can be further divided into small resource blocks by TBufPool::InitBufPool.

Prototype

Non-sharing mode

template <class T>
__aicore__ inline bool InitBufPool(T& bufPool, uint32_t len)

Sharing mode

template <class T, class U>
__aicore__ inline bool InitBufPool(T& bufPool, uint32_t len, U& shareBuf)

Parameters

**Table 1** Parameters in InitBufPool(T& bufPool, uint32_t len) prototype definition
Parameter	Input/Output	Meaning
bufPool	Input	New resource pool. The type is TBufPool.
len	Input	Length of the new resource pool, in bytes. If the length is not 32-byte aligned, it is automatically padded up to 32-byte aligned.

**Table 2** Parameters in InitBufPool(T& bufPool, uint32_t len, U& shareBuf) prototype definition
Parameter	Input/Output	Meaning
bufPool	Input	New resource pool. The type is TBufPool.
len	Input	Length of the new resource pool, in bytes. If the length is not 32-byte aligned, it is automatically padded up to 32-byte aligned.
shareBuf	Input	Reused resource pool of the TBufPool type. The newly divided resource pool shares the start address and length with the reused resource pool.

Availability

Atlas Training Series Product

Precautions

The physical memory of the new resource pool must be the same as those of the reused resource pool. The two resource pools share the start address and length.
The entered length must be less than or equal to the length of the reused resource pool.
For details about other restrictions, see TBufPool.

Returns

None

Example

When the data volume is large and the memory is limited, data movement cannot be completed at a time. In this case, the data movement needs to be split into multiple phases. Each phase uses a part of the data. The TBufPool resource pool can be used for memory address reuse. In this example, the resource pool tbufPool0 is allocated from the Tpipe. After tbufPool0 allocates space to src0Gm, tbufPool1 is allocated. tbufPool1 and tbufPool2 are reused and used for the first and second rounds of computation, respectively. In this case, tbufPool1 and tbufPool2 share the start address and length.

class ResetApi {
public:
    __aicore__ inline ResetApi() {}
    __aicore__ inline void Init(__gm__ uint8_t* src0Gm, __gm__ uint8_t* src1Gm, __gm__ uint8_t* dstGm)
    {
        src0Global.SetGlobalBuffer((__gm__ half*)src0Gm);
        src1Global.SetGlobalBuffer((__gm__ half*)src1Gm);
        dstGlobal.SetGlobalBuffer((__gm__ half*)dstGm);
        pipe.InitBufPool(tbufPool0, 131072);
        tbufPool0.InitBuffer(srcQue0, 1, 65536); // Total src0
        tbufPool0.InitBufPool(tbufPool1, 65536);
        tbufPool0.InitBufPool(tbufPool2, 65536, tbufPool1);
    }
    __aicore__ inline void Process()
    {
        tbufPool1.InitBuffer(srcQue1, 1, 32768);
        tbufPool1.InitBuffer(dstQue0, 1, 32768);
        CopyIn();
        Compute();
        CopyOut();
        tbufPool1.Reset();
        tbufPool2.InitBuffer(srcQue2, 1, 32768);
        tbufPool2.InitBuffer(dstQue1, 1, 32768);
        CopyIn1();
        Compute1();
        CopyOut1();
        tbufPool2.Reset();
        tbufPool0.Reset();
        pipe.Reset();
    }
private:
    __aicore__ inline void CopyIn()
    {
        AscendC::LocalTensor<half> src0Local = srcQue0.AllocTensor<half>();
        AscendC::LocalTensor<half> src1Local = srcQue1.AllocTensor<half>();
        AscendC::DataCopy(src0Local, src0Global, 16384);
        AscendC::DataCopy(src1Local, src1Global, 16384);
        srcQue0.EnQue(src0Local);
        srcQue1.EnQue(src1Local);
    }
    __aicore__ inline void Compute()
    {
        AscendC::LocalTensor<half> src0Local = srcQue0.DeQue<half>();
        AscendC::LocalTensor<half> src1Local = srcQue1.DeQue<half>();
        AscendC::LocalTensor<half> dstLocal = dstQue0.AllocTensor<half>();
        AscendC::Add(dstLocal, src0Local, src1Local, 16384);
        dstQue0.EnQue<half>(dstLocal);
        srcQue0.FreeTensor(src0Local);
        srcQue1.FreeTensor(src1Local);
    }
    __aicore__ inline void CopyOut()
    {
        AscendC::LocalTensor<half> dstLocal = dstQue0.DeQue<half>();
        AscendC::DataCopy(dstGlobal, dstLocal, 16384);
        dstQue0.FreeTensor(dstLocal);
    }
    __aicore__ inline void CopyIn1()
    {
        AscendC::LocalTensor<half> src0Local = srcQue0.AllocTensor<half>();
        AscendC::LocalTensor<half> src1Local = srcQue2.AllocTensor<half>();
        AscendC::DataCopy(src0Local, src0Global[16384], 16384);
        AscendC::DataCopy(src1Local, src1Global[16384], 16384);
        srcQue0.EnQue(src0Local);
        srcQue2.EnQue(src1Local);
    }
    __aicore__ inline void Compute1()
    {
        AscendC::LocalTensor<half> src0Local = srcQue0.DeQue<half>();
        AscendC::LocalTensor<half> src1Local = srcQue2.DeQue<half>();
        AscendC::LocalTensor<half> dstLocal = dstQue1.AllocTensor<half>();
        AscendC::Add(dstLocal, src0Local, src1Local, 16384);
        dstQue1.EnQue<half>(dstLocal);
        srcQue0.FreeTensor(src0Local);
        srcQue2.FreeTensor(src1Local);
    }
    __aicore__ inline void CopyOut1()
    {
        AscendC::LocalTensor<half> dstLocal = dstQue1.DeQue<half>();
        AscendC::DataCopy(dstGlobal[16384], dstLocal, 16384);
        dstQue1.FreeTensor(dstLocal);
    }
private:
    AscendC::TPipe pipe;
    AscendC::TBufPool<AscendC::TPosition::VECCALC> tbufPool0, tbufPool1, tbufPool2;
    AscendC::TQue<AscendC::QuePosition::VECIN, 1> srcQue0, srcQue1, srcQue2;
    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> dstQue0, dstQue1;
    AscendC::GlobalTensor<half> src0Global, src1Global, dstGlobal;
};
extern "C" __global__ __aicore__ void tbufpool_kernel(__gm__ uint8_t* src0Gm, __gm__ uint8_t* src1Gm, __gm__ uint8_t* dstGm)
{
    ResetApi op;
    op.Init(src0Gm, src1Gm, dstGm);
    op.Process();
}

Parent topic: TBufPool