LoadDataUnzip

Product Support

Product	Supported
Atlas A3 training products/Atlas A3 inference products	x
Atlas A2 training products/Atlas A2 inference products	x
Atlas 200I/500 A2 inference products	x
Atlas inference product's AI Core	√
Atlas inference product's Vector Core	x
Atlas training products	x

Function

Decompress the data on the GM and load the data to A1, B1, and B2. LoadUnzipIndex needs to be executed to load the compression index table before this API is called.

Prototype

template <typename T>
__aicore__ inline void LoadDataUnzip(const LocalTensor<T>& dst, const GlobalTensor<T>& src)

Parameters

**Table 1** Parameters
Parameter	Input/Output	Meaning
dst	Output	Destination operand. Type: LocalTensor. Supported TPosition: A1/B1/B2. When TPosition is A1/B1, the start address of LocalTensor must be 32-byte aligned. When TPosition is B2, the start address of LocalTensor must be 512-byte aligned. The supported data type is int8_t.
src	Input	Source operand of the GlobalTensor type. The data type must be the same as that of dst.

Restrictions

For details about the operand address alignment requirements, see General Address Alignment Restrictions.

Returns

None

Example

The calling example supports the Atlas inference product's AI Core platform.

#include "kernel_operator.h"

class KernelLoadUnzip {
public:
    __aicore__ inline KernelLoadUnzip() {}
    __aicore__ inline void Init(__gm__ int8_t *weGm, __gm__ int8_t *indexGm, __gm__ int8_t *dstGm)
    {
        weGlobal.SetGlobalBuffer((__gm__ int8_t *)weGm);
        indexGlobal.SetGlobalBuffer((__gm__ int8_t *)indexGm);
        dstGlobal.SetGlobalBuffer((__gm__ int8_t *)dstGm);
        pipe.InitBuffer(inQueueB1, 1, dstLen * sizeof(int8_t));
        pipe.InitBuffer(outQueueUB, 1, dstLen * sizeof(int8_t));
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        CopyToUB();
        CopyOut();
    }

private:
    __aicore__ inline void CopyIn()
    {
        AscendC::LocalTensor<int8_t> weightB1 = inQueueB1.AllocTensor<int8_t>();
        AscendC::LoadUnzipIndex(indexGlobal, numOfIndexTabEntry);
        AscendC::LoadDataUnzip(weightB1, weGlobal);
        inQueueB1.EnQue(weightB1);
    }
    __aicore__ inline void CopyToUB()
    {
        AscendC::LocalTensor<int8_t> weightB1 = inQueueB1.DeQue<int8_t>();
        AscendC::LocalTensor<int8_t> featureMapUB = outQueueUB.AllocTensor<int8_t>();
        AscendC::DataCopy(featureMapUB, weightB1, dstLen);
        outQueueUB.EnQue<int8_t>(featureMapUB);
        inQueueB1.FreeTensor(weightB1);
    }
    __aicore__ inline void CopyOut()
    {
        AscendC::LocalTensor<int8_t> featureMapUB = outQueueUB.DeQue<int8_t>();
        event_t eventIdMTE1ToMTE3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(AscendC::HardEvent::MTE1_MTE3));
        AscendC::SetFlag<AscendC::HardEvent::MTE1_MTE3>(eventIdMTE1ToMTE3);
        AscendC::WaitFlag<AscendC::HardEvent::MTE1_MTE3>(eventIdMTE1ToMTE3);
        AscendC::DataCopy(dstGlobal, featureMapUB, dstLen);
        outQueueUB.FreeTensor(featureMapUB);
    }

private:
    AscendC::TPipe pipe;
    AscendC::TQue<AscendC::TPosition::B1, 1> inQueueB1;
    AscendC::TQue<AscendC::TPosition::VECOUT, 1> outQueueUB;
    AscendC::GlobalTensor<int8_t> weGlobal;
    AscendC::GlobalTensor<int8_t> dstGlobal;
    AscendC::GlobalTensor<int8_t> indexGlobal;
    uint32_t srcLen = 896, dstLen = 1024, numOfIndexTabEntry = 1;
};
extern "C" __global__ __aicore__ void cube_load_unzip_simple_kernel(__gm__ int8_t *weightGm,
    __gm__ int8_t *indexGm, __gm__ int8_t *dstGm)
{
    KernelLoadUnzip op;
    op.Init(weightGm, indexGm, dstGm);
    op.Process();
}

Parent topic: Data Movement