LoadDataUnzip

Supported Products

Product

Supported (√/x)

Atlas A3 training products/Atlas A3 inference products

x

Atlas A2 training products/Atlas A2 inference products

x

Atlas 200I/500 A2 inference products

x

Atlas inference product's AI Core

Atlas inference product's Vector Core

x

Atlas training products

x

Function Usage

Decompress the data on the GM and load the data to A1, B1, and B2. LoadUnzipIndex needs to be executed to load the compression index table before this API is called.

Prototype

1
2
template <typename T>
__aicore__ inline void LoadDataUnzip(const LocalTensor<T>& dst, const GlobalTensor<T>& src)

Parameters

Table 1 Parameters

Parameter

Input/Output

Meaning

dst

Output

Destination operand. Type: LocalTensor. Supported TPosition: A1/B1/B2.

When TPosition is A1/B1, the start address of LocalTensor must be 32-byte aligned. When TPosition is B2, the start address of LocalTensor must be 512-byte aligned.

The supported data type is int8_t.

src

Input

Source operand of the GlobalTensor type. The data type must be the same as that of dst.

Restrictions

Returns

None

Example

The calling example supports the Atlas inference product's AI Core platform.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#include "kernel_operator.h"

class KernelLoadUnzip {
public:
    __aicore__ inline KernelLoadUnzip() {}
    __aicore__ inline void Init(__gm__ int8_t *weGm, __gm__ int8_t *indexGm, __gm__ int8_t *dstGm)
    {
        weGlobal.SetGlobalBuffer((__gm__ int8_t *)weGm);
        indexGlobal.SetGlobalBuffer((__gm__ int8_t *)indexGm);
        dstGlobal.SetGlobalBuffer((__gm__ int8_t *)dstGm);
        pipe.InitBuffer(inQueueB1, 1, dstLen * sizeof(int8_t));
        pipe.InitBuffer(outQueueUB, 1, dstLen * sizeof(int8_t));
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        CopyToUB();
        CopyOut();
    }

private:
    __aicore__ inline void CopyIn()
    {
        AscendC::LocalTensor<int8_t> weightB1 = inQueueB1.AllocTensor<int8_t>();
        AscendC::LoadUnzipIndex(indexGlobal, numOfIndexTabEntry);
        AscendC::LoadDataUnzip(weightB1, weGlobal);
        inQueueB1.EnQue(weightB1);
    }
    __aicore__ inline void CopyToUB()
    {
        AscendC::LocalTensor<int8_t> weightB1 = inQueueB1.DeQue<int8_t>();
        AscendC::LocalTensor<int8_t> featureMapUB = outQueueUB.AllocTensor<int8_t>();
        AscendC::DataCopy(featureMapUB, weightB1, dstLen);
        outQueueUB.EnQue<int8_t>(featureMapUB);
        inQueueB1.FreeTensor(weightB1);
    }
    __aicore__ inline void CopyOut()
    {
        AscendC::LocalTensor<int8_t> featureMapUB = outQueueUB.DeQue<int8_t>();
        event_t eventIdMTE1ToMTE3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(AscendC::HardEvent::MTE1_MTE3));
        AscendC::SetFlag<AscendC::HardEvent::MTE1_MTE3>(eventIdMTE1ToMTE3);
        AscendC::WaitFlag<AscendC::HardEvent::MTE1_MTE3>(eventIdMTE1ToMTE3);
        AscendC::DataCopy(dstGlobal, featureMapUB, dstLen);
        outQueueUB.FreeTensor(featureMapUB);
    }

private:
    AscendC::TPipe pipe;
    AscendC::TQue<AscendC::TPosition::B1, 1> inQueueB1;
    AscendC::TQue<AscendC::TPosition::VECOUT, 1> outQueueUB;
    AscendC::GlobalTensor<int8_t> weGlobal;
    AscendC::GlobalTensor<int8_t> dstGlobal;
    AscendC::GlobalTensor<int8_t> indexGlobal;
    uint32_t srcLen = 896, dstLen = 1024, numOfIndexTabEntry = 1;
};
extern "C" __global__ __aicore__ void cube_load_unzip_simple_kernel(__gm__ int8_t *weightGm,
    __gm__ int8_t *indexGm, __gm__ int8_t *dstGm)
{
    KernelLoadUnzip op;
    op.Init(weightGm, indexGm, dstGm);
    op.Process();
}