LoadDataUnzip

功能说明

将GM上的数据解压并搬运到A1/B1/B2上。

函数原型

template <typename T>
__aicore__ inline void LoadDataUnzip(const LocalTensor<T>& dstLocal, const GlobalTensor<T>& srcLocal);

参数说明

表1 参数说明
参数名称	输入/输出	含义
dstLocal	输出	目的操作数，类型为LocalTensor，支持的TPosition为A1/B1/B2。支持的数据类型为：int8_t。
srcLocal	输入	源操作数，类型为GlobalTensor。数据类型需要与dstLocal保持一致。

支持的型号

Atlas推理系列产品AI Core

注意事项

操作数地址偏移对齐要求请参见通用约束。

返回值

无

调用示例

该调用示例支持的运行平台为Atlas推理系列产品AI Core。

#include "kernel_operator.h"
namespace AscendC {
class KernelLoadUnzip {
public:
    __aicore__ inline KernelLoadUnzip() {}
    __aicore__ inline void Init(__gm__ int8_t *weGm, __gm__ int8_t *indexGm, __gm__ int8_t *dstGm)
    {
        weGlobal.SetGlobalBuffer((__gm__ int8_t *)weGm);
        indexGlobal.SetGlobalBuffer((__gm__ int8_t *)indexGm);
        dstGlobal.SetGlobalBuffer((__gm__ int8_t *)dstGm);
        pipe.InitBuffer(inQueueB1, 1, dstLen * sizeof(int8_t));
        pipe.InitBuffer(outQueueUB, 1, dstLen * sizeof(int8_t));
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        CopyToUB();
        CopyOut();
    }

private:
    __aicore__ inline void CopyIn()
    {
        LocalTensor<int8_t> weightB1 = inQueueB1.AllocTensor<int8_t>();
        LoadUnzipIndex(indexGlobal, numOfIndexTabEntry);
        LoadDataUnzip(weightB1, weGlobal);
        inQueueB1.EnQue(weightB1);
    }
    __aicore__ inline void CopyToUB()
    {
        LocalTensor<int8_t> weightB1 = inQueueB1.DeQue<int8_t>();
        LocalTensor<int8_t> featureMapUB = outQueueUB.AllocTensor<int8_t>();
        DataCopy(featureMapUB, weightB1, dstLen);
        outQueueUB.EnQue<int8_t>(featureMapUB);
        inQueueB1.FreeTensor(weightB1);
    }
    __aicore__ inline void CopyOut()
    {
        LocalTensor<int8_t> featureMapUB = outQueueUB.DeQue<int8_t>();
        event_t eventIdMTE1ToMTE3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE1_MTE3));
        SetFlag<HardEvent::MTE1_MTE3>(eventIdMTE1ToMTE3);
        WaitFlag<HardEvent::MTE1_MTE3>(eventIdMTE1ToMTE3);
        DataCopy(dstGlobal, featureMapUB, dstLen);
        outQueueUB.FreeTensor(featureMapUB);
    }

private:
    TPipe pipe;
    TQue<QuePosition::B1, 1> inQueueB1;
    TQue<QuePosition::VECOUT, 1> outQueueUB;
    GlobalTensor<int8_t> weGlobal;
    GlobalTensor<int8_t> dstGlobal;
    GlobalTensor<int8_t> indexGlobal;
    uint32_t srcLen = 896, dstLen = 1024, numOfIndexTabEntry = 1;
};
} // namespace AscendC

extern "C" __global__ __aicore__ void cube_load_unzip_simple_kernel(__gm__ int8_t *weightGm, 
	__gm__ int8_t *indexGm, __gm__ int8_t *dstGm)
{                                                                                            
	AscendC::KernelLoadUnzip op;
	op.Init(weightGm, indexGm, dstGm);
	op.Process();
}

父主题： 矩阵计算