无DataCopyPad的处理方式
Atlas 推理系列产品未提供DataCopyPad的接口,需要对搬进和搬出非对齐场景进行处理,如下为不同场景及其处理方式:
- Global数据逐行搬运到Local中,Local中每行都存在冗余数据
- 冗余数据参与计算
- 使用Mask掩掉脏数据,一般用于轴归约计算等
- 搬入Local中,使用Pad高阶API Pad确定的值
- Global搬运非对齐数据到Local, 整块搬入后进行Pad成0值
- Local非对齐拷贝出Global,拷贝长度 > 32B
- Local中存在冗余数据,如果有效数据为32B整除,使用UnPad接口去除冗余数据完整搬出
- 使用GatherMask处理
- Local非对齐拷贝出Global, 拷贝长度 < 32B
调用示例
- 如下代码展示,数据大于32B,图1 冗余数据参与计算和图6 使用GatherMask借位搬运的搬出数据方式。
#include "kernel_operator.h" using namespace AscendC; constexpr int32_t blockLen = 36; // >32B, //以下均为元素个数 constexpr int32_t TOTAL_LENGTH = (8 * 16)* 18; // 8个核,16个tile,每个tile处理18个half constexpr int32_t USE_CORE_NUM = 8; constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM; // length computed of each core constexpr int32_t TILE_NUM = 16; // split data into 16 tiles for each core constexpr int32_t BUFFER_NUM = 1; // tensor num for each queue constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // 18个half class KernelDataCopyPad { public: __aicore__ inline KernelDataCopyPad() {} __aicore__ inline void Init(GM_ADDR inputX1_gm, GM_ADDR output_gm) { src0_global.SetGlobalBuffer((__gm__ half*)(inputX1_gm) + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); // 单输入 out_global.SetGlobalBuffer((__gm__ half*)(output_gm) + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); pipe.InitBuffer(inQueueX1, 2, 32 * sizeof(half)); pipe.InitBuffer(outQueue, 2, 32 * sizeof(half)); pipe.InitBuffer(outQueue_tail, 2, 16 * sizeof(half)); pipe.InitBuffer(tmp_Pattern, 16 * sizeof(uint16_t)); } __aicore__ inline void Process() { int32_t loopCount = TILE_NUM * BUFFER_NUM; for (int32_t i = 0; i < loopCount; i++) { CopyIn(i); Compute(i); CopyOut(i); } } private: __aicore__ inline void CopyIn(int32_t progress){ LocalTensor<half> input_local = inQueueX1.AllocTensor<half>(); for (int i = 0; i < 1; i++){ // 逐行搬入 每行有效数据为18个half uint32_t srcGM_idx = progress * 18 + 18 * i; DataCopy(input_local[32 * i], src0_global[srcGM_idx], 32); // 每次搬32个half到UB } inQueueX1.EnQue(input_local); } __aicore__ inline void Compute(int32_t progress){ LocalTensor<half> output_local = outQueue.AllocTensor<half>(); LocalTensor<half> input_local = inQueueX1.DeQue<half>(); Abs(output_local, input_local, 18);//主计算 event_t eventIDMTE3ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V)); LocalTensor<uint16_t> Buf_Pattern = tmp_Pattern.Get<uint16_t>(); Buf_Pattern.SetValue(0, 0b1111111111111100); //前16个half选中后14个half Buf_Pattern.SetValue(1, 0b0000000000000011); //后16个half选中前2个half SetFlag<HardEvent::MTE3_V>(eventIDMTE3ToV); //保证先设置好Buf_Pattern 再进行atomic累加 WaitFlag<HardEvent::MTE3_V>(eventIDMTE3ToV); uint32_t mask = 128; uint64_t rsvdCnt = 0; LocalTensor<half> tail_local = outQueue_tail.AllocTensor<half>(); //tail_local存储18个half中的从第2个到第18个元素 GatherMask(tail_local, output_local, Buf_Pattern, true, mask, { 1, 1, 8, 8 }, rsvdCnt); outQueue.EnQue<half>(output_local); outQueue_tail.EnQue<half>(tail_local); inQueueX1.FreeTensor(input_local); } __aicore__ inline void CopyOut(int32_t progress){ LocalTensor<half> output_local = outQueue.DeQue<half>(); LocalTensor<half> tail_local = outQueue_tail.DeQue<half>(); DataCopy(out_global[progress * 18], output_local, 16); //搬运前面对齐地址部分 DataCopy(out_global[progress * 18 + (18-16)], tail_local, 16); //搬运后32B至GM outQueue.FreeTensor(output_local); outQueue_tail.FreeTensor(tail_local); } private: GlobalTensor<half> src0_global; GlobalTensor<half> out_global; TPipe pipe; TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX1; TQue<QuePosition::VECOUT, BUFFER_NUM> outQueue, outQueue_tail; TBuf<QuePosition::VECCALC> tmp_Pattern; }; extern "C" __global__ __aicore__ void datacopypad_custom(GM_ADDR inputX1_gm, GM_ADDR output_gm){ KernelDataCopyPad op; op.Init(inputX1_gm, output_gm); op.Process(); }
- 如下代码展示,数据小于32B,参考图3 使用Pad高阶API填充0值的方法逐行将Local上脏数据清零,参考图7 使用atomic累加的方式处理拷贝长度小于32B的场景的搬出数据方式。
#include "kernel_operator.h" using namespace AscendC; constexpr int32_t blockLen = 22; // <32B, constexpr int32_t TOTAL_LENGTH = 4 * 15 * 11; // 4个核,每个核处理15 * 11个half constexpr int32_t USE_CORE_NUM = 4; // num of core used constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM; // length computed of each core constexpr int32_t TILE_NUM = 1; constexpr int32_t BUFFER_NUM = 1; // tensor num for each queue constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // 15 * 11个half class KernelDataCopyPad { public: __aicore__ inline KernelDataCopyPad() {} __aicore__ inline void Init(GM_ADDR inputX1_gm, GM_ADDR output_gm){ src0_global.SetGlobalBuffer((__gm__ half*)(inputX1_gm) + 11*15 * GetBlockIdx(), 11*15); out_global.SetGlobalBuffer((__gm__ half*)(output_gm) + 11*15 * GetBlockIdx(), 11*15); pipe.InitBuffer(inQueueX1, BUFFER_NUM, 16*15*sizeof(half)); pipe.InitBuffer(outQueue, BUFFER_NUM, 16*15*sizeof(half)); pipe.InitBuffer(zero_Queue, BUFFER_NUM, 16*sizeof(half)); } __aicore__ inline void Process() { // loop count need to be doubled, due to double buffer constexpr int32_t loopCount = TILE_NUM * BUFFER_NUM; // tiling strategy, pipeline parallel for (int32_t i = 0; i < loopCount; i++) { CopyIn(i); Compute(i); CopyOut(i); } } private: __aicore__ inline void CopyIn(int32_t progress)//GM->UB { LocalTensor<half> input_local = inQueueX1.AllocTensor<half>(); for (int i = 0; i < 15; i++){ // 逐行搬入 每行有效数据为11个half DataCopy(input_local[i*16], src0_global[i*11], 16); // 每次搬16个half到UB } inQueueX1.EnQue(input_local); } __aicore__ inline void Compute(int32_t progress){ LocalTensor<half> output_local = outQueue.AllocTensor<half>(); LocalTensor<half> input_local = inQueueX1.DeQue<half>(); LocalTensor<half> zero_tensor = zero_Queue.AllocTensor<half>(); event_t eventIDMTE3ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V)); event_t eventIDVToMTE3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3)); //所有dstGM清零 //使用local zero tensor用0覆盖dstGM Duplicate<half> (zero_tensor, 0, 16); //设置全0 tensor uint32_t copy_len = 11*15*sizeof(half) /32*32/sizeof(half); //165向下取整 -> 160 zero_Queue.EnQue<half>(zero_tensor); zero_tensor = zero_Queue.DeQue<half>(); for(int i=0; i<copy_len/16; i++){ DataCopy<half> (out_global[i*16], zero_tensor, 16); } DataCopy<half> (out_global[11*15-16], zero_tensor, 16); //避免将dstGM上有效数据覆盖掉 SetFlag<HardEvent::MTE3_V>(eventIDMTE3ToV); //保证先清零GM 再进行atomic累加 WaitFlag<HardEvent::MTE3_V>(eventIDMTE3ToV); uint64_t mask0 = ((uint64_t)1 <<16) - ((uint64_t)1 <<11); //不选前11个数,对应二进制为(2^16-1)- (2^11-1) uint64_t mask[2] = {mask0, 0}; for (int i=0; i<15; i++){ Duplicate<half> (input_local[i*16], 0, mask, 1, 1, 1); //计算前逐行将srcLocal上脏数据pad掉 } Abs(output_local, input_local, 16*15);//15行 outQueue.EnQue<half>(output_local); inQueueX1.FreeTensor(input_local); zero_Queue.FreeTensor(zero_tensor); } __aicore__ inline void CopyOut(int32_t progress) { LocalTensor<half> output_local = outQueue.DeQue<half>(); SetAtomicAdd<half>(); for (int i=0; i<15; i++){ DataCopy<half> (out_global[i * 11], output_local[i*16], 16); } SetAtomicNone(); outQueue.FreeTensor(output_local); } private: GlobalTensor<half> src0_global; GlobalTensor<half> out_global; TPipe pipe; TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX1; TQue<QuePosition::VECOUT, BUFFER_NUM> outQueue; TQue<QuePosition::VECOUT, BUFFER_NUM> zero_Queue; }; extern "C" __global__ __aicore__ void datacopypad_custom(GM_ADDR inputX1_gm, GM_ADDR output_gm){ KernelDataCopyPad op; op.Init(inputX1_gm, output_gm); op.Process(); }
- 如下代码展示,数据小于32B,参考图2,使用mask掩掉脏数据,参考图7的搬出数据方式。
#include "kernel_operator.h" using namespace AscendC; constexpr int32_t blockLen = 8; constexpr int32_t TOTAL_LENGTH = 4 * 4 * 4; // 4个核, 每个核处理4*4个half constexpr int32_t USE_CORE_NUM = 4; // num of core used constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM; // length computed of each core constexpr int32_t TILE_NUM = 1; constexpr int32_t BUFFER_NUM = 1; // tensor num for each queue constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // 16个half class KernelDataCopyPad { public: __aicore__ inline KernelDataCopyPad() {} __aicore__ inline void Init(GM_ADDR inputX1_gm, GM_ADDR output_gm){ src0_global.SetGlobalBuffer((__gm__ half*)(inputX1_gm) + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); out_global.SetGlobalBuffer((__gm__ half*)(output_gm) + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); pipe.InitBuffer(inQueueX1, BUFFER_NUM, 32*4); pipe.InitBuffer(outQueue, BUFFER_NUM, 32*4); pipe.InitBuffer(zero_Queue, BUFFER_NUM, 16*sizeof(half)); pipe.InitBuffer(workQueue, 1, 16 * sizeof(half)); } __aicore__ inline void Process() { constexpr int32_t loopCount = TILE_NUM * BUFFER_NUM; for (int32_t i = 0; i < loopCount; i++) { CopyIn(i); Compute(i); CopyOut(i); } } private: __aicore__ inline void CopyIn(int32_t progress){ LocalTensor<half> input_local = inQueueX1.AllocTensor<half>(); LocalTensor<half> zero_tensor = zero_Queue.AllocTensor<half>(); for (int i = 0; i < 4; i++){ // 逐行搬入 每行有效数据是前4个half DataCopy(input_local[i*16], src0_global[i*4], 16); // 每次搬16个half到UB } inQueueX1.EnQue(input_local); zero_Queue.EnQue(zero_tensor); } __aicore__ inline void Compute(int32_t progress){ LocalTensor<half> output_local = outQueue.AllocTensor<half>(); LocalTensor<half> work_local = workQueue.AllocTensor<half>(); LocalTensor<half> input_local = inQueueX1.DeQue<half>(); LocalTensor<half> zero_tensor = zero_Queue.DeQue<half>(); Duplicate<half> (zero_tensor, 0, 16); //设置全0 tensor // dstGM被清零后再进行Vector计算 event_t eventIDMTE3ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V)); DataCopy<half> (out_global, zero_tensor, 16); //所有dstGM清零 SetFlag<HardEvent::MTE3_V>(eventIDMTE3ToV); WaitFlag<HardEvent::MTE3_V>(eventIDMTE3ToV); outQueue.EnQue<half>(output_local); output_local = outQueue.DeQue<half>(); Duplicate<half> (output_local, 0, 16*4); //将dstLocal清零 uint64_t Mask0 = ((uint64_t)1 <<4) - 1; //mask模式控制只有前4个元素参与ReduceMin运算 uint64_t Mask[2] = {Mask0, 0}; for (int i=0; i<4; i++){ ReduceMin<half>(output_local[i*16], input_local[i*16], work_local, Mask, 1, 8, false); } outQueue.EnQue<half>(output_local); inQueueX1.FreeTensor(input_local); workQueue.FreeTensor(work_local); zero_Queue.FreeTensor(zero_tensor); } __aicore__ inline void CopyOut(int32_t progress){ LocalTensor<half> output_local = outQueue.DeQue<half>(); SetAtomicAdd<half>(); for (int i=0; i<4; i++){ //逐行搬出 一共4行 DataCopy<half> (out_global[i*4], output_local[i*16], 16); } SetAtomicNone(); outQueue.FreeTensor(output_local); } private: GlobalTensor<half> src0_global; GlobalTensor<half> out_global; TPipe pipe; TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX1; TQue<QuePosition::VECOUT, BUFFER_NUM> outQueue; TQue<QuePosition::VECOUT, BUFFER_NUM> workQueue; TQue<QuePosition::VECOUT, BUFFER_NUM> zero_Queue; }; extern "C" __global__ __aicore__ void datacopypad_custom(GM_ADDR inputX1_gm, GM_ADDR output_gm){ KernelDataCopyPad op; op.Init(inputX1_gm, output_gm); op.Process(); }
- 如下代码展示,数据小于32B,图1和图5的搬出数据方式。
#include "kernel_operator.h" #include "datacopypad_tiling.h" using namespace AscendC; constexpr int32_t blockLen = 28; //每个block有14个half constexpr int32_t TOTAL_LENGTH = 8 * 8 * 2 * (16 * 14); //8个核,16个tile 每个tile处理16 * 14个half constexpr int32_t USE_CORE_NUM = 8; // num of core used constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM; // length computed of each core constexpr int32_t TILE_NUM = 8; // split data into 8 tiles for each core constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // 每个tile处理16组 每组14个half class KernelDataCopyPad { public: __aicore__ inline KernelDataCopyPad() {} __aicore__ inline void Init(GM_ADDR inputX1_gm, GM_ADDR output_gm, UnPadParams& unPadParamsIn, DataCopyPadCustomTilingData tiling){ src0_global.SetGlobalBuffer((__gm__ half*)(inputX1_gm) + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);//单输入 out_global.SetGlobalBuffer((__gm__ half*)(output_gm) + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); pipe.InitBuffer(inQueueX1, BUFFER_NUM, 16*16*sizeof(half)); pipe.InitBuffer(outQueue, BUFFER_NUM, 16*16*sizeof(half)); } __aicore__ inline void Process(DataCopyPadCustomTilingData tiling) { int32_t loopCount = TILE_NUM * BUFFER_NUM; for (int32_t i = 0; i < loopCount; i++) { CopyIn(i); Compute(i, tiling); CopyOut(i); } } private: __aicore__ inline void CopyIn(int32_t progress){ LocalTensor<half> input_local = inQueueX1.AllocTensor<half>(); for (int i=0; i<16; i++){ //逐行搬入 uint32_t srcGM_idx = progress * 14*16 + 14*i; DataCopy(input_local[16*i], src0_global[srcGM_idx], 16); } inQueueX1.EnQue(input_local); } __aicore__ inline void Compute(int32_t progress, DataCopyPadCustomTilingData tiling){ LocalTensor<half> input_local = inQueueX1.DeQue<half>(); LocalTensor<half> output_local = outQueue.AllocTensor<half>(); Abs(input_local, input_local, 16*16); //主计算 unPadParams.rightPad = 2; //每行去掉2个脏数据 UnPad<half> (output_local, input_local, unPadParams, tiling.unPadTiling); outQueue.EnQue<half>(output_local); inQueueX1.FreeTensor(input_local); } __aicore__ inline void CopyOut(int32_t progress){ LocalTensor<half> output_local = outQueue.DeQue<half>(); DataCopy(out_global[progress * 14*16], output_local, 14*16); outQueue.FreeTensor(output_local); } private: GlobalTensor<half> src0_global; GlobalTensor<half> out_global; TPipe pipe; TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX1; TQue<QuePosition::VECOUT, BUFFER_NUM> outQueue; UnPadParams unPadParams; }; extern "C" __global__ __aicore__ void datacopypad_custom(GM_ADDR inputX1_gm, GM_ADDR output_gm, DataCopyPadCustomTilingData tiling) { KernelDataCopyPad op; UnPadParams unPadParams{0, 2}; op.Init(inputX1_gm, output_gm, unPadParams,tiling); op.Process(tiling); }
- 如下代码展示,数据小于32B,图4和图7的搬出数据方式。
#include "kernel_operator.h" #include "datacopypad_tiling.h" using namespace AscendC; constexpr int32_t blockLen = 12; //6个half constexpr int32_t TOTAL_LENGTH = 16 * 8 * (16 * 6); //8个核,每个tile处理16 * 6个half constexpr int32_t USE_CORE_NUM = 8; // num of core used constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM; // length computed of each core constexpr int32_t TILE_NUM = 16; // split data into 16 tiles for each core constexpr int32_t BUFFER_NUM = 1; // tensor num for each queue constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; //每个tile处理16 组 6个half class KernelDataCopyPad { public: __aicore__ inline KernelDataCopyPad() {} __aicore__ inline void Init(GM_ADDR inputX1_gm, GM_ADDR output_gm, PadParams& unPadParamsIn, Copyin_TilingData tiling_pad){ src0_global.SetGlobalBuffer((__gm__ half*)(inputX1_gm) + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); //单输入 out_global.SetGlobalBuffer((__gm__ half*)(output_gm) + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); pipe.InitBuffer(inQueueX1, 1, 16*16*sizeof(half)); pipe.InitBuffer(outQueue, 1, 16*16*sizeof(half)); pipe.InitBuffer(zero_Queue, BUFFER_NUM, 16*sizeof(half)); } __aicore__ inline void Process(Copyin_TilingData tiling_pad) { int32_t loopCount = TILE_NUM * BUFFER_NUM; for (int32_t i = 0; i < loopCount; i++) { CopyIn(i); Compute(i, tiling_pad); CopyOut(i); } } private: __aicore__ inline void CopyIn(int32_t progress)//GM->UB { LocalTensor<half> input_local = inQueueX1.AllocTensor<half>(); for (int i=0; i<16; i++){ //逐行搬入 uint32_t srcGM_idx = progress * 6*16 + 6*i; DataCopy(input_local[16*i], src0_global[srcGM_idx], 16); } inQueueX1.EnQue(input_local); } __aicore__ inline void Compute(int32_t progress, Copyin_TilingData tiling_pad){ LocalTensor<half> input_local = inQueueX1.DeQue<half>(); LocalTensor<half> output_local = outQueue.AllocTensor<half>(); padParams.leftPad = 0; padParams.rightPad = 10; //每行将10个脏数据替换成0 padParams.padValue = 0; Pad<half> (output_local, input_local, padParams, tiling_pad.padtiling); LocalTensor<half> zero_tensor = zero_Queue.AllocTensor<half>(); outQueue.EnQue<half>(output_local); zero_Queue.EnQue<half>(zero_tensor); inQueueX1.FreeTensor(input_local); } __aicore__ inline void CopyOut(int32_t progress) { LocalTensor<half> zero_tensor = zero_Queue.DeQue<half>(); //保证zero_tensor先被设置成全0再搬至GM event_t eventIDVToMTE3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3)); Duplicate<half> (zero_tensor, 0, 16); //设置全0 tensor SetFlag<HardEvent::V_MTE3>(eventIDVToMTE3); WaitFlag<HardEvent::V_MTE3>(eventIDVToMTE3); // dstGM被清零后再进行Vector计算 event_t eventIDMTE3ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V)); for(int i=0; i<6; i++){ DataCopy<half> (out_global[progress * 6*16 + i*16], zero_tensor, 16); } SetFlag<HardEvent::MTE3_V>(eventIDMTE3ToV); WaitFlag<HardEvent::MTE3_V>(eventIDMTE3ToV); LocalTensor<half> output_local = outQueue.DeQue<half>(); Abs(output_local, output_local, 16*16);//主计算 outQueue.EnQue<half>(output_local); output_local = outQueue.DeQue<half>(); SetAtomicAdd<half>(); for (int i=0; i<16; i++){ uint32_t dstGM_idx = progress * 6*16 + i*6; DataCopy<half> (out_global[dstGM_idx], output_local[i*16], 16); } SetAtomicNone(); outQueue.FreeTensor(output_local); zero_Queue.FreeTensor(zero_tensor); } private: GlobalTensor<half> src0_global; GlobalTensor<half> out_global; TPipe pipe; TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX1; TQue<QuePosition::VECOUT, BUFFER_NUM> outQueue; PadParams padParams; TQue<QuePosition::VECOUT, BUFFER_NUM> zero_Queue; }; extern "C" __global__ __aicore__ void datacopypad_custom(GM_ADDR inputX1_gm, GM_ADDR output_gm, Copyin_TilingData tiling_pad) { KernelDataCopyPad op; PadParams padParams{0, 10, 0}; op.Init(inputX1_gm, output_gm, padParams,tiling_pad); op.Process(tiling_pad); }
父主题: 非对齐场景及处理方式