昇腾社区首页
中文
注册

无DataCopyPad的处理方式

Atlas 推理系列产品未提供DataCopyPad的接口,需要对搬进和搬出非对齐场景进行处理,如下为不同场景及其处理方式:

  • Global数据逐行搬运到Local中,Local中每行都存在冗余数据
    • 冗余数据参与计算
      图1 冗余数据参与计算
    • 使用Mask掩掉脏数据,一般用于轴归约计算等
      图2 冗余数据不参与计算
    • 搬入Local中,使用Pad高阶API Pad确定的值
      图3 使用Pad高阶API填充0值
  • Global搬运非对齐数据到Local, 整块搬入后进行Pad成0值
    图4 使用Pad接口补齐
  • Local非对齐拷贝出Global,拷贝长度 > 32B
    • Local中存在冗余数据,如果有效数据为32B整除,使用UnPad接口去除冗余数据完整搬出
      图5 使用UnPad去除冗余值
    • 使用GatherMask处理
      图6 使用GatherMask借位搬运
  • Local非对齐拷贝出Global, 拷贝长度 < 32B
    图7 使用atomic累加的方式处理拷贝长度小于32B的场景

调用示例

  • 如下代码展示,数据大于32B,图1 冗余数据参与计算图6 使用GatherMask借位搬运的搬出数据方式。
    #include "kernel_operator.h"
    using namespace AscendC;
    constexpr int32_t blockLen = 36; // >32B,
    //以下均为元素个数
    constexpr int32_t TOTAL_LENGTH = (8 * 16)* 18; // 8个核,16个tile,每个tile处理18个half
    constexpr int32_t USE_CORE_NUM = 8;                       
    constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM; // length computed of each core 
    constexpr int32_t TILE_NUM = 16;  // split data into 16 tiles for each core
    constexpr int32_t BUFFER_NUM = 1;  // tensor num for each queue
    constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // 18个half
    class KernelDataCopyPad {
    public:
        __aicore__ inline KernelDataCopyPad() {}
        __aicore__ inline void Init(GM_ADDR inputX1_gm, GM_ADDR output_gm) {
            src0_global.SetGlobalBuffer((__gm__ half*)(inputX1_gm) + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);  // 单输入
            out_global.SetGlobalBuffer((__gm__ half*)(output_gm) + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
            pipe.InitBuffer(inQueueX1, 2, 32 * sizeof(half));
            pipe.InitBuffer(outQueue, 2, 32 * sizeof(half));
            pipe.InitBuffer(outQueue_tail, 2, 16 * sizeof(half));
            pipe.InitBuffer(tmp_Pattern, 16 * sizeof(uint16_t));
        }
        __aicore__ inline void Process()
        {
            int32_t loopCount = TILE_NUM * BUFFER_NUM;
            for (int32_t i = 0; i < loopCount; i++) {
                CopyIn(i);
                Compute(i);
                CopyOut(i);
            }
        }
    private:
        __aicore__ inline void CopyIn(int32_t progress){
            LocalTensor<half> input_local = inQueueX1.AllocTensor<half>();
                for (int i = 0; i < 1; i++){
                    // 逐行搬入 每行有效数据为18个half
                    uint32_t srcGM_idx = progress * 18 + 18 * i;
                    DataCopy(input_local[32 * i], src0_global[srcGM_idx], 32);  // 每次搬32个half到UB
                }
            inQueueX1.EnQue(input_local);
        }
        __aicore__ inline void Compute(int32_t progress){
            LocalTensor<half> output_local = outQueue.AllocTensor<half>();
            LocalTensor<half> input_local = inQueueX1.DeQue<half>();
            Abs(output_local, input_local, 18);//主计算
    
            event_t eventIDMTE3ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V));
            LocalTensor<uint16_t> Buf_Pattern = tmp_Pattern.Get<uint16_t>();
            Buf_Pattern.SetValue(0, 0b1111111111111100); //前16个half选中后14个half
            Buf_Pattern.SetValue(1, 0b0000000000000011); //后16个half选中前2个half
            SetFlag<HardEvent::MTE3_V>(eventIDMTE3ToV); //保证先设置好Buf_Pattern 再进行atomic累加
            WaitFlag<HardEvent::MTE3_V>(eventIDMTE3ToV);
    
            uint32_t mask = 128;
            uint64_t rsvdCnt = 0;
            LocalTensor<half> tail_local = outQueue_tail.AllocTensor<half>();
            //tail_local存储18个half中的从第2个到第18个元素
            GatherMask(tail_local, output_local, Buf_Pattern, true, mask, { 1, 1, 8, 8 }, rsvdCnt);
            outQueue.EnQue<half>(output_local);
            outQueue_tail.EnQue<half>(tail_local);
            inQueueX1.FreeTensor(input_local);
        }
        __aicore__ inline void CopyOut(int32_t progress){
            LocalTensor<half> output_local = outQueue.DeQue<half>();
            LocalTensor<half> tail_local = outQueue_tail.DeQue<half>();
            DataCopy(out_global[progress * 18], output_local, 16); //搬运前面对齐地址部分
            DataCopy(out_global[progress * 18 + (18-16)], tail_local, 16); //搬运后32B至GM
            outQueue.FreeTensor(output_local);
            outQueue_tail.FreeTensor(tail_local);
        }
    private:
        GlobalTensor<half> src0_global;
        GlobalTensor<half> out_global;
        TPipe pipe;
        TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX1;
        TQue<QuePosition::VECOUT, BUFFER_NUM> outQueue, outQueue_tail;
        TBuf<QuePosition::VECCALC> tmp_Pattern;
    };
    extern "C" __global__ __aicore__ void datacopypad_custom(GM_ADDR inputX1_gm, GM_ADDR output_gm){
        KernelDataCopyPad op;
        op.Init(inputX1_gm, output_gm);
        op.Process();
    }
  • 如下代码展示,数据小于32B,参考图3 使用Pad高阶API填充0值的方法逐行将Local上脏数据清零,参考图7 使用atomic累加的方式处理拷贝长度小于32B的场景的搬出数据方式。
    #include "kernel_operator.h"
    using namespace AscendC;
    constexpr int32_t blockLen = 22; // <32B,
    constexpr int32_t TOTAL_LENGTH = 4 * 15 * 11;  // 4个核,每个核处理15 * 11个half
    constexpr int32_t USE_CORE_NUM = 4; // num of core used
    constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM; // length computed of each core
    constexpr int32_t TILE_NUM = 1;
    constexpr int32_t BUFFER_NUM = 1;  // tensor num for each queue
    constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // 15 * 11个half
    class KernelDataCopyPad {
    public:
        __aicore__ inline KernelDataCopyPad() {}
        __aicore__ inline void Init(GM_ADDR inputX1_gm, GM_ADDR output_gm){
            src0_global.SetGlobalBuffer((__gm__ half*)(inputX1_gm) + 11*15 * GetBlockIdx(), 11*15);
            out_global.SetGlobalBuffer((__gm__ half*)(output_gm) + 11*15 * GetBlockIdx(), 11*15);
            pipe.InitBuffer(inQueueX1, BUFFER_NUM, 16*15*sizeof(half));
            pipe.InitBuffer(outQueue, BUFFER_NUM, 16*15*sizeof(half));
            pipe.InitBuffer(zero_Queue, BUFFER_NUM, 16*sizeof(half));
        }
        __aicore__ inline void Process()
        {
            // loop count need to be doubled, due to double buffer
            constexpr int32_t loopCount = TILE_NUM * BUFFER_NUM;
            // tiling strategy, pipeline parallel
            for (int32_t i = 0; i < loopCount; i++) {
                CopyIn(i);
                Compute(i);
                CopyOut(i);
            }
        }
    private:
        __aicore__ inline void CopyIn(int32_t progress)//GM->UB
        {
            LocalTensor<half> input_local = inQueueX1.AllocTensor<half>();
                for (int i = 0; i < 15; i++){
                    // 逐行搬入 每行有效数据为11个half
                    DataCopy(input_local[i*16], src0_global[i*11], 16);  // 每次搬16个half到UB
                }
            inQueueX1.EnQue(input_local);
        }
        __aicore__ inline void Compute(int32_t progress){
            LocalTensor<half> output_local = outQueue.AllocTensor<half>();
            LocalTensor<half> input_local = inQueueX1.DeQue<half>();
            LocalTensor<half> zero_tensor = zero_Queue.AllocTensor<half>();
            event_t eventIDMTE3ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V));
            event_t eventIDVToMTE3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
            //所有dstGM清零
            //使用local zero tensor用0覆盖dstGM
            Duplicate<half> (zero_tensor, 0, 16); //设置全0 tensor
            uint32_t copy_len = 11*15*sizeof(half) /32*32/sizeof(half); //165向下取整 -> 160
            zero_Queue.EnQue<half>(zero_tensor);
            zero_tensor = zero_Queue.DeQue<half>();
            for(int i=0; i<copy_len/16; i++){
                DataCopy<half> (out_global[i*16], zero_tensor, 16);
            }
            DataCopy<half> (out_global[11*15-16], zero_tensor, 16); //避免将dstGM上有效数据覆盖掉
            SetFlag<HardEvent::MTE3_V>(eventIDMTE3ToV); //保证先清零GM 再进行atomic累加
            WaitFlag<HardEvent::MTE3_V>(eventIDMTE3ToV);
            uint64_t mask0 = ((uint64_t)1 <<16) - ((uint64_t)1 <<11); //不选前11个数,对应二进制为(2^16-1)- (2^11-1)
            uint64_t mask[2] = {mask0, 0};
            for (int i=0; i<15; i++){
                Duplicate<half> (input_local[i*16], 0, mask, 1, 1, 1); //计算前逐行将srcLocal上脏数据pad掉
            }
            Abs(output_local, input_local, 16*15);//15行
            outQueue.EnQue<half>(output_local);
            inQueueX1.FreeTensor(input_local);
            zero_Queue.FreeTensor(zero_tensor);
        }
        __aicore__ inline void CopyOut(int32_t progress)
        {
            LocalTensor<half> output_local = outQueue.DeQue<half>();
            SetAtomicAdd<half>();
            for (int i=0; i<15; i++){
                DataCopy<half> (out_global[i * 11], output_local[i*16], 16);
            }
            SetAtomicNone();
            outQueue.FreeTensor(output_local);
        }
    private:
        GlobalTensor<half> src0_global;
        GlobalTensor<half> out_global;
        TPipe pipe;
        TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX1;
        TQue<QuePosition::VECOUT, BUFFER_NUM> outQueue;
        TQue<QuePosition::VECOUT, BUFFER_NUM> zero_Queue;
    };
    extern "C" __global__ __aicore__ void datacopypad_custom(GM_ADDR inputX1_gm, GM_ADDR output_gm){
        KernelDataCopyPad op;
        op.Init(inputX1_gm, output_gm);
        op.Process();
    }
  • 如下代码展示,数据小于32B,参考图2,使用mask掩掉脏数据,参考图7的搬出数据方式。
    #include "kernel_operator.h"
    using namespace AscendC;
    constexpr int32_t blockLen = 8;
    constexpr int32_t TOTAL_LENGTH = 4 * 4 * 4;  // 4个核, 每个核处理4*4个half
    constexpr int32_t USE_CORE_NUM = 4;                                   // num of core used
    constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM;         // length computed of each core
    constexpr int32_t TILE_NUM = 1;
    constexpr int32_t BUFFER_NUM = 1;                                     // tensor num for each queue
    constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // 16个half
    class KernelDataCopyPad {
    public:
        __aicore__ inline KernelDataCopyPad() {}
        __aicore__ inline void Init(GM_ADDR inputX1_gm, GM_ADDR output_gm){
            src0_global.SetGlobalBuffer((__gm__ half*)(inputX1_gm) + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
            out_global.SetGlobalBuffer((__gm__ half*)(output_gm) + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
            pipe.InitBuffer(inQueueX1, BUFFER_NUM, 32*4);
            pipe.InitBuffer(outQueue, BUFFER_NUM, 32*4);
            pipe.InitBuffer(zero_Queue, BUFFER_NUM, 16*sizeof(half));
            pipe.InitBuffer(workQueue, 1, 16 * sizeof(half));
        }
        __aicore__ inline void Process()
        {
            constexpr int32_t loopCount = TILE_NUM * BUFFER_NUM;
            for (int32_t i = 0; i < loopCount; i++) {
                CopyIn(i);
                Compute(i);
                CopyOut(i);
            }
        }
    private:
        __aicore__ inline void CopyIn(int32_t progress){
            LocalTensor<half> input_local = inQueueX1.AllocTensor<half>();
            LocalTensor<half> zero_tensor = zero_Queue.AllocTensor<half>();
                for (int i = 0; i < 4; i++){
                    // 逐行搬入 每行有效数据是前4个half
                    DataCopy(input_local[i*16], src0_global[i*4], 16);  // 每次搬16个half到UB
                }
            inQueueX1.EnQue(input_local);
            zero_Queue.EnQue(zero_tensor);
        }
        __aicore__ inline void Compute(int32_t progress){
            LocalTensor<half> output_local = outQueue.AllocTensor<half>();
            LocalTensor<half> work_local = workQueue.AllocTensor<half>();
            LocalTensor<half> input_local = inQueueX1.DeQue<half>();
            LocalTensor<half> zero_tensor = zero_Queue.DeQue<half>();
            Duplicate<half> (zero_tensor, 0, 16); //设置全0 tensor
            // dstGM被清零后再进行Vector计算
            event_t eventIDMTE3ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V));
            DataCopy<half> (out_global, zero_tensor, 16); //所有dstGM清零
            SetFlag<HardEvent::MTE3_V>(eventIDMTE3ToV);
            WaitFlag<HardEvent::MTE3_V>(eventIDMTE3ToV);
            outQueue.EnQue<half>(output_local);
            output_local = outQueue.DeQue<half>();
            Duplicate<half> (output_local, 0, 16*4); //将dstLocal清零
            uint64_t Mask0 = ((uint64_t)1 <<4) - 1; //mask模式控制只有前4个元素参与ReduceMin运算
            uint64_t Mask[2] = {Mask0, 0};
            for (int i=0; i<4; i++){
                ReduceMin<half>(output_local[i*16], input_local[i*16], work_local, Mask, 1, 8, false);
            }
            
            outQueue.EnQue<half>(output_local);
            inQueueX1.FreeTensor(input_local);
            workQueue.FreeTensor(work_local);
            zero_Queue.FreeTensor(zero_tensor);
        }
        __aicore__ inline void CopyOut(int32_t progress){
            LocalTensor<half> output_local = outQueue.DeQue<half>();
            SetAtomicAdd<half>();
            for (int i=0; i<4; i++){ //逐行搬出 一共4行
                DataCopy<half> (out_global[i*4], output_local[i*16], 16);
            }
            SetAtomicNone();
            outQueue.FreeTensor(output_local);
        }
    private:
        GlobalTensor<half> src0_global;
        GlobalTensor<half> out_global;
        TPipe pipe;
        TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX1;
        TQue<QuePosition::VECOUT, BUFFER_NUM> outQueue;
        TQue<QuePosition::VECOUT, BUFFER_NUM> workQueue;
        TQue<QuePosition::VECOUT, BUFFER_NUM> zero_Queue;
    };
    extern "C" __global__ __aicore__ void datacopypad_custom(GM_ADDR inputX1_gm, GM_ADDR output_gm){
        KernelDataCopyPad op;
        op.Init(inputX1_gm, output_gm);
        op.Process();
    }
  • 如下代码展示,数据小于32B,图1图5的搬出数据方式。
    #include "kernel_operator.h"
    #include "datacopypad_tiling.h"
    using namespace AscendC;
    constexpr int32_t blockLen = 28; //每个block有14个half
    constexpr int32_t TOTAL_LENGTH = 8 * 8 * 2 * (16 * 14);  //8个核,16个tile 每个tile处理16 * 14个half
    constexpr int32_t USE_CORE_NUM = 8;  // num of core used
    constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM;  // length computed of each core
    constexpr int32_t TILE_NUM = 8;  // split data into 8 tiles for each core
    constexpr int32_t BUFFER_NUM = 2;  // tensor num for each queue
    constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // 每个tile处理16组 每组14个half
    class KernelDataCopyPad {
    public:
        __aicore__ inline KernelDataCopyPad() {}
        __aicore__ inline void Init(GM_ADDR inputX1_gm, GM_ADDR output_gm, UnPadParams& unPadParamsIn, DataCopyPadCustomTilingData tiling){
            src0_global.SetGlobalBuffer((__gm__ half*)(inputX1_gm) + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);//单输入
            out_global.SetGlobalBuffer((__gm__ half*)(output_gm) + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
            pipe.InitBuffer(inQueueX1, BUFFER_NUM, 16*16*sizeof(half));
            pipe.InitBuffer(outQueue, BUFFER_NUM, 16*16*sizeof(half));
        }
        __aicore__ inline void Process(DataCopyPadCustomTilingData tiling)
        {
            int32_t loopCount = TILE_NUM * BUFFER_NUM;
            for (int32_t i = 0; i < loopCount; i++) {
                CopyIn(i);
                Compute(i, tiling);
                CopyOut(i);
            }
        }
    private:
        __aicore__ inline void CopyIn(int32_t progress){
            LocalTensor<half> input_local = inQueueX1.AllocTensor<half>();
            for (int i=0; i<16; i++){   //逐行搬入
                uint32_t srcGM_idx = progress * 14*16 + 14*i;
                DataCopy(input_local[16*i], src0_global[srcGM_idx], 16);
            }
            inQueueX1.EnQue(input_local);
        }
        __aicore__ inline void Compute(int32_t progress, DataCopyPadCustomTilingData tiling){
            LocalTensor<half> input_local = inQueueX1.DeQue<half>();
            LocalTensor<half> output_local = outQueue.AllocTensor<half>();
            Abs(input_local, input_local, 16*16);  //主计算
            unPadParams.rightPad = 2; //每行去掉2个脏数据
            UnPad<half> (output_local, input_local, unPadParams, tiling.unPadTiling);
            outQueue.EnQue<half>(output_local);
            inQueueX1.FreeTensor(input_local);
        }
        __aicore__ inline void CopyOut(int32_t progress){
            LocalTensor<half> output_local = outQueue.DeQue<half>();
            DataCopy(out_global[progress * 14*16], output_local, 14*16);
            outQueue.FreeTensor(output_local);
        }
    private:
        GlobalTensor<half> src0_global;
        GlobalTensor<half> out_global;
        TPipe pipe;
        TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX1;
        TQue<QuePosition::VECOUT, BUFFER_NUM> outQueue;
        UnPadParams unPadParams;
    };
    extern "C" __global__ __aicore__ void datacopypad_custom(GM_ADDR inputX1_gm, GM_ADDR output_gm, DataCopyPadCustomTilingData tiling)
    {
        KernelDataCopyPad op;
        UnPadParams unPadParams{0, 2};
        op.Init(inputX1_gm, output_gm, unPadParams,tiling);
        op.Process(tiling);
    }
  • 如下代码展示,数据小于32B,图4图7的搬出数据方式。
    #include "kernel_operator.h"
    #include "datacopypad_tiling.h"
    using namespace AscendC;
    constexpr int32_t blockLen = 12; //6个half
    constexpr int32_t TOTAL_LENGTH = 16 * 8 * (16 * 6);   //8个核,每个tile处理16 * 6个half
    constexpr int32_t USE_CORE_NUM = 8;   // num of core used
    constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM;   // length computed of each core
    constexpr int32_t TILE_NUM = 16;  // split data into 16 tiles for each core
    constexpr int32_t BUFFER_NUM = 1;  // tensor num for each queue
    constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; //每个tile处理16 组 6个half
    class KernelDataCopyPad {
    public:
        __aicore__ inline KernelDataCopyPad() {}
        __aicore__ inline void Init(GM_ADDR inputX1_gm, GM_ADDR output_gm, PadParams& unPadParamsIn, Copyin_TilingData tiling_pad){
            src0_global.SetGlobalBuffer((__gm__ half*)(inputX1_gm) + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); //单输入
            out_global.SetGlobalBuffer((__gm__ half*)(output_gm) + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
            pipe.InitBuffer(inQueueX1, 1, 16*16*sizeof(half));
            pipe.InitBuffer(outQueue, 1, 16*16*sizeof(half));
            pipe.InitBuffer(zero_Queue, BUFFER_NUM, 16*sizeof(half));
        }
        __aicore__ inline void Process(Copyin_TilingData tiling_pad)
        {
            int32_t loopCount = TILE_NUM * BUFFER_NUM;
            for (int32_t i = 0; i < loopCount; i++) {
                CopyIn(i);
                Compute(i, tiling_pad);
                CopyOut(i);
            }
        }
    private:
        __aicore__ inline void CopyIn(int32_t progress)//GM->UB
        {
            LocalTensor<half> input_local = inQueueX1.AllocTensor<half>();
            for (int i=0; i<16; i++){  //逐行搬入
                uint32_t srcGM_idx = progress * 6*16 + 6*i;
                DataCopy(input_local[16*i], src0_global[srcGM_idx], 16);
            }
            inQueueX1.EnQue(input_local);
        }
        __aicore__ inline void Compute(int32_t progress, Copyin_TilingData tiling_pad){
            LocalTensor<half> input_local = inQueueX1.DeQue<half>();
            LocalTensor<half> output_local = outQueue.AllocTensor<half>();
            padParams.leftPad = 0;
            padParams.rightPad = 10; //每行将10个脏数据替换成0
            padParams.padValue = 0;
            Pad<half> (output_local, input_local, padParams, tiling_pad.padtiling);
            LocalTensor<half> zero_tensor = zero_Queue.AllocTensor<half>();
            outQueue.EnQue<half>(output_local);
            zero_Queue.EnQue<half>(zero_tensor);
            inQueueX1.FreeTensor(input_local);
        }
        __aicore__ inline void CopyOut(int32_t progress)
        {
            LocalTensor<half> zero_tensor = zero_Queue.DeQue<half>();
    
            //保证zero_tensor先被设置成全0再搬至GM
            event_t eventIDVToMTE3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
            Duplicate<half> (zero_tensor, 0, 16); //设置全0 tensor
            SetFlag<HardEvent::V_MTE3>(eventIDVToMTE3);
            WaitFlag<HardEvent::V_MTE3>(eventIDVToMTE3);
    
            // dstGM被清零后再进行Vector计算
            event_t eventIDMTE3ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V));
            for(int i=0; i<6; i++){
                DataCopy<half> (out_global[progress * 6*16 + i*16], zero_tensor, 16);
            }
            SetFlag<HardEvent::MTE3_V>(eventIDMTE3ToV);
            WaitFlag<HardEvent::MTE3_V>(eventIDMTE3ToV);
    
            LocalTensor<half> output_local = outQueue.DeQue<half>();
            Abs(output_local, output_local, 16*16);//主计算
            outQueue.EnQue<half>(output_local);
            output_local = outQueue.DeQue<half>();
    
            SetAtomicAdd<half>();
            for (int i=0; i<16; i++){
                uint32_t dstGM_idx = progress * 6*16 + i*6;
                DataCopy<half> (out_global[dstGM_idx], output_local[i*16], 16);
            }
            SetAtomicNone();
    
            outQueue.FreeTensor(output_local);
            zero_Queue.FreeTensor(zero_tensor);
        }
    private:
        GlobalTensor<half> src0_global;
        GlobalTensor<half> out_global;
        TPipe pipe;
        TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX1;
        TQue<QuePosition::VECOUT, BUFFER_NUM> outQueue;
        PadParams padParams;
        TQue<QuePosition::VECOUT, BUFFER_NUM> zero_Queue;
    };
    extern "C" __global__ __aicore__ void datacopypad_custom(GM_ADDR inputX1_gm, GM_ADDR output_gm, Copyin_TilingData tiling_pad)
    {
        KernelDataCopyPad op;
        PadParams padParams{0, 10, 0};
        op.Init(inputX1_gm, output_gm, padParams,tiling_pad);
        op.Process(tiling_pad);
    }