下载
EN
注册

算子选择器

用户通过自定义算子选择器,根据shape的不同选择对应的Tiling策略。完整代码请参考样例使用获取。

.h实现

.h文件中定义了存储Tiling参数的结构体(例如BatchNormParam)和算子选择器实现函数(例如:SelectAclopBatchNorm)。

算子选择器实现函数的功能是:通过输入的shape信息,选择使用哪种Tiling策略,并将计算得到的Tiling参数存储到BatchNormParam结构体中,并输出aclopKernelDesc。
#ifndef TVM_TOPK_FLOWTABLE_H
#define TVM_TOPK_FLOWTABLE_H
#include "acl/acl.h"

//align with 64
structBatchNormParam {
   int32_t input_n;  // input format = nchw
   int32_t input_c;
   int32_t input_h;
   int32_t input_w;
   int32_t in_datatype; //input data type = fp16
   int32_t output_n;  // output format = nchw
   int32_t output_c;
   int32_t output_h;
   int32_t output_w;
   int32_t out_datatype; //output_data_type = fp16
   int32_t gamma_c;  //mean tensor channel = input channel
   int32_t gamma_datatype;
   int32_t beta_c;   //var tensor channel = input channel
   int32_t beta_datatype;
   int32_t param1;   //tiling parameters, difference tiling mode has difference tiling parameters
   int32_t param2;   //gived 10 parameters
   int32_t param3;
   int32_t param4;
   int32_t param5;
   int32_t param6;
   int32_t param7;
   int32_t param8;
   int32_t param9;
   int32_t param10;
};

extern "C" aclError SelectAclopBatchNorm(int numInputs, const aclTensorDesc *const inputDesc[],
                                         int numOutputs, const aclTensorDesc *const outputDesc[],
                                         const aclopAttr *opAttr, aclopKernelDesc *aclopKernelDesc);

#endif

.cpp实现

算子选择器关键实现逻辑如下:

  1. 根据输入的shape信息,判断使用哪种Tiling策略。示例代码:
    /**
     * batchnorm tiling to 
     * @limit: 1. n,c,h,w is the format of NCHW
     *         2. inDtype, outDtype is the multiple of 16, and the dtype is float16
     *         3. mini, core num is 2
     *         4. only support to select one batchnorm schedule
     *         5. not set kernel workspace
     */
    void BatchNormTiling(uint64_t n, uint64_t c, uint64_t h, uint64_t w, aclDataType inDtype, TilingMode &mode)
    {
        uint64_t lenwh = w*h;
        uint64_t bytesize = 0;
        if(inDtype == ACL_FLOAT16){
            bytesize = 2;
        }
        else{
            cout << "[ERROR] not fp16 datatype not support yet." << endl;
            return;
        }
    
        if (c > lenwh && lenwh == CeilDivMul(lenwh, 16) && (CeilDivMul(c, 16) * lenwh * bytesize) <= TILING_0_UB_SIZE) {
            mode = TILING_MODE_1;
        }
        else{
            if(lenwh*bytesize > TILING_2_UB_SIZE){
                mode = TILING_MODE_2;
            }
            else if((lenwh*bytesize > TILING_2_UB_SIZE/2) && (lenwh*bytesize < TILING_2_UB_SIZE)){
                mode = TILING_MODE_2;
            }
            else{
                mode = TILING_MODE_3;
            }
        }
    
        //cout << "[INFO] select tiling mode is:" << mode << endl;
        return;
    }
  2. 针对不同的Tiling策略,计算具体Tiling参数。

    对比固定shape场景下Tiling参数的计算,动态shape场景下Tiling参数不再在TIK算子实现函数中计算,将重复的Scalar运算全部转移到Host CPU上,减少AI Core上Scalar运算量,有利于流水线并行。

        if(tilingmode == TILING_MODE_1){
            scheduleFlag = "tiling_mode_1__kernel0"; //tiling mode 1 生成的二进制名字
            int32_t chn_num = MAX_C_SIZE;
            int32_t align_16 = CeilDivMul(in_w*in_h, 16);
            int32_t total_use_ub = chn_num*align_16*2*2 + chn_num*2;
    
            if(total_use_ub <= UB_SIZE){ //数据可一次搬入Unified Buffer空间
                bnparam.param1 = in_h*in_w;  //input_wh
                bnparam.param2 = CeilDivMul(bnparam.param1, 16); //align_wh
                bnparam.param3 = CeilDivMul(in_c, 16); //align_c
            }
            else{//数据不可一次搬入Unified Buffer空间
                bnparam.param1 = in_h*in_w;  //input_wh
                bnparam.param2 = CeilDiv(bnparam.param1, 16); //iterwh_align16
                bnparam.param3 = CeilDiv(in_c, 16); //repeat_alignc
                bnparam.param4 = CeilDivMul(bnparam.param1, 16); //align_wh
                bnparam.param5 = CeilDivMul(in_c, 16); //align_c
                bnparam.param6 = CeilDiv(bnparam.param4, 16) - 1;
            }
        }
        else if(tilingmode == TILING_MODE_2){
            scheduleFlag = "tiling_mode_2__kernel0"; //tiling mode 2 生成的二进制名字
            bnparam.param1 = in_h*in_w;  //input_wh
            bnparam.param2 = CeilDivMul(bnparam.param1, 16); //align_wh
            int32_t tiling_num = (TILING_1_UB_SIZE / 4); 
            bnparam.param3 = CeilDiv(bnparam.param1, tiling_num); //iter_ceil
            if(bnparam.param3 < 2){ //W*H可一次搬入给定的Unified Buffer空间
                bnparam.param4 = tiling_num / 128;//iter_mask128
                bnparam.param5 = bnparam.param4/255; //repeat_mask128
                bnparam.param6 = bnparam.param4 - bnparam.param5*255; //repeat_res_mask128
                bnparam.param7 = tiling_num -  bnparam.param4*128; //res_mask128
            }
            else{//W*H需分多次搬入给定的Unified Buffer空间
                bnparam.param4 = CeilDiv(in_h, bnparam.param3);//iter_h
                bnparam.param5 = CeilDivMul(bnparam.param4*in_w, 16);//iter_align16
                bnparam.param6 = CeilDivMul(((in_h - bnparam.param4*(bnparam.param3 - 1))*in_w) , 16); //iter_res_align16
                bnparam.param7 = CeilDivMul(bnparam.param4*in_w, 128);//repeat_mask128
                bnparam.param8 = CeilDivMul(((in_h - bnparam.param4*(bnparam.param3 - 1))*in_w), 128);//repeat_res_mask128
                bnparam.param9 = in_h - bnparam.param4*bnparam.param3;
            }
        }
        else if(tilingmode == TILING_MODE_3){//tiling mode 3 生成的二进制名字
            bnparam.param1 = in_h*in_w;  //input_wh
            bnparam.param2 = CeilDivMul(bnparam.param1, 16); //align_wh
            int32_t tiling_num = (TILING_2_UB_SIZE / 4); 
            bnparam.param3 = tiling_num / bnparam.param2; //single_chnum
            if(bnparam.param3 > in_c){ 
                bnparam.param3 = in_c;
            }
            bnparam.param4 = (int32_t)(in_c / bnparam.param3);//iter_cnum
            if(bnparam.param4 <= 0){
                bnparam.param4 = 1;
            }
            if(bnparam.param1 == bnparam.param2){//单次搬入多个Featuremap, W*H 32Byte对齐
                bnparam.param5 = bnparam.param1*bnparam.param3; //repeat_length
                bnparam.param6 = bnparam.param1 / 128; //repeat_mask
                bnparam.param7 = bnparam.param1 -  bnparam.param6*128; //repeat_res_mask
                bnparam.param8 = 128*bnparam.param6; //res_mask
                bnparam.param9 = in_c - bnparam.param3*(bnparam.param4); //res_ch_num
                bnparam.param10 = bnparam.param9*bnparam.param1; //res_repeat_length
            }
            else{//W*H 非32对齐,单次仅搬入1个通道数据
                bnparam.param5 = bnparam.param1 / 128; //repeat_mask128
                bnparam.param6 = bnparam.param1 - bnparam.param5*128; //repeat_res_mask
            }
            scheduleFlag = "tiling_mode_3__kernel0";
        }
  3. 通过如下两个接口设置aclopKernelDesc参数,用于算子执行。
    • aclopSetKernelArgs:供算子选择器选择到合适Kernel后,进行参数设置。将计算得到的参数,即kernel id、block dim, tiling参数,设置到aclopKernelDesc中。
          aclopSetKernelArgs(aclopKernelDesc, scheduleFlag.c_str(),  1, &bnparam, sizeof(BatchNormParam));

      建议此处设置的block dim(AI Core核数)和TIK算子实现时的使用的AI Core核数保持一致。

    • aclopSetKernelWorkspaceSizes:供算子选择器选择到合适kernel后,进行workspace设置,非必须,根据算子情况可选。

    aclopSetKernelArgs和aclopSetKernelWorkspaceSizes接口说明请参考AscendCL应用软件开发指南 (C&C++)