算子选择器
用户通过自定义算子选择器,根据shape的不同选择对应的Tiling策略。完整代码请参考样例使用获取。
.h实现
.h文件中定义了存储Tiling参数的结构体(例如BatchNormParam)和算子选择器实现函数(例如:SelectAclopBatchNorm)。
算子选择器实现函数的功能是:通过输入的shape信息,选择使用哪种Tiling策略,并将计算得到的Tiling参数存储到BatchNormParam结构体中,并输出aclopKernelDesc。
#ifndef TVM_TOPK_FLOWTABLE_H #define TVM_TOPK_FLOWTABLE_H #include "acl/acl.h" //align with 64 structBatchNormParam { int32_t input_n; // input format = nchw int32_t input_c; int32_t input_h; int32_t input_w; int32_t in_datatype; //input data type = fp16 int32_t output_n; // output format = nchw int32_t output_c; int32_t output_h; int32_t output_w; int32_t out_datatype; //output_data_type = fp16 int32_t gamma_c; //mean tensor channel = input channel int32_t gamma_datatype; int32_t beta_c; //var tensor channel = input channel int32_t beta_datatype; int32_t param1; //tiling parameters, difference tiling mode has difference tiling parameters int32_t param2; //gived 10 parameters int32_t param3; int32_t param4; int32_t param5; int32_t param6; int32_t param7; int32_t param8; int32_t param9; int32_t param10; }; extern "C" aclError SelectAclopBatchNorm(int numInputs, const aclTensorDesc *const inputDesc[], int numOutputs, const aclTensorDesc *const outputDesc[], const aclopAttr *opAttr, aclopKernelDesc *aclopKernelDesc); #endif
.cpp实现
算子选择器关键实现逻辑如下:
- 根据输入的shape信息,判断使用哪种Tiling策略。示例代码:
/** * batchnorm tiling to * @limit: 1. n,c,h,w is the format of NCHW * 2. inDtype, outDtype is the multiple of 16, and the dtype is float16 * 3. mini, core num is 2 * 4. only support to select one batchnorm schedule * 5. not set kernel workspace */ void BatchNormTiling(uint64_t n, uint64_t c, uint64_t h, uint64_t w, aclDataType inDtype, TilingMode &mode) { uint64_t lenwh = w*h; uint64_t bytesize = 0; if(inDtype == ACL_FLOAT16){ bytesize = 2; } else{ cout << "[ERROR] not fp16 datatype not support yet." << endl; return; } if (c > lenwh && lenwh == CeilDivMul(lenwh, 16) && (CeilDivMul(c, 16) * lenwh * bytesize) <= TILING_0_UB_SIZE) { mode = TILING_MODE_1; } else{ if(lenwh*bytesize > TILING_2_UB_SIZE){ mode = TILING_MODE_2; } else if((lenwh*bytesize > TILING_2_UB_SIZE/2) && (lenwh*bytesize < TILING_2_UB_SIZE)){ mode = TILING_MODE_2; } else{ mode = TILING_MODE_3; } } //cout << "[INFO] select tiling mode is:" << mode << endl; return; }
- 针对不同的Tiling策略,计算具体Tiling参数。
对比固定shape场景下Tiling参数的计算,动态shape场景下Tiling参数不再在TIK算子实现函数中计算,将重复的Scalar运算全部转移到Host CPU上,减少AI Core上Scalar运算量,有利于流水线并行。
if(tilingmode == TILING_MODE_1){ scheduleFlag = "tiling_mode_1__kernel0"; //tiling mode 1 生成的二进制名字 int32_t chn_num = MAX_C_SIZE; int32_t align_16 = CeilDivMul(in_w*in_h, 16); int32_t total_use_ub = chn_num*align_16*2*2 + chn_num*2; if(total_use_ub <= UB_SIZE){ //数据可一次搬入Unified Buffer空间 bnparam.param1 = in_h*in_w; //input_wh bnparam.param2 = CeilDivMul(bnparam.param1, 16); //align_wh bnparam.param3 = CeilDivMul(in_c, 16); //align_c } else{//数据不可一次搬入Unified Buffer空间 bnparam.param1 = in_h*in_w; //input_wh bnparam.param2 = CeilDiv(bnparam.param1, 16); //iterwh_align16 bnparam.param3 = CeilDiv(in_c, 16); //repeat_alignc bnparam.param4 = CeilDivMul(bnparam.param1, 16); //align_wh bnparam.param5 = CeilDivMul(in_c, 16); //align_c bnparam.param6 = CeilDiv(bnparam.param4, 16) - 1; } } else if(tilingmode == TILING_MODE_2){ scheduleFlag = "tiling_mode_2__kernel0"; //tiling mode 2 生成的二进制名字 bnparam.param1 = in_h*in_w; //input_wh bnparam.param2 = CeilDivMul(bnparam.param1, 16); //align_wh int32_t tiling_num = (TILING_1_UB_SIZE / 4); bnparam.param3 = CeilDiv(bnparam.param1, tiling_num); //iter_ceil if(bnparam.param3 < 2){ //W*H可一次搬入给定的Unified Buffer空间 bnparam.param4 = tiling_num / 128;//iter_mask128 bnparam.param5 = bnparam.param4/255; //repeat_mask128 bnparam.param6 = bnparam.param4 - bnparam.param5*255; //repeat_res_mask128 bnparam.param7 = tiling_num - bnparam.param4*128; //res_mask128 } else{//W*H需分多次搬入给定的Unified Buffer空间 bnparam.param4 = CeilDiv(in_h, bnparam.param3);//iter_h bnparam.param5 = CeilDivMul(bnparam.param4*in_w, 16);//iter_align16 bnparam.param6 = CeilDivMul(((in_h - bnparam.param4*(bnparam.param3 - 1))*in_w) , 16); //iter_res_align16 bnparam.param7 = CeilDivMul(bnparam.param4*in_w, 128);//repeat_mask128 bnparam.param8 = CeilDivMul(((in_h - bnparam.param4*(bnparam.param3 - 1))*in_w), 128);//repeat_res_mask128 bnparam.param9 = in_h - bnparam.param4*bnparam.param3; } } else if(tilingmode == TILING_MODE_3){//tiling mode 3 生成的二进制名字 bnparam.param1 = in_h*in_w; //input_wh bnparam.param2 = CeilDivMul(bnparam.param1, 16); //align_wh int32_t tiling_num = (TILING_2_UB_SIZE / 4); bnparam.param3 = tiling_num / bnparam.param2; //single_chnum if(bnparam.param3 > in_c){ bnparam.param3 = in_c; } bnparam.param4 = (int32_t)(in_c / bnparam.param3);//iter_cnum if(bnparam.param4 <= 0){ bnparam.param4 = 1; } if(bnparam.param1 == bnparam.param2){//单次搬入多个Featuremap, W*H 32Byte对齐 bnparam.param5 = bnparam.param1*bnparam.param3; //repeat_length bnparam.param6 = bnparam.param1 / 128; //repeat_mask bnparam.param7 = bnparam.param1 - bnparam.param6*128; //repeat_res_mask bnparam.param8 = 128*bnparam.param6; //res_mask bnparam.param9 = in_c - bnparam.param3*(bnparam.param4); //res_ch_num bnparam.param10 = bnparam.param9*bnparam.param1; //res_repeat_length } else{//W*H 非32对齐,单次仅搬入1个通道数据 bnparam.param5 = bnparam.param1 / 128; //repeat_mask128 bnparam.param6 = bnparam.param1 - bnparam.param5*128; //repeat_res_mask } scheduleFlag = "tiling_mode_3__kernel0"; }
- 通过如下两个接口设置aclopKernelDesc参数,用于算子执行。
- aclopSetKernelArgs:供算子选择器选择到合适Kernel后,进行参数设置。将计算得到的参数,即kernel id、block dim, tiling参数,设置到aclopKernelDesc中。
aclopSetKernelArgs(aclopKernelDesc, scheduleFlag.c_str(), 1, &bnparam, sizeof(BatchNormParam));
建议此处设置的block dim(AI Core核数)和TIK算子实现时的使用的AI Core核数保持一致。
- aclopSetKernelWorkspaceSizes:供算子选择器选择到合适kernel后,进行workspace设置,非必须,根据算子情况可选。
- aclopSetKernelArgs:供算子选择器选择到合适Kernel后,进行参数设置。将计算得到的参数,即kernel id、block dim, tiling参数,设置到aclopKernelDesc中。
父主题: 参考实现