Operator Selector

You can select a tiling policy based on the shape by using the custom operator selector. For the complete code, see Sample Usage.

.h Implementation

The .h file declares the structures (for example, BatchNormParam) that define the tiling parameters and the implementation functions (for example, SelectAclopBatchNorm) of the operator selector.

The operator selector is used to select a tiling policy based on the input shape, store the computed tiling arguments in the BatchNormParam structure, and output aclopKernelDesc.

#ifndef TVM_TOPK_FLOWTABLE_H
#define TVM_TOPK_FLOWTABLE_H
#include "acl/acl.h"

//align with 64
structBatchNormParam {
   int32_t input_n;  // input format = nchw
   int32_t input_c;
   int32_t input_h;
   int32_t input_w;
   int32_t in_datatype; //input data type = fp16
   int32_t output_n;  // output format = nchw
   int32_t output_c;
   int32_t output_h;
   int32_t output_w;
   int32_t out_datatype; //output_data_type = fp16
   int32_t gamma_c;  //mean tensor channel = input channel
   int32_t gamma_datatype;
   int32_t beta_c;   //var tensor channel = input channel
   int32_t beta_datatype;
   int32_t param1;   //tiling parameters, difference tiling mode has difference tiling parameters
   int32_t param2;   //gived 10 parameters
   int32_t param3;
   int32_t param4;
   int32_t param5;
   int32_t param6;
   int32_t param7;
   int32_t param8;
   int32_t param9;
   int32_t param10;
};

extern "C" aclError SelectAclopBatchNorm(int numInputs, const aclTensorDesc *const inputDesc[],
                                         int numOutputs, const aclTensorDesc *const outputDesc[],
                                         const aclopAttr *opAttr, aclopKernelDesc *aclopKernelDesc);

#endif

.cpp Implementation

The key implementation logic of an operator selector is as follows:

Determine the tiling policy to be used based on the input shape. The sample code is as follows.

/**
 * batchnorm tiling to 
 * @limit: 1. n,c,h,w is the format of NCHW
 *         2. inDtype, outDtype is the multiple of 16, and the dtype is float16
 *         3. mini, core num is 2
 *         4. only support to select one batchnorm schedule
 *         5. not set kernel workspace
 */
void BatchNormTiling(uint64_t n, uint64_t c, uint64_t h, uint64_t w, aclDataType inDtype, TilingMode &mode)
{
    uint64_t lenwh = w*h;
    uint64_t bytesize = 0;
    if(inDtype == ACL_FLOAT16){
        bytesize = 2;
    }
    else{
        cout << "[ERROR] not fp16 datatype not support yet." << endl;
        return;
    }

    if (c > lenwh && lenwh == CeilDivMul(lenwh, 16) && (CeilDivMul(c, 16) * lenwh * bytesize) <= TILING_0_UB_SIZE) {
        mode = TILING_MODE_1;
    }
    else{
        if(lenwh*bytesize > TILING_2_UB_SIZE){
            mode = TILING_MODE_2;
        }
        else if((lenwh*bytesize > TILING_2_UB_SIZE/2) && (lenwh*bytesize < TILING_2_UB_SIZE)){
            mode = TILING_MODE_2;
        }
        else{
            mode = TILING_MODE_3;
        }
    }

    //cout << "[INFO] select tiling mode is:" << mode << endl;
    return;
}

Compute the specific tiling parameters based on the tiling policy.

In the dynamic-shape scenario, the tiling parameters are not computed in the TIK operator implementation. Iterative scalar compute process is moved to the host CPU to save the scalar compute process workload on the AI Cores using pipeline parallelism.

    if(tilingmode == TILING_MODE_1){
        scheduleFlag = "tiling_mode_1__kernel0"; //Binary file name of tiling mode 1
        int32_t chn_num = MAX_C_SIZE;
        int32_t align_16 = CeilDivMul(in_w*in_h, 16);
        int32_t total_use_ub = chn_num*align_16*2*2 + chn_num*2;

        if(total_use_ub <= UB_SIZE){// Data can be moved to the Unified Buffer at a time.
            bnparam.param1 = in_h*in_w;  //input_wh
            bnparam.param2 = CeilDivMul(bnparam.param1, 16); //align_wh
            bnparam.param3 = CeilDivMul(in_c, 16); //align_c
        }
        else{//Data cannot be moved to the Unified Buffer at a time.
            bnparam.param1 = in_h*in_w;  //input_wh
            bnparam.param2 = CeilDiv(bnparam.param1, 16); //iterwh_align16
            bnparam.param3 = CeilDiv(in_c, 16); //repeat_alignc
            bnparam.param4 = CeilDivMul(bnparam.param1, 16); //align_wh
            bnparam.param5 = CeilDivMul(in_c, 16); //align_c
            bnparam.param6 = CeilDiv(bnparam.param4, 16) - 1;
        }
    }
    else if(tilingmode == TILING_MODE_2){
        scheduleFlag = "tiling_mode_2__kernel0"; //Binary file name of tiling mode 2
        bnparam.param1 = in_h*in_w;  //input_wh
        bnparam.param2 = CeilDivMul(bnparam.param1, 16); //align_wh
        int32_t tiling_num = (TILING_1_UB_SIZE / 4); 
        bnparam.param3 = CeilDiv(bnparam.param1, tiling_num); //iter_ceil
        if(bnparam.param3 < 2){//W * H can be moved into the Unified Buffer at a time.
            bnparam.param4 = tiling_num / 128;//iter_mask128
            bnparam.param5 = bnparam.param4/255; //repeat_mask128
            bnparam.param6 = bnparam.param4 - bnparam.param5*255; //repeat_res_mask128
            bnparam.param7 = tiling_num -  bnparam.param4*128; //res_mask128
        }
        else{//W * H needs to be moved into the Unified Buffer for multiple times.
            bnparam.param4 = CeilDiv(in_h, bnparam.param3);//iter_h
            bnparam.param5 = CeilDivMul(bnparam.param4*in_w, 16);//iter_align16
            bnparam.param6 = CeilDivMul(((in_h - bnparam.param4*(bnparam.param3 - 1))*in_w) , 16); //iter_res_align16
            bnparam.param7 = CeilDivMul(bnparam.param4*in_w, 128);//repeat_mask128
            bnparam.param8 = CeilDivMul(((in_h - bnparam.param4*(bnparam.param3 - 1))*in_w), 128);//repeat_res_mask128
            bnparam.param9 = in_h - bnparam.param4*bnparam.param3;
        }
    }
    else if(tilingmode == TILING_MODE_3){//Bin file name of tiling mode 3
        bnparam.param1 = in_h*in_w;  //input_wh
        bnparam.param2 = CeilDivMul(bnparam.param1, 16); //align_wh
        int32_t tiling_num = (TILING_2_UB_SIZE / 4); 
        bnparam.param3 = tiling_num / bnparam.param2; //single_chnum
        if(bnparam.param3 > in_c){ 
            bnparam.param3 = in_c;
        }
        bnparam.param4 = (int32_t)(in_c / bnparam.param3);//iter_cnum
        if(bnparam.param4 <= 0){
            bnparam.param4 = 1;
        }
        if(bnparam.param1 == bnparam.param2){//W * H is a multiple of 32 bytes: Multiple feature maps can be moved at a time.
            bnparam.param5 = bnparam.param1*bnparam.param3; //repeat_length
            bnparam.param6 = bnparam.param1 / 128; //repeat_mask
            bnparam.param7 = bnparam.param1 -  bnparam.param6*128; //repeat_res_mask
            bnparam.param8 = 128*bnparam.param6; //res_mask
            bnparam.param9 = in_c - bnparam.param3*(bnparam.param4); //res_ch_num
            bnparam.param10 = bnparam.param9*bnparam.param1; //res_repeat_length
        }
        else{// W * H is not a multiple of 32 bytes: Data of only one channel can be moved at a time.
            bnparam.param5 = bnparam.param1 / 128; //repeat_mask128
            bnparam.param6 = bnparam.param1 - bnparam.param5*128; //repeat_res_mask
        }
        scheduleFlag = "tiling_mode_3__kernel0";
    }

Set the aclopKernelDesc parameters for operator execution using the following APIs:
- aclopSetKernelArgs: allows the operator selector to set parameters after selecting a proper kernel. Passes the computed kernel id, block dim, and tiling arguments to aclopKernelDesc.
```
    aclopSetKernelArgs(aclopKernelDesc, scheduleFlag.c_str(),  1, &bnparam, sizeof(BatchNormParam));
```
  Keep block dim the same as the number of AI Cores used for parallel operator implementation in TIK mode.
- aclopSetKernelWorkspaceSizes: (optional) allows the operator selector to set the workspace after selecting a proper kernel.
For details about the aclopSetKernelArgs and aclopSetKernelWorkspaceSizes APIs, see Application Software Development Guide (C&C++).

Parent topic: Sample Implementation