本节介绍如何针对每一种Tiling策略分别进行算子实现。BatchNorm算子的完整实现代码请参考样例使用获取。
不同Tiling策略下,代码结构类似,如下所示:
# 导入依赖的Python模块 from tbe import tik import numpy as np # 常量定义,注意不同场景下MAX_WIDTH和MAX_HEIGHT的值有差异 #size of 310 ai core ub buffer UB_SIZE = 240 * 1024 # batch for N MAX_BATCH = 1 # channel for C MAX_CHANNEL = 1024 # width for W MAX_WIDTH = 32 # height for H MAX_HEIGHT = 32 class BatchNorm(): # 初始化函数 def __init__(self, input0, gamma0, beta0, output0, kernel_name = "BatchNorm"): """ 初始化函数实现 """ # 算子计算逻辑实现和编译 def batchnorm_compute(self): # 不同Tiling策略下计算逻辑不同 self.batchnorm_compute_tiling() # 算子编译 self.tik_instance.BuildCCE(kernel_name=self.kernel_name, inputs=[self.input_gm, self.gamma_gm, self.beta_gm], outputs=[self.output_gm], flowtable=[self.input_n, self.input_c, self.input_h, self.input_w, self.inputtype, self.output_n, self.output_c, self.output_h, self.output_w, self.outputtype, self.gamma_c, self.gammatype, self.beta_c, self.betatype, self.param1, self.param2, self.param3, self.param4, self.param5, self.param6, self.param7, self.param8, self.param9, self.param10], enable_l2=True) return self.tik_instance # 算子定义函数 def batch_norm(input0, gamma0, beta0, output0, kernel_name = "BatchNorm"): obj = BatchNorm(input0, gamma0, beta0, output0, kernel_name) obj.batchnorm_compute()
如下几点需要关注:
# C通道并行计算 def batchnorm_compute_tiling_c(self): with self.tik_instance.if_scope(total_use_ub <= UB_SIZE): # Size(N*C*H*W) < 112KB,只需1次搬运 # 准备Unified Buffer空间数据 ...... # CHW->HWC->CHW, 将数据从CHW转换为HWC,利用C通道连续数据进行计算,计算完毕后从 # HWC->CHW,搬运到Global Memory空间。 with self.tik_instance.for_range(0, align_wh // 16, thread_num = 2) as k: # transfer (chn_num, 16) to (16, chn_num) # [align_c, 16] src_list = [data_ub[0*align_wh + k*16], ..., data_ub[15*align_wh + k*16]] # [align_wh, chn_num] dst_list = [trans_ub[(k*16 + 0)*align_c],trans_ub[(k*16 + 15)*align_c]] # CHW->HWC self.tik_instance.vec_trans_scatter(True, True, dst_list, src_list, align_c // 16, 1, align_wh) # calc batchnorm (x - mean)/var with self.tik_instance.for_range (0, align_wh, thread_num = 2) as m: self.tik_instance.vec_add(16, tmp_ub, trans_ub[m*align_c], gamma_ub, align_c // 16, 1, 1, 1) self.tik_instance.vec_mul(16, trans_ub[m*align_c], tmp_ub, beta_ub, align_c // 16, 1, 1, 1) with self.tik_instance.for_range(0, align_wh // 16, thread_num = 2) as k: # [align_wh, chn_num] src_list = [trans_ub[(k*16 + 0)*align_c], ... trans_ub[(k*16 + 15)*align_c]] # [chn_num, align_wh] dst_list = [data_ub[0*align_wh + k*16], ... data_ub[15*align_wh + k*16]] # HWC->CHW self.tik_instance.vec_trans_scatter(True, True, dst_list, src_list, align_c // 16, align_wh, 1) #move ub->gm self.tik_instance.data_move(output_gm[0], data_ub[0], 0, 1, align_wh*self.input_c // 16, 0, 0) with self.tik_instance.else_scope(): # Size(N*C*H*W) > 112KB,需多次搬运 # 准备Unified Buffer空间数据 ...... # Cx16->16xC->Cx16, 将数据从CHW转换为HWC,利用C通道连续数据进行计算,H*W方向上每次只搬运16个 #FP16数据,计算完毕后转换成HWC->CHW,搬运到Global Memory空间, 注H*W必须满足32Byte对齐。 src_list = [ping_ub[0, 0], ping_ub[1, 0], ping_ub[2, 0], ping_ub[3, 0], ping_ub[4, 0], ping_ub[5, 0], ping_ub[6, 0], ping_ub[7, 0], ping_ub[8, 0], ping_ub[9, 0], ping_ub[10, 0], ping_ub[11, 0], ping_ub[12, 0], ping_ub[13, 0], ping_ub[14, 0], ping_ub[15, 0]] # [16, align_c] dst_list = [trans_ub[0, 0], trans_ub[1, 0], trans_ub[2, 0], trans_ub[3, 0], trans_ub[4, 0], trans_ub[5, 0], trans_ub[6, 0], trans_ub[7, 0], trans_ub[8, 0], trans_ub[9, 0], trans_ub[10, 0], trans_ub[11, 0], trans_ub[12, 0], trans_ub[13, 0], trans_ub[14, 0],trans_ub[15, 0]] # transpose Cx16 to 16 x C self.tik_instance.vec_trans_scatter(True, True, dst_list, src_list, repeat_alignc, 1, 16) # 计算均值与方差: (x - mean)/var with self.tik_instance.for_range (0, 16) as m: self.tik_instance.vec_add(16, tmp_ub, trans_ub[m, 0], gamma_ub, repeat_alignc, 1, 1, 1) self.tik_instance.vec_mul(16, trans_ub[m, 0], tmp_ub, beta_ub, repeat_alignc, 1, 1, 1) # [16, align_c] src_list = [trans_ub[0, 0], trans_ub[1, 0], trans_ub[2, 0], trans_ub[3, 0], trans_ub[4, 0], trans_ub[5, 0], trans_ub[6, 0], trans_ub[7, 0], trans_ub[8, 0], trans_ub[9, 0], trans_ub[10, 0], trans_ub[11, 0], trans_ub[12, 0], trans_ub[13, 0], trans_ub[14, 0], trans_ub[15, 0]] # [align_c, 16] dst_list = [vconv_ub[0, 0], vconv_ub[1, 0], vconv_ub[2, 0], vconv_ub[3, 0], vconv_ub[4, 0], vconv_ub[5, 0], vconv_ub[6, 0], vconv_ub[7, 0], vconv_ub[8, 0], vconv_ub[9, 0], vconv_ub[10, 0], vconv_ub[11, 0], vconv_ub[12, 0], vconv_ub[13, 0], vconv_ub[14, 0],vconv_ub[15, 0]] # transpose 16xC -> Cx16 self.tik_instance.vec_trans_scatter(True, True, dst_list, src_list, repeat_alignc, 16, 1)
# 单通道HW并行计算 def batchnorm_compute_tiling_wh_single_c(self): with self.tik_instance.if_scope(iter_ceil < 2): # Size(H*W) < 112KB且Size(H*W*2) > 112KB时,可一次搬完一个通道 with self.tik_instance.for_range(0, iter_num_double, thread_num = 2) as i: #double buffer ...... # 计算均值与方差 self.tik_instance.vec_adds(128, temp_ub[0], ping_ub[0], self.sclar_gamma, maxiternum, 8, 8) self.tik_instance.vec_muls(128, temp_ub[0], temp_ub[0], self.sclar_beta, maxiternum, 8, 8) self.tik_instance.data_move(output_gm[i*input_wh], temp_ub, 0, 1, align_wh // 16, 0, 0) with self.tik_instance.for_range(iter_num_double, self.input_c) as i: # 处理尾块 ...... # 计算均值与方差 self.tik_instance.vec_adds(128, temp_ub[0], ping_ub[0], self.sclar_gamma, maxiternum, 8, 8) self.tik_instance.vec_muls(128, temp_ub[0], temp_ub[0], self.sclar_beta, maxiternum, 8, 8) self.tik_instance.data_move(output_gm[i*input_wh], temp_ub, 0, 1, align_wh // 16, 0, 0) with self.tik_instance.else_scope(): # Size(H*W) > 112KB时,一次搬不完一个通道,需要切块多次搬。同时针对是否存在尾块的情况进行分别处理 with self.tik_instance.if_scope(res_h == 0): # 对H进行块划分,情况1,能够划分完毕,没有尾块 with self.tik_instance.for_range(0, self.input_c) as i: # 遍历C通道 self.sclar_gamma.set_as(gamma_ub[i]) self.sclar_beta.set_as(beta_ub[i]) idx = i*input_wh iter_num_double = floor_div_mul(iter_ceil, 2) with self.tik_instance.for_range(0, iter_num_double, thread_num = 2) as j: ... # 计算均值与方差 self.tik_instance.vec_adds(128, temp_ub, ping_ub, self.sclar_gamma, repeat_mask128 // 128, 8, 8) self.tik_instance.vec_muls(128, temp_ub, temp_ub, self.sclar_beta, repeat_mask128 // 128, 8, 8) with self.tik_instance.for_range(iter_num_double, iter_ceil) as j: # 处理剩余奇数块 ...... # 计算均值与方差 self.tik_instance.vec_adds(128, temp_ub, ping_ub, self.sclar_gamma, repeat_mask128 // 128, 8, 8) self.tik_instance.vec_muls(128, temp_ub, temp_ub, self.sclar_beta, repeat_mask128 // 128, 8, 8) ...... with self.tik_instance.else_scope(): # 对H进行块划分,情况2,不能够划分完毕,存在尾块,需要处理 with self.tik_instance.for_range(0, self.input_c) as i: # 遍历C通道 ...... with self.tik_instance.for_range(0, iter_num_double, thread_num = 2) as j:#double buffer ...... # 计算均值与方差 self.tik_instance.vec_adds(128, temp_ub, ping_ub, self.sclar_gamma, repeat_mask128 // 128, 8, 8) self.tik_instance.vec_muls(128, temp_ub, temp_ub, self.sclar_beta, repeat_mask128 // 128, 8, 8) with self.tik_instance.for_range(iter_num_double, iter_ceil - 1) as j: # 奇数尾块处理 self.tik_instance.vec_adds(128, temp_ub, ping_ub, self.sclar_gamma, repeat_mask128 // 128, 8, 8) self.tik_instance.vec_muls(128, temp_ub, temp_ub, self.sclar_beta, repeat_mask128 // 128, 8, 8) # H切分剩余尾块处理 with self.tik_instance.if_scope(iter_res_align16 > 0): ... self.tik_instance.vec_adds(128, temp_ub, pong_ub, self.sclar_gamma, repeat_res_mask128 // 128, 8, 8) self.tik_instance.vec_muls(128, temp_ub, temp_ub, self.sclar_beta, repeat_res_mask128 // 128, 8, 8)
# 多通道HW并行计算 def batchnorm_compute_tiling_wh_single_c(self): with self.tik_instance.if_scope(align_wh == input_wh): # Size (H*W*C) (C > 1) < 112KB,且满足32Byte对齐时,直接搬多个通道 with self.tik_instance.for_range(0, iter_num_double, thread_num = 2) as i: #thread_num=2 double buffer加速 ...... # 一次搬运多个通道,逐单通道进行BN with self.tik_instance.for_range(0, single_chnum) as j: # 计算均值与方差 self.tik_instance.vec_adds(128, temp_ub[idx1], ping_ub[idx1], self.sclar_gamma, repeat_mask + 1, 8, 8) self.tik_instance.vec_muls(128, temp_ub[idx1], temp_ub[idx1], self.sclar_beta, repeat_mask + 1, 8, 8) self.tik_instance.data_move(output_gm[index], temp_ub, 0, 1, repeat_length // 16, 0, 0) with self.tik_instance.for_range(iter_num_double, iter_cnum) as i: # 处理无法进行double buffer的剩余奇数尾块 ... # 逐单通道进行BN with self.tik_instance.for_range(0, single_chnum) as j: # 计算均值与方差 self.tik_instance.vec_adds(128, temp_ub[idx1], ping_ub[idx1], self.sclar_gamma, repeat_mask + 1, 8, 8) self.tik_instance.vec_muls(128, temp_ub[idx1], temp_ub[idx1], self.sclar_beta, repeat_mask + 1, 8, 8) self.tik_instance.data_move(output_gm[index], temp_ub, 0, 1, repeat_length // 16, 0, 0) with self.tik_instance.if_scope(res_ch_num > 0): # 处理剩余通道数据 ...... with self.tik_instance.for_range(0, res_ch_num) as j: ...... # 计算均值与方差 self.tik_instance.vec_adds(128, temp_ub[j*input_wh], ping_ub[j*input_wh], self.sclar_gamma, repeat_mask + 1, 8, 8) self.tik_instance.vec_muls(128, temp_ub[j*input_wh], temp_ub[j*input_wh], self.sclar_beta, repeat_mask + 1, 8, 8) self.tik_instance.data_move(output_gm[index2], temp_ub, 0, 1, res_repeat_length // 16, 0, 0) with self.tik_instance.else_scope(): # Size (H*W*C) (C > 1) < 112KB,但Featuremap不满足32Byte对齐时,需要对每个通道依次搬运,否则每次都要处理尾块,影响计算效率 # 一次处理单个通道,采用double buffer技术,2个通道可以并行 with self.tik_instance.for_range(0, iter_num_double, thread_num = 2) as i: ...... # 计算均值与方差 self.tik_instance.vec_adds(128, temp_ub, ping_ub, self.sclar_gamma, repeat_mask128 + 1, 8, 8) self.tik_instance.vec_muls(128, temp_ub, temp_ub, self.sclar_beta, repeat_mask128 + 1, 8, 8) with self.tik_instance.for_range(iter_num_double, self.input_c) as i: # 处理不能double buffer的剩余通道,剩余通道为1 ...... self.tik_instance.vec_adds(128, temp_ub, ping_ub, self.sclar_gamma, repeat_mask128 + 1, 8, 8) self.tik_instance.vec_muls(128, temp_ub, temp_ub, self.sclar_beta, repeat_mask128 + 1, 8, 8) self.tik_instance.data_move(output_gm[i * input_wh], temp_ub, 0, 1, align_wh // 16, 0, 0)