算子实现
本节介绍如何针对每一种Tiling策略分别进行算子实现。BatchNorm算子的完整实现代码请参考样例使用获取。
代码结构
不同Tiling策略下,代码结构类似,如下所示:
# 导入依赖的Python模块
from tbe import tik
import numpy as np
# 常量定义,注意不同场景下MAX_WIDTH和MAX_HEIGHT的值有差异
#size of 310 ai core ub buffer
UB_SIZE = 240 * 1024
# batch for N
MAX_BATCH = 1
# channel for C
MAX_CHANNEL = 1024
# width for W
MAX_WIDTH = 32
# height for H
MAX_HEIGHT = 32
class BatchNorm():
# 初始化函数
def __init__(self, input0, gamma0, beta0, output0, kernel_name = "BatchNorm"):
"""
初始化函数实现
"""
# 算子计算逻辑实现和编译
def batchnorm_compute(self):
# 不同Tiling策略下计算逻辑不同
self.batchnorm_compute_tiling()
# 算子编译
self.tik_instance.BuildCCE(kernel_name=self.kernel_name,
inputs=[self.input_gm,
self.gamma_gm,
self.beta_gm],
outputs=[self.output_gm],
flowtable=[self.input_n, self.input_c,
self.input_h, self.input_w,
self.inputtype, self.output_n,
self.output_c, self.output_h,
self.output_w, self.outputtype,
self.gamma_c, self.gammatype,
self.beta_c, self.betatype,
self.param1, self.param2,
self.param3, self.param4,
self.param5, self.param6,
self.param7, self.param8,
self.param9, self.param10],
enable_l2=True)
return self.tik_instance
# 算子定义函数
def batch_norm(input0, gamma0, beta0, output0, kernel_name = "BatchNorm"):
obj = BatchNorm(input0, gamma0, beta0, output0, kernel_name)
obj.batchnorm_compute()
如下几点需要关注:
- 不同Tiling策略下,代码结构类似,主要区别点在于不同Tiling策略下计算逻辑不同,具体请参考C通道并行、单通道HW并行、多通道HW并行。
- 通过BuildCCE接口编译TIK算子时,可以传入flowtable参数。该参数为算子选择器计算好的Tiling参数,相当于在output容器后增加一段用户缓存Tiling参数的地址空间,flowtable列表中的Tiling参数个数和inputs的参数个数之和小于等于64,每个Tiling参数为TIK InputScalar类型,Tiling参数不在TIK中计算,将重复的Scalar运算全部转移到Host CPU上的算子选择器计算,减少AI Core上Scalar运算量,有利于流水线并行。
# C通道并行计算
def batchnorm_compute_tiling_c(self):
with self.tik_instance.if_scope(total_use_ub <= UB_SIZE): # Size(N*C*H*W) < 112KB,只需1次搬运
# 准备Unified Buffer空间数据
......
# CHW->HWC->CHW, 将数据从CHW转换为HWC,利用C通道连续数据进行计算,计算完毕后从
# HWC->CHW,搬运到Global Memory空间。
with self.tik_instance.for_range(0, align_wh // 16, thread_num = 2) as k:
# transfer (chn_num, 16) to (16, chn_num)
# [align_c, 16]
src_list = [data_ub[0*align_wh + k*16], ..., data_ub[15*align_wh + k*16]]
# [align_wh, chn_num]
dst_list = [trans_ub[(k*16 + 0)*align_c],trans_ub[(k*16 + 15)*align_c]]
# CHW->HWC
self.tik_instance.vec_trans_scatter(True, True, dst_list, src_list, align_c // 16, 1,
align_wh)
# calc batchnorm (x - mean)/var
with self.tik_instance.for_range (0, align_wh, thread_num = 2) as m:
self.tik_instance.vec_add(16, tmp_ub, trans_ub[m*align_c],
gamma_ub, align_c // 16, 1, 1, 1)
self.tik_instance.vec_mul(16, trans_ub[m*align_c], tmp_ub,
beta_ub, align_c // 16, 1, 1, 1)
with self.tik_instance.for_range(0, align_wh // 16, thread_num = 2) as k:
# [align_wh, chn_num]
src_list = [trans_ub[(k*16 + 0)*align_c], ... trans_ub[(k*16 + 15)*align_c]]
# [chn_num, align_wh]
dst_list = [data_ub[0*align_wh + k*16], ... data_ub[15*align_wh + k*16]]
# HWC->CHW
self.tik_instance.vec_trans_scatter(True, True, dst_list, src_list, align_c // 16,
align_wh, 1)
#move ub->gm
self.tik_instance.data_move(output_gm[0], data_ub[0], 0, 1, align_wh*self.input_c // 16, 0, 0)
with self.tik_instance.else_scope(): # Size(N*C*H*W) > 112KB,需多次搬运
# 准备Unified Buffer空间数据
......
# Cx16->16xC->Cx16, 将数据从CHW转换为HWC,利用C通道连续数据进行计算,H*W方向上每次只搬运16个
#FP16数据,计算完毕后转换成HWC->CHW,搬运到Global Memory空间, 注H*W必须满足32Byte对齐。
src_list = [ping_ub[0, 0], ping_ub[1, 0], ping_ub[2, 0], ping_ub[3, 0],
ping_ub[4, 0], ping_ub[5, 0], ping_ub[6, 0], ping_ub[7, 0],
ping_ub[8, 0], ping_ub[9, 0], ping_ub[10, 0], ping_ub[11, 0],
ping_ub[12, 0], ping_ub[13, 0], ping_ub[14, 0], ping_ub[15, 0]]
# [16, align_c]
dst_list = [trans_ub[0, 0], trans_ub[1, 0], trans_ub[2, 0], trans_ub[3, 0],
trans_ub[4, 0], trans_ub[5, 0], trans_ub[6, 0], trans_ub[7, 0],
trans_ub[8, 0], trans_ub[9, 0], trans_ub[10, 0], trans_ub[11, 0],
trans_ub[12, 0], trans_ub[13, 0], trans_ub[14, 0],trans_ub[15, 0]]
# transpose Cx16 to 16 x C
self.tik_instance.vec_trans_scatter(True, True, dst_list, src_list, repeat_alignc, 1, 16)
# 计算均值与方差: (x - mean)/var
with self.tik_instance.for_range (0, 16) as m:
self.tik_instance.vec_add(16, tmp_ub, trans_ub[m, 0], gamma_ub, repeat_alignc, 1, 1, 1)
self.tik_instance.vec_mul(16, trans_ub[m, 0], tmp_ub, beta_ub, repeat_alignc, 1, 1, 1)
# [16, align_c]
src_list = [trans_ub[0, 0], trans_ub[1, 0], trans_ub[2, 0], trans_ub[3, 0],
trans_ub[4, 0], trans_ub[5, 0], trans_ub[6, 0], trans_ub[7, 0],
trans_ub[8, 0], trans_ub[9, 0], trans_ub[10, 0], trans_ub[11, 0],
trans_ub[12, 0], trans_ub[13, 0], trans_ub[14, 0], trans_ub[15, 0]]
# [align_c, 16]
dst_list = [vconv_ub[0, 0], vconv_ub[1, 0], vconv_ub[2, 0], vconv_ub[3, 0],
vconv_ub[4, 0], vconv_ub[5, 0], vconv_ub[6, 0], vconv_ub[7, 0],
vconv_ub[8, 0], vconv_ub[9, 0], vconv_ub[10, 0], vconv_ub[11, 0],
vconv_ub[12, 0], vconv_ub[13, 0], vconv_ub[14, 0],vconv_ub[15, 0]]
# transpose 16xC -> Cx16
self.tik_instance.vec_trans_scatter(True, True, dst_list, src_list, repeat_alignc, 16, 1)
单通道HW并行
# 单通道HW并行计算
def batchnorm_compute_tiling_wh_single_c(self):
with self.tik_instance.if_scope(iter_ceil < 2): # Size(H*W) < 112KB且Size(H*W*2) > 112KB时,可一次搬完一个通道
with self.tik_instance.for_range(0, iter_num_double, thread_num = 2) as i: #double buffer
......
# 计算均值与方差
self.tik_instance.vec_adds(128, temp_ub[0], ping_ub[0], self.sclar_gamma, maxiternum, 8, 8)
self.tik_instance.vec_muls(128, temp_ub[0], temp_ub[0], self.sclar_beta, maxiternum, 8, 8)
self.tik_instance.data_move(output_gm[i*input_wh], temp_ub, 0, 1, align_wh // 16, 0, 0)
with self.tik_instance.for_range(iter_num_double, self.input_c) as i: # 处理尾块
......
# 计算均值与方差
self.tik_instance.vec_adds(128, temp_ub[0], ping_ub[0], self.sclar_gamma, maxiternum, 8, 8)
self.tik_instance.vec_muls(128, temp_ub[0], temp_ub[0], self.sclar_beta, maxiternum, 8, 8)
self.tik_instance.data_move(output_gm[i*input_wh], temp_ub, 0, 1, align_wh // 16, 0, 0)
with self.tik_instance.else_scope(): # Size(H*W) > 112KB时,一次搬不完一个通道,需要切块多次搬。同时针对是否存在尾块的情况进行分别处理
with self.tik_instance.if_scope(res_h == 0): # 对H进行块划分,情况1,能够划分完毕,没有尾块
with self.tik_instance.for_range(0, self.input_c) as i: # 遍历C通道
self.sclar_gamma.set_as(gamma_ub[i])
self.sclar_beta.set_as(beta_ub[i])
idx = i*input_wh
iter_num_double = floor_div_mul(iter_ceil, 2)
with self.tik_instance.for_range(0, iter_num_double, thread_num = 2) as j:
...
# 计算均值与方差
self.tik_instance.vec_adds(128, temp_ub, ping_ub, self.sclar_gamma, repeat_mask128 // 128, 8, 8)
self.tik_instance.vec_muls(128, temp_ub, temp_ub, self.sclar_beta, repeat_mask128 // 128, 8, 8)
with self.tik_instance.for_range(iter_num_double, iter_ceil) as j: # 处理剩余奇数块
......
# 计算均值与方差
self.tik_instance.vec_adds(128, temp_ub, ping_ub, self.sclar_gamma, repeat_mask128 // 128, 8, 8)
self.tik_instance.vec_muls(128, temp_ub, temp_ub, self.sclar_beta, repeat_mask128 // 128, 8, 8)
......
with self.tik_instance.else_scope(): # 对H进行块划分,情况2,不能够划分完毕,存在尾块,需要处理
with self.tik_instance.for_range(0, self.input_c) as i: # 遍历C通道
......
with self.tik_instance.for_range(0, iter_num_double, thread_num = 2) as j:#double buffer
......
# 计算均值与方差
self.tik_instance.vec_adds(128, temp_ub, ping_ub, self.sclar_gamma, repeat_mask128 // 128, 8, 8)
self.tik_instance.vec_muls(128, temp_ub, temp_ub, self.sclar_beta, repeat_mask128 // 128, 8, 8)
with self.tik_instance.for_range(iter_num_double, iter_ceil - 1) as j: # 奇数尾块处理
self.tik_instance.vec_adds(128, temp_ub, ping_ub, self.sclar_gamma, repeat_mask128 // 128, 8, 8)
self.tik_instance.vec_muls(128, temp_ub, temp_ub, self.sclar_beta, repeat_mask128 // 128, 8, 8)
# H切分剩余尾块处理
with self.tik_instance.if_scope(iter_res_align16 > 0):
...
self.tik_instance.vec_adds(128, temp_ub, pong_ub, self.sclar_gamma, repeat_res_mask128 // 128, 8, 8)
self.tik_instance.vec_muls(128, temp_ub, temp_ub, self.sclar_beta, repeat_res_mask128 // 128, 8, 8)
多通道HW并行
# 多通道HW并行计算
def batchnorm_compute_tiling_wh_single_c(self):
with self.tik_instance.if_scope(align_wh == input_wh): # Size (H*W*C) (C > 1) < 112KB,且满足32Byte对齐时,直接搬多个通道
with self.tik_instance.for_range(0, iter_num_double, thread_num = 2) as i: #thread_num=2 double buffer加速
......
# 一次搬运多个通道,逐单通道进行BN
with self.tik_instance.for_range(0, single_chnum) as j:
# 计算均值与方差
self.tik_instance.vec_adds(128, temp_ub[idx1], ping_ub[idx1], self.sclar_gamma, repeat_mask + 1, 8, 8)
self.tik_instance.vec_muls(128, temp_ub[idx1], temp_ub[idx1], self.sclar_beta, repeat_mask + 1, 8, 8)
self.tik_instance.data_move(output_gm[index], temp_ub, 0, 1, repeat_length // 16, 0, 0)
with self.tik_instance.for_range(iter_num_double, iter_cnum) as i: # 处理无法进行double buffer的剩余奇数尾块
...
# 逐单通道进行BN
with self.tik_instance.for_range(0, single_chnum) as j:
# 计算均值与方差
self.tik_instance.vec_adds(128, temp_ub[idx1], ping_ub[idx1], self.sclar_gamma, repeat_mask + 1, 8, 8)
self.tik_instance.vec_muls(128, temp_ub[idx1], temp_ub[idx1], self.sclar_beta, repeat_mask + 1, 8, 8)
self.tik_instance.data_move(output_gm[index], temp_ub, 0, 1, repeat_length // 16, 0, 0)
with self.tik_instance.if_scope(res_ch_num > 0): # 处理剩余通道数据
......
with self.tik_instance.for_range(0, res_ch_num) as j:
......
# 计算均值与方差
self.tik_instance.vec_adds(128, temp_ub[j*input_wh], ping_ub[j*input_wh], self.sclar_gamma, repeat_mask + 1, 8, 8)
self.tik_instance.vec_muls(128, temp_ub[j*input_wh], temp_ub[j*input_wh], self.sclar_beta, repeat_mask + 1, 8, 8)
self.tik_instance.data_move(output_gm[index2], temp_ub, 0, 1, res_repeat_length // 16, 0, 0)
with self.tik_instance.else_scope(): # Size (H*W*C) (C > 1) < 112KB,但Featuremap不满足32Byte对齐时,需要对每个通道依次搬运,否则每次都要处理尾块,影响计算效率
# 一次处理单个通道,采用double buffer技术,2个通道可以并行
with self.tik_instance.for_range(0, iter_num_double, thread_num = 2) as i:
......
# 计算均值与方差
self.tik_instance.vec_adds(128, temp_ub, ping_ub, self.sclar_gamma, repeat_mask128 + 1, 8, 8)
self.tik_instance.vec_muls(128, temp_ub, temp_ub, self.sclar_beta, repeat_mask128 + 1, 8, 8)
with self.tik_instance.for_range(iter_num_double, self.input_c) as i: # 处理不能double buffer的剩余通道,剩余通道为1
......
self.tik_instance.vec_adds(128, temp_ub, ping_ub, self.sclar_gamma, repeat_mask128 + 1, 8, 8)
self.tik_instance.vec_muls(128, temp_ub, temp_ub, self.sclar_beta, repeat_mask128 + 1, 8, 8)
self.tik_instance.data_move(output_gm[i * input_wh], temp_ub, 0, 1, align_wh // 16, 0, 0)
父主题: 参考实现