本节设计了一个张量加法的Sample,通过综合应用文档介绍的TIK优化机制,增强用户对TIK编程与优化的理解。
为保证张量加法Sample的结构清晰性和易读性,本章采用类的形式对Sample进行组织和实现,类的定义如下所示。
class Vadd(): # 接收数据,并完成相关初始化计算 def __init__(self, input_x, input_y, output_z, kernel_name="vadd_sample"): # 完成算子计算与编译 def vadd_compute(self): # 定义每个AI Core上的运算 def vadd_compute_each_core(self, move_offset, move_num): # 定义AI Core上的分片计算 def vadd_compute_each_loop(self, move_offset, move_num): # 用于功能和性能测试 def vadd_sample(input_x, input_y, output_z, kernel_name):
完整Sample如下所示。
import math from functools import reduce as functools_reduce import numpy as np from tbe import tik import tbe.common.platform as tbe_platform from tbe.common.utils import para_check # 计算每种数据类型所占bit数 def get_bit_len(dtype): index = 0 for i in dtype: if i.isdigit(): break index += 1 return int(dtype[index:]) class Vadd(): def __init__(self, input_x, input_y, output_z, kernel_name="vadd_sample"): self.shape_x = input_x.get("shape") self.dtype_x = input_x.get("dtype") self.shape_y = input_y.get("shape") self.dtype_y = input_y.get("dtype") self.shape_z = output_z.get("shape") self.dtype_z = output_z.get("dtype") self.kernel_name = kernel_name # 构造TIK容器并开启debug调试功能 self.tik_instance = tik.Tik(disable_debug=False) # 设置昇腾AI处理器的版本,若不设置,默认使用“Ascend310” soc_version="Ascend310" tbe_platform.set_current_compile_soc_info(soc_version,core_type="AiCore") # 获取AI Core的个数 self.aicore_num = tbe_platform.get_soc_spec("CORE_NUM") # Unified Buffer上数据读取和写入必须32B对齐,此参数用来计算tensor划分和数据搬运指令参数 block_byte_size = 32 # 获取Unified Buffer空间大小,单位为bytes ub_size_bytes = tbe_platform.get_soc_spec("UB_SIZE") # 根据输入的数据类型计算一个block可以存放多少个对应的元素 dtype_bytes_size = get_bit_len(self.dtype_x) // 8 self.data_each_block = block_byte_size // dtype_bytes_size # 计算在Unified Buffer上给两个输入和计算结果分别分配多少空间(地址重叠),并进行32B对齐 self.ub_tensor_size = ( ub_size_bytes // dtype_bytes_size // 2 // self.data_each_block * self.data_each_block) # 计算输入的元素个数 self.input_num = functools_reduce(lambda x, y: x * y, self.shape_x) # 计算每个AI Core需要处理的数据量,当前只考虑均分场景,且均分后32 Bytes对齐 self.data_num_each_core = self.input_num // self.aicore_num # vector指令每个repeat最多计算8个block,该参数为mask的最大值 self.vector_mask_max = 8 * self.data_each_block self.input_x_gm = self.tik_instance.Tensor( self.dtype_x, self.shape_x, name="input_x_gm", scope=tik.scope_gm) self.input_y_gm = self.tik_instance.Tensor( self.dtype_y, self.shape_y, name="input_y_gm", scope=tik.scope_gm) self.output_z_gm = self.tik_instance.Tensor( self.dtype_z, self.shape_z, name="output_z_gm", scope=tik.scope_gm) def vadd_compute(self): with self.tik_instance.for_range( 0, self.aicore_num, block_num=self.aicore_num) as index: # 创建两个输入在Unified Buffer上的tensor self.input_x_ub = self.tik_instance.Tensor( self.dtype_x, (self.ub_tensor_size,), name="input_x_ub", scope=tik.scope_ubuf) self.input_y_ub = self.tik_instance.Tensor( self.dtype_y, (self.ub_tensor_size,), name="input_y_ub", scope=tik.scope_ubuf) # 将对应的GM上的数据搬运到Unified Buffer,每次搬运的偏移量为已经处理过的数据个数 move_offset = index * self.data_num_each_core # 每个aicore计算自己负责的数据分片 self.vadd_compute_each_core(move_offset, self.data_num_each_core) self.tik_instance.BuildCCE( kernel_name=self.kernel_name, inputs=[self.input_x_gm, self.input_y_gm], outputs=[self.output_z_gm]) return self.tik_instance def vadd_compute_each_core(self, move_offset, move_num): loop_time = move_num // self.ub_tensor_size move_offset_init = move_offset if loop_time > 0: with self.tik_instance.for_range(0, loop_time) as loop_index: move_offset += loop_index * self.ub_tensor_size self.vadd_compute_each_loop(move_offset, self.ub_tensor_size) move_offset = move_offset_init + loop_time * self.ub_tensor_size last_num = move_num % self.ub_tensor_size if last_num > 0: self.vadd_compute_each_loop(move_offset, last_num) def vadd_compute_each_loop(self, move_offset, move_num): # 计算每次搬运的burst_len burst_len = math.ceil(move_num / self.data_each_block) self.tik_instance.data_move(self.input_x_ub, self.input_x_gm[move_offset], 0, 1, burst_len, 0, 0) self.tik_instance.data_move(self.input_y_ub, self.input_y_gm[move_offset], 0, 1, burst_len, 0, 0) vadd_loop = move_num // (self.vector_mask_max * 255) add_offset = 0 if vadd_loop > 0: with self.tik_instance.for_range(0, vadd_loop) as add_index: add_offset = add_index * self.vector_mask_max * 255 self.tik_instance.vec_add(self.vector_mask_max, self.input_x_ub[add_offset], self.input_x_ub[add_offset], self.input_y_ub[add_offset], 255, 8, 8, 8) add_offset = vadd_loop * self.vector_mask_max * 255 repeat_time = ( move_num % (self.vector_mask_max * 255) // self.vector_mask_max) if repeat_time > 0: self.tik_instance.vec_add(self.vector_mask_max, self.input_x_ub[add_offset], self.input_x_ub[add_offset], self.input_y_ub[add_offset], repeat_time, 8, 8, 8) add_offset += repeat_time * self.vector_mask_max last_num = move_num % self.vector_mask_max if last_num > 0: self.tik_instance.vec_add(last_num, self.input_x_ub[add_offset], self.input_x_ub[add_offset], self.input_y_ub[add_offset], 1, 8, 8, 8) self.tik_instance.data_move(self.output_z_gm[move_offset], self.input_x_ub, 0, 1, burst_len, 0, 0) @para_check.check_input_type(dict, dict, dict, str) def vadd_sample(input_x, input_y, output_z, kernel_name): """ calculating data Parameters ---------- input_x : dict shape and dtype of input input_y : dict shape and dtype of input output_z : dict shape and dtype of output, should be same shape and type as input kernel_name : str kernel name, default value is "vadd_sample" Returns ------- None """ vadd_instance = Vadd(input_x, input_y, output_z, kernel_name) tik_instance = vadd_instance.vadd_compute() return tik_instance if __name__ == "__main__": tik_instance = vadd_sample({"shape":(32, 16384), "dtype":"float16"}, {"shape":(32, 16384), "dtype":"float16"}, {"shape":(32, 16384), "dtype":"float16"}, "vadd_32_16384_float16") input_x = np.random.uniform(-10, 5, (32, 16384)).astype(np.float16) input_y = np.random.uniform(-10, 5, (32, 16384)).astype(np.float16) feed_dict = { 'input_x_gm': input_x, 'input_y_gm': input_y } result = tik_instance.tikdb.start_debug(feed_dict, interactive=True) print(result)