For Advanced
The sample in this section implements tensor addition. The preceding TIK optimization mechanisms are applied in this sample to enhance your understanding of TIK programming and optimization.
To make the sample structure clear and readable, this sample is organized and implemented in the format of a class. The class is defined as follows.
class Vadd():
# Receive data and complete initialization.
def __init__(self, input_x, input_y, output_z, kernel_name="vadd_sample"):
# Compute and build the operator.
def vadd_compute(self):
# Define the operations on each AI Core.
def vadd_compute_each_core(self, move_offset, move_num):
# Define the tiled compute process on each AI Core.
def vadd_compute_each_loop(self, move_offset, move_num):
# For function and performance testing.
def vadd_sample(input_x, input_y, output_z, kernel_name):
The complete sample code is as follows.
import math
from functools import reduce as functools_reduce
import numpy as np
from tbe import tik
import tbe.common.platform as tbe_platform
from tbe.common.utils import para_check
# Calculate the number of bits occupied by each data type.
def get_bit_len(dtype):
index = 0
for i in dtype:
if i.isdigit():
break
index += 1
return int(dtype[index:])
class Vadd():
def __init__(self, input_x, input_y, output_z, kernel_name="vadd_sample"):
self.shape_x = input_x.get("shape")
self.dtype_x = input_x.get("dtype")
self.shape_y = input_y.get("shape")
self.dtype_y = input_y.get("dtype")
self.shape_z = output_z.get("shape")
self.dtype_z = output_z.get("dtype")
self.kernel_name = kernel_name
# Construct a TIK container and enable the debugging function.
self.tik_instance = tik.Tik(disable_debug=False)
# Set this parameter based on the Ascend AI Processor version in use.
soc_version="xxx"
tbe_platform.set_current_compile_soc_info(soc_version,core_type="AiCore")
# Obtain the number of AI Cores.
self.aicore_num = tbe_platform.get_soc_spec("CORE_NUM")
# The data read from and written to the UB must be 32-byte aligned. This parameter is used to compute the tensor division and data movement instruction parameters.
block_byte_size = 32
# Obtain the UB size in bytes.
ub_size_bytes = tbe_platform.get_soc_spec("UB_SIZE")
# Compute the number of elements per block based on the input data type.
dtype_bytes_size = get_bit_len(self.dtype_x) // 8
self.data_each_block = block_byte_size // dtype_bytes_size
# Compute the space (address overlapping is considered) allocated to the two inputs and the compute result in the UB, and perform 32-byte alignment.
self.ub_tensor_size = (
ub_size_bytes // dtype_bytes_size // 2 // self.data_each_block *
self.data_each_block)
# Compute the number of input elements.
self.input_num = functools_reduce(lambda x, y: x * y, self.shape_x)
# Compute the data elements evenly scheduled to each AI Core and perform 32-byte alignment.
self.data_num_each_core = self.input_num // self.aicore_num
# The Vector instruction computes a maximum of eight blocks for each iteration repeat. This parameter value is the maximum value of mask.
self.vector_mask_max = 8 * self.data_each_block
self.input_x_gm = self.tik_instance.Tensor(
self.dtype_x, self.shape_x, name="input_x_gm", scope=tik.scope_gm)
self.input_y_gm = self.tik_instance.Tensor(
self.dtype_y, self.shape_y, name="input_y_gm", scope=tik.scope_gm)
self.output_z_gm = self.tik_instance.Tensor(
self.dtype_z, self.shape_z, name="output_z_gm", scope=tik.scope_gm)
def vadd_compute(self):
with self.tik_instance.for_range(
0, self.aicore_num, block_num=self.aicore_num) as index:
# Create two input tensors in the UB.
self.input_x_ub = self.tik_instance.Tensor(
self.dtype_x, (self.ub_tensor_size,),
name="input_x_ub",
scope=tik.scope_ubuf)
self.input_y_ub = self.tik_instance.Tensor(
self.dtype_y, (self.ub_tensor_size,),
name="input_y_ub",
scope=tik.scope_ubuf)
# Move data from the GM to the UB. The offset of each movement is the count of processed elements.
move_offset = index * self.data_num_each_core
# Compute the data tiles scheduled to each AI Core.
self.vadd_compute_each_core(move_offset, self.data_num_each_core)
self.tik_instance.BuildCCE(
kernel_name=self.kernel_name,
inputs=[self.input_x_gm, self.input_y_gm],
outputs=[self.output_z_gm])
return self.tik_instance
def vadd_compute_each_core(self, move_offset, move_num):
loop_time = move_num // self.ub_tensor_size
move_offset_init = move_offset
if loop_time > 0:
with self.tik_instance.for_range(0, loop_time) as loop_index:
move_offset += loop_index * self.ub_tensor_size
self.vadd_compute_each_loop(move_offset, self.ub_tensor_size)
move_offset = move_offset_init + loop_time * self.ub_tensor_size
last_num = move_num % self.ub_tensor_size
if last_num > 0:
self.vadd_compute_each_loop(move_offset, last_num)
def vadd_compute_each_loop(self, move_offset, move_num):
# Compute burst_len of each data movement.
burst_len = math.ceil(move_num / self.data_each_block)
self.tik_instance.data_move(self.input_x_ub,
self.input_x_gm[move_offset], 0, 1,
burst_len, 0, 0)
self.tik_instance.data_move(self.input_y_ub,
self.input_y_gm[move_offset], 0, 1,
burst_len, 0, 0)
vadd_loop = move_num // (self.vector_mask_max * 255)
add_offset = 0
if vadd_loop > 0:
with self.tik_instance.for_range(0, vadd_loop) as add_index:
add_offset = add_index * self.vector_mask_max * 255
self.tik_instance.vec_add(self.vector_mask_max,
self.input_x_ub[add_offset],
self.input_x_ub[add_offset],
self.input_y_ub[add_offset],
255, 8, 8, 8)
add_offset = vadd_loop * self.vector_mask_max * 255
repeat_time = (
move_num % (self.vector_mask_max * 255) // self.vector_mask_max)
if repeat_time > 0:
self.tik_instance.vec_add(self.vector_mask_max,
self.input_x_ub[add_offset],
self.input_x_ub[add_offset],
self.input_y_ub[add_offset],
repeat_time, 8, 8, 8)
add_offset += repeat_time * self.vector_mask_max
last_num = move_num % self.vector_mask_max
if last_num > 0:
self.tik_instance.vec_add(last_num,
self.input_x_ub[add_offset],
self.input_x_ub[add_offset],
self.input_y_ub[add_offset],
1, 8, 8, 8)
self.tik_instance.data_move(self.output_z_gm[move_offset],
self.input_x_ub, 0, 1, burst_len, 0, 0)
@para_check.check_input_type(dict, dict, dict, str)
def vadd_sample(input_x, input_y, output_z, kernel_name):
"""
calculating data
Parameters
----------
input_x : dict
shape and dtype of input
input_y : dict
shape and dtype of input
output_z : dict
shape and dtype of output, should be same shape and type as input
kernel_name : str
kernel name, default value is "vadd_sample"
Returns
-------
None
"""
vadd_instance = Vadd(input_x, input_y, output_z, kernel_name)
tik_instance = vadd_instance.vadd_compute()
return tik_instance
if __name__ == "__main__":
tik_instance = vadd_sample({"shape":(32, 16384), "dtype":"float16"},
{"shape":(32, 16384), "dtype":"float16"},
{"shape":(32, 16384), "dtype":"float16"},
"vadd_32_16384_float16")
input_x = np.random.uniform(-10, 5, (32, 16384)).astype(np.float16)
input_y = np.random.uniform(-10, 5, (32, 16384)).astype(np.float16)
feed_dict = {
'input_x_gm': input_x,
'input_y_gm': input_y
}
result = tik_instance.tikdb.start_debug(feed_dict, interactive=True)
print(result)
Parent topic: TIK Samples