For Advanced

The sample in this section implements tensor addition. The preceding TIK optimization mechanisms are applied in this sample to enhance your understanding of TIK programming and optimization.

To make the sample structure clear and readable, this sample is organized and implemented in the format of a class. The class is defined as follows.

class Vadd():
       # Receive data and complete initialization.
       def __init__(self, input_x, input_y, output_z, kernel_name="vadd_sample"):
       
       # Compute and build the operator.
       def vadd_compute(self):
       
       # Define the operations on each AI Core.
       def vadd_compute_each_core(self, move_offset, move_num):
       
       # Define the tiled compute process on each AI Core.
       def vadd_compute_each_loop(self, move_offset, move_num):
       
       # For function and performance testing.
 def vadd_sample(input_x, input_y, output_z, kernel_name):

The complete sample code is as follows.

import math
from functools import reduce as functools_reduce
import numpy as np
from tbe import tik
import tbe.common.platform as tbe_platform
from tbe.common.utils import para_check

# Calculate the number of bits occupied by each data type.
def get_bit_len(dtype):
    index = 0
    for i in dtype:
        if i.isdigit():
            break
        index += 1
    return int(dtype[index:])

class Vadd():
    def __init__(self, input_x, input_y, output_z, kernel_name="vadd_sample"):
        self.shape_x = input_x.get("shape")
        self.dtype_x = input_x.get("dtype")
        self.shape_y = input_y.get("shape")
        self.dtype_y = input_y.get("dtype")
        self.shape_z = output_z.get("shape")
        self.dtype_z = output_z.get("dtype")
        self.kernel_name = kernel_name
        # Construct a TIK container and enable the debugging function.
        self.tik_instance = tik.Tik(disable_debug=False)

        # Set this parameter based on the Ascend AI Processor version in use.
        soc_version="xxx"
        tbe_platform.set_current_compile_soc_info(soc_version,core_type="AiCore")

        # Obtain the number of AI Cores.
        self.aicore_num = tbe_platform.get_soc_spec("CORE_NUM")
        # The data read from and written to the UB must be 32-byte aligned. This parameter is used to compute the tensor division and data movement instruction parameters.
        block_byte_size = 32

        # Obtain the UB size in bytes.
        ub_size_bytes = tbe_platform.get_soc_spec("UB_SIZE")

        # Compute the number of elements per block based on the input data type.
        dtype_bytes_size = get_bit_len(self.dtype_x) // 8
        self.data_each_block = block_byte_size // dtype_bytes_size

        # Compute the space (address overlapping is considered) allocated to the two inputs and the compute result in the UB, and perform 32-byte alignment.
        self.ub_tensor_size = (
            ub_size_bytes // dtype_bytes_size // 2 // self.data_each_block *
            self.data_each_block)

        # Compute the number of input elements.
        self.input_num = functools_reduce(lambda x, y: x * y, self.shape_x)

        # Compute the data elements evenly scheduled to each AI Core and perform 32-byte alignment.
        self.data_num_each_core = self.input_num // self.aicore_num

        # The Vector instruction computes a maximum of eight blocks for each iteration repeat. This parameter value is the maximum value of mask.
        self.vector_mask_max = 8 * self.data_each_block

        self.input_x_gm = self.tik_instance.Tensor(
            self.dtype_x, self.shape_x, name="input_x_gm", scope=tik.scope_gm)
        self.input_y_gm = self.tik_instance.Tensor(
            self.dtype_y, self.shape_y, name="input_y_gm", scope=tik.scope_gm)
        self.output_z_gm = self.tik_instance.Tensor(
            self.dtype_z, self.shape_z, name="output_z_gm", scope=tik.scope_gm)

    def vadd_compute(self):
        with self.tik_instance.for_range(
                0, self.aicore_num, block_num=self.aicore_num) as index:
            # Create two input tensors in the UB.
            self.input_x_ub = self.tik_instance.Tensor(
                self.dtype_x, (self.ub_tensor_size,),
                name="input_x_ub",
                scope=tik.scope_ubuf)
            self.input_y_ub = self.tik_instance.Tensor(
                self.dtype_y, (self.ub_tensor_size,),
                name="input_y_ub",
                scope=tik.scope_ubuf)

            # Move data from the GM to the UB. The offset of each movement is the count of processed elements.
            move_offset = index * self.data_num_each_core

            # Compute the data tiles scheduled to each AI Core.
            self.vadd_compute_each_core(move_offset, self.data_num_each_core)

        self.tik_instance.BuildCCE(
            kernel_name=self.kernel_name,
            inputs=[self.input_x_gm, self.input_y_gm],
            outputs=[self.output_z_gm])

        return self.tik_instance

    def vadd_compute_each_core(self, move_offset, move_num):
        loop_time = move_num // self.ub_tensor_size
        move_offset_init = move_offset
        if loop_time > 0:
            with self.tik_instance.for_range(0, loop_time) as loop_index:
                move_offset += loop_index * self.ub_tensor_size
                self.vadd_compute_each_loop(move_offset, self.ub_tensor_size)
            move_offset = move_offset_init + loop_time * self.ub_tensor_size

        last_num = move_num % self.ub_tensor_size
        if last_num > 0:
            self.vadd_compute_each_loop(move_offset, last_num)

    def vadd_compute_each_loop(self, move_offset, move_num):
        # Compute burst_len of each data movement.
        burst_len = math.ceil(move_num / self.data_each_block)

        self.tik_instance.data_move(self.input_x_ub,
                                    self.input_x_gm[move_offset], 0, 1,
                                    burst_len, 0, 0)
        self.tik_instance.data_move(self.input_y_ub,
                                    self.input_y_gm[move_offset], 0, 1,
                                    burst_len, 0, 0)
        vadd_loop = move_num // (self.vector_mask_max * 255)
        add_offset = 0
        if vadd_loop > 0:
            with self.tik_instance.for_range(0, vadd_loop) as add_index:
                add_offset = add_index * self.vector_mask_max * 255
                self.tik_instance.vec_add(self.vector_mask_max,
                                       self.input_x_ub[add_offset],
                                       self.input_x_ub[add_offset],
                                       self.input_y_ub[add_offset], 
                                       255, 8, 8, 8)
            add_offset = vadd_loop * self.vector_mask_max * 255
        repeat_time = (
            move_num % (self.vector_mask_max * 255) // self.vector_mask_max)
        if repeat_time > 0:
            self.tik_instance.vec_add(self.vector_mask_max,
                                   self.input_x_ub[add_offset],
                                   self.input_x_ub[add_offset],
                                   self.input_y_ub[add_offset], 
                                   repeat_time, 8, 8, 8)
            add_offset += repeat_time * self.vector_mask_max
        last_num = move_num % self.vector_mask_max
        if last_num > 0:
            self.tik_instance.vec_add(last_num, 
                                   self.input_x_ub[add_offset],
                                   self.input_x_ub[add_offset],
                                   self.input_y_ub[add_offset], 
                                   1, 8, 8, 8)

        self.tik_instance.data_move(self.output_z_gm[move_offset],
                                    self.input_x_ub, 0, 1, burst_len, 0, 0)


@para_check.check_input_type(dict, dict, dict, str)
def vadd_sample(input_x, input_y, output_z, kernel_name):
    """
    calculating data

    Parameters
    ----------
    input_x : dict
        shape and dtype of input
    input_y : dict
        shape and dtype of input
    output_z : dict
        shape and dtype of output, should be same shape and type as input
    kernel_name : str
        kernel name, default value is "vadd_sample"

    Returns
    -------
    None
    """
    vadd_instance = Vadd(input_x, input_y, output_z, kernel_name)
    tik_instance = vadd_instance.vadd_compute()

    return tik_instance


if __name__ == "__main__":
    tik_instance = vadd_sample({"shape":(32, 16384), "dtype":"float16"},
                               {"shape":(32, 16384), "dtype":"float16"},
                               {"shape":(32, 16384), "dtype":"float16"},
                               "vadd_32_16384_float16")

    input_x = np.random.uniform(-10, 5, (32, 16384)).astype(np.float16)
    input_y = np.random.uniform(-10, 5, (32, 16384)).astype(np.float16)
    feed_dict = {
        'input_x_gm': input_x,
        'input_y_gm': input_y
    }
    result = tik_instance.tikdb.start_debug(feed_dict, interactive=True)
    print(result)