Enabling Iteration Offload in sess.run Mode

Automated porting

In sess.run mode, configure the iterations_per_loop parameter by using set_iteration_per_loop and change the sess.run() call count according to the following formula: sess.run() call count = Original sess.run() call count/iterations_per_loop.

Find npu_config_proto in the script, configure iterations_per_loop in the session configuration, and set the number of iterations per loop using set_iteration_per_loop, as shown in the following code in bold:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from __future__ import print_function
import input_data
from npu_bridge.npu_init import *
 
mnist = input_data.read_data_sets("/test/", one_hot=True)
 
import tensorflow as tf
 
# Set the model.
# Set the learning rate.
learning_rate = 0.01
# Set the number of training iterations.
training_epochs = 10
# Set the batch size.
batch_size = 100
# Set the number of iterations to display the loss.
display_step = 1
 
x = tf.placeholder(tf.float32, [None, 784])
y = tf.placeholder(tf.float32, [None, 10])
 
# Set the model parameters.
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
 
# Compile the model.
pred = tf.nn.softmax(tf.matmul(x, W) + b)
 
# Define the loss function: cross entropy.
cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices=1))
 
# Update gradients.
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
 
# Initialize all variables.
init = tf.global_variables_initializer()
 
config = tf.ConfigProto(allow_soft_placement=True)
custom_op =  config.graph_options.rewrite_options.custom_optimizers.add()
custom_op.name =  "NpuOptimizer"
custom_op.parameter_map["mix_compile_mode"].b = False  # Disable mixed computing (disabled by default).
custom_op.parameter_map["enable_data_pre_proc"].b = True # If the GetNext operator exists on the network, offload it. GetNext operator offload is a prerequisite for iteration offload.
custom_op.parameter_map["iterations_per_loop"].i = 10 # Determine whether the training iteration is offloaded. Must be equal to iterations_per_loop set in set_iteration_per_loop.
config = npu_config_proto(config_proto=config)
 
# Train the model.
with tf.Session(config=config) as sess:
    sess.run(init)
    # Set the number of iterations per loop to 10 in sess.run mode.
    train_op = util.set_iteration_per_loop(sess, optimizer, 10)  # sess indicates the created TensorFlow session, optimizer indicates the defined gradient update operation, and 10 indicates the number of training iterations on the device.
 
    for epoch in range(training_epochs):
        avg_cost = 0
        total_batch = int(mnist.train.num_examples / batch_size)
 
        for i in range(total_batch):
            batch_xs, batch_ys = mnist.train.next_batch(batch_size)
            _, c = sess.run([train_op, cost], feed_dict={x: batch_xs, y: batch_ys})
 
            avg_cost += c / total_batch

The set_iteration_per_loop API involves graph modification. If a graph cannot be modified (for example, the graph is frozen or a session is created using tf.train.Supervisor), you cannot use the set_iteration_per_loop API to set the number of iterations per loop. Instead, you can use create_iteration_per_loop_var and load_iteration_per_loop_var to set the number of iterations per loop, as shown in the following code in bold:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from __future__ import print_function
import input_data
from npu_bridge.npu_init import *
 
mnist = input_data.read_data_sets("/test/", one_hot=True)
 
import tensorflow as tf
 
# Set the model.
# Set the learning rate.
learning_rate = 0.01
# Set the number of training iterations.
training_epochs = 10
# Set the batch size.
batch_size = 100
# Set the number of iterations to display the loss.
display_step = 1
 
x = tf.placeholder(tf.float32, [None, 784])
y = tf.placeholder(tf.float32, [None, 10])
 
# Set the model parameters.
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
 
# Compile the model.
pred = tf.nn.softmax(tf.matmul(x, W) + b)
 
# Define the loss function: cross entropy.
cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices=1))
 
# Update gradients.
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
 
# Initialize all variables.
init = tf.global_variables_initializer()
 
config = tf.ConfigProto(allow_soft_placement=True)
custom_op =  config.graph_options.rewrite_options.custom_optimizers.add()
custom_op.name =  "NpuOptimizer"
custom_op.parameter_map["mix_compile_mode"].b = False  # Disable mixed computing (disabled by default).
custom_op.parameter_map["enable_data_pre_proc"].b = True # If the GetNext operator exists on the network, offload it. The GetNext operator offload is a prerequisite for iteration offload.
custom_op.parameter_map["iterations_per_loop"].i = 10  # Used for functional validation. Must be equal to iterations_per_loop set in load_iteration_per_loop_var.
config = npu_config_proto(config_proto=config)
 
# Train the model.
with tf.Session(config=config) as sess:
    sess.run(init)
    # Set the number of iterations per loop to 10 in sess.run mode.
    iteration = util.IterationPerLoop()   # Define the IterationPerLoop class object.
    train_op = iteration.create_iteration_per_loop_var(optimizer)    # optimizer indicates the defined operation for updating gradients.
    tf.train.Supervisor(logdir="/home/xxxx",init_op=init)  # Freeze the graph.
    iteration.load_iteration_per_loop_var(sess, 10)  # Set the number of iterations per loop. sess indicates the created TensorFlow session, and 10 indicates the number of training iterations on the device.

    for epoch in range(training_epochs):
        avg_cost = 0
        total_batch = int(mnist.train.num_examples / batch_size)
 
        for i in range(total_batch):
            batch_xs, batch_ys = mnist.train.next_batch(batch_size)
            _, c = sess.run([train_op, cost], feed_dict={x: batch_xs, y: batch_ys})
 
            avg_cost += c / total_batch

Manual porting

In sess.run mode, configure the iterations_per_loop parameter by using set_iteration_per_loop and change the sess.run() call count according to the following formula: sess.run() call count = Original sess.run() call count/iterations_per_loop. See the following code in bold:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from __future__ import print_function
import input_data
from npu_bridge.npu_init import *
 
mnist = input_data.read_data_sets("/test/", one_hot=True)
 
import tensorflow as tf
 
# Set the model.
# Set the learning rate.
learning_rate = 0.01
# Set the number of training iterations.
training_epochs = 10
# Set the batch size.
batch_size = 100
# Set the number of iterations to display the loss.
display_step = 1
 
x = tf.placeholder(tf.float32, [None, 784])
y = tf.placeholder(tf.float32, [None, 10])
 
# Set the model parameters.
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
 
# Compile the model.
pred = tf.nn.softmax(tf.matmul(x, W) + b)
 
# Define the loss function: cross entropy.
cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices=1))
 
# Update gradients.
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
 
# Initialize all variables.
init = tf.global_variables_initializer()
 
config = tf.ConfigProto(allow_soft_placement=True)
custom_op =  config.graph_options.rewrite_options.custom_optimizers.add()
custom_op.name =  "NpuOptimizer"
custom_op.parameter_map["use_off_line"].b = True # Perform training on the Ascend AI Processor.
custom_op.parameter_map["mix_compile_mode"].b = False  # Disable mixed computing (disabled by default).
custom_op.parameter_map["enable_data_pre_proc"].b = True # If the GetNext operator exists on the network, offload it. The GetNext operator offload is a prerequisite for iteration offload.
custom_op.parameter_map["iterations_per_loop"].i = 10 # Determine whether the training iteration is offloaded. Must be equal to iterations_per_loop set in set_iteration_per_loop.
config.graph_options.rewrite_options.remapping = RewriterConfig.OFF
config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF
 
# Train the model.
with tf.Session(config=config) as sess:
    sess.run(init)
    # Set the number of iterations per loop to 10 in sess.run mode.
    train_op = util.set_iteration_per_loop(sess, optimizer, 10) # sess indicates the created TensorFlow session, optimizer indicates the defined gradient update operation, and 10 indicates the number of training iterations on the device.
 
    for epoch in range(training_epochs):
        avg_cost = 0
        total_batch = int(mnist.train.num_examples / batch_size)
 
        for i in range(total_batch):
            batch_xs, batch_ys = mnist.train.next_batch(batch_size)
            _, c = sess.run([train_op, cost], feed_dict={x: batch_xs, y: batch_ys})
 
            avg_cost += c / total_batch

The set_iteration_per_loop API involves graph modification. If a graph cannot be modified (for example, the graph is frozen or a session is created using tf.train.Supervisor), you cannot use the set_iteration_per_loop API to set the number of iterations per loop. Instead, you can use create_iteration_per_loop_var and load_iteration_per_loop_var to set the number of iterations per loop, as shown in the following code in bold:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from __future__ import print_function
import input_data
from npu_bridge.npu_init import *
 
mnist = input_data.read_data_sets("/test/", one_hot=True)
 
import tensorflow as tf
 
# Set the model.
# Set the learning rate.
learning_rate = 0.01
# Set the number of training iterations.
training_epochs = 10
# Set the batch size.
batch_size = 100
# Set the number of iterations to display the loss.
display_step = 1
 
x = tf.placeholder(tf.float32, [None, 784])
y = tf.placeholder(tf.float32, [None, 10])
 
# Set the model parameters.
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
 
# Compile the model.
pred = tf.nn.softmax(tf.matmul(x, W) + b)
 
# Define the loss function: cross entropy.
cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices=1))
 
# Update gradients.
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
 
# Initialize all variables.
init = tf.global_variables_initializer()
 
config = tf.ConfigProto(allow_soft_placement=True)
custom_op =  config.graph_options.rewrite_options.custom_optimizers.add()
custom_op.name =  "NpuOptimizer"
custom_op.parameter_map["use_off_line"].b = True  # Perform training on the Ascend AI Processor.
custom_op.parameter_map["mix_compile_mode"].b = False  # Disable mixed computing (disabled by default).
custom_op.parameter_map["enable_data_pre_proc"].b = True  # If the GetNext operator exists on the network, offload it. The GetNext operator offload is a prerequisite for iteration offload.
custom_op.parameter_map["iterations_per_loop"].i = 10  # Determine whether the training iteration is offloaded. Must be equal to iterations_per_loop set in load_iteration_per_loop_var.
config.graph_options.rewrite_options.remapping = RewriterConfig.OFF
config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF 
# Train the model.
with tf.Session(config=config) as sess:
    sess.run(init)
    # Set the number of iterations per loop to 10 in sess.run mode.
    iteration = util.IterationPerLoop() 
    train_op = iteration.create_iteration_per_loop_var(optimizer) # optimizer indicates the defined operation for updating gradients.
    tf.train.Supervisor(logdir="/home/xxxx",init_op=init)  # Freeze the graph.
    iteration.load_iteration_per_loop_var(sess, 10)  # Set the number of iterations per loop. sess indicates the created TensorFlow session, and 10 indicates the number of training iterations on the device.

    for epoch in range(training_epochs):
        avg_cost = 0
        total_batch = int(mnist.train.num_examples / batch_size)
 
        for i in range(total_batch):
            batch_xs, batch_ys = mnist.train.next_batch(batch_size)
            _, c = sess.run([train_op, cost], feed_dict={x: batch_xs, y: batch_ys})
 
            avg_cost += c / total_batch

After the number of iterations per loop is changed, you are advised to adjust other loop parameters based on the actual situation, such as the time required for obtaining a single step and updating the number of iterations.

Checking Whether iterations_per_loop Takes Effect

After iteration offload is enabled, you can check whether the keyword "Insert op success" exists in the INFO log on the host to determine whether iterations_per_loop takes effect.

You can run the following command to set the log level on the host to INFO. The default output path of INFO logs is $HOME/ascend/log/run/plog/.

export ASCEND_GLOBAL_LOG_LEVEL=1