Enabling Iteration Offload in sess.run Mode
Automated porting
In sess.run mode, configure the iterations_per_loop parameter by using set_iteration_per_loop and change the sess.run() call count according to the following formula: sess.run() call count = Original sess.run() call count/iterations_per_loop.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
from __future__ import print_function import input_data from npu_bridge.npu_init import * mnist = input_data.read_data_sets("/test/", one_hot=True) import tensorflow as tf # Set the model. # Set the learning rate. learning_rate = 0.01 # Set the number of training iterations. training_epochs = 10 # Set the batch size. batch_size = 100 # Set the number of iterations to display the loss. display_step = 1 x = tf.placeholder(tf.float32, [None, 784]) y = tf.placeholder(tf.float32, [None, 10]) # Set the model parameters. W = tf.Variable(tf.zeros([784, 10])) b = tf.Variable(tf.zeros([10])) # Compile the model. pred = tf.nn.softmax(tf.matmul(x, W) + b) # Define the loss function: cross entropy. cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices=1)) # Update gradients. optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) # Initialize all variables. init = tf.global_variables_initializer() config = tf.ConfigProto(allow_soft_placement=True) custom_op = config.graph_options.rewrite_options.custom_optimizers.add() custom_op.name = "NpuOptimizer" custom_op.parameter_map["mix_compile_mode"].b = False # Disable mixed computing (disabled by default). custom_op.parameter_map["enable_data_pre_proc"].b = True # If the GetNext operator exists on the network, offload it. GetNext operator offload is a prerequisite for iteration offload. custom_op.parameter_map["iterations_per_loop"].i = 10 # Determine whether the training iteration is offloaded. Must be equal to iterations_per_loop set in set_iteration_per_loop. config = npu_config_proto(config_proto=config) # Train the model. with tf.Session(config=config) as sess: sess.run(init) # Set the number of iterations per loop to 10 in sess.run mode. train_op = util.set_iteration_per_loop(sess, optimizer, 10) # sess indicates the created TensorFlow session, optimizer indicates the defined gradient update operation, and 10 indicates the number of training iterations on the device. for epoch in range(training_epochs): avg_cost = 0 total_batch = int(mnist.train.num_examples / batch_size) for i in range(total_batch): batch_xs, batch_ys = mnist.train.next_batch(batch_size) _, c = sess.run([train_op, cost], feed_dict={x: batch_xs, y: batch_ys}) avg_cost += c / total_batch |
The set_iteration_per_loop API involves graph modification. If a graph cannot be modified (for example, the graph is frozen or a session is created using tf.train.Supervisor), you cannot use the set_iteration_per_loop API to set the number of iterations per loop. Instead, you can use create_iteration_per_loop_var and load_iteration_per_loop_var to set the number of iterations per loop, as shown in the following code in bold:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
from __future__ import print_function import input_data from npu_bridge.npu_init import * mnist = input_data.read_data_sets("/test/", one_hot=True) import tensorflow as tf # Set the model. # Set the learning rate. learning_rate = 0.01 # Set the number of training iterations. training_epochs = 10 # Set the batch size. batch_size = 100 # Set the number of iterations to display the loss. display_step = 1 x = tf.placeholder(tf.float32, [None, 784]) y = tf.placeholder(tf.float32, [None, 10]) # Set the model parameters. W = tf.Variable(tf.zeros([784, 10])) b = tf.Variable(tf.zeros([10])) # Compile the model. pred = tf.nn.softmax(tf.matmul(x, W) + b) # Define the loss function: cross entropy. cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices=1)) # Update gradients. optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) # Initialize all variables. init = tf.global_variables_initializer() config = tf.ConfigProto(allow_soft_placement=True) custom_op = config.graph_options.rewrite_options.custom_optimizers.add() custom_op.name = "NpuOptimizer" custom_op.parameter_map["mix_compile_mode"].b = False # Disable mixed computing (disabled by default). custom_op.parameter_map["enable_data_pre_proc"].b = True # If the GetNext operator exists on the network, offload it. The GetNext operator offload is a prerequisite for iteration offload. custom_op.parameter_map["iterations_per_loop"].i = 10 # Used for functional validation. Must be equal to iterations_per_loop set in load_iteration_per_loop_var. config = npu_config_proto(config_proto=config) # Train the model. with tf.Session(config=config) as sess: sess.run(init) # Set the number of iterations per loop to 10 in sess.run mode. iteration = util.IterationPerLoop() # Define the IterationPerLoop class object. train_op = iteration.create_iteration_per_loop_var(optimizer) # optimizer indicates the defined operation for updating gradients. tf.train.Supervisor(logdir="/home/xxxx",init_op=init) # Freeze the graph. iteration.load_iteration_per_loop_var(sess, 10) # Set the number of iterations per loop. sess indicates the created TensorFlow session, and 10 indicates the number of training iterations on the device. for epoch in range(training_epochs): avg_cost = 0 total_batch = int(mnist.train.num_examples / batch_size) for i in range(total_batch): batch_xs, batch_ys = mnist.train.next_batch(batch_size) _, c = sess.run([train_op, cost], feed_dict={x: batch_xs, y: batch_ys}) avg_cost += c / total_batch |
Manual porting
In sess.run mode, configure the iterations_per_loop parameter by using set_iteration_per_loop and change the sess.run() call count according to the following formula: sess.run() call count = Original sess.run() call count/iterations_per_loop. See the following code in bold:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
from __future__ import print_function import input_data from npu_bridge.npu_init import * mnist = input_data.read_data_sets("/test/", one_hot=True) import tensorflow as tf # Set the model. # Set the learning rate. learning_rate = 0.01 # Set the number of training iterations. training_epochs = 10 # Set the batch size. batch_size = 100 # Set the number of iterations to display the loss. display_step = 1 x = tf.placeholder(tf.float32, [None, 784]) y = tf.placeholder(tf.float32, [None, 10]) # Set the model parameters. W = tf.Variable(tf.zeros([784, 10])) b = tf.Variable(tf.zeros([10])) # Compile the model. pred = tf.nn.softmax(tf.matmul(x, W) + b) # Define the loss function: cross entropy. cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices=1)) # Update gradients. optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) # Initialize all variables. init = tf.global_variables_initializer() config = tf.ConfigProto(allow_soft_placement=True) custom_op = config.graph_options.rewrite_options.custom_optimizers.add() custom_op.name = "NpuOptimizer" custom_op.parameter_map["use_off_line"].b = True # Perform training on the Ascend AI Processor. custom_op.parameter_map["mix_compile_mode"].b = False # Disable mixed computing (disabled by default). custom_op.parameter_map["enable_data_pre_proc"].b = True # If the GetNext operator exists on the network, offload it. The GetNext operator offload is a prerequisite for iteration offload. custom_op.parameter_map["iterations_per_loop"].i = 10 # Determine whether the training iteration is offloaded. Must be equal to iterations_per_loop set in set_iteration_per_loop. config.graph_options.rewrite_options.remapping = RewriterConfig.OFF config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF # Train the model. with tf.Session(config=config) as sess: sess.run(init) # Set the number of iterations per loop to 10 in sess.run mode. train_op = util.set_iteration_per_loop(sess, optimizer, 10) # sess indicates the created TensorFlow session, optimizer indicates the defined gradient update operation, and 10 indicates the number of training iterations on the device. for epoch in range(training_epochs): avg_cost = 0 total_batch = int(mnist.train.num_examples / batch_size) for i in range(total_batch): batch_xs, batch_ys = mnist.train.next_batch(batch_size) _, c = sess.run([train_op, cost], feed_dict={x: batch_xs, y: batch_ys}) avg_cost += c / total_batch |
The set_iteration_per_loop API involves graph modification. If a graph cannot be modified (for example, the graph is frozen or a session is created using tf.train.Supervisor), you cannot use the set_iteration_per_loop API to set the number of iterations per loop. Instead, you can use create_iteration_per_loop_var and load_iteration_per_loop_var to set the number of iterations per loop, as shown in the following code in bold:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
from __future__ import print_function import input_data from npu_bridge.npu_init import * mnist = input_data.read_data_sets("/test/", one_hot=True) import tensorflow as tf # Set the model. # Set the learning rate. learning_rate = 0.01 # Set the number of training iterations. training_epochs = 10 # Set the batch size. batch_size = 100 # Set the number of iterations to display the loss. display_step = 1 x = tf.placeholder(tf.float32, [None, 784]) y = tf.placeholder(tf.float32, [None, 10]) # Set the model parameters. W = tf.Variable(tf.zeros([784, 10])) b = tf.Variable(tf.zeros([10])) # Compile the model. pred = tf.nn.softmax(tf.matmul(x, W) + b) # Define the loss function: cross entropy. cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices=1)) # Update gradients. optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) # Initialize all variables. init = tf.global_variables_initializer() config = tf.ConfigProto(allow_soft_placement=True) custom_op = config.graph_options.rewrite_options.custom_optimizers.add() custom_op.name = "NpuOptimizer" custom_op.parameter_map["use_off_line"].b = True # Perform training on the Ascend AI Processor. custom_op.parameter_map["mix_compile_mode"].b = False # Disable mixed computing (disabled by default). custom_op.parameter_map["enable_data_pre_proc"].b = True # If the GetNext operator exists on the network, offload it. The GetNext operator offload is a prerequisite for iteration offload. custom_op.parameter_map["iterations_per_loop"].i = 10 # Determine whether the training iteration is offloaded. Must be equal to iterations_per_loop set in load_iteration_per_loop_var. config.graph_options.rewrite_options.remapping = RewriterConfig.OFF config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF # Train the model. with tf.Session(config=config) as sess: sess.run(init) # Set the number of iterations per loop to 10 in sess.run mode. iteration = util.IterationPerLoop() train_op = iteration.create_iteration_per_loop_var(optimizer) # optimizer indicates the defined operation for updating gradients. tf.train.Supervisor(logdir="/home/xxxx",init_op=init) # Freeze the graph. iteration.load_iteration_per_loop_var(sess, 10) # Set the number of iterations per loop. sess indicates the created TensorFlow session, and 10 indicates the number of training iterations on the device. for epoch in range(training_epochs): avg_cost = 0 total_batch = int(mnist.train.num_examples / batch_size) for i in range(total_batch): batch_xs, batch_ys = mnist.train.next_batch(batch_size) _, c = sess.run([train_op, cost], feed_dict={x: batch_xs, y: batch_ys}) avg_cost += c / total_batch |
After the number of iterations per loop is changed, you are advised to adjust other loop parameters based on the actual situation, such as the time required for obtaining a single step and updating the number of iterations.
Checking Whether iterations_per_loop Takes Effect
After iteration offload is enabled, you can check whether the keyword "Insert op success" exists in the INFO log on the host to determine whether iterations_per_loop takes effect.
You can run the following command to set the log level on the host to INFO. The default output path of INFO logs is $HOME/ascend/log/run/plog/.
export ASCEND_GLOBAL_LOG_LEVEL=1