PyTorch-specific Performance Data Collection
For PyTorch, use the API for performance data collection. After the collection is complete, the performance data is automatically parsed. For details, see "msprof Collection Commands" in CANN Performance Tuning Tool User Guide.
The following is a complete example:
import torch
import torch_npu
device = torch.device("npu")
class DemoModel(torch.nn.Module):
def forward(self, in0, in1):
mul0 = in0 * in1
sub0 = mul0 ** 2 - in0
sub1 = mul0 ** 2 - in1
slice0 = sub0[:, :2]
slice1 = sub1[:, 2:]
cat0 = torch.cat([slice0, slice1], dim=1)
return cat0
def main():
torch.manual_seed(2025)
in0 = torch.randn(3,5).to(device)
in1 = torch.randn(3,5).to(device)
model = DemoModel().to(device)
output = model(in0, in1)
print(output.shape)
print(output.device)
print(output)
# perf code
experimental_config = torch_npu.profiler._ExperimentalConfig(
export_type=[
torch_npu.profiler.ExportType.Text,
torch_npu.profiler.ExportType.Db
],
profiler_level=torch_npu.profiler.ProfilerLevel.Level1,
msprof_tx=False,
aic_metrics=torch_npu.profiler.AiCMetrics.AiCoreNone,
l2_cache=False,
op_attr=False,
data_simplification=False,
record_op_args=False,
gc_detect_threshold=None
)
steps = 10
with torch_npu.profiler.profile(
activities=[
torch_npu.profiler.ProfilerActivity.CPU,
torch_npu.profiler.ProfilerActivity.NPU
],
schedule=torch_npu.profiler.schedule(wait=3, warmup=0, active=1, repeat=1, skip_first=1),
on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./result"),
record_shapes=False,
profile_memory=False,
with_stack=False,
with_modules=False,
with_flops=False,
experimental_config=experimental_config) as prof:
print("profiling start...")
for step in range(steps):
output = model(in0, in1)
prof.step()
if __name__ == "__main__":
main()
Parent topic: msprof