PyTorch专用的性能采集
PyTorch推荐使用接口进行性能采集,采集完之后会自动解析性能数据,详细请参考《CANN 性能调优工具用户指南》中的“msprof采集通用命令”。
完整示例如下:
import torch import torch_npu device = torch.device("npu") class DemoModel(torch.nn.Module): def forward(self, in0, in1): mul0 = in0 * in1 sub0 = mul0 ** 2 - in0 sub1 = mul0 ** 2 - in1 slice0 = sub0[:, :2] slice1 = sub1[:, 2:] cat0 = torch.cat([slice0, slice1], dim=1) return cat0 def main(): torch.manual_seed(2025) in0 = torch.randn(3,5).to(device) in1 = torch.randn(3,5).to(device) model = DemoModel().to(device) output = model(in0, in1) print(output.shape) print(output.device) print(output) # perf code experimental_config = torch_npu.profiler._ExperimentalConfig( export_type=[ torch_npu.profiler.ExportType.Text, torch_npu.profiler.ExportType.Db ], profiler_level=torch_npu.profiler.ProfilerLevel.Level1, msprof_tx=False, aic_metrics=torch_npu.profiler.AiCMetrics.AiCoreNone, l2_cache=False, op_attr=False, data_simplification=False, record_op_args=False, gc_detect_threshold=None ) steps = 10 with torch_npu.profiler.profile( activities=[ torch_npu.profiler.ProfilerActivity.CPU, torch_npu.profiler.ProfilerActivity.NPU ], schedule=torch_npu.profiler.schedule(wait=3, warmup=0, active=1, repeat=1, skip_first=1), on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./result"), record_shapes=False, profile_memory=False, with_stack=False, with_modules=False, with_flops=False, experimental_config=experimental_config) as prof: print("profiling start...") for step in range(steps): output = model(in0, in1) prof.step() if __name__ == "__main__": main()
父主题: msprof工具介绍