模型运行过程中开启Profiling采集性能数据会造成性能膨胀,具体表现是开启Profiling后模型性能数据采集step打屏耗时较不开启Profiling耗时变长,两者差值为膨胀时间。
膨胀程度与Profiling采集设置相关,以下按影响模型性能程度列举主要影响因素:
experimental_config = torch_npu.profiler._ExperimentalConfig( export_type=torch_npu.profiler.ExportType.Text, aic_metrics=torch_npu.profiler.AiCMetrics.PipeUtilization, profiler_level=torch_npu.profiler.ProfilerLevel.Level1, l2_cache=False, data_simplification=False ) with torch_npu.profiler.profile( activities=[ torch_npu.profiler.ProfilerActivity.CPU, torch_npu.profiler.ProfilerActivity.NPU ], schedule=torch_npu.profiler.schedule(wait=1, warmup=1, active=2, repeat=2, skip_first=10), on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./result"), experimental_config=experimental_config) as prof: for step in range(steps): train_one_step(step, steps, train_loader, model, optimizer, criterion) prof.step()
with torch_npu.profiler.profile( activities=[ torch_npu.profiler.ProfilerActivity.NPU], schedule=torch_npu.profiler.schedule(wait=1, warmup=1, active=1, repeat=1, skip_first=20), on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./npu-profling-least-inflation") ) as prof: for step, x in enumerate(train_dataloader):# train_one_step ... # train_one_step prof.step()