Mitigating Performance Overload When Flushing Profile Data to Shared Storage in Large-Scale Multi-rank PyTorch Clusters
Symptom
In large-scale multi-rank PyTorch clusters, tensorboard_trace_handler is triggered via on_trace_ready when the Ascend PyTorch Profiler API is used to collect and flush profile data to disks.
Significant performance overhead occurs.
Possible Causes
When on_trace_ready triggers tensorboard_trace_handler to direct the flushing path to shared storage, a large number of flushing requests from multiple ranks cause the shared storage to respond slowly.
Troubleshooting
To mitigate performance overhead, implement a custom handler for on_trace_ready that first writes profile data to local storage and subsequently transfers it to shared storage.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | import time import torch import torch_npu import os import random import string import shutil import argparse from torch_npu.profiler._profiler_path_creator import ProfPathCreator def generate_random_filename(length=8): letters = string.ascii_lowercase random_letters = ''.join(random.choice(letters) for _ in range(length)) return random_letters def create_random_directory(): random_filename = generate_random_filename() file_path = os.path.join("/tmp/", f"{random_filename}") return file_path def move_file_to_user_path(file_path, user_input_path): try: if not os.path.exists(user_input_path): os.makedirs(user_input_path) for item in os.listdir(file_path): source_item = os.path.join(file_path, item) destination_item = os.path.join(user_input_path, item) shutil.move(source_item, destination_item) os.rmdir(file_path) return True except Exception as e: print(str(e)) return False def train_one_step(i): print(f"[APP]train: {i} step...") a = torch.rand(2, 3).to("npu") b = torch.rand(2, 3).to("npu") c = a + b time.sleep(2) # user_tensorboard_trace_handler is the custom function of on_trace_ready. ori_path is the local path, and dst_path is the shared storage path. def user_tensorboard_trace_handler(ori_path, dst_path, dir_name: str = None, worker_name: str = None, analyse_flag: bool = True): ProfPathCreator().init(worker_name=worker_name, dir_name=dir_name) def handler_fn(prof_inst) -> None: if analyse_flag: prof_inst.prof_if.analyse() result = move_file_to_user_path(ori_path, dst_path) if result is True: print(f"File moved to {dst_path} successfully.") else: print(f"Error occurred when moving the file: {result}") return handler_fn def main(): parser = argparse.ArgumentParser(description="Generate a random txt file and move it to the specified path.") parser.add_argument("path", help="Target path where the file will be moved") args = parser.parse_args() random_txt_file_path = create_random_directory() # on_trace_ready calls user_tensorboard_trace_handler. prof = torch_npu.profiler.profile(on_trace_ready=user_tensorboard_trace_handler(random_txt_file_path, args.path, random_txt_file_path), schedule=torch_npu.profiler.schedule(skip_first=1, repeat=1, active=2, wait=0, warmup=0)) prof.start() step_num = 5 for i in range(step_num): train_one_step(i) prof.step() prof.stop() if __name__ == "__main__": main() |
Parent topic: FAQs