完成MindX DL的安装后,可使用yaml下发一个训练任务,检测系统是否可以正常运行。
可选择以下方式中的一种来获取训练镜像
可将训练镜像重命名,如:mindspore:b035。
可参考容器镜像安全加固。
root@ubuntu:/data/atlas_dls/public/dataset/imagenet# pwd /data/atlas_dls/public/dataset/imagenet
root@ubuntu:/data/atlas_dls/public/dataset/imagenet# du -sh 176M
mkdir /data/atlas_dls/code
root@ubuntu:/data/atlas_dls/code/ResNet50_for_MindSpore_1.4_code/scripts/# scripts/ ├── docker_start.sh ├── run_standalone_train_gpu.sh ├── run_standalone_train.sh ... ├── rank_table.sh ├── utils.sh └── train_start.sh
... run_distribute: False enable_profiling: False data_path: "/cache/data" output_path: "/cache/train" # 修改checkpoint保存路径,请用户根据实际情况进行修改 load_path: "/cache/checkpoint_path/" device_target: "Ascend" checkpoint_path: "./checkpoint/" checkpoint_file_path: "" ... net_name: "resnet50" dataset: "imagenet2012" device_num: 1 pre_trained: "/job/code/output/checkpoint/ckpt_0" # 容器内预训练模型加载路径(支持目录和文件),请用户参考训练yaml根据实际情况进行修改 run_eval: False eval_dataset_path: "" parameter_server: False filter_weight: False save_best_ckpt: True eval_start_epoch: 40 ... network_dataset: "resnet50_imagenet2012" # 再训练选项 save_graphs: False # 是否开启图编译结果保存 save_graphs_path: "./graphs" # 图编译结果保存路径 has_trained_epoch: 0 # 模型预训练的epoch,默认是0 has_trained_step: 0 # 模型预训练的step,默认是0 --- # 每项配置的帮助说明 enable_modelarts: "Whether training on modelarts, default: False" ... batch_size: "Batch size for training and evaluation" epoch_size: "Total training epochs." checkpoint_path: "The location of the checkpoint file." checkpoint_file_path: "The location of the checkpoint file." save_graphs: "Whether save graphs during training, default: False." save_graphs_path: "Path to save graphs."
... def set_parameter(): """set_parameter""" target = config.device_target if target == "CPU": config.run_distribute = False # init context rank_save_graphs_path = os.path.join(config.save_graphs_path, "soma") # Whether open graph saving config.save_graphs = not config.pre_trained # 设置图编译结果是否保存 # init context if config.mode_name == 'GRAPH': if target == "Ascend": rank_save_graphs_path = os.path.join(config.save_graphs_path, "soma", str(os.getenv('DEVICE_ID'))) context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=config.save_graphs, save_graphs_path=rank_save_graphs_path) else: context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=config.save_graphs) set_graph_kernel_context(target, config.net_name) ... def load_pre_trained_checkpoint(): """ Load checkpoint according to pre_trained path. """ param_dict = None if config.pre_trained: if os.path.isdir(config.pre_trained): ckpt_save_dir = os.path.join(config.output_path, config.checkpoint_path, "ckpt_0") ckpt_pattern = os.path.join(ckpt_save_dir, "*.ckpt") ckpt_files = glob.glob(ckpt_pattern) if not ckpt_files: logger.warning(f"There is no ckpt file in {ckpt_save_dir}, " f"pre_trained is unsupported.") else: ckpt_files.sort(key=os.path.getmtime, reverse=True) time_stamp = datetime.datetime.now() print(f"time stamp {time_stamp.strftime('%Y.%m.%d-%H:%M:%S')}" f" pre trained ckpt model {ckpt_files[0]} loading", flush=True) param_dict = load_checkpoint(ckpt_files[0]) elif os.path.isfile(config.pre_trained): param_dict = load_checkpoint(config.pre_trained) else: print(f"Invalid pre_trained {config.pre_trained} parameter.") return param_dict ... @moxing_wrapper() def train_net(): """train net""" target = config.device_target set_parameter() ckpt_param_dict = load_pre_trained_checkpoint() dataset = create_dataset(dataset_path=config.data_path, do_train=True, repeat_num=1, batch_size=config.batch_size, train_image_size=config.train_image_size, eval_image_size=config.eval_image_size, target=target, distribute=config.run_distribute) step_size = dataset.get_dataset_size() ... time_cb = TimeMonitor(data_size=step_size) loss_cb = LossCallBack(config.has_trained_epoch) cb = [time_cb, loss_cb] ckpt_save_dir = set_save_ckpt_dir() if config.save_checkpoint: ckpt_append_info = [{"epoch_num": config.has_trained_epoch, "step_num": config.has_trained_step}] config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size, keep_checkpoint_max=config.keep_checkpoint_max, append_info=ckpt_append_info) ckpt_cb = ModelCheckpoint(prefix="resnet", directory=ckpt_save_dir, config=config_ck) cb += [ckpt_cb]