根据模型框架选择对应的指导示例。
root@ubuntu:/data/atlas_dls/public/dataset/resnet50/imagenet_TF# pwd
1 | /data/atlas_dls/public/dataset/resnet50/imagenet_TF |
root@ubuntu:/data/atlas_dls/public/dataset/resnet50/imagenet_TF# du -sh
1 | 42G |
/data/atlas_dls/public/code/ResNet50_for_TensorFlow_2.6_code/ ├── scripts │ ├── train_start.sh │ ... │ ... ├── tensorflow │ ├── resnet_ctl_imagenet_main.py │ ├── resnet_model.py │ ├── resnet_runnable.py │ ... │ ... ├── benchmark.sh ├── modelzoo_level.txt ... └── requirements.txt
root@ubuntu:/data/atlas_dls/public/dataset/resnet50/imagenet# pwd
1 | /data/atlas_dls/public/dataset/resnet50/imagenet |
root@ubuntu:/data/atlas_dls/public/dataset/resnet50/imagenet# du -sh
1 | 11G |
def main(): args = parser.parse_args() os.environ['MASTER_ADDR'] = args.addr #os.environ['MASTER_PORT'] = '29501' # 注释或删除该行代码 if os.getenv('ALLOW_FP32', False) and os.getenv('ALLOW_HF32', False): raise RuntimeError('ALLOW_FP32 and ALLOW_HF32 cannot be set at the same time!') elif os.getenv('ALLOW_HF32', False): torch.npu.conv.allow_hf32 = True elif os.getenv('ALLOW_FP32', False): torch.npu.conv.allow_hf32 = False torch.npu.matmul.allow_hf32 = False
root@ubuntu:/data/atlas_dls/public/code/ResNet50_ID4149_for_PyTorch/scripts# scripts/ ├── train_start.sh
root@ubuntu:/data/atlas_dls/public/dataset/imagenet# pwd
1 | /data/atlas_dls/public/dataset/imagenet |
root@ubuntu:/data/atlas_dls/public/dataset/imagenet# du -sh
1 | 11G |
root@ubuntu:/data/atlas_dls/public/code/ResNet50_for_MindSpore_2.0_code/scripts/# scripts/ ├── docker_start.sh ├── run_standalone_train_gpu.sh ├── run_standalone_train.sh ... └── train_start.sh
... if config.run_distribute: if target == "Ascend": #device_id = int(os.getenv('DEVICE_ID', '0')) #注释该行代码 #ms.set_context(device_id=device_id) #注释该行代码 ms.set_auto_parallel_context(device_num=config.device_num, parallel_mode=ms.ParallelMode.DATA_PARALLEL, gradients_mean=True) set_algo_parameters(elementwise_op_strategy_follow=True) if config.net_name == "resnet50" or config.net_name == "se-resnet50": if config.boost_mode not in ["O1", "O2"]: ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) elif config.net_name in ["resnet101", "resnet152"]: ms.set_auto_parallel_context(all_reduce_fusion_config=config.all_reduce_fusion_config) init() # GPU target ...