参考断点续训的yaml参数说明,同时需增加和修改yaml配置文件。
... apiVersion: batch.volcano.sh/v1alpha1 kind: Job metadata: name: mindx-dls-test # 注意和ConfigMap的name的对应关系 namespace: vcjob # 根据实际需要选择合适的namespace(ConfigMap, Job需要保持一致) labels: ring-controller.atlas: ascend-910 # hccl_controller 根据该标签来区分配置Ascend910和非Ascend910的场景 fault-scheduling: "grace" elastic-scheduling: "on" # 需添加""号 annotations: minReplicas: "1" spec: minAvailable: 2 # 需要和replicas值相等 ... maxRetry: 0 ... lifecycle: # 使用临终遗言功能需要添加加粗代码 preStop: exec: command: ["/bin/bash", "-c", "cd /job/code/resnet/scripts; bash pre_stop.sh"] resources: requests: huawei.com/Ascend910: 8 # 服务器间分布式训练设置为8,单机训练时根据任务进行调整 limits: huawei.com/Ascend910: 8 # 数值与请求数量保持一致 ...