昇腾社区首页
中文
注册

ClusterD安全加固

ClusterD运行后,会启动gRPC服务端侦听训练容器内gRPC客户端的消息,完成断点续训特性。ClusterD默认情况下会使用非安全的gRPC通信方式,用户可采用TLS/SSL加密方式通信,防止通信过程被攻击。

下面将以ClusterD和NodeD的双向认证为例,介绍ClusterD安全加固的详细步骤。在本示例中,ClusterD为服务端,NodeD为客户端。

前提条件

在进行双向认证前,用户需准备好以下证书文件。

  • rootCA.crt
  • client.crt
  • client.key
  • server.crt
  • server.key

操作步骤

  1. 拉取nginx镜像。
    docker pull nginx
  2. 在路径A下创建文件夹cert,将前提条件中的证书文件rootCA.crt、server.crt、server.key放入cert文件夹下。
  3. 在路径A下新建一个文件夹conf,在该文件夹下新建一个名为nginx.conf的文件,并将以下内容写入文件中:
       worker_processes 1;
       worker_cpu_affinity 0001;
    
       worker_rlimit_nofile 4096;
       events {
           worker_connections 4096;
       }
    
       http {
        port_in_redirect off;
        server_tokens off;
        autoindex off;
    
        access_log /var/log/nginx/access.log;
        error_log /var/log/nginx/error.log info;
    
        limit_req_zone global zone=req_zone:100m rate=20r/s;
        limit_conn_zone global zone=north_conn_zone:100m;
    
        server {
         listen <ClusterD的Pod IP>:8888 ssl;  # ClusterD的Pod IP地址,端口与ClusterD配置文件中的端口保持一致
         http2 on;
    
    
         proxy_ssl_session_reuse off;
    
         add_header Referrer-Policy "no-referrer";
         add_header X-XSS-Protection "1; mode=block";
         add_header X-Frame-Options DENY;
         add_header X-Content-Type-Options nosniff;
         add_header Strict-Transport-Security " max-age=31536000; includeSubDomains ";
         add_header Content-Security-Policy "default-src 'self'";
         add_header Cache-control "no-cache, no-store, must-revalidate";
         add_header Pragma no-cache;
         add_header Expires 0;
    
         ssl_session_tickets off;
    
         ssl_certificate     /etc/nginx/conf.d/cert/server.crt;                     # 服务端证书路径(权限400)
         ssl_certificate_key /etc/nginx/conf.d/cert/server.key;              # 服务端私钥路径,私钥不能明文配置(权限400)
         ssl_client_certificate /etc/nginx/conf.d/cert/rootCA.crt;
         ssl_verify_client on;
         ssl_verify_depth 2; 
         send_timeout 60;
    
         limit_req zone=req_zone burst=20 nodelay;
         limit_conn north_conn_zone 20;
         keepalive_timeout  60;
         proxy_read_timeout 900;
         proxy_connect_timeout   60;
         proxy_send_timeout      60;
         client_header_timeout   60;
         client_body_timeout 10;
         client_header_buffer_size  2k;
         large_client_header_buffers 4 8k;
         client_body_buffer_size 16K;
         client_max_body_size 20m;
         ssl_protocols TLSv1.2 TLSv1.3;
         ssl_ciphers "ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-RSA-AES128-GCM-SHA256 !aNULL !eNULL !LOW !3DES !MD5 !EXP !PSK !SRP !DSS !RC4"; 
         ssl_session_timeout 10s;
         ssl_session_cache shared:SSL:10m;
    
         location / {
          grpc_pass grpc://<ClusterD的Pod IP>:8899;                    # ClusterD的Pod IP地址
           }
        }
       }
  4. 在ClusterD启动YAML文件中修改或新增以下加粗字段。
        # Deployment中的containers项增加
               - name: nginx
                 image: nginx:latest
                 imagePullPolicy: Never
                 command: [ "/bin/bash", "-c", "--"]
                 args: [ "sleep infinity" ]
                 volumeMounts:
                   - name: nginx-cert
                     mountPath: /etc/nginx/conf.d/cert
                   - name: nginx-conf
                     mountPath: /etc/nginx/conf
    
       # Deployment中的volumes项增加
               - name: nginx-cert
                 hostPath:
                   path: /{路径A}/cert           # x509证书、私钥目录路径,请将路径A替换成步骤2中的文件路径
               - name: nginx-conf
                 hostPath:
                   path: /{路径A}/config       # nginx启动配置文件,请将路径A替换成步骤2中的文件路径
    
    
       # Service中的ports项改为如下
           - protocol: TCP
             port: 8888
             targetPort: 8888
  5. 执行以下命令启动ClusterD服务。
    kubectl apply -f clusterd-v{version}.yaml
  6. 执行以下命令查看ClusterD的pod IP,将查询到的pod IP写入步骤3的nginx.conf文件中。
    kubectl get pod -A -o wide | grep clusterd
  7. 启动nginx。
    ## 进入nginx容器 
    kubectl exec -it -n mindx-dl clusterd-{xxx} -c nginx bash      #请将{xxx}替换为ClusterD的Pod启动以后K8s随机生成的Pod ID  
    ## 执行以下命令启动nginx,并根据提示输入密钥口令 
    nginx -c /etc/nginx/conf/nginx.conf
  8. 启动NodeD服务。
    1. 在路径B中创建文件夹cert,将前提条件中的证书文件rootCA.crt、client.crt、client.key放入cert文件夹下。
    2. 在路径B下新建一个文件夹conf,在该文件夹下新建一个名为nginx.conf的文件,并将以下内容写入文件中:
         worker_processes 1;
         worker_cpu_affinity 0001;
      
         worker_rlimit_nofile 4096;
         events {
             worker_connections 4096;
         }
      
         http {
          port_in_redirect off;
          server_tokens off;
          autoindex off;
      
          access_log /var/log/nginx/access.log;
          error_log /var/log/nginx/error.log;
      
          grpc_buffer_size 16M;
      
          limit_req_zone global zone=req_zone:100m rate=20r/s;
          limit_conn_zone global zone=north_conn_zone:100m;
      
          server {
           listen 127.0.0.1:8899; 
           http2 on;     
      
           ssl_session_tickets off;
      
           limit_req zone=req_zone burst=20 nodelay;
           limit_conn north_conn_zone 20;
           keepalive_timeout  60;
           proxy_read_timeout 900;
           proxy_connect_timeout   60;
           proxy_send_timeout      60;
           client_header_timeout   60;
           client_body_timeout 10;
           client_header_buffer_size  200k;
           large_client_header_buffers 4 800k;
           client_body_buffer_size 160K;
           client_max_body_size 20m;
      
           location / {
            grpc_pass grpcs://<ClusterD的service IP>:8888;                    # ClusterD的service IP地址,service IP可通过以下命令进行查询:kubectl get svc -A | grep clusterd
            grpc_ssl_verify on;
            grpc_ssl_trusted_certificate /etc/nginx/conf.d/cert/rootCA.crt;
            grpc_ssl_verify_depth 2;
            grpc_ssl_certificate /etc/nginx/conf.d/cert/client.crt;
            grpc_ssl_certificate_key /etc/nginx/conf.d/cert/client.key;
            grpc_ssl_protocols TLSv1.2 TLSv1.3;
            grpc_ssl_ciphers "ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-RSA-AES128-GCM-SHA256 !aNULL !eNULL !LOW !3DES !MD5 !EXP !PSK !SRP !DSS !RC4";
            grpc_ssl_name <服务证书中的SAN或CN>;     # 服务证书的SAN或CN
             }
          }
         }
    3. 在NodeD的启动YAML文件中新增以下加粗字段。
         # 新增启动参数 sleep 150  
              args: [ "sleep 150; /usr/local/bin/noded -logFile=/var/log/mindx-dl/noded/noded.log -logLevel=0" ]
      
         # containers项增加
                 - name: nginx
                   image: nginx:latest
                   imagePullPolicy: Never
                   command: [ "/bin/bash", "-c", "--"]
                   args: [ "sleep infinity" ]
                   volumeMounts:
                     - name: nginx-cert
                       mountPath: /etc/nginx/conf.d/cert
                     - name: nginx-conf
                       mountPath: /etc/nginx/conf
      
         # volumes项增加
                 - name: nginx-cert
                   hostPath:
                     path: /{路径B}/cert          # x509证书、私钥目录路径
                 - name: nginx-conf
                   hostPath:
                     path: /{路径B}/config      # nginx启动配置文件
    4. 执行以下命令启动NodeD。
      kubectl apply -f noded-v{version}.yaml
    5. 进入NodeD容器,新增域名解析规则。
      ## 进入noded容器 
      kubectl exec -it -n <noded pod ns> <noded pod name> bash  
      ## 新增域名映射规则 
      echo 127.0.0.1 clusterd-grpc-svc.mindx-dl.svc.cluster.local >> /etc/hosts
    6. 启动nginx。
      ## 进入nginx容器 
      kubectl exec -it -n mindx-dl noded-{xxx} -c nginx bash      # {xxx}表示NodeD的Pod启动以后K8s随机生成的Pod ID  
      ## 启动nginx 
      nginx -c /etc/nginx/conf/nginx.conf