update

fb78b6e5 · CPCAlex · 8a5aa7ae · fb78b6e5 · fb78b6e5 · fb78b6e5
Commit fb78b6e5 authored 1 year ago by CPCAlex
--- a/kubernestes-configs/gpu-job.yml
+++ b/kubernestes-configs/gpu-job.yml
+# batch/v1 tells it to use the JOB API
+apiVersion: batch/v1
+# we are running a Job, not a Pod
+kind: Job
+# set the name of the job
+metadata:
+  name: peicongjob
+spec:
+  # how many times should the system
+  # retry before calling it a failure
+  backoffLimit: 0
+  template:
+    spec:
+      # should we restart on failure
+      restartPolicy: Never
+      # what containers will we need
+      containers:
+        # the name of the container
+        - name: traffic-sign-recognition-system
+          # the image: can be from any pubic facing registry
+          image: gitlab-registry.nrp-nautilus.io/peicongcheng/traffic-sign-recognition-system
+          # the working dir when the container starts
+          workingDir: /path/to/mydir
+          # should Kube pull it
+          imagePullPolicy: IfNotPresent
+          # we need to expose the port
+          # that will be used for DDP
+          ports:
+            - containerPort: 8880
+          # setting of env variables
+          env:
+            # which interface to use
+            - name: NCCL_SOCKET_IFNAME
+              value: eth0
+            # prints some INFO level
+            # NCCL logs
+            - name: NCCL_DEBUG
+              value: INFO
+          # the command to run when the container starts
+          command: ["python", "-m", "train.py", "./train_cfg.py"]
+          # define the resources for this container
+          resources:
+            # limits - the max given to the container
+            limits:
+              # RAM
+              memory: 64Gi
+              # cores
+              cpu: 32
+              # NVIDIA GPUs
+              nvidia.com/gpu: 4
+            # requests - what we'd like
+            requests:
+              # RAM
+              memory: 64Gi
+              # CPU Cores
+              cpu: 32
+              # GPUs
+              nvidia.com/gpu: 4
+          # what volumes should we mount
+          volumeMounts:
+            # my datasets PVC should mount to /data
+            - mountPath: /data
+              name: peicong
+            # IMPORTANT: we need SHM for DDP
+            - mountPath: /dev/shm
+              name: dshm
+      # tell Kube where to find the volumes we want to use
+      volumes:
+        # which PVC is my data
+        - name: peicong
+          persistentVolumeClaim:
+            claimName: peicong
+        # setup shared memory as a RAM volume
+        - name: dshm
+          emptyDir:
+            medium: Memory
+      # Tell Kube what type of GPUs we want
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: nvidia.com/gpu.product
+                    operator: In
+                    values:
+                      # asking for 3090s only
+                      - NVIDIA-GeForce-RTX-3090
--- a/kubernestes-configs/gpu-pod.yml
+++ b/kubernestes-configs/gpu-pod.yml
+apiVersion: v1
+kind: Pod
+metadata:
+  name: peicongpod
+spec:
+  containers:
+    - name: peicongpod
+      image: gitlab-registry.nrp-nautilus.io/peicongcheng/traffic-sign-recognition-system
+      imagePullPolicy: IfNotPresent
+      command: ["sleep", "infinity"]
+      resources:
+        limits:
+          memory: 12Gi
+          cpu: 2
+          nvidia.com/gpu: 1
+        requests:
+          memory: 12Gi
+          cpu: 2
+          nvidia.com/gpu: 1
+      volumeMounts:
+        - mountPath: /data
+          name: peicong
+        # IMPORTANT: we need SHM for PyTorch
+        - mountPath: /dev/shm
+          name: dshm
+  volumes:
+    - name: peicong
+      persistentVolumeClaim:
+        claimName: peicong
+    # setup shared memory as a RAM volume
+    - name: dshm
+      emptyDir:
+        medium: Memory
+  affinity:
+    nodeAffinity:
+      requiredDuringSchedulingIgnoredDuringExecution:
+        nodeSelectorTerms:
+          - matchExpressions:
+              - key: nvidia.com/gpu.product
+                operator: In
+                values:
+                  - NVIDIA-GeForce-RTX-3090
+                  - Tesla-T4
--- a/kubernestes-configs/persistent_volume.yml
+++ b/kubernestes-configs/persistent_volume.yml
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: peicong
+spec:
+  storageClassName: rook-cephfs	
+  accessModes:
+  - ReadWriteMany
+  resources:
+    requests:
+      storage: 50Gi
\ No newline at end of file
--- a/kubernestes-configs/pod_pvc.yml
+++ b/kubernestes-configs/pod_pvc.yml
+apiVersion: v1
+kind: Pod
+metadata:
+  name: peicongpod # YOUR POD NAME HERE
+spec:
+  containers:
+    - name: peicongpod # YOUR CONTAINER NAME HERE
+      image: gitlab-registry.nrp-nautilus.io/peicongcheng/traffic-sign-recognition-system
+      command: ["sh", "-c", "echo 'Im a new pod' && sleep infinity"]
+      resources:
+        limits:
+          memory: 12Gi
+          cpu: 2
+        requests:
+          memory: 10Gi
+          cpu: 2
+      volumeMounts:
+        - mountPath: /data
+          name: peicong # YOUR PVC NAME HERE
+  volumes:
+    - name: peicong # YOUR PVC NAME HERE
+      persistentVolumeClaim:
+        claimName: peicong # YOUR PVC NAME HERE