diff --git a/kubernestes-configs/gpu-job.yml b/kubernestes-configs/gpu-job.yml new file mode 100644 index 0000000000000000000000000000000000000000..a19453d29e5a2e8fd2a61a1e31d2753a83fb01b3 --- /dev/null +++ b/kubernestes-configs/gpu-job.yml @@ -0,0 +1,89 @@ +# batch/v1 tells it to use the JOB API +apiVersion: batch/v1 +# we are running a Job, not a Pod +kind: Job + +# set the name of the job +metadata: + name: peicongjob + +spec: + # how many times should the system + # retry before calling it a failure + backoffLimit: 0 + template: + spec: + # should we restart on failure + restartPolicy: Never + # what containers will we need + containers: + # the name of the container + - name: traffic-sign-recognition-system + # the image: can be from any pubic facing registry + image: gitlab-registry.nrp-nautilus.io/peicongcheng/traffic-sign-recognition-system + # the working dir when the container starts + workingDir: /path/to/mydir + # should Kube pull it + imagePullPolicy: IfNotPresent + # we need to expose the port + # that will be used for DDP + ports: + - containerPort: 8880 + # setting of env variables + env: + # which interface to use + - name: NCCL_SOCKET_IFNAME + value: eth0 + # prints some INFO level + # NCCL logs + - name: NCCL_DEBUG + value: INFO + # the command to run when the container starts + command: ["python", "-m", "train.py", "./train_cfg.py"] + # define the resources for this container + resources: + # limits - the max given to the container + limits: + # RAM + memory: 64Gi + # cores + cpu: 32 + # NVIDIA GPUs + nvidia.com/gpu: 4 + # requests - what we'd like + requests: + # RAM + memory: 64Gi + # CPU Cores + cpu: 32 + # GPUs + nvidia.com/gpu: 4 + # what volumes should we mount + volumeMounts: + # my datasets PVC should mount to /data + - mountPath: /data + name: peicong + # IMPORTANT: we need SHM for DDP + - mountPath: /dev/shm + name: dshm + # tell Kube where to find the volumes we want to use + volumes: + # which PVC is my data + - name: peicong + persistentVolumeClaim: + claimName: peicong + # setup shared memory as a RAM volume + - name: dshm + emptyDir: + medium: Memory + # Tell Kube what type of GPUs we want + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: nvidia.com/gpu.product + operator: In + values: + # asking for 3090s only + - NVIDIA-GeForce-RTX-3090 diff --git a/kubernestes-configs/gpu-pod.yml b/kubernestes-configs/gpu-pod.yml new file mode 100644 index 0000000000000000000000000000000000000000..a2d3ff0bee7a7827e0d1a57f6f04623a3b9791e0 --- /dev/null +++ b/kubernestes-configs/gpu-pod.yml @@ -0,0 +1,45 @@ +apiVersion: v1 +kind: Pod + +metadata: + name: peicongpod + +spec: + containers: + - name: peicongpod + image: gitlab-registry.nrp-nautilus.io/peicongcheng/traffic-sign-recognition-system + imagePullPolicy: IfNotPresent + command: ["sleep", "infinity"] + resources: + limits: + memory: 12Gi + cpu: 2 + nvidia.com/gpu: 1 + requests: + memory: 12Gi + cpu: 2 + nvidia.com/gpu: 1 + volumeMounts: + - mountPath: /data + name: peicong + # IMPORTANT: we need SHM for PyTorch + - mountPath: /dev/shm + name: dshm + volumes: + - name: peicong + persistentVolumeClaim: + claimName: peicong + # setup shared memory as a RAM volume + - name: dshm + emptyDir: + medium: Memory + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: nvidia.com/gpu.product + operator: In + values: + - NVIDIA-GeForce-RTX-3090 + - Tesla-T4 diff --git a/kubernestes-configs/persistent_volume.yml b/kubernestes-configs/persistent_volume.yml new file mode 100644 index 0000000000000000000000000000000000000000..354ef89e19b7e926290a45ff05e197f008094c19 --- /dev/null +++ b/kubernestes-configs/persistent_volume.yml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: peicong +spec: + storageClassName: rook-cephfs + accessModes: + - ReadWriteMany + resources: + requests: + storage: 50Gi \ No newline at end of file diff --git a/kubernestes-configs/pod_pvc.yml b/kubernestes-configs/pod_pvc.yml new file mode 100644 index 0000000000000000000000000000000000000000..e9f963c97082567e60dabb3e617dc021efbce562 --- /dev/null +++ b/kubernestes-configs/pod_pvc.yml @@ -0,0 +1,24 @@ +apiVersion: v1 +kind: Pod +metadata: + name: peicongpod # YOUR POD NAME HERE +spec: + containers: + - name: peicongpod # YOUR CONTAINER NAME HERE + image: gitlab-registry.nrp-nautilus.io/peicongcheng/traffic-sign-recognition-system + command: ["sh", "-c", "echo 'Im a new pod' && sleep infinity"] + resources: + limits: + memory: 12Gi + cpu: 2 + requests: + memory: 10Gi + cpu: 2 + volumeMounts: + - mountPath: /data + name: peicong # YOUR PVC NAME HERE + volumes: + - name: peicong # YOUR PVC NAME HERE + persistentVolumeClaim: + claimName: peicong # YOUR PVC NAME HERE +