Skip to content
Snippets Groups Projects
Commit fb78b6e5 authored by CPCAlex's avatar CPCAlex
Browse files

update

parent 8a5aa7ae
No related branches found
No related tags found
No related merge requests found
Pipeline #43014 failed
# batch/v1 tells it to use the JOB API
apiVersion: batch/v1
# we are running a Job, not a Pod
kind: Job
# set the name of the job
metadata:
name: peicongjob
spec:
# how many times should the system
# retry before calling it a failure
backoffLimit: 0
template:
spec:
# should we restart on failure
restartPolicy: Never
# what containers will we need
containers:
# the name of the container
- name: traffic-sign-recognition-system
# the image: can be from any pubic facing registry
image: gitlab-registry.nrp-nautilus.io/peicongcheng/traffic-sign-recognition-system
# the working dir when the container starts
workingDir: /path/to/mydir
# should Kube pull it
imagePullPolicy: IfNotPresent
# we need to expose the port
# that will be used for DDP
ports:
- containerPort: 8880
# setting of env variables
env:
# which interface to use
- name: NCCL_SOCKET_IFNAME
value: eth0
# prints some INFO level
# NCCL logs
- name: NCCL_DEBUG
value: INFO
# the command to run when the container starts
command: ["python", "-m", "train.py", "./train_cfg.py"]
# define the resources for this container
resources:
# limits - the max given to the container
limits:
# RAM
memory: 64Gi
# cores
cpu: 32
# NVIDIA GPUs
nvidia.com/gpu: 4
# requests - what we'd like
requests:
# RAM
memory: 64Gi
# CPU Cores
cpu: 32
# GPUs
nvidia.com/gpu: 4
# what volumes should we mount
volumeMounts:
# my datasets PVC should mount to /data
- mountPath: /data
name: peicong
# IMPORTANT: we need SHM for DDP
- mountPath: /dev/shm
name: dshm
# tell Kube where to find the volumes we want to use
volumes:
# which PVC is my data
- name: peicong
persistentVolumeClaim:
claimName: peicong
# setup shared memory as a RAM volume
- name: dshm
emptyDir:
medium: Memory
# Tell Kube what type of GPUs we want
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: nvidia.com/gpu.product
operator: In
values:
# asking for 3090s only
- NVIDIA-GeForce-RTX-3090
apiVersion: v1
kind: Pod
metadata:
name: peicongpod
spec:
containers:
- name: peicongpod
image: gitlab-registry.nrp-nautilus.io/peicongcheng/traffic-sign-recognition-system
imagePullPolicy: IfNotPresent
command: ["sleep", "infinity"]
resources:
limits:
memory: 12Gi
cpu: 2
nvidia.com/gpu: 1
requests:
memory: 12Gi
cpu: 2
nvidia.com/gpu: 1
volumeMounts:
- mountPath: /data
name: peicong
# IMPORTANT: we need SHM for PyTorch
- mountPath: /dev/shm
name: dshm
volumes:
- name: peicong
persistentVolumeClaim:
claimName: peicong
# setup shared memory as a RAM volume
- name: dshm
emptyDir:
medium: Memory
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: nvidia.com/gpu.product
operator: In
values:
- NVIDIA-GeForce-RTX-3090
- Tesla-T4
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: peicong
spec:
storageClassName: rook-cephfs
accessModes:
- ReadWriteMany
resources:
requests:
storage: 50Gi
\ No newline at end of file
apiVersion: v1
kind: Pod
metadata:
name: peicongpod # YOUR POD NAME HERE
spec:
containers:
- name: peicongpod # YOUR CONTAINER NAME HERE
image: gitlab-registry.nrp-nautilus.io/peicongcheng/traffic-sign-recognition-system
command: ["sh", "-c", "echo 'Im a new pod' && sleep infinity"]
resources:
limits:
memory: 12Gi
cpu: 2
requests:
memory: 10Gi
cpu: 2
volumeMounts:
- mountPath: /data
name: peicong # YOUR PVC NAME HERE
volumes:
- name: peicong # YOUR PVC NAME HERE
persistentVolumeClaim:
claimName: peicong # YOUR PVC NAME HERE
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment