Skip to content

Commit 9c36a11

Browse files
authored
feat: add support for jobset (#29)
* feat: add support for jobset Signed-off-by: vsoch <[email protected]>
1 parent e55d245 commit 9c36a11

File tree

30 files changed

+922
-130
lines changed

30 files changed

+922
-130
lines changed

api/v1alpha1/statemachine_types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,10 @@ type JobStep struct {
122122
// +optional
123123
Events JobEvents `json:"events,omitempty"`
124124

125+
// Environment for the job
126+
// +optional
127+
Environment map[string]string `json:"environment,omitempty"`
128+
125129
// Architecture (arm64 or amd64)
126130
// +kubebuilder:default="amd64"
127131
// +default="amd64"

config/crd/bases/state-machine.converged-computing.org_statemachines.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,11 @@ spec:
109109
description: Walltime (in string format) for the job
110110
type: string
111111
type: object
112+
environment:
113+
additionalProperties:
114+
type: string
115+
description: Environment for the job
116+
type: object
112117
events:
113118
description: Event for a job
114119
properties:

config/rbac/role.yaml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,42 @@ rules:
380380
- patch
381381
- update
382382
- watch
383+
- apiGroups:
384+
- jobset.x-k8s.io
385+
resources:
386+
- jobsets
387+
verbs:
388+
- create
389+
- delete
390+
- get
391+
- list
392+
- patch
393+
- update
394+
- watch
395+
- apiGroups:
396+
- jobset.x-k8s.io
397+
resources:
398+
- jobsets/finalizers
399+
verbs:
400+
- create
401+
- delete
402+
- get
403+
- list
404+
- patch
405+
- update
406+
- watch
407+
- apiGroups:
408+
- jobset.x-k8s.io
409+
resources:
410+
- jobsets/status
411+
verbs:
412+
- create
413+
- delete
414+
- get
415+
- list
416+
- patch
417+
- update
418+
- watch
383419
- apiGroups:
384420
- networking.k8s.io
385421
resources:

docker/manager/Dockerfile

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,14 @@ RUN yum update -y && yum install -y gcc-g++ git which python3-pip && \
77
dnf update -y && dnf install -y epel-release && \
88
dnf install -y which vim htop
99

10+
# Install oras for easy interaction with registry
11+
RUN cd /tmp && VERSION="1.2.2" && \
12+
curl -LO "https://github.com/oras-project/oras/releases/download/v${VERSION}/oras_${VERSION}_linux_amd64.tar.gz" && \
13+
mkdir -p oras-install/ && \
14+
tar -zxf oras_${VERSION}_*.tar.gz -C oras-install/ && \
15+
mv oras-install/oras /usr/local/bin/ && \
16+
rm -rf oras_${VERSION}_*.tar.gz oras-install/
17+
1018
COPY ./docker/manager/entrypoint.sh /
1119
COPY ./examples/jobs /opt/jobs
1220
COPY ./python /opt/state-machine-operator

examples/dist/state-machine-operator-dev.yaml

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,11 @@ spec:
117117
description: Walltime (in string format) for the job
118118
type: string
119119
type: object
120+
environment:
121+
additionalProperties:
122+
type: string
123+
description: Environment for the job
124+
type: object
120125
events:
121126
description: Event for a job
122127
properties:
@@ -747,6 +752,42 @@ rules:
747752
- patch
748753
- update
749754
- watch
755+
- apiGroups:
756+
- jobset.x-k8s.io
757+
resources:
758+
- jobsets
759+
verbs:
760+
- create
761+
- delete
762+
- get
763+
- list
764+
- patch
765+
- update
766+
- watch
767+
- apiGroups:
768+
- jobset.x-k8s.io
769+
resources:
770+
- jobsets/finalizers
771+
verbs:
772+
- create
773+
- delete
774+
- get
775+
- list
776+
- patch
777+
- update
778+
- watch
779+
- apiGroups:
780+
- jobset.x-k8s.io
781+
resources:
782+
- jobsets/status
783+
verbs:
784+
- create
785+
- delete
786+
- get
787+
- list
788+
- patch
789+
- update
790+
- watch
750791
- apiGroups:
751792
- networking.k8s.io
752793
resources:

examples/test/jobset-mnist.yaml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
apiVersion: state-machine.converged-computing.org/v1alpha1
2+
kind: StateMachine
3+
metadata:
4+
name: state-machine
5+
spec:
6+
manager:
7+
pullPolicy: Never
8+
workflow:
9+
completed: 10
10+
11+
cluster:
12+
maxSize: 4
13+
14+
# This example needs JobSet installed
15+
# kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.4.0/manifests.yaml
16+
jobs:
17+
- name: pytorch
18+
properties:
19+
jobset: "yes"
20+
ports: "3389"
21+
config:
22+
nodes: 4
23+
image: ghcr.io/converged-computing/mummi-experiments:mnist
24+
workdir: /workspace
25+
environment:
26+
MASTER_PORT: "3389"
27+
RANK: "from:metadata.annotations['batch.kubernetes.io/job-completion-index']"
28+
PYTHONUNBUFFERED: "0"
29+
epochs: "1"
30+
script: torchrun --rdzv_id=123 --nnodes=${nodes} --nproc_per_node=1 --master_addr=${jobname}-jobset-0-0.${jobname}.default.svc.cluster.local --master_port=$MASTER_PORT --node_rank=$RANK mnist.py --epochs=${epochs} --log-interval=1

examples/test/jobset-mnist/Dockerfile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
FROM gcr.io/k8s-staging-jobset/pytorch-mnist:latest
2+
# docker build -t ghcr.io/converged-computing/mummi-experiments:mnist .
3+
RUN apt-get update && apt-get install -y curl wget iputils-ping
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
apiVersion: state-machine.converged-computing.org/v1alpha1
2+
kind: StateMachine
3+
metadata:
4+
name: state-machine
5+
spec:
6+
manager:
7+
pullPolicy: Never
8+
workflow:
9+
completed: 10
10+
cluster:
11+
maxSize: 4
12+
13+
# This example needs JobSet installed
14+
# kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.4.0/manifests.yaml
15+
jobs:
16+
- name: pytorch
17+
properties:
18+
jobset: "yes"
19+
ports: "3389"
20+
config:
21+
nodes: 4
22+
# based off of gcr.io/k8s-staging-jobset/pytorch-mnist:latest
23+
image: ghcr.io/converged-computing/mummi-experiments:mnist
24+
workdir: /workspace
25+
environment:
26+
MASTER_PORT: "3389"
27+
RANK: "from:metadata.annotations['batch.kubernetes.io/job-completion-index']"
28+
PYTHONUNBUFFERED: "0"
29+
epochs: "1"
30+
script: torchrun --rdzv_id=123 --nnodes=${nodes} --nproc_per_node=1 --master_addr=${jobname}-jobset-0-0.${jobname}.default.svc.cluster.local --master_port=$MASTER_PORT --node_rank=$RANK mnist.py --epochs=${epochs} --log-interval=1
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
FROM gcr.io/k8s-staging-jobset/pytorch-resnet:latest
2+
# docker build -t ghcr.io/converged-computing/mummi-experiments:cifar .
3+
RUN /bin/bash -c "curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -"
4+
# RUN apt-get update && apt-get install -y curl wget iputils-ping
5+
COPY ./resnet.py /resnet.py

0 commit comments

Comments
 (0)