Skip to content

Commit e9a143f

Browse files
authored
feat: fix webhook bug, create tfconn in pod controller (#7)
1 parent 11dbcdf commit e9a143f

File tree

19 files changed

+337
-129
lines changed

19 files changed

+337
-129
lines changed

.github/workflows/release.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ jobs:
1111
permissions:
1212
# to create release tags (cycjimmy/semantic-release-action)
1313
contents: write
14+
issues: write
15+
pull-requests: write
1416

1517
runs-on: ubuntu-latest
1618
outputs:
@@ -43,6 +45,7 @@ jobs:
4345
with:
4446
images: tensorfusion/tensor-fusion-operator
4547
tags: type=semver,pattern={{version}},value=${{needs.release.outputs.version}}
48+
4649
- name: Login to DockerHub
4750
uses: docker/login-action@v2
4851
with:

.mirrord/mirrord.json

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{
2+
"feature": {
3+
"network": {
4+
"incoming": "steal",
5+
"outgoing": true
6+
},
7+
"fs": "read",
8+
"env": true
9+
},
10+
"target": {
11+
"namespace": "tensor-fusion",
12+
"path": {
13+
"deployment": "tensor-fusion-operator-controller-manager",
14+
"container": "manager"
15+
}
16+
}
17+
}

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ endif
1414
# Be aware that the target commands are only tested with Docker which is
1515
# scaffolded by default. However, you might want to replace it to use other
1616
# tools. (i.e. podman)
17-
CONTAINER_TOOL ?= docker
17+
CONTAINER_TOOL ?= $(shell command -v docker >/dev/null 2>&1 && echo docker || echo nerdctl)
1818

1919
# Setting SHELL to bash allows bash commands to be executed by recipes.
2020
# Options are set to exit when a recipe line exits non-zero or a piped command fails.

PROJECT

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ resources:
2323
kind: GPU
2424
path: github.com/NexusGPU/tensor-fusion-operator/api/v1
2525
version: v1
26-
- core: true
26+
- controller: true
27+
core: true
2728
group: core
2829
kind: Pod
2930
path: k8s.io/api/core/v1

cmd/main.go

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ func main() {
6767
var secureMetrics bool
6868
var enableHTTP2 bool
6969
var tlsOpts []func(*tls.Config)
70+
var configFile string
71+
flag.StringVar(&configFile, "config", "/etc/tensor-fusion/config.yaml", "Config file of tensor-fusion-operator")
7072
flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+
7173
"Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.")
7274
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
@@ -152,7 +154,13 @@ func main() {
152154
}
153155

154156
ctx := context.Background()
155-
config := config.NewDefaultConfig()
157+
config, err := config.LoadConfig(configFile)
158+
if os.IsNotExist(err) {
159+
setupLog.Info("config file is not exists, use default config", "configFile", configFile)
160+
} else if err != nil {
161+
setupLog.Error(err, "unable to load config", "configFile", configFile, "err", err)
162+
os.Exit(1)
163+
}
156164
scheduler := scheduler.NewNaiveScheduler()
157165
if err = (&controller.TensorFusionConnectionReconciler{
158166
Client: mgr.GetClient(),
@@ -183,6 +191,7 @@ func main() {
183191
}
184192
}
185193

194+
186195
if err = (&controller.TensorFusionClusterReconciler{
187196
Client: mgr.GetClient(),
188197
Scheme: mgr.GetScheme(),
@@ -211,6 +220,13 @@ func main() {
211220
setupLog.Error(err, "unable to create controller", "controller", "GPUNodeClass")
212221
os.Exit(1)
213222
}
223+
if err = (&controller.PodReconciler{
224+
Client: mgr.GetClient(),
225+
Scheme: mgr.GetScheme(),
226+
}).SetupWithManager(mgr); err != nil {
227+
setupLog.Error(err, "unable to create controller", "controller", "Pod")
228+
os.Exit(1)
229+
}
214230
// +kubebuilder:scaffold:builder
215231
if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
216232
setupLog.Error(err, "unable to set up health check")

config/default/kustomization.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Adds namespace to all resources.
2-
namespace: tensor-fusion-operator-system
2+
namespace: tensor-fusion
33

44
# Value of this field is prepended to the
55
# names of all resources, e.g. a deployment named

config/default/manager_webhook_patch.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ apiVersion: apps/v1
22
kind: Deployment
33
metadata:
44
name: controller-manager
5-
namespace: system
65
labels:
76
app.kubernetes.io/name: tensor-fusion-operator
87
app.kubernetes.io/managed-by: kustomize

config/manager/kustomization.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,9 @@ namespace: tensor-fusion
22

33
resources:
44
- manager.yaml
5+
apiVersion: kustomize.config.k8s.io/v1beta1
6+
kind: Kustomization
7+
images:
8+
- name: controller
9+
newName: tensorfusion/tensor-fusion-operator
10+
newTag: latest

config/manager/manager.yaml

Lines changed: 0 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,7 @@
1-
apiVersion: v1
2-
kind: Namespace
3-
metadata:
4-
labels:
5-
control-plane: controller-manager
6-
app.kubernetes.io/name: tensor-fusion-operator
7-
app.kubernetes.io/managed-by: kustomize
8-
name: system
9-
---
101
apiVersion: apps/v1
112
kind: Deployment
123
metadata:
134
name: controller-manager
14-
namespace: system
155
labels:
166
control-plane: controller-manager
177
app.kubernetes.io/name: tensor-fusion-operator
@@ -28,35 +18,6 @@ spec:
2818
labels:
2919
control-plane: controller-manager
3020
spec:
31-
# TODO(user): Uncomment the following code to configure the nodeAffinity expression
32-
# according to the platforms which are supported by your solution.
33-
# It is considered best practice to support multiple architectures. You can
34-
# build your manager image using the makefile target docker-buildx.
35-
# affinity:
36-
# nodeAffinity:
37-
# requiredDuringSchedulingIgnoredDuringExecution:
38-
# nodeSelectorTerms:
39-
# - matchExpressions:
40-
# - key: kubernetes.io/arch
41-
# operator: In
42-
# values:
43-
# - amd64
44-
# - arm64
45-
# - ppc64le
46-
# - s390x
47-
# - key: kubernetes.io/os
48-
# operator: In
49-
# values:
50-
# - linux
51-
securityContext:
52-
runAsNonRoot: true
53-
# TODO(user): For common cases that do not require escalating privileges
54-
# it is recommended to ensure that all your Pods/Containers are restrictive.
55-
# More info: https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted
56-
# Please uncomment the following code if your project does NOT have to work on old Kubernetes
57-
# versions < 1.19 or on vendors versions which do NOT support this field by default (i.e. Openshift < 4.11 ).
58-
# seccompProfile:
59-
# type: RuntimeDefault
6021
containers:
6122
- command:
6223
- /manager
@@ -65,11 +26,6 @@ spec:
6526
- --health-probe-bind-address=:8081
6627
image: controller:latest
6728
name: manager
68-
securityContext:
69-
allowPrivilegeEscalation: false
70-
capabilities:
71-
drop:
72-
- "ALL"
7329
livenessProbe:
7430
httpGet:
7531
path: /healthz
@@ -82,8 +38,6 @@ spec:
8238
port: 8081
8339
initialDelaySeconds: 5
8440
periodSeconds: 10
85-
# TODO(user): Configure the resources accordingly based on the project requirements.
86-
# More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
8741
resources:
8842
limits:
8943
cpu: 500m

config/rbac/role.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,18 @@ kind: ClusterRole
44
metadata:
55
name: manager-role
66
rules:
7+
- apiGroups:
8+
- ""
9+
resources:
10+
- pods
11+
verbs:
12+
- create
13+
- delete
14+
- get
15+
- list
16+
- patch
17+
- update
18+
- watch
719
- apiGroups:
820
- tensor-fusion.ai
921
resources:

0 commit comments

Comments
 (0)