Skip to content

Commit 4c58287

Browse files
authored
feat: support alert and integrating with alert manager, optimize metrics (#223)
* feat: alert evaluation and integrate with alert manager, init tsdb tables, optimize performance and metrics definitions * fix: lint and init issue * fix: init tsdb schema, add start params, alert related helm updates * fix: config watcher, merge alert rules to global config * fix: add log rules, alert evaluator bugs * chore: lint issues * fix: port release until pod deleted bug, alert evaluator bugs
1 parent ff0bb47 commit 4c58287

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+2539
-323
lines changed

.vscode/launch.json

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,11 @@
1313
"ENABLE_WEBHOOKS": "false"
1414
},
1515
"program": "${workspaceFolder}/cmd/main.go",
16-
"args": ["--gpu-info-config", "${workspaceFolder}/config/samples/gpu-info-config.yaml"]
16+
"args": [
17+
"--gpu-info-config", "${workspaceFolder}/config/samples/gpu-info-config.yaml",
18+
"--alert-rule-config", "${workspaceFolder}/config/samples/dynamic-config.yaml",
19+
"--enable-alert", "true"
20+
]
1721
},
1822
{
1923
"name": "Debug Discovery",

.vscode/settings.json

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
{
22
"cSpell.words": [
33
"alertmanager",
4+
"alertname",
45
"alicloud",
56
"Aliyun",
67
"AMDCDNA",
@@ -11,6 +12,7 @@
1112
"batchv",
1213
"burstable",
1314
"CDNA",
15+
"certgen",
1416
"certificaterequests",
1517
"certmanager",
1618
"clientgoscheme",
@@ -28,17 +30,22 @@
2830
"Eventf",
2931
"finalizer",
3032
"Finalizers",
33+
"FULLTEXT",
3134
"goconst",
35+
"gocyclo",
3236
"golint",
3337
"Gomega",
3438
"gopsutil",
39+
"gorm",
3540
"gosec",
3641
"gpuallocator",
3742
"gpunode",
3843
"gpunodeclasses",
3944
"gpunodes",
4045
"gpupool",
4146
"gpupools",
47+
"GPUT",
48+
"GPUVRAM",
4249
"greptime",
4350
"greptimedb",
4451
"healthz",
@@ -50,32 +57,39 @@
5057
"kustomization",
5158
"metav",
5259
"metricsserver",
60+
"mito",
5361
"nindent",
5462
"nolint",
5563
"NVML",
5664
"omitempty",
5765
"onsi",
5866
"portallocator",
67+
"Postable",
5968
"printcolumn",
6069
"prometheusagents",
6170
"prometheuses",
6271
"prometheusrules",
6372
"RDNA",
6473
"readyz",
74+
"replicaset",
75+
"runbook",
6576
"runpod",
77+
"samber",
6678
"schedulingconfigtemplate",
6779
"schedulingconfigtemplates",
6880
"schedulingcorev",
6981
"shirou",
7082
"strategicpatches",
7183
"subresource",
84+
"Tabler",
7285
"tensorfusion",
7386
"tensorfusionaiv",
7487
"tensorfusioncluster",
7588
"tensorfusionclusters",
7689
"tensorfusionworkload",
7790
"Tera",
7891
"tflops",
92+
"timberio",
7993
"Tmpl",
8094
"Tolerations",
8195
"utilruntime",

api/v1/workloadprofile_types.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,16 @@ const (
3333
// WorkloadProfileSpec defines the desired state of WorkloadProfile.
3434
type WorkloadProfileSpec struct {
3535
// +optional
36+
// If replicas not set, it will be dynamic based on pending Pod
37+
// If isLocalGPU set to true, replicas must be dynamic, and this field will be ignored
3638
Replicas *int32 `json:"replicas,omitempty"`
3739

3840
// +optional
3941
PoolName string `json:"poolName,omitempty"`
4042

4143
// +optional
44+
Resources Resources `json:"resources"`
4245

43-
Resources Resources `json:"resources,omitempty"`
4446
// +optional
4547
// Qos defines the quality of service level for the client.
4648
Qos QoSLevel `json:"qos,omitempty"`
@@ -57,9 +59,8 @@ type WorkloadProfileSpec struct {
5759
GPUCount uint `json:"gpuCount,omitempty"`
5860

5961
// +optional
60-
// TODO, not implemented
6162
// This mode is only available when `is-local-gpu` set to true, in this mode, TensorFusion will also inject vGPU worker into init container, so that to achieve best performance, trade-off is user might by-pass the vGPU worker and using physical GPU directly
62-
NoStandaloneWorkerMode bool `json:"noStandaloneWorkerMode,omitempty"`
63+
StandaloneWorkerMode bool `json:"standaloneWorkerMode,omitempty"`
6364

6465
// +optional
6566
// AutoScalingConfig configured here will override Pool's schedulingConfig

charts/tensor-fusion/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 1.3.3
18+
version: 1.3.5
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to

charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -183,12 +183,6 @@ spec:
183183
description: Schedule the workload to the same GPU server that runs
184184
vGPU worker for best performance, default to false
185185
type: boolean
186-
noStandaloneWorkerMode:
187-
description: This mode is only available when `is-local-gpu` set to
188-
true, in this mode, TensorFusion will also inject vGPU worker into
189-
init container, so that to achieve best performance, trade-off is
190-
user might by-pass the vGPU worker and using physical GPU directly
191-
type: boolean
192186
poolName:
193187
type: string
194188
qos:
@@ -200,6 +194,9 @@ spec:
200194
- critical
201195
type: string
202196
replicas:
197+
description: |-
198+
If replicas not set, it will be dynamic based on pending Pod
199+
If isLocalGPU set to true, replicas must be dynamic, and this field will be ignored
203200
format: int32
204201
type: integer
205202
resources:
@@ -244,6 +241,12 @@ spec:
244241
- limits
245242
- requests
246243
type: object
244+
standaloneWorkerMode:
245+
description: This mode is only available when `is-local-gpu` set to
246+
true, in this mode, TensorFusion will also inject vGPU worker into
247+
init container, so that to achieve best performance, trade-off is
248+
user might by-pass the vGPU worker and using physical GPU directly
249+
type: boolean
247250
type: object
248251
status:
249252
description: TensorFusionWorkloadStatus defines the observed state of

charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -182,12 +182,6 @@ spec:
182182
description: Schedule the workload to the same GPU server that runs
183183
vGPU worker for best performance, default to false
184184
type: boolean
185-
noStandaloneWorkerMode:
186-
description: This mode is only available when `is-local-gpu` set to
187-
true, in this mode, TensorFusion will also inject vGPU worker into
188-
init container, so that to achieve best performance, trade-off is
189-
user might by-pass the vGPU worker and using physical GPU directly
190-
type: boolean
191185
poolName:
192186
type: string
193187
qos:
@@ -199,6 +193,9 @@ spec:
199193
- critical
200194
type: string
201195
replicas:
196+
description: |-
197+
If replicas not set, it will be dynamic based on pending Pod
198+
If isLocalGPU set to true, replicas must be dynamic, and this field will be ignored
202199
format: int32
203200
type: integer
204201
resources:
@@ -243,6 +240,12 @@ spec:
243240
- limits
244241
- requests
245242
type: object
243+
standaloneWorkerMode:
244+
description: This mode is only available when `is-local-gpu` set to
245+
true, in this mode, TensorFusion will also inject vGPU worker into
246+
init container, so that to achieve best performance, trade-off is
247+
user might by-pass the vGPU worker and using physical GPU directly
248+
type: boolean
246249
type: object
247250
status:
248251
description: WorkloadProfileStatus defines the observed state of WorkloadProfile.
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
{{- if .Values.alert.enabled }}
2+
apiVersion: v1
3+
kind: ConfigMap
4+
metadata:
5+
name: alert-manager-config
6+
namespace: {{ include "tensor-fusion.namespace" . }}
7+
labels:
8+
tensor-fusion.ai/component: alert-manager
9+
{{- include "tensor-fusion.labels" . | nindent 4 }}
10+
data:
11+
alertmanager.yml: |
12+
{{- toYaml .Values.alert.alertManagerConfig | nindent 4 }}
13+
---
14+
apiVersion: apps/v1
15+
kind: StatefulSet
16+
metadata:
17+
name: alert-manager
18+
namespace: {{ include "tensor-fusion.namespace" . }}
19+
labels:
20+
tensor-fusion.ai/component: alert-manager
21+
{{- include "tensor-fusion.labels" . | nindent 4 }}
22+
spec:
23+
replicas: {{ .Values.alert.replicaCount }}
24+
selector:
25+
matchLabels:
26+
tensor-fusion.ai/component: alert-manager
27+
template:
28+
metadata:
29+
creationTimestamp: null
30+
labels:
31+
tensor-fusion.ai/component: alert-manager
32+
{{- include "tensor-fusion.labels" . | nindent 8 }}
33+
spec:
34+
enableServiceLinks: false
35+
volumes:
36+
- name: config
37+
configMap:
38+
name: alert-manager-config
39+
defaultMode: 420
40+
- name: storage
41+
hostPath:
42+
path: /data/alertmanager
43+
type: DirectoryOrCreate
44+
containers:
45+
- name: alertmanager
46+
image: "{{ .Values.alert.image.repository }}:{{ .Values.alert.image.tag }}"
47+
args:
48+
- '--storage.path=/alertmanager'
49+
- '--config.file=/etc/alertmanager/alertmanager.yml'
50+
ports:
51+
- name: http
52+
containerPort: 9093
53+
protocol: TCP
54+
env:
55+
- name: POD_IP
56+
valueFrom:
57+
fieldRef:
58+
apiVersion: v1
59+
fieldPath: status.podIP
60+
resources:
61+
{{- toYaml .Values.alert.resources | nindent 12 }}
62+
volumeMounts:
63+
- name: config
64+
mountPath: /etc/alertmanager
65+
- name: storage
66+
mountPath: /alertmanager
67+
livenessProbe:
68+
httpGet:
69+
path: /
70+
port: http
71+
scheme: HTTP
72+
timeoutSeconds: 1
73+
periodSeconds: 10
74+
successThreshold: 1
75+
failureThreshold: 3
76+
readinessProbe:
77+
httpGet:
78+
path: /
79+
port: http
80+
scheme: HTTP
81+
timeoutSeconds: 1
82+
periodSeconds: 10
83+
successThreshold: 1
84+
failureThreshold: 3
85+
restartPolicy: Always
86+
serviceAccountName: alert-manager
87+
serviceName: alert-manager-headless
88+
updateStrategy:
89+
type: RollingUpdate
90+
rollingUpdate:
91+
partition: 0
92+
revisionHistoryLimit: 10
93+
---
94+
apiVersion: v1
95+
kind: Service
96+
metadata:
97+
name: alert-manager
98+
namespace: {{ include "tensor-fusion.namespace" . }}
99+
labels:
100+
tensor-fusion.ai/component: alert-manager
101+
{{- include "tensor-fusion.labels" . | nindent 4 }}
102+
spec:
103+
ports:
104+
- name: http
105+
protocol: TCP
106+
port: 9093
107+
targetPort: http
108+
selector:
109+
tensor-fusion.ai/component: alert-manager
110+
type: ClusterIP
111+
sessionAffinity: None
112+
internalTrafficPolicy: Cluster
113+
---
114+
apiVersion: v1
115+
kind: Service
116+
metadata:
117+
name: alert-manager-headless
118+
namespace: {{ include "tensor-fusion.namespace" . }}
119+
labels:
120+
tensor-fusion.ai/component: alert-manager-headless
121+
{{- include "tensor-fusion.labels" . | nindent 4 }}
122+
spec:
123+
ports:
124+
- name: http
125+
protocol: TCP
126+
port: 9093
127+
targetPort: http
128+
selector:
129+
tensor-fusion.ai/component: alert-manager
130+
clusterIP: None
131+
clusterIPs:
132+
- None
133+
type: ClusterIP
134+
sessionAffinity: None
135+
internalTrafficPolicy: Cluster
136+
137+
{{- end }}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
apiVersion: v1
2+
kind: ConfigMap
3+
metadata:
4+
name: {{ .Release.Name }}-config
5+
namespace: {{ include "tensor-fusion.namespace" . }}
6+
labels:
7+
tensor-fusion.ai/component: config
8+
{{- include "tensor-fusion.labels" . | nindent 4 }}
9+
data:
10+
# Read by tensor fusion operator, eval alert rules and send to alertmanager if enabledAlert is true
11+
config.yaml: |
12+
{{- toYaml .Values.dynamicConfig | nindent 4 }}

0 commit comments

Comments
 (0)