NexusGPU
diff --git a/‎.vscode/launch.json
Lines changed: 5 additions & 1 deletion b/‎.vscode/launch.json
Lines changed: 5 additions & 1 deletion
diff --git a/‎.vscode/settings.json
Lines changed: 14 additions & 0 deletions b/‎.vscode/settings.json
Lines changed: 14 additions & 0 deletions
diff --git a/‎api/v1/workloadprofile_types.go
Lines changed: 4 additions & 3 deletions b/‎api/v1/workloadprofile_types.go
Lines changed: 4 additions & 3 deletions
diff --git a/‎charts/tensor-fusion/Chart.yaml
Lines changed: 1 addition & 1 deletion b/‎charts/tensor-fusion/Chart.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml
Lines changed: 9 additions & 6 deletions b/‎charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml
Lines changed: 9 additions & 6 deletions
diff --git a/‎charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml
Lines changed: 9 additions & 6 deletions b/‎charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml
Lines changed: 9 additions & 6 deletions
diff --git a/‎charts/tensor-fusion/templates/alert-manager.yaml
Lines changed: 137 additions & 0 deletions b/‎charts/tensor-fusion/templates/alert-manager.yaml
Lines changed: 137 additions & 0 deletions
diff --git a/‎charts/tensor-fusion/templates/config.yaml
Lines changed: 12 additions & 0 deletions b/‎charts/tensor-fusion/templates/config.yaml
Lines changed: 12 additions & 0 deletions
@@ -13,7 +13,11 @@
                 "ENABLE_WEBHOOKS": "false"
             },
             "program": "${workspaceFolder}/cmd/main.go",
-            "args": ["--gpu-info-config", "${workspaceFolder}/config/samples/gpu-info-config.yaml"]
+            "args": [
+                "--gpu-info-config", "${workspaceFolder}/config/samples/gpu-info-config.yaml",
+                "--alert-rule-config", "${workspaceFolder}/config/samples/dynamic-config.yaml",
+                "--enable-alert", "true"
+            ]
         },
         {
             "name": "Debug Discovery",
 
@@ -1,6 +1,7 @@
 {
     "cSpell.words": [
         "alertmanager",
+        "alertname",
         "alicloud",
         "Aliyun",
         "AMDCDNA",
@@ -11,6 +12,7 @@
         "batchv",
         "burstable",
         "CDNA",
+        "certgen",
         "certificaterequests",
         "certmanager",
         "clientgoscheme",
@@ -28,17 +30,22 @@
         "Eventf",
         "finalizer",
         "Finalizers",
+        "FULLTEXT",
         "goconst",
+        "gocyclo",
         "golint",
         "Gomega",
         "gopsutil",
+        "gorm",
         "gosec",
         "gpuallocator",
         "gpunode",
         "gpunodeclasses",
         "gpunodes",
         "gpupool",
         "gpupools",
+        "GPUT",
+        "GPUVRAM",
         "greptime",
         "greptimedb",
         "healthz",
@@ -50,32 +57,39 @@
         "kustomization",
         "metav",
         "metricsserver",
+        "mito",
         "nindent",
         "nolint",
         "NVML",
         "omitempty",
         "onsi",
         "portallocator",
+        "Postable",
         "printcolumn",
         "prometheusagents",
         "prometheuses",
         "prometheusrules",
         "RDNA",
         "readyz",
+        "replicaset",
+        "runbook",
         "runpod",
+        "samber",
         "schedulingconfigtemplate",
         "schedulingconfigtemplates",
         "schedulingcorev",
         "shirou",
         "strategicpatches",
         "subresource",
+        "Tabler",
         "tensorfusion",
         "tensorfusionaiv",
         "tensorfusioncluster",
         "tensorfusionclusters",
         "tensorfusionworkload",
         "Tera",
         "tflops",
+        "timberio",
         "Tmpl",
         "Tolerations",
         "utilruntime",
 
@@ -33,14 +33,16 @@ const (
 // WorkloadProfileSpec defines the desired state of WorkloadProfile.
 type WorkloadProfileSpec struct {
 	// +optional
+	// If replicas not set, it will be dynamic based on pending Pod
+	// If isLocalGPU set to true, replicas must be dynamic, and this field will be ignored
 	Replicas *int32 `json:"replicas,omitempty"`
 
 	// +optional
 	PoolName string `json:"poolName,omitempty"`
 
 	// +optional
+	Resources Resources `json:"resources"`
 
-	Resources Resources `json:"resources,omitempty"`
 	// +optional
 	// Qos defines the quality of service level for the client.
 	Qos QoSLevel `json:"qos,omitempty"`
@@ -57,9 +59,8 @@ type WorkloadProfileSpec struct {
 	GPUCount uint `json:"gpuCount,omitempty"`
 
 	// +optional
-	// TODO, not implemented
 	// This mode is only available when `is-local-gpu` set to true, in this mode, TensorFusion will also inject vGPU worker into init container, so that to achieve best performance, trade-off is user might by-pass the vGPU worker and using physical GPU directly
-	NoStandaloneWorkerMode bool `json:"noStandaloneWorkerMode,omitempty"`
+	StandaloneWorkerMode bool `json:"standaloneWorkerMode,omitempty"`
 
 	// +optional
 	// AutoScalingConfig configured here will override Pool's schedulingConfig
 
@@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 1.3.3
+version: 1.3.5
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 
@@ -183,12 +183,6 @@ spec:
                 description: Schedule the workload to the same GPU server that runs
                   vGPU worker for best performance, default to false
                 type: boolean
-              noStandaloneWorkerMode:
-                description: This mode is only available when `is-local-gpu` set to
-                  true, in this mode, TensorFusion will also inject vGPU worker into
-                  init container, so that to achieve best performance, trade-off is
-                  user might by-pass the vGPU worker and using physical GPU directly
-                type: boolean
               poolName:
                 type: string
               qos:
@@ -200,6 +194,9 @@ spec:
                 - critical
                 type: string
               replicas:
+                description: |-
+                  If replicas not set, it will be dynamic based on pending Pod
+                  If isLocalGPU set to true, replicas must be dynamic, and this field will be ignored
                 format: int32
                 type: integer
               resources:
@@ -244,6 +241,12 @@ spec:
                 - limits
                 - requests
                 type: object
+              standaloneWorkerMode:
+                description: This mode is only available when `is-local-gpu` set to
+                  true, in this mode, TensorFusion will also inject vGPU worker into
+                  init container, so that to achieve best performance, trade-off is
+                  user might by-pass the vGPU worker and using physical GPU directly
+                type: boolean
             type: object
           status:
             description: TensorFusionWorkloadStatus defines the observed state of
 
@@ -182,12 +182,6 @@ spec:
                 description: Schedule the workload to the same GPU server that runs
                   vGPU worker for best performance, default to false
                 type: boolean
-              noStandaloneWorkerMode:
-                description: This mode is only available when `is-local-gpu` set to
-                  true, in this mode, TensorFusion will also inject vGPU worker into
-                  init container, so that to achieve best performance, trade-off is
-                  user might by-pass the vGPU worker and using physical GPU directly
-                type: boolean
               poolName:
                 type: string
               qos:
@@ -199,6 +193,9 @@ spec:
                 - critical
                 type: string
               replicas:
+                description: |-
+                  If replicas not set, it will be dynamic based on pending Pod
+                  If isLocalGPU set to true, replicas must be dynamic, and this field will be ignored
                 format: int32
                 type: integer
               resources:
@@ -243,6 +240,12 @@ spec:
                 - limits
                 - requests
                 type: object
+              standaloneWorkerMode:
+                description: This mode is only available when `is-local-gpu` set to
+                  true, in this mode, TensorFusion will also inject vGPU worker into
+                  init container, so that to achieve best performance, trade-off is
+                  user might by-pass the vGPU worker and using physical GPU directly
+                type: boolean
             type: object
           status:
             description: WorkloadProfileStatus defines the observed state of WorkloadProfile.
 
@@ -0,0 +1,137 @@
+{{- if .Values.alert.enabled }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: alert-manager-config
+  namespace: {{ include "tensor-fusion.namespace" . }}
+  labels:
+    tensor-fusion.ai/component: alert-manager
+    {{- include "tensor-fusion.labels" . | nindent 4 }}
+data:
+  alertmanager.yml: |
+    {{- toYaml .Values.alert.alertManagerConfig | nindent 4 }}
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: alert-manager
+  namespace: {{ include "tensor-fusion.namespace" . }}
+  labels:
+    tensor-fusion.ai/component: alert-manager
+    {{- include "tensor-fusion.labels" . | nindent 4 }}
+spec:
+  replicas: {{ .Values.alert.replicaCount }}
+  selector:
+    matchLabels:
+      tensor-fusion.ai/component: alert-manager
+  template:
+    metadata:
+      creationTimestamp: null
+      labels:
+        tensor-fusion.ai/component: alert-manager
+        {{- include "tensor-fusion.labels" . | nindent 8 }}
+    spec:
+      enableServiceLinks: false
+      volumes:
+        - name: config
+          configMap:
+            name: alert-manager-config
+            defaultMode: 420
+        - name: storage
+          hostPath:
+            path: /data/alertmanager
+            type: DirectoryOrCreate
+      containers:
+        - name: alertmanager
+          image: "{{ .Values.alert.image.repository }}:{{ .Values.alert.image.tag }}"
+          args:
+            - '--storage.path=/alertmanager'
+            - '--config.file=/etc/alertmanager/alertmanager.yml'
+          ports:
+            - name: http
+              containerPort: 9093
+              protocol: TCP
+          env:
+            - name: POD_IP
+              valueFrom:
+                fieldRef:
+                  apiVersion: v1
+                  fieldPath: status.podIP
+          resources: 
+            {{- toYaml .Values.alert.resources | nindent 12 }}
+          volumeMounts:
+            - name: config
+              mountPath: /etc/alertmanager
+            - name: storage
+              mountPath: /alertmanager
+          livenessProbe:
+            httpGet:
+              path: /
+              port: http
+              scheme: HTTP
+            timeoutSeconds: 1
+            periodSeconds: 10
+            successThreshold: 1
+            failureThreshold: 3
+          readinessProbe:
+            httpGet:
+              path: /
+              port: http
+              scheme: HTTP
+            timeoutSeconds: 1
+            periodSeconds: 10
+            successThreshold: 1
+            failureThreshold: 3
+      restartPolicy: Always
+      serviceAccountName: alert-manager
+  serviceName: alert-manager-headless
+  updateStrategy:
+    type: RollingUpdate
+    rollingUpdate:
+      partition: 0
+  revisionHistoryLimit: 10
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: alert-manager
+  namespace: {{ include "tensor-fusion.namespace" . }}
+  labels:
+    tensor-fusion.ai/component: alert-manager
+    {{- include "tensor-fusion.labels" . | nindent 4 }}
+spec:
+  ports:
+    - name: http
+      protocol: TCP
+      port: 9093
+      targetPort: http
+  selector:
+    tensor-fusion.ai/component: alert-manager
+  type: ClusterIP
+  sessionAffinity: None
+  internalTrafficPolicy: Cluster
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: alert-manager-headless
+  namespace: {{ include "tensor-fusion.namespace" . }}
+  labels:
+    tensor-fusion.ai/component: alert-manager-headless
+    {{- include "tensor-fusion.labels" . | nindent 4 }}
+spec:
+  ports:
+    - name: http
+      protocol: TCP
+      port: 9093
+      targetPort: http
+  selector:
+    tensor-fusion.ai/component: alert-manager
+  clusterIP: None
+  clusterIPs:
+    - None
+  type: ClusterIP
+  sessionAffinity: None
+  internalTrafficPolicy: Cluster
+
+{{- end }}
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ .Release.Name }}-config
+  namespace: {{ include "tensor-fusion.namespace" . }}
+  labels:
+    tensor-fusion.ai/component: config
+    {{- include "tensor-fusion.labels" . | nindent 4 }}
+data:
+  # Read by tensor fusion operator, eval alert rules and send to alertmanager if enabledAlert is true
+  config.yaml: |
+    {{- toYaml .Values.dynamicConfig | nindent 4 }}