fix: add missing inject container for worker pod, remove unused annotation (#305)

Code2Life · web-flow · commit 87a0fcf67a06 · 2025-07-31T17:46:23.000+08:00
* fix: ut potential issue

* fix: add missing inject container for worker pod, remove unused annotation

* fix: ut issue
diff --git a/internal/controller/gpunodeclaim_controller_test.go b/internal/controller/gpunodeclaim_controller_test.go
@@ -54,7 +54,7 @@ var _ = Describe("GPUNodeClaim Controller", func() {
 				g.Expect(k8sClient.List(ctx, gpuNodes)).Should(Succeed())
 
 				// Add mock GPU for the provisioned nodes
-				tfEnv.AddMockGPU4ProvisionedNodes(gpuNodeClaimList, gpuNodes)
+				tfEnv.AddMockGPU4ProvisionedNodes(g, gpuNodeClaimList, gpuNodes)
 
 				k8sNodes := &corev1.NodeList{}
 				g.Expect(k8sClient.List(ctx, k8sNodes)).Should(Succeed())
diff --git a/internal/controller/pod_controller.go b/internal/controller/pod_controller.go
@@ -172,7 +172,7 @@ func (r *PodReconciler) setPendingOwnedWorkload(ctx context.Context, pod *corev1
 }
 
 func buildTensorFusionConnectionObj(pod *corev1.Pod) *tfv1.TensorFusionConnection {
-	workloadName, ok := pod.Annotations[constants.WorkloadKey]
+	workloadName, ok := pod.Labels[constants.WorkloadKey]
 	if !ok {
 		return nil
 	}
diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go
@@ -454,7 +454,7 @@ func (c *TensorFusionEnv) UpdateHypervisorStatus(checkNodeNum bool) {
 	}
 }
 
-func (c *TensorFusionEnv) AddMockGPU4ProvisionedNodes(gpuNodeClaimList *tfv1.GPUNodeClaimList, gpuNodes *tfv1.GPUNodeList) {
+func (c *TensorFusionEnv) AddMockGPU4ProvisionedNodes(g Gomega, gpuNodeClaimList *tfv1.GPUNodeClaimList, gpuNodes *tfv1.GPUNodeList) {
 	GinkgoHelper()
 	claimToGPUNodeMap := make(map[string]*tfv1.GPUNode)
 	for _, gpuNode := range gpuNodes.Items {
@@ -480,7 +480,7 @@ func (c *TensorFusionEnv) AddMockGPU4ProvisionedNodes(gpuNodeClaimList *tfv1.GPU
 		_ = controllerutil.SetControllerReference(gpuNode, gpu, scheme.Scheme)
 		err := k8sClient.Get(ctx, client.ObjectKey{Name: gpu.Name}, &tfv1.GPU{})
 		if errors.IsNotFound(err) {
-			Expect(k8sClient.Create(ctx, gpu)).Should(Succeed())
+			g.Expect(k8sClient.Create(ctx, gpu)).Should(Succeed())
 
 			err = retry.RetryOnConflict(retry.DefaultBackoff, func() error {
 				latest := &tfv1.GPU{}
@@ -508,7 +508,7 @@ func (c *TensorFusionEnv) AddMockGPU4ProvisionedNodes(gpuNodeClaimList *tfv1.GPU
 				}
 				return nil
 			})
-			Expect(err).Should(Succeed())
+			g.Expect(err).Should(Succeed())
 		}
 
 		// update GPUNode status to trigger node level reconcile, simulate node discovery job
@@ -520,7 +520,7 @@ func (c *TensorFusionEnv) AddMockGPU4ProvisionedNodes(gpuNodeClaimList *tfv1.GPU
 				TotalTFlops: gpuNodeClaim.Spec.TFlopsOffered,
 				TotalVRAM:   gpuNodeClaim.Spec.VRAMOffered,
 			}
-			Expect(k8sClient.Status().Update(ctx, gpuNode)).Should(Succeed())
+			g.Expect(k8sClient.Status().Update(ctx, gpuNode)).Should(Succeed())
 		}
 	}
 }
diff --git a/internal/metrics/recorder.go b/internal/metrics/recorder.go
@@ -76,7 +76,7 @@ func SetWorkerMetricsByWorkload(pod *corev1.Pod) {
 	if _, ok := workerMetricsMap[pod.Name]; !ok {
 		workerMetricsMap[pod.Name] = &WorkerResourceMetrics{
 			WorkerName:     pod.Name,
-			WorkloadName:   pod.Annotations[constants.WorkloadKey],
+			WorkloadName:   pod.Labels[constants.WorkloadKey],
 			PoolName:       pod.Annotations[constants.GpuPoolKey],
 			Namespace:      pod.Namespace,
 			QoS:            pod.Annotations[constants.QoSLevelAnnotation],
@@ -98,7 +98,7 @@ func SetWorkerMetricsByWorkload(pod *corev1.Pod) {
 	} else {
 		metricsItem.GPUCount = int(count)
 	}
-	metricsItem.WorkloadName = pod.Annotations[constants.WorkloadKey]
+	metricsItem.WorkloadName = pod.Labels[constants.WorkloadKey]
 }
 
 func SetNodeMetrics(node *tfv1.GPUNode, poolObj *tfv1.GPUPool, gpuModels []string) {
diff --git a/internal/utils/compose.go b/internal/utils/compose.go
@@ -84,10 +84,6 @@ func AddOrOverrideTFClientMissingAnnotationsBeforePatch(pod *v1.Pod, tfInfo Tens
 	if pod.Annotations == nil {
 		pod.Annotations = map[string]string{}
 	}
-	// add workload to pod annotations just for additional information
-	// so that users will know which GPU workload this pod binds to
-	pod.Annotations[constants.WorkloadKey] = tfInfo.WorkloadName
-
 	// When it's worker, set workload key to label for triggering workload reconcile
 	if tfInfo.Profile.IsLocalGPU {
 		if pod.Labels == nil {
@@ -116,7 +112,11 @@ func AddOrOverrideTFClientMissingAnnotationsBeforePatch(pod *v1.Pod, tfInfo Tens
 	pod.Annotations[constants.InjectContainerAnnotation] = strings.Join(tfInfo.ContainerNames, ",")
 }
 
-func AppendTFWorkerLabelsAndAnnotationsAfterTemplate(podTmpl *v1.PodTemplate, workload *tfv1.TensorFusionWorkload) (map[string]string, map[string]string) {
+func AppendTFWorkerLabelsAndAnnotationsAfterTemplate(
+	podTmpl *v1.PodTemplate,
+	workload *tfv1.TensorFusionWorkload,
+	containerName string,
+) (map[string]string, map[string]string) {
 	labels := maps.Clone(podTmpl.Template.Labels)
 	if labels == nil {
 		labels = map[string]string{}
@@ -132,6 +132,7 @@ func AppendTFWorkerLabelsAndAnnotationsAfterTemplate(podTmpl *v1.PodTemplate, wo
 	annotations[constants.VRAMLimitAnnotation] = res.Limits.Vram.String()
 	annotations[constants.TFLOPSRequestAnnotation] = res.Requests.Tflops.String()
 	annotations[constants.VRAMRequestAnnotation] = res.Requests.Vram.String()
+	annotations[constants.InjectContainerAnnotation] = containerName
 	if workload.Spec.Qos == "" {
 		annotations[constants.QoSLevelAnnotation] = string(tfv1.QoSMedium)
 	} else {
@@ -595,7 +596,7 @@ func AddTFNodeDiscoveryConfAfterTemplate(ctx context.Context, tmpl *v1.PodTempla
 	}
 }
 
-func AddWorkerConfAfterTemplate(ctx context.Context, spec *v1.PodSpec, workerConfig *tfv1.WorkerConfig, hypervisorConfig *tfv1.HypervisorConfig, workload *tfv1.TensorFusionWorkload) {
+func AddWorkerConfAfterTemplate(ctx context.Context, spec *v1.PodSpec, workerConfig *tfv1.WorkerConfig, hypervisorConfig *tfv1.HypervisorConfig, workload *tfv1.TensorFusionWorkload) string {
 	// NOTE: need to set environment variable to make all GPUs visible to the worker,
 	// vgpu.rs limiter will limit to specific devices after Pod started
 	spec.Containers[0].Name = constants.TFContainerNameWorker
@@ -689,4 +690,6 @@ func AddWorkerConfAfterTemplate(ctx context.Context, spec *v1.PodSpec, workerCon
 	if len(spec.Containers[0].Resources.Requests) == 0 {
 		spec.Containers[0].Resources.Requests = workerDefaultRequests
 	}
+
+	return spec.Containers[0].Name
 }
diff --git a/internal/webhook/v1/pod_webhook_test.go b/internal/webhook/v1/pod_webhook_test.go
@@ -345,10 +345,10 @@ var _ = Describe("TensorFusionPodMutator", func() {
 			Expect(scheduleMutation.Value).To(Equal(constants.SchedulerName))
 
 			workloadAnnotationMutation, found := lo.Find(resp.Patches, func(patch jsonpatch.JsonPatchOperation) bool {
-				return patch.Path == "/metadata/annotations/tensor-fusion.ai~1workload"
+				return patch.Path == "/metadata/annotations/tensor-fusion.ai~1tflops-limit"
 			})
 			Expect(found).To(BeTrue())
-			Expect(workloadAnnotationMutation.Value).To(Equal("test-pod-local-gpu"))
+			Expect(workloadAnnotationMutation.Value).To(Equal("100"))
 		})
 	})
 
diff --git a/internal/worker/worker.go b/internal/worker/worker.go
@@ -41,14 +41,14 @@ func (wg *WorkerGenerator) GenerateWorkerPod(
 	}
 	spec := podTmpl.Template.Spec
 
-	utils.AddWorkerConfAfterTemplate(ctx, &spec, wg.WorkerConfig, wg.HypervisorConfig, workload)
+	containerName := utils.AddWorkerConfAfterTemplate(ctx, &spec, wg.WorkerConfig, wg.HypervisorConfig, workload)
 
 	// performance optimization, service link will cause high CPU usage when service number is large
 	spec.EnableServiceLinks = ptr.To(false)
 	spec.SchedulerName = constants.SchedulerName
 
 	// Add labels to identify this pod as part of the workload
-	labels, annotations := utils.AppendTFWorkerLabelsAndAnnotationsAfterTemplate(podTmpl, workload)
+	labels, annotations := utils.AppendTFWorkerLabelsAndAnnotationsAfterTemplate(podTmpl, workload, containerName)
 
 	return &v1.Pod{
 		ObjectMeta: metav1.ObjectMeta{

Original file line number	Diff line number	Diff line change
`@@ -172,7 +172,7 @@ func (r PodReconciler) setPendingOwnedWorkload(ctx context.Context, pod corev1`
`172`	`172`	`}`
`173`	`173`
`174`	`174`	`func buildTensorFusionConnectionObj(pod corev1.Pod) tfv1.TensorFusionConnection {`
`175`		`- workloadName, ok := pod.Annotations[constants.WorkloadKey]`
	`175`	`+ workloadName, ok := pod.Labels[constants.WorkloadKey]`
`176`	`176`	`if !ok {`
`177`	`177`	`return nil`
`178`	`178`	`}`
Original file line number	Diff line number	Diff line change
`@@ -454,7 +454,7 @@ func (c *TensorFusionEnv) UpdateHypervisorStatus(checkNodeNum bool) {`
`454`	`454`	`}`
`455`	`455`	`}`
`456`	`456`
`457`		`-func (c TensorFusionEnv) AddMockGPU4ProvisionedNodes(gpuNodeClaimList tfv1.GPUNodeClaimList, gpuNodes *tfv1.GPUNodeList) {`
	`457`	`+func (c TensorFusionEnv) AddMockGPU4ProvisionedNodes(g Gomega, gpuNodeClaimList tfv1.GPUNodeClaimList, gpuNodes *tfv1.GPUNodeList) {`
`458`	`458`	`GinkgoHelper()`
`459`	`459`	`claimToGPUNodeMap := make(map[string]*tfv1.GPUNode)`
`460`	`460`	`for _, gpuNode := range gpuNodes.Items {`
`@@ -480,7 +480,7 @@ func (c TensorFusionEnv) AddMockGPU4ProvisionedNodes(gpuNodeClaimList tfv1.GPU`
`480`	`480`	`_ = controllerutil.SetControllerReference(gpuNode, gpu, scheme.Scheme)`
`481`	`481`	`err := k8sClient.Get(ctx, client.ObjectKey{Name: gpu.Name}, &tfv1.GPU{})`
`482`	`482`	`if errors.IsNotFound(err) {`
`483`		`- Expect(k8sClient.Create(ctx, gpu)).Should(Succeed())`
	`483`	`+ g.Expect(k8sClient.Create(ctx, gpu)).Should(Succeed())`
`484`	`484`
`485`	`485`	`err = retry.RetryOnConflict(retry.DefaultBackoff, func() error {`
`486`	`486`	`latest := &tfv1.GPU{}`
`@@ -508,7 +508,7 @@ func (c TensorFusionEnv) AddMockGPU4ProvisionedNodes(gpuNodeClaimList tfv1.GPU`
`508`	`508`	`}`
`509`	`509`	`return nil`
`510`	`510`	`})`
`511`		`- Expect(err).Should(Succeed())`
	`511`	`+ g.Expect(err).Should(Succeed())`
`512`	`512`	`}`
`513`	`513`
`514`	`514`	`// update GPUNode status to trigger node level reconcile, simulate node discovery job`
`@@ -520,7 +520,7 @@ func (c TensorFusionEnv) AddMockGPU4ProvisionedNodes(gpuNodeClaimList tfv1.GPU`
`520`	`520`	`TotalTFlops: gpuNodeClaim.Spec.TFlopsOffered,`
`521`	`521`	`TotalVRAM: gpuNodeClaim.Spec.VRAMOffered,`
`522`	`522`	`}`
`523`		`- Expect(k8sClient.Status().Update(ctx, gpuNode)).Should(Succeed())`
	`523`	`+ g.Expect(k8sClient.Status().Update(ctx, gpuNode)).Should(Succeed())`
`524`	`524`	`}`
`525`	`525`	`}`
`526`	`526`	`}`