Skip to content

Commit 87a0fcf

Browse files
authored
fix: add missing inject container for worker pod, remove unused annotation (#305)
* fix: ut potential issue * fix: add missing inject container for worker pod, remove unused annotation * fix: ut issue
1 parent adcace4 commit 87a0fcf

File tree

7 files changed

+21
-18
lines changed

7 files changed

+21
-18
lines changed

internal/controller/gpunodeclaim_controller_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ var _ = Describe("GPUNodeClaim Controller", func() {
5454
g.Expect(k8sClient.List(ctx, gpuNodes)).Should(Succeed())
5555

5656
// Add mock GPU for the provisioned nodes
57-
tfEnv.AddMockGPU4ProvisionedNodes(gpuNodeClaimList, gpuNodes)
57+
tfEnv.AddMockGPU4ProvisionedNodes(g, gpuNodeClaimList, gpuNodes)
5858

5959
k8sNodes := &corev1.NodeList{}
6060
g.Expect(k8sClient.List(ctx, k8sNodes)).Should(Succeed())

internal/controller/pod_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ func (r *PodReconciler) setPendingOwnedWorkload(ctx context.Context, pod *corev1
172172
}
173173

174174
func buildTensorFusionConnectionObj(pod *corev1.Pod) *tfv1.TensorFusionConnection {
175-
workloadName, ok := pod.Annotations[constants.WorkloadKey]
175+
workloadName, ok := pod.Labels[constants.WorkloadKey]
176176
if !ok {
177177
return nil
178178
}

internal/controller/suite_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -454,7 +454,7 @@ func (c *TensorFusionEnv) UpdateHypervisorStatus(checkNodeNum bool) {
454454
}
455455
}
456456

457-
func (c *TensorFusionEnv) AddMockGPU4ProvisionedNodes(gpuNodeClaimList *tfv1.GPUNodeClaimList, gpuNodes *tfv1.GPUNodeList) {
457+
func (c *TensorFusionEnv) AddMockGPU4ProvisionedNodes(g Gomega, gpuNodeClaimList *tfv1.GPUNodeClaimList, gpuNodes *tfv1.GPUNodeList) {
458458
GinkgoHelper()
459459
claimToGPUNodeMap := make(map[string]*tfv1.GPUNode)
460460
for _, gpuNode := range gpuNodes.Items {
@@ -480,7 +480,7 @@ func (c *TensorFusionEnv) AddMockGPU4ProvisionedNodes(gpuNodeClaimList *tfv1.GPU
480480
_ = controllerutil.SetControllerReference(gpuNode, gpu, scheme.Scheme)
481481
err := k8sClient.Get(ctx, client.ObjectKey{Name: gpu.Name}, &tfv1.GPU{})
482482
if errors.IsNotFound(err) {
483-
Expect(k8sClient.Create(ctx, gpu)).Should(Succeed())
483+
g.Expect(k8sClient.Create(ctx, gpu)).Should(Succeed())
484484

485485
err = retry.RetryOnConflict(retry.DefaultBackoff, func() error {
486486
latest := &tfv1.GPU{}
@@ -508,7 +508,7 @@ func (c *TensorFusionEnv) AddMockGPU4ProvisionedNodes(gpuNodeClaimList *tfv1.GPU
508508
}
509509
return nil
510510
})
511-
Expect(err).Should(Succeed())
511+
g.Expect(err).Should(Succeed())
512512
}
513513

514514
// update GPUNode status to trigger node level reconcile, simulate node discovery job
@@ -520,7 +520,7 @@ func (c *TensorFusionEnv) AddMockGPU4ProvisionedNodes(gpuNodeClaimList *tfv1.GPU
520520
TotalTFlops: gpuNodeClaim.Spec.TFlopsOffered,
521521
TotalVRAM: gpuNodeClaim.Spec.VRAMOffered,
522522
}
523-
Expect(k8sClient.Status().Update(ctx, gpuNode)).Should(Succeed())
523+
g.Expect(k8sClient.Status().Update(ctx, gpuNode)).Should(Succeed())
524524
}
525525
}
526526
}

internal/metrics/recorder.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ func SetWorkerMetricsByWorkload(pod *corev1.Pod) {
7676
if _, ok := workerMetricsMap[pod.Name]; !ok {
7777
workerMetricsMap[pod.Name] = &WorkerResourceMetrics{
7878
WorkerName: pod.Name,
79-
WorkloadName: pod.Annotations[constants.WorkloadKey],
79+
WorkloadName: pod.Labels[constants.WorkloadKey],
8080
PoolName: pod.Annotations[constants.GpuPoolKey],
8181
Namespace: pod.Namespace,
8282
QoS: pod.Annotations[constants.QoSLevelAnnotation],
@@ -98,7 +98,7 @@ func SetWorkerMetricsByWorkload(pod *corev1.Pod) {
9898
} else {
9999
metricsItem.GPUCount = int(count)
100100
}
101-
metricsItem.WorkloadName = pod.Annotations[constants.WorkloadKey]
101+
metricsItem.WorkloadName = pod.Labels[constants.WorkloadKey]
102102
}
103103

104104
func SetNodeMetrics(node *tfv1.GPUNode, poolObj *tfv1.GPUPool, gpuModels []string) {

internal/utils/compose.go

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,10 +84,6 @@ func AddOrOverrideTFClientMissingAnnotationsBeforePatch(pod *v1.Pod, tfInfo Tens
8484
if pod.Annotations == nil {
8585
pod.Annotations = map[string]string{}
8686
}
87-
// add workload to pod annotations just for additional information
88-
// so that users will know which GPU workload this pod binds to
89-
pod.Annotations[constants.WorkloadKey] = tfInfo.WorkloadName
90-
9187
// When it's worker, set workload key to label for triggering workload reconcile
9288
if tfInfo.Profile.IsLocalGPU {
9389
if pod.Labels == nil {
@@ -116,7 +112,11 @@ func AddOrOverrideTFClientMissingAnnotationsBeforePatch(pod *v1.Pod, tfInfo Tens
116112
pod.Annotations[constants.InjectContainerAnnotation] = strings.Join(tfInfo.ContainerNames, ",")
117113
}
118114

119-
func AppendTFWorkerLabelsAndAnnotationsAfterTemplate(podTmpl *v1.PodTemplate, workload *tfv1.TensorFusionWorkload) (map[string]string, map[string]string) {
115+
func AppendTFWorkerLabelsAndAnnotationsAfterTemplate(
116+
podTmpl *v1.PodTemplate,
117+
workload *tfv1.TensorFusionWorkload,
118+
containerName string,
119+
) (map[string]string, map[string]string) {
120120
labels := maps.Clone(podTmpl.Template.Labels)
121121
if labels == nil {
122122
labels = map[string]string{}
@@ -132,6 +132,7 @@ func AppendTFWorkerLabelsAndAnnotationsAfterTemplate(podTmpl *v1.PodTemplate, wo
132132
annotations[constants.VRAMLimitAnnotation] = res.Limits.Vram.String()
133133
annotations[constants.TFLOPSRequestAnnotation] = res.Requests.Tflops.String()
134134
annotations[constants.VRAMRequestAnnotation] = res.Requests.Vram.String()
135+
annotations[constants.InjectContainerAnnotation] = containerName
135136
if workload.Spec.Qos == "" {
136137
annotations[constants.QoSLevelAnnotation] = string(tfv1.QoSMedium)
137138
} else {
@@ -595,7 +596,7 @@ func AddTFNodeDiscoveryConfAfterTemplate(ctx context.Context, tmpl *v1.PodTempla
595596
}
596597
}
597598

598-
func AddWorkerConfAfterTemplate(ctx context.Context, spec *v1.PodSpec, workerConfig *tfv1.WorkerConfig, hypervisorConfig *tfv1.HypervisorConfig, workload *tfv1.TensorFusionWorkload) {
599+
func AddWorkerConfAfterTemplate(ctx context.Context, spec *v1.PodSpec, workerConfig *tfv1.WorkerConfig, hypervisorConfig *tfv1.HypervisorConfig, workload *tfv1.TensorFusionWorkload) string {
599600
// NOTE: need to set environment variable to make all GPUs visible to the worker,
600601
// vgpu.rs limiter will limit to specific devices after Pod started
601602
spec.Containers[0].Name = constants.TFContainerNameWorker
@@ -689,4 +690,6 @@ func AddWorkerConfAfterTemplate(ctx context.Context, spec *v1.PodSpec, workerCon
689690
if len(spec.Containers[0].Resources.Requests) == 0 {
690691
spec.Containers[0].Resources.Requests = workerDefaultRequests
691692
}
693+
694+
return spec.Containers[0].Name
692695
}

internal/webhook/v1/pod_webhook_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -345,10 +345,10 @@ var _ = Describe("TensorFusionPodMutator", func() {
345345
Expect(scheduleMutation.Value).To(Equal(constants.SchedulerName))
346346

347347
workloadAnnotationMutation, found := lo.Find(resp.Patches, func(patch jsonpatch.JsonPatchOperation) bool {
348-
return patch.Path == "/metadata/annotations/tensor-fusion.ai~1workload"
348+
return patch.Path == "/metadata/annotations/tensor-fusion.ai~1tflops-limit"
349349
})
350350
Expect(found).To(BeTrue())
351-
Expect(workloadAnnotationMutation.Value).To(Equal("test-pod-local-gpu"))
351+
Expect(workloadAnnotationMutation.Value).To(Equal("100"))
352352
})
353353
})
354354

internal/worker/worker.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,14 +41,14 @@ func (wg *WorkerGenerator) GenerateWorkerPod(
4141
}
4242
spec := podTmpl.Template.Spec
4343

44-
utils.AddWorkerConfAfterTemplate(ctx, &spec, wg.WorkerConfig, wg.HypervisorConfig, workload)
44+
containerName := utils.AddWorkerConfAfterTemplate(ctx, &spec, wg.WorkerConfig, wg.HypervisorConfig, workload)
4545

4646
// performance optimization, service link will cause high CPU usage when service number is large
4747
spec.EnableServiceLinks = ptr.To(false)
4848
spec.SchedulerName = constants.SchedulerName
4949

5050
// Add labels to identify this pod as part of the workload
51-
labels, annotations := utils.AppendTFWorkerLabelsAndAnnotationsAfterTemplate(podTmpl, workload)
51+
labels, annotations := utils.AppendTFWorkerLabelsAndAnnotationsAfterTemplate(podTmpl, workload, containerName)
5252

5353
return &v1.Pod{
5454
ObjectMeta: metav1.ObjectMeta{

0 commit comments

Comments
 (0)