fix: bump helm chart version, fix progressive migration schedule bug (#286)

Code2Life · web-flow · commit 712bab7f94d8 · 2025-07-21T16:10:34.000+08:00
* fix: ut accidentally failure issue

* fix: bump helm chart version, fix progressive migration schedule bug
diff --git a/charts/tensor-fusion/Chart.yaml b/charts/tensor-fusion/Chart.yaml
@@ -15,10 +15,10 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 1.4.7
+version: 1.4.8
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: "1.39.1"
+appVersion: "1.42.1"
diff --git a/internal/controller/tensorfusionworkload_controller_test.go b/internal/controller/tensorfusionworkload_controller_test.go
@@ -161,10 +161,12 @@ var _ = Describe("TensorFusionWorkload Controller", func() {
 			Expect(originalPodTemplateHash).NotTo(BeEmpty())
 
 			workload := &tfv1.TensorFusionWorkload{}
-			Expect(k8sClient.Get(ctx, key, workload)).To(Succeed())
-			workload.Spec.Resources.Limits.Tflops = resource.MustParse("30")
-			workload.Spec.Resources.Limits.Vram = resource.MustParse("24Gi")
-			Expect(k8sClient.Update(ctx, workload)).To(Succeed())
+			Eventually(func(g Gomega) {
+				g.Expect(k8sClient.Get(ctx, key, workload)).To(Succeed())
+				workload.Spec.Resources.Limits.Tflops = resource.MustParse("30")
+				workload.Spec.Resources.Limits.Vram = resource.MustParse("24Gi")
+				g.Expect(k8sClient.Update(ctx, workload)).To(Succeed())
+			}).Should(Succeed())
 
 			Eventually(func(g Gomega) {
 				g.Expect(k8sClient.List(ctx, podList,
diff --git a/internal/gpuallocator/gpuallocator.go b/internal/gpuallocator/gpuallocator.go
@@ -510,11 +510,14 @@ func (s *GpuAllocator) AdjustAllocation(ctx context.Context, adjustRequest tfv1.
 	return tfv1.Resource{}, nil
 }
 
-func (s *GpuAllocator) ListNonTensorFusionNodes() sets.Set[string] {
+func (s *GpuAllocator) ListNonUsingNodes() sets.Set[string] {
 	set := sets.New[string]()
-	for _, gpu := range s.gpuStore {
-		if gpu.Status.UsedBy != tfv1.UsedByTensorFusion {
-			set.Insert(gpu.Status.NodeSelector[constants.KubernetesHostNameLabel])
+	for nodeName, gpuNames := range s.nodeWorkerStore {
+		// If using by TF, the node can not be used by original scheduler
+		// If using by other scheduler, won't record as TF worker, thus the map is empty
+		// Return non using nodes can ensure original scheduler not conflict with TF
+		if len(gpuNames) == 0 {
+			set.Insert(nodeName)
 		}
 	}
 	return set
diff --git a/internal/scheduler/gpuresources/gpuresources.go b/internal/scheduler/gpuresources/gpuresources.go
@@ -92,7 +92,7 @@ func (s *GPUFit) Name() string {
 func (s *GPUFit) PreFilter(ctx context.Context, state *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
 	// Handle progressive migration case
 	if utils.IsProgressiveMigration() && utils.HasGPUResourceRequest(pod) {
-		nodeNames := s.allocator.ListNonTensorFusionNodes()
+		nodeNames := s.allocator.ListNonUsingNodes()
 		return &framework.PreFilterResult{
 			NodeNames: nodeNames,
 		}, framework.NewStatus(framework.Success, "progressive migration for native resources claim")
diff --git a/internal/scheduler/gpuresources/gpuresources_test.go b/internal/scheduler/gpuresources/gpuresources_test.go
@@ -356,13 +356,13 @@ func (s *GPUResourcesSuite) TestPreFilterForNonTensorFusionPod() {
 			name:           "pod requires 1 GPU, enough capacity",
 			pod:            s.makeNonTensorFusionPod("p1", 1),
 			expectedStatus: framework.Success,
-			expectedNodes:  "node-c",
+			expectedNodes:  "node-b node-c",
 		},
 		{
 			name:           "pod requires 2 GPU, enough capacity",
 			pod:            s.makeNonTensorFusionPod("p1", 2),
 			expectedStatus: framework.Success,
-			expectedNodes:  "node-c",
+			expectedNodes:  "node-b node-c",
 		},
 	}
 
diff --git a/internal/utils/compose.go b/internal/utils/compose.go
@@ -9,6 +9,7 @@ import (
 
 	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
 	constants "github.com/NexusGPU/tensor-fusion/internal/constants"
+	"github.com/samber/lo"
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	"k8s.io/utils/ptr"
@@ -202,10 +203,50 @@ func AddTFDefaultClientConfBeforePatch(
 
 	if tfInfo.Profile.IsLocalGPU {
 		for _, injectContainerIndex := range injectContainerIndices {
-			pod.Spec.Containers[injectContainerIndex].Env = append(pod.Spec.Containers[injectContainerIndex].Env, v1.EnvVar{
-				Name:  constants.NvidiaVisibleAllDeviceEnv,
-				Value: constants.NvidiaVisibleAllDeviceValue,
-			}, v1.EnvVar{
+			envList := pod.Spec.Containers[injectContainerIndex].Env
+			if !lo.ContainsBy(envList, func(env v1.EnvVar) bool {
+				return env.Name == constants.PodNamespaceEnv
+			}) {
+				envList = append(envList, v1.EnvVar{
+					Name: constants.PodNamespaceEnv,
+					ValueFrom: &v1.EnvVarSource{
+						FieldRef: &v1.ObjectFieldSelector{
+							FieldPath: constants.NamespaceFieldRef,
+						},
+					},
+				})
+			}
+			if !lo.ContainsBy(envList, func(env v1.EnvVar) bool {
+				return env.Name == constants.PodNameEnv
+			}) {
+				envList = append(envList, v1.EnvVar{
+					Name: constants.PodNameEnv,
+					ValueFrom: &v1.EnvVarSource{
+						FieldRef: &v1.ObjectFieldSelector{
+							FieldPath: constants.ResourceNameFieldRef,
+						},
+					},
+				})
+			}
+			if !lo.ContainsBy(envList, func(env v1.EnvVar) bool {
+				return env.Name == constants.ContainerNameEnv
+			}) {
+				envList = append(envList, v1.EnvVar{
+					Name:  constants.ContainerNameEnv,
+					Value: pod.Spec.Containers[injectContainerIndex].Name,
+				})
+			}
+
+			if !lo.ContainsBy(envList, func(env v1.EnvVar) bool {
+				return env.Name == constants.NvidiaVisibleAllDeviceEnv
+			}) {
+				envList = append(envList, v1.EnvVar{
+					Name:  constants.NvidiaVisibleAllDeviceEnv,
+					Value: constants.NvidiaVisibleAllDeviceValue,
+				})
+			}
+
+			envList = append(envList, v1.EnvVar{
 				Name:  constants.RealNvmlLibPathEnv,
 				Value: constants.RealNvmlLibPathValue,
 			}, v1.EnvVar{
@@ -221,23 +262,6 @@ func AddTFDefaultClientConfBeforePatch(
 			}, v1.EnvVar{
 				Name:  constants.HypervisorPortEnv,
 				Value: strconv.Itoa(int(getHypervisorPortNumber(pool.Spec.ComponentConfig.Hypervisor))),
-			}, v1.EnvVar{
-				Name: constants.PodNamespaceEnv,
-				ValueFrom: &v1.EnvVarSource{
-					FieldRef: &v1.ObjectFieldSelector{
-						FieldPath: constants.NamespaceFieldRef,
-					},
-				},
-			}, v1.EnvVar{
-				Name: constants.PodNameEnv,
-				ValueFrom: &v1.EnvVarSource{
-					FieldRef: &v1.ObjectFieldSelector{
-						FieldPath: constants.ResourceNameFieldRef,
-					},
-				},
-			}, v1.EnvVar{
-				Name:  constants.ContainerNameEnv,
-				Value: pod.Spec.Containers[injectContainerIndex].Name,
 			}, v1.EnvVar{
 				Name:  constants.NGPUPathEnv,
 				Value: constants.NGPUPathValue,
@@ -253,13 +277,15 @@ func AddTFDefaultClientConfBeforePatch(
 				features := strings.Split(pod.Annotations[constants.DisableFeaturesAnnotation], ",")
 				for _, feature := range features {
 					if feat, ok := featureShortcutMap[feature]; ok {
-						pod.Spec.Containers[injectContainerIndex].Env = append(pod.Spec.Containers[injectContainerIndex].Env, v1.EnvVar{
+						envList = append(envList, v1.EnvVar{
 							Name:  feat.EnvName,
 							Value: feat.EnvValue,
 						})
 					}
 				}
 			}
+
+			pod.Spec.Containers[injectContainerIndex].Env = envList
 		}
 	}
 }

Original file line number	Diff line number	Diff line change
`@@ -510,11 +510,14 @@ func (s *GpuAllocator) AdjustAllocation(ctx context.Context, adjustRequest tfv1.`
`510`	`510`	`return tfv1.Resource{}, nil`
`511`	`511`	`}`
`512`	`512`
`513`		`-func (s *GpuAllocator) ListNonTensorFusionNodes() sets.Set[string] {`
	`513`	`+func (s *GpuAllocator) ListNonUsingNodes() sets.Set[string] {`
`514`	`514`	`set := sets.New[string]()`
`515`		`- for _, gpu := range s.gpuStore {`
`516`		`- if gpu.Status.UsedBy != tfv1.UsedByTensorFusion {`
`517`		`- set.Insert(gpu.Status.NodeSelector[constants.KubernetesHostNameLabel])`
	`515`	`+ for nodeName, gpuNames := range s.nodeWorkerStore {`
	`516`	`+ // If using by TF, the node can not be used by original scheduler`
	`517`	`+ // If using by other scheduler, won't record as TF worker, thus the map is empty`
	`518`	`+ // Return non using nodes can ensure original scheduler not conflict with TF`
	`519`	`+ if len(gpuNames) == 0 {`
	`520`	`+ set.Insert(nodeName)`
`518`	`521`	`}`
`519`	`522`	`}`
`520`	`523`	`return set`