Skip to content

Commit 712bab7

Browse files
authored
fix: bump helm chart version, fix progressive migration schedule bug (#286)
* fix: ut accidentally failure issue * fix: bump helm chart version, fix progressive migration schedule bug
1 parent f7a0623 commit 712bab7

File tree

6 files changed

+66
-35
lines changed

6 files changed

+66
-35
lines changed

charts/tensor-fusion/Chart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 1.4.7
18+
version: 1.4.8
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to
2222
# follow Semantic Versioning. They should reflect the version the application is using.
2323
# It is recommended to use it with quotes.
24-
appVersion: "1.39.1"
24+
appVersion: "1.42.1"

internal/controller/tensorfusionworkload_controller_test.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -161,10 +161,12 @@ var _ = Describe("TensorFusionWorkload Controller", func() {
161161
Expect(originalPodTemplateHash).NotTo(BeEmpty())
162162

163163
workload := &tfv1.TensorFusionWorkload{}
164-
Expect(k8sClient.Get(ctx, key, workload)).To(Succeed())
165-
workload.Spec.Resources.Limits.Tflops = resource.MustParse("30")
166-
workload.Spec.Resources.Limits.Vram = resource.MustParse("24Gi")
167-
Expect(k8sClient.Update(ctx, workload)).To(Succeed())
164+
Eventually(func(g Gomega) {
165+
g.Expect(k8sClient.Get(ctx, key, workload)).To(Succeed())
166+
workload.Spec.Resources.Limits.Tflops = resource.MustParse("30")
167+
workload.Spec.Resources.Limits.Vram = resource.MustParse("24Gi")
168+
g.Expect(k8sClient.Update(ctx, workload)).To(Succeed())
169+
}).Should(Succeed())
168170

169171
Eventually(func(g Gomega) {
170172
g.Expect(k8sClient.List(ctx, podList,

internal/gpuallocator/gpuallocator.go

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -510,11 +510,14 @@ func (s *GpuAllocator) AdjustAllocation(ctx context.Context, adjustRequest tfv1.
510510
return tfv1.Resource{}, nil
511511
}
512512

513-
func (s *GpuAllocator) ListNonTensorFusionNodes() sets.Set[string] {
513+
func (s *GpuAllocator) ListNonUsingNodes() sets.Set[string] {
514514
set := sets.New[string]()
515-
for _, gpu := range s.gpuStore {
516-
if gpu.Status.UsedBy != tfv1.UsedByTensorFusion {
517-
set.Insert(gpu.Status.NodeSelector[constants.KubernetesHostNameLabel])
515+
for nodeName, gpuNames := range s.nodeWorkerStore {
516+
// If using by TF, the node can not be used by original scheduler
517+
// If using by other scheduler, won't record as TF worker, thus the map is empty
518+
// Return non using nodes can ensure original scheduler not conflict with TF
519+
if len(gpuNames) == 0 {
520+
set.Insert(nodeName)
518521
}
519522
}
520523
return set

internal/scheduler/gpuresources/gpuresources.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ func (s *GPUFit) Name() string {
9292
func (s *GPUFit) PreFilter(ctx context.Context, state *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
9393
// Handle progressive migration case
9494
if utils.IsProgressiveMigration() && utils.HasGPUResourceRequest(pod) {
95-
nodeNames := s.allocator.ListNonTensorFusionNodes()
95+
nodeNames := s.allocator.ListNonUsingNodes()
9696
return &framework.PreFilterResult{
9797
NodeNames: nodeNames,
9898
}, framework.NewStatus(framework.Success, "progressive migration for native resources claim")

internal/scheduler/gpuresources/gpuresources_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -356,13 +356,13 @@ func (s *GPUResourcesSuite) TestPreFilterForNonTensorFusionPod() {
356356
name: "pod requires 1 GPU, enough capacity",
357357
pod: s.makeNonTensorFusionPod("p1", 1),
358358
expectedStatus: framework.Success,
359-
expectedNodes: "node-c",
359+
expectedNodes: "node-b node-c",
360360
},
361361
{
362362
name: "pod requires 2 GPU, enough capacity",
363363
pod: s.makeNonTensorFusionPod("p1", 2),
364364
expectedStatus: framework.Success,
365-
expectedNodes: "node-c",
365+
expectedNodes: "node-b node-c",
366366
},
367367
}
368368

internal/utils/compose.go

Lines changed: 48 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99

1010
tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
1111
constants "github.com/NexusGPU/tensor-fusion/internal/constants"
12+
"github.com/samber/lo"
1213
v1 "k8s.io/api/core/v1"
1314
"k8s.io/apimachinery/pkg/api/resource"
1415
"k8s.io/utils/ptr"
@@ -202,10 +203,50 @@ func AddTFDefaultClientConfBeforePatch(
202203

203204
if tfInfo.Profile.IsLocalGPU {
204205
for _, injectContainerIndex := range injectContainerIndices {
205-
pod.Spec.Containers[injectContainerIndex].Env = append(pod.Spec.Containers[injectContainerIndex].Env, v1.EnvVar{
206-
Name: constants.NvidiaVisibleAllDeviceEnv,
207-
Value: constants.NvidiaVisibleAllDeviceValue,
208-
}, v1.EnvVar{
206+
envList := pod.Spec.Containers[injectContainerIndex].Env
207+
if !lo.ContainsBy(envList, func(env v1.EnvVar) bool {
208+
return env.Name == constants.PodNamespaceEnv
209+
}) {
210+
envList = append(envList, v1.EnvVar{
211+
Name: constants.PodNamespaceEnv,
212+
ValueFrom: &v1.EnvVarSource{
213+
FieldRef: &v1.ObjectFieldSelector{
214+
FieldPath: constants.NamespaceFieldRef,
215+
},
216+
},
217+
})
218+
}
219+
if !lo.ContainsBy(envList, func(env v1.EnvVar) bool {
220+
return env.Name == constants.PodNameEnv
221+
}) {
222+
envList = append(envList, v1.EnvVar{
223+
Name: constants.PodNameEnv,
224+
ValueFrom: &v1.EnvVarSource{
225+
FieldRef: &v1.ObjectFieldSelector{
226+
FieldPath: constants.ResourceNameFieldRef,
227+
},
228+
},
229+
})
230+
}
231+
if !lo.ContainsBy(envList, func(env v1.EnvVar) bool {
232+
return env.Name == constants.ContainerNameEnv
233+
}) {
234+
envList = append(envList, v1.EnvVar{
235+
Name: constants.ContainerNameEnv,
236+
Value: pod.Spec.Containers[injectContainerIndex].Name,
237+
})
238+
}
239+
240+
if !lo.ContainsBy(envList, func(env v1.EnvVar) bool {
241+
return env.Name == constants.NvidiaVisibleAllDeviceEnv
242+
}) {
243+
envList = append(envList, v1.EnvVar{
244+
Name: constants.NvidiaVisibleAllDeviceEnv,
245+
Value: constants.NvidiaVisibleAllDeviceValue,
246+
})
247+
}
248+
249+
envList = append(envList, v1.EnvVar{
209250
Name: constants.RealNvmlLibPathEnv,
210251
Value: constants.RealNvmlLibPathValue,
211252
}, v1.EnvVar{
@@ -221,23 +262,6 @@ func AddTFDefaultClientConfBeforePatch(
221262
}, v1.EnvVar{
222263
Name: constants.HypervisorPortEnv,
223264
Value: strconv.Itoa(int(getHypervisorPortNumber(pool.Spec.ComponentConfig.Hypervisor))),
224-
}, v1.EnvVar{
225-
Name: constants.PodNamespaceEnv,
226-
ValueFrom: &v1.EnvVarSource{
227-
FieldRef: &v1.ObjectFieldSelector{
228-
FieldPath: constants.NamespaceFieldRef,
229-
},
230-
},
231-
}, v1.EnvVar{
232-
Name: constants.PodNameEnv,
233-
ValueFrom: &v1.EnvVarSource{
234-
FieldRef: &v1.ObjectFieldSelector{
235-
FieldPath: constants.ResourceNameFieldRef,
236-
},
237-
},
238-
}, v1.EnvVar{
239-
Name: constants.ContainerNameEnv,
240-
Value: pod.Spec.Containers[injectContainerIndex].Name,
241265
}, v1.EnvVar{
242266
Name: constants.NGPUPathEnv,
243267
Value: constants.NGPUPathValue,
@@ -253,13 +277,15 @@ func AddTFDefaultClientConfBeforePatch(
253277
features := strings.Split(pod.Annotations[constants.DisableFeaturesAnnotation], ",")
254278
for _, feature := range features {
255279
if feat, ok := featureShortcutMap[feature]; ok {
256-
pod.Spec.Containers[injectContainerIndex].Env = append(pod.Spec.Containers[injectContainerIndex].Env, v1.EnvVar{
280+
envList = append(envList, v1.EnvVar{
257281
Name: feat.EnvName,
258282
Value: feat.EnvValue,
259283
})
260284
}
261285
}
262286
}
287+
288+
pod.Spec.Containers[injectContainerIndex].Env = envList
263289
}
264290
}
265291
}

0 commit comments

Comments
 (0)