Skip to content

Commit 8b0c01f

Browse files
authored
fix: npe and workload not found after immediate creation bug (#272)
1 parent e3359b9 commit 8b0c01f

File tree

1 file changed

+5
-15
lines changed

1 file changed

+5
-15
lines changed

internal/gpuallocator/gpuallocator.go

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -310,14 +310,16 @@ func (s *GpuAllocator) Dealloc(
310310
log := log.FromContext(s.ctx)
311311

312312
request, exists := s.uniqueAllocation[podUID]
313-
if !exists {
313+
if !exists || request == nil {
314314
// should not block finalizer
315315
log.Error(fmt.Errorf("pod has not allocated GPUs"), "pod", podUID)
316+
return
316317
}
317318

318319
if _, exists := s.uniqueDeallocation[podUID]; exists {
319320
// should not block finalizer
320321
log.Error(fmt.Errorf("pod has already deallocated GPUs"), "pod", podUID)
322+
return
321323
}
322324

323325
s.storeMutex.Lock()
@@ -1057,16 +1059,6 @@ func removeRunningApp(ctx context.Context, gpu *tfv1.GPU, workloadNameNamespace
10571059
}
10581060

10591061
func (s *GpuAllocator) ComposeAllocationRequest(pod *v1.Pod) (tfv1.AllocRequest, string, error) {
1060-
var tfWorkload tfv1.TensorFusionWorkload
1061-
1062-
err := s.Get(s.ctx, client.ObjectKey{
1063-
Name: pod.Labels[constants.WorkloadKey],
1064-
Namespace: pod.Namespace,
1065-
}, &tfWorkload)
1066-
if err != nil {
1067-
return tfv1.AllocRequest{}, "failed to get tf workload", err
1068-
}
1069-
10701062
gpuRequestResource, err := utils.GetGPUResource(pod, true)
10711063
if err != nil {
10721064
return tfv1.AllocRequest{}, "invalid gpu request annotation", err
@@ -1091,11 +1083,9 @@ func (s *GpuAllocator) ComposeAllocationRequest(pod *v1.Pod) (tfv1.AllocRequest,
10911083
Count: uint(count),
10921084
GPUModel: pod.Annotations[constants.GPUModelAnnotation],
10931085
WorkloadNameNamespace: tfv1.NameNamespace{
1094-
Name: tfWorkload.Name,
1095-
Namespace: tfWorkload.Namespace,
1086+
Name: pod.Labels[constants.WorkloadKey],
1087+
Namespace: pod.Namespace,
10961088
},
1097-
NodeAffinity: tfWorkload.Spec.NodeAffinity,
1098-
10991089
PodMeta: pod.ObjectMeta,
11001090
}
11011091
return allocRequest, "", nil

0 commit comments

Comments
 (0)