Skip to content

Commit f7a0623

Browse files
authored
feat: add GPUNodeClaim for cloud vendor integration and karpenter integration (#282)
* fix: bump kubernetes version * feat: add gpunodeclaim resource for cloud provisioning * fix: gpu node claim * fix: node claim management * fix: upgrade kubernetes version, add gpu node claim test * fix: reduce log rotation period * fix: merge karpenter integration code * fix: node claim provider nil issue * fix: log lib issue, node claim test * fix: add context to cloud providers and improve logging with controller-runtime logger * fix: add api version for node claim * fix: node pricing map refactor * fix: node claim owner ref issue * fix: provisioning mode duplicate creation issue * fix: node compaction and karpenter provision issue * fix: schedule simulation detail * fix: add killer switch for provisioning mode, fix provision bug * fix: ut issue * fix: global config not loaded bug * fix: ut issue * fix: lint issues * fix: ut issues, add serial test mode * fix: ut issues, compaction provision bug * fix: node filter test case issue
1 parent cf613ef commit f7a0623

File tree

80 files changed

+2649
-1282
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

80 files changed

+2649
-1282
lines changed

.vscode/settings.json

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
"Aliyun",
99
"AMDCDNA",
1010
"AMDRDNA",
11+
"apierrors",
1112
"apimachinery",
1213
"apimachineryruntime",
1314
"apiruntime",
@@ -37,8 +38,10 @@
3738
"CUDA",
3839
"cycjimmy",
3940
"datanode",
41+
"deepcopy",
4042
"defaultbinder",
4143
"dylib",
44+
"eastus",
4245
"envtest",
4346
"essd",
4447
"Eventf",
@@ -57,6 +60,7 @@
5760
"gosec",
5861
"gpuallocator",
5962
"gpunode",
63+
"gpunodeclaim",
6064
"gpunodeclaims",
6165
"gpunodeclasses",
6266
"gpunodes",
@@ -75,6 +79,7 @@
7579
"influxdata",
7680
"jsonpatch",
7781
"karpenter",
82+
"karpv",
7883
"klog",
7984
"Klogr",
8085
"kubebuilder",
@@ -86,13 +91,16 @@
8691
"libcuda",
8792
"libnvidia",
8893
"lineprotocol",
94+
"mapstructure",
8995
"metav",
9096
"metricsserver",
9197
"Milli",
98+
"mitchellh",
9299
"mito",
93100
"mutatingwebhookconfiguration",
94101
"ngpu",
95102
"nindent",
103+
"nodeclassref",
96104
"noderesources",
97105
"nolint",
98106
"NVML",
@@ -126,6 +134,7 @@
126134
"statefulsets",
127135
"strategicpatch",
128136
"strategicpatches",
137+
"stretchr",
129138
"subresource",
130139
"Tabler",
131140
"tensorfusion",

Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,10 @@ vet: ## Run go vet against code.
6464
test: manifests generate fmt vet envtest ## Run tests.
6565
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" GO_TESTING=true go run github.com/onsi/ginkgo/v2/ginkgo -p -timeout 0 -cover -coverprofile cover.out -r --skip-file ./test/e2e
6666

67+
.PHONY: test-serial
68+
test-serial: manifests generate fmt vet envtest ## Run tests.
69+
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" GO_TESTING=true go run github.com/onsi/ginkgo/v2/ginkgo -timeout 0 -r --skip-file ./test/e2e
70+
6771
.PHONY: ut
6872
ut: manifests generate ## Run unit tests by make ut F=<focus-file>
6973
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" cd internal/controller && GO_TESTING=true go run github.com/onsi/ginkgo/v2/ginkgo -p -timeout 0 --focus-file $F && cd ../../

PROJECT

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,4 +87,20 @@ resources:
8787
kind: TensorFusionWorkload
8888
path: github.com/NexusGPU/tensor-fusion/api/v1
8989
version: v1
90+
- api:
91+
crdVersion: v1
92+
namespaced: true
93+
controller: true
94+
domain: tensor-fusion.ai
95+
kind: GPUResourceQuota
96+
path: github.com/NexusGPU/tensor-fusion/api/v1
97+
version: v1
98+
- api:
99+
crdVersion: v1
100+
namespaced: true
101+
controller: true
102+
domain: tensor-fusion.ai
103+
kind: GPUNodeClaim
104+
path: github.com/NexusGPU/tensor-fusion/api/v1
105+
version: v1
90106
version: "3"

api/v1/gpunode_funcs.go

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
package v1
22

33
import (
4-
"time"
5-
64
"k8s.io/apimachinery/pkg/api/resource"
75
)
86

@@ -18,10 +16,3 @@ func (node *GPUNode) InitializeStatus(initTFlops, initVRAM resource.Quantity, in
1816
ObservedGeneration: node.Generation,
1917
}
2018
}
21-
22-
func (node *GPUNode) SetAnnotationToTriggerNodeSync() {
23-
if node.Annotations == nil {
24-
node.Annotations = make(map[string]string)
25-
}
26-
node.Annotations["tensor-fusion.ai/refresh-node-state"] = time.Now().String()
27-
}

api/v1/gpunode_types.go

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,6 @@ const (
5454

5555
// GPUNodeStatus defines the observed state of GPUNode.
5656
type GPUNodeStatus struct {
57-
// the identifier of the kubernetes node, in nodeSelector mode, GPUNode name is the same as kubernetes node name because of it's owned by the Kubernetes node, while in node provisioning mode owned by the GPUNode, and K8S Node name is uncontrollable
58-
KubernetesNodeName string `json:"kubernetesNodeName"`
59-
6057
// +kubebuilder:default=Pending
6158
Phase TensorFusionGPUNodePhase `json:"phase"`
6259

@@ -112,21 +109,6 @@ const (
112109
)
113110

114111
type GPUNodeInfo struct {
115-
// +optional
116-
// only set when node is managed by TensorFusion
117-
InstanceID string `json:"instanceID,omitempty"`
118-
Region string `json:"region,omitempty"`
119-
120-
Hostname string `json:"hostname,omitempty"`
121-
IP string `json:"ip,omitempty"`
122-
KernelVersion string `json:"kernelVersion,omitempty"`
123-
OSImage string `json:"osImage,omitempty"`
124-
GPUDriverVersion string `json:"gpuDriverVersion,omitempty"`
125-
GPUModel string `json:"gpuModel,omitempty"`
126-
GPUCount int32 `json:"gpuCount,omitempty"`
127-
OperatingSystem string `json:"operatingSystem,omitempty"`
128-
Architecture string `json:"architecture,omitempty"`
129-
130112
// Additional space for L1/L2 VRAM buffer
131113
RAMSize resource.Quantity `json:"ramSize,omitempty"`
132114
DataDiskSize resource.Quantity `json:"dataDiskSize,omitempty"`

api/v1/gpunodeclaim_types.go

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
/*
2+
Copyright 2024.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package v1
18+
19+
import (
20+
"k8s.io/apimachinery/pkg/api/resource"
21+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
22+
)
23+
24+
// GPUNodeClaimStatus defines the observed state of GPUNodeClaim.
25+
type GPUNodeClaimStatus struct {
26+
27+
// +kubebuilder:default=Pending
28+
Phase GPUNodeClaimPhase `json:"phase"`
29+
30+
InstanceID string `json:"instanceID,omitempty"`
31+
}
32+
33+
type GPUNodeClaimPhase string
34+
35+
const (
36+
GPUNodeClaimPending GPUNodeClaimPhase = "Pending"
37+
GPUNodeClaimCreating GPUNodeClaimPhase = "Creating"
38+
GPUNodeClaimBound GPUNodeClaimPhase = "Bound"
39+
)
40+
41+
const GPUNodeClaimKind = "GPUNodeClaim"
42+
43+
// +kubebuilder:object:root=true
44+
// +kubebuilder:subresource:status
45+
// +kubebuilder:resource:scope=Cluster
46+
// +kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase"
47+
48+
// GPUNodeClaim is the Schema for the gpunodeclaims API.
49+
type GPUNodeClaim struct {
50+
metav1.TypeMeta `json:",inline"`
51+
metav1.ObjectMeta `json:"metadata"`
52+
53+
Spec GPUNodeClaimSpec `json:"spec,omitempty"`
54+
Status GPUNodeClaimStatus `json:"status,omitempty"`
55+
}
56+
57+
// +kubebuilder:object:root=true
58+
59+
// GPUNodeClaimList contains a list of GPUNodeClaim.
60+
type GPUNodeClaimList struct {
61+
metav1.TypeMeta `json:",inline"`
62+
metav1.ListMeta `json:"metadata"`
63+
Items []GPUNodeClaim `json:"items"`
64+
}
65+
66+
func init() {
67+
SchemeBuilder.Register(&GPUNodeClaim{}, &GPUNodeClaimList{})
68+
}
69+
70+
type CapacityTypeEnum string
71+
72+
const (
73+
CapacityTypeOnDemand CapacityTypeEnum = "OnDemand"
74+
75+
CapacityTypeReserved CapacityTypeEnum = "Reserved"
76+
77+
// Spot and Preemptive are aliases of each other, used by different providers
78+
CapacityTypeSpot CapacityTypeEnum = "Spot"
79+
)
80+
81+
// GPUNodeClaimSpec defines the desired state of GPUNodeClaim.
82+
type GPUNodeClaimSpec struct {
83+
NodeName string `json:"nodeName,omitempty"`
84+
Region string `json:"region,omitempty"`
85+
Zone string `json:"zone,omitempty"`
86+
InstanceType string `json:"instanceType,omitempty"`
87+
NodeClassRef GroupKindName `json:"nodeClassRef,omitempty"`
88+
CapacityType CapacityTypeEnum `json:"capacityType,omitempty"`
89+
90+
TFlopsOffered resource.Quantity `json:"tflopsOffered"`
91+
VRAMOffered resource.Quantity `json:"vramOffered"`
92+
GPUDeviceOffered int32 `json:"gpuDeviceOffered"`
93+
94+
ExtraParams map[string]string `json:"extraParams,omitempty"`
95+
}
96+
97+
type GroupKindName struct {
98+
Group string `json:"group"`
99+
Kind string `json:"kind"`
100+
Version string `json:"version"`
101+
Name string `json:"name"`
102+
}

api/v1/gpunodeclass_types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,10 @@ type GPUNodeClassList struct {
154154
Items []GPUNodeClass `json:"items"`
155155
}
156156

157+
const (
158+
GPUNodeClassKind = "GPUNodeClass"
159+
)
160+
157161
func init() {
158162
SchemeBuilder.Register(&GPUNodeClass{}, &GPUNodeClassList{})
159163
}

api/v1/gpupool_types.go

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -111,8 +111,14 @@ const (
111111
// NodeProvisioner or NodeSelector, they are exclusive.
112112
// NodeSelector is for existing GPUs, NodeProvisioner is for Karpenter-like auto management.
113113
type NodeProvisioner struct {
114+
115+
// TensorFusion GPUNodeClass name
114116
NodeClass string `json:"nodeClass,omitempty"`
115117

118+
// Karpenter NodeClass name
119+
// +optional
120+
KarpenterNodeClassRef *GroupKindName `json:"karpenterNodeClassRef,omitempty"`
121+
116122
// +optional
117123
GPURequirements []Requirement `json:"gpuRequirements,omitempty"`
118124
// +optional
@@ -167,13 +173,13 @@ type Requirement struct {
167173
Values []string `json:"values,omitempty"`
168174
}
169175

170-
// +kubebuilder:validation:Enum=node.kubernetes.io/instance-type;kubernetes.io/arch;kubernetes.io/os;topology.kubernetes.io/region;topology.kubernetes.io/zone;karpenter.sh/capacity-type;tensor-fusion.ai/gpu-arch;tensor-fusion.ai/gpu-instance-family;tensor-fusion.ai/gpu-instance-size
176+
// +kubebuilder:validation:Enum=node.kubernetes.io/instance-type;kubernetes.io/arch;kubernetes.io/os;topology.kubernetes.io/region;topology.kubernetes.io/zone;karpenter.sh/capacity-type;tensor-fusion.ai/gpu-vendor;tensor-fusion.ai/gpu-instance-family;tensor-fusion.ai/gpu-instance-size
171177
type NodeRequirementKey string
172178

173179
const (
174-
NodeRequirementKeyInstanceType NodeRequirementKey = "node.kubernetes.io/instance-type"
175-
NodeRequirementKeyArchitecture NodeRequirementKey = "kubernetes.io/arch"
176-
NodeRequirementKeyGPUArchitecture NodeRequirementKey = "tensor-fusion.ai/gpu-arch"
180+
NodeRequirementKeyInstanceType NodeRequirementKey = "node.kubernetes.io/instance-type"
181+
NodeRequirementKeyArchitecture NodeRequirementKey = "kubernetes.io/arch"
182+
NodeRequirementKeyGPUVendor NodeRequirementKey = "tensor-fusion.ai/gpu-vendor"
177183

178184
NodeRequirementKeyOS NodeRequirementKey = "kubernetes.io/os"
179185
NodeRequirementKeyRegion NodeRequirementKey = "topology.kubernetes.io/region"
@@ -401,6 +407,10 @@ type GPUPoolStatus struct {
401407
// TODO not implemented yet
402408
BudgetExceeded string `json:"budgetExceeded,omitempty"`
403409

410+
// +optional
411+
// +kubebuilder:default="None"
412+
ProvisioningPhase ProvisioningPhase `json:"provisioningPhase,omitempty"`
413+
404414
// +optional
405415
LastCompactionTime *metav1.Time `json:"lastCompactionTime,omitempty"`
406416
}
@@ -416,6 +426,21 @@ const (
416426
TensorFusionPoolPhaseDestroying = TensorFusionPoolPhase(constants.PhaseDestroying)
417427
)
418428

429+
// +kubebuilder:validation:Enum=None;Initializing;Provisioning;Completed
430+
type ProvisioningPhase string
431+
432+
const (
433+
// None means not in provisioning mode
434+
ProvisioningPhaseNone = ProvisioningPhase("None")
435+
436+
// When NodeClaim created and pending GPUNodeClaim not empty, it's provisioning state,
437+
// check until all GPUNodeClaims are bound, unless next scale up should not happen
438+
ProvisioningPhaseProvisioning = ProvisioningPhase("Provisioning")
439+
440+
// When all GPUNodeClaims are bound, set to Completed
441+
ProvisioningPhaseCompleted = ProvisioningPhase("Completed")
442+
)
443+
419444
type PoolProvisioningStatus struct {
420445
InitializingNodes int32 `json:"initializingNodes,omitempty"`
421446
TerminatingNodes int32 `json:"terminatingNodes,omitempty"`

0 commit comments

Comments
 (0)