Skip to content

Commit aeaeca2

Browse files
feat: Support Karpenter Node Scaling integration and prcing (#262)
* support kapenter for cloudprovider * Karpenter Node Scaling integration * Karpenter Node Scaling integration * Karpenter Node Scaling integration * Support Karpenter Node Scaling integration and prcing * filte FractionalGPU and aws ec2 support pricing and nodeinfo * Fill AZ instance gpu memory * add mock node create/delete behavior and fix unit test issue * fix parameter name issue and remove Finalizer and GPUArchitectureEnum * 1. support customer annotation 2. support dynamic load aws-gpu/az-gpu file * fix launch.json * rollback dynamic pricing data logic --------- Co-authored-by: Joey Yang <[email protected]>
1 parent b311427 commit aeaeca2

28 files changed

+3893
-12
lines changed

api/v1/gpupool_types.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,8 @@ type NodeProvisioner struct {
126126
CPUTaints []Taint `json:"cpuTaints,omitempty"`
127127
// +optional
128128
CPULabels map[string]string `json:"cpuNodeLabels,omitempty"`
129+
// +optional
130+
GPUAnnotation map[string]string `json:"gpuNodeAnnotations,omitempty"`
129131

130132
// +optional
131133
// NodeProvisioner will start an virtual billing based on public pricing or customized pricing, if the VM's costs exceeded any budget constraints, the new VM will not be created, and alerts will be generated

api/v1/tensorfusioncluster_types.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ const (
126126
AuthTypeServiceAccountRole AuthTypeEnum = "serviceAccountRole"
127127
)
128128

129-
// +kubebuilder:validation:Enum=aws;lambda-labs;gcp;azure;oracle-oci;ibm;openshift;vultr;together-ai;alibaba;nvidia;tencent;runpod;mock
129+
// +kubebuilder:validation:Enum=aws;lambda-labs;gcp;azure;oracle-oci;ibm;openshift;vultr;together-ai;alibaba;nvidia;tencent;runpod;karpenter;mock
130130
type ComputingVendorName string
131131

132132
const (
@@ -143,6 +143,7 @@ const (
143143
ComputingVendorNvidia ComputingVendorName = "nvidia"
144144
ComputingVendorTencent ComputingVendorName = "tencent"
145145
ComputingVendorRunPod ComputingVendorName = "runpod"
146+
ComputingVendorKarpenter ComputingVendorName = "karpenter"
146147

147148
// This is not unit/integration testing only, no cloud provider is involved
148149
ComputingVendorMock ComputingVendorName = "mock"

api/v1/zz_generated.deepcopy.go

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,10 @@ spec:
359359
type: string
360360
type: object
361361
type: array
362+
gpuNodeAnnotations:
363+
additionalProperties:
364+
type: string
365+
type: object
362366
gpuNodeLabels:
363367
additionalProperties:
364368
type: string

charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ spec:
119119
- nvidia
120120
- tencent
121121
- runpod
122+
- karpenter
122123
- mock
123124
type: string
124125
type: object
@@ -425,6 +426,10 @@ spec:
425426
type: string
426427
type: object
427428
type: array
429+
gpuNodeAnnotations:
430+
additionalProperties:
431+
type: string
432+
type: object
428433
gpuNodeLabels:
429434
additionalProperties:
430435
type: string

cmd/main.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ import (
4848
tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
4949
"github.com/NexusGPU/tensor-fusion/cmd/sched"
5050
"github.com/NexusGPU/tensor-fusion/internal/alert"
51+
"github.com/NexusGPU/tensor-fusion/internal/cloudprovider/pricing"
5152
"github.com/NexusGPU/tensor-fusion/internal/config"
5253
"github.com/NexusGPU/tensor-fusion/internal/constants"
5354
"github.com/NexusGPU/tensor-fusion/internal/controller"
@@ -564,6 +565,7 @@ func startWatchGPUInfoChanges(ctx context.Context, gpuInfos *[]config.GpuInfo, g
564565
for _, gpuInfo := range updatedGpuInfos {
565566
gpuPricingMap[gpuInfo.FullModelName] = gpuInfo.CostPerHour
566567
}
568+
pricing.SetTflopsMap(&updatedGpuInfos)
567569
}
568570
}()
569571
}

config/crd/bases/tensor-fusion.ai_gpupools.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,10 @@ spec:
359359
type: string
360360
type: object
361361
type: array
362+
gpuNodeAnnotations:
363+
additionalProperties:
364+
type: string
365+
type: object
362366
gpuNodeLabels:
363367
additionalProperties:
364368
type: string

config/crd/bases/tensor-fusion.ai_tensorfusionclusters.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ spec:
119119
- nvidia
120120
- tencent
121121
- runpod
122+
- karpenter
122123
- mock
123124
type: string
124125
type: object
@@ -425,6 +426,10 @@ spec:
425426
type: string
426427
type: object
427428
type: array
429+
gpuNodeAnnotations:
430+
additionalProperties:
431+
type: string
432+
type: object
428433
gpuNodeLabels:
429434
additionalProperties:
430435
type: string

go.mod

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ require (
1212
github.com/gin-gonic/gin v1.10.1
1313
github.com/influxdata/line-protocol/v2 v2.2.1
1414
github.com/lithammer/shortuuid/v4 v4.2.0
15+
github.com/mitchellh/mapstructure v1.5.0
1516
github.com/onsi/ginkgo/v2 v2.23.4
1617
github.com/onsi/gomega v1.37.0
1718
github.com/pkg/errors v0.9.1
@@ -33,6 +34,7 @@ require (
3334
k8s.io/kubernetes v1.32.5
3435
k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979
3536
sigs.k8s.io/controller-runtime v0.20.4
37+
sigs.k8s.io/karpenter v1.2.2
3638
sigs.k8s.io/scheduler-plugins v0.31.8
3739
sigs.k8s.io/yaml v1.5.0
3840
)
@@ -49,6 +51,7 @@ require (
4951
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.4 // indirect
5052
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.17 // indirect
5153
github.com/aws/smithy-go v1.22.4 // indirect
54+
github.com/awslabs/operatorpkg v0.0.0-20241205163410-0fff9f28d115 // indirect
5255
github.com/beorn7/perks v1.0.1 // indirect
5356
github.com/blang/semver/v4 v4.0.0 // indirect
5457
github.com/bytedance/sonic v1.13.2 // indirect
@@ -104,6 +107,7 @@ require (
104107
github.com/leodido/go-urn v1.4.0 // indirect
105108
github.com/mailru/easyjson v0.9.0 // indirect
106109
github.com/mattn/go-isatty v0.0.20 // indirect
110+
github.com/mitchellh/hashstructure/v2 v2.0.2 // indirect
107111
github.com/moby/term v0.5.0 // indirect
108112
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
109113
github.com/modern-go/reflect2 v1.0.2 // indirect
@@ -116,6 +120,7 @@ require (
116120
github.com/prometheus/client_model v0.6.1 // indirect
117121
github.com/prometheus/common v0.62.0 // indirect
118122
github.com/prometheus/procfs v0.15.1 // indirect
123+
github.com/robfig/cron/v3 v3.0.1 // indirect
119124
github.com/spf13/cobra v1.8.1 // indirect
120125
github.com/spf13/pflag v1.0.5 // indirect
121126
github.com/stoewer/go-strcase v1.3.0 // indirect

go.sum

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,17 @@ github.com/NVIDIA/go-nvml v0.12.9-0 h1:e344UK8ZkeMeeLkdQtRhmXRxNf+u532LDZPGMtkdu
1313
github.com/NVIDIA/go-nvml v0.12.9-0/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4=
1414
github.com/NYTimes/gziphandler v1.1.1 h1:ZUDjpQae29j0ryrS0u/B8HZfJBtBQHjqw2rQ2cqUQ3I=
1515
github.com/NYTimes/gziphandler v1.1.1/go.mod h1:n/CVRwUEOgIxrgPvAQhUUr9oeUtvrhMomdKFjzJNB0c=
16+
github.com/Pallinder/go-randomdata v1.2.0 h1:DZ41wBchNRb/0GfsePLiSwb0PHZmT67XY00lCDlaYPg=
17+
github.com/Pallinder/go-randomdata v1.2.0/go.mod h1:yHmJgulpD2Nfrm0cR9tI/+oAgRqCQQixsA8HyRZfV9Y=
1618
github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw=
1719
github.com/aliyun/alibaba-cloud-sdk-go v1.63.107 h1:qagvUyrgOnBIlVRQWOyCZGVKUIYbMBdGdJ104vBpRFU=
1820
github.com/aliyun/alibaba-cloud-sdk-go v1.63.107/go.mod h1:SOSDHfe1kX91v3W5QiBsWSLqeLxImobbMX1mxrFHsVQ=
1921
github.com/antlr4-go/antlr/v4 v4.13.1 h1:SqQKkuVZ+zWkMMNkjy5FZe5mr5WURWnlpmOuzYWrPrQ=
2022
github.com/antlr4-go/antlr/v4 v4.13.1/go.mod h1:GKmUxMtwp6ZgGwZSva4eWPC5mS6vUAmOABFgjdkM7Nw=
2123
github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a h1:idn718Q4B6AGu/h5Sxe66HYVdqdGu2l9Iebqhi/AEoA=
2224
github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY=
25+
github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHSxpiH9JdtuBj0=
26+
github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY=
2327
github.com/aws/aws-sdk-go-v2 v1.36.5 h1:0OF9RiEMEdDdZEMqF9MRjevyxAQcf6gY+E7vwBILFj0=
2428
github.com/aws/aws-sdk-go-v2 v1.36.5/go.mod h1:EYrzvCCN9CMUTa5+6lf6MM4tq3Zjp8UhSGR/cBsjai0=
2529
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.36 h1:SsytQyTMHMDPspp+spo7XwXTP44aJZZAC7fBV2C5+5s=
@@ -34,6 +38,8 @@ github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.17 h1:t0E6FzRE
3438
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.17/go.mod h1:ygpklyoaypuyDvOM5ujWGrYWpAK3h7ugnmKCU/76Ys4=
3539
github.com/aws/smithy-go v1.22.4 h1:uqXzVZNuNexwc/xrh6Tb56u89WDlJY6HS+KC0S4QSjw=
3640
github.com/aws/smithy-go v1.22.4/go.mod h1:t1ufH5HMublsJYulve2RKmHDC15xu1f26kHCp/HgceI=
41+
github.com/awslabs/operatorpkg v0.0.0-20241205163410-0fff9f28d115 h1:9nhjY3dzCpEmhpQ0vMlhB7wqucAiftLjAIEQu8uT2J4=
42+
github.com/awslabs/operatorpkg v0.0.0-20241205163410-0fff9f28d115/go.mod h1:TTs6HGuqmgdNyNlbdv29v1OoON+kQKVPojZgJaJVtNk=
3743
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
3844
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
3945
github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=
@@ -159,6 +165,8 @@ github.com/grpc-ecosystem/grpc-gateway v1.16.0 h1:gmcG1KaJ57LophUzW0Hy8NmPhnMZb4
159165
github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw=
160166
github.com/grpc-ecosystem/grpc-gateway/v2 v2.25.1 h1:VNqngBF40hVlDloBruUehVYC3ArSgIyScOAyMRqBxRg=
161167
github.com/grpc-ecosystem/grpc-gateway/v2 v2.25.1/go.mod h1:RBRO7fro65R6tjKzYgLAFo0t1QEXY1Dp+i/bvpRiqiQ=
168+
github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4=
169+
github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY=
162170
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
163171
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
164172
github.com/influxdata/line-protocol-corpus v0.0.0-20210519164801-ca6fa5da0184/go.mod h1:03nmhxzZ7Xk2pdG+lmMd7mHDfeVOYFyhOgwO61qWU98=
@@ -211,6 +219,10 @@ github.com/mailru/easyjson v0.9.0 h1:PrnmzHw7262yW8sTBwxi1PdJA3Iw/EKBa8psRf7d9a4
211219
github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU=
212220
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
213221
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
222+
github.com/mitchellh/hashstructure/v2 v2.0.2 h1:vGKWl0YJqUNxE8d+h8f6NJLcCJrgbhC4NcD46KavDd4=
223+
github.com/mitchellh/hashstructure/v2 v2.0.2/go.mod h1:MG3aRVU/N29oo/V/IhBX8GR/zz4kQkprJgF2EVszyDE=
224+
github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY=
225+
github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo=
214226
github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0=
215227
github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y=
216228
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
@@ -229,6 +241,8 @@ github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8
229241
github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
230242
github.com/opentracing/opentracing-go v1.2.1-0.20220228012449-10b1cf09e00b h1:FfH+VrHHk6Lxt9HdVS0PXzSXFyS2NbZKXv33FYPol0A=
231243
github.com/opentracing/opentracing-go v1.2.1-0.20220228012449-10b1cf09e00b/go.mod h1:AC62GU6hc0BrNm+9RK9VSiwa/EUe1bkIeFORAMcHvJU=
244+
github.com/patrickmn/go-cache v2.1.0+incompatible h1:HRMgzkcYKYpi3C8ajMPV8OFXaaRUnok+kx1WdO15EQc=
245+
github.com/patrickmn/go-cache v2.1.0+incompatible/go.mod h1:3Qf8kWWT7OJRJbdiICTKqZju1ZixQ/KpMGzzAfe6+WQ=
232246
github.com/pelletier/go-toml/v2 v2.2.3 h1:YmeHyLY8mFWbdkNWwpr+qIL2bEqT0o95WSdkNHvL12M=
233247
github.com/pelletier/go-toml/v2 v2.2.3/go.mod h1:MfCQTFTvCcUyyvvwm1+G6H/jORL20Xlb6rzQu9GuUkc=
234248
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
@@ -246,6 +260,8 @@ github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ
246260
github.com/prometheus/common v0.62.0/go.mod h1:vyBcEuLSvWos9B1+CyL7JZ2up+uFzXhkqml0W5zIY1I=
247261
github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
248262
github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
263+
github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs=
264+
github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro=
249265
github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
250266
github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
251267
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
@@ -494,6 +510,8 @@ sigs.k8s.io/controller-runtime v0.20.4 h1:X3c+Odnxz+iPTRobG4tp092+CvBU9UK0t/bRf+
494510
sigs.k8s.io/controller-runtime v0.20.4/go.mod h1:xg2XB0K5ShQzAgsoujxuKN4LNXR2LfwwHsPj7Iaw+XY=
495511
sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE=
496512
sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg=
513+
sigs.k8s.io/karpenter v1.2.2 h1:aiks+/3JHIMtEw+gQXdw7cf2M/ZTcpto4BOHE9dC40U=
514+
sigs.k8s.io/karpenter v1.2.2/go.mod h1:CTQOmDpbYMTY6uvRZ6XoyKNXWoIMVbCOGCkgvYeZSO4=
497515
sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY=
498516
sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU=
499517
sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY=

0 commit comments

Comments
 (0)