Skip to content

Commit 20979b2

Browse files
authored
Merge pull request #1884 from rootfs/nvidia-grace
feat(sensor): support NVIDIA Grace Hopper
2 parents c9524a8 + b9e42b8 commit 20979b2

File tree

4 files changed

+447
-1
lines changed

4 files changed

+447
-1
lines changed

pkg/sensors/accelerator/devices/device.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ const (
3030
HABANA
3131
DCGM
3232
NVML
33+
GRACE
3334
)
3435

3536
var (
@@ -46,7 +47,7 @@ type (
4647
)
4748

4849
func (d DeviceType) String() string {
49-
return [...]string{"MOCK", "HABANA", "DCGM", "NVML"}[d]
50+
return [...]string{"MOCK", "HABANA", "DCGM", "NVML", "GRACE HOPPER"}[d]
5051
}
5152

5253
type Device interface {
@@ -110,6 +111,7 @@ func registerDevices(r *Registry) {
110111
dcgmCheck(r)
111112
habanaCheck(r)
112113
nvmlCheck(r)
114+
graceCheck(r)
113115
}
114116

115117
func (r *Registry) MustRegister(a string, d DeviceType, deviceStartup deviceStartupFunc) {
Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
/*
2+
Copyright 2024.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package devices
18+
19+
import (
20+
"fmt"
21+
"os"
22+
"path/filepath"
23+
"strconv"
24+
"strings"
25+
"time"
26+
27+
"github.com/sustainable-computing-io/kepler/pkg/config"
28+
"k8s.io/klog/v2"
29+
)
30+
31+
const (
32+
// Grace ACPI power paths and identifiers
33+
graceHwmonPathTemplate = "/sys/class/hwmon/hwmon*"
34+
graceDevicePath = "device/"
35+
gracePowerPrefix = "power1"
36+
graceOemInfoFile = "_oem_info"
37+
graceAverageFile = "_average"
38+
39+
// Grace Hopper module power identifier
40+
graceModuleLabel = "Module Power Socket" // Total CG1 module power (GPU+HBM)
41+
42+
// Constants
43+
microWattToMilliJoule = 1000 // Convert microwatts to mJ assuming 1 second sampling
44+
graceHwType = config.GPU
45+
)
46+
47+
var (
48+
graceAccImpl = gpuGraceACPI{}
49+
graceType DeviceType
50+
)
51+
52+
type gpuGraceACPI struct {
53+
collectionSupported bool
54+
modulePowerPaths map[int]string // Module power paths indexed by socket
55+
currTime time.Time
56+
}
57+
58+
func graceCheck(r *Registry) {
59+
if err := graceAccImpl.InitLib(); err != nil {
60+
klog.V(5).Infof("Error initializing Grace GPU: %v", err)
61+
return
62+
}
63+
graceType = GRACE
64+
if err := addDeviceInterface(r, graceType, graceHwType, graceDeviceStartup); err == nil {
65+
klog.Infof("Using %s to obtain Grace GPU power", graceAccImpl.Name())
66+
} else {
67+
klog.V(5).Infof("Error registering Grace GPU: %v", err)
68+
}
69+
}
70+
71+
func graceDeviceStartup() Device {
72+
if err := graceAccImpl.Init(); err != nil {
73+
klog.Errorf("failed to init Grace GPU device: %v", err)
74+
return nil
75+
}
76+
return &graceAccImpl
77+
}
78+
79+
func (g *gpuGraceACPI) findModulePowerPaths() error {
80+
g.modulePowerPaths = make(map[int]string)
81+
82+
hwmonDirs, err := filepath.Glob(graceHwmonPathTemplate)
83+
if err != nil {
84+
return fmt.Errorf("failed to find hwmon directories: %v", err)
85+
}
86+
87+
for _, hwmonDir := range hwmonDirs {
88+
deviceDir := hwmonDir + "/" + graceDevicePath
89+
oemFile := deviceDir + gracePowerPrefix + graceOemInfoFile
90+
data, err := os.ReadFile(oemFile)
91+
if err != nil {
92+
continue
93+
}
94+
label := strings.TrimSpace(string(data))
95+
96+
if !strings.HasPrefix(label, graceModuleLabel) {
97+
continue
98+
}
99+
100+
var socketNum int
101+
if strings.HasSuffix(label, "Socket 0") {
102+
socketNum = 0
103+
} else if strings.HasSuffix(label, "Socket 1") {
104+
socketNum = 1
105+
} else {
106+
continue
107+
}
108+
109+
avgFile := deviceDir + gracePowerPrefix + graceAverageFile
110+
g.modulePowerPaths[socketNum] = avgFile
111+
}
112+
113+
return nil
114+
}
115+
116+
func (g *gpuGraceACPI) readPowerFile(path string) (uint64, error) {
117+
if path == "" {
118+
return 0, fmt.Errorf("power path not initialized")
119+
}
120+
121+
data, err := os.ReadFile(path)
122+
if err != nil {
123+
return 0, fmt.Errorf("failed to read power file %s: %v", path, err)
124+
}
125+
126+
power, err := strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64)
127+
if err != nil {
128+
return 0, fmt.Errorf("failed to parse power value: %v", err)
129+
}
130+
131+
now := time.Now()
132+
if g.currTime.IsZero() {
133+
g.currTime = now
134+
return 0, nil
135+
}
136+
137+
diff := now.Sub(g.currTime)
138+
seconds := diff.Seconds()
139+
g.currTime = now
140+
141+
energy := uint64(float64(power) * seconds / microWattToMilliJoule)
142+
return energy, nil
143+
}
144+
145+
func (g *gpuGraceACPI) Name() string {
146+
return graceType.String()
147+
}
148+
149+
func (g *gpuGraceACPI) DevType() DeviceType {
150+
return graceType
151+
}
152+
153+
func (g *gpuGraceACPI) HwType() string {
154+
return graceHwType
155+
}
156+
157+
func (g *gpuGraceACPI) InitLib() error {
158+
return nil
159+
}
160+
161+
func (g *gpuGraceACPI) Init() error {
162+
if err := g.findModulePowerPaths(); err != nil {
163+
return err
164+
}
165+
g.collectionSupported = len(g.modulePowerPaths) > 0
166+
if g.collectionSupported {
167+
klog.V(4).Infof("Detected Grace Hopper system with %d GPUs", len(g.modulePowerPaths))
168+
}
169+
return nil
170+
}
171+
172+
func (g *gpuGraceACPI) IsDeviceCollectionSupported() bool {
173+
return g.collectionSupported
174+
}
175+
176+
func (g *gpuGraceACPI) SetDeviceCollectionSupported(supported bool) {
177+
g.collectionSupported = supported
178+
}
179+
180+
func (g *gpuGraceACPI) AbsEnergyFromDevice() []uint32 {
181+
var energies []uint32
182+
for socketNum := 0; socketNum < len(g.modulePowerPaths); socketNum++ {
183+
if path, ok := g.modulePowerPaths[socketNum]; ok {
184+
energy, err := g.readPowerFile(path)
185+
if err != nil {
186+
klog.V(3).Infof("Failed to read GPU power for socket %d: %v", socketNum, err)
187+
energies = append(energies, 0)
188+
continue
189+
}
190+
energies = append(energies, uint32(energy))
191+
}
192+
}
193+
return energies
194+
}
195+
196+
func (g *gpuGraceACPI) DevicesByID() map[int]any {
197+
devs := make(map[int]any)
198+
for socketNum := range g.modulePowerPaths {
199+
devs[socketNum] = GPUDevice{
200+
ID: socketNum,
201+
IsSubdevice: false,
202+
}
203+
}
204+
return devs
205+
}
206+
207+
func (g *gpuGraceACPI) DevicesByName() map[string]any {
208+
return make(map[string]any)
209+
}
210+
211+
func (g *gpuGraceACPI) DeviceInstances() map[int]map[int]any {
212+
return make(map[int]map[int]any)
213+
}
214+
215+
func (g *gpuGraceACPI) DeviceUtilizationStats(dev any) (map[any]any, error) {
216+
return make(map[any]any), nil
217+
}
218+
219+
func (g *gpuGraceACPI) ProcessResourceUtilizationPerDevice(dev any, since time.Duration) (map[uint32]any, error) {
220+
// Grace Hopper doesn't provide per-process GPU utilization through ACPI
221+
return make(map[uint32]any), nil
222+
}
223+
224+
func (g *gpuGraceACPI) Shutdown() bool {
225+
g.currTime = time.Time{}
226+
return true
227+
}

pkg/sensors/components/power.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,13 @@ func InitPowerImpl() {
7575
return
7676
}
7777

78+
graceACPIImpl := &source.GraceACPI{}
79+
if graceACPIImpl.IsSystemCollectionSupported() {
80+
klog.V(1).Infoln("use NVIDIA Grace ACPI to obtain power")
81+
powerImpl = graceACPIImpl
82+
return
83+
}
84+
7885
klog.V(1).Infoln("Unable to obtain power, use estimate method")
7986
estimateImpl := &source.PowerEstimate{}
8087
powerImpl = estimateImpl

0 commit comments

Comments
 (0)