Skip to content

Commit 3bf901c

Browse files
authored
Merge pull request #2176 from sthaha/feat-track-all-terminated
feat(monitor): add terminated workload tracking and reporting
2 parents e33e5d4 + a2648cb commit 3bf901c

File tree

13 files changed

+2015
-238
lines changed

13 files changed

+2015
-238
lines changed

docs/metrics/metrics.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ These metrics provide energy and power information for containers.
107107
- `container_id`
108108
- `container_name`
109109
- `runtime`
110+
- `state`
110111
- `zone`
111112
- `pod_id`
112113
- **Constant Labels**:
@@ -120,6 +121,7 @@ These metrics provide energy and power information for containers.
120121
- `container_id`
121122
- `container_name`
122123
- `runtime`
124+
- `state`
123125
- `zone`
124126
- `pod_id`
125127
- **Constant Labels**:
@@ -187,6 +189,7 @@ These metrics provide energy and power information for virtual machines.
187189
- `vm_id`
188190
- `vm_name`
189191
- `hypervisor`
192+
- `state`
190193
- `zone`
191194
- **Constant Labels**:
192195
- `node_name`
@@ -199,6 +202,7 @@ These metrics provide energy and power information for virtual machines.
199202
- `vm_id`
200203
- `vm_name`
201204
- `hypervisor`
205+
- `state`
202206
- `zone`
203207
- **Constant Labels**:
204208
- `node_name`
@@ -215,6 +219,7 @@ These metrics provide energy and power information for pods.
215219
- `pod_id`
216220
- `pod_name`
217221
- `pod_namespace`
222+
- `state`
218223
- `zone`
219224
- **Constant Labels**:
220225
- `node_name`
@@ -227,6 +232,7 @@ These metrics provide energy and power information for pods.
227232
- `pod_id`
228233
- `pod_name`
229234
- `pod_namespace`
235+
- `state`
230236
- `zone`
231237
- **Constant Labels**:
232238
- `node_name`

internal/exporter/prometheus/collector/power_collector.go

Lines changed: 24 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -129,14 +129,14 @@ func NewPowerCollector(monitor PowerDataProvider, nodeName string, logger *slog.
129129
processCPUWattsDescriptor: wattsDesc("process", "cpu", nodeName, []string{"pid", "comm", "exe", "type", "state", cntrID, vmID, zone}),
130130
processCPUTimeDescriptor: timeDesc("process", "cpu", nodeName, []string{"pid", "comm", "exe", "type", cntrID, vmID}),
131131

132-
containerCPUJoulesDescriptor: joulesDesc("container", "cpu", nodeName, []string{cntrID, "container_name", "runtime", zone, podID}),
133-
containerCPUWattsDescriptor: wattsDesc("container", "cpu", nodeName, []string{cntrID, "container_name", "runtime", zone, podID}),
132+
containerCPUJoulesDescriptor: joulesDesc("container", "cpu", nodeName, []string{cntrID, "container_name", "runtime", "state", zone, podID}),
133+
containerCPUWattsDescriptor: wattsDesc("container", "cpu", nodeName, []string{cntrID, "container_name", "runtime", "state", zone, podID}),
134134

135-
vmCPUJoulesDescriptor: joulesDesc("vm", "cpu", nodeName, []string{vmID, "vm_name", "hypervisor", zone}),
136-
vmCPUWattsDescriptor: wattsDesc("vm", "cpu", nodeName, []string{vmID, "vm_name", "hypervisor", zone}),
135+
vmCPUJoulesDescriptor: joulesDesc("vm", "cpu", nodeName, []string{vmID, "vm_name", "hypervisor", "state", zone}),
136+
vmCPUWattsDescriptor: wattsDesc("vm", "cpu", nodeName, []string{vmID, "vm_name", "hypervisor", "state", zone}),
137137

138-
podCPUJoulesDescriptor: joulesDesc("pod", "cpu", nodeName, []string{podID, "pod_name", "pod_namespace", zone}),
139-
podCPUWattsDescriptor: wattsDesc("pod", "cpu", nodeName, []string{podID, "pod_name", "pod_namespace", zone}),
138+
podCPUJoulesDescriptor: joulesDesc("pod", "cpu", nodeName, []string{podID, "pod_name", "pod_namespace", "state", zone}),
139+
podCPUWattsDescriptor: wattsDesc("pod", "cpu", nodeName, []string{podID, "pod_name", "pod_namespace", "state", zone}),
140140
}
141141

142142
go c.waitForData()
@@ -228,15 +228,18 @@ func (c *PowerCollector) Collect(ch chan<- prometheus.Metric) {
228228
}
229229

230230
if c.metricsLevel.IsContainerEnabled() {
231-
c.collectContainerMetrics(ch, snapshot.Containers)
231+
c.collectContainerMetrics(ch, "running", snapshot.Containers)
232+
c.collectContainerMetrics(ch, "terminated", snapshot.TerminatedContainers)
232233
}
233234

234235
if c.metricsLevel.IsVMEnabled() {
235-
c.collectVMMetrics(ch, snapshot.VirtualMachines)
236+
c.collectVMMetrics(ch, "running", snapshot.VirtualMachines)
237+
c.collectVMMetrics(ch, "terminated", snapshot.TerminatedVirtualMachines)
236238
}
237239

238240
if c.metricsLevel.IsPodEnabled() {
239-
c.collectPodMetrics(ch, snapshot.Pods)
241+
c.collectPodMetrics(ch, "running", snapshot.Pods)
242+
c.collectPodMetrics(ch, "terminated", snapshot.TerminatedPods)
240243
}
241244
}
242245

@@ -342,9 +345,9 @@ func (c *PowerCollector) collectProcessMetrics(ch chan<- prometheus.Metric, stat
342345
}
343346

344347
// collectContainerMetrics collects container-level power metrics
345-
func (c *PowerCollector) collectContainerMetrics(ch chan<- prometheus.Metric, containers monitor.Containers) {
348+
func (c *PowerCollector) collectContainerMetrics(ch chan<- prometheus.Metric, state string, containers monitor.Containers) {
346349
if len(containers) == 0 {
347-
c.logger.Debug("No containers to export metrics for")
350+
c.logger.Debug("No containers to export metrics for", "state", state)
348351
return
349352
}
350353

@@ -357,7 +360,7 @@ func (c *PowerCollector) collectContainerMetrics(ch chan<- prometheus.Metric, co
357360
c.containerCPUJoulesDescriptor,
358361
prometheus.CounterValue,
359362
usage.EnergyTotal.Joules(),
360-
id, container.Name, string(container.Runtime),
363+
id, container.Name, string(container.Runtime), state,
361364
zoneName,
362365
container.PodID,
363366
)
@@ -366,7 +369,7 @@ func (c *PowerCollector) collectContainerMetrics(ch chan<- prometheus.Metric, co
366369
c.containerCPUWattsDescriptor,
367370
prometheus.GaugeValue,
368371
usage.Power.Watts(),
369-
id, container.Name, string(container.Runtime),
372+
id, container.Name, string(container.Runtime), state,
370373
zoneName,
371374
container.PodID,
372375
)
@@ -375,9 +378,9 @@ func (c *PowerCollector) collectContainerMetrics(ch chan<- prometheus.Metric, co
375378
}
376379

377380
// collectVMMetrics collects vm-level power metrics
378-
func (c *PowerCollector) collectVMMetrics(ch chan<- prometheus.Metric, vms monitor.VirtualMachines) {
381+
func (c *PowerCollector) collectVMMetrics(ch chan<- prometheus.Metric, state string, vms monitor.VirtualMachines) {
379382
if len(vms) == 0 {
380-
c.logger.Debug("No vms to export metrics for")
383+
c.logger.Debug("No vms to export metrics for", "state", state)
381384
return
382385
}
383386

@@ -389,24 +392,24 @@ func (c *PowerCollector) collectVMMetrics(ch chan<- prometheus.Metric, vms monit
389392
c.vmCPUJoulesDescriptor,
390393
prometheus.CounterValue,
391394
usage.EnergyTotal.Joules(),
392-
id, vm.Name, string(vm.Hypervisor),
395+
id, vm.Name, string(vm.Hypervisor), state,
393396
zoneName,
394397
)
395398

396399
ch <- prometheus.MustNewConstMetric(
397400
c.vmCPUWattsDescriptor,
398401
prometheus.GaugeValue,
399402
usage.Power.Watts(),
400-
id, vm.Name, string(vm.Hypervisor),
403+
id, vm.Name, string(vm.Hypervisor), state,
401404
zoneName,
402405
)
403406
}
404407
}
405408
}
406409

407-
func (c *PowerCollector) collectPodMetrics(ch chan<- prometheus.Metric, pods monitor.Pods) {
410+
func (c *PowerCollector) collectPodMetrics(ch chan<- prometheus.Metric, state string, pods monitor.Pods) {
408411
if len(pods) == 0 {
409-
c.logger.Debug("No pods to export metrics for")
412+
c.logger.Debug("No pods to export metrics", "state", state)
410413
return
411414
}
412415

@@ -418,15 +421,15 @@ func (c *PowerCollector) collectPodMetrics(ch chan<- prometheus.Metric, pods mon
418421
c.podCPUJoulesDescriptor,
419422
prometheus.CounterValue,
420423
usage.EnergyTotal.Joules(),
421-
id, pod.Name, pod.Namespace,
424+
id, pod.Name, pod.Namespace, state,
422425
zoneName,
423426
)
424427

425428
ch <- prometheus.MustNewConstMetric(
426429
c.podCPUWattsDescriptor,
427430
prometheus.GaugeValue,
428431
usage.Power.Watts(),
429-
id, pod.Name, pod.Namespace,
432+
id, pod.Name, pod.Namespace, state,
430433
zoneName,
431434
)
432435
}

internal/exporter/prometheus/collector/power_collector_test.go

Lines changed: 196 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,8 @@ func TestPowerCollector(t *testing.T) {
285285
mockMonitor.On("Snapshot").Return(testData, nil)
286286

287287
// Create collector
288-
collector := NewPowerCollector(mockMonitor, "test-node", logger, metrics.MetricsLevelNode|metrics.MetricsLevelProcess|metrics.MetricsLevelContainer|metrics.MetricsLevelVM|metrics.MetricsLevelPod)
288+
allLevels := metrics.MetricsLevelNode | metrics.MetricsLevelProcess | metrics.MetricsLevelContainer | metrics.MetricsLevelVM | metrics.MetricsLevelPod
289+
collector := NewPowerCollector(mockMonitor, "test-node", logger, allLevels)
289290

290291
// Trigger update to ensure descriptors are created
291292
mockMonitor.TriggerUpdate()
@@ -860,3 +861,197 @@ func TestPowerCollector_MetricsLevelFiltering(t *testing.T) {
860861
})
861862
}
862863
}
864+
865+
func TestTerminatedContainerExport(t *testing.T) {
866+
logger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelError}))
867+
mockMonitor := NewMockPowerMonitor()
868+
869+
packageZone := device.NewMockRaplZone("package", 0, "/sys/class/powercap/intel-rapl/intel-rapl:0", 1000)
870+
871+
testSnapshot := &monitor.Snapshot{
872+
Timestamp: time.Now(),
873+
Node: &monitor.Node{
874+
Zones: monitor.NodeZoneUsageMap{
875+
packageZone: monitor.NodeUsage{
876+
EnergyTotal: 1000 * device.Joule,
877+
Power: 10 * device.Watt,
878+
},
879+
},
880+
},
881+
Processes: monitor.Processes{},
882+
Containers: monitor.Containers{
883+
"running-container": &monitor.Container{
884+
ID: "running-container",
885+
Name: "running-cont",
886+
Runtime: resource.DockerRuntime,
887+
Zones: monitor.ZoneUsageMap{
888+
packageZone: monitor.Usage{
889+
EnergyTotal: 150 * device.Joule,
890+
Power: 15 * device.Watt,
891+
},
892+
},
893+
},
894+
},
895+
TerminatedContainers: monitor.Containers{
896+
"terminated-container": &monitor.Container{
897+
ID: "terminated-container",
898+
Name: "terminated-cont",
899+
Runtime: resource.PodmanRuntime,
900+
Zones: monitor.ZoneUsageMap{
901+
packageZone: monitor.Usage{
902+
EnergyTotal: 300 * device.Joule,
903+
Power: 30 * device.Watt,
904+
},
905+
},
906+
},
907+
},
908+
VirtualMachines: monitor.VirtualMachines{},
909+
Pods: monitor.Pods{},
910+
}
911+
912+
mockMonitor.On("Snapshot").Return(testSnapshot, nil)
913+
914+
allLevels := metrics.MetricsLevelNode | metrics.MetricsLevelProcess | metrics.MetricsLevelContainer | metrics.MetricsLevelVM | metrics.MetricsLevelPod
915+
collector := NewPowerCollector(mockMonitor, "test-node", logger, allLevels)
916+
917+
registry := prometheus.NewRegistry()
918+
registry.MustRegister(collector)
919+
920+
mockMonitor.TriggerUpdate()
921+
time.Sleep(10 * time.Millisecond)
922+
923+
t.Run("Terminated Container Metrics Export", func(t *testing.T) {
924+
// Test running container metrics
925+
assertMetricLabelValues(t, registry, "kepler_container_cpu_joules_total",
926+
map[string]string{"container_id": "running-container", "state": "running"}, 150.0)
927+
assertMetricLabelValues(t, registry, "kepler_container_cpu_watts",
928+
map[string]string{"container_id": "running-container", "state": "running"}, 15.0)
929+
930+
// Test terminated container metrics
931+
assertMetricLabelValues(t, registry, "kepler_container_cpu_joules_total",
932+
map[string]string{"container_id": "terminated-container", "state": "terminated"}, 300.0)
933+
assertMetricLabelValues(t, registry, "kepler_container_cpu_watts",
934+
map[string]string{"container_id": "terminated-container", "state": "terminated"}, 30.0)
935+
936+
// Test additional labels for running container
937+
assertMetricLabelValues(t, registry, "kepler_container_cpu_joules_total",
938+
map[string]string{"container_id": "running-container", "container_name": "running-cont", "runtime": "docker"}, 150.0)
939+
940+
// Test additional labels for terminated container
941+
assertMetricLabelValues(t, registry, "kepler_container_cpu_joules_total",
942+
map[string]string{"container_id": "terminated-container", "container_name": "terminated-cont", "runtime": "podman"}, 300.0)
943+
})
944+
945+
t.Run("Container State Labels", func(t *testing.T) {
946+
// Verify that the state label exists and has correct values
947+
assertMetricExists(t, registry, "kepler_container_cpu_joules_total",
948+
map[string]string{"state": "running"})
949+
assertMetricExists(t, registry, "kepler_container_cpu_joules_total",
950+
map[string]string{"state": "terminated"})
951+
952+
// Also verify for watts metrics
953+
assertMetricExists(t, registry, "kepler_container_cpu_watts",
954+
map[string]string{"state": "running"})
955+
assertMetricExists(t, registry, "kepler_container_cpu_watts",
956+
map[string]string{"state": "terminated"})
957+
})
958+
959+
mockMonitor.AssertExpectations(t)
960+
}
961+
962+
func TestTerminatedVMExport(t *testing.T) {
963+
logger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelError}))
964+
mockMonitor := NewMockPowerMonitor()
965+
966+
packageZone := device.NewMockRaplZone("package", 0, "/sys/class/powercap/intel-rapl/intel-rapl:0", 1000)
967+
968+
testSnapshot := &monitor.Snapshot{
969+
Timestamp: time.Now(),
970+
Node: &monitor.Node{
971+
Zones: monitor.NodeZoneUsageMap{
972+
packageZone: monitor.NodeUsage{
973+
EnergyTotal: 1000 * device.Joule,
974+
Power: 10 * device.Watt,
975+
},
976+
},
977+
},
978+
Processes: monitor.Processes{},
979+
Containers: monitor.Containers{},
980+
VirtualMachines: monitor.VirtualMachines{
981+
"running-vm": &monitor.VirtualMachine{
982+
ID: "running-vm",
983+
Name: "running-virtual-machine",
984+
Hypervisor: resource.KVMHypervisor,
985+
Zones: monitor.ZoneUsageMap{
986+
packageZone: monitor.Usage{
987+
EnergyTotal: 250 * device.Joule,
988+
Power: 25 * device.Watt,
989+
},
990+
},
991+
},
992+
},
993+
TerminatedVirtualMachines: monitor.VirtualMachines{
994+
"terminated-vm": &monitor.VirtualMachine{
995+
ID: "terminated-vm",
996+
Name: "terminated-virtual-machine",
997+
Hypervisor: resource.KVMHypervisor,
998+
Zones: monitor.ZoneUsageMap{
999+
packageZone: monitor.Usage{
1000+
EnergyTotal: 400 * device.Joule,
1001+
Power: 40 * device.Watt,
1002+
},
1003+
},
1004+
},
1005+
},
1006+
Pods: monitor.Pods{},
1007+
}
1008+
1009+
mockMonitor.On("Snapshot").Return(testSnapshot, nil)
1010+
1011+
allLevels := metrics.MetricsLevelNode | metrics.MetricsLevelProcess | metrics.MetricsLevelContainer | metrics.MetricsLevelVM | metrics.MetricsLevelPod
1012+
collector := NewPowerCollector(mockMonitor, "test-node", logger, allLevels)
1013+
1014+
registry := prometheus.NewRegistry()
1015+
registry.MustRegister(collector)
1016+
1017+
mockMonitor.TriggerUpdate()
1018+
time.Sleep(10 * time.Millisecond)
1019+
1020+
t.Run("Terminated VM Metrics Export", func(t *testing.T) {
1021+
// Test running VM metrics
1022+
assertMetricLabelValues(t, registry, "kepler_vm_cpu_joules_total",
1023+
map[string]string{"vm_id": "running-vm", "state": "running"}, 250.0)
1024+
assertMetricLabelValues(t, registry, "kepler_vm_cpu_watts",
1025+
map[string]string{"vm_id": "running-vm", "state": "running"}, 25.0)
1026+
1027+
// Test terminated VM metrics
1028+
assertMetricLabelValues(t, registry, "kepler_vm_cpu_joules_total",
1029+
map[string]string{"vm_id": "terminated-vm", "state": "terminated"}, 400.0)
1030+
assertMetricLabelValues(t, registry, "kepler_vm_cpu_watts",
1031+
map[string]string{"vm_id": "terminated-vm", "state": "terminated"}, 40.0)
1032+
1033+
// Test additional labels for running VM
1034+
assertMetricLabelValues(t, registry, "kepler_vm_cpu_joules_total",
1035+
map[string]string{"vm_id": "running-vm", "vm_name": "running-virtual-machine", "hypervisor": "kvm"}, 250.0)
1036+
1037+
// Test additional labels for terminated VM
1038+
assertMetricLabelValues(t, registry, "kepler_vm_cpu_joules_total",
1039+
map[string]string{"vm_id": "terminated-vm", "vm_name": "terminated-virtual-machine", "hypervisor": "kvm"}, 400.0)
1040+
})
1041+
1042+
t.Run("VM State Labels", func(t *testing.T) {
1043+
// Verify that the state label exists and has correct values
1044+
assertMetricExists(t, registry, "kepler_vm_cpu_joules_total",
1045+
map[string]string{"state": "running"})
1046+
assertMetricExists(t, registry, "kepler_vm_cpu_joules_total",
1047+
map[string]string{"state": "terminated"})
1048+
1049+
// Also verify for watts metrics
1050+
assertMetricExists(t, registry, "kepler_vm_cpu_watts",
1051+
map[string]string{"state": "running"})
1052+
assertMetricExists(t, registry, "kepler_vm_cpu_watts",
1053+
map[string]string{"state": "terminated"})
1054+
})
1055+
1056+
mockMonitor.AssertExpectations(t)
1057+
}

0 commit comments

Comments
 (0)