Skip to content

Commit 0cae469

Browse files
saintubeshenxin
and
shenxin
authored
scheduler: make loadaware debuggable (#2468)
Signed-off-by: saintube <[email protected]> Co-authored-by: shenxin <[email protected]>
1 parent c56107d commit 0cae469

File tree

3 files changed

+228
-2
lines changed

3 files changed

+228
-2
lines changed

pkg/scheduler/plugins/loadaware/load_aware.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ func (p *Plugin) Filter(ctx context.Context, state *framework.CycleState, pod *c
182182
klog.ErrorS(err, "GetEstimatedUsed failed!", "node", node.Name)
183183
return nil
184184
}
185-
return filterNodeUsage(usageThresholds, estimatedUsed, allocatable, prodPod, filterProfile)
185+
return filterNodeUsage(node.Name, pod, usageThresholds, estimatedUsed, allocatable, prodPod, filterProfile)
186186
}
187187

188188
func (p *Plugin) ScoreExtensions() framework.ScoreExtensions {
@@ -282,10 +282,12 @@ func (p *Plugin) GetEstimatedUsed(nodeName string, nodeMetric *slov1alpha1.NodeM
282282
}
283283
}
284284
}
285+
klog.V(6).Infof("GetEstimatedUsed: node %s, pod %s, estimatedUsed %+v, assignedPodEstimatedUsed %+v, estimatedPods: %+v",
286+
nodeName, klog.KObj(pod), estimatedUsed, assignedPodEstimatedUsed, estimatedPods)
285287
return estimatedUsed, nil
286288
}
287289

288-
func filterNodeUsage(usageThresholds, estimatedUsed map[corev1.ResourceName]int64, allocatable corev1.ResourceList, prodPod bool, filterProfile *usageThresholdsFilterProfile) *framework.Status {
290+
func filterNodeUsage(nodeName string, pod *corev1.Pod, usageThresholds, estimatedUsed map[corev1.ResourceName]int64, allocatable corev1.ResourceList, prodPod bool, filterProfile *usageThresholdsFilterProfile) *framework.Status {
289291
for resourceName, value := range usageThresholds {
290292
if value == 0 {
291293
continue
@@ -303,6 +305,8 @@ func filterNodeUsage(usageThresholds, estimatedUsed map[corev1.ResourceName]int6
303305
if !prodPod && filterProfile.AggregatedUsage != nil {
304306
reason = ErrReasonAggregatedUsageExceedThreshold
305307
}
308+
klog.V(5).InfoS("failed to filter node usage for pod", "pod", klog.KObj(pod), "node", nodeName,
309+
"resource", resourceName, "total", total, "usage", usage, "threshold", value)
306310
return framework.NewStatus(framework.Unschedulable, fmt.Sprintf(reason, resourceName))
307311
}
308312
return nil
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
/*
2+
Copyright 2022 The Koordinator Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package loadaware
18+
19+
import (
20+
"net/http"
21+
"time"
22+
23+
"github.com/gin-gonic/gin"
24+
corev1 "k8s.io/api/core/v1"
25+
26+
"github.com/koordinator-sh/koordinator/pkg/scheduler/frameworkext/services"
27+
)
28+
29+
var _ services.APIServiceProvider = &Plugin{}
30+
31+
type NodeAssignInfoData struct {
32+
Pods []PodAssignInfoData `json:"pods,omitempty"`
33+
}
34+
35+
type PodAssignInfoData struct {
36+
Timestamp time.Time `json:"timestamp,omitempty"`
37+
Pod *corev1.Pod `json:"pod,omitempty"`
38+
}
39+
40+
func (p *Plugin) RegisterEndpoints(group *gin.RouterGroup) {
41+
group.GET("/node/:nodeName", func(c *gin.Context) {
42+
nodeName := c.Param("nodeName")
43+
assignInfo := p.podAssignCache.getPodsAssignInfoOnNode(nodeName)
44+
if len(assignInfo) == 0 {
45+
c.JSON(http.StatusOK, &NodeAssignInfoData{})
46+
return
47+
}
48+
49+
resp := &NodeAssignInfoData{
50+
Pods: make([]PodAssignInfoData, 0, len(assignInfo)),
51+
}
52+
for i := range assignInfo {
53+
resp.Pods = append(resp.Pods, PodAssignInfoData{
54+
Timestamp: assignInfo[i].timestamp,
55+
Pod: assignInfo[i].pod,
56+
})
57+
}
58+
59+
c.JSON(http.StatusOK, resp)
60+
})
61+
}
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
/*
2+
Copyright 2022 The Koordinator Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package loadaware
18+
19+
import (
20+
"encoding/json"
21+
"net/http"
22+
"net/http/httptest"
23+
"testing"
24+
25+
"github.com/gin-gonic/gin"
26+
"github.com/stretchr/testify/assert"
27+
corev1 "k8s.io/api/core/v1"
28+
"k8s.io/apimachinery/pkg/api/resource"
29+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
30+
"k8s.io/apimachinery/pkg/util/uuid"
31+
)
32+
33+
func TestQueryNode(t *testing.T) {
34+
t.Run("test", func(t *testing.T) {
35+
36+
preTimeNowFn := timeNowFn
37+
defer func() {
38+
timeNowFn = preTimeNowFn
39+
}()
40+
timeNowFn = fakeTimeNowFn
41+
42+
pl := &Plugin{
43+
podAssignCache: newPodAssignCache(),
44+
}
45+
testNodeName := "test-node"
46+
pendingPod := &corev1.Pod{
47+
ObjectMeta: metav1.ObjectMeta{
48+
Name: "test-pod",
49+
UID: uuid.NewUUID(),
50+
},
51+
Spec: corev1.PodSpec{
52+
Containers: []corev1.Container{
53+
{
54+
Name: "main",
55+
Resources: corev1.ResourceRequirements{
56+
Requests: corev1.ResourceList{
57+
corev1.ResourceCPU: resource.MustParse("4"),
58+
corev1.ResourceMemory: resource.MustParse("8Gi"),
59+
},
60+
},
61+
},
62+
},
63+
},
64+
Status: corev1.PodStatus{
65+
Phase: corev1.PodPending,
66+
},
67+
}
68+
pl.podAssignCache.assign("", pendingPod)
69+
assignedPod := &corev1.Pod{
70+
ObjectMeta: metav1.ObjectMeta{
71+
Name: "test-pod-1",
72+
UID: uuid.NewUUID(),
73+
},
74+
Spec: corev1.PodSpec{
75+
Containers: []corev1.Container{
76+
{
77+
Name: "main",
78+
Resources: corev1.ResourceRequirements{
79+
Requests: corev1.ResourceList{
80+
corev1.ResourceCPU: resource.MustParse("4"),
81+
corev1.ResourceMemory: resource.MustParse("8Gi"),
82+
},
83+
},
84+
},
85+
},
86+
NodeName: testNodeName,
87+
},
88+
Status: corev1.PodStatus{
89+
Phase: corev1.PodRunning,
90+
},
91+
}
92+
pl.podAssignCache.assign(testNodeName, assignedPod)
93+
terminatedPod := &corev1.Pod{
94+
ObjectMeta: metav1.ObjectMeta{
95+
Name: "test-terminated-pod-1",
96+
UID: uuid.NewUUID(),
97+
},
98+
Spec: corev1.PodSpec{
99+
Containers: []corev1.Container{
100+
{
101+
Name: "main",
102+
Resources: corev1.ResourceRequirements{
103+
Requests: corev1.ResourceList{
104+
corev1.ResourceCPU: resource.MustParse("4"),
105+
corev1.ResourceMemory: resource.MustParse("8Gi"),
106+
},
107+
},
108+
},
109+
},
110+
NodeName: testNodeName,
111+
},
112+
Status: corev1.PodStatus{
113+
Phase: corev1.PodFailed,
114+
},
115+
}
116+
pl.podAssignCache.unAssign(testNodeName, terminatedPod)
117+
118+
// got unknown node
119+
engine := gin.Default()
120+
pl.RegisterEndpoints(engine.Group("/"))
121+
w := httptest.NewRecorder()
122+
req, _ := http.NewRequest("GET", "/node/unknown-node", nil)
123+
engine.ServeHTTP(w, req)
124+
assert.Equal(t, http.StatusOK, w.Result().StatusCode)
125+
got := &NodeAssignInfoData{}
126+
err := json.NewDecoder(w.Result().Body).Decode(got)
127+
assert.NoError(t, err)
128+
expectedResp := &NodeAssignInfoData{}
129+
assert.Equal(t, expectedResp, got)
130+
131+
// got the assign pod
132+
w = httptest.NewRecorder()
133+
req, _ = http.NewRequest("GET", "/node/test-node", nil)
134+
engine.ServeHTTP(w, req)
135+
assert.Equal(t, http.StatusOK, w.Result().StatusCode)
136+
got = &NodeAssignInfoData{}
137+
err = json.NewDecoder(w.Result().Body).Decode(got)
138+
assert.NoError(t, err)
139+
expectedResp = &NodeAssignInfoData{
140+
Pods: []PodAssignInfoData{
141+
{
142+
Timestamp: timeNowFn(),
143+
Pod: assignedPod,
144+
},
145+
},
146+
}
147+
assert.Equal(t, expectedResp, got)
148+
149+
// got no assign pod
150+
pl.podAssignCache.unAssign(testNodeName, assignedPod)
151+
w = httptest.NewRecorder()
152+
req, _ = http.NewRequest("GET", "/node/test-node", nil)
153+
engine.ServeHTTP(w, req)
154+
assert.Equal(t, http.StatusOK, w.Result().StatusCode)
155+
got = &NodeAssignInfoData{}
156+
err = json.NewDecoder(w.Result().Body).Decode(got)
157+
assert.NoError(t, err)
158+
expectedResp = &NodeAssignInfoData{}
159+
assert.Equal(t, expectedResp, got)
160+
})
161+
}

0 commit comments

Comments
 (0)