Skip to content

Commit 227d3ba

Browse files
authored
Feat(autoscaler): Add retry delays to RestMetricsFetcher (#1466)
* Add retry delays to RestMetricsFetcher Signed-off-by: Omer Aplatony <[email protected]> * adjust comment Signed-off-by: Omer Aplatony <[email protected]> * lint Signed-off-by: Omer Aplatony <[email protected]> --------- Signed-off-by: Omer Aplatony <[email protected]>
1 parent b37a021 commit 227d3ba

File tree

1 file changed

+44
-4
lines changed

1 file changed

+44
-4
lines changed

pkg/controller/podautoscaler/metrics/fetcher.go

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"io"
2424
"net/http"
2525
"strings"
26+
"time"
2627

2728
autoscalingv1alpha1 "github.com/vllm-project/aibrix/api/autoscaling/v1alpha1"
2829

@@ -43,6 +44,9 @@ const (
4344
ResourceMetrics MetricType = "resource"
4445
CustomMetrics MetricType = "custom"
4546
RawMetrics MetricType = "raw"
47+
maxRetries = 3
48+
baseDelay = 100 * time.Millisecond
49+
maxDelay = 5 * time.Second
4650
)
4751

4852
// MetricFetcher defines an interface for fetching metrics. it could be Kubernetes metrics or Pod prometheus metrics.
@@ -98,17 +102,53 @@ func (f *RestMetricsFetcher) FetchMetric(ctx context.Context, protocol autoscali
98102
return 0.0, fmt.Errorf("failed to create request to source %s: %v", url, err)
99103
}
100104

101-
// Send the request using the default client
102-
resp, err := f.client.Do(req)
103-
if err != nil {
104-
return 0.0, fmt.Errorf("failed to fetch metrics from source %s: %v", url, err)
105+
var resp *http.Response
106+
var lastErr error
107+
108+
for attempt := 0; attempt < maxRetries; attempt++ {
109+
if attempt > 0 {
110+
backoffDelay := baseDelay * time.Duration(1<<uint(attempt-1))
111+
// Cap the delay to prevent it from becoming too long
112+
if backoffDelay > maxDelay {
113+
backoffDelay = maxDelay
114+
}
115+
klog.V(4).InfoS("Backing off before retry", "attempt", attempt+1, "delay", backoffDelay)
116+
time.Sleep(backoffDelay)
117+
}
118+
var err error
119+
resp, err = f.client.Do(req)
120+
if err != nil {
121+
lastErr = err
122+
continue
123+
}
124+
if resp.StatusCode >= 200 && resp.StatusCode < 300 {
125+
lastErr = nil // Success
126+
break
127+
}
128+
129+
if err := resp.Body.Close(); err != nil {
130+
klog.ErrorS(err, "error closing response body")
131+
}
132+
133+
// Don't retry on 4xx client errors.
134+
if resp.StatusCode >= 400 && resp.StatusCode < 500 {
135+
lastErr = fmt.Errorf("client error: %s", resp.Status)
136+
break
137+
}
138+
lastErr = fmt.Errorf("server error: %s", resp.Status)
105139
}
140+
141+
if lastErr != nil {
142+
return 0.0, fmt.Errorf("failed to fetch metrics from source %s: %v", url, lastErr)
143+
}
144+
106145
defer func() {
107146
if err := resp.Body.Close(); err != nil {
108147
// Handle the error here. For example, log it or take appropriate corrective action.
109148
klog.ErrorS(err, "error closing response body")
110149
}
111150
}()
151+
112152
body, err := io.ReadAll(resp.Body)
113153
if err != nil {
114154
return 0.0, fmt.Errorf("failed to read response from source %s: %v", url, err)

0 commit comments

Comments
 (0)