@@ -23,6 +23,7 @@ import (
23
23
"io"
24
24
"net/http"
25
25
"strings"
26
+ "time"
26
27
27
28
autoscalingv1alpha1 "github.com/vllm-project/aibrix/api/autoscaling/v1alpha1"
28
29
@@ -43,6 +44,9 @@ const (
43
44
ResourceMetrics MetricType = "resource"
44
45
CustomMetrics MetricType = "custom"
45
46
RawMetrics MetricType = "raw"
47
+ maxRetries = 3
48
+ baseDelay = 100 * time .Millisecond
49
+ maxDelay = 5 * time .Second
46
50
)
47
51
48
52
// MetricFetcher defines an interface for fetching metrics. it could be Kubernetes metrics or Pod prometheus metrics.
@@ -98,17 +102,53 @@ func (f *RestMetricsFetcher) FetchMetric(ctx context.Context, protocol autoscali
98
102
return 0.0 , fmt .Errorf ("failed to create request to source %s: %v" , url , err )
99
103
}
100
104
101
- // Send the request using the default client
102
- resp , err := f .client .Do (req )
103
- if err != nil {
104
- return 0.0 , fmt .Errorf ("failed to fetch metrics from source %s: %v" , url , err )
105
+ var resp * http.Response
106
+ var lastErr error
107
+
108
+ for attempt := 0 ; attempt < maxRetries ; attempt ++ {
109
+ if attempt > 0 {
110
+ backoffDelay := baseDelay * time .Duration (1 << uint (attempt - 1 ))
111
+ // Cap the delay to prevent it from becoming too long
112
+ if backoffDelay > maxDelay {
113
+ backoffDelay = maxDelay
114
+ }
115
+ klog .V (4 ).InfoS ("Backing off before retry" , "attempt" , attempt + 1 , "delay" , backoffDelay )
116
+ time .Sleep (backoffDelay )
117
+ }
118
+ var err error
119
+ resp , err = f .client .Do (req )
120
+ if err != nil {
121
+ lastErr = err
122
+ continue
123
+ }
124
+ if resp .StatusCode >= 200 && resp .StatusCode < 300 {
125
+ lastErr = nil // Success
126
+ break
127
+ }
128
+
129
+ if err := resp .Body .Close (); err != nil {
130
+ klog .ErrorS (err , "error closing response body" )
131
+ }
132
+
133
+ // Don't retry on 4xx client errors.
134
+ if resp .StatusCode >= 400 && resp .StatusCode < 500 {
135
+ lastErr = fmt .Errorf ("client error: %s" , resp .Status )
136
+ break
137
+ }
138
+ lastErr = fmt .Errorf ("server error: %s" , resp .Status )
105
139
}
140
+
141
+ if lastErr != nil {
142
+ return 0.0 , fmt .Errorf ("failed to fetch metrics from source %s: %v" , url , lastErr )
143
+ }
144
+
106
145
defer func () {
107
146
if err := resp .Body .Close (); err != nil {
108
147
// Handle the error here. For example, log it or take appropriate corrective action.
109
148
klog .ErrorS (err , "error closing response body" )
110
149
}
111
150
}()
151
+
112
152
body , err := io .ReadAll (resp .Body )
113
153
if err != nil {
114
154
return 0.0 , fmt .Errorf ("failed to read response from source %s: %v" , url , err )
0 commit comments