@@ -17,6 +17,7 @@ import (
17
17
"github.com/grafana/agent/service/http"
18
18
"github.com/grafana/agent/service/labelstore"
19
19
"github.com/grafana/ckit/shard"
20
+ "github.com/grafana/dskit/backoff"
20
21
"github.com/prometheus/common/model"
21
22
"github.com/prometheus/prometheus/config"
22
23
"github.com/prometheus/prometheus/discovery"
@@ -27,6 +28,7 @@ import (
27
28
toolscache "k8s.io/client-go/tools/cache"
28
29
"sigs.k8s.io/controller-runtime/pkg/cache"
29
30
"sigs.k8s.io/controller-runtime/pkg/client"
31
+ "sigs.k8s.io/controller-runtime/pkg/client/apiutil"
30
32
31
33
"github.com/grafana/agent/component/prometheus/operator"
32
34
"github.com/grafana/agent/component/prometheus/operator/configgen"
@@ -39,6 +41,13 @@ import (
39
41
// Generous timeout period for configuring all informers
40
42
const informerSyncTimeout = 10 * time .Second
41
43
44
+ // Retry configuration for configuring informers
45
+ var informerBackoff = backoff.Config {
46
+ MinBackoff : 100 * time .Millisecond ,
47
+ MaxBackoff : time .Second ,
48
+ MaxRetries : 10 ,
49
+ }
50
+
42
51
// crdManager is all of the fields required to run a crd based component.
43
52
// on update, this entire thing should be recreated and restarted
44
53
type crdManager struct {
@@ -132,7 +141,7 @@ func (c *crdManager) Run(ctx context.Context) error {
132
141
if err := c .runInformers (restConfig , ctx ); err != nil {
133
142
return err
134
143
}
135
- level .Info (c .logger ).Log ("msg" , "informers started" )
144
+ level .Info (c .logger ).Log ("msg" , "informers started" )
136
145
137
146
var cachedTargets map [string ][]* targetgroup.Group
138
147
// Start the target discovery loop to update the scrape manager with new targets.
@@ -266,6 +275,20 @@ func (c *crdManager) runInformers(restConfig *rest.Config, ctx context.Context)
266
275
if ls != labels .Nothing () {
267
276
opts .DefaultLabelSelector = ls
268
277
}
278
+
279
+ // TODO: Remove custom opts.Mapper when sigs.k8s.io/controller-runtime >= 0.17.0 as `NewDynamicRESTMapper` is the default in that version
280
+ var err error
281
+ opts .HTTPClient , err = rest .HTTPClientFor (restConfig )
282
+ if err != nil {
283
+ return err
284
+ }
285
+
286
+ opts .Mapper , err = apiutil .NewDynamicRESTMapper (restConfig , opts .HTTPClient )
287
+ if err != nil {
288
+ return fmt .Errorf ("could not create RESTMapper from config: %w" , err )
289
+ }
290
+ // TODO: end custom opts.Mapps
291
+
269
292
cache , err := cache .New (restConfig , opts )
270
293
if err != nil {
271
294
return err
@@ -305,18 +328,34 @@ func (c *crdManager) configureInformers(ctx context.Context, informers cache.Inf
305
328
return fmt .Errorf ("unknown kind to configure Informers: %s" , c .kind )
306
329
}
307
330
308
- informerCtx , cancel := context .WithTimeout (ctx , informerSyncTimeout )
309
- defer cancel ()
331
+ var informer cache.Informer
332
+ var err error
333
+ bo := backoff .New (ctx , informerBackoff )
334
+
335
+ for bo .Ongoing () {
336
+ informerCtx , cancel := context .WithTimeout (ctx , informerSyncTimeout )
337
+ defer cancel ()
338
+
339
+ informer , err = informers .GetInformer (informerCtx , prototype )
340
+ if err == nil {
341
+ // Successfully got the informer
342
+ break
343
+ }
310
344
311
- informer , err := informers .GetInformer (informerCtx , prototype )
312
- if err != nil {
313
345
if errors .Is (informerCtx .Err (), context .DeadlineExceeded ) { // Check the context to prevent GetInformer returning a fake timeout
314
346
return fmt .Errorf ("timeout exceeded while configuring informers. Check the connection" +
315
347
" to the Kubernetes API is stable and that the Agent has appropriate RBAC permissions for %v" , prototype )
316
348
}
317
349
350
+ level .Warn (c .logger ).Log ("msg" , "failed to get informer - will retry" , "err" , err )
351
+
352
+ bo .Wait ()
353
+ }
354
+
355
+ if err != nil {
318
356
return err
319
357
}
358
+
320
359
const resync = 5 * time .Minute
321
360
switch c .kind {
322
361
case KindPodMonitor :
0 commit comments