Skip to content

Commit a97bd6f

Browse files
committed
fix(prometheus.operator): retry GetInformer when running prometheus operator components
1 parent fdbe1b5 commit a97bd6f

File tree

2 files changed

+45
-6
lines changed

2 files changed

+45
-6
lines changed

component/prometheus/operator/common/crdmanager.go

Lines changed: 44 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"github.com/grafana/agent/service/http"
1818
"github.com/grafana/agent/service/labelstore"
1919
"github.com/grafana/ckit/shard"
20+
"github.com/grafana/dskit/backoff"
2021
"github.com/prometheus/common/model"
2122
"github.com/prometheus/prometheus/config"
2223
"github.com/prometheus/prometheus/discovery"
@@ -27,6 +28,7 @@ import (
2728
toolscache "k8s.io/client-go/tools/cache"
2829
"sigs.k8s.io/controller-runtime/pkg/cache"
2930
"sigs.k8s.io/controller-runtime/pkg/client"
31+
"sigs.k8s.io/controller-runtime/pkg/client/apiutil"
3032

3133
"github.com/grafana/agent/component/prometheus/operator"
3234
"github.com/grafana/agent/component/prometheus/operator/configgen"
@@ -39,6 +41,13 @@ import (
3941
// Generous timeout period for configuring all informers
4042
const informerSyncTimeout = 10 * time.Second
4143

44+
// Retry configuration for configuring informers
45+
var informerBackoff = backoff.Config{
46+
MinBackoff: 100 * time.Millisecond,
47+
MaxBackoff: time.Second,
48+
MaxRetries: 10,
49+
}
50+
4251
// crdManager is all of the fields required to run a crd based component.
4352
// on update, this entire thing should be recreated and restarted
4453
type crdManager struct {
@@ -132,7 +141,7 @@ func (c *crdManager) Run(ctx context.Context) error {
132141
if err := c.runInformers(restConfig, ctx); err != nil {
133142
return err
134143
}
135-
level.Info(c.logger).Log("msg", "informers started")
144+
level.Info(c.logger).Log("msg", "informers started")
136145

137146
var cachedTargets map[string][]*targetgroup.Group
138147
// Start the target discovery loop to update the scrape manager with new targets.
@@ -266,6 +275,20 @@ func (c *crdManager) runInformers(restConfig *rest.Config, ctx context.Context)
266275
if ls != labels.Nothing() {
267276
opts.DefaultLabelSelector = ls
268277
}
278+
279+
// TODO: Remove custom opts.Mapper when sigs.k8s.io/controller-runtime >= 0.17.0 as `NewDynamicRESTMapper` is the default in that version
280+
var err error
281+
opts.HTTPClient, err = rest.HTTPClientFor(restConfig)
282+
if err != nil {
283+
return err
284+
}
285+
286+
opts.Mapper, err = apiutil.NewDynamicRESTMapper(restConfig, opts.HTTPClient)
287+
if err != nil {
288+
return fmt.Errorf("could not create RESTMapper from config: %w", err)
289+
}
290+
// TODO: end custom opts.Mapps
291+
269292
cache, err := cache.New(restConfig, opts)
270293
if err != nil {
271294
return err
@@ -305,18 +328,34 @@ func (c *crdManager) configureInformers(ctx context.Context, informers cache.Inf
305328
return fmt.Errorf("unknown kind to configure Informers: %s", c.kind)
306329
}
307330

308-
informerCtx, cancel := context.WithTimeout(ctx, informerSyncTimeout)
309-
defer cancel()
331+
var informer cache.Informer
332+
var err error
333+
bo := backoff.New(ctx, informerBackoff)
334+
335+
for bo.Ongoing() {
336+
informerCtx, cancel := context.WithTimeout(ctx, informerSyncTimeout)
337+
defer cancel()
338+
339+
informer, err = informers.GetInformer(informerCtx, prototype)
340+
if err == nil {
341+
// Successfully got the informer
342+
break
343+
}
310344

311-
informer, err := informers.GetInformer(informerCtx, prototype)
312-
if err != nil {
313345
if errors.Is(informerCtx.Err(), context.DeadlineExceeded) { // Check the context to prevent GetInformer returning a fake timeout
314346
return fmt.Errorf("timeout exceeded while configuring informers. Check the connection"+
315347
" to the Kubernetes API is stable and that the Agent has appropriate RBAC permissions for %v", prototype)
316348
}
317349

350+
level.Warn(c.logger).Log("msg", "failed to get informer - will retry", "err", err)
351+
352+
bo.Wait()
353+
}
354+
355+
if err != nil {
318356
return err
319357
}
358+
320359
const resync = 5 * time.Minute
321360
switch c.kind {
322361
case KindPodMonitor:

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ require (
232232
k8s.io/component-base v0.28.1
233233
k8s.io/klog/v2 v2.100.1
234234
k8s.io/utils v0.0.0-20230726121419-3b25d923346b
235-
sigs.k8s.io/controller-runtime v0.16.2
235+
sigs.k8s.io/controller-runtime v0.16.2 // TODO: Remove custom rest mapper from component/prometheus/operator/common/crdmanager.go when upgrading past v0.17.0
236236
sigs.k8s.io/yaml v1.3.0
237237
)
238238

0 commit comments

Comments
 (0)