diff --git a/config/config.go b/config/config.go index 82885b233a..55aa05cf76 100644 --- a/config/config.go +++ b/config/config.go @@ -1007,6 +1007,7 @@ type Receiver struct { DiscordConfigs []*DiscordConfig `yaml:"discord_configs,omitempty" json:"discord_configs,omitempty"` EmailConfigs []*EmailConfig `yaml:"email_configs,omitempty" json:"email_configs,omitempty"` + IncidentioConfigs []*IncidentioConfig `yaml:"incidentio_configs,omitempty" json:"incidentio_configs,omitempty"` PagerdutyConfigs []*PagerdutyConfig `yaml:"pagerduty_configs,omitempty" json:"pagerduty_configs,omitempty"` SlackConfigs []*SlackConfig `yaml:"slack_configs,omitempty" json:"slack_configs,omitempty"` WebhookConfigs []*WebhookConfig `yaml:"webhook_configs,omitempty" json:"webhook_configs,omitempty"` diff --git a/config/notifiers.go b/config/notifiers.go index 87f806aa27..cfe148f1e0 100644 --- a/config/notifiers.go +++ b/config/notifiers.go @@ -28,6 +28,13 @@ import ( ) var ( + // DefaultIncidentioConfig defines default values for Incident.io configurations. + DefaultIncidentioConfig = IncidentioConfig{ + NotifierConfig: NotifierConfig{ + VSendResolved: true, + }, + } + // DefaultWebhookConfig defines default values for Webhook configurations. DefaultWebhookConfig = WebhookConfig{ NotifierConfig: NotifierConfig{ @@ -521,6 +528,57 @@ func (c *SlackConfig) UnmarshalYAML(unmarshal func(interface{}) error) error { return nil } +// IncidentioConfig configures notifications via incident.io. +type IncidentioConfig struct { + NotifierConfig `yaml:",inline" json:",inline"` + + HTTPConfig *commoncfg.HTTPClientConfig `yaml:"http_config,omitempty" json:"http_config,omitempty"` + + // URL to send POST request to. + URL *URL `yaml:"url" json:"url"` + URLFile string `yaml:"url_file" json:"url_file"` + + // AlertSourceToken is the key used to authenticate with the alert source in incident.io. + AlertSourceToken Secret `yaml:"alert_source_token,omitempty" json:"alert_source_token,omitempty"` + AlertSourceTokenFile string `yaml:"alert_source_token_file,omitempty" json:"alert_source_token_file,omitempty"` + + // MaxAlerts is the maximum number of alerts to be sent per incident.io message. + // Alerts exceeding this threshold will be truncated. Setting this to 0 + // allows an unlimited number of alerts. Note that if the payload exceeds + // incident.io's size limits, you will receive a 429 response and alerts + // will not be ingested. + MaxAlerts uint64 `yaml:"max_alerts" json:"max_alerts"` + + // Timeout is the maximum time allowed to invoke incident.io. Setting this to 0 + // does not impose a timeout. + Timeout time.Duration `yaml:"timeout" json:"timeout"` +} + +// UnmarshalYAML implements the yaml.Unmarshaler interface. +func (c *IncidentioConfig) UnmarshalYAML(unmarshal func(interface{}) error) error { + *c = DefaultIncidentioConfig + type plain IncidentioConfig + if err := unmarshal((*plain)(c)); err != nil { + return err + } + if c.URL == nil && c.URLFile == "" { + return errors.New("one of url or url_file must be configured") + } + if c.URL != nil && c.URLFile != "" { + return errors.New("at most one of url & url_file must be configured") + } + if c.AlertSourceToken != "" && c.AlertSourceTokenFile != "" { + return errors.New("at most one of alert_source_token & alert_source_token_file must be configured") + } + if c.AlertSourceToken == "" && c.AlertSourceTokenFile == "" { + return errors.New("one of alert_source_token or alert_source_token_file must be configured") + } + if c.HTTPConfig != nil && c.HTTPConfig.Authorization != nil && (c.AlertSourceToken != "" || c.AlertSourceTokenFile != "") { + return errors.New("cannot specify both alert_source_token/alert_source_token_file and http_config.authorization") + } + return nil +} + // WebhookConfig configures notifications via a generic webhook. type WebhookConfig struct { NotifierConfig `yaml:",inline" json:",inline"` diff --git a/config/receiver/receiver.go b/config/receiver/receiver.go index d92a19a4c5..23be2b11c8 100644 --- a/config/receiver/receiver.go +++ b/config/receiver/receiver.go @@ -23,6 +23,7 @@ import ( "github.com/prometheus/alertmanager/notify" "github.com/prometheus/alertmanager/notify/discord" "github.com/prometheus/alertmanager/notify/email" + "github.com/prometheus/alertmanager/notify/incidentio" "github.com/prometheus/alertmanager/notify/jira" "github.com/prometheus/alertmanager/notify/msteams" "github.com/prometheus/alertmanager/notify/msteamsv2" @@ -106,6 +107,9 @@ func BuildReceiverIntegrations(nc config.Receiver, tmpl *template.Template, logg for i, c := range nc.JiraConfigs { add("jira", i, c, func(l *slog.Logger) (notify.Notifier, error) { return jira.New(c, tmpl, l, httpOpts...) }) } + for i, c := range nc.IncidentioConfigs { + add("incidentio", i, c, func(l *slog.Logger) (notify.Notifier, error) { return incidentio.New(c, tmpl, l, httpOpts...) }) + } for i, c := range nc.RocketchatConfigs { add("rocketchat", i, c, func(l *slog.Logger) (notify.Notifier, error) { return rocketchat.New(c, tmpl, l, httpOpts...) }) } diff --git a/docs/configuration.md b/docs/configuration.md index 731437f765..ee1e047815 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -748,6 +748,8 @@ opsgenie_configs: [ - , ... ] pagerduty_configs: [ - , ... ] +incidentio_configs: + [ - , ... ] pushover_configs: [ - , ... ] rocketchat_configs: @@ -1673,6 +1675,40 @@ There is a list of [integrations](https://prometheus.io/docs/operating/integrations/#alertmanager-webhook-receiver) with this feature. +### `` + +incident.io notifications are sent via the [incident.io Alert Sources API](https://incident.io/docs/api/alert-sources). + +```yaml +# Whether to notify about resolved alerts. +[ send_resolved: | default = true ] + +# The HTTP client's configuration. +[ http_config: | default = global.http_config ] + +# The URL to send the incident.io alert. This would typically be provided by the +# incident.io team when setting up an alert source. +# URL and URL_file are mutually exclusive. +url: +url_file: + +# The alert source token is used to authenticate with incident.io. +# alert_source_token and alert_source_token_file are mutually exclusive. +[ alert_source_token: ] +[ alert_source_token_file: ] + +# The maximum number of alerts to be sent per incident.io message. +# Alerts exceeding this threshold will be truncated. Setting this to 0 +# allows an unlimited number of alerts. Note that if the payload exceeds +# incident.io's size limits, you will receive a 429 response and alerts +# will not be ingested. +[ max_alerts: | default = 0 ] + +# Timeout is the maximum time allowed to invoke incident.io. Setting this to 0 +# does not impose a timeout. +[ timeout: | default = 0s ] +``` + ### `` WeChat notifications are sent via the [WeChat diff --git a/notify/incidentio/incidentio.go b/notify/incidentio/incidentio.go new file mode 100644 index 0000000000..cf0d347c4f --- /dev/null +++ b/notify/incidentio/incidentio.go @@ -0,0 +1,205 @@ +// Copyright 2025 Prometheus Team +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package incidentio + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "io" + "log/slog" + "net/http" + "os" + "strings" + + commoncfg "github.com/prometheus/common/config" + + "github.com/prometheus/alertmanager/config" + "github.com/prometheus/alertmanager/notify" + "github.com/prometheus/alertmanager/template" + "github.com/prometheus/alertmanager/types" +) + +// Notifier implements a Notifier for incident.io. +type Notifier struct { + conf *config.IncidentioConfig + tmpl *template.Template + logger *slog.Logger + client *http.Client + retrier *notify.Retrier +} + +// New returns a new incident.io notifier. +func New(conf *config.IncidentioConfig, t *template.Template, l *slog.Logger, httpOpts ...commoncfg.HTTPClientOption) (*Notifier, error) { + // Handle authentication configuration + if conf.HTTPConfig == nil { + conf.HTTPConfig = &commoncfg.HTTPClientConfig{} + } + + // Ensure one of AlertSourceToken or AlertSourceTokenFile is provided + if conf.AlertSourceToken == "" && conf.AlertSourceTokenFile == "" { + return nil, errors.New("one of alert_source_token or alert_source_token_file must be configured") + } + + // Error if authorization is already set in HTTPConfig + if conf.HTTPConfig.Authorization != nil { + return nil, errors.New("cannot specify both alert_source_token/alert_source_token_file and http_config.authorization") + } + + // Set authorization from token or token file + if conf.AlertSourceToken != "" { + conf.HTTPConfig.Authorization = &commoncfg.Authorization{ + Type: "Bearer", + Credentials: commoncfg.Secret(conf.AlertSourceToken), + } + } else if conf.AlertSourceTokenFile != "" { + content, err := os.ReadFile(conf.AlertSourceTokenFile) + if err != nil { + return nil, fmt.Errorf("failed to read alert_source_token_file: %w", err) + } + + conf.HTTPConfig.Authorization = &commoncfg.Authorization{ + Type: "Bearer", + Credentials: commoncfg.Secret(strings.TrimSpace(string(content))), + } + } + + client, err := commoncfg.NewClientFromConfig(*conf.HTTPConfig, "incidentio", httpOpts...) + if err != nil { + return nil, err + } + + return &Notifier{ + conf: conf, + tmpl: t, + logger: l, + client: client, + // Always retry on 429 (rate limiting) and 5xx response codes. + retrier: ¬ify.Retrier{ + RetryCodes: []int{ + http.StatusTooManyRequests, // 429 + }, + CustomDetailsFunc: errDetails, + }, + }, nil +} + +// Message defines the JSON object sent to incident.io endpoints. +type Message struct { + *template.Data + + // The protocol version. + Version string `json:"version"` + GroupKey string `json:"groupKey"` + TruncatedAlerts uint64 `json:"truncatedAlerts"` +} + +func truncateAlerts(maxAlerts uint64, alerts []*types.Alert) ([]*types.Alert, uint64) { + if maxAlerts != 0 && uint64(len(alerts)) > maxAlerts { + return alerts[:maxAlerts], uint64(len(alerts)) - maxAlerts + } + + return alerts, 0 +} + +// Notify implements the Notifier interface. +func (n *Notifier) Notify(ctx context.Context, alerts ...*types.Alert) (bool, error) { + alerts, numTruncated := truncateAlerts(n.conf.MaxAlerts, alerts) + data := notify.GetTemplateData(ctx, n.tmpl, alerts, n.logger) + + groupKey, err := notify.ExtractGroupKey(ctx) + if err != nil { + return false, err + } + + n.logger.Debug("incident.io notification", "groupKey", groupKey) + + msg := &Message{ + Version: "1", + Data: data, + GroupKey: groupKey.String(), + TruncatedAlerts: numTruncated, + } + + var buf bytes.Buffer + if err := json.NewEncoder(&buf).Encode(msg); err != nil { + return false, err + } + + var url string + if n.conf.URL != nil { + url = n.conf.URL.String() + } else { + content, err := os.ReadFile(n.conf.URLFile) + if err != nil { + return false, fmt.Errorf("read url_file: %w", err) + } + url = strings.TrimSpace(string(content)) + } + + if n.conf.Timeout > 0 { + postCtx, cancel := context.WithTimeoutCause(ctx, n.conf.Timeout, fmt.Errorf("configured incident.io timeout reached (%s)", n.conf.Timeout)) + defer cancel() + ctx = postCtx + } + + resp, err := notify.PostJSON(ctx, n.client, url, &buf) + if err != nil { + if ctx.Err() != nil { + err = fmt.Errorf("%w: %w", err, context.Cause(ctx)) + } + return true, notify.RedactURL(err) + } + defer notify.Drain(resp) + + shouldRetry, err := n.retrier.Check(resp.StatusCode, resp.Body) + if err != nil { + return shouldRetry, notify.NewErrorWithReason(notify.GetFailureReasonFromStatusCode(resp.StatusCode), err) + } + return shouldRetry, err +} + +// errDetails extracts error details from the response for better error messages. +func errDetails(status int, body io.Reader) string { + if body == nil { + return "" + } + + // Try to decode the error message from JSON response + var errorResponse struct { + Message string `json:"message"` + Errors []string `json:"errors"` + Error string `json:"error"` + } + + if err := json.NewDecoder(body).Decode(&errorResponse); err != nil { + return "" + } + + // Format the error message + var parts []string + if errorResponse.Message != "" { + parts = append(parts, errorResponse.Message) + } + if errorResponse.Error != "" { + parts = append(parts, errorResponse.Error) + } + if len(errorResponse.Errors) > 0 { + parts = append(parts, strings.Join(errorResponse.Errors, ", ")) + } + + return strings.Join(parts, ": ") +} diff --git a/notify/incidentio/incidentio_test.go b/notify/incidentio/incidentio_test.go new file mode 100644 index 0000000000..730897b4f8 --- /dev/null +++ b/notify/incidentio/incidentio_test.go @@ -0,0 +1,303 @@ +// Copyright 2025 Prometheus Team +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package incidentio + +import ( + "bytes" + "context" + "encoding/json" + "io" + "net/http" + "net/http/httptest" + "net/url" + "os" + "testing" + "time" + + commoncfg "github.com/prometheus/common/config" + "github.com/prometheus/common/model" + "github.com/prometheus/common/promslog" + "github.com/stretchr/testify/require" + + "github.com/prometheus/alertmanager/config" + "github.com/prometheus/alertmanager/notify" + "github.com/prometheus/alertmanager/notify/test" + "github.com/prometheus/alertmanager/types" +) + +func TestIncidentIORetry(t *testing.T) { + notifier, err := New( + &config.IncidentioConfig{ + URL: &config.URL{URL: &url.URL{Scheme: "https", Host: "example.com"}}, + HTTPConfig: &commoncfg.HTTPClientConfig{}, + AlertSourceToken: "test-token", + }, + test.CreateTmpl(t), + promslog.NewNopLogger(), + ) + require.NoError(t, err) + + retryCodes := append(test.DefaultRetryCodes(), http.StatusTooManyRequests) + for statusCode, expected := range test.RetryTests(retryCodes) { + actual, _ := notifier.retrier.Check(statusCode, nil) + require.Equal(t, expected, actual, "retry - error on status %d", statusCode) + } +} + +func TestIncidentIORedactedURL(t *testing.T) { + ctx, u, fn := test.GetContextWithCancelingURL() + defer fn() + + notifier, err := New( + &config.IncidentioConfig{ + URL: &config.URL{URL: u}, + HTTPConfig: &commoncfg.HTTPClientConfig{}, + AlertSourceToken: "test-token", + }, + test.CreateTmpl(t), + promslog.NewNopLogger(), + ) + require.NoError(t, err) + + test.AssertNotifyLeaksNoSecret(ctx, t, notifier, u.String()) +} + +func TestIncidentIOURLFromFile(t *testing.T) { + ctx, u, fn := test.GetContextWithCancelingURL() + defer fn() + + f, err := os.CreateTemp("", "incidentio_test") + require.NoError(t, err, "creating temp file failed") + _, err = f.WriteString(u.String() + "\n") + require.NoError(t, err, "writing to temp file failed") + + notifier, err := New( + &config.IncidentioConfig{ + URLFile: f.Name(), + HTTPConfig: &commoncfg.HTTPClientConfig{}, + AlertSourceToken: "test-token", + }, + test.CreateTmpl(t), + promslog.NewNopLogger(), + ) + require.NoError(t, err) + + test.AssertNotifyLeaksNoSecret(ctx, t, notifier, u.String()) +} + +func TestIncidentIOTruncateAlerts(t *testing.T) { + alerts := make([]*types.Alert, 10) + + truncatedAlerts, numTruncated := truncateAlerts(0, alerts) + require.Len(t, truncatedAlerts, 10) + require.EqualValues(t, 0, numTruncated) + + truncatedAlerts, numTruncated = truncateAlerts(4, alerts) + require.Len(t, truncatedAlerts, 4) + require.EqualValues(t, 6, numTruncated) + + truncatedAlerts, numTruncated = truncateAlerts(100, alerts) + require.Len(t, truncatedAlerts, 10) + require.EqualValues(t, 0, numTruncated) +} + +func TestIncidentIONotify(t *testing.T) { + // Test regular notifications are correctly sent + server := httptest.NewServer(http.HandlerFunc( + func(w http.ResponseWriter, r *http.Request) { + // Verify the content type header + contentType := r.Header.Get("Content-Type") + require.Equal(t, "application/json", contentType) + + // Decode the webhook payload + var msg Message + require.NoError(t, json.NewDecoder(r.Body).Decode(&msg)) + + // Verify required fields + require.Equal(t, "1", msg.Version) + require.NotEmpty(t, msg.GroupKey) + w.WriteHeader(http.StatusOK) + }, + )) + defer server.Close() + + u, err := url.Parse(server.URL) + require.NoError(t, err) + + notifier, err := New( + &config.IncidentioConfig{ + URL: &config.URL{URL: u}, + HTTPConfig: &commoncfg.HTTPClientConfig{}, + AlertSourceToken: "test-token", + }, + test.CreateTmpl(t), + promslog.NewNopLogger(), + ) + require.NoError(t, err) + + ctx := context.Background() + ctx = notify.WithGroupKey(ctx, "1") + + alert := &types.Alert{ + Alert: model.Alert{ + Labels: model.LabelSet{ + "alertname": "TestAlert", + "severity": "critical", + }, + StartsAt: time.Now(), + EndsAt: time.Now().Add(time.Hour), + }, + } + + retry, err := notifier.Notify(ctx, alert) + require.NoError(t, err) + require.False(t, retry) +} + +func TestIncidentIORetryScenarios(t *testing.T) { + testCases := []struct { + name string + statusCode int + responseBody []byte + expectRetry bool + expectErrorMsgContains string + }{ + { + name: "success response", + statusCode: http.StatusOK, + responseBody: []byte(`{"status":"success"}`), + expectRetry: false, + expectErrorMsgContains: "", + }, + { + name: "rate limit response", + statusCode: http.StatusTooManyRequests, + responseBody: []byte(`{"error":"rate limit exceeded","message":"Too many requests"}`), + expectRetry: true, + expectErrorMsgContains: "rate limit exceeded", + }, + { + name: "server error response", + statusCode: http.StatusInternalServerError, + responseBody: []byte(`{"error":"internal error"}`), + expectRetry: true, + expectErrorMsgContains: "internal error", + }, + { + name: "client error response", + statusCode: http.StatusBadRequest, + responseBody: []byte(`{"error":"invalid request","message":"Invalid payload format"}`), + expectRetry: false, + expectErrorMsgContains: "invalid request", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc( + func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(tc.statusCode) + w.Write(tc.responseBody) + }, + )) + defer server.Close() + + u, err := url.Parse(server.URL) + require.NoError(t, err) + + notifier, err := New( + &config.IncidentioConfig{ + URL: &config.URL{URL: u}, + HTTPConfig: &commoncfg.HTTPClientConfig{}, + AlertSourceToken: "test-token", + }, + test.CreateTmpl(t), + promslog.NewNopLogger(), + ) + require.NoError(t, err) + + ctx := context.Background() + ctx = notify.WithGroupKey(ctx, "1") + + alert := &types.Alert{ + Alert: model.Alert{ + Labels: model.LabelSet{ + "alertname": "TestAlert", + "severity": "critical", + }, + StartsAt: time.Now(), + EndsAt: time.Now().Add(time.Hour), + }, + } + + retry, err := notifier.Notify(ctx, alert) + if tc.expectErrorMsgContains == "" { + require.NoError(t, err) + } else { + require.Error(t, err) + require.Contains(t, err.Error(), tc.expectErrorMsgContains) + } + require.Equal(t, tc.expectRetry, retry) + }) + } +} + +func TestIncidentIOErrDetails(t *testing.T) { + for _, tc := range []struct { + name string + status int + body io.Reader + expect string + }{ + { + name: "empty body", + status: http.StatusBadRequest, + body: nil, + expect: "", + }, + { + name: "single error field", + status: http.StatusBadRequest, + body: bytes.NewBufferString(`{"error":"Invalid request"}`), + expect: "Invalid request", + }, + { + name: "message and errors", + status: http.StatusBadRequest, + body: bytes.NewBufferString(`{"message":"Validation failed","errors":["Field is required","Value too long"]}`), + expect: "Validation failed: Field is required, Value too long", + }, + { + name: "message and error", + status: http.StatusTooManyRequests, + body: bytes.NewBufferString(`{"message":"Too many requests","error":"Rate limit exceeded"}`), + expect: "Too many requests: Rate limit exceeded", + }, + { + name: "invalid JSON", + status: http.StatusBadRequest, + body: bytes.NewBufferString(`{invalid}`), + expect: "", + }, + } { + t.Run(tc.name, func(t *testing.T) { + result := errDetails(tc.status, tc.body) + if tc.expect == "" { + require.Equal(t, "", result) + } else { + require.Contains(t, result, tc.expect) + } + }) + } +} diff --git a/notify/notify.go b/notify/notify.go index 3973e7876b..6ab54a7ba2 100644 --- a/notify/notify.go +++ b/notify/notify.go @@ -365,6 +365,7 @@ func (m *Metrics) InitializeFor(receiver map[string][]Integration) { "webex", "msteams", "msteamsv2", + "incidentio", "jira", "rocketchat", } {