diff --git a/traefik-mixin/README.md b/traefik-mixin/README.md index ba5d1823a..7d1b03aae 100644 --- a/traefik-mixin/README.md +++ b/traefik-mixin/README.md @@ -1,10 +1,63 @@ -The Traefik mixin is a set of configurable, reusable, and extensible dashboards based on the metrics exported by Traefik itself. It also creates suitable dashboard descriptions for Grafana. +# traefik-mixin + +The Traefik mixin is a set of configurable, reusable, and extensible dashboards based on the metrics exported by Traefik itself. It also creates suitable dashboard descriptions for Grafana. Lastly, some alerts are also included. To use them, you need to have mixtool and jsonnetfmt installed. If you have a working Go development environment, it's easiest to run the following: -$ go get github.com/monitoring-mixins/mixtool/cmd/mixtool -$ go get github.com/google/go-jsonnet/cmd/jsonnetfmt -You can then build the Prometheus rules files alerts.yaml and rules.yaml and a directory dashboard_out with the JSON dashboard files for Grafana: +```shell +go get github.com/monitoring-mixins/mixtool/cmd/mixtool +go get github.com/google/go-jsonnet/cmd/jsonnetfmt +``` + +You can then build the Prometheus rules files and dashboards for Grafana: + +```shell +make build +``` + +This will generate: + +- Prometheus alerts in `prometheus_rules_out/prometheus_alerts.yaml` +- Prometheus rules in `prometheus_rules_out/prometheus_rules.yaml` (if you have rules defined) +- Grafana dashboards in `dashboards_out/` + +## Included Alerts + +The following Prometheus alerts are included: + +- **TraefikConfigReloadFailuresIncreasing**: Fires if Traefik is failing to reload its config. +- **TraefikTLSCertificatesExpiring**: Fires if Traefik is serving certificates that will expire very soon (critical, threshold configurable). +- **TraefikTLSCertificatesExpiringSoon**: Fires if Traefik is serving certificates that will expire soon (warning, threshold configurable, only fires if the expiry is less than the warning threshold but greater than the critical threshold). + +## Configuration + +You can configure alert thresholds, selectors, and labels in `config.libsonnet`: + +```jsonnet +{ + _config+:: { + traefik_tls_expiry_days_critical: 7, // critical threshold (days) + traefik_tls_expiry_days_warning: 14, // warning threshold (days) + filteringSelector: '', // optional metric label selector for all alerts + // Example: + // filteringSelector: "component=\"traefik\",environment=\"production\"", + groupLabels: 'job, environment', + instanceLabels: 'instance', + + alertLabels: {}, // optional alert labels + // Example: + // alertLabels: { + // environment: 'production', + // component: 'traefik', + // }, + alertAnnotations: {}, // optional alert annotations + // Example: + // alertAnnotations: { + // runbook: 'https://runbooks.example.com/traefik-tls', + // grafana: 'https://grafana.example.com/d/traefik', + // }, + }, +} +``` -$ make build -For more advanced uses of mixins, see https://github.com/monitoring-mixins/docs. +For more advanced uses of mixins, see [monitoring-mixins/docs](https://github.com/monitoring-mixins/docs). diff --git a/traefik-mixin/alerts/alerts.libsonnet b/traefik-mixin/alerts/alerts.libsonnet new file mode 100644 index 000000000..b7855e52b --- /dev/null +++ b/traefik-mixin/alerts/alerts.libsonnet @@ -0,0 +1,59 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'traefik', + rules: [ + { + alert: 'TraefikConfigReloadFailuresIncreasing', + expr: ||| + sum by (%(groupLabels)s) (rate(traefik_config_reloads_failure_total{%(filteringSelector)s}[5m])) > 0 + ||| % $._config, + 'for': '5m', + labels: { + severity: 'critical', + } + std.get($._config, 'alertLabels', {}), + annotations: { + summary: 'Traefik is failing to reload its configuration.', + description: ||| + Traefik is failing to reload its config in {{ $labels.%(firstGroupLabel)s }}. + ||| % { firstGroupLabel: std.split($._config.groupLabels, ',')[0] }, + } + std.get($._config, 'alertAnnotations', {}), + }, + { + alert: 'TraefikTLSCertificatesExpiring', + expr: ||| + max by (%(instanceLabels)s, sans) ((last_over_time(traefik_tls_certs_not_after{%(filteringSelector)s}[5m]) - time()) / 86400) < %(traefik_tls_expiry_days_critical)s + ||| % $._config, + 'for': '5m', + labels: { + severity: 'critical', + } + std.get($._config, 'alertLabels', {}), + annotations: { + summary: 'A Traefik-served TLS certificate will expire very soon.', + description: ||| + The minimum number of days until a Traefik-served certificate expires is {{ printf "%%.0f" $value }} days on {{ $labels.sans }} which is below the critical threshold of %(traefik_tls_expiry_days_critical)s. + ||| % $._config, + } + std.get($._config, 'alertAnnotations', {}), + }, + { + alert: 'TraefikTLSCertificatesExpiringSoon', + expr: ||| + max by (%(instanceLabels)s, sans) ((last_over_time(traefik_tls_certs_not_after{%(filteringSelector)s}[5m]) - time()) / 86400) < %(traefik_tls_expiry_days_warning)s > %(traefik_tls_expiry_days_critical)s + ||| % $._config, + 'for': '5m', + labels: { + severity: 'warning', + } + std.get($._config, 'alertLabels', {}), + annotations: { + summary: 'A Traefik-served TLS certificate will expire soon.', + description: ||| + The minimum number of days until a Traefik-served certificate expires is {{ printf "%%.0f" $value }} days on {{ $labels.sans }} which is less than %(traefik_tls_expiry_days_warning)s but greater than %(traefik_tls_expiry_days_critical)s. + ||| % $._config, + } + std.get($._config, 'alertAnnotations', {}), + }, + ], + }, + ], + }, +} diff --git a/traefik-mixin/config.libsonnet b/traefik-mixin/config.libsonnet new file mode 100644 index 000000000..6df3e5b93 --- /dev/null +++ b/traefik-mixin/config.libsonnet @@ -0,0 +1,26 @@ +{ + _config+:: { + // alerts thresholds + traefik_tls_expiry_days_critical: 7, + traefik_tls_expiry_days_warning: 14, + filteringSelector: '', + // Example: + // filteringSelector: "component=\"traefik\",environment=\"production\"", + // for config reload alert + groupLabels: 'job', + // for TLS alerts + instanceLabels: 'instance', + alertLabels: {}, + // Example: + // alertLabels: { + // environment: 'production', + // component: 'traefik', + // }, + alertAnnotations: {}, + // Example: + // alertAnnotations: { + // runbook: 'https://runbooks.example.com/traefik-tls', + // grafana: 'https://grafana.example.com/d/traefik', + // }, + }, +} diff --git a/traefik-mixin/mixin.libsonnet b/traefik-mixin/mixin.libsonnet index 55a5e191a..0d1f96f96 100644 --- a/traefik-mixin/mixin.libsonnet +++ b/traefik-mixin/mixin.libsonnet @@ -2,4 +2,5 @@ grafanaDashboards+:: { 'traefikdash.json': (import 'dashboards/traefikdash.json'), }, -} +} + (import 'alerts/alerts.libsonnet') + +(import 'config.libsonnet') diff --git a/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml b/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml index e69de29bb..da20526e7 100644 --- a/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml +++ b/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml @@ -0,0 +1,33 @@ +groups: + - name: traefik + rules: + - alert: TraefikConfigReloadFailuresIncreasing + annotations: + description: | + Traefik is failing to reload its config in {{ $labels.job }}. + summary: Traefik is failing to reload its configuration. + expr: | + sum by (job) (rate(traefik_config_reloads_failure_total{}[5m])) > 0 + for: 5m + labels: + severity: critical + - alert: TraefikTLSCertificatesExpiring + annotations: + description: | + The minimum number of days until a Traefik-served certificate expires is {{ printf "%.0f" $value }} days on {{ $labels.sans }} which is below the critical threshold of 7. + summary: A Traefik-served TLS certificate will expire very soon. + expr: | + max by (instance, sans) ((last_over_time(traefik_tls_certs_not_after{}[5m]) - time()) / 86400) < 7 + for: 5m + labels: + severity: critical + - alert: TraefikTLSCertificatesExpiringSoon + annotations: + description: | + The minimum number of days until a Traefik-served certificate expires is {{ printf "%.0f" $value }} days on {{ $labels.sans }} which is less than 14 but greater than 7. + summary: A Traefik-served TLS certificate will expire soon. + expr: | + max by (instance, sans) ((last_over_time(traefik_tls_certs_not_after{}[5m]) - time()) / 86400) < 14 > 7 + for: 5m + labels: + severity: warning