From f4268dbbc9d1f12a63fe90ab50d6f79631c5dc5d Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Tue, 1 Jul 2025 09:52:38 -0400 Subject: [PATCH 01/16] feat: traefik alerts Adds two basic Traefik alerts. One for config reloads failing and the other for TLS certificate expiry. --- traefik-mixin/alerts/alerts.libsonnet | 33 +++++++++++++++++++ traefik-mixin/mixin.libsonnet | 1 + .../prometheus_alerts.yaml | 18 ++++++++++ 3 files changed, 52 insertions(+) create mode 100644 traefik-mixin/alerts/alerts.libsonnet diff --git a/traefik-mixin/alerts/alerts.libsonnet b/traefik-mixin/alerts/alerts.libsonnet new file mode 100644 index 000000000..33d5c78ff --- /dev/null +++ b/traefik-mixin/alerts/alerts.libsonnet @@ -0,0 +1,33 @@ +{ + groups+: [ + { + name: 'traefik', + rules: [ + // TraefikConfigReloadFailuresIncreasing + { + alert: 'TraefikConfigReloadFailuresIncreasing', + expr: "sum(rate(traefik_config_reloads_failure_total[5m])) > 0", + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + description: 'Traefik is failing to reload its config', + }, + }, + // TraefikTLSCertificatesExpiring + { + alert: 'TraefikTLSCertificatesExpiring', + expr: "max by (sans) ((last_over_time(traefik_tls_certs_not_after[5m]) - time()) / 86400) < 7", + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + description: 'Traefik is serving certificates that will expire soon', + }, + }, + ], + }, + ], +} diff --git a/traefik-mixin/mixin.libsonnet b/traefik-mixin/mixin.libsonnet index 55a5e191a..a63df9338 100644 --- a/traefik-mixin/mixin.libsonnet +++ b/traefik-mixin/mixin.libsonnet @@ -2,4 +2,5 @@ grafanaDashboards+:: { 'traefikdash.json': (import 'dashboards/traefikdash.json'), }, + prometheusAlerts+:: (import 'alerts/alerts.libsonnet'), } diff --git a/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml b/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml index e69de29bb..17fb283fc 100644 --- a/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml +++ b/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml @@ -0,0 +1,18 @@ +groups: +- name: traefik + rules: + - alert: TraefikConfigReloadFailuresIncreasing + annotations: + description: Traefik is failing to reload its config + expr: sum(rate(traefik_config_reloads_failure_total[5m])) > 0 + for: 5m + labels: + severity: critical + - alert: TraefikTLSCertificatesExpiring + annotations: + description: Traefik is serving certificates that will expire soon + expr: max by (sans) ((last_over_time(traefik_tls_certs_not_after[5m]) - time()) + / 86400) < 7 + for: 5m + labels: + severity: critical From fe655d65dbd802c13ac422ec36cd8eb37b5b15b2 Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Tue, 1 Jul 2025 09:58:16 -0400 Subject: [PATCH 02/16] docs: update README for traefik --- traefik-mixin/README.md | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/traefik-mixin/README.md b/traefik-mixin/README.md index ba5d1823a..1ac5cf984 100644 --- a/traefik-mixin/README.md +++ b/traefik-mixin/README.md @@ -1,10 +1,31 @@ -The Traefik mixin is a set of configurable, reusable, and extensible dashboards based on the metrics exported by Traefik itself. It also creates suitable dashboard descriptions for Grafana. +# traefik-mixin + +The Traefik mixin is a set of configurable, reusable, and extensible dashboards based on the metrics exported by Traefik itself. It also creates suitable dashboard descriptions for Grafana. Lastly, some alerts are also included. To use them, you need to have mixtool and jsonnetfmt installed. If you have a working Go development environment, it's easiest to run the following: +```shell $ go get github.com/monitoring-mixins/mixtool/cmd/mixtool $ go get github.com/google/go-jsonnet/cmd/jsonnetfmt -You can then build the Prometheus rules files alerts.yaml and rules.yaml and a directory dashboard_out with the JSON dashboard files for Grafana: +``` + +You can then build the Prometheus rules files and dashboards for Grafana: +```shell $ make build +``` + +This will generate: + +- Prometheus alerts in `prometheus_rules_out/prometheus_alerts.yaml` +- Prometheus rules in `prometheus_rules_out/prometheus_rules.yaml` (if you have rules defined) +- Grafana dashboards in `dashboards_out/` + +## Included Alerts + +The following Prometheus alerts are included: + +- **TraefikConfigReloadFailuresIncreasing**: Fires if Traefik is failing to reload its config. +- **TraefikTLSCertificatesExpiring**: Fires if Traefik is serving certificates that will expire soon. + For more advanced uses of mixins, see https://github.com/monitoring-mixins/docs. From a6c274b6b7968887ed11ccb6861f30a3a6c18ec1 Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Tue, 1 Jul 2025 10:51:11 -0400 Subject: [PATCH 03/16] chore: make fmt --- traefik-mixin/alerts/alerts.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/traefik-mixin/alerts/alerts.libsonnet b/traefik-mixin/alerts/alerts.libsonnet index 33d5c78ff..090e5aff5 100644 --- a/traefik-mixin/alerts/alerts.libsonnet +++ b/traefik-mixin/alerts/alerts.libsonnet @@ -6,7 +6,7 @@ // TraefikConfigReloadFailuresIncreasing { alert: 'TraefikConfigReloadFailuresIncreasing', - expr: "sum(rate(traefik_config_reloads_failure_total[5m])) > 0", + expr: 'sum(rate(traefik_config_reloads_failure_total[5m])) > 0', 'for': '5m', labels: { severity: 'critical', @@ -18,7 +18,7 @@ // TraefikTLSCertificatesExpiring { alert: 'TraefikTLSCertificatesExpiring', - expr: "max by (sans) ((last_over_time(traefik_tls_certs_not_after[5m]) - time()) / 86400) < 7", + expr: 'max by (sans) ((last_over_time(traefik_tls_certs_not_after[5m]) - time()) / 86400) < 7', 'for': '5m', labels: { severity: 'critical', From f485e827a01c40f1865a6b19fa1ac193bd06fdf2 Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Tue, 1 Jul 2025 11:35:20 -0400 Subject: [PATCH 04/16] ref: make everything configurable --- traefik-mixin/alerts/alerts.libsonnet | 80 ++++++++++++------- traefik-mixin/config.libsonnet | 26 ++++++ traefik-mixin/mixin.libsonnet | 4 +- .../prometheus_alerts.yaml | 19 ++++- 4 files changed, 95 insertions(+), 34 deletions(-) create mode 100644 traefik-mixin/config.libsonnet diff --git a/traefik-mixin/alerts/alerts.libsonnet b/traefik-mixin/alerts/alerts.libsonnet index 090e5aff5..d2bc97615 100644 --- a/traefik-mixin/alerts/alerts.libsonnet +++ b/traefik-mixin/alerts/alerts.libsonnet @@ -1,33 +1,57 @@ { - groups+: [ - { - name: 'traefik', - rules: [ - // TraefikConfigReloadFailuresIncreasing - { - alert: 'TraefikConfigReloadFailuresIncreasing', - expr: 'sum(rate(traefik_config_reloads_failure_total[5m])) > 0', - 'for': '5m', - labels: { - severity: 'critical', + prometheusAlerts+:: { + groups+: [ + { + name: 'traefik', + rules: [ + // TraefikConfigReloadFailuresIncreasing + { + alert: 'TraefikConfigReloadFailuresIncreasing', + expr: ||| + sum by (%(sumByLabels)s) (rate(traefik_config_reloads_failure_total{%(timeSeriesLabels)s}[5m])) > 0 + ||| % $._config, + 'for': '5m', + labels: { + severity: 'critical', + } + std.get($._config, 'alertLabels', {}), + annotations: { + description: 'Traefik is failing to reload its config', + } + std.get($._config, 'alertAnnotations', {}), }, - annotations: { - description: 'Traefik is failing to reload its config', + // TraefikTLSCertificatesExpiring (critical) + { + alert: 'TraefikTLSCertificatesExpiring', + expr: ||| + max by (%(maxByLabels)s) ((last_over_time(traefik_tls_certs_not_after{%(timeSeriesLabels)s}[5m]) - time()) / 86400) < %(traefik_tls_expiry_days_critical)s + ||| % $._config, + 'for': '5m', + labels: { + severity: 'critical', + } + std.get($._config, 'alertLabels', {}), + annotations: { + description: ||| + The minimum number of days until a Traefik-served certificate expires is {{ printf "%%.0f" $value }} days on {{ $labels.sans }} which is below the critical threshold of %(traefik_tls_expiry_days_critical)s. + ||| % $._config, + } + std.get($._config, 'alertAnnotations', {}), }, - }, - // TraefikTLSCertificatesExpiring - { - alert: 'TraefikTLSCertificatesExpiring', - expr: 'max by (sans) ((last_over_time(traefik_tls_certs_not_after[5m]) - time()) / 86400) < 7', - 'for': '5m', - labels: { - severity: 'critical', + // TraefikTLSCertificatesExpiring (warning) + { + alert: 'TraefikTLSCertificatesExpiringSoon', + expr: ||| + max by (%(maxByLabels)s) ((last_over_time(traefik_tls_certs_not_after{%(timeSeriesLabels)s}[5m]) - time()) / 86400) < %(traefik_tls_expiry_days_warning)s + ||| % $._config, + 'for': '5m', + labels: { + severity: 'warning', + } + std.get($._config, 'alertLabels', {}), + annotations: { + description: ||| + The minimum number of days until a Traefik-served certificate expires is {{ printf "%%.0f" $value }} days on {{ $labels.sans }} which is below the warning threshold of %(traefik_tls_expiry_days_warning)s. + ||| % $._config, + } + std.get($._config, 'alertAnnotations', {}), }, - annotations: { - description: 'Traefik is serving certificates that will expire soon', - }, - }, - ], - }, - ], + ], + }, + ], + }, } diff --git a/traefik-mixin/config.libsonnet b/traefik-mixin/config.libsonnet new file mode 100644 index 000000000..de6db71ce --- /dev/null +++ b/traefik-mixin/config.libsonnet @@ -0,0 +1,26 @@ +{ + _config+:: { + // alerts thresholds + traefik_tls_expiry_days_critical: 7, + traefik_tls_expiry_days_warning: 14, + timeSeriesLabels: '', + // Example: + // timeSeriesLabels: "component=\"traefik\",environment=\"production\"", + // for config alert + sumByLabels: 'instance', + // for TLS alerts + maxByLabels: 'sans', + alertLabels: {}, + // Example: + // alertLabels: { + // environment: 'production', + // component: 'traefik', + // }, + alertAnnotations: {}, + // Example: + // alertAnnotations: { + // runbook: 'https://runbooks.example.com/traefik-tls', + // grafana: 'https://grafana.example.com/d/traefik', + // }, + }, +} diff --git a/traefik-mixin/mixin.libsonnet b/traefik-mixin/mixin.libsonnet index a63df9338..0d1f96f96 100644 --- a/traefik-mixin/mixin.libsonnet +++ b/traefik-mixin/mixin.libsonnet @@ -2,5 +2,5 @@ grafanaDashboards+:: { 'traefikdash.json': (import 'dashboards/traefikdash.json'), }, - prometheusAlerts+:: (import 'alerts/alerts.libsonnet'), -} +} + (import 'alerts/alerts.libsonnet') + +(import 'config.libsonnet') diff --git a/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml b/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml index 17fb283fc..b28d0efe4 100644 --- a/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml +++ b/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml @@ -4,15 +4,26 @@ groups: - alert: TraefikConfigReloadFailuresIncreasing annotations: description: Traefik is failing to reload its config - expr: sum(rate(traefik_config_reloads_failure_total[5m])) > 0 + expr: | + sum by (instance) (rate(traefik_config_reloads_failure_total{}[5m])) > 0 for: 5m labels: severity: critical - alert: TraefikTLSCertificatesExpiring annotations: - description: Traefik is serving certificates that will expire soon - expr: max by (sans) ((last_over_time(traefik_tls_certs_not_after[5m]) - time()) - / 86400) < 7 + description: | + The minimum number of days until a Traefik-served certificate expires is {{ printf "%.0f" $value }} days on {{ $labels.sans }} which is below the critical threshold of 7. + expr: | + max by (sans) ((last_over_time(traefik_tls_certs_not_after{}[5m]) - time()) / 86400) < 7 for: 5m labels: severity: critical + - alert: TraefikTLSCertificatesExpiringSoon + annotations: + description: | + The minimum number of days until a Traefik-served certificate expires is {{ printf "%.0f" $value }} days on {{ $labels.sans }} which is below the warning threshold of 14. + expr: | + max by (sans) ((last_over_time(traefik_tls_certs_not_after{}[5m]) - time()) / 86400) < 14 + for: 5m + labels: + severity: warning From 478d293ab4b9f0f8347e5b9271006f643acc2c59 Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Tue, 1 Jul 2025 11:45:53 -0400 Subject: [PATCH 05/16] ref: only have warning alert fire if above crit --- traefik-mixin/alerts/alerts.libsonnet | 4 ++-- traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/traefik-mixin/alerts/alerts.libsonnet b/traefik-mixin/alerts/alerts.libsonnet index d2bc97615..aeff50f09 100644 --- a/traefik-mixin/alerts/alerts.libsonnet +++ b/traefik-mixin/alerts/alerts.libsonnet @@ -38,7 +38,7 @@ { alert: 'TraefikTLSCertificatesExpiringSoon', expr: ||| - max by (%(maxByLabels)s) ((last_over_time(traefik_tls_certs_not_after{%(timeSeriesLabels)s}[5m]) - time()) / 86400) < %(traefik_tls_expiry_days_warning)s + max by (%(maxByLabels)s) ((last_over_time(traefik_tls_certs_not_after{%(timeSeriesLabels)s}[5m]) - time()) / 86400) < %(traefik_tls_expiry_days_warning)s > %(traefik_tls_expiry_days_critical)s ||| % $._config, 'for': '5m', labels: { @@ -46,7 +46,7 @@ } + std.get($._config, 'alertLabels', {}), annotations: { description: ||| - The minimum number of days until a Traefik-served certificate expires is {{ printf "%%.0f" $value }} days on {{ $labels.sans }} which is below the warning threshold of %(traefik_tls_expiry_days_warning)s. + The minimum number of days until a Traefik-served certificate expires is {{ printf "%%.0f" $value }} days on {{ $labels.sans }} which is less than %(traefik_tls_expiry_days_warning)s but greater than %(traefik_tls_expiry_days_critical)s. ||| % $._config, } + std.get($._config, 'alertAnnotations', {}), }, diff --git a/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml b/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml index b28d0efe4..5d0862753 100644 --- a/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml +++ b/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml @@ -21,9 +21,9 @@ groups: - alert: TraefikTLSCertificatesExpiringSoon annotations: description: | - The minimum number of days until a Traefik-served certificate expires is {{ printf "%.0f" $value }} days on {{ $labels.sans }} which is below the warning threshold of 14. + The minimum number of days until a Traefik-served certificate expires is {{ printf "%.0f" $value }} days on {{ $labels.sans }} which is less than 14 but greater than 7. expr: | - max by (sans) ((last_over_time(traefik_tls_certs_not_after{}[5m]) - time()) / 86400) < 14 + max by (sans) ((last_over_time(traefik_tls_certs_not_after{}[5m]) - time()) / 86400) < 14 > 7 for: 5m labels: severity: warning From afe04fe1dc1302a3cfd2d06068b1385f80ffbbe5 Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Tue, 1 Jul 2025 11:51:40 -0400 Subject: [PATCH 06/16] docs: mention config vars --- traefik-mixin/README.md | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/traefik-mixin/README.md b/traefik-mixin/README.md index 1ac5cf984..733d2ed76 100644 --- a/traefik-mixin/README.md +++ b/traefik-mixin/README.md @@ -5,14 +5,14 @@ The Traefik mixin is a set of configurable, reusable, and extensible dashboards To use them, you need to have mixtool and jsonnetfmt installed. If you have a working Go development environment, it's easiest to run the following: ```shell -$ go get github.com/monitoring-mixins/mixtool/cmd/mixtool -$ go get github.com/google/go-jsonnet/cmd/jsonnetfmt +go get github.com/monitoring-mixins/mixtool/cmd/mixtool +go get github.com/google/go-jsonnet/cmd/jsonnetfmt ``` You can then build the Prometheus rules files and dashboards for Grafana: ```shell -$ make build +make build ``` This will generate: @@ -26,6 +26,25 @@ This will generate: The following Prometheus alerts are included: - **TraefikConfigReloadFailuresIncreasing**: Fires if Traefik is failing to reload its config. -- **TraefikTLSCertificatesExpiring**: Fires if Traefik is serving certificates that will expire soon. +- **TraefikTLSCertificatesExpiring**: Fires if Traefik is serving certificates that will expire very soon (critical, threshold configurable). +- **TraefikTLSCertificatesExpiringSoon**: Fires if Traefik is serving certificates that will expire soon (warning, threshold configurable, only fires if the expiry is less than the warning threshold but greater than the critical threshold). + +## Configuration + +You can configure alert thresholds and labels in `config.libsonnet`: + +```jsonnet +{ + _config+:: { + traefik_tls_expiry_days_critical: 7, // critical threshold (days) + traefik_tls_expiry_days_warning: 14, // warning threshold (days) + alertLabels: {}, // optional alert labels + alertAnnotations: {}, // optional alert annotations + timeSeriesLabels: "component=\"traefik\",environment=\"production\"", // optional time series labels + sumByLabels: "instance", // optional sum by labels + maxByLabels: "sans", // optional max by labels + }, +} +``` For more advanced uses of mixins, see https://github.com/monitoring-mixins/docs. From a113742d4a58e3e9656c81abb2973107462cd66e Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Wed, 2 Jul 2025 09:46:36 -0400 Subject: [PATCH 07/16] ref: match convention of config vars --- traefik-mixin/alerts/alerts.libsonnet | 6 +++--- traefik-mixin/config.libsonnet | 10 +++++----- .../prometheus_rules_out/prometheus_alerts.yaml | 6 +++--- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/traefik-mixin/alerts/alerts.libsonnet b/traefik-mixin/alerts/alerts.libsonnet index aeff50f09..5375aa0c4 100644 --- a/traefik-mixin/alerts/alerts.libsonnet +++ b/traefik-mixin/alerts/alerts.libsonnet @@ -8,7 +8,7 @@ { alert: 'TraefikConfigReloadFailuresIncreasing', expr: ||| - sum by (%(sumByLabels)s) (rate(traefik_config_reloads_failure_total{%(timeSeriesLabels)s}[5m])) > 0 + sum by (%(groupLabels)s) (rate(traefik_config_reloads_failure_total{%(filteringSelector)s}[5m])) > 0 ||| % $._config, 'for': '5m', labels: { @@ -22,7 +22,7 @@ { alert: 'TraefikTLSCertificatesExpiring', expr: ||| - max by (%(maxByLabels)s) ((last_over_time(traefik_tls_certs_not_after{%(timeSeriesLabels)s}[5m]) - time()) / 86400) < %(traefik_tls_expiry_days_critical)s + max by (%(instanceLabels)s, sans) ((last_over_time(traefik_tls_certs_not_after{%(filteringSelector)s}[5m]) - time()) / 86400) < %(traefik_tls_expiry_days_critical)s ||| % $._config, 'for': '5m', labels: { @@ -38,7 +38,7 @@ { alert: 'TraefikTLSCertificatesExpiringSoon', expr: ||| - max by (%(maxByLabels)s) ((last_over_time(traefik_tls_certs_not_after{%(timeSeriesLabels)s}[5m]) - time()) / 86400) < %(traefik_tls_expiry_days_warning)s > %(traefik_tls_expiry_days_critical)s + max by (%(instanceLabels)s, sans) ((last_over_time(traefik_tls_certs_not_after{%(filteringSelector)s}[5m]) - time()) / 86400) < %(traefik_tls_expiry_days_warning)s > %(traefik_tls_expiry_days_critical)s ||| % $._config, 'for': '5m', labels: { diff --git a/traefik-mixin/config.libsonnet b/traefik-mixin/config.libsonnet index de6db71ce..38e4a5812 100644 --- a/traefik-mixin/config.libsonnet +++ b/traefik-mixin/config.libsonnet @@ -3,13 +3,13 @@ // alerts thresholds traefik_tls_expiry_days_critical: 7, traefik_tls_expiry_days_warning: 14, - timeSeriesLabels: '', + filteringSelector: '', // Example: - // timeSeriesLabels: "component=\"traefik\",environment=\"production\"", - // for config alert - sumByLabels: 'instance', + // filteringSelector: "component=\"traefik\",environment=\"production\"", + // for config reload alert + groupLabels: 'job, environment', // for TLS alerts - maxByLabels: 'sans', + instanceLabels: 'instance', alertLabels: {}, // Example: // alertLabels: { diff --git a/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml b/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml index 5d0862753..a2eb589c9 100644 --- a/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml +++ b/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml @@ -5,7 +5,7 @@ groups: annotations: description: Traefik is failing to reload its config expr: | - sum by (instance) (rate(traefik_config_reloads_failure_total{}[5m])) > 0 + sum by (job, environment) (rate(traefik_config_reloads_failure_total{}[5m])) > 0 for: 5m labels: severity: critical @@ -14,7 +14,7 @@ groups: description: | The minimum number of days until a Traefik-served certificate expires is {{ printf "%.0f" $value }} days on {{ $labels.sans }} which is below the critical threshold of 7. expr: | - max by (sans) ((last_over_time(traefik_tls_certs_not_after{}[5m]) - time()) / 86400) < 7 + max by (instance, sans) ((last_over_time(traefik_tls_certs_not_after{}[5m]) - time()) / 86400) < 7 for: 5m labels: severity: critical @@ -23,7 +23,7 @@ groups: description: | The minimum number of days until a Traefik-served certificate expires is {{ printf "%.0f" $value }} days on {{ $labels.sans }} which is less than 14 but greater than 7. expr: | - max by (sans) ((last_over_time(traefik_tls_certs_not_after{}[5m]) - time()) / 86400) < 14 > 7 + max by (instance, sans) ((last_over_time(traefik_tls_certs_not_after{}[5m]) - time()) / 86400) < 14 > 7 for: 5m labels: severity: warning From 0a440a61e3991b0b10f08c5772248ceb8840d392 Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Wed, 2 Jul 2025 11:43:06 -0400 Subject: [PATCH 08/16] docs: match new var names --- traefik-mixin/README.md | 22 +++++++++++++++++----- traefik-mixin/alerts/alerts.libsonnet | 3 --- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/traefik-mixin/README.md b/traefik-mixin/README.md index 733d2ed76..65970d703 100644 --- a/traefik-mixin/README.md +++ b/traefik-mixin/README.md @@ -31,20 +31,32 @@ The following Prometheus alerts are included: ## Configuration -You can configure alert thresholds and labels in `config.libsonnet`: +You can configure alert thresholds, selectors, and labels in `config.libsonnet`: ```jsonnet { _config+:: { traefik_tls_expiry_days_critical: 7, // critical threshold (days) traefik_tls_expiry_days_warning: 14, // warning threshold (days) + filteringSelector: '', // optional metric label selector for all alerts + // Example: + // filteringSelector: "component=\"traefik\",environment=\"production\"", + groupLabels: 'job, environment', // for config reload alert (sum by) + instanceLabels: 'instance', // for TLS alerts (max by) alertLabels: {}, // optional alert labels + // Example: + // alertLabels: { + // environment: 'production', + // component: 'traefik', + // }, alertAnnotations: {}, // optional alert annotations - timeSeriesLabels: "component=\"traefik\",environment=\"production\"", // optional time series labels - sumByLabels: "instance", // optional sum by labels - maxByLabels: "sans", // optional max by labels + // Example: + // alertAnnotations: { + // runbook: 'https://runbooks.example.com/traefik-tls', + // grafana: 'https://grafana.example.com/d/traefik', + // }, }, } ``` -For more advanced uses of mixins, see https://github.com/monitoring-mixins/docs. +For more advanced uses of mixins, see [monitoring-mixins/docs](https://github.com/monitoring-mixins/docs). diff --git a/traefik-mixin/alerts/alerts.libsonnet b/traefik-mixin/alerts/alerts.libsonnet index 5375aa0c4..d67ea8b8e 100644 --- a/traefik-mixin/alerts/alerts.libsonnet +++ b/traefik-mixin/alerts/alerts.libsonnet @@ -4,7 +4,6 @@ { name: 'traefik', rules: [ - // TraefikConfigReloadFailuresIncreasing { alert: 'TraefikConfigReloadFailuresIncreasing', expr: ||| @@ -18,7 +17,6 @@ description: 'Traefik is failing to reload its config', } + std.get($._config, 'alertAnnotations', {}), }, - // TraefikTLSCertificatesExpiring (critical) { alert: 'TraefikTLSCertificatesExpiring', expr: ||| @@ -34,7 +32,6 @@ ||| % $._config, } + std.get($._config, 'alertAnnotations', {}), }, - // TraefikTLSCertificatesExpiring (warning) { alert: 'TraefikTLSCertificatesExpiringSoon', expr: ||| From c3ebbc91b11ce40af82042c41116bf2246eb7112 Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Wed, 2 Jul 2025 11:50:22 -0400 Subject: [PATCH 09/16] fix: add summary --- traefik-mixin/alerts/alerts.libsonnet | 3 +++ traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml | 3 +++ 2 files changed, 6 insertions(+) diff --git a/traefik-mixin/alerts/alerts.libsonnet b/traefik-mixin/alerts/alerts.libsonnet index d67ea8b8e..e65a2215a 100644 --- a/traefik-mixin/alerts/alerts.libsonnet +++ b/traefik-mixin/alerts/alerts.libsonnet @@ -14,6 +14,7 @@ severity: 'critical', } + std.get($._config, 'alertLabels', {}), annotations: { + summary: 'Traefik is failing to reload its configuration.', description: 'Traefik is failing to reload its config', } + std.get($._config, 'alertAnnotations', {}), }, @@ -27,6 +28,7 @@ severity: 'critical', } + std.get($._config, 'alertLabels', {}), annotations: { + summary: 'A Traefik-served TLS certificate will expire very soon.', description: ||| The minimum number of days until a Traefik-served certificate expires is {{ printf "%%.0f" $value }} days on {{ $labels.sans }} which is below the critical threshold of %(traefik_tls_expiry_days_critical)s. ||| % $._config, @@ -42,6 +44,7 @@ severity: 'warning', } + std.get($._config, 'alertLabels', {}), annotations: { + summary: 'A Traefik-served TLS certificate will expire soon.', description: ||| The minimum number of days until a Traefik-served certificate expires is {{ printf "%%.0f" $value }} days on {{ $labels.sans }} which is less than %(traefik_tls_expiry_days_warning)s but greater than %(traefik_tls_expiry_days_critical)s. ||| % $._config, diff --git a/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml b/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml index a2eb589c9..2352b478f 100644 --- a/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml +++ b/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml @@ -4,6 +4,7 @@ groups: - alert: TraefikConfigReloadFailuresIncreasing annotations: description: Traefik is failing to reload its config + summary: Traefik is failing to reload its configuration. expr: | sum by (job, environment) (rate(traefik_config_reloads_failure_total{}[5m])) > 0 for: 5m @@ -13,6 +14,7 @@ groups: annotations: description: | The minimum number of days until a Traefik-served certificate expires is {{ printf "%.0f" $value }} days on {{ $labels.sans }} which is below the critical threshold of 7. + summary: A Traefik-served TLS certificate will expire very soon. expr: | max by (instance, sans) ((last_over_time(traefik_tls_certs_not_after{}[5m]) - time()) / 86400) < 7 for: 5m @@ -22,6 +24,7 @@ groups: annotations: description: | The minimum number of days until a Traefik-served certificate expires is {{ printf "%.0f" $value }} days on {{ $labels.sans }} which is less than 14 but greater than 7. + summary: A Traefik-served TLS certificate will expire soon. expr: | max by (instance, sans) ((last_over_time(traefik_tls_certs_not_after{}[5m]) - time()) / 86400) < 14 > 7 for: 5m From f1ba22103d6b3cd7aeef72b2e9073152fd4f61b4 Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Wed, 2 Jul 2025 12:06:00 -0400 Subject: [PATCH 10/16] fix: template config reload with environment label --- traefik-mixin/alerts/alerts.libsonnet | 6 ++++-- traefik-mixin/config.libsonnet | 2 +- traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml | 3 ++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/traefik-mixin/alerts/alerts.libsonnet b/traefik-mixin/alerts/alerts.libsonnet index e65a2215a..797b76783 100644 --- a/traefik-mixin/alerts/alerts.libsonnet +++ b/traefik-mixin/alerts/alerts.libsonnet @@ -7,7 +7,7 @@ { alert: 'TraefikConfigReloadFailuresIncreasing', expr: ||| - sum by (%(groupLabels)s) (rate(traefik_config_reloads_failure_total{%(filteringSelector)s}[5m])) > 0 + sum by (%(groupLabels)s, environment) (rate(traefik_config_reloads_failure_total{%(filteringSelector)s}[5m])) > 0 ||| % $._config, 'for': '5m', labels: { @@ -15,7 +15,9 @@ } + std.get($._config, 'alertLabels', {}), annotations: { summary: 'Traefik is failing to reload its configuration.', - description: 'Traefik is failing to reload its config', + description: ||| + Traefik is failing to reload its config in {{ $labels.environment }}. + |||, } + std.get($._config, 'alertAnnotations', {}), }, { diff --git a/traefik-mixin/config.libsonnet b/traefik-mixin/config.libsonnet index 38e4a5812..6df3e5b93 100644 --- a/traefik-mixin/config.libsonnet +++ b/traefik-mixin/config.libsonnet @@ -7,7 +7,7 @@ // Example: // filteringSelector: "component=\"traefik\",environment=\"production\"", // for config reload alert - groupLabels: 'job, environment', + groupLabels: 'job', // for TLS alerts instanceLabels: 'instance', alertLabels: {}, diff --git a/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml b/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml index 2352b478f..da1e65901 100644 --- a/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml +++ b/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml @@ -3,7 +3,8 @@ groups: rules: - alert: TraefikConfigReloadFailuresIncreasing annotations: - description: Traefik is failing to reload its config + description: | + Traefik is failing to reload its config in {{ $labels.environment }}. summary: Traefik is failing to reload its configuration. expr: | sum by (job, environment) (rate(traefik_config_reloads_failure_total{}[5m])) > 0 From 7b4b7a1b7943234c29a390f4193202f21606ac09 Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Wed, 2 Jul 2025 12:13:49 -0400 Subject: [PATCH 11/16] fix: better grouping for template Co-authored-by: v-zhuravlev --- traefik-mixin/alerts/alerts.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/traefik-mixin/alerts/alerts.libsonnet b/traefik-mixin/alerts/alerts.libsonnet index 797b76783..4f0dbea57 100644 --- a/traefik-mixin/alerts/alerts.libsonnet +++ b/traefik-mixin/alerts/alerts.libsonnet @@ -16,8 +16,8 @@ annotations: { summary: 'Traefik is failing to reload its configuration.', description: ||| - Traefik is failing to reload its config in {{ $labels.environment }}. - |||, + Traefik is failing to reload its config in {{ $labels.%(firstGroupLabel)s }}. + ||| % {firstGroupLabel: std.split($._config.groupLabels, ',')[0]}, } + std.get($._config, 'alertAnnotations', {}), }, { From 5b0f366d9c7c450a0d775efad9f701102155a58f Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Wed, 2 Jul 2025 12:14:11 -0400 Subject: [PATCH 12/16] fix: make all --- traefik-mixin/alerts/alerts.libsonnet | 2 +- traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/traefik-mixin/alerts/alerts.libsonnet b/traefik-mixin/alerts/alerts.libsonnet index 4f0dbea57..005d82e45 100644 --- a/traefik-mixin/alerts/alerts.libsonnet +++ b/traefik-mixin/alerts/alerts.libsonnet @@ -17,7 +17,7 @@ summary: 'Traefik is failing to reload its configuration.', description: ||| Traefik is failing to reload its config in {{ $labels.%(firstGroupLabel)s }}. - ||| % {firstGroupLabel: std.split($._config.groupLabels, ',')[0]}, + ||| % { firstGroupLabel: std.split($._config.groupLabels, ',')[0] }, } + std.get($._config, 'alertAnnotations', {}), }, { diff --git a/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml b/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml index da1e65901..0d9d1e41f 100644 --- a/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml +++ b/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml @@ -4,7 +4,7 @@ groups: - alert: TraefikConfigReloadFailuresIncreasing annotations: description: | - Traefik is failing to reload its config in {{ $labels.environment }}. + Traefik is failing to reload its config in {{ $labels.job }}. summary: Traefik is failing to reload its configuration. expr: | sum by (job, environment) (rate(traefik_config_reloads_failure_total{}[5m])) > 0 From 65f86e22e6bd700038e941dec186b1bb34a50263 Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Wed, 2 Jul 2025 12:32:36 -0400 Subject: [PATCH 13/16] ref: remove config comments Co-authored-by: v-zhuravlev --- traefik-mixin/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/traefik-mixin/README.md b/traefik-mixin/README.md index 65970d703..7d1b03aae 100644 --- a/traefik-mixin/README.md +++ b/traefik-mixin/README.md @@ -41,8 +41,9 @@ You can configure alert thresholds, selectors, and labels in `config.libsonnet`: filteringSelector: '', // optional metric label selector for all alerts // Example: // filteringSelector: "component=\"traefik\",environment=\"production\"", - groupLabels: 'job, environment', // for config reload alert (sum by) - instanceLabels: 'instance', // for TLS alerts (max by) + groupLabels: 'job, environment', + instanceLabels: 'instance', + alertLabels: {}, // optional alert labels // Example: // alertLabels: { From 899ab9c88e086c34d2e4d16664e7b9fef8fd2479 Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Wed, 2 Jul 2025 12:33:05 -0400 Subject: [PATCH 14/16] ref: remove environment label Co-authored-by: v-zhuravlev --- traefik-mixin/alerts/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/traefik-mixin/alerts/alerts.libsonnet b/traefik-mixin/alerts/alerts.libsonnet index 005d82e45..b7855e52b 100644 --- a/traefik-mixin/alerts/alerts.libsonnet +++ b/traefik-mixin/alerts/alerts.libsonnet @@ -7,7 +7,7 @@ { alert: 'TraefikConfigReloadFailuresIncreasing', expr: ||| - sum by (%(groupLabels)s, environment) (rate(traefik_config_reloads_failure_total{%(filteringSelector)s}[5m])) > 0 + sum by (%(groupLabels)s) (rate(traefik_config_reloads_failure_total{%(filteringSelector)s}[5m])) > 0 ||| % $._config, 'for': '5m', labels: { From 2a612f0edf5ac0332d95c5bcb570f792ba299b7c Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Wed, 2 Jul 2025 12:33:18 -0400 Subject: [PATCH 15/16] fix: make all --- traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml b/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml index 0d9d1e41f..66795872a 100644 --- a/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml +++ b/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml @@ -7,7 +7,7 @@ groups: Traefik is failing to reload its config in {{ $labels.job }}. summary: Traefik is failing to reload its configuration. expr: | - sum by (job, environment) (rate(traefik_config_reloads_failure_total{}[5m])) > 0 + sum by (job) (rate(traefik_config_reloads_failure_total{}[5m])) > 0 for: 5m labels: severity: critical From dae09873c50709d38a2409117be4aa144565ec2b Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Wed, 2 Jul 2025 14:10:23 -0400 Subject: [PATCH 16/16] fix: make fmt --- .../prometheus_alerts.yaml | 64 +++++++++---------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml b/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml index 66795872a..da20526e7 100644 --- a/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml +++ b/traefik-mixin/prometheus_rules_out/prometheus_alerts.yaml @@ -1,33 +1,33 @@ groups: -- name: traefik - rules: - - alert: TraefikConfigReloadFailuresIncreasing - annotations: - description: | - Traefik is failing to reload its config in {{ $labels.job }}. - summary: Traefik is failing to reload its configuration. - expr: | - sum by (job) (rate(traefik_config_reloads_failure_total{}[5m])) > 0 - for: 5m - labels: - severity: critical - - alert: TraefikTLSCertificatesExpiring - annotations: - description: | - The minimum number of days until a Traefik-served certificate expires is {{ printf "%.0f" $value }} days on {{ $labels.sans }} which is below the critical threshold of 7. - summary: A Traefik-served TLS certificate will expire very soon. - expr: | - max by (instance, sans) ((last_over_time(traefik_tls_certs_not_after{}[5m]) - time()) / 86400) < 7 - for: 5m - labels: - severity: critical - - alert: TraefikTLSCertificatesExpiringSoon - annotations: - description: | - The minimum number of days until a Traefik-served certificate expires is {{ printf "%.0f" $value }} days on {{ $labels.sans }} which is less than 14 but greater than 7. - summary: A Traefik-served TLS certificate will expire soon. - expr: | - max by (instance, sans) ((last_over_time(traefik_tls_certs_not_after{}[5m]) - time()) / 86400) < 14 > 7 - for: 5m - labels: - severity: warning + - name: traefik + rules: + - alert: TraefikConfigReloadFailuresIncreasing + annotations: + description: | + Traefik is failing to reload its config in {{ $labels.job }}. + summary: Traefik is failing to reload its configuration. + expr: | + sum by (job) (rate(traefik_config_reloads_failure_total{}[5m])) > 0 + for: 5m + labels: + severity: critical + - alert: TraefikTLSCertificatesExpiring + annotations: + description: | + The minimum number of days until a Traefik-served certificate expires is {{ printf "%.0f" $value }} days on {{ $labels.sans }} which is below the critical threshold of 7. + summary: A Traefik-served TLS certificate will expire very soon. + expr: | + max by (instance, sans) ((last_over_time(traefik_tls_certs_not_after{}[5m]) - time()) / 86400) < 7 + for: 5m + labels: + severity: critical + - alert: TraefikTLSCertificatesExpiringSoon + annotations: + description: | + The minimum number of days until a Traefik-served certificate expires is {{ printf "%.0f" $value }} days on {{ $labels.sans }} which is less than 14 but greater than 7. + summary: A Traefik-served TLS certificate will expire soon. + expr: | + max by (instance, sans) ((last_over_time(traefik_tls_certs_not_after{}[5m]) - time()) / 86400) < 14 > 7 + for: 5m + labels: + severity: warning