Skip to content

Commit 30e4c7e

Browse files
committed
Add alerts and dashboards for the new replication metric
Signed-off-by: Kemal Akkoyun <kakkoyun@gmail.com>
1 parent 2a6ff71 commit 30e4c7e

File tree

9 files changed

+292
-35
lines changed

9 files changed

+292
-35
lines changed

examples/alerts/alerts.md

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -412,16 +412,18 @@ rules:
412412
for: 10m
413413
labels:
414414
severity: critical
415-
- alert: ThanosReceiveHighForwardRequestFailures
415+
- alert: ThanosReceiveHighReplicationFailures
416416
annotations:
417-
message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize
417+
message: Thanos Receive {{$labels.job}} is failing to replicate {{ $value | humanize
418418
}}% of requests.
419419
expr: |
420+
thanos_receive_replication_factor > 1
421+
and
420422
(
421423
(
422-
sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m]))
424+
sum by (job) (rate(thanos_receive_replications_total{result="error", job=~"thanos-receive.*"}[5m]))
423425
/
424-
sum by (job) (rate(thanos_receive_forward_requests_total{job=~"thanos-receive.*"}[5m]))
426+
sum by (job) (rate(thanos_receive_replications_total{job=~"thanos-receive.*"}[5m]))
425427
)
426428
>
427429
(
@@ -433,6 +435,19 @@ rules:
433435
for: 5m
434436
labels:
435437
severity: warning
438+
- alert: ThanosReceiveHighForwardRequestFailures
439+
annotations:
440+
message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize
441+
}}% of requests.
442+
expr: |
443+
(
444+
sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m]))
445+
/
446+
sum by (job) (rate(thanos_receive_forward_requests_total{job=~"thanos-receive.*"}[5m]))
447+
) * 100 > 20
448+
for: 5m
449+
labels:
450+
severity: warning
436451
- alert: ThanosReceiveHighHashringFileRefreshFailures
437452
annotations:
438453
message: Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{

examples/alerts/alerts.yaml

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -173,16 +173,18 @@ groups:
173173
for: 10m
174174
labels:
175175
severity: critical
176-
- alert: ThanosReceiveHighForwardRequestFailures
176+
- alert: ThanosReceiveHighReplicationFailures
177177
annotations:
178-
message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize
179-
}}% of requests.
178+
message: Thanos Receive {{$labels.job}} is failing to replicate {{ $value |
179+
humanize }}% of requests.
180180
expr: |
181+
thanos_receive_replication_factor > 1
182+
and
181183
(
182184
(
183-
sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m]))
185+
sum by (job) (rate(thanos_receive_replications_total{result="error", job=~"thanos-receive.*"}[5m]))
184186
/
185-
sum by (job) (rate(thanos_receive_forward_requests_total{job=~"thanos-receive.*"}[5m]))
187+
sum by (job) (rate(thanos_receive_replications_total{job=~"thanos-receive.*"}[5m]))
186188
)
187189
>
188190
(
@@ -194,6 +196,19 @@ groups:
194196
for: 5m
195197
labels:
196198
severity: warning
199+
- alert: ThanosReceiveHighForwardRequestFailures
200+
annotations:
201+
message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize
202+
}}% of requests.
203+
expr: |
204+
(
205+
sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m]))
206+
/
207+
sum by (job) (rate(thanos_receive_forward_requests_total{job=~"thanos-receive.*"}[5m]))
208+
) * 100 > 20
209+
for: 5m
210+
labels:
211+
severity: warning
197212
- alert: ThanosReceiveHighHashringFileRefreshFailures
198213
annotations:
199214
message: Thanos Receive {{$labels.job}} is failing to refresh hashring file,

examples/alerts/rules.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,14 @@ groups:
7272
labels:
7373
quantile: "0.99"
7474
record: :http_request_duration_seconds:histogram_quantile
75+
- expr: |
76+
(
77+
sum(rate(thanos_receive_replications_total{result="error", job=~"thanos-receive.*"}[5m]))
78+
/
79+
sum(rate(thanos_receive_replications_total{job=~"thanos-receive.*"}[5m]))
80+
)
81+
labels: {}
82+
record: :thanos_receive_replication_failure_per_requests:sum_rate
7583
- expr: |
7684
(
7785
sum(rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m]))

0 commit comments

Comments
 (0)