File tree Expand file tree Collapse file tree 9 files changed +292
-35
lines changed
Expand file tree Collapse file tree 9 files changed +292
-35
lines changed Original file line number Diff line number Diff line change @@ -412,16 +412,18 @@ rules:
412412 for : 10m
413413 labels :
414414 severity : critical
415- - alert : ThanosReceiveHighForwardRequestFailures
415+ - alert : ThanosReceiveHighReplicationFailures
416416 annotations :
417- message : Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize
417+ message : Thanos Receive {{$labels.job}} is failing to replicate {{ $value | humanize
418418 }}% of requests.
419419 expr : |
420+ thanos_receive_replication_factor > 1
421+ and
420422 (
421423 (
422- sum by (job) (rate(thanos_receive_forward_requests_total {result="error", job=~"thanos-receive.*"}[5m]))
424+ sum by (job) (rate(thanos_receive_replications_total {result="error", job=~"thanos-receive.*"}[5m]))
423425 /
424- sum by (job) (rate(thanos_receive_forward_requests_total {job=~"thanos-receive.*"}[5m]))
426+ sum by (job) (rate(thanos_receive_replications_total {job=~"thanos-receive.*"}[5m]))
425427 )
426428 >
427429 (
@@ -433,6 +435,19 @@ rules:
433435 for : 5m
434436 labels :
435437 severity : warning
438+ - alert : ThanosReceiveHighForwardRequestFailures
439+ annotations :
440+ message : Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize
441+ }}% of requests.
442+ expr : |
443+ (
444+ sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m]))
445+ /
446+ sum by (job) (rate(thanos_receive_forward_requests_total{job=~"thanos-receive.*"}[5m]))
447+ ) * 100 > 20
448+ for : 5m
449+ labels :
450+ severity : warning
436451- alert : ThanosReceiveHighHashringFileRefreshFailures
437452 annotations :
438453 message : Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{
Original file line number Diff line number Diff line change @@ -173,16 +173,18 @@ groups:
173173 for : 10m
174174 labels :
175175 severity : critical
176- - alert : ThanosReceiveHighForwardRequestFailures
176+ - alert : ThanosReceiveHighReplicationFailures
177177 annotations :
178- message : Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize
179- }}% of requests.
178+ message : Thanos Receive {{$labels.job}} is failing to replicate {{ $value |
179+ humanize }}% of requests.
180180 expr : |
181+ thanos_receive_replication_factor > 1
182+ and
181183 (
182184 (
183- sum by (job) (rate(thanos_receive_forward_requests_total {result="error", job=~"thanos-receive.*"}[5m]))
185+ sum by (job) (rate(thanos_receive_replications_total {result="error", job=~"thanos-receive.*"}[5m]))
184186 /
185- sum by (job) (rate(thanos_receive_forward_requests_total {job=~"thanos-receive.*"}[5m]))
187+ sum by (job) (rate(thanos_receive_replications_total {job=~"thanos-receive.*"}[5m]))
186188 )
187189 >
188190 (
@@ -194,6 +196,19 @@ groups:
194196 for : 5m
195197 labels :
196198 severity : warning
199+ - alert : ThanosReceiveHighForwardRequestFailures
200+ annotations :
201+ message : Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize
202+ }}% of requests.
203+ expr : |
204+ (
205+ sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m]))
206+ /
207+ sum by (job) (rate(thanos_receive_forward_requests_total{job=~"thanos-receive.*"}[5m]))
208+ ) * 100 > 20
209+ for : 5m
210+ labels :
211+ severity : warning
197212 - alert : ThanosReceiveHighHashringFileRefreshFailures
198213 annotations :
199214 message : Thanos Receive {{$labels.job}} is failing to refresh hashring file,
Original file line number Diff line number Diff line change @@ -72,6 +72,14 @@ groups:
7272 labels:
7373 quantile: "0.99"
7474 record: :http_request_duration_seconds:histogram_quantile
75+ - expr : |
76+ (
77+ sum(rate(thanos_receive_replications_total{result="error", job=~"thanos-receive.*"}[5m]))
78+ /
79+ sum(rate(thanos_receive_replications_total{job=~"thanos-receive.*"}[5m]))
80+ )
81+ labels: {}
82+ record: :thanos_receive_replication_failure_per_requests:sum_rate
7583 - expr : |
7684 (
7785 sum(rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m]))
You can’t perform that action at this time.
0 commit comments