-
Notifications
You must be signed in to change notification settings - Fork 2.6k
Mixin: Add and update alerts #2644
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
fc967aa
0e0399d
fd2d62a
7479418
3d8075d
614030b
94fc82e
962de6c
c3ec6e8
e15e7d6
580c497
da32f8d
e48e790
2111e70
77ae769
6bdc1d9
b7dfb32
3e250a9
e8d7f4e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,7 +21,7 @@ | |
| }, | ||
| annotations: { | ||
| summary: 'Filesystem is predicted to run out of space within the next 24 hours.', | ||
| description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.', | ||
| description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.', | ||
| }, | ||
| }, | ||
| { | ||
|
|
@@ -41,7 +41,7 @@ | |
| }, | ||
| annotations: { | ||
| summary: 'Filesystem is predicted to run out of space within the next 4 hours.', | ||
| description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.', | ||
| description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.', | ||
| }, | ||
| }, | ||
| { | ||
|
|
@@ -59,7 +59,7 @@ | |
| }, | ||
| annotations: { | ||
| summary: 'Filesystem has less than %(fsSpaceAvailableWarningThreshold)d%% space left.' % $._config, | ||
| description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', | ||
| description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', | ||
| }, | ||
| }, | ||
| { | ||
|
|
@@ -77,7 +77,7 @@ | |
| }, | ||
| annotations: { | ||
| summary: 'Filesystem has less than %(fsSpaceAvailableCriticalThreshold)d%% space left.' % $._config, | ||
| description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', | ||
| description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', | ||
| }, | ||
| }, | ||
| { | ||
|
|
@@ -97,7 +97,7 @@ | |
| }, | ||
| annotations: { | ||
| summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.', | ||
| description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.', | ||
| description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.', | ||
| }, | ||
| }, | ||
| { | ||
|
|
@@ -117,7 +117,7 @@ | |
| }, | ||
| annotations: { | ||
| summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.', | ||
| description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.', | ||
| description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.', | ||
| }, | ||
| }, | ||
| { | ||
|
|
@@ -135,7 +135,7 @@ | |
| }, | ||
| annotations: { | ||
| summary: 'Filesystem has less than 5% inodes left.', | ||
| description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.', | ||
| description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.', | ||
| }, | ||
| }, | ||
| { | ||
|
|
@@ -153,13 +153,13 @@ | |
| }, | ||
| annotations: { | ||
| summary: 'Filesystem has less than 3% inodes left.', | ||
| description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.', | ||
| description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.', | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'NodeNetworkReceiveErrs', | ||
| expr: ||| | ||
| rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 | ||
| rate(node_network_receive_errs_total{%(nodeExporterSelector)s}[2m]) / rate(node_network_receive_packets_total{%(nodeExporterSelector)s}[2m]) > 0.01 | ||
| ||| % $._config, | ||
| 'for': '1h', | ||
| labels: { | ||
|
|
@@ -173,7 +173,7 @@ | |
| { | ||
| alert: 'NodeNetworkTransmitErrs', | ||
| expr: ||| | ||
| rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 | ||
| rate(node_network_transmit_errs_total{%(nodeExporterSelector)s}[2m]) / rate(node_network_transmit_packets_total{%(nodeExporterSelector)s}[2m]) > 0.01 | ||
| ||| % $._config, | ||
| 'for': '1h', | ||
| labels: { | ||
|
|
@@ -187,7 +187,7 @@ | |
| { | ||
| alert: 'NodeHighNumberConntrackEntriesUsed', | ||
| expr: ||| | ||
| (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 | ||
| (node_nf_conntrack_entries{%(nodeExporterSelector)s} / node_nf_conntrack_entries_limit) > 0.75 | ||
| ||| % $._config, | ||
| annotations: { | ||
| summary: 'Number of conntrack are getting close to the limit.', | ||
|
|
@@ -204,7 +204,7 @@ | |
| ||| % $._config, | ||
| annotations: { | ||
| summary: 'Node Exporter text file collector failed to scrape.', | ||
| description: 'Node Exporter text file collector failed to scrape.', | ||
| description: 'Node Exporter text file collector on {{ $labels.instance }} failed to scrape.', | ||
| }, | ||
| labels: { | ||
| severity: 'warning', | ||
|
|
@@ -231,7 +231,7 @@ | |
| }, | ||
| annotations: { | ||
| summary: 'Clock skew detected.', | ||
| description: 'Clock on {{ $labels.instance }} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host.', | ||
| description: 'Clock at {{ $labels.instance }} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host.', | ||
| }, | ||
| }, | ||
| { | ||
|
|
@@ -247,7 +247,7 @@ | |
| }, | ||
| annotations: { | ||
| summary: 'Clock not synchronising.', | ||
| description: 'Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.', | ||
| description: 'Clock at {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.', | ||
| }, | ||
| }, | ||
| { | ||
|
|
@@ -260,8 +260,8 @@ | |
| severity: 'critical', | ||
| }, | ||
| annotations: { | ||
| summary: 'RAID Array is degraded', | ||
| description: "RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.", | ||
| summary: 'RAID Array is degraded.', | ||
| description: "RAID array '{{ $labels.device }}' at {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.", | ||
| }, | ||
| }, | ||
| { | ||
|
|
@@ -273,8 +273,8 @@ | |
| severity: 'warning', | ||
| }, | ||
| annotations: { | ||
| summary: 'Failed device in RAID array', | ||
| description: "At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.", | ||
| summary: 'Failed device in RAID array.', | ||
| description: "At least one device in RAID array at {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.", | ||
| }, | ||
| }, | ||
| { | ||
|
|
@@ -309,6 +309,104 @@ | |
| description: 'File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.', | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'NodeCPUHighUsage', | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. High CPU usage is not a problem and can just be an indicator or properly utilizing your machine, so I'd remove these
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. perhaps, as long as we can alert on high system load(saturation).
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, CPU usage is good. :) I mean, this would be a case for the "info" level alerts that I like to promote, but I don't think we have them here in the mixin. (Info level alerts notify nobody, but you could look at the alerts page while troubleshooting. They point to things that are not problems per se and might be OK, but which you might be interested while there is an actual incident happening.)
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah I'd be fine with a 'info' level severity. No reason to now just introduce that now that we're on this.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll make it an info according to this guideline: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps a warning if the usage stays above 98% for 1h would be viable? That would be a case where the host is at capacity and scheduling more tasks there would result in performance degradation. It is a risk folks can accept but something that should be considered as part of the capacity plan. |
||
| expr: ||| | ||
| sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode!="idle"}[2m]))) * 100 > %(cpuHighUsageThreshold)d | ||
| ||| % $._config, | ||
| 'for': '15m', | ||
| labels: { | ||
| severity: 'info', | ||
| }, | ||
| annotations: { | ||
| summary: 'High CPU usage.', | ||
| description: ||| | ||
| CPU usage at {{ $labels.instance }} has been above %(cpuHighUsageThreshold)d%% for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}%%. | ||
| ||| % $._config, | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'NodeSystemSaturation', | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure about this, this miiight make sense but I also leaning towards not doing this. @SuperQ wdyt?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is is really helpful to detect system performance degradation, https://www.brendangregg.com/blog/2017-08-08/linux-load-averages.html
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The question is are there in pragmatic reality scenarios where a high load is nothing to warn about. And the article explains how other, non cpu related saturation will also increase this. (But thanks for the link, super interesting to see the actual patch that introduced this confusion :))
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this alert triggers on load per core. There are always exceptions, in such situations 'silence' could also help.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
that's why this alert is called NodeSystemSaturation, not CPUsaturation, btw :)
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I totally remember scenarios where the load metric was high in legitimate use cases without being a problem, but that's long ago, and a lot has changed in the kernel since then. I heard opinions that load average is now somewhat useful as a metric, others state the opposite, and I don't feel qualified to make the call.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe also info severity? Alerting/paging on this kinda goes against alerting on actual impact (as oppose to alert on response times of services running on the overloaded node)
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the load metric here actually tells us about actual saturation, then I would say "warning" is fine. It is an actionable alert then, at least for the many scenarios where you don't want to run your systems over-saturated all the time. It's just not urgent enough to wake someone up. IMHO "info" level alerts are for conditions that are completely fine on their own but could be hints towards a possible cause while an incident is happening.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah lets convert it to warning
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok, decreased to warning, agree |
||
| expr: ||| | ||
| node_load1{%(nodeExporterSelector)s} | ||
| / count without (cpu, mode) (node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}) > %(systemSaturationPerCoreThreshold)d | ||
| ||| % $._config, | ||
| 'for': '15m', | ||
| labels: { | ||
| severity: 'warning', | ||
| }, | ||
| annotations: { | ||
| summary: 'System saturated, load per core is very high.', | ||
| description: ||| | ||
| System load per core at {{ $labels.instance }} has been above %(systemSaturationPerCoreThreshold)d for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}. | ||
| This might indicate this instance resources saturation and can cause it becoming unresponsive. | ||
| ||| % $._config, | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'NodeMemoryMajorPagesFaults', | ||
| expr: ||| | ||
| rate(node_vmstat_pgmajfault{%(nodeExporterSelector)s}[5m]) > %(memoryMajorPagesFaultsThreshold)d | ||
| ||| % $._config, | ||
| 'for': '15m', | ||
| labels: { | ||
| severity: 'warning', | ||
| }, | ||
| annotations: { | ||
| summary: 'Memory major page faults are occurring at very high rate.', | ||
| description: ||| | ||
| Memory major pages are occurring at very high rate at {{ $labels.instance }}, %(memoryMajorPagesFaultsThreshold)d major page faults per second for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}. | ||
| Please check that there is enough memory available at this instance. | ||
| ||| % $._config, | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'NodeMemoryHighUtilization', | ||
| expr: ||| | ||
| 100 - (node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} / node_memory_MemTotal_bytes{%(nodeExporterSelector)s} * 100) > %(memoryHighUtilizationThreshold)d | ||
| ||| % $._config, | ||
| 'for': '15m', | ||
| labels: { | ||
| severity: 'warning', | ||
| }, | ||
| annotations: { | ||
| summary: 'Host is running out of memory.', | ||
| description: ||| | ||
| Memory is filling up at {{ $labels.instance }}, has been above %(memoryHighUtilizationThreshold)d%% for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}%%. | ||
| ||| % $._config, | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'NodeDiskIOSaturation', | ||
| expr: ||| | ||
| rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[5m]) > %(diskIOSaturationThreshold)d | ||
| ||| % $._config, | ||
| 'for': '30m', | ||
| labels: { | ||
| severity: 'warning', | ||
| }, | ||
| annotations: { | ||
| summary: 'Disk IO queue is high.', | ||
| description: ||| | ||
| Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above %(diskIOSaturationThreshold)d for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}. | ||
| This symptom might indicate disk saturation. | ||
| ||| % $._config, | ||
| }, | ||
| }, | ||
| { | ||
| alert: 'NodeSystemdServiceFailed', | ||
| expr: ||| | ||
| node_systemd_unit_state{%(nodeExporterSelector)s, state="failed"} == 1 | ||
| ||| % $._config, | ||
| 'for': '5m', | ||
| labels: { | ||
| severity: 'warning', | ||
| }, | ||
| annotations: { | ||
| summary: 'Systemd service has entered failed state.', | ||
| description: 'Systemd service {{ $labels.name }} has entered failed state at {{ $labels.instance }}', | ||
| }, | ||
| }, | ||
| ], | ||
| }, | ||
| ], | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.