Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Update mixin for linux-integration (#18)
* Add mountpoint to NodeFilesystem alerts
This helps to identify alerting filesystem.

* Decrease NodeFilesystem pending time to 15m
30m is too long and there is a risk of running out of disk space/inodes completely if something is filling up disk very fast (like log file).

* Add CPU and memory alerts
* Add failed systemd service alert
* Decrease NodeNetwork*Errs pending period
* Set 'at' everywhere as preposition for instance
* Add NodeDiskIOSaturation alert
* Add %(nodeExporterSelector)s to Network and conntrack alerts
* Add diskDevice selector
* Fix NodeMemoryHighUtilization alert
* Add NodeSystemSaturation and NodeMemoryMajorPagesFaults
* Decrease NodeSystemdServiceFailed severity to warning
* Extend alert description
* Add comma after 'mounted on'
* Add thresholds for memory alerts
* Add thresholds for memory, disk and system alerts
* Set severity to NodeCPUHighUsage to info
* Convert graph panels to timeseries panel
...With default style (opacity, tooltip etc).
Also:
Change 'logical core' line style to dotted
Update Disk I/O time metric to dots
* Move dashboard paramaters to config
* Add overview row
* Add Cpu Usage stat panel
* Add network dash
- Add interfaces overview panel
- Add oper status timeline
- Add common lib with reused elements (templates, queries)
- Add common panels with shared style to be used accross this mixin
* Remove external panels lib
* Add fleet dashboard
* Update fleet dash
* Add CPU and memory to fleet
* Add common cpu/memory/disk/network panels on fleet
* add network errors panel as points
* Fix alerts column in fleet table
* Add support for multiple group and instance labels
* Add sockstat to network dashboard
* Add netstat to network dashboard
* Change span to gridPod. Make overview row smaller.
- gridPos supports tiny panels height.
* add reboot annotation
* Add system dashboard
* add filesystem row
* Add disk and fs dashboard
* Add memory dashboard
* Add memory generic counters to memory dashboard
* Update common lib
* Update OOM killer panel
* Add common annotations: kernelChange, OOMkill
* Add mountpoint to NodeFilesystem alerts
- This helps to identify alerting filesystem.
* Add CPU and memory alerts
* Add failed systemd service alert
* Decrease NodeNetwork*Errs pending period
* Set 'at' everywhere as preposition for instance
* Add NodeDiskIOSaturation alert
* Add %(nodeExporterSelector)s to Network and conntrack alerts
* Add diskDevice selector
* Fix NodeMemoryHighUtilization alert
* Add NodeSystemSaturation and NodeMemoryMajorPagesFaults
* Decrease NodeSystemdServiceFailed severity to warning
* Remove unused import
* Add ability to set custom dashboardUID
* Add mountpoint to NodeFilesystem alerts
* Add failed systemd service alert
* Remove systemd panel
- systemd collector is disabled by default
* Add some lint exclusions.
- Add UIDs to all dashboards.
- Add units and descriptions to all panels which were missing them.
- Modify alerts descriptions and summaries as needed for linting.
* Add multi-cluster dashboard lint exclusions
* Extend alert description
* Add thresholds for memory, disk and system alerts
* Set severity to NodeCPUHighUsage to info
* Fix broken diskSpaceUsage link
* Fix cpuIdle panel units
* Change cpuUsage to use $__rate_interval
* Fix cpu usage (replace with nodeQuerySelector)
* Fix units (seconds->s)
* Fix iops units
* Add %(nodeQuerySelector)s to alerts queries
* Add support for multi in job
* Fix Pagesout metric
* Add total and available memory metrics
* Update context switches description
* Add network descriptions
* Change pipe to | from / in AxisLabel
* Update network descriptions
* Add timezone metric

---------

Signed-off-by: Vitaly Zhuravlev <[email protected]>
Signed-off-by: Ryan J. Geyer <[email protected]>
  • Loading branch information
v-zhuravlev committed Jul 15, 2023
commit 2059552b2d6badf37aa5cd128edb11e6c8472b46
16 changes: 8 additions & 8 deletions docs/node-mixin/alerts/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0
)
||| % $._config,
'for': '30m',
'for': '15m',
labels: {
severity: 'warning',
},
Expand All @@ -71,7 +71,7 @@
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0
)
||| % $._config,
'for': '30m',
'for': '15m',
labels: {
severity: '%(nodeCriticalSeverity)s' % $._config,
},
Expand Down Expand Up @@ -129,7 +129,7 @@
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0
)
||| % $._config,
'for': '1h',
'for': '15m',
labels: {
severity: 'warning',
},
Expand All @@ -147,7 +147,7 @@
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0
)
||| % $._config,
'for': '1h',
'for': '15m',
labels: {
severity: '%(nodeCriticalSeverity)s' % $._config,
},
Expand All @@ -161,7 +161,7 @@
expr: |||
rate(node_network_receive_errs_total{%(nodeExporterSelector)s}[2m]) / rate(node_network_receive_packets_total{%(nodeExporterSelector)s}[2m]) > 0.01
||| % $._config,
'for': '1h',
'for': '15m',
labels: {
severity: 'warning',
},
Expand All @@ -175,7 +175,7 @@
expr: |||
rate(node_network_transmit_errs_total{%(nodeExporterSelector)s}[2m]) / rate(node_network_transmit_packets_total{%(nodeExporterSelector)s}[2m]) > 0.01
||| % $._config,
'for': '1h',
'for': '15m',
labels: {
severity: 'warning',
},
Expand Down Expand Up @@ -261,7 +261,7 @@
},
annotations: {
summary: 'RAID Array is degraded.',
description: "RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.",
description: "RAID array '{{ $labels.device }}' at {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.",
},
},
{
Expand All @@ -274,7 +274,7 @@
},
annotations: {
summary: 'Failed device in RAID array.',
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.",
description: "At least one device in RAID array at {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.",
},
},
{
Expand Down
27 changes: 27 additions & 0 deletions docs/node-mixin/config.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,36 @@
rateInterval: '5m',
// Opt-in for multi-cluster support.
showMultiCluster: false,

clusterLabel: 'cluster',

// groupLabels is a string with comma-separated
// labels that are common labels of instances belonging to the
// same logical group. Include not only enough labels to
// identify cluster members, but also all common labels you want
// to keep for resulting cluster-level alerts.
groupLabels: 'job',
// commaSeparated list of labels identifying a single instance:
instanceLabels: 'instance',

dashboardNamePrefix: 'Node Exporter / ',
dashboardTags: ['node-exporter-mixin'],
dashboardRefresh: '30s',
dashboardTimezone: 'utc',
dashboardInterval: 'now-2h',

// Grafana dashboard IDs are necessary for stable links for dashboards
grafanaDashboardIDs: {
'node-rsrc-use.json': 'node-rsrc-use',
'node-cluster-rsrc-use.json': 'node-cluster-rsrc-use',
'node-multicluster-rsrc-use.json': 'node-multicluster-rsrc-use',
'nodes.json': 'nodes',
'nodes-darwin.json': 'nodes-darwin',
'nodes-system.json': 'node-system',
'nodes-memory.json': 'node-memory',
'nodes-network.json': 'node-network',
'nodes-disk.json': 'node-disk',
'nodes-fleet.json': 'node-fleet',
},
},
}
165 changes: 165 additions & 0 deletions docs/node-mixin/dashboards/disk.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet';
local dashboard = grafana.dashboard;
local row = grafana.row;
local prometheus = grafana.prometheus;
local template = grafana.template;
local graphPanel = grafana.graphPanel;
local nodePanels = import '../lib/panels/panels.libsonnet';
local commonPanels = import '../lib/panels/common/panels.libsonnet';
local nodeTimeseries = nodePanels.timeseries;
local common = import '../lib/common.libsonnet';

{

// https://www.robustperception.io/filesystem-metrics-from-the-node-exporter/
new(config=null, platform=null):: {
local c = common.new(config=config, platform=platform),
local commonPromTarget = c.commonPromTarget,
local templates = c.templates,
local q = c.queries,

local fsAvailable =
nodeTimeseries.new(
'Filesystem Space Available',
description=|||
Filesystem space utilisation in bytes, by mountpoint.
|||
)
.withUnits('decbytes')
.withFillOpacity(5)
.addTarget(commonPromTarget(
expr=q.node_filesystem_avail_bytes,
legendFormat='{{ mountpoint }}',
)),

local fsInodes =
nodeTimeseries.new(
'Free inodes',
description='The inode is a data structure in a Unix-style file system that describes a file-system object such as a file or a directory.',
)
.withUnits('short')
.addTarget(commonPromTarget(
expr=q.node_filesystem_files_free,
legendFormat='{{ mountpoint }}'
))
.addTarget(commonPromTarget(
expr=q.node_filesystem_files,
legendFormat='{{ mountpoint }}'
)),
local fsInodesTotal =
nodeTimeseries.new(
'Total inodes',
description='The inode is a data structure in a Unix-style file system that describes a file-system object such as a file or a directory.',
)
.withUnits('short')
.addTarget(commonPromTarget(
expr=q.node_filesystem_files,
legendFormat='{{ mountpoint }}'
)),
local fsErrorsandRO =
nodeTimeseries.new('Filesystems with errors / read-only')
.withMax(1)
.addTarget(commonPromTarget(
expr=q.node_filesystem_readonly,
legendFormat='{{ mountpoint }}'
))
.addTarget(commonPromTarget(
expr=q.node_filesystem_device_error,
legendFormat='{{ mountpoint }}'
)),
local fileDescriptors =
nodeTimeseries.new(
'File Descriptors',
description=|||
File descriptor is a handle to an open file or input/output (I/O) resource, such as a network socket or a pipe.
The operating system uses file descriptors to keep track of open files and I/O resources, and provides a way for programs to read from and write to them.
|||
)
.addTarget(commonPromTarget(
expr=q.process_max_fds,
legendFormat='Maximum open file descriptors',
))
.addTarget(commonPromTarget(
expr=q.process_open_fds,
legendFormat='Open file descriptors',
)),

local diskIOcompleted =
nodeTimeseries.new(
title='Disk IOps completed',
description='The number (after merges) of I/O requests completed per second for the device'
)
.withUnits('iops')
.withNegativeYByRegex('reads')
.withAxisLabel('read(-) | write(+)')
.addTarget(commonPromTarget(
expr=q.node_disk_reads_completed_total,
legendFormat='{{device}} reads completed',
))
.addTarget(commonPromTarget(
expr=q.node_disk_writes_completed_total,
legendFormat='{{device}} writes completed',
)),

local diskAvgWaitTime =
nodeTimeseries.new(
title='Disk Average Wait Time',
description='The average time for requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them.'
)
.withUnits('s')
.withNegativeYByRegex('read')
.withAxisLabel('read(-) | write(+)')
.addTarget(commonPromTarget(
expr=q.diskWaitReadTime,
legendFormat='{{device}} read wait time avg',
))
.addTarget(commonPromTarget(
expr=q.diskWaitWriteTime,
legendFormat='{{device}} write wait time avg',
)),

local diskAvgQueueSize =
nodeTimeseries.new(
title='Average Queue Size (aqu-sz)',
description='The average queue length of the requests that were issued to the device.'
)
.addTarget(commonPromTarget(
expr=q.diskAvgQueueSize,
legendFormat='{{device}}',
)),

local panelsGrid =
[
{ type: 'row', title: 'Filesystem', gridPos: { y: 0 } },
fsAvailable { gridPos: { x: 0, w: 12, h: 8, y: 0 } },
c.panelsWithTargets.diskSpaceUsage { gridPos: { x: 12, w: 12, h: 8, y: 0 } },
fsInodes { gridPos: { x: 0, w: 12, h: 8, y: 0 } },
fsInodesTotal { gridPos: { x: 12, w: 12, h: 8, y: 0 } },
fsErrorsandRO { gridPos: { x: 0, w: 12, h: 8, y: 0 } },
fileDescriptors { gridPos: { x: 12, w: 12, h: 8, y: 0 } },
{ type: 'row', title: 'Disk', gridPos: { y: 25 } },
c.panelsWithTargets.diskIO { gridPos: { x: 0, w: 12, h: 8, y: 25 } },
diskIOcompleted { gridPos: { x: 12, w: 12, h: 8, y: 25 } },
diskAvgWaitTime { gridPos: { x: 0, w: 12, h: 8, y: 25 } },
diskAvgQueueSize { gridPos: { x: 12, w: 12, h: 8, y: 25 } },
],

dashboard: if platform == 'Linux' then
dashboard.new(
'%sNode Filesystem and Disk' % config { nodeQuerySelector: c.nodeQuerySelector }.dashboardNamePrefix,
time_from=config.dashboardInterval,
tags=(config.dashboardTags),
timezone=config.dashboardTimezone,
refresh=config.dashboardRefresh,
graphTooltip='shared_crosshair',
uid=config.grafanaDashboardIDs['nodes-disk.json']
)
.addLink(c.links.fleetDash)
.addLink(c.links.nodeDash)
.addLink(c.links.otherDashes)
.addAnnotations(c.annotations)
.addTemplates(templates)
.addPanels(panelsGrid)
else if platform == 'Darwin' then {},
},
}
Loading