Update mixin for linux-integration (#18)

* Add mountpoint to NodeFilesystem alerts This helps to identify alerting filesystem. * Decrease NodeFilesystem pending time to 15m 30m is too long and there is a risk of running out of disk space/inodes completely if something is filling up disk very fast (like log file). * Add CPU and memory alerts * Add failed systemd service alert * Decrease NodeNetwork*Errs pending period * Set 'at' everywhere as preposition for instance * Add NodeDiskIOSaturation alert * Add %(nodeExporterSelector)s to Network and conntrack alerts * Add diskDevice selector * Fix NodeMemoryHighUtilization alert * Add NodeSystemSaturation and NodeMemoryMajorPagesFaults * Decrease NodeSystemdServiceFailed severity to warning * Extend alert description * Add comma after 'mounted on' * Add thresholds for memory alerts * Add thresholds for memory, disk and system alerts * Set severity to NodeCPUHighUsage to info * Convert graph panels to timeseries panel ...With default style (opacity, tooltip etc). Also: Change 'logical core' line style to dotted Update Disk I/O time metric to dots * Move dashboard paramaters to config * Add overview row * Add Cpu Usage stat panel * Add network dash - Add interfaces overview panel - Add oper status timeline - Add common lib with reused elements (templates, queries) - Add common panels with shared style to be used accross this mixin * Remove external panels lib * Add fleet dashboard * Update fleet dash * Add CPU and memory to fleet * Add common cpu/memory/disk/network panels on fleet * add network errors panel as points * Fix alerts column in fleet table * Add support for multiple group and instance labels * Add sockstat to network dashboard * Add netstat to network dashboard * Change span to gridPod. Make overview row smaller. - gridPos supports tiny panels height. * add reboot annotation * Add system dashboard * add filesystem row * Add disk and fs dashboard * Add memory dashboard * Add memory generic counters to memory dashboard * Update common lib * Update OOM killer panel * Add common annotations: kernelChange, OOMkill * Add mountpoint to NodeFilesystem alerts - This helps to identify alerting filesystem. * Add CPU and memory alerts * Add failed systemd service alert * Decrease NodeNetwork*Errs pending period * Set 'at' everywhere as preposition for instance * Add NodeDiskIOSaturation alert * Add %(nodeExporterSelector)s to Network and conntrack alerts * Add diskDevice selector * Fix NodeMemoryHighUtilization alert * Add NodeSystemSaturation and NodeMemoryMajorPagesFaults * Decrease NodeSystemdServiceFailed severity to warning * Remove unused import * Add ability to set custom dashboardUID * Add mountpoint to NodeFilesystem alerts * Add failed systemd service alert * Remove systemd panel - systemd collector is disabled by default * Add some lint exclusions. - Add UIDs to all dashboards. - Add units and descriptions to all panels which were missing them. - Modify alerts descriptions and summaries as needed for linting. * Add multi-cluster dashboard lint exclusions * Extend alert description * Add thresholds for memory, disk and system alerts * Set severity to NodeCPUHighUsage to info * Fix broken diskSpaceUsage link * Fix cpuIdle panel units * Change cpuUsage to use $__rate_interval * Fix cpu usage (replace with nodeQuerySelector) * Fix units (seconds->s) * Fix iops units * Add %(nodeQuerySelector)s to alerts queries * Add support for multi in job * Fix Pagesout metric * Add total and available memory metrics * Update context switches description * Add network descriptions * Change pipe to | from / in AxisLabel * Update network descriptions * Add timezone metric --------- Signed-off-by: Vitaly Zhuravlev <[email protected]> Signed-off-by: Ryan J. Geyer <[email protected]>
prometheus · v-zhuravlev · Oct 27, 2022 · Oct 27, 2022 · Apr 14, 2023 · Apr 14, 2023
commit 2059552b2d6badf37aa5cd128edb11e6c8472b46
diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet
@@ -53,7 +53,7 @@
                 node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0
               )
             ||| % $._config,
-            'for': '30m',
+            'for': '15m',
             labels: {
               severity: 'warning',
             },
@@ -71,7 +71,7 @@
                 node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0
               )
             ||| % $._config,
-            'for': '30m',
+            'for': '15m',
             labels: {
               severity: '%(nodeCriticalSeverity)s' % $._config,
             },
@@ -129,7 +129,7 @@
                 node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0
               )
             ||| % $._config,
-            'for': '1h',
+            'for': '15m',
             labels: {
               severity: 'warning',
             },
@@ -147,7 +147,7 @@
                 node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0
               )
             ||| % $._config,
-            'for': '1h',
+            'for': '15m',
             labels: {
               severity: '%(nodeCriticalSeverity)s' % $._config,
             },
@@ -161,7 +161,7 @@
             expr: |||
               rate(node_network_receive_errs_total{%(nodeExporterSelector)s}[2m]) / rate(node_network_receive_packets_total{%(nodeExporterSelector)s}[2m]) > 0.01
             ||| % $._config,
-            'for': '1h',
+            'for': '15m',
             labels: {
               severity: 'warning',
             },
@@ -175,7 +175,7 @@
             expr: |||
               rate(node_network_transmit_errs_total{%(nodeExporterSelector)s}[2m]) / rate(node_network_transmit_packets_total{%(nodeExporterSelector)s}[2m]) > 0.01
             ||| % $._config,
-            'for': '1h',
+            'for': '15m',
             labels: {
               severity: 'warning',
             },
@@ -261,7 +261,7 @@
             },
             annotations: {
               summary: 'RAID Array is degraded.',
-              description: "RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.",
+              description: "RAID array '{{ $labels.device }}' at {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.",
             },
           },
           {
@@ -274,7 +274,7 @@
             },
             annotations: {
               summary: 'Failed device in RAID array.',
-              description: "At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.",
+              description: "At least one device in RAID array at {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.",
             },
           },
           {

diff --git a/docs/node-mixin/config.libsonnet b/docs/node-mixin/config.libsonnet
@@ -82,9 +82,36 @@
     rateInterval: '5m',
     // Opt-in for multi-cluster support.
     showMultiCluster: false,
+
     clusterLabel: 'cluster',
 
+    // groupLabels is a string with comma-separated
+    // labels that are common labels of instances belonging to the
+    // same logical group. Include not only enough labels to
+    // identify cluster members, but also all common labels you want
+    // to keep for resulting cluster-level alerts.
+    groupLabels: 'job',
+    // commaSeparated list of labels identifying a single instance:
+    instanceLabels: 'instance',
+
     dashboardNamePrefix: 'Node Exporter / ',
     dashboardTags: ['node-exporter-mixin'],
+    dashboardRefresh: '30s',
+    dashboardTimezone: 'utc',
+    dashboardInterval: 'now-2h',
+
+    // Grafana dashboard IDs are necessary for stable links for dashboards
+    grafanaDashboardIDs: {
+      'node-rsrc-use.json': 'node-rsrc-use',
+      'node-cluster-rsrc-use.json': 'node-cluster-rsrc-use',
+      'node-multicluster-rsrc-use.json': 'node-multicluster-rsrc-use',
+      'nodes.json': 'nodes',
+      'nodes-darwin.json': 'nodes-darwin',
+      'nodes-system.json': 'node-system',
+      'nodes-memory.json': 'node-memory',
+      'nodes-network.json': 'node-network',
+      'nodes-disk.json': 'node-disk',
+      'nodes-fleet.json': 'node-fleet',
+    },
   },
 }
diff --git a/docs/node-mixin/dashboards/disk.libsonnet b/docs/node-mixin/dashboards/disk.libsonnet
@@ -0,0 +1,165 @@
+local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet';
+local dashboard = grafana.dashboard;
+local row = grafana.row;
+local prometheus = grafana.prometheus;
+local template = grafana.template;
+local graphPanel = grafana.graphPanel;
+local nodePanels = import '../lib/panels/panels.libsonnet';
+local commonPanels = import '../lib/panels/common/panels.libsonnet';
+local nodeTimeseries = nodePanels.timeseries;
+local common = import '../lib/common.libsonnet';
+
+{
+
+  // https://www.robustperception.io/filesystem-metrics-from-the-node-exporter/
+  new(config=null, platform=null):: {
+    local c = common.new(config=config, platform=platform),
+    local commonPromTarget = c.commonPromTarget,
+    local templates = c.templates,
+    local q = c.queries,
+
+    local fsAvailable =
+      nodeTimeseries.new(
+        'Filesystem Space Available',
+        description=|||
+          Filesystem space utilisation in bytes, by mountpoint.
+        |||
+      )
+      .withUnits('decbytes')
+      .withFillOpacity(5)
+      .addTarget(commonPromTarget(
+        expr=q.node_filesystem_avail_bytes,
+        legendFormat='{{ mountpoint }}',
+      )),
+
+    local fsInodes =
+      nodeTimeseries.new(
+        'Free inodes',
+        description='The inode is a data structure in a Unix-style file system that describes a file-system object such as a file or a directory.',
+      )
+      .withUnits('short')
+      .addTarget(commonPromTarget(
+        expr=q.node_filesystem_files_free,
+        legendFormat='{{ mountpoint }}'
+      ))
+      .addTarget(commonPromTarget(
+        expr=q.node_filesystem_files,
+        legendFormat='{{ mountpoint }}'
+      )),
+    local fsInodesTotal =
+      nodeTimeseries.new(
+        'Total inodes',
+        description='The inode is a data structure in a Unix-style file system that describes a file-system object such as a file or a directory.',
+      )
+      .withUnits('short')
+      .addTarget(commonPromTarget(
+        expr=q.node_filesystem_files,
+        legendFormat='{{ mountpoint }}'
+      )),
+    local fsErrorsandRO =
+      nodeTimeseries.new('Filesystems with errors / read-only')
+      .withMax(1)
+      .addTarget(commonPromTarget(
+        expr=q.node_filesystem_readonly,
+        legendFormat='{{ mountpoint }}'
+      ))
+      .addTarget(commonPromTarget(
+        expr=q.node_filesystem_device_error,
+        legendFormat='{{ mountpoint }}'
+      )),
+    local fileDescriptors =
+      nodeTimeseries.new(
+        'File Descriptors',
+        description=|||
+          File descriptor is a handle to an open file or input/output (I/O) resource, such as a network socket or a pipe.
+          The operating system uses file descriptors to keep track of open files and I/O resources, and provides a way for programs to read from and write to them.
+        |||
+      )
+      .addTarget(commonPromTarget(
+        expr=q.process_max_fds,
+        legendFormat='Maximum open file descriptors',
+      ))
+      .addTarget(commonPromTarget(
+        expr=q.process_open_fds,
+        legendFormat='Open file descriptors',
+      )),
+
+    local diskIOcompleted =
+      nodeTimeseries.new(
+        title='Disk IOps completed',
+        description='The number (after merges) of I/O requests completed per second for the device'
+      )
+      .withUnits('iops')
+      .withNegativeYByRegex('reads')
+      .withAxisLabel('read(-) | write(+)')
+      .addTarget(commonPromTarget(
+        expr=q.node_disk_reads_completed_total,
+        legendFormat='{{device}} reads completed',
+      ))
+      .addTarget(commonPromTarget(
+        expr=q.node_disk_writes_completed_total,
+        legendFormat='{{device}} writes completed',
+      )),
+
+    local diskAvgWaitTime =
+      nodeTimeseries.new(
+        title='Disk Average Wait Time',
+        description='The average time for requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them.'
+      )
+      .withUnits('s')
+      .withNegativeYByRegex('read')
+      .withAxisLabel('read(-) | write(+)')
+      .addTarget(commonPromTarget(
+        expr=q.diskWaitReadTime,
+        legendFormat='{{device}} read wait time avg',
+      ))
+      .addTarget(commonPromTarget(
+        expr=q.diskWaitWriteTime,
+        legendFormat='{{device}} write wait time avg',
+      )),
+
+    local diskAvgQueueSize =
+      nodeTimeseries.new(
+        title='Average Queue Size (aqu-sz)',
+        description='The average queue length of the requests that were issued to the device.'
+      )
+      .addTarget(commonPromTarget(
+        expr=q.diskAvgQueueSize,
+        legendFormat='{{device}}',
+      )),
+
+    local panelsGrid =
+      [
+        { type: 'row', title: 'Filesystem', gridPos: { y: 0 } },
+        fsAvailable { gridPos: { x: 0, w: 12, h: 8, y: 0 } },
+        c.panelsWithTargets.diskSpaceUsage { gridPos: { x: 12, w: 12, h: 8, y: 0 } },
+        fsInodes { gridPos: { x: 0, w: 12, h: 8, y: 0 } },
+        fsInodesTotal { gridPos: { x: 12, w: 12, h: 8, y: 0 } },
+        fsErrorsandRO { gridPos: { x: 0, w: 12, h: 8, y: 0 } },
+        fileDescriptors { gridPos: { x: 12, w: 12, h: 8, y: 0 } },
+        { type: 'row', title: 'Disk', gridPos: { y: 25 } },
+        c.panelsWithTargets.diskIO { gridPos: { x: 0, w: 12, h: 8, y: 25 } },
+        diskIOcompleted { gridPos: { x: 12, w: 12, h: 8, y: 25 } },
+        diskAvgWaitTime { gridPos: { x: 0, w: 12, h: 8, y: 25 } },
+        diskAvgQueueSize { gridPos: { x: 12, w: 12, h: 8, y: 25 } },
+      ],
+
+    dashboard: if platform == 'Linux' then
+      dashboard.new(
+        '%sNode Filesystem and Disk' % config { nodeQuerySelector: c.nodeQuerySelector }.dashboardNamePrefix,
+        time_from=config.dashboardInterval,
+        tags=(config.dashboardTags),
+        timezone=config.dashboardTimezone,
+        refresh=config.dashboardRefresh,
+        graphTooltip='shared_crosshair',
+        uid=config.grafanaDashboardIDs['nodes-disk.json']
+      )
+      .addLink(c.links.fleetDash)
+      .addLink(c.links.nodeDash)
+      .addLink(c.links.otherDashes)
+      .addAnnotations(c.annotations)
+      .addTemplates(templates)
+      .addPanels(panelsGrid)
+    else if platform == 'Darwin' then {},
+  },
+}