Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Refactor config
  • Loading branch information
v-zhuravlev committed Nov 13, 2023
commit 784cf59803acc21cd458fc3a5c90151f9e690c82
34 changes: 18 additions & 16 deletions docs/node-observ-lib/README.md
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor nitpicks throughout: I don't mind shortening observability to observ for the folder name, but it feels pretty awkward everywhere else

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good one, updated readme

Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,15 @@ You can use observ-lib to fill in monitoring-mixin structure:
local nodelib = import 'node-observ-lib/main.libsonnet';

local linux =
nodelib.new(
filteringSelector='job="node"',
groupLabels=['job'],
instanceLabels=['instance'],
dashboardNamePrefix='Node exporter / ',
dashboardTags=['node-exporter-mixin'],
uid='node'
)
nodelib.new()
+ nodelib.withConfigMixin({
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should add a little explanation/example with separate configuration?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added

filteringSelector: 'job=~".*node.*"',
groupLabels: ['job'],
instanceLabels: ['instance'],
dashboardNamePrefix: 'Node exporter / ',
dashboardTags: ['node-exporter-mixin'],
uid: 'node',
})
+ nodelib.withConfigMixin(
{
// enable loki logs
Expand All @@ -51,14 +52,15 @@ local g = import './g.libsonnet';
local nodelib = import 'node-observ-lib/main.libsonnet';

local linux =
nodelib.new(
filteringSelector='job="node"',
groupLabels=['job'],
instanceLabels=['instance'],
dashboardNamePrefix='Node exporter / ',
dashboardTags=['node-exporter-mixin'],
uid='node'
)
nodelib.new()
+ nodelib.withConfigMixin({
filteringSelector: 'job=~".*node.*"',
groupLabels: ['job'],
instanceLabels: ['instance'],
dashboardNamePrefix: 'Node exporter / ',
dashboardTags: ['node-exporter-mixin'],
uid: 'node',
})
+ {
grafana+: {
panels+: {
Expand Down
104 changes: 104 additions & 0 deletions docs/node-observ-lib/config.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
{

// any modular observability library should inlcude as inputs:
// 'dashboardNamePrefix' - Use as prefix for all Dashboards and (optional) rule groups
// 'filteringSelector' - Static selector to apply to ALL dashboard variables of type query, panel queries, alerts and recording rules.
// 'groupLabels' - one or more labels that can be used to identify 'group' of instances. In simple cases, can be 'job' or 'cluster'.
// 'instanceLabels' - one or more labels that can be used to identify single entity of instances. In simple cases, can be 'instance' or 'pod'.
// 'uid' - UID to prefix all dashboards original uids

filteringSelector: std.get(self, 'nodeExporterSelector', default='"job="node"'),
groupLabels: ['job'],
instanceLabels: ['instance'],
dashboardNamePrefix: 'Node exporter / ',
uid: 'node',

dashboardTags: [self.uid],

// Select the fstype for filesystem-related queries. If left
// empty, all filesystems are selected. If you have unusual
// filesystem you don't want to include in dashboards and
// alerting, you can exclude them here, e.g. 'fstype!="tmpfs"'.
fsSelector: 'fstype!=""',

// Select the mountpoint for filesystem-related queries. If left
// empty, all mountpoints are selected. For example if you have a
// special purpose tmpfs instance that has a fixed size and will
// always be 100% full, but you still want alerts and dashboards for
// other tmpfs instances, you can exclude those by mountpoint prefix
// like so: 'mountpoint!~"/var/lib/foo.*"'.
fsMountpointSelector: 'mountpoint!=""',

// Select the device for disk-related queries. If left empty, all
// devices are selected. If you have unusual devices you don't
// want to include in dashboards and alerting, you can exclude
// them here, e.g. 'device!="tmpfs"'.
diskDeviceSelector: 'device!=""',

// Some of the alerts are meant to fire if a criticadiskDeviceSelector failure of a
// node is imminent (e.g. the disk is about to run full). In a
// true “cloud native” setup, failures of a single node should be
// tolerated. Hence, even imminent failure of a single node is no
// reason to create a paging alert. However, in practice there are
// still many situations where operators like to get paged in time
// before a node runs out of disk space. nodeCriticalSeverity can
// be set to the desired severity for this kind of alerts. This
// can even be templated to depend on labels of the node, e.g. you
// could make this critical for traditional database masters but
// just a warning for K8s nodes.
nodeCriticalSeverity: 'critical',

// CPU utilization (%) on which to trigger the
// 'NodeCPUHighUsage' alert.
cpuHighUsageThreshold: 90,
// Load average 1m (per core) on which to trigger the
// 'NodeSystemSaturation' alert.
systemSaturationPerCoreThreshold: 2,

// Available disk space (%) thresholds on which to trigger the
// 'NodeFilesystemSpaceFillingUp' alerts. These alerts fire if the disk
// usage grows in a way that it is predicted to run out in 4h or 1d
// and if the provided thresholds have been reached right now.
// In some cases you'll want to adjust these, e.g. by default Kubernetes
// runs the image garbage collection when the disk usage reaches 85%
// of its available space. In that case, you'll want to reduce the
// critical threshold below to something like 14 or 15, otherwise
// the alert could fire under normal node usage.
fsSpaceFillingUpWarningThreshold: 40,
fsSpaceFillingUpCriticalThreshold: 20,

// Available disk space (%) thresholds on which to trigger the
// 'NodeFilesystemAlmostOutOfSpace' alerts.
fsSpaceAvailableWarningThreshold: 5,
fsSpaceAvailableCriticalThreshold: 3,

// Memory utilzation (%) level on which to trigger the
// 'NodeMemoryHighUtilization' alert.
memoryHighUtilizationThreshold: 90,

// Threshold for the rate of memory major page faults to trigger
// 'NodeMemoryMajorPagesFaults' alert.
memoryMajorPagesFaultsThreshold: 500,

// Disk IO queue level above which to trigger
// 'NodeDiskIOSaturation' alert.
diskIOSaturationThreshold: 10,

rateInterval: '5m',

dashboardPeriod: 'now-1h',
dashboardTimezone: 'default',
dashboardRefresh: '1m',

// logs lib related
enableLokiLogs: false,
extraLogLabels: ['transport', 'unit', 'level'],
logsVolumeGroupBy: 'level',
showLogsVolume: true,
logsFilteringSelector: self.filteringSelector,
logsExtraFilters:
|||
| label_format timestamp="{{__timestamp__}}"
| line_format `{{ if eq "[[instance]]" ".*" }}{{alignLeft 25 .instance}}|{{alignLeft 25 .unit}}|{{else}}{{alignLeft 25 .unit}}|{{end}} {{__line__}}`
|||,
}
127 changes: 3 additions & 124 deletions docs/node-observ-lib/main.libsonnet
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
local alerts = import './alerts.libsonnet';
local annotations = import './annotations.libsonnet';
local config = import './config.libsonnet';
local dashboards = import './dashboards.libsonnet';
local datasources = import './datasources.libsonnet';
local g = import './g.libsonnet';
Expand All @@ -15,132 +16,10 @@ local commonlib = import 'common-lib/common/main.libsonnet';
config+: config,
},

// any modular observability library should inlcude as inputs:
// 'dashboardNamePrefix' - Use as prefix for all Dashboards and (optional) rule groups
// 'filteringSelector' - Static selector to apply to ALL dashboard variables of type query, panel queries, alerts and recording rules.
// 'groupLabels' - one or more labels that can be used to identify 'group' of instances. In simple cases, can be 'job' or 'cluster'.
// 'instanceLabels' - one or more labels that can be used to identify single entity of instances. In simple cases, can be 'instance' or 'pod'.
// 'uid' - UID to prefix all dashboards original uids

new(
filteringSelector='job="node"',
groupLabels=['job'],
instanceLabels=['instance'],
dashboardNamePrefix='Node exporter / ',
dashboardTags=[uid],
uid,
): {
new(): {

local this = self,
config: {

groupLabels: groupLabels,
instanceLabels: instanceLabels,

dashboardTags: dashboardTags,
uid: uid,
dashboardNamePrefix: dashboardNamePrefix,

// optional

// Selectors are inserted between {} in Prometheus queries.
// Select the metrics coming from the node exporter. Note that all
// the selected metrics are shown stacked on top of each other in
// the 'USE Method / Cluster' dashboard. Consider disabling that
// dashboard if mixing up all those metrics in the same dashboard
// doesn't make sense (e.g. because they are coming from different
// clusters).
nodeExporterSelector: filteringSelector,
filteringSelector: self.nodeExporterSelector,

// Select the fstype for filesystem-related queries. If left
// empty, all filesystems are selected. If you have unusual
// filesystem you don't want to include in dashboards and
// alerting, you can exclude them here, e.g. 'fstype!="tmpfs"'.
fsSelector: 'fstype!=""',

// Select the mountpoint for filesystem-related queries. If left
// empty, all mountpoints are selected. For example if you have a
// special purpose tmpfs instance that has a fixed size and will
// always be 100% full, but you still want alerts and dashboards for
// other tmpfs instances, you can exclude those by mountpoint prefix
// like so: 'mountpoint!~"/var/lib/foo.*"'.
fsMountpointSelector: 'mountpoint!=""',

// Select the device for disk-related queries. If left empty, all
// devices are selected. If you have unusual devices you don't
// want to include in dashboards and alerting, you can exclude
// them here, e.g. 'device!="tmpfs"'.
diskDeviceSelector: 'device!=""',

// Some of the alerts are meant to fire if a criticadiskDeviceSelector failure of a
// node is imminent (e.g. the disk is about to run full). In a
// true “cloud native” setup, failures of a single node should be
// tolerated. Hence, even imminent failure of a single node is no
// reason to create a paging alert. However, in practice there are
// still many situations where operators like to get paged in time
// before a node runs out of disk space. nodeCriticalSeverity can
// be set to the desired severity for this kind of alerts. This
// can even be templated to depend on labels of the node, e.g. you
// could make this critical for traditional database masters but
// just a warning for K8s nodes.
nodeCriticalSeverity: 'critical',

// CPU utilization (%) on which to trigger the
// 'NodeCPUHighUsage' alert.
cpuHighUsageThreshold: 90,
// Load average 1m (per core) on which to trigger the
// 'NodeSystemSaturation' alert.
systemSaturationPerCoreThreshold: 2,

// Available disk space (%) thresholds on which to trigger the
// 'NodeFilesystemSpaceFillingUp' alerts. These alerts fire if the disk
// usage grows in a way that it is predicted to run out in 4h or 1d
// and if the provided thresholds have been reached right now.
// In some cases you'll want to adjust these, e.g. by default Kubernetes
// runs the image garbage collection when the disk usage reaches 85%
// of its available space. In that case, you'll want to reduce the
// critical threshold below to something like 14 or 15, otherwise
// the alert could fire under normal node usage.
fsSpaceFillingUpWarningThreshold: 40,
fsSpaceFillingUpCriticalThreshold: 20,

// Available disk space (%) thresholds on which to trigger the
// 'NodeFilesystemAlmostOutOfSpace' alerts.
fsSpaceAvailableWarningThreshold: 5,
fsSpaceAvailableCriticalThreshold: 3,

// Memory utilzation (%) level on which to trigger the
// 'NodeMemoryHighUtilization' alert.
memoryHighUtilizationThreshold: 90,

// Threshold for the rate of memory major page faults to trigger
// 'NodeMemoryMajorPagesFaults' alert.
memoryMajorPagesFaultsThreshold: 500,

// Disk IO queue level above which to trigger
// 'NodeDiskIOSaturation' alert.
diskIOSaturationThreshold: 10,

rateInterval: '5m',

dashboardPeriod: 'now-1h',
dashboardTimezone: 'default',
dashboardRefresh: '1m',

// logs lib related
enableLokiLogs: false,
extraLogLabels: ['transport', 'unit', 'level'],
logsVolumeGroupBy: 'level',
showLogsVolume: true,
logsFilteringSelector: self.filteringSelector,
logsExtraFilters:
|||
| label_format timestamp="{{__timestamp__}}"
| line_format `{{ if eq "[[instance]]" ".*" }}{{alignLeft 25 .instance}}|{{alignLeft 25 .unit}}|{{else}}{{alignLeft 25 .unit}}|{{end}} {{__line__}}`
|||,
},

config: config,
grafana: {
variables: variables.new(this),
targets: targets.new(this),
Expand Down
18 changes: 10 additions & 8 deletions docs/node-observ-lib/mixin.libsonnet
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
local g = import './g.libsonnet';
local nodelib = import './main.libsonnet';


local linux =
nodelib.new(
filteringSelector='job="node"',
groupLabels=['job'],
instanceLabels=['instance'],
dashboardNamePrefix='Node exporter / ',
dashboardTags=['node-exporter-mixin'],
uid='node'
);
nodelib.new()
+ nodelib.withConfigMixin({
filteringSelector: 'job=~".*node.*"',
groupLabels: ['job'],
instanceLabels: ['instance'],
dashboardNamePrefix: 'Node exporter / ',
dashboardTags: ['node-exporter-mixin'],
uid: 'node',
});

{
grafanaDashboards+:: linux.grafana.dashboards,
Expand Down