Skip to content
This repository was archived by the owner on Nov 15, 2023. It is now read-only.
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
40762b6
Refactor rebase master prometheus_v0.3
nodebreaker0-0 Nov 28, 2019
472aca1
Milestone1: Final Version of v0.3
nodebreaker0-0 Dec 10, 2019
1974f94
no-std or warm compatibility issues, grapana-data -source code refere…
nodebreaker0-0 Jan 2, 2020
62a873d
Cargo.lock paritytech/master rebase
nodebreaker0-0 Jan 2, 2020
97e37e4
prometheus networking.rs del, grafana-data-source networking.rs pub e…
nodebreaker0-0 Jan 3, 2020
f3e444e
chore: reflect various feedback
nodebreaker0-0 Jan 10, 2020
8145df7
Spaces to tabs.
gavofyork Jan 6, 2020
fca71be
Replace grafana and tidy
expenses Jan 10, 2020
855ceed
Add generics
expenses Jan 10, 2020
70c6bcc
Add photo back
expenses Jan 10, 2020
5941041
Re-fix spaces in primitives/consensus/babe/src/inherents.rs
expenses Jan 10, 2020
7d9d341
Refactor rebase master prometheus_v0.3
nodebreaker0-0 Nov 28, 2019
52316ae
Milestone1: Final Version of v0.3
nodebreaker0-0 Dec 10, 2019
82cd8cf
no-std or warm compatibility issues, grapana-data -source code refere…
nodebreaker0-0 Jan 2, 2020
b052829
prometheus networking.rs del, grafana-data-source networking.rs pub e…
nodebreaker0-0 Jan 3, 2020
4e831e9
chore: reflect various feedback
nodebreaker0-0 Jan 10, 2020
0b15bc4
Replace grafana and tidy
expenses Jan 10, 2020
8025361
Add generics
expenses Jan 10, 2020
fd081c9
Add photo back
expenses Jan 10, 2020
f00bb9b
Re-fix spaces in primitives/consensus/babe/src/inherents.rs
expenses Jan 10, 2020
398da97
chore: revert this file back to paritytech/master inherents.rs.
nodebreaker0-0 Jan 14, 2020
84c458f
Add newline at EOF
expenses Jan 14, 2020
f931588
Merge remote-tracking branch 'nodebreaker/prometheus_v0.3' into ashle…
expenses Jan 16, 2020
5796c85
Merge remote-tracking branch 'parity/master' into ashley-prometheus
expenses Jan 16, 2020
2a45de8
Merge remote-tracking branch 'parity/master' into ashley-prometheus
expenses Jan 16, 2020
8bae73b
Merge remote-tracking branch 'parity/master' into ashley-prometheus
expenses Jan 17, 2020
1caa0f1
Merge remote-tracking branch 'parity/master' into ashley-prometheus
expenses Jan 17, 2020
ffb4746
Tidy
expenses Jan 17, 2020
8a6e3c5
Use local registry
expenses Jan 17, 2020
53c95de
fix typo
hskang9 Jan 17, 2020
23cb72e
chore: Apply review feedback
nodebreaker0-0 Jan 20, 2020
3d1634b
Merge remote-tracking branch 'nodebreaker/prometheus_v0.3' into ashle…
expenses Jan 20, 2020
55ac4f8
Merge remote-tracking branch 'parity/master' into ashley-prometheus
expenses Jan 20, 2020
3cfe43c
endpoint -> exporter
expenses Jan 20, 2020
0af2369
fix readme
expenses Jan 20, 2020
44f8c0e
Merge pull request #3 from paritytech/ashley-prometheus
nodebreaker0-0 Jan 21, 2020
d2bba61
Remove lazy_static, use ServiceMetrics struct instead
expenses Jan 21, 2020
5d3d9a7
Merge pull request #4 from paritytech/ashley-prometheus
nodebreaker0-0 Jan 22, 2020
32c04b4
Merge branch 'paritytech/master' into prometheus_v0.3
mxinden Jan 23, 2020
9bde830
Switch to using GaugeVecs
expenses Jan 23, 2020
97faad6
Merge pull request #5 from mxinden/prometheus_v0.3
nodebreaker0-0 Jan 24, 2020
f64a482
chore: without nightly , edit README
nodebreaker0-0 Jan 24, 2020
5d56d93
Merge branch 'prometheus_v0.3' into ashley-prometheus
nodebreaker0-0 Jan 24, 2020
50f2928
block_height -> block_height_number
expenses Jan 27, 2020
8edb710
Merge branch 'ashley-prometheus' of github.com:paritytech/substrate i…
expenses Jan 27, 2020
6a25ea8
Switch to a ready_transactions_number gauge
expenses Jan 29, 2020
18cf2be
Merge pull request #7 from paritytech/ashley-prometheus
nodebreaker0-0 Jan 29, 2020
bb388f5
Update utils/prometheus/src/lib.rs
hskang9 Jan 31, 2020
77ba252
no-prometheus flag add
nodebreaker0-0 Jan 31, 2020
5e327c2
/metrics url Input check
nodebreaker0-0 Jan 31, 2020
ccb3179
remove prometheus in Tracing
nodebreaker0-0 Feb 5, 2020
152176f
remove prometheus in Tracing
nodebreaker0-0 Feb 14, 2020
8de57c2
Merge branch 'master' into prometheus_v0.3
nodebreaker0-0 Feb 14, 2020
e76f200
chore: master code rebase edit
nodebreaker0-0 Feb 14, 2020
55428b7
gitlab-check-web-wasm edit code
nodebreaker0-0 Feb 14, 2020
d1003e6
From:from and cargo.lock update
nodebreaker0-0 Feb 17, 2020
5c2e085
Merge commit 'db1ab7d18fbe7876cdea43bbf30f147ddd263f94' into promethe…
nodebreaker0-0 Feb 17, 2020
b4b4432
with_prometheus_registry add background_tasks
nodebreaker0-0 Feb 17, 2020
ee421a6
Merge commit '419e5fd0026cfd528cd3b327789bb0a3a8215703' into promethe…
nodebreaker0-0 Feb 18, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Tidy
  • Loading branch information
expenses committed Jan 17, 2020
commit ffb4746ddc76139d67d6659e968fc0b0e1a2d7a8
4 changes: 2 additions & 2 deletions client/cli/src/informant/display.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ use prometheus_endpoint::{create_gauge, Gauge, U64};

prometheus_endpoint::lazy_static! {
pub static ref SYNC_TARGET: Gauge<U64> = create_gauge(
"sync_target_number",
"block sync target number"
"substrate_sync_target_number",
"Block sync target number"
);
}

Expand Down
2 changes: 1 addition & 1 deletion client/cli/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1049,7 +1049,7 @@ where
config.rpc_http = Some(parse_address(&format!("{}:{}", rpc_interface, 9933), cli.rpc_port)?);
config.rpc_ws = Some(parse_address(&format!("{}:{}", ws_interface, 9944), cli.ws_port)?);
config.prometheus_port = Some(
parse_address(&format!("{}:{}", prometheus_interface, 9955), cli.prometheus_port)?
parse_address(&format!("{}:{}", prometheus_interface, 9615), cli.prometheus_port)?
);

config.rpc_ws_max_connections = cli.ws_max_connections;
Expand Down
42 changes: 21 additions & 21 deletions client/service/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,36 +55,36 @@ use prometheus_endpoint::{create_gauge, Gauge, U64, F64};

prometheus_endpoint::lazy_static! {
pub static ref FINALITY_HEIGHT: Gauge<U64> = create_gauge(
"consensus_finality_block_height_number",
"block is finality HEIGHT"
"substrate_finality_block_height_number",
"Height of the highest finalized block"
);
pub static ref BEST_HEIGHT: Gauge<U64> = create_gauge(
"consensus_best_block_height_number",
"block is best HEIGHT"
"substrate_best_block_height_number",
"Height of the highest block"
);
pub static ref P2P_PEERS_NUM: Gauge<U64> = create_gauge(
"p2p_peers_number",
"network gosip peers number"
pub static ref PEERS_NUM: Gauge<U64> = create_gauge(
"substrate_peers_count",
"Number of network gossip peers"
);
pub static ref TX_COUNT: Gauge<U64> = create_gauge(
"consensus_num_txs",
"substrate_transaction_count",
"Number of transactions"
);
pub static ref NODE_MEMORY: Gauge<U64> = create_gauge(
"consensus_node_memory",
"node memory"
"substrate_memory_usage",
"Node memory usage"
);
pub static ref NODE_CPU: Gauge<F64> = create_gauge(
"consensus_node_cpu",
"node cpu"
"substrate_cpu_usage",
"Node CPU usage"
);
pub static ref P2P_NODE_DOWNLOAD: Gauge<U64> = create_gauge(
"p2p_peers_receive_byte_per_sec",
"p2p_node_download_per_sec_byte"
pub static ref NODE_DOWNLOAD: Gauge<U64> = create_gauge(
"substrate_receive_byte_per_sec",
"Received bytes per second"
);
pub static ref P2P_NODE_UPLOAD: Gauge<U64> = create_gauge(
"p2p_peers_send_byte_per_sec",
"p2p_node_upload_per_sec_byte"
pub static ref NODE_UPLOAD: Gauge<U64> = create_gauge(
"substrate_sent_byte_per_sec",
"Sent bytes per second"
);
}
/// Aggregator for the components required to build a service.
Expand Down Expand Up @@ -1006,9 +1006,9 @@ ServiceBuilder<
TX_COUNT.set(txpool_status.ready as u64);
FINALITY_HEIGHT.set(finalized_number);
BEST_HEIGHT.set(best_number);
P2P_PEERS_NUM.set(num_peers as u64);
P2P_NODE_DOWNLOAD.set(net_status.average_download_per_sec);
P2P_NODE_UPLOAD.set(net_status.average_upload_per_sec);
PEERS_NUM.set(num_peers as u64);
NODE_DOWNLOAD.set(net_status.average_download_per_sec);
NODE_UPLOAD.set(net_status.average_upload_per_sec);

ready(())
});
Expand Down
2 changes: 1 addition & 1 deletion client/tracing/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,6 @@ fn send_telemetry(span_datum: SpanDatum) {
);
}

fn send_prometheus(span_datum: SpanDatum) {
fn send_prometheus(_span_datum: SpanDatum) {
unimplemented!()
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mxinden do you know of a way that we could implement this API? It seems like prometheus gauges etc are intended to created as static refs etc.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My guess is that you can't implement it without creating a macro.

Or you can simply make a static reference.
https://github.com/tikv/rust-prometheus/blob/master/examples/example_edition_2018.rs

Copy link
Contributor

@mattrutherford mattrutherford Jan 14, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't yet understand fully how prometheus works, but to properly support this I think we would need to be able to:

  • batch together all values (spans) collected since last scrape
  • partition by target and name (by the use of labels perhaps).

With all targets currently in the code enabled, we can be approaching 200 measurements per second just across the runtime. Is this feasible to do while preserving timestamp (which we would need to explicitly provide for this receiver)?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm. I'll have a go at this later.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

be approaching 200 measurements

@mattrutherford 200 measurements are not an issue for Prometheus. Prometheus can easily handle couple of mega bytes of metric data per scrape. See kubernetes/kube-state-metrics#498 for some numbers.

batch together all values (spans) collected since last scrape

This sounds more like we want a Histogram, right?


Would it be possible to tackle the effort within client/tracing as a follow up pull request? I would like to keep this one small. What do you think @mattrutherford @expenses @nodebreaker0-0?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree it's good to be clear on terminology, but that wasn't my main concern - only that if we're ripping out the Grafana server as part of the introduction of this Prometheus feature, is whether we'd lose this functionality to plot individual observations* - however that seems unavoidable based on what you said.

*For my use-case it's OK because I use the Telemetry Receiver in substrate-tracing to send the data to substrate-analytics which uses a PostgreSQL datasource, which in turn can be queried by Grafana (because I want to archive the data); however this is not trivial to set up, so the question is - do we really want to kill substrate Grafana server yet? It's a relatively new feature so maybe not many people use it, but I think it's something we should consider as part of our decision to remove it. Particularly as the current primary use-case for substrate-tracing is for profiling and that might be handy for developers to have any easy way to look at this.

Pinging @marcio-diaz @shawntabrizi @DarkEld3r in case you have opinion on this

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Imo the Grafana server in its current state isn't really good at doing anything. If Prometheus isn't going to work for tracing, we should probably find something that is. I'm happy enough to leave the Grafana server in for the moment until that happens.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, good to know - and just to be clear that I have no problem removing it, if it's not going to be a big detriment to people - just wanted to make sure everyone was aware of full implications

Copy link

@Hyung-bharvest Hyung-bharvest Jan 20, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think tracing other's nodes is not (should not) the purpose of the prometheus exporter. It is purely for monitoring operator's own nodes. Extracting information from nodes operated by others is not sounding right as a perspective of information privacy.

I guess tracing functionality should remain in telemetry, and it should be also "off as default" for privacy of node operators.

And storing and querying historical data for own nodes can be done on monitoring server(grafana), and it should not burden the node itself.

Please let me know if I misunderstood this context.

Copy link
Contributor

@mattrutherford mattrutherford Jan 23, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK - I can see what the misunderstanding is here and it warrants some more explanation - tracing is only capturing local data; which we then have the option to send somewhere, either Log (output tracing data via logger), Grafana server (built-in) or Telemetry. When using telemetry to send the tracing data, it is not expected (or desired) to send to the default telemetry url, so we override that via the cli, with eg: --telemetry-url 'ws://localhost:8080 9' to send the data to an analytics server (often running on the same machine, but can be anywhere you want).

40 changes: 5 additions & 35 deletions utils/prometheus/README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Substrate Prometheus Node Exporter
![grants](./photo_2019-12-13_16-32-53.jpg)
# Substrate Prometheus Exporter

## Introduction

Prometheus is one of the most widely used monitoring tool for managing high availability services supported by [Cloud Native Computing Foundation](https://www.cncf.io/). By providing Prometheus metrics in Substrate, node operators can easily adopt widely used display/alert tools such as Grafana and Alertmanager without setting-up/operating external Prometheus push gateways (which is an antipattern in the first place) through RPC connections. Easy access to such monitoring tools will benefit parachain developers/operators and validators to have much higher availability of their services.
Prometheus is one of the most widely used monitoring tool for managing highly available services supported by [Cloud Native Computing Foundation](https://www.cncf.io/). By providing Prometheus metrics in Substrate, node operators can easily adopt widely used display/alert tools such as Grafana and Alertmanager without setting-up/operating external Prometheus push gateways (which is an antipattern in the first place) through RPC connections. Easy access to such monitoring tools will benefit parachain developers/operators and validators to have much higher availability of their services.

## Table of Contents

Expand All @@ -24,37 +24,7 @@ Start Grafana

## Metrics

substrate can report and serve the Prometheus metrics, which in their turn can be consumed by Prometheus collector(s).

This functionality is disabled by default.

To enable the Prometheus metrics, set in your cli command (--prometheus-addr,--prometheus-port ).
Metrics will be served under /metrics on 33333 port by default.

### List of available metrics


Consensus metrics, namespace: `substrate`

| **Name** | **Type** | **Tags** | **Description** |
| -------------------------------------- | --------- | -------- | --------------------------------------------------------------- |
| consensus_finality_block_height_number | IntGauge | | finality Height of the chain |
| consensus_best_block_height_number | IntGauge | | best Height of the chain |
| consensus_target_syn_number | IntGauge | | syning Height target number |
| consensus_num_txs | Gauge | | Number of transactions |
| consensus_node_memory | IntGauge | | Node's primary memory |
| consensus_node_cpu | IntGauge | | Node's cpu load |
| consensus_state_cache_size | IntGauge | | used state cache size |
| p2p_peers_number | IntGauge | | Number of peers node's connected to |
| p2p_peer_receive_bytes_per_sec | IntGauge | | number of bytes received from a given peer |
| p2p_peer_send_bytes_per_sec | IntGauge | | number of bytes sent to a given peer |
| Resource_receive_bytes_per_sec(Future) | IntGauge | | Operating System of bytes received |
| Resource_send_bytes_per_sec(Future) | IntGauge | | Operating System of bytes sent |
| Resource_cpu_use(Future) | IntGauge | | Operating System cpu load |
| Resource_disk_use(Future) | IntGauge | | Operating System disk use |
| validator_sign_prevote(Future) | IntGauge | validator addr | validator sign vote list |
| validator_sign_precommit(Future) | IntGauge | validator addr | validator sign commit list |

Substrate can report and serve the Prometheus metrics, which in turn can be consumed by Prometheus collector(s). Metrics will be served under /metrics on 9615 port by default.

## Start Prometheus
### Install prometheus
Expand Down Expand Up @@ -84,7 +54,7 @@ Then edit `prometheus.yml` and add `jobs` :
### Start Prometheus

```bash
cd <prometheus file>
cd <prometheus folder>
./prometheus
```

Expand Down
Binary file removed utils/prometheus/photo_2019-12-13_16-32-53.jpg
Binary file not shown.
39 changes: 13 additions & 26 deletions utils/prometheus/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,19 +43,22 @@ pub enum Error {
/// Http request error.
Http(hyper::http::Error),
/// i/o error.
Io(std::io::Error)
Io(std::io::Error),
#[display(fmt = "Prometheus export port {} already in use.", _0)]
PortInUse(SocketAddr)
}
impl std::error::Error for Error {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self {
Error::Hyper(error) => Some(error),
Error::Http(error) => Some(error),
Error::Io(error) => Some(error)
Error::Io(error) => Some(error),
Error::PortInUse(_) => None
}
}
}

async fn request_metrics(req: Request<Body>) -> Result<Response<Body>, Error> {
async fn request_metrics(_req: Request<Body>) -> Result<Response<Body>, Error> {
let metric_families = prometheus::gather();
let mut buffer = vec![];
let encoder = TextEncoder::new();
Expand Down Expand Up @@ -84,40 +87,24 @@ impl<T> hyper::rt::Executor<T> for Executor
/// Initializes the metrics context, and starts an HTTP server
/// to serve metrics.
#[cfg(not(target_os = "unknown"))]
pub async fn init_prometheus(mut prometheus_addr: SocketAddr) -> Result<(), Error>{
use async_std::{net, io};
pub async fn init_prometheus(prometheus_addr: SocketAddr) -> Result<(), Error>{
use networking::Incoming;
let listener = loop {
let listener = net::TcpListener::bind(&prometheus_addr).await;
match listener {
Ok(listener) => {
log::info!("Prometheus server started at {}", prometheus_addr);
break listener
},
Err(err) => match err.kind() {
io::ErrorKind::AddrInUse | io::ErrorKind::PermissionDenied if prometheus_addr.port() != 0 => {
log::warn!(
"Prometheus server to already {} port.", prometheus_addr.port()
);
prometheus_addr.set_port(0);
continue;
},
_ => return Err(err.into())
}
}
};
let listener = async_std::net::TcpListener::bind(&prometheus_addr)
.await
.map_err(|_| Error::PortInUse(prometheus_addr))?;

let service = make_service_fn(|_| {
async {
Ok::<_, Error>(service_fn(request_metrics))
}
});

let _server = Server::builder(Incoming(listener.incoming()))
let server = Server::builder(Incoming(listener.incoming()))
.executor(Executor)
.serve(service)
.boxed();

let result = _server.await.map_err(Into::into);
let result = server.await.map_err(Into::into);

result
}
Expand Down