From 01af6307b794cbf4a7d3bc02299414edd8ac3203 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Wed, 25 Oct 2023 11:50:45 +0300 Subject: [PATCH 01/45] skeleton Signed-off-by: Andrei Sandu --- Cargo.lock | 25 +++++++++++++ Cargo.toml | 1 + polkadot/node/subsystem-bench/Cargo.toml | 46 ++++++++++++++++++++++++ polkadot/node/subsystem-bench/README.md | 6 ++++ polkadot/node/subsystem-bench/build.rs | 22 ++++++++++++ 5 files changed, 100 insertions(+) create mode 100644 polkadot/node/subsystem-bench/Cargo.toml create mode 100644 polkadot/node/subsystem-bench/README.md create mode 100644 polkadot/node/subsystem-bench/build.rs diff --git a/Cargo.lock b/Cargo.lock index a8d679c6ce8b0..e846ae53a543d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -12990,6 +12990,31 @@ dependencies = [ "sp-core", ] +[[package]] +name = "polkadot-subsystem-bench" +version = "1.0.0" +dependencies = [ + "assert_matches", + "async-trait", + "clap 4.4.6", + "color-eyre", + "futures", + "futures-timer", + "polkadot-erasure-coding", + "polkadot-node-core-backing", + "polkadot-node-primitives", + "polkadot-node-subsystem", + "polkadot-node-subsystem-test-helpers", + "polkadot-node-subsystem-types", + "polkadot-node-subsystem-util", + "polkadot-primitives", + "rand 0.8.5", + "sp-core", + "sp-keystore", + "substrate-build-script-utils", + "tracing-gum", +] + [[package]] name = "polkadot-test-client" version = "1.0.0" diff --git a/Cargo.toml b/Cargo.toml index c98fe6d1a3acf..2c5acccd5cfb6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -125,6 +125,7 @@ members = [ "polkadot/node/gum/proc-macro", "polkadot/node/jaeger", "polkadot/node/malus", + "polkadot/node/subsystem-bench", "polkadot/node/metrics", "polkadot/node/network/approval-distribution", "polkadot/node/network/availability-distribution", diff --git a/polkadot/node/subsystem-bench/Cargo.toml b/polkadot/node/subsystem-bench/Cargo.toml new file mode 100644 index 0000000000000..b2cc88ff057d5 --- /dev/null +++ b/polkadot/node/subsystem-bench/Cargo.toml @@ -0,0 +1,46 @@ +[package] +name = "polkadot-subsystem-bench" +description = "Subsystem performance benchmark client" +version = "1.0.0" +authors.workspace = true +edition.workspace = true +license.workspace = true +readme = "README.md" +publish = false + +[[bin]] +name = "subsystem-bench" +path = "src/subsystem-bench.rs" + +# Prevent rustdoc error. Already documented from top-level Cargo.toml. +doc = false + +[dependencies] +polkadot-node-subsystem = { path = "../subsystem" } +polkadot-node-subsystem-util = { path = "../subsystem-util" } +polkadot-node-subsystem-types = { path = "../subsystem-types" } +polkadot-node-core-backing = { path = "../core/backing" } +polkadot-node-primitives = { path = "../primitives" } +polkadot-primitives = { path = "../../primitives" } +color-eyre = { version = "0.6.1", default-features = false } +assert_matches = "1.5" +async-trait = "0.1.57" +sp-keystore = { path = "../../../substrate/primitives/keystore" } +sp-core = { path = "../../../substrate/primitives/core" } +clap = { version = "4.4.6", features = ["derive"] } +futures = "0.3.21" +futures-timer = "3.0.2" +gum = { package = "tracing-gum", path = "../gum" } +erasure = { package = "polkadot-erasure-coding", path = "../../erasure-coding" } +rand = "0.8.5" + +[dev-dependencies] +polkadot-node-subsystem-test-helpers = { path = "../subsystem-test-helpers" } +sp-core = { path = "../../../substrate/primitives/core" } +futures = { version = "0.3.21", features = ["thread-pool"] } + +[build-dependencies] +substrate-build-script-utils = { path = "../../../substrate/utils/build-script-utils" } + +[features] +default = [] diff --git a/polkadot/node/subsystem-bench/README.md b/polkadot/node/subsystem-bench/README.md new file mode 100644 index 0000000000000..8843f9883116f --- /dev/null +++ b/polkadot/node/subsystem-bench/README.md @@ -0,0 +1,6 @@ +# Subsystem benchmark client + +Run subsystem performance tests in isolation. + +Currently implemented benchmarks: +* `availability-recovery` diff --git a/polkadot/node/subsystem-bench/build.rs b/polkadot/node/subsystem-bench/build.rs new file mode 100644 index 0000000000000..84fe22e23ed6f --- /dev/null +++ b/polkadot/node/subsystem-bench/build.rs @@ -0,0 +1,22 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +fn main() { + substrate_build_script_utils::generate_cargo_keys(); + // For the node/worker version check, make sure we always rebuild the node and binary workers + // when the version changes. + substrate_build_script_utils::rerun_if_git_head_changed(); +} From 7c22abeb76ed0db47f7df9ac475c2ccab7743b67 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 6 Nov 2023 19:15:38 +0200 Subject: [PATCH 02/45] wip Signed-off-by: Andrei Sandu --- Cargo.lock | 18 +- .../network/availability-recovery/src/lib.rs | 13 +- polkadot/node/subsystem-bench/Cargo.toml | 21 +- .../node/subsystem-bench/src/availability.rs | 501 ++++++++++++++++++ .../subsystem-bench/src/subsystem-bench.rs | 133 +++++ 5 files changed, 674 insertions(+), 12 deletions(-) create mode 100644 polkadot/node/subsystem-bench/src/availability.rs create mode 100644 polkadot/node/subsystem-bench/src/subsystem-bench.rs diff --git a/Cargo.lock b/Cargo.lock index e846ae53a543d..d113fd7e43cd3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -12998,20 +12998,32 @@ dependencies = [ "async-trait", "clap 4.4.6", "color-eyre", + "env_logger 0.9.3", "futures", "futures-timer", + "log", + "parity-scale-codec", + "polkadot-availability-recovery", "polkadot-erasure-coding", - "polkadot-node-core-backing", + "polkadot-node-metrics", + "polkadot-node-network-protocol", "polkadot-node-primitives", "polkadot-node-subsystem", "polkadot-node-subsystem-test-helpers", "polkadot-node-subsystem-types", "polkadot-node-subsystem-util", "polkadot-primitives", + "polkadot-primitives-test-helpers", + "prometheus", "rand 0.8.5", + "sc-network", + "sc-service", + "sp-application-crypto", "sp-core", + "sp-keyring", "sp-keystore", "substrate-build-script-utils", + "tokio", "tracing-gum", ] @@ -18675,9 +18687,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.32.0" +version = "1.33.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17ed6077ed6cd6c74735e21f37eb16dc3935f96878b1fe961074089cc80893f9" +checksum = "4f38200e3ef7995e5ef13baec2f432a6da0aa9ac495b2c0e8f3b7eec2c92d653" dependencies = [ "backtrace", "bytes", diff --git a/polkadot/node/network/availability-recovery/src/lib.rs b/polkadot/node/network/availability-recovery/src/lib.rs index e2146981da926..156a8cbbc82e6 100644 --- a/polkadot/node/network/availability-recovery/src/lib.rs +++ b/polkadot/node/network/availability-recovery/src/lib.rs @@ -65,7 +65,7 @@ mod error; mod futures_undead; mod metrics; mod task; -use metrics::Metrics; +pub use metrics::Metrics; #[cfg(test)] mod tests; @@ -582,7 +582,7 @@ impl AvailabilityRecoverySubsystem { } } - async fn run(self, mut ctx: Context) -> SubsystemResult<()> { + pub async fn run(self, mut ctx: Context) -> SubsystemResult<()> { let mut state = State::default(); let Self { mut req_receiver, metrics, recovery_strategy_kind, bypass_availability_store } = self; @@ -617,9 +617,12 @@ impl AvailabilityRecoverySubsystem { .into_iter() .cycle(); + gum::debug!("Subsystem running"); loop { let recv_req = req_receiver.recv(|| vec![COST_INVALID_REQUEST]).fuse(); pin_mut!(recv_req); + gum::debug!("waiting for message"); + futures::select! { erasure_task = erasure_task_rx.next() => { match erasure_task { @@ -640,7 +643,7 @@ impl AvailabilityRecoverySubsystem { } }, None => { - gum::debug!( + gum::trace!( target: LOG_TARGET, "Erasure task channel closed", ); @@ -655,6 +658,7 @@ impl AvailabilityRecoverySubsystem { &mut state, signal, ).await? { + gum::debug!(target: LOG_TARGET, "subsystem concluded"); return Ok(()); } FromOrchestra::Communication { msg } => { @@ -818,10 +822,11 @@ async fn erasure_task_thread( let _ = sender.send(maybe_data); }, None => { - gum::debug!( + gum::trace!( target: LOG_TARGET, "Erasure task channel closed. Node shutting down ?", ); + break }, } } diff --git a/polkadot/node/subsystem-bench/Cargo.toml b/polkadot/node/subsystem-bench/Cargo.toml index b2cc88ff057d5..729749ab153bf 100644 --- a/polkadot/node/subsystem-bench/Cargo.toml +++ b/polkadot/node/subsystem-bench/Cargo.toml @@ -19,9 +19,10 @@ doc = false polkadot-node-subsystem = { path = "../subsystem" } polkadot-node-subsystem-util = { path = "../subsystem-util" } polkadot-node-subsystem-types = { path = "../subsystem-types" } -polkadot-node-core-backing = { path = "../core/backing" } polkadot-node-primitives = { path = "../primitives" } polkadot-primitives = { path = "../../primitives" } +polkadot-node-network-protocol = { path = "../network/protocol" } +polkadot-availability-recovery = { path = "../network/availability-recovery" } color-eyre = { version = "0.6.1", default-features = false } assert_matches = "1.5" async-trait = "0.1.57" @@ -31,13 +32,23 @@ clap = { version = "4.4.6", features = ["derive"] } futures = "0.3.21" futures-timer = "3.0.2" gum = { package = "tracing-gum", path = "../gum" } -erasure = { package = "polkadot-erasure-coding", path = "../../erasure-coding" } +polkadot-erasure-coding = { package = "polkadot-erasure-coding", path = "../../erasure-coding" } +log = "0.4.17" +env_logger = "0.9.0" rand = "0.8.5" +parity-scale-codec = { version = "3.6.1", features = ["std", "derive"] } +tokio = "1.24.2" -[dev-dependencies] polkadot-node-subsystem-test-helpers = { path = "../subsystem-test-helpers" } -sp-core = { path = "../../../substrate/primitives/core" } -futures = { version = "0.3.21", features = ["thread-pool"] } +sp-keyring = { path = "../../../substrate/primitives/keyring" } +sp-application-crypto = { path = "../../../substrate/primitives/application-crypto" } +sc-network = { path = "../../../substrate/client/network" } +sc-service = { path = "../../../substrate/client/service" } +polkadot-node-metrics = { path = "../metrics" } + +polkadot-primitives-test-helpers = { path = "../../primitives/test-helpers" } +# prometheus = { package = "substrate-prometheus-endpoint", path = "../../../substrate/utils/prometheus" } +prometheus = { version = "0.13.0", default-features = false } [build-dependencies] substrate-build-script-utils = { path = "../../../substrate/utils/build-script-utils" } diff --git a/polkadot/node/subsystem-bench/src/availability.rs b/polkadot/node/subsystem-bench/src/availability.rs new file mode 100644 index 0000000000000..d5cb9515ca687 --- /dev/null +++ b/polkadot/node/subsystem-bench/src/availability.rs @@ -0,0 +1,501 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +use std::{sync::Arc, time::Duration}; + +use assert_matches::assert_matches; +use env_logger::Env; +use futures::{ + channel::{mpsc, oneshot}, + executor, future, Future, FutureExt, SinkExt, +}; +use futures_timer::Delay; +use polkadot_node_metrics::metrics::Metrics; + +use polkadot_availability_recovery::{AvailabilityRecoverySubsystem, Metrics as SubsystemMetrics}; + +use parity_scale_codec::Encode; +use polkadot_node_network_protocol::request_response::{ + self as req_res, v1::ChunkResponse, IncomingRequest, Recipient, ReqProtocolNames, Requests, +}; + +use prometheus::Registry; +use sc_network::{config::RequestResponseConfig, IfDisconnected, OutboundFailure, RequestFailure}; + +use polkadot_erasure_coding::{branches, obtain_chunks_v1 as obtain_chunks}; +use polkadot_node_primitives::{BlockData, PoV, Proof}; +use polkadot_node_subsystem::{ + errors::RecoveryError, + jaeger, + messages::{ + AllMessages, AvailabilityRecoveryMessage, AvailabilityStoreMessage, NetworkBridgeTxMessage, + RuntimeApiMessage, RuntimeApiRequest, + }, + overseer, ActiveLeavesUpdate, FromOrchestra, OverseerSignal, SpawnedSubsystem, Subsystem, + SubsystemContext, SubsystemError, SubsystemResult, +}; + +const LOG_TARGET: &str = "subsystem-bench::availability"; + +use polkadot_erasure_coding::recovery_threshold; +use polkadot_node_primitives::{AvailableData, ErasureChunk}; +// use polkadot_node_subsystem::{ +// errors::RecoveryError, +// jaeger, +// messages::{AvailabilityRecoveryMessage, AvailabilityStoreMessage}, +// overseer, ActiveLeavesUpdate, FromOrchestra, OverseerSignal, SpawnedSubsystem, +// SubsystemContext, SubsystemError, SubsystemResult, +// }; +use polkadot_node_subsystem_test_helpers::{ + make_subsystem_context, mock::new_leaf, TestSubsystemContextHandle, +}; +use polkadot_node_subsystem_util::TimeoutExt; +use polkadot_primitives::{ + AuthorityDiscoveryId, CandidateHash, CandidateReceipt, GroupIndex, Hash, HeadData, IndexedVec, + PersistedValidationData, SessionIndex, SessionInfo, ValidatorId, ValidatorIndex, +}; +use polkadot_primitives_test_helpers::{dummy_candidate_receipt, dummy_hash}; +use sc_service::{SpawnTaskHandle, TaskManager}; + +type VirtualOverseer = TestSubsystemContextHandle; + +// Deterministic genesis hash for protocol names +const GENESIS_HASH: Hash = Hash::repeat_byte(0xff); + +struct AvailabilityRecoverySubsystemInstance { + protocol_config: RequestResponseConfig, +} + +pub struct EnvParams { + // The candidate we will recover in the benchmark. + candidate: CandidateReceipt, +} + +// Implements a mockup of NetworkBridge and AvilabilityStore to support provide state for +// `AvailabilityRecoverySubsystemInstance` +pub struct TestEnvironment { + // A tokio runtime to use in the test + runtime: tokio::runtime::Handle, + // A task manager that tracks task poll durations. + task_manager: TaskManager, + // The Prometheus metrics registry + registry: Registry, + // A test overseer. + to_subsystem: mpsc::Sender>, + // Parameters + params: EnvParams, + // Subsystem instance, currently keeps req/response protocol channel senders. + instance: AvailabilityRecoverySubsystemInstance, +} + +impl TestEnvironment { + pub fn new(runtime: tokio::runtime::Handle, mut params: EnvParams, registry: Registry) -> Self { + let task_manager: TaskManager = TaskManager::new(runtime.clone(), Some(®istry)).unwrap(); + let (instance, virtual_overseer) = AvailabilityRecoverySubsystemInstance::new( + ®istry, + task_manager.spawn_handle(), + runtime.clone(), + ); + + // TODO: support parametrization of initial test state + // n_validator, n_cores. + let state = TestState::new(params.candidate.clone()); + // Override candidate after computing erasure in `TestState::new` + params.candidate = state.candidate(); + + // Create channel to inject messages int the subsystem. + let to_subsystem = virtual_overseer.tx.clone(); + + // We need to start a receiver to process messages from the subsystem. + task_manager.spawn_handle().spawn_blocking( + "test-environment", + "test-environment", + async move { Self::env_task(virtual_overseer, state).await }, + ); + + TestEnvironment { runtime, task_manager, registry, to_subsystem, params, instance } + } + + pub fn params(&self) -> &EnvParams { + &self.params + } + + async fn respond_to_send_request(state: &mut TestState, request: Requests) { + match request { + Requests::ChunkFetchingV1(outgoing_request) => { + let validator_index = outgoing_request.payload.index.0 as usize; + let chunk: ChunkResponse = state.chunks[validator_index].clone().into(); + + let _ = outgoing_request + .pending_response + .send(Ok(req_res::v1::ChunkFetchingResponse::from(Some(chunk)).encode())); + }, + _ => panic!("received an unexpected request"), + } + } + + // A task that mocks dependent subsystems based on environment configuration. + // TODO: Spawn real subsystems, user overseer builder. + async fn env_task( + mut ctx: TestSubsystemContextHandle, + mut state: TestState, + ) { + loop { + futures::select! { + message = ctx.recv().fuse() => { + gum::debug!(target: LOG_TARGET, ?message, "Env task received message"); + + match message { + AllMessages::NetworkBridgeTx( + NetworkBridgeTxMessage::SendRequests( + requests, + _if_disconnected, + ) + ) => { + for request in requests { + // TODO: add latency variance when answering requests. This should be an env parameter. + Self::respond_to_send_request(&mut state, request).await; + } + }, + AllMessages::AvailabilityStore(AvailabilityStoreMessage::QueryAvailableData(_candidate_hash, tx)) => { + // TODO: Simulate av store load by delaying the response. + state.respond_none_to_available_data_query(tx).await; + }, + AllMessages::AvailabilityStore(AvailabilityStoreMessage::QueryAllChunks(_candidate_hash, tx)) => { + // Test env: We always have our own chunk. + state.respond_to_query_all_request(|index| index == state.validator_index.0 as usize, tx).await; + }, + AllMessages::AvailabilityStore( + AvailabilityStoreMessage::QueryChunkSize(_, tx) + ) => { + let chunk_size = state.chunks[0].encoded_size(); + let _ = tx.send(Some(chunk_size)); + } + AllMessages::RuntimeApi(RuntimeApiMessage::Request( + relay_parent, + RuntimeApiRequest::SessionInfo( + session_index, + tx, + ) + )) => { + tx.send(Ok(Some(state.session_info()))).unwrap(); + } + _ => panic!("Unexpected input") + } + } + } + } + } + + // Send a message to the subsystem under test environment. + pub async fn send_message(&mut self, msg: AvailabilityRecoveryMessage) { + gum::trace!(msg = ?msg, "sending message"); + self.to_subsystem + .send(FromOrchestra::Communication { msg }) + .timeout(MAX_TIME_OF_FLIGHT) + .await + .unwrap_or_else(|| { + panic!("{}ms maximum time of flight breached", MAX_TIME_OF_FLIGHT.as_millis()) + }) + .unwrap(); + } + + // Send a signal to the subsystem under test environment. + pub async fn send_signal(&mut self, signal: OverseerSignal) { + self.to_subsystem + .send(FromOrchestra::Signal(signal)) + .timeout(MAX_TIME_OF_FLIGHT) + .await + .unwrap_or_else(|| { + panic!("{}ms is more than enough for sending signals.", TIMEOUT.as_millis()) + }) + .unwrap(); + } +} + +/// Implementation for chunks only +/// TODO: all recovery methods. +impl AvailabilityRecoverySubsystemInstance { + pub fn new( + registry: &Registry, + spawn_task_handle: SpawnTaskHandle, + runtime: tokio::runtime::Handle, + ) -> (Self, TestSubsystemContextHandle) { + let (context, virtual_overseer) = make_subsystem_context(spawn_task_handle.clone()); + let (collation_req_receiver, req_cfg) = + IncomingRequest::get_config_receiver(&ReqProtocolNames::new(&GENESIS_HASH, None)); + let subsystem = AvailabilityRecoverySubsystem::with_chunks_only( + collation_req_receiver, + Metrics::try_register(®istry).unwrap(), + ); + + let spawned_subsystem = subsystem.start(context); + let subsystem_future = async move { + spawned_subsystem.future.await.unwrap(); + }; + + spawn_task_handle.spawn_blocking( + spawned_subsystem.name, + spawned_subsystem.name, + subsystem_future, + ); + + (Self { protocol_config: req_cfg }, virtual_overseer) + } +} + +const TIMEOUT: Duration = Duration::from_millis(300); + +// We use this to bail out sending messages to the subsystem if it is overloaded such that +// the time of flight is breaches 5s. +// This should eventually be a test parameter. +const MAX_TIME_OF_FLIGHT: Duration = Duration::from_millis(5000); + +macro_rules! delay { + ($delay:expr) => { + Delay::new(Duration::from_millis($delay)).await; + }; +} + +use sp_keyring::Sr25519Keyring; + +#[derive(Debug)] +enum Has { + No, + Yes, + NetworkError(RequestFailure), + /// Make request not return at all, instead the sender is returned from the function. + /// + /// Note, if you use `DoesNotReturn` you have to keep the returned senders alive, otherwise the + /// subsystem will receive a cancel event and the request actually does return. + DoesNotReturn, +} + +impl Has { + fn timeout() -> Self { + Has::NetworkError(RequestFailure::Network(OutboundFailure::Timeout)) + } +} + +#[derive(Clone)] +struct TestState { + validators: Vec, + validator_public: IndexedVec, + validator_authority_id: Vec, + // The test node validator index. + validator_index: ValidatorIndex, + candidate: CandidateReceipt, + session_index: SessionIndex, + + persisted_validation_data: PersistedValidationData, + + available_data: AvailableData, + chunks: Vec, + invalid_chunks: Vec, +} + +impl TestState { + fn candidate(&self) -> CandidateReceipt { + self.candidate.clone() + } + + fn threshold(&self) -> usize { + recovery_threshold(self.validators.len()).unwrap() + } + + fn impossibility_threshold(&self) -> usize { + self.validators.len() - self.threshold() + 1 + } + + async fn respond_to_available_data_query(&self, tx: oneshot::Sender>) { + let _ = tx.send(Some(self.available_data.clone())); + } + + async fn respond_none_to_available_data_query( + &self, + tx: oneshot::Sender>, + ) { + let _ = tx.send(None); + } + + fn session_info(&self) -> SessionInfo { + SessionInfo { + validators: self.validator_public.clone(), + discovery_keys: self.validator_authority_id.clone(), + // all validators in the same group. + validator_groups: IndexedVec::>::from(vec![(0..self + .validators + .len()) + .map(|i| ValidatorIndex(i as _)) + .collect()]), + assignment_keys: vec![], + n_cores: 0, + zeroth_delay_tranche_width: 0, + relay_vrf_modulo_samples: 0, + n_delay_tranches: 0, + no_show_slots: 0, + needed_approvals: 0, + active_validator_indices: vec![], + dispute_period: 6, + random_seed: [0u8; 32], + } + } + async fn respond_to_query_all_request( + &self, + send_chunk: impl Fn(usize) -> bool, + tx: oneshot::Sender>, + ) { + let v = self.chunks.iter().filter(|c| send_chunk(c.index.0 as usize)).cloned().collect(); + + let _ = tx.send(v); + } +} + +fn validator_pubkeys(val_ids: &[Sr25519Keyring]) -> IndexedVec { + val_ids.iter().map(|v| v.public().into()).collect() +} + +fn validator_authority_id(val_ids: &[Sr25519Keyring]) -> Vec { + val_ids.iter().map(|v| v.public().into()).collect() +} + +fn derive_erasure_chunks_with_proofs_and_root( + n_validators: usize, + available_data: &AvailableData, + alter_chunk: impl Fn(usize, &mut Vec), +) -> (Vec, Hash) { + let mut chunks: Vec> = obtain_chunks(n_validators, available_data).unwrap(); + + for (i, chunk) in chunks.iter_mut().enumerate() { + alter_chunk(i, chunk) + } + + // create proofs for each erasure chunk + let branches = branches(chunks.as_ref()); + + let root = branches.root(); + let erasure_chunks = branches + .enumerate() + .map(|(index, (proof, chunk))| ErasureChunk { + chunk: chunk.to_vec(), + index: ValidatorIndex(index as _), + proof: Proof::try_from(proof).unwrap(), + }) + .collect::>(); + + (erasure_chunks, root) +} + +impl TestState { + fn new(mut candidate: CandidateReceipt) -> Self { + let validators = vec![ + Sr25519Keyring::Ferdie, // <- this node, role: validator + Sr25519Keyring::Alice, + Sr25519Keyring::Bob, + Sr25519Keyring::Charlie, + Sr25519Keyring::Dave, + ]; + + let validator_public = validator_pubkeys(&validators); + let validator_authority_id = validator_authority_id(&validators); + let validator_index = ValidatorIndex(0); + + let session_index = 10; + + let persisted_validation_data = PersistedValidationData { + parent_head: HeadData(vec![7, 8, 9]), + relay_parent_number: Default::default(), + max_pov_size: 1024, + relay_parent_storage_root: Default::default(), + }; + + /// A 5MB PoV. + let pov = PoV { block_data: BlockData(vec![42; 1024 * 1024 * 5]) }; + + let available_data = AvailableData { + validation_data: persisted_validation_data.clone(), + pov: Arc::new(pov), + }; + + let (chunks, erasure_root) = derive_erasure_chunks_with_proofs_and_root( + validators.len(), + &available_data, + |_, _| {}, + ); + // Mess around: + let invalid_chunks = chunks + .iter() + .cloned() + .map(|mut chunk| { + if chunk.chunk.len() >= 2 && chunk.chunk[0] != chunk.chunk[1] { + chunk.chunk[0] = chunk.chunk[1]; + } else if chunk.chunk.len() >= 1 { + chunk.chunk[0] = !chunk.chunk[0]; + } else { + chunk.proof = Proof::dummy_proof(); + } + chunk + }) + .collect(); + debug_assert_ne!(chunks, invalid_chunks); + + candidate.descriptor.erasure_root = erasure_root; + + Self { + validators, + validator_public, + validator_authority_id, + validator_index, + candidate, + session_index, + persisted_validation_data, + available_data, + chunks, + invalid_chunks, + } + } +} + +pub fn bench_chunk_recovery_params() -> EnvParams { + let mut candidate = dummy_candidate_receipt(dummy_hash()); + EnvParams { candidate } +} +pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { + env.send_signal(OverseerSignal::ActiveLeaves(ActiveLeavesUpdate::start_work(new_leaf( + Hash::repeat_byte(1), + 1, + )))) + .await; + + let mut candidate = env.params().candidate.clone(); + + for candidate_num in 0..10u64 { + let (tx, rx) = oneshot::channel(); + + candidate.descriptor.relay_parent = Hash::from_low_u64_be(candidate_num); + + env.send_message(AvailabilityRecoveryMessage::RecoverAvailableData( + candidate.clone(), + 1, + Some(GroupIndex(0)), + tx, + )) + .await; + + let available_data = rx.await.unwrap().unwrap(); + } + env.send_signal(OverseerSignal::Conclude).await; +} diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs new file mode 100644 index 0000000000000..3acf561e0dafd --- /dev/null +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -0,0 +1,133 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! A tool for running subsystem benchmark tests designed for development and +//! CI regression testing. + +use clap::Parser; +use color_eyre::eyre; +use prometheus::proto::LabelPair; +use sc_service::TaskManager; + +pub(crate) mod availability; + +use availability::{EnvParams, TestEnvironment}; +const LOG_TARGET: &str = "subsystem-bench"; + +/// Define the supported benchmarks targets +#[derive(Debug, Parser)] +#[command(about = "Target subsystems", version, rename_all = "kebab-case")] +enum BenchmarkTarget { + /// Benchmark availability recovery strategies. + AvailabilityRecovery, +} + +#[derive(Debug, Parser)] +#[allow(missing_docs)] +struct BenchCli { + #[command(subcommand)] + pub target: BenchmarkTarget, +} + +fn new_runtime() -> tokio::runtime::Runtime { + tokio::runtime::Builder::new_multi_thread() + .thread_name("subsystem-bench") + .enable_all() + .thread_stack_size(3 * 1024 * 1024) + .build() + .unwrap() +} + +impl BenchCli { + /// Launch a malus node. + fn launch(self) -> eyre::Result<()> { + use prometheus::{proto::MetricType, Counter, Encoder, Opts, Registry, TextEncoder}; + + let encoder = TextEncoder::new(); + + println!("Preparing {:?} benchmarks", self.target); + + let runtime = new_runtime(); + let registry = Registry::new(); + + let params = availability::bench_chunk_recovery_params(); + let mut env = TestEnvironment::new(runtime.handle().clone(), params, registry.clone()); + + runtime.block_on(availability::bench_chunk_recovery(&mut env)); + + let metric_families = registry.gather(); + let total_subsystem_cpu = 0; + + for familiy in metric_families { + let metric_type = familiy.get_field_type(); + + for metric in familiy.get_metric() { + match metric_type { + MetricType::HISTOGRAM => { + let h = metric.get_histogram(); + + let mut inf_seen = false; + + let labels = metric.get_label(); + // Skip test env usage. + let mut env_label = LabelPair::default(); + env_label.set_name("task_group".into()); + env_label.set_value("test-environment".into()); + + let mut is_env_metric = false; + for label_pair in labels { + if &env_label == label_pair { + is_env_metric = true; + break + } + } + + if !is_env_metric { + println!( + "{:?} CPU seconds used: {:?}", + familiy.get_name(), + h.get_sample_sum() + ); + } + }, + _ => {}, + } + } + } + // encoder.encode(&metric_families, &mut buffer).unwrap(); + + // Output to the standard output. + // println!("Metrics: {}", String::from_utf8(buffer).unwrap()); + Ok(()) + } +} + +fn main() -> eyre::Result<()> { + color_eyre::install()?; + let _ = env_logger::builder() + .is_test(true) + .filter(Some(LOG_TARGET), log::LevelFilter::Debug) + .try_init(); + + let cli: BenchCli = BenchCli::parse(); + cli.launch()?; + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; +} From c3adc77f2920363d0df458c26f1b9a2a70e8ad2b Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 6 Nov 2023 22:54:39 +0200 Subject: [PATCH 03/45] measure tput and fixes Signed-off-by: Andrei Sandu --- .../node/subsystem-bench/src/availability.rs | 183 +++++++++++------- .../subsystem-bench/src/subsystem-bench.rs | 10 +- 2 files changed, 118 insertions(+), 75 deletions(-) diff --git a/polkadot/node/subsystem-bench/src/availability.rs b/polkadot/node/subsystem-bench/src/availability.rs index d5cb9515ca687..72c8a736217d0 100644 --- a/polkadot/node/subsystem-bench/src/availability.rs +++ b/polkadot/node/subsystem-bench/src/availability.rs @@ -14,10 +14,13 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . -use std::{sync::Arc, time::Duration}; +use std::{ + sync::Arc, + time::{Duration, Instant}, +}; use assert_matches::assert_matches; -use env_logger::Env; +use color_eyre::owo_colors::colors::xterm; use futures::{ channel::{mpsc, oneshot}, executor, future, Future, FutureExt, SinkExt, @@ -52,20 +55,14 @@ const LOG_TARGET: &str = "subsystem-bench::availability"; use polkadot_erasure_coding::recovery_threshold; use polkadot_node_primitives::{AvailableData, ErasureChunk}; -// use polkadot_node_subsystem::{ -// errors::RecoveryError, -// jaeger, -// messages::{AvailabilityRecoveryMessage, AvailabilityStoreMessage}, -// overseer, ActiveLeavesUpdate, FromOrchestra, OverseerSignal, SpawnedSubsystem, -// SubsystemContext, SubsystemError, SubsystemResult, -// }; + use polkadot_node_subsystem_test_helpers::{ - make_subsystem_context, mock::new_leaf, TestSubsystemContextHandle, + make_buffered_subsystem_context, mock::new_leaf, TestSubsystemContextHandle, }; use polkadot_node_subsystem_util::TimeoutExt; use polkadot_primitives::{ - AuthorityDiscoveryId, CandidateHash, CandidateReceipt, GroupIndex, Hash, HeadData, IndexedVec, - PersistedValidationData, SessionIndex, SessionInfo, ValidatorId, ValidatorIndex, + AuthorityDiscoveryId, CandidateHash, CandidateReceipt, CoreIndex, GroupIndex, Hash, HeadData, + IndexedVec, PersistedValidationData, SessionIndex, SessionInfo, ValidatorId, ValidatorIndex, }; use polkadot_primitives_test_helpers::{dummy_candidate_receipt, dummy_hash}; use sc_service::{SpawnTaskHandle, TaskManager}; @@ -97,12 +94,18 @@ pub struct TestEnvironment { to_subsystem: mpsc::Sender>, // Parameters params: EnvParams, - // Subsystem instance, currently keeps req/response protocol channel senders. + // Subsystem instance, currently keeps req/response protocol channel senders + // for the whole duration of the test. instance: AvailabilityRecoverySubsystemInstance, + // The test intial state. The current state is owned by the task doing the overseer/subsystem + // mockings. + state: TestState, } impl TestEnvironment { - pub fn new(runtime: tokio::runtime::Handle, mut params: EnvParams, registry: Registry) -> Self { + // Create a new test environment with specified initial state and prometheus registry. + // We use prometheus metrics to collect per job task poll time and subsystem metrics. + pub fn new(runtime: tokio::runtime::Handle, state: TestState, registry: Registry) -> Self { let task_manager: TaskManager = TaskManager::new(runtime.clone(), Some(®istry)).unwrap(); let (instance, virtual_overseer) = AvailabilityRecoverySubsystemInstance::new( ®istry, @@ -112,26 +115,29 @@ impl TestEnvironment { // TODO: support parametrization of initial test state // n_validator, n_cores. - let state = TestState::new(params.candidate.clone()); - // Override candidate after computing erasure in `TestState::new` - params.candidate = state.candidate(); + let params = EnvParams { candidate: state.candidate() }; - // Create channel to inject messages int the subsystem. + // Copy sender for later when we need to inject messages in to the subsystem. let to_subsystem = virtual_overseer.tx.clone(); + let task_state = state.clone(); // We need to start a receiver to process messages from the subsystem. + // This mocks an overseer and all dependent subsystems task_manager.spawn_handle().spawn_blocking( "test-environment", "test-environment", - async move { Self::env_task(virtual_overseer, state).await }, + async move { Self::env_task(virtual_overseer, task_state).await }, ); - TestEnvironment { runtime, task_manager, registry, to_subsystem, params, instance } + TestEnvironment { runtime, task_manager, registry, to_subsystem, params, instance, state } } pub fn params(&self) -> &EnvParams { &self.params } + pub fn input(&self) -> &TestInput { + self.state.input() + } async fn respond_to_send_request(state: &mut TestState, request: Requests) { match request { @@ -234,7 +240,8 @@ impl AvailabilityRecoverySubsystemInstance { spawn_task_handle: SpawnTaskHandle, runtime: tokio::runtime::Handle, ) -> (Self, TestSubsystemContextHandle) { - let (context, virtual_overseer) = make_subsystem_context(spawn_task_handle.clone()); + let (context, virtual_overseer) = + make_buffered_subsystem_context(spawn_task_handle.clone(), 4096); let (collation_req_receiver, req_cfg) = IncomingRequest::get_config_receiver(&ReqProtocolNames::new(&GENESIS_HASH, None)); let subsystem = AvailabilityRecoverySubsystem::with_chunks_only( @@ -291,7 +298,7 @@ impl Has { } #[derive(Clone)] -struct TestState { +pub struct TestState { validators: Vec, validator_public: IndexedVec, validator_authority_id: Vec, @@ -305,9 +312,14 @@ struct TestState { available_data: AvailableData, chunks: Vec, invalid_chunks: Vec, + input: TestInput, } impl TestState { + fn input(&self) -> &TestInput { + &self.input + } + fn candidate(&self) -> CandidateReceipt { self.candidate.clone() } @@ -362,53 +374,14 @@ impl TestState { let _ = tx.send(v); } -} -fn validator_pubkeys(val_ids: &[Sr25519Keyring]) -> IndexedVec { - val_ids.iter().map(|v| v.public().into()).collect() -} - -fn validator_authority_id(val_ids: &[Sr25519Keyring]) -> Vec { - val_ids.iter().map(|v| v.public().into()).collect() -} - -fn derive_erasure_chunks_with_proofs_and_root( - n_validators: usize, - available_data: &AvailableData, - alter_chunk: impl Fn(usize, &mut Vec), -) -> (Vec, Hash) { - let mut chunks: Vec> = obtain_chunks(n_validators, available_data).unwrap(); - - for (i, chunk) in chunks.iter_mut().enumerate() { - alter_chunk(i, chunk) - } - - // create proofs for each erasure chunk - let branches = branches(chunks.as_ref()); - - let root = branches.root(); - let erasure_chunks = branches - .enumerate() - .map(|(index, (proof, chunk))| ErasureChunk { - chunk: chunk.to_vec(), - index: ValidatorIndex(index as _), - proof: Proof::try_from(proof).unwrap(), - }) - .collect::>(); - - (erasure_chunks, root) -} - -impl TestState { - fn new(mut candidate: CandidateReceipt) -> Self { - let validators = vec![ - Sr25519Keyring::Ferdie, // <- this node, role: validator - Sr25519Keyring::Alice, - Sr25519Keyring::Bob, - Sr25519Keyring::Charlie, - Sr25519Keyring::Dave, - ]; + pub fn new(input: TestInput) -> Self { + let validators = (0..input.n_validators as u64) + .into_iter() + .map(|v| Sr25519Keyring::Alice) + .collect::>(); + let mut candidate = dummy_candidate_receipt(dummy_hash()); let validator_public = validator_pubkeys(&validators); let validator_authority_id = validator_authority_id(&validators); let validator_index = ValidatorIndex(0); @@ -465,15 +438,66 @@ impl TestState { available_data, chunks, invalid_chunks, + input, } } } -pub fn bench_chunk_recovery_params() -> EnvParams { - let mut candidate = dummy_candidate_receipt(dummy_hash()); - EnvParams { candidate } +fn validator_pubkeys(val_ids: &[Sr25519Keyring]) -> IndexedVec { + val_ids.iter().map(|v| v.public().into()).collect() } + +fn validator_authority_id(val_ids: &[Sr25519Keyring]) -> Vec { + val_ids.iter().map(|v| v.public().into()).collect() +} + +fn derive_erasure_chunks_with_proofs_and_root( + n_validators: usize, + available_data: &AvailableData, + alter_chunk: impl Fn(usize, &mut Vec), +) -> (Vec, Hash) { + let mut chunks: Vec> = obtain_chunks(n_validators, available_data).unwrap(); + + for (i, chunk) in chunks.iter_mut().enumerate() { + alter_chunk(i, chunk) + } + + // create proofs for each erasure chunk + let branches = branches(chunks.as_ref()); + + let root = branches.root(); + let erasure_chunks = branches + .enumerate() + .map(|(index, (proof, chunk))| ErasureChunk { + chunk: chunk.to_vec(), + index: ValidatorIndex(index as _), + proof: Proof::try_from(proof).unwrap(), + }) + .collect::>(); + + (erasure_chunks, root) +} + +/// The test input parameters +#[derive(Clone)] +pub struct TestInput { + pub n_validators: usize, + pub n_cores: usize, + pub pov_size: usize, + // This parameter is used to determine how many recoveries we batch in parallel + // similarly to how in practice tranche0 assignments work. + pub vrf_modulo_samples: usize, +} + +impl Default for TestInput { + fn default() -> Self { + Self { n_validators: 300, n_cores: 50, pov_size: 5 * 1024 * 1024, vrf_modulo_samples: 6 } + } +} + pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { + let input = env.input().clone(); + env.send_signal(OverseerSignal::ActiveLeaves(ActiveLeavesUpdate::start_work(new_leaf( Hash::repeat_byte(1), 1, @@ -482,8 +506,12 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { let mut candidate = env.params().candidate.clone(); - for candidate_num in 0..10u64 { + let start_marker = Instant::now(); + + let mut batch = Vec::new(); + for candidate_num in 0..input.n_cores as u64 { let (tx, rx) = oneshot::channel(); + batch.push(rx); candidate.descriptor.relay_parent = Hash::from_low_u64_be(candidate_num); @@ -495,7 +523,20 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { )) .await; + if batch.len() >= input.vrf_modulo_samples { + for rx in std::mem::take(&mut batch) { + let available_data = rx.await.unwrap().unwrap(); + } + } + } + + for rx in std::mem::take(&mut batch) { let available_data = rx.await.unwrap().unwrap(); } + env.send_signal(OverseerSignal::Conclude).await; + let duration = start_marker.elapsed().as_millis(); + let tput = ((input.n_cores * input.pov_size) as u128) / duration * 1000; + println!("Benchmark completed in {:?}ms", duration); + println!("Throughput: {}KiB/s", tput / 1024); } diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs index 3acf561e0dafd..bfc0b63e86d33 100644 --- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -24,7 +24,7 @@ use sc_service::TaskManager; pub(crate) mod availability; -use availability::{EnvParams, TestEnvironment}; +use availability::{EnvParams, TestEnvironment, TestInput, TestState}; const LOG_TARGET: &str = "subsystem-bench"; /// Define the supported benchmarks targets @@ -45,6 +45,7 @@ struct BenchCli { fn new_runtime() -> tokio::runtime::Runtime { tokio::runtime::Builder::new_multi_thread() .thread_name("subsystem-bench") + .max_blocking_threads(32) .enable_all() .thread_stack_size(3 * 1024 * 1024) .build() @@ -63,8 +64,9 @@ impl BenchCli { let runtime = new_runtime(); let registry = Registry::new(); - let params = availability::bench_chunk_recovery_params(); - let mut env = TestEnvironment::new(runtime.handle().clone(), params, registry.clone()); + let state = TestState::new(TestInput::default()); + + let mut env = TestEnvironment::new(runtime.handle().clone(), state, registry.clone()); runtime.block_on(availability::bench_chunk_recovery(&mut env)); @@ -119,7 +121,7 @@ fn main() -> eyre::Result<()> { color_eyre::install()?; let _ = env_logger::builder() .is_test(true) - .filter(Some(LOG_TARGET), log::LevelFilter::Debug) + .filter(Some(LOG_TARGET), log::LevelFilter::Info) .try_init(); let cli: BenchCli = BenchCli::parse(); From 31b0351eaea643f181fe3216e03aae5d4da12ed6 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Tue, 7 Nov 2023 15:37:33 +0200 Subject: [PATCH 04/45] add network emulation Signed-off-by: Andrei Sandu --- .../{availability.rs => availability/mod.rs} | 50 ++++- .../src/availability/network.rs | 212 ++++++++++++++++++ .../subsystem-bench/src/subsystem-bench.rs | 3 +- 3 files changed, 255 insertions(+), 10 deletions(-) rename polkadot/node/subsystem-bench/src/{availability.rs => availability/mod.rs} (92%) create mode 100644 polkadot/node/subsystem-bench/src/availability/network.rs diff --git a/polkadot/node/subsystem-bench/src/availability.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs similarity index 92% rename from polkadot/node/subsystem-bench/src/availability.rs rename to polkadot/node/subsystem-bench/src/availability/mod.rs index 72c8a736217d0..cdc2bf5ce6444 100644 --- a/polkadot/node/subsystem-bench/src/availability.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -16,6 +16,7 @@ use std::{ sync::Arc, + thread::sleep, time::{Duration, Instant}, }; @@ -67,6 +68,8 @@ use polkadot_primitives::{ use polkadot_primitives_test_helpers::{dummy_candidate_receipt, dummy_hash}; use sc_service::{SpawnTaskHandle, TaskManager}; +mod network; + type VirtualOverseer = TestSubsystemContextHandle; // Deterministic genesis hash for protocol names @@ -121,12 +124,13 @@ impl TestEnvironment { let to_subsystem = virtual_overseer.tx.clone(); let task_state = state.clone(); + let spawn_task_handle = task_manager.spawn_handle(); // We need to start a receiver to process messages from the subsystem. // This mocks an overseer and all dependent subsystems task_manager.spawn_handle().spawn_blocking( "test-environment", "test-environment", - async move { Self::env_task(virtual_overseer, task_state).await }, + async move { Self::env_task(virtual_overseer, task_state, spawn_task_handle).await }, ); TestEnvironment { runtime, task_manager, registry, to_subsystem, params, instance, state } @@ -139,15 +143,20 @@ impl TestEnvironment { self.state.input() } - async fn respond_to_send_request(state: &mut TestState, request: Requests) { + pub fn respond_to_send_request(state: &mut TestState, request: Requests) -> NetworkAction { match request { Requests::ChunkFetchingV1(outgoing_request) => { let validator_index = outgoing_request.payload.index.0 as usize; let chunk: ChunkResponse = state.chunks[validator_index].clone().into(); + let size = chunk.encoded_size(); + let future = async move { + let _ = outgoing_request + .pending_response + .send(Ok(req_res::v1::ChunkFetchingResponse::from(Some(chunk)).encode())); + } + .boxed(); - let _ = outgoing_request - .pending_response - .send(Ok(req_res::v1::ChunkFetchingResponse::from(Some(chunk)).encode())); + NetworkAction::new(validator_index, future, size) }, _ => panic!("received an unexpected request"), } @@ -158,7 +167,15 @@ impl TestEnvironment { async fn env_task( mut ctx: TestSubsystemContextHandle, mut state: TestState, + spawn_task_handle: SpawnTaskHandle, ) { + // Emulate `n_validators` each with 1MiB of bandwidth available. + let mut network = NetworkEmulator::new( + state.input().n_validators, + state.input().bandwidth, + spawn_task_handle, + ); + loop { futures::select! { message = ctx.recv().fuse() => { @@ -173,7 +190,9 @@ impl TestEnvironment { ) => { for request in requests { // TODO: add latency variance when answering requests. This should be an env parameter. - Self::respond_to_send_request(&mut state, request).await; + let action = Self::respond_to_send_request(&mut state, request); + // action.run().await; + network.submit_peer_action(action.index(), action); } }, AllMessages::AvailabilityStore(AvailabilityStoreMessage::QueryAvailableData(_candidate_hash, tx)) => { @@ -241,7 +260,7 @@ impl AvailabilityRecoverySubsystemInstance { runtime: tokio::runtime::Handle, ) -> (Self, TestSubsystemContextHandle) { let (context, virtual_overseer) = - make_buffered_subsystem_context(spawn_task_handle.clone(), 4096); + make_buffered_subsystem_context(spawn_task_handle.clone(), 4096 * 4); let (collation_req_receiver, req_cfg) = IncomingRequest::get_config_receiver(&ReqProtocolNames::new(&GENESIS_HASH, None)); let subsystem = AvailabilityRecoverySubsystem::with_chunks_only( @@ -279,6 +298,10 @@ macro_rules! delay { use sp_keyring::Sr25519Keyring; +use crate::availability::network::NetworkAction; + +use self::network::NetworkEmulator; + #[derive(Debug)] enum Has { No, @@ -479,7 +502,7 @@ fn derive_erasure_chunks_with_proofs_and_root( } /// The test input parameters -#[derive(Clone)] +#[derive(Clone, Debug)] pub struct TestInput { pub n_validators: usize, pub n_cores: usize, @@ -487,11 +510,19 @@ pub struct TestInput { // This parameter is used to determine how many recoveries we batch in parallel // similarly to how in practice tranche0 assignments work. pub vrf_modulo_samples: usize, + // The amount of bandiwdht remote validators have. + pub bandwidth: usize, } impl Default for TestInput { fn default() -> Self { - Self { n_validators: 300, n_cores: 50, pov_size: 5 * 1024 * 1024, vrf_modulo_samples: 6 } + Self { + n_validators: 10, + n_cores: 10, + pov_size: 5 * 1024 * 1024, + vrf_modulo_samples: 6, + bandwidth: 15 * 1024 * 1024, + } } } @@ -535,6 +566,7 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { } env.send_signal(OverseerSignal::Conclude).await; + delay!(5); let duration = start_marker.elapsed().as_millis(); let tput = ((input.n_cores * input.pov_size) as u128) / duration * 1000; println!("Benchmark completed in {:?}ms", duration); diff --git a/polkadot/node/subsystem-bench/src/availability/network.rs b/polkadot/node/subsystem-bench/src/availability/network.rs new file mode 100644 index 0000000000000..268de5d828eb1 --- /dev/null +++ b/polkadot/node/subsystem-bench/src/availability/network.rs @@ -0,0 +1,212 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +use super::*; +use futures::stream::FuturesOrdered; +use tokio::sync::mpsc::{UnboundedReceiver, UnboundedSender}; + +// An emulated node egress traffic rate_limiter. +#[derive(Debug)] +struct RateLimit { + // How often we refill credits in buckets + tick_rate: usize, + // Total ticks + total_ticks: usize, + // Max refill per tick + max_refill: usize, + // Available credit. We allow for bursts over 1/tick_rate of `cps` budget, but we + // account it by negative credit. + credits: isize, + // When last refilled. + last_refill: Instant, +} + +impl RateLimit { + // Create a new `RateLimit` from a `cps` (credits per second) budget and + // `tick_rate`. + pub fn new(tick_rate: usize, cps: usize) -> Self { + // Compute how much refill for each tick + let max_refill = cps / tick_rate; + RateLimit { + tick_rate, + total_ticks: 0, + max_refill, + // A fresh start + credits: max_refill as isize, + last_refill: Instant::now(), + } + } + + pub async fn refill(&mut self) { + // If this is called to early, we need to sleep until next tick. + let now = Instant::now(); + let next_tick_delta = + (self.last_refill + Duration::from_millis(1000 / self.tick_rate as u64)) - now; + + // Sleep until next tick. + if !next_tick_delta.is_zero() { + gum::trace!(target: LOG_TARGET, "need to sleep {}ms", next_tick_delta.as_millis()); + tokio::time::sleep(next_tick_delta).await; + } + + self.total_ticks += 1; + self.credits += self.max_refill as isize; + self.last_refill = Instant::now(); + } + + // Reap credits from the bucket. + // Blocks if credits budged goes negative during call. + pub async fn reap(&mut self, amount: usize) { + self.credits -= amount as isize; + + if self.credits >= 0 { + return + } + + while self.credits < 0 { + gum::trace!(target: LOG_TARGET, "Before refill: {:?}", &self); + self.refill().await; + gum::trace!(target: LOG_TARGET, "After refill: {:?}", &self); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use polkadot_node_metrics::metered::CoarseDuration; + use std::time::Instant; + + use super::RateLimit; + + #[tokio::test] + async fn test_expected_rate() { + let tick_rate = 200; + let budget = 1_000_000; + // rate must not exceeed 100 credits per second + let mut rate_limiter = RateLimit::new(tick_rate, budget); + let mut total_sent = 0usize; + let start = Instant::now(); + + let mut reap_amount = 0; + while rate_limiter.total_ticks < tick_rate { + reap_amount += 1; + reap_amount = reap_amount % 100; + + rate_limiter.reap(reap_amount).await; + total_sent += reap_amount; + } + + let end = Instant::now(); + + // assert_eq!(end - start, Duration::from_secs(1)); + println!("duration: {}", (end - start).as_millis()); + + // Allow up to `budget/max_refill` error tolerance + let lower_bound = budget as u128 * ((end - start).as_millis() / 1000u128); + let upper_bound = budget as u128 * + ((end - start).as_millis() / 1000u128 + rate_limiter.max_refill as u128); + assert!(total_sent as u128 >= lower_bound); + assert!(total_sent as u128 <= upper_bound); + } +} +// A network peer emulator +struct PeerEmulator { + // The queue of requests waiting to be served by the emulator + actions_tx: UnboundedSender, +} + +impl PeerEmulator { + pub fn new(bandwidth: usize, spawn_task_handle: SpawnTaskHandle) -> Self { + let (actions_tx, mut actions_rx) = tokio::sync::mpsc::unbounded_channel(); + + spawn_task_handle.spawn("peer-emulator", "test-environment", async move { + let mut rate_limiter = RateLimit::new(20, bandwidth); + loop { + let maybe_action: Option = actions_rx.recv().await; + if let Some(action) = maybe_action { + let size = action.size(); + rate_limiter.reap(size).await; + action.run().await; + } else { + break + } + } + }); + + Self { actions_tx } + } + + // Queue a send request from the emulated peer. + pub fn send(&mut self, action: NetworkAction) { + self.actions_tx.send(action).expect("peer emulator task lives"); + } +} + +pub type ActionFuture = std::pin::Pin + std::marker::Send>>; +// An network action to be completed by the emulator task. +pub struct NetworkAction { + // The function that performs the action + run: ActionFuture, + // The payload size that we simulate sending from a peer + size: usize, + // Peer index + index: usize, +} + +impl NetworkAction { + pub fn new(index: usize, run: ActionFuture, size: usize) -> Self { + Self { run, size, index } + } + pub fn size(&self) -> usize { + self.size + } + + pub async fn run(self) { + self.run.await; + } + + pub fn index(&self) -> usize { + self.index + } +} + +// Mocks the network bridge and an arbitrary number of connected peer nodes. +// Implements network latency, bandwidth and error. +pub struct NetworkEmulator { + // Number of peers connected on validation protocol + n_peers: usize, + // The maximum Rx/Tx bandwidth in bytes per second. + bandwidth: usize, + // Per peer network emulation + peers: Vec, +} + +impl NetworkEmulator { + pub fn new(n_peers: usize, bandwidth: usize, spawn_task_handle: SpawnTaskHandle) -> Self { + Self { + n_peers, + bandwidth, + peers: (0..n_peers) + .map(|index| PeerEmulator::new(bandwidth, spawn_task_handle.clone())) + .collect::>(), + } + } + + pub fn submit_peer_action(&mut self, index: usize, action: NetworkAction) { + let _ = self.peers[index].send(action); + } +} diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs index bfc0b63e86d33..d58f0bccba9be 100644 --- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -45,7 +45,6 @@ struct BenchCli { fn new_runtime() -> tokio::runtime::Runtime { tokio::runtime::Builder::new_multi_thread() .thread_name("subsystem-bench") - .max_blocking_threads(32) .enable_all() .thread_stack_size(3 * 1024 * 1024) .build() @@ -68,6 +67,8 @@ impl BenchCli { let mut env = TestEnvironment::new(runtime.handle().clone(), state, registry.clone()); + println!("{:?}", env.input()); + runtime.block_on(availability::bench_chunk_recovery(&mut env)); let metric_families = registry.gather(); From e4bb037260e1fee1f06dee1d5581f9d7763e2548 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Tue, 7 Nov 2023 15:51:50 +0200 Subject: [PATCH 05/45] cleanup Signed-off-by: Andrei Sandu --- Cargo.lock | 1 - polkadot/node/subsystem-bench/Cargo.toml | 3 - polkadot/node/subsystem-bench/build.rs | 22 ----- .../subsystem-bench/src/availability/mod.rs | 90 +++++-------------- .../src/availability/network.rs | 5 +- .../subsystem-bench/src/subsystem-bench.rs | 10 +-- 6 files changed, 24 insertions(+), 107 deletions(-) delete mode 100644 polkadot/node/subsystem-bench/build.rs diff --git a/Cargo.lock b/Cargo.lock index d113fd7e43cd3..4645aeee6aab7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13022,7 +13022,6 @@ dependencies = [ "sp-core", "sp-keyring", "sp-keystore", - "substrate-build-script-utils", "tokio", "tracing-gum", ] diff --git a/polkadot/node/subsystem-bench/Cargo.toml b/polkadot/node/subsystem-bench/Cargo.toml index 729749ab153bf..7408397f930c2 100644 --- a/polkadot/node/subsystem-bench/Cargo.toml +++ b/polkadot/node/subsystem-bench/Cargo.toml @@ -50,8 +50,5 @@ polkadot-primitives-test-helpers = { path = "../../primitives/test-helpers" } # prometheus = { package = "substrate-prometheus-endpoint", path = "../../../substrate/utils/prometheus" } prometheus = { version = "0.13.0", default-features = false } -[build-dependencies] -substrate-build-script-utils = { path = "../../../substrate/utils/build-script-utils" } - [features] default = [] diff --git a/polkadot/node/subsystem-bench/build.rs b/polkadot/node/subsystem-bench/build.rs deleted file mode 100644 index 84fe22e23ed6f..0000000000000 --- a/polkadot/node/subsystem-bench/build.rs +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright (C) Parity Technologies (UK) Ltd. -// This file is part of Polkadot. - -// Polkadot is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. - -// Polkadot is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. - -// You should have received a copy of the GNU General Public License -// along with Polkadot. If not, see . - -fn main() { - substrate_build_script_utils::generate_cargo_keys(); - // For the node/worker version check, make sure we always rebuild the node and binary workers - // when the version changes. - substrate_build_script_utils::rerun_if_git_head_changed(); -} diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index cdc2bf5ce6444..c6e9dead09c1d 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -16,40 +16,34 @@ use std::{ sync::Arc, - thread::sleep, time::{Duration, Instant}, }; -use assert_matches::assert_matches; -use color_eyre::owo_colors::colors::xterm; use futures::{ channel::{mpsc, oneshot}, - executor, future, Future, FutureExt, SinkExt, + FutureExt, SinkExt, }; use futures_timer::Delay; use polkadot_node_metrics::metrics::Metrics; -use polkadot_availability_recovery::{AvailabilityRecoverySubsystem, Metrics as SubsystemMetrics}; +use polkadot_availability_recovery::AvailabilityRecoverySubsystem; use parity_scale_codec::Encode; use polkadot_node_network_protocol::request_response::{ - self as req_res, v1::ChunkResponse, IncomingRequest, Recipient, ReqProtocolNames, Requests, + self as req_res, v1::ChunkResponse, IncomingRequest, ReqProtocolNames, Requests, }; use prometheus::Registry; -use sc_network::{config::RequestResponseConfig, IfDisconnected, OutboundFailure, RequestFailure}; +use sc_network::{config::RequestResponseConfig, OutboundFailure, RequestFailure}; use polkadot_erasure_coding::{branches, obtain_chunks_v1 as obtain_chunks}; use polkadot_node_primitives::{BlockData, PoV, Proof}; use polkadot_node_subsystem::{ - errors::RecoveryError, - jaeger, messages::{ AllMessages, AvailabilityRecoveryMessage, AvailabilityStoreMessage, NetworkBridgeTxMessage, RuntimeApiMessage, RuntimeApiRequest, }, - overseer, ActiveLeavesUpdate, FromOrchestra, OverseerSignal, SpawnedSubsystem, Subsystem, - SubsystemContext, SubsystemError, SubsystemResult, + ActiveLeavesUpdate, FromOrchestra, OverseerSignal, Subsystem, }; const LOG_TARGET: &str = "subsystem-bench::availability"; @@ -62,41 +56,30 @@ use polkadot_node_subsystem_test_helpers::{ }; use polkadot_node_subsystem_util::TimeoutExt; use polkadot_primitives::{ - AuthorityDiscoveryId, CandidateHash, CandidateReceipt, CoreIndex, GroupIndex, Hash, HeadData, - IndexedVec, PersistedValidationData, SessionIndex, SessionInfo, ValidatorId, ValidatorIndex, + AuthorityDiscoveryId, CandidateReceipt, GroupIndex, Hash, HeadData, IndexedVec, + PersistedValidationData, SessionIndex, SessionInfo, ValidatorId, ValidatorIndex, }; use polkadot_primitives_test_helpers::{dummy_candidate_receipt, dummy_hash}; use sc_service::{SpawnTaskHandle, TaskManager}; mod network; -type VirtualOverseer = TestSubsystemContextHandle; - // Deterministic genesis hash for protocol names const GENESIS_HASH: Hash = Hash::repeat_byte(0xff); struct AvailabilityRecoverySubsystemInstance { - protocol_config: RequestResponseConfig, -} - -pub struct EnvParams { - // The candidate we will recover in the benchmark. - candidate: CandidateReceipt, + _protocol_config: RequestResponseConfig, } // Implements a mockup of NetworkBridge and AvilabilityStore to support provide state for // `AvailabilityRecoverySubsystemInstance` pub struct TestEnvironment { - // A tokio runtime to use in the test - runtime: tokio::runtime::Handle, // A task manager that tracks task poll durations. task_manager: TaskManager, // The Prometheus metrics registry registry: Registry, // A test overseer. to_subsystem: mpsc::Sender>, - // Parameters - params: EnvParams, // Subsystem instance, currently keeps req/response protocol channel senders // for the whole duration of the test. instance: AvailabilityRecoverySubsystemInstance, @@ -110,15 +93,8 @@ impl TestEnvironment { // We use prometheus metrics to collect per job task poll time and subsystem metrics. pub fn new(runtime: tokio::runtime::Handle, state: TestState, registry: Registry) -> Self { let task_manager: TaskManager = TaskManager::new(runtime.clone(), Some(®istry)).unwrap(); - let (instance, virtual_overseer) = AvailabilityRecoverySubsystemInstance::new( - ®istry, - task_manager.spawn_handle(), - runtime.clone(), - ); - - // TODO: support parametrization of initial test state - // n_validator, n_cores. - let params = EnvParams { candidate: state.candidate() }; + let (instance, virtual_overseer) = + AvailabilityRecoverySubsystemInstance::new(®istry, task_manager.spawn_handle()); // Copy sender for later when we need to inject messages in to the subsystem. let to_subsystem = virtual_overseer.tx.clone(); @@ -133,12 +109,9 @@ impl TestEnvironment { async move { Self::env_task(virtual_overseer, task_state, spawn_task_handle).await }, ); - TestEnvironment { runtime, task_manager, registry, to_subsystem, params, instance, state } + TestEnvironment { task_manager, registry, to_subsystem, instance, state } } - pub fn params(&self) -> &EnvParams { - &self.params - } pub fn input(&self) -> &TestInput { self.state.input() } @@ -189,9 +162,7 @@ impl TestEnvironment { ) ) => { for request in requests { - // TODO: add latency variance when answering requests. This should be an env parameter. let action = Self::respond_to_send_request(&mut state, request); - // action.run().await; network.submit_peer_action(action.index(), action); } }, @@ -210,9 +181,9 @@ impl TestEnvironment { let _ = tx.send(Some(chunk_size)); } AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, + _relay_parent, RuntimeApiRequest::SessionInfo( - session_index, + _session_index, tx, ) )) => { @@ -257,7 +228,6 @@ impl AvailabilityRecoverySubsystemInstance { pub fn new( registry: &Registry, spawn_task_handle: SpawnTaskHandle, - runtime: tokio::runtime::Handle, ) -> (Self, TestSubsystemContextHandle) { let (context, virtual_overseer) = make_buffered_subsystem_context(spawn_task_handle.clone(), 4096 * 4); @@ -279,7 +249,7 @@ impl AvailabilityRecoverySubsystemInstance { subsystem_future, ); - (Self { protocol_config: req_cfg }, virtual_overseer) + (Self { _protocol_config: req_cfg }, virtual_overseer) } } @@ -302,24 +272,6 @@ use crate::availability::network::NetworkAction; use self::network::NetworkEmulator; -#[derive(Debug)] -enum Has { - No, - Yes, - NetworkError(RequestFailure), - /// Make request not return at all, instead the sender is returned from the function. - /// - /// Note, if you use `DoesNotReturn` you have to keep the returned senders alive, otherwise the - /// subsystem will receive a cancel event and the request actually does return. - DoesNotReturn, -} - -impl Has { - fn timeout() -> Self { - Has::NetworkError(RequestFailure::Network(OutboundFailure::Timeout)) - } -} - #[derive(Clone)] pub struct TestState { validators: Vec, @@ -401,7 +353,7 @@ impl TestState { pub fn new(input: TestInput) -> Self { let validators = (0..input.n_validators as u64) .into_iter() - .map(|v| Sr25519Keyring::Alice) + .map(|_v| Sr25519Keyring::Alice) .collect::>(); let mut candidate = dummy_candidate_receipt(dummy_hash()); @@ -418,8 +370,8 @@ impl TestState { relay_parent_storage_root: Default::default(), }; - /// A 5MB PoV. - let pov = PoV { block_data: BlockData(vec![42; 1024 * 1024 * 5]) }; + // A 5MB PoV. + let pov = PoV { block_data: BlockData(vec![42; input.pov_size]) }; let available_data = AvailableData { validation_data: persisted_validation_data.clone(), @@ -535,10 +487,8 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { )))) .await; - let mut candidate = env.params().candidate.clone(); - let start_marker = Instant::now(); - + let mut candidate = env.state.candidate(); let mut batch = Vec::new(); for candidate_num in 0..input.n_cores as u64 { let (tx, rx) = oneshot::channel(); @@ -556,13 +506,13 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { if batch.len() >= input.vrf_modulo_samples { for rx in std::mem::take(&mut batch) { - let available_data = rx.await.unwrap().unwrap(); + let _available_data = rx.await.unwrap().unwrap(); } } } for rx in std::mem::take(&mut batch) { - let available_data = rx.await.unwrap().unwrap(); + let _available_data = rx.await.unwrap().unwrap(); } env.send_signal(OverseerSignal::Conclude).await; diff --git a/polkadot/node/subsystem-bench/src/availability/network.rs b/polkadot/node/subsystem-bench/src/availability/network.rs index 268de5d828eb1..1889e971cc1e1 100644 --- a/polkadot/node/subsystem-bench/src/availability/network.rs +++ b/polkadot/node/subsystem-bench/src/availability/network.rs @@ -15,8 +15,7 @@ // along with Polkadot. If not, see . use super::*; -use futures::stream::FuturesOrdered; -use tokio::sync::mpsc::{UnboundedReceiver, UnboundedSender}; +use tokio::sync::mpsc::UnboundedSender; // An emulated node egress traffic rate_limiter. #[derive(Debug)] @@ -201,7 +200,7 @@ impl NetworkEmulator { n_peers, bandwidth, peers: (0..n_peers) - .map(|index| PeerEmulator::new(bandwidth, spawn_task_handle.clone())) + .map(|_index| PeerEmulator::new(bandwidth, spawn_task_handle.clone())) .collect::>(), } } diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs index d58f0bccba9be..30a9dff02757f 100644 --- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -20,11 +20,10 @@ use clap::Parser; use color_eyre::eyre; use prometheus::proto::LabelPair; -use sc_service::TaskManager; pub(crate) mod availability; -use availability::{EnvParams, TestEnvironment, TestInput, TestState}; +use availability::{TestEnvironment, TestInput, TestState}; const LOG_TARGET: &str = "subsystem-bench"; /// Define the supported benchmarks targets @@ -54,9 +53,7 @@ fn new_runtime() -> tokio::runtime::Runtime { impl BenchCli { /// Launch a malus node. fn launch(self) -> eyre::Result<()> { - use prometheus::{proto::MetricType, Counter, Encoder, Opts, Registry, TextEncoder}; - - let encoder = TextEncoder::new(); + use prometheus::{proto::MetricType, Registry, TextEncoder}; println!("Preparing {:?} benchmarks", self.target); @@ -72,7 +69,6 @@ impl BenchCli { runtime.block_on(availability::bench_chunk_recovery(&mut env)); let metric_families = registry.gather(); - let total_subsystem_cpu = 0; for familiy in metric_families { let metric_type = familiy.get_field_type(); @@ -82,8 +78,6 @@ impl BenchCli { MetricType::HISTOGRAM => { let h = metric.get_histogram(); - let mut inf_seen = false; - let labels = metric.get_label(); // Skip test env usage. let mut env_label = LabelPair::default(); From a69492481061bebdfbcfe9bb834a5c24a1160a33 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Tue, 7 Nov 2023 18:31:15 +0200 Subject: [PATCH 06/45] Add latency emulation Signed-off-by: Andrei Sandu --- .../src/availability/configuration.rs | 107 ++++++++++++++++++ .../subsystem-bench/src/availability/mod.rs | 89 +++++++-------- .../src/availability/network.rs | 50 ++++---- .../subsystem-bench/src/subsystem-bench.rs | 9 +- 4 files changed, 182 insertions(+), 73 deletions(-) create mode 100644 polkadot/node/subsystem-bench/src/availability/configuration.rs diff --git a/polkadot/node/subsystem-bench/src/availability/configuration.rs b/polkadot/node/subsystem-bench/src/availability/configuration.rs new file mode 100644 index 0000000000000..14e8f55128d96 --- /dev/null +++ b/polkadot/node/subsystem-bench/src/availability/configuration.rs @@ -0,0 +1,107 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +use super::*; + +/// Peer response latency configuration. +#[derive(Clone, Debug)] +pub struct PeerLatency { + /// Min latency for `NetworkAction` completion. + pub min_latency: Duration, + /// Max latency or `NetworkAction` completion. + pub max_latency: Duration, +} + +/// The test input parameters +#[derive(Clone, Debug)] +pub struct TestConfiguration { + /// Number of validators + pub n_validators: usize, + /// Number of cores + pub n_cores: usize, + /// The PoV size + pub pov_size: usize, + /// This parameter is used to determine how many recoveries we batch in parallel + /// similarly to how in practice tranche0 assignments work. + pub vrf_modulo_samples: usize, + /// The amount of bandiwdht remote validators have. + pub bandwidth: usize, + /// Optional peer emulation latency + pub latency: Option, +} + +impl Default for TestConfiguration { + fn default() -> Self { + Self { + n_validators: 10, + n_cores: 10, + pov_size: 5 * 1024 * 1024, + vrf_modulo_samples: 6, + bandwidth: 15 * 1024 * 1024, + latency: None, + } + } +} + +impl TestConfiguration { + /// An unconstrained standard configuration matching Polkadot/Kusama + pub fn unconstrained_300_validators_60_cores(pov_size: usize) -> TestConfiguration { + Self { + n_validators: 300, + n_cores: 60, + pov_size, + vrf_modulo_samples: 6, + // HW specs node bandwidth + bandwidth: 60 * 1024 * 1024, + // No latency + latency: None, + } + } + + /// Polkadot/Kusama configuration with typical latency constraints. + pub fn healthy_network_300_validators_60_cores(pov_size: usize) -> TestConfiguration { + Self { + n_validators: 300, + n_cores: 60, + pov_size, + vrf_modulo_samples: 6, + // HW specs node bandwidth + bandwidth: 60 * 1024 * 1024, + latency: Some(PeerLatency { + min_latency: Duration::from_millis(1), + max_latency: Duration::from_millis(50), + }), + } + } + + /// Polkadot/Kusama configuration with degraded due to latencies. + /// TODO: implement errors. + pub fn degraded_network_300_validators_60_cores(pov_size: usize) -> TestConfiguration { + Self { + n_validators: 300, + n_cores: 60, + pov_size, + vrf_modulo_samples: 6, + // HW specs node bandwidth + bandwidth: 60 * 1024 * 1024, + // A range of latencies to expect in a degraded network + latency: Some(PeerLatency { + min_latency: Duration::from_millis(1), + max_latency: Duration::from_millis(1000), + }), + } + } +} diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index c6e9dead09c1d..6c0c41c86c0f8 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -32,6 +32,7 @@ use parity_scale_codec::Encode; use polkadot_node_network_protocol::request_response::{ self as req_res, v1::ChunkResponse, IncomingRequest, ReqProtocolNames, Requests, }; +use rand::{distributions::Uniform, prelude::Distribution, thread_rng}; use prometheus::Registry; use sc_network::{config::RequestResponseConfig, OutboundFailure, RequestFailure}; @@ -62,8 +63,11 @@ use polkadot_primitives::{ use polkadot_primitives_test_helpers::{dummy_candidate_receipt, dummy_hash}; use sc_service::{SpawnTaskHandle, TaskManager}; +mod configuration; mod network; +pub use configuration::TestConfiguration; + // Deterministic genesis hash for protocol names const GENESIS_HASH: Hash = Hash::repeat_byte(0xff); @@ -112,8 +116,20 @@ impl TestEnvironment { TestEnvironment { task_manager, registry, to_subsystem, instance, state } } - pub fn input(&self) -> &TestInput { - self.state.input() + pub fn config(&self) -> &TestConfiguration { + self.state.config() + } + + /// Produce a randomized duration between `min` and `max`. + fn random_latency(maybe_peer_latency: Option<&PeerLatency>) -> Option { + if let Some(peer_latency) = maybe_peer_latency { + Some( + Uniform::from(peer_latency.min_latency..=peer_latency.max_latency) + .sample(&mut thread_rng()), + ) + } else { + None + } } pub fn respond_to_send_request(state: &mut TestState, request: Requests) -> NetworkAction { @@ -129,7 +145,13 @@ impl TestEnvironment { } .boxed(); - NetworkAction::new(validator_index, future, size) + NetworkAction::new( + validator_index, + future, + size, + // Generate a random latency based on configuration. + Self::random_latency(state.config().latency.as_ref()), + ) }, _ => panic!("received an unexpected request"), } @@ -144,8 +166,8 @@ impl TestEnvironment { ) { // Emulate `n_validators` each with 1MiB of bandwidth available. let mut network = NetworkEmulator::new( - state.input().n_validators, - state.input().bandwidth, + state.config().n_validators, + state.config().bandwidth, spawn_task_handle, ); @@ -270,7 +292,7 @@ use sp_keyring::Sr25519Keyring; use crate::availability::network::NetworkAction; -use self::network::NetworkEmulator; +use self::{configuration::PeerLatency, network::NetworkEmulator}; #[derive(Clone)] pub struct TestState { @@ -287,26 +309,18 @@ pub struct TestState { available_data: AvailableData, chunks: Vec, invalid_chunks: Vec, - input: TestInput, + config: TestConfiguration, } impl TestState { - fn input(&self) -> &TestInput { - &self.input + fn config(&self) -> &TestConfiguration { + &self.config } fn candidate(&self) -> CandidateReceipt { self.candidate.clone() } - fn threshold(&self) -> usize { - recovery_threshold(self.validators.len()).unwrap() - } - - fn impossibility_threshold(&self) -> usize { - self.validators.len() - self.threshold() + 1 - } - async fn respond_to_available_data_query(&self, tx: oneshot::Sender>) { let _ = tx.send(Some(self.available_data.clone())); } @@ -350,8 +364,8 @@ impl TestState { let _ = tx.send(v); } - pub fn new(input: TestInput) -> Self { - let validators = (0..input.n_validators as u64) + pub fn new(config: TestConfiguration) -> Self { + let validators = (0..config.n_validators as u64) .into_iter() .map(|_v| Sr25519Keyring::Alice) .collect::>(); @@ -371,7 +385,7 @@ impl TestState { }; // A 5MB PoV. - let pov = PoV { block_data: BlockData(vec![42; input.pov_size]) }; + let pov = PoV { block_data: BlockData(vec![42; config.pov_size]) }; let available_data = AvailableData { validation_data: persisted_validation_data.clone(), @@ -413,7 +427,7 @@ impl TestState { available_data, chunks, invalid_chunks, - input, + config, } } } @@ -453,33 +467,8 @@ fn derive_erasure_chunks_with_proofs_and_root( (erasure_chunks, root) } -/// The test input parameters -#[derive(Clone, Debug)] -pub struct TestInput { - pub n_validators: usize, - pub n_cores: usize, - pub pov_size: usize, - // This parameter is used to determine how many recoveries we batch in parallel - // similarly to how in practice tranche0 assignments work. - pub vrf_modulo_samples: usize, - // The amount of bandiwdht remote validators have. - pub bandwidth: usize, -} - -impl Default for TestInput { - fn default() -> Self { - Self { - n_validators: 10, - n_cores: 10, - pov_size: 5 * 1024 * 1024, - vrf_modulo_samples: 6, - bandwidth: 15 * 1024 * 1024, - } - } -} - pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { - let input = env.input().clone(); + let config = env.config().clone(); env.send_signal(OverseerSignal::ActiveLeaves(ActiveLeavesUpdate::start_work(new_leaf( Hash::repeat_byte(1), @@ -490,7 +479,7 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { let start_marker = Instant::now(); let mut candidate = env.state.candidate(); let mut batch = Vec::new(); - for candidate_num in 0..input.n_cores as u64 { + for candidate_num in 0..config.n_cores as u64 { let (tx, rx) = oneshot::channel(); batch.push(rx); @@ -504,7 +493,7 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { )) .await; - if batch.len() >= input.vrf_modulo_samples { + if batch.len() >= config.vrf_modulo_samples { for rx in std::mem::take(&mut batch) { let _available_data = rx.await.unwrap().unwrap(); } @@ -518,7 +507,7 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { env.send_signal(OverseerSignal::Conclude).await; delay!(5); let duration = start_marker.elapsed().as_millis(); - let tput = ((input.n_cores * input.pov_size) as u128) / duration * 1000; + let tput = ((config.n_cores * config.pov_size) as u128) / duration * 1000; println!("Benchmark completed in {:?}ms", duration); println!("Throughput: {}KiB/s", tput / 1024); } diff --git a/polkadot/node/subsystem-bench/src/availability/network.rs b/polkadot/node/subsystem-bench/src/availability/network.rs index 1889e971cc1e1..d6fc175c859ba 100644 --- a/polkadot/node/subsystem-bench/src/availability/network.rs +++ b/polkadot/node/subsystem-bench/src/availability/network.rs @@ -122,6 +122,7 @@ mod tests { assert!(total_sent as u128 <= upper_bound); } } + // A network peer emulator struct PeerEmulator { // The queue of requests waiting to be served by the emulator @@ -132,19 +133,32 @@ impl PeerEmulator { pub fn new(bandwidth: usize, spawn_task_handle: SpawnTaskHandle) -> Self { let (actions_tx, mut actions_rx) = tokio::sync::mpsc::unbounded_channel(); - spawn_task_handle.spawn("peer-emulator", "test-environment", async move { - let mut rate_limiter = RateLimit::new(20, bandwidth); - loop { - let maybe_action: Option = actions_rx.recv().await; - if let Some(action) = maybe_action { - let size = action.size(); - rate_limiter.reap(size).await; - action.run().await; - } else { - break + spawn_task_handle + .clone() + .spawn("peer-emulator", "test-environment", async move { + let mut rate_limiter = RateLimit::new(20, bandwidth); + loop { + let maybe_action: Option = actions_rx.recv().await; + if let Some(action) = maybe_action { + let size = action.size(); + rate_limiter.reap(size).await; + if let Some(latency) = action.latency { + spawn_task_handle.spawn( + "peer-emulator-latency", + "test-environment", + async move { + tokio::time::sleep(latency).await; + action.run().await; + }, + ) + } else { + action.run().await; + } + } else { + break + } } - } - }); + }); Self { actions_tx } } @@ -164,11 +178,13 @@ pub struct NetworkAction { size: usize, // Peer index index: usize, + // The amount of time to delay the polling `run` + latency: Option, } impl NetworkAction { - pub fn new(index: usize, run: ActionFuture, size: usize) -> Self { - Self { run, size, index } + pub fn new(index: usize, run: ActionFuture, size: usize, latency: Option) -> Self { + Self { run, size, index, latency } } pub fn size(&self) -> usize { self.size @@ -186,10 +202,6 @@ impl NetworkAction { // Mocks the network bridge and an arbitrary number of connected peer nodes. // Implements network latency, bandwidth and error. pub struct NetworkEmulator { - // Number of peers connected on validation protocol - n_peers: usize, - // The maximum Rx/Tx bandwidth in bytes per second. - bandwidth: usize, // Per peer network emulation peers: Vec, } @@ -197,8 +209,6 @@ pub struct NetworkEmulator { impl NetworkEmulator { pub fn new(n_peers: usize, bandwidth: usize, spawn_task_handle: SpawnTaskHandle) -> Self { Self { - n_peers, - bandwidth, peers: (0..n_peers) .map(|_index| PeerEmulator::new(bandwidth, spawn_task_handle.clone())) .collect::>(), diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs index 30a9dff02757f..52c5227267992 100644 --- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -23,7 +23,7 @@ use prometheus::proto::LabelPair; pub(crate) mod availability; -use availability::{TestEnvironment, TestInput, TestState}; +use availability::{TestConfiguration, TestEnvironment, TestState}; const LOG_TARGET: &str = "subsystem-bench"; /// Define the supported benchmarks targets @@ -60,11 +60,14 @@ impl BenchCli { let runtime = new_runtime(); let registry = Registry::new(); - let state = TestState::new(TestInput::default()); + let test_config = + TestConfiguration::degraded_network_300_validators_60_cores(1024 * 1024); + + let state = TestState::new(test_config); let mut env = TestEnvironment::new(runtime.handle().clone(), state, registry.clone()); - println!("{:?}", env.input()); + println!("{:?}", env.config()); runtime.block_on(availability::bench_chunk_recovery(&mut env)); From 7ca4dbadf6d24bd94abb0a06bf25d3aaacea8e9f Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Wed, 8 Nov 2023 13:15:02 +0200 Subject: [PATCH 07/45] support multiple pov sizes Signed-off-by: Andrei Sandu --- .../src/availability/configuration.rs | 44 ++-- .../subsystem-bench/src/availability/mod.rs | 222 ++++++++++++------ .../subsystem-bench/src/subsystem-bench.rs | 6 +- 3 files changed, 176 insertions(+), 96 deletions(-) diff --git a/polkadot/node/subsystem-bench/src/availability/configuration.rs b/polkadot/node/subsystem-bench/src/availability/configuration.rs index 14e8f55128d96..3df496ad04280 100644 --- a/polkadot/node/subsystem-bench/src/availability/configuration.rs +++ b/polkadot/node/subsystem-bench/src/availability/configuration.rs @@ -28,80 +28,92 @@ pub struct PeerLatency { /// The test input parameters #[derive(Clone, Debug)] pub struct TestConfiguration { + /// Configuration for the `availability-recovery` subsystem. + pub use_fast_path: bool, /// Number of validators pub n_validators: usize, /// Number of cores pub n_cores: usize, /// The PoV size - pub pov_size: usize, + pub pov_sizes: Vec, /// This parameter is used to determine how many recoveries we batch in parallel - /// similarly to how in practice tranche0 assignments work. - pub vrf_modulo_samples: usize, + /// to simulate tranche0 recoveries. + pub max_parallel_recoveries: usize, /// The amount of bandiwdht remote validators have. pub bandwidth: usize, /// Optional peer emulation latency pub latency: Option, + /// Error probability + pub error: usize, } impl Default for TestConfiguration { fn default() -> Self { Self { + use_fast_path: false, n_validators: 10, n_cores: 10, - pov_size: 5 * 1024 * 1024, - vrf_modulo_samples: 6, - bandwidth: 15 * 1024 * 1024, + pov_sizes: vec![5 * 1024 * 1024], + max_parallel_recoveries: 6, + bandwidth: 60 * 1024 * 1024, latency: None, + error: 0, } } } impl TestConfiguration { /// An unconstrained standard configuration matching Polkadot/Kusama - pub fn unconstrained_300_validators_60_cores(pov_size: usize) -> TestConfiguration { + pub fn unconstrained_300_validators_60_cores(pov_sizes: Vec) -> TestConfiguration { Self { + use_fast_path: false, n_validators: 300, n_cores: 60, - pov_size, - vrf_modulo_samples: 6, + pov_sizes, + max_parallel_recoveries: 20, // HW specs node bandwidth bandwidth: 60 * 1024 * 1024, // No latency latency: None, + error: 0, } } /// Polkadot/Kusama configuration with typical latency constraints. - pub fn healthy_network_300_validators_60_cores(pov_size: usize) -> TestConfiguration { + pub fn healthy_network_300_validators_60_cores(pov_sizes: Vec) -> TestConfiguration { Self { + use_fast_path: true, n_validators: 300, n_cores: 60, - pov_size, - vrf_modulo_samples: 6, + pov_sizes, + max_parallel_recoveries: 6, // HW specs node bandwidth bandwidth: 60 * 1024 * 1024, latency: Some(PeerLatency { min_latency: Duration::from_millis(1), max_latency: Duration::from_millis(50), }), + error: 5, } } /// Polkadot/Kusama configuration with degraded due to latencies. /// TODO: implement errors. - pub fn degraded_network_300_validators_60_cores(pov_size: usize) -> TestConfiguration { + pub fn degraded_network_300_validators_60_cores(pov_sizes: Vec) -> TestConfiguration { Self { + use_fast_path: true, n_validators: 300, n_cores: 60, - pov_size, - vrf_modulo_samples: 6, + pov_sizes, + max_parallel_recoveries: 6, // HW specs node bandwidth bandwidth: 60 * 1024 * 1024, // A range of latencies to expect in a degraded network latency: Some(PeerLatency { min_latency: Duration::from_millis(1), - max_latency: Duration::from_millis(1000), + max_latency: Duration::from_millis(500), }), + error: 30, } } } diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index 6c0c41c86c0f8..dcfeb42877803 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -15,6 +15,7 @@ // along with Polkadot. If not, see . use std::{ + collections::HashMap, sync::Arc, time::{Duration, Instant}, }; @@ -23,7 +24,6 @@ use futures::{ channel::{mpsc, oneshot}, FutureExt, SinkExt, }; -use futures_timer::Delay; use polkadot_node_metrics::metrics::Metrics; use polkadot_availability_recovery::AvailabilityRecoverySubsystem; @@ -49,7 +49,6 @@ use polkadot_node_subsystem::{ const LOG_TARGET: &str = "subsystem-bench::availability"; -use polkadot_erasure_coding::recovery_threshold; use polkadot_node_primitives::{AvailableData, ErasureChunk}; use polkadot_node_subsystem_test_helpers::{ @@ -57,7 +56,7 @@ use polkadot_node_subsystem_test_helpers::{ }; use polkadot_node_subsystem_util::TimeoutExt; use polkadot_primitives::{ - AuthorityDiscoveryId, CandidateReceipt, GroupIndex, Hash, HeadData, IndexedVec, + AuthorityDiscoveryId, CandidateHash, CandidateReceipt, GroupIndex, Hash, HeadData, IndexedVec, PersistedValidationData, SessionIndex, SessionInfo, ValidatorId, ValidatorIndex, }; use polkadot_primitives_test_helpers::{dummy_candidate_receipt, dummy_hash}; @@ -97,8 +96,11 @@ impl TestEnvironment { // We use prometheus metrics to collect per job task poll time and subsystem metrics. pub fn new(runtime: tokio::runtime::Handle, state: TestState, registry: Registry) -> Self { let task_manager: TaskManager = TaskManager::new(runtime.clone(), Some(®istry)).unwrap(); - let (instance, virtual_overseer) = - AvailabilityRecoverySubsystemInstance::new(®istry, task_manager.spawn_handle()); + let (instance, virtual_overseer) = AvailabilityRecoverySubsystemInstance::new( + ®istry, + task_manager.spawn_handle(), + state.config().use_fast_path, + ); // Copy sender for later when we need to inject messages in to the subsystem. let to_subsystem = virtual_overseer.tx.clone(); @@ -132,16 +134,60 @@ impl TestEnvironment { } } + /// Generate a random error based on `probability`. + /// `probability` should be a number between 0 and 100. + fn random_error(probability: usize) -> bool { + Uniform::from(0..=99).sample(&mut thread_rng()) < probability + } + pub fn respond_to_send_request(state: &mut TestState, request: Requests) -> NetworkAction { match request { Requests::ChunkFetchingV1(outgoing_request) => { let validator_index = outgoing_request.payload.index.0 as usize; - let chunk: ChunkResponse = state.chunks[validator_index].clone().into(); + let chunk: ChunkResponse = + state.chunks.get(&outgoing_request.payload.candidate_hash).unwrap() + [validator_index] + .clone() + .into(); let size = chunk.encoded_size(); + + let response = if Self::random_error(state.config().error) { + Err(RequestFailure::Network(OutboundFailure::ConnectionClosed)) + } else { + Ok(req_res::v1::ChunkFetchingResponse::from(Some(chunk)).encode()) + }; + let future = async move { - let _ = outgoing_request - .pending_response - .send(Ok(req_res::v1::ChunkFetchingResponse::from(Some(chunk)).encode())); + let _ = outgoing_request.pending_response.send(response); + } + .boxed(); + + NetworkAction::new( + validator_index, + future, + size, + // Generate a random latency based on configuration. + Self::random_latency(state.config().latency.as_ref()), + ) + }, + Requests::AvailableDataFetchingV1(outgoing_request) => { + // TODO: do better, by implementing diff authority ids and mapping network actions + // to authority id, + let validator_index = + Uniform::from(0..state.config().n_validators).sample(&mut thread_rng()); + let available_data = + state.candidates.get(&outgoing_request.payload.candidate_hash).unwrap().clone(); + let size = available_data.encoded_size(); + + let response = if Self::random_error(state.config().error) { + Err(RequestFailure::Network(OutboundFailure::ConnectionClosed)) + } else { + Ok(req_res::v1::AvailableDataFetchingResponse::from(Some(available_data)) + .encode()) + }; + + let future = async move { + let _ = outgoing_request.pending_response.send(response); } .boxed(); @@ -192,14 +238,14 @@ impl TestEnvironment { // TODO: Simulate av store load by delaying the response. state.respond_none_to_available_data_query(tx).await; }, - AllMessages::AvailabilityStore(AvailabilityStoreMessage::QueryAllChunks(_candidate_hash, tx)) => { + AllMessages::AvailabilityStore(AvailabilityStoreMessage::QueryAllChunks(candidate_hash, tx)) => { // Test env: We always have our own chunk. - state.respond_to_query_all_request(|index| index == state.validator_index.0 as usize, tx).await; + state.respond_to_query_all_request(candidate_hash, |index| index == state.validator_index.0 as usize, tx).await; }, AllMessages::AvailabilityStore( - AvailabilityStoreMessage::QueryChunkSize(_, tx) + AvailabilityStoreMessage::QueryChunkSize(candidate_hash, tx) ) => { - let chunk_size = state.chunks[0].encoded_size(); + let chunk_size = state.chunks.get(&candidate_hash).unwrap()[0].encoded_size(); let _ = tx.send(Some(chunk_size)); } AllMessages::RuntimeApi(RuntimeApiMessage::Request( @@ -250,15 +296,24 @@ impl AvailabilityRecoverySubsystemInstance { pub fn new( registry: &Registry, spawn_task_handle: SpawnTaskHandle, + use_fast_path: bool, ) -> (Self, TestSubsystemContextHandle) { let (context, virtual_overseer) = make_buffered_subsystem_context(spawn_task_handle.clone(), 4096 * 4); let (collation_req_receiver, req_cfg) = IncomingRequest::get_config_receiver(&ReqProtocolNames::new(&GENESIS_HASH, None)); - let subsystem = AvailabilityRecoverySubsystem::with_chunks_only( - collation_req_receiver, - Metrics::try_register(®istry).unwrap(), - ); + + let subsystem = if use_fast_path { + AvailabilityRecoverySubsystem::with_fast_path( + collation_req_receiver, + Metrics::try_register(®istry).unwrap(), + ) + } else { + AvailabilityRecoverySubsystem::with_chunks_only( + collation_req_receiver, + Metrics::try_register(®istry).unwrap(), + ) + }; let spawned_subsystem = subsystem.start(context); let subsystem_future = async move { @@ -282,12 +337,6 @@ const TIMEOUT: Duration = Duration::from_millis(300); // This should eventually be a test parameter. const MAX_TIME_OF_FLIGHT: Duration = Duration::from_millis(5000); -macro_rules! delay { - ($delay:expr) => { - Delay::new(Duration::from_millis($delay)).await; - }; -} - use sp_keyring::Sr25519Keyring; use crate::availability::network::NetworkAction; @@ -301,14 +350,15 @@ pub struct TestState { validator_authority_id: Vec, // The test node validator index. validator_index: ValidatorIndex, - candidate: CandidateReceipt, + // Per core candidates receipts. + candidate_receipts: Vec, session_index: SessionIndex, persisted_validation_data: PersistedValidationData, + /// A per size pov mapping to available data. + candidates: HashMap, - available_data: AvailableData, - chunks: Vec, - invalid_chunks: Vec, + chunks: HashMap>, config: TestConfiguration, } @@ -317,12 +367,8 @@ impl TestState { &self.config } - fn candidate(&self) -> CandidateReceipt { - self.candidate.clone() - } - - async fn respond_to_available_data_query(&self, tx: oneshot::Sender>) { - let _ = tx.send(Some(self.available_data.clone())); + fn candidate(&self, candidate_index: usize) -> CandidateReceipt { + self.candidate_receipts.get(candidate_index).unwrap().clone() } async fn respond_none_to_available_data_query( @@ -337,9 +383,7 @@ impl TestState { validators: self.validator_public.clone(), discovery_keys: self.validator_authority_id.clone(), // all validators in the same group. - validator_groups: IndexedVec::>::from(vec![(0..self - .validators - .len()) + validator_groups: IndexedVec::>::from(vec![(0..5) .map(|i| ValidatorIndex(i as _)) .collect()]), assignment_keys: vec![], @@ -356,10 +400,18 @@ impl TestState { } async fn respond_to_query_all_request( &self, + candidate_hash: CandidateHash, send_chunk: impl Fn(usize) -> bool, tx: oneshot::Sender>, ) { - let v = self.chunks.iter().filter(|c| send_chunk(c.index.0 as usize)).cloned().collect(); + let v = self + .chunks + .get(&candidate_hash) + .unwrap() + .iter() + .filter(|c| send_chunk(c.index.0 as usize)) + .cloned() + .collect(); let _ = tx.send(v); } @@ -370,13 +422,15 @@ impl TestState { .map(|_v| Sr25519Keyring::Alice) .collect::>(); - let mut candidate = dummy_candidate_receipt(dummy_hash()); let validator_public = validator_pubkeys(&validators); let validator_authority_id = validator_authority_id(&validators); let validator_index = ValidatorIndex(0); - + let mut pov_size_to_candidate = HashMap::new(); + let mut chunks = HashMap::new(); + let mut candidates = HashMap::new(); let session_index = 10; + // we use it for all candidates. let persisted_validation_data = PersistedValidationData { parent_head: HeadData(vec![7, 8, 9]), relay_parent_number: Default::default(), @@ -384,49 +438,57 @@ impl TestState { relay_parent_storage_root: Default::default(), }; - // A 5MB PoV. - let pov = PoV { block_data: BlockData(vec![42; config.pov_size]) }; + // Create initial candidate receipts + let mut candidate_receipts = config + .pov_sizes + .iter() + .map(|_index| dummy_candidate_receipt(dummy_hash())) + .collect::>(); - let available_data = AvailableData { - validation_data: persisted_validation_data.clone(), - pov: Arc::new(pov), - }; + for (index, pov_size) in config.pov_sizes.iter().enumerate() { + let mut candidate = &mut candidate_receipts[index]; + // a hack to make candidate unique. + candidate.descriptor.relay_parent = Hash::from_low_u64_be(index as u64); - let (chunks, erasure_root) = derive_erasure_chunks_with_proofs_and_root( - validators.len(), - &available_data, - |_, _| {}, - ); - // Mess around: - let invalid_chunks = chunks - .iter() - .cloned() - .map(|mut chunk| { - if chunk.chunk.len() >= 2 && chunk.chunk[0] != chunk.chunk[1] { - chunk.chunk[0] = chunk.chunk[1]; - } else if chunk.chunk.len() >= 1 { - chunk.chunk[0] = !chunk.chunk[0]; - } else { - chunk.proof = Proof::dummy_proof(); - } - chunk - }) - .collect(); - debug_assert_ne!(chunks, invalid_chunks); + // We reuse candidates of same size, to speed up the test startup. + let (erasure_root, available_data, new_chunks) = + pov_size_to_candidate.entry(pov_size).or_insert_with(|| { + let pov = PoV { block_data: BlockData(vec![index as u8; *pov_size]) }; + + let available_data = AvailableData { + validation_data: persisted_validation_data.clone(), + pov: Arc::new(pov), + }; + + let (new_chunks, erasure_root) = derive_erasure_chunks_with_proofs_and_root( + validators.len(), + &available_data, + |_, _| {}, + ); + + candidate.descriptor.erasure_root = erasure_root; - candidate.descriptor.erasure_root = erasure_root; + chunks.insert(candidate.hash(), new_chunks.clone()); + candidates.insert(candidate.hash(), available_data.clone()); + + (erasure_root, available_data, new_chunks) + }); + + candidate.descriptor.erasure_root = *erasure_root; + candidates.insert(candidate.hash(), available_data.clone()); + chunks.insert(candidate.hash(), new_chunks.clone()); + } Self { validators, validator_public, validator_authority_id, validator_index, - candidate, + candidate_receipts, session_index, persisted_validation_data, - available_data, + candidates, chunks, - invalid_chunks, config, } } @@ -467,6 +529,9 @@ fn derive_erasure_chunks_with_proofs_and_root( (erasure_chunks, root) } +pub async fn bench_with_chunks_if_pov_large(env: &mut TestEnvironment) {} + +pub async fn bench_inner(env: &mut TestEnvironment) {} pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { let config = env.config().clone(); @@ -477,14 +542,14 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { .await; let start_marker = Instant::now(); - let mut candidate = env.state.candidate(); let mut batch = Vec::new(); + let mut availability_bytes = 0; for candidate_num in 0..config.n_cores as u64 { + let candidate = env.state.candidate_receipts[candidate_num as usize].clone(); + let (tx, rx) = oneshot::channel(); batch.push(rx); - candidate.descriptor.relay_parent = Hash::from_low_u64_be(candidate_num); - env.send_message(AvailabilityRecoveryMessage::RecoverAvailableData( candidate.clone(), 1, @@ -493,21 +558,22 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { )) .await; - if batch.len() >= config.vrf_modulo_samples { + if batch.len() >= config.max_parallel_recoveries { for rx in std::mem::take(&mut batch) { - let _available_data = rx.await.unwrap().unwrap(); + let available_data = rx.await.unwrap().unwrap(); + availability_bytes += available_data.encoded_size(); } } } for rx in std::mem::take(&mut batch) { - let _available_data = rx.await.unwrap().unwrap(); + let available_data = rx.await.unwrap().unwrap(); + availability_bytes += available_data.encoded_size(); } env.send_signal(OverseerSignal::Conclude).await; - delay!(5); let duration = start_marker.elapsed().as_millis(); - let tput = ((config.n_cores * config.pov_size) as u128) / duration * 1000; + let tput = (availability_bytes as u128) / duration * 1000; println!("Benchmark completed in {:?}ms", duration); println!("Throughput: {}KiB/s", tput / 1024); } diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs index 52c5227267992..2a5edf7cf197d 100644 --- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -60,8 +60,10 @@ impl BenchCli { let runtime = new_runtime(); let registry = Registry::new(); - let test_config = - TestConfiguration::degraded_network_300_validators_60_cores(1024 * 1024); + let mut pov_sizes = Vec::new(); + pov_sizes.append(&mut vec![1024 * 1024 * 5; 60]); + + let test_config = TestConfiguration::unconstrained_300_validators_60_cores(pov_sizes); let state = TestState::new(test_config); From 0430b5b909b84abc5bb4c078924ce46e944dc18a Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Wed, 8 Nov 2023 15:07:22 +0200 Subject: [PATCH 08/45] new metric in recovery and more testing Signed-off-by: Andrei Sandu --- Cargo.lock | 1 + .../availability-recovery/src/metrics.rs | 17 +++++++++++--- .../network/availability-recovery/src/task.rs | 3 ++- polkadot/node/subsystem-bench/Cargo.toml | 2 +- .../src/availability/configuration.rs | 22 +++++++++++++++---- .../subsystem-bench/src/availability/mod.rs | 10 +++++++-- .../subsystem-bench/src/subsystem-bench.rs | 14 ++++++++++-- .../node/subsystem-test-helpers/src/lib.rs | 12 ++++++---- 8 files changed, 64 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4645aeee6aab7..5b54745cdcc39 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13022,6 +13022,7 @@ dependencies = [ "sp-core", "sp-keyring", "sp-keystore", + "substrate-prometheus-endpoint", "tokio", "tracing-gum", ] diff --git a/polkadot/node/network/availability-recovery/src/metrics.rs b/polkadot/node/network/availability-recovery/src/metrics.rs index aa72167395076..d82a8f9ae5faf 100644 --- a/polkadot/node/network/availability-recovery/src/metrics.rs +++ b/polkadot/node/network/availability-recovery/src/metrics.rs @@ -29,7 +29,10 @@ struct MetricsInner { /// /// Gets incremented on each sent chunk requests. chunk_requests_issued: Counter, - + /// Total number of bytes recovered + /// + /// Gets incremented on each succesful recovery + recovered_bytes_total: Counter, /// A counter for finished chunk requests. /// /// Split by result: @@ -133,9 +136,10 @@ impl Metrics { } /// A full recovery succeeded. - pub fn on_recovery_succeeded(&self) { + pub fn on_recovery_succeeded(&self, bytes: usize) { if let Some(metrics) = &self.0 { - metrics.full_recoveries_finished.with_label_values(&["success"]).inc() + metrics.full_recoveries_finished.with_label_values(&["success"]).inc(); + metrics.recovered_bytes_total.inc_by(bytes as u64) } } @@ -171,6 +175,13 @@ impl metrics::Metrics for Metrics { )?, registry, )?, + recovered_bytes_total: prometheus::register( + Counter::new( + "polkadot_parachain_availability_recovery_bytes_total", + "Total number of bytes recovered", + )?, + registry, + )?, chunk_requests_finished: prometheus::register( CounterVec::new( Opts::new( diff --git a/polkadot/node/network/availability-recovery/src/task.rs b/polkadot/node/network/availability-recovery/src/task.rs index d5bc2da84944a..9ed911f3b5a70 100644 --- a/polkadot/node/network/availability-recovery/src/task.rs +++ b/polkadot/node/network/availability-recovery/src/task.rs @@ -23,6 +23,7 @@ use crate::{ LOG_TARGET, }; use futures::{channel::oneshot, SinkExt}; +use parity_scale_codec::Encode; #[cfg(not(test))] use polkadot_node_network_protocol::request_response::CHUNK_REQUEST_TIMEOUT; use polkadot_node_network_protocol::request_response::{ @@ -426,7 +427,7 @@ where return Err(err) }, Ok(data) => { - self.params.metrics.on_recovery_succeeded(); + self.params.metrics.on_recovery_succeeded(data.encoded_size()); return Ok(data) }, } diff --git a/polkadot/node/subsystem-bench/Cargo.toml b/polkadot/node/subsystem-bench/Cargo.toml index 7408397f930c2..2de978234a63c 100644 --- a/polkadot/node/subsystem-bench/Cargo.toml +++ b/polkadot/node/subsystem-bench/Cargo.toml @@ -47,7 +47,7 @@ sc-service = { path = "../../../substrate/client/service" } polkadot-node-metrics = { path = "../metrics" } polkadot-primitives-test-helpers = { path = "../../primitives/test-helpers" } -# prometheus = { package = "substrate-prometheus-endpoint", path = "../../../substrate/utils/prometheus" } +prometheus_endpoint = { package = "substrate-prometheus-endpoint", path = "../../../substrate/utils/prometheus" } prometheus = { version = "0.13.0", default-features = false } [features] diff --git a/polkadot/node/subsystem-bench/src/availability/configuration.rs b/polkadot/node/subsystem-bench/src/availability/configuration.rs index 3df496ad04280..9a93bf12e114f 100644 --- a/polkadot/node/subsystem-bench/src/availability/configuration.rs +++ b/polkadot/node/subsystem-bench/src/availability/configuration.rs @@ -65,12 +65,26 @@ impl Default for TestConfiguration { impl TestConfiguration { /// An unconstrained standard configuration matching Polkadot/Kusama pub fn unconstrained_300_validators_60_cores(pov_sizes: Vec) -> TestConfiguration { + Self { + use_fast_path: false, + n_validators: 300, + n_cores: 100, + pov_sizes, + max_parallel_recoveries: 100, + // HW specs node bandwidth + bandwidth: 60 * 1024 * 1024, + // No latency + latency: None, + error: 0, + } + } + pub fn unconstrained_1000_validators_60_cores(pov_sizes: Vec) -> TestConfiguration { Self { use_fast_path: false, n_validators: 300, n_cores: 60, pov_sizes, - max_parallel_recoveries: 20, + max_parallel_recoveries: 30, // HW specs node bandwidth bandwidth: 60 * 1024 * 1024, // No latency @@ -101,11 +115,11 @@ impl TestConfiguration { /// TODO: implement errors. pub fn degraded_network_300_validators_60_cores(pov_sizes: Vec) -> TestConfiguration { Self { - use_fast_path: true, + use_fast_path: false, n_validators: 300, - n_cores: 60, + n_cores: 100, pov_sizes, - max_parallel_recoveries: 6, + max_parallel_recoveries: 20, // HW specs node bandwidth bandwidth: 60 * 1024 * 1024, // A range of latencies to expect in a degraded network diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index dcfeb42877803..8f8eca104385b 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -298,8 +298,11 @@ impl AvailabilityRecoverySubsystemInstance { spawn_task_handle: SpawnTaskHandle, use_fast_path: bool, ) -> (Self, TestSubsystemContextHandle) { - let (context, virtual_overseer) = - make_buffered_subsystem_context(spawn_task_handle.clone(), 4096 * 4); + let (context, virtual_overseer) = make_buffered_subsystem_context( + spawn_task_handle.clone(), + 4096 * 4, + "availability-recovery", + ); let (collation_req_receiver, req_cfg) = IncomingRequest::get_config_receiver(&ReqProtocolNames::new(&GENESIS_HASH, None)); @@ -558,6 +561,7 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { )) .await; + // TODO: select between futures unordered of rx await and timer to send next request. if batch.len() >= config.max_parallel_recoveries { for rx in std::mem::take(&mut batch) { let available_data = rx.await.unwrap().unwrap(); @@ -576,4 +580,6 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { let tput = (availability_bytes as u128) / duration * 1000; println!("Benchmark completed in {:?}ms", duration); println!("Throughput: {}KiB/s", tput / 1024); + + tokio::time::sleep(Duration::from_secs(1)).await; } diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs index 2a5edf7cf197d..4dc0936291b1c 100644 --- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -20,6 +20,7 @@ use clap::Parser; use color_eyre::eyre; use prometheus::proto::LabelPair; +use std::net::{Ipv4Addr, SocketAddr}; pub(crate) mod availability; @@ -59,16 +60,25 @@ impl BenchCli { let runtime = new_runtime(); let registry = Registry::new(); + let registry_clone = registry.clone(); let mut pov_sizes = Vec::new(); - pov_sizes.append(&mut vec![1024 * 1024 * 5; 60]); + pov_sizes.append(&mut vec![1024 * 1024; 100]); - let test_config = TestConfiguration::unconstrained_300_validators_60_cores(pov_sizes); + let test_config = TestConfiguration::unconstrained_1000_validators_60_cores(pov_sizes); let state = TestState::new(test_config); let mut env = TestEnvironment::new(runtime.handle().clone(), state, registry.clone()); + let handle = runtime.spawn(async move { + prometheus_endpoint::init_prometheus( + SocketAddr::new(std::net::IpAddr::V4(Ipv4Addr::LOCALHOST), 9999), + registry_clone, + ) + .await + }); + println!("{:?}", env.config()); runtime.block_on(availability::bench_chunk_recovery(&mut env)); diff --git a/polkadot/node/subsystem-test-helpers/src/lib.rs b/polkadot/node/subsystem-test-helpers/src/lib.rs index 3f92513498c41..5393ccafa6f38 100644 --- a/polkadot/node/subsystem-test-helpers/src/lib.rs +++ b/polkadot/node/subsystem-test-helpers/src/lib.rs @@ -187,6 +187,7 @@ pub struct TestSubsystemContext { tx: TestSubsystemSender, rx: mpsc::Receiver>, spawn: S, + name: &'static str, } #[async_trait::async_trait] @@ -223,7 +224,7 @@ where name: &'static str, s: Pin + Send>>, ) -> SubsystemResult<()> { - self.spawn.spawn(name, None, s); + self.spawn.spawn(name, Some(self.name), s); Ok(()) } @@ -232,7 +233,7 @@ where name: &'static str, s: Pin + Send>>, ) -> SubsystemResult<()> { - self.spawn.spawn_blocking(name, None, s); + self.spawn.spawn_blocking(name, Some(self.name), s); Ok(()) } @@ -292,8 +293,9 @@ impl TestSubsystemContextHandle { /// of the tests. pub fn make_subsystem_context( spawner: S, + name: &'static str, ) -> (TestSubsystemContext>, TestSubsystemContextHandle) { - make_buffered_subsystem_context(spawner, 0) + make_buffered_subsystem_context(spawner, 0, name) } /// Make a test subsystem context with buffered overseer channel. Some tests (e.g. @@ -302,6 +304,7 @@ pub fn make_subsystem_context( pub fn make_buffered_subsystem_context( spawner: S, buffer_size: usize, + name: &'static str, ) -> (TestSubsystemContext>, TestSubsystemContextHandle) { let (overseer_tx, overseer_rx) = mpsc::channel(buffer_size); let (all_messages_tx, all_messages_rx) = mpsc::unbounded(); @@ -311,6 +314,7 @@ pub fn make_buffered_subsystem_context( tx: TestSubsystemSender { tx: all_messages_tx }, rx: overseer_rx, spawn: SpawnGlue(spawner), + name, }, TestSubsystemContextHandle { tx: overseer_tx, rx: all_messages_rx }, ) @@ -332,7 +336,7 @@ pub fn subsystem_test_harness( Test: Future, { let pool = TaskExecutor::new(); - let (context, handle) = make_subsystem_context(pool); + let (context, handle) = make_subsystem_context(pool, "default"); let overseer = overseer_factory(handle); let test = test_factory(context); From 027bcd862eef7d2f776ceb0d2bf3dc11ef490b5c Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Thu, 9 Nov 2023 23:15:01 +0200 Subject: [PATCH 09/45] CLI update and fixes Signed-off-by: Andrei Sandu --- Cargo.lock | 11 ++ cumulus/pallets/xcmp-queue/src/tests.rs | 22 ++- .../network/availability-recovery/Cargo.toml | 4 + .../network/availability-recovery/src/lib.rs | 10 +- polkadot/node/subsystem-bench/Cargo.toml | 4 +- .../src/availability/configuration.rs | 94 ++++++------ .../subsystem-bench/src/availability/mod.rs | 140 ++++++++++++------ .../subsystem-bench/src/subsystem-bench.rs | 125 ++++++++++++++-- 8 files changed, 292 insertions(+), 118 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5b54745cdcc39..05355cad0e2c1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2561,6 +2561,15 @@ dependencies = [ "clap_derive 4.4.2", ] +[[package]] +name = "clap-num" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "488557e97528174edaa2ee268b23a809e0c598213a4bbcb4f34575a46fda147e" +dependencies = [ + "num-traits", +] + [[package]] name = "clap_builder" version = "4.4.6" @@ -11715,6 +11724,7 @@ dependencies = [ "sp-core", "sp-keyring", "thiserror", + "tokio", "tracing-gum", ] @@ -12997,6 +13007,7 @@ dependencies = [ "assert_matches", "async-trait", "clap 4.4.6", + "clap-num", "color-eyre", "env_logger 0.9.3", "futures", diff --git a/cumulus/pallets/xcmp-queue/src/tests.rs b/cumulus/pallets/xcmp-queue/src/tests.rs index cf6d947609d2f..bab7e92ca2de8 100644 --- a/cumulus/pallets/xcmp-queue/src/tests.rs +++ b/cumulus/pallets/xcmp-queue/src/tests.rs @@ -410,9 +410,11 @@ fn verify_fee_factor_increase_and_decrease() { assert_eq!(DeliveryFeeFactor::::get(sibling_para_id), initial); // Sending the message right now is cheap - let (_, delivery_fees) = validate_send::(destination, xcm.clone()) - .expect("message can be sent; qed"); - let Fungible(delivery_fee_amount) = delivery_fees.inner()[0].fun else { unreachable!("asset is fungible; qed"); }; + let (_, delivery_fees) = + validate_send::(destination, xcm.clone()).expect("message can be sent; qed"); + let Fungible(delivery_fee_amount) = delivery_fees.inner()[0].fun else { + unreachable!("asset is fungible; qed"); + }; assert_eq!(delivery_fee_amount, 402_000_000); let smaller_xcm = Xcm(vec![ClearOrigin; 30]); @@ -422,19 +424,23 @@ fn verify_fee_factor_increase_and_decrease() { assert_ok!(send_xcm::(destination, xcm.clone())); // Size 520 assert_eq!(DeliveryFeeFactor::::get(sibling_para_id), FixedU128::from_float(1.05)); - for _ in 0..12 { // We finish at size 929 + for _ in 0..12 { + // We finish at size 929 assert_ok!(send_xcm::(destination, smaller_xcm.clone())); } assert!(DeliveryFeeFactor::::get(sibling_para_id) > FixedU128::from_float(1.88)); // Sending the message right now is expensive - let (_, delivery_fees) = validate_send::(destination, xcm.clone()) - .expect("message can be sent; qed"); - let Fungible(delivery_fee_amount) = delivery_fees.inner()[0].fun else { unreachable!("asset is fungible; qed"); }; + let (_, delivery_fees) = + validate_send::(destination, xcm.clone()).expect("message can be sent; qed"); + let Fungible(delivery_fee_amount) = delivery_fees.inner()[0].fun else { + unreachable!("asset is fungible; qed"); + }; assert_eq!(delivery_fee_amount, 758_030_955); // Fee factor only decreases in `take_outbound_messages` - for _ in 0..5 { // We take 5 100 byte pages + for _ in 0..5 { + // We take 5 100 byte pages XcmpQueue::take_outbound_messages(1); } assert!(DeliveryFeeFactor::::get(sibling_para_id) < FixedU128::from_float(1.72)); diff --git a/polkadot/node/network/availability-recovery/Cargo.toml b/polkadot/node/network/availability-recovery/Cargo.toml index 42c3abef547b9..5f3df09c2bd9e 100644 --- a/polkadot/node/network/availability-recovery/Cargo.toml +++ b/polkadot/node/network/availability-recovery/Cargo.toml @@ -7,6 +7,7 @@ license.workspace = true [dependencies] futures = "0.3.21" +tokio = "1.24.2" schnellru = "0.2.1" rand = "0.8.5" fatality = "0.0.6" @@ -36,3 +37,6 @@ sc-network = { path = "../../../../substrate/client/network" } polkadot-node-subsystem-test-helpers = { path = "../../subsystem-test-helpers" } polkadot-primitives-test-helpers = { path = "../../../primitives/test-helpers" } + +[features] +subsystem-benchmarks = [] \ No newline at end of file diff --git a/polkadot/node/network/availability-recovery/src/lib.rs b/polkadot/node/network/availability-recovery/src/lib.rs index 156a8cbbc82e6..ffb634ad76e2f 100644 --- a/polkadot/node/network/availability-recovery/src/lib.rs +++ b/polkadot/node/network/availability-recovery/src/lib.rs @@ -617,12 +617,9 @@ impl AvailabilityRecoverySubsystem { .into_iter() .cycle(); - gum::debug!("Subsystem running"); loop { let recv_req = req_receiver.recv(|| vec![COST_INVALID_REQUEST]).fuse(); pin_mut!(recv_req); - gum::debug!("waiting for message"); - futures::select! { erasure_task = erasure_task_rx.next() => { match erasure_task { @@ -729,6 +726,8 @@ impl AvailabilityRecoverySubsystem { } } output = state.ongoing_recoveries.select_next_some() => { + // No caching for benchmark. + #[cfg(not(feature = "subsystem-benchmarks"))] if let Some((candidate_hash, result)) = output { if let Ok(recovery) = CachedRecovery::try_from(result) { state.availability_lru.insert(candidate_hash, recovery); @@ -829,5 +828,10 @@ async fn erasure_task_thread( break }, } + + // In benchmarks this is a very hot loop not yielding at all. + // To update promehteus metrics for the task we need to yield. + #[cfg(feature = "subsystem-benchmarks")] + tokio::task::yield_now().await; } } diff --git a/polkadot/node/subsystem-bench/Cargo.toml b/polkadot/node/subsystem-bench/Cargo.toml index 2de978234a63c..01b992d15fc66 100644 --- a/polkadot/node/subsystem-bench/Cargo.toml +++ b/polkadot/node/subsystem-bench/Cargo.toml @@ -22,7 +22,7 @@ polkadot-node-subsystem-types = { path = "../subsystem-types" } polkadot-node-primitives = { path = "../primitives" } polkadot-primitives = { path = "../../primitives" } polkadot-node-network-protocol = { path = "../network/protocol" } -polkadot-availability-recovery = { path = "../network/availability-recovery" } +polkadot-availability-recovery = { path = "../network/availability-recovery", features=["subsystem-benchmarks"]} color-eyre = { version = "0.6.1", default-features = false } assert_matches = "1.5" async-trait = "0.1.57" @@ -38,7 +38,7 @@ env_logger = "0.9.0" rand = "0.8.5" parity-scale-codec = { version = "3.6.1", features = ["std", "derive"] } tokio = "1.24.2" - +clap-num = "1.0.2" polkadot-node-subsystem-test-helpers = { path = "../subsystem-test-helpers" } sp-keyring = { path = "../../../substrate/primitives/keyring" } sp-application-crypto = { path = "../../../substrate/primitives/application-crypto" } diff --git a/polkadot/node/subsystem-bench/src/availability/configuration.rs b/polkadot/node/subsystem-bench/src/availability/configuration.rs index 9a93bf12e114f..1355c67edea06 100644 --- a/polkadot/node/subsystem-bench/src/availability/configuration.rs +++ b/polkadot/node/subsystem-bench/src/availability/configuration.rs @@ -17,7 +17,7 @@ use super::*; /// Peer response latency configuration. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Default)] pub struct PeerLatency { /// Min latency for `NetworkAction` completion. pub min_latency: Duration, @@ -36,15 +36,15 @@ pub struct TestConfiguration { pub n_cores: usize, /// The PoV size pub pov_sizes: Vec, - /// This parameter is used to determine how many recoveries we batch in parallel - /// to simulate tranche0 recoveries. - pub max_parallel_recoveries: usize, - /// The amount of bandiwdht remote validators have. + /// The amount of bandiwdth remote validators have. pub bandwidth: usize, /// Optional peer emulation latency pub latency: Option, /// Error probability pub error: usize, + /// Number of loops + /// In one loop `n_cores` candidates are recovered + pub num_loops: usize, } impl Default for TestConfiguration { @@ -54,80 +54,78 @@ impl Default for TestConfiguration { n_validators: 10, n_cores: 10, pov_sizes: vec![5 * 1024 * 1024], - max_parallel_recoveries: 6, bandwidth: 60 * 1024 * 1024, latency: None, error: 0, + num_loops: 1, } } } impl TestConfiguration { /// An unconstrained standard configuration matching Polkadot/Kusama - pub fn unconstrained_300_validators_60_cores(pov_sizes: Vec) -> TestConfiguration { + pub fn ideal_network( + num_loops: usize, + use_fast_path: bool, + n_validators: usize, + n_cores: usize, + pov_sizes: Vec, + ) -> TestConfiguration { Self { - use_fast_path: false, - n_validators: 300, - n_cores: 100, + use_fast_path, + n_cores, + n_validators, pov_sizes, - max_parallel_recoveries: 100, // HW specs node bandwidth - bandwidth: 60 * 1024 * 1024, - // No latency - latency: None, - error: 0, - } - } - pub fn unconstrained_1000_validators_60_cores(pov_sizes: Vec) -> TestConfiguration { - Self { - use_fast_path: false, - n_validators: 300, - n_cores: 60, - pov_sizes, - max_parallel_recoveries: 30, - // HW specs node bandwidth - bandwidth: 60 * 1024 * 1024, + bandwidth: 50 * 1024 * 1024, // No latency latency: None, error: 0, + num_loops, } } - /// Polkadot/Kusama configuration with typical latency constraints. - pub fn healthy_network_300_validators_60_cores(pov_sizes: Vec) -> TestConfiguration { + pub fn healthy_network( + num_loops: usize, + use_fast_path: bool, + n_validators: usize, + n_cores: usize, + pov_sizes: Vec, + ) -> TestConfiguration { Self { - use_fast_path: true, - n_validators: 300, - n_cores: 60, + use_fast_path, + n_cores, + n_validators, pov_sizes, - max_parallel_recoveries: 6, - // HW specs node bandwidth - bandwidth: 60 * 1024 * 1024, + bandwidth: 50 * 1024 * 1024, latency: Some(PeerLatency { min_latency: Duration::from_millis(1), - max_latency: Duration::from_millis(50), + max_latency: Duration::from_millis(100), }), - error: 5, + error: 3, + num_loops, } } - /// Polkadot/Kusama configuration with degraded due to latencies. - /// TODO: implement errors. - pub fn degraded_network_300_validators_60_cores(pov_sizes: Vec) -> TestConfiguration { + pub fn degraded_network( + num_loops: usize, + use_fast_path: bool, + n_validators: usize, + n_cores: usize, + pov_sizes: Vec, + ) -> TestConfiguration { Self { - use_fast_path: false, - n_validators: 300, - n_cores: 100, + use_fast_path, + n_cores, + n_validators, pov_sizes, - max_parallel_recoveries: 20, - // HW specs node bandwidth - bandwidth: 60 * 1024 * 1024, - // A range of latencies to expect in a degraded network + bandwidth: 50 * 1024 * 1024, latency: Some(PeerLatency { - min_latency: Duration::from_millis(1), + min_latency: Duration::from_millis(10), max_latency: Duration::from_millis(500), }), - error: 30, + error: 33, + num_loops, } } } diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index 8f8eca104385b..7b9b64c07096d 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -22,7 +22,8 @@ use std::{ use futures::{ channel::{mpsc, oneshot}, - FutureExt, SinkExt, + stream::FuturesUnordered, + FutureExt, SinkExt, StreamExt, }; use polkadot_node_metrics::metrics::Metrics; @@ -32,7 +33,7 @@ use parity_scale_codec::Encode; use polkadot_node_network_protocol::request_response::{ self as req_res, v1::ChunkResponse, IncomingRequest, ReqProtocolNames, Requests, }; -use rand::{distributions::Uniform, prelude::Distribution, thread_rng}; +use rand::{distributions::Uniform, prelude::Distribution, seq::IteratorRandom, thread_rng}; use prometheus::Registry; use sc_network::{config::RequestResponseConfig, OutboundFailure, RequestFailure}; @@ -46,6 +47,7 @@ use polkadot_node_subsystem::{ }, ActiveLeavesUpdate, FromOrchestra, OverseerSignal, Subsystem, }; +use std::net::{Ipv4Addr, SocketAddr}; const LOG_TARGET: &str = "subsystem-bench::availability"; @@ -74,20 +76,47 @@ struct AvailabilityRecoverySubsystemInstance { _protocol_config: RequestResponseConfig, } -// Implements a mockup of NetworkBridge and AvilabilityStore to support provide state for -// `AvailabilityRecoverySubsystemInstance` +/// The test environment is responsible for creating an instance of the availability recovery +/// subsystem and connecting it to an emulated overseer. +/// +/// ## Mockups +/// We emulate the following subsystems: +/// - runtime api +/// - network bridge +/// - availability store +/// +/// As the subsystem's performance depends on network connectivity, the test environment +/// emulates validator nodes on the network, see `NetworkEmulator`. The network emulation +/// is configurable in terms of peer bandwidth, latency and connection error rate using +/// uniform distribution sampling. +/// +/// The mockup logic is implemented in `env_task` which owns and advances the `TestState`. +/// +/// ## Usage +/// `TestEnvironment` is used in tests to send `Overseer` messages or signals to the subsystem +/// under test. +/// +/// ## Collecting test metrics +/// +/// ### Prometheus +/// A prometheus endpoint is exposed while the test is running. A local Prometheus instance +/// can scrape it every 1s and a Grafana dashboard is the preferred way of visualizing +/// the performance characteristics of the subsystem. +/// +/// ### CLI +/// A subset of the Prometheus metrics are printed at the end of the test. pub struct TestEnvironment { - // A task manager that tracks task poll durations. + // A task manager that tracks task poll durations allows us to measure + // per task CPU usage as we do in the Polkadot node. task_manager: TaskManager, // The Prometheus metrics registry registry: Registry, - // A test overseer. + // A channel to the availability recovery subsystem to_subsystem: mpsc::Sender>, // Subsystem instance, currently keeps req/response protocol channel senders // for the whole duration of the test. instance: AvailabilityRecoverySubsystemInstance, - // The test intial state. The current state is owned by the task doing the overseer/subsystem - // mockings. + // The test intial state. The current state is owned by `env_task`. state: TestState, } @@ -115,6 +144,18 @@ impl TestEnvironment { async move { Self::env_task(virtual_overseer, task_state, spawn_task_handle).await }, ); + let registry_clone = registry.clone(); + task_manager + .spawn_handle() + .spawn_blocking("prometheus", "test-environment", async move { + prometheus_endpoint::init_prometheus( + SocketAddr::new(std::net::IpAddr::V4(Ipv4Addr::LOCALHOST), 9999), + registry_clone, + ) + .await + .unwrap(); + }); + TestEnvironment { task_manager, registry, to_subsystem, instance, state } } @@ -284,7 +325,10 @@ impl TestEnvironment { .timeout(MAX_TIME_OF_FLIGHT) .await .unwrap_or_else(|| { - panic!("{}ms is more than enough for sending signals.", TIMEOUT.as_millis()) + panic!( + "{}ms is more than enough for sending signals.", + MAX_TIME_OF_FLIGHT.as_millis() + ) }) .unwrap(); } @@ -382,15 +426,18 @@ impl TestState { } fn session_info(&self) -> SessionInfo { + let my_vec = (0..self.config().n_validators) + .map(|i| ValidatorIndex(i as _)) + .collect::>(); + + let validator_groups = my_vec.chunks(5).map(|x| Vec::from(x)).collect::>(); + SessionInfo { validators: self.validator_public.clone(), discovery_keys: self.validator_authority_id.clone(), - // all validators in the same group. - validator_groups: IndexedVec::>::from(vec![(0..5) - .map(|i| ValidatorIndex(i as _)) - .collect()]), + validator_groups: IndexedVec::>::from(validator_groups), assignment_keys: vec![], - n_cores: 0, + n_cores: self.config().n_cores as u32, zeroth_delay_tranche_width: 0, relay_vrf_modulo_samples: 0, n_delay_tranches: 0, @@ -449,7 +496,7 @@ impl TestState { .collect::>(); for (index, pov_size) in config.pov_sizes.iter().enumerate() { - let mut candidate = &mut candidate_receipts[index]; + let candidate = &mut candidate_receipts[index]; // a hack to make candidate unique. candidate.descriptor.relay_parent = Hash::from_low_u64_be(index as u64); @@ -532,9 +579,6 @@ fn derive_erasure_chunks_with_proofs_and_root( (erasure_chunks, root) } -pub async fn bench_with_chunks_if_pov_large(env: &mut TestEnvironment) {} - -pub async fn bench_inner(env: &mut TestEnvironment) {} pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { let config = env.config().clone(); @@ -545,39 +589,45 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { .await; let start_marker = Instant::now(); - let mut batch = Vec::new(); - let mut availability_bytes = 0; - for candidate_num in 0..config.n_cores as u64 { - let candidate = env.state.candidate_receipts[candidate_num as usize].clone(); - - let (tx, rx) = oneshot::channel(); - batch.push(rx); - - env.send_message(AvailabilityRecoveryMessage::RecoverAvailableData( - candidate.clone(), - 1, - Some(GroupIndex(0)), - tx, - )) - .await; - - // TODO: select between futures unordered of rx await and timer to send next request. - if batch.len() >= config.max_parallel_recoveries { - for rx in std::mem::take(&mut batch) { - let available_data = rx.await.unwrap().unwrap(); - availability_bytes += available_data.encoded_size(); - } + let mut batch = FuturesUnordered::new(); + let mut availability_bytes = 0u128; + + for loop_num in 0..env.config().num_loops { + gum::info!(target: LOG_TARGET, loop_num, "Starting loop"); + + for candidate_num in 0..config.n_cores as u64 { + let candidate = env.state.candidate(candidate_num as usize); + + let (tx, rx) = oneshot::channel(); + batch.push(rx); + + env.send_message(AvailabilityRecoveryMessage::RecoverAvailableData( + candidate.clone(), + 1, + Some(GroupIndex(candidate_num as u32 % (config.n_cores / 5) as u32)), + tx, + )) + .await; + + // // TODO: select between futures unordered of rx await and timer to send next request. + // if batch.len() >= config.max_parallel_recoveries { + // for rx in std::mem::take(&mut batch) { + // let available_data = rx.await.unwrap().unwrap(); + // availability_bytes += available_data.encoded_size() as u128; + // } + // } } - } - for rx in std::mem::take(&mut batch) { - let available_data = rx.await.unwrap().unwrap(); - availability_bytes += available_data.encoded_size(); + while let Some(completed) = batch.next().await { + let available_data = completed.unwrap().unwrap(); + availability_bytes += available_data.encoded_size() as u128; + } } + println!("Waiting for subsystem to complete work... {} requests ", batch.len()); env.send_signal(OverseerSignal::Conclude).await; let duration = start_marker.elapsed().as_millis(); - let tput = (availability_bytes as u128) / duration * 1000; + let tput = ((availability_bytes) / duration) * 1000; println!("Benchmark completed in {:?}ms", duration); println!("Throughput: {}KiB/s", tput / 1024); diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs index 4dc0936291b1c..f5180004840c4 100644 --- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -20,24 +20,89 @@ use clap::Parser; use color_eyre::eyre; use prometheus::proto::LabelPair; -use std::net::{Ipv4Addr, SocketAddr}; +use std::time::Duration; pub(crate) mod availability; use availability::{TestConfiguration, TestEnvironment, TestState}; const LOG_TARGET: &str = "subsystem-bench"; +use clap_num::number_range; + +fn le_100(s: &str) -> Result { + number_range(s, 0, 100) +} + +fn le_5000(s: &str) -> Result { + number_range(s, 0, 5000) +} + +#[derive(Debug, clap::Parser, Clone)] +#[clap(rename_all = "kebab-case")] +#[allow(missing_docs)] +pub struct NetworkOptions {} + +#[derive(clap::ValueEnum, Clone, Copy, Debug, PartialEq)] +#[value(rename_all = "kebab-case")] +#[non_exhaustive] +pub enum NetworkEmulation { + Ideal, + Healthy, + Degraded, +} + +#[derive(Debug, clap::Parser)] +#[clap(rename_all = "kebab-case")] +#[allow(missing_docs)] +pub struct DataAvailabilityReadOptions { + #[clap(long, ignore_case = true, default_value_t = 100)] + /// Number of cores to fetch availability for. + pub n_cores: usize, + + #[clap(long, ignore_case = true, default_value_t = 500)] + /// Number of validators to fetch chunks from. + pub n_validators: usize, + + #[clap(short, long, default_value_t = false)] + /// Turbo boost AD Read by fetching from backers first. Tipically this is only faster if nodes + /// have enough bandwidth. + pub fetch_from_backers: bool, + + #[clap(short, long, ignore_case = true, default_value_t = 1)] + /// Number of times to loop fetching for each core. + pub num_loops: usize, +} /// Define the supported benchmarks targets #[derive(Debug, Parser)] #[command(about = "Target subsystems", version, rename_all = "kebab-case")] enum BenchmarkTarget { /// Benchmark availability recovery strategies. - AvailabilityRecovery, + DataAvailabilityRead(DataAvailabilityReadOptions), } #[derive(Debug, Parser)] #[allow(missing_docs)] struct BenchCli { + #[arg(long, value_enum, ignore_case = true, default_value_t = NetworkEmulation::Ideal)] + /// The type of network to be emulated + pub network: NetworkEmulation, + + #[clap(short, long)] + /// The bandwidth of simulated remote peers in KiB + pub peer_bandwidth: Option, + + #[clap(long, value_parser=le_100)] + /// Simulated connection error rate [0-100]. + pub peer_error: Option, + + #[clap(long, value_parser=le_5000)] + /// Minimum remote peer latency in milliseconds [0-5000]. + pub peer_min_latency: Option, + + #[clap(long, value_parser=le_5000)] + /// Maximum remote peer latency in milliseconds [0-5000]. + pub peer_max_latency: Option, + #[command(subcommand)] pub target: BenchmarkTarget, } @@ -63,21 +128,57 @@ impl BenchCli { let registry_clone = registry.clone(); let mut pov_sizes = Vec::new(); - pov_sizes.append(&mut vec![1024 * 1024; 100]); + pov_sizes.append(&mut vec![5 * 1024 * 1024; 200]); + + let mut test_config = match self.target { + BenchmarkTarget::DataAvailabilityRead(options) => match self.network { + NetworkEmulation::Healthy => TestConfiguration::healthy_network( + options.num_loops, + options.fetch_from_backers, + options.n_validators, + options.n_cores, + pov_sizes, + ), + NetworkEmulation::Degraded => TestConfiguration::degraded_network( + options.num_loops, + options.fetch_from_backers, + options.n_validators, + options.n_cores, + pov_sizes, + ), + NetworkEmulation::Ideal => TestConfiguration::ideal_network( + options.num_loops, + options.fetch_from_backers, + options.n_validators, + options.n_cores, + pov_sizes, + ), + }, + }; + + let mut latency_config = test_config.latency.clone().unwrap_or_default(); + + if let Some(latency) = self.peer_min_latency { + latency_config.min_latency = Duration::from_millis(latency); + } - let test_config = TestConfiguration::unconstrained_1000_validators_60_cores(pov_sizes); + if let Some(latency) = self.peer_max_latency { + latency_config.max_latency = Duration::from_millis(latency); + } - let state = TestState::new(test_config); + if let Some(error) = self.peer_error { + test_config.error = error; + } + if let Some(bandwidth) = self.peer_bandwidth { + // CLI expects bw in KiB + test_config.bandwidth = bandwidth * 1024; + } + + let state = TestState::new(test_config); let mut env = TestEnvironment::new(runtime.handle().clone(), state, registry.clone()); - let handle = runtime.spawn(async move { - prometheus_endpoint::init_prometheus( - SocketAddr::new(std::net::IpAddr::V4(Ipv4Addr::LOCALHOST), 9999), - registry_clone, - ) - .await - }); + let runtime_handle = runtime.handle().clone(); println!("{:?}", env.config()); From 5a05da0f6c87e7e19ff1940d4b9f035cbb4cf7e9 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Thu, 9 Nov 2023 23:51:58 +0200 Subject: [PATCH 10/45] peer stats Signed-off-by: Andrei Sandu --- .../subsystem-bench/src/availability/mod.rs | 28 ++++++----- .../src/availability/network.rs | 49 +++++++++++++++++-- 2 files changed, 63 insertions(+), 14 deletions(-) diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index 7b9b64c07096d..a4980ffc5fdd2 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -118,6 +118,8 @@ pub struct TestEnvironment { instance: AvailabilityRecoverySubsystemInstance, // The test intial state. The current state is owned by `env_task`. state: TestState, + // A handle to the network emulator. + network: NetworkEmulator, } impl TestEnvironment { @@ -131,17 +133,24 @@ impl TestEnvironment { state.config().use_fast_path, ); + let mut network = NetworkEmulator::new( + state.config().n_validators, + state.config().bandwidth, + task_manager.spawn_handle(), + ); + // Copy sender for later when we need to inject messages in to the subsystem. let to_subsystem = virtual_overseer.tx.clone(); let task_state = state.clone(); - let spawn_task_handle = task_manager.spawn_handle(); + let task_network = network.clone(); + // We need to start a receiver to process messages from the subsystem. // This mocks an overseer and all dependent subsystems task_manager.spawn_handle().spawn_blocking( "test-environment", "test-environment", - async move { Self::env_task(virtual_overseer, task_state, spawn_task_handle).await }, + async move { Self::env_task(virtual_overseer, task_state, task_network).await }, ); let registry_clone = registry.clone(); @@ -156,7 +165,7 @@ impl TestEnvironment { .unwrap(); }); - TestEnvironment { task_manager, registry, to_subsystem, instance, state } + TestEnvironment { task_manager, registry, to_subsystem, instance, state, network } } pub fn config(&self) -> &TestConfiguration { @@ -249,15 +258,8 @@ impl TestEnvironment { async fn env_task( mut ctx: TestSubsystemContextHandle, mut state: TestState, - spawn_task_handle: SpawnTaskHandle, + mut network: NetworkEmulator, ) { - // Emulate `n_validators` each with 1MiB of bandwidth available. - let mut network = NetworkEmulator::new( - state.config().n_validators, - state.config().bandwidth, - spawn_task_handle, - ); - loop { futures::select! { message = ctx.recv().fuse() => { @@ -631,5 +633,9 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { println!("Benchmark completed in {:?}ms", duration); println!("Throughput: {}KiB/s", tput / 1024); + let stats = env.network.stats().await; + for (index, stat) in stats.iter().enumerate() { + println!("Validator #{} : {:?}", index, stat); + } tokio::time::sleep(Duration::from_secs(1)).await; } diff --git a/polkadot/node/subsystem-bench/src/availability/network.rs b/polkadot/node/subsystem-bench/src/availability/network.rs index d6fc175c859ba..544ecf06372af 100644 --- a/polkadot/node/subsystem-bench/src/availability/network.rs +++ b/polkadot/node/subsystem-bench/src/availability/network.rs @@ -124,6 +124,7 @@ mod tests { } // A network peer emulator +#[derive(Clone)] struct PeerEmulator { // The queue of requests waiting to be served by the emulator actions_tx: UnboundedSender, @@ -137,11 +138,15 @@ impl PeerEmulator { .clone() .spawn("peer-emulator", "test-environment", async move { let mut rate_limiter = RateLimit::new(20, bandwidth); + let rx_bytes_total = 0; + let mut tx_bytes_total = 0u128; + loop { let maybe_action: Option = actions_rx.recv().await; if let Some(action) = maybe_action { let size = action.size(); rate_limiter.reap(size).await; + tx_bytes_total += size as u128; if let Some(latency) = action.latency { spawn_task_handle.spawn( "peer-emulator-latency", @@ -152,7 +157,12 @@ impl PeerEmulator { }, ) } else { - action.run().await; + // Send stats if requested + if let Some(stats_sender) = action.stats { + stats_sender.send(PeerEmulatorStats { rx_bytes_total, tx_bytes_total }).unwrap(); + } else { + action.run().await; + } } } else { break @@ -170,7 +180,7 @@ impl PeerEmulator { } pub type ActionFuture = std::pin::Pin + std::marker::Send>>; -// An network action to be completed by the emulator task. +/// An network action to be completed by the emulator task. pub struct NetworkAction { // The function that performs the action run: ActionFuture, @@ -180,12 +190,28 @@ pub struct NetworkAction { index: usize, // The amount of time to delay the polling `run` latency: Option, + // An optional request of rx/tx statistics for the peer at `index` + stats: Option>, +} + +/// Book keeping of sent and received bytes. +#[derive(Debug, Clone)] +pub struct PeerEmulatorStats { + pub rx_bytes_total: u128, + pub tx_bytes_total: u128, } impl NetworkAction { pub fn new(index: usize, run: ActionFuture, size: usize, latency: Option) -> Self { - Self { run, size, index, latency } + Self { run, size, index, latency, stats: None } + } + + pub fn stats(index: usize, stats_sender:oneshot::Sender) -> Self { + let run = async move {}.boxed(); + + Self { run, size: 0, index, latency: None, stats: Some(stats_sender) } } + pub fn size(&self) -> usize { self.size } @@ -201,6 +227,7 @@ impl NetworkAction { // Mocks the network bridge and an arbitrary number of connected peer nodes. // Implements network latency, bandwidth and error. +#[derive(Clone)] pub struct NetworkEmulator { // Per peer network emulation peers: Vec, @@ -218,4 +245,20 @@ impl NetworkEmulator { pub fn submit_peer_action(&mut self, index: usize, action: NetworkAction) { let _ = self.peers[index].send(action); } + + // Returns the sent/received stats for all peers. + pub async fn stats(&mut self) -> Vec { + let receivers = (0..self.peers.len()).map(|peer_index| { + let (stats_tx, stats_rx) = oneshot::channel(); + self.submit_peer_action(peer_index, NetworkAction::stats(peer_index, stats_tx)); + stats_rx + }).collect::>(); + + let mut stats = Vec::new(); + for receiver in receivers { + stats.push(receiver.await.unwrap()); + } + + stats + } } From 895e8d6a627334b46025a212242e31684eb8a9fc Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Fri, 10 Nov 2023 12:42:51 +0200 Subject: [PATCH 11/45] Switch stats to atomics Signed-off-by: Andrei Sandu --- .../subsystem-bench/src/availability/mod.rs | 1 - .../src/availability/network.rs | 83 ++++++++++--------- .../subsystem-bench/src/subsystem-bench.rs | 1 - 3 files changed, 42 insertions(+), 43 deletions(-) diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index a4980ffc5fdd2..7903ba08b6169 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -13,7 +13,6 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . - use std::{ collections::HashMap, sync::Arc, diff --git a/polkadot/node/subsystem-bench/src/availability/network.rs b/polkadot/node/subsystem-bench/src/availability/network.rs index 544ecf06372af..02af817e691fd 100644 --- a/polkadot/node/subsystem-bench/src/availability/network.rs +++ b/polkadot/node/subsystem-bench/src/availability/network.rs @@ -13,10 +13,11 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . - use super::*; +use prometheus_endpoint::U64; +use sc_network::network_state::Peer; +use std::sync::atomic::{AtomicU64, Ordering}; use tokio::sync::mpsc::UnboundedSender; - // An emulated node egress traffic rate_limiter. #[derive(Debug)] struct RateLimit { @@ -131,7 +132,11 @@ struct PeerEmulator { } impl PeerEmulator { - pub fn new(bandwidth: usize, spawn_task_handle: SpawnTaskHandle) -> Self { + pub fn new( + bandwidth: usize, + spawn_task_handle: SpawnTaskHandle, + stats: Arc, + ) -> Self { let (actions_tx, mut actions_rx) = tokio::sync::mpsc::unbounded_channel(); spawn_task_handle @@ -140,13 +145,12 @@ impl PeerEmulator { let mut rate_limiter = RateLimit::new(20, bandwidth); let rx_bytes_total = 0; let mut tx_bytes_total = 0u128; - loop { + let stats_clone = stats.clone(); let maybe_action: Option = actions_rx.recv().await; if let Some(action) = maybe_action { let size = action.size(); rate_limiter.reap(size).await; - tx_bytes_total += size as u128; if let Some(latency) = action.latency { spawn_task_handle.spawn( "peer-emulator-latency", @@ -154,15 +158,14 @@ impl PeerEmulator { async move { tokio::time::sleep(latency).await; action.run().await; + stats_clone + .tx_bytes_total + .fetch_add(size as u64, Ordering::Relaxed); }, ) } else { - // Send stats if requested - if let Some(stats_sender) = action.stats { - stats_sender.send(PeerEmulatorStats { rx_bytes_total, tx_bytes_total }).unwrap(); - } else { - action.run().await; - } + action.run().await; + stats_clone.tx_bytes_total.fetch_add(size as u64, Ordering::Relaxed); } } else { break @@ -190,26 +193,23 @@ pub struct NetworkAction { index: usize, // The amount of time to delay the polling `run` latency: Option, - // An optional request of rx/tx statistics for the peer at `index` - stats: Option>, } /// Book keeping of sent and received bytes. -#[derive(Debug, Clone)] +#[derive(Debug, Default)] pub struct PeerEmulatorStats { - pub rx_bytes_total: u128, - pub tx_bytes_total: u128, + pub rx_bytes_total: AtomicU64, + pub tx_bytes_total: AtomicU64, } +#[derive(Debug, Default)] +pub struct PeerStats { + pub rx_bytes_total: u64, + pub tx_bytes_total: u64, +} impl NetworkAction { pub fn new(index: usize, run: ActionFuture, size: usize, latency: Option) -> Self { - Self { run, size, index, latency, stats: None } - } - - pub fn stats(index: usize, stats_sender:oneshot::Sender) -> Self { - let run = async move {}.boxed(); - - Self { run, size: 0, index, latency: None, stats: Some(stats_sender) } + Self { run, size, index, latency } } pub fn size(&self) -> usize { @@ -231,15 +231,19 @@ impl NetworkAction { pub struct NetworkEmulator { // Per peer network emulation peers: Vec, + stats: Vec>, } impl NetworkEmulator { pub fn new(n_peers: usize, bandwidth: usize, spawn_task_handle: SpawnTaskHandle) -> Self { - Self { - peers: (0..n_peers) - .map(|_index| PeerEmulator::new(bandwidth, spawn_task_handle.clone())) - .collect::>(), - } + let (stats, peers) = (0..n_peers) + .map(|_index| { + let stats = Arc::new(PeerEmulatorStats::default()); + (stats.clone(), PeerEmulator::new(bandwidth, spawn_task_handle.clone(), stats)) + }) + .unzip(); + + Self { peers, stats } } pub fn submit_peer_action(&mut self, index: usize, action: NetworkAction) { @@ -247,18 +251,15 @@ impl NetworkEmulator { } // Returns the sent/received stats for all peers. - pub async fn stats(&mut self) -> Vec { - let receivers = (0..self.peers.len()).map(|peer_index| { - let (stats_tx, stats_rx) = oneshot::channel(); - self.submit_peer_action(peer_index, NetworkAction::stats(peer_index, stats_tx)); - stats_rx - }).collect::>(); - - let mut stats = Vec::new(); - for receiver in receivers { - stats.push(receiver.await.unwrap()); - } - - stats + pub async fn stats(&mut self) -> Vec { + let r = self + .stats + .iter() + .map(|stats| PeerStats { + rx_bytes_total: stats.rx_bytes_total.load(Ordering::Relaxed), + tx_bytes_total: stats.tx_bytes_total.load(Ordering::Relaxed), + }) + .collect::>(); + r } } diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs index f5180004840c4..ba66d06fe320d 100644 --- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -16,7 +16,6 @@ //! A tool for running subsystem benchmark tests designed for development and //! CI regression testing. - use clap::Parser; use color_eyre::eyre; use prometheus::proto::LabelPair; From a2fb0c95d08c17ad5647c904cb48854ec30ba470 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Sun, 12 Nov 2023 03:14:34 +0200 Subject: [PATCH 12/45] add more network metrics, new load generator Signed-off-by: Andrei Sandu --- Cargo.lock | 1 + polkadot/node/subsystem-bench/Cargo.toml | 2 +- .../src/availability/configuration.rs | 7 +- .../subsystem-bench/src/availability/mod.rs | 335 +++++++++++++----- .../src/availability/network.rs | 149 +++++++- .../src/availability/test_env.rs | 63 ++++ .../subsystem-bench/src/subsystem-bench.rs | 25 +- 7 files changed, 461 insertions(+), 121 deletions(-) create mode 100644 polkadot/node/subsystem-bench/src/availability/test_env.rs diff --git a/Cargo.lock b/Cargo.lock index 05355cad0e2c1..ee80ffb6e8157 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13012,6 +13012,7 @@ dependencies = [ "env_logger 0.9.3", "futures", "futures-timer", + "itertools 0.11.0", "log", "parity-scale-codec", "polkadot-availability-recovery", diff --git a/polkadot/node/subsystem-bench/Cargo.toml b/polkadot/node/subsystem-bench/Cargo.toml index 01b992d15fc66..c5d62d3aa74f2 100644 --- a/polkadot/node/subsystem-bench/Cargo.toml +++ b/polkadot/node/subsystem-bench/Cargo.toml @@ -45,7 +45,7 @@ sp-application-crypto = { path = "../../../substrate/primitives/application-cryp sc-network = { path = "../../../substrate/client/network" } sc-service = { path = "../../../substrate/client/service" } polkadot-node-metrics = { path = "../metrics" } - +itertools = "0.11.0" polkadot-primitives-test-helpers = { path = "../../primitives/test-helpers" } prometheus_endpoint = { package = "substrate-prometheus-endpoint", path = "../../../substrate/utils/prometheus" } prometheus = { version = "0.13.0", default-features = false } diff --git a/polkadot/node/subsystem-bench/src/availability/configuration.rs b/polkadot/node/subsystem-bench/src/availability/configuration.rs index 1355c67edea06..cf142de06634c 100644 --- a/polkadot/node/subsystem-bench/src/availability/configuration.rs +++ b/polkadot/node/subsystem-bench/src/availability/configuration.rs @@ -37,6 +37,8 @@ pub struct TestConfiguration { /// The PoV size pub pov_sizes: Vec, /// The amount of bandiwdth remote validators have. + pub peer_bandwidth: usize, + /// The amount of bandiwdth our node has. pub bandwidth: usize, /// Optional peer emulation latency pub latency: Option, @@ -55,6 +57,7 @@ impl Default for TestConfiguration { n_cores: 10, pov_sizes: vec![5 * 1024 * 1024], bandwidth: 60 * 1024 * 1024, + peer_bandwidth: 60 * 1024 * 1024, latency: None, error: 0, num_loops: 1, @@ -76,8 +79,8 @@ impl TestConfiguration { n_cores, n_validators, pov_sizes, - // HW specs node bandwidth bandwidth: 50 * 1024 * 1024, + peer_bandwidth: 50 * 1024 * 1024, // No latency latency: None, error: 0, @@ -98,6 +101,7 @@ impl TestConfiguration { n_validators, pov_sizes, bandwidth: 50 * 1024 * 1024, + peer_bandwidth: 50 * 1024 * 1024, latency: Some(PeerLatency { min_latency: Duration::from_millis(1), max_latency: Duration::from_millis(100), @@ -120,6 +124,7 @@ impl TestConfiguration { n_validators, pov_sizes, bandwidth: 50 * 1024 * 1024, + peer_bandwidth: 50 * 1024 * 1024, latency: Some(PeerLatency { min_latency: Duration::from_millis(10), max_latency: Duration::from_millis(500), diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index 7903ba08b6169..4f821f8199089 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -13,8 +13,11 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . +use itertools::Itertools; use std::{ collections::HashMap, + iter::Cycle, + ops::{Div, Sub}, sync::Arc, time::{Duration, Instant}, }; @@ -24,6 +27,8 @@ use futures::{ stream::FuturesUnordered, FutureExt, SinkExt, StreamExt, }; +use futures_timer::Delay; + use polkadot_node_metrics::metrics::Metrics; use polkadot_availability_recovery::AvailabilityRecoverySubsystem; @@ -48,6 +53,8 @@ use polkadot_node_subsystem::{ }; use std::net::{Ipv4Addr, SocketAddr}; +mod test_env; + const LOG_TARGET: &str = "subsystem-bench::availability"; use polkadot_node_primitives::{AvailableData, ErasureChunk}; @@ -119,6 +126,8 @@ pub struct TestEnvironment { state: TestState, // A handle to the network emulator. network: NetworkEmulator, + // Configuration/env metrics + metrics: TestEnvironmentMetrics, } impl TestEnvironment { @@ -131,11 +140,13 @@ impl TestEnvironment { task_manager.spawn_handle(), state.config().use_fast_path, ); - + let metrics = + TestEnvironmentMetrics::new(®istry).expect("Metrics need to be registered"); let mut network = NetworkEmulator::new( state.config().n_validators, - state.config().bandwidth, + state.config().peer_bandwidth, task_manager.spawn_handle(), + ®istry, ); // Copy sender for later when we need to inject messages in to the subsystem. @@ -143,13 +154,31 @@ impl TestEnvironment { let task_state = state.clone(); let task_network = network.clone(); + let spawn_handle = task_manager.spawn_handle(); + + // Our node rate limiting + let mut rx_limiter = RateLimit::new(10, state.config.bandwidth); + let (ingress_tx, mut ingress_rx) = tokio::sync::mpsc::unbounded_channel::(); + let our_network_stats = network.peer_stats(0); + + spawn_handle.spawn_blocking("our-node-rx", "test-environment", async move { + while let Some(action) = ingress_rx.recv().await { + let size = action.size(); + + // account for our node receiving the data. + our_network_stats.inc_received(size); + + rx_limiter.reap(size).await; + action.run().await; + } + }); // We need to start a receiver to process messages from the subsystem. // This mocks an overseer and all dependent subsystems task_manager.spawn_handle().spawn_blocking( "test-environment", "test-environment", - async move { Self::env_task(virtual_overseer, task_state, task_network).await }, + async move { Self::env_task(virtual_overseer, task_state, task_network, ingress_tx).await }, ); let registry_clone = registry.clone(); @@ -164,13 +193,17 @@ impl TestEnvironment { .unwrap(); }); - TestEnvironment { task_manager, registry, to_subsystem, instance, state, network } + TestEnvironment { task_manager, registry, to_subsystem, instance, state, network, metrics } } pub fn config(&self) -> &TestConfiguration { self.state.config() } + pub fn network(&self) -> &NetworkEmulator { + &self.network + } + /// Produce a randomized duration between `min` and `max`. fn random_latency(maybe_peer_latency: Option<&PeerLatency>) -> Option { if let Some(peer_latency) = maybe_peer_latency { @@ -183,24 +216,51 @@ impl TestEnvironment { } } + pub fn metrics(&self) -> &TestEnvironmentMetrics { + &self.metrics + } + /// Generate a random error based on `probability`. /// `probability` should be a number between 0 and 100. fn random_error(probability: usize) -> bool { Uniform::from(0..=99).sample(&mut thread_rng()) < probability } - pub fn respond_to_send_request(state: &mut TestState, request: Requests) -> NetworkAction { + pub fn request_size(request: &Requests) -> u64 { + match request { + Requests::ChunkFetchingV1(outgoing_request) => + outgoing_request.payload.encoded_size() as u64, + Requests::AvailableDataFetchingV1(outgoing_request) => + outgoing_request.payload.encoded_size() as u64, + _ => panic!("received an unexpected request"), + } + } + + pub fn respond_to_send_request( + state: &mut TestState, + request: Requests, + ingress_tx: tokio::sync::mpsc::UnboundedSender, + ) -> NetworkAction { match request { Requests::ChunkFetchingV1(outgoing_request) => { let validator_index = outgoing_request.payload.index.0 as usize; - let chunk: ChunkResponse = - state.chunks.get(&outgoing_request.payload.candidate_hash).unwrap() - [validator_index] - .clone() - .into(); - let size = chunk.encoded_size(); + let candidate_hash = outgoing_request.payload.candidate_hash; + + let candidate_index = state + .candidate_hashes + .get(&candidate_hash) + .expect("candidate was generated previously; qed"); + gum::warn!(target: LOG_TARGET, ?candidate_hash, candidate_index, "Candidate mapped to index"); + + let chunk: ChunkResponse = state.chunks.get(*candidate_index as usize).unwrap() + [validator_index] + .clone() + .into(); + let mut size = chunk.encoded_size(); let response = if Self::random_error(state.config().error) { + // Error will not account to any bandwidth used. + size = 0; Err(RequestFailure::Network(OutboundFailure::ConnectionClosed)) } else { Ok(req_res::v1::ChunkFetchingResponse::from(Some(chunk)).encode()) @@ -211,21 +271,39 @@ impl TestEnvironment { } .boxed(); + let future_wrapper = async move { + // Forward the response to the ingress channel of our node. + // On receive side we apply our node receiving rate limit. + let action = NetworkAction::new(validator_index, future, size, None); + ingress_tx.send(action).unwrap(); + } + .boxed(); + NetworkAction::new( validator_index, - future, + future_wrapper, size, // Generate a random latency based on configuration. Self::random_latency(state.config().latency.as_ref()), ) }, Requests::AvailableDataFetchingV1(outgoing_request) => { + println!("{:?}", outgoing_request); // TODO: do better, by implementing diff authority ids and mapping network actions // to authority id, let validator_index = Uniform::from(0..state.config().n_validators).sample(&mut thread_rng()); + + let candidate_hash = outgoing_request.payload.candidate_hash; + let candidate_index = state + .candidate_hashes + .get(&candidate_hash) + .expect("candidate was generated previously; qed"); + gum::warn!(target: LOG_TARGET, ?candidate_hash, candidate_index, "Candidate mapped to index"); + let available_data = - state.candidates.get(&outgoing_request.payload.candidate_hash).unwrap().clone(); + state.available_data.get(*candidate_index as usize).unwrap().clone(); + let size = available_data.encoded_size(); let response = if Self::random_error(state.config().error) { @@ -240,9 +318,17 @@ impl TestEnvironment { } .boxed(); + let future_wrapper = async move { + // Forward the response to the ingress channel of our node. + // On receive side we apply our node receiving rate limit. + let action = NetworkAction::new(validator_index, future, size, None); + ingress_tx.send(action).unwrap(); + } + .boxed(); + NetworkAction::new( validator_index, - future, + future_wrapper, size, // Generate a random latency based on configuration. Self::random_latency(state.config().latency.as_ref()), @@ -258,11 +344,12 @@ impl TestEnvironment { mut ctx: TestSubsystemContextHandle, mut state: TestState, mut network: NetworkEmulator, + ingress_tx: tokio::sync::mpsc::UnboundedSender, ) { loop { futures::select! { message = ctx.recv().fuse() => { - gum::debug!(target: LOG_TARGET, ?message, "Env task received message"); + gum::trace!(target: LOG_TARGET, ?message, "Env task received message"); match message { AllMessages::NetworkBridgeTx( @@ -272,7 +359,9 @@ impl TestEnvironment { ) ) => { for request in requests { - let action = Self::respond_to_send_request(&mut state, request); + network.inc_sent(Self::request_size(&request)); + let action = Self::respond_to_send_request(&mut state, request, ingress_tx.clone()); + // Account for our node sending the request over the emulated network. network.submit_peer_action(action.index(), action); } }, @@ -287,7 +376,10 @@ impl TestEnvironment { AllMessages::AvailabilityStore( AvailabilityStoreMessage::QueryChunkSize(candidate_hash, tx) ) => { - let chunk_size = state.chunks.get(&candidate_hash).unwrap()[0].encoded_size(); + let candidate_index = state.candidate_hashes.get(&candidate_hash).expect("candidate was generated previously; qed"); + gum::info!(target: LOG_TARGET, ?candidate_hash, candidate_index, "Candidate mapped to index"); + + let chunk_size = state.chunks.get(*candidate_index as usize).unwrap()[0].encoded_size(); let _ = tx.send(Some(chunk_size)); } AllMessages::RuntimeApi(RuntimeApiMessage::Request( @@ -345,8 +437,8 @@ impl AvailabilityRecoverySubsystemInstance { ) -> (Self, TestSubsystemContextHandle) { let (context, virtual_overseer) = make_buffered_subsystem_context( spawn_task_handle.clone(), - 4096 * 4, - "availability-recovery", + 128, + "availability-recovery-subsystem", ); let (collation_req_receiver, req_cfg) = IncomingRequest::get_config_receiver(&ReqProtocolNames::new(&GENESIS_HASH, None)); @@ -378,8 +470,6 @@ impl AvailabilityRecoverySubsystemInstance { } } -const TIMEOUT: Duration = Duration::from_millis(300); - // We use this to bail out sending messages to the subsystem if it is overloaded such that // the time of flight is breaches 5s. // This should eventually be a test parameter. @@ -387,9 +477,13 @@ const MAX_TIME_OF_FLIGHT: Duration = Duration::from_millis(5000); use sp_keyring::Sr25519Keyring; -use crate::availability::network::NetworkAction; +use crate::availability::network::{ActionFuture, NetworkAction}; -use self::{configuration::PeerLatency, network::NetworkEmulator}; +use self::{ + configuration::PeerLatency, + network::{NetworkEmulator, RateLimit}, + test_env::TestEnvironmentMetrics, +}; #[derive(Clone)] pub struct TestState { @@ -398,15 +492,22 @@ pub struct TestState { validator_authority_id: Vec, // The test node validator index. validator_index: ValidatorIndex, - // Per core candidates receipts. - candidate_receipts: Vec, session_index: SessionIndex, - + pov_sizes: Cycle>, + // Generated candidate receipts to be used in the test + candidates: Cycle>, + candidates_generated: usize, + // Map from pov size to candidate index + pov_size_to_candidate: HashMap, + // Map from generated candidate hashes to candidate index in `available_data` + // and `chunks`. + candidate_hashes: HashMap, persisted_validation_data: PersistedValidationData, - /// A per size pov mapping to available data. - candidates: HashMap, - chunks: HashMap>, + candidate_receipts: Vec, + available_data: Vec, + chunks: Vec>, + /// Next candidate index in config: TestConfiguration, } @@ -415,10 +516,6 @@ impl TestState { &self.config } - fn candidate(&self, candidate_index: usize) -> CandidateReceipt { - self.candidate_receipts.get(candidate_index).unwrap().clone() - } - async fn respond_none_to_available_data_query( &self, tx: oneshot::Sender>, @@ -455,9 +552,17 @@ impl TestState { send_chunk: impl Fn(usize) -> bool, tx: oneshot::Sender>, ) { + gum::info!(target: LOG_TARGET, ?candidate_hash, "respond_to_query_all_request"); + + let candidate_index = self + .candidate_hashes + .get(&candidate_hash) + .expect("candidate was generated previously; qed"); + gum::info!(target: LOG_TARGET, ?candidate_hash, candidate_index, "Candidate mapped to index"); + let v = self .chunks - .get(&candidate_hash) + .get(*candidate_index as usize) .unwrap() .iter() .filter(|c| send_chunk(c.index.0 as usize)) @@ -467,6 +572,41 @@ impl TestState { let _ = tx.send(v); } + pub fn next_candidate(&mut self) -> Option { + let candidate = self.candidates.next(); + let candidate_hash = candidate.as_ref().unwrap().hash(); + gum::trace!(target: LOG_TARGET, "Next candidate selected {:?}", candidate_hash); + candidate + } + + /// Generate candidates to be used in the test. + pub fn generate_candidates(&mut self, count: usize) { + gum::info!(target: LOG_TARGET, "Pre-generating {} candidates.", count); + + // Generate all candidates + self.candidates = (0..count) + .map(|index| { + let pov_size = self.pov_sizes.next().expect("This is a cycle; qed"); + let candidate_index = *self + .pov_size_to_candidate + .get(&pov_size) + .expect("pov_size always exists; qed"); + let mut candidate_receipt = self.candidate_receipts[candidate_index].clone(); + + // Make it unique. + candidate_receipt.descriptor.relay_parent = Hash::from_low_u64_be(index as u64); + // Store the new candidate in the state + self.candidate_hashes.insert(candidate_receipt.hash(), candidate_index); + + gum::info!(target: LOG_TARGET, candidate_hash = ?candidate_receipt.hash(), "new candidate"); + + candidate_receipt + }) + .collect::>() + .into_iter() + .cycle(); + } + pub fn new(config: TestConfiguration) -> Self { let validators = (0..config.n_validators as u64) .into_iter() @@ -476,9 +616,10 @@ impl TestState { let validator_public = validator_pubkeys(&validators); let validator_authority_id = validator_authority_id(&validators); let validator_index = ValidatorIndex(0); + let mut chunks = Vec::new(); + let mut available_data = Vec::new(); + let mut candidate_receipts = Vec::new(); let mut pov_size_to_candidate = HashMap::new(); - let mut chunks = HashMap::new(); - let mut candidates = HashMap::new(); let session_index = 10; // we use it for all candidates. @@ -489,59 +630,54 @@ impl TestState { relay_parent_storage_root: Default::default(), }; - // Create initial candidate receipts - let mut candidate_receipts = config - .pov_sizes - .iter() - .map(|_index| dummy_candidate_receipt(dummy_hash())) - .collect::>(); - - for (index, pov_size) in config.pov_sizes.iter().enumerate() { - let candidate = &mut candidate_receipts[index]; - // a hack to make candidate unique. - candidate.descriptor.relay_parent = Hash::from_low_u64_be(index as u64); - - // We reuse candidates of same size, to speed up the test startup. - let (erasure_root, available_data, new_chunks) = - pov_size_to_candidate.entry(pov_size).or_insert_with(|| { - let pov = PoV { block_data: BlockData(vec![index as u8; *pov_size]) }; - - let available_data = AvailableData { - validation_data: persisted_validation_data.clone(), - pov: Arc::new(pov), - }; + // For each unique pov we create a candidate receipt. + for (index, pov_size) in config.pov_sizes.iter().cloned().unique().enumerate() { + gum::info!(target: LOG_TARGET, index, pov_size, "Generating template candidates"); - let (new_chunks, erasure_root) = derive_erasure_chunks_with_proofs_and_root( - validators.len(), - &available_data, - |_, _| {}, - ); + let mut candidate_receipt = dummy_candidate_receipt(dummy_hash()); + let pov = PoV { block_data: BlockData(vec![index as u8; pov_size]) }; - candidate.descriptor.erasure_root = erasure_root; + let new_available_data = AvailableData { + validation_data: persisted_validation_data.clone(), + pov: Arc::new(pov), + }; - chunks.insert(candidate.hash(), new_chunks.clone()); - candidates.insert(candidate.hash(), available_data.clone()); + let (new_chunks, erasure_root) = derive_erasure_chunks_with_proofs_and_root( + validators.len(), + &new_available_data, + |_, _| {}, + ); - (erasure_root, available_data, new_chunks) - }); + candidate_receipt.descriptor.erasure_root = erasure_root; - candidate.descriptor.erasure_root = *erasure_root; - candidates.insert(candidate.hash(), available_data.clone()); - chunks.insert(candidate.hash(), new_chunks.clone()); + chunks.push(new_chunks); + available_data.push(new_available_data); + pov_size_to_candidate.insert(pov_size, index); + candidate_receipts.push(candidate_receipt); } - Self { + let pov_sizes = config.pov_sizes.clone().into_iter().cycle(); + let mut state = Self { validators, validator_public, validator_authority_id, validator_index, - candidate_receipts, session_index, persisted_validation_data, - candidates, + available_data, + candidate_receipts, chunks, config, - } + pov_size_to_candidate, + pov_sizes, + candidates_generated: 0, + candidate_hashes: HashMap::new(), + candidates: Vec::new().into_iter().cycle(), + }; + + gum::info!(target: LOG_TARGET, "Created test environment."); + + state } } @@ -593,12 +729,19 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { let mut batch = FuturesUnordered::new(); let mut availability_bytes = 0u128; + env.metrics().set_n_validators(config.n_validators); + env.metrics().set_n_cores(config.n_cores); + env.metrics().set_pov_size(config.pov_sizes[0]); + let mut completed_count = 0; + for loop_num in 0..env.config().num_loops { gum::info!(target: LOG_TARGET, loop_num, "Starting loop"); + env.metrics().set_current_loop(loop_num); + let loop_start_ts = Instant::now(); for candidate_num in 0..config.n_cores as u64 { - let candidate = env.state.candidate(candidate_num as usize); - + let candidate = + env.state.next_candidate().expect("We always send up to n_cores*num_loops; qed"); let (tx, rx) = oneshot::channel(); batch.push(rx); @@ -609,32 +752,40 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { tx, )) .await; - - // // TODO: select between futures unordered of rx await and timer to send next request. - // if batch.len() >= config.max_parallel_recoveries { - // for rx in std::mem::take(&mut batch) { - // let available_data = rx.await.unwrap().unwrap(); - // availability_bytes += available_data.encoded_size() as u128; - // } - // } } + gum::info!("{} requests pending, {} completed", batch.len(), completed_count); while let Some(completed) = batch.next().await { let available_data = completed.unwrap().unwrap(); availability_bytes += available_data.encoded_size() as u128; } + + let block_time_delta = + Duration::from_secs(6).saturating_sub(Instant::now().sub(loop_start_ts)); + gum::info!(target: LOG_TARGET, "Sleeping till end of block {}ms", block_time_delta.as_millis()); + tokio::time::sleep(block_time_delta).await; } - println!("Waiting for subsystem to complete work... {} requests ", batch.len()); env.send_signal(OverseerSignal::Conclude).await; let duration = start_marker.elapsed().as_millis(); - let tput = ((availability_bytes) / duration) * 1000; - println!("Benchmark completed in {:?}ms", duration); - println!("Throughput: {}KiB/s", tput / 1024); + let availability_bytes = availability_bytes / 1024; + gum::info!("Benchmark completed in {:?}ms", duration); + gum::info!("Throughput: {} KiB/block", availability_bytes / env.config().num_loops as u128); + gum::info!( + "Block time: {} ms", + start_marker.elapsed().as_millis() / env.config().num_loops as u128 + ); + + let stats = env.network.stats(); + gum::info!( + "Total received from network: {} MiB", + stats + .iter() + .enumerate() + .map(|(index, stats)| stats.tx_bytes_total as u128) + .sum::() / + (1024 * 1024) + ); - let stats = env.network.stats().await; - for (index, stat) in stats.iter().enumerate() { - println!("Validator #{} : {:?}", index, stat); - } tokio::time::sleep(Duration::from_secs(1)).await; } diff --git a/polkadot/node/subsystem-bench/src/availability/network.rs b/polkadot/node/subsystem-bench/src/availability/network.rs index 02af817e691fd..948fbae445e1b 100644 --- a/polkadot/node/subsystem-bench/src/availability/network.rs +++ b/polkadot/node/subsystem-bench/src/availability/network.rs @@ -20,7 +20,7 @@ use std::sync::atomic::{AtomicU64, Ordering}; use tokio::sync::mpsc::UnboundedSender; // An emulated node egress traffic rate_limiter. #[derive(Debug)] -struct RateLimit { +pub struct RateLimit { // How often we refill credits in buckets tick_rate: usize, // Total ticks @@ -142,9 +142,8 @@ impl PeerEmulator { spawn_task_handle .clone() .spawn("peer-emulator", "test-environment", async move { - let mut rate_limiter = RateLimit::new(20, bandwidth); - let rx_bytes_total = 0; - let mut tx_bytes_total = 0u128; + // Rate limit peer send. + let mut rate_limiter = RateLimit::new(10, bandwidth); loop { let stats_clone = stats.clone(); let maybe_action: Option = actions_rx.recv().await; @@ -158,14 +157,12 @@ impl PeerEmulator { async move { tokio::time::sleep(latency).await; action.run().await; - stats_clone - .tx_bytes_total - .fetch_add(size as u64, Ordering::Relaxed); + stats_clone.inc_sent(size); }, ) } else { action.run().await; - stats_clone.tx_bytes_total.fetch_add(size as u64, Ordering::Relaxed); + stats_clone.inc_sent(size); } } else { break @@ -195,11 +192,43 @@ pub struct NetworkAction { latency: Option, } +unsafe impl Send for NetworkAction {} + /// Book keeping of sent and received bytes. -#[derive(Debug, Default)] pub struct PeerEmulatorStats { - pub rx_bytes_total: AtomicU64, - pub tx_bytes_total: AtomicU64, + rx_bytes_total: AtomicU64, + tx_bytes_total: AtomicU64, + metrics: Metrics, + peer_index: usize, +} + +impl PeerEmulatorStats { + pub(crate) fn new(peer_index: usize, metrics: Metrics) -> Self { + Self { + metrics, + rx_bytes_total: AtomicU64::from(0), + tx_bytes_total: AtomicU64::from(0), + peer_index, + } + } + + pub fn inc_sent(&self, bytes: usize) { + self.tx_bytes_total.fetch_add(bytes as u64, Ordering::Relaxed); + self.metrics.on_peer_sent(self.peer_index, bytes as u64); + } + + pub fn inc_received(&self, bytes: usize) { + self.rx_bytes_total.fetch_add(bytes as u64, Ordering::Relaxed); + self.metrics.on_peer_received(self.peer_index, bytes as u64); + } + + pub fn sent(&self) -> u64 { + self.tx_bytes_total.load(Ordering::Relaxed) + } + + pub fn received(&self) -> u64 { + self.rx_bytes_total.load(Ordering::Relaxed) + } } #[derive(Debug, Default)] @@ -229,21 +258,31 @@ impl NetworkAction { // Implements network latency, bandwidth and error. #[derive(Clone)] pub struct NetworkEmulator { - // Per peer network emulation + // Per peer network emulation. peers: Vec, + // Per peer stats. stats: Vec>, + // Metrics + metrics: Metrics, } impl NetworkEmulator { - pub fn new(n_peers: usize, bandwidth: usize, spawn_task_handle: SpawnTaskHandle) -> Self { + pub fn new( + n_peers: usize, + bandwidth: usize, + spawn_task_handle: SpawnTaskHandle, + registry: &Registry, + ) -> Self { + let metrics = Metrics::new(®istry).expect("Metrics always register succesfully"); + let (stats, peers) = (0..n_peers) - .map(|_index| { - let stats = Arc::new(PeerEmulatorStats::default()); + .map(|peer_index| { + let stats = Arc::new(PeerEmulatorStats::new(peer_index, metrics.clone())); (stats.clone(), PeerEmulator::new(bandwidth, spawn_task_handle.clone(), stats)) }) .unzip(); - Self { peers, stats } + Self { peers, stats, metrics } } pub fn submit_peer_action(&mut self, index: usize, action: NetworkAction) { @@ -251,15 +290,87 @@ impl NetworkEmulator { } // Returns the sent/received stats for all peers. - pub async fn stats(&mut self) -> Vec { + pub fn peer_stats(&mut self, peer_index: usize) -> Arc { + self.stats[peer_index].clone() + } + + // Returns the sent/received stats for all peers. + pub fn stats(&mut self) -> Vec { let r = self .stats .iter() .map(|stats| PeerStats { - rx_bytes_total: stats.rx_bytes_total.load(Ordering::Relaxed), - tx_bytes_total: stats.tx_bytes_total.load(Ordering::Relaxed), + rx_bytes_total: stats.received(), + tx_bytes_total: stats.sent(), }) .collect::>(); r } + + // Increment bytes sent by our node (the node that contains the subsystem under test) + pub fn inc_sent(&self, bytes: u64) { + // Our node always is peer 0. + self.metrics.on_peer_sent(0, bytes); + } + + // Increment bytes received by our node (the node that contains the subsystem under test) + pub fn inc_received(&self, bytes: u64) { + // Our node always is peer 0. + self.metrics.on_peer_received(0, bytes); + } +} + +use polkadot_node_subsystem_util::metrics::{ + self, + prometheus::{self, Counter, CounterVec, Histogram, Opts, PrometheusError, Registry}, +}; + +/// Emulated network metrics. +#[derive(Clone)] +pub(crate) struct Metrics { + /// Number of bytes sent per peer. + peer_total_sent: CounterVec, + /// Number of received sent per peer. + peer_total_received: CounterVec, +} + +impl Metrics { + pub fn new(registry: &Registry) -> Result { + Ok(Self { + peer_total_sent: prometheus::register( + CounterVec::new( + Opts::new( + "subsystem_benchmark_network_peer_total_bytes_sent", + "Total number of bytes a peer has sent.", + ), + &["peer"], + )?, + registry, + )?, + peer_total_received: prometheus::register( + CounterVec::new( + Opts::new( + "subsystem_benchmark_network_peer_total_bytes_received", + "Total number of bytes a peer has received.", + ), + &["peer"], + )?, + registry, + )?, + }) + } + + /// Increment total sent for a peer. + pub fn on_peer_sent(&self, peer_index: usize, bytes: u64) { + self.peer_total_sent + .with_label_values(vec![format!("node{}", peer_index).as_str()].as_slice()) + .inc_by(bytes); + } + + /// Increment total receioved for a peer. + pub fn on_peer_received(&self, peer_index: usize, bytes: u64) { + self.peer_total_received + .with_label_values(vec![format!("node{}", peer_index).as_str()].as_slice()) + .inc_by(bytes); + } } diff --git a/polkadot/node/subsystem-bench/src/availability/test_env.rs b/polkadot/node/subsystem-bench/src/availability/test_env.rs new file mode 100644 index 0000000000000..f67c132f4eb47 --- /dev/null +++ b/polkadot/node/subsystem-bench/src/availability/test_env.rs @@ -0,0 +1,63 @@ +use super::*; +use polkadot_node_subsystem_util::metrics::{ + self, + prometheus::{self, Counter, Gauge, Histogram, Opts, PrometheusError, Registry, U64}, +}; + +/// Test environment/configuration metrics +#[derive(Clone)] +pub struct TestEnvironmentMetrics { + /// Number of bytes sent per peer. + n_validators: Gauge, + /// Number of received sent per peer. + n_cores: Gauge, + /// PoV size + pov_size: Gauge, + /// Current loop + current_loop: Gauge, +} + +impl TestEnvironmentMetrics { + pub fn new(registry: &Registry) -> Result { + Ok(Self { + n_validators: prometheus::register( + Gauge::new( + "subsystem_benchmark_n_validators", + "Total number of validators in the test", + )?, + registry, + )?, + n_cores: prometheus::register( + Gauge::new( + "subsystem_benchmark_n_cores", + "Number of cores we fetch availability for each loop", + )?, + registry, + )?, + pov_size: prometheus::register( + Gauge::new("subsystem_benchmark_pov_size", "The pov size")?, + registry, + )?, + current_loop: prometheus::register( + Gauge::new("subsystem_benchmark_current_loop", "The current test loop")?, + registry, + )?, + }) + } + + pub fn set_n_validators(&self, n_validators: usize) { + self.n_validators.set(n_validators as u64); + } + + pub fn set_n_cores(&self, n_cores: usize) { + self.n_cores.set(n_cores as u64); + } + + pub fn set_current_loop(&self, current_loop: usize) { + self.current_loop.set(current_loop as u64); + } + + pub fn set_pov_size(&self, pov_size: usize) { + self.pov_size.set(pov_size as u64); + } +} diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs index ba66d06fe320d..bdd8d93313bb0 100644 --- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -90,6 +90,10 @@ struct BenchCli { /// The bandwidth of simulated remote peers in KiB pub peer_bandwidth: Option, + #[clap(short, long)] + /// The bandwidth of our simulated node in KiB + pub bandwidth: Option, + #[clap(long, value_parser=le_100)] /// Simulated connection error rate [0-100]. pub peer_error: Option, @@ -124,10 +128,9 @@ impl BenchCli { let runtime = new_runtime(); let registry = Registry::new(); - let registry_clone = registry.clone(); let mut pov_sizes = Vec::new(); - pov_sizes.append(&mut vec![5 * 1024 * 1024; 200]); + pov_sizes.append(&mut vec![10 * 1024 * 1024; 200]); let mut test_config = match self.target { BenchmarkTarget::DataAvailabilityRead(options) => match self.network { @@ -170,14 +173,20 @@ impl BenchCli { } if let Some(bandwidth) = self.peer_bandwidth { + // CLI expects bw in KiB + test_config.peer_bandwidth = bandwidth * 1024; + } + + if let Some(bandwidth) = self.bandwidth { // CLI expects bw in KiB test_config.bandwidth = bandwidth * 1024; } - let state = TestState::new(test_config); - let mut env = TestEnvironment::new(runtime.handle().clone(), state, registry.clone()); + let candidate_count = test_config.n_cores * test_config.num_loops; - let runtime_handle = runtime.handle().clone(); + let mut state = TestState::new(test_config); + state.generate_candidates(candidate_count); + let mut env = TestEnvironment::new(runtime.handle().clone(), state, registry.clone()); println!("{:?}", env.config()); @@ -230,9 +239,9 @@ impl BenchCli { fn main() -> eyre::Result<()> { color_eyre::install()?; let _ = env_logger::builder() - .is_test(true) - .filter(Some(LOG_TARGET), log::LevelFilter::Info) - .try_init(); + .filter(Some("hyper"), log::LevelFilter::Info) + .try_init() + .unwrap(); let cli: BenchCli = BenchCli::parse(); cli.launch()?; From d1b9fa39aaa98cf7e20b2108399a887780255d3b Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Tue, 14 Nov 2023 12:20:24 +0200 Subject: [PATCH 13/45] refactor Signed-off-by: Andrei Sandu --- Cargo.lock | 1 + polkadot/node/subsystem-bench/Cargo.toml | 1 + .../subsystem-bench/src/availability/mod.rs | 105 ++++++++++-------- .../src/availability/test_env.rs | 63 ----------- .../node/subsystem-bench/src/core/display.rs | 15 +++ .../node/subsystem-bench/src/core/keyring.rs | 46 ++++++++ polkadot/node/subsystem-bench/src/core/mod.rs | 80 +++++++++++++ .../src/{availability => core}/network.rs | 48 +++++--- .../node/subsystem-bench/src/core/test_env.rs | 102 +++++++++++++++++ .../subsystem-bench/src/subsystem-bench.rs | 41 +++++-- 10 files changed, 371 insertions(+), 131 deletions(-) delete mode 100644 polkadot/node/subsystem-bench/src/availability/test_env.rs create mode 100644 polkadot/node/subsystem-bench/src/core/display.rs create mode 100644 polkadot/node/subsystem-bench/src/core/keyring.rs create mode 100644 polkadot/node/subsystem-bench/src/core/mod.rs rename polkadot/node/subsystem-bench/src/{availability => core}/network.rs (86%) create mode 100644 polkadot/node/subsystem-bench/src/core/test_env.rs diff --git a/Cargo.lock b/Cargo.lock index ee80ffb6e8157..9e93536d4f327 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13028,6 +13028,7 @@ dependencies = [ "polkadot-primitives-test-helpers", "prometheus", "rand 0.8.5", + "sc-keystore", "sc-network", "sc-service", "sp-application-crypto", diff --git a/polkadot/node/subsystem-bench/Cargo.toml b/polkadot/node/subsystem-bench/Cargo.toml index c5d62d3aa74f2..72c8c3ac3c4d8 100644 --- a/polkadot/node/subsystem-bench/Cargo.toml +++ b/polkadot/node/subsystem-bench/Cargo.toml @@ -27,6 +27,7 @@ color-eyre = { version = "0.6.1", default-features = false } assert_matches = "1.5" async-trait = "0.1.57" sp-keystore = { path = "../../../substrate/primitives/keystore" } +sc-keystore = { path = "../../../substrate/client/keystore" } sp-core = { path = "../../../substrate/primitives/core" } clap = { version = "4.4.6", features = ["derive"] } futures = "0.3.21" diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index 4f821f8199089..5f856ec1780fa 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -22,6 +22,10 @@ use std::{ time::{Duration, Instant}, }; +use sc_keystore::LocalKeystore; +use sp_application_crypto::AppCrypto; +use sp_keystore::{Keystore, KeystorePtr}; + use futures::{ channel::{mpsc, oneshot}, stream::FuturesUnordered, @@ -53,7 +57,7 @@ use polkadot_node_subsystem::{ }; use std::net::{Ipv4Addr, SocketAddr}; -mod test_env; +use super::core::{keyring::Keyring, network::*, test_env::TestEnvironmentMetrics}; const LOG_TARGET: &str = "subsystem-bench::availability"; @@ -71,9 +75,8 @@ use polkadot_primitives_test_helpers::{dummy_candidate_receipt, dummy_hash}; use sc_service::{SpawnTaskHandle, TaskManager}; mod configuration; -mod network; -pub use configuration::TestConfiguration; +pub use configuration::{PeerLatency, TestConfiguration}; // Deterministic genesis hash for protocol names const GENESIS_HASH: Hash = Hash::repeat_byte(0xff); @@ -140,10 +143,12 @@ impl TestEnvironment { task_manager.spawn_handle(), state.config().use_fast_path, ); + let metrics = TestEnvironmentMetrics::new(®istry).expect("Metrics need to be registered"); let mut network = NetworkEmulator::new( state.config().n_validators, + state.validator_authority_id.clone(), state.config().peer_bandwidth, task_manager.spawn_handle(), ®istry, @@ -243,7 +248,7 @@ impl TestEnvironment { ) -> NetworkAction { match request { Requests::ChunkFetchingV1(outgoing_request) => { - let validator_index = outgoing_request.payload.index.0 as usize; + let validator_index: usize = outgoing_request.payload.index.0 as usize; let candidate_hash = outgoing_request.payload.candidate_hash; let candidate_index = state @@ -266,6 +271,12 @@ impl TestEnvironment { Ok(req_res::v1::ChunkFetchingResponse::from(Some(chunk)).encode()) }; + let authority_discovery_id = match outgoing_request.peer { + req_res::Recipient::Authority(authority_discovery_id) => authority_discovery_id, + _ => panic!("Peer recipient not supported yet"), + }; + let authority_discovery_id_clone = authority_discovery_id.clone(); + let future = async move { let _ = outgoing_request.pending_response.send(response); } @@ -274,13 +285,14 @@ impl TestEnvironment { let future_wrapper = async move { // Forward the response to the ingress channel of our node. // On receive side we apply our node receiving rate limit. - let action = NetworkAction::new(validator_index, future, size, None); + let action = + NetworkAction::new(authority_discovery_id_clone, future, size, None); ingress_tx.send(action).unwrap(); } .boxed(); NetworkAction::new( - validator_index, + authority_discovery_id, future_wrapper, size, // Generate a random latency based on configuration. @@ -288,12 +300,6 @@ impl TestEnvironment { ) }, Requests::AvailableDataFetchingV1(outgoing_request) => { - println!("{:?}", outgoing_request); - // TODO: do better, by implementing diff authority ids and mapping network actions - // to authority id, - let validator_index = - Uniform::from(0..state.config().n_validators).sample(&mut thread_rng()); - let candidate_hash = outgoing_request.payload.candidate_hash; let candidate_index = state .candidate_hashes @@ -318,16 +324,23 @@ impl TestEnvironment { } .boxed(); + let authority_discovery_id = match outgoing_request.peer { + req_res::Recipient::Authority(authority_discovery_id) => authority_discovery_id, + _ => panic!("Peer recipient not supported yet"), + }; + let authority_discovery_id_clone = authority_discovery_id.clone(); + let future_wrapper = async move { // Forward the response to the ingress channel of our node. // On receive side we apply our node receiving rate limit. - let action = NetworkAction::new(validator_index, future, size, None); + let action = + NetworkAction::new(authority_discovery_id_clone, future, size, None); ingress_tx.send(action).unwrap(); } .boxed(); NetworkAction::new( - validator_index, + authority_discovery_id, future_wrapper, size, // Generate a random latency based on configuration. @@ -362,7 +375,7 @@ impl TestEnvironment { network.inc_sent(Self::request_size(&request)); let action = Self::respond_to_send_request(&mut state, request, ingress_tx.clone()); // Account for our node sending the request over the emulated network. - network.submit_peer_action(action.index(), action); + network.submit_peer_action(action.peer(), action); } }, AllMessages::AvailabilityStore(AvailabilityStoreMessage::QueryAvailableData(_candidate_hash, tx)) => { @@ -470,25 +483,24 @@ impl AvailabilityRecoverySubsystemInstance { } } +pub fn random_pov_size(min_pov_size: usize, max_pov_size: usize) -> usize { + random_uniform_sample(min_pov_size, max_pov_size) +} + +fn random_uniform_sample + From>(min_value: T, max_value: T) -> T { + Uniform::from(min_value.into()..=max_value.into()) + .sample(&mut thread_rng()) + .into() +} + // We use this to bail out sending messages to the subsystem if it is overloaded such that // the time of flight is breaches 5s. // This should eventually be a test parameter. const MAX_TIME_OF_FLIGHT: Duration = Duration::from_millis(5000); -use sp_keyring::Sr25519Keyring; - -use crate::availability::network::{ActionFuture, NetworkAction}; - -use self::{ - configuration::PeerLatency, - network::{NetworkEmulator, RateLimit}, - test_env::TestEnvironmentMetrics, -}; - #[derive(Clone)] pub struct TestState { - validators: Vec, - validator_public: IndexedVec, + validator_public: Vec, validator_authority_id: Vec, // The test node validator index. validator_index: ValidatorIndex, @@ -531,7 +543,7 @@ impl TestState { let validator_groups = my_vec.chunks(5).map(|x| Vec::from(x)).collect::>(); SessionInfo { - validators: self.validator_public.clone(), + validators: self.validator_public.clone().into(), discovery_keys: self.validator_authority_id.clone(), validator_groups: IndexedVec::>::from(validator_groups), assignment_keys: vec![], @@ -608,13 +620,24 @@ impl TestState { } pub fn new(config: TestConfiguration) -> Self { - let validators = (0..config.n_validators as u64) - .into_iter() - .map(|_v| Sr25519Keyring::Alice) + let keystore: KeystorePtr = Arc::new(LocalKeystore::in_memory()); + + let keyrings = (0..config.n_validators) + .map(|peer_index| Keyring::new(format!("Node{}", peer_index).into())) .collect::>(); - let validator_public = validator_pubkeys(&validators); - let validator_authority_id = validator_authority_id(&validators); + // Generate `AuthorityDiscoveryId`` for each peer + let validator_public: Vec = keyrings + .iter() + .map(|keyring: &Keyring| keyring.clone().public().into()) + .collect::>(); + + let validator_authority_id: Vec = keyrings + .iter() + .map({ |keyring| keyring.clone().public().into() }) + .collect::>() + .into(); + let validator_index = ValidatorIndex(0); let mut chunks = Vec::new(); let mut available_data = Vec::new(); @@ -643,7 +666,7 @@ impl TestState { }; let (new_chunks, erasure_root) = derive_erasure_chunks_with_proofs_and_root( - validators.len(), + config.n_validators, &new_available_data, |_, _| {}, ); @@ -658,7 +681,6 @@ impl TestState { let pov_sizes = config.pov_sizes.clone().into_iter().cycle(); let mut state = Self { - validators, validator_public, validator_authority_id, validator_index, @@ -681,14 +703,6 @@ impl TestState { } } -fn validator_pubkeys(val_ids: &[Sr25519Keyring]) -> IndexedVec { - val_ids.iter().map(|v| v.public().into()).collect() -} - -fn validator_authority_id(val_ids: &[Sr25519Keyring]) -> Vec { - val_ids.iter().map(|v| v.public().into()).collect() -} - fn derive_erasure_chunks_with_proofs_and_root( n_validators: usize, available_data: &AvailableData, @@ -731,8 +745,6 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { env.metrics().set_n_validators(config.n_validators); env.metrics().set_n_cores(config.n_cores); - env.metrics().set_pov_size(config.pov_sizes[0]); - let mut completed_count = 0; for loop_num in 0..env.config().num_loops { gum::info!(target: LOG_TARGET, loop_num, "Starting loop"); @@ -754,9 +766,10 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { .await; } - gum::info!("{} requests pending, {} completed", batch.len(), completed_count); + gum::info!("{} requests pending", batch.len()); while let Some(completed) = batch.next().await { let available_data = completed.unwrap().unwrap(); + env.metrics().on_pov_size(available_data.encoded_size()); availability_bytes += available_data.encoded_size() as u128; } diff --git a/polkadot/node/subsystem-bench/src/availability/test_env.rs b/polkadot/node/subsystem-bench/src/availability/test_env.rs deleted file mode 100644 index f67c132f4eb47..0000000000000 --- a/polkadot/node/subsystem-bench/src/availability/test_env.rs +++ /dev/null @@ -1,63 +0,0 @@ -use super::*; -use polkadot_node_subsystem_util::metrics::{ - self, - prometheus::{self, Counter, Gauge, Histogram, Opts, PrometheusError, Registry, U64}, -}; - -/// Test environment/configuration metrics -#[derive(Clone)] -pub struct TestEnvironmentMetrics { - /// Number of bytes sent per peer. - n_validators: Gauge, - /// Number of received sent per peer. - n_cores: Gauge, - /// PoV size - pov_size: Gauge, - /// Current loop - current_loop: Gauge, -} - -impl TestEnvironmentMetrics { - pub fn new(registry: &Registry) -> Result { - Ok(Self { - n_validators: prometheus::register( - Gauge::new( - "subsystem_benchmark_n_validators", - "Total number of validators in the test", - )?, - registry, - )?, - n_cores: prometheus::register( - Gauge::new( - "subsystem_benchmark_n_cores", - "Number of cores we fetch availability for each loop", - )?, - registry, - )?, - pov_size: prometheus::register( - Gauge::new("subsystem_benchmark_pov_size", "The pov size")?, - registry, - )?, - current_loop: prometheus::register( - Gauge::new("subsystem_benchmark_current_loop", "The current test loop")?, - registry, - )?, - }) - } - - pub fn set_n_validators(&self, n_validators: usize) { - self.n_validators.set(n_validators as u64); - } - - pub fn set_n_cores(&self, n_cores: usize) { - self.n_cores.set(n_cores as u64); - } - - pub fn set_current_loop(&self, current_loop: usize) { - self.current_loop.set(current_loop as u64); - } - - pub fn set_pov_size(&self, pov_size: usize) { - self.pov_size.set(pov_size as u64); - } -} diff --git a/polkadot/node/subsystem-bench/src/core/display.rs b/polkadot/node/subsystem-bench/src/core/display.rs new file mode 100644 index 0000000000000..47483d33a42a9 --- /dev/null +++ b/polkadot/node/subsystem-bench/src/core/display.rs @@ -0,0 +1,15 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . diff --git a/polkadot/node/subsystem-bench/src/core/keyring.rs b/polkadot/node/subsystem-bench/src/core/keyring.rs new file mode 100644 index 0000000000000..40e8d60d0cd1e --- /dev/null +++ b/polkadot/node/subsystem-bench/src/core/keyring.rs @@ -0,0 +1,46 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +pub use sp_core::sr25519; +use sp_core::{ + sr25519::{Pair, Public, Signature}, + ByteArray, Pair as PairT, H256, +}; +use std::{collections::HashMap, ops::Deref}; + +/// Set of test accounts. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct Keyring { + name: String, +} + +impl Keyring { + pub fn new(name: String) -> Keyring { + Self { name } + } + + pub fn pair(self) -> Pair { + Pair::from_string(&format!("//{}", self.name), None).expect("input is always good; qed") + } + + pub fn public(self) -> Public { + self.pair().public() + } + + pub fn to_seed(self) -> String { + format!("//{}", self.name) + } +} diff --git a/polkadot/node/subsystem-bench/src/core/mod.rs b/polkadot/node/subsystem-bench/src/core/mod.rs new file mode 100644 index 0000000000000..4b9db3144f54a --- /dev/null +++ b/polkadot/node/subsystem-bench/src/core/mod.rs @@ -0,0 +1,80 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +use itertools::Itertools; +use std::{ + collections::HashMap, + iter::Cycle, + ops::{Div, Sub}, + sync::Arc, + time::{Duration, Instant}, +}; + +use sc_keystore::LocalKeystore; +use sp_application_crypto::AppCrypto; +use sp_keystore::{Keystore, KeystorePtr}; + +use futures::{ + channel::{mpsc, oneshot}, + stream::FuturesUnordered, + FutureExt, SinkExt, StreamExt, +}; +use futures_timer::Delay; + +use polkadot_node_metrics::metrics::Metrics; + +use polkadot_availability_recovery::AvailabilityRecoverySubsystem; + +use parity_scale_codec::Encode; +use polkadot_node_network_protocol::request_response::{ + self as req_res, v1::ChunkResponse, IncomingRequest, ReqProtocolNames, Requests, +}; +use rand::{distributions::Uniform, prelude::Distribution, seq::IteratorRandom, thread_rng}; + +use prometheus::Registry; +use sc_network::{config::RequestResponseConfig, OutboundFailure, RequestFailure}; + +use polkadot_erasure_coding::{branches, obtain_chunks_v1 as obtain_chunks}; +use polkadot_node_primitives::{BlockData, PoV, Proof}; +use polkadot_node_subsystem::{ + messages::{ + AllMessages, AvailabilityRecoveryMessage, AvailabilityStoreMessage, NetworkBridgeTxMessage, + RuntimeApiMessage, RuntimeApiRequest, + }, + ActiveLeavesUpdate, FromOrchestra, OverseerSignal, Subsystem, +}; +use std::net::{Ipv4Addr, SocketAddr}; + +use super::core::{keyring::Keyring, network::*, test_env::TestEnvironmentMetrics}; + +const LOG_TARGET: &str = "subsystem-bench::core"; + +use polkadot_node_primitives::{AvailableData, ErasureChunk}; + +use polkadot_node_subsystem_test_helpers::{ + make_buffered_subsystem_context, mock::new_leaf, TestSubsystemContextHandle, +}; +use polkadot_node_subsystem_util::TimeoutExt; +use polkadot_primitives::{ + AuthorityDiscoveryId, CandidateHash, CandidateReceipt, GroupIndex, Hash, HeadData, IndexedVec, + PersistedValidationData, SessionIndex, SessionInfo, ValidatorId, ValidatorIndex, +}; +use polkadot_primitives_test_helpers::{dummy_candidate_receipt, dummy_hash}; +use sc_service::{SpawnTaskHandle, TaskManager}; + +pub mod keyring; +pub mod network; +pub mod test_env; diff --git a/polkadot/node/subsystem-bench/src/availability/network.rs b/polkadot/node/subsystem-bench/src/core/network.rs similarity index 86% rename from polkadot/node/subsystem-bench/src/availability/network.rs rename to polkadot/node/subsystem-bench/src/core/network.rs index 948fbae445e1b..170ab45e35a39 100644 --- a/polkadot/node/subsystem-bench/src/availability/network.rs +++ b/polkadot/node/subsystem-bench/src/core/network.rs @@ -124,7 +124,9 @@ mod tests { } } -// A network peer emulator +// A network peer emulator. It spawns a task that accepts `NetworkActions` and +// executes them with a configurable delay and bandwidth constraints. Tipically +// these actions wrap a future that performs a channel send to the subsystem(s) under test. #[derive(Clone)] struct PeerEmulator { // The queue of requests waiting to be served by the emulator @@ -186,8 +188,8 @@ pub struct NetworkAction { run: ActionFuture, // The payload size that we simulate sending from a peer size: usize, - // Peer index - index: usize, + // Peer which should run the action. + peer: AuthorityDiscoveryId, // The amount of time to delay the polling `run` latency: Option, } @@ -237,8 +239,13 @@ pub struct PeerStats { pub tx_bytes_total: u64, } impl NetworkAction { - pub fn new(index: usize, run: ActionFuture, size: usize, latency: Option) -> Self { - Self { run, size, index, latency } + pub fn new( + peer: AuthorityDiscoveryId, + run: ActionFuture, + size: usize, + latency: Option, + ) -> Self { + Self { run, size, peer, latency } } pub fn size(&self) -> usize { @@ -249,44 +256,55 @@ impl NetworkAction { self.run.await; } - pub fn index(&self) -> usize { - self.index + pub fn peer(&self) -> AuthorityDiscoveryId { + self.peer.clone() } } -// Mocks the network bridge and an arbitrary number of connected peer nodes. -// Implements network latency, bandwidth and error. +/// Mocks the network bridge and an arbitrary number of connected peer nodes. +/// Implements network latency, bandwidth and connection errors. #[derive(Clone)] pub struct NetworkEmulator { // Per peer network emulation. peers: Vec, - // Per peer stats. + /// Per peer stats. stats: Vec>, - // Metrics + /// Network throughput metrics metrics: Metrics, + /// Each emulated peer is a validator. + validator_authority_ids: HashMap, } impl NetworkEmulator { pub fn new( n_peers: usize, + validator_authority_ids: Vec, bandwidth: usize, spawn_task_handle: SpawnTaskHandle, registry: &Registry, ) -> Self { let metrics = Metrics::new(®istry).expect("Metrics always register succesfully"); + let mut validator_authority_id_mapping = HashMap::new(); + // Create a `PeerEmulator` for each peer. let (stats, peers) = (0..n_peers) - .map(|peer_index| { + .zip(validator_authority_ids.into_iter()) + .map(|(peer_index, authority_id)| { + validator_authority_id_mapping.insert(authority_id, peer_index); let stats = Arc::new(PeerEmulatorStats::new(peer_index, metrics.clone())); (stats.clone(), PeerEmulator::new(bandwidth, spawn_task_handle.clone(), stats)) }) .unzip(); - Self { peers, stats, metrics } + Self { peers, stats, metrics, validator_authority_ids: validator_authority_id_mapping } } - pub fn submit_peer_action(&mut self, index: usize, action: NetworkAction) { - let _ = self.peers[index].send(action); + pub fn submit_peer_action(&mut self, peer: AuthorityDiscoveryId, action: NetworkAction) { + let index = self + .validator_authority_ids + .get(&peer) + .expect("all test authorities are valid; qed"); + self.peers[*index].send(action); } // Returns the sent/received stats for all peers. diff --git a/polkadot/node/subsystem-bench/src/core/test_env.rs b/polkadot/node/subsystem-bench/src/core/test_env.rs new file mode 100644 index 0000000000000..c20b96d642afe --- /dev/null +++ b/polkadot/node/subsystem-bench/src/core/test_env.rs @@ -0,0 +1,102 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +use super::*; +use polkadot_node_subsystem_util::metrics::{ + self, + prometheus::{ + self, Counter, Gauge, Histogram, HistogramVec, Opts, PrometheusError, Registry, U64, + }, +}; + +const MIB: f64 = 1024.0*1024.0; + +/// Test environment/configuration metrics +#[derive(Clone)] +pub struct TestEnvironmentMetrics { + /// Number of bytes sent per peer. + n_validators: Gauge, + /// Number of received sent per peer. + n_cores: Gauge, + /// PoV size + pov_size: Histogram, + /// Current loop + current_loop: Gauge, +} + +impl TestEnvironmentMetrics { + pub fn new(registry: &Registry) -> Result { + let mut buckets = prometheus::exponential_buckets(16384.0, 2.0, 9) + .expect("arguments are always valid; qed"); + buckets.extend(vec![ + 5.0 * MIB, + 6.0 * MIB, + 7.0 * MIB, + 8.0 * MIB, + 9.0 * MIB, + 10.0 * MIB, + ]); + + Ok(Self { + n_validators: prometheus::register( + Gauge::new( + "subsystem_benchmark_n_validators", + "Total number of validators in the test", + )?, + registry, + )?, + n_cores: prometheus::register( + Gauge::new( + "subsystem_benchmark_n_cores", + "Number of cores we fetch availability for each loop", + )?, + registry, + )?, + current_loop: prometheus::register( + Gauge::new("subsystem_benchmark_current_loop", "The current test loop")?, + registry, + )?, + pov_size: prometheus::register( + Histogram::with_opts( + prometheus::HistogramOpts::new( + "subsystem_benchmark_pov_size", + "The compressed size of the proof of validity of a candidate", + ) + .buckets( + buckets + ), + )?, + registry, + )?, + }) + } + + pub fn set_n_validators(&self, n_validators: usize) { + self.n_validators.set(n_validators as u64); + } + + pub fn set_n_cores(&self, n_cores: usize) { + self.n_cores.set(n_cores as u64); + } + + pub fn set_current_loop(&self, current_loop: usize) { + self.current_loop.set(current_loop as u64); + } + + pub fn on_pov_size(&self, pov_size: usize) { + self.pov_size.observe(pov_size as f64); + } +} diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs index bdd8d93313bb0..9e581555d7614 100644 --- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -22,8 +22,9 @@ use prometheus::proto::LabelPair; use std::time::Duration; pub(crate) mod availability; +pub(crate) mod core; -use availability::{TestConfiguration, TestEnvironment, TestState}; +use availability::{random_pov_size, TestConfiguration, TestEnvironment, TestState}; const LOG_TARGET: &str = "subsystem-bench"; use clap_num::number_range; @@ -62,6 +63,14 @@ pub struct DataAvailabilityReadOptions { /// Number of validators to fetch chunks from. pub n_validators: usize, + #[clap(long, ignore_case = true, default_value_t = 5120)] + /// The minimum pov size in KiB + pub min_pov_size: usize, + + #[clap(long, ignore_case = true, default_value_t = 5120)] + /// The maximum pov size bytes + pub max_pov_size: usize, + #[clap(short, long, default_value_t = false)] /// Turbo boost AD Read by fetching from backers first. Tipically this is only faster if nodes /// have enough bandwidth. @@ -129,9 +138,6 @@ impl BenchCli { let runtime = new_runtime(); let registry = Registry::new(); - let mut pov_sizes = Vec::new(); - pov_sizes.append(&mut vec![10 * 1024 * 1024; 200]); - let mut test_config = match self.target { BenchmarkTarget::DataAvailabilityRead(options) => match self.network { NetworkEmulation::Healthy => TestConfiguration::healthy_network( @@ -139,21 +145,42 @@ impl BenchCli { options.fetch_from_backers, options.n_validators, options.n_cores, - pov_sizes, + (0..options.n_cores) + .map(|_| { + random_pov_size( + options.min_pov_size * 1024, + options.max_pov_size * 1024, + ) + }) + .collect(), ), NetworkEmulation::Degraded => TestConfiguration::degraded_network( options.num_loops, options.fetch_from_backers, options.n_validators, options.n_cores, - pov_sizes, + (0..options.n_cores) + .map(|_| { + random_pov_size( + options.min_pov_size * 1024, + options.max_pov_size * 1024, + ) + }) + .collect(), ), NetworkEmulation::Ideal => TestConfiguration::ideal_network( options.num_loops, options.fetch_from_backers, options.n_validators, options.n_cores, - pov_sizes, + (0..options.n_cores) + .map(|_| { + random_pov_size( + options.min_pov_size * 1024, + options.max_pov_size * 1024, + ) + }) + .collect(), ), }, }; From c5937ab840c56a812f840332fdbb295b23c10823 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Wed, 15 Nov 2023 08:42:41 +0200 Subject: [PATCH 14/45] pretty cli + minor refactor + remove unused Signed-off-by: Andrei Sandu --- Cargo.lock | 61 +++- cumulus/pallets/xcmp-queue/src/bridging.rs | 4 +- polkadot/node/subsystem-bench/Cargo.toml | 3 + .../src/availability/configuration.rs | 83 +++++- .../subsystem-bench/src/availability/mod.rs | 120 ++++---- .../node/subsystem-bench/src/core/display.rs | 276 ++++++++++++++++++ .../node/subsystem-bench/src/core/keyring.rs | 10 +- polkadot/node/subsystem-bench/src/core/mod.rs | 56 +--- .../node/subsystem-bench/src/core/network.rs | 4 +- .../node/subsystem-bench/src/core/test_env.rs | 39 +-- .../subsystem-bench/src/subsystem-bench.rs | 99 +++---- .../node/subsystem-test-helpers/src/lib.rs | 7 + 12 files changed, 537 insertions(+), 225 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9e93536d4f327..73fc3cbdeccc9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2764,6 +2764,17 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" +[[package]] +name = "colored" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2674ec482fbc38012cf31e6c42ba0177b431a0cb6f15fe40efa5aab1bda516f6" +dependencies = [ + "is-terminal", + "lazy_static", + "windows-sys 0.48.0", +] + [[package]] name = "comfy-table" version = "7.0.1" @@ -8568,7 +8579,7 @@ dependencies = [ "itertools 0.10.5", "tar", "tempfile", - "toml_edit", + "toml_edit 0.19.14", ] [[package]] @@ -13009,6 +13020,7 @@ dependencies = [ "clap 4.4.6", "clap-num", "color-eyre", + "colored", "env_logger 0.9.3", "futures", "futures-timer", @@ -13031,12 +13043,14 @@ dependencies = [ "sc-keystore", "sc-network", "sc-service", + "serde", "sp-application-crypto", "sp-core", "sp-keyring", "sp-keystore", "substrate-prometheus-endpoint", "tokio", + "toml 0.8.8", "tracing-gum", ] @@ -13441,7 +13455,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f4c021e1093a56626774e81216a4ce732a735e5bad4868a03f3ed65ca0c3919" dependencies = [ "once_cell", - "toml_edit", + "toml_edit 0.19.14", ] [[package]] @@ -16276,18 +16290,18 @@ checksum = "f97841a747eef040fcd2e7b3b9a220a7205926e60488e673d9e4926d27772ce5" [[package]] name = "serde" -version = "1.0.188" +version = "1.0.192" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf9e0fcba69a370eed61bcf2b728575f726b50b55cba78064753d708ddc7549e" +checksum = "bca2a08484b285dcb282d0f67b26cadc0df8b19f8c12502c13d966bf9482f001" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.188" +version = "1.0.192" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2" +checksum = "d6c7207fbec9faa48073f3e3074cbe553af6ea512d7c21ba46e434e70ea9fbc1" dependencies = [ "proc-macro2", "quote", @@ -16316,9 +16330,9 @@ dependencies = [ [[package]] name = "serde_spanned" -version = "0.6.3" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96426c9936fd7a0124915f9185ea1d20aa9445cc9821142f0a73bc9207a2e186" +checksum = "12022b835073e5b11e90a14f86838ceb1c8fb0325b72416845c487ac0fa95e80" dependencies = [ "serde", ] @@ -18819,14 +18833,26 @@ dependencies = [ "serde", "serde_spanned", "toml_datetime", - "toml_edit", + "toml_edit 0.19.14", +] + +[[package]] +name = "toml" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1a195ec8c9da26928f773888e0742ca3ca1040c6cd859c919c9f59c1954ab35" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit 0.21.0", ] [[package]] name = "toml_datetime" -version = "0.6.3" +version = "0.6.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cda73e2f1397b1262d6dfdcef8aafae14d1de7748d66822d3bfeeb6d03e5e4b" +checksum = "3550f4e9685620ac18a50ed434eb3aec30db8ba93b0287467bca5826ea25baf1" dependencies = [ "serde", ] @@ -18844,6 +18870,19 @@ dependencies = [ "winnow", ] +[[package]] +name = "toml_edit" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d34d383cd00a163b4a5b85053df514d45bc330f6de7737edfe0a93311d1eaa03" +dependencies = [ + "indexmap 2.0.0", + "serde", + "serde_spanned", + "toml_datetime", + "winnow", +] + [[package]] name = "tower" version = "0.4.13" diff --git a/cumulus/pallets/xcmp-queue/src/bridging.rs b/cumulus/pallets/xcmp-queue/src/bridging.rs index 0fc3f1f39ea38..53238fe2bf7a9 100644 --- a/cumulus/pallets/xcmp-queue/src/bridging.rs +++ b/cumulus/pallets/xcmp-queue/src/bridging.rs @@ -55,7 +55,9 @@ impl, Runtime: crate::Config> let sibling_bridge_hub_id: ParaId = SiblingBridgeHubParaId::get(); // let's find the channel's state with the sibling parachain, - let Some((outbound_state, queued_pages)) = pallet::Pallet::::outbound_channel_state(sibling_bridge_hub_id) else { + let Some((outbound_state, queued_pages)) = + pallet::Pallet::::outbound_channel_state(sibling_bridge_hub_id) + else { return false }; // suspended channel => it is congested diff --git a/polkadot/node/subsystem-bench/Cargo.toml b/polkadot/node/subsystem-bench/Cargo.toml index 72c8c3ac3c4d8..3308b6fe1052b 100644 --- a/polkadot/node/subsystem-bench/Cargo.toml +++ b/polkadot/node/subsystem-bench/Cargo.toml @@ -24,6 +24,7 @@ polkadot-primitives = { path = "../../primitives" } polkadot-node-network-protocol = { path = "../network/protocol" } polkadot-availability-recovery = { path = "../network/availability-recovery", features=["subsystem-benchmarks"]} color-eyre = { version = "0.6.1", default-features = false } +colored = "2.0.4" assert_matches = "1.5" async-trait = "0.1.57" sp-keystore = { path = "../../../substrate/primitives/keystore" } @@ -50,6 +51,8 @@ itertools = "0.11.0" polkadot-primitives-test-helpers = { path = "../../primitives/test-helpers" } prometheus_endpoint = { package = "substrate-prometheus-endpoint", path = "../../../substrate/utils/prometheus" } prometheus = { version = "0.13.0", default-features = false } +toml = "0.8.8" +serde = "1.0.192" [features] default = [] diff --git a/polkadot/node/subsystem-bench/src/availability/configuration.rs b/polkadot/node/subsystem-bench/src/availability/configuration.rs index cf142de06634c..2d29d23811da6 100644 --- a/polkadot/node/subsystem-bench/src/availability/configuration.rs +++ b/polkadot/node/subsystem-bench/src/availability/configuration.rs @@ -14,10 +14,12 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . -use super::*; +use std::path::Path; +use super::*; +use serde::{Deserialize,Serialize}; /// Peer response latency configuration. -#[derive(Clone, Debug, Default)] +#[derive(Clone, Debug, Default, Serialize, Deserialize)] pub struct PeerLatency { /// Min latency for `NetworkAction` completion. pub min_latency: Duration, @@ -26,7 +28,7 @@ pub struct PeerLatency { } /// The test input parameters -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize, Deserialize)] pub struct TestConfiguration { /// Configuration for the `availability-recovery` subsystem. pub use_fast_path: bool, @@ -34,8 +36,13 @@ pub struct TestConfiguration { pub n_validators: usize, /// Number of cores pub n_cores: usize, - /// The PoV size - pub pov_sizes: Vec, + /// The min PoV size + pub min_pov_size: usize, + /// The max PoV size, + pub max_pov_size: usize, + /// Randomly sampled pov_sizes + #[serde(skip)] + pov_sizes: Vec, /// The amount of bandiwdth remote validators have. pub peer_bandwidth: usize, /// The amount of bandiwdth our node has. @@ -44,31 +51,72 @@ pub struct TestConfiguration { pub latency: Option, /// Error probability pub error: usize, - /// Number of loops - /// In one loop `n_cores` candidates are recovered - pub num_loops: usize, + /// Number of blocks + /// In one block `n_cores` candidates are recovered + pub num_blocks: usize, } + impl Default for TestConfiguration { fn default() -> Self { Self { use_fast_path: false, - n_validators: 10, + n_validators: 100, n_cores: 10, pov_sizes: vec![5 * 1024 * 1024], bandwidth: 60 * 1024 * 1024, peer_bandwidth: 60 * 1024 * 1024, latency: None, error: 0, - num_loops: 1, + num_blocks: 1, + min_pov_size: 5*1024*1024, + max_pov_size: 5*1024*1024, } } } +fn generate_pov_sizes(count: usize, min: usize, max: usize) -> Vec { + (0..count).map(|_| random_pov_size(min, max)).collect() +} + +#[derive(Serialize,Deserialize)] +pub struct TestSequence { + #[serde(rename(serialize = "TestConfiguration", deserialize = "TestConfiguration"))] + test_configurations: Vec +} + +impl TestSequence { + pub fn to_vec(mut self) -> Vec { + // Generate Pov sizes + + for config in self.test_configurations.iter_mut() { + config.pov_sizes = generate_pov_sizes(config.n_cores, config.min_pov_size, config.max_pov_size); + } + + self.test_configurations + } +} + +impl TestSequence { + pub fn new_from_file(path: &Path) -> std::io::Result { + let string = String::from_utf8(std::fs::read(&path)?).expect("File is valid UTF8"); + Ok(toml::from_str(&string).expect("File is valid test sequence TOML")) + } +} + impl TestConfiguration { + pub fn write_to_disk(&self) { + // Serialize a slice of configurations + let toml = toml::to_string(&TestSequence{ test_configurations: vec![self.clone()] }).unwrap(); + std::fs::write("last_test.toml", toml).unwrap(); + } + + pub fn pov_sizes(&self) -> &[usize] { + &self.pov_sizes + } /// An unconstrained standard configuration matching Polkadot/Kusama pub fn ideal_network( - num_loops: usize, + num_blocks: usize, use_fast_path: bool, n_validators: usize, n_cores: usize, @@ -84,12 +132,13 @@ impl TestConfiguration { // No latency latency: None, error: 0, - num_loops, + num_blocks, + ..Default::default() } } pub fn healthy_network( - num_loops: usize, + num_blocks: usize, use_fast_path: bool, n_validators: usize, n_cores: usize, @@ -107,12 +156,13 @@ impl TestConfiguration { max_latency: Duration::from_millis(100), }), error: 3, - num_loops, + num_blocks, + ..Default::default() } } pub fn degraded_network( - num_loops: usize, + num_blocks: usize, use_fast_path: bool, n_validators: usize, n_cores: usize, @@ -130,7 +180,8 @@ impl TestConfiguration { max_latency: Duration::from_millis(500), }), error: 33, - num_loops, + num_blocks, + ..Default::default() } } } diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index 5f856ec1780fa..2c9f3e735afb1 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -17,22 +17,18 @@ use itertools::Itertools; use std::{ collections::HashMap, iter::Cycle, - ops::{Div, Sub}, + ops::Sub, sync::Arc, time::{Duration, Instant}, }; -use sc_keystore::LocalKeystore; -use sp_application_crypto::AppCrypto; -use sp_keystore::{Keystore, KeystorePtr}; +use colored::Colorize; use futures::{ channel::{mpsc, oneshot}, stream::FuturesUnordered, FutureExt, SinkExt, StreamExt, }; -use futures_timer::Delay; - use polkadot_node_metrics::metrics::Metrics; use polkadot_availability_recovery::AvailabilityRecoverySubsystem; @@ -41,7 +37,7 @@ use parity_scale_codec::Encode; use polkadot_node_network_protocol::request_response::{ self as req_res, v1::ChunkResponse, IncomingRequest, ReqProtocolNames, Requests, }; -use rand::{distributions::Uniform, prelude::Distribution, seq::IteratorRandom, thread_rng}; +use rand::{distributions::Uniform, prelude::Distribution, thread_rng}; use prometheus::Registry; use sc_network::{config::RequestResponseConfig, OutboundFailure, RequestFailure}; @@ -74,9 +70,9 @@ use polkadot_primitives::{ use polkadot_primitives_test_helpers::{dummy_candidate_receipt, dummy_hash}; use sc_service::{SpawnTaskHandle, TaskManager}; -mod configuration; +pub mod configuration; -pub use configuration::{PeerLatency, TestConfiguration}; +pub use configuration::{PeerLatency, TestConfiguration, TestSequence}; // Deterministic genesis hash for protocol names const GENESIS_HASH: Hash = Hash::repeat_byte(0xff); @@ -205,8 +201,12 @@ impl TestEnvironment { self.state.config() } - pub fn network(&self) -> &NetworkEmulator { - &self.network + pub fn network(&mut self) -> &mut NetworkEmulator { + &mut self.network + } + + pub fn registry(&self) -> &Registry { + &self.registry } /// Produce a randomized duration between `min` and `max`. @@ -361,7 +361,14 @@ impl TestEnvironment { ) { loop { futures::select! { - message = ctx.recv().fuse() => { + maybe_message = ctx.maybe_recv().fuse() => { + let message = if let Some(message) = maybe_message{ + message + } else { + gum::info!("{}", "Test completed".bright_blue()); + return + }; + gum::trace!(target: LOG_TARGET, ?message, "Env task received message"); match message { @@ -390,7 +397,7 @@ impl TestEnvironment { AvailabilityStoreMessage::QueryChunkSize(candidate_hash, tx) ) => { let candidate_index = state.candidate_hashes.get(&candidate_hash).expect("candidate was generated previously; qed"); - gum::info!(target: LOG_TARGET, ?candidate_hash, candidate_index, "Candidate mapped to index"); + gum::debug!(target: LOG_TARGET, ?candidate_hash, candidate_index, "Candidate mapped to index"); let chunk_size = state.chunks.get(*candidate_index as usize).unwrap()[0].encoded_size(); let _ = tx.send(Some(chunk_size)); @@ -564,13 +571,11 @@ impl TestState { send_chunk: impl Fn(usize) -> bool, tx: oneshot::Sender>, ) { - gum::info!(target: LOG_TARGET, ?candidate_hash, "respond_to_query_all_request"); - let candidate_index = self .candidate_hashes .get(&candidate_hash) .expect("candidate was generated previously; qed"); - gum::info!(target: LOG_TARGET, ?candidate_hash, candidate_index, "Candidate mapped to index"); + gum::debug!(target: LOG_TARGET, ?candidate_hash, candidate_index, "Candidate mapped to index"); let v = self .chunks @@ -593,7 +598,7 @@ impl TestState { /// Generate candidates to be used in the test. pub fn generate_candidates(&mut self, count: usize) { - gum::info!(target: LOG_TARGET, "Pre-generating {} candidates.", count); + gum::info!(target: LOG_TARGET,"{}", format!("Pre-generating {} candidates.", count).bright_blue()); // Generate all candidates self.candidates = (0..count) @@ -610,7 +615,7 @@ impl TestState { // Store the new candidate in the state self.candidate_hashes.insert(candidate_receipt.hash(), candidate_index); - gum::info!(target: LOG_TARGET, candidate_hash = ?candidate_receipt.hash(), "new candidate"); + gum::debug!(target: LOG_TARGET, candidate_hash = ?candidate_receipt.hash(), "new candidate"); candidate_receipt }) @@ -620,8 +625,6 @@ impl TestState { } pub fn new(config: TestConfiguration) -> Self { - let keystore: KeystorePtr = Arc::new(LocalKeystore::in_memory()); - let keyrings = (0..config.n_validators) .map(|peer_index| Keyring::new(format!("Node{}", peer_index).into())) .collect::>(); @@ -634,7 +637,7 @@ impl TestState { let validator_authority_id: Vec = keyrings .iter() - .map({ |keyring| keyring.clone().public().into() }) + .map(|keyring| keyring.clone().public().into()) .collect::>() .into(); @@ -654,8 +657,8 @@ impl TestState { }; // For each unique pov we create a candidate receipt. - for (index, pov_size) in config.pov_sizes.iter().cloned().unique().enumerate() { - gum::info!(target: LOG_TARGET, index, pov_size, "Generating template candidates"); + for (index, pov_size) in config.pov_sizes().iter().cloned().unique().enumerate() { + gum::info!(target: LOG_TARGET, index, pov_size, "{}", "Generating template candidate".bright_blue()); let mut candidate_receipt = dummy_candidate_receipt(dummy_hash()); let pov = PoV { block_data: BlockData(vec![index as u8; pov_size]) }; @@ -679,8 +682,10 @@ impl TestState { candidate_receipts.push(candidate_receipt); } - let pov_sizes = config.pov_sizes.clone().into_iter().cycle(); - let mut state = Self { + let pov_sizes = config.pov_sizes().to_vec().into_iter().cycle(); + gum::info!(target: LOG_TARGET, "{}","Created test environment.".bright_blue()); + + Self { validator_public, validator_authority_id, validator_index, @@ -695,11 +700,7 @@ impl TestState { candidates_generated: 0, candidate_hashes: HashMap::new(), candidates: Vec::new().into_iter().cycle(), - }; - - gum::info!(target: LOG_TARGET, "Created test environment."); - - state + } } } @@ -746,27 +747,29 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { env.metrics().set_n_validators(config.n_validators); env.metrics().set_n_cores(config.n_cores); - for loop_num in 0..env.config().num_loops { - gum::info!(target: LOG_TARGET, loop_num, "Starting loop"); - env.metrics().set_current_loop(loop_num); + for block_num in 0..env.config().num_blocks { + gum::info!(target: LOG_TARGET, "Current block {}/{}", block_num, env.config().num_blocks); + env.metrics().set_current_block(block_num); - let loop_start_ts = Instant::now(); + let block_start_ts = Instant::now(); for candidate_num in 0..config.n_cores as u64 { let candidate = - env.state.next_candidate().expect("We always send up to n_cores*num_loops; qed"); + env.state.next_candidate().expect("We always send up to n_cores*num_blocks; qed"); let (tx, rx) = oneshot::channel(); batch.push(rx); env.send_message(AvailabilityRecoveryMessage::RecoverAvailableData( candidate.clone(), 1, - Some(GroupIndex(candidate_num as u32 % (config.n_cores / 5) as u32)), + Some(GroupIndex( + candidate_num as u32 % (std::cmp::max(5, config.n_cores) / 5) as u32, + )), tx, )) .await; } - gum::info!("{} requests pending", batch.len()); + gum::info!("{}", format!("{} requests pending", batch.len()).bright_black()); while let Some(completed) = batch.next().await { let available_data = completed.unwrap().unwrap(); env.metrics().on_pov_size(available_data.encoded_size()); @@ -774,31 +777,44 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { } let block_time_delta = - Duration::from_secs(6).saturating_sub(Instant::now().sub(loop_start_ts)); - gum::info!(target: LOG_TARGET, "Sleeping till end of block {}ms", block_time_delta.as_millis()); + Duration::from_secs(6).saturating_sub(Instant::now().sub(block_start_ts)); + gum::info!(target: LOG_TARGET,"{}", format!("Sleeping till end of block ({}ms)", block_time_delta.as_millis()).bright_black()); tokio::time::sleep(block_time_delta).await; } env.send_signal(OverseerSignal::Conclude).await; let duration = start_marker.elapsed().as_millis(); let availability_bytes = availability_bytes / 1024; - gum::info!("Benchmark completed in {:?}ms", duration); - gum::info!("Throughput: {} KiB/block", availability_bytes / env.config().num_loops as u128); + gum::info!("Benchmark completed in {}", format!("{:?}ms", duration).cyan()); gum::info!( - "Block time: {} ms", - start_marker.elapsed().as_millis() / env.config().num_loops as u128 + "Throughput: {}", + format!("{} KiB/block", availability_bytes / env.config().num_blocks as u128).bright_red() + ); + gum::info!( + "Block time: {}", + format!("{} ms", start_marker.elapsed().as_millis() / env.config().num_blocks as u128).red() ); - let stats = env.network.stats(); + let stats = env.network().stats(); gum::info!( - "Total received from network: {} MiB", - stats - .iter() - .enumerate() - .map(|(index, stats)| stats.tx_bytes_total as u128) - .sum::() / - (1024 * 1024) + "Total received from network: {}", + format!( + "{} MiB", + stats + .iter() + .enumerate() + .map(|(_index, stats)| stats.tx_bytes_total as u128) + .sum::() / (1024 * 1024) + ) + .cyan() ); - tokio::time::sleep(Duration::from_secs(1)).await; + let test_metrics = super::core::display::parse_metrics(&env.registry()); + let subsystem_cpu_metrics = + test_metrics.subset_with_label_value("task_group", "availability-recovery-subsystem"); + gum::info!(target: LOG_TARGET, "Total subsystem CPU usage {}", format!("{:.2}s", subsystem_cpu_metrics.sum_by("substrate_tasks_polling_duration_sum")).bright_purple()); + + let test_env_cpu_metrics = + test_metrics.subset_with_label_value("task_group", "test-environment"); + gum::info!(target: LOG_TARGET, "Total test environment CPU usage {}", format!("{:.2}s", test_env_cpu_metrics.sum_by("substrate_tasks_polling_duration_sum")).bright_purple()); } diff --git a/polkadot/node/subsystem-bench/src/core/display.rs b/polkadot/node/subsystem-bench/src/core/display.rs index 47483d33a42a9..4b63f45c5f8aa 100644 --- a/polkadot/node/subsystem-bench/src/core/display.rs +++ b/polkadot/node/subsystem-bench/src/core/display.rs @@ -13,3 +13,279 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . +//! Some helper methods for parsing prometheus metrics to a format that can be +//! displayed in the CLI. +//! +//! Currently histogram buckets are skipped. +use super::LOG_TARGET; +use colored::Colorize; +use prometheus::{ + proto::{MetricFamily, MetricType}, + Registry, +}; +use std::fmt::Display; + +#[derive(Default)] +pub struct MetricCollection(Vec); + +impl From> for MetricCollection { + fn from(metrics: Vec) -> Self { + MetricCollection(metrics) + } +} + +impl MetricCollection { + pub fn get(&self, name: &str) -> Vec<&TestMetric> { + self.all().into_iter().filter(|metric| &metric.name == name).collect() + } + + pub fn all(&self) -> &Vec { + &self.0 + } + + /// Sums up all metrics with the given name in the collection + pub fn sum_by(&self, name: &str) -> f64 { + self.all() + .into_iter() + .filter(|metric| &metric.name == name) + .map(|metric| metric.value) + .sum() + } + + pub fn subset_with_label_value(&self, label_name: &str, label_value: &str) -> MetricCollection { + self.0 + .iter() + .filter_map(|metric| { + if let Some(index) = metric.label_names.iter().position(|label| label == label_name) + { + if Some(&String::from(label_value)) == metric.label_values.get(index) { + Some(metric.clone()) + } else { + None + } + } else { + None + } + }) + .collect::>() + .into() + } +} + +impl Display for MetricCollection { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "")?; + let metrics = self.all(); + for metric in metrics { + writeln!(f, "{}", metric)?; + } + Ok(()) + } +} +#[derive(Debug, Clone)] +pub struct TestMetric { + name: String, + label_names: Vec, + label_values: Vec, + value: f64, +} + +impl Display for TestMetric { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "({} = {}) [{:?}, {:?}]", + self.name.cyan(), + format!("{}", self.value).white(), + self.label_names, + self.label_values + ) + } +} + +// fn encode_impl( +// &self, +// metric_families: &[MetricFamily], +// writer: &mut dyn WriteUtf8, +// ) -> Result<()> { for mf in metric_families { // Fail-fast checks. check_metric_family(mf)?; + +// // Write `# HELP` header. +// let name = mf.get_name(); +// let help = mf.get_help(); +// if !help.is_empty() { +// writer.write_all("# HELP ")?; +// writer.write_all(name)?; +// writer.write_all(" ")?; +// writer.write_all(&escape_string(help, false))?; +// writer.write_all("\n")?; +// } + +// // Write `# TYPE` header. +// let metric_type = mf.get_field_type(); +// let lowercase_type = format!("{:?}", metric_type).to_lowercase(); +// writer.write_all("# TYPE ")?; +// writer.write_all(name)?; +// writer.write_all(" ")?; +// writer.write_all(&lowercase_type)?; +// writer.write_all("\n")?; + +// for m in mf.get_metric() { +// match metric_type { +// MetricType::COUNTER => { +// write_sample(writer, name, None, m, None, m.get_counter().get_value())?; +// } +// MetricType::GAUGE => { +// write_sample(writer, name, None, m, None, m.get_gauge().get_value())?; +// } +// MetricType::HISTOGRAM => { +// let h = m.get_histogram(); + +// let mut inf_seen = false; +// for b in h.get_bucket() { +// let upper_bound = b.get_upper_bound(); +// write_sample( +// writer, +// name, +// Some("_bucket"), +// m, +// Some((BUCKET_LABEL, &upper_bound.to_string())), +// b.get_cumulative_count() as f64, +// )?; +// if upper_bound.is_sign_positive() && upper_bound.is_infinite() { +// inf_seen = true; +// } +// } +// if !inf_seen { +// write_sample( +// writer, +// name, +// Some("_bucket"), +// m, +// Some((BUCKET_LABEL, POSITIVE_INF)), +// h.get_sample_count() as f64, +// )?; +// } + +// write_sample(writer, name, Some("_sum"), m, None, h.get_sample_sum())?; + +// write_sample( +// writer, +// name, +// Some("_count"), +// m, +// None, +// h.get_sample_count() as f64, +// )?; +// } +// MetricType::SUMMARY => { +// let s = m.get_summary(); + +// for q in s.get_quantile() { +// write_sample( +// writer, +// name, +// None, +// m, +// Some((QUANTILE, &q.get_quantile().to_string())), +// q.get_value(), +// )?; +// } + +// write_sample(writer, name, Some("_sum"), m, None, s.get_sample_sum())?; + +// write_sample( +// writer, +// name, +// Some("_count"), +// m, +// None, +// s.get_sample_count() as f64, +// )?; +// } +// MetricType::UNTYPED => { +// unimplemented!(); +// } +// } +// } +// } + +// Ok(()) +// } + +// Returns `false` if metric should be skipped. +fn check_metric_family(mf: &MetricFamily) -> bool { + if mf.get_metric().is_empty() { + gum::error!(target: LOG_TARGET, "MetricFamily has no metrics: {:?}", mf); + return false + } + if mf.get_name().is_empty() { + gum::error!(target: LOG_TARGET, "MetricFamily has no name: {:?}", mf); + return false + } + + true +} + +pub fn parse_metrics(registry: &Registry) -> MetricCollection { + let metric_families = registry.gather(); + let mut test_metrics = Vec::new(); + for mf in metric_families { + if !check_metric_family(&mf) { + continue + } + + let name: String = mf.get_name().into(); + let metric_type = mf.get_field_type(); + for m in mf.get_metric() { + let (label_names, label_values): (Vec, Vec) = m + .get_label() + .iter() + .map(|pair| (String::from(pair.get_name()), String::from(pair.get_value()))) + .unzip(); + + match metric_type { + MetricType::COUNTER => { + test_metrics.push(TestMetric { + name: name.clone(), + label_names, + label_values, + value: m.get_counter().get_value(), + }); + }, + MetricType::GAUGE => { + test_metrics.push(TestMetric { + name: name.clone(), + label_names, + label_values, + value: m.get_gauge().get_value(), + }); + }, + MetricType::HISTOGRAM => { + let h = m.get_histogram(); + let h_name = name.clone() + "_sum".into(); + test_metrics.push(TestMetric { + name: h_name, + label_names: label_names.clone(), + label_values: label_values.clone(), + value: h.get_sample_sum(), + }); + + let h_name = name.clone() + "_count".into(); + test_metrics.push(TestMetric { + name: h_name, + label_names, + label_values, + value: h.get_sample_sum(), + }); + }, + MetricType::SUMMARY => { + unimplemented!(); + }, + MetricType::UNTYPED => { + unimplemented!(); + }, + } + } + } + test_metrics.into() +} diff --git a/polkadot/node/subsystem-bench/src/core/keyring.rs b/polkadot/node/subsystem-bench/src/core/keyring.rs index 40e8d60d0cd1e..2d9aa348a922b 100644 --- a/polkadot/node/subsystem-bench/src/core/keyring.rs +++ b/polkadot/node/subsystem-bench/src/core/keyring.rs @@ -16,11 +16,9 @@ pub use sp_core::sr25519; use sp_core::{ - sr25519::{Pair, Public, Signature}, - ByteArray, Pair as PairT, H256, + sr25519::{Pair, Public}, + Pair as PairT, }; -use std::{collections::HashMap, ops::Deref}; - /// Set of test accounts. #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct Keyring { @@ -39,8 +37,4 @@ impl Keyring { pub fn public(self) -> Public { self.pair().public() } - - pub fn to_seed(self) -> String { - format!("//{}", self.name) - } } diff --git a/polkadot/node/subsystem-bench/src/core/mod.rs b/polkadot/node/subsystem-bench/src/core/mod.rs index 4b9db3144f54a..0d7b5c3c40157 100644 --- a/polkadot/node/subsystem-bench/src/core/mod.rs +++ b/polkadot/node/subsystem-bench/src/core/mod.rs @@ -14,67 +14,17 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . -use itertools::Itertools; use std::{ collections::HashMap, - iter::Cycle, - ops::{Div, Sub}, sync::Arc, time::{Duration, Instant}, }; - -use sc_keystore::LocalKeystore; -use sp_application_crypto::AppCrypto; -use sp_keystore::{Keystore, KeystorePtr}; - -use futures::{ - channel::{mpsc, oneshot}, - stream::FuturesUnordered, - FutureExt, SinkExt, StreamExt, -}; -use futures_timer::Delay; - -use polkadot_node_metrics::metrics::Metrics; - -use polkadot_availability_recovery::AvailabilityRecoverySubsystem; - -use parity_scale_codec::Encode; -use polkadot_node_network_protocol::request_response::{ - self as req_res, v1::ChunkResponse, IncomingRequest, ReqProtocolNames, Requests, -}; -use rand::{distributions::Uniform, prelude::Distribution, seq::IteratorRandom, thread_rng}; - -use prometheus::Registry; -use sc_network::{config::RequestResponseConfig, OutboundFailure, RequestFailure}; - -use polkadot_erasure_coding::{branches, obtain_chunks_v1 as obtain_chunks}; -use polkadot_node_primitives::{BlockData, PoV, Proof}; -use polkadot_node_subsystem::{ - messages::{ - AllMessages, AvailabilityRecoveryMessage, AvailabilityStoreMessage, NetworkBridgeTxMessage, - RuntimeApiMessage, RuntimeApiRequest, - }, - ActiveLeavesUpdate, FromOrchestra, OverseerSignal, Subsystem, -}; -use std::net::{Ipv4Addr, SocketAddr}; - -use super::core::{keyring::Keyring, network::*, test_env::TestEnvironmentMetrics}; - const LOG_TARGET: &str = "subsystem-bench::core"; -use polkadot_node_primitives::{AvailableData, ErasureChunk}; - -use polkadot_node_subsystem_test_helpers::{ - make_buffered_subsystem_context, mock::new_leaf, TestSubsystemContextHandle, -}; -use polkadot_node_subsystem_util::TimeoutExt; -use polkadot_primitives::{ - AuthorityDiscoveryId, CandidateHash, CandidateReceipt, GroupIndex, Hash, HeadData, IndexedVec, - PersistedValidationData, SessionIndex, SessionInfo, ValidatorId, ValidatorIndex, -}; -use polkadot_primitives_test_helpers::{dummy_candidate_receipt, dummy_hash}; -use sc_service::{SpawnTaskHandle, TaskManager}; +use polkadot_primitives::AuthorityDiscoveryId; +use sc_service::SpawnTaskHandle; pub mod keyring; pub mod network; pub mod test_env; +pub mod display; \ No newline at end of file diff --git a/polkadot/node/subsystem-bench/src/core/network.rs b/polkadot/node/subsystem-bench/src/core/network.rs index 170ab45e35a39..9250762f99871 100644 --- a/polkadot/node/subsystem-bench/src/core/network.rs +++ b/polkadot/node/subsystem-bench/src/core/network.rs @@ -15,7 +15,6 @@ // along with Polkadot. If not, see . use super::*; use prometheus_endpoint::U64; -use sc_network::network_state::Peer; use std::sync::atomic::{AtomicU64, Ordering}; use tokio::sync::mpsc::UnboundedSender; // An emulated node egress traffic rate_limiter. @@ -339,8 +338,7 @@ impl NetworkEmulator { } use polkadot_node_subsystem_util::metrics::{ - self, - prometheus::{self, Counter, CounterVec, Histogram, Opts, PrometheusError, Registry}, + prometheus::{CounterVec, Opts, PrometheusError, Registry}, }; /// Emulated network metrics. diff --git a/polkadot/node/subsystem-bench/src/core/test_env.rs b/polkadot/node/subsystem-bench/src/core/test_env.rs index c20b96d642afe..153d5bdf95c77 100644 --- a/polkadot/node/subsystem-bench/src/core/test_env.rs +++ b/polkadot/node/subsystem-bench/src/core/test_env.rs @@ -14,15 +14,11 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . -use super::*; -use polkadot_node_subsystem_util::metrics::{ - self, - prometheus::{ - self, Counter, Gauge, Histogram, HistogramVec, Opts, PrometheusError, Registry, U64, - }, +use polkadot_node_subsystem_util::metrics::prometheus::{ + self, Gauge, Histogram, PrometheusError, Registry, U64, }; -const MIB: f64 = 1024.0*1024.0; +const MIB: f64 = 1024.0 * 1024.0; /// Test environment/configuration metrics #[derive(Clone)] @@ -33,22 +29,15 @@ pub struct TestEnvironmentMetrics { n_cores: Gauge, /// PoV size pov_size: Histogram, - /// Current loop - current_loop: Gauge, + /// Current block + current_block: Gauge, } impl TestEnvironmentMetrics { pub fn new(registry: &Registry) -> Result { let mut buckets = prometheus::exponential_buckets(16384.0, 2.0, 9) - .expect("arguments are always valid; qed"); - buckets.extend(vec![ - 5.0 * MIB, - 6.0 * MIB, - 7.0 * MIB, - 8.0 * MIB, - 9.0 * MIB, - 10.0 * MIB, - ]); + .expect("arguments are always valid; qed"); + buckets.extend(vec![5.0 * MIB, 6.0 * MIB, 7.0 * MIB, 8.0 * MIB, 9.0 * MIB, 10.0 * MIB]); Ok(Self { n_validators: prometheus::register( @@ -61,12 +50,12 @@ impl TestEnvironmentMetrics { n_cores: prometheus::register( Gauge::new( "subsystem_benchmark_n_cores", - "Number of cores we fetch availability for each loop", + "Number of cores we fetch availability for each block", )?, registry, )?, - current_loop: prometheus::register( - Gauge::new("subsystem_benchmark_current_loop", "The current test loop")?, + current_block: prometheus::register( + Gauge::new("subsystem_benchmark_current_block", "The current test block")?, registry, )?, pov_size: prometheus::register( @@ -75,9 +64,7 @@ impl TestEnvironmentMetrics { "subsystem_benchmark_pov_size", "The compressed size of the proof of validity of a candidate", ) - .buckets( - buckets - ), + .buckets(buckets), )?, registry, )?, @@ -92,8 +79,8 @@ impl TestEnvironmentMetrics { self.n_cores.set(n_cores as u64); } - pub fn set_current_loop(&self, current_loop: usize) { - self.current_loop.set(current_loop as u64); + pub fn set_current_block(&self, current_block: usize) { + self.current_block.set(current_block as u64); } pub fn on_pov_size(&self, pov_size: usize) { diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs index 9e581555d7614..3cffd2ec427e6 100644 --- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -18,8 +18,9 @@ //! CI regression testing. use clap::Parser; use color_eyre::eyre; -use prometheus::proto::LabelPair; -use std::time::Duration; + +use colored::Colorize; +use std::{time::Duration, path::Path}; pub(crate) mod availability; pub(crate) mod core; @@ -77,15 +78,29 @@ pub struct DataAvailabilityReadOptions { pub fetch_from_backers: bool, #[clap(short, long, ignore_case = true, default_value_t = 1)] - /// Number of times to loop fetching for each core. - pub num_loops: usize, + /// Number of times to block fetching for each core. + pub num_blocks: usize, } + + +#[derive(Debug, clap::Parser)] +#[clap(rename_all = "kebab-case")] +#[allow(missing_docs)] +pub struct TestSequenceOptions { + #[clap(short, long, ignore_case = true)] + pub path: String, +} + + + /// Define the supported benchmarks targets #[derive(Debug, Parser)] #[command(about = "Target subsystems", version, rename_all = "kebab-case")] enum BenchmarkTarget { /// Benchmark availability recovery strategies. DataAvailabilityRead(DataAvailabilityReadOptions), + /// Run a test sequence specified in a file + TestSequence(TestSequenceOptions), } #[derive(Debug, Parser)] @@ -131,17 +146,31 @@ fn new_runtime() -> tokio::runtime::Runtime { impl BenchCli { /// Launch a malus node. fn launch(self) -> eyre::Result<()> { - use prometheus::{proto::MetricType, Registry, TextEncoder}; - - println!("Preparing {:?} benchmarks", self.target); + use prometheus::Registry; let runtime = new_runtime(); - let registry = Registry::new(); let mut test_config = match self.target { + BenchmarkTarget::TestSequence(options) => { + let test_sequence = availability::TestSequence::new_from_file(Path::new(&options.path)).expect("File exists").to_vec(); + let num_steps = test_sequence.len(); + gum::info!("{}", format!("Sequence contains {} step(s)",num_steps).bright_purple()); + for (index, test_config) in test_sequence.into_iter().enumerate(){ + gum::info!("{}", format!("Current step {}/{}", index + 1, num_steps).bright_purple()); + + let candidate_count = test_config.n_cores * test_config.num_blocks; + + let mut state = TestState::new(test_config); + state.generate_candidates(candidate_count); + let mut env = TestEnvironment::new(runtime.handle().clone(), state, Registry::new()); + + runtime.block_on(availability::bench_chunk_recovery(&mut env)); + } + return Ok(()) + } BenchmarkTarget::DataAvailabilityRead(options) => match self.network { NetworkEmulation::Healthy => TestConfiguration::healthy_network( - options.num_loops, + options.num_blocks, options.fetch_from_backers, options.n_validators, options.n_cores, @@ -155,7 +184,7 @@ impl BenchCli { .collect(), ), NetworkEmulation::Degraded => TestConfiguration::degraded_network( - options.num_loops, + options.num_blocks, options.fetch_from_backers, options.n_validators, options.n_cores, @@ -169,7 +198,7 @@ impl BenchCli { .collect(), ), NetworkEmulation::Ideal => TestConfiguration::ideal_network( - options.num_loops, + options.num_blocks, options.fetch_from_backers, options.n_validators, options.n_cores, @@ -209,56 +238,15 @@ impl BenchCli { test_config.bandwidth = bandwidth * 1024; } - let candidate_count = test_config.n_cores * test_config.num_loops; + let candidate_count = test_config.n_cores * test_config.num_blocks; + test_config.write_to_disk(); let mut state = TestState::new(test_config); state.generate_candidates(candidate_count); - let mut env = TestEnvironment::new(runtime.handle().clone(), state, registry.clone()); - - println!("{:?}", env.config()); + let mut env = TestEnvironment::new(runtime.handle().clone(), state, Registry::new()); runtime.block_on(availability::bench_chunk_recovery(&mut env)); - let metric_families = registry.gather(); - - for familiy in metric_families { - let metric_type = familiy.get_field_type(); - - for metric in familiy.get_metric() { - match metric_type { - MetricType::HISTOGRAM => { - let h = metric.get_histogram(); - - let labels = metric.get_label(); - // Skip test env usage. - let mut env_label = LabelPair::default(); - env_label.set_name("task_group".into()); - env_label.set_value("test-environment".into()); - - let mut is_env_metric = false; - for label_pair in labels { - if &env_label == label_pair { - is_env_metric = true; - break - } - } - - if !is_env_metric { - println!( - "{:?} CPU seconds used: {:?}", - familiy.get_name(), - h.get_sample_sum() - ); - } - }, - _ => {}, - } - } - } - // encoder.encode(&metric_families, &mut buffer).unwrap(); - - // Output to the standard output. - // println!("Metrics: {}", String::from_utf8(buffer).unwrap()); Ok(()) } } @@ -267,6 +255,7 @@ fn main() -> eyre::Result<()> { color_eyre::install()?; let _ = env_logger::builder() .filter(Some("hyper"), log::LevelFilter::Info) + .filter(None, log::LevelFilter::Info) .try_init() .unwrap(); diff --git a/polkadot/node/subsystem-test-helpers/src/lib.rs b/polkadot/node/subsystem-test-helpers/src/lib.rs index 5393ccafa6f38..1c3c47150ac6a 100644 --- a/polkadot/node/subsystem-test-helpers/src/lib.rs +++ b/polkadot/node/subsystem-test-helpers/src/lib.rs @@ -279,6 +279,13 @@ impl TestSubsystemContextHandle { .expect("Test subsystem no longer live") } + /// Receive the next message from the subsystem. + pub async fn maybe_recv(&mut self) -> Option { + self.try_recv() + .timeout(Self::TIMEOUT) + .await + .expect("`fn recv` does not timeout") + } /// Receive the next message from the subsystem, or `None` if the channel has been closed. pub async fn try_recv(&mut self) -> Option { self.rx From d6c259df9ff7eaaa5f7207364b87a7f3a76b165e Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Wed, 15 Nov 2023 18:39:29 +0200 Subject: [PATCH 15/45] update Signed-off-by: Andrei Sandu --- .../network/availability-recovery/src/lib.rs | 3 +- .../subsystem-bench/src/availability/cli.rs | 17 ++++ .../src/availability/configuration.rs | 19 ++--- .../subsystem-bench/src/availability/mod.rs | 27 +++++-- polkadot/node/subsystem-bench/src/core/mod.rs | 2 +- .../node/subsystem-bench/src/core/network.rs | 8 +- .../node/subsystem-bench/src/core/test_env.rs | 10 +++ .../subsystem-bench/src/subsystem-bench.rs | 41 ++++++---- .../node/subsystem-bench/test_sequence.toml | 77 +++++++++++++++++++ 9 files changed, 165 insertions(+), 39 deletions(-) create mode 100644 polkadot/node/subsystem-bench/src/availability/cli.rs create mode 100644 polkadot/node/subsystem-bench/test_sequence.toml diff --git a/polkadot/node/network/availability-recovery/src/lib.rs b/polkadot/node/network/availability-recovery/src/lib.rs index ffb634ad76e2f..6dafcf4ccfc81 100644 --- a/polkadot/node/network/availability-recovery/src/lib.rs +++ b/polkadot/node/network/availability-recovery/src/lib.rs @@ -582,6 +582,7 @@ impl AvailabilityRecoverySubsystem { } } + /// Starts the inner subsystem loop. pub async fn run(self, mut ctx: Context) -> SubsystemResult<()> { let mut state = State::default(); let Self { mut req_receiver, metrics, recovery_strategy_kind, bypass_availability_store } = @@ -726,8 +727,6 @@ impl AvailabilityRecoverySubsystem { } } output = state.ongoing_recoveries.select_next_some() => { - // No caching for benchmark. - #[cfg(not(feature = "subsystem-benchmarks"))] if let Some((candidate_hash, result)) = output { if let Ok(recovery) = CachedRecovery::try_from(result) { state.availability_lru.insert(candidate_hash, recovery); diff --git a/polkadot/node/subsystem-bench/src/availability/cli.rs b/polkadot/node/subsystem-bench/src/availability/cli.rs new file mode 100644 index 0000000000000..43a938f2abeaa --- /dev/null +++ b/polkadot/node/subsystem-bench/src/availability/cli.rs @@ -0,0 +1,17 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +use super::*; diff --git a/polkadot/node/subsystem-bench/src/availability/configuration.rs b/polkadot/node/subsystem-bench/src/availability/configuration.rs index 2d29d23811da6..cbad4a2dc1b84 100644 --- a/polkadot/node/subsystem-bench/src/availability/configuration.rs +++ b/polkadot/node/subsystem-bench/src/availability/configuration.rs @@ -17,7 +17,7 @@ use std::path::Path; use super::*; -use serde::{Deserialize,Serialize}; +use serde::{Deserialize, Serialize}; /// Peer response latency configuration. #[derive(Clone, Debug, Default, Serialize, Deserialize)] pub struct PeerLatency { @@ -56,7 +56,6 @@ pub struct TestConfiguration { pub num_blocks: usize, } - impl Default for TestConfiguration { fn default() -> Self { Self { @@ -69,8 +68,8 @@ impl Default for TestConfiguration { latency: None, error: 0, num_blocks: 1, - min_pov_size: 5*1024*1024, - max_pov_size: 5*1024*1024, + min_pov_size: 5 * 1024 * 1024, + max_pov_size: 5 * 1024 * 1024, } } } @@ -79,10 +78,10 @@ fn generate_pov_sizes(count: usize, min: usize, max: usize) -> Vec { (0..count).map(|_| random_pov_size(min, max)).collect() } -#[derive(Serialize,Deserialize)] +#[derive(Serialize, Deserialize)] pub struct TestSequence { #[serde(rename(serialize = "TestConfiguration", deserialize = "TestConfiguration"))] - test_configurations: Vec + test_configurations: Vec, } impl TestSequence { @@ -90,14 +89,15 @@ impl TestSequence { // Generate Pov sizes for config in self.test_configurations.iter_mut() { - config.pov_sizes = generate_pov_sizes(config.n_cores, config.min_pov_size, config.max_pov_size); + config.pov_sizes = + generate_pov_sizes(config.n_cores, config.min_pov_size, config.max_pov_size); } self.test_configurations } } -impl TestSequence { +impl TestSequence { pub fn new_from_file(path: &Path) -> std::io::Result { let string = String::from_utf8(std::fs::read(&path)?).expect("File is valid UTF8"); Ok(toml::from_str(&string).expect("File is valid test sequence TOML")) @@ -107,7 +107,8 @@ impl TestSequence { impl TestConfiguration { pub fn write_to_disk(&self) { // Serialize a slice of configurations - let toml = toml::to_string(&TestSequence{ test_configurations: vec![self.clone()] }).unwrap(); + let toml = + toml::to_string(&TestSequence { test_configurations: vec![self.clone()] }).unwrap(); std::fs::write("last_test.toml", toml).unwrap(); } diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index 2c9f3e735afb1..0a0830ff99753 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -748,13 +748,15 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { env.metrics().set_n_cores(config.n_cores); for block_num in 0..env.config().num_blocks { - gum::info!(target: LOG_TARGET, "Current block {}/{}", block_num, env.config().num_blocks); + gum::info!(target: LOG_TARGET, "Current block {}/{}", block_num + 1, env.config().num_blocks); env.metrics().set_current_block(block_num); let block_start_ts = Instant::now(); for candidate_num in 0..config.n_cores as u64 { - let candidate = - env.state.next_candidate().expect("We always send up to n_cores*num_blocks; qed"); + let candidate = env + .state + .next_candidate() + .expect("We always send up to n_cores*num_blocks; qed"); let (tx, rx) = oneshot::channel(); batch.push(rx); @@ -769,7 +771,7 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { .await; } - gum::info!("{}", format!("{} requests pending", batch.len()).bright_black()); + gum::info!("{}", format!("{} recoveries pending", batch.len()).bright_black()); while let Some(completed) = batch.next().await { let available_data = completed.unwrap().unwrap(); env.metrics().on_pov_size(available_data.encoded_size()); @@ -778,6 +780,10 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { let block_time_delta = Duration::from_secs(6).saturating_sub(Instant::now().sub(block_start_ts)); + + let block_time = Instant::now().sub(block_start_ts).as_millis() as u64; + env.metrics().set_block_time(block_time); + gum::info!("Block time {}", format!("{:?}ms", block_time).cyan()); gum::info!(target: LOG_TARGET,"{}", format!("Sleeping till end of block ({}ms)", block_time_delta.as_millis()).bright_black()); tokio::time::sleep(block_time_delta).await; } @@ -785,14 +791,15 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { env.send_signal(OverseerSignal::Conclude).await; let duration = start_marker.elapsed().as_millis(); let availability_bytes = availability_bytes / 1024; - gum::info!("Benchmark completed in {}", format!("{:?}ms", duration).cyan()); + gum::info!("All blocks processed in {}", format!("{:?}ms", duration).cyan()); gum::info!( "Throughput: {}", format!("{} KiB/block", availability_bytes / env.config().num_blocks as u128).bright_red() ); gum::info!( "Block time: {}", - format!("{} ms", start_marker.elapsed().as_millis() / env.config().num_blocks as u128).red() + format!("{} ms", start_marker.elapsed().as_millis() / env.config().num_blocks as u128) + .red() ); let stats = env.network().stats(); @@ -812,9 +819,13 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { let test_metrics = super::core::display::parse_metrics(&env.registry()); let subsystem_cpu_metrics = test_metrics.subset_with_label_value("task_group", "availability-recovery-subsystem"); - gum::info!(target: LOG_TARGET, "Total subsystem CPU usage {}", format!("{:.2}s", subsystem_cpu_metrics.sum_by("substrate_tasks_polling_duration_sum")).bright_purple()); + let total_cpu = subsystem_cpu_metrics.sum_by("substrate_tasks_polling_duration_sum"); + gum::info!(target: LOG_TARGET, "Total subsystem CPU usage {}", format!("{:.2}s", total_cpu).bright_purple()); + gum::info!(target: LOG_TARGET, "CPU usage per block {}", format!("{:.2}s", total_cpu/env.config().num_blocks as f64).bright_purple()); let test_env_cpu_metrics = test_metrics.subset_with_label_value("task_group", "test-environment"); - gum::info!(target: LOG_TARGET, "Total test environment CPU usage {}", format!("{:.2}s", test_env_cpu_metrics.sum_by("substrate_tasks_polling_duration_sum")).bright_purple()); + let total_cpu = test_env_cpu_metrics.sum_by("substrate_tasks_polling_duration_sum"); + gum::info!(target: LOG_TARGET, "Total test environment CPU usage {}", format!("{:.2}s", total_cpu).bright_purple()); + gum::info!(target: LOG_TARGET, "CPU usage per block {}", format!("{:.2}s", total_cpu/env.config().num_blocks as f64).bright_purple()); } diff --git a/polkadot/node/subsystem-bench/src/core/mod.rs b/polkadot/node/subsystem-bench/src/core/mod.rs index 0d7b5c3c40157..2e9e0364273e2 100644 --- a/polkadot/node/subsystem-bench/src/core/mod.rs +++ b/polkadot/node/subsystem-bench/src/core/mod.rs @@ -24,7 +24,7 @@ const LOG_TARGET: &str = "subsystem-bench::core"; use polkadot_primitives::AuthorityDiscoveryId; use sc_service::SpawnTaskHandle; +pub mod display; pub mod keyring; pub mod network; pub mod test_env; -pub mod display; \ No newline at end of file diff --git a/polkadot/node/subsystem-bench/src/core/network.rs b/polkadot/node/subsystem-bench/src/core/network.rs index 9250762f99871..629d09df694cb 100644 --- a/polkadot/node/subsystem-bench/src/core/network.rs +++ b/polkadot/node/subsystem-bench/src/core/network.rs @@ -14,9 +14,11 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . use super::*; +use colored::Colorize; use prometheus_endpoint::U64; use std::sync::atomic::{AtomicU64, Ordering}; use tokio::sync::mpsc::UnboundedSender; + // An emulated node egress traffic rate_limiter. #[derive(Debug)] pub struct RateLimit { @@ -282,6 +284,8 @@ impl NetworkEmulator { spawn_task_handle: SpawnTaskHandle, registry: &Registry, ) -> Self { + gum::info!(target: LOG_TARGET, "{}",format!("Initializing network emulation for {} peers.", n_peers).bright_blue()); + let metrics = Metrics::new(®istry).expect("Metrics always register succesfully"); let mut validator_authority_id_mapping = HashMap::new(); @@ -337,8 +341,8 @@ impl NetworkEmulator { } } -use polkadot_node_subsystem_util::metrics::{ - prometheus::{CounterVec, Opts, PrometheusError, Registry}, +use polkadot_node_subsystem_util::metrics::prometheus::{ + self, CounterVec, Opts, PrometheusError, Registry, }; /// Emulated network metrics. diff --git a/polkadot/node/subsystem-bench/src/core/test_env.rs b/polkadot/node/subsystem-bench/src/core/test_env.rs index 153d5bdf95c77..e6b09a1c13e63 100644 --- a/polkadot/node/subsystem-bench/src/core/test_env.rs +++ b/polkadot/node/subsystem-bench/src/core/test_env.rs @@ -31,6 +31,8 @@ pub struct TestEnvironmentMetrics { pov_size: Histogram, /// Current block current_block: Gauge, + /// Current block + block_time: Gauge, } impl TestEnvironmentMetrics { @@ -58,6 +60,10 @@ impl TestEnvironmentMetrics { Gauge::new("subsystem_benchmark_current_block", "The current test block")?, registry, )?, + block_time: prometheus::register( + Gauge::new("subsystem_benchmark_block_time", "The time it takes for the target subsystems(s) to complete all the requests in a block")?, + registry, + )?, pov_size: prometheus::register( Histogram::with_opts( prometheus::HistogramOpts::new( @@ -83,6 +89,10 @@ impl TestEnvironmentMetrics { self.current_block.set(current_block as u64); } + pub fn set_block_time(&self, block_time_ms: u64) { + self.block_time.set(block_time_ms); + } + pub fn on_pov_size(&self, pov_size: usize) { self.pov_size.observe(pov_size as f64); } diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs index 3cffd2ec427e6..280172662453f 100644 --- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -20,7 +20,7 @@ use clap::Parser; use color_eyre::eyre; use colored::Colorize; -use std::{time::Duration, path::Path}; +use std::{path::Path, time::Duration}; pub(crate) mod availability; pub(crate) mod core; @@ -82,7 +82,6 @@ pub struct DataAvailabilityReadOptions { pub num_blocks: usize, } - #[derive(Debug, clap::Parser)] #[clap(rename_all = "kebab-case")] #[allow(missing_docs)] @@ -91,12 +90,10 @@ pub struct TestSequenceOptions { pub path: String, } - - /// Define the supported benchmarks targets #[derive(Debug, Parser)] -#[command(about = "Target subsystems", version, rename_all = "kebab-case")] -enum BenchmarkTarget { +#[command(about = "Test objectives", version, rename_all = "kebab-case")] +enum TestObjective { /// Benchmark availability recovery strategies. DataAvailabilityRead(DataAvailabilityReadOptions), /// Run a test sequence specified in a file @@ -131,7 +128,7 @@ struct BenchCli { pub peer_max_latency: Option, #[command(subcommand)] - pub target: BenchmarkTarget, + pub objective: TestObjective, } fn new_runtime() -> tokio::runtime::Runtime { @@ -150,25 +147,35 @@ impl BenchCli { let runtime = new_runtime(); - let mut test_config = match self.target { - BenchmarkTarget::TestSequence(options) => { - let test_sequence = availability::TestSequence::new_from_file(Path::new(&options.path)).expect("File exists").to_vec(); + let mut test_config = match self.objective { + TestObjective::TestSequence(options) => { + let test_sequence = + availability::TestSequence::new_from_file(Path::new(&options.path)) + .expect("File exists") + .to_vec(); let num_steps = test_sequence.len(); - gum::info!("{}", format!("Sequence contains {} step(s)",num_steps).bright_purple()); - for (index, test_config) in test_sequence.into_iter().enumerate(){ - gum::info!("{}", format!("Current step {}/{}", index + 1, num_steps).bright_purple()); + gum::info!( + "{}", + format!("Sequence contains {} step(s)", num_steps).bright_purple() + ); + for (index, test_config) in test_sequence.into_iter().enumerate() { + gum::info!( + "{}", + format!("Current step {}/{}", index + 1, num_steps).bright_purple() + ); let candidate_count = test_config.n_cores * test_config.num_blocks; let mut state = TestState::new(test_config); state.generate_candidates(candidate_count); - let mut env = TestEnvironment::new(runtime.handle().clone(), state, Registry::new()); - + let mut env = + TestEnvironment::new(runtime.handle().clone(), state, Registry::new()); + runtime.block_on(availability::bench_chunk_recovery(&mut env)); } return Ok(()) - } - BenchmarkTarget::DataAvailabilityRead(options) => match self.network { + }, + TestObjective::DataAvailabilityRead(options) => match self.network { NetworkEmulation::Healthy => TestConfiguration::healthy_network( options.num_blocks, options.fetch_from_backers, diff --git a/polkadot/node/subsystem-bench/test_sequence.toml b/polkadot/node/subsystem-bench/test_sequence.toml new file mode 100644 index 0000000000000..d32477b9efe96 --- /dev/null +++ b/polkadot/node/subsystem-bench/test_sequence.toml @@ -0,0 +1,77 @@ +[[TestConfiguration]] +use_fast_path = false +n_validators = 300 +n_cores = 20 +min_pov_size = 5242880 +max_pov_size = 5242880 +peer_bandwidth = 128000 +bandwidth = 52428800 +error = 33 +num_blocks = 5 + +[TestConfiguration.latency.min_latency] +secs = 0 +nanos = 1000000 + +[TestConfiguration.latency.max_latency] +secs = 0 +nanos = 100000000 + +[[TestConfiguration]] +use_fast_path = false +n_validators = 500 +n_cores = 20 +min_pov_size = 5242880 +max_pov_size = 5242880 +peer_bandwidth = 128000 +bandwidth = 52428800 +error = 33 +num_blocks = 5 + +[TestConfiguration.latency.min_latency] +secs = 0 +nanos = 1000000 + +[TestConfiguration.latency.max_latency] +secs = 0 +nanos = 1000000000 + + +[[TestConfiguration]] +use_fast_path = false +n_validators = 1000 +n_cores = 20 +min_pov_size = 5242880 +max_pov_size = 5242880 +peer_bandwidth = 128000 +bandwidth = 52428800 +error = 33 +num_blocks = 5 + +[TestConfiguration.latency.min_latency] +secs = 0 +nanos = 1000000 + +[TestConfiguration.latency.max_latency] +secs = 0 +nanos = 1000000000 + + +[[TestConfiguration]] +use_fast_path = false +n_validators = 2000 +n_cores = 20 +min_pov_size = 5242880 +max_pov_size = 5242880 +peer_bandwidth = 128000 +bandwidth = 52428800 +error = 33 +num_blocks = 5 + +[TestConfiguration.latency.min_latency] +secs = 0 +nanos = 1000000 + +[TestConfiguration.latency.max_latency] +secs = 0 +nanos = 1000000000 From 050529b68ca2402e79e593968571606580d5ca7a Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Wed, 15 Nov 2023 18:47:05 +0200 Subject: [PATCH 16/45] remove comment Signed-off-by: Andrei Sandu --- polkadot/node/subsystem-bench/src/subsystem-bench.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs index 280172662453f..7dcc8a15074af 100644 --- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -141,7 +141,6 @@ fn new_runtime() -> tokio::runtime::Runtime { } impl BenchCli { - /// Launch a malus node. fn launch(self) -> eyre::Result<()> { use prometheus::Registry; From cb38be5c505863df72567efa3a0e3489b9bc42eb Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Fri, 17 Nov 2023 11:38:23 +0200 Subject: [PATCH 17/45] separate cli options for availability Signed-off-by: Andrei Sandu --- .../subsystem-bench/src/availability/cli.rs | 44 +++++++++++++++- .../subsystem-bench/src/availability/mod.rs | 7 +-- .../src/core/{test_env.rs => environment.rs} | 2 + polkadot/node/subsystem-bench/src/core/mod.rs | 2 +- .../subsystem-bench/src/subsystem-bench.rs | 52 ++----------------- 5 files changed, 55 insertions(+), 52 deletions(-) rename polkadot/node/subsystem-bench/src/core/{test_env.rs => environment.rs} (98%) diff --git a/polkadot/node/subsystem-bench/src/availability/cli.rs b/polkadot/node/subsystem-bench/src/availability/cli.rs index 43a938f2abeaa..ef4d7e6f631a4 100644 --- a/polkadot/node/subsystem-bench/src/availability/cli.rs +++ b/polkadot/node/subsystem-bench/src/availability/cli.rs @@ -14,4 +14,46 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . -use super::*; +#[derive(Debug, clap::Parser, Clone)] +#[clap(rename_all = "kebab-case")] +#[allow(missing_docs)] +pub struct NetworkOptions {} + +#[derive(clap::ValueEnum, Clone, Copy, Debug, PartialEq)] +#[value(rename_all = "kebab-case")] +#[non_exhaustive] +pub enum NetworkEmulation { + Ideal, + Healthy, + Degraded, +} + +#[derive(Debug, clap::Parser)] +#[clap(rename_all = "kebab-case")] +#[allow(missing_docs)] +pub struct DataAvailabilityReadOptions { + #[clap(long, ignore_case = true, default_value_t = 100)] + /// Number of cores to fetch availability for. + pub n_cores: usize, + + #[clap(long, ignore_case = true, default_value_t = 500)] + /// Number of validators to fetch chunks from. + pub n_validators: usize, + + #[clap(long, ignore_case = true, default_value_t = 5120)] + /// The minimum pov size in KiB + pub min_pov_size: usize, + + #[clap(long, ignore_case = true, default_value_t = 5120)] + /// The maximum pov size bytes + pub max_pov_size: usize, + + #[clap(short, long, default_value_t = false)] + /// Turbo boost AD Read by fetching from backers first. Tipically this is only faster if nodes + /// have enough bandwidth. + pub fetch_from_backers: bool, + + #[clap(short, long, ignore_case = true, default_value_t = 1)] + /// Number of times to block fetching for each core. + pub num_blocks: usize, +} diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index 0a0830ff99753..8866348ea22bb 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -53,7 +53,7 @@ use polkadot_node_subsystem::{ }; use std::net::{Ipv4Addr, SocketAddr}; -use super::core::{keyring::Keyring, network::*, test_env::TestEnvironmentMetrics}; +use super::core::{environment::TestEnvironmentMetrics, keyring::Keyring, network::*}; const LOG_TARGET: &str = "subsystem-bench::availability"; @@ -70,8 +70,9 @@ use polkadot_primitives::{ use polkadot_primitives_test_helpers::{dummy_candidate_receipt, dummy_hash}; use sc_service::{SpawnTaskHandle, TaskManager}; +mod cli; pub mod configuration; - +pub use cli::{DataAvailabilityReadOptions, NetworkEmulation, NetworkOptions}; pub use configuration::{PeerLatency, TestConfiguration, TestSequence}; // Deterministic genesis hash for protocol names @@ -162,7 +163,7 @@ impl TestEnvironment { let (ingress_tx, mut ingress_rx) = tokio::sync::mpsc::unbounded_channel::(); let our_network_stats = network.peer_stats(0); - spawn_handle.spawn_blocking("our-node-rx", "test-environment", async move { + spawn_handle.spawn_blocking("node0-rx", "test-environment", async move { while let Some(action) = ingress_rx.recv().await { let size = action.size(); diff --git a/polkadot/node/subsystem-bench/src/core/test_env.rs b/polkadot/node/subsystem-bench/src/core/environment.rs similarity index 98% rename from polkadot/node/subsystem-bench/src/core/test_env.rs rename to polkadot/node/subsystem-bench/src/core/environment.rs index e6b09a1c13e63..6a680799972d7 100644 --- a/polkadot/node/subsystem-bench/src/core/test_env.rs +++ b/polkadot/node/subsystem-bench/src/core/environment.rs @@ -14,6 +14,8 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . +use super::*; +use network::NetworkEmulator; use polkadot_node_subsystem_util::metrics::prometheus::{ self, Gauge, Histogram, PrometheusError, Registry, U64, }; diff --git a/polkadot/node/subsystem-bench/src/core/mod.rs b/polkadot/node/subsystem-bench/src/core/mod.rs index 2e9e0364273e2..564fb7148fa0a 100644 --- a/polkadot/node/subsystem-bench/src/core/mod.rs +++ b/polkadot/node/subsystem-bench/src/core/mod.rs @@ -25,6 +25,6 @@ use polkadot_primitives::AuthorityDiscoveryId; use sc_service::SpawnTaskHandle; pub mod display; +pub mod environment; pub mod keyring; pub mod network; -pub mod test_env; diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs index 7dcc8a15074af..42efb7fd63c8b 100644 --- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -25,10 +25,13 @@ use std::{path::Path, time::Duration}; pub(crate) mod availability; pub(crate) mod core; -use availability::{random_pov_size, TestConfiguration, TestEnvironment, TestState}; -const LOG_TARGET: &str = "subsystem-bench"; +use availability::{ + random_pov_size, DataAvailabilityReadOptions, NetworkEmulation, TestConfiguration, + TestEnvironment, TestState, +}; use clap_num::number_range; +const LOG_TARGET: &str = "subsystem-bench"; fn le_100(s: &str) -> Result { number_range(s, 0, 100) @@ -37,51 +40,6 @@ fn le_100(s: &str) -> Result { fn le_5000(s: &str) -> Result { number_range(s, 0, 5000) } - -#[derive(Debug, clap::Parser, Clone)] -#[clap(rename_all = "kebab-case")] -#[allow(missing_docs)] -pub struct NetworkOptions {} - -#[derive(clap::ValueEnum, Clone, Copy, Debug, PartialEq)] -#[value(rename_all = "kebab-case")] -#[non_exhaustive] -pub enum NetworkEmulation { - Ideal, - Healthy, - Degraded, -} - -#[derive(Debug, clap::Parser)] -#[clap(rename_all = "kebab-case")] -#[allow(missing_docs)] -pub struct DataAvailabilityReadOptions { - #[clap(long, ignore_case = true, default_value_t = 100)] - /// Number of cores to fetch availability for. - pub n_cores: usize, - - #[clap(long, ignore_case = true, default_value_t = 500)] - /// Number of validators to fetch chunks from. - pub n_validators: usize, - - #[clap(long, ignore_case = true, default_value_t = 5120)] - /// The minimum pov size in KiB - pub min_pov_size: usize, - - #[clap(long, ignore_case = true, default_value_t = 5120)] - /// The maximum pov size bytes - pub max_pov_size: usize, - - #[clap(short, long, default_value_t = false)] - /// Turbo boost AD Read by fetching from backers first. Tipically this is only faster if nodes - /// have enough bandwidth. - pub fetch_from_backers: bool, - - #[clap(short, long, ignore_case = true, default_value_t = 1)] - /// Number of times to block fetching for each core. - pub num_blocks: usize, -} - #[derive(Debug, clap::Parser)] #[clap(rename_all = "kebab-case")] #[allow(missing_docs)] From 24a736afb7727f2cc4780748edcc873692928503 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Fri, 17 Nov 2023 13:07:40 +0200 Subject: [PATCH 18/45] implement unified and extensible configuration Signed-off-by: Andrei Sandu --- Cargo.lock | 52 +++-- polkadot/node/subsystem-bench/Cargo.toml | 2 +- .../subsystem-bench/src/availability/cli.rs | 23 +-- .../src/availability/configuration.rs | 169 +--------------- .../subsystem-bench/src/availability/mod.rs | 32 +-- polkadot/node/subsystem-bench/src/cli.rs | 65 ++++++ .../subsystem-bench/src/core/configuration.rs | 190 ++++++++++++++++++ polkadot/node/subsystem-bench/src/core/mod.rs | 1 + .../node/subsystem-bench/src/core/network.rs | 1 - .../subsystem-bench/src/subsystem-bench.rs | 100 ++++----- .../node/subsystem-bench/test_sequence.toml | 77 ------- .../node/subsystem-bench/test_sequence.yaml | 56 ++++++ 12 files changed, 398 insertions(+), 370 deletions(-) create mode 100644 polkadot/node/subsystem-bench/src/cli.rs create mode 100644 polkadot/node/subsystem-bench/src/core/configuration.rs delete mode 100644 polkadot/node/subsystem-bench/test_sequence.toml create mode 100644 polkadot/node/subsystem-bench/test_sequence.yaml diff --git a/Cargo.lock b/Cargo.lock index 73fc3cbdeccc9..b40a40db47b57 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8579,7 +8579,7 @@ dependencies = [ "itertools 0.10.5", "tar", "tempfile", - "toml_edit 0.19.14", + "toml_edit", ] [[package]] @@ -13044,13 +13044,13 @@ dependencies = [ "sc-network", "sc-service", "serde", + "serde_yaml", "sp-application-crypto", "sp-core", "sp-keyring", "sp-keystore", "substrate-prometheus-endpoint", "tokio", - "toml 0.8.8", "tracing-gum", ] @@ -13455,7 +13455,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f4c021e1093a56626774e81216a4ce732a735e5bad4868a03f3ed65ca0c3919" dependencies = [ "once_cell", - "toml_edit 0.19.14", + "toml_edit", ] [[package]] @@ -16349,6 +16349,19 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_yaml" +version = "0.9.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3cc7a1570e38322cfe4154732e5110f887ea57e22b76f4bfd32b5bdd3368666c" +dependencies = [ + "indexmap 2.0.0", + "itoa", + "ryu", + "serde", + "unsafe-libyaml", +] + [[package]] name = "serial_test" version = "2.0.0" @@ -18833,19 +18846,7 @@ dependencies = [ "serde", "serde_spanned", "toml_datetime", - "toml_edit 0.19.14", -] - -[[package]] -name = "toml" -version = "0.8.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1a195ec8c9da26928f773888e0742ca3ca1040c6cd859c919c9f59c1954ab35" -dependencies = [ - "serde", - "serde_spanned", - "toml_datetime", - "toml_edit 0.21.0", + "toml_edit", ] [[package]] @@ -18870,19 +18871,6 @@ dependencies = [ "winnow", ] -[[package]] -name = "toml_edit" -version = "0.21.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d34d383cd00a163b4a5b85053df514d45bc330f6de7737edfe0a93311d1eaa03" -dependencies = [ - "indexmap 2.0.0", - "serde", - "serde_spanned", - "toml_datetime", - "winnow", -] - [[package]] name = "tower" version = "0.4.13" @@ -19325,6 +19313,12 @@ dependencies = [ "subtle 2.4.1", ] +[[package]] +name = "unsafe-libyaml" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28467d3e1d3c6586d8f25fa243f544f5800fec42d97032474e17222c2b75cfa" + [[package]] name = "unsigned-varint" version = "0.7.1" diff --git a/polkadot/node/subsystem-bench/Cargo.toml b/polkadot/node/subsystem-bench/Cargo.toml index 3308b6fe1052b..9dab8dce8455e 100644 --- a/polkadot/node/subsystem-bench/Cargo.toml +++ b/polkadot/node/subsystem-bench/Cargo.toml @@ -51,8 +51,8 @@ itertools = "0.11.0" polkadot-primitives-test-helpers = { path = "../../primitives/test-helpers" } prometheus_endpoint = { package = "substrate-prometheus-endpoint", path = "../../../substrate/utils/prometheus" } prometheus = { version = "0.13.0", default-features = false } -toml = "0.8.8" serde = "1.0.192" +serde_yaml = "0.9" [features] default = [] diff --git a/polkadot/node/subsystem-bench/src/availability/cli.rs b/polkadot/node/subsystem-bench/src/availability/cli.rs index ef4d7e6f631a4..06fb2966d878c 100644 --- a/polkadot/node/subsystem-bench/src/availability/cli.rs +++ b/polkadot/node/subsystem-bench/src/availability/cli.rs @@ -14,6 +14,7 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . +use serde::{Deserialize, Serialize}; #[derive(Debug, clap::Parser, Clone)] #[clap(rename_all = "kebab-case")] #[allow(missing_docs)] @@ -28,32 +29,12 @@ pub enum NetworkEmulation { Degraded, } -#[derive(Debug, clap::Parser)] +#[derive(Debug, Clone, Serialize, Deserialize, clap::Parser)] #[clap(rename_all = "kebab-case")] #[allow(missing_docs)] pub struct DataAvailabilityReadOptions { - #[clap(long, ignore_case = true, default_value_t = 100)] - /// Number of cores to fetch availability for. - pub n_cores: usize, - - #[clap(long, ignore_case = true, default_value_t = 500)] - /// Number of validators to fetch chunks from. - pub n_validators: usize, - - #[clap(long, ignore_case = true, default_value_t = 5120)] - /// The minimum pov size in KiB - pub min_pov_size: usize, - - #[clap(long, ignore_case = true, default_value_t = 5120)] - /// The maximum pov size bytes - pub max_pov_size: usize, - #[clap(short, long, default_value_t = false)] /// Turbo boost AD Read by fetching from backers first. Tipically this is only faster if nodes /// have enough bandwidth. pub fetch_from_backers: bool, - - #[clap(short, long, ignore_case = true, default_value_t = 1)] - /// Number of times to block fetching for each core. - pub num_blocks: usize, } diff --git a/polkadot/node/subsystem-bench/src/availability/configuration.rs b/polkadot/node/subsystem-bench/src/availability/configuration.rs index cbad4a2dc1b84..f96b8e2cb7cea 100644 --- a/polkadot/node/subsystem-bench/src/availability/configuration.rs +++ b/polkadot/node/subsystem-bench/src/availability/configuration.rs @@ -14,175 +14,12 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . -use std::path::Path; - use super::*; use serde::{Deserialize, Serialize}; -/// Peer response latency configuration. -#[derive(Clone, Debug, Default, Serialize, Deserialize)] -pub struct PeerLatency { - /// Min latency for `NetworkAction` completion. - pub min_latency: Duration, - /// Max latency or `NetworkAction` completion. - pub max_latency: Duration, -} /// The test input parameters -#[derive(Clone, Debug, Serialize, Deserialize)] -pub struct TestConfiguration { - /// Configuration for the `availability-recovery` subsystem. +#[derive(Clone, Default, Debug, Serialize, Deserialize)] +pub struct AvailabilityRecoveryConfiguration { + /// Prefer the fast path (try fetch from backers first) pub use_fast_path: bool, - /// Number of validators - pub n_validators: usize, - /// Number of cores - pub n_cores: usize, - /// The min PoV size - pub min_pov_size: usize, - /// The max PoV size, - pub max_pov_size: usize, - /// Randomly sampled pov_sizes - #[serde(skip)] - pov_sizes: Vec, - /// The amount of bandiwdth remote validators have. - pub peer_bandwidth: usize, - /// The amount of bandiwdth our node has. - pub bandwidth: usize, - /// Optional peer emulation latency - pub latency: Option, - /// Error probability - pub error: usize, - /// Number of blocks - /// In one block `n_cores` candidates are recovered - pub num_blocks: usize, -} - -impl Default for TestConfiguration { - fn default() -> Self { - Self { - use_fast_path: false, - n_validators: 100, - n_cores: 10, - pov_sizes: vec![5 * 1024 * 1024], - bandwidth: 60 * 1024 * 1024, - peer_bandwidth: 60 * 1024 * 1024, - latency: None, - error: 0, - num_blocks: 1, - min_pov_size: 5 * 1024 * 1024, - max_pov_size: 5 * 1024 * 1024, - } - } -} - -fn generate_pov_sizes(count: usize, min: usize, max: usize) -> Vec { - (0..count).map(|_| random_pov_size(min, max)).collect() -} - -#[derive(Serialize, Deserialize)] -pub struct TestSequence { - #[serde(rename(serialize = "TestConfiguration", deserialize = "TestConfiguration"))] - test_configurations: Vec, -} - -impl TestSequence { - pub fn to_vec(mut self) -> Vec { - // Generate Pov sizes - - for config in self.test_configurations.iter_mut() { - config.pov_sizes = - generate_pov_sizes(config.n_cores, config.min_pov_size, config.max_pov_size); - } - - self.test_configurations - } -} - -impl TestSequence { - pub fn new_from_file(path: &Path) -> std::io::Result { - let string = String::from_utf8(std::fs::read(&path)?).expect("File is valid UTF8"); - Ok(toml::from_str(&string).expect("File is valid test sequence TOML")) - } -} - -impl TestConfiguration { - pub fn write_to_disk(&self) { - // Serialize a slice of configurations - let toml = - toml::to_string(&TestSequence { test_configurations: vec![self.clone()] }).unwrap(); - std::fs::write("last_test.toml", toml).unwrap(); - } - - pub fn pov_sizes(&self) -> &[usize] { - &self.pov_sizes - } - /// An unconstrained standard configuration matching Polkadot/Kusama - pub fn ideal_network( - num_blocks: usize, - use_fast_path: bool, - n_validators: usize, - n_cores: usize, - pov_sizes: Vec, - ) -> TestConfiguration { - Self { - use_fast_path, - n_cores, - n_validators, - pov_sizes, - bandwidth: 50 * 1024 * 1024, - peer_bandwidth: 50 * 1024 * 1024, - // No latency - latency: None, - error: 0, - num_blocks, - ..Default::default() - } - } - - pub fn healthy_network( - num_blocks: usize, - use_fast_path: bool, - n_validators: usize, - n_cores: usize, - pov_sizes: Vec, - ) -> TestConfiguration { - Self { - use_fast_path, - n_cores, - n_validators, - pov_sizes, - bandwidth: 50 * 1024 * 1024, - peer_bandwidth: 50 * 1024 * 1024, - latency: Some(PeerLatency { - min_latency: Duration::from_millis(1), - max_latency: Duration::from_millis(100), - }), - error: 3, - num_blocks, - ..Default::default() - } - } - - pub fn degraded_network( - num_blocks: usize, - use_fast_path: bool, - n_validators: usize, - n_cores: usize, - pov_sizes: Vec, - ) -> TestConfiguration { - Self { - use_fast_path, - n_cores, - n_validators, - pov_sizes, - bandwidth: 50 * 1024 * 1024, - peer_bandwidth: 50 * 1024 * 1024, - latency: Some(PeerLatency { - min_latency: Duration::from_millis(10), - max_latency: Duration::from_millis(500), - }), - error: 33, - num_blocks, - ..Default::default() - } - } } diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index 8866348ea22bb..9be15c576e3aa 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -53,12 +53,18 @@ use polkadot_node_subsystem::{ }; use std::net::{Ipv4Addr, SocketAddr}; -use super::core::{environment::TestEnvironmentMetrics, keyring::Keyring, network::*}; +use super::core::{ + configuration::{PeerLatency, TestConfiguration}, + environment::TestEnvironmentMetrics, + keyring::Keyring, + network::*, +}; const LOG_TARGET: &str = "subsystem-bench::availability"; use polkadot_node_primitives::{AvailableData, ErasureChunk}; +use super::cli::TestObjective; use polkadot_node_subsystem_test_helpers::{ make_buffered_subsystem_context, mock::new_leaf, TestSubsystemContextHandle, }; @@ -73,7 +79,7 @@ use sc_service::{SpawnTaskHandle, TaskManager}; mod cli; pub mod configuration; pub use cli::{DataAvailabilityReadOptions, NetworkEmulation, NetworkOptions}; -pub use configuration::{PeerLatency, TestConfiguration, TestSequence}; +pub use configuration::AvailabilityRecoveryConfiguration; // Deterministic genesis hash for protocol names const GENESIS_HASH: Hash = Hash::repeat_byte(0xff); @@ -138,7 +144,10 @@ impl TestEnvironment { let (instance, virtual_overseer) = AvailabilityRecoverySubsystemInstance::new( ®istry, task_manager.spawn_handle(), - state.config().use_fast_path, + match &state.config().objective { + TestObjective::DataAvailabilityRead(options) => options.fetch_from_backers, + _ => panic!("Unexpected objective"), + }, ); let metrics = @@ -491,16 +500,6 @@ impl AvailabilityRecoverySubsystemInstance { } } -pub fn random_pov_size(min_pov_size: usize, max_pov_size: usize) -> usize { - random_uniform_sample(min_pov_size, max_pov_size) -} - -fn random_uniform_sample + From>(min_value: T, max_value: T) -> T { - Uniform::from(min_value.into()..=max_value.into()) - .sample(&mut thread_rng()) - .into() -} - // We use this to bail out sending messages to the subsystem if it is overloaded such that // the time of flight is breaches 5s. // This should eventually be a test parameter. @@ -508,6 +507,9 @@ const MAX_TIME_OF_FLIGHT: Duration = Duration::from_millis(5000); #[derive(Clone)] pub struct TestState { + // Full test configuration + config: TestConfiguration, + // State starts here. validator_public: Vec, validator_authority_id: Vec, // The test node validator index. @@ -527,8 +529,6 @@ pub struct TestState { candidate_receipts: Vec, available_data: Vec, chunks: Vec>, - /// Next candidate index in - config: TestConfiguration, } impl TestState { @@ -687,6 +687,7 @@ impl TestState { gum::info!(target: LOG_TARGET, "{}","Created test environment.".bright_blue()); Self { + config, validator_public, validator_authority_id, validator_index, @@ -695,7 +696,6 @@ impl TestState { available_data, candidate_receipts, chunks, - config, pov_size_to_candidate, pov_sizes, candidates_generated: 0, diff --git a/polkadot/node/subsystem-bench/src/cli.rs b/polkadot/node/subsystem-bench/src/cli.rs new file mode 100644 index 0000000000000..2f00ad2f35857 --- /dev/null +++ b/polkadot/node/subsystem-bench/src/cli.rs @@ -0,0 +1,65 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . +use super::availability::{ + AvailabilityRecoveryConfiguration, DataAvailabilityReadOptions, NetworkEmulation, + TestEnvironment, TestState, +}; +use serde::{Deserialize, Serialize}; + +use super::core::configuration::{PeerLatency, TestConfiguration, TestSequence}; + +#[derive(Debug, Clone, Serialize, Deserialize, clap::Parser)] +#[clap(rename_all = "kebab-case")] +#[allow(missing_docs)] +pub struct TestSequenceOptions { + #[clap(short, long, ignore_case = true)] + pub path: String, +} + +/// Define the supported benchmarks targets +#[derive(Debug, Clone, clap::Parser, Serialize, Deserialize)] +#[command(about = "Test objectives", version, rename_all = "kebab-case")] +pub enum TestObjective { + /// Benchmark availability recovery strategies. + DataAvailabilityRead(DataAvailabilityReadOptions), + /// Run a test sequence specified in a file + TestSequence(TestSequenceOptions), +} + +#[derive(Debug, clap::Parser)] +#[clap(rename_all = "kebab-case")] +#[allow(missing_docs)] +pub struct StandardTestOptions { + #[clap(long, ignore_case = true, default_value_t = 100)] + /// Number of cores to fetch availability for. + pub n_cores: usize, + + #[clap(long, ignore_case = true, default_value_t = 500)] + /// Number of validators to fetch chunks from. + pub n_validators: usize, + + #[clap(long, ignore_case = true, default_value_t = 5120)] + /// The minimum pov size in KiB + pub min_pov_size: usize, + + #[clap(long, ignore_case = true, default_value_t = 5120)] + /// The maximum pov size bytes + pub max_pov_size: usize, + + #[clap(short, long, ignore_case = true, default_value_t = 1)] + /// The number of blocks the test is going to run. + pub num_blocks: usize, +} diff --git a/polkadot/node/subsystem-bench/src/core/configuration.rs b/polkadot/node/subsystem-bench/src/core/configuration.rs new file mode 100644 index 0000000000000..017d4023ef654 --- /dev/null +++ b/polkadot/node/subsystem-bench/src/core/configuration.rs @@ -0,0 +1,190 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . +use std::path::Path; + +use crate::availability::AvailabilityRecoveryConfiguration; + +use super::*; +pub use crate::cli::TestObjective; +use rand::{distributions::Uniform, prelude::Distribution, thread_rng}; +use serde::{Deserialize, Serialize}; + +pub fn random_pov_size(min_pov_size: usize, max_pov_size: usize) -> usize { + random_uniform_sample(min_pov_size, max_pov_size) +} + +fn random_uniform_sample + From>(min_value: T, max_value: T) -> T { + Uniform::from(min_value.into()..=max_value.into()) + .sample(&mut thread_rng()) + .into() +} + +/// Peer response latency configuration. +#[derive(Clone, Debug, Default, Serialize, Deserialize)] +pub struct PeerLatency { + /// Min latency for `NetworkAction` completion. + pub min_latency: Duration, + /// Max latency or `NetworkAction` completion. + pub max_latency: Duration, +} + +/// The test input parameters +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct TestConfiguration { + /// The test objective + pub objective: TestObjective, + /// Number of validators + pub n_validators: usize, + /// Number of cores + pub n_cores: usize, + /// The min PoV size + pub min_pov_size: usize, + /// The max PoV size, + pub max_pov_size: usize, + /// Randomly sampled pov_sizes + #[serde(skip)] + pov_sizes: Vec, + /// The amount of bandiwdth remote validators have. + pub peer_bandwidth: usize, + /// The amount of bandiwdth our node has. + pub bandwidth: usize, + /// Optional peer emulation latency + pub latency: Option, + /// Error probability + pub error: usize, + /// Number of blocks + /// In one block `n_cores` candidates are recovered + pub num_blocks: usize, +} + +fn generate_pov_sizes(count: usize, min_kib: usize, max_kib: usize) -> Vec { + (0..count).map(|_| random_pov_size(min_kib * 1024, max_kib * 1024)).collect() +} + +#[derive(Serialize, Deserialize)] +pub struct TestSequence { + #[serde(rename(serialize = "TestConfiguration", deserialize = "TestConfiguration"))] + test_configurations: Vec, +} + +impl TestSequence { + pub fn to_vec(mut self) -> Vec { + self.test_configurations + .into_iter() + .map(|mut config| { + config.pov_sizes = + generate_pov_sizes(config.n_cores, config.min_pov_size, config.max_pov_size); + config + }) + .collect() + } +} + +impl TestSequence { + pub fn new_from_file(path: &Path) -> std::io::Result { + let string = String::from_utf8(std::fs::read(&path)?).expect("File is valid UTF8"); + Ok(serde_yaml::from_str(&string).expect("File is valid test sequence YA")) + } +} + +impl TestConfiguration { + pub fn write_to_disk(&self) { + // Serialize a slice of configurations + let yaml = serde_yaml::to_string(&TestSequence { test_configurations: vec![self.clone()] }) + .unwrap(); + std::fs::write("last_test.yaml", yaml).unwrap(); + } + + pub fn pov_sizes(&self) -> &[usize] { + &self.pov_sizes + } + /// An unconstrained standard configuration matching Polkadot/Kusama + pub fn ideal_network( + objective: TestObjective, + num_blocks: usize, + n_validators: usize, + n_cores: usize, + min_pov_size: usize, + max_pov_size: usize, + ) -> TestConfiguration { + Self { + objective, + n_cores, + n_validators, + pov_sizes: generate_pov_sizes(n_cores, min_pov_size, max_pov_size), + bandwidth: 50 * 1024 * 1024, + peer_bandwidth: 50 * 1024 * 1024, + // No latency + latency: None, + error: 0, + num_blocks, + min_pov_size, + max_pov_size, + } + } + + pub fn healthy_network( + objective: TestObjective, + num_blocks: usize, + n_validators: usize, + n_cores: usize, + min_pov_size: usize, + max_pov_size: usize, + ) -> TestConfiguration { + Self { + objective, + n_cores, + n_validators, + pov_sizes: generate_pov_sizes(n_cores, min_pov_size, max_pov_size), + bandwidth: 50 * 1024 * 1024, + peer_bandwidth: 50 * 1024 * 1024, + latency: Some(PeerLatency { + min_latency: Duration::from_millis(1), + max_latency: Duration::from_millis(100), + }), + error: 3, + num_blocks, + min_pov_size, + max_pov_size, + } + } + + pub fn degraded_network( + objective: TestObjective, + num_blocks: usize, + n_validators: usize, + n_cores: usize, + min_pov_size: usize, + max_pov_size: usize, + ) -> TestConfiguration { + Self { + objective, + n_cores, + n_validators, + pov_sizes: generate_pov_sizes(n_cores, min_pov_size, max_pov_size), + bandwidth: 50 * 1024 * 1024, + peer_bandwidth: 50 * 1024 * 1024, + latency: Some(PeerLatency { + min_latency: Duration::from_millis(10), + max_latency: Duration::from_millis(500), + }), + error: 33, + num_blocks, + min_pov_size, + max_pov_size, + } + } +} diff --git a/polkadot/node/subsystem-bench/src/core/mod.rs b/polkadot/node/subsystem-bench/src/core/mod.rs index 564fb7148fa0a..06aa58f7256b0 100644 --- a/polkadot/node/subsystem-bench/src/core/mod.rs +++ b/polkadot/node/subsystem-bench/src/core/mod.rs @@ -24,6 +24,7 @@ const LOG_TARGET: &str = "subsystem-bench::core"; use polkadot_primitives::AuthorityDiscoveryId; use sc_service::SpawnTaskHandle; +pub mod configuration; pub mod display; pub mod environment; pub mod keyring; diff --git a/polkadot/node/subsystem-bench/src/core/network.rs b/polkadot/node/subsystem-bench/src/core/network.rs index 629d09df694cb..f20bb919dedba 100644 --- a/polkadot/node/subsystem-bench/src/core/network.rs +++ b/polkadot/node/subsystem-bench/src/core/network.rs @@ -113,7 +113,6 @@ mod tests { let end = Instant::now(); - // assert_eq!(end - start, Duration::from_secs(1)); println!("duration: {}", (end - start).as_millis()); // Allow up to `budget/max_refill` error tolerance diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs index 42efb7fd63c8b..b94e594e59452 100644 --- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -23,12 +23,16 @@ use colored::Colorize; use std::{path::Path, time::Duration}; pub(crate) mod availability; +pub(crate) mod cli; pub(crate) mod core; use availability::{ - random_pov_size, DataAvailabilityReadOptions, NetworkEmulation, TestConfiguration, + AvailabilityRecoveryConfiguration, DataAvailabilityReadOptions, NetworkEmulation, TestEnvironment, TestState, }; +use cli::TestObjective; + +use core::configuration::{PeerLatency, TestConfiguration, TestSequence}; use clap_num::number_range; const LOG_TARGET: &str = "subsystem-bench"; @@ -40,23 +44,6 @@ fn le_100(s: &str) -> Result { fn le_5000(s: &str) -> Result { number_range(s, 0, 5000) } -#[derive(Debug, clap::Parser)] -#[clap(rename_all = "kebab-case")] -#[allow(missing_docs)] -pub struct TestSequenceOptions { - #[clap(short, long, ignore_case = true)] - pub path: String, -} - -/// Define the supported benchmarks targets -#[derive(Debug, Parser)] -#[command(about = "Test objectives", version, rename_all = "kebab-case")] -enum TestObjective { - /// Benchmark availability recovery strategies. - DataAvailabilityRead(DataAvailabilityReadOptions), - /// Run a test sequence specified in a file - TestSequence(TestSequenceOptions), -} #[derive(Debug, Parser)] #[allow(missing_docs)] @@ -65,6 +52,9 @@ struct BenchCli { /// The type of network to be emulated pub network: NetworkEmulation, + #[clap(flatten)] + pub standard_configuration: cli::StandardTestOptions, + #[clap(short, long)] /// The bandwidth of simulated remote peers in KiB pub peer_bandwidth: Option, @@ -86,7 +76,7 @@ struct BenchCli { pub peer_max_latency: Option, #[command(subcommand)] - pub objective: TestObjective, + pub objective: cli::TestObjective, } fn new_runtime() -> tokio::runtime::Runtime { @@ -104,10 +94,11 @@ impl BenchCli { let runtime = new_runtime(); + let configuration = self.standard_configuration; let mut test_config = match self.objective { TestObjective::TestSequence(options) => { let test_sequence = - availability::TestSequence::new_from_file(Path::new(&options.path)) + core::configuration::TestSequence::new_from_file(Path::new(&options.path)) .expect("File exists") .to_vec(); let num_steps = test_sequence.len(); @@ -117,8 +108,17 @@ impl BenchCli { ); for (index, test_config) in test_sequence.into_iter().enumerate() { gum::info!( - "{}", - format!("Current step {}/{}", index + 1, num_steps).bright_purple() + "{}, {}, {}, {}, {}, {}", + format!("Step {}/{}", index + 1, num_steps).bright_purple(), + format!("n_validators = {}", test_config.n_validators).blue(), + format!("n_cores = {}", test_config.n_cores).blue(), + format!( + "pov_size = {} - {}", + test_config.min_pov_size, test_config.max_pov_size + ) + .bright_black(), + format!("error = {}", test_config.error).bright_black(), + format!("latency = {:?}", test_config.latency).bright_black(), ); let candidate_count = test_config.n_cores * test_config.num_blocks; @@ -132,48 +132,30 @@ impl BenchCli { } return Ok(()) }, - TestObjective::DataAvailabilityRead(options) => match self.network { + TestObjective::DataAvailabilityRead(ref options) => match self.network { NetworkEmulation::Healthy => TestConfiguration::healthy_network( - options.num_blocks, - options.fetch_from_backers, - options.n_validators, - options.n_cores, - (0..options.n_cores) - .map(|_| { - random_pov_size( - options.min_pov_size * 1024, - options.max_pov_size * 1024, - ) - }) - .collect(), + self.objective, + configuration.num_blocks, + configuration.n_validators, + configuration.n_cores, + configuration.min_pov_size, + configuration.max_pov_size, ), NetworkEmulation::Degraded => TestConfiguration::degraded_network( - options.num_blocks, - options.fetch_from_backers, - options.n_validators, - options.n_cores, - (0..options.n_cores) - .map(|_| { - random_pov_size( - options.min_pov_size * 1024, - options.max_pov_size * 1024, - ) - }) - .collect(), + self.objective, + configuration.num_blocks, + configuration.n_validators, + configuration.n_cores, + configuration.min_pov_size, + configuration.max_pov_size, ), NetworkEmulation::Ideal => TestConfiguration::ideal_network( - options.num_blocks, - options.fetch_from_backers, - options.n_validators, - options.n_cores, - (0..options.n_cores) - .map(|_| { - random_pov_size( - options.min_pov_size * 1024, - options.max_pov_size * 1024, - ) - }) - .collect(), + self.objective, + configuration.num_blocks, + configuration.n_validators, + configuration.n_cores, + configuration.min_pov_size, + configuration.max_pov_size, ), }, }; diff --git a/polkadot/node/subsystem-bench/test_sequence.toml b/polkadot/node/subsystem-bench/test_sequence.toml deleted file mode 100644 index d32477b9efe96..0000000000000 --- a/polkadot/node/subsystem-bench/test_sequence.toml +++ /dev/null @@ -1,77 +0,0 @@ -[[TestConfiguration]] -use_fast_path = false -n_validators = 300 -n_cores = 20 -min_pov_size = 5242880 -max_pov_size = 5242880 -peer_bandwidth = 128000 -bandwidth = 52428800 -error = 33 -num_blocks = 5 - -[TestConfiguration.latency.min_latency] -secs = 0 -nanos = 1000000 - -[TestConfiguration.latency.max_latency] -secs = 0 -nanos = 100000000 - -[[TestConfiguration]] -use_fast_path = false -n_validators = 500 -n_cores = 20 -min_pov_size = 5242880 -max_pov_size = 5242880 -peer_bandwidth = 128000 -bandwidth = 52428800 -error = 33 -num_blocks = 5 - -[TestConfiguration.latency.min_latency] -secs = 0 -nanos = 1000000 - -[TestConfiguration.latency.max_latency] -secs = 0 -nanos = 1000000000 - - -[[TestConfiguration]] -use_fast_path = false -n_validators = 1000 -n_cores = 20 -min_pov_size = 5242880 -max_pov_size = 5242880 -peer_bandwidth = 128000 -bandwidth = 52428800 -error = 33 -num_blocks = 5 - -[TestConfiguration.latency.min_latency] -secs = 0 -nanos = 1000000 - -[TestConfiguration.latency.max_latency] -secs = 0 -nanos = 1000000000 - - -[[TestConfiguration]] -use_fast_path = false -n_validators = 2000 -n_cores = 20 -min_pov_size = 5242880 -max_pov_size = 5242880 -peer_bandwidth = 128000 -bandwidth = 52428800 -error = 33 -num_blocks = 5 - -[TestConfiguration.latency.min_latency] -secs = 0 -nanos = 1000000 - -[TestConfiguration.latency.max_latency] -secs = 0 -nanos = 1000000000 diff --git a/polkadot/node/subsystem-bench/test_sequence.yaml b/polkadot/node/subsystem-bench/test_sequence.yaml new file mode 100644 index 0000000000000..088a7e15729b2 --- /dev/null +++ b/polkadot/node/subsystem-bench/test_sequence.yaml @@ -0,0 +1,56 @@ +TestConfiguration: +# Test 1 +- objective: !DataAvailabilityRead + fetch_from_backers: false + n_validators: 300 + n_cores: 10 + min_pov_size: 1120 + max_pov_size: 5120 + peer_bandwidth: 52428800 + bandwidth: 52428800 + latency: + min_latency: + secs: 0 + nanos: 1000000 + max_latency: + secs: 0 + nanos: 100000000 + error: 3 + num_blocks: 10 +# Test 2 +- objective: !DataAvailabilityRead + fetch_from_backers: false + n_validators: 500 + n_cores: 10 + min_pov_size: 1120 + max_pov_size: 5120 + peer_bandwidth: 52428800 + bandwidth: 52428800 + latency: + min_latency: + secs: 0 + nanos: 1000000 + max_latency: + secs: 0 + nanos: 100000000 + error: 3 + num_blocks: 10 + +# Test 2 +- objective: !DataAvailabilityRead + fetch_from_backers: false + n_validators: 1000 + n_cores: 10 + min_pov_size: 1120 + max_pov_size: 5120 + peer_bandwidth: 52428800 + bandwidth: 52428800 + latency: + min_latency: + secs: 0 + nanos: 1000000 + max_latency: + secs: 0 + nanos: 100000000 + error: 3 + num_blocks: 10 From 28438650ef36a17528cf45883712786ca9dc034d Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Fri, 24 Nov 2023 16:05:51 +0200 Subject: [PATCH 19/45] Prepare to swtich to overseer Signed-off-by: Andrei Sandu --- Cargo.lock | 1 + polkadot/node/subsystem-bench/Cargo.toml | 1 + .../subsystem-bench/src/availability/mod.rs | 20 ++--- .../subsystem-bench/src/core/mock/dummy.rs | 89 +++++++++++++++++++ .../node/subsystem-bench/src/core/mock/mod.rs | 76 ++++++++++++++++ polkadot/node/subsystem-bench/src/core/mod.rs | 1 + .../subsystem-bench/src/core/subsystem.rs | 16 ++++ .../subsystem-bench/src/subsystem-bench.rs | 8 +- .../procedural/src/pallet/expand/warnings.rs | 8 +- 9 files changed, 199 insertions(+), 21 deletions(-) create mode 100644 polkadot/node/subsystem-bench/src/core/mock/dummy.rs create mode 100644 polkadot/node/subsystem-bench/src/core/mock/mod.rs create mode 100644 polkadot/node/subsystem-bench/src/core/subsystem.rs diff --git a/Cargo.lock b/Cargo.lock index b40a40db47b57..44a9093710ecc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13027,6 +13027,7 @@ dependencies = [ "itertools 0.11.0", "log", "parity-scale-codec", + "paste", "polkadot-availability-recovery", "polkadot-erasure-coding", "polkadot-node-metrics", diff --git a/polkadot/node/subsystem-bench/Cargo.toml b/polkadot/node/subsystem-bench/Cargo.toml index 9dab8dce8455e..d1c68c6e5f54f 100644 --- a/polkadot/node/subsystem-bench/Cargo.toml +++ b/polkadot/node/subsystem-bench/Cargo.toml @@ -53,6 +53,7 @@ prometheus_endpoint = { package = "substrate-prometheus-endpoint", path = "../.. prometheus = { version = "0.13.0", default-features = false } serde = "1.0.192" serde_yaml = "0.9" +paste = "1.0.14" [features] default = [] diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index 9be15c576e3aa..8bd28b02bd73c 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -129,7 +129,7 @@ pub struct TestEnvironment { // for the whole duration of the test. instance: AvailabilityRecoverySubsystemInstance, // The test intial state. The current state is owned by `env_task`. - state: TestState, + config: TestConfiguration, // A handle to the network emulator. network: NetworkEmulator, // Configuration/env metrics @@ -140,6 +140,7 @@ impl TestEnvironment { // Create a new test environment with specified initial state and prometheus registry. // We use prometheus metrics to collect per job task poll time and subsystem metrics. pub fn new(runtime: tokio::runtime::Handle, state: TestState, registry: Registry) -> Self { + let config = state.config().clone(); let task_manager: TaskManager = TaskManager::new(runtime.clone(), Some(®istry)).unwrap(); let (instance, virtual_overseer) = AvailabilityRecoverySubsystemInstance::new( ®istry, @@ -153,9 +154,9 @@ impl TestEnvironment { let metrics = TestEnvironmentMetrics::new(®istry).expect("Metrics need to be registered"); let mut network = NetworkEmulator::new( - state.config().n_validators, + config.n_validators, state.validator_authority_id.clone(), - state.config().peer_bandwidth, + config.peer_bandwidth, task_manager.spawn_handle(), ®istry, ); @@ -168,7 +169,7 @@ impl TestEnvironment { let spawn_handle = task_manager.spawn_handle(); // Our node rate limiting - let mut rx_limiter = RateLimit::new(10, state.config.bandwidth); + let mut rx_limiter = RateLimit::new(10, config.bandwidth); let (ingress_tx, mut ingress_rx) = tokio::sync::mpsc::unbounded_channel::(); let our_network_stats = network.peer_stats(0); @@ -204,11 +205,11 @@ impl TestEnvironment { .unwrap(); }); - TestEnvironment { task_manager, registry, to_subsystem, instance, state, network, metrics } + TestEnvironment { task_manager, registry, to_subsystem, instance, config, network, metrics } } pub fn config(&self) -> &TestConfiguration { - self.state.config() + &self.config } pub fn network(&mut self) -> &mut NetworkEmulator { @@ -457,8 +458,6 @@ impl TestEnvironment { } } -/// Implementation for chunks only -/// TODO: all recovery methods. impl AvailabilityRecoverySubsystemInstance { pub fn new( registry: &Registry, @@ -732,7 +731,7 @@ fn derive_erasure_chunks_with_proofs_and_root( (erasure_chunks, root) } -pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { +pub async fn bench_chunk_recovery(env: &mut TestEnvironment, mut state: TestState) { let config = env.config().clone(); env.send_signal(OverseerSignal::ActiveLeaves(ActiveLeavesUpdate::start_work(new_leaf( @@ -754,8 +753,7 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment) { let block_start_ts = Instant::now(); for candidate_num in 0..config.n_cores as u64 { - let candidate = env - .state + let candidate = state .next_candidate() .expect("We always send up to n_cores*num_blocks; qed"); let (tx, rx) = oneshot::channel(); diff --git a/polkadot/node/subsystem-bench/src/core/mock/dummy.rs b/polkadot/node/subsystem-bench/src/core/mock/dummy.rs new file mode 100644 index 0000000000000..122fc23ac52f2 --- /dev/null +++ b/polkadot/node/subsystem-bench/src/core/mock/dummy.rs @@ -0,0 +1,89 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . +//! Dummy subsystem mocks. +//! +use paste::paste; + +use futures::{channel::oneshot, select, Future, FutureExt}; +use polkadot_node_subsystem::{ + overseer, AllMessages, FromOrchestra, HeadSupportsParachains, Overseer, OverseerConnector, + OverseerHandle, SpawnGlue, SpawnedSubsystem, Subsystem, SubsystemError, +}; +use std::time::Duration; +use tokio::time::sleep; + +macro_rules! mock { + // Just query by relay parent + ($subsystem_name:ident) => { + paste! { + pub struct [] {} + #[overseer::subsystem($subsystem_name, error=SubsystemError, prefix=self::overseer)] + impl [] { + fn start(self, ctx: Context) -> SpawnedSubsystem { + let future = self.run(ctx).map(|_| Ok(())).boxed(); + + SpawnedSubsystem { name: stringify!($subsystem_name), future } + } + } + + #[overseer::contextbounds($subsystem_name, prefix = self::overseer)] + impl [] { + async fn run(self, mut ctx: Context) { + let mut count_total_msg = 0; + loop { + futures::select!{ + _msg = ctx.recv().fuse() => { + count_total_msg +=1; + } + _ = sleep(Duration::from_secs(6)).fuse() => { + if count_total_msg > 0 { + gum::info!(target: "mock-subsystems", "Subsystem {} processed {} messages since last time", stringify!($subsystem_name), count_total_msg); + } + count_total_msg = 0; + } + } + } + } + } + } + }; +} + +mock!(AvailabilityStore); +mock!(StatementDistribution); +mock!(BitfieldSigning); +mock!(BitfieldDistribution); +mock!(Provisioner); +mock!(NetworkBridgeRx); +mock!(CollationGeneration); +mock!(CollatorProtocol); +mock!(GossipSupport); +mock!(DisputeDistribution); +mock!(DisputeCoordinator); +mock!(ProspectiveParachains); +mock!(PvfChecker); +mock!(CandidateBacking); +mock!(AvailabilityDistribution); +mock!(CandidateValidation); +mock!(AvailabilityRecovery); +mock!(NetworkBridgeTx); +mock!(ChainApi); +mock!(ChainSelection); +mock!(ApprovalVoting); +mock!(ApprovalDistribution); +mock!(RuntimeApi); + + diff --git a/polkadot/node/subsystem-bench/src/core/mock/mod.rs b/polkadot/node/subsystem-bench/src/core/mock/mod.rs new file mode 100644 index 0000000000000..f13e87c8683b3 --- /dev/null +++ b/polkadot/node/subsystem-bench/src/core/mock/mod.rs @@ -0,0 +1,76 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +use polkadot_node_subsystem::{ + overseer, AllMessages, FromOrchestra, HeadSupportsParachains, Overseer, OverseerConnector, + OverseerHandle, SpawnGlue, SpawnedSubsystem, Subsystem, +}; +use polkadot_node_subsystem_types::Hash; + +pub mod dummy; +mod temp; + +use dummy::*; +use sc_service::SpawnTaskHandle; + +struct AlwaysSupportsParachains {} +#[async_trait::async_trait] +impl HeadSupportsParachains for AlwaysSupportsParachains { + async fn head_supports_parachains(&self, _head: &Hash) -> bool { + true + } +} + +pub fn new_overseer_with_dummy_subsystems(spawn_task_handle: SpawnTaskHandle) { + // Initialize a mock overseer. + // All subsystem except approval_voting and approval_distribution are mock subsystems. + let spawner_glue = SpawnGlue(spawn_task_handle); + let overseer_connector = OverseerConnector::with_event_capacity(64000); + let builder = Overseer::builder() + .approval_voting(MockApprovalVoting {}) + .approval_distribution(MockApprovalDistribution {}) + .availability_recovery(MockAvailabilityRecovery {}) + .candidate_validation(MockCandidateValidation {}) + .chain_api(MockChainApi { }) + .chain_selection(MockChainSelection {}) + .dispute_coordinator(MockDisputeCoordinator {}) + .runtime_api(MockRuntimeApi { }) + .network_bridge_tx(MockNetworkBridgeTx {}) + .availability_distribution(MockAvailabilityDistribution {}) + .availability_store(MockAvailabilityStore {}) + .pvf_checker(MockPvfChecker {}) + .candidate_backing(MockCandidateBacking {}) + .statement_distribution(MockStatementDistribution {}) + .bitfield_signing(MockBitfieldSigning {}) + .bitfield_distribution(MockBitfieldDistribution {}) + .provisioner(MockProvisioner {}) + .network_bridge_rx(MockNetworkBridgeRx {}) + .collation_generation(MockCollationGeneration {}) + .collator_protocol(MockCollatorProtocol {}) + .gossip_support(MockGossipSupport {}) + .dispute_distribution(MockDisputeDistribution {}) + .prospective_parachains(MockProspectiveParachains {}) + .activation_external_listeners(Default::default()) + .span_per_active_leaf(Default::default()) + .active_leaves(Default::default()) + .metrics(Default::default()) + .supports_parachains(AlwaysSupportsParachains {}) + .spawner(spawner_glue); + + let (mock_overseer, mock_overseer_handle) = + builder.build_with_connector(overseer_connector).expect("Should not fail"); + +} \ No newline at end of file diff --git a/polkadot/node/subsystem-bench/src/core/mod.rs b/polkadot/node/subsystem-bench/src/core/mod.rs index 06aa58f7256b0..af2abf0860cdf 100644 --- a/polkadot/node/subsystem-bench/src/core/mod.rs +++ b/polkadot/node/subsystem-bench/src/core/mod.rs @@ -29,3 +29,4 @@ pub mod display; pub mod environment; pub mod keyring; pub mod network; +pub mod mock; diff --git a/polkadot/node/subsystem-bench/src/core/subsystem.rs b/polkadot/node/subsystem-bench/src/core/subsystem.rs new file mode 100644 index 0000000000000..c61e641d255d8 --- /dev/null +++ b/polkadot/node/subsystem-bench/src/core/subsystem.rs @@ -0,0 +1,16 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs index b94e594e59452..ca561e5c4955e 100644 --- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -126,9 +126,9 @@ impl BenchCli { let mut state = TestState::new(test_config); state.generate_candidates(candidate_count); let mut env = - TestEnvironment::new(runtime.handle().clone(), state, Registry::new()); + TestEnvironment::new(runtime.handle().clone(), state.clone(), Registry::new()); - runtime.block_on(availability::bench_chunk_recovery(&mut env)); + runtime.block_on(availability::bench_chunk_recovery(&mut env, state)); } return Ok(()) }, @@ -189,9 +189,9 @@ impl BenchCli { let mut state = TestState::new(test_config); state.generate_candidates(candidate_count); - let mut env = TestEnvironment::new(runtime.handle().clone(), state, Registry::new()); + let mut env = TestEnvironment::new(runtime.handle().clone(), state.clone(), Registry::new()); - runtime.block_on(availability::bench_chunk_recovery(&mut env)); + runtime.block_on(availability::bench_chunk_recovery(&mut env, state)); Ok(()) } diff --git a/substrate/frame/support/procedural/src/pallet/expand/warnings.rs b/substrate/frame/support/procedural/src/pallet/expand/warnings.rs index 6ce2097c26847..030e3ddaf3232 100644 --- a/substrate/frame/support/procedural/src/pallet/expand/warnings.rs +++ b/substrate/frame/support/procedural/src/pallet/expand/warnings.rs @@ -33,9 +33,7 @@ pub(crate) fn weight_witness_warning( if dev_mode { return } - let CallWeightDef::Immediate(w) = &method.weight else { - return - }; + let CallWeightDef::Immediate(w) = &method.weight else { return }; let partial_warning = Warning::new_deprecated("UncheckedWeightWitness") .old("not check weight witness data") @@ -66,9 +64,7 @@ pub(crate) fn weight_constant_warning( if dev_mode { return } - let syn::Expr::Lit(lit) = weight else { - return - }; + let syn::Expr::Lit(lit) = weight else { return }; let warning = Warning::new_deprecated("ConstantWeight") .index(warnings.len()) From b17a1477ede5840d882a69d44f8e0a40eb986c56 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 27 Nov 2023 11:28:53 +0200 Subject: [PATCH 20/45] add mocked subsystems Signed-off-by: Andrei Sandu --- Cargo.lock | 1 + polkadot/node/subsystem-bench/Cargo.toml | 1 + .../src/availability/configuration.rs | 1 - .../subsystem-bench/src/availability/mod.rs | 5 +- polkadot/node/subsystem-bench/src/cli.rs | 7 +- .../subsystem-bench/src/core/configuration.rs | 22 +- .../subsystem-bench/src/core/environment.rs | 2 - .../subsystem-bench/src/core/mock/av_store.rs | 127 +++++++++ .../subsystem-bench/src/core/mock/dummy.rs | 10 +- .../node/subsystem-bench/src/core/mock/mod.rs | 97 ++++--- .../src/core/mock/network_bridge.rs | 262 ++++++++++++++++++ .../src/core/mock/runtime_api.rs | 107 +++++++ polkadot/node/subsystem-bench/src/core/mod.rs | 2 +- .../subsystem-bench/src/subsystem-bench.rs | 21 +- 14 files changed, 587 insertions(+), 78 deletions(-) create mode 100644 polkadot/node/subsystem-bench/src/core/mock/av_store.rs create mode 100644 polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs create mode 100644 polkadot/node/subsystem-bench/src/core/mock/runtime_api.rs diff --git a/Cargo.lock b/Cargo.lock index 15cda46316f92..b349886761ad1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13433,6 +13433,7 @@ dependencies = [ "futures-timer", "itertools 0.11.0", "log", + "orchestra", "parity-scale-codec", "paste", "polkadot-availability-recovery", diff --git a/polkadot/node/subsystem-bench/Cargo.toml b/polkadot/node/subsystem-bench/Cargo.toml index d1c68c6e5f54f..8296874c0dab5 100644 --- a/polkadot/node/subsystem-bench/Cargo.toml +++ b/polkadot/node/subsystem-bench/Cargo.toml @@ -54,6 +54,7 @@ prometheus = { version = "0.13.0", default-features = false } serde = "1.0.192" serde_yaml = "0.9" paste = "1.0.14" +orchestra = { version = "0.3.3", default-features = false, features=["futures_channel"] } [features] default = [] diff --git a/polkadot/node/subsystem-bench/src/availability/configuration.rs b/polkadot/node/subsystem-bench/src/availability/configuration.rs index f96b8e2cb7cea..1274862a8e4a1 100644 --- a/polkadot/node/subsystem-bench/src/availability/configuration.rs +++ b/polkadot/node/subsystem-bench/src/availability/configuration.rs @@ -14,7 +14,6 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . -use super::*; use serde::{Deserialize, Serialize}; /// The test input parameters diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index 8bd28b02bd73c..3f95985050746 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -753,9 +753,8 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment, mut state: TestStat let block_start_ts = Instant::now(); for candidate_num in 0..config.n_cores as u64 { - let candidate = state - .next_candidate() - .expect("We always send up to n_cores*num_blocks; qed"); + let candidate = + state.next_candidate().expect("We always send up to n_cores*num_blocks; qed"); let (tx, rx) = oneshot::channel(); batch.push(rx); diff --git a/polkadot/node/subsystem-bench/src/cli.rs b/polkadot/node/subsystem-bench/src/cli.rs index 2f00ad2f35857..ee67a01d449e3 100644 --- a/polkadot/node/subsystem-bench/src/cli.rs +++ b/polkadot/node/subsystem-bench/src/cli.rs @@ -13,14 +13,9 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . -use super::availability::{ - AvailabilityRecoveryConfiguration, DataAvailabilityReadOptions, NetworkEmulation, - TestEnvironment, TestState, -}; +use super::availability::DataAvailabilityReadOptions; use serde::{Deserialize, Serialize}; -use super::core::configuration::{PeerLatency, TestConfiguration, TestSequence}; - #[derive(Debug, Clone, Serialize, Deserialize, clap::Parser)] #[clap(rename_all = "kebab-case")] #[allow(missing_docs)] diff --git a/polkadot/node/subsystem-bench/src/core/configuration.rs b/polkadot/node/subsystem-bench/src/core/configuration.rs index 017d4023ef654..4526505c3a64d 100644 --- a/polkadot/node/subsystem-bench/src/core/configuration.rs +++ b/polkadot/node/subsystem-bench/src/core/configuration.rs @@ -15,8 +15,6 @@ // along with Polkadot. If not, see . use std::path::Path; -use crate::availability::AvailabilityRecoveryConfiguration; - use super::*; pub use crate::cli::TestObjective; use rand::{distributions::Uniform, prelude::Distribution, thread_rng}; @@ -81,7 +79,7 @@ pub struct TestSequence { } impl TestSequence { - pub fn to_vec(mut self) -> Vec { + pub fn to_vec(self) -> Vec { self.test_configurations .into_iter() .map(|mut config| { @@ -188,3 +186,21 @@ impl TestConfiguration { } } } + +/// Produce a randomized duration between `min` and `max`. +pub fn random_latency(maybe_peer_latency: Option<&PeerLatency>) -> Option { + if let Some(peer_latency) = maybe_peer_latency { + Some( + Uniform::from(peer_latency.min_latency..=peer_latency.max_latency) + .sample(&mut thread_rng()), + ) + } else { + None + } +} + +/// Generate a random error based on `probability`. +/// `probability` should be a number between 0 and 100. +pub fn random_error(probability: usize) -> bool { + Uniform::from(0..=99).sample(&mut thread_rng()) < probability +} diff --git a/polkadot/node/subsystem-bench/src/core/environment.rs b/polkadot/node/subsystem-bench/src/core/environment.rs index 6a680799972d7..e6b09a1c13e63 100644 --- a/polkadot/node/subsystem-bench/src/core/environment.rs +++ b/polkadot/node/subsystem-bench/src/core/environment.rs @@ -14,8 +14,6 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . -use super::*; -use network::NetworkEmulator; use polkadot_node_subsystem_util::metrics::prometheus::{ self, Gauge, Histogram, PrometheusError, Registry, U64, }; diff --git a/polkadot/node/subsystem-bench/src/core/mock/av_store.rs b/polkadot/node/subsystem-bench/src/core/mock/av_store.rs new file mode 100644 index 0000000000000..e84aeba5b6b7d --- /dev/null +++ b/polkadot/node/subsystem-bench/src/core/mock/av_store.rs @@ -0,0 +1,127 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . +//! +//! A generic av store subsystem mockup suitable to be used in benchmarks. + +use parity_scale_codec::Encode; +use polkadot_primitives::CandidateHash; + +use std::collections::HashMap; + +use futures::{channel::oneshot, FutureExt}; + +use polkadot_node_primitives::ErasureChunk; + +use polkadot_node_subsystem::{ + messages::AvailabilityStoreMessage, overseer, SpawnedSubsystem, SubsystemError, +}; + +pub struct AvailabilityStoreState { + candidate_hashes: HashMap, + chunks: Vec>, +} + +const LOG_TARGET: &str = "subsystem-bench::av-store-mock"; + +/// A mock of the availability store subsystem. This one also generates all the +/// candidates that a +pub struct MockAvailabilityStore { + state: AvailabilityStoreState, +} + +impl MockAvailabilityStore { + pub fn new( + chunks: Vec>, + candidate_hashes: HashMap, + ) -> MockAvailabilityStore { + Self { state: AvailabilityStoreState { chunks, candidate_hashes } } + } + + async fn respond_to_query_all_request( + &self, + candidate_hash: CandidateHash, + send_chunk: impl Fn(usize) -> bool, + tx: oneshot::Sender>, + ) { + let candidate_index = self + .state + .candidate_hashes + .get(&candidate_hash) + .expect("candidate was generated previously; qed"); + gum::debug!(target: LOG_TARGET, ?candidate_hash, candidate_index, "Candidate mapped to index"); + + let v = self + .state + .chunks + .get(*candidate_index as usize) + .unwrap() + .iter() + .filter(|c| send_chunk(c.index.0 as usize)) + .cloned() + .collect(); + + let _ = tx.send(v); + } +} + +#[overseer::subsystem(AvailabilityStore, error=SubsystemError, prefix=self::overseer)] +impl MockAvailabilityStore { + fn start(self, ctx: Context) -> SpawnedSubsystem { + let future = self.run(ctx).map(|_| Ok(())).boxed(); + + SpawnedSubsystem { name: "av-store-mock-subsystem", future } + } +} + +#[overseer::contextbounds(AvailabilityStore, prefix = self::overseer)] +impl MockAvailabilityStore { + async fn run(self, mut ctx: Context) { + loop { + let msg = ctx.recv().await.expect("Overseer never fails us"); + + match msg { + orchestra::FromOrchestra::Signal(_) => {}, + orchestra::FromOrchestra::Communication { msg } => match msg { + AvailabilityStoreMessage::QueryAvailableData(_candidate_hash, tx) => { + // We never have the full available data. + let _ = tx.send(None); + }, + AvailabilityStoreMessage::QueryAllChunks(candidate_hash, tx) => { + // We always have our own chunk. + self.respond_to_query_all_request(candidate_hash, |index| index == 0, tx) + .await; + }, + AvailabilityStoreMessage::QueryChunkSize(candidate_hash, tx) => { + let candidate_index = self + .state + .candidate_hashes + .get(&candidate_hash) + .expect("candidate was generated previously; qed"); + gum::debug!(target: LOG_TARGET, ?candidate_hash, candidate_index, "Candidate mapped to index"); + + let chunk_size = self.state.chunks.get(*candidate_index as usize).unwrap() + [0] + .encoded_size(); + let _ = tx.send(Some(chunk_size)); + }, + _ => { + unimplemented!("Unexpected runtime-api message") + }, + }, + } + } + } +} diff --git a/polkadot/node/subsystem-bench/src/core/mock/dummy.rs b/polkadot/node/subsystem-bench/src/core/mock/dummy.rs index 122fc23ac52f2..196cc81f1e822 100644 --- a/polkadot/node/subsystem-bench/src/core/mock/dummy.rs +++ b/polkadot/node/subsystem-bench/src/core/mock/dummy.rs @@ -14,14 +14,10 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . //! Dummy subsystem mocks. -//! use paste::paste; -use futures::{channel::oneshot, select, Future, FutureExt}; -use polkadot_node_subsystem::{ - overseer, AllMessages, FromOrchestra, HeadSupportsParachains, Overseer, OverseerConnector, - OverseerHandle, SpawnGlue, SpawnedSubsystem, Subsystem, SubsystemError, -}; +use futures::FutureExt; +use polkadot_node_subsystem::{overseer, SpawnedSubsystem, SubsystemError}; use std::time::Duration; use tokio::time::sleep; @@ -85,5 +81,3 @@ mock!(ChainSelection); mock!(ApprovalVoting); mock!(ApprovalDistribution); mock!(RuntimeApi); - - diff --git a/polkadot/node/subsystem-bench/src/core/mock/mod.rs b/polkadot/node/subsystem-bench/src/core/mock/mod.rs index f13e87c8683b3..df874de31a7c5 100644 --- a/polkadot/node/subsystem-bench/src/core/mock/mod.rs +++ b/polkadot/node/subsystem-bench/src/core/mock/mod.rs @@ -15,18 +15,20 @@ // along with Polkadot. If not, see . use polkadot_node_subsystem::{ - overseer, AllMessages, FromOrchestra, HeadSupportsParachains, Overseer, OverseerConnector, - OverseerHandle, SpawnGlue, SpawnedSubsystem, Subsystem, + HeadSupportsParachains, Overseer, OverseerConnector, OverseerHandle, SpawnGlue, }; use polkadot_node_subsystem_types::Hash; +pub mod av_store; pub mod dummy; -mod temp; +pub mod network_bridge; +pub mod runtime_api; + +pub(crate) use dummy::*; -use dummy::*; use sc_service::SpawnTaskHandle; -struct AlwaysSupportsParachains {} +pub struct AlwaysSupportsParachains {} #[async_trait::async_trait] impl HeadSupportsParachains for AlwaysSupportsParachains { async fn head_supports_parachains(&self, _head: &Hash) -> bool { @@ -34,43 +36,50 @@ impl HeadSupportsParachains for AlwaysSupportsParachains { } } -pub fn new_overseer_with_dummy_subsystems(spawn_task_handle: SpawnTaskHandle) { - // Initialize a mock overseer. - // All subsystem except approval_voting and approval_distribution are mock subsystems. - let spawner_glue = SpawnGlue(spawn_task_handle); - let overseer_connector = OverseerConnector::with_event_capacity(64000); - let builder = Overseer::builder() - .approval_voting(MockApprovalVoting {}) - .approval_distribution(MockApprovalDistribution {}) - .availability_recovery(MockAvailabilityRecovery {}) - .candidate_validation(MockCandidateValidation {}) - .chain_api(MockChainApi { }) - .chain_selection(MockChainSelection {}) - .dispute_coordinator(MockDisputeCoordinator {}) - .runtime_api(MockRuntimeApi { }) - .network_bridge_tx(MockNetworkBridgeTx {}) - .availability_distribution(MockAvailabilityDistribution {}) - .availability_store(MockAvailabilityStore {}) - .pvf_checker(MockPvfChecker {}) - .candidate_backing(MockCandidateBacking {}) - .statement_distribution(MockStatementDistribution {}) - .bitfield_signing(MockBitfieldSigning {}) - .bitfield_distribution(MockBitfieldDistribution {}) - .provisioner(MockProvisioner {}) - .network_bridge_rx(MockNetworkBridgeRx {}) - .collation_generation(MockCollationGeneration {}) - .collator_protocol(MockCollatorProtocol {}) - .gossip_support(MockGossipSupport {}) - .dispute_distribution(MockDisputeDistribution {}) - .prospective_parachains(MockProspectiveParachains {}) - .activation_external_listeners(Default::default()) - .span_per_active_leaf(Default::default()) - .active_leaves(Default::default()) - .metrics(Default::default()) - .supports_parachains(AlwaysSupportsParachains {}) - .spawner(spawner_glue); - - let (mock_overseer, mock_overseer_handle) = - builder.build_with_connector(overseer_connector).expect("Should not fail"); +// An orchestra with dummy subsystems +macro_rules! dummy_builder { + ($spawn_task_handle: ident) => { + // Initialize a mock overseer. + // All subsystem except approval_voting and approval_distribution are mock subsystems. + Overseer::builder() + .approval_voting(MockApprovalVoting {}) + .approval_distribution(MockApprovalDistribution {}) + .availability_recovery(MockAvailabilityRecovery {}) + .candidate_validation(MockCandidateValidation {}) + .chain_api(MockChainApi {}) + .chain_selection(MockChainSelection {}) + .dispute_coordinator(MockDisputeCoordinator {}) + .runtime_api(MockRuntimeApi {}) + .network_bridge_tx(MockNetworkBridgeTx {}) + .availability_distribution(MockAvailabilityDistribution {}) + .availability_store(MockAvailabilityStore {}) + .pvf_checker(MockPvfChecker {}) + .candidate_backing(MockCandidateBacking {}) + .statement_distribution(MockStatementDistribution {}) + .bitfield_signing(MockBitfieldSigning {}) + .bitfield_distribution(MockBitfieldDistribution {}) + .provisioner(MockProvisioner {}) + .network_bridge_rx(MockNetworkBridgeRx {}) + .collation_generation(MockCollationGeneration {}) + .collator_protocol(MockCollatorProtocol {}) + .gossip_support(MockGossipSupport {}) + .dispute_distribution(MockDisputeDistribution {}) + .prospective_parachains(MockProspectiveParachains {}) + .activation_external_listeners(Default::default()) + .span_per_active_leaf(Default::default()) + .active_leaves(Default::default()) + .metrics(Default::default()) + .supports_parachains(AlwaysSupportsParachains {}) + .spawner(SpawnGlue($spawn_task_handle)) + }; +} -} \ No newline at end of file +pub fn new_overseer_with_dummy_subsystems( + spawn_task_handle: SpawnTaskHandle, +) -> (Overseer, AlwaysSupportsParachains>, OverseerHandle) { + let overseer_connector = OverseerConnector::with_event_capacity(64000); + let dummy = dummy_builder!(spawn_task_handle); + let builder = dummy.replace_chain_api(|_| MockChainApi {}); + // let (mock_overseer, mock_overseer_handle) = + builder.build_with_connector(overseer_connector).expect("Should not fail") +} diff --git a/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs b/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs new file mode 100644 index 0000000000000..cd374f8c18db3 --- /dev/null +++ b/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs @@ -0,0 +1,262 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . +//! +//! A generic av store subsystem mockup suitable to be used in benchmarks. + +use parity_scale_codec::Encode; + +use std::collections::HashMap; + +use futures::FutureExt; + +use polkadot_node_primitives::{AvailableData, ErasureChunk}; + +use polkadot_primitives::CandidateHash; +use sc_network::{OutboundFailure, RequestFailure}; + +use polkadot_node_subsystem::{ + messages::NetworkBridgeTxMessage, overseer, SpawnedSubsystem, SubsystemError, +}; + +use polkadot_node_network_protocol::request_response::{ + self as req_res, v1::ChunkResponse, Requests, +}; + +use crate::core::{ + configuration::{random_error, random_latency, TestConfiguration}, + network::{NetworkAction, NetworkEmulator, RateLimit}, +}; + +/// The availability store state of all emulated peers. +/// The network bridge tx mock will respond to requests as if the request is being serviced +/// by a remote peer on the network +pub struct NetworkAvailabilityState { + candidate_hashes: HashMap, + available_data: Vec, + chunks: Vec>, +} + +const LOG_TARGET: &str = "subsystem-bench::network-bridge-tx-mock"; + +/// A mock of the network bridge tx subsystem. +pub struct MockNetworkBridgeTx { + /// The test configurationg + config: TestConfiguration, + /// The network availability state + availabilty: NetworkAvailabilityState, + /// A network emulator instance + network: NetworkEmulator, +} + +impl MockNetworkBridgeTx { + pub fn new( + config: TestConfiguration, + availabilty: NetworkAvailabilityState, + network: NetworkEmulator, + ) -> MockNetworkBridgeTx { + Self { config, availabilty, network } + } + + pub fn respond_to_send_request( + &mut self, + request: Requests, + ingress_tx: &mut tokio::sync::mpsc::UnboundedSender, + ) -> NetworkAction { + let ingress_tx = ingress_tx.clone(); + + match request { + Requests::ChunkFetchingV1(outgoing_request) => { + let validator_index: usize = outgoing_request.payload.index.0 as usize; + let candidate_hash = outgoing_request.payload.candidate_hash; + + let candidate_index = self + .availabilty + .candidate_hashes + .get(&candidate_hash) + .expect("candidate was generated previously; qed"); + gum::warn!(target: LOG_TARGET, ?candidate_hash, candidate_index, "Candidate mapped to index"); + + let chunk: ChunkResponse = + self.availabilty.chunks.get(*candidate_index as usize).unwrap() + [validator_index] + .clone() + .into(); + let mut size = chunk.encoded_size(); + + let response = if random_error(self.config.error) { + // Error will not account to any bandwidth used. + size = 0; + Err(RequestFailure::Network(OutboundFailure::ConnectionClosed)) + } else { + Ok(req_res::v1::ChunkFetchingResponse::from(Some(chunk)).encode()) + }; + + let authority_discovery_id = match outgoing_request.peer { + req_res::Recipient::Authority(authority_discovery_id) => authority_discovery_id, + _ => unimplemented!("Peer recipient not supported yet"), + }; + let authority_discovery_id_clone = authority_discovery_id.clone(); + + let future = async move { + let _ = outgoing_request.pending_response.send(response); + } + .boxed(); + + let future_wrapper = async move { + // Forward the response to the ingress channel of our node. + // On receive side we apply our node receiving rate limit. + let action = + NetworkAction::new(authority_discovery_id_clone, future, size, None); + ingress_tx.send(action).unwrap(); + } + .boxed(); + + NetworkAction::new( + authority_discovery_id, + future_wrapper, + size, + // Generate a random latency based on configuration. + random_latency(self.config.latency.as_ref()), + ) + }, + Requests::AvailableDataFetchingV1(outgoing_request) => { + let candidate_hash = outgoing_request.payload.candidate_hash; + let candidate_index = self + .availabilty + .candidate_hashes + .get(&candidate_hash) + .expect("candidate was generated previously; qed"); + gum::warn!(target: LOG_TARGET, ?candidate_hash, candidate_index, "Candidate mapped to index"); + + let available_data = + self.availabilty.available_data.get(*candidate_index as usize).unwrap().clone(); + + let size = available_data.encoded_size(); + + let response = if random_error(self.config.error) { + Err(RequestFailure::Network(OutboundFailure::ConnectionClosed)) + } else { + Ok(req_res::v1::AvailableDataFetchingResponse::from(Some(available_data)) + .encode()) + }; + + let future = async move { + let _ = outgoing_request.pending_response.send(response); + } + .boxed(); + + let authority_discovery_id = match outgoing_request.peer { + req_res::Recipient::Authority(authority_discovery_id) => authority_discovery_id, + _ => unimplemented!("Peer recipient not supported yet"), + }; + let authority_discovery_id_clone = authority_discovery_id.clone(); + + let future_wrapper = async move { + // Forward the response to the ingress channel of our node. + // On receive side we apply our node receiving rate limit. + let action = + NetworkAction::new(authority_discovery_id_clone, future, size, None); + ingress_tx.send(action).unwrap(); + } + .boxed(); + + NetworkAction::new( + authority_discovery_id, + future_wrapper, + size, + // Generate a random latency based on configuration. + random_latency(self.config.latency.as_ref()), + ) + }, + _ => panic!("received an unexpected request"), + } + } +} + +#[overseer::subsystem(NetworkBridgeTx, error=SubsystemError, prefix=self::overseer)] +impl MockNetworkBridgeTx { + fn start(self, ctx: Context) -> SpawnedSubsystem { + let future = self.run(ctx).map(|_| Ok(())).boxed(); + + SpawnedSubsystem { name: "network-bridge-tx-mock-subsystem", future } + } +} + +#[overseer::contextbounds(NetworkBridgeTx, prefix = self::overseer)] +impl MockNetworkBridgeTx { + async fn run(mut self, mut ctx: Context) { + let (mut ingress_tx, mut ingress_rx) = + tokio::sync::mpsc::unbounded_channel::(); + + // Initialize our node bandwidth limits. + let mut rx_limiter = RateLimit::new(10, self.config.bandwidth); + + // Get a handle to our node network emulation stats. + let our_network_stats = self.network.peer_stats(0); + // This task will handle receipt of messages on our simulated network of the node. + let _ = ctx + .spawn_blocking( + "node0-rx", + async move { + while let Some(action) = ingress_rx.recv().await { + let size = action.size(); + + // account for our node receiving the data. + our_network_stats.inc_received(size); + + rx_limiter.reap(size).await; + action.run().await; + } + } + .boxed(), + ) + .expect("We never fail to spawn tasks"); + + // Main subsystem loop. + loop { + let msg = ctx.recv().await.expect("Overseer never fails us"); + + match msg { + orchestra::FromOrchestra::Signal(_) => {}, + orchestra::FromOrchestra::Communication { msg } => match msg { + NetworkBridgeTxMessage::SendRequests(requests, _if_disconnected) => { + for request in requests { + self.network.inc_sent(request_size(&request)); + let action = self.respond_to_send_request(request, &mut ingress_tx); + // Will account for our node sending the request over the emulated + // network. + self.network.submit_peer_action(action.peer(), action); + } + }, + _ => { + unimplemented!("Unexpected runtime-api message") + }, + }, + } + } + } +} + +// A helper to determine the request payload size. +fn request_size(request: &Requests) -> u64 { + match request { + Requests::ChunkFetchingV1(outgoing_request) => + outgoing_request.payload.encoded_size() as u64, + Requests::AvailableDataFetchingV1(outgoing_request) => + outgoing_request.payload.encoded_size() as u64, + _ => panic!("received an unexpected request"), + } +} diff --git a/polkadot/node/subsystem-bench/src/core/mock/runtime_api.rs b/polkadot/node/subsystem-bench/src/core/mock/runtime_api.rs new file mode 100644 index 0000000000000..e8c1098b97f03 --- /dev/null +++ b/polkadot/node/subsystem-bench/src/core/mock/runtime_api.rs @@ -0,0 +1,107 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . +//! +//! A generic runtime api subsystem mockup suitable to be used in benchmarks. + +use polkadot_primitives::{ + AuthorityDiscoveryId, GroupIndex, IndexedVec, SessionInfo, ValidatorId, ValidatorIndex, +}; + +use polkadot_node_subsystem::{ + messages::{RuntimeApiMessage, RuntimeApiRequest}, + overseer, SpawnedSubsystem, SubsystemError, +}; + +use crate::core::configuration::TestConfiguration; +use futures::FutureExt; + +pub struct RuntimeApiState { + validator_public: Vec, + validator_authority_id: Vec, +} + +pub struct MockRuntimeApi { + state: RuntimeApiState, + config: TestConfiguration, +} + +impl MockRuntimeApi { + pub fn new( + config: TestConfiguration, + validator_public: Vec, + validator_authority_id: Vec, + ) -> MockRuntimeApi { + Self { state: RuntimeApiState { validator_public, validator_authority_id }, config } + } + + fn session_info(&self) -> SessionInfo { + let all_validators = (0..self.config.n_validators) + .map(|i| ValidatorIndex(i as _)) + .collect::>(); + + let validator_groups = all_validators.chunks(5).map(|x| Vec::from(x)).collect::>(); + + SessionInfo { + validators: self.state.validator_public.clone().into(), + discovery_keys: self.state.validator_authority_id.clone(), + validator_groups: IndexedVec::>::from(validator_groups), + assignment_keys: vec![], + n_cores: self.config.n_cores as u32, + zeroth_delay_tranche_width: 0, + relay_vrf_modulo_samples: 0, + n_delay_tranches: 0, + no_show_slots: 0, + needed_approvals: 0, + active_validator_indices: vec![], + dispute_period: 6, + random_seed: [0u8; 32], + } + } +} + +#[overseer::subsystem(RuntimeApi, error=SubsystemError, prefix=self::overseer)] +impl MockRuntimeApi { + fn start(self, ctx: Context) -> SpawnedSubsystem { + let future = self.run(ctx).map(|_| Ok(())).boxed(); + + SpawnedSubsystem { name: "runtime-api-mock-subsystem", future } + } +} + +#[overseer::contextbounds(RuntimeApi, prefix = self::overseer)] +impl MockRuntimeApi { + async fn run(self, mut ctx: Context) { + loop { + let msg = ctx.recv().await.expect("Overseer never fails us"); + + match msg { + orchestra::FromOrchestra::Signal(_) => {}, + orchestra::FromOrchestra::Communication { msg } => match msg { + RuntimeApiMessage::Request( + _request, + RuntimeApiRequest::SessionInfo(_session_index, sender), + ) => { + let _ = sender.send(Ok(Some(self.session_info()))); + }, + // Long term TODO: implement more as needed. + _ => { + unimplemented!("Unexpected runtime-api message") + }, + }, + } + } + } +} diff --git a/polkadot/node/subsystem-bench/src/core/mod.rs b/polkadot/node/subsystem-bench/src/core/mod.rs index af2abf0860cdf..11ca03dbda4c2 100644 --- a/polkadot/node/subsystem-bench/src/core/mod.rs +++ b/polkadot/node/subsystem-bench/src/core/mod.rs @@ -28,5 +28,5 @@ pub mod configuration; pub mod display; pub mod environment; pub mod keyring; -pub mod network; pub mod mock; +pub mod network; diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs index ca561e5c4955e..ce9e8aa3a8be0 100644 --- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -26,16 +26,13 @@ pub(crate) mod availability; pub(crate) mod cli; pub(crate) mod core; -use availability::{ - AvailabilityRecoveryConfiguration, DataAvailabilityReadOptions, NetworkEmulation, - TestEnvironment, TestState, -}; +use availability::{NetworkEmulation, TestEnvironment, TestState}; use cli::TestObjective; -use core::configuration::{PeerLatency, TestConfiguration, TestSequence}; +use core::configuration::TestConfiguration; use clap_num::number_range; -const LOG_TARGET: &str = "subsystem-bench"; +// const LOG_TARGET: &str = "subsystem-bench"; fn le_100(s: &str) -> Result { number_range(s, 0, 100) @@ -125,14 +122,17 @@ impl BenchCli { let mut state = TestState::new(test_config); state.generate_candidates(candidate_count); - let mut env = - TestEnvironment::new(runtime.handle().clone(), state.clone(), Registry::new()); + let mut env = TestEnvironment::new( + runtime.handle().clone(), + state.clone(), + Registry::new(), + ); runtime.block_on(availability::bench_chunk_recovery(&mut env, state)); } return Ok(()) }, - TestObjective::DataAvailabilityRead(ref options) => match self.network { + TestObjective::DataAvailabilityRead(ref _options) => match self.network { NetworkEmulation::Healthy => TestConfiguration::healthy_network( self.objective, configuration.num_blocks, @@ -189,7 +189,8 @@ impl BenchCli { let mut state = TestState::new(test_config); state.generate_candidates(candidate_count); - let mut env = TestEnvironment::new(runtime.handle().clone(), state.clone(), Registry::new()); + let mut env = + TestEnvironment::new(runtime.handle().clone(), state.clone(), Registry::new()); runtime.block_on(availability::bench_chunk_recovery(&mut env, state)); From 4724d8c98d6b47643cbccb9e00dc7e550dad3a78 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 27 Nov 2023 19:43:15 +0200 Subject: [PATCH 21/45] full overseer based implementation complete Signed-off-by: Andrei Sandu --- polkadot/node/overseer/src/lib.rs | 2 + .../subsystem-bench/src/availability/mod.rs | 637 ++++++------------ .../subsystem-bench/src/core/configuration.rs | 33 + .../subsystem-bench/src/core/environment.rs | 28 + .../subsystem-bench/src/core/mock/av_store.rs | 8 +- .../subsystem-bench/src/core/mock/dummy.rs | 25 +- .../node/subsystem-bench/src/core/mock/mod.rs | 26 +- .../src/core/mock/network_bridge.rs | 7 +- .../src/core/mock/runtime_api.rs | 28 +- .../node/subsystem-bench/src/core/network.rs | 2 +- .../subsystem-bench/src/subsystem-bench.rs | 40 +- .../node/subsystem-test-helpers/src/mock.rs | 12 +- 12 files changed, 347 insertions(+), 501 deletions(-) diff --git a/polkadot/node/overseer/src/lib.rs b/polkadot/node/overseer/src/lib.rs index da99546a44f75..f4eddf1f41ceb 100644 --- a/polkadot/node/overseer/src/lib.rs +++ b/polkadot/node/overseer/src/lib.rs @@ -276,6 +276,7 @@ impl From> for BlockInfo { /// An event from outside the overseer scope, such /// as the substrate framework or user interaction. +#[derive(Debug)] pub enum Event { /// A new block was imported. /// @@ -300,6 +301,7 @@ pub enum Event { } /// Some request from outer world. +#[derive(Debug)] pub enum ExternalRequest { /// Wait for the activation of a particular hash /// and be notified by means of the return channel. diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index 3f95985050746..54a3cd9613198 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -21,14 +21,16 @@ use std::{ sync::Arc, time::{Duration, Instant}, }; +use tokio::runtime::{Handle, Runtime}; + +use polkadot_node_subsystem::{ + BlockInfo, Event, Overseer, OverseerConnector, OverseerHandle, SpawnGlue, +}; +use sc_network::request_responses::ProtocolConfig; use colored::Colorize; -use futures::{ - channel::{mpsc, oneshot}, - stream::FuturesUnordered, - FutureExt, SinkExt, StreamExt, -}; +use futures::{channel::oneshot, stream::FuturesUnordered, FutureExt, SinkExt, StreamExt}; use polkadot_node_metrics::metrics::Metrics; use polkadot_availability_recovery::AvailabilityRecoverySubsystem; @@ -40,23 +42,30 @@ use polkadot_node_network_protocol::request_response::{ use rand::{distributions::Uniform, prelude::Distribution, thread_rng}; use prometheus::Registry; -use sc_network::{config::RequestResponseConfig, OutboundFailure, RequestFailure}; +use sc_network::{OutboundFailure, RequestFailure}; use polkadot_erasure_coding::{branches, obtain_chunks_v1 as obtain_chunks}; use polkadot_node_primitives::{BlockData, PoV, Proof}; use polkadot_node_subsystem::{ - messages::{ - AllMessages, AvailabilityRecoveryMessage, AvailabilityStoreMessage, NetworkBridgeTxMessage, - RuntimeApiMessage, RuntimeApiRequest, - }, - ActiveLeavesUpdate, FromOrchestra, OverseerSignal, Subsystem, + messages::{AllMessages, AvailabilityRecoveryMessage}, + ActiveLeavesUpdate, OverseerSignal, }; use std::net::{Ipv4Addr, SocketAddr}; +use crate::core::{ + configuration::TestAuthorities, + environment::TestEnvironmentDependencies, + mock::{ + av_store, + network_bridge::{self, MockNetworkBridgeTx, NetworkAvailabilityState}, + runtime_api, MockAvailabilityStore, MockRuntimeApi, + }, +}; + use super::core::{ configuration::{PeerLatency, TestConfiguration}, environment::TestEnvironmentMetrics, - keyring::Keyring, + mock::dummy_builder, network::*, }; @@ -64,14 +73,12 @@ const LOG_TARGET: &str = "subsystem-bench::availability"; use polkadot_node_primitives::{AvailableData, ErasureChunk}; -use super::cli::TestObjective; -use polkadot_node_subsystem_test_helpers::{ - make_buffered_subsystem_context, mock::new_leaf, TestSubsystemContextHandle, -}; +use super::{cli::TestObjective, core::mock::AlwaysSupportsParachains}; +use polkadot_node_subsystem_test_helpers::mock::new_block_import_event; use polkadot_node_subsystem_util::TimeoutExt; use polkadot_primitives::{ - AuthorityDiscoveryId, CandidateHash, CandidateReceipt, GroupIndex, Hash, HeadData, IndexedVec, - PersistedValidationData, SessionIndex, SessionInfo, ValidatorId, ValidatorIndex, + CandidateHash, CandidateReceipt, GroupIndex, Hash, HeadData, PersistedValidationData, + SessionIndex, ValidatorIndex, }; use polkadot_primitives_test_helpers::{dummy_candidate_receipt, dummy_hash}; use sc_service::{SpawnTaskHandle, TaskManager}; @@ -81,28 +88,22 @@ pub mod configuration; pub use cli::{DataAvailabilityReadOptions, NetworkEmulation, NetworkOptions}; pub use configuration::AvailabilityRecoveryConfiguration; -// Deterministic genesis hash for protocol names +// A dummy genesis hash const GENESIS_HASH: Hash = Hash::repeat_byte(0xff); -struct AvailabilityRecoverySubsystemInstance { - _protocol_config: RequestResponseConfig, -} - -/// The test environment is responsible for creating an instance of the availability recovery -/// subsystem and connecting it to an emulated overseer. +/// The test environment is the high level wrapper of all things required to test +/// a certain subsystem. /// /// ## Mockups -/// We emulate the following subsystems: -/// - runtime api -/// - network bridge -/// - availability store +/// The overseer is passed in during construction and it can host an arbitrary number of +/// real subsystems instances and the corresponding mocked instances such that the real +/// subsystems can get their messages answered. /// /// As the subsystem's performance depends on network connectivity, the test environment /// emulates validator nodes on the network, see `NetworkEmulator`. The network emulation /// is configurable in terms of peer bandwidth, latency and connection error rate using /// uniform distribution sampling. /// -/// The mockup logic is implemented in `env_task` which owns and advances the `TestState`. /// /// ## Usage /// `TestEnvironment` is used in tests to send `Overseer` messages or signals to the subsystem @@ -121,13 +122,14 @@ pub struct TestEnvironment { // A task manager that tracks task poll durations allows us to measure // per task CPU usage as we do in the Polkadot node. task_manager: TaskManager, + // Our runtime + runtime: tokio::runtime::Runtime, + // A runtime handle + runtime_handle: tokio::runtime::Handle, // The Prometheus metrics registry registry: Registry, - // A channel to the availability recovery subsystem - to_subsystem: mpsc::Sender>, - // Subsystem instance, currently keeps req/response protocol channel senders - // for the whole duration of the test. - instance: AvailabilityRecoverySubsystemInstance, + // A handle to the lovely overseer + overseer_handle: OverseerHandle, // The test intial state. The current state is owned by `env_task`. config: TestConfiguration, // A handle to the network emulator. @@ -136,62 +138,142 @@ pub struct TestEnvironment { metrics: TestEnvironmentMetrics, } -impl TestEnvironment { - // Create a new test environment with specified initial state and prometheus registry. - // We use prometheus metrics to collect per job task poll time and subsystem metrics. - pub fn new(runtime: tokio::runtime::Handle, state: TestState, registry: Registry) -> Self { - let config = state.config().clone(); - let task_manager: TaskManager = TaskManager::new(runtime.clone(), Some(®istry)).unwrap(); - let (instance, virtual_overseer) = AvailabilityRecoverySubsystemInstance::new( - ®istry, - task_manager.spawn_handle(), - match &state.config().objective { - TestObjective::DataAvailabilityRead(options) => options.fetch_from_backers, - _ => panic!("Unexpected objective"), - }, - ); +fn build_overseer( + spawn_task_handle: SpawnTaskHandle, + runtime_api: MockRuntimeApi, + av_store: MockAvailabilityStore, + network_bridge: MockNetworkBridgeTx, + availability_recovery: AvailabilityRecoverySubsystem, +) -> (Overseer, AlwaysSupportsParachains>, OverseerHandle) { + let overseer_connector = OverseerConnector::with_event_capacity(64000); + let dummy = dummy_builder!(spawn_task_handle); + let builder = dummy + .replace_runtime_api(|_| runtime_api) + .replace_availability_store(|_| av_store) + .replace_network_bridge_tx(|_| network_bridge) + .replace_availability_recovery(|_| availability_recovery); + + builder.build_with_connector(overseer_connector).expect("Should not fail") +} - let metrics = - TestEnvironmentMetrics::new(®istry).expect("Metrics need to be registered"); - let mut network = NetworkEmulator::new( - config.n_validators, - state.validator_authority_id.clone(), - config.peer_bandwidth, - task_manager.spawn_handle(), - ®istry, - ); - - // Copy sender for later when we need to inject messages in to the subsystem. - let to_subsystem = virtual_overseer.tx.clone(); - - let task_state = state.clone(); - let task_network = network.clone(); - let spawn_handle = task_manager.spawn_handle(); +/// Takes a test configuration and uses it to creates the `TestEnvironment`. +pub fn prepare_test( + config: TestConfiguration, + state: &mut TestState, +) -> (TestEnvironment, ProtocolConfig) { + prepare_test_inner(config, state, TestEnvironmentDependencies::default()) +} + +/// Takes a test configuration and uses it to creates the `TestEnvironment`. +pub fn prepare_test_with_dependencies( + config: TestConfiguration, + state: &mut TestState, + dependencies: TestEnvironmentDependencies, +) -> (TestEnvironment, ProtocolConfig) { + prepare_test_inner(config, state, dependencies) +} + +fn prepare_test_inner( + config: TestConfiguration, + state: &mut TestState, + dependencies: TestEnvironmentDependencies, +) -> (TestEnvironment, ProtocolConfig) { + // We need to first create the high level test state object. + // This will then be decomposed into per subsystem states. + let candidate_count = config.n_cores * config.num_blocks; + state.generate_candidates(candidate_count); + + // Generate test authorities. + let test_authorities = config.generate_authorities(); + + let runtime_api = runtime_api::MockRuntimeApi::new( + config.clone(), + test_authorities.validator_public.clone(), + test_authorities.validator_authority_id.clone(), + ); - // Our node rate limiting - let mut rx_limiter = RateLimit::new(10, config.bandwidth); - let (ingress_tx, mut ingress_rx) = tokio::sync::mpsc::unbounded_channel::(); - let our_network_stats = network.peer_stats(0); + let av_store = + av_store::MockAvailabilityStore::new(state.chunks.clone(), state.candidate_hashes.clone()); + + let availability_state = NetworkAvailabilityState { + candidate_hashes: state.candidate_hashes.clone(), + available_data: state.available_data.clone(), + chunks: state.chunks.clone(), + }; + + let network = NetworkEmulator::new( + config.n_validators.clone(), + test_authorities.validator_authority_id.clone(), + config.peer_bandwidth, + dependencies.task_manager.spawn_handle(), + &dependencies.registry, + ); - spawn_handle.spawn_blocking("node0-rx", "test-environment", async move { - while let Some(action) = ingress_rx.recv().await { - let size = action.size(); + let network_bridge_tx = network_bridge::MockNetworkBridgeTx::new( + config.clone(), + availability_state, + network.clone(), + ); + + let use_fast_path = match &state.config().objective { + TestObjective::DataAvailabilityRead(options) => options.fetch_from_backers, + _ => panic!("Unexpected objective"), + }; + + let (collation_req_receiver, req_cfg) = + IncomingRequest::get_config_receiver(&ReqProtocolNames::new(&GENESIS_HASH, None)); + + let subsystem = if use_fast_path { + AvailabilityRecoverySubsystem::with_fast_path( + collation_req_receiver, + Metrics::try_register(&dependencies.registry).unwrap(), + ) + } else { + AvailabilityRecoverySubsystem::with_chunks_only( + collation_req_receiver, + Metrics::try_register(&dependencies.registry).unwrap(), + ) + }; + + let (overseer, overseer_handle) = build_overseer( + dependencies.task_manager.spawn_handle(), + runtime_api, + av_store, + network_bridge_tx, + subsystem, + ); - // account for our node receiving the data. - our_network_stats.inc_received(size); + ( + TestEnvironment::new( + dependencies.task_manager, + config, + dependencies.registry, + dependencies.runtime, + network, + overseer, + overseer_handle, + ), + req_cfg, + ) +} - rx_limiter.reap(size).await; - action.run().await; - } - }); +impl TestEnvironment { + // Create a new test environment with specified initial state and prometheus registry. + // We use prometheus metrics to collect per job task poll time and subsystem metrics. + pub fn new( + task_manager: TaskManager, + config: TestConfiguration, + registry: Registry, + runtime: Runtime, + network: NetworkEmulator, + overseer: Overseer, AlwaysSupportsParachains>, + overseer_handle: OverseerHandle, + ) -> Self { + let metrics = + TestEnvironmentMetrics::new(®istry).expect("Metrics need to be registered"); - // We need to start a receiver to process messages from the subsystem. - // This mocks an overseer and all dependent subsystems - task_manager.spawn_handle().spawn_blocking( - "test-environment", - "test-environment", - async move { Self::env_task(virtual_overseer, task_state, task_network, ingress_tx).await }, - ); + let spawn_handle = task_manager.spawn_handle(); + spawn_handle.spawn_blocking("overseer", "overseer", overseer.run()); let registry_clone = registry.clone(); task_manager @@ -205,7 +287,16 @@ impl TestEnvironment { .unwrap(); }); - TestEnvironment { task_manager, registry, to_subsystem, instance, config, network, metrics } + TestEnvironment { + task_manager, + runtime_handle: runtime.handle().clone(), + runtime, + registry, + overseer_handle, + config, + network, + metrics, + } } pub fn config(&self) -> &TestConfiguration { @@ -236,266 +327,20 @@ impl TestEnvironment { &self.metrics } - /// Generate a random error based on `probability`. - /// `probability` should be a number between 0 and 100. - fn random_error(probability: usize) -> bool { - Uniform::from(0..=99).sample(&mut thread_rng()) < probability - } - - pub fn request_size(request: &Requests) -> u64 { - match request { - Requests::ChunkFetchingV1(outgoing_request) => - outgoing_request.payload.encoded_size() as u64, - Requests::AvailableDataFetchingV1(outgoing_request) => - outgoing_request.payload.encoded_size() as u64, - _ => panic!("received an unexpected request"), - } - } - - pub fn respond_to_send_request( - state: &mut TestState, - request: Requests, - ingress_tx: tokio::sync::mpsc::UnboundedSender, - ) -> NetworkAction { - match request { - Requests::ChunkFetchingV1(outgoing_request) => { - let validator_index: usize = outgoing_request.payload.index.0 as usize; - let candidate_hash = outgoing_request.payload.candidate_hash; - - let candidate_index = state - .candidate_hashes - .get(&candidate_hash) - .expect("candidate was generated previously; qed"); - gum::warn!(target: LOG_TARGET, ?candidate_hash, candidate_index, "Candidate mapped to index"); - - let chunk: ChunkResponse = state.chunks.get(*candidate_index as usize).unwrap() - [validator_index] - .clone() - .into(); - let mut size = chunk.encoded_size(); - - let response = if Self::random_error(state.config().error) { - // Error will not account to any bandwidth used. - size = 0; - Err(RequestFailure::Network(OutboundFailure::ConnectionClosed)) - } else { - Ok(req_res::v1::ChunkFetchingResponse::from(Some(chunk)).encode()) - }; - - let authority_discovery_id = match outgoing_request.peer { - req_res::Recipient::Authority(authority_discovery_id) => authority_discovery_id, - _ => panic!("Peer recipient not supported yet"), - }; - let authority_discovery_id_clone = authority_discovery_id.clone(); - - let future = async move { - let _ = outgoing_request.pending_response.send(response); - } - .boxed(); - - let future_wrapper = async move { - // Forward the response to the ingress channel of our node. - // On receive side we apply our node receiving rate limit. - let action = - NetworkAction::new(authority_discovery_id_clone, future, size, None); - ingress_tx.send(action).unwrap(); - } - .boxed(); - - NetworkAction::new( - authority_discovery_id, - future_wrapper, - size, - // Generate a random latency based on configuration. - Self::random_latency(state.config().latency.as_ref()), - ) - }, - Requests::AvailableDataFetchingV1(outgoing_request) => { - let candidate_hash = outgoing_request.payload.candidate_hash; - let candidate_index = state - .candidate_hashes - .get(&candidate_hash) - .expect("candidate was generated previously; qed"); - gum::warn!(target: LOG_TARGET, ?candidate_hash, candidate_index, "Candidate mapped to index"); - - let available_data = - state.available_data.get(*candidate_index as usize).unwrap().clone(); - - let size = available_data.encoded_size(); - - let response = if Self::random_error(state.config().error) { - Err(RequestFailure::Network(OutboundFailure::ConnectionClosed)) - } else { - Ok(req_res::v1::AvailableDataFetchingResponse::from(Some(available_data)) - .encode()) - }; - - let future = async move { - let _ = outgoing_request.pending_response.send(response); - } - .boxed(); - - let authority_discovery_id = match outgoing_request.peer { - req_res::Recipient::Authority(authority_discovery_id) => authority_discovery_id, - _ => panic!("Peer recipient not supported yet"), - }; - let authority_discovery_id_clone = authority_discovery_id.clone(); - - let future_wrapper = async move { - // Forward the response to the ingress channel of our node. - // On receive side we apply our node receiving rate limit. - let action = - NetworkAction::new(authority_discovery_id_clone, future, size, None); - ingress_tx.send(action).unwrap(); - } - .boxed(); - - NetworkAction::new( - authority_discovery_id, - future_wrapper, - size, - // Generate a random latency based on configuration. - Self::random_latency(state.config().latency.as_ref()), - ) - }, - _ => panic!("received an unexpected request"), - } - } - - // A task that mocks dependent subsystems based on environment configuration. - // TODO: Spawn real subsystems, user overseer builder. - async fn env_task( - mut ctx: TestSubsystemContextHandle, - mut state: TestState, - mut network: NetworkEmulator, - ingress_tx: tokio::sync::mpsc::UnboundedSender, - ) { - loop { - futures::select! { - maybe_message = ctx.maybe_recv().fuse() => { - let message = if let Some(message) = maybe_message{ - message - } else { - gum::info!("{}", "Test completed".bright_blue()); - return - }; - - gum::trace!(target: LOG_TARGET, ?message, "Env task received message"); - - match message { - AllMessages::NetworkBridgeTx( - NetworkBridgeTxMessage::SendRequests( - requests, - _if_disconnected, - ) - ) => { - for request in requests { - network.inc_sent(Self::request_size(&request)); - let action = Self::respond_to_send_request(&mut state, request, ingress_tx.clone()); - // Account for our node sending the request over the emulated network. - network.submit_peer_action(action.peer(), action); - } - }, - AllMessages::AvailabilityStore(AvailabilityStoreMessage::QueryAvailableData(_candidate_hash, tx)) => { - // TODO: Simulate av store load by delaying the response. - state.respond_none_to_available_data_query(tx).await; - }, - AllMessages::AvailabilityStore(AvailabilityStoreMessage::QueryAllChunks(candidate_hash, tx)) => { - // Test env: We always have our own chunk. - state.respond_to_query_all_request(candidate_hash, |index| index == state.validator_index.0 as usize, tx).await; - }, - AllMessages::AvailabilityStore( - AvailabilityStoreMessage::QueryChunkSize(candidate_hash, tx) - ) => { - let candidate_index = state.candidate_hashes.get(&candidate_hash).expect("candidate was generated previously; qed"); - gum::debug!(target: LOG_TARGET, ?candidate_hash, candidate_index, "Candidate mapped to index"); - - let chunk_size = state.chunks.get(*candidate_index as usize).unwrap()[0].encoded_size(); - let _ = tx.send(Some(chunk_size)); - } - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - _relay_parent, - RuntimeApiRequest::SessionInfo( - _session_index, - tx, - ) - )) => { - tx.send(Ok(Some(state.session_info()))).unwrap(); - } - _ => panic!("Unexpected input") - } - } - } - } + pub fn runtime(&self) -> Handle { + self.runtime_handle.clone() } // Send a message to the subsystem under test environment. - pub async fn send_message(&mut self, msg: AvailabilityRecoveryMessage) { - gum::trace!(msg = ?msg, "sending message"); - self.to_subsystem - .send(FromOrchestra::Communication { msg }) + pub async fn send_message(&mut self, msg: Event) { + self.overseer_handle + .send(msg) .timeout(MAX_TIME_OF_FLIGHT) .await .unwrap_or_else(|| { panic!("{}ms maximum time of flight breached", MAX_TIME_OF_FLIGHT.as_millis()) }) - .unwrap(); - } - - // Send a signal to the subsystem under test environment. - pub async fn send_signal(&mut self, signal: OverseerSignal) { - self.to_subsystem - .send(FromOrchestra::Signal(signal)) - .timeout(MAX_TIME_OF_FLIGHT) - .await - .unwrap_or_else(|| { - panic!( - "{}ms is more than enough for sending signals.", - MAX_TIME_OF_FLIGHT.as_millis() - ) - }) - .unwrap(); - } -} - -impl AvailabilityRecoverySubsystemInstance { - pub fn new( - registry: &Registry, - spawn_task_handle: SpawnTaskHandle, - use_fast_path: bool, - ) -> (Self, TestSubsystemContextHandle) { - let (context, virtual_overseer) = make_buffered_subsystem_context( - spawn_task_handle.clone(), - 128, - "availability-recovery-subsystem", - ); - let (collation_req_receiver, req_cfg) = - IncomingRequest::get_config_receiver(&ReqProtocolNames::new(&GENESIS_HASH, None)); - - let subsystem = if use_fast_path { - AvailabilityRecoverySubsystem::with_fast_path( - collation_req_receiver, - Metrics::try_register(®istry).unwrap(), - ) - } else { - AvailabilityRecoverySubsystem::with_chunks_only( - collation_req_receiver, - Metrics::try_register(®istry).unwrap(), - ) - }; - - let spawned_subsystem = subsystem.start(context); - let subsystem_future = async move { - spawned_subsystem.future.await.unwrap(); - }; - - spawn_task_handle.spawn_blocking( - spawned_subsystem.name, - spawned_subsystem.name, - subsystem_future, - ); - - (Self { _protocol_config: req_cfg }, virtual_overseer) + .expect("send never fails"); } } @@ -509,8 +354,7 @@ pub struct TestState { // Full test configuration config: TestConfiguration, // State starts here. - validator_public: Vec, - validator_authority_id: Vec, + test_authorities: TestAuthorities, // The test node validator index. validator_index: ValidatorIndex, session_index: SessionIndex, @@ -535,60 +379,6 @@ impl TestState { &self.config } - async fn respond_none_to_available_data_query( - &self, - tx: oneshot::Sender>, - ) { - let _ = tx.send(None); - } - - fn session_info(&self) -> SessionInfo { - let my_vec = (0..self.config().n_validators) - .map(|i| ValidatorIndex(i as _)) - .collect::>(); - - let validator_groups = my_vec.chunks(5).map(|x| Vec::from(x)).collect::>(); - - SessionInfo { - validators: self.validator_public.clone().into(), - discovery_keys: self.validator_authority_id.clone(), - validator_groups: IndexedVec::>::from(validator_groups), - assignment_keys: vec![], - n_cores: self.config().n_cores as u32, - zeroth_delay_tranche_width: 0, - relay_vrf_modulo_samples: 0, - n_delay_tranches: 0, - no_show_slots: 0, - needed_approvals: 0, - active_validator_indices: vec![], - dispute_period: 6, - random_seed: [0u8; 32], - } - } - async fn respond_to_query_all_request( - &self, - candidate_hash: CandidateHash, - send_chunk: impl Fn(usize) -> bool, - tx: oneshot::Sender>, - ) { - let candidate_index = self - .candidate_hashes - .get(&candidate_hash) - .expect("candidate was generated previously; qed"); - gum::debug!(target: LOG_TARGET, ?candidate_hash, candidate_index, "Candidate mapped to index"); - - let v = self - .chunks - .get(*candidate_index as usize) - .unwrap() - .iter() - .filter(|c| send_chunk(c.index.0 as usize)) - .cloned() - .collect(); - - let _ = tx.send(v); - } - pub fn next_candidate(&mut self) -> Option { let candidate = self.candidates.next(); let candidate_hash = candidate.as_ref().unwrap().hash(); @@ -596,6 +386,10 @@ impl TestState { candidate } + pub fn authorities(&self) -> &TestAuthorities { + &self.test_authorities + } + /// Generate candidates to be used in the test. pub fn generate_candidates(&mut self, count: usize) { gum::info!(target: LOG_TARGET,"{}", format!("Pre-generating {} candidates.", count).bright_blue()); @@ -624,22 +418,9 @@ impl TestState { .cycle(); } - pub fn new(config: TestConfiguration) -> Self { - let keyrings = (0..config.n_validators) - .map(|peer_index| Keyring::new(format!("Node{}", peer_index).into())) - .collect::>(); - - // Generate `AuthorityDiscoveryId`` for each peer - let validator_public: Vec = keyrings - .iter() - .map(|keyring: &Keyring| keyring.clone().public().into()) - .collect::>(); - - let validator_authority_id: Vec = keyrings - .iter() - .map(|keyring| keyring.clone().public().into()) - .collect::>() - .into(); + pub fn new(config: &TestConfiguration) -> Self { + let config = config.clone(); + let test_authorities = config.generate_authorities(); let validator_index = ValidatorIndex(0); let mut chunks = Vec::new(); @@ -687,8 +468,7 @@ impl TestState { Self { config, - validator_public, - validator_authority_id, + test_authorities, validator_index, session_index, persisted_validation_data, @@ -734,11 +514,7 @@ fn derive_erasure_chunks_with_proofs_and_root( pub async fn bench_chunk_recovery(env: &mut TestEnvironment, mut state: TestState) { let config = env.config().clone(); - env.send_signal(OverseerSignal::ActiveLeaves(ActiveLeavesUpdate::start_work(new_leaf( - Hash::repeat_byte(1), - 1, - )))) - .await; + env.send_message(new_block_import_event(Hash::repeat_byte(1), 1)).await; let start_marker = Instant::now(); let mut batch = FuturesUnordered::new(); @@ -758,15 +534,20 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment, mut state: TestStat let (tx, rx) = oneshot::channel(); batch.push(rx); - env.send_message(AvailabilityRecoveryMessage::RecoverAvailableData( - candidate.clone(), - 1, - Some(GroupIndex( - candidate_num as u32 % (std::cmp::max(5, config.n_cores) / 5) as u32, - )), - tx, - )) - .await; + let message = Event::MsgToSubsystem { + msg: AllMessages::AvailabilityRecovery( + AvailabilityRecoveryMessage::RecoverAvailableData( + candidate.clone(), + 1, + Some(GroupIndex( + candidate_num as u32 % (std::cmp::max(5, config.n_cores) / 5) as u32, + )), + tx, + ), + ), + origin: LOG_TARGET, + }; + env.send_message(message).await; } gum::info!("{}", format!("{} recoveries pending", batch.len()).bright_black()); @@ -786,8 +567,8 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment, mut state: TestStat tokio::time::sleep(block_time_delta).await; } - env.send_signal(OverseerSignal::Conclude).await; - let duration = start_marker.elapsed().as_millis(); + env.send_message(Event::Stop).await; + let duration: u128 = start_marker.elapsed().as_millis(); let availability_bytes = availability_bytes / 1024; gum::info!("All blocks processed in {}", format!("{:?}ms", duration).cyan()); gum::info!( diff --git a/polkadot/node/subsystem-bench/src/core/configuration.rs b/polkadot/node/subsystem-bench/src/core/configuration.rs index 4526505c3a64d..f8fdcf2973ebe 100644 --- a/polkadot/node/subsystem-bench/src/core/configuration.rs +++ b/polkadot/node/subsystem-bench/src/core/configuration.rs @@ -16,7 +16,10 @@ use std::path::Path; use super::*; +use keyring::Keyring; + pub use crate::cli::TestObjective; +use polkadot_primitives::ValidatorId; use rand::{distributions::Uniform, prelude::Distribution, thread_rng}; use serde::{Deserialize, Serialize}; @@ -98,6 +101,14 @@ impl TestSequence { } } +/// Helper struct for authority related state. +#[derive(Clone)] +pub struct TestAuthorities { + pub keyrings: Vec, + pub validator_public: Vec, + pub validator_authority_id: Vec, +} + impl TestConfiguration { pub fn write_to_disk(&self) { // Serialize a slice of configurations @@ -109,6 +120,28 @@ impl TestConfiguration { pub fn pov_sizes(&self) -> &[usize] { &self.pov_sizes } + + /// Generates the authority keys we need for the network emulation. + pub fn generate_authorities(&self) -> TestAuthorities { + let keyrings = (0..self.n_validators) + .map(|peer_index| Keyring::new(format!("Node{}", peer_index).into())) + .collect::>(); + + // Generate `AuthorityDiscoveryId`` for each peer + let validator_public: Vec = keyrings + .iter() + .map(|keyring: &Keyring| keyring.clone().public().into()) + .collect::>(); + + let validator_authority_id: Vec = keyrings + .iter() + .map(|keyring| keyring.clone().public().into()) + .collect::>() + .into(); + + TestAuthorities { keyrings, validator_public, validator_authority_id } + } + /// An unconstrained standard configuration matching Polkadot/Kusama pub fn ideal_network( objective: TestObjective, diff --git a/polkadot/node/subsystem-bench/src/core/environment.rs b/polkadot/node/subsystem-bench/src/core/environment.rs index e6b09a1c13e63..c9cc6ae404107 100644 --- a/polkadot/node/subsystem-bench/src/core/environment.rs +++ b/polkadot/node/subsystem-bench/src/core/environment.rs @@ -17,6 +17,7 @@ use polkadot_node_subsystem_util::metrics::prometheus::{ self, Gauge, Histogram, PrometheusError, Registry, U64, }; +use sc_service::TaskManager; const MIB: f64 = 1024.0 * 1024.0; @@ -97,3 +98,30 @@ impl TestEnvironmentMetrics { self.pov_size.observe(pov_size as f64); } } + +fn new_runtime() -> tokio::runtime::Runtime { + tokio::runtime::Builder::new_multi_thread() + .thread_name("subsystem-bench") + .enable_all() + .thread_stack_size(3 * 1024 * 1024) + .build() + .unwrap() +} + +/// Wrapper for dependencies +pub struct TestEnvironmentDependencies { + pub registry: Registry, + pub task_manager: TaskManager, + pub runtime: tokio::runtime::Runtime, +} + +impl Default for TestEnvironmentDependencies { + fn default() -> Self { + let runtime = new_runtime(); + let registry = Registry::new(); + let task_manager: TaskManager = + TaskManager::new(runtime.handle().clone(), Some(®istry)).unwrap(); + + Self { runtime, registry, task_manager } + } +} diff --git a/polkadot/node/subsystem-bench/src/core/mock/av_store.rs b/polkadot/node/subsystem-bench/src/core/mock/av_store.rs index e84aeba5b6b7d..7f6ff2abfe9e0 100644 --- a/polkadot/node/subsystem-bench/src/core/mock/av_store.rs +++ b/polkadot/node/subsystem-bench/src/core/mock/av_store.rs @@ -89,22 +89,28 @@ impl MockAvailabilityStore { #[overseer::contextbounds(AvailabilityStore, prefix = self::overseer)] impl MockAvailabilityStore { async fn run(self, mut ctx: Context) { + gum::debug!(target: LOG_TARGET, "Subsystem running"); loop { let msg = ctx.recv().await.expect("Overseer never fails us"); match msg { orchestra::FromOrchestra::Signal(_) => {}, orchestra::FromOrchestra::Communication { msg } => match msg { - AvailabilityStoreMessage::QueryAvailableData(_candidate_hash, tx) => { + AvailabilityStoreMessage::QueryAvailableData(candidate_hash, tx) => { + gum::debug!(target: LOG_TARGET, candidate_hash = ?candidate_hash, "Responding to QueryAvailableData"); + // We never have the full available data. let _ = tx.send(None); }, AvailabilityStoreMessage::QueryAllChunks(candidate_hash, tx) => { // We always have our own chunk. + gum::debug!(target: LOG_TARGET, candidate_hash = ?candidate_hash, "Responding to QueryAllChunks"); self.respond_to_query_all_request(candidate_hash, |index| index == 0, tx) .await; }, AvailabilityStoreMessage::QueryChunkSize(candidate_hash, tx) => { + gum::debug!(target: LOG_TARGET, candidate_hash = ?candidate_hash, "Responding to QueryChunkSize"); + let candidate_index = self .state .candidate_hashes diff --git a/polkadot/node/subsystem-bench/src/core/mock/dummy.rs b/polkadot/node/subsystem-bench/src/core/mock/dummy.rs index 196cc81f1e822..998153875ede0 100644 --- a/polkadot/node/subsystem-bench/src/core/mock/dummy.rs +++ b/polkadot/node/subsystem-bench/src/core/mock/dummy.rs @@ -21,6 +21,8 @@ use polkadot_node_subsystem::{overseer, SpawnedSubsystem, SubsystemError}; use std::time::Duration; use tokio::time::sleep; +const LOG_TARGET: &str = "subsystem-bench::mockery"; + macro_rules! mock { // Just query by relay parent ($subsystem_name:ident) => { @@ -41,15 +43,22 @@ macro_rules! mock { let mut count_total_msg = 0; loop { futures::select!{ - _msg = ctx.recv().fuse() => { - count_total_msg +=1; - } - _ = sleep(Duration::from_secs(6)).fuse() => { - if count_total_msg > 0 { - gum::info!(target: "mock-subsystems", "Subsystem {} processed {} messages since last time", stringify!($subsystem_name), count_total_msg); + msg = ctx.recv().fuse() => { + match msg.unwrap() { + orchestra::FromOrchestra::Signal(_) => {}, + orchestra::FromOrchestra::Communication { msg } => { + gum::debug!(target: LOG_TARGET, msg = ?msg, "mocked subsystem received message"); + } + } + + count_total_msg +=1; + } + _ = sleep(Duration::from_secs(6)).fuse() => { + if count_total_msg > 0 { + gum::trace!(target: LOG_TARGET, "Subsystem {} processed {} messages since last time", stringify!($subsystem_name), count_total_msg); + } + count_total_msg = 0; } - count_total_msg = 0; - } } } } diff --git a/polkadot/node/subsystem-bench/src/core/mock/mod.rs b/polkadot/node/subsystem-bench/src/core/mock/mod.rs index df874de31a7c5..d59642e960586 100644 --- a/polkadot/node/subsystem-bench/src/core/mock/mod.rs +++ b/polkadot/node/subsystem-bench/src/core/mock/mod.rs @@ -14,9 +14,7 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . -use polkadot_node_subsystem::{ - HeadSupportsParachains, Overseer, OverseerConnector, OverseerHandle, SpawnGlue, -}; +use polkadot_node_subsystem::HeadSupportsParachains; use polkadot_node_subsystem_types::Hash; pub mod av_store; @@ -24,9 +22,9 @@ pub mod dummy; pub mod network_bridge; pub mod runtime_api; -pub(crate) use dummy::*; - -use sc_service::SpawnTaskHandle; +pub use av_store::*; +pub use network_bridge::*; +pub use runtime_api::*; pub struct AlwaysSupportsParachains {} #[async_trait::async_trait] @@ -38,7 +36,9 @@ impl HeadSupportsParachains for AlwaysSupportsParachains { // An orchestra with dummy subsystems macro_rules! dummy_builder { - ($spawn_task_handle: ident) => { + ($spawn_task_handle: ident) => {{ + use super::core::mock::dummy::*; + // Initialize a mock overseer. // All subsystem except approval_voting and approval_distribution are mock subsystems. Overseer::builder() @@ -71,15 +71,7 @@ macro_rules! dummy_builder { .metrics(Default::default()) .supports_parachains(AlwaysSupportsParachains {}) .spawner(SpawnGlue($spawn_task_handle)) - }; + }}; } -pub fn new_overseer_with_dummy_subsystems( - spawn_task_handle: SpawnTaskHandle, -) -> (Overseer, AlwaysSupportsParachains>, OverseerHandle) { - let overseer_connector = OverseerConnector::with_event_capacity(64000); - let dummy = dummy_builder!(spawn_task_handle); - let builder = dummy.replace_chain_api(|_| MockChainApi {}); - // let (mock_overseer, mock_overseer_handle) = - builder.build_with_connector(overseer_connector).expect("Should not fail") -} +pub(crate) use dummy_builder; diff --git a/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs b/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs index cd374f8c18db3..a6d07c3d4a20a 100644 --- a/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs +++ b/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs @@ -44,9 +44,9 @@ use crate::core::{ /// The network bridge tx mock will respond to requests as if the request is being serviced /// by a remote peer on the network pub struct NetworkAvailabilityState { - candidate_hashes: HashMap, - available_data: Vec, - chunks: Vec>, + pub candidate_hashes: HashMap, + pub available_data: Vec, + pub chunks: Vec>, } const LOG_TARGET: &str = "subsystem-bench::network-bridge-tx-mock"; @@ -234,6 +234,7 @@ impl MockNetworkBridgeTx { orchestra::FromOrchestra::Communication { msg } => match msg { NetworkBridgeTxMessage::SendRequests(requests, _if_disconnected) => { for request in requests { + gum::debug!(target: LOG_TARGET, request = ?request, "Processing request"); self.network.inc_sent(request_size(&request)); let action = self.respond_to_send_request(request, &mut ingress_tx); // Will account for our node sending the request over the emulated diff --git a/polkadot/node/subsystem-bench/src/core/mock/runtime_api.rs b/polkadot/node/subsystem-bench/src/core/mock/runtime_api.rs index e8c1098b97f03..a106eb1309914 100644 --- a/polkadot/node/subsystem-bench/src/core/mock/runtime_api.rs +++ b/polkadot/node/subsystem-bench/src/core/mock/runtime_api.rs @@ -28,6 +28,8 @@ use polkadot_node_subsystem::{ use crate::core::configuration::TestConfiguration; use futures::FutureExt; +const LOG_TARGET: &str = "subsystem-bench::runtime-api-mock"; + pub struct RuntimeApiState { validator_public: Vec, validator_authority_id: Vec, @@ -89,17 +91,21 @@ impl MockRuntimeApi { match msg { orchestra::FromOrchestra::Signal(_) => {}, - orchestra::FromOrchestra::Communication { msg } => match msg { - RuntimeApiMessage::Request( - _request, - RuntimeApiRequest::SessionInfo(_session_index, sender), - ) => { - let _ = sender.send(Ok(Some(self.session_info()))); - }, - // Long term TODO: implement more as needed. - _ => { - unimplemented!("Unexpected runtime-api message") - }, + orchestra::FromOrchestra::Communication { msg } => { + gum::debug!(target: LOG_TARGET, msg=?msg, "recv message"); + + match msg { + RuntimeApiMessage::Request( + _request, + RuntimeApiRequest::SessionInfo(_session_index, sender), + ) => { + let _ = sender.send(Ok(Some(self.session_info()))); + }, + // Long term TODO: implement more as needed. + _ => { + unimplemented!("Unexpected runtime-api message") + }, + } }, } } diff --git a/polkadot/node/subsystem-bench/src/core/network.rs b/polkadot/node/subsystem-bench/src/core/network.rs index f20bb919dedba..80d961babe03c 100644 --- a/polkadot/node/subsystem-bench/src/core/network.rs +++ b/polkadot/node/subsystem-bench/src/core/network.rs @@ -186,7 +186,7 @@ pub type ActionFuture = std::pin::Pin + std pub struct NetworkAction { // The function that performs the action run: ActionFuture, - // The payload size that we simulate sending from a peer + // The payload size that we simulate sending/receiving from a peer size: usize, // Peer which should run the action. peer: AuthorityDiscoveryId, diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs index ce9e8aa3a8be0..4460315c35c5e 100644 --- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -26,7 +26,7 @@ pub(crate) mod availability; pub(crate) mod cli; pub(crate) mod core; -use availability::{NetworkEmulation, TestEnvironment, TestState}; +use availability::{prepare_test, NetworkEmulation, TestEnvironment, TestState}; use cli::TestObjective; use core::configuration::TestConfiguration; @@ -76,21 +76,8 @@ struct BenchCli { pub objective: cli::TestObjective, } -fn new_runtime() -> tokio::runtime::Runtime { - tokio::runtime::Builder::new_multi_thread() - .thread_name("subsystem-bench") - .enable_all() - .thread_stack_size(3 * 1024 * 1024) - .build() - .unwrap() -} - impl BenchCli { fn launch(self) -> eyre::Result<()> { - use prometheus::Registry; - - let runtime = new_runtime(); - let configuration = self.standard_configuration; let mut test_config = match self.objective { TestObjective::TestSequence(options) => { @@ -120,15 +107,9 @@ impl BenchCli { let candidate_count = test_config.n_cores * test_config.num_blocks; - let mut state = TestState::new(test_config); - state.generate_candidates(candidate_count); - let mut env = TestEnvironment::new( - runtime.handle().clone(), - state.clone(), - Registry::new(), - ); - - runtime.block_on(availability::bench_chunk_recovery(&mut env, state)); + let mut state = TestState::new(&test_config); + let (mut env, _protocol_config) = prepare_test(test_config, &mut state); + env.runtime().block_on(availability::bench_chunk_recovery(&mut env, state)); } return Ok(()) }, @@ -185,14 +166,11 @@ impl BenchCli { } let candidate_count = test_config.n_cores * test_config.num_blocks; - test_config.write_to_disk(); - - let mut state = TestState::new(test_config); - state.generate_candidates(candidate_count); - let mut env = - TestEnvironment::new(runtime.handle().clone(), state.clone(), Registry::new()); + // test_config.write_to_disk(); - runtime.block_on(availability::bench_chunk_recovery(&mut env, state)); + let mut state = TestState::new(&test_config); + let (mut env, _protocol_config) = prepare_test(test_config, &mut state); + env.runtime().block_on(availability::bench_chunk_recovery(&mut env, state)); Ok(()) } @@ -202,7 +180,7 @@ fn main() -> eyre::Result<()> { color_eyre::install()?; let _ = env_logger::builder() .filter(Some("hyper"), log::LevelFilter::Info) - .filter(None, log::LevelFilter::Info) + // .filter(None, log::LevelFilter::Trace) .try_init() .unwrap(); diff --git a/polkadot/node/subsystem-test-helpers/src/mock.rs b/polkadot/node/subsystem-test-helpers/src/mock.rs index 522bc3c2cc4f4..11e77b6e8968e 100644 --- a/polkadot/node/subsystem-test-helpers/src/mock.rs +++ b/polkadot/node/subsystem-test-helpers/src/mock.rs @@ -16,7 +16,7 @@ use std::sync::Arc; -use polkadot_node_subsystem::{jaeger, ActivatedLeaf}; +use polkadot_node_subsystem::{jaeger, ActivatedLeaf, Event, BlockInfo}; use sc_client_api::UnpinHandle; use sc_keystore::LocalKeystore; use sc_utils::mpsc::tracing_unbounded; @@ -59,3 +59,13 @@ pub fn new_leaf(hash: Hash, number: BlockNumber) -> ActivatedLeaf { span: Arc::new(jaeger::Span::Disabled), } } + +/// Create a new leaf with the given hash and number. +pub fn new_block_import_event(hash: Hash, number: BlockNumber) -> Event { + Event::BlockImported(BlockInfo { + hash, + parent_hash: Hash::default(), + number, + unpin_handle: dummy_unpin_handle(hash), + }) +} From 7aed30f13be1c8cf6de43e49945dd419613037f2 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 27 Nov 2023 19:55:18 +0200 Subject: [PATCH 22/45] make clean Signed-off-by: Andrei Sandu --- .../subsystem-bench/src/availability/mod.rs | 185 +----------------- .../subsystem-bench/src/core/environment.rs | 157 ++++++++++++++- .../subsystem-bench/src/core/subsystem.rs | 16 -- .../subsystem-bench/src/subsystem-bench.rs | 13 +- 4 files changed, 171 insertions(+), 200 deletions(-) delete mode 100644 polkadot/node/subsystem-bench/src/core/subsystem.rs diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index 54a3cd9613198..e1974794cb8d0 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -21,36 +21,24 @@ use std::{ sync::Arc, time::{Duration, Instant}, }; -use tokio::runtime::{Handle, Runtime}; -use polkadot_node_subsystem::{ - BlockInfo, Event, Overseer, OverseerConnector, OverseerHandle, SpawnGlue, -}; +use crate::TestEnvironment; +use polkadot_node_subsystem::{Event, Overseer, OverseerConnector, OverseerHandle, SpawnGlue}; use sc_network::request_responses::ProtocolConfig; use colored::Colorize; -use futures::{channel::oneshot, stream::FuturesUnordered, FutureExt, SinkExt, StreamExt}; +use futures::{channel::oneshot, stream::FuturesUnordered, StreamExt}; use polkadot_node_metrics::metrics::Metrics; use polkadot_availability_recovery::AvailabilityRecoverySubsystem; +use crate::GENESIS_HASH; use parity_scale_codec::Encode; -use polkadot_node_network_protocol::request_response::{ - self as req_res, v1::ChunkResponse, IncomingRequest, ReqProtocolNames, Requests, -}; -use rand::{distributions::Uniform, prelude::Distribution, thread_rng}; - -use prometheus::Registry; -use sc_network::{OutboundFailure, RequestFailure}; - use polkadot_erasure_coding::{branches, obtain_chunks_v1 as obtain_chunks}; +use polkadot_node_network_protocol::request_response::{IncomingRequest, ReqProtocolNames}; use polkadot_node_primitives::{BlockData, PoV, Proof}; -use polkadot_node_subsystem::{ - messages::{AllMessages, AvailabilityRecoveryMessage}, - ActiveLeavesUpdate, OverseerSignal, -}; -use std::net::{Ipv4Addr, SocketAddr}; +use polkadot_node_subsystem::messages::{AllMessages, AvailabilityRecoveryMessage}; use crate::core::{ configuration::TestAuthorities, @@ -62,12 +50,7 @@ use crate::core::{ }, }; -use super::core::{ - configuration::{PeerLatency, TestConfiguration}, - environment::TestEnvironmentMetrics, - mock::dummy_builder, - network::*, -}; +use super::core::{configuration::TestConfiguration, mock::dummy_builder, network::*}; const LOG_TARGET: &str = "subsystem-bench::availability"; @@ -75,69 +58,18 @@ use polkadot_node_primitives::{AvailableData, ErasureChunk}; use super::{cli::TestObjective, core::mock::AlwaysSupportsParachains}; use polkadot_node_subsystem_test_helpers::mock::new_block_import_event; -use polkadot_node_subsystem_util::TimeoutExt; use polkadot_primitives::{ CandidateHash, CandidateReceipt, GroupIndex, Hash, HeadData, PersistedValidationData, - SessionIndex, ValidatorIndex, + ValidatorIndex, }; use polkadot_primitives_test_helpers::{dummy_candidate_receipt, dummy_hash}; -use sc_service::{SpawnTaskHandle, TaskManager}; +use sc_service::SpawnTaskHandle; mod cli; pub mod configuration; pub use cli::{DataAvailabilityReadOptions, NetworkEmulation, NetworkOptions}; pub use configuration::AvailabilityRecoveryConfiguration; -// A dummy genesis hash -const GENESIS_HASH: Hash = Hash::repeat_byte(0xff); - -/// The test environment is the high level wrapper of all things required to test -/// a certain subsystem. -/// -/// ## Mockups -/// The overseer is passed in during construction and it can host an arbitrary number of -/// real subsystems instances and the corresponding mocked instances such that the real -/// subsystems can get their messages answered. -/// -/// As the subsystem's performance depends on network connectivity, the test environment -/// emulates validator nodes on the network, see `NetworkEmulator`. The network emulation -/// is configurable in terms of peer bandwidth, latency and connection error rate using -/// uniform distribution sampling. -/// -/// -/// ## Usage -/// `TestEnvironment` is used in tests to send `Overseer` messages or signals to the subsystem -/// under test. -/// -/// ## Collecting test metrics -/// -/// ### Prometheus -/// A prometheus endpoint is exposed while the test is running. A local Prometheus instance -/// can scrape it every 1s and a Grafana dashboard is the preferred way of visualizing -/// the performance characteristics of the subsystem. -/// -/// ### CLI -/// A subset of the Prometheus metrics are printed at the end of the test. -pub struct TestEnvironment { - // A task manager that tracks task poll durations allows us to measure - // per task CPU usage as we do in the Polkadot node. - task_manager: TaskManager, - // Our runtime - runtime: tokio::runtime::Runtime, - // A runtime handle - runtime_handle: tokio::runtime::Handle, - // The Prometheus metrics registry - registry: Registry, - // A handle to the lovely overseer - overseer_handle: OverseerHandle, - // The test intial state. The current state is owned by `env_task`. - config: TestConfiguration, - // A handle to the network emulator. - network: NetworkEmulator, - // Configuration/env metrics - metrics: TestEnvironmentMetrics, -} - fn build_overseer( spawn_task_handle: SpawnTaskHandle, runtime_api: MockRuntimeApi, @@ -257,107 +189,12 @@ fn prepare_test_inner( ) } -impl TestEnvironment { - // Create a new test environment with specified initial state and prometheus registry. - // We use prometheus metrics to collect per job task poll time and subsystem metrics. - pub fn new( - task_manager: TaskManager, - config: TestConfiguration, - registry: Registry, - runtime: Runtime, - network: NetworkEmulator, - overseer: Overseer, AlwaysSupportsParachains>, - overseer_handle: OverseerHandle, - ) -> Self { - let metrics = - TestEnvironmentMetrics::new(®istry).expect("Metrics need to be registered"); - - let spawn_handle = task_manager.spawn_handle(); - spawn_handle.spawn_blocking("overseer", "overseer", overseer.run()); - - let registry_clone = registry.clone(); - task_manager - .spawn_handle() - .spawn_blocking("prometheus", "test-environment", async move { - prometheus_endpoint::init_prometheus( - SocketAddr::new(std::net::IpAddr::V4(Ipv4Addr::LOCALHOST), 9999), - registry_clone, - ) - .await - .unwrap(); - }); - - TestEnvironment { - task_manager, - runtime_handle: runtime.handle().clone(), - runtime, - registry, - overseer_handle, - config, - network, - metrics, - } - } - - pub fn config(&self) -> &TestConfiguration { - &self.config - } - - pub fn network(&mut self) -> &mut NetworkEmulator { - &mut self.network - } - - pub fn registry(&self) -> &Registry { - &self.registry - } - - /// Produce a randomized duration between `min` and `max`. - fn random_latency(maybe_peer_latency: Option<&PeerLatency>) -> Option { - if let Some(peer_latency) = maybe_peer_latency { - Some( - Uniform::from(peer_latency.min_latency..=peer_latency.max_latency) - .sample(&mut thread_rng()), - ) - } else { - None - } - } - - pub fn metrics(&self) -> &TestEnvironmentMetrics { - &self.metrics - } - - pub fn runtime(&self) -> Handle { - self.runtime_handle.clone() - } - - // Send a message to the subsystem under test environment. - pub async fn send_message(&mut self, msg: Event) { - self.overseer_handle - .send(msg) - .timeout(MAX_TIME_OF_FLIGHT) - .await - .unwrap_or_else(|| { - panic!("{}ms maximum time of flight breached", MAX_TIME_OF_FLIGHT.as_millis()) - }) - .expect("send never fails"); - } -} - -// We use this to bail out sending messages to the subsystem if it is overloaded such that -// the time of flight is breaches 5s. -// This should eventually be a test parameter. -const MAX_TIME_OF_FLIGHT: Duration = Duration::from_millis(5000); - #[derive(Clone)] pub struct TestState { // Full test configuration config: TestConfiguration, // State starts here. test_authorities: TestAuthorities, - // The test node validator index. - validator_index: ValidatorIndex, - session_index: SessionIndex, pov_sizes: Cycle>, // Generated candidate receipts to be used in the test candidates: Cycle>, @@ -422,12 +259,10 @@ impl TestState { let config = config.clone(); let test_authorities = config.generate_authorities(); - let validator_index = ValidatorIndex(0); let mut chunks = Vec::new(); let mut available_data = Vec::new(); let mut candidate_receipts = Vec::new(); let mut pov_size_to_candidate = HashMap::new(); - let session_index = 10; // we use it for all candidates. let persisted_validation_data = PersistedValidationData { @@ -469,8 +304,6 @@ impl TestState { Self { config, test_authorities, - validator_index, - session_index, persisted_validation_data, available_data, candidate_receipts, diff --git a/polkadot/node/subsystem-bench/src/core/environment.rs b/polkadot/node/subsystem-bench/src/core/environment.rs index c9cc6ae404107..4fd752675074d 100644 --- a/polkadot/node/subsystem-bench/src/core/environment.rs +++ b/polkadot/node/subsystem-bench/src/core/environment.rs @@ -14,10 +14,23 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . +use crate::{ + core::{configuration::PeerLatency, mock::AlwaysSupportsParachains, network::NetworkEmulator}, + TestConfiguration, +}; +use core::time::Duration; +use polkadot_node_subsystem::{Event, Overseer, OverseerHandle, SpawnGlue, TimeoutExt}; +use polkadot_node_subsystem_types::Hash; use polkadot_node_subsystem_util::metrics::prometheus::{ self, Gauge, Histogram, PrometheusError, Registry, U64, }; -use sc_service::TaskManager; +use rand::{ + distributions::{Distribution, Uniform}, + thread_rng, +}; +use sc_service::{SpawnTaskHandle, TaskManager}; +use std::net::{Ipv4Addr, SocketAddr}; +use tokio::runtime::{Handle, Runtime}; const MIB: f64 = 1024.0 * 1024.0; @@ -125,3 +138,145 @@ impl Default for TestEnvironmentDependencies { Self { runtime, registry, task_manager } } } + +// A dummy genesis hash +pub const GENESIS_HASH: Hash = Hash::repeat_byte(0xff); + +// We use this to bail out sending messages to the subsystem if it is overloaded such that +// the time of flight is breaches 5s. +// This should eventually be a test parameter. +const MAX_TIME_OF_FLIGHT: Duration = Duration::from_millis(5000); + +/// The test environment is the high level wrapper of all things required to test +/// a certain subsystem. +/// +/// ## Mockups +/// The overseer is passed in during construction and it can host an arbitrary number of +/// real subsystems instances and the corresponding mocked instances such that the real +/// subsystems can get their messages answered. +/// +/// As the subsystem's performance depends on network connectivity, the test environment +/// emulates validator nodes on the network, see `NetworkEmulator`. The network emulation +/// is configurable in terms of peer bandwidth, latency and connection error rate using +/// uniform distribution sampling. +/// +/// +/// ## Usage +/// `TestEnvironment` is used in tests to send `Overseer` messages or signals to the subsystem +/// under test. +/// +/// ## Collecting test metrics +/// +/// ### Prometheus +/// A prometheus endpoint is exposed while the test is running. A local Prometheus instance +/// can scrape it every 1s and a Grafana dashboard is the preferred way of visualizing +/// the performance characteristics of the subsystem. +/// +/// ### CLI +/// A subset of the Prometheus metrics are printed at the end of the test. +pub struct TestEnvironment { + // A task manager that tracks task poll durations allows us to measure + // per task CPU usage as we do in the Polkadot node. + task_manager: TaskManager, + // Our runtime + runtime: tokio::runtime::Runtime, + // A runtime handle + runtime_handle: tokio::runtime::Handle, + // The Prometheus metrics registry + registry: Registry, + // A handle to the lovely overseer + overseer_handle: OverseerHandle, + // The test intial state. The current state is owned by `env_task`. + config: TestConfiguration, + // A handle to the network emulator. + network: NetworkEmulator, + // Configuration/env metrics + metrics: TestEnvironmentMetrics, +} + +impl TestEnvironment { + // Create a new test environment with specified initial state and prometheus registry. + // We use prometheus metrics to collect per job task poll time and subsystem metrics. + pub fn new( + task_manager: TaskManager, + config: TestConfiguration, + registry: Registry, + runtime: Runtime, + network: NetworkEmulator, + overseer: Overseer, AlwaysSupportsParachains>, + overseer_handle: OverseerHandle, + ) -> Self { + let metrics = + TestEnvironmentMetrics::new(®istry).expect("Metrics need to be registered"); + + let spawn_handle = task_manager.spawn_handle(); + spawn_handle.spawn_blocking("overseer", "overseer", overseer.run()); + + let registry_clone = registry.clone(); + task_manager + .spawn_handle() + .spawn_blocking("prometheus", "test-environment", async move { + prometheus_endpoint::init_prometheus( + SocketAddr::new(std::net::IpAddr::V4(Ipv4Addr::LOCALHOST), 9999), + registry_clone, + ) + .await + .unwrap(); + }); + + TestEnvironment { + task_manager, + runtime_handle: runtime.handle().clone(), + runtime, + registry, + overseer_handle, + config, + network, + metrics, + } + } + + pub fn config(&self) -> &TestConfiguration { + &self.config + } + + pub fn network(&mut self) -> &mut NetworkEmulator { + &mut self.network + } + + pub fn registry(&self) -> &Registry { + &self.registry + } + + /// Produce a randomized duration between `min` and `max`. + fn random_latency(maybe_peer_latency: Option<&PeerLatency>) -> Option { + if let Some(peer_latency) = maybe_peer_latency { + Some( + Uniform::from(peer_latency.min_latency..=peer_latency.max_latency) + .sample(&mut thread_rng()), + ) + } else { + None + } + } + + pub fn metrics(&self) -> &TestEnvironmentMetrics { + &self.metrics + } + + pub fn runtime(&self) -> Handle { + self.runtime_handle.clone() + } + + // Send a message to the subsystem under test environment. + pub async fn send_message(&mut self, msg: Event) { + self.overseer_handle + .send(msg) + .timeout(MAX_TIME_OF_FLIGHT) + .await + .unwrap_or_else(|| { + panic!("{}ms maximum time of flight breached", MAX_TIME_OF_FLIGHT.as_millis()) + }) + .expect("send never fails"); + } +} diff --git a/polkadot/node/subsystem-bench/src/core/subsystem.rs b/polkadot/node/subsystem-bench/src/core/subsystem.rs deleted file mode 100644 index c61e641d255d8..0000000000000 --- a/polkadot/node/subsystem-bench/src/core/subsystem.rs +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright (C) Parity Technologies (UK) Ltd. -// This file is part of Polkadot. - -// Polkadot is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. - -// Polkadot is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. - -// You should have received a copy of the GNU General Public License -// along with Polkadot. If not, see . - diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs index 4460315c35c5e..51ce8fc1d5eaa 100644 --- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -26,10 +26,13 @@ pub(crate) mod availability; pub(crate) mod cli; pub(crate) mod core; -use availability::{prepare_test, NetworkEmulation, TestEnvironment, TestState}; +use availability::{prepare_test, NetworkEmulation, TestState}; use cli::TestObjective; -use core::configuration::TestConfiguration; +use core::{ + configuration::TestConfiguration, + environment::{TestEnvironment, GENESIS_HASH}, +}; use clap_num::number_range; // const LOG_TARGET: &str = "subsystem-bench"; @@ -105,8 +108,6 @@ impl BenchCli { format!("latency = {:?}", test_config.latency).bright_black(), ); - let candidate_count = test_config.n_cores * test_config.num_blocks; - let mut state = TestState::new(&test_config); let (mut env, _protocol_config) = prepare_test(test_config, &mut state); env.runtime().block_on(availability::bench_chunk_recovery(&mut env, state)); @@ -165,11 +166,9 @@ impl BenchCli { test_config.bandwidth = bandwidth * 1024; } - let candidate_count = test_config.n_cores * test_config.num_blocks; - // test_config.write_to_disk(); - let mut state = TestState::new(&test_config); let (mut env, _protocol_config) = prepare_test(test_config, &mut state); + // test_config.write_to_disk(); env.runtime().block_on(availability::bench_chunk_recovery(&mut env, state)); Ok(()) From b51485bfe995c8891107b8c8618dc4b972c1d0c0 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 27 Nov 2023 20:00:10 +0200 Subject: [PATCH 23/45] more cleaning Signed-off-by: Andrei Sandu --- .../subsystem-bench/src/availability/mod.rs | 13 +----- .../subsystem-bench/src/core/environment.rs | 41 ++++++++----------- 2 files changed, 18 insertions(+), 36 deletions(-) diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index e1974794cb8d0..02ec794dc7458 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -175,18 +175,7 @@ fn prepare_test_inner( subsystem, ); - ( - TestEnvironment::new( - dependencies.task_manager, - config, - dependencies.registry, - dependencies.runtime, - network, - overseer, - overseer_handle, - ), - req_cfg, - ) + (TestEnvironment::new(dependencies, config, network, overseer, overseer_handle), req_cfg) } #[derive(Clone)] diff --git a/polkadot/node/subsystem-bench/src/core/environment.rs b/polkadot/node/subsystem-bench/src/core/environment.rs index 4fd752675074d..d213d24c9af70 100644 --- a/polkadot/node/subsystem-bench/src/core/environment.rs +++ b/polkadot/node/subsystem-bench/src/core/environment.rs @@ -30,7 +30,7 @@ use rand::{ }; use sc_service::{SpawnTaskHandle, TaskManager}; use std::net::{Ipv4Addr, SocketAddr}; -use tokio::runtime::{Handle, Runtime}; +use tokio::runtime::Handle; const MIB: f64 = 1024.0 * 1024.0; @@ -175,15 +175,10 @@ const MAX_TIME_OF_FLIGHT: Duration = Duration::from_millis(5000); /// ### CLI /// A subset of the Prometheus metrics are printed at the end of the test. pub struct TestEnvironment { - // A task manager that tracks task poll durations allows us to measure - // per task CPU usage as we do in the Polkadot node. - task_manager: TaskManager, - // Our runtime - runtime: tokio::runtime::Runtime, + // Test dependencies + dependencies: TestEnvironmentDependencies, // A runtime handle runtime_handle: tokio::runtime::Handle, - // The Prometheus metrics registry - registry: Registry, // A handle to the lovely overseer overseer_handle: OverseerHandle, // The test intial state. The current state is owned by `env_task`. @@ -198,37 +193,35 @@ impl TestEnvironment { // Create a new test environment with specified initial state and prometheus registry. // We use prometheus metrics to collect per job task poll time and subsystem metrics. pub fn new( - task_manager: TaskManager, + dependencies: TestEnvironmentDependencies, config: TestConfiguration, - registry: Registry, - runtime: Runtime, network: NetworkEmulator, overseer: Overseer, AlwaysSupportsParachains>, overseer_handle: OverseerHandle, ) -> Self { - let metrics = - TestEnvironmentMetrics::new(®istry).expect("Metrics need to be registered"); + let metrics = TestEnvironmentMetrics::new(&dependencies.registry) + .expect("Metrics need to be registered"); - let spawn_handle = task_manager.spawn_handle(); + let spawn_handle = dependencies.task_manager.spawn_handle(); spawn_handle.spawn_blocking("overseer", "overseer", overseer.run()); - let registry_clone = registry.clone(); - task_manager - .spawn_handle() - .spawn_blocking("prometheus", "test-environment", async move { + let registry_clone = dependencies.registry.clone(); + dependencies.task_manager.spawn_handle().spawn_blocking( + "prometheus", + "test-environment", + async move { prometheus_endpoint::init_prometheus( SocketAddr::new(std::net::IpAddr::V4(Ipv4Addr::LOCALHOST), 9999), registry_clone, ) .await .unwrap(); - }); + }, + ); TestEnvironment { - task_manager, - runtime_handle: runtime.handle().clone(), - runtime, - registry, + runtime_handle: dependencies.runtime.handle().clone(), + dependencies, overseer_handle, config, network, @@ -245,7 +238,7 @@ impl TestEnvironment { } pub fn registry(&self) -> &Registry { - &self.registry + &self.dependencies.registry } /// Produce a randomized duration between `min` and `max`. From 7e464447d7db427223089a2d01aa38048f7c8927 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 27 Nov 2023 20:11:06 +0200 Subject: [PATCH 24/45] more cleaning Signed-off-by: Andrei Sandu --- .../subsystem-bench/src/availability/mod.rs | 5 +- .../subsystem-bench/src/core/configuration.rs | 1 + .../node/subsystem-bench/src/core/display.rs | 109 ------------------ .../subsystem-bench/src/core/environment.rs | 20 +--- .../node/subsystem-bench/src/core/network.rs | 1 + 5 files changed, 5 insertions(+), 131 deletions(-) diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index 02ec794dc7458..5546d9cc357f3 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -97,6 +97,7 @@ pub fn prepare_test( } /// Takes a test configuration and uses it to creates the `TestEnvironment`. +#[allow(unused)] pub fn prepare_test_with_dependencies( config: TestConfiguration, state: &mut TestState, @@ -212,10 +213,6 @@ impl TestState { candidate } - pub fn authorities(&self) -> &TestAuthorities { - &self.test_authorities - } - /// Generate candidates to be used in the test. pub fn generate_candidates(&mut self, count: usize) { gum::info!(target: LOG_TARGET,"{}", format!("Pre-generating {} candidates.", count).bright_blue()); diff --git a/polkadot/node/subsystem-bench/src/core/configuration.rs b/polkadot/node/subsystem-bench/src/core/configuration.rs index f8fdcf2973ebe..35fa51790c911 100644 --- a/polkadot/node/subsystem-bench/src/core/configuration.rs +++ b/polkadot/node/subsystem-bench/src/core/configuration.rs @@ -110,6 +110,7 @@ pub struct TestAuthorities { } impl TestConfiguration { + #[allow(unused)] pub fn write_to_disk(&self) { // Serialize a slice of configurations let yaml = serde_yaml::to_string(&TestSequence { test_configurations: vec![self.clone()] }) diff --git a/polkadot/node/subsystem-bench/src/core/display.rs b/polkadot/node/subsystem-bench/src/core/display.rs index 4b63f45c5f8aa..921c22b2059e7 100644 --- a/polkadot/node/subsystem-bench/src/core/display.rs +++ b/polkadot/node/subsystem-bench/src/core/display.rs @@ -103,115 +103,6 @@ impl Display for TestMetric { } } -// fn encode_impl( -// &self, -// metric_families: &[MetricFamily], -// writer: &mut dyn WriteUtf8, -// ) -> Result<()> { for mf in metric_families { // Fail-fast checks. check_metric_family(mf)?; - -// // Write `# HELP` header. -// let name = mf.get_name(); -// let help = mf.get_help(); -// if !help.is_empty() { -// writer.write_all("# HELP ")?; -// writer.write_all(name)?; -// writer.write_all(" ")?; -// writer.write_all(&escape_string(help, false))?; -// writer.write_all("\n")?; -// } - -// // Write `# TYPE` header. -// let metric_type = mf.get_field_type(); -// let lowercase_type = format!("{:?}", metric_type).to_lowercase(); -// writer.write_all("# TYPE ")?; -// writer.write_all(name)?; -// writer.write_all(" ")?; -// writer.write_all(&lowercase_type)?; -// writer.write_all("\n")?; - -// for m in mf.get_metric() { -// match metric_type { -// MetricType::COUNTER => { -// write_sample(writer, name, None, m, None, m.get_counter().get_value())?; -// } -// MetricType::GAUGE => { -// write_sample(writer, name, None, m, None, m.get_gauge().get_value())?; -// } -// MetricType::HISTOGRAM => { -// let h = m.get_histogram(); - -// let mut inf_seen = false; -// for b in h.get_bucket() { -// let upper_bound = b.get_upper_bound(); -// write_sample( -// writer, -// name, -// Some("_bucket"), -// m, -// Some((BUCKET_LABEL, &upper_bound.to_string())), -// b.get_cumulative_count() as f64, -// )?; -// if upper_bound.is_sign_positive() && upper_bound.is_infinite() { -// inf_seen = true; -// } -// } -// if !inf_seen { -// write_sample( -// writer, -// name, -// Some("_bucket"), -// m, -// Some((BUCKET_LABEL, POSITIVE_INF)), -// h.get_sample_count() as f64, -// )?; -// } - -// write_sample(writer, name, Some("_sum"), m, None, h.get_sample_sum())?; - -// write_sample( -// writer, -// name, -// Some("_count"), -// m, -// None, -// h.get_sample_count() as f64, -// )?; -// } -// MetricType::SUMMARY => { -// let s = m.get_summary(); - -// for q in s.get_quantile() { -// write_sample( -// writer, -// name, -// None, -// m, -// Some((QUANTILE, &q.get_quantile().to_string())), -// q.get_value(), -// )?; -// } - -// write_sample(writer, name, Some("_sum"), m, None, s.get_sample_sum())?; - -// write_sample( -// writer, -// name, -// Some("_count"), -// m, -// None, -// s.get_sample_count() as f64, -// )?; -// } -// MetricType::UNTYPED => { -// unimplemented!(); -// } -// } -// } -// } - -// Ok(()) -// } - // Returns `false` if metric should be skipped. fn check_metric_family(mf: &MetricFamily) -> bool { if mf.get_metric().is_empty() { diff --git a/polkadot/node/subsystem-bench/src/core/environment.rs b/polkadot/node/subsystem-bench/src/core/environment.rs index d213d24c9af70..28e98c6b42d06 100644 --- a/polkadot/node/subsystem-bench/src/core/environment.rs +++ b/polkadot/node/subsystem-bench/src/core/environment.rs @@ -15,7 +15,7 @@ // along with Polkadot. If not, see . use crate::{ - core::{configuration::PeerLatency, mock::AlwaysSupportsParachains, network::NetworkEmulator}, + core::{mock::AlwaysSupportsParachains, network::NetworkEmulator}, TestConfiguration, }; use core::time::Duration; @@ -24,10 +24,6 @@ use polkadot_node_subsystem_types::Hash; use polkadot_node_subsystem_util::metrics::prometheus::{ self, Gauge, Histogram, PrometheusError, Registry, U64, }; -use rand::{ - distributions::{Distribution, Uniform}, - thread_rng, -}; use sc_service::{SpawnTaskHandle, TaskManager}; use std::net::{Ipv4Addr, SocketAddr}; use tokio::runtime::Handle; @@ -181,7 +177,7 @@ pub struct TestEnvironment { runtime_handle: tokio::runtime::Handle, // A handle to the lovely overseer overseer_handle: OverseerHandle, - // The test intial state. The current state is owned by `env_task`. + // The test configuration. config: TestConfiguration, // A handle to the network emulator. network: NetworkEmulator, @@ -241,18 +237,6 @@ impl TestEnvironment { &self.dependencies.registry } - /// Produce a randomized duration between `min` and `max`. - fn random_latency(maybe_peer_latency: Option<&PeerLatency>) -> Option { - if let Some(peer_latency) = maybe_peer_latency { - Some( - Uniform::from(peer_latency.min_latency..=peer_latency.max_latency) - .sample(&mut thread_rng()), - ) - } else { - None - } - } - pub fn metrics(&self) -> &TestEnvironmentMetrics { &self.metrics } diff --git a/polkadot/node/subsystem-bench/src/core/network.rs b/polkadot/node/subsystem-bench/src/core/network.rs index 80d961babe03c..f5532087e35cd 100644 --- a/polkadot/node/subsystem-bench/src/core/network.rs +++ b/polkadot/node/subsystem-bench/src/core/network.rs @@ -334,6 +334,7 @@ impl NetworkEmulator { } // Increment bytes received by our node (the node that contains the subsystem under test) + #[allow(unused)] pub fn inc_received(&self, bytes: u64) { // Our node always is peer 0. self.metrics.on_peer_received(0, bytes); From d3df9279adbe6bfb78856e16ff4502d09245c25f Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 27 Nov 2023 22:37:48 +0200 Subject: [PATCH 25/45] proper overseer control Signed-off-by: Andrei Sandu --- Cargo.lock | 1 + polkadot/node/subsystem-bench/Cargo.toml | 1 + .../subsystem-bench/src/availability/mod.rs | 56 ++++++++----------- .../node/subsystem-bench/src/core/display.rs | 1 + .../subsystem-bench/src/core/environment.rs | 31 ++++++++-- .../subsystem-bench/src/core/mock/av_store.rs | 9 ++- .../subsystem-bench/src/core/mock/dummy.rs | 10 +++- .../src/core/mock/network_bridge.rs | 8 ++- .../src/core/mock/runtime_api.rs | 29 +++++----- .../subsystem-bench/src/subsystem-bench.rs | 2 + .../node/subsystem-test-helpers/src/mock.rs | 8 +-- 11 files changed, 91 insertions(+), 65 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b349886761ad1..197807b7fa8d6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13445,6 +13445,7 @@ dependencies = [ "polkadot-node-subsystem-test-helpers", "polkadot-node-subsystem-types", "polkadot-node-subsystem-util", + "polkadot-overseer", "polkadot-primitives", "polkadot-primitives-test-helpers", "prometheus", diff --git a/polkadot/node/subsystem-bench/Cargo.toml b/polkadot/node/subsystem-bench/Cargo.toml index 8296874c0dab5..f775a1ff9efee 100644 --- a/polkadot/node/subsystem-bench/Cargo.toml +++ b/polkadot/node/subsystem-bench/Cargo.toml @@ -24,6 +24,7 @@ polkadot-primitives = { path = "../../primitives" } polkadot-node-network-protocol = { path = "../network/protocol" } polkadot-availability-recovery = { path = "../network/availability-recovery", features=["subsystem-benchmarks"]} color-eyre = { version = "0.6.1", default-features = false } +polkadot-overseer = { path = "../overseer" } colored = "2.0.4" assert_matches = "1.5" async-trait = "0.1.57" diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index 5546d9cc357f3..6282a6b63f01b 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -23,7 +23,9 @@ use std::{ }; use crate::TestEnvironment; -use polkadot_node_subsystem::{Event, Overseer, OverseerConnector, OverseerHandle, SpawnGlue}; +use polkadot_node_subsystem::{Overseer, OverseerConnector, SpawnGlue}; +use polkadot_overseer::Handle as OverseerHandle; + use sc_network::request_responses::ProtocolConfig; use colored::Colorize; @@ -41,7 +43,6 @@ use polkadot_node_primitives::{BlockData, PoV, Proof}; use polkadot_node_subsystem::messages::{AllMessages, AvailabilityRecoveryMessage}; use crate::core::{ - configuration::TestAuthorities, environment::TestEnvironmentDependencies, mock::{ av_store, @@ -57,7 +58,7 @@ const LOG_TARGET: &str = "subsystem-bench::availability"; use polkadot_node_primitives::{AvailableData, ErasureChunk}; use super::{cli::TestObjective, core::mock::AlwaysSupportsParachains}; -use polkadot_node_subsystem_test_helpers::mock::new_block_import_event; +use polkadot_node_subsystem_test_helpers::mock::new_block_import_info; use polkadot_primitives::{ CandidateHash, CandidateReceipt, GroupIndex, Hash, HeadData, PersistedValidationData, ValidatorIndex, @@ -85,7 +86,10 @@ fn build_overseer( .replace_network_bridge_tx(|_| network_bridge) .replace_availability_recovery(|_| availability_recovery); - builder.build_with_connector(overseer_connector).expect("Should not fail") + let (overseer, raw_handle) = + builder.build_with_connector(overseer_connector).expect("Should not fail"); + + (overseer, OverseerHandle::new(raw_handle)) } /// Takes a test configuration and uses it to creates the `TestEnvironment`. @@ -119,11 +123,7 @@ fn prepare_test_inner( // Generate test authorities. let test_authorities = config.generate_authorities(); - let runtime_api = runtime_api::MockRuntimeApi::new( - config.clone(), - test_authorities.validator_public.clone(), - test_authorities.validator_authority_id.clone(), - ); + let runtime_api = runtime_api::MockRuntimeApi::new(config.clone(), test_authorities.clone()); let av_store = av_store::MockAvailabilityStore::new(state.chunks.clone(), state.candidate_hashes.clone()); @@ -136,7 +136,7 @@ fn prepare_test_inner( let network = NetworkEmulator::new( config.n_validators.clone(), - test_authorities.validator_authority_id.clone(), + test_authorities.validator_authority_id, config.peer_bandwidth, dependencies.task_manager.spawn_handle(), &dependencies.registry, @@ -183,18 +183,14 @@ fn prepare_test_inner( pub struct TestState { // Full test configuration config: TestConfiguration, - // State starts here. - test_authorities: TestAuthorities, pov_sizes: Cycle>, // Generated candidate receipts to be used in the test candidates: Cycle>, - candidates_generated: usize, // Map from pov size to candidate index pov_size_to_candidate: HashMap, // Map from generated candidate hashes to candidate index in `available_data` // and `chunks`. candidate_hashes: HashMap, - persisted_validation_data: PersistedValidationData, candidate_receipts: Vec, available_data: Vec, @@ -243,7 +239,6 @@ impl TestState { pub fn new(config: &TestConfiguration) -> Self { let config = config.clone(); - let test_authorities = config.generate_authorities(); let mut chunks = Vec::new(); let mut available_data = Vec::new(); @@ -289,14 +284,11 @@ impl TestState { Self { config, - test_authorities, - persisted_validation_data, available_data, candidate_receipts, chunks, pov_size_to_candidate, pov_sizes, - candidates_generated: 0, candidate_hashes: HashMap::new(), candidates: Vec::new().into_iter().cycle(), } @@ -333,7 +325,7 @@ fn derive_erasure_chunks_with_proofs_and_root( pub async fn bench_chunk_recovery(env: &mut TestEnvironment, mut state: TestState) { let config = env.config().clone(); - env.send_message(new_block_import_event(Hash::repeat_byte(1), 1)).await; + env.import_block(new_block_import_info(Hash::repeat_byte(1), 1)).await; let start_marker = Instant::now(); let mut batch = FuturesUnordered::new(); @@ -353,19 +345,16 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment, mut state: TestStat let (tx, rx) = oneshot::channel(); batch.push(rx); - let message = Event::MsgToSubsystem { - msg: AllMessages::AvailabilityRecovery( - AvailabilityRecoveryMessage::RecoverAvailableData( - candidate.clone(), - 1, - Some(GroupIndex( - candidate_num as u32 % (std::cmp::max(5, config.n_cores) / 5) as u32, - )), - tx, - ), + let message = AllMessages::AvailabilityRecovery( + AvailabilityRecoveryMessage::RecoverAvailableData( + candidate.clone(), + 1, + Some(GroupIndex( + candidate_num as u32 % (std::cmp::max(5, config.n_cores) / 5) as u32, + )), + tx, ), - origin: LOG_TARGET, - }; + ); env.send_message(message).await; } @@ -386,7 +375,8 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment, mut state: TestStat tokio::time::sleep(block_time_delta).await; } - env.send_message(Event::Stop).await; + env.stop().await; + let duration: u128 = start_marker.elapsed().as_millis(); let availability_bytes = availability_bytes / 1024; gum::info!("All blocks processed in {}", format!("{:?}ms", duration).cyan()); @@ -416,7 +406,7 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment, mut state: TestStat let test_metrics = super::core::display::parse_metrics(&env.registry()); let subsystem_cpu_metrics = - test_metrics.subset_with_label_value("task_group", "availability-recovery-subsystem"); + test_metrics.subset_with_label_value("task_group", "availability-recovery"); let total_cpu = subsystem_cpu_metrics.sum_by("substrate_tasks_polling_duration_sum"); gum::info!(target: LOG_TARGET, "Total subsystem CPU usage {}", format!("{:.2}s", total_cpu).bright_purple()); gum::info!(target: LOG_TARGET, "CPU usage per block {}", format!("{:.2}s", total_cpu/env.config().num_blocks as f64).bright_purple()); diff --git a/polkadot/node/subsystem-bench/src/core/display.rs b/polkadot/node/subsystem-bench/src/core/display.rs index 921c22b2059e7..13ea7d375e95b 100644 --- a/polkadot/node/subsystem-bench/src/core/display.rs +++ b/polkadot/node/subsystem-bench/src/core/display.rs @@ -13,6 +13,7 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . +// //! Some helper methods for parsing prometheus metrics to a format that can be //! displayed in the CLI. //! diff --git a/polkadot/node/subsystem-bench/src/core/environment.rs b/polkadot/node/subsystem-bench/src/core/environment.rs index 28e98c6b42d06..fd09de9169a42 100644 --- a/polkadot/node/subsystem-bench/src/core/environment.rs +++ b/polkadot/node/subsystem-bench/src/core/environment.rs @@ -19,11 +19,15 @@ use crate::{ TestConfiguration, }; use core::time::Duration; -use polkadot_node_subsystem::{Event, Overseer, OverseerHandle, SpawnGlue, TimeoutExt}; +use polkadot_overseer::{BlockInfo, Handle as OverseerHandle}; + +use polkadot_node_subsystem::{messages::AllMessages, Overseer, SpawnGlue, TimeoutExt}; use polkadot_node_subsystem_types::Hash; use polkadot_node_subsystem_util::metrics::prometheus::{ self, Gauge, Histogram, PrometheusError, Registry, U64, }; + +use sc_network::peer_store::LOG_TARGET; use sc_service::{SpawnTaskHandle, TaskManager}; use std::net::{Ipv4Addr, SocketAddr}; use tokio::runtime::Handle; @@ -199,8 +203,8 @@ impl TestEnvironment { .expect("Metrics need to be registered"); let spawn_handle = dependencies.task_manager.spawn_handle(); - spawn_handle.spawn_blocking("overseer", "overseer", overseer.run()); + spawn_handle.spawn_blocking("overseer", "overseer", overseer.run()); let registry_clone = dependencies.registry.clone(); dependencies.task_manager.spawn_handle().spawn_blocking( "prometheus", @@ -246,14 +250,29 @@ impl TestEnvironment { } // Send a message to the subsystem under test environment. - pub async fn send_message(&mut self, msg: Event) { + pub async fn send_message(&mut self, msg: AllMessages) { self.overseer_handle - .send(msg) + .send_msg(msg, LOG_TARGET) .timeout(MAX_TIME_OF_FLIGHT) .await .unwrap_or_else(|| { panic!("{}ms maximum time of flight breached", MAX_TIME_OF_FLIGHT.as_millis()) - }) - .expect("send never fails"); + }); + } + + // Send a signal to the subsystem under test environment. + pub async fn import_block(&mut self, block: BlockInfo) { + self.overseer_handle + .block_imported(block) + .timeout(MAX_TIME_OF_FLIGHT) + .await + .unwrap_or_else(|| { + panic!("{}ms maximum time of flight breached", MAX_TIME_OF_FLIGHT.as_millis()) + }); + } + + // Stop overseer and subsystems. + pub async fn stop(&mut self) { + self.overseer_handle.stop().await; } } diff --git a/polkadot/node/subsystem-bench/src/core/mock/av_store.rs b/polkadot/node/subsystem-bench/src/core/mock/av_store.rs index 7f6ff2abfe9e0..1ff7d1728af98 100644 --- a/polkadot/node/subsystem-bench/src/core/mock/av_store.rs +++ b/polkadot/node/subsystem-bench/src/core/mock/av_store.rs @@ -29,6 +29,8 @@ use polkadot_node_subsystem::{ messages::AvailabilityStoreMessage, overseer, SpawnedSubsystem, SubsystemError, }; +use polkadot_node_subsystem_types::OverseerSignal; + pub struct AvailabilityStoreState { candidate_hashes: HashMap, chunks: Vec>, @@ -82,7 +84,7 @@ impl MockAvailabilityStore { fn start(self, ctx: Context) -> SpawnedSubsystem { let future = self.run(ctx).map(|_| Ok(())).boxed(); - SpawnedSubsystem { name: "av-store-mock-subsystem", future } + SpawnedSubsystem { name: "test-environment", future } } } @@ -94,7 +96,10 @@ impl MockAvailabilityStore { let msg = ctx.recv().await.expect("Overseer never fails us"); match msg { - orchestra::FromOrchestra::Signal(_) => {}, + orchestra::FromOrchestra::Signal(signal) => match signal { + OverseerSignal::Conclude => return, + _ => {}, + }, orchestra::FromOrchestra::Communication { msg } => match msg { AvailabilityStoreMessage::QueryAvailableData(candidate_hash, tx) => { gum::debug!(target: LOG_TARGET, candidate_hash = ?candidate_hash, "Responding to QueryAvailableData"); diff --git a/polkadot/node/subsystem-bench/src/core/mock/dummy.rs b/polkadot/node/subsystem-bench/src/core/mock/dummy.rs index 998153875ede0..0628368a49c08 100644 --- a/polkadot/node/subsystem-bench/src/core/mock/dummy.rs +++ b/polkadot/node/subsystem-bench/src/core/mock/dummy.rs @@ -33,7 +33,8 @@ macro_rules! mock { fn start(self, ctx: Context) -> SpawnedSubsystem { let future = self.run(ctx).map(|_| Ok(())).boxed(); - SpawnedSubsystem { name: stringify!($subsystem_name), future } + // The name will appear in substrate CPU task metrics as `task_group`.` + SpawnedSubsystem { name: "test-environment", future } } } @@ -45,7 +46,12 @@ macro_rules! mock { futures::select!{ msg = ctx.recv().fuse() => { match msg.unwrap() { - orchestra::FromOrchestra::Signal(_) => {}, + orchestra::FromOrchestra::Signal(signal) => { + match signal { + polkadot_node_subsystem_types::OverseerSignal::Conclude => {return}, + _ => {} + } + }, orchestra::FromOrchestra::Communication { msg } => { gum::debug!(target: LOG_TARGET, msg = ?msg, "mocked subsystem received message"); } diff --git a/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs b/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs index a6d07c3d4a20a..144a16b9f14b5 100644 --- a/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs +++ b/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs @@ -17,6 +17,7 @@ //! A generic av store subsystem mockup suitable to be used in benchmarks. use parity_scale_codec::Encode; +use polkadot_node_subsystem_types::OverseerSignal; use std::collections::HashMap; @@ -191,7 +192,7 @@ impl MockNetworkBridgeTx { fn start(self, ctx: Context) -> SpawnedSubsystem { let future = self.run(ctx).map(|_| Ok(())).boxed(); - SpawnedSubsystem { name: "network-bridge-tx-mock-subsystem", future } + SpawnedSubsystem { name: "test-environment", future } } } @@ -230,7 +231,10 @@ impl MockNetworkBridgeTx { let msg = ctx.recv().await.expect("Overseer never fails us"); match msg { - orchestra::FromOrchestra::Signal(_) => {}, + orchestra::FromOrchestra::Signal(signal) => match signal { + OverseerSignal::Conclude => return, + _ => {}, + }, orchestra::FromOrchestra::Communication { msg } => match msg { NetworkBridgeTxMessage::SendRequests(requests, _if_disconnected) => { for request in requests { diff --git a/polkadot/node/subsystem-bench/src/core/mock/runtime_api.rs b/polkadot/node/subsystem-bench/src/core/mock/runtime_api.rs index a106eb1309914..9cbe025ae8060 100644 --- a/polkadot/node/subsystem-bench/src/core/mock/runtime_api.rs +++ b/polkadot/node/subsystem-bench/src/core/mock/runtime_api.rs @@ -16,23 +16,21 @@ //! //! A generic runtime api subsystem mockup suitable to be used in benchmarks. -use polkadot_primitives::{ - AuthorityDiscoveryId, GroupIndex, IndexedVec, SessionInfo, ValidatorId, ValidatorIndex, -}; +use polkadot_primitives::{GroupIndex, IndexedVec, SessionInfo, ValidatorIndex}; use polkadot_node_subsystem::{ messages::{RuntimeApiMessage, RuntimeApiRequest}, overseer, SpawnedSubsystem, SubsystemError, }; +use polkadot_node_subsystem_types::OverseerSignal; -use crate::core::configuration::TestConfiguration; +use crate::core::configuration::{TestAuthorities, TestConfiguration}; use futures::FutureExt; const LOG_TARGET: &str = "subsystem-bench::runtime-api-mock"; pub struct RuntimeApiState { - validator_public: Vec, - validator_authority_id: Vec, + authorities: TestAuthorities, } pub struct MockRuntimeApi { @@ -41,12 +39,8 @@ pub struct MockRuntimeApi { } impl MockRuntimeApi { - pub fn new( - config: TestConfiguration, - validator_public: Vec, - validator_authority_id: Vec, - ) -> MockRuntimeApi { - Self { state: RuntimeApiState { validator_public, validator_authority_id }, config } + pub fn new(config: TestConfiguration, authorities: TestAuthorities) -> MockRuntimeApi { + Self { state: RuntimeApiState { authorities }, config } } fn session_info(&self) -> SessionInfo { @@ -57,8 +51,8 @@ impl MockRuntimeApi { let validator_groups = all_validators.chunks(5).map(|x| Vec::from(x)).collect::>(); SessionInfo { - validators: self.state.validator_public.clone().into(), - discovery_keys: self.state.validator_authority_id.clone(), + validators: self.state.authorities.validator_public.clone().into(), + discovery_keys: self.state.authorities.validator_authority_id.clone(), validator_groups: IndexedVec::>::from(validator_groups), assignment_keys: vec![], n_cores: self.config.n_cores as u32, @@ -79,7 +73,7 @@ impl MockRuntimeApi { fn start(self, ctx: Context) -> SpawnedSubsystem { let future = self.run(ctx).map(|_| Ok(())).boxed(); - SpawnedSubsystem { name: "runtime-api-mock-subsystem", future } + SpawnedSubsystem { name: "test-environment", future } } } @@ -90,7 +84,10 @@ impl MockRuntimeApi { let msg = ctx.recv().await.expect("Overseer never fails us"); match msg { - orchestra::FromOrchestra::Signal(_) => {}, + orchestra::FromOrchestra::Signal(signal) => match signal { + OverseerSignal::Conclude => return, + _ => {}, + }, orchestra::FromOrchestra::Communication { msg } => { gum::debug!(target: LOG_TARGET, msg=?msg, "recv message"); diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs index 51ce8fc1d5eaa..f9261d848778b 100644 --- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -179,6 +179,8 @@ fn main() -> eyre::Result<()> { color_eyre::install()?; let _ = env_logger::builder() .filter(Some("hyper"), log::LevelFilter::Info) + // Avoid `Terminating due to subsystem exit subsystem` warnings + .filter(Some("polkadot_overseer"), log::LevelFilter::Error) // .filter(None, log::LevelFilter::Trace) .try_init() .unwrap(); diff --git a/polkadot/node/subsystem-test-helpers/src/mock.rs b/polkadot/node/subsystem-test-helpers/src/mock.rs index 11e77b6e8968e..fc2dd6a4e34e6 100644 --- a/polkadot/node/subsystem-test-helpers/src/mock.rs +++ b/polkadot/node/subsystem-test-helpers/src/mock.rs @@ -16,7 +16,7 @@ use std::sync::Arc; -use polkadot_node_subsystem::{jaeger, ActivatedLeaf, Event, BlockInfo}; +use polkadot_node_subsystem::{jaeger, ActivatedLeaf,BlockInfo}; use sc_client_api::UnpinHandle; use sc_keystore::LocalKeystore; use sc_utils::mpsc::tracing_unbounded; @@ -61,11 +61,11 @@ pub fn new_leaf(hash: Hash, number: BlockNumber) -> ActivatedLeaf { } /// Create a new leaf with the given hash and number. -pub fn new_block_import_event(hash: Hash, number: BlockNumber) -> Event { - Event::BlockImported(BlockInfo { +pub fn new_block_import_info(hash: Hash, number: BlockNumber) -> BlockInfo { + BlockInfo { hash, parent_hash: Hash::default(), number, unpin_handle: dummy_unpin_handle(hash), - }) + } } From 7557768d740a87336cb0479d78ebe2c7b816e6b2 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 27 Nov 2023 23:06:17 +0200 Subject: [PATCH 26/45] refactor CLI display of env stats Signed-off-by: Andrei Sandu --- .../subsystem-bench/src/availability/mod.rs | 27 +------- .../subsystem-bench/src/core/environment.rs | 63 +++++++++++++++++-- .../src/core/mock/network_bridge.rs | 2 + .../node/subsystem-bench/src/core/network.rs | 2 +- 4 files changed, 63 insertions(+), 31 deletions(-) diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index 6282a6b63f01b..ae4e743205e38 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -390,30 +390,5 @@ pub async fn bench_chunk_recovery(env: &mut TestEnvironment, mut state: TestStat .red() ); - let stats = env.network().stats(); - gum::info!( - "Total received from network: {}", - format!( - "{} MiB", - stats - .iter() - .enumerate() - .map(|(_index, stats)| stats.tx_bytes_total as u128) - .sum::() / (1024 * 1024) - ) - .cyan() - ); - - let test_metrics = super::core::display::parse_metrics(&env.registry()); - let subsystem_cpu_metrics = - test_metrics.subset_with_label_value("task_group", "availability-recovery"); - let total_cpu = subsystem_cpu_metrics.sum_by("substrate_tasks_polling_duration_sum"); - gum::info!(target: LOG_TARGET, "Total subsystem CPU usage {}", format!("{:.2}s", total_cpu).bright_purple()); - gum::info!(target: LOG_TARGET, "CPU usage per block {}", format!("{:.2}s", total_cpu/env.config().num_blocks as f64).bright_purple()); - - let test_env_cpu_metrics = - test_metrics.subset_with_label_value("task_group", "test-environment"); - let total_cpu = test_env_cpu_metrics.sum_by("substrate_tasks_polling_duration_sum"); - gum::info!(target: LOG_TARGET, "Total test environment CPU usage {}", format!("{:.2}s", total_cpu).bright_purple()); - gum::info!(target: LOG_TARGET, "CPU usage per block {}", format!("{:.2}s", total_cpu/env.config().num_blocks as f64).bright_purple()); + gum::info!("{}", &env); } diff --git a/polkadot/node/subsystem-bench/src/core/environment.rs b/polkadot/node/subsystem-bench/src/core/environment.rs index fd09de9169a42..24d10ecb1fa1d 100644 --- a/polkadot/node/subsystem-bench/src/core/environment.rs +++ b/polkadot/node/subsystem-bench/src/core/environment.rs @@ -18,6 +18,7 @@ use crate::{ core::{mock::AlwaysSupportsParachains, network::NetworkEmulator}, TestConfiguration, }; +use colored::Colorize; use core::time::Duration; use polkadot_overseer::{BlockInfo, Handle as OverseerHandle}; @@ -29,7 +30,10 @@ use polkadot_node_subsystem_util::metrics::prometheus::{ use sc_network::peer_store::LOG_TARGET; use sc_service::{SpawnTaskHandle, TaskManager}; -use std::net::{Ipv4Addr, SocketAddr}; +use std::{ + fmt::Display, + net::{Ipv4Addr, SocketAddr}, +}; use tokio::runtime::Handle; const MIB: f64 = 1024.0 * 1024.0; @@ -233,8 +237,8 @@ impl TestEnvironment { &self.config } - pub fn network(&mut self) -> &mut NetworkEmulator { - &mut self.network + pub fn network(&self) -> &NetworkEmulator { + &self.network } pub fn registry(&self) -> &Registry { @@ -260,7 +264,7 @@ impl TestEnvironment { }); } - // Send a signal to the subsystem under test environment. + // Send an `ActiveLeavesUpdate` signal to all subsystems under test. pub async fn import_block(&mut self, block: BlockInfo) { self.overseer_handle .block_imported(block) @@ -276,3 +280,54 @@ impl TestEnvironment { self.overseer_handle.stop().await; } } + +impl Display for TestEnvironment { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let stats = self.network().stats(); + + writeln!(f, "\n")?; + writeln!( + f, + "Total received from network: {}", + format!( + "{} MiB", + stats + .iter() + .enumerate() + .map(|(_index, stats)| stats.tx_bytes_total as u128) + .sum::() / (1024 * 1024) + ) + .cyan() + )?; + writeln!( + f, + "Total sent to network: {}", + format!("{} KiB", stats[0].tx_bytes_total / (1024)).cyan() + )?; + + let test_metrics = super::display::parse_metrics(self.registry()); + let subsystem_cpu_metrics = + test_metrics.subset_with_label_value("task_group", "availability-recovery"); + let total_cpu = subsystem_cpu_metrics.sum_by("substrate_tasks_polling_duration_sum"); + writeln!(f, "Total subsystem CPU usage {}", format!("{:.2}s", total_cpu).bright_purple())?; + writeln!( + f, + "CPU usage per block {}", + format!("{:.2}s", total_cpu / self.config().num_blocks as f64).bright_purple() + )?; + + let test_env_cpu_metrics = + test_metrics.subset_with_label_value("task_group", "test-environment"); + let total_cpu = test_env_cpu_metrics.sum_by("substrate_tasks_polling_duration_sum"); + writeln!( + f, + "Total test environment CPU usage {}", + format!("{:.2}s", total_cpu).bright_purple() + )?; + writeln!( + f, + "CPU usage per block {}", + format!("{:.2}s", total_cpu / self.config().num_blocks as f64).bright_purple() + ) + } +} diff --git a/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs b/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs index 144a16b9f14b5..a45cacd0241a5 100644 --- a/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs +++ b/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs @@ -80,6 +80,8 @@ impl MockNetworkBridgeTx { match request { Requests::ChunkFetchingV1(outgoing_request) => { + self.network.peer_stats(0).inc_sent(outgoing_request.payload.encoded_size()); + let validator_index: usize = outgoing_request.payload.index.0 as usize; let candidate_hash = outgoing_request.payload.candidate_hash; diff --git a/polkadot/node/subsystem-bench/src/core/network.rs b/polkadot/node/subsystem-bench/src/core/network.rs index f5532087e35cd..f36c0967466bc 100644 --- a/polkadot/node/subsystem-bench/src/core/network.rs +++ b/polkadot/node/subsystem-bench/src/core/network.rs @@ -315,7 +315,7 @@ impl NetworkEmulator { } // Returns the sent/received stats for all peers. - pub fn stats(&mut self) -> Vec { + pub fn stats(&self) -> Vec { let r = self .stats .iter() From 787dc00bc7c411becbc17e04eea29fa91d1f8e00 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Tue, 28 Nov 2023 10:13:11 +0200 Subject: [PATCH 27/45] Add grafana dashboards for DA read Signed-off-by: Andrei Sandu --- .../src/grafana/availability-read.json | 1872 +++++++++++++++++ .../src/grafana/task-cpu-usage.json | 755 +++++++ 2 files changed, 2627 insertions(+) create mode 100644 polkadot/node/subsystem-bench/src/grafana/availability-read.json create mode 100644 polkadot/node/subsystem-bench/src/grafana/task-cpu-usage.json diff --git a/polkadot/node/subsystem-bench/src/grafana/availability-read.json b/polkadot/node/subsystem-bench/src/grafana/availability-read.json new file mode 100644 index 0000000000000..4fbbe1f58731c --- /dev/null +++ b/polkadot/node/subsystem-bench/src/grafana/availability-read.json @@ -0,0 +1,1872 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "Subsystem and test environment metrics", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 2, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": 60000, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 90, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "editorMode": "code", + "expr": "subsystem_benchmark_n_validators{}", + "instant": false, + "legendFormat": "n_vaidators", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "editorMode": "code", + "expr": "subsystem_benchmark_n_cores{}", + "hide": false, + "instant": false, + "legendFormat": "n_cores", + "range": true, + "refId": "B" + } + ], + "title": "Test configuration", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 31, + "panels": [], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], + "title": "Overview", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$data_source" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 57, + "interval": "1s", + "options": { + "legend": { + "calcs": [ + "mean", + "min", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.0.2", + "repeat": "nodename", + "targets": [ + { + "datasource": { + "uid": "$data_source" + }, + "editorMode": "code", + "expr": "sum(rate(substrate_tasks_polling_duration_sum{}[2s])) by ($cpu_group_by)", + "interval": "", + "legendFormat": "{{task_group}}", + "range": true, + "refId": "A" + } + ], + "title": "All tasks CPU usage breakdown", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$data_source" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 6 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 93, + "interval": "1s", + "options": { + "legend": { + "calcs": [ + "mean", + "min", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.0.2", + "targets": [ + { + "datasource": { + "uid": "$data_source" + }, + "editorMode": "code", + "expr": "increase(substrate_tasks_polling_duration_sum{task_group=\"availability-recovery-subsystem\"}[6s])", + "interval": "", + "legendFormat": "{{task_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Availability subsystem CPU usage per block", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$data_source" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 10, + "type": "log" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 94, + "interval": "1s", + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.0.2", + "targets": [ + { + "datasource": { + "uid": "$data_source" + }, + "editorMode": "code", + "expr": "sum(substrate_tasks_polling_duration_sum{}) by ($cpu_group_by)", + "interval": "", + "legendFormat": "{{task_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Total CPU burn", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$data_source" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 10, + "type": "log" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "dark-red", + "value": 6000 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 40 + }, + "id": 95, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.0.2", + "targets": [ + { + "datasource": { + "uid": "$data_source" + }, + "editorMode": "code", + "expr": "subsystem_benchmark_block_time", + "interval": "", + "legendFormat": "Instant block time", + "range": true, + "refId": "A" + } + ], + "title": "Block time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 100, + "gradientMode": "hue", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 2, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 40 + }, + "id": 89, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "editorMode": "code", + "expr": "sum(rate(subsystem_benchmark_network_peer_total_bytes_sent{}[5s]))", + "instant": false, + "legendFormat": "Received", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "editorMode": "code", + "expr": "sum(rate(subsystem_benchmark_network_peer_total_bytes_received{}[5s]))", + "hide": false, + "instant": false, + "legendFormat": "Sent", + "range": true, + "refId": "B" + } + ], + "title": "Emulated network throughput ", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 15, + "w": 12, + "x": 0, + "y": 52 + }, + "id": 88, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "editorMode": "code", + "expr": "rate(subsystem_benchmark_network_peer_total_bytes_received{}[10s])", + "instant": false, + "legendFormat": "Received by {{peer}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "editorMode": "code", + "expr": "rate(subsystem_benchmark_network_peer_total_bytes_sent{}[10s])", + "hide": false, + "instant": false, + "legendFormat": "Sent by {{peer}}", + "range": true, + "refId": "B" + } + ], + "title": "Emulated peer throughput", + "type": "timeseries" + }, + { + "cards": {}, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateInferno", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 15, + "w": 12, + "x": 12, + "y": 52 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 92, + "interval": "1s", + "legend": { + "show": true + }, + "maxDataPoints": 1340, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": { + "decimals": 0 + }, + "color": { + "exponent": 0.5, + "fill": "#b4ff00", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Inferno", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "show": true, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 0, + "reverse": false, + "unit": "bytes" + } + }, + "pluginVersion": "10.1.1", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(increase(subsystem_benchmark_pov_size_bucket{}[$__rate_interval])) by (le)", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "title": "Recovered PoV sizes", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": 0, + "transformations": [], + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "description": "Number of erasure-encoded chunks of data belonging to candidate blocks. ", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic", + "seriesBy": "max" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "chunks/s" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 67 + }, + "id": 43, + "interval": "1s", + "maxDataPoints": 1340, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.2.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(polkadot_parachain_availability_recovery_chunk_requests_issued{}[10s]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Chunks requested", + "queryType": "randomWalk", + "refId": "B" + } + ], + "title": "Availability", + "transformations": [], + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 77 + }, + "id": 35, + "panels": [], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], + "title": "Availability subystem metrics", + "type": "row" + }, + { + "cards": {}, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateInferno", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 78 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 68, + "interval": "1s", + "legend": { + "show": true + }, + "maxDataPoints": 1340, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": { + "decimals": 0 + }, + "color": { + "exponent": 0.5, + "fill": "#b4ff00", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Inferno", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "show": true, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 0, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "10.1.1", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(increase(polkadot_parachain_availability_recovery_time_total_bucket{}[$__rate_interval])) by (le)", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "title": "Time to recover a PoV", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": 0, + "transformations": [], + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "cards": {}, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateInferno", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 78 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 67, + "interval": "1s", + "legend": { + "show": true + }, + "maxDataPoints": 1340, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": { + "decimals": 0 + }, + "color": { + "exponent": 0.5, + "fill": "#b4ff00", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Inferno", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "show": true, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 0, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "10.1.1", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(increase(polkadot_parachain_availability_recovery_time_chunk_request_bucket{}[$__rate_interval])) by (le)", + "format": "heatmap", + "instant": false, + "interval": "", + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Chunk request duration", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": 0, + "transformations": [], + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 0, + "format": "bitfields", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic", + "seriesBy": "max" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 88 + }, + "id": 85, + "interval": "1s", + "maxDataPoints": 1340, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.2.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "rate(polkadot_parachain_availability_recovery_bytes_total{}[30s])", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Bytes recovered", + "queryType": "randomWalk", + "refId": "B" + } + ], + "title": "Recovery throughtput", + "transformations": [], + "type": "timeseries" + }, + { + "cards": {}, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateInferno", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 88 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 84, + "interval": "1s", + "legend": { + "show": true + }, + "maxDataPoints": 1340, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": { + "decimals": 0 + }, + "color": { + "exponent": 0.5, + "fill": "#b4ff00", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Inferno", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "show": true, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 0, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "10.1.1", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(increase(polkadot_parachain_availability_reencode_chunks_bucket{}[$__rate_interval])) by (le)", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "title": "Re-encoding chunks timing", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": 0, + "transformations": [], + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "cards": {}, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateInferno", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 98 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 83, + "interval": "1s", + "legend": { + "show": true + }, + "maxDataPoints": 1340, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": { + "decimals": 0 + }, + "color": { + "exponent": 0.5, + "fill": "#b4ff00", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Inferno", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "show": true, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 0, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "10.1.1", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(increase(polkadot_parachain_availability_recovery_time_erasure_recovery_bucket{}[$__rate_interval])) by (le)", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "title": "Erasure recovery (no I/O)", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": 0, + "transformations": [], + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "description": "Number of erasure-encoded chunks of data belonging to candidate blocks. ", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic", + "seriesBy": "max" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "stepAfter", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 108 + }, + "id": 86, + "interval": "1s", + "maxDataPoints": 1340, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.2.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(polkadot_parachain_availability_recovery_recoveries_finished{}[1s]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Finished", + "queryType": "randomWalk", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(polkadot_parachain_availability_recovery_recovieries_started{}[1s]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Started", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Recoveries", + "transformations": [], + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 118 + }, + "id": 2, + "panels": [], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], + "title": "Approval voting", + "type": "row" + } + ], + "refresh": "5s", + "schemaVersion": 38, + "style": "dark", + "tags": [ + "subsystem", + "benchmark" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "hide": 0, + "includeAll": false, + "label": "Source of data", + "multi": false, + "name": "data_source", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": { + "selected": true, + "text": "task_name", + "value": "task_name" + }, + "description": "Sum CPU usage by task name or task group.", + "hide": 0, + "includeAll": false, + "label": "Group CPU usage", + "multi": false, + "name": "cpu_group_by", + "options": [ + { + "selected": true, + "text": "task_name", + "value": "task_name" + }, + { + "selected": false, + "text": "task_group", + "value": "task_group" + } + ], + "query": "task_name, task_group", + "queryValue": "", + "skipUrlSync": false, + "type": "custom" + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s" + ] + }, + "timezone": "utc", + "title": "Data Availability Read", + "uid": "asdadasd1", + "version": 56, + "weekStart": "" + } \ No newline at end of file diff --git a/polkadot/node/subsystem-bench/src/grafana/task-cpu-usage.json b/polkadot/node/subsystem-bench/src/grafana/task-cpu-usage.json new file mode 100644 index 0000000000000..90763444abf19 --- /dev/null +++ b/polkadot/node/subsystem-bench/src/grafana/task-cpu-usage.json @@ -0,0 +1,755 @@ +{ + "annotations": { + "list": [ + { + "$$hashKey": "object:326", + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "limit": 100, + "name": "Annotations & Alerts", + "showIn": 0, + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + }, + { + "$$hashKey": "object:327", + "datasource": { + "uid": "$data_source" + }, + "enable": true, + "expr": "increase(${metric_namespace}_tasks_ended_total{reason=\"panic\", node=~\"${nodename}\"}[10m])", + "hide": true, + "iconColor": "rgba(255, 96, 96, 1)", + "limit": 100, + "name": "Task panics", + "rawQuery": "SELECT\n extract(epoch from time_column) AS time,\n text_column as text,\n tags_column as tags\nFROM\n metric_table\nWHERE\n $__timeFilter(time_column)\n", + "showIn": 0, + "step": "10m", + "tags": [], + "textFormat": "{{node}} - {{task_name}}", + "titleFormat": "Panic!", + "type": "tags" + }, + { + "$$hashKey": "object:621", + "datasource": { + "uid": "$data_source" + }, + "enable": true, + "expr": "changes(${metric_namespace}_process_start_time_seconds{node=~\"${nodename}\"}[10m])", + "hide": false, + "iconColor": "#8AB8FF", + "name": "Node reboots", + "showIn": 0, + "step": "10m", + "textFormat": "{{node}}", + "titleFormat": "Reboots" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 1, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 29, + "panels": [], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], + "title": "Tasks", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 3, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 1 + }, + "hiddenSeries": false, + "id": 11, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "repeat": "nodename", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$data_source" + }, + "editorMode": "code", + "expr": "sum(rate(substrate_tasks_polling_duration_sum{}[$__rate_interval])) by (task_name)", + "interval": "", + "legendFormat": "{{task_name}}", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "CPU time spent on each task", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2721", + "format": "percentunit", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:2722", + "format": "short", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 3, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 10 + }, + "hiddenSeries": false, + "id": 30, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "repeat": "nodename", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$data_source" + }, + "editorMode": "code", + "expr": "rate(substrate_tasks_polling_duration_count{}[$__rate_interval])", + "interval": "", + "legendFormat": "{{task_name}}", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Task polling rate per second", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2571", + "format": "cps", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:2572", + "format": "short", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 16 + }, + "hiddenSeries": false, + "id": 43, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": true, + "hideZero": false, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": true, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "repeat": "nodename", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$data_source" + }, + "editorMode": "code", + "expr": "increase(substrate_tasks_polling_duration_sum{}[$__rate_interval]) / increase(substrate_tasks_polling_duration_count{}[$__rate_interval])", + "interval": "", + "legendFormat": "{{task_name}}", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Average time it takes to call Future::poll()", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2571", + "format": "s", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:2572", + "format": "short", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 22 + }, + "hiddenSeries": false, + "id": 15, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": true, + "values": true + }, + "lines": false, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "repeat": "nodename", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": true, + "targets": [ + { + "datasource": { + "uid": "$data_source" + }, + "editorMode": "code", + "expr": "increase(substrate_tasks_spawned_total{}[$__rate_interval])", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{task_name}}", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Number of tasks started", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:771", + "format": "short", + "logBase": 10, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:772", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 28 + }, + "hiddenSeries": false, + "id": 2, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "repeat": "nodename", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "uid": "$data_source" + }, + "editorMode": "code", + "expr": "substrate_tasks_spawned_total{} - sum(substrate_tasks_ended_total{}) without(reason)\n\n# Fallback if tasks_ended_total is null for that task\nor on(task_name) substrate_tasks_spawned_total{}", + "interval": "", + "legendFormat": "{{task_name}}", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Number of tasks running", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:919", + "format": "short", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:920", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 34 + }, + "hiddenSeries": false, + "id": 7, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "repeat": "nodename", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": true, + "targets": [ + { + "datasource": { + "uid": "$data_source" + }, + "editorMode": "code", + "expr": "irate(substrate_tasks_polling_duration_bucket{le=\"+Inf\"}[$__rate_interval])\n - ignoring(le)\n irate(substrate_tasks_polling_duration_bucket{le=\"1.024\"}[$__rate_interval]) > 0", + "interval": "", + "legendFormat": "{{task_name}}", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Number of calls to `Future::poll` that took more than one second", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:3040", + "format": "cps", + "label": "Calls to `Future::poll`/second", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:3041", + "format": "short", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + }, + { + "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 40 + }, + "id": 27, + "panels": [], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], + "title": "Unbounded Channels", + "type": "row" + } + ], + "refresh": "5s", + "schemaVersion": 38, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "utc", + "title": "Substrate Service Tasks with substrate prefix", + "uid": "S7sc-M_Gk", + "version": 17, + "weekStart": "" + } \ No newline at end of file From cd18f8de2d4963c4fafe05423290c25a667be190 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Tue, 28 Nov 2023 10:14:09 +0200 Subject: [PATCH 28/45] network stats fixes Signed-off-by: Andrei Sandu --- .../subsystem-bench/src/availability/mod.rs | 10 ------ .../subsystem-bench/src/core/configuration.rs | 7 +++-- .../node/subsystem-bench/src/core/display.rs | 4 +-- .../subsystem-bench/src/core/environment.rs | 2 +- .../src/core/mock/network_bridge.rs | 31 +++++++++++++------ polkadot/node/subsystem-bench/src/core/mod.rs | 8 ----- .../node/subsystem-bench/src/core/network.rs | 23 ++++++++++++-- 7 files changed, 49 insertions(+), 36 deletions(-) diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index ae4e743205e38..a5f1a0866a5ba 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -100,16 +100,6 @@ pub fn prepare_test( prepare_test_inner(config, state, TestEnvironmentDependencies::default()) } -/// Takes a test configuration and uses it to creates the `TestEnvironment`. -#[allow(unused)] -pub fn prepare_test_with_dependencies( - config: TestConfiguration, - state: &mut TestState, - dependencies: TestEnvironmentDependencies, -) -> (TestEnvironment, ProtocolConfig) { - prepare_test_inner(config, state, dependencies) -} - fn prepare_test_inner( config: TestConfiguration, state: &mut TestState, diff --git a/polkadot/node/subsystem-bench/src/core/configuration.rs b/polkadot/node/subsystem-bench/src/core/configuration.rs index 35fa51790c911..340b5c03ab84a 100644 --- a/polkadot/node/subsystem-bench/src/core/configuration.rs +++ b/polkadot/node/subsystem-bench/src/core/configuration.rs @@ -13,13 +13,14 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . -use std::path::Path; - +// +//! Test configuration definition and helpers. use super::*; use keyring::Keyring; +use std::{path::Path, time::Duration}; pub use crate::cli::TestObjective; -use polkadot_primitives::ValidatorId; +use polkadot_primitives::{AuthorityDiscoveryId, ValidatorId}; use rand::{distributions::Uniform, prelude::Distribution, thread_rng}; use serde::{Deserialize, Serialize}; diff --git a/polkadot/node/subsystem-bench/src/core/display.rs b/polkadot/node/subsystem-bench/src/core/display.rs index 13ea7d375e95b..f21a8b907d118 100644 --- a/polkadot/node/subsystem-bench/src/core/display.rs +++ b/polkadot/node/subsystem-bench/src/core/display.rs @@ -14,8 +14,8 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . // -//! Some helper methods for parsing prometheus metrics to a format that can be -//! displayed in the CLI. +//! Display implementations and helper methods for parsing prometheus metrics +//! to a format that can be displayed in the CLI. //! //! Currently histogram buckets are skipped. use super::LOG_TARGET; diff --git a/polkadot/node/subsystem-bench/src/core/environment.rs b/polkadot/node/subsystem-bench/src/core/environment.rs index 24d10ecb1fa1d..5c04071c442f7 100644 --- a/polkadot/node/subsystem-bench/src/core/environment.rs +++ b/polkadot/node/subsystem-bench/src/core/environment.rs @@ -13,7 +13,7 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . - +//! Test environment implementation use crate::{ core::{mock::AlwaysSupportsParachains, network::NetworkEmulator}, TestConfiguration, diff --git a/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs b/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs index a45cacd0241a5..c14a3895e238c 100644 --- a/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs +++ b/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs @@ -80,7 +80,16 @@ impl MockNetworkBridgeTx { match request { Requests::ChunkFetchingV1(outgoing_request) => { + let authority_discovery_id = match outgoing_request.peer { + req_res::Recipient::Authority(authority_discovery_id) => authority_discovery_id, + _ => unimplemented!("Peer recipient not supported yet"), + }; + // Account our sent request bytes. self.network.peer_stats(0).inc_sent(outgoing_request.payload.encoded_size()); + // Account for remote received request bytes. + self.network + .peer_stats_by_id(authority_discovery_id.clone()) + .inc_received(outgoing_request.payload.encoded_size()); let validator_index: usize = outgoing_request.payload.index.0 as usize; let candidate_hash = outgoing_request.payload.candidate_hash; @@ -107,10 +116,6 @@ impl MockNetworkBridgeTx { Ok(req_res::v1::ChunkFetchingResponse::from(Some(chunk)).encode()) }; - let authority_discovery_id = match outgoing_request.peer { - req_res::Recipient::Authority(authority_discovery_id) => authority_discovery_id, - _ => unimplemented!("Peer recipient not supported yet"), - }; let authority_discovery_id_clone = authority_discovery_id.clone(); let future = async move { @@ -142,7 +147,18 @@ impl MockNetworkBridgeTx { .candidate_hashes .get(&candidate_hash) .expect("candidate was generated previously; qed"); - gum::warn!(target: LOG_TARGET, ?candidate_hash, candidate_index, "Candidate mapped to index"); + gum::debug!(target: LOG_TARGET, ?candidate_hash, candidate_index, "Candidate mapped to index"); + + let authority_discovery_id = match outgoing_request.peer { + req_res::Recipient::Authority(authority_discovery_id) => authority_discovery_id, + _ => unimplemented!("Peer recipient not supported yet"), + }; + // Account our sent request bytes. + self.network.peer_stats(0).inc_sent(outgoing_request.payload.encoded_size()); + // Account for remote received request bytes. + self.network + .peer_stats_by_id(authority_discovery_id.clone()) + .inc_received(outgoing_request.payload.encoded_size()); let available_data = self.availabilty.available_data.get(*candidate_index as usize).unwrap().clone(); @@ -161,10 +177,6 @@ impl MockNetworkBridgeTx { } .boxed(); - let authority_discovery_id = match outgoing_request.peer { - req_res::Recipient::Authority(authority_discovery_id) => authority_discovery_id, - _ => unimplemented!("Peer recipient not supported yet"), - }; let authority_discovery_id_clone = authority_discovery_id.clone(); let future_wrapper = async move { @@ -243,6 +255,7 @@ impl MockNetworkBridgeTx { gum::debug!(target: LOG_TARGET, request = ?request, "Processing request"); self.network.inc_sent(request_size(&request)); let action = self.respond_to_send_request(request, &mut ingress_tx); + // Will account for our node sending the request over the emulated // network. self.network.submit_peer_action(action.peer(), action); diff --git a/polkadot/node/subsystem-bench/src/core/mod.rs b/polkadot/node/subsystem-bench/src/core/mod.rs index 11ca03dbda4c2..282788d143b44 100644 --- a/polkadot/node/subsystem-bench/src/core/mod.rs +++ b/polkadot/node/subsystem-bench/src/core/mod.rs @@ -14,16 +14,8 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . -use std::{ - collections::HashMap, - sync::Arc, - time::{Duration, Instant}, -}; const LOG_TARGET: &str = "subsystem-bench::core"; -use polkadot_primitives::AuthorityDiscoveryId; -use sc_service::SpawnTaskHandle; - pub mod configuration; pub mod display; pub mod environment; diff --git a/polkadot/node/subsystem-bench/src/core/network.rs b/polkadot/node/subsystem-bench/src/core/network.rs index f36c0967466bc..40809ce36e8dd 100644 --- a/polkadot/node/subsystem-bench/src/core/network.rs +++ b/polkadot/node/subsystem-bench/src/core/network.rs @@ -15,8 +15,17 @@ // along with Polkadot. If not, see . use super::*; use colored::Colorize; +use polkadot_primitives::AuthorityDiscoveryId; use prometheus_endpoint::U64; -use std::sync::atomic::{AtomicU64, Ordering}; +use sc_service::SpawnTaskHandle; +use std::{ + collections::HashMap, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, + }, + time::{Duration, Instant}, +}; use tokio::sync::mpsc::UnboundedSender; // An emulated node egress traffic rate_limiter. @@ -309,11 +318,20 @@ impl NetworkEmulator { self.peers[*index].send(action); } - // Returns the sent/received stats for all peers. + // Returns the sent/received stats for `peer_index`. pub fn peer_stats(&mut self, peer_index: usize) -> Arc { self.stats[peer_index].clone() } + // Returns the sent/received stats for `peer`. + pub fn peer_stats_by_id(&mut self, peer: AuthorityDiscoveryId) -> Arc { + let peer_index = self + .validator_authority_ids + .get(&peer) + .expect("all test authorities are valid; qed"); + self.stats[*peer_index].clone() + } + // Returns the sent/received stats for all peers. pub fn stats(&self) -> Vec { let r = self @@ -334,7 +352,6 @@ impl NetworkEmulator { } // Increment bytes received by our node (the node that contains the subsystem under test) - #[allow(unused)] pub fn inc_received(&self, bytes: u64) { // Our node always is peer 0. self.metrics.on_peer_received(0, bytes); From e8506b3d663a408b67cbff21749c3d273aa0c031 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Tue, 28 Nov 2023 10:16:41 +0200 Subject: [PATCH 29/45] move examples and grafana Signed-off-by: Andrei Sandu --- .../{src => }/grafana/availability-read.json | 0 .../{src => }/grafana/task-cpu-usage.json | 0 .../examples/availability_read.yaml} | 27 ++++++++++--------- 3 files changed, 14 insertions(+), 13 deletions(-) rename polkadot/node/subsystem-bench/{src => }/grafana/availability-read.json (100%) rename polkadot/node/subsystem-bench/{src => }/grafana/task-cpu-usage.json (100%) rename polkadot/node/subsystem-bench/{test_sequence.yaml => src/examples/availability_read.yaml} (75%) diff --git a/polkadot/node/subsystem-bench/src/grafana/availability-read.json b/polkadot/node/subsystem-bench/grafana/availability-read.json similarity index 100% rename from polkadot/node/subsystem-bench/src/grafana/availability-read.json rename to polkadot/node/subsystem-bench/grafana/availability-read.json diff --git a/polkadot/node/subsystem-bench/src/grafana/task-cpu-usage.json b/polkadot/node/subsystem-bench/grafana/task-cpu-usage.json similarity index 100% rename from polkadot/node/subsystem-bench/src/grafana/task-cpu-usage.json rename to polkadot/node/subsystem-bench/grafana/task-cpu-usage.json diff --git a/polkadot/node/subsystem-bench/test_sequence.yaml b/polkadot/node/subsystem-bench/src/examples/availability_read.yaml similarity index 75% rename from polkadot/node/subsystem-bench/test_sequence.yaml rename to polkadot/node/subsystem-bench/src/examples/availability_read.yaml index 088a7e15729b2..889309e64a2bd 100644 --- a/polkadot/node/subsystem-bench/test_sequence.yaml +++ b/polkadot/node/subsystem-bench/src/examples/availability_read.yaml @@ -1,10 +1,10 @@ TestConfiguration: # Test 1 - objective: !DataAvailabilityRead - fetch_from_backers: false + fetch_from_backers: true n_validators: 300 - n_cores: 10 - min_pov_size: 1120 + n_cores: 20 + min_pov_size: 5120 max_pov_size: 5120 peer_bandwidth: 52428800 bandwidth: 52428800 @@ -16,13 +16,14 @@ TestConfiguration: secs: 0 nanos: 100000000 error: 3 - num_blocks: 10 + num_blocks: 3 + # Test 2 - objective: !DataAvailabilityRead - fetch_from_backers: false + fetch_from_backers: true n_validators: 500 - n_cores: 10 - min_pov_size: 1120 + n_cores: 20 + min_pov_size: 5120 max_pov_size: 5120 peer_bandwidth: 52428800 bandwidth: 52428800 @@ -34,14 +35,14 @@ TestConfiguration: secs: 0 nanos: 100000000 error: 3 - num_blocks: 10 + num_blocks: 3 -# Test 2 +# Test 3 - objective: !DataAvailabilityRead - fetch_from_backers: false + fetch_from_backers: true n_validators: 1000 - n_cores: 10 - min_pov_size: 1120 + n_cores: 20 + min_pov_size: 5120 max_pov_size: 5120 peer_bandwidth: 52428800 bandwidth: 52428800 @@ -53,4 +54,4 @@ TestConfiguration: secs: 0 nanos: 100000000 error: 3 - num_blocks: 10 + num_blocks: 3 From cbb677202c14fe04c013100c7910dfeecdf26b5b Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Tue, 28 Nov 2023 14:52:42 +0200 Subject: [PATCH 30/45] Add readme Signed-off-by: Andrei Sandu --- polkadot/node/subsystem-bench/README.md | 182 +++++++++++++++++- .../{src => }/examples/availability_read.yaml | 0 .../subsystem-bench/src/availability/cli.rs | 5 +- .../subsystem-bench/src/availability/mod.rs | 2 +- polkadot/node/subsystem-bench/src/cli.rs | 2 +- .../node/subsystem-bench/src/core/display.rs | 18 +- .../subsystem-bench/src/subsystem-bench.rs | 24 +-- 7 files changed, 211 insertions(+), 22 deletions(-) rename polkadot/node/subsystem-bench/{src => }/examples/availability_read.yaml (100%) diff --git a/polkadot/node/subsystem-bench/README.md b/polkadot/node/subsystem-bench/README.md index 8843f9883116f..4ed25ff9078c5 100644 --- a/polkadot/node/subsystem-bench/README.md +++ b/polkadot/node/subsystem-bench/README.md @@ -1,6 +1,182 @@ # Subsystem benchmark client -Run subsystem performance tests in isolation. +Run parachain consensus stress and performance tests on your development machine. + +## Motivation +The parachain consensus node implementation spans across many modules which we call subsystems. Each subsystem is responsible for a small part of logic of the parachain consensus pipeline, but in general the most load and performance issues are localized in just a few core subsystems like `availability-recovery`, `approval-voting` or `dispute-coordinator`. In the absence of this client, we would run large test nets in order to load/stress test these parts of the system. Setting up and making sense of the amount of data produced by such a large test is very expensive, hard to orchestrate and is a huge development time sink. + +This tool aims to solve this problem by making it easy to: +- set up and run core subsystem load tests locally on your development machine +- iterate and conclude faster when benchmarking new optimizations or comparing implementations +- automate and keep track of performance regressions in CI runs +- simulate various networking topologies, bandwidth and connectivity issues + +## Test environment setup + +`cargo build --profile=testnet --bin subsystem-bench -p polkadot-subsystem-bench` + +The output binary will be placed in `target/testnet/subsystem-bench`. + +### Test metrics +Subsystem, CPU usage and network metrics are exposed via a prometheus endpoint during the test execution. +A small subset of these collected metrics are displayed in the CLI, but for an in depth analysys of the test results, a local Grafana/Prometheus stack is needed. + +### Install Prometheus +Please follow the [official installation guide](https://prometheus.io/docs/prometheus/latest/installation/) for your platform/OS. + +After succesfully installing and starting up Prometheus, we need to alter it's configuration such that it +will scrape the benchmark prometheus endpoint `127.0.0.1:9999`. Please check the prometheus official documentation +regarding the location of `prometheus.yml`. On MacOS for example the full path `/opt/homebrew/etc/prometheus.yml` + +prometheus.yml: +``` +global: + scrape_interval: 5s + +scrape_configs: + - job_name: "prometheus" + static_configs: + - targets: ["localhost:9090"] + - job_name: "subsystem-bench" + scrape_interval: 0s500ms + static_configs: + - targets: ['localhost:9999'] +``` + +To complete this step restart Prometheus server such that it picks up the new configuration. +### Install and setup Grafana + +Follow the [installation guide](https://grafana.com/docs/grafana/latest/setup-grafana/installation/) relevant +to your operating system. + +Once you have the installation up and running, configure the local Prometheus as a data source by following +[this guide](https://grafana.com/docs/grafana/latest/datasources/prometheus/configure-prometheus-data-source/) + +#### Import dashboards + +Follow [this guide](https://grafana.com/docs/grafana/latest/dashboards/manage-dashboards/#export-and-import-dashboards) to import the dashboards from the repository `grafana` folder. + +## Running existing tests + +To run a test, you need to first choose a test objective. Currently, we support the following: + +``` +target/testnet/subsystem-bench --help +The almighty Subsystem Benchmark Tool™️ + +Usage: subsystem-bench [OPTIONS] + +Commands: + data-availability-read Benchmark availability recovery strategies + test-sequence Run a test sequence specified in a file + help Print this message or the help of the given subcommand(s) + +``` + +The `test-sequence` is a special test objective that wraps up an arbitrary number of test objectives. It is tipically used to run a suite of tests defined in a `yaml` file like in this [example](examples/availability_read.yaml). + +### Standard test options + +``` +Options: + --network The type of network to be emulated [default: ideal] [possible values: ideal, + healthy, degraded] + --n-cores Number of cores to fetch availability for [default: 100] + --n-validators Number of validators to fetch chunks from [default: 500] + --min-pov-size The minimum pov size in KiB [default: 5120] + --max-pov-size The maximum pov size bytes [default: 5120] + -n, --num-blocks The number of blocks the test is going to run [default: 1] + -p, --peer-bandwidth The bandwidth of simulated remote peers in KiB + -b, --bandwidth The bandwidth of our simulated node in KiB + --peer-error Simulated conection error ratio [0-100] + --peer-min-latency Minimum remote peer latency in milliseconds [0-5000] + --peer-max-latency Maximum remote peer latency in milliseconds [0-5000] + -h, --help Print help + -V, --version Print version +``` + +These apply to all test objectives, except `test-sequence` which relies on the values being specified in a file. + +### Test objectives +Each test objective can have it's specific configuration options, in contrast with the standard test options. + +For `data-availability-read` the recovery strategy to be used is configurable. +``` +target/testnet/subsystem-bench data-availability-read --help +Benchmark availability recovery strategies + +Usage: subsystem-bench data-availability-read [OPTIONS] + +Options: + -f, --fetch-from-backers Turbo boost AD Read by fetching the full availability datafrom backers first. Saves CPU as we + don't need to re-construct from chunks. Tipically this is only faster if nodes have enough + bandwidth + -h, --help Print help +``` +### Understanding the test configuration +A single test configuration `TestConfiguration` struct applies to a single run of a certain test objective. + +The configuration describes the following important parameters that influence the test duration and resource +usage: +- how many validators are on the emulated network (`n_validators`) +- how many cores per block the subsystem will have to do work on (`n_cores`) +- for how many blocks the test should run (`num_blocks`) + +From the perspective of the subsystem under test, this means that it will receive an `ActiveLeavesUpdate` signal +followed by an arbitrary amount of messages. The process repeat itself for `num_blocks`. These messages are generally test payloads pre-generated before the test run, or constructed on pre-genereated paylods. For example the `AvailabilityRecoveryMessage::RecoverAvailableData` message includes a `CandidateReceipt` which is generated before the test is started. + +### Example run + +Let's run an availabilty read test which will recover availability for 10 cores with max PoV size on a 500 +node validator network. + +``` + target/testnet/subsystem-bench --n-cores 10 data-availability-read +[2023-11-28T09:01:59Z INFO subsystem_bench::core::display] n_validators = 500, n_cores = 10, pov_size = 5120 - 5120, error = 0, latency = None +[2023-11-28T09:01:59Z INFO subsystem-bench::availability] Generating template candidate index=0 pov_size=5242880 +[2023-11-28T09:01:59Z INFO subsystem-bench::availability] Created test environment. +[2023-11-28T09:01:59Z INFO subsystem-bench::availability] Pre-generating 10 candidates. +[2023-11-28T09:02:01Z INFO subsystem-bench::core] Initializing network emulation for 500 peers. +[2023-11-28T09:02:01Z INFO substrate_prometheus_endpoint] 〽️ Prometheus exporter started at 127.0.0.1:9999 +[2023-11-28T09:02:01Z INFO subsystem-bench::availability] Current block 1/1 +[2023-11-28T09:02:01Z INFO subsystem_bench::availability] 10 recoveries pending +[2023-11-28T09:02:04Z INFO subsystem_bench::availability] Block time 3231ms +[2023-11-28T09:02:04Z INFO subsystem-bench::availability] Sleeping till end of block (2768ms) +[2023-11-28T09:02:07Z INFO subsystem_bench::availability] All blocks processed in 6001ms +[2023-11-28T09:02:07Z INFO subsystem_bench::availability] Throughput: 51200 KiB/block +[2023-11-28T09:02:07Z INFO subsystem_bench::availability] Block time: 6001 ms +[2023-11-28T09:02:07Z INFO subsystem_bench::availability] + + Total received from network: 66 MiB + Total sent to network: 58 KiB + Total subsystem CPU usage 4.16s + CPU usage per block 4.16s + Total test environment CPU usage 0.00s + CPU usage per block 0.00s +``` +### Test logs + +You can select node cateogries and verbosity as with the Polkadot clien, simply setting `RUST_LOOG="parachain=debug"` turns on debug logs for all parachain consensus subsystems in the test. + +### View test metrics + +Assuming the Grafana/Prometheus stack installation steps completed succesfully, you should be able to +view the test progress in real time by accessing [this link](http://localhost:3000/goto/i1vzLpNSR?orgId=1). + +Now run `target/testnet/subsystem-bench test-sequence --path polkadot/node/subsystem-bench/examples/availability_read.yaml` and view the metrics in real time and spot differences between different `n_valiator` values. + +## Create new test objectives +This tool is intended to make it easy to write new test objectives that focus individual subsystems, +or even multiple subsystems (for example `approval-distribution` and `approval-voting`). + +A special kind of test objectives are performance regression tests for the CI pipeline. These should be sequences +of tests that check the performance characteristics (such as CPU usage, speed) of the subsystem under test in both happy and negative scenarios (low bandwidth, network errors and low connectivity). + +### Reuaseble test components +To faster write a new test objective you need to use some higher level wrappers and logic: `TestEnvironment` `TestConfiguration`, `TestAuthorities`, `NetworkEmulator`. To create the `TestEnvironment` you will +need to also build an `Overseer`, but that should be easy using the mockups for subsystems in`core::mock`. + +### Mocking +Ideally we want to have a single mock implementation for subsystems that can be minimally configured to +be used in different tests. A good example is `runtime-api` which currently only responds to session information requests based on static data. It can be easily extended to service other requests. -Currently implemented benchmarks: -* `availability-recovery` diff --git a/polkadot/node/subsystem-bench/src/examples/availability_read.yaml b/polkadot/node/subsystem-bench/examples/availability_read.yaml similarity index 100% rename from polkadot/node/subsystem-bench/src/examples/availability_read.yaml rename to polkadot/node/subsystem-bench/examples/availability_read.yaml diff --git a/polkadot/node/subsystem-bench/src/availability/cli.rs b/polkadot/node/subsystem-bench/src/availability/cli.rs index 06fb2966d878c..f86f1bfb700dc 100644 --- a/polkadot/node/subsystem-bench/src/availability/cli.rs +++ b/polkadot/node/subsystem-bench/src/availability/cli.rs @@ -34,7 +34,8 @@ pub enum NetworkEmulation { #[allow(missing_docs)] pub struct DataAvailabilityReadOptions { #[clap(short, long, default_value_t = false)] - /// Turbo boost AD Read by fetching from backers first. Tipically this is only faster if nodes - /// have enough bandwidth. + /// Turbo boost AD Read by fetching the full availability datafrom backers first. Saves CPU as + /// we don't need to re-construct from chunks. Tipically this is only faster if nodes have + /// enough bandwidth. pub fetch_from_backers: bool, } diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index a5f1a0866a5ba..e5543e5d39042 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -312,7 +312,7 @@ fn derive_erasure_chunks_with_proofs_and_root( (erasure_chunks, root) } -pub async fn bench_chunk_recovery(env: &mut TestEnvironment, mut state: TestState) { +pub async fn benchmark_availability_read(env: &mut TestEnvironment, mut state: TestState) { let config = env.config().clone(); env.import_block(new_block_import_info(Hash::repeat_byte(1), 1)).await; diff --git a/polkadot/node/subsystem-bench/src/cli.rs b/polkadot/node/subsystem-bench/src/cli.rs index ee67a01d449e3..3352f33a3503b 100644 --- a/polkadot/node/subsystem-bench/src/cli.rs +++ b/polkadot/node/subsystem-bench/src/cli.rs @@ -26,7 +26,7 @@ pub struct TestSequenceOptions { /// Define the supported benchmarks targets #[derive(Debug, Clone, clap::Parser, Serialize, Deserialize)] -#[command(about = "Test objectives", version, rename_all = "kebab-case")] +#[command(rename_all = "kebab-case")] pub enum TestObjective { /// Benchmark availability recovery strategies. DataAvailabilityRead(DataAvailabilityReadOptions), diff --git a/polkadot/node/subsystem-bench/src/core/display.rs b/polkadot/node/subsystem-bench/src/core/display.rs index f21a8b907d118..629fb2edc4146 100644 --- a/polkadot/node/subsystem-bench/src/core/display.rs +++ b/polkadot/node/subsystem-bench/src/core/display.rs @@ -18,7 +18,7 @@ //! to a format that can be displayed in the CLI. //! //! Currently histogram buckets are skipped. -use super::LOG_TARGET; +use super::{LOG_TARGET, configuration::TestConfiguration}; use colored::Colorize; use prometheus::{ proto::{MetricFamily, MetricType}, @@ -181,3 +181,19 @@ pub fn parse_metrics(registry: &Registry) -> MetricCollection { } test_metrics.into() } + + +pub fn display_configuration(test_config: &TestConfiguration) { + gum::info!( + "{}, {}, {}, {}, {}", + format!("n_validators = {}", test_config.n_validators).blue(), + format!("n_cores = {}", test_config.n_cores).blue(), + format!( + "pov_size = {} - {}", + test_config.min_pov_size, test_config.max_pov_size + ) + .bright_black(), + format!("error = {}", test_config.error).bright_black(), + format!("latency = {:?}", test_config.latency).bright_black(), + ); +} diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs index f9261d848778b..a666ee06ad55d 100644 --- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -35,7 +35,8 @@ use core::{ }; use clap_num::number_range; -// const LOG_TARGET: &str = "subsystem-bench"; + +use crate::core::display::display_configuration; fn le_100(s: &str) -> Result { number_range(s, 0, 100) @@ -64,7 +65,7 @@ struct BenchCli { pub bandwidth: Option, #[clap(long, value_parser=le_100)] - /// Simulated connection error rate [0-100]. + /// Simulated conection error ratio [0-100]. pub peer_error: Option, #[clap(long, value_parser=le_5000)] @@ -95,22 +96,14 @@ impl BenchCli { ); for (index, test_config) in test_sequence.into_iter().enumerate() { gum::info!( - "{}, {}, {}, {}, {}, {}", + "{}", format!("Step {}/{}", index + 1, num_steps).bright_purple(), - format!("n_validators = {}", test_config.n_validators).blue(), - format!("n_cores = {}", test_config.n_cores).blue(), - format!( - "pov_size = {} - {}", - test_config.min_pov_size, test_config.max_pov_size - ) - .bright_black(), - format!("error = {}", test_config.error).bright_black(), - format!("latency = {:?}", test_config.latency).bright_black(), ); + display_configuration(&test_config); let mut state = TestState::new(&test_config); let (mut env, _protocol_config) = prepare_test(test_config, &mut state); - env.runtime().block_on(availability::bench_chunk_recovery(&mut env, state)); + env.runtime().block_on(availability::benchmark_availability_read(&mut env, state)); } return Ok(()) }, @@ -166,10 +159,12 @@ impl BenchCli { test_config.bandwidth = bandwidth * 1024; } + display_configuration(&test_config); + let mut state = TestState::new(&test_config); let (mut env, _protocol_config) = prepare_test(test_config, &mut state); // test_config.write_to_disk(); - env.runtime().block_on(availability::bench_chunk_recovery(&mut env, state)); + env.runtime().block_on(availability::benchmark_availability_read(&mut env, state)); Ok(()) } @@ -181,6 +176,7 @@ fn main() -> eyre::Result<()> { .filter(Some("hyper"), log::LevelFilter::Info) // Avoid `Terminating due to subsystem exit subsystem` warnings .filter(Some("polkadot_overseer"), log::LevelFilter::Error) + .filter(None, log::LevelFilter::Info) // .filter(None, log::LevelFilter::Trace) .try_init() .unwrap(); From 1a8087010d36a8712482ba9f03d383e17e547623 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Tue, 28 Nov 2023 15:04:32 +0200 Subject: [PATCH 31/45] fmt + readme updates Signed-off-by: Andrei Sandu --- polkadot/node/subsystem-bench/README.md | 22 ++++++++++--------- .../subsystem-bench/src/availability/cli.rs | 4 ++-- .../node/subsystem-bench/src/core/display.rs | 12 ++++------ .../subsystem-bench/src/subsystem-bench.rs | 11 +++++----- 4 files changed, 23 insertions(+), 26 deletions(-) diff --git a/polkadot/node/subsystem-bench/README.md b/polkadot/node/subsystem-bench/README.md index 4ed25ff9078c5..5b58dc3a5be48 100644 --- a/polkadot/node/subsystem-bench/README.md +++ b/polkadot/node/subsystem-bench/README.md @@ -3,9 +3,9 @@ Run parachain consensus stress and performance tests on your development machine. ## Motivation -The parachain consensus node implementation spans across many modules which we call subsystems. Each subsystem is responsible for a small part of logic of the parachain consensus pipeline, but in general the most load and performance issues are localized in just a few core subsystems like `availability-recovery`, `approval-voting` or `dispute-coordinator`. In the absence of this client, we would run large test nets in order to load/stress test these parts of the system. Setting up and making sense of the amount of data produced by such a large test is very expensive, hard to orchestrate and is a huge development time sink. +The parachain consensus node implementation spans across many modules which we call subsystems. Each subsystem is responsible for a small part of logic of the parachain consensus pipeline, but in general the most load and performance issues are localized in just a few core subsystems like `availability-recovery`, `approval-voting` or `dispute-coordinator`. In the absence such a tool, we would run large test nets to load/stress test these parts of the system. Setting up and making sense of the amount of data produced by such a large test is very expensive, hard to orchestrate and is a huge development time sink. -This tool aims to solve this problem by making it easy to: +This tool aims to solve the problem by making it easy to: - set up and run core subsystem load tests locally on your development machine - iterate and conclude faster when benchmarking new optimizations or comparing implementations - automate and keep track of performance regressions in CI runs @@ -56,7 +56,7 @@ Once you have the installation up and running, configure the local Prometheus as Follow [this guide](https://grafana.com/docs/grafana/latest/dashboards/manage-dashboards/#export-and-import-dashboards) to import the dashboards from the repository `grafana` folder. -## Running existing tests +## How to run a test To run a test, you need to first choose a test objective. Currently, we support the following: @@ -68,12 +68,10 @@ Usage: subsystem-bench [OPTIONS] Commands: data-availability-read Benchmark availability recovery strategies - test-sequence Run a test sequence specified in a file - help Print this message or the help of the given subcommand(s) ``` -The `test-sequence` is a special test objective that wraps up an arbitrary number of test objectives. It is tipically used to run a suite of tests defined in a `yaml` file like in this [example](examples/availability_read.yaml). +Note: `test-sequence` is a special test objective that wraps up an arbitrary number of test objectives. It is tipically used to run a suite of tests defined in a `yaml` file like in this [example](examples/availability_read.yaml). ### Standard test options @@ -123,7 +121,7 @@ usage: - for how many blocks the test should run (`num_blocks`) From the perspective of the subsystem under test, this means that it will receive an `ActiveLeavesUpdate` signal -followed by an arbitrary amount of messages. The process repeat itself for `num_blocks`. These messages are generally test payloads pre-generated before the test run, or constructed on pre-genereated paylods. For example the `AvailabilityRecoveryMessage::RecoverAvailableData` message includes a `CandidateReceipt` which is generated before the test is started. +followed by an arbitrary amount of messages. This process repeats itself for `num_blocks`. The messages are generally test payloads pre-generated before the test run, or constructed on pre-genereated payloads. For example the `AvailabilityRecoveryMessage::RecoverAvailableData` message includes a `CandidateReceipt` which is generated before the test is started. ### Example run @@ -154,14 +152,18 @@ node validator network. Total test environment CPU usage 0.00s CPU usage per block 0.00s ``` + +`Block time` in the context of `data-availability-read` has a different meaning. It measures the amount of time it took the subsystem to finish processing all of the messages sent in the context of the current test block. + + ### Test logs -You can select node cateogries and verbosity as with the Polkadot clien, simply setting `RUST_LOOG="parachain=debug"` turns on debug logs for all parachain consensus subsystems in the test. +You can select log target, subtarget and verbosity just like with Polkadot node CLI, simply setting `RUST_LOOG="parachain=debug"` turns on debug logs for all parachain consensus subsystems in the test. ### View test metrics Assuming the Grafana/Prometheus stack installation steps completed succesfully, you should be able to -view the test progress in real time by accessing [this link](http://localhost:3000/goto/i1vzLpNSR?orgId=1). +view the test progress in real time by accessing [this link](http://localhost:3000/goto/SM5B8pNSR?orgId=1). Now run `target/testnet/subsystem-bench test-sequence --path polkadot/node/subsystem-bench/examples/availability_read.yaml` and view the metrics in real time and spot differences between different `n_valiator` values. @@ -172,7 +174,7 @@ or even multiple subsystems (for example `approval-distribution` and `approval-v A special kind of test objectives are performance regression tests for the CI pipeline. These should be sequences of tests that check the performance characteristics (such as CPU usage, speed) of the subsystem under test in both happy and negative scenarios (low bandwidth, network errors and low connectivity). -### Reuaseble test components +### Reusable test components To faster write a new test objective you need to use some higher level wrappers and logic: `TestEnvironment` `TestConfiguration`, `TestAuthorities`, `NetworkEmulator`. To create the `TestEnvironment` you will need to also build an `Overseer`, but that should be easy using the mockups for subsystems in`core::mock`. diff --git a/polkadot/node/subsystem-bench/src/availability/cli.rs b/polkadot/node/subsystem-bench/src/availability/cli.rs index f86f1bfb700dc..8da4a59253c62 100644 --- a/polkadot/node/subsystem-bench/src/availability/cli.rs +++ b/polkadot/node/subsystem-bench/src/availability/cli.rs @@ -34,8 +34,8 @@ pub enum NetworkEmulation { #[allow(missing_docs)] pub struct DataAvailabilityReadOptions { #[clap(short, long, default_value_t = false)] - /// Turbo boost AD Read by fetching the full availability datafrom backers first. Saves CPU as - /// we don't need to re-construct from chunks. Tipically this is only faster if nodes have + /// Turbo boost AD Read by fetching the full availability datafrom backers first. Saves CPU as + /// we don't need to re-construct from chunks. Tipically this is only faster if nodes have /// enough bandwidth. pub fetch_from_backers: bool, } diff --git a/polkadot/node/subsystem-bench/src/core/display.rs b/polkadot/node/subsystem-bench/src/core/display.rs index 629fb2edc4146..03a5c13aeb47d 100644 --- a/polkadot/node/subsystem-bench/src/core/display.rs +++ b/polkadot/node/subsystem-bench/src/core/display.rs @@ -14,11 +14,11 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . // -//! Display implementations and helper methods for parsing prometheus metrics +//! Display implementations and helper methods for parsing prometheus metrics //! to a format that can be displayed in the CLI. //! //! Currently histogram buckets are skipped. -use super::{LOG_TARGET, configuration::TestConfiguration}; +use super::{configuration::TestConfiguration, LOG_TARGET}; use colored::Colorize; use prometheus::{ proto::{MetricFamily, MetricType}, @@ -182,17 +182,13 @@ pub fn parse_metrics(registry: &Registry) -> MetricCollection { test_metrics.into() } - pub fn display_configuration(test_config: &TestConfiguration) { gum::info!( "{}, {}, {}, {}, {}", format!("n_validators = {}", test_config.n_validators).blue(), format!("n_cores = {}", test_config.n_cores).blue(), - format!( - "pov_size = {} - {}", - test_config.min_pov_size, test_config.max_pov_size - ) - .bright_black(), + format!("pov_size = {} - {}", test_config.min_pov_size, test_config.max_pov_size) + .bright_black(), format!("error = {}", test_config.error).bright_black(), format!("latency = {:?}", test_config.latency).bright_black(), ); diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs index a666ee06ad55d..5337a13e9729d 100644 --- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -95,15 +95,13 @@ impl BenchCli { format!("Sequence contains {} step(s)", num_steps).bright_purple() ); for (index, test_config) in test_sequence.into_iter().enumerate() { - gum::info!( - "{}", - format!("Step {}/{}", index + 1, num_steps).bright_purple(), - ); + gum::info!("{}", format!("Step {}/{}", index + 1, num_steps).bright_purple(),); display_configuration(&test_config); let mut state = TestState::new(&test_config); let (mut env, _protocol_config) = prepare_test(test_config, &mut state); - env.runtime().block_on(availability::benchmark_availability_read(&mut env, state)); + env.runtime() + .block_on(availability::benchmark_availability_read(&mut env, state)); } return Ok(()) }, @@ -164,7 +162,8 @@ impl BenchCli { let mut state = TestState::new(&test_config); let (mut env, _protocol_config) = prepare_test(test_config, &mut state); // test_config.write_to_disk(); - env.runtime().block_on(availability::benchmark_availability_read(&mut env, state)); + env.runtime() + .block_on(availability::benchmark_availability_read(&mut env, state)); Ok(()) } From eb49ea0277b89ecbd1cc14613dc0f6eb6e0bd9b7 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Tue, 28 Nov 2023 15:14:26 +0200 Subject: [PATCH 32/45] update dashboard and sample Signed-off-by: Andrei Sandu --- .../examples/availability_read.yaml | 6 +- .../grafana/availability-read.json | 3418 +++++++++-------- 2 files changed, 1713 insertions(+), 1711 deletions(-) diff --git a/polkadot/node/subsystem-bench/examples/availability_read.yaml b/polkadot/node/subsystem-bench/examples/availability_read.yaml index 889309e64a2bd..311ea972141fc 100644 --- a/polkadot/node/subsystem-bench/examples/availability_read.yaml +++ b/polkadot/node/subsystem-bench/examples/availability_read.yaml @@ -1,7 +1,7 @@ TestConfiguration: # Test 1 - objective: !DataAvailabilityRead - fetch_from_backers: true + fetch_from_backers: false n_validators: 300 n_cores: 20 min_pov_size: 5120 @@ -20,7 +20,7 @@ TestConfiguration: # Test 2 - objective: !DataAvailabilityRead - fetch_from_backers: true + fetch_from_backers: false n_validators: 500 n_cores: 20 min_pov_size: 5120 @@ -39,7 +39,7 @@ TestConfiguration: # Test 3 - objective: !DataAvailabilityRead - fetch_from_backers: true + fetch_from_backers: false n_validators: 1000 n_cores: 20 min_pov_size: 5120 diff --git a/polkadot/node/subsystem-bench/grafana/availability-read.json b/polkadot/node/subsystem-bench/grafana/availability-read.json index 4fbbe1f58731c..31c4ad3c79523 100644 --- a/polkadot/node/subsystem-bench/grafana/availability-read.json +++ b/polkadot/node/subsystem-bench/grafana/availability-read.json @@ -1,1872 +1,1874 @@ { - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "datasource", - "uid": "grafana" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "target": { - "limit": 100, - "matchAny": false, - "tags": [], - "type": "dashboard" - }, - "type": "dashboard" - } - ] - }, - "description": "Subsystem and test environment metrics", - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 0, - "id": 2, - "links": [], - "liveNow": false, - "panels": [ + "annotations": { + "list": [ { + "builtIn": 1, "datasource": { - "type": "prometheus", - "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" + "type": "dashboard" + } + ] + }, + "description": "Subsystem and test environment metrics", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 2, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineStyle": { - "fill": "solid" - }, - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": 60000, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] + "showPoints": "auto", + "spanNulls": 60000, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" } }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 90, - "interval": "1s", - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] } }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" - }, - "editorMode": "code", - "expr": "subsystem_benchmark_n_validators{}", - "instant": false, - "legendFormat": "n_vaidators", - "range": true, - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" - }, - "editorMode": "code", - "expr": "subsystem_benchmark_n_cores{}", - "hide": false, - "instant": false, - "legendFormat": "n_cores", - "range": true, - "refId": "B" - } - ], - "title": "Test configuration", - "type": "timeseries" + "overrides": [] }, - { - "collapsed": false, - "datasource": { - "type": "datasource", - "uid": "grafana" - }, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 9 + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 90, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "id": 31, - "panels": [], - "targets": [ - { - "datasource": { - "type": "datasource", - "uid": "grafana" - }, - "refId": "A" - } - ], - "title": "Overview", - "type": "row" + "tooltip": { + "mode": "multi", + "sort": "none" + } }, - { - "datasource": { - "type": "prometheus", - "uid": "$data_source" + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "editorMode": "code", + "expr": "subsystem_benchmark_n_validators{}", + "instant": false, + "legendFormat": "n_vaidators", + "range": true, + "refId": "A" }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" + { + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "editorMode": "code", + "expr": "subsystem_benchmark_n_cores{}", + "hide": false, + "instant": false, + "legendFormat": "n_cores", + "range": true, + "refId": "B" + } + ], + "title": "Test configuration", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 31, + "panels": [], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], + "title": "Overview", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$data_source" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 30, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" }, - "unit": "percentunit" + "thresholdsStyle": { + "mode": "off" + } }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 24, - "x": 0, - "y": 10 - }, - "id": 57, - "interval": "1s", - "options": { - "legend": { - "calcs": [ - "mean", - "min", - "max" - ], - "displayMode": "table", - "placement": "right", - "showLegend": true, - "sortBy": "Mean", - "sortDesc": true + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } + "unit": "percentunit" }, - "pluginVersion": "10.0.2", - "repeat": "nodename", - "targets": [ - { - "datasource": { - "uid": "$data_source" - }, - "editorMode": "code", - "expr": "sum(rate(substrate_tasks_polling_duration_sum{}[2s])) by ($cpu_group_by)", - "interval": "", - "legendFormat": "{{task_group}}", - "range": true, - "refId": "A" - } - ], - "title": "All tasks CPU usage breakdown", - "type": "timeseries" + "overrides": [] }, - { - "datasource": { - "type": "prometheus", - "uid": "$data_source" + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 57, + "interval": "1s", + "options": { + "legend": { + "calcs": [ + "mean", + "min", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.0.2", + "repeat": "nodename", + "targets": [ + { + "datasource": { + "uid": "$data_source" + }, + "editorMode": "code", + "expr": "sum(rate(substrate_tasks_polling_duration_sum{}[2s])) by ($cpu_group_by)", + "interval": "", + "legendFormat": "{{task_group}}", + "range": true, + "refId": "A" + } + ], + "title": "All tasks CPU usage breakdown", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$data_source" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 30, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "area" - } + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 6 - } - ] + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" }, - "unit": "s" + "thresholdsStyle": { + "mode": "area" + } }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 24, - "x": 0, - "y": 20 - }, - "id": 93, - "interval": "1s", - "options": { - "legend": { - "calcs": [ - "mean", - "min", - "max" - ], - "displayMode": "table", - "placement": "right", - "showLegend": true, - "sortBy": "Mean", - "sortDesc": true + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 6 + } + ] }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } + "unit": "s" }, - "pluginVersion": "10.0.2", - "targets": [ - { - "datasource": { - "uid": "$data_source" - }, - "editorMode": "code", - "expr": "increase(substrate_tasks_polling_duration_sum{task_group=\"availability-recovery-subsystem\"}[6s])", - "interval": "", - "legendFormat": "{{task_name}}", - "range": true, - "refId": "A" - } - ], - "title": "Availability subsystem CPU usage per block", - "type": "timeseries" + "overrides": [] }, - { - "datasource": { - "type": "prometheus", - "uid": "$data_source" + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 93, + "interval": "1s", + "options": { + "legend": { + "calcs": [ + "mean", + "min", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.0.2", + "targets": [ + { + "datasource": { + "uid": "$data_source" + }, + "editorMode": "code", + "expr": "increase(substrate_tasks_polling_duration_sum{task_group=\"availability-recovery\"}[6s])", + "interval": "", + "legendFormat": "{{task_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Availability subsystem CPU usage per block", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$data_source" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 30, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "log": 10, - "type": "log" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 10, + "type": "log" }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" }, - "unit": "s" + "thresholdsStyle": { + "mode": "off" + } }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 24, - "x": 0, - "y": 30 - }, - "id": 94, - "interval": "1s", - "options": { - "legend": { - "calcs": [ - "last" - ], - "displayMode": "table", - "placement": "right", - "showLegend": true + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } + "unit": "s" }, - "pluginVersion": "10.0.2", - "targets": [ - { - "datasource": { - "uid": "$data_source" - }, - "editorMode": "code", - "expr": "sum(substrate_tasks_polling_duration_sum{}) by ($cpu_group_by)", - "interval": "", - "legendFormat": "{{task_name}}", - "range": true, - "refId": "A" - } - ], - "title": "Total CPU burn", - "type": "timeseries" + "overrides": [] }, - { - "datasource": { - "type": "prometheus", - "uid": "$data_source" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 94, + "interval": "1s", + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.0.2", + "targets": [ + { + "datasource": { + "uid": "$data_source" + }, + "editorMode": "code", + "expr": "sum(substrate_tasks_polling_duration_sum{}) by ($cpu_group_by)", + "interval": "", + "legendFormat": "{{task_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Total CPU burn", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$data_source" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 30, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "log": 10, - "type": "log" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "area" - } + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 10, + "type": "log" }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "dark-red", - "value": 6000 - } - ] + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" }, - "unit": "ms" + "thresholdsStyle": { + "mode": "area" + } }, - "overrides": [] - }, - "gridPos": { - "h": 12, - "w": 12, - "x": 0, - "y": 40 - }, - "id": 95, - "interval": "1s", - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true, - "sortBy": "Last", - "sortDesc": true + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "dark-red", + "value": 6000 + } + ] }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } + "unit": "ms" }, - "pluginVersion": "10.0.2", - "targets": [ - { - "datasource": { - "uid": "$data_source" - }, - "editorMode": "code", - "expr": "subsystem_benchmark_block_time", - "interval": "", - "legendFormat": "Instant block time", - "range": true, - "refId": "A" - } - ], - "title": "Block time", - "type": "timeseries" + "overrides": [] }, - { - "datasource": { - "type": "prometheus", - "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 40 + }, + "id": 95, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.0.2", + "targets": [ + { + "datasource": { + "uid": "$data_source" + }, + "editorMode": "code", + "expr": "subsystem_benchmark_block_time", + "interval": "", + "legendFormat": "Instant block time", + "range": true, + "refId": "A" + } + ], + "title": "All candidates in block recovery time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 100, + "gradientMode": "hue", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 100, - "gradientMode": "hue", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 2, - "scaleDistribution": { - "log": 2, - "type": "log" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 2, + "scaleDistribution": { + "log": 2, + "type": "log" }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" }, - "unit": "binBps" + "thresholdsStyle": { + "mode": "off" + } }, - "overrides": [] - }, - "gridPos": { - "h": 12, - "w": 12, - "x": 12, - "y": 40 - }, - "id": 89, - "interval": "1s", - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] }, - "tooltip": { - "mode": "multi", - "sort": "none" - } + "unit": "binBps" }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" - }, - "editorMode": "code", - "expr": "sum(rate(subsystem_benchmark_network_peer_total_bytes_sent{}[5s]))", - "instant": false, - "legendFormat": "Received", - "range": true, - "refId": "A" + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 40 + }, + "id": 89, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" }, - { - "datasource": { - "type": "prometheus", - "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" - }, - "editorMode": "code", - "expr": "sum(rate(subsystem_benchmark_network_peer_total_bytes_received{}[5s]))", - "hide": false, - "instant": false, - "legendFormat": "Sent", - "range": true, - "refId": "B" - } - ], - "title": "Emulated network throughput ", - "type": "timeseries" + "editorMode": "code", + "expr": "sum(rate(subsystem_benchmark_network_peer_total_bytes_received{}[5s]))", + "instant": false, + "legendFormat": "Received", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "editorMode": "code", + "expr": "sum(rate(subsystem_benchmark_network_peer_total_bytes_sent{}[5s]))", + "hide": false, + "instant": false, + "legendFormat": "Sent", + "range": true, + "refId": "B" + } + ], + "title": "Emulated network throughput ", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" }, - { - "datasource": { - "type": "prometheus", - "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "log": 2, - "type": "log" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 2, + "type": "log" }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" }, - "unit": "bytes" + "thresholdsStyle": { + "mode": "off" + } }, - "overrides": [] - }, - "gridPos": { - "h": 15, - "w": 12, - "x": 0, - "y": 52 - }, - "id": 88, - "interval": "1s", - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] }, - "tooltip": { - "mode": "single", - "sort": "none" - } + "unit": "bytes" }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" - }, - "editorMode": "code", - "expr": "rate(subsystem_benchmark_network_peer_total_bytes_received{}[10s])", - "instant": false, - "legendFormat": "Received by {{peer}}", - "range": true, - "refId": "A" + "overrides": [] + }, + "gridPos": { + "h": 15, + "w": 12, + "x": 0, + "y": 52 + }, + "id": 88, + "interval": "1s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" }, - { - "datasource": { - "type": "prometheus", - "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + "editorMode": "code", + "expr": "rate(subsystem_benchmark_network_peer_total_bytes_received{}[10s])", + "instant": false, + "legendFormat": "Received by {{peer}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "editorMode": "code", + "expr": "rate(subsystem_benchmark_network_peer_total_bytes_sent{}[10s])", + "hide": false, + "instant": false, + "legendFormat": "Sent by {{peer}}", + "range": true, + "refId": "B" + } + ], + "title": "Emulated peer throughput", + "type": "timeseries" + }, + { + "cards": {}, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateInferno", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false }, - "editorMode": "code", - "expr": "rate(subsystem_benchmark_network_peer_total_bytes_sent{}[10s])", - "hide": false, - "instant": false, - "legendFormat": "Sent by {{peer}}", - "range": true, - "refId": "B" + "scaleDistribution": { + "type": "linear" + } } - ], - "title": "Emulated peer throughput", - "type": "timeseries" + }, + "overrides": [] }, - { - "cards": {}, + "gridPos": { + "h": 15, + "w": 12, + "x": 12, + "y": 52 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 92, + "interval": "1s", + "legend": { + "show": true + }, + "maxDataPoints": 1340, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": { + "decimals": 0 + }, "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", - "colorScheme": "interpolateInferno", "exponent": 0.5, - "mode": "spectrum" + "fill": "#b4ff00", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Inferno", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 }, - "dataFormat": "tsbuckets", - "datasource": { - "type": "prometheus", - "uid": "${data_source}" - }, - "description": "", - "fieldConfig": { - "defaults": { - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "scaleDistribution": { - "type": "linear" - } - } - }, - "overrides": [] - }, - "gridPos": { - "h": 15, - "w": 12, - "x": 12, - "y": 52 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 92, - "interval": "1s", "legend": { "show": true }, - "maxDataPoints": 1340, - "options": { - "calculate": false, - "calculation": {}, - "cellGap": 2, - "cellValues": { - "decimals": 0 - }, - "color": { - "exponent": 0.5, - "fill": "#b4ff00", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "Inferno", - "steps": 128 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, - "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto" - }, - "showValue": "never", - "tooltip": { - "show": true, - "yHistogram": true - }, - "yAxis": { - "axisPlacement": "left", - "decimals": 0, - "reverse": false, - "unit": "bytes" - } + "rowsFrame": { + "layout": "auto" }, - "pluginVersion": "10.1.1", - "reverseYBuckets": false, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${data_source}" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum(increase(subsystem_benchmark_pov_size_bucket{}[$__rate_interval])) by (le)", - "format": "heatmap", - "hide": false, - "instant": false, - "interval": "", - "legendFormat": "{{le}}", - "queryType": "randomWalk", - "refId": "B" - } - ], - "title": "Recovered PoV sizes", + "showValue": "never", "tooltip": { "show": true, - "showHistogram": true - }, - "tooltipDecimals": 0, - "transformations": [], - "type": "heatmap", - "xAxis": { - "show": true + "yHistogram": true }, "yAxis": { + "axisPlacement": "left", "decimals": 0, - "format": "s", - "logBase": 1, - "show": true - }, - "yBucketBound": "auto" + "reverse": false, + "unit": "bytes" + } }, - { - "datasource": { - "type": "prometheus", - "uid": "${data_source}" - }, - "description": "Number of erasure-encoded chunks of data belonging to candidate blocks. ", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic", - "seriesBy": "max" + "pluginVersion": "10.1.1", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(increase(subsystem_benchmark_pov_size_bucket{}[$__rate_interval])) by (le)", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "title": "Recovered PoV sizes", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": 0, + "transformations": [], + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "description": "Number of erasure-encoded chunks of data belonging to candidate blocks. ", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic", + "seriesBy": "max" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "smooth", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" }, - "unit": "chunks/s" + "thresholdsStyle": { + "mode": "off" + } }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 67 - }, - "id": 43, - "interval": "1s", - "maxDataPoints": 1340, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] }, - "tooltip": { - "mode": "multi", - "sort": "none" - } + "unit": "chunks/s" }, - "pluginVersion": "8.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${data_source}" - }, - "editorMode": "code", - "exemplar": true, - "expr": "sum(rate(polkadot_parachain_availability_recovery_chunk_requests_issued{}[10s]))", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "legendFormat": "Chunks requested", - "queryType": "randomWalk", - "refId": "B" - } - ], - "title": "Availability", - "transformations": [], - "type": "timeseries" + "overrides": [] }, - { - "collapsed": false, - "datasource": { - "type": "datasource", - "uid": "grafana" - }, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 77 + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 67 + }, + "id": 43, + "interval": "1s", + "maxDataPoints": 1340, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "id": 35, - "panels": [], - "targets": [ - { - "datasource": { - "type": "datasource", - "uid": "grafana" + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.2.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(polkadot_parachain_availability_recovery_chunk_requests_issued{}[10s]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Chunks requested", + "queryType": "randomWalk", + "refId": "B" + } + ], + "title": "Availability", + "transformations": [], + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 77 + }, + "id": 35, + "panels": [], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], + "title": "Availability subystem metrics", + "type": "row" + }, + { + "cards": {}, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateInferno", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false }, - "refId": "A" + "scaleDistribution": { + "type": "linear" + } } - ], - "title": "Availability subystem metrics", - "type": "row" + }, + "overrides": [] }, - { - "cards": {}, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 78 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 68, + "interval": "1s", + "legend": { + "show": true + }, + "maxDataPoints": 1340, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": { + "decimals": 0 + }, "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", - "colorScheme": "interpolateInferno", "exponent": 0.5, - "mode": "spectrum" + "fill": "#b4ff00", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Inferno", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 }, - "dataFormat": "tsbuckets", - "datasource": { - "type": "prometheus", - "uid": "${data_source}" - }, - "description": "", - "fieldConfig": { - "defaults": { - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "scaleDistribution": { - "type": "linear" - } - } - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 78 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 68, - "interval": "1s", "legend": { "show": true }, - "maxDataPoints": 1340, - "options": { - "calculate": false, - "calculation": {}, - "cellGap": 2, - "cellValues": { - "decimals": 0 - }, - "color": { - "exponent": 0.5, - "fill": "#b4ff00", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "Inferno", - "steps": 128 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, - "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto" - }, - "showValue": "never", - "tooltip": { - "show": true, - "yHistogram": true - }, - "yAxis": { - "axisPlacement": "left", - "decimals": 0, - "reverse": false, - "unit": "s" - } + "rowsFrame": { + "layout": "auto" }, - "pluginVersion": "10.1.1", - "reverseYBuckets": false, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${data_source}" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum(increase(polkadot_parachain_availability_recovery_time_total_bucket{}[$__rate_interval])) by (le)", - "format": "heatmap", - "hide": false, - "instant": false, - "interval": "", - "legendFormat": "{{le}}", - "queryType": "randomWalk", - "refId": "B" - } - ], - "title": "Time to recover a PoV", + "showValue": "never", "tooltip": { "show": true, - "showHistogram": true - }, - "tooltipDecimals": 0, - "transformations": [], - "type": "heatmap", - "xAxis": { - "show": true + "yHistogram": true }, "yAxis": { + "axisPlacement": "left", "decimals": 0, - "format": "s", - "logBase": 1, - "show": true + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "10.1.1", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(increase(polkadot_parachain_availability_recovery_time_total_bucket{}[$__rate_interval])) by (le)", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "title": "Time to recover a PoV", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": 0, + "transformations": [], + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "cards": {}, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateInferno", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } }, - "yBucketBound": "auto" + "overrides": [] }, - { - "cards": {}, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 78 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 67, + "interval": "1s", + "legend": { + "show": true + }, + "maxDataPoints": 1340, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": { + "decimals": 0 + }, "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", - "colorScheme": "interpolateInferno", "exponent": 0.5, - "mode": "spectrum" + "fill": "#b4ff00", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Inferno", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 }, - "dataFormat": "tsbuckets", - "datasource": { - "type": "prometheus", - "uid": "${data_source}" - }, - "description": "", - "fieldConfig": { - "defaults": { - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "scaleDistribution": { - "type": "linear" - } - } - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 78 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 67, - "interval": "1s", "legend": { "show": true }, - "maxDataPoints": 1340, - "options": { - "calculate": false, - "calculation": {}, - "cellGap": 2, - "cellValues": { - "decimals": 0 - }, - "color": { - "exponent": 0.5, - "fill": "#b4ff00", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "Inferno", - "steps": 128 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, - "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto" - }, - "showValue": "never", - "tooltip": { - "show": true, - "yHistogram": true - }, - "yAxis": { - "axisPlacement": "left", - "decimals": 0, - "reverse": false, - "unit": "s" - } + "rowsFrame": { + "layout": "auto" }, - "pluginVersion": "10.1.1", - "reverseYBuckets": false, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${data_source}" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum(increase(polkadot_parachain_availability_recovery_time_chunk_request_bucket{}[$__rate_interval])) by (le)", - "format": "heatmap", - "instant": false, - "interval": "", - "legendFormat": "{{le}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "title": "Chunk request duration", + "showValue": "never", "tooltip": { "show": true, - "showHistogram": true - }, - "tooltipDecimals": 0, - "transformations": [], - "type": "heatmap", - "xAxis": { - "show": true + "yHistogram": true }, "yAxis": { + "axisPlacement": "left", "decimals": 0, - "format": "bitfields", - "logBase": 1, - "show": true - }, - "yBucketBound": "auto" + "reverse": false, + "unit": "s" + } }, - { - "datasource": { - "type": "prometheus", - "uid": "${data_source}" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic", - "seriesBy": "max" + "pluginVersion": "10.1.1", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(increase(polkadot_parachain_availability_recovery_time_chunk_request_bucket{}[$__rate_interval])) by (le)", + "format": "heatmap", + "instant": false, + "interval": "", + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Chunk request duration", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": 0, + "transformations": [], + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 0, + "format": "bitfields", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic", + "seriesBy": "max" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "smooth", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" }, - "unit": "Bps" + "thresholdsStyle": { + "mode": "off" + } }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 88 - }, - "id": 85, - "interval": "1s", - "maxDataPoints": 1340, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] }, - "tooltip": { - "mode": "multi", - "sort": "none" - } + "unit": "Bps" }, - "pluginVersion": "8.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${data_source}" + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 88 + }, + "id": 85, + "interval": "1s", + "maxDataPoints": 1340, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.2.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "rate(polkadot_parachain_availability_recovery_bytes_total{}[30s])", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Bytes recovered", + "queryType": "randomWalk", + "refId": "B" + } + ], + "title": "Recovery throughtput", + "transformations": [], + "type": "timeseries" + }, + { + "cards": {}, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateInferno", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false }, - "editorMode": "code", - "exemplar": true, - "expr": "rate(polkadot_parachain_availability_recovery_bytes_total{}[30s])", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "legendFormat": "Bytes recovered", - "queryType": "randomWalk", - "refId": "B" + "scaleDistribution": { + "type": "linear" + } } - ], - "title": "Recovery throughtput", - "transformations": [], - "type": "timeseries" + }, + "overrides": [] }, - { - "cards": {}, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 88 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 84, + "interval": "1s", + "legend": { + "show": true + }, + "maxDataPoints": 1340, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": { + "decimals": 0 + }, "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", - "colorScheme": "interpolateInferno", "exponent": 0.5, - "mode": "spectrum" + "fill": "#b4ff00", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Inferno", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 }, - "dataFormat": "tsbuckets", - "datasource": { - "type": "prometheus", - "uid": "${data_source}" - }, - "description": "", - "fieldConfig": { - "defaults": { - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "scaleDistribution": { - "type": "linear" - } - } - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 88 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 84, - "interval": "1s", "legend": { "show": true }, - "maxDataPoints": 1340, - "options": { - "calculate": false, - "calculation": {}, - "cellGap": 2, - "cellValues": { - "decimals": 0 - }, - "color": { - "exponent": 0.5, - "fill": "#b4ff00", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "Inferno", - "steps": 128 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, - "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto" - }, - "showValue": "never", - "tooltip": { - "show": true, - "yHistogram": true - }, - "yAxis": { - "axisPlacement": "left", - "decimals": 0, - "reverse": false, - "unit": "s" - } + "rowsFrame": { + "layout": "auto" }, - "pluginVersion": "10.1.1", - "reverseYBuckets": false, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${data_source}" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum(increase(polkadot_parachain_availability_reencode_chunks_bucket{}[$__rate_interval])) by (le)", - "format": "heatmap", - "hide": false, - "instant": false, - "interval": "", - "legendFormat": "{{le}}", - "queryType": "randomWalk", - "refId": "B" - } - ], - "title": "Re-encoding chunks timing", + "showValue": "never", "tooltip": { "show": true, - "showHistogram": true - }, - "tooltipDecimals": 0, - "transformations": [], - "type": "heatmap", - "xAxis": { - "show": true + "yHistogram": true }, "yAxis": { + "axisPlacement": "left", "decimals": 0, - "format": "s", - "logBase": 1, - "show": true + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "10.1.1", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(increase(polkadot_parachain_availability_reencode_chunks_bucket{}[$__rate_interval])) by (le)", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "title": "Re-encoding chunks timing", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": 0, + "transformations": [], + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "cards": {}, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateInferno", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } }, - "yBucketBound": "auto" + "overrides": [] }, - { - "cards": {}, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 98 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 83, + "interval": "1s", + "legend": { + "show": true + }, + "maxDataPoints": 1340, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": { + "decimals": 0 + }, "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", - "colorScheme": "interpolateInferno", "exponent": 0.5, - "mode": "spectrum" + "fill": "#b4ff00", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Inferno", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 }, - "dataFormat": "tsbuckets", - "datasource": { - "type": "prometheus", - "uid": "${data_source}" - }, - "description": "", - "fieldConfig": { - "defaults": { - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "scaleDistribution": { - "type": "linear" - } - } - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 98 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 83, - "interval": "1s", "legend": { "show": true }, - "maxDataPoints": 1340, - "options": { - "calculate": false, - "calculation": {}, - "cellGap": 2, - "cellValues": { - "decimals": 0 - }, - "color": { - "exponent": 0.5, - "fill": "#b4ff00", - "mode": "scheme", - "reverse": false, - "scale": "exponential", - "scheme": "Inferno", - "steps": 128 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, - "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto" - }, - "showValue": "never", - "tooltip": { - "show": true, - "yHistogram": true - }, - "yAxis": { - "axisPlacement": "left", - "decimals": 0, - "reverse": false, - "unit": "s" - } + "rowsFrame": { + "layout": "auto" }, - "pluginVersion": "10.1.1", - "reverseYBuckets": false, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${data_source}" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum(increase(polkadot_parachain_availability_recovery_time_erasure_recovery_bucket{}[$__rate_interval])) by (le)", - "format": "heatmap", - "hide": false, - "instant": false, - "interval": "", - "legendFormat": "{{le}}", - "queryType": "randomWalk", - "refId": "B" - } - ], - "title": "Erasure recovery (no I/O)", + "showValue": "never", "tooltip": { "show": true, - "showHistogram": true - }, - "tooltipDecimals": 0, - "transformations": [], - "type": "heatmap", - "xAxis": { - "show": true + "yHistogram": true }, "yAxis": { + "axisPlacement": "left", "decimals": 0, - "format": "s", - "logBase": 1, - "show": true - }, - "yBucketBound": "auto" + "reverse": false, + "unit": "s" + } }, - { - "datasource": { - "type": "prometheus", - "uid": "${data_source}" - }, - "description": "Number of erasure-encoded chunks of data belonging to candidate blocks. ", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic", - "seriesBy": "max" + "pluginVersion": "10.1.1", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(increase(polkadot_parachain_availability_recovery_time_erasure_recovery_bucket{}[$__rate_interval])) by (le)", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "title": "Erasure recovery (no I/O)", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": 0, + "transformations": [], + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "description": "Number of erasure-encoded chunks of data belonging to candidate blocks. ", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic", + "seriesBy": "max" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "stepAfter", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } + "insertNulls": false, + "lineInterpolation": "stepAfter", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" }, - "unit": "cps" + "thresholdsStyle": { + "mode": "off" + } }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 108 - }, - "id": 86, - "interval": "1s", - "maxDataPoints": 1340, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] }, - "tooltip": { - "mode": "multi", - "sort": "none" - } + "unit": "cps" }, - "pluginVersion": "8.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${data_source}" - }, - "editorMode": "code", - "exemplar": true, - "expr": "sum(rate(polkadot_parachain_availability_recovery_recoveries_finished{}[1s]))", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "legendFormat": "Finished", - "queryType": "randomWalk", - "refId": "B" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${data_source}" - }, - "editorMode": "code", - "exemplar": true, - "expr": "sum(rate(polkadot_parachain_availability_recovery_recovieries_started{}[1s]))", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "legendFormat": "Started", - "queryType": "randomWalk", - "refId": "A" - } - ], - "title": "Recoveries", - "transformations": [], - "type": "timeseries" + "overrides": [] }, - { - "collapsed": false, - "datasource": { - "type": "datasource", - "uid": "grafana" - }, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 118 + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 108 + }, + "id": 86, + "interval": "1s", + "maxDataPoints": 1340, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "id": 2, - "panels": [], - "targets": [ - { - "datasource": { - "type": "datasource", - "uid": "grafana" - }, - "refId": "A" - } - ], - "title": "Approval voting", - "type": "row" - } - ], - "refresh": "5s", - "schemaVersion": 38, - "style": "dark", - "tags": [ - "subsystem", - "benchmark" - ], - "templating": { - "list": [ + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.2.2", + "targets": [ { - "current": { - "selected": false, - "text": "Prometheus", - "value": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" - }, - "hide": 0, - "includeAll": false, - "label": "Source of data", - "multi": false, - "name": "data_source", - "options": [], - "query": "prometheus", - "queryValue": "", - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "type": "datasource" + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(polkadot_parachain_availability_recovery_recoveries_finished{}[1s]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Finished", + "queryType": "randomWalk", + "refId": "B" }, { - "current": { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(polkadot_parachain_availability_recovery_recovieries_started{}[1s]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Started", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Recoveries", + "transformations": [], + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 118 + }, + "id": 2, + "panels": [], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], + "title": "Approval voting", + "type": "row" + } + ], + "refresh": false, + "schemaVersion": 38, + "style": "dark", + "tags": [ + "subsystem", + "benchmark" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "e56e7dd2-a992-4eec-aa96-e47b21c9020b" + }, + "hide": 0, + "includeAll": false, + "label": "Source of data", + "multi": false, + "name": "data_source", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": { + "selected": true, + "text": "task_name", + "value": "task_name" + }, + "description": "Sum CPU usage by task name or task group.", + "hide": 0, + "includeAll": false, + "label": "Group CPU usage", + "multi": false, + "name": "cpu_group_by", + "options": [ + { "selected": true, "text": "task_name", "value": "task_name" }, - "description": "Sum CPU usage by task name or task group.", - "hide": 0, - "includeAll": false, - "label": "Group CPU usage", - "multi": false, - "name": "cpu_group_by", - "options": [ - { - "selected": true, - "text": "task_name", - "value": "task_name" - }, - { - "selected": false, - "text": "task_group", - "value": "task_group" - } - ], - "query": "task_name, task_group", - "queryValue": "", - "skipUrlSync": false, - "type": "custom" - } - ] - }, - "time": { - "from": "now-5m", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s" - ] - }, - "timezone": "utc", - "title": "Data Availability Read", - "uid": "asdadasd1", - "version": 56, - "weekStart": "" - } \ No newline at end of file + { + "selected": false, + "text": "task_group", + "value": "task_group" + } + ], + "query": "task_name, task_group", + "queryValue": "", + "skipUrlSync": false, + "type": "custom" + } + ] + }, + "time": { + "from": "2023-11-28T13:05:32.794Z", + "to": "2023-11-28T13:06:56.173Z" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s" + ] + }, + "timezone": "utc", + "title": "Data Availability Read", + "uid": "asdadasd1", + "version": 58, + "weekStart": "" +} \ No newline at end of file From b2490560da9d8924a393a31322de2ee59e31ca72 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Tue, 28 Nov 2023 15:51:16 +0200 Subject: [PATCH 33/45] remove unused Signed-off-by: Andrei Sandu --- polkadot/node/subsystem-bench/src/availability/cli.rs | 4 ---- polkadot/node/subsystem-bench/src/availability/mod.rs | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/polkadot/node/subsystem-bench/src/availability/cli.rs b/polkadot/node/subsystem-bench/src/availability/cli.rs index 8da4a59253c62..65df8c1552aa8 100644 --- a/polkadot/node/subsystem-bench/src/availability/cli.rs +++ b/polkadot/node/subsystem-bench/src/availability/cli.rs @@ -15,10 +15,6 @@ // along with Polkadot. If not, see . use serde::{Deserialize, Serialize}; -#[derive(Debug, clap::Parser, Clone)] -#[clap(rename_all = "kebab-case")] -#[allow(missing_docs)] -pub struct NetworkOptions {} #[derive(clap::ValueEnum, Clone, Copy, Debug, PartialEq)] #[value(rename_all = "kebab-case")] diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index e5543e5d39042..cbd2f8287633f 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -68,7 +68,7 @@ use sc_service::SpawnTaskHandle; mod cli; pub mod configuration; -pub use cli::{DataAvailabilityReadOptions, NetworkEmulation, NetworkOptions}; +pub use cli::{DataAvailabilityReadOptions, NetworkEmulation}; pub use configuration::AvailabilityRecoveryConfiguration; fn build_overseer( From fb34181c26cc58f84ffa687fbb0b823e8f5233fb Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Tue, 28 Nov 2023 15:58:24 +0200 Subject: [PATCH 34/45] revert unneeded changes Signed-off-by: Andrei Sandu --- .../node/subsystem-test-helpers/src/lib.rs | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/polkadot/node/subsystem-test-helpers/src/lib.rs b/polkadot/node/subsystem-test-helpers/src/lib.rs index 1c3c47150ac6a..3f92513498c41 100644 --- a/polkadot/node/subsystem-test-helpers/src/lib.rs +++ b/polkadot/node/subsystem-test-helpers/src/lib.rs @@ -187,7 +187,6 @@ pub struct TestSubsystemContext { tx: TestSubsystemSender, rx: mpsc::Receiver>, spawn: S, - name: &'static str, } #[async_trait::async_trait] @@ -224,7 +223,7 @@ where name: &'static str, s: Pin + Send>>, ) -> SubsystemResult<()> { - self.spawn.spawn(name, Some(self.name), s); + self.spawn.spawn(name, None, s); Ok(()) } @@ -233,7 +232,7 @@ where name: &'static str, s: Pin + Send>>, ) -> SubsystemResult<()> { - self.spawn.spawn_blocking(name, Some(self.name), s); + self.spawn.spawn_blocking(name, None, s); Ok(()) } @@ -279,13 +278,6 @@ impl TestSubsystemContextHandle { .expect("Test subsystem no longer live") } - /// Receive the next message from the subsystem. - pub async fn maybe_recv(&mut self) -> Option { - self.try_recv() - .timeout(Self::TIMEOUT) - .await - .expect("`fn recv` does not timeout") - } /// Receive the next message from the subsystem, or `None` if the channel has been closed. pub async fn try_recv(&mut self) -> Option { self.rx @@ -300,9 +292,8 @@ impl TestSubsystemContextHandle { /// of the tests. pub fn make_subsystem_context( spawner: S, - name: &'static str, ) -> (TestSubsystemContext>, TestSubsystemContextHandle) { - make_buffered_subsystem_context(spawner, 0, name) + make_buffered_subsystem_context(spawner, 0) } /// Make a test subsystem context with buffered overseer channel. Some tests (e.g. @@ -311,7 +302,6 @@ pub fn make_subsystem_context( pub fn make_buffered_subsystem_context( spawner: S, buffer_size: usize, - name: &'static str, ) -> (TestSubsystemContext>, TestSubsystemContextHandle) { let (overseer_tx, overseer_rx) = mpsc::channel(buffer_size); let (all_messages_tx, all_messages_rx) = mpsc::unbounded(); @@ -321,7 +311,6 @@ pub fn make_buffered_subsystem_context( tx: TestSubsystemSender { tx: all_messages_tx }, rx: overseer_rx, spawn: SpawnGlue(spawner), - name, }, TestSubsystemContextHandle { tx: overseer_tx, rx: all_messages_rx }, ) @@ -343,7 +332,7 @@ pub fn subsystem_test_harness( Test: Future, { let pool = TaskExecutor::new(); - let (context, handle) = make_subsystem_context(pool, "default"); + let (context, handle) = make_subsystem_context(pool); let overseer = overseer_factory(handle); let test = test_factory(context); From 3a716a54830aac41d75b9e6f411829966de0a92f Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Wed, 29 Nov 2023 12:11:08 +0200 Subject: [PATCH 35/45] add missing comments and minor fixes Signed-off-by: Andrei Sandu --- .../subsystem-bench/src/availability/mod.rs | 31 ++++++++++--------- .../node/subsystem-bench/src/core/display.rs | 4 --- .../subsystem-bench/src/core/environment.rs | 18 +++++------ .../src/core/mock/network_bridge.rs | 20 ++++++------ .../node/subsystem-bench/src/core/network.rs | 16 +++++----- 5 files changed, 43 insertions(+), 46 deletions(-) diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index cbd2f8287633f..f4c39893215bb 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -105,11 +105,6 @@ fn prepare_test_inner( state: &mut TestState, dependencies: TestEnvironmentDependencies, ) -> (TestEnvironment, ProtocolConfig) { - // We need to first create the high level test state object. - // This will then be decomposed into per subsystem states. - let candidate_count = config.n_cores * config.num_blocks; - state.generate_candidates(candidate_count); - // Generate test authorities. let test_authorities = config.generate_authorities(); @@ -173,6 +168,7 @@ fn prepare_test_inner( pub struct TestState { // Full test configuration config: TestConfiguration, + // A cycle iterator on all PoV sizes used in the test. pov_sizes: Cycle>, // Generated candidate receipts to be used in the test candidates: Cycle>, @@ -181,9 +177,11 @@ pub struct TestState { // Map from generated candidate hashes to candidate index in `available_data` // and `chunks`. candidate_hashes: HashMap, - - candidate_receipts: Vec, + // Per candidate index receipts. + candidate_receipt_templates: Vec, + // Per candidate index `AvailableData` available_data: Vec, + // Per candiadte index chunks chunks: Vec>, } @@ -200,7 +198,8 @@ impl TestState { } /// Generate candidates to be used in the test. - pub fn generate_candidates(&mut self, count: usize) { + fn generate_candidates(&mut self) { + let count = self.config.n_cores * self.config.num_blocks; gum::info!(target: LOG_TARGET,"{}", format!("Pre-generating {} candidates.", count).bright_blue()); // Generate all candidates @@ -211,7 +210,8 @@ impl TestState { .pov_size_to_candidate .get(&pov_size) .expect("pov_size always exists; qed"); - let mut candidate_receipt = self.candidate_receipts[candidate_index].clone(); + let mut candidate_receipt = + self.candidate_receipt_templates[candidate_index].clone(); // Make it unique. candidate_receipt.descriptor.relay_parent = Hash::from_low_u64_be(index as u64); @@ -232,7 +232,7 @@ impl TestState { let mut chunks = Vec::new(); let mut available_data = Vec::new(); - let mut candidate_receipts = Vec::new(); + let mut candidate_receipt_templates = Vec::new(); let mut pov_size_to_candidate = HashMap::new(); // we use it for all candidates. @@ -266,22 +266,25 @@ impl TestState { chunks.push(new_chunks); available_data.push(new_available_data); pov_size_to_candidate.insert(pov_size, index); - candidate_receipts.push(candidate_receipt); + candidate_receipt_templates.push(candidate_receipt); } let pov_sizes = config.pov_sizes().to_vec().into_iter().cycle(); gum::info!(target: LOG_TARGET, "{}","Created test environment.".bright_blue()); - Self { + let mut _self = Self { config, available_data, - candidate_receipts, + candidate_receipt_templates, chunks, pov_size_to_candidate, pov_sizes, candidate_hashes: HashMap::new(), candidates: Vec::new().into_iter().cycle(), - } + }; + + _self.generate_candidates(); + _self } } diff --git a/polkadot/node/subsystem-bench/src/core/display.rs b/polkadot/node/subsystem-bench/src/core/display.rs index 03a5c13aeb47d..b9ff82d1c06a2 100644 --- a/polkadot/node/subsystem-bench/src/core/display.rs +++ b/polkadot/node/subsystem-bench/src/core/display.rs @@ -36,10 +36,6 @@ impl From> for MetricCollection { } impl MetricCollection { - pub fn get(&self, name: &str) -> Vec<&TestMetric> { - self.all().into_iter().filter(|metric| &metric.name == name).collect() - } - pub fn all(&self) -> &Vec { &self.0 } diff --git a/polkadot/node/subsystem-bench/src/core/environment.rs b/polkadot/node/subsystem-bench/src/core/environment.rs index 5c04071c442f7..247596474078e 100644 --- a/polkadot/node/subsystem-bench/src/core/environment.rs +++ b/polkadot/node/subsystem-bench/src/core/environment.rs @@ -20,6 +20,7 @@ use crate::{ }; use colored::Colorize; use core::time::Duration; +use futures::FutureExt; use polkadot_overseer::{BlockInfo, Handle as OverseerHandle}; use polkadot_node_subsystem::{messages::AllMessages, Overseer, SpawnGlue, TimeoutExt}; @@ -179,23 +180,22 @@ const MAX_TIME_OF_FLIGHT: Duration = Duration::from_millis(5000); /// ### CLI /// A subset of the Prometheus metrics are printed at the end of the test. pub struct TestEnvironment { - // Test dependencies + /// Test dependencies dependencies: TestEnvironmentDependencies, - // A runtime handle + /// A runtime handle runtime_handle: tokio::runtime::Handle, - // A handle to the lovely overseer + /// A handle to the lovely overseer overseer_handle: OverseerHandle, - // The test configuration. + /// The test configuration. config: TestConfiguration, - // A handle to the network emulator. + /// A handle to the network emulator. network: NetworkEmulator, - // Configuration/env metrics + /// Configuration/env metrics metrics: TestEnvironmentMetrics, } impl TestEnvironment { - // Create a new test environment with specified initial state and prometheus registry. - // We use prometheus metrics to collect per job task poll time and subsystem metrics. + /// Create a new test environment pub fn new( dependencies: TestEnvironmentDependencies, config: TestConfiguration, @@ -207,8 +207,8 @@ impl TestEnvironment { .expect("Metrics need to be registered"); let spawn_handle = dependencies.task_manager.spawn_handle(); + spawn_handle.spawn_blocking("overseer", "overseer", overseer.run().boxed()); - spawn_handle.spawn_blocking("overseer", "overseer", overseer.run()); let registry_clone = dependencies.registry.clone(); dependencies.task_manager.spawn_handle().spawn_blocking( "prometheus", diff --git a/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs b/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs index c14a3895e238c..2bc8d22234b60 100644 --- a/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs +++ b/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs @@ -219,19 +219,18 @@ impl MockNetworkBridgeTx { // Initialize our node bandwidth limits. let mut rx_limiter = RateLimit::new(10, self.config.bandwidth); - // Get a handle to our node network emulation stats. - let our_network_stats = self.network.peer_stats(0); - // This task will handle receipt of messages on our simulated network of the node. + let our_network = self.network.clone(); + + // This task will handle node messages receipt from the simulated network. let _ = ctx .spawn_blocking( - "node0-rx", + "network-receive", async move { while let Some(action) = ingress_rx.recv().await { let size = action.size(); // account for our node receiving the data. - our_network_stats.inc_received(size); - + our_network.inc_received(size); rx_limiter.reap(size).await; action.run().await; } @@ -271,12 +270,11 @@ impl MockNetworkBridgeTx { } // A helper to determine the request payload size. -fn request_size(request: &Requests) -> u64 { +fn request_size(request: &Requests) -> usize { match request { - Requests::ChunkFetchingV1(outgoing_request) => - outgoing_request.payload.encoded_size() as u64, + Requests::ChunkFetchingV1(outgoing_request) => outgoing_request.payload.encoded_size(), Requests::AvailableDataFetchingV1(outgoing_request) => - outgoing_request.payload.encoded_size() as u64, - _ => panic!("received an unexpected request"), + outgoing_request.payload.encoded_size(), + _ => unimplemented!("received an unexpected request"), } } diff --git a/polkadot/node/subsystem-bench/src/core/network.rs b/polkadot/node/subsystem-bench/src/core/network.rs index 40809ce36e8dd..67dc0e0f267e6 100644 --- a/polkadot/node/subsystem-bench/src/core/network.rs +++ b/polkadot/node/subsystem-bench/src/core/network.rs @@ -225,12 +225,12 @@ impl PeerEmulatorStats { pub fn inc_sent(&self, bytes: usize) { self.tx_bytes_total.fetch_add(bytes as u64, Ordering::Relaxed); - self.metrics.on_peer_sent(self.peer_index, bytes as u64); + self.metrics.on_peer_sent(self.peer_index, bytes); } pub fn inc_received(&self, bytes: usize) { self.rx_bytes_total.fetch_add(bytes as u64, Ordering::Relaxed); - self.metrics.on_peer_received(self.peer_index, bytes as u64); + self.metrics.on_peer_received(self.peer_index, bytes); } pub fn sent(&self) -> u64 { @@ -346,13 +346,13 @@ impl NetworkEmulator { } // Increment bytes sent by our node (the node that contains the subsystem under test) - pub fn inc_sent(&self, bytes: u64) { + pub fn inc_sent(&self, bytes: usize) { // Our node always is peer 0. self.metrics.on_peer_sent(0, bytes); } // Increment bytes received by our node (the node that contains the subsystem under test) - pub fn inc_received(&self, bytes: u64) { + pub fn inc_received(&self, bytes: usize) { // Our node always is peer 0. self.metrics.on_peer_received(0, bytes); } @@ -398,16 +398,16 @@ impl Metrics { } /// Increment total sent for a peer. - pub fn on_peer_sent(&self, peer_index: usize, bytes: u64) { + pub fn on_peer_sent(&self, peer_index: usize, bytes: usize) { self.peer_total_sent .with_label_values(vec![format!("node{}", peer_index).as_str()].as_slice()) - .inc_by(bytes); + .inc_by(bytes as u64); } /// Increment total receioved for a peer. - pub fn on_peer_received(&self, peer_index: usize, bytes: u64) { + pub fn on_peer_received(&self, peer_index: usize, bytes: usize) { self.peer_total_received .with_label_values(vec![format!("node{}", peer_index).as_str()].as_slice()) - .inc_by(bytes); + .inc_by(bytes as u64); } } From a092b764aad74194632d70224d1c2b53bd15dd63 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Wed, 29 Nov 2023 12:49:37 +0200 Subject: [PATCH 36/45] clippy Signed-off-by: Andrei Sandu --- polkadot/node/subsystem-bench/src/availability/mod.rs | 2 +- polkadot/node/subsystem-bench/src/core/network.rs | 2 -- polkadot/node/subsystem-bench/src/subsystem-bench.rs | 5 ----- 3 files changed, 1 insertion(+), 8 deletions(-) diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index f4c39893215bb..7d6865fee958e 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -120,7 +120,7 @@ fn prepare_test_inner( }; let network = NetworkEmulator::new( - config.n_validators.clone(), + config.n_validators, test_authorities.validator_authority_id, config.peer_bandwidth, dependencies.task_manager.spawn_handle(), diff --git a/polkadot/node/subsystem-bench/src/core/network.rs b/polkadot/node/subsystem-bench/src/core/network.rs index 67dc0e0f267e6..3d38a8f36b190 100644 --- a/polkadot/node/subsystem-bench/src/core/network.rs +++ b/polkadot/node/subsystem-bench/src/core/network.rs @@ -96,8 +96,6 @@ impl RateLimit { #[cfg(test)] mod tests { - use super::*; - use polkadot_node_metrics::metered::CoarseDuration; use std::time::Instant; use super::RateLimit; diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs index 5337a13e9729d..0f3ae0f41417e 100644 --- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -184,8 +184,3 @@ fn main() -> eyre::Result<()> { cli.launch()?; Ok(()) } - -#[cfg(test)] -mod tests { - use super::*; -} From ca27370c275a0a5f3f10823b4e8ccf1dae3f1a36 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Wed, 29 Nov 2023 13:00:12 +0200 Subject: [PATCH 37/45] zepter format features --fix Signed-off-by: Andrei Sandu --- polkadot/node/network/availability-recovery/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polkadot/node/network/availability-recovery/Cargo.toml b/polkadot/node/network/availability-recovery/Cargo.toml index 4a3f5c26e7b9b..3d77652acd03c 100644 --- a/polkadot/node/network/availability-recovery/Cargo.toml +++ b/polkadot/node/network/availability-recovery/Cargo.toml @@ -40,4 +40,4 @@ polkadot-node-subsystem-test-helpers = { path = "../../subsystem-test-helpers" } polkadot-primitives-test-helpers = { path = "../../../primitives/test-helpers" } [features] -subsystem-benchmarks = [] \ No newline at end of file +subsystem-benchmarks = [] From be814e554ae2445e1835e95ee7f9780519e643c8 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Wed, 29 Nov 2023 14:24:15 +0200 Subject: [PATCH 38/45] fix markdown Signed-off-by: Andrei Sandu --- polkadot/node/subsystem-bench/README.md | 82 +++++++++++++++++-------- 1 file changed, 57 insertions(+), 25 deletions(-) diff --git a/polkadot/node/subsystem-bench/README.md b/polkadot/node/subsystem-bench/README.md index 5b58dc3a5be48..351e07b6abca6 100644 --- a/polkadot/node/subsystem-bench/README.md +++ b/polkadot/node/subsystem-bench/README.md @@ -3,9 +3,16 @@ Run parachain consensus stress and performance tests on your development machine. ## Motivation -The parachain consensus node implementation spans across many modules which we call subsystems. Each subsystem is responsible for a small part of logic of the parachain consensus pipeline, but in general the most load and performance issues are localized in just a few core subsystems like `availability-recovery`, `approval-voting` or `dispute-coordinator`. In the absence such a tool, we would run large test nets to load/stress test these parts of the system. Setting up and making sense of the amount of data produced by such a large test is very expensive, hard to orchestrate and is a huge development time sink. + +The parachain consensus node implementation spans across many modules which we call subsystems. Each subsystem is +responsible for a small part of logic of the parachain consensus pipeline, but in general the most load and +performance issues are localized in just a few core subsystems like `availability-recovery`, `approval-voting` or +`dispute-coordinator`. In the absence such a tool, we would run large test nets to load/stress test these parts of +the system. Setting up and making sense of the amount of data produced by such a large test is very expensive, hard +to orchestrate and is a huge development time sink. This tool aims to solve the problem by making it easy to: + - set up and run core subsystem load tests locally on your development machine - iterate and conclude faster when benchmarking new optimizations or comparing implementations - automate and keep track of performance regressions in CI runs @@ -18,17 +25,22 @@ This tool aims to solve the problem by making it easy to: The output binary will be placed in `target/testnet/subsystem-bench`. ### Test metrics + Subsystem, CPU usage and network metrics are exposed via a prometheus endpoint during the test execution. -A small subset of these collected metrics are displayed in the CLI, but for an in depth analysys of the test results, a local Grafana/Prometheus stack is needed. +A small subset of these collected metrics are displayed in the CLI, but for an in depth analysys of the test results, +a local Grafana/Prometheus stack is needed. ### Install Prometheus -Please follow the [official installation guide](https://prometheus.io/docs/prometheus/latest/installation/) for your platform/OS. + +Please follow the [official installation guide](https://prometheus.io/docs/prometheus/latest/installation/) for your +platform/OS. After succesfully installing and starting up Prometheus, we need to alter it's configuration such that it will scrape the benchmark prometheus endpoint `127.0.0.1:9999`. Please check the prometheus official documentation -regarding the location of `prometheus.yml`. On MacOS for example the full path `/opt/homebrew/etc/prometheus.yml` +regarding the location of `prometheus.yml`. On MacOS for example the full path `/opt/homebrew/etc/prometheus.yml` prometheus.yml: + ``` global: scrape_interval: 5s @@ -44,6 +56,7 @@ scrape_configs: ``` To complete this step restart Prometheus server such that it picks up the new configuration. + ### Install and setup Grafana Follow the [installation guide](https://grafana.com/docs/grafana/latest/setup-grafana/installation/) relevant @@ -54,7 +67,8 @@ Once you have the installation up and running, configure the local Prometheus as #### Import dashboards -Follow [this guide](https://grafana.com/docs/grafana/latest/dashboards/manage-dashboards/#export-and-import-dashboards) to import the dashboards from the repository `grafana` folder. +Follow [this guide](https://grafana.com/docs/grafana/latest/dashboards/manage-dashboards/#export-and-import-dashboards) +to import the dashboards from the repository `grafana` folder. ## How to run a test @@ -71,14 +85,15 @@ Commands: ``` -Note: `test-sequence` is a special test objective that wraps up an arbitrary number of test objectives. It is tipically used to run a suite of tests defined in a `yaml` file like in this [example](examples/availability_read.yaml). +Note: `test-sequence` is a special test objective that wraps up an arbitrary number of test objectives. It is tipically + used to run a suite of tests defined in a `yaml` file like in this [example](examples/availability_read.yaml). ### Standard test options ``` Options: - --network The type of network to be emulated [default: ideal] [possible values: ideal, - healthy, degraded] + --network The type of network to be emulated [default: ideal] [possible values: + ideal, healthy, degraded] --n-cores Number of cores to fetch availability for [default: 100] --n-validators Number of validators to fetch chunks from [default: 500] --min-pov-size The minimum pov size in KiB [default: 5120] @@ -96,9 +111,11 @@ Options: These apply to all test objectives, except `test-sequence` which relies on the values being specified in a file. ### Test objectives + Each test objective can have it's specific configuration options, in contrast with the standard test options. For `data-availability-read` the recovery strategy to be used is configurable. + ``` target/testnet/subsystem-bench data-availability-read --help Benchmark availability recovery strategies @@ -106,31 +123,38 @@ Benchmark availability recovery strategies Usage: subsystem-bench data-availability-read [OPTIONS] Options: - -f, --fetch-from-backers Turbo boost AD Read by fetching the full availability datafrom backers first. Saves CPU as we - don't need to re-construct from chunks. Tipically this is only faster if nodes have enough - bandwidth + -f, --fetch-from-backers Turbo boost AD Read by fetching the full availability datafrom backers first. Saves CPU + as we don't need to re-construct from chunks. Tipically this is only faster if nodes + have enough bandwidth -h, --help Print help ``` + ### Understanding the test configuration + A single test configuration `TestConfiguration` struct applies to a single run of a certain test objective. The configuration describes the following important parameters that influence the test duration and resource usage: + - how many validators are on the emulated network (`n_validators`) - how many cores per block the subsystem will have to do work on (`n_cores`) - for how many blocks the test should run (`num_blocks`) -From the perspective of the subsystem under test, this means that it will receive an `ActiveLeavesUpdate` signal -followed by an arbitrary amount of messages. This process repeats itself for `num_blocks`. The messages are generally test payloads pre-generated before the test run, or constructed on pre-genereated payloads. For example the `AvailabilityRecoveryMessage::RecoverAvailableData` message includes a `CandidateReceipt` which is generated before the test is started. +From the perspective of the subsystem under test, this means that it will receive an `ActiveLeavesUpdate` signal +followed by an arbitrary amount of messages. This process repeats itself for `num_blocks`. The messages are generally +test payloads pre-generated before the test run, or constructed on pre-genereated payloads. For example the +`AvailabilityRecoveryMessage::RecoverAvailableData` message includes a `CandidateReceipt` which is generated before +the test is started. -### Example run +### Example run Let's run an availabilty read test which will recover availability for 10 cores with max PoV size on a 500 node validator network. ``` target/testnet/subsystem-bench --n-cores 10 data-availability-read -[2023-11-28T09:01:59Z INFO subsystem_bench::core::display] n_validators = 500, n_cores = 10, pov_size = 5120 - 5120, error = 0, latency = None +[2023-11-28T09:01:59Z INFO subsystem_bench::core::display] n_validators = 500, n_cores = 10, pov_size = 5120 - 5120, + error = 0, latency = None [2023-11-28T09:01:59Z INFO subsystem-bench::availability] Generating template candidate index=0 pov_size=5242880 [2023-11-28T09:01:59Z INFO subsystem-bench::availability] Created test environment. [2023-11-28T09:01:59Z INFO subsystem-bench::availability] Pre-generating 10 candidates. @@ -153,32 +177,40 @@ node validator network. CPU usage per block 0.00s ``` -`Block time` in the context of `data-availability-read` has a different meaning. It measures the amount of time it took the subsystem to finish processing all of the messages sent in the context of the current test block. - +`Block time` in the context of `data-availability-read` has a different meaning. It measures the amount of time it +took the subsystem to finish processing all of the messages sent in the context of the current test block. ### Test logs -You can select log target, subtarget and verbosity just like with Polkadot node CLI, simply setting `RUST_LOOG="parachain=debug"` turns on debug logs for all parachain consensus subsystems in the test. +You can select log target, subtarget and verbosity just like with Polkadot node CLI, simply setting +`RUST_LOOG="parachain=debug"` turns on debug logs for all parachain consensus subsystems in the test. ### View test metrics -Assuming the Grafana/Prometheus stack installation steps completed succesfully, you should be able to +Assuming the Grafana/Prometheus stack installation steps completed succesfully, you should be able to view the test progress in real time by accessing [this link](http://localhost:3000/goto/SM5B8pNSR?orgId=1). -Now run `target/testnet/subsystem-bench test-sequence --path polkadot/node/subsystem-bench/examples/availability_read.yaml` and view the metrics in real time and spot differences between different `n_valiator` values. +Now run +`target/testnet/subsystem-bench test-sequence --path polkadot/node/subsystem-bench/examples/availability_read.yaml` +and view the metrics in real time and spot differences between different `n_valiator` values. ## Create new test objectives -This tool is intended to make it easy to write new test objectives that focus individual subsystems, + +This tool is intended to make it easy to write new test objectives that focus individual subsystems, or even multiple subsystems (for example `approval-distribution` and `approval-voting`). A special kind of test objectives are performance regression tests for the CI pipeline. These should be sequences -of tests that check the performance characteristics (such as CPU usage, speed) of the subsystem under test in both happy and negative scenarios (low bandwidth, network errors and low connectivity). +of tests that check the performance characteristics (such as CPU usage, speed) of the subsystem under test in both +happy and negative scenarios (low bandwidth, network errors and low connectivity). ### Reusable test components -To faster write a new test objective you need to use some higher level wrappers and logic: `TestEnvironment` `TestConfiguration`, `TestAuthorities`, `NetworkEmulator`. To create the `TestEnvironment` you will + +To faster write a new test objective you need to use some higher level wrappers and logic: `TestEnvironment`, +`TestConfiguration`, `TestAuthorities`, `NetworkEmulator`. To create the `TestEnvironment` you will need to also build an `Overseer`, but that should be easy using the mockups for subsystems in`core::mock`. ### Mocking -Ideally we want to have a single mock implementation for subsystems that can be minimally configured to -be used in different tests. A good example is `runtime-api` which currently only responds to session information requests based on static data. It can be easily extended to service other requests. +Ideally we want to have a single mock implementation for subsystems that can be minimally configured to +be used in different tests. A good example is `runtime-api` which currently only responds to session information +requests based on static data. It can be easily extended to service other requests. From 11ce8f5121ed5d6448a77e46647a4a0ffcfad066 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Wed, 29 Nov 2023 18:01:40 +0200 Subject: [PATCH 39/45] remove sleep till end of block Signed-off-by: Andrei Sandu --- polkadot/node/subsystem-bench/src/availability/mod.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index 7d6865fee958e..77888fa6058c5 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -363,13 +363,9 @@ pub async fn benchmark_availability_read(env: &mut TestEnvironment, mut state: T let block_time = Instant::now().sub(block_start_ts).as_millis() as u64; env.metrics().set_block_time(block_time); - gum::info!("Block time {}", format!("{:?}ms", block_time).cyan()); - gum::info!(target: LOG_TARGET,"{}", format!("Sleeping till end of block ({}ms)", block_time_delta.as_millis()).bright_black()); - tokio::time::sleep(block_time_delta).await; + gum::info!("All work for block completed in {}", format!("{:?}ms", block_time).cyan()); } - env.stop().await; - let duration: u128 = start_marker.elapsed().as_millis(); let availability_bytes = availability_bytes / 1024; gum::info!("All blocks processed in {}", format!("{:?}ms", duration).cyan()); @@ -384,4 +380,5 @@ pub async fn benchmark_availability_read(env: &mut TestEnvironment, mut state: T ); gum::info!("{}", &env); + env.stop().await; } From 8d93abc6dd73a7cf82668b17c570235e3a4d7dbc Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Wed, 29 Nov 2023 20:04:37 +0200 Subject: [PATCH 40/45] review Signed-off-by: Andrei Sandu --- .../network/availability-recovery/src/lib.rs | 2 +- polkadot/node/subsystem-bench/README.md | 2 +- .../src/availability/configuration.rs | 24 ------------------- .../subsystem-bench/src/availability/mod.rs | 13 +--------- .../subsystem-bench/src/core/mock/av_store.rs | 2 +- .../src/core/mock/network_bridge.rs | 2 +- 6 files changed, 5 insertions(+), 40 deletions(-) delete mode 100644 polkadot/node/subsystem-bench/src/availability/configuration.rs diff --git a/polkadot/node/network/availability-recovery/src/lib.rs b/polkadot/node/network/availability-recovery/src/lib.rs index c454028b86502..d029bce04173b 100644 --- a/polkadot/node/network/availability-recovery/src/lib.rs +++ b/polkadot/node/network/availability-recovery/src/lib.rs @@ -856,7 +856,7 @@ async fn erasure_task_thread( } // In benchmarks this is a very hot loop not yielding at all. - // To update promehteus metrics for the task we need to yield. + // To update CPU metrics for the task we need to yield. #[cfg(feature = "subsystem-benchmarks")] tokio::task::yield_now().await; } diff --git a/polkadot/node/subsystem-bench/README.md b/polkadot/node/subsystem-bench/README.md index 351e07b6abca6..f4ea04662f9e4 100644 --- a/polkadot/node/subsystem-bench/README.md +++ b/polkadot/node/subsystem-bench/README.md @@ -7,7 +7,7 @@ Run parachain consensus stress and performance tests on your development machine The parachain consensus node implementation spans across many modules which we call subsystems. Each subsystem is responsible for a small part of logic of the parachain consensus pipeline, but in general the most load and performance issues are localized in just a few core subsystems like `availability-recovery`, `approval-voting` or -`dispute-coordinator`. In the absence such a tool, we would run large test nets to load/stress test these parts of +`dispute-coordinator`. In the absence of such a tool, we would run large test nets to load/stress test these parts of the system. Setting up and making sense of the amount of data produced by such a large test is very expensive, hard to orchestrate and is a huge development time sink. diff --git a/polkadot/node/subsystem-bench/src/availability/configuration.rs b/polkadot/node/subsystem-bench/src/availability/configuration.rs deleted file mode 100644 index 1274862a8e4a1..0000000000000 --- a/polkadot/node/subsystem-bench/src/availability/configuration.rs +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright (C) Parity Technologies (UK) Ltd. -// This file is part of Polkadot. - -// Polkadot is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. - -// Polkadot is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. - -// You should have received a copy of the GNU General Public License -// along with Polkadot. If not, see . - -use serde::{Deserialize, Serialize}; - -/// The test input parameters -#[derive(Clone, Default, Debug, Serialize, Deserialize)] -pub struct AvailabilityRecoveryConfiguration { - /// Prefer the fast path (try fetch from backers first) - pub use_fast_path: bool, -} diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index 77888fa6058c5..ca2e800d4c896 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -14,13 +14,7 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . use itertools::Itertools; -use std::{ - collections::HashMap, - iter::Cycle, - ops::Sub, - sync::Arc, - time::{Duration, Instant}, -}; +use std::{collections::HashMap, iter::Cycle, ops::Sub, sync::Arc, time::Instant}; use crate::TestEnvironment; use polkadot_node_subsystem::{Overseer, OverseerConnector, SpawnGlue}; @@ -67,9 +61,7 @@ use polkadot_primitives_test_helpers::{dummy_candidate_receipt, dummy_hash}; use sc_service::SpawnTaskHandle; mod cli; -pub mod configuration; pub use cli::{DataAvailabilityReadOptions, NetworkEmulation}; -pub use configuration::AvailabilityRecoveryConfiguration; fn build_overseer( spawn_task_handle: SpawnTaskHandle, @@ -358,9 +350,6 @@ pub async fn benchmark_availability_read(env: &mut TestEnvironment, mut state: T availability_bytes += available_data.encoded_size() as u128; } - let block_time_delta = - Duration::from_secs(6).saturating_sub(Instant::now().sub(block_start_ts)); - let block_time = Instant::now().sub(block_start_ts).as_millis() as u64; env.metrics().set_block_time(block_time); gum::info!("All work for block completed in {}", format!("{:?}ms", block_time).cyan()); diff --git a/polkadot/node/subsystem-bench/src/core/mock/av_store.rs b/polkadot/node/subsystem-bench/src/core/mock/av_store.rs index 1ff7d1728af98..88747affc8c0c 100644 --- a/polkadot/node/subsystem-bench/src/core/mock/av_store.rs +++ b/polkadot/node/subsystem-bench/src/core/mock/av_store.rs @@ -129,7 +129,7 @@ impl MockAvailabilityStore { let _ = tx.send(Some(chunk_size)); }, _ => { - unimplemented!("Unexpected runtime-api message") + unimplemented!("Unexpected av-store message") }, }, } diff --git a/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs b/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs index 2bc8d22234b60..53f4fb9631f2e 100644 --- a/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs +++ b/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs @@ -261,7 +261,7 @@ impl MockNetworkBridgeTx { } }, _ => { - unimplemented!("Unexpected runtime-api message") + unimplemented!("Unexpected network bridge message") }, }, } From af141eefcc198926f7da7734228e208e8411deac Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Fri, 1 Dec 2023 15:28:09 +0200 Subject: [PATCH 41/45] Emulated network improvements Signed-off-by: Andrei Sandu --- .../subsystem-bench/src/availability/mod.rs | 8 +- .../subsystem-bench/src/core/configuration.rs | 33 ++++- .../src/core/mock/network_bridge.rs | 55 +++++++- .../node/subsystem-bench/src/core/network.rs | 128 ++++++++++++++---- 4 files changed, 184 insertions(+), 40 deletions(-) diff --git a/polkadot/node/subsystem-bench/src/availability/mod.rs b/polkadot/node/subsystem-bench/src/availability/mod.rs index ca2e800d4c896..244119735966b 100644 --- a/polkadot/node/subsystem-bench/src/availability/mod.rs +++ b/polkadot/node/subsystem-bench/src/availability/mod.rs @@ -111,13 +111,7 @@ fn prepare_test_inner( chunks: state.chunks.clone(), }; - let network = NetworkEmulator::new( - config.n_validators, - test_authorities.validator_authority_id, - config.peer_bandwidth, - dependencies.task_manager.spawn_handle(), - &dependencies.registry, - ); + let network = NetworkEmulator::new(&config, &dependencies, &test_authorities); let network_bridge_tx = network_bridge::MockNetworkBridgeTx::new( config.clone(), diff --git a/polkadot/node/subsystem-bench/src/core/configuration.rs b/polkadot/node/subsystem-bench/src/core/configuration.rs index 340b5c03ab84a..adb5ce80c0d4c 100644 --- a/polkadot/node/subsystem-bench/src/core/configuration.rs +++ b/polkadot/node/subsystem-bench/src/core/configuration.rs @@ -43,6 +43,21 @@ pub struct PeerLatency { pub max_latency: Duration, } +// Default PoV size in KiB. +fn default_pov_size() -> usize { + 5120 +} + +// Default bandwidth in bytes +fn default_bandwidth() -> usize { + 52428800 +} + +// Default connectivity percentage +fn default_connectivity() -> usize { + 100 +} + /// The test input parameters #[derive(Clone, Debug, Serialize, Deserialize)] pub struct TestConfiguration { @@ -53,22 +68,31 @@ pub struct TestConfiguration { /// Number of cores pub n_cores: usize, /// The min PoV size + #[serde(default = "default_pov_size")] pub min_pov_size: usize, /// The max PoV size, + #[serde(default = "default_pov_size")] pub max_pov_size: usize, /// Randomly sampled pov_sizes #[serde(skip)] pov_sizes: Vec, /// The amount of bandiwdth remote validators have. + #[serde(default = "default_bandwidth")] pub peer_bandwidth: usize, /// The amount of bandiwdth our node has. + #[serde(default = "default_bandwidth")] pub bandwidth: usize, /// Optional peer emulation latency + #[serde(default)] pub latency: Option, - /// Error probability + /// Error probability, applies to sending messages to the emulated network peers + #[serde(default)] pub error: usize, - /// Number of blocks - /// In one block `n_cores` candidates are recovered + /// Connectivity ratio, the percentage of peers we are not connected to, but ar part of + /// the topology. + #[serde(default = "default_connectivity")] + pub connectivity: usize, + /// Number of blocks to run the test for pub num_blocks: usize, } @@ -166,6 +190,7 @@ impl TestConfiguration { num_blocks, min_pov_size, max_pov_size, + connectivity: 100, } } @@ -192,6 +217,7 @@ impl TestConfiguration { num_blocks, min_pov_size, max_pov_size, + connectivity: 95, } } @@ -218,6 +244,7 @@ impl TestConfiguration { num_blocks, min_pov_size, max_pov_size, + connectivity: 67, } } } diff --git a/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs b/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs index 53f4fb9631f2e..fa47302091834 100644 --- a/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs +++ b/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs @@ -16,10 +16,10 @@ //! //! A generic av store subsystem mockup suitable to be used in benchmarks. +use futures::Future; use parity_scale_codec::Encode; use polkadot_node_subsystem_types::OverseerSignal; - -use std::collections::HashMap; +use std::{collections::HashMap, pin::Pin}; use futures::FutureExt; @@ -35,6 +35,7 @@ use polkadot_node_subsystem::{ use polkadot_node_network_protocol::request_response::{ self as req_res, v1::ChunkResponse, Requests, }; +use polkadot_primitives::AuthorityDiscoveryId; use crate::core::{ configuration::{random_error, random_latency, TestConfiguration}, @@ -71,7 +72,24 @@ impl MockNetworkBridgeTx { Self { config, availabilty, network } } - pub fn respond_to_send_request( + fn not_connected_response( + &self, + authority_discovery_id: &AuthorityDiscoveryId, + future: Pin + Send>>, + ) -> NetworkAction { + // The network action will send the error after a random delay expires. + return NetworkAction::new( + authority_discovery_id.clone(), + future, + 0, + // Generate a random latency based on configuration. + random_latency(self.config.latency.as_ref()), + ) + } + /// Returns an `NetworkAction` corresponding to the peer sending the response. If + /// the peer is connected, the error is sent with a randomized latency as defined in + /// configuration. + fn respond_to_send_request( &mut self, request: Requests, ingress_tx: &mut tokio::sync::mpsc::UnboundedSender, @@ -86,9 +104,23 @@ impl MockNetworkBridgeTx { }; // Account our sent request bytes. self.network.peer_stats(0).inc_sent(outgoing_request.payload.encoded_size()); + + // If peer is disconnected return an error to the caller + if !self.network.is_peer_connected(&authority_discovery_id) { + // We always send `NotConnected` error and we ignore `IfDisconnected` value in + // the caller. + let future = async move { + let _ = outgoing_request + .pending_response + .send(Err(RequestFailure::NotConnected)); + } + .boxed(); + return self.not_connected_response(&authority_discovery_id, future) + } + // Account for remote received request bytes. self.network - .peer_stats_by_id(authority_discovery_id.clone()) + .peer_stats_by_id(&authority_discovery_id) .inc_received(outgoing_request.payload.encoded_size()); let validator_index: usize = outgoing_request.payload.index.0 as usize; @@ -153,11 +185,24 @@ impl MockNetworkBridgeTx { req_res::Recipient::Authority(authority_discovery_id) => authority_discovery_id, _ => unimplemented!("Peer recipient not supported yet"), }; + // Account our sent request bytes. self.network.peer_stats(0).inc_sent(outgoing_request.payload.encoded_size()); + + // If peer is disconnected return an error to the caller + if !self.network.is_peer_connected(&authority_discovery_id) { + let future = async move { + let _ = outgoing_request + .pending_response + .send(Err(RequestFailure::NotConnected)); + } + .boxed(); + return self.not_connected_response(&authority_discovery_id, future) + } + // Account for remote received request bytes. self.network - .peer_stats_by_id(authority_discovery_id.clone()) + .peer_stats_by_id(&authority_discovery_id) .inc_received(outgoing_request.payload.encoded_size()); let available_data = diff --git a/polkadot/node/subsystem-bench/src/core/network.rs b/polkadot/node/subsystem-bench/src/core/network.rs index 3d38a8f36b190..09943becb65cc 100644 --- a/polkadot/node/subsystem-bench/src/core/network.rs +++ b/polkadot/node/subsystem-bench/src/core/network.rs @@ -13,10 +13,15 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . -use super::*; +use super::{ + configuration::{TestAuthorities, TestConfiguration}, + environment::TestEnvironmentDependencies, + *, +}; use colored::Colorize; use polkadot_primitives::AuthorityDiscoveryId; use prometheus_endpoint::U64; +use rand::{seq::SliceRandom, thread_rng}; use sc_service::SpawnTaskHandle; use std::{ collections::HashMap, @@ -268,44 +273,97 @@ impl NetworkAction { } } +/// The state of a peer on the emulated network. +#[derive(Clone)] +enum Peer { + Connected(PeerEmulator), + Disconnected(PeerEmulator), +} + +impl Peer { + pub fn disconnect(&mut self) { + let new_self = match self { + Peer::Connected(peer) => Peer::Disconnected(peer.clone()), + _ => return, + }; + *self = new_self; + } + + pub fn is_connected(&self) -> bool { + if let Peer::Connected(_) = self { + true + } else { + false + } + } + + pub fn emulator(&mut self) -> &mut PeerEmulator { + match self { + Peer::Connected(ref mut emulator) => emulator, + Peer::Disconnected(ref mut emulator) => emulator, + } + } +} + /// Mocks the network bridge and an arbitrary number of connected peer nodes. /// Implements network latency, bandwidth and connection errors. #[derive(Clone)] pub struct NetworkEmulator { // Per peer network emulation. - peers: Vec, + peers: Vec, /// Per peer stats. stats: Vec>, - /// Network throughput metrics - metrics: Metrics, /// Each emulated peer is a validator. validator_authority_ids: HashMap, } impl NetworkEmulator { pub fn new( - n_peers: usize, - validator_authority_ids: Vec, - bandwidth: usize, - spawn_task_handle: SpawnTaskHandle, - registry: &Registry, + config: &TestConfiguration, + dependencies: &TestEnvironmentDependencies, + authorities: &TestAuthorities, ) -> Self { - gum::info!(target: LOG_TARGET, "{}",format!("Initializing network emulation for {} peers.", n_peers).bright_blue()); + let n_peers = config.n_validators; + gum::info!(target: LOG_TARGET, "{}",format!("Initializing emulation for a {} peer network.", n_peers).bright_blue()); + gum::info!(target: LOG_TARGET, "{}",format!("connectivity {}%, error {}%", config.connectivity, config.error).bright_black()); - let metrics = Metrics::new(®istry).expect("Metrics always register succesfully"); + let metrics = + Metrics::new(&dependencies.registry).expect("Metrics always register succesfully"); let mut validator_authority_id_mapping = HashMap::new(); // Create a `PeerEmulator` for each peer. - let (stats, peers) = (0..n_peers) - .zip(validator_authority_ids.into_iter()) + let (stats, mut peers): (_, Vec<_>) = (0..n_peers) + .zip(authorities.validator_authority_id.clone().into_iter()) .map(|(peer_index, authority_id)| { validator_authority_id_mapping.insert(authority_id, peer_index); let stats = Arc::new(PeerEmulatorStats::new(peer_index, metrics.clone())); - (stats.clone(), PeerEmulator::new(bandwidth, spawn_task_handle.clone(), stats)) + ( + stats.clone(), + Peer::Connected(PeerEmulator::new( + config.peer_bandwidth, + dependencies.task_manager.spawn_handle(), + stats, + )), + ) }) .unzip(); - Self { peers, stats, metrics, validator_authority_ids: validator_authority_id_mapping } + let connected_count = config.n_validators as f64 / (100.0 / config.connectivity as f64); + + let (_connected, to_disconnect) = + peers.partial_shuffle(&mut thread_rng(), connected_count as usize); + + for peer in to_disconnect { + peer.disconnect(); + } + + gum::info!(target: LOG_TARGET, "{}",format!("Network created, connected validator count {}", connected_count).bright_black()); + + Self { peers, stats, validator_authority_ids: validator_authority_id_mapping } + } + + pub fn is_peer_connected(&self, peer: &AuthorityDiscoveryId) -> bool { + self.peer(peer).is_connected() } pub fn submit_peer_action(&mut self, peer: AuthorityDiscoveryId, action: NetworkAction) { @@ -313,21 +371,41 @@ impl NetworkEmulator { .validator_authority_ids .get(&peer) .expect("all test authorities are valid; qed"); - self.peers[*index].send(action); + + let peer = self.peers.get_mut(*index).expect("We just retrieved the index above; qed"); + + // Only actions of size 0 are allowed on disconnected peers. + // Typically this are delayed error response sends. + if action.size() > 0 && !peer.is_connected() { + gum::warn!(target: LOG_TARGET, peer_index = index, "Attempted to send data from a disconnected peer, operation ignored"); + return + } + + peer.emulator().send(action); } // Returns the sent/received stats for `peer_index`. - pub fn peer_stats(&mut self, peer_index: usize) -> Arc { + pub fn peer_stats(&self, peer_index: usize) -> Arc { self.stats[peer_index].clone() } - // Returns the sent/received stats for `peer`. - pub fn peer_stats_by_id(&mut self, peer: AuthorityDiscoveryId) -> Arc { - let peer_index = self + // Helper to get peer index by `AuthorityDiscoveryId` + fn peer_index(&self, peer: &AuthorityDiscoveryId) -> usize { + *self .validator_authority_ids - .get(&peer) - .expect("all test authorities are valid; qed"); - self.stats[*peer_index].clone() + .get(peer) + .expect("all test authorities are valid; qed") + } + + // Return the Peer entry for a given `AuthorityDiscoveryId`. + fn peer(&self, peer: &AuthorityDiscoveryId) -> &Peer { + &self.peers[self.peer_index(peer)] + } + // Returns the sent/received stats for `peer`. + pub fn peer_stats_by_id(&mut self, peer: &AuthorityDiscoveryId) -> Arc { + let peer_index = self.peer_index(peer); + + self.stats[peer_index].clone() } // Returns the sent/received stats for all peers. @@ -346,13 +424,13 @@ impl NetworkEmulator { // Increment bytes sent by our node (the node that contains the subsystem under test) pub fn inc_sent(&self, bytes: usize) { // Our node always is peer 0. - self.metrics.on_peer_sent(0, bytes); + self.peer_stats(0).inc_sent(bytes); } // Increment bytes received by our node (the node that contains the subsystem under test) pub fn inc_received(&self, bytes: usize) { // Our node always is peer 0. - self.metrics.on_peer_received(0, bytes); + self.peer_stats(0).inc_received(bytes); } } From 29d80fa638ea4315319d7e96ec391e80d3a2350c Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Fri, 1 Dec 2023 16:21:47 +0200 Subject: [PATCH 42/45] fix comment Signed-off-by: Andrei Sandu --- polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs b/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs index fa47302091834..c8140843b3b96 100644 --- a/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs +++ b/polkadot/node/subsystem-bench/src/core/mock/network_bridge.rs @@ -105,7 +105,7 @@ impl MockNetworkBridgeTx { // Account our sent request bytes. self.network.peer_stats(0).inc_sent(outgoing_request.payload.encoded_size()); - // If peer is disconnected return an error to the caller + // If peer is disconnected return an error if !self.network.is_peer_connected(&authority_discovery_id) { // We always send `NotConnected` error and we ignore `IfDisconnected` value in // the caller. @@ -189,7 +189,7 @@ impl MockNetworkBridgeTx { // Account our sent request bytes. self.network.peer_stats(0).inc_sent(outgoing_request.payload.encoded_size()); - // If peer is disconnected return an error to the caller + // If peer is disconnected return an error if !self.network.is_peer_connected(&authority_discovery_id) { let future = async move { let _ = outgoing_request From 70ac38ed37a43edf45d3cee93411c0f35bd1c456 Mon Sep 17 00:00:00 2001 From: Andrei Eres Date: Mon, 4 Dec 2023 15:04:25 +0100 Subject: [PATCH 43/45] Add cpu profiling --- Cargo.lock | 2 + polkadot/node/subsystem-bench/Cargo.toml | 2 + polkadot/node/subsystem-bench/README.md | 93 ++++++++++++------- .../subsystem-bench/docker/docker-compose.yml | 35 +++++++ .../docker/prometheus/prometheus.yml | 11 +++ .../grafana/cpu-profiling.json | 70 ++++++++++++++ .../subsystem-bench/src/subsystem-bench.rs | 24 +++++ 7 files changed, 202 insertions(+), 35 deletions(-) create mode 100644 polkadot/node/subsystem-bench/docker/docker-compose.yml create mode 100644 polkadot/node/subsystem-bench/docker/prometheus/prometheus.yml create mode 100644 polkadot/node/subsystem-bench/grafana/cpu-profiling.json diff --git a/Cargo.lock b/Cargo.lock index 2cc35a2754243..e2863aaa4b6e6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13462,6 +13462,8 @@ dependencies = [ "polkadot-primitives", "polkadot-primitives-test-helpers", "prometheus", + "pyroscope", + "pyroscope_pprofrs", "rand 0.8.5", "sc-keystore", "sc-network", diff --git a/polkadot/node/subsystem-bench/Cargo.toml b/polkadot/node/subsystem-bench/Cargo.toml index f775a1ff9efee..67a4eecdef425 100644 --- a/polkadot/node/subsystem-bench/Cargo.toml +++ b/polkadot/node/subsystem-bench/Cargo.toml @@ -56,6 +56,8 @@ serde = "1.0.192" serde_yaml = "0.9" paste = "1.0.14" orchestra = { version = "0.3.3", default-features = false, features=["futures_channel"] } +pyroscope = "0.5.7" +pyroscope_pprofrs = "0.2.7" [features] default = [] diff --git a/polkadot/node/subsystem-bench/README.md b/polkadot/node/subsystem-bench/README.md index f4ea04662f9e4..fca37da621752 100644 --- a/polkadot/node/subsystem-bench/README.md +++ b/polkadot/node/subsystem-bench/README.md @@ -1,14 +1,14 @@ # Subsystem benchmark client -Run parachain consensus stress and performance tests on your development machine. +Run parachain consensus stress and performance tests on your development machine. ## Motivation -The parachain consensus node implementation spans across many modules which we call subsystems. Each subsystem is -responsible for a small part of logic of the parachain consensus pipeline, but in general the most load and -performance issues are localized in just a few core subsystems like `availability-recovery`, `approval-voting` or -`dispute-coordinator`. In the absence of such a tool, we would run large test nets to load/stress test these parts of -the system. Setting up and making sense of the amount of data produced by such a large test is very expensive, hard +The parachain consensus node implementation spans across many modules which we call subsystems. Each subsystem is +responsible for a small part of logic of the parachain consensus pipeline, but in general the most load and +performance issues are localized in just a few core subsystems like `availability-recovery`, `approval-voting` or +`dispute-coordinator`. In the absence of such a tool, we would run large test nets to load/stress test these parts of +the system. Setting up and making sense of the amount of data produced by such a large test is very expensive, hard to orchestrate and is a huge development time sink. This tool aims to solve the problem by making it easy to: @@ -26,17 +26,26 @@ The output binary will be placed in `target/testnet/subsystem-bench`. ### Test metrics -Subsystem, CPU usage and network metrics are exposed via a prometheus endpoint during the test execution. -A small subset of these collected metrics are displayed in the CLI, but for an in depth analysys of the test results, +Subsystem, CPU usage and network metrics are exposed via a prometheus endpoint during the test execution. +A small subset of these collected metrics are displayed in the CLI, but for an in depth analysys of the test results, a local Grafana/Prometheus stack is needed. +### Run Prometheus, Pyroscope and Graphana in Docker + +If you are using Docker, you can skip the next steps to install Prometheus, Pyroscope and Graphana using `docker-compose` file. + +```bash +cd polkadot/node/subsystem-bench/docker +docker compose up +``` + ### Install Prometheus -Please follow the [official installation guide](https://prometheus.io/docs/prometheus/latest/installation/) for your +Please follow the [official installation guide](https://prometheus.io/docs/prometheus/latest/installation/) for your platform/OS. After succesfully installing and starting up Prometheus, we need to alter it's configuration such that it -will scrape the benchmark prometheus endpoint `127.0.0.1:9999`. Please check the prometheus official documentation +will scrape the benchmark prometheus endpoint `127.0.0.1:9999`. Please check the prometheus official documentation regarding the location of `prometheus.yml`. On MacOS for example the full path `/opt/homebrew/etc/prometheus.yml` prometheus.yml: @@ -57,17 +66,30 @@ scrape_configs: To complete this step restart Prometheus server such that it picks up the new configuration. -### Install and setup Grafana +### Install Pyroscope + +To collect CPU profiling data, you must be running the Pyroscope server. Follow the [installation guide](https://grafana.com/docs/pyroscope/latest/get-started/) relevant to your operating system. + +### Install Grafana Follow the [installation guide](https://grafana.com/docs/grafana/latest/setup-grafana/installation/) relevant to your operating system. -Once you have the installation up and running, configure the local Prometheus as a data source by following -[this guide](https://grafana.com/docs/grafana/latest/datasources/prometheus/configure-prometheus-data-source/) +### Setup Grafana + +Once you have the installation up and running, configure the local Prometheus and Pyroscope (if needed) as data sources by following these guides: + +- [Prometheus](https://grafana.com/docs/grafana/latest/datasources/prometheus/configure-prometheus-data-source/) +- [Pyroscope](https://grafana.com/docs/grafana/latest/datasources/grafana-pyroscope/) + +If you are running the servers in Docker, use the following URLs: + +- Prometheus `http://prometheus:9090/` +- Pyroscope `http://pyroscope:4040/` #### Import dashboards -Follow [this guide](https://grafana.com/docs/grafana/latest/dashboards/manage-dashboards/#export-and-import-dashboards) +Follow [this guide](https://grafana.com/docs/grafana/latest/dashboards/manage-dashboards/#export-and-import-dashboards) to import the dashboards from the repository `grafana` folder. ## How to run a test @@ -86,13 +108,13 @@ Commands: ``` Note: `test-sequence` is a special test objective that wraps up an arbitrary number of test objectives. It is tipically - used to run a suite of tests defined in a `yaml` file like in this [example](examples/availability_read.yaml). +used to run a suite of tests defined in a `yaml` file like in this [example](examples/availability_read.yaml). ### Standard test options - + ``` Options: - --network The type of network to be emulated [default: ideal] [possible values: + --network The type of network to be emulated [default: ideal] [possible values: ideal, healthy, degraded] --n-cores Number of cores to fetch availability for [default: 100] --n-validators Number of validators to fetch chunks from [default: 500] @@ -104,6 +126,7 @@ Options: --peer-error Simulated conection error ratio [0-100] --peer-min-latency Minimum remote peer latency in milliseconds [0-5000] --peer-max-latency Maximum remote peer latency in milliseconds [0-5000] + --profile Enable CPU Profiling -h, --help Print help -V, --version Print version ``` @@ -123,8 +146,8 @@ Benchmark availability recovery strategies Usage: subsystem-bench data-availability-read [OPTIONS] Options: - -f, --fetch-from-backers Turbo boost AD Read by fetching the full availability datafrom backers first. Saves CPU - as we don't need to re-construct from chunks. Tipically this is only faster if nodes + -f, --fetch-from-backers Turbo boost AD Read by fetching the full availability datafrom backers first. Saves CPU + as we don't need to re-construct from chunks. Tipically this is only faster if nodes have enough bandwidth -h, --help Print help ``` @@ -141,9 +164,9 @@ usage: - for how many blocks the test should run (`num_blocks`) From the perspective of the subsystem under test, this means that it will receive an `ActiveLeavesUpdate` signal -followed by an arbitrary amount of messages. This process repeats itself for `num_blocks`. The messages are generally -test payloads pre-generated before the test run, or constructed on pre-genereated payloads. For example the -`AvailabilityRecoveryMessage::RecoverAvailableData` message includes a `CandidateReceipt` which is generated before +followed by an arbitrary amount of messages. This process repeats itself for `num_blocks`. The messages are generally +test payloads pre-generated before the test run, or constructed on pre-genereated payloads. For example the +`AvailabilityRecoveryMessage::RecoverAvailableData` message includes a `CandidateReceipt` which is generated before the test is started. ### Example run @@ -152,8 +175,8 @@ Let's run an availabilty read test which will recover availability for 10 cores node validator network. ``` - target/testnet/subsystem-bench --n-cores 10 data-availability-read -[2023-11-28T09:01:59Z INFO subsystem_bench::core::display] n_validators = 500, n_cores = 10, pov_size = 5120 - 5120, + target/testnet/subsystem-bench --n-cores 10 data-availability-read +[2023-11-28T09:01:59Z INFO subsystem_bench::core::display] n_validators = 500, n_cores = 10, pov_size = 5120 - 5120, error = 0, latency = None [2023-11-28T09:01:59Z INFO subsystem-bench::availability] Generating template candidate index=0 pov_size=5242880 [2023-11-28T09:01:59Z INFO subsystem-bench::availability] Created test environment. @@ -167,8 +190,8 @@ node validator network. [2023-11-28T09:02:07Z INFO subsystem_bench::availability] All blocks processed in 6001ms [2023-11-28T09:02:07Z INFO subsystem_bench::availability] Throughput: 51200 KiB/block [2023-11-28T09:02:07Z INFO subsystem_bench::availability] Block time: 6001 ms -[2023-11-28T09:02:07Z INFO subsystem_bench::availability] - +[2023-11-28T09:02:07Z INFO subsystem_bench::availability] + Total received from network: 66 MiB Total sent to network: 58 KiB Total subsystem CPU usage 4.16s @@ -177,12 +200,12 @@ node validator network. CPU usage per block 0.00s ``` -`Block time` in the context of `data-availability-read` has a different meaning. It measures the amount of time it +`Block time` in the context of `data-availability-read` has a different meaning. It measures the amount of time it took the subsystem to finish processing all of the messages sent in the context of the current test block. ### Test logs -You can select log target, subtarget and verbosity just like with Polkadot node CLI, simply setting +You can select log target, subtarget and verbosity just like with Polkadot node CLI, simply setting `RUST_LOOG="parachain=debug"` turns on debug logs for all parachain consensus subsystems in the test. ### View test metrics @@ -190,27 +213,27 @@ You can select log target, subtarget and verbosity just like with Polkadot node Assuming the Grafana/Prometheus stack installation steps completed succesfully, you should be able to view the test progress in real time by accessing [this link](http://localhost:3000/goto/SM5B8pNSR?orgId=1). -Now run -`target/testnet/subsystem-bench test-sequence --path polkadot/node/subsystem-bench/examples/availability_read.yaml` -and view the metrics in real time and spot differences between different `n_valiator` values. - +Now run +`target/testnet/subsystem-bench test-sequence --path polkadot/node/subsystem-bench/examples/availability_read.yaml` +and view the metrics in real time and spot differences between different `n_valiator` values. + ## Create new test objectives This tool is intended to make it easy to write new test objectives that focus individual subsystems, or even multiple subsystems (for example `approval-distribution` and `approval-voting`). A special kind of test objectives are performance regression tests for the CI pipeline. These should be sequences -of tests that check the performance characteristics (such as CPU usage, speed) of the subsystem under test in both +of tests that check the performance characteristics (such as CPU usage, speed) of the subsystem under test in both happy and negative scenarios (low bandwidth, network errors and low connectivity). ### Reusable test components -To faster write a new test objective you need to use some higher level wrappers and logic: `TestEnvironment`, +To faster write a new test objective you need to use some higher level wrappers and logic: `TestEnvironment`, `TestConfiguration`, `TestAuthorities`, `NetworkEmulator`. To create the `TestEnvironment` you will need to also build an `Overseer`, but that should be easy using the mockups for subsystems in`core::mock`. ### Mocking Ideally we want to have a single mock implementation for subsystems that can be minimally configured to -be used in different tests. A good example is `runtime-api` which currently only responds to session information +be used in different tests. A good example is `runtime-api` which currently only responds to session information requests based on static data. It can be easily extended to service other requests. diff --git a/polkadot/node/subsystem-bench/docker/docker-compose.yml b/polkadot/node/subsystem-bench/docker/docker-compose.yml new file mode 100644 index 0000000000000..fc5eb1f634e64 --- /dev/null +++ b/polkadot/node/subsystem-bench/docker/docker-compose.yml @@ -0,0 +1,35 @@ +services: + grafana: + image: grafana/grafana-enterprise:latest + container_name: grafana + restart: always + networks: + - subsystem-bench + ports: + - "3000:3000" + + prometheus: + image: prom/prometheus:latest + container_name: prometheus + restart: always + networks: + - subsystem-bench + volumes: + - ./prometheus:/etc/prometheus + extra_hosts: + - "host.docker.internal:host-gateway" + ports: + - "9090:9090" + - "9999:9999" + + pyroscope: + container_name: pyroscope + image: grafana/pyroscope:latest + restart: always + networks: + - subsystem-bench + ports: + - "4040:4040" + +networks: + subsystem-bench: diff --git a/polkadot/node/subsystem-bench/docker/prometheus/prometheus.yml b/polkadot/node/subsystem-bench/docker/prometheus/prometheus.yml new file mode 100644 index 0000000000000..0bb25cfcb36c6 --- /dev/null +++ b/polkadot/node/subsystem-bench/docker/prometheus/prometheus.yml @@ -0,0 +1,11 @@ +global: + scrape_interval: 5s + +scrape_configs: + - job_name: "prometheus" + static_configs: + - targets: ["localhost:9090"] + - job_name: "subsystem-bench" + scrape_interval: 0s500ms + static_configs: + - targets: ['host.docker.internal:9999'] diff --git a/polkadot/node/subsystem-bench/grafana/cpu-profiling.json b/polkadot/node/subsystem-bench/grafana/cpu-profiling.json new file mode 100644 index 0000000000000..0d53a1b936576 --- /dev/null +++ b/polkadot/node/subsystem-bench/grafana/cpu-profiling.json @@ -0,0 +1,70 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 1, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "grafana-pyroscope-datasource", + "uid": "bc3bc04f-85f9-464b-8ae3-fbe0949063f6" + }, + "gridPos": { + "h": 18, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "targets": [ + { + "datasource": { + "type": "grafana-pyroscope-datasource", + "uid": "bc3bc04f-85f9-464b-8ae3-fbe0949063f6" + }, + "groupBy": [], + "labelSelector": "{service_name=\"subsystem-bench\"}", + "profileTypeId": "process_cpu:cpu:nanoseconds:cpu:nanoseconds", + "queryType": "profile", + "refId": "A" + } + ], + "title": "CPU Profiling", + "type": "flamegraph" + } + ], + "refresh": "", + "schemaVersion": 38, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "CPU Profiling", + "uid": "c31191d5-fe2b-49e2-8b1c-1451f31d1628", + "version": 1, + "weekStart": "" + } diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs index 0f3ae0f41417e..98191e3bac4bd 100644 --- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -76,12 +76,31 @@ struct BenchCli { /// Maximum remote peer latency in milliseconds [0-5000]. pub peer_max_latency: Option, + #[clap(long, default_value_t = false)] + /// Enable CPU Profiling + pub profile: bool, + #[command(subcommand)] pub objective: cli::TestObjective, } impl BenchCli { fn launch(self) -> eyre::Result<()> { + use pyroscope::PyroscopeAgent; + use pyroscope_pprofrs::{pprof_backend, PprofConfig}; + + // Pyroscope must be running on port 4040 + // See https://grafana.com/docs/pyroscope/latest/get-started/#download-and-configure-pyroscope + let agent_running = if self.profile { + let agent = PyroscopeAgent::builder("http://localhost:4040", "subsystem-bench") + .backend(pprof_backend(PprofConfig::new().sample_rate(100))) + .build()?; + + Some(agent.start()?) + } else { + None + }; + let configuration = self.standard_configuration; let mut test_config = match self.objective { TestObjective::TestSequence(options) => { @@ -165,6 +184,11 @@ impl BenchCli { env.runtime() .block_on(availability::benchmark_availability_read(&mut env, state)); + if let Some(agent_running) = agent_running { + let agent_ready = agent_running.stop()?; + agent_ready.shutdown(); + } + Ok(()) } } From b9f4dd90070c9317da4ac1e913591cd30723d9d7 Mon Sep 17 00:00:00 2001 From: Andrei Eres Date: Tue, 5 Dec 2023 13:59:13 +0100 Subject: [PATCH 44/45] Update polkadot/node/subsystem-bench/README.md Co-authored-by: Andrei Sandu <54316454+sandreim@users.noreply.github.com> --- polkadot/node/subsystem-bench/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polkadot/node/subsystem-bench/README.md b/polkadot/node/subsystem-bench/README.md index fca37da621752..88e105991ddaa 100644 --- a/polkadot/node/subsystem-bench/README.md +++ b/polkadot/node/subsystem-bench/README.md @@ -32,7 +32,7 @@ a local Grafana/Prometheus stack is needed. ### Run Prometheus, Pyroscope and Graphana in Docker -If you are using Docker, you can skip the next steps to install Prometheus, Pyroscope and Graphana using `docker-compose` file. +If docker is not usable, then follow the next sections to manually install Prometheus, Pyroscope and Graphana on your machine. ```bash cd polkadot/node/subsystem-bench/docker From 8736689a093cd63662fc4ef6c8fc1c81c9ce117a Mon Sep 17 00:00:00 2001 From: Andrei Eres Date: Tue, 5 Dec 2023 15:08:03 +0100 Subject: [PATCH 45/45] Update --- polkadot/node/subsystem-bench/README.md | 32 ++++++++++--------- .../subsystem-bench/src/subsystem-bench.rs | 21 +++++++----- 2 files changed, 30 insertions(+), 23 deletions(-) diff --git a/polkadot/node/subsystem-bench/README.md b/polkadot/node/subsystem-bench/README.md index 88e105991ddaa..7f4b1d0191268 100644 --- a/polkadot/node/subsystem-bench/README.md +++ b/polkadot/node/subsystem-bench/README.md @@ -114,21 +114,23 @@ used to run a suite of tests defined in a `yaml` file like in this [example](exa ``` Options: - --network The type of network to be emulated [default: ideal] [possible values: - ideal, healthy, degraded] - --n-cores Number of cores to fetch availability for [default: 100] - --n-validators Number of validators to fetch chunks from [default: 500] - --min-pov-size The minimum pov size in KiB [default: 5120] - --max-pov-size The maximum pov size bytes [default: 5120] - -n, --num-blocks The number of blocks the test is going to run [default: 1] - -p, --peer-bandwidth The bandwidth of simulated remote peers in KiB - -b, --bandwidth The bandwidth of our simulated node in KiB - --peer-error Simulated conection error ratio [0-100] - --peer-min-latency Minimum remote peer latency in milliseconds [0-5000] - --peer-max-latency Maximum remote peer latency in milliseconds [0-5000] - --profile Enable CPU Profiling - -h, --help Print help - -V, --version Print version + --network The type of network to be emulated [default: ideal] [possible values: + ideal, healthy, degraded] + --n-cores Number of cores to fetch availability for [default: 100] + --n-validators Number of validators to fetch chunks from [default: 500] + --min-pov-size The minimum pov size in KiB [default: 5120] + --max-pov-size The maximum pov size bytes [default: 5120] + -n, --num-blocks The number of blocks the test is going to run [default: 1] + -p, --peer-bandwidth The bandwidth of simulated remote peers in KiB + -b, --bandwidth The bandwidth of our simulated node in KiB + --peer-error Simulated conection error ratio [0-100] + --peer-min-latency Minimum remote peer latency in milliseconds [0-5000] + --peer-max-latency Maximum remote peer latency in milliseconds [0-5000] + --profile Enable CPU Profiling with Pyroscope + --pyroscope-url Pyroscope Server URL [default: http://localhost:4040] + --pyroscope-sample-rate Pyroscope Sample Rate [default: 113] + -h, --help Print help + -V, --version Print version ``` These apply to all test objectives, except `test-sequence` which relies on the values being specified in a file. diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs index 98191e3bac4bd..dc073991c7016 100644 --- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -18,6 +18,8 @@ //! CI regression testing. use clap::Parser; use color_eyre::eyre; +use pyroscope::PyroscopeAgent; +use pyroscope_pprofrs::{pprof_backend, PprofConfig}; use colored::Colorize; use std::{path::Path, time::Duration}; @@ -77,23 +79,26 @@ struct BenchCli { pub peer_max_latency: Option, #[clap(long, default_value_t = false)] - /// Enable CPU Profiling + /// Enable CPU Profiling with Pyroscope pub profile: bool, + #[clap(long, default_value_t = String::from("http://localhost:4040"))] + /// Pyroscope Server URL + pub pyroscope_url: String, + + #[clap(long, default_value_t = 113)] + /// Pyroscope Sample Rate + pub pyroscope_sample_rate: u32, + #[command(subcommand)] pub objective: cli::TestObjective, } impl BenchCli { fn launch(self) -> eyre::Result<()> { - use pyroscope::PyroscopeAgent; - use pyroscope_pprofrs::{pprof_backend, PprofConfig}; - - // Pyroscope must be running on port 4040 - // See https://grafana.com/docs/pyroscope/latest/get-started/#download-and-configure-pyroscope let agent_running = if self.profile { - let agent = PyroscopeAgent::builder("http://localhost:4040", "subsystem-bench") - .backend(pprof_backend(PprofConfig::new().sample_rate(100))) + let agent = PyroscopeAgent::builder(self.pyroscope_url.as_str(), "subsystem-bench") + .backend(pprof_backend(PprofConfig::new().sample_rate(self.pyroscope_sample_rate))) .build()?; Some(agent.start()?)