Runtime access and connection management for PoV distribution.

paritytech · eskimor · Mar 28, 2021 · Mar 18, 2021 · Mar 18, 2021 · Mar 18, 2021
commit a49b4d431b2a58b525df2b67199f734f23017d42
diff --git a/node/network/availability-distribution/src/error.rs b/node/network/availability-distribution/src/error.rs
@@ -22,9 +22,11 @@ use thiserror::Error;
 use futures::channel::oneshot;
 
 use polkadot_node_subsystem_util::Error as UtilError;
-use polkadot_primitives::v1::SessionIndex;
+use polkadot_primitives::v1::{CompressedPoVError, SessionIndex};
 use polkadot_subsystem::{errors::RuntimeApiError, SubsystemError};
 
+use crate::LOG_TARGET;
+
 /// Errors of this subsystem.
 #[derive(Debug, Error)]
 pub enum Error {
@@ -56,24 +58,28 @@ pub enum Error {
 	/// Sending response failed.
 	#[error("Sending a request's response failed.")]
 	SendResponse,
-}
 
-/// Error that we should handle gracefully by logging it.
-#[derive(Debug)]
-pub enum NonFatalError {
 	/// Some request to utility functions failed.
 	/// This can be either `RuntimeRequestCanceled` or `RuntimeApiError`.
+	#[error("Utility request failed")]
 	UtilRequest(UtilError),
 
 	/// Runtime API subsystem is down, which means we're shutting down.
+	#[error("Runtime request canceled")]
 	RuntimeRequestCanceled(oneshot::Canceled),
 
 	/// Some request to the runtime failed.
 	/// For example if we prune a block we're requesting info about.
+	#[error("Runtime API error")]
 	RuntimeRequest(RuntimeApiError),
 
 	/// We tried fetching a session info which was not available.
+	#[error("There was no session with the given index")]
 	NoSuchSession(SessionIndex),
+
+	/// Decompressing PoV failed.
+	#[error("PoV could not be decompressed")]
+	PoVDecompression(CompressedPoVError),
 }
 
 pub type Result<T> = std::result::Result<T, Error>;
@@ -90,9 +96,20 @@ pub(crate) async fn recv_runtime<V>(
 		oneshot::Receiver<std::result::Result<V, RuntimeApiError>>,
 		UtilError,
 	>,
-) -> std::result::Result<V, NonFatalError> {
-	r.map_err(NonFatalError::UtilRequest)?
+) -> std::result::Result<V, Error> {
+	r.map_err(Error::UtilRequest)?
 		.await
-		.map_err(NonFatalError::RuntimeRequestCanceled)?
-		.map_err(NonFatalError::RuntimeRequest)
+		.map_err(Error::RuntimeRequestCanceled)?
+		.map_err(Error::RuntimeRequest)
+}
+
+
+/// Utility for eating top level errors and log them.
+///
+/// We basically always want to try and continue on error. This utility function is meant to
+/// consume top-level errors by simply logging them
+pub fn log_error(result: Result<()>, ctx: &'static str) {
+	if let Err(error) = result {
+		tracing::warn!(target: LOG_TARGET, error = ?error, ctx);
+	}
 }
diff --git a/node/network/availability-distribution/src/lib.rs b/node/network/availability-distribution/src/lib.rs
@@ -26,12 +26,20 @@ use polkadot_subsystem::{
 /// Error and [`Result`] type for this subsystem.
 mod error;
 pub use error::Error;
-use error::Result;
+use error::{Result, log_error};
+
+/// Runtime requests.
+mod runtime;
+use runtime::Runtime;
 
 /// `Requester` taking care of requesting chunks for candidates pending availability.
 mod requester;
 use requester::Requester;
 
+/// Handing requests for PoVs during backing.
+mod pov_requester;
+use pov_requester::PoVRequester;
+
 /// Responding to erasure chunk requests:
 mod responder;
 use responder::{answer_chunk_request_log, answer_pov_request_log};
@@ -52,6 +60,8 @@ const LOG_TARGET: &'static str = "parachain::availability-distribution";
 pub struct AvailabilityDistributionSubsystem {
 	/// Pointer to a keystore, which is required for determining this nodes validator index.
 	keystore: SyncCryptoStorePtr,
+    /// Easy and efficient runtime access for this subsystem.
+    runtime: Runtime,
 	/// Prometheus metrics.
 	metrics: Metrics,
 }
@@ -74,17 +84,20 @@ where
 }
 
 impl AvailabilityDistributionSubsystem {
+
 	/// Create a new instance of the availability distribution.
 	pub fn new(keystore: SyncCryptoStorePtr, metrics: Metrics) -> Self {
-		Self { keystore, metrics }
+        let runtime = Runtime::new(keystore.clone());
+		Self { keystore, runtime,  metrics }
 	}
 
 	/// Start processing work as passed on from the Overseer.
-	async fn run<Context>(self, mut ctx: Context) -> Result<()>
+	async fn run<Context>(mut self, mut ctx: Context) -> Result<()>
 	where
 		Context: SubsystemContext<Message = AvailabilityDistributionMessage> + Sync + Send,
 	{
 		let mut requester = Requester::new(self.keystore.clone(), self.metrics.clone()).fuse();
+        let mut pov_requester = PoVRequester::new();
 		loop {
 			let action = {
 				let mut subsystem_next = ctx.recv().fuse();
@@ -107,14 +120,14 @@ impl AvailabilityDistributionSubsystem {
 			};
 			match message {
 				FromOverseer::Signal(OverseerSignal::ActiveLeaves(update)) => {
-					// Update the relay chain heads we are fetching our pieces for:
-					if let Some(e) = requester
-						.get_mut()
-						.update_fetching_heads(&mut ctx, update)
-						.await?
-					{
-						tracing::debug!(target: LOG_TARGET, "Error processing ActiveLeavesUpdate: {:?}", e);
-					}
+                    log_error(
+                        pov_requester.update_connected_validators(&mut ctx, &mut self.runtime, &update).await,
+                        "PoVRequester::update_connected_validators"
+                    );
+                    log_error(
+                        requester.get_mut().update_fetching_heads(&mut ctx, update).await,
+                        "Error in Requester::update_fetching_heads"
+                    );
 				}
 				FromOverseer::Signal(OverseerSignal::BlockFinalized(..)) => {}
 				FromOverseer::Signal(OverseerSignal::Conclude) => {

diff --git a/node/network/availability-distribution/src/pov_requester/mod.rs b/node/network/availability-distribution/src/pov_requester/mod.rs
@@ -0,0 +1,128 @@
+// Copyright 2021 Parity Technologies (UK) Ltd.
+// This file is part of Polkadot.
+
+// Polkadot is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Polkadot is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Polkadot.  If not, see <http://www.gnu.org/licenses/>.
+
+//! PoV requester takes care of requesting PoVs from validators of a backing group.
+
+use futures::channel::mpsc;
+use lru::LruCache;
+
+use polkadot_node_network_protocol::{PeerId, peer_set::PeerSet};
+use polkadot_primitives::v1::{AuthorityDiscoveryId, Hash, SessionIndex};
+use polkadot_subsystem::{ActiveLeavesUpdate, SubsystemContext, messages::{AllMessages, NetworkBridgeMessage}};
+
+use crate::runtime::Runtime;
+
+/// Number of sessions we want to keep in the LRU.
+const NUM_SESSIONS: usize = 2;
+
+pub struct PoVRequester {
+
+	/// We only ever care about being connected to validators of at most two sessions.
+	///
+	/// So we keep an LRU for managing connection requests of size 2.
+	connected_validators: LruCache<SessionIndex, mpsc::Receiver<(AuthorityDiscoveryId, PeerId)>>,
+}
+
+impl PoVRequester {
+	/// Create a new requester for PoVs.
+	pub fn new() -> Self {
+		Self {
+			connected_validators: LruCache::new(NUM_SESSIONS),
+		}
+	}
+
+	/// Make sure we are connected to the right set of validators.
+	///
+	/// On every `ActiveLeavesUpdate`, we check whether we are connected properly to our current
+	/// validator group.
+	pub async fn update_connected_validators<Context>(
+		&mut self,
+		ctx: &mut Context,
+		runtime: &mut Runtime,
+		update: &ActiveLeavesUpdate,
+	) -> super::Result<()>
+	where
+		Context: SubsystemContext,
+	{
+		let activated = update.activated.iter().map(|(h, _)| h);
+		let activated_sessions =
+			get_activated_sessions(ctx, runtime, activated).await?;
+
+		for (parent, session_index) in activated_sessions {
+			if self.connected_validators.contains(&session_index) {
+				continue
+			}
+			self.connected_validators.put(
+				session_index,
+				connect_to_relevant_validators(ctx, runtime, parent, session_index).await?
+			);
+		}
+		Ok(())
+	}
+
+}
+
+async fn get_activated_sessions<Context>(ctx: &mut Context, runtime: &mut Runtime, new_heads: impl Iterator<Item = &Hash>) 
+	-> super::Result<impl Iterator<Item = (Hash, SessionIndex)>>
+where
+	Context: SubsystemContext,
+{
+	let mut sessions = Vec::new();
+	for parent in new_heads {
+		sessions.push((*parent, runtime.get_session_index(ctx, *parent).await?));
+	}
+	Ok(sessions.into_iter())
+}
+
+async fn connect_to_relevant_validators<Context>(
+	ctx: &mut Context,
+	runtime: &mut Runtime,
+	parent: Hash,
+	session: SessionIndex
+) 
+	-> super::Result<mpsc::Receiver<(AuthorityDiscoveryId, PeerId)>>
+where
+	Context: SubsystemContext,
+{
+	let validator_ids = determine_relevant_validators(ctx, runtime, parent, session).await?;
+	// We don't actually care about `PeerId`s, just keeping receiver so we stay connected:
+	let (tx, rx) = mpsc::channel(0);
+	ctx.send_message(AllMessages::NetworkBridge(NetworkBridgeMessage::ConnectToValidators {
+		validator_ids, peer_set: PeerSet::Validation, connected: tx
+	})).await;
+	Ok(rx)
+}
+
+async fn determine_relevant_validators<Context>(
+	ctx: &mut Context,
+	runtime: &mut Runtime,
+	parent: Hash,
+	session: SessionIndex,
+) 
+	-> super::Result<Vec<AuthorityDiscoveryId>>
+where
+	Context: SubsystemContext,
+{
+	let info = runtime.get_session_info_by_index(ctx, parent, session).await?;
+	if let Some(validator_info) = &info.validator_info {
+		let indeces = info.session_info.validator_groups.get(validator_info.our_group.0 as usize)
+			.expect("Our group got retrieved from that session info, it must exist. qed.")
+			.clone();
+		Ok(indeces.into_iter().map(|i| info.session_info.discovery_keys[i.0 as usize].clone()).collect())
+	} else {
+		Ok(Vec::new())
+	}
+}
diff --git a/node/network/availability-distribution/src/requester/mod.rs b/node/network/availability-distribution/src/requester/mod.rs
@@ -40,7 +40,7 @@ use polkadot_subsystem::{
 };
 
 use super::{error::recv_runtime, session_cache::SessionCache, LOG_TARGET, Metrics};
-use crate::error::NonFatalError;
+use crate::error::Error;
 
 /// A task fetching a particular chunk.
 mod fetch_task;
@@ -97,7 +97,7 @@ impl Requester {
 		&mut self,
 		ctx: &mut Context,
 		update: ActiveLeavesUpdate,
-	) -> super::Result<Option<NonFatalError>>
+	) -> super::Result<()>
 	where
 		Context: SubsystemContext,
 	{
@@ -107,30 +107,25 @@ impl Requester {
 		} = update;
 		// Order important! We need to handle activated, prior to deactivated, otherwise we might
 		// cancel still needed jobs.
-		let err = self.start_requesting_chunks(ctx, activated.into_iter()).await?;
+		self.start_requesting_chunks(ctx, activated.into_iter()).await?;
 		self.stop_requesting_chunks(deactivated.into_iter());
-		Ok(err)
+		Ok(())
 	}
 
 	/// Start requesting chunks for newly imported heads.
 	async fn start_requesting_chunks<Context>(
 		&mut self,
 		ctx: &mut Context,
 		new_heads: impl Iterator<Item = (Hash, Arc<jaeger::Span>)>,
-	) -> super::Result<Option<NonFatalError>>
+	) -> super::Result<()>
 	where
 		Context: SubsystemContext,
 	{
 		for (leaf, _) in new_heads {
-			let cores = match query_occupied_cores(ctx, leaf).await {
-				Err(err) => return Ok(Some(err)),
-				Ok(cores) => cores,
-			};
-			if let Some(err) = self.add_cores(ctx, leaf, cores).await? {
-				return Ok(Some(err));
-			}
+			let cores = query_occupied_cores(ctx, leaf).await?;
+			self.add_cores(ctx, leaf, cores).await?;
 		}
-		Ok(None)
+		Ok(())
 	}
 
 	/// Stop requesting chunks for obsolete heads.
@@ -155,7 +150,7 @@ impl Requester {
 		ctx: &mut Context,
 		leaf: Hash,
 		cores: impl IntoIterator<Item = OccupiedCore>,
-	) -> super::Result<Option<NonFatalError>>
+	) -> super::Result<()>
 	where
 		Context: SubsystemContext,
 	{
@@ -170,7 +165,7 @@ impl Requester {
 					let tx = self.tx.clone();
 					let metrics = self.metrics.clone();
 
-					let task_cfg = match self
+					let task_cfg = self
 						.session_cache
 						.with_session_info(
 							ctx,
@@ -180,11 +175,7 @@ impl Requester {
 							leaf,
 							|info| FetchTaskConfig::new(leaf, &core, tx, metrics, info),
 						)
-						.await
-					{
-						Err(err) => return Ok(Some(err)),
-						Ok(task_cfg) => task_cfg,
-					};
+						.await?;
 
 					if let Some(task_cfg) = task_cfg {
 						e.insert(FetchTask::start(task_cfg, ctx).await?);
@@ -193,7 +184,7 @@ impl Requester {
 				}
 			}
 		}
-		Ok(None)
+		Ok(())
 	}
 }
 
@@ -228,7 +219,7 @@ impl Stream for Requester {
 async fn query_occupied_cores<Context>(
 	ctx: &mut Context,
 	relay_parent: Hash,
-) -> Result<Vec<OccupiedCore>, NonFatalError>
+) -> Result<Vec<OccupiedCore>, Error>
 where
 	Context: SubsystemContext,
 {