Skip to content

Commit

Permalink
fix(network): Persist links between archival nodes (#3376)
Browse files Browse the repository at this point in the history
Each archival node will allocate some of its connections slots
for other archival nodes. When it is rebalancing, some connections
are dropped, but it won't drop any connection to other archival
node, if the number of direct archival nodes is lower than some
threshold.

Fixes: #3364

Test plan
=========
archival_node @ chain/network/tests/routing.rs should pass

This test connects to archival nodes to each other, and perform several
rounds of connection from other nodes forcing a rebalancing, and check
that connection between archival node is not dropped.
  • Loading branch information
mfornet authored Sep 26, 2020
1 parent 523eb4f commit 971f814
Show file tree
Hide file tree
Showing 7 changed files with 142 additions and 6 deletions.
20 changes: 20 additions & 0 deletions chain/network/src/peer_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,13 @@ impl PeerManagerActor {
.count()
}

fn num_archival_peers(&self) -> usize {
self.active_peers
.values()
.filter(|active_peer| active_peer.full_peer_info.chain_info.archival)
.count()
}

/// Check if it is needed to create a new outbound connection.
/// If the number of active connections is less than `ideal_connections_lo` or
/// (the number of outgoing connections is less than `minimum_outbound_peers`
Expand Down Expand Up @@ -663,6 +670,8 @@ impl PeerManagerActor {
/// find the one we connected earlier and add it to the safe set.
/// else break
fn try_stop_active_connection(&self) {
debug!(target: "network", "Trying to stop an active connection. Number of active connections: {}", self.active_peers.len());

// Build safe set
let mut safe_set = HashSet::new();

Expand All @@ -676,6 +685,17 @@ impl PeerManagerActor {
}
}

if self.config.archive
&& self.num_archival_peers()
<= self.config.archival_peer_connections_lower_bound as usize
{
for (peer, active) in self.active_peers.iter() {
if active.full_peer_info.chain_info.archival {
safe_set.insert(peer.clone());
}
}
}

// Find all recent connections
let mut recent_connections = self
.active_peers
Expand Down
2 changes: 2 additions & 0 deletions chain/network/src/test_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ impl NetworkConfig {
ideal_connections_hi: 35,
peer_recent_time_window: Duration::from_secs(600),
safe_set_size: 20,
archival_peer_connections_lower_bound: 10,
ban_window: Duration::from_secs(1),
peer_expiration_duration: Duration::from_secs(60 * 60),
max_send_peers: 512,
Expand All @@ -78,6 +79,7 @@ impl NetworkConfig {
push_info_period: Duration::from_millis(100),
blacklist: HashMap::new(),
outbound_disabled: false,
archive: false,
}
}
}
Expand Down
5 changes: 5 additions & 0 deletions chain/network/src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -803,6 +803,9 @@ pub struct NetworkConfig {
/// Number of peers to keep while removing a connection.
/// Used to avoid disconnecting from peers we have been connected since long time.
pub safe_set_size: u32,
/// Lower bound of the number of connections to archival peers to keep
/// if we are an archival node.
pub archival_peer_connections_lower_bound: u32,
/// Duration of the ban for misbehaving peers.
pub ban_window: Duration,
/// Remove expired peers.
Expand Down Expand Up @@ -833,6 +836,8 @@ pub struct NetworkConfig {
/// are satisfied.
/// This flag should be ALWAYS FALSE. Only set to true for testing purposes.
pub outbound_disabled: bool,
/// Not clear old data, set `true` for archive nodes.
pub archive: bool,
}

impl NetworkConfig {
Expand Down
4 changes: 2 additions & 2 deletions chain/network/tests/full_network.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ pub fn connect_at_max_capacity(

// Wait until all running nodes have expected connections
for node_id in 0..num_node {
runner.push_action(check_expected_connections(node_id, expected_connections));
runner.push_action(check_expected_connections(node_id, Some(expected_connections), None));
}

// Restart stopped nodes
Expand All @@ -49,7 +49,7 @@ pub fn connect_at_max_capacity(

// Wait until all nodes have expected connections
for node_id in 0..total_nodes {
runner.push_action(check_expected_connections(node_id, expected_connections));
runner.push_action(check_expected_connections(node_id, Some(expected_connections), None));
}

start_test(runner);
Expand Down
34 changes: 34 additions & 0 deletions chain/network/tests/routing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -242,3 +242,37 @@ fn max_num_peers_limit() {

start_test(runner);
}

/// Check that two archival nodes keep connected after network rebalance. Nodes 0 and 1 are archival nodes, others aren't.
/// Initially connect 2, 3, 4 to 0. Then connect 1 to 0, this connection should persist, even after other nodes tries
/// to connect to node 0 again.
///
/// Do four rounds where 2, 3, 4 tries to connect to 0 and check that connection between 0 and 1 was never dropped.
#[test]
fn archival_node() {
let mut runner = Runner::new(5, 5)
.max_num_peers(3)
.ideal_connections(2, 2)
.safe_set_size(0)
.set_as_archival(0)
.set_as_archival(1);

runner.push(Action::AddEdge(2, 0));
runner.push(Action::AddEdge(3, 0));
runner.push(Action::AddEdge(4, 0));
runner.push_action(check_expected_connections(0, Some(2), Some(2)));

runner.push(Action::AddEdge(1, 0));
runner.push_action(check_expected_connections(0, Some(2), Some(2)));
runner.push_action(check_direct_connection(0, 1));

for _step in 0..4 {
runner.push(Action::AddEdge(2, 0));
runner.push(Action::AddEdge(3, 0));
runner.push(Action::AddEdge(4, 0));
runner.push_action(check_expected_connections(0, Some(2), Some(2)));
runner.push_action(check_direct_connection(0, 1));
}

start_test(runner);
}
69 changes: 65 additions & 4 deletions chain/network/tests/runner/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ pub fn setup_network_node(

let peer_manager = PeerManagerActor::create(move |ctx| {
let mut client_config = ClientConfig::test(false, 100, 200, num_validators, false);
client_config.archive = config.archive;
client_config.ttl_account_id_router = config.ttl_account_id_router;
let network_adapter = NetworkRecipient::new();
network_adapter.set_recipient(ctx.address().recipient());
Expand Down Expand Up @@ -363,6 +364,7 @@ struct TestConfig {
ideal_connections: Option<(u32, u32)>,
minimum_outbound_peers: Option<u32>,
safe_set_size: Option<u32>,
archive: bool,
}

impl TestConfig {
Expand All @@ -377,6 +379,7 @@ impl TestConfig {
ideal_connections: None,
minimum_outbound_peers: None,
safe_set_size: None,
archive: false,
}
}
}
Expand Down Expand Up @@ -425,6 +428,12 @@ impl Runner {
self
}

/// Set node `u` as archival node.
pub fn set_as_archival(mut self, u: usize) -> Self {
self.test_config[u].archive = true;
self
}

/// Specify boot nodes. By default there are no boot nodes.
pub fn use_boot_nodes(mut self, boot_nodes: Vec<usize>) -> Self {
self.apply_all(move |test_config| {
Expand Down Expand Up @@ -453,7 +462,7 @@ impl Runner {

pub fn safe_set_size(mut self, safe_set_size: u32) -> Self {
self.apply_all(move |test_config| {
test_config.minimum_outbound_peers = Some(safe_set_size);
test_config.safe_set_size = Some(safe_set_size);
});
self
}
Expand Down Expand Up @@ -539,6 +548,7 @@ impl Runner {
network_config.blacklist = blacklist;
network_config.outbound_disabled = test_config.outbound_disabled;
network_config.boot_nodes = boot_nodes;
network_config.archive = test_config.archive;

network_config.ideal_connections_lo =
test_config.ideal_connections.map_or(network_config.ideal_connections_lo, |(lo, _)| lo);
Expand Down Expand Up @@ -665,8 +675,14 @@ impl Handler<RunnerMessage> for Runner {
}
}

/// Check that `node_id` has at least `expected_connections` as active peers.
pub fn check_expected_connections(node_id: usize, expected_connections: usize) -> ActionFn {
/// Check that the number of connections of `node_id` is in the range:
/// [expected_connections_lo, expected_connections_hi]
/// Use None to denote semi-open interval
pub fn check_expected_connections(
node_id: usize,
expected_connections_lo: Option<usize>,
expected_connections_hi: Option<usize>,
) -> ActionFn {
Box::new(
move |info: SharedRunningInfo,
flag: Arc<AtomicBool>,
Expand All @@ -681,7 +697,19 @@ pub fn check_expected_connections(node_id: usize, expected_connections: usize) -
.send(GetInfo {})
.map_err(|_| ())
.and_then(move |res| {
if res.num_active_peers >= expected_connections {
let left = if let Some(expected_connections_lo) = expected_connections_lo {
expected_connections_lo <= res.num_active_peers
} else {
true
};

let right = if let Some(expected_connections_hi) = expected_connections_hi {
res.num_active_peers <= expected_connections_hi
} else {
true
};

if left && right {
flag.store(true, Ordering::Relaxed);
}
future::ok(())
Expand All @@ -692,6 +720,39 @@ pub fn check_expected_connections(node_id: usize, expected_connections: usize) -
)
}

/// Check that `node_id` has a direct connection to `target_id`.
pub fn check_direct_connection(node_id: usize, target_id: usize) -> ActionFn {
Box::new(
move |info: SharedRunningInfo,
flag: Arc<AtomicBool>,
_ctx: &mut Context<WaitOrTimeout>,
_runner| {
let info = info.read().unwrap();
let target_peer_id = info.peers_info[target_id].id.clone();

actix::spawn(
info.pm_addr
.get(node_id)
.unwrap()
.send(NetworkRequests::FetchRoutingTable)
.map_err(|_| ())
.and_then(move |res| {
if let NetworkResponses::RoutingTableInfo(routing_table) = res {
if let Some(routes) = routing_table.peer_forwarding.get(&target_peer_id)
{
if routes.contains(&target_peer_id) {
flag.store(true, Ordering::Relaxed);
}
}
}
future::ok(())
})
.map(drop),
);
},
)
}

/// Restart a node that was already stopped.
pub fn restart(node_id: usize) -> ActionFn {
Box::new(
Expand Down
14 changes: 14 additions & 0 deletions neard/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,11 @@ fn default_peer_recent_time_window() -> Duration {
fn default_safe_set_size() -> u32 {
20
}
/// Lower bound of the number of connections to archival peers to keep
/// if we are an archival node.
fn default_archival_peer_connections_lower_bound() -> u32 {
10
}
/// Time to persist Accounts Id in the router without removing them in seconds.
fn default_ttl_account_id_router() -> Duration {
Duration::from_secs(TTL_ACCOUNT_ID_ROUTER)
Expand Down Expand Up @@ -215,6 +220,10 @@ pub struct Network {
/// Used to avoid disconnecting from peers we have been connected since long time.
#[serde(default = "default_safe_set_size")]
pub safe_set_size: u32,
/// Lower bound of the number of connections to archival peers to keep
/// if we are an archival node.
#[serde(default = "default_archival_peer_connections_lower_bound")]
pub archival_peer_connections_lower_bound: u32,
/// Handshake timeout.
pub handshake_timeout: Duration,
/// Duration before trying to reconnect to a peer.
Expand Down Expand Up @@ -247,6 +256,7 @@ impl Default for Network {
ideal_connections_hi: default_ideal_connections_hi(),
peer_recent_time_window: default_peer_recent_time_window(),
safe_set_size: default_safe_set_size(),
archival_peer_connections_lower_bound: default_archival_peer_connections_lower_bound(),
handshake_timeout: Duration::from_secs(20),
reconnect_delay: Duration::from_secs(60),
skip_sync_wait: false,
Expand Down Expand Up @@ -613,6 +623,9 @@ impl NearConfig {
ideal_connections_hi: config.network.ideal_connections_hi,
peer_recent_time_window: config.network.peer_recent_time_window,
safe_set_size: config.network.safe_set_size,
archival_peer_connections_lower_bound: config
.network
.archival_peer_connections_lower_bound,
ban_window: config.network.ban_window,
max_send_peers: 512,
peer_expiration_duration: Duration::from_secs(7 * 24 * 60 * 60),
Expand All @@ -624,6 +637,7 @@ impl NearConfig {
push_info_period: Duration::from_millis(100),
blacklist: blacklist_from_iter(config.network.blacklist),
outbound_disabled: false,
archive: config.archive,
},
telemetry_config: config.telemetry,
rpc_config: config.rpc,
Expand Down

0 comments on commit 971f814

Please sign in to comment.