From c2b350023ba849d1b33142592264aaa51fcb7f1e Mon Sep 17 00:00:00 2001 From: behzad nouri Date: Fri, 11 Oct 2024 16:31:50 +0000 Subject: [PATCH] refreshes ContactInfo.outset before initializing validator (#3135) Nodes join gossip during bootstrap process with a stub contact-info which in particular has invalid TVU socket address. Once the bootstrap is done they re-join gossip a 2nd time with a fully populated contact-info, but this contact-info has an outset timestamp older than the 1st one because it was initiated earlier. In v2.0 the outset timestamp determines which contact-info overrides the other, so the v2.0 nodes refrain from updating their CRDS table with the fully initialized contact-info. The commit refreshes ContactInfo.outset before initializing the validator so that it overrides the one pushed to the gossip by the bootstrap stage. --- gossip/src/cluster_info.rs | 2 +- gossip/src/contact_info.rs | 19 ++++++++++++------- turbine/src/cluster_nodes.rs | 2 +- validator/src/main.rs | 7 +++++++ 4 files changed, 21 insertions(+), 9 deletions(-) diff --git a/gossip/src/cluster_info.rs b/gossip/src/cluster_info.rs index af597a230fba53..2706fac5bdaf39 100644 --- a/gossip/src/cluster_info.rs +++ b/gossip/src/cluster_info.rs @@ -666,7 +666,7 @@ impl ClusterInfo { *instance = NodeInstance::new(&mut thread_rng(), id, timestamp()); } *self.keypair.write().unwrap() = new_keypair; - self.my_contact_info.write().unwrap().set_pubkey(id); + self.my_contact_info.write().unwrap().hot_swap_pubkey(id); self.insert_self(); self.push_message(CrdsValue::new_signed( diff --git a/gossip/src/contact_info.rs b/gossip/src/contact_info.rs index b3ca9c94a762a5..2174a82af7274b 100644 --- a/gossip/src/contact_info.rs +++ b/gossip/src/contact_info.rs @@ -181,11 +181,7 @@ impl ContactInfo { Self { pubkey, wallclock, - outset: { - let now = SystemTime::now(); - let elapsed = now.duration_since(UNIX_EPOCH).unwrap(); - u64::try_from(elapsed.as_micros()).unwrap() - }, + outset: get_node_outset(), shred_version, version: solana_version::Version::default(), addrs: Vec::::default(), @@ -210,8 +206,11 @@ impl ContactInfo { self.shred_version } - pub fn set_pubkey(&mut self, pubkey: Pubkey) { - self.pubkey = pubkey + pub fn hot_swap_pubkey(&mut self, pubkey: Pubkey) { + self.pubkey = pubkey; + // Need to update ContactInfo.outset so that this node's contact-info + // will override older node with the same pubkey. + self.outset = get_node_outset(); } pub fn set_wallclock(&mut self, wallclock: u64) { @@ -409,6 +408,12 @@ impl ContactInfo { } } +fn get_node_outset() -> u64 { + let now = SystemTime::now(); + let elapsed = now.duration_since(UNIX_EPOCH).unwrap(); + u64::try_from(elapsed.as_micros()).unwrap() +} + impl<'de> Deserialize<'de> for ContactInfo { fn deserialize(deserializer: D) -> Result where diff --git a/turbine/src/cluster_nodes.rs b/turbine/src/cluster_nodes.rs index d2e3da68a4b7af..743a3202cba767 100644 --- a/turbine/src/cluster_nodes.rs +++ b/turbine/src/cluster_nodes.rs @@ -547,7 +547,7 @@ pub fn make_test_cluster( .collect(); nodes.shuffle(rng); let keypair = Arc::new(Keypair::new()); - nodes[0].set_pubkey(keypair.pubkey()); + nodes[0] = ContactInfo::new_localhost(&keypair.pubkey(), /*wallclock:*/ timestamp()); let this_node = nodes[0].clone(); let mut stakes: HashMap = nodes .iter() diff --git a/validator/src/main.rs b/validator/src/main.rs index db65c890a7542e..444c1513100fe9 100644 --- a/validator/src/main.rs +++ b/validator/src/main.rs @@ -1901,6 +1901,13 @@ pub fn main() { return; } + // Bootstrap code above pushes a contact-info with more recent timestamp to + // gossip. If the node is staked the contact-info lingers in gossip causing + // false duplicate nodes error. + // Below line refreshes the timestamp on contact-info so that it overrides + // the one pushed by bootstrap. + node.info.hot_swap_pubkey(identity_keypair.pubkey()); + let validator = Validator::new( node, identity_keypair,