diff --git a/deploy/stage/common-values-iris-mpc.yaml b/deploy/stage/common-values-iris-mpc.yaml index 772a86729..332f56f51 100644 --- a/deploy/stage/common-values-iris-mpc.yaml +++ b/deploy/stage/common-values-iris-mpc.yaml @@ -1,4 +1,4 @@ -image: "ghcr.io/worldcoin/iris-mpc:v0.8.16" +image: "ghcr.io/worldcoin/iris-mpc:v0.8.18" environment: stage replicaCount: 1 diff --git a/deploy/stage/mpc1-stage/values-iris-mpc.yaml b/deploy/stage/mpc1-stage/values-iris-mpc.yaml index ce4f63032..362bfd016 100644 --- a/deploy/stage/mpc1-stage/values-iris-mpc.yaml +++ b/deploy/stage/mpc1-stage/values-iris-mpc.yaml @@ -49,6 +49,9 @@ env: - name: SMPC__PROCESSING_TIMEOUT_SECS value: "60" + + - name: SMPC__HEARTBEAT_INTERVAL_SECS + value: "5" - name: SMPC__PATH value: "/data/" diff --git a/deploy/stage/mpc2-stage/values-iris-mpc.yaml b/deploy/stage/mpc2-stage/values-iris-mpc.yaml index a5af01f3e..8fc9ead17 100644 --- a/deploy/stage/mpc2-stage/values-iris-mpc.yaml +++ b/deploy/stage/mpc2-stage/values-iris-mpc.yaml @@ -49,6 +49,9 @@ env: - name: SMPC__PROCESSING_TIMEOUT_SECS value: "60" + + - name: SMPC__HEARTBEAT_INTERVAL_SECS + value: "5" - name: SMPC__PATH value: "/data/" diff --git a/deploy/stage/mpc3-stage/values-iris-mpc.yaml b/deploy/stage/mpc3-stage/values-iris-mpc.yaml index 799567e0c..94ce08d90 100644 --- a/deploy/stage/mpc3-stage/values-iris-mpc.yaml +++ b/deploy/stage/mpc3-stage/values-iris-mpc.yaml @@ -50,6 +50,9 @@ env: - name: SMPC__PROCESSING_TIMEOUT_SECS value: "60" + - name: SMPC__HEARTBEAT_INTERVAL_SECS + value: "5" + - name: SMPC__PATH value: "/data/" diff --git a/iris-mpc-common/src/config/mod.rs b/iris-mpc-common/src/config/mod.rs index 4fd4cd906..f4785eb6d 100644 --- a/iris-mpc-common/src/config/mod.rs +++ b/iris-mpc-common/src/config/mod.rs @@ -60,6 +60,9 @@ pub struct Config { #[serde(default = "default_max_batch_size")] pub max_batch_size: usize, + + #[serde(default = "default_heartbeat_interval_secs")] + pub heartbeat_interval_secs: u64, } fn default_processing_timeout_secs() -> u64 { @@ -70,6 +73,10 @@ fn default_max_batch_size() -> usize { 64 } +fn default_heartbeat_interval_secs() -> u64 { + 30 +} + impl Config { pub fn load_config(prefix: &str) -> eyre::Result { let settings = config::Config::builder(); diff --git a/iris-mpc-gpu/src/server/heartbeat_nccl.rs b/iris-mpc-gpu/src/server/heartbeat_nccl.rs index 5a59ffc9e..df4f6d13e 100644 --- a/iris-mpc-gpu/src/server/heartbeat_nccl.rs +++ b/iris-mpc-gpu/src/server/heartbeat_nccl.rs @@ -7,13 +7,14 @@ use tokio::{ task::{spawn_blocking, JoinHandle}, time::timeout, }; -const HEARBEAT_INTERVAL: Duration = Duration::from_secs(5); pub async fn start_heartbeat( party_id: usize, main_tx: oneshot::Sender>, + heartbeat_interval_secs: u64, ) -> eyre::Result<()> { let (tx, mut rx) = mpsc::channel(1); + let heartbeat_interval = Duration::from_secs(heartbeat_interval_secs); let heartbeat_handle: JoinHandle> = spawn_blocking(move || { let device_manager = Arc::new(DeviceManager::init_with_streams()); @@ -61,7 +62,7 @@ pub async fn start_heartbeat( Ok(()) }())?; } - std::thread::sleep(HEARBEAT_INTERVAL); + std::thread::sleep(heartbeat_interval); counter += 1; } }); @@ -74,7 +75,7 @@ pub async fn start_heartbeat( match timeout(timeout_interval, rx.recv()).await { // The first heartbeat might take a while due to retries. However, after the connection // is established, we switch to the normal heartbeat interval. - Ok(Some(Ok(_))) => timeout_interval = 2 * HEARBEAT_INTERVAL, + Ok(Some(Ok(_))) => timeout_interval = 2 * heartbeat_interval, Ok(None) => { tracing::error!("Heartbeat: Channel closed."); break; diff --git a/iris-mpc/src/bin/server.rs b/iris-mpc/src/bin/server.rs index f4b6fc27f..b04604117 100644 --- a/iris-mpc/src/bin/server.rs +++ b/iris-mpc/src/bin/server.rs @@ -589,7 +589,11 @@ async fn server_main(config: Config) -> eyre::Result<()> { let mut background_tasks = TaskMonitor::new(); let (tx, rx) = oneshot::channel(); - let _heartbeat = background_tasks.spawn(start_heartbeat(config.party_id, tx)); + let _heartbeat = background_tasks.spawn(start_heartbeat( + config.party_id, + tx, + config.heartbeat_interval_secs, + )); background_tasks.check_tasks(); tracing::info!("Heartbeat starting...");