diff --git a/Cargo.toml b/Cargo.toml index f1bcaf64d2..a6dc38f0ec 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -51,7 +51,13 @@ c8y_mapper_ext = { path = "crates/extensions/c8y_mapper_ext" } camino = "1.1" cap = "0.1" certificate = { path = "crates/common/certificate" } -clap = { version = "4.4", features = ["cargo", "derive"] } +clap = { version = "4.4", features = [ + "cargo", + "derive", + "string", + "env", + "unstable-styles", +] } clock = { path = "crates/common/clock" } collectd_ext = { path = "crates/extensions/collectd_ext" } csv = "1.1" diff --git a/crates/core/tedge/Cargo.toml b/crates/core/tedge/Cargo.toml index aecef0879c..5268c8fc3d 100644 --- a/crates/core/tedge/Cargo.toml +++ b/crates/core/tedge/Cargo.toml @@ -19,13 +19,7 @@ c8y-remote-access-plugin = { workspace = true } camino = { workspace = true } cap = { workspace = true } certificate = { workspace = true, features = ["reqwest-blocking"] } -clap = { workspace = true, features = [ - "cargo", - "derive", - "env", - "string", - "unstable-styles", -] } +clap = { workspace = true } doku = { workspace = true } hyper = { workspace = true, default-features = false } nix = { workspace = true } diff --git a/crates/core/tedge_api/src/health.rs b/crates/core/tedge_api/src/health.rs index 53a5c3f981..db43fe1cbe 100644 --- a/crates/core/tedge_api/src/health.rs +++ b/crates/core/tedge_api/src/health.rs @@ -10,7 +10,6 @@ use mqtt_channel::Topic; use serde::Deserialize; use serde::Serialize; use serde_json::json; -use serde_json::Value as JsonValue; use std::fmt::Display; use std::process; use std::sync::Arc; @@ -97,11 +96,16 @@ impl ServiceHealthTopic { } } +/// Payload of the health status message. +/// +/// Contains only fields required for the payload to be considered a valid health status message. +/// Other components are free to require additional fields for their purposes. +/// +/// https://thin-edge.github.io/thin-edge.io/operate/troubleshooting/monitoring-service-health/ #[derive(Deserialize, Serialize, Debug, Default)] pub struct HealthStatus { + /// Current status of the service, synced by the mapper to the cloud pub status: Status, - pub pid: Option, - pub time: Option, } #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] @@ -146,11 +150,7 @@ impl HealthStatus { Ok("0") => Status::Down, _ => Status::default(), }; - HealthStatus { - status, - pid: None, - time: None, - } + HealthStatus { status } } else { serde_json::from_slice(message.payload()).unwrap_or_default() }; diff --git a/crates/core/tedge_watchdog/src/systemd_watchdog.rs b/crates/core/tedge_watchdog/src/systemd_watchdog.rs index 78741a3966..b8eb8f67a5 100644 --- a/crates/core/tedge_watchdog/src/systemd_watchdog.rs +++ b/crates/core/tedge_watchdog/src/systemd_watchdog.rs @@ -1,4 +1,3 @@ -use crate::error::WatchdogError; use anyhow::Context; use freedesktop_entry_parser::parse_entry; use futures::channel::mpsc; @@ -8,6 +7,9 @@ use futures::StreamExt; use mqtt_channel::MqttMessage; use mqtt_channel::PubChannel; use mqtt_channel::Topic; +use serde::Deserialize; +use serde::Serialize; +use serde_json::Value as JsonValue; use std::path::PathBuf; use std::process; use std::process::Command; @@ -20,7 +22,6 @@ use tedge_api::mqtt_topics::Channel; use tedge_api::mqtt_topics::EntityTopicId; use tedge_api::mqtt_topics::MqttSchema; use tedge_api::mqtt_topics::OperationType; -use tedge_api::HealthStatus; use tedge_config::TEdgeConfigLocation; use tedge_utils::timestamp::IsoOrUnix; use time::OffsetDateTime; @@ -29,6 +30,8 @@ use tracing::error; use tracing::info; use tracing::warn; +use crate::error::WatchdogError; + const SERVICE_NAME: &str = "tedge-watchdog"; /// How many times more often do we send notify to systemd watchdog, than is necessary from the @@ -38,6 +41,16 @@ const SERVICE_NAME: &str = "tedge-watchdog"; /// a timing misalignment. const NOTIFY_SEND_FREQ_RATIO: u64 = 4; +/// A subset of fields of health status payload required by the watchdog. +/// +/// https://thin-edge.github.io/thin-edge.io/operate/troubleshooting/monitoring-service-health/ +#[derive(Debug, Serialize, Deserialize)] +struct HealthStatusExt { + /// Used for tracking service restarts + pub pid: Option, + pub time: Option, +} + pub async fn start_watchdog(tedge_config_dir: PathBuf) -> Result<(), anyhow::Error> { // Send ready notification to systemd. notify_systemd(process::id(), "--ready")?; @@ -128,7 +141,6 @@ async fn start_watchdog_for_tedge_services(tedge_config_dir: PathBuf) { let tedge_config_location = tedge_config_location.clone(); watchdog_tasks.push(tokio::spawn(async move { - // let interval = Duration::from_secs((interval / NOTIFY_SEND_FREQ_RATIO).max(1)); monitor_tedge_service( tedge_config_location, @@ -243,11 +255,11 @@ async fn monitor_tedge_service( async fn get_latest_health_status_message( request_timestamp: OffsetDateTime, messages: &mut mpsc::UnboundedReceiver, -) -> Result { +) -> Result { while let Some(message) = messages.next().await { if let Ok(message) = message.payload_str() { debug!("Health response received: {message}"); - if let Ok(health_status) = serde_json::from_str::(message) { + if let Ok(health_status) = serde_json::from_str::(message) { if health_status.time.is_none() { error!("Ignoring invalid health response: {health_status:?} without a `time` field in it"); continue; diff --git a/crates/extensions/c8y_mapper_ext/src/service_monitor.rs b/crates/extensions/c8y_mapper_ext/src/service_monitor.rs index 40fd855f85..46bcaa3176 100644 --- a/crates/extensions/c8y_mapper_ext/src/service_monitor.rs +++ b/crates/extensions/c8y_mapper_ext/src/service_monitor.rs @@ -32,11 +32,8 @@ pub fn convert_health_status_message( return vec![]; } - let HealthStatus { - status, - pid: _, - time: _, - } = HealthStatus::try_from_health_status_message(message, mqtt_schema).unwrap(); + let HealthStatus { status } = + HealthStatus::try_from_health_status_message(message, mqtt_schema).unwrap(); let display_name = entity .other @@ -83,6 +80,15 @@ mod tests { r#"102,test_device:device:main:service:tedge-mapper-c8y,service,tedge-mapper-c8y,up"#; "service-monitoring-thin-edge-device" )] + // If there are any problems with fields other than `status`, we want to ignore them and still send status update + #[test_case( + "test_device", + "te/device/main/service/tedge-mapper-c8y/status/health", + r#"{"unrecognised_field": [42], "time": "invalid timestamp", "pid": "invalid pid", "status": "up"}"#, + "c8y/s/us", + r#"102,test_device:device:main:service:tedge-mapper-c8y,service,tedge-mapper-c8y,up"#; + "service-monitoring-thin-edge-device-optional-fields-invalid" + )] #[test_case( "test_device", "te/device/child/service/tedge-mapper-c8y/status/health", diff --git a/plugins/tedge_apt_plugin/Cargo.toml b/plugins/tedge_apt_plugin/Cargo.toml index 843392da07..429adde5a1 100644 --- a/plugins/tedge_apt_plugin/Cargo.toml +++ b/plugins/tedge_apt_plugin/Cargo.toml @@ -10,7 +10,7 @@ homepage = { workspace = true } repository = { workspace = true } [dependencies] -clap = { workspace = true, features = ["derive"] } +clap = { workspace = true } csv = { workspace = true } log = { workspace = true } regex = { workspace = true }