From 793385e0a4329bab9c44f479b196bc69e753927a Mon Sep 17 00:00:00 2001
From: Mark Johnston <markj@FreeBSD.org>
Date: Fri, 26 Apr 2024 14:12:50 -0400
Subject: [PATCH] freebsd-relayd: Fix problems with HCE/PFE/relay desync after
 reload

It has been observed that relayd will sometimes exit with an assertion
failure immediately after its config has been reloaded (i.e., SIGHUP was
sent or "relayctl reload" was used).  Usually it's because the PFE was
handling a host status update (IMSG_HOST_STATUS) and failed to find the
host in question or the check counts aren't matching.

relayd configuration reloads are implemented by the parent, which
re-reads the configuration file, builds up a new set of data structures
(hosts, tables, redirects, etc.), and sends copies to each worker.
The sequence is:
1. parent -> IMSG_CTL_RESET -> workers, causing workers to free all of
   the existing data structures.
2. parent -> IMSG_CFG_{TABLE,HOST,...} -> workers, giving state to
   workers.
3. parent -> IMSG_CFG_DONE -> workers, indicating that it's done with
   the configuration update.
4. workers -> IMSG_CFG_DONE -> parent, indicating workers are ready to
   start.
5. parent -> IMSG_CFG_START -> workers, only once all messages in step 4
   have been received by the parent.

The structures sent in step 2 each have a unique ID (modulo 32-bit
object ID rollover), so a host object ID is only valid in one
"configuration epoch".  When a reload starts, a new configuration epoch
begins, and all IDs belonging to the previous epoch become invalid.

The basic problem is that there is no global ordering of IPC messages
sent between workers and the parent.  Each process in relayd receives
messages from multiple queues, and these can be interleaved or delayed
indefinitely.  For instance, IMSG_HOST_STATUS (sent from HCE -> PFE, and
PFE -> relay) refers to a host by ID, but there is nothing ensuring that
the status update is handled in the same epoch in which it was sent.
Such mismatches result in the assertion failures mentioned above.

One solution is simply to remove checks and drop messages if they refer
to non-existent objects.  However, this might hide bugs and generally
makes the code harder to reason about.  My solution is to make the
notion of configuration epoch explicit in relayd worker state, and to
tag certain messages with the current epoch.  Then workers can decide
what to do about mismatches.  So:

1. Add a 64-bit epoch counter, incremented in step 1 of the reload
   procedure above.  That is, each worker has a notion of the current
   epoch, incremented when IMSG_CTL_RESET is received.
2. When the HCE schedules host checks, it stamps the host with the
   current epoch.  When IMSG_HOST_STATUS is set to the PFE and to
   relays, the receiver drops the message if the epoch doesn't match.
   This ensures that old host status messages are correctly ignored.
3. "script" checks involve a round-trip through the parent process which
   isn't cancelled when the HCE receives IMSG_CTL_RESET.  Thus, imbue
   script state with the current epoch as well.
4. Ensure that the PFE and HCE actually disable events when CTL_RESET is
   received.  Previously, hce_disable_events() and pfe_disable_events()
   weren't getting called at all.
5. Make sure that relayctl can't be used to schedule a check while a
   configuration reload is in progress.

This was sufficient to eliminate assertion failures in a stress test
which spams relayd with SIGHUP and relayctl poll commands.  I suspect
that my coverage of item 5 above isn't wide enough; that is, other
relayctl commands (e.g., relayctl table disable) might still cause
problems.
---
 usr.sbin/relayd/check_script.c |  7 +++++++
 usr.sbin/relayd/config.c       |  4 ++++
 usr.sbin/relayd/hce.c          | 23 +++++++++++++++--------
 usr.sbin/relayd/pfe.c          | 14 ++++++++++----
 usr.sbin/relayd/relay.c        |  2 ++
 usr.sbin/relayd/relayd.h       |  7 ++++++-
 6 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/usr.sbin/relayd/check_script.c b/usr.sbin/relayd/check_script.c
index d0cd1f6fc7ff..e162a0942ee2 100644
--- a/usr.sbin/relayd/check_script.c
+++ b/usr.sbin/relayd/check_script.c
@@ -48,6 +48,7 @@ check_script(struct relayd *env, struct host *host)
 	host->flags &= ~(F_CHECK_SENT|F_CHECK_DONE);
 
 	scr.host = host->conf.id;
+	scr.config_gen = env->sc_config_gen;
 	if ((strlcpy(scr.name, host->conf.name,sizeof(scr.name)) >=
 	    sizeof(scr.name)) ||
 	    (strlcpy(scr.path, table->conf.path, sizeof(scr.path)) >=
@@ -65,6 +66,12 @@ script_done(struct relayd *env, struct ctl_script *scr)
 {
 	struct host		*host;
 
+	if (scr->config_gen != env->sc_config_gen) {
+		log_debug("script check for %s interrupted, ignoring",
+		    scr->name);
+		return;
+	}
+
 	if ((host = host_find(env, scr->host)) == NULL)
 		fatalx("%s: invalid host id", __func__);
 
diff --git a/usr.sbin/relayd/config.c b/usr.sbin/relayd/config.c
index 48c35c0b1056..a6a626d70421 100644
--- a/usr.sbin/relayd/config.c
+++ b/usr.sbin/relayd/config.c
@@ -153,6 +153,8 @@ config_purge(struct relayd *env, u_int reset)
 	struct keyname		*keyname;
 	u_int			 what;
 
+	env->sc_started = 0;
+
 	what = ps->ps_what[privsep_process] & reset;
 
 	if (what & CONFIG_TABLES && env->sc_tables != NULL) {
@@ -265,6 +267,8 @@ config_getreset(struct relayd *env, struct imsg *imsg)
 
 	config_purge(env, mode);
 
+	env->sc_config_gen++;
+
 	return (0);
 }
 
diff --git a/usr.sbin/relayd/hce.c b/usr.sbin/relayd/hce.c
index a1cb0683bfed..a667ef9ea6d2 100644
--- a/usr.sbin/relayd/hce.c
+++ b/usr.sbin/relayd/hce.c
@@ -82,11 +82,10 @@ hce_setup_events(void)
 	struct timeval	 tv;
 	struct table	*table;
 
-	if (!event_initialized(&env->sc_ev)) {
+	if (!event_initialized(&env->sc_ev))
 		evtimer_set(&env->sc_ev, hce_launch_checks, env);
-		bzero(&tv, sizeof(tv));
-		evtimer_add(&env->sc_ev, &tv);
-	}
+	bzero(&tv, sizeof(tv));
+	evtimer_add(&env->sc_ev, &tv);
 
 	if (env->sc_conf.flags & F_TLS) {
 		TAILQ_FOREACH(table, env->sc_tables, entry) {
@@ -100,6 +99,8 @@ hce_setup_events(void)
 			tls_config_insecure_noverifyname(table->tls_cfg);
 		}
 	}
+
+	env->sc_started = 1;
 }
 
 void
@@ -170,6 +171,7 @@ hce_launch_checks(int fd, short event, void *arg)
 				continue;
 			bcopy(&tv, &host->cte.tv_start,
 			    sizeof(host->cte.tv_start));
+			host->config_gen = env->sc_config_gen;
 			switch (table->conf.check) {
 			case CHECK_ICMP:
 				schedule_icmp(env, host);
@@ -257,6 +259,7 @@ hce_notify_done(struct host *host, enum host_error he)
 	st.up = host->up;
 	st.check_cnt = host->check_cnt;
 	st.retry_cnt = host->retry_cnt;
+	st.config_gen = host->config_gen;
 	st.he = he;
 	host->flags |= (F_CHECK_SENT|F_CHECK_DONE);
 	msg = host_error(he);
@@ -305,6 +308,7 @@ hce_dispatch_pfe(int fd, struct privsep_proc *p, struct imsg *imsg)
 	objid_t			 id;
 	struct host		*host;
 	struct table		*table;
+	u_int64_t		 config_gen;
 
 	switch (imsg->hdr.type) {
 	case IMSG_HOST_DISABLE:
@@ -342,10 +346,12 @@ hce_dispatch_pfe(int fd, struct privsep_proc *p, struct imsg *imsg)
 			host->up = HOST_UNKNOWN;
 		break;
 	case IMSG_CTL_POLL:
-		evtimer_del(&env->sc_ev);
-		TAILQ_FOREACH(table, env->sc_tables, entry)
-			table->skipped = 0;
-		hce_launch_checks(-1, EV_TIMEOUT, env);
+		if (env->sc_started) {
+			evtimer_del(&env->sc_ev);
+			TAILQ_FOREACH(table, env->sc_tables, entry)
+				table->skipped = 0;
+			hce_launch_checks(-1, EV_TIMEOUT, env);
+		}
 		break;
 	default:
 		return (-1);
@@ -378,6 +384,7 @@ hce_dispatch_parent(int fd, struct privsep_proc *p, struct imsg *imsg)
 		hce_setup_events();
 		break;
 	case IMSG_CTL_RESET:
+		hce_disable_events();
 		config_getreset(env, imsg);
 		break;
 	default:
diff --git a/usr.sbin/relayd/pfe.c b/usr.sbin/relayd/pfe.c
index d17f819ccde4..7b848c4e56dd 100644
--- a/usr.sbin/relayd/pfe.c
+++ b/usr.sbin/relayd/pfe.c
@@ -125,11 +125,12 @@ pfe_setup_events(void)
 	struct timeval	 tv;
 
 	/* Schedule statistics timer */
-	if (!event_initialized(&env->sc_statev)) {
+	if (!event_initialized(&env->sc_statev))
 		evtimer_set(&env->sc_statev, pfe_statistics, NULL);
-		bcopy(&env->sc_conf.statinterval, &tv, sizeof(tv));
-		evtimer_add(&env->sc_statev, &tv);
-	}
+	bcopy(&env->sc_conf.statinterval, &tv, sizeof(tv));
+	evtimer_add(&env->sc_statev, &tv);
+
+	env->sc_started = 1;
 }
 
 void
@@ -151,6 +152,10 @@ pfe_dispatch_hce(int fd, struct privsep_proc *p, struct imsg *imsg)
 	case IMSG_HOST_STATUS:
 		IMSG_SIZE_CHECK(imsg, &st);
 		memcpy(&st, imsg->data, sizeof(st));
+		if (st.config_gen != env->sc_config_gen) {
+			log_debug("%s: reload gen mismatch", __func__);
+			break;
+		}
 		if ((host = host_find(env, st.id)) == NULL)
 			fatalx("%s: invalid host id", __func__);
 		host->he = st.he;
@@ -286,6 +291,7 @@ pfe_dispatch_relay(int fd, struct privsep_proc *p, struct imsg *imsg)
 	struct relay		*rlay;
 	struct ctl_conn		*c;
 	struct rsession		 con, *s, *t;
+	u_int64_t		 config_gen;
 	int			 cid;
 	objid_t			 sid;
 
diff --git a/usr.sbin/relayd/relay.c b/usr.sbin/relayd/relay.c
index de94a2d9c3de..d51428369809 100644
--- a/usr.sbin/relayd/relay.c
+++ b/usr.sbin/relayd/relay.c
@@ -1976,6 +1976,8 @@ relay_dispatch_pfe(int fd, struct privsep_proc *p, struct imsg *imsg)
 	case IMSG_HOST_STATUS:
 		IMSG_SIZE_CHECK(imsg, &st);
 		memcpy(&st, imsg->data, sizeof(st));
+		if (st.config_gen != env->sc_config_gen)
+			break;
 		if ((host = host_find(env, st.id)) == NULL)
 			fatalx("%s: invalid host id", __func__);
 		if (host->flags & F_DISABLE)
diff --git a/usr.sbin/relayd/relayd.h b/usr.sbin/relayd/relayd.h
index 40e31aebd869..6544c5282fce 100644
--- a/usr.sbin/relayd/relayd.h
+++ b/usr.sbin/relayd/relayd.h
@@ -162,6 +162,7 @@ struct ctl_status {
 	int		 up;
 	int		 retry_cnt;
 	u_long		 check_cnt;
+	u_int64_t	 config_gen;
 	u_int16_t	 he;
 };
 
@@ -193,6 +194,7 @@ struct ctl_relayfd {
 
 struct ctl_script {
 	objid_t		 host;
+	u_int64_t	 config_gen;
 	int		 retval;
 	struct timeval	 timeout;
 	char		 name[HOST_NAME_MAX+1];
@@ -481,6 +483,7 @@ struct host {
 	u_long			 up_cnt;
 	int			 retry_cnt;
 	int			 idx;
+	u_int64_t		 config_gen;
 	u_int16_t		 he;
 	int			 code;
 	struct ctl_tcp_event	 cte;
@@ -1059,7 +1062,7 @@ enum imsg_type {
 	IMSG_CA_PRIVDEC,
 	IMSG_SESS_PUBLISH,	/* from relay to pfe */
 	IMSG_SESS_UNPUBLISH,
-	IMSG_TLSTICKET_REKEY
+	IMSG_TLSTICKET_REKEY,
 };
 
 enum privsep_procid {
@@ -1206,6 +1209,8 @@ struct relayd {
 
 	struct privsep		*sc_ps;
 	int			 sc_reload;
+	int			 sc_started;
+	u_int64_t		 sc_config_gen;
 };
 
 #define RELAYD_OPT_VERBOSE		0x01