Skip to content

Commit

Permalink
Merge pull request #31293 from aljoscha/0dt-notice-ddl-changes
Browse files Browse the repository at this point in the history
0dt: in preflight checks, notice DDL changes and restart read-only envd
  • Loading branch information
aljoscha authored Feb 10, 2025
2 parents 40af533 + 1f69ebd commit 445cbeb
Show file tree
Hide file tree
Showing 10 changed files with 795 additions and 21 deletions.
1 change: 1 addition & 0 deletions misc/python/materialize/cli/ci_annotate_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@
# For 0dt upgrades
| halting\ process:\ (unable\ to\ confirm\ leadership|fenced\ out\ old\ deployment;\ rebooting\ as\ leader|this\ deployment\ has\ been\ fenced\ out)
| zippy-materialized.* \| .* halting\ process:\ Server\ started\ with\ requested\ generation
| there\ have\ been\ DDL\ that\ we\ need\ to\ react\ to;\ rebooting\ in\ read-only\ mode
# Don't care for ssh problems
| fatal:\ userauth_pubkey
# Fences without incrementing deploy generation
Expand Down
7 changes: 7 additions & 0 deletions src/adapter-types/src/dyncfgs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ pub const WITH_0DT_DEPLOYMENT_MAX_WAIT: Config<Duration> = Config::new(
"How long to wait at most for clusters to be hydrated, when doing a zero-downtime deployment.",
);

pub const WITH_0DT_DEPLOYMENT_DDL_CHECK_INTERVAL: Config<Duration> = Config::new(
"with_0dt_deployment_ddl_check_interval",
Duration::from_secs(5 * 60),
"How often to check for DDL changes during zero-downtime deployment.",
);

pub const ENABLE_0DT_DEPLOYMENT_PANIC_AFTER_TIMEOUT: Config<bool> = Config::new(
"enable_0dt_deployment_panic_after_timeout",
false,
Expand Down Expand Up @@ -117,6 +123,7 @@ pub fn all_dyncfgs(configs: ConfigSet) -> ConfigSet {
.add(&ALLOW_USER_SESSIONS)
.add(&ENABLE_0DT_DEPLOYMENT)
.add(&WITH_0DT_DEPLOYMENT_MAX_WAIT)
.add(&WITH_0DT_DEPLOYMENT_DDL_CHECK_INTERVAL)
.add(&ENABLE_0DT_DEPLOYMENT_PANIC_AFTER_TIMEOUT)
.add(&WITH_0DT_DEPLOYMENT_CAUGHT_UP_CHECK_INTERVAL)
.add(&ENABLE_0DT_CAUGHT_UP_CHECK)
Expand Down
13 changes: 12 additions & 1 deletion src/adapter/src/catalog/transact.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ use itertools::Itertools;
use mz_adapter_types::compaction::CompactionWindow;
use mz_adapter_types::connection::ConnectionId;
use mz_adapter_types::dyncfgs::{
ENABLE_0DT_DEPLOYMENT, ENABLE_0DT_DEPLOYMENT_PANIC_AFTER_TIMEOUT, WITH_0DT_DEPLOYMENT_MAX_WAIT,
ENABLE_0DT_DEPLOYMENT, ENABLE_0DT_DEPLOYMENT_PANIC_AFTER_TIMEOUT,
WITH_0DT_DEPLOYMENT_DDL_CHECK_INTERVAL, WITH_0DT_DEPLOYMENT_MAX_WAIT,
};
use mz_audit_log::{
CreateOrDropClusterReplicaReasonV1, EventDetails, EventType, IdFullNameV1, IdNameV1,
Expand Down Expand Up @@ -2268,6 +2269,13 @@ impl Catalog {
Duration::parse(VarInput::Flat(&parsed_value))
.expect("parsing succeeded above");
tx.set_0dt_deployment_max_wait(with_0dt_deployment_max_wait)?;
} else if name == WITH_0DT_DEPLOYMENT_DDL_CHECK_INTERVAL.name() {
let with_0dt_deployment_ddl_check_interval =
Duration::parse(VarInput::Flat(&parsed_value))
.expect("parsing succeeded above");
tx.set_0dt_deployment_ddl_check_interval(
with_0dt_deployment_ddl_check_interval,
)?;
} else if name == ENABLE_0DT_DEPLOYMENT_PANIC_AFTER_TIMEOUT.name() {
let panic_after_timeout =
strconv::parse_bool(&parsed_value).expect("parsing succeeded above");
Expand Down Expand Up @@ -2297,6 +2305,8 @@ impl Catalog {
tx.reset_enable_0dt_deployment()?;
} else if name == WITH_0DT_DEPLOYMENT_MAX_WAIT.name() {
tx.reset_0dt_deployment_max_wait()?;
} else if name == WITH_0DT_DEPLOYMENT_DDL_CHECK_INTERVAL.name() {
tx.reset_0dt_deployment_ddl_check_interval()?;
}

CatalogState::add_to_audit_log(
Expand All @@ -2314,6 +2324,7 @@ impl Catalog {
tx.clear_system_configs();
tx.reset_enable_0dt_deployment()?;
tx.reset_0dt_deployment_max_wait()?;
tx.reset_0dt_deployment_ddl_check_interval()?;

CatalogState::add_to_audit_log(
&state.system_configuration,
Expand Down
9 changes: 9 additions & 0 deletions src/catalog/src/durable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,15 @@ pub trait OpenableDurableCatalogState: Debug + Send {
/// LaunchDarkly is available.
async fn get_0dt_deployment_max_wait(&mut self) -> Result<Option<Duration>, CatalogError>;

/// Get the `with_0dt_deployment_ddl_check_interval` config value of this instance.
///
/// This mirrors the `with_0dt_deployment_ddl_check_interval` "system var" so that we can
/// toggle the flag with LaunchDarkly, but use it in boot before
/// LaunchDarkly is available.
async fn get_0dt_deployment_ddl_check_interval(
&mut self,
) -> Result<Option<Duration>, CatalogError>;

/// Get the `enable_0dt_deployment_panic_after_timeout` config value of this
/// instance.
///
Expand Down
9 changes: 9 additions & 0 deletions src/catalog/src/durable/initialize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,15 @@ pub(crate) const ENABLE_0DT_DEPLOYMENT: &str = "enable_0dt_deployment";
/// NOTE: Weird prefix because we can't start with a `0`.
pub(crate) const WITH_0DT_DEPLOYMENT_MAX_WAIT: &str = "with_0dt_deployment_max_wait";

/// The key used within the "config" collection where we store a mirror of the
/// `with_0dt_deployment_ddl_check_interval` "system var" value. This is
/// mirrored so that we can toggle the flag with LaunchDarkly, but use it in
/// boot before LaunchDarkly is available.
///
/// NOTE: Weird prefix because we can't start with a `0`.
pub(crate) const WITH_0DT_DEPLOYMENT_DDL_CHECK_INTERVAL: &str =
"with_0dt_deployment_ddl_check_interval";

/// The key used within the "config" collection where we store a mirror of the
/// `enable_0dt_deployment_panic_after_timeout` "system var" value. This is
/// mirrored so that we can toggle the flag with LaunchDarkly, but use it in
Expand Down
15 changes: 14 additions & 1 deletion src/catalog/src/durable/persist.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ use crate::durable::debug::{Collection, CollectionType, DebugCatalogState, Trace
use crate::durable::error::FenceError;
use crate::durable::initialize::{
ENABLE_0DT_DEPLOYMENT, ENABLE_0DT_DEPLOYMENT_PANIC_AFTER_TIMEOUT, SYSTEM_CONFIG_SYNCED_KEY,
USER_VERSION_KEY, WITH_0DT_DEPLOYMENT_MAX_WAIT,
USER_VERSION_KEY, WITH_0DT_DEPLOYMENT_DDL_CHECK_INTERVAL, WITH_0DT_DEPLOYMENT_MAX_WAIT,
};
use crate::durable::metrics::Metrics;
use crate::durable::objects::state_update::{
Expand Down Expand Up @@ -1467,6 +1467,19 @@ impl OpenableDurableCatalogState for UnopenedPersistCatalogState {
}
}

#[mz_ore::instrument(level = "debug")]
async fn get_0dt_deployment_ddl_check_interval(
&mut self,
) -> Result<Option<Duration>, CatalogError> {
let value = self
.get_current_config(WITH_0DT_DEPLOYMENT_DDL_CHECK_INTERVAL)
.await?;
match value {
None => Ok(None),
Some(millis) => Ok(Some(Duration::from_millis(millis))),
}
}

#[mz_ore::instrument(level = "debug")]
async fn get_enable_0dt_deployment_panic_after_timeout(
&mut self,
Expand Down
33 changes: 32 additions & 1 deletion src/catalog/src/durable/transaction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ use mz_storage_types::controller::StorageError;
use crate::builtin::BuiltinLog;
use crate::durable::initialize::{
ENABLE_0DT_DEPLOYMENT, ENABLE_0DT_DEPLOYMENT_PANIC_AFTER_TIMEOUT, SYSTEM_CONFIG_SYNCED_KEY,
WITH_0DT_DEPLOYMENT_MAX_WAIT,
WITH_0DT_DEPLOYMENT_DDL_CHECK_INTERVAL, WITH_0DT_DEPLOYMENT_MAX_WAIT,
};
use crate::durable::objects::serialization::proto;
use crate::durable::objects::{
Expand Down Expand Up @@ -1876,6 +1876,27 @@ impl<'a> Transaction<'a> {
)
}

/// Updates the catalog `with_0dt_deployment_ddl_check_interval` "config"
/// value to match the `with_0dt_deployment_ddl_check_interval` "system var"
/// value.
///
/// These are mirrored so that we can toggle the flag with Launch Darkly,
/// but use it in boot before Launch Darkly is available.
pub fn set_0dt_deployment_ddl_check_interval(
&mut self,
value: Duration,
) -> Result<(), CatalogError> {
self.set_config(
WITH_0DT_DEPLOYMENT_DDL_CHECK_INTERVAL.into(),
Some(
value
.as_millis()
.try_into()
.expect("ddl check interval fits into u64"),
),
)
}

/// Updates the catalog `0dt_deployment_panic_after_timeout` "config" value to
/// match the `0dt_deployment_panic_after_timeout` "system var" value.
///
Expand Down Expand Up @@ -1909,6 +1930,16 @@ impl<'a> Transaction<'a> {
self.set_config(WITH_0DT_DEPLOYMENT_MAX_WAIT.into(), None)
}

/// Removes the catalog `with_0dt_deployment_ddl_check_interval` "config"
/// value to match the `with_0dt_deployment_ddl_check_interval` "system var"
/// value.
///
/// These are mirrored so that we can toggle the flag with LaunchDarkly, but
/// use it in boot before LaunchDarkly is available.
pub fn reset_0dt_deployment_ddl_check_interval(&mut self) -> Result<(), CatalogError> {
self.set_config(WITH_0DT_DEPLOYMENT_DDL_CHECK_INTERVAL.into(), None)
}

/// Removes the catalog `enable_0dt_deployment_panic_after_timeout` "config"
/// value to match the `enable_0dt_deployment_panic_after_timeout` "system
/// var" value.
Expand Down
Loading

0 comments on commit 445cbeb

Please sign in to comment.