Skip to content

Commit

Permalink
Merge pull request #31079 from MaterializeInc/push-yzkkkoprlllq
Browse files Browse the repository at this point in the history
loosen version upgrade checks to allow for lts version upgrades
  • Loading branch information
doy-materialize authored Jan 28, 2025
2 parents 2e376e7 + b25e51f commit 3b86bc3
Show file tree
Hide file tree
Showing 6 changed files with 262 additions and 4 deletions.
14 changes: 13 additions & 1 deletion ci/nightly/pipeline.template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -876,6 +876,18 @@ steps:
composition: platform-checks
args: [--scenario=UpgradeEntireMz, "--seed=$BUILDKITE_JOB_ID"]

- id: checks-lts-upgrade
label: "Checks LTS upgrade, whole-Mz restart"
depends_on: build-aarch64
timeout_in_minutes: 180
parallelism: 2
agents:
queue: hetzner-aarch64-16cpu-32gb
plugins:
- ./ci/plugins/mzcompose:
composition: platform-checks
args: [--scenario=UpgradeEntireMzFromLatestLTS, "--seed=$BUILDKITE_JOB_ID"]

- id: checks-preflight-check-rollback
label: "Checks preflight-check and roll back upgrade"
depends_on: build-aarch64
Expand Down Expand Up @@ -1676,7 +1688,7 @@ steps:
plugins:
- ./ci/plugins/mzcompose:
composition: legacy-upgrade
args: ["--versions-source=docs"]
args: ["--versions-source=docs", "--lts-upgrade"]
agents:
queue: hetzner-aarch64-4cpu-8gb

Expand Down
36 changes: 35 additions & 1 deletion misc/python/materialize/checks/scenarios_upgrade.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from materialize.checks.scenarios import Scenario
from materialize.mz_version import MzVersion
from materialize.mzcompose.services.materialized import LEADER_STATUS_HEALTHCHECK
from materialize.version_list import get_published_minor_mz_versions
from materialize.version_list import LTS_VERSIONS, get_published_minor_mz_versions

# late initialization
_minor_versions: list[MzVersion] | None = None
Expand Down Expand Up @@ -83,6 +83,40 @@ def start_mz_read_only(
)


class UpgradeEntireMzFromLatestLTS(Scenario):
"""Upgrade the entire Mz instance from the last LTS version without any intermediate steps. This makes sure our LTS releases for self-managed Materialize stay upgradable."""

def base_version(self) -> MzVersion:
return LTS_VERSIONS[-1]

def actions(self) -> list[Action]:
print(f"Upgrading from tag {self.base_version()}")
return [
StartMz(
self,
tag=self.base_version(),
),
Initialize(self),
Manipulate(self, phase=1),
KillMz(
capture_logs=True
), # We always use True here otherwise docker-compose will lose the pre-upgrade logs
StartMz(
self,
tag=None,
),
Manipulate(self, phase=2),
Validate(self),
# A second restart while already on the new version
KillMz(capture_logs=True),
StartMz(
self,
tag=None,
),
Validate(self),
]


class UpgradeEntireMz(Scenario):
"""Upgrade the entire Mz instance from the last released version."""

Expand Down
5 changes: 5 additions & 0 deletions misc/python/materialize/version_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@

MZ_ROOT = Path(os.environ["MZ_ROOT"])

LTS_VERSIONS = [
MzVersion.parse_mz("v0.130.1"), # v25.1.0
# Put new versions at the bottom
]

# not released on Docker
INVALID_VERSIONS = {
MzVersion.parse_mz("v0.52.1"),
Expand Down
48 changes: 47 additions & 1 deletion src/persist-client/src/cfg.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ use crate::operators::STORAGE_SOURCE_DECODE_FUEL;
use crate::project::OPTIMIZE_IGNORED_DATA_DECODE;
use crate::read::READER_LEASE_DURATION;

const LTS_VERSIONS: &[Version] = &[
// 25.1
Version::new(0, 130, 0),
];

/// The tunable knobs for persist.
///
/// Tuning inputs:
Expand Down Expand Up @@ -609,6 +614,10 @@ impl BlobKnobs for PersistConfig {
}
}

pub fn check_data_version(code_version: &Version, data_version: &Version) -> Result<(), String> {
check_data_version_with_lts_versions(code_version, data_version, LTS_VERSIONS)
}

// If persist gets some encoded ProtoState from the future (e.g. two versions of
// code are running simultaneously against the same shard), it might have a
// field that the current code doesn't know about. This would be silently
Expand All @@ -633,7 +642,44 @@ impl BlobKnobs for PersistConfig {
// data we read is going to be because we fetched it using a pointer stored in
// some persist state. If we can handle the state, we can handle the blobs it
// references, too.
pub fn check_data_version(code_version: &Version, data_version: &Version) -> Result<(), String> {
pub(crate) fn check_data_version_with_lts_versions(
code_version: &Version,
data_version: &Version,
lts_versions: &[Version],
) -> Result<(), String> {
// Allow upgrades specifically between consecutive LTS releases.
let base_code_version = Version {
patch: 0,
..code_version.clone()
};
let base_data_version = Version {
patch: 0,
..data_version.clone()
};
if data_version >= code_version {
for window in lts_versions.windows(2) {
if base_code_version == window[0] && base_data_version <= window[1] {
return Ok(());
}
}

if let Some(last) = lts_versions.last() {
if base_code_version == *last
// kind of arbitrary, but just ensure we don't accidentally
// upgrade too far (the previous check should ensure that a
// new version won't take over from a too-old previous
// version, but we want to make sure the other side also
// doesn't get confused)
&& base_data_version
.minor
.saturating_sub(base_code_version.minor)
< 40
{
return Ok(());
}
}
}

// Allow one minor version of forward compatibility. We could avoid the
// clone with some nested comparisons of the semver fields, but this code
// isn't particularly performance sensitive and I find this impl easier to
Expand Down
146 changes: 146 additions & 0 deletions src/persist-client/src/internal/encoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2061,6 +2061,152 @@ mod tests {
proptest!(|(state in any_state::<u64>(0..3))| testcase(state));
}

#[mz_ore::test]
fn check_data_versions_with_lts_versions() {
#[track_caller]
fn testcase(code: &str, data: &str, lts_versions: &[Version], expected: Result<(), ()>) {
let code = Version::parse(code).unwrap();
let data = Version::parse(data).unwrap();
let actual = cfg::check_data_version_with_lts_versions(&code, &data, lts_versions)
.map_err(|_| ());
assert_eq!(actual, expected);
}

let none = [];
let one = [Version::new(0, 130, 0)];
let two = [Version::new(0, 130, 0), Version::new(0, 140, 0)];
let three = [
Version::new(0, 130, 0),
Version::new(0, 140, 0),
Version::new(0, 150, 0),
];

testcase("0.130.0", "0.128.0", &none, Ok(()));
testcase("0.130.0", "0.129.0", &none, Ok(()));
testcase("0.130.0", "0.130.0", &none, Ok(()));
testcase("0.130.0", "0.130.1", &none, Ok(()));
testcase("0.130.1", "0.130.0", &none, Ok(()));
testcase("0.130.0", "0.131.0", &none, Ok(()));
testcase("0.130.0", "0.132.0", &none, Err(()));

testcase("0.129.0", "0.127.0", &none, Ok(()));
testcase("0.129.0", "0.128.0", &none, Ok(()));
testcase("0.129.0", "0.129.0", &none, Ok(()));
testcase("0.129.0", "0.129.1", &none, Ok(()));
testcase("0.129.1", "0.129.0", &none, Ok(()));
testcase("0.129.0", "0.130.0", &none, Ok(()));
testcase("0.129.0", "0.131.0", &none, Err(()));

testcase("0.130.0", "0.128.0", &one, Ok(()));
testcase("0.130.0", "0.129.0", &one, Ok(()));
testcase("0.130.0", "0.130.0", &one, Ok(()));
testcase("0.130.0", "0.130.1", &one, Ok(()));
testcase("0.130.1", "0.130.0", &one, Ok(()));
testcase("0.130.0", "0.131.0", &one, Ok(()));
testcase("0.130.0", "0.132.0", &one, Ok(()));

testcase("0.129.0", "0.127.0", &one, Ok(()));
testcase("0.129.0", "0.128.0", &one, Ok(()));
testcase("0.129.0", "0.129.0", &one, Ok(()));
testcase("0.129.0", "0.129.1", &one, Ok(()));
testcase("0.129.1", "0.129.0", &one, Ok(()));
testcase("0.129.0", "0.130.0", &one, Ok(()));
testcase("0.129.0", "0.131.0", &one, Err(()));

testcase("0.131.0", "0.129.0", &one, Ok(()));
testcase("0.131.0", "0.130.0", &one, Ok(()));
testcase("0.131.0", "0.131.0", &one, Ok(()));
testcase("0.131.0", "0.131.1", &one, Ok(()));
testcase("0.131.1", "0.131.0", &one, Ok(()));
testcase("0.131.0", "0.132.0", &one, Ok(()));
testcase("0.131.0", "0.133.0", &one, Err(()));

testcase("0.130.0", "0.128.0", &two, Ok(()));
testcase("0.130.0", "0.129.0", &two, Ok(()));
testcase("0.130.0", "0.130.0", &two, Ok(()));
testcase("0.130.0", "0.130.1", &two, Ok(()));
testcase("0.130.1", "0.130.0", &two, Ok(()));
testcase("0.130.0", "0.131.0", &two, Ok(()));
testcase("0.130.0", "0.132.0", &two, Ok(()));
testcase("0.130.0", "0.135.0", &two, Ok(()));
testcase("0.130.0", "0.138.0", &two, Ok(()));
testcase("0.130.0", "0.139.0", &two, Ok(()));
testcase("0.130.0", "0.140.0", &two, Ok(()));
testcase("0.130.9", "0.140.0", &two, Ok(()));
testcase("0.130.0", "0.140.1", &two, Ok(()));
testcase("0.130.3", "0.140.1", &two, Ok(()));
testcase("0.130.3", "0.140.9", &two, Ok(()));
testcase("0.130.0", "0.141.0", &two, Err(()));
testcase("0.129.0", "0.133.0", &two, Err(()));
testcase("0.129.0", "0.140.0", &two, Err(()));
testcase("0.131.0", "0.133.0", &two, Err(()));
testcase("0.131.0", "0.140.0", &two, Err(()));

testcase("0.130.0", "0.128.0", &three, Ok(()));
testcase("0.130.0", "0.129.0", &three, Ok(()));
testcase("0.130.0", "0.130.0", &three, Ok(()));
testcase("0.130.0", "0.130.1", &three, Ok(()));
testcase("0.130.1", "0.130.0", &three, Ok(()));
testcase("0.130.0", "0.131.0", &three, Ok(()));
testcase("0.130.0", "0.132.0", &three, Ok(()));
testcase("0.130.0", "0.135.0", &three, Ok(()));
testcase("0.130.0", "0.138.0", &three, Ok(()));
testcase("0.130.0", "0.139.0", &three, Ok(()));
testcase("0.130.0", "0.140.0", &three, Ok(()));
testcase("0.130.9", "0.140.0", &three, Ok(()));
testcase("0.130.0", "0.140.1", &three, Ok(()));
testcase("0.130.3", "0.140.1", &three, Ok(()));
testcase("0.130.3", "0.140.9", &three, Ok(()));
testcase("0.130.0", "0.141.0", &three, Err(()));
testcase("0.129.0", "0.133.0", &three, Err(()));
testcase("0.129.0", "0.140.0", &three, Err(()));
testcase("0.131.0", "0.133.0", &three, Err(()));
testcase("0.131.0", "0.140.0", &three, Err(()));
testcase("0.130.0", "0.150.0", &three, Err(()));

testcase("0.140.0", "0.138.0", &three, Ok(()));
testcase("0.140.0", "0.139.0", &three, Ok(()));
testcase("0.140.0", "0.140.0", &three, Ok(()));
testcase("0.140.0", "0.140.1", &three, Ok(()));
testcase("0.140.1", "0.140.0", &three, Ok(()));
testcase("0.140.0", "0.141.0", &three, Ok(()));
testcase("0.140.0", "0.142.0", &three, Ok(()));
testcase("0.140.0", "0.145.0", &three, Ok(()));
testcase("0.140.0", "0.148.0", &three, Ok(()));
testcase("0.140.0", "0.149.0", &three, Ok(()));
testcase("0.140.0", "0.150.0", &three, Ok(()));
testcase("0.140.9", "0.150.0", &three, Ok(()));
testcase("0.140.0", "0.150.1", &three, Ok(()));
testcase("0.140.3", "0.150.1", &three, Ok(()));
testcase("0.140.3", "0.150.9", &three, Ok(()));
testcase("0.140.0", "0.151.0", &three, Err(()));
testcase("0.139.0", "0.143.0", &three, Err(()));
testcase("0.139.0", "0.150.0", &three, Err(()));
testcase("0.141.0", "0.143.0", &three, Err(()));
testcase("0.141.0", "0.150.0", &three, Err(()));

testcase("0.150.0", "0.148.0", &three, Ok(()));
testcase("0.150.0", "0.149.0", &three, Ok(()));
testcase("0.150.0", "0.150.0", &three, Ok(()));
testcase("0.150.0", "0.150.1", &three, Ok(()));
testcase("0.150.1", "0.150.0", &three, Ok(()));
testcase("0.150.0", "0.151.0", &three, Ok(()));
testcase("0.150.0", "0.152.0", &three, Ok(()));
testcase("0.150.0", "0.155.0", &three, Ok(()));
testcase("0.150.0", "0.158.0", &three, Ok(()));
testcase("0.150.0", "0.159.0", &three, Ok(()));
testcase("0.150.0", "0.160.0", &three, Ok(()));
testcase("0.150.9", "0.160.0", &three, Ok(()));
testcase("0.150.0", "0.160.1", &three, Ok(()));
testcase("0.150.3", "0.160.1", &three, Ok(()));
testcase("0.150.3", "0.160.9", &three, Ok(()));
testcase("0.150.0", "0.161.0", &three, Ok(()));
testcase("0.149.0", "0.153.0", &three, Err(()));
testcase("0.149.0", "0.160.0", &three, Err(()));
testcase("0.151.0", "0.153.0", &three, Err(()));
testcase("0.151.0", "0.160.0", &three, Err(()));
}

#[mz_ore::test]
fn check_data_versions() {
#[track_caller]
Expand Down
17 changes: 16 additions & 1 deletion test/legacy-upgrade/mzcompose.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from materialize.mzcompose.services.testdrive import Testdrive
from materialize.mzcompose.services.zookeeper import Zookeeper
from materialize.version_list import (
LTS_VERSIONS,
VersionsFromDocs,
get_all_published_mz_versions,
get_published_minor_mz_versions,
Expand Down Expand Up @@ -80,6 +81,7 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None:
help="from what source to fetch the versions",
)
parser.add_argument("--ignore-missing-version", action="store_true")
parser.add_argument("--lts-upgrade", action="store_true")
args = parser.parse_args()

parallelism_index = buildkite.get_parallelism_index()
Expand Down Expand Up @@ -131,6 +133,18 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None:
test_upgrade_from_version(
c, "current_source", priors=[], filter=args.filter, zero_downtime=False
)
if args.lts_upgrade:
# Direct upgrade from latest LTS version without any inbetween versions
version = LTS_VERSIONS[-1]
priors = [v for v in all_versions if v <= version]
test_upgrade_from_version(
c,
f"{version}",
priors,
filter=args.filter,
zero_downtime=False,
lts_upgrade=True,
)


def get_all_and_latest_two_minor_mz_versions(
Expand All @@ -152,6 +166,7 @@ def test_upgrade_from_version(
priors: list[MzVersion],
filter: str,
zero_downtime: bool,
lts_upgrade: bool = False,
) -> None:
print(
f"+++ Testing {'0dt upgrade' if zero_downtime else 'regular upgrade'} from Materialize {from_version} to current_source."
Expand Down Expand Up @@ -241,7 +256,7 @@ def test_upgrade_from_version(
c.kill(mz_service)
c.rm(mz_service, "testdrive")

if from_version != "current_source":
if from_version != "current_source" and not lts_upgrade:
# We can't skip in-between minor versions anymore, so go through all of them
for version in get_published_minor_mz_versions(newest_first=False):
if version <= from_version:
Expand Down

0 comments on commit 3b86bc3

Please sign in to comment.