From 6315b7b6889df3c69b480e35fd71820ca4a69a70 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 20 Sep 2024 11:05:59 +0100 Subject: [PATCH 1/5] pageserver: non-zero shards learn GC offset from shard zero --- .../src/tenant/remote_timeline_client.rs | 23 ++++ pageserver/src/tenant/timeline.rs | 116 +++++++++++++----- 2 files changed, 105 insertions(+), 34 deletions(-) diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 94f42c782782..b910a405471b 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -197,6 +197,7 @@ use utils::backoff::{ self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, }; use utils::pausable_failpoint; +use utils::shard::ShardNumber; use std::collections::{HashMap, VecDeque}; use std::sync::atomic::{AtomicU32, Ordering}; @@ -2231,6 +2232,28 @@ impl RemoteTimelineClient { UploadQueue::Initialized(x) => x.no_pending_work(), } } + + /// 'foreign' in the sense that it does not belong to this tenant shard. This method + /// is used during GC for other shards to get the index of shard zero. + pub(crate) async fn download_foreign_index( + &self, + shard_number: ShardNumber, + cancel: &CancellationToken, + ) -> Result<(IndexPart, Generation, std::time::SystemTime), DownloadError> { + let foreign_shard_id = TenantShardId { + shard_number, + shard_count: self.tenant_shard_id.shard_count, + tenant_id: self.tenant_shard_id.tenant_id, + }; + download_index_part( + &self.storage_impl, + &foreign_shard_id, + &self.timeline_id, + Generation::MAX, + cancel, + ) + .await + } } pub(crate) struct UploadQueueAccessor<'a> { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 2bc14ec3172c..a7fadb553fab 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -38,6 +38,7 @@ use pageserver_api::{ shard::{ShardIdentity, ShardNumber, TenantShardId}, }; use rand::Rng; +use remote_storage::DownloadError; use serde_with::serde_as; use storage_broker::BrokerClientChannel; use tokio::{ @@ -4774,6 +4775,86 @@ impl Timeline { Ok(()) } + async fn find_gc_time_cutoff( + &self, + pitr: Duration, + cancel: &CancellationToken, + ctx: &RequestContext, + ) -> Result, PageReconstructError> { + debug_assert_current_span_has_tenant_and_timeline_id(); + if self.shard_identity.is_shard_zero() { + // Shard Zero has SLRU data and can calculate the PITR time -> LSN mapping itself + let now = SystemTime::now(); + let time_range = if pitr == Duration::ZERO { + humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid") + } else { + pitr + }; + + // If PITR is so large or `now` is so small that this underflows, we will retain no history (highly unexpected case) + let time_cutoff = now.checked_sub(time_range).unwrap_or(now); + let timestamp = to_pg_timestamp(time_cutoff); + + let time_cutoff = match self.find_lsn_for_timestamp(timestamp, cancel, ctx).await? { + LsnForTimestamp::Present(lsn) => Some(lsn), + LsnForTimestamp::Future(lsn) => { + // The timestamp is in the future. That sounds impossible, + // but what it really means is that there hasn't been + // any commits since the cutoff timestamp. + // + // In this case we should use the LSN of the most recent commit, + // which is implicitly the last LSN in the log. + debug!("future({})", lsn); + Some(self.get_last_record_lsn()) + } + LsnForTimestamp::Past(lsn) => { + debug!("past({})", lsn); + None + } + LsnForTimestamp::NoData(lsn) => { + debug!("nodata({})", lsn); + None + } + }; + Ok(time_cutoff) + } else { + // Shards other than shard zero cannot do timestamp->lsn lookups, and must instead learn their GC cutoff + // from shard zero's index. The index doesn't explicitly tell us the time cutoff, but we may assume that + // the point up to which shard zero's last_gc_cutoff has advanced will either be the time cutoff, or a + // space cutoff that we would also have respected ourselves. + match self + .remote_client + .download_foreign_index(ShardNumber(0), cancel) + .await + { + Ok((index_part, index_generation, _index_mtime)) => { + tracing::info!("GC loaded shard zero metadata (gen {index_generation:?}): latest_gc_cutoff_lsn: {}", + index_part.metadata.latest_gc_cutoff_lsn()); + Ok(Some(index_part.metadata.latest_gc_cutoff_lsn())) + } + Err(DownloadError::NotFound) => { + // This is unexpected, because during timeline creations shard zero persists to remote + // storage before other shards are called, and during timeline deletion non-zeroth shards are + // deleted before the zeroth one. However, it should be harmless: if we somehow end up in this + // state, then shard zero should _eventually_ write an index when it GCs. + tracing::warn!("GC couldn't find shard zero's index for timeline"); + Ok(None) + } + Err(e) => { + // TODO: this function should return a different error type than page reconstruct error + Err(PageReconstructError::Other(anyhow::anyhow!(e))) + } + } + + // TODO: after reading shard zero's GC cutoff, we should validate its generation with the storage + // controller. Otherwise, it is possible that we see the GC cutoff go backwards while shard zero + // is going through a migration if we read the old location's index and it has GC'd ahead of the + // new location. This is legal in principle, but problematic in practice because it might result + // in a timeline creation succeeding on shard zero ('s new location) but then failing on other shards + // because they have GC'd past the branch point. + } + } + /// Find the Lsns above which layer files need to be retained on /// garbage collection. /// @@ -4816,40 +4897,7 @@ impl Timeline { // - if PITR interval is set, then this is our cutoff. // - if PITR interval is not set, then we do a lookup // based on DEFAULT_PITR_INTERVAL, so that size-based retention does not result in keeping history around permanently on idle databases. - let time_cutoff = { - let now = SystemTime::now(); - let time_range = if pitr == Duration::ZERO { - humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid") - } else { - pitr - }; - - // If PITR is so large or `now` is so small that this underflows, we will retain no history (highly unexpected case) - let time_cutoff = now.checked_sub(time_range).unwrap_or(now); - let timestamp = to_pg_timestamp(time_cutoff); - - match self.find_lsn_for_timestamp(timestamp, cancel, ctx).await? { - LsnForTimestamp::Present(lsn) => Some(lsn), - LsnForTimestamp::Future(lsn) => { - // The timestamp is in the future. That sounds impossible, - // but what it really means is that there hasn't been - // any commits since the cutoff timestamp. - // - // In this case we should use the LSN of the most recent commit, - // which is implicitly the last LSN in the log. - debug!("future({})", lsn); - Some(self.get_last_record_lsn()) - } - LsnForTimestamp::Past(lsn) => { - debug!("past({})", lsn); - None - } - LsnForTimestamp::NoData(lsn) => { - debug!("nodata({})", lsn); - None - } - } - }; + let time_cutoff = self.find_gc_time_cutoff(pitr, cancel, ctx).await?; Ok(match (pitr, time_cutoff) { (Duration::ZERO, Some(time_cutoff)) => { From 42a9c9a5087afc704eb17fd7e9eb653e9fdb3935 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 23 Sep 2024 09:07:19 +0100 Subject: [PATCH 2/5] tests: do GC in pg_regress --- test_runner/regress/test_pg_regress.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index f4698191eb9c..a0c86107f6e0 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -110,13 +110,17 @@ def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: End check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files) - # Ensure that compaction works, on a timeline containing all the diversity that postgres regression tests create. + # Ensure that compaction/GC works, on a timeline containing all the diversity that postgres regression tests create. # There should have been compactions mid-test as well, this final check is in addition those. for shard, pageserver in tenant_get_shards(env, env.initial_tenant): pageserver.http_client().timeline_checkpoint( shard, env.initial_timeline, force_repartition=True, force_image_layer_creation=True ) + pageserver.http_client().timeline_gc( + shard, env.initial_timeline, None + ) + # Run the main PostgreSQL regression tests, in src/test/regress. # From 69047d067b509838bc58ead7c8f4e0b07e89a993 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 18 Nov 2024 09:54:51 +0000 Subject: [PATCH 3/5] tests: add test_sharding_gc --- test_runner/fixtures/remote_storage.py | 16 ++-- test_runner/regress/test_sharding.py | 110 ++++++++++++++++++++++++- 2 files changed, 120 insertions(+), 6 deletions(-) diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index 702495366169..c630ea98b447 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -77,14 +77,16 @@ def kill(self): class LocalFsStorage: root: Path - def tenant_path(self, tenant_id: TenantId) -> Path: + def tenant_path(self, tenant_id: Union[TenantId, TenantShardId]) -> Path: return self.root / "tenants" / str(tenant_id) - def timeline_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path: + def timeline_path( + self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId + ) -> Path: return self.tenant_path(tenant_id) / "timelines" / str(timeline_id) def timeline_latest_generation( - self, tenant_id: TenantId, timeline_id: TimelineId + self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId ) -> Optional[int]: timeline_files = os.listdir(self.timeline_path(tenant_id, timeline_id)) index_parts = [f for f in timeline_files if f.startswith("index_part")] @@ -102,7 +104,9 @@ def parse_gen(filename: str) -> Optional[int]: raise RuntimeError(f"No index_part found for {tenant_id}/{timeline_id}") return generations[-1] - def index_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path: + def index_path( + self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId + ) -> Path: latest_gen = self.timeline_latest_generation(tenant_id, timeline_id) if latest_gen is None: filename = TIMELINE_INDEX_PART_FILE_NAME @@ -126,7 +130,9 @@ def remote_layer_path( filename = f"{local_name}-{generation:08x}" return self.timeline_path(tenant_id, timeline_id) / filename - def index_content(self, tenant_id: TenantId, timeline_id: TimelineId) -> Any: + def index_content( + self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId + ) -> Any: with self.index_path(tenant_id, timeline_id).open("r") as f: return json.load(f) diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 0a4a53356d94..9f0ceb296b54 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -19,7 +19,7 @@ wait_for_last_flush_lsn, ) from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty -from fixtures.remote_storage import s3_storage +from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, s3_storage from fixtures.utils import skip_in_debug_build, wait_until from fixtures.workload import Workload from pytest_httpserver import HTTPServer @@ -1685,3 +1685,111 @@ def test_top_tenants(neon_env_builder: NeonEnvBuilder): ) assert len(top["shards"]) == n_tenants - 4 assert set(i["id"] for i in top["shards"]) == set(str(i[0]) for i in tenants[4:]) + + +def test_sharding_gc( + neon_env_builder: NeonEnvBuilder, +): + """ + Exercise GC in a sharded tenant: because only shard 0 holds SLRU content, it acts as + the "leader" for GC, and other shards read its index to learn what LSN they should + GC up to. + """ + + shard_count = 4 + neon_env_builder.num_pageservers = shard_count + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + + TENANT_CONF = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": 128 * 1024, + "compaction_threshold": 1, + "compaction_target_size": 128 * 1024, + # A short PITR horizon, so that we won't have to sleep too long in the test to wait for it to + # happen. + "pitr_interval": "1s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # Disable automatic creation of image layers, as we will create them explicitly when we want them + "image_creation_threshold": 9999, + "image_layer_creation_check_threshold": 0, + "lsn_lease_length": "0s", + } + env = neon_env_builder.init_start( + initial_tenant_shard_count=shard_count, initial_tenant_conf=TENANT_CONF + ) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # Create a branch and write some data + workload = Workload(env, tenant_id, timeline_id) + initial_lsn = Lsn(workload.endpoint().safe_psql("SELECT pg_current_wal_lsn()")[0][0]) + log.info(f"Started at LSN: {initial_lsn}") + + workload.init() + + # Write enough data to generate multiple layers + for _i in range(10): + last_lsn = workload.write_rows(32) + + assert last_lsn > initial_lsn + + log.info(f"Wrote up to last LSN: {last_lsn}") + + # Do full image layer generation. When we subsequently wait for PITR, all historic deltas + # should be GC-able + for shard_number in range(shard_count): + shard = TenantShardId(tenant_id, shard_number, shard_count) + env.get_tenant_pageserver(shard).http_client().timeline_compact( + shard, timeline_id, force_image_layer_creation=True + ) + + workload.churn_rows(32) + + time.sleep(5) + + # Invoke GC on a non-zero shard and verify its GC cutoff LSN does not advance + shard_one = TenantShardId(tenant_id, 1, shard_count) + env.get_tenant_pageserver(shard_one).http_client().timeline_gc( + shard_one, timeline_id, gc_horizon=None + ) + + # Check shard 1's index - GC cutoff LSN should not have advanced + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + shard_1_index = env.pageserver_remote_storage.index_content( + tenant_id=shard_one, timeline_id=timeline_id + ) + shard_1_gc_cutoff_lsn = Lsn(shard_1_index["metadata_bytes"]["latest_gc_cutoff_lsn"]) + log.info(f"Shard 1 cutoff LSN: {shard_1_gc_cutoff_lsn}") + assert shard_1_gc_cutoff_lsn <= last_lsn + + shard_zero = TenantShardId(tenant_id, 0, shard_count) + env.get_tenant_pageserver(shard_zero).http_client().timeline_gc( + shard_zero, timeline_id, gc_horizon=None + ) + + # TODO: observe that GC LSN of shard 0 has moved forward in remote storage + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + shard_0_index = env.pageserver_remote_storage.index_content( + tenant_id=shard_zero, timeline_id=timeline_id + ) + shard_0_gc_cutoff_lsn = Lsn(shard_0_index["metadata_bytes"]["latest_gc_cutoff_lsn"]) + log.info(f"Shard 0 cutoff LSN: {shard_0_gc_cutoff_lsn}") + assert shard_0_gc_cutoff_lsn >= last_lsn + + # Invoke GC on all other shards and verify their GC cutoff LSNs + for shard_number in range(1, shard_count): + shard = TenantShardId(tenant_id, shard_number, shard_count) + env.get_tenant_pageserver(shard).http_client().timeline_gc( + shard, timeline_id, gc_horizon=None + ) + + # Verify GC cutoff LSN advanced to match shard 0 + shard_index = env.pageserver_remote_storage.index_content( + tenant_id=shard, timeline_id=timeline_id + ) + shard_gc_cutoff_lsn = Lsn(shard_index["metadata_bytes"]["latest_gc_cutoff_lsn"]) + log.info(f"Shard {shard_number} cutoff LSN: {shard_gc_cutoff_lsn}") + assert shard_gc_cutoff_lsn == shard_0_gc_cutoff_lsn From c7d0d412fa36859f5b3010b8d7a1696343690d2c Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 20 Sep 2024 11:30:35 +0100 Subject: [PATCH 4/5] pageserver: do not ingest SLRUs on shard >0 --- libs/pageserver_api/src/shard.rs | 7 +++- pageserver/src/pgdatadir_mapping.rs | 56 ++++++++++++++++++----------- pageserver/src/walingest.rs | 4 +++ 3 files changed, 46 insertions(+), 21 deletions(-) diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index e83cf4c855a1..eda02c77c31e 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -175,7 +175,12 @@ impl ShardIdentity { /// /// Shards _may_ drop keys which return false here, but are not obliged to. pub fn is_key_disposable(&self, key: &Key) -> bool { - if key_is_shard0(key) { + if self.count < ShardCount(2) { + // Fast path: unsharded tenant doesn't dispose of anything + return false; + } + + if key_is_shard0(key) && key.field1 != 0x01 { // Q: Why can't we dispose of shard0 content if we're not shard 0? // A1: because the WAL ingestion logic currently ingests some shard 0 // content on all shards, even though it's only read on shard 0. If we diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 7c1abbf3e2d1..7de7cf5e09d4 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -338,6 +338,7 @@ impl Timeline { lsn: Lsn, ctx: &RequestContext, ) -> Result { + assert!(self.tenant_shard_id.is_shard_zero()); let n_blocks = self .get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx) .await?; @@ -360,6 +361,7 @@ impl Timeline { lsn: Lsn, ctx: &RequestContext, ) -> Result { + assert!(self.tenant_shard_id.is_shard_zero()); let key = slru_block_to_key(kind, segno, blknum); self.get(key, lsn, ctx).await } @@ -372,6 +374,7 @@ impl Timeline { version: Version<'_>, ctx: &RequestContext, ) -> Result { + assert!(self.tenant_shard_id.is_shard_zero()); let key = slru_segment_size_to_key(kind, segno); let mut buf = version.get(self, key, ctx).await?; Ok(buf.get_u32_le()) @@ -385,6 +388,7 @@ impl Timeline { version: Version<'_>, ctx: &RequestContext, ) -> Result { + assert!(self.tenant_shard_id.is_shard_zero()); // fetch directory listing let key = slru_dir_to_key(kind); let buf = version.get(self, key, ctx).await?; @@ -855,26 +859,28 @@ impl Timeline { } // Iterate SLRUs next - for kind in [ - SlruKind::Clog, - SlruKind::MultiXactMembers, - SlruKind::MultiXactOffsets, - ] { - let slrudir_key = slru_dir_to_key(kind); - result.add_key(slrudir_key); - let buf = self.get(slrudir_key, lsn, ctx).await?; - let dir = SlruSegmentDirectory::des(&buf)?; - let mut segments: Vec = dir.segments.iter().cloned().collect(); - segments.sort_unstable(); - for segno in segments { - let segsize_key = slru_segment_size_to_key(kind, segno); - let mut buf = self.get(segsize_key, lsn, ctx).await?; - let segsize = buf.get_u32_le(); - - result.add_range( - slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, segsize), - ); - result.add_key(segsize_key); + if self.tenant_shard_id.is_shard_zero() { + for kind in [ + SlruKind::Clog, + SlruKind::MultiXactMembers, + SlruKind::MultiXactOffsets, + ] { + let slrudir_key = slru_dir_to_key(kind); + result.add_key(slrudir_key); + let buf = self.get(slrudir_key, lsn, ctx).await?; + let dir = SlruSegmentDirectory::des(&buf)?; + let mut segments: Vec = dir.segments.iter().cloned().collect(); + segments.sort_unstable(); + for segno in segments { + let segsize_key = slru_segment_size_to_key(kind, segno); + let mut buf = self.get(segsize_key, lsn, ctx).await?; + let segsize = buf.get_u32_le(); + + result.add_range( + slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, segsize), + ); + result.add_key(segsize_key); + } } } @@ -1269,6 +1275,10 @@ impl<'a> DatadirModification<'a> { blknum: BlockNumber, rec: NeonWalRecord, ) -> anyhow::Result<()> { + if !self.tline.tenant_shard_id.is_shard_zero() { + return Ok(()); + } + self.put( slru_block_to_key(kind, segno, blknum), Value::WalRecord(rec), @@ -1302,6 +1312,9 @@ impl<'a> DatadirModification<'a> { blknum: BlockNumber, img: Bytes, ) -> anyhow::Result<()> { + if !self.tline.tenant_shard_id.is_shard_zero() { + return Ok(()); + } let key = slru_block_to_key(kind, segno, blknum); if !key.is_valid_key_on_write_path() { anyhow::bail!( @@ -1343,6 +1356,9 @@ impl<'a> DatadirModification<'a> { segno: u32, blknum: BlockNumber, ) -> anyhow::Result<()> { + if !self.tline.tenant_shard_id.is_shard_zero() { + return Ok(()); + } let key = slru_block_to_key(kind, segno, blknum); if !key.is_valid_key_on_write_path() { anyhow::bail!( diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 38d69760f276..0b9ed791d870 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -1372,6 +1372,10 @@ impl WalIngest { img: Bytes, ctx: &RequestContext, ) -> Result<()> { + if !self.shard.is_shard_zero() { + return Ok(()); + } + self.handle_slru_extend(modification, kind, segno, blknum, ctx) .await?; modification.put_slru_page_image(kind, segno, blknum, img)?; From 001341a873f7ea96214d71490dfbb0dc31220ee7 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 20 Sep 2024 11:26:45 +0100 Subject: [PATCH 5/5] pageserver: don't ingest checkpoint page on shards >0 --- pageserver/src/walingest.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 0b9ed791d870..d7745894ab4d 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -162,7 +162,7 @@ impl WalIngest { assert!(!modification.has_dirty_data()); } - assert!(!self.checkpoint_modified); + assert!(!self.checkpoint_modified || !self.shard.is_shard_zero()); if interpreted.xid != pg_constants::INVALID_TRANSACTION_ID && self.checkpoint.update_next_xid(interpreted.xid) { @@ -277,8 +277,8 @@ impl WalIngest { .ingest_batch(interpreted.batch, &self.shard, ctx) .await?; - // If checkpoint data was updated, store the new version in the repository - if self.checkpoint_modified { + // If checkpoint data was updated, store the new version in the repository (on shard zero only) + if self.checkpoint_modified && self.shard.is_shard_zero() { let new_checkpoint_bytes = self.checkpoint.encode()?; modification.put_checkpoint(new_checkpoint_bytes)?;