diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeDowngrade.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeDowngrade.java index d705cb227bcc6..baf08a4214b22 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeDowngrade.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/upgrade/UpgradeDowngrade.java @@ -58,7 +58,7 @@ public UpgradeDowngrade( this.upgradeDowngradeHelper = upgradeDowngradeHelper; } - public boolean needsUpgradeOrDowngrade(HoodieTableVersion toWriteVersion) { + public static boolean needsUpgradeOrDowngrade(HoodieTableMetaClient metaClient, HoodieWriteConfig config, HoodieTableVersion toWriteVersion) { HoodieTableVersion fromTableVersion = metaClient.getTableConfig().getTableVersion(); // If table version is less than SIX, then we need to upgrade to SIX first before upgrading to any other version, irrespective of autoUpgrade flag if (fromTableVersion.versionCode() < HoodieTableVersion.SIX.versionCode() && toWriteVersion.versionCode() >= HoodieTableVersion.EIGHT.versionCode()) { @@ -76,6 +76,10 @@ public boolean needsUpgradeOrDowngrade(HoodieTableVersion toWriteVersion) { return toWriteVersion.versionCode() != fromTableVersion.versionCode(); } + public boolean needsUpgradeOrDowngrade(HoodieTableVersion toWriteVersion) { + return needsUpgradeOrDowngrade(metaClient, config, toWriteVersion); + } + /** * Perform Upgrade or Downgrade steps if required and updated table version if need be. *

diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/checkpoint/Checkpoint.java b/hudi-common/src/main/java/org/apache/hudi/common/table/checkpoint/Checkpoint.java index f1693a4019698..67248ba4adcdb 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/checkpoint/Checkpoint.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/checkpoint/Checkpoint.java @@ -36,6 +36,11 @@ public abstract class Checkpoint implements Serializable { // These are extra props to be written to the commit metadata protected Map extraProps = new HashMap<>(); + public Checkpoint setCheckpointKey(String newKey) { + checkpointKey = newKey; + return this; + } + public String getCheckpointKey() { return checkpointKey; } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/checkpoint/CheckpointUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/table/checkpoint/CheckpointUtils.java index 2f49f3d2b1f94..15084a74ae46d 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/table/checkpoint/CheckpointUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/checkpoint/CheckpointUtils.java @@ -49,6 +49,16 @@ public class CheckpointUtils { "org.apache.hudi.utilities.sources.MockS3EventsHoodieIncrSource", "org.apache.hudi.utilities.sources.MockGcsEventsHoodieIncrSource" ))); + + public static final Set HOODIE_INCREMENTAL_SOURCES; + + static { + HashSet hoodieIncSource = new HashSet<>(DATASOURCES_NOT_SUPPORTED_WITH_CKPT_V2); + hoodieIncSource.add("org.apache.hudi.utilities.sources.MockGeneralHoodieIncrSource"); + hoodieIncSource.add("org.apache.hudi.utilities.sources.HoodieIncrSource"); + HOODIE_INCREMENTAL_SOURCES = Collections.unmodifiableSet(hoodieIncSource); + } + public static Checkpoint getCheckpoint(HoodieCommitMetadata commitMetadata) { if (!StringUtils.isNullOrEmpty(commitMetadata.getMetadata(STREAMER_CHECKPOINT_KEY_V2)) || !StringUtils.isNullOrEmpty(commitMetadata.getMetadata(STREAMER_CHECKPOINT_RESET_KEY_V2))) { @@ -61,6 +71,20 @@ public static Checkpoint getCheckpoint(HoodieCommitMetadata commitMetadata) { throw new HoodieException("Checkpoint is not found in the commit metadata: " + commitMetadata.getExtraMetadata()); } + public static Checkpoint buildCheckpointFromGeneralSource( + String sourceClassName, int writeTableVersion, String checkpointToResume) { + return CheckpointUtils.shouldTargetCheckpointV2(writeTableVersion, sourceClassName) + ? new StreamerCheckpointV2(checkpointToResume) : new StreamerCheckpointV1(checkpointToResume); + } + + // Whenever we create checkpoint from streamer config checkpoint override, we should use this function + // to build checkpoints. + public static Checkpoint buildCheckpointFromConfigOverride( + String sourceClassName, int writeTableVersion, String checkpointToResume) { + return CheckpointUtils.shouldTargetCheckpointV2(writeTableVersion, sourceClassName) + ? new UnresolvedStreamerCheckpointBasedOnCfg(checkpointToResume) : new StreamerCheckpointV1(checkpointToResume); + } + public static boolean shouldTargetCheckpointV2(int writeTableVersion, String sourceClassName) { return writeTableVersion >= HoodieTableVersion.EIGHT.versionCode() && !DATASOURCES_NOT_SUPPORTED_WITH_CKPT_V2.contains(sourceClassName); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/checkpoint/HoodieIncrSourceCheckpointValUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/table/checkpoint/HoodieIncrSourceCheckpointValUtils.java new file mode 100644 index 0000000000000..829c985aeb10c --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/checkpoint/HoodieIncrSourceCheckpointValUtils.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.table.checkpoint; + +/** + * Utility class providing methods to check if a string starts with specific resume-related prefixes. + */ +public class HoodieIncrSourceCheckpointValUtils { + public static final String RESET_CHECKPOINT_V2_SEPARATOR = ":"; + public static final String REQUEST_TIME_PREFIX = "resumeFromInstantRequestTime"; + public static final String COMPLETION_TIME_PREFIX = "resumeFromInstantCompletionTime"; + + /** + * For hoodie incremental source ingestion, if the target table is version 8 or higher, the checkpoint + * key set by streamer config can be in either of the following format: + * - resumeFromInstantRequestTime:[checkpoint value based on request time] + * - resumeFromInstantCompletionTime:[checkpoint value based on completion time] + * + * StreamerCheckpointV2FromCfgCkp class itself captured the fact that this is version 8 and higher, plus + * the checkpoint source is from streamer config override. + * + * When the checkpoint is consumed by individual data sources, we need to convert them to either vanilla + * checkpoint v1 (request time based) or checkpoint v2 (completion time based). + */ + public static Checkpoint resolveToActualCheckpointVersion(UnresolvedStreamerCheckpointBasedOnCfg checkpoint) { + String[] parts = extractKeyValues(checkpoint); + switch (parts[0]) { + case REQUEST_TIME_PREFIX: { + return new StreamerCheckpointV1(checkpoint).setCheckpointKey(parts[1]); + } + case COMPLETION_TIME_PREFIX: { + return new StreamerCheckpointV2(checkpoint).setCheckpointKey(parts[1]); + } + default: + throw new IllegalArgumentException("Unknown event ordering mode " + parts[0]); + } + } + + private static String [] extractKeyValues(UnresolvedStreamerCheckpointBasedOnCfg checkpoint) { + String checkpointKey = checkpoint.getCheckpointKey(); + String[] parts = checkpointKey.split(RESET_CHECKPOINT_V2_SEPARATOR); + if (parts.length != 2 + || ( + !parts[0].trim().equals(REQUEST_TIME_PREFIX) + && !parts[0].trim().equals(COMPLETION_TIME_PREFIX) + )) { + throw new IllegalArgumentException( + "Illegal checkpoint key override `" + checkpointKey + "`. Valid format is either `resumeFromInstantRequestTime:` or " + + "`resumeFromInstantCompletionTime:`."); + } + parts[0] = parts[0].trim(); + parts[1] = parts[1].trim(); + return parts; + } +} \ No newline at end of file diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/checkpoint/UnresolvedStreamerCheckpointBasedOnCfg.java b/hudi-common/src/main/java/org/apache/hudi/common/table/checkpoint/UnresolvedStreamerCheckpointBasedOnCfg.java new file mode 100644 index 0000000000000..cd9485ae82b90 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/table/checkpoint/UnresolvedStreamerCheckpointBasedOnCfg.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.table.checkpoint; + +import org.apache.hudi.common.model.HoodieCommitMetadata; + +/** + * A special checkpoint v2 class that indicates its checkpoint key comes from streamer config checkpoint + * overrides. + * + * For hoodie incremental source, based on the content of the checkpoint override value, it can indicate + * either request time based checkpoint or completion time based. So the class serves as an indicator to + * data sources of interest that it needs to be further parsed and resolved to either checkpoint v1 or v2. + * + * For all the other data sources, it behaves exactly the same as checkpoint v2. + * + * To keep the checkpoint class design ignorant of which data source it serves, the class only indicates where + * the checkpoint key comes from. + * */ +public class UnresolvedStreamerCheckpointBasedOnCfg extends StreamerCheckpointV2 { + public UnresolvedStreamerCheckpointBasedOnCfg(String key) { + super(key); + } + + public UnresolvedStreamerCheckpointBasedOnCfg(Checkpoint checkpoint) { + super(checkpoint); + } + + public UnresolvedStreamerCheckpointBasedOnCfg(HoodieCommitMetadata commitMetadata) { + super(commitMetadata); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/checkpoint/TestCheckpointUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/table/checkpoint/TestCheckpointUtils.java index 450627e473b82..a4b1be058799a 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/table/checkpoint/TestCheckpointUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/checkpoint/TestCheckpointUtils.java @@ -21,6 +21,7 @@ import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.HoodieTableVersion; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; @@ -39,6 +40,7 @@ import static org.apache.hudi.common.table.timeline.TimelineUtils.HollowCommitHandling.FAIL; import static org.apache.hudi.common.table.timeline.TimelineUtils.HollowCommitHandling.USE_TRANSITION_TIME; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.ArgumentMatchers.anyString; @@ -53,6 +55,9 @@ public class TestCheckpointUtils { private HoodieTableMetaClient metaClient; private HoodieActiveTimeline activeTimeline; + private static final String CHECKPOINT_TO_RESUME = "20240101000000"; + private static final String GENERAL_SOURCE = "org.apache.hudi.utilities.sources.GeneralSource"; + @BeforeEach public void setUp() { metaClient = mock(HoodieTableMetaClient.class); @@ -207,7 +212,49 @@ public void testConvertCheckpointWithUseTransitionTime() { "8, org.apache.hudi.utilities.sources.MockS3EventsHoodieIncrSource, false", "8, org.apache.hudi.utilities.sources.MockGcsEventsHoodieIncrSource, false" }) - public void testTargetCheckpointV2(int version, String sourceClassName, boolean expected) { - assertEquals(expected, CheckpointUtils.shouldTargetCheckpointV2(version, sourceClassName)); + public void testTargetCheckpointV2(int version, String sourceClassName, boolean isV2Checkpoint) { + assertEquals(isV2Checkpoint, CheckpointUtils.buildCheckpointFromGeneralSource(sourceClassName, version, "ignored") instanceof StreamerCheckpointV2); + } + + @Test + public void testBuildCheckpointFromGeneralSource() { + // Test V2 checkpoint creation (newer table version + general source) + Checkpoint checkpoint1 = CheckpointUtils.buildCheckpointFromGeneralSource( + GENERAL_SOURCE, + HoodieTableVersion.EIGHT.versionCode(), + CHECKPOINT_TO_RESUME + ); + assertInstanceOf(StreamerCheckpointV2.class, checkpoint1); + assertEquals(CHECKPOINT_TO_RESUME, checkpoint1.getCheckpointKey()); + + // Test V1 checkpoint creation (older table version) + Checkpoint checkpoint2 = CheckpointUtils.buildCheckpointFromGeneralSource( + GENERAL_SOURCE, + HoodieTableVersion.SEVEN.versionCode(), + CHECKPOINT_TO_RESUME + ); + assertInstanceOf(StreamerCheckpointV1.class, checkpoint2); + assertEquals(CHECKPOINT_TO_RESUME, checkpoint2.getCheckpointKey()); + } + + @Test + public void testBuildCheckpointFromConfigOverride() { + // Test checkpoint from config creation (newer table version + general source) + Checkpoint checkpoint1 = CheckpointUtils.buildCheckpointFromConfigOverride( + GENERAL_SOURCE, + HoodieTableVersion.EIGHT.versionCode(), + CHECKPOINT_TO_RESUME + ); + assertInstanceOf(UnresolvedStreamerCheckpointBasedOnCfg.class, checkpoint1); + assertEquals(CHECKPOINT_TO_RESUME, checkpoint1.getCheckpointKey()); + + // Test V1 checkpoint creation (older table version) + Checkpoint checkpoint2 = CheckpointUtils.buildCheckpointFromConfigOverride( + GENERAL_SOURCE, + HoodieTableVersion.SEVEN.versionCode(), + CHECKPOINT_TO_RESUME + ); + assertInstanceOf(StreamerCheckpointV1.class, checkpoint2); + assertEquals(CHECKPOINT_TO_RESUME, checkpoint2.getCheckpointKey()); } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/checkpoint/TestHoodieIncrSourceCheckpointValUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/table/checkpoint/TestHoodieIncrSourceCheckpointValUtils.java new file mode 100644 index 0000000000000..5e9f220d1135e --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/table/checkpoint/TestHoodieIncrSourceCheckpointValUtils.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.table.checkpoint; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class TestHoodieIncrSourceCheckpointValUtils { + + @Test + public void testResolveToV1V2CheckpointWithRequestTime() { + String checkpoint = "20240301"; + UnresolvedStreamerCheckpointBasedOnCfg mockCheckpoint = mock(UnresolvedStreamerCheckpointBasedOnCfg.class); + when(mockCheckpoint.getCheckpointKey()).thenReturn("resumeFromInstantRequestTime:" + checkpoint); + + Checkpoint result = HoodieIncrSourceCheckpointValUtils.resolveToActualCheckpointVersion(mockCheckpoint); + + assertInstanceOf(StreamerCheckpointV1.class, result); + assertEquals(checkpoint, result.getCheckpointKey()); + } + + @Test + public void testResolveToV1V2CheckpointWithCompletionTime() { + String checkpoint = "20240302"; + UnresolvedStreamerCheckpointBasedOnCfg mockCheckpoint = mock(UnresolvedStreamerCheckpointBasedOnCfg.class); + when(mockCheckpoint.getCheckpointKey()).thenReturn("resumeFromInstantCompletionTime:" + checkpoint); + + Checkpoint result = HoodieIncrSourceCheckpointValUtils.resolveToActualCheckpointVersion(mockCheckpoint); + + assertInstanceOf(StreamerCheckpointV2.class, result); + assertEquals(checkpoint, result.getCheckpointKey()); + } + + @Test + public void testResolveToV1V2CheckpointWithInvalidPrefix() { + UnresolvedStreamerCheckpointBasedOnCfg mockCheckpoint = mock(UnresolvedStreamerCheckpointBasedOnCfg.class); + when(mockCheckpoint.getCheckpointKey()).thenReturn("invalidPrefix:20240303"); + + IllegalArgumentException exception = assertThrows( + IllegalArgumentException.class, + () -> HoodieIncrSourceCheckpointValUtils.resolveToActualCheckpointVersion(mockCheckpoint) + ); + assertTrue(exception.getMessage().contains("Illegal checkpoint key override")); + } + + @Test + public void testResolveToV1V2CheckpointWithMalformedInput() { + UnresolvedStreamerCheckpointBasedOnCfg mockCheckpoint = mock(UnresolvedStreamerCheckpointBasedOnCfg.class); + when(mockCheckpoint.getCheckpointKey()).thenReturn("malformedInput"); + + IllegalArgumentException exception = assertThrows( + IllegalArgumentException.class, + () -> HoodieIncrSourceCheckpointValUtils.resolveToActualCheckpointVersion(mockCheckpoint) + ); + assertTrue(exception.getMessage().contains("Illegal checkpoint key override")); + } +} \ No newline at end of file diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/HoodieIncrSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/HoodieIncrSource.java index 10a02b6511a42..ce1a407eaa9d5 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/HoodieIncrSource.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/HoodieIncrSource.java @@ -25,6 +25,7 @@ import org.apache.hudi.common.table.HoodieTableVersion; import org.apache.hudi.common.table.checkpoint.Checkpoint; import org.apache.hudi.common.table.checkpoint.CheckpointUtils; +import org.apache.hudi.common.table.checkpoint.UnresolvedStreamerCheckpointBasedOnCfg; import org.apache.hudi.common.table.checkpoint.StreamerCheckpointV1; import org.apache.hudi.common.table.checkpoint.StreamerCheckpointV2; import org.apache.hudi.common.table.log.InstantRange; @@ -69,6 +70,7 @@ import static org.apache.hudi.DataSourceReadOptions.QUERY_TYPE; import static org.apache.hudi.DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL; import static org.apache.hudi.DataSourceReadOptions.START_COMMIT; +import static org.apache.hudi.common.table.checkpoint.HoodieIncrSourceCheckpointValUtils.resolveToActualCheckpointVersion; import static org.apache.hudi.common.table.timeline.InstantComparison.LESSER_THAN_OR_EQUALS; import static org.apache.hudi.common.table.timeline.InstantComparison.compareTimestamps; import static org.apache.hudi.common.util.ConfigUtils.checkRequiredConfigProperties; @@ -183,8 +185,14 @@ public HoodieIncrSource( @Override protected Option translateCheckpoint(Option lastCheckpoint) { - // For Hudi incremental source, we'll let #fetchNextBatch to handle it since - // it involves heavy lifting by loading the timeline for analysis + // User might override checkpoint based on + // - instant request time: Then we will treat it as a V1 checkpoint. + // - completion time: We will treat it as a normal V2 checkpoint. + if (!lastCheckpoint.isEmpty() && lastCheckpoint.get() instanceof UnresolvedStreamerCheckpointBasedOnCfg) { + lastCheckpoint = Option.of(resolveToActualCheckpointVersion((UnresolvedStreamerCheckpointBasedOnCfg) lastCheckpoint.get())); + } + // For Hudi incremental source, we'll let #fetchNextBatch to handle request time to completion time + // based conversion as it is heavy operation. return lastCheckpoint; } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/Source.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/Source.java index 779981fbbef35..a6c3c0e1411da 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/Source.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/Source.java @@ -41,6 +41,7 @@ import java.io.Serializable; +import static org.apache.hudi.common.table.checkpoint.CheckpointUtils.shouldTargetCheckpointV2; import static org.apache.hudi.config.HoodieWriteConfig.WRITE_TABLE_VERSION; /** @@ -99,6 +100,16 @@ protected InputBatch readFromCheckpoint(Option lastCheckpoint, lo sourceLimit); } + /** + * The second phase of checkpoint resolution - Checkpoint version translation. + * After the checkpoint value is decided based on the existing configurations at + * org.apache.hudi.utilities.streamer.StreamerCheckpointUtils#resolveWhatCheckpointToResume, + * + * For most of the data sources the there is no difference between checkpoint V1 and V2, it's + * merely changing the wrapper class. + * + * Check child class method overrides to see special case handling. + * */ @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) protected Option translateCheckpoint(Option lastCheckpoint) { if (lastCheckpoint.isEmpty()) { @@ -128,6 +139,25 @@ protected Option translateCheckpoint(Option lastCheckpoi throw new UnsupportedOperationException("Unsupported checkpoint type: " + lastCheckpoint.get()); } + public void assertCheckpointVersion(Option lastCheckpoint, Option lastCheckpointTranslated, Checkpoint checkpoint) { + if (checkpoint != null) { + boolean shouldBeV2Checkpoint = shouldTargetCheckpointV2(writeTableVersion, getClass().getName()); + String errorMessage = String.format( + "Data source should return checkpoint version V%s. The checkpoint resumed in the iteration is %s, whose translated version is %s. " + + "The checkpoint returned after the iteration %s.", + shouldBeV2Checkpoint ? "2" : "1", + lastCheckpoint.isEmpty() ? "null" : lastCheckpointTranslated.get(), + lastCheckpointTranslated.isEmpty() ? "null" : lastCheckpointTranslated.get(), + checkpoint); + if (shouldBeV2Checkpoint && !(checkpoint instanceof StreamerCheckpointV2)) { + throw new IllegalStateException(errorMessage); + } + if (!shouldBeV2Checkpoint && !(checkpoint instanceof StreamerCheckpointV1)) { + throw new IllegalStateException(errorMessage); + } + } + } + /** * Main API called by Hoodie Streamer to fetch records. * @@ -136,7 +166,9 @@ protected Option translateCheckpoint(Option lastCheckpoi * @return */ public final InputBatch fetchNext(Option lastCheckpoint, long sourceLimit) { - InputBatch batch = readFromCheckpoint(translateCheckpoint(lastCheckpoint), sourceLimit); + Option lastCheckpointTranslated = translateCheckpoint(lastCheckpoint); + InputBatch batch = readFromCheckpoint(lastCheckpointTranslated, sourceLimit); + assertCheckpointVersion(lastCheckpoint, lastCheckpointTranslated, batch.getCheckpointForNextBatch()); // If overriddenSchemaProvider is passed in CLI, use it return overriddenSchemaProvider == null ? batch : new InputBatch<>(batch.getBatch(), batch.getCheckpointForNextBatch(), overriddenSchemaProvider); diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/HoodieStreamer.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/HoodieStreamer.java index 074ad8743b501..cfe01dfde68a0 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/HoodieStreamer.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/HoodieStreamer.java @@ -392,7 +392,12 @@ public static class Config implements Serializable { /** * Resume Hudi Streamer from this checkpoint. */ - @Parameter(names = {"--checkpoint"}, description = "Resume Hudi Streamer from this checkpoint.") + @Parameter(names = {"--checkpoint"}, description = "Resume Hudi Streamer from this checkpoint. \nIf --source-class specifies a class " + + "that is a child class of org.apache.hudi.utilities.sources.HoodieIncrSource and the hoodie.table.version is 8 or higher, the " + + "format is expected to be either `resumeFromInstantRequestTime: ` or `resumeFromInstantCompletionTime: `. " + + "In table version 8 and above internally we only support completion time based commit ordering. If we use resumeFromInstantCompletionTime " + + "mode, the checkpoint will be reset to the instant with the corresponding completion time and resume. If we use resumeFromInstantRequestTime " + + "hudi first finds the instant, fetch the completion time of it and resume from the completion time.") public String checkpoint = null; @Parameter(names = {"--initial-checkpoint-provider"}, description = "subclass of " diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/StreamSync.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/StreamSync.java index f13e4c00bba8b..d78f49d617d48 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/StreamSync.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/StreamSync.java @@ -47,8 +47,6 @@ import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.TableSchemaResolver; import org.apache.hudi.common.table.checkpoint.Checkpoint; -import org.apache.hudi.common.table.checkpoint.StreamerCheckpointV1; -import org.apache.hudi.common.table.checkpoint.StreamerCheckpointV2; import org.apache.hudi.common.table.log.block.HoodieLogBlock; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; @@ -140,7 +138,7 @@ import static org.apache.hudi.common.table.HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE; import static org.apache.hudi.common.table.HoodieTableConfig.TIMELINE_HISTORY_PATH; import static org.apache.hudi.common.table.HoodieTableConfig.URL_ENCODE_PARTITIONING; -import static org.apache.hudi.common.table.checkpoint.CheckpointUtils.shouldTargetCheckpointV2; +import static org.apache.hudi.common.table.checkpoint.CheckpointUtils.buildCheckpointFromGeneralSource; import static org.apache.hudi.common.util.ConfigUtils.getBooleanWithAltKeys; import static org.apache.hudi.config.HoodieClusteringConfig.ASYNC_CLUSTERING_ENABLE; import static org.apache.hudi.config.HoodieClusteringConfig.INLINE_CLUSTERING; @@ -545,7 +543,7 @@ private Option getLastPendingCompactionInstant(Option co */ public Pair readFromSource(String instantTime, HoodieTableMetaClient metaClient) throws IOException { // Retrieve the previous round checkpoints, if any - Option checkpointToResume = StreamerCheckpointUtils.getCheckpointToResumeFrom(commitsTimelineOpt, cfg, props); + Option checkpointToResume = StreamerCheckpointUtils.resolveCheckpointToResumeFrom(commitsTimelineOpt, cfg, props, metaClient); LOG.info("Checkpoint to resume from : " + checkpointToResume); int maxRetryCount = cfg.retryOnSourceFailures ? cfg.maxRetryCount : 1; @@ -887,10 +885,8 @@ Map extractCheckpointMetadata(InputBatch inputBatch, TypedProper } // Otherwise create new checkpoint based on version - Checkpoint checkpoint = shouldTargetCheckpointV2(versionCode, cfg.sourceClassName) - ? new StreamerCheckpointV2((String) null) - : new StreamerCheckpointV1((String) null); - + Checkpoint checkpoint = buildCheckpointFromGeneralSource(cfg.sourceClassName, versionCode, null); + return checkpoint.getCheckpointCommitMetadata(cfg.checkpoint, cfg.ignoreCheckpoint); } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/StreamerCheckpointUtils.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/StreamerCheckpointUtils.java index c2e310ec81cfd..289c769eb4067 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/StreamerCheckpointUtils.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/streamer/StreamerCheckpointUtils.java @@ -22,10 +22,10 @@ import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.HoodieTableVersion; import org.apache.hudi.common.table.checkpoint.Checkpoint; import org.apache.hudi.common.table.checkpoint.CheckpointUtils; -import org.apache.hudi.common.table.checkpoint.StreamerCheckpointV1; -import org.apache.hudi.common.table.checkpoint.StreamerCheckpointV2; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.TimelineLayout; @@ -36,6 +36,7 @@ import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.HoodieUpgradeDowngradeException; import org.apache.hudi.utilities.config.KafkaSourceConfig; import org.apache.hudi.utilities.exception.HoodieStreamerException; @@ -44,34 +45,89 @@ import java.io.IOException; +import static org.apache.hudi.common.table.checkpoint.CheckpointUtils.HOODIE_INCREMENTAL_SOURCES; +import static org.apache.hudi.common.table.checkpoint.CheckpointUtils.buildCheckpointFromConfigOverride; import static org.apache.hudi.common.table.checkpoint.StreamerCheckpointV2.STREAMER_CHECKPOINT_KEY_V2; import static org.apache.hudi.common.table.checkpoint.StreamerCheckpointV2.STREAMER_CHECKPOINT_RESET_KEY_V2; import static org.apache.hudi.common.table.timeline.InstantComparison.LESSER_THAN; import static org.apache.hudi.common.table.timeline.InstantComparison.compareTimestamps; import static org.apache.hudi.common.util.ConfigUtils.removeConfigFromProps; +import static org.apache.hudi.table.upgrade.UpgradeDowngrade.needsUpgradeOrDowngrade; public class StreamerCheckpointUtils { private static final Logger LOG = LoggerFactory.getLogger(StreamerCheckpointUtils.class); - public static Option getCheckpointToResumeFrom(Option commitsTimelineOpt, - HoodieStreamer.Config streamerConfig, - TypedProperties props) throws IOException { + /** + * The first phase of checkpoint resolution - read the checkpoint configs from 2 sources and resolve + * conflicts: + *

+ * The 2 sources can have conflicts, and we need to decide which config should prevail. + *

+ * For the second phase of checkpoint resolution please refer + * {@link org.apache.hudi.utilities.sources.Source#translateCheckpoint} and child class overrides of this + * method. + */ + public static Option resolveCheckpointToResumeFrom(Option commitsTimelineOpt, + HoodieStreamer.Config streamerConfig, + TypedProperties props, + HoodieTableMetaClient metaClient) throws IOException { Option checkpoint = Option.empty(); + assertNoCheckpointOverrideDuringUpgradeForHoodieIncSource(metaClient, streamerConfig, props); + // If we have both streamer config and commits specifying what checkpoint to use, go with the + // checkpoint resolution logic to resolve conflicting configurations. if (commitsTimelineOpt.isPresent()) { - checkpoint = getCheckpointToResumeString(commitsTimelineOpt.get(), streamerConfig, props); + checkpoint = resolveCheckpointBetweenConfigAndPrevCommit(commitsTimelineOpt.get(), streamerConfig, props); + } + // If there is only streamer config, extract the checkpoint directly. + checkpoint = useCkpFromOverrideConfigIfAny(streamerConfig, props, checkpoint); + return checkpoint; + } + + /** + * Asserts that checkpoint override options are not used during table upgrade/downgrade operations. + * This validation is necessary because using checkpoint overrides during upgrade/downgrade operations + * is ambigious on if it should be interpreted as requested time or completion time. + * + * @param metaClient The metadata client for the Hudi table + * @param streamerConfig The configuration for the Hudi streamer + * @param props The typed properties containing configuration settings + * @throws HoodieUpgradeDowngradeException if checkpoint override options are used during upgrade/downgrade + */ + @VisibleForTesting + static void assertNoCheckpointOverrideDuringUpgradeForHoodieIncSource(HoodieTableMetaClient metaClient, HoodieStreamer.Config streamerConfig, TypedProperties props) { + boolean hasCheckpointOverride = !StringUtils.isNullOrEmpty(streamerConfig.checkpoint) + || !StringUtils.isNullOrEmpty(streamerConfig.ignoreCheckpoint); + boolean isHoodieIncSource = HOODIE_INCREMENTAL_SOURCES.contains(streamerConfig.sourceClassName); + if (hasCheckpointOverride && isHoodieIncSource) { + HoodieTableVersion writeTableVersion = HoodieTableVersion.fromVersionCode(ConfigUtils.getIntWithAltKeys(props, HoodieWriteConfig.WRITE_TABLE_VERSION)); + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(streamerConfig.targetBasePath).withProps(props).build(); + if (config.autoUpgrade() && needsUpgradeOrDowngrade(metaClient, config, writeTableVersion)) { + throw new HoodieUpgradeDowngradeException( + String.format("When upgrade/downgrade is happening, please avoid setting --checkpoint option and --ignore-checkpoint for your delta streamers." + + " Detected invalid streamer configuration:\n%s", streamerConfig)); + } } + } + private static Option useCkpFromOverrideConfigIfAny( + HoodieStreamer.Config streamerConfig, TypedProperties props, Option checkpoint) { LOG.debug("Checkpoint from config: " + streamerConfig.checkpoint); if (!checkpoint.isPresent() && streamerConfig.checkpoint != null) { int writeTableVersion = ConfigUtils.getIntWithAltKeys(props, HoodieWriteConfig.WRITE_TABLE_VERSION); - checkpoint = Option.of(CheckpointUtils.shouldTargetCheckpointV2(writeTableVersion, streamerConfig.sourceClassName) - ? new StreamerCheckpointV2(streamerConfig.checkpoint) : new StreamerCheckpointV1(streamerConfig.checkpoint)); + checkpoint = Option.of(buildCheckpointFromConfigOverride(streamerConfig.sourceClassName, writeTableVersion, streamerConfig.checkpoint)); } return checkpoint; } /** * Process previous commit metadata and checkpoint configs set by user to determine the checkpoint to resume from. + * The function consults various checkpoint related configurations and set the right + * `org.apache.hudi.common.table.checkpoint.Checkpoint#checkpointKey` value in the returned object. * * @param commitsTimeline commits timeline of interest, including .commit and .deltacommit. * @@ -79,9 +135,9 @@ public static Option getCheckpointToResumeFrom(Option getCheckpointToResumeString(HoodieTimeline commitsTimeline, - HoodieStreamer.Config streamerConfig, - TypedProperties props) throws IOException { + static Option resolveCheckpointBetweenConfigAndPrevCommit(HoodieTimeline commitsTimeline, + HoodieStreamer.Config streamerConfig, + TypedProperties props) throws IOException { Option resumeCheckpoint = Option.empty(); // has deltacommit and this is a MOR table, then we should get checkpoint from .deltacommit // if changing from mor to cow, before changing we must do a full compaction, so we can only consider .commit in such case @@ -102,15 +158,13 @@ static Option getCheckpointToResumeString(HoodieTimeline commitsTime HoodieCommitMetadata commitMetadata = commitMetadataOption.get(); Checkpoint checkpointFromCommit = CheckpointUtils.getCheckpoint(commitMetadata); LOG.debug("Checkpoint reset from metadata: " + checkpointFromCommit.getCheckpointResetKey()); - if (streamerConfig.ignoreCheckpoint != null && (StringUtils.isNullOrEmpty(checkpointFromCommit.getCheckpointIgnoreKey()) - || !streamerConfig.ignoreCheckpoint.equals(checkpointFromCommit.getCheckpointIgnoreKey()))) { + if (ignoreCkpCfgPrevailsOverCkpFromPrevCommit(streamerConfig, checkpointFromCommit)) { // we ignore any existing checkpoint and start ingesting afresh resumeCheckpoint = Option.empty(); - } else if (streamerConfig.checkpoint != null && (StringUtils.isNullOrEmpty(checkpointFromCommit.getCheckpointResetKey()) - || !streamerConfig.checkpoint.equals(checkpointFromCommit.getCheckpointResetKey()))) { - resumeCheckpoint = Option.of(CheckpointUtils.shouldTargetCheckpointV2(writeTableVersion, streamerConfig.sourceClassName) - ? new StreamerCheckpointV2(streamerConfig.checkpoint) : new StreamerCheckpointV1(streamerConfig.checkpoint)); - } else if (!StringUtils.isNullOrEmpty(checkpointFromCommit.getCheckpointKey())) { + } else if (ckpOverrideCfgPrevailsOverCkpFromPrevCommit(streamerConfig, checkpointFromCommit)) { + resumeCheckpoint = Option.of(buildCheckpointFromConfigOverride( + streamerConfig.sourceClassName, writeTableVersion, streamerConfig.checkpoint)); + } else if (shouldUseCkpFromPrevCommit(checkpointFromCommit)) { //if previous checkpoint is an empty string, skip resume use Option.empty() resumeCheckpoint = Option.of(checkpointFromCommit); } else if (compareTimestamps(HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS, @@ -126,13 +180,26 @@ static Option getCheckpointToResumeString(HoodieTimeline commitsTime } } else if (streamerConfig.checkpoint != null) { // getLatestCommitMetadataWithValidCheckpointInfo(commitTimelineOpt.get()) will never return a commit metadata w/o any checkpoint key set. - resumeCheckpoint = Option.of(CheckpointUtils.shouldTargetCheckpointV2(writeTableVersion, streamerConfig.sourceClassName) - ? new StreamerCheckpointV2(streamerConfig.checkpoint) : new StreamerCheckpointV1(streamerConfig.checkpoint)); + resumeCheckpoint = Option.of(buildCheckpointFromConfigOverride(streamerConfig.sourceClassName, writeTableVersion, streamerConfig.checkpoint)); } } return resumeCheckpoint; } + private static boolean shouldUseCkpFromPrevCommit(Checkpoint checkpointFromCommit) { + return !StringUtils.isNullOrEmpty(checkpointFromCommit.getCheckpointKey()); + } + + private static boolean ckpOverrideCfgPrevailsOverCkpFromPrevCommit(HoodieStreamer.Config streamerConfig, Checkpoint checkpointFromCommit) { + return streamerConfig.checkpoint != null && (StringUtils.isNullOrEmpty(checkpointFromCommit.getCheckpointResetKey()) + || !streamerConfig.checkpoint.equals(checkpointFromCommit.getCheckpointResetKey())); + } + + private static boolean ignoreCkpCfgPrevailsOverCkpFromPrevCommit(HoodieStreamer.Config streamerConfig, Checkpoint checkpointFromCommit) { + return streamerConfig.ignoreCheckpoint != null && (StringUtils.isNullOrEmpty(checkpointFromCommit.getCheckpointIgnoreKey()) + || !streamerConfig.ignoreCheckpoint.equals(checkpointFromCommit.getCheckpointIgnoreKey())); + } + public static Option> getLatestInstantAndCommitMetadataWithValidCheckpointInfo(HoodieTimeline timeline) throws IOException { return (Option>) timeline.getReverseOrderedInstants().map(instant -> { diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/CheckpointValidator.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/CheckpointValidator.java index 88a24cf408556..257616ca8b772 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/CheckpointValidator.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/CheckpointValidator.java @@ -21,15 +21,18 @@ import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.table.checkpoint.Checkpoint; +import org.apache.hudi.common.table.checkpoint.UnresolvedStreamerCheckpointBasedOnCfg; +import org.apache.hudi.common.table.checkpoint.StreamerCheckpointV1; +import org.apache.hudi.common.table.checkpoint.StreamerCheckpointV2; import org.apache.hudi.common.util.Option; import java.util.function.BiFunction; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; - /** * Helper class to validate checkpoint options in test scenarios. * Used by MockS3EventsHoodieIncrSource to validate checkpoint behavior. @@ -40,6 +43,7 @@ public class CheckpointValidator { /** * Validation keys for checkpoint testing. */ + public static final String VAL_CKP_INSTANCE_OF = "VAL_CKP_INSTANCE_OF_KEY"; public static final String VAL_CKP_KEY_EQ_VAL = "VAL_CKP_KEY_EQ_VAL_KEY"; public static final String VAL_CKP_RESET_KEY_EQUALS = "VAL_CKP_RESET_KEY"; public static final String VAL_CKP_RESET_KEY_IS_NULL = "VAL_CKP_RESET_KEY_IS_NULL"; @@ -47,6 +51,7 @@ public class CheckpointValidator { public static final String VAL_CKP_IGNORE_KEY_IS_NULL = "VAL_CKP_IGNORE_KEY_IS_NULL"; public static final String VAL_NON_EMPTY_CKP_ALL_MEMBERS = "VAL_NON_EMPTY_CKP_ALL_MEMBERS"; public static final String VAL_EMPTY_CKP_KEY = "VAL_EMPTY_CKP_KEY"; + public static final String VAL_NO_INGESTION_HAPPENS = "VAL_NO_INGESTION_HAPPENS_KEY"; public static final String VAL_NO_OP = "VAL_NO_OP"; /* @@ -54,25 +59,49 @@ public class CheckpointValidator { * */ private static final BiFunction, TypedProperties, Void> VAL_NON_EMPTY_CKP_WITH_FIXED_VALUE = (ckpOpt, props) -> { assertFalse(ckpOpt.isEmpty()); + Checkpoint ckp = ckpOpt.get(); + assertCheckpointIsInstanceOf(props, ckp); + assertCheckpointValueEqualsTo(props, ckp); + return null; + }; + + private static void assertCheckpointValueEqualsTo(TypedProperties props, Checkpoint ckp) { if (props.containsKey(VAL_CKP_KEY_EQ_VAL)) { - assertEquals(props.getString(VAL_CKP_KEY_EQ_VAL), ckpOpt.get().getCheckpointKey()); + assertEquals(props.getString(VAL_CKP_KEY_EQ_VAL), ckp.getCheckpointKey()); } if (props.containsKey(VAL_CKP_RESET_KEY_EQUALS)) { - assertEquals(ckpOpt.get().getCheckpointResetKey(), props.getString(VAL_CKP_RESET_KEY_EQUALS)); + assertEquals(ckp.getCheckpointResetKey(), props.getString(VAL_CKP_RESET_KEY_EQUALS)); } if (props.containsKey(VAL_CKP_RESET_KEY_IS_NULL)) { - assertNull(ckpOpt.get().getCheckpointResetKey()); + assertNull(ckp.getCheckpointResetKey()); } if (props.containsKey(VAL_CKP_IGNORE_KEY_EQUALS)) { - assertEquals(ckpOpt.get().getCheckpointIgnoreKey(), props.getString(VAL_CKP_IGNORE_KEY_EQUALS)); + assertEquals(ckp.getCheckpointIgnoreKey(), props.getString(VAL_CKP_IGNORE_KEY_EQUALS)); } if (props.containsKey(VAL_CKP_IGNORE_KEY_IS_NULL)) { - assertNull(ckpOpt.get().getCheckpointIgnoreKey()); + assertNull(ckp.getCheckpointIgnoreKey()); } - return null; - }; + } + + private static void assertCheckpointIsInstanceOf(TypedProperties props, Checkpoint ckp) { + if (props.containsKey(VAL_CKP_INSTANCE_OF)) { + switch ((String) props.get(VAL_CKP_INSTANCE_OF)) { + case "org.apache.hudi.common.table.checkpoint.StreamerCheckpointV1": + assertInstanceOf(StreamerCheckpointV1.class, ckp); + break; + case "org.apache.hudi.common.table.checkpoint.StreamerCheckpointV2": + assertInstanceOf(StreamerCheckpointV2.class, ckp); + break; + case "org.apache.hudi.common.table.checkpoint.StreamerCheckpointFromCfgCkp": + assertInstanceOf(UnresolvedStreamerCheckpointBasedOnCfg.class, ckp); + break; + default: + throw new RuntimeException("Unknown checkpoint class to validate " + props.get(VAL_CKP_INSTANCE_OF)); + } + } + } private static final BiFunction, TypedProperties, Void> VAL_EMPTY_CKP = (ckpOpt, props) -> { assertTrue(ckpOpt.isEmpty()); @@ -98,6 +127,9 @@ public static void validateCheckpointOption(Option lastCheckpoint, T case VAL_EMPTY_CKP_KEY: VAL_EMPTY_CKP.apply(lastCheckpoint, props); break; + case VAL_NO_INGESTION_HAPPENS: + throw new IllegalStateException( + String.format("Expected no ingestion happening but the source is asked to resume from checkpoint %s with props %s", lastCheckpoint, props)); default: throw new IllegalArgumentException("Unsupported validation type: " + valType); } diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/DummyOperationExecutor.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/DummyOperationExecutor.java index 054fb95acc98c..81b7edb0ec245 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/DummyOperationExecutor.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/DummyOperationExecutor.java @@ -22,6 +22,7 @@ import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.table.checkpoint.Checkpoint; import org.apache.hudi.common.table.checkpoint.StreamerCheckpointV1; +import org.apache.hudi.common.table.checkpoint.StreamerCheckpointV2; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.collection.Pair; @@ -38,8 +39,11 @@ public class DummyOperationExecutor { /** * Operation keys for different test scenarios returning empty row sets with different checkpoint configurations. + * For corresponding dummy operations please extract the lambda function name from the key OP_[DummyOperationFunctionName]_KEY + * and check the corresponding [DummyOperationFunctionName] member definition. */ - public static final String OP_EMPTY_ROW_SET_NONE_NULL_CKP_KEY = "OP_EMPTY_ROW_SET_NONE_NULL_CKP1_KEY"; + public static final String OP_EMPTY_ROW_SET_NONE_NULL_CKP_V1_KEY = "OP_EMPTY_ROW_SET_NONE_NULL_CKP_V1_KEY"; + public static final String OP_EMPTY_ROW_SET_NONE_NULL_CKP_V2_KEY = "OP_EMPTY_ROW_SET_NONE_NULL_CKP_V2_KEY"; public static final String OP_EMPTY_ROW_SET_NULL_CKP_KEY = "OP_EMPTY_ROW_SET_NULL_CKP"; /** @@ -53,22 +57,24 @@ private interface OperationFunction { Pair>, Checkpoint> apply(Option checkpoint, Long limit, TypedProperties props); } - private static final OperationFunction EMPTY_ROW_SET_NONE_NULL_CKP = + private static final OperationFunction EMPTY_ROW_SET_NONE_NULL_CKP_V1 = (checkpoint, limit, props) -> { Option> empty = Option.empty(); String returnCheckpoint = props.getString(RETURN_CHECKPOINT_KEY, CUSTOM_CHECKPOINT1); - return Pair.of( - empty, - new StreamerCheckpointV1(returnCheckpoint)); + return Pair.of(empty, new StreamerCheckpointV1(returnCheckpoint)); }; private static final OperationFunction EMPTY_ROW_SET_NULL_CKP = (checkpoint, limit, props) -> { Option> empty = Option.empty(); - return Pair.of( - empty, - null - ); + return Pair.of(empty, null); + }; + + private static final OperationFunction EMPTY_ROW_SET_NONE_NULL_CKP_V2 = + (checkpoint, limit, props) -> { + Option> empty = Option.empty(); + String returnCheckpoint = props.getString(RETURN_CHECKPOINT_KEY, CUSTOM_CHECKPOINT1); + return Pair.of(empty, new StreamerCheckpointV2(returnCheckpoint)); }; /** @@ -82,10 +88,12 @@ private interface OperationFunction { */ public static Pair>, Checkpoint> executeDummyOperation( Option lastCheckpoint, long sourceLimit, TypedProperties props) { - String opType = props.getString(OP_FETCH_NEXT_BATCH, OP_EMPTY_ROW_SET_NONE_NULL_CKP_KEY); + String opType = props.getString(OP_FETCH_NEXT_BATCH, OP_EMPTY_ROW_SET_NONE_NULL_CKP_V1_KEY); switch (opType) { - case OP_EMPTY_ROW_SET_NONE_NULL_CKP_KEY: - return EMPTY_ROW_SET_NONE_NULL_CKP.apply(lastCheckpoint, sourceLimit, props); + case OP_EMPTY_ROW_SET_NONE_NULL_CKP_V1_KEY: + return EMPTY_ROW_SET_NONE_NULL_CKP_V1.apply(lastCheckpoint, sourceLimit, props); + case OP_EMPTY_ROW_SET_NONE_NULL_CKP_V2_KEY: + return EMPTY_ROW_SET_NONE_NULL_CKP_V2.apply(lastCheckpoint, sourceLimit, props); case OP_EMPTY_ROW_SET_NULL_CKP_KEY: return EMPTY_ROW_SET_NULL_CKP.apply(lastCheckpoint, sourceLimit, props); default: diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/MockGeneralHoodieIncrSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/MockGeneralHoodieIncrSource.java new file mode 100644 index 0000000000000..55de622b7821e --- /dev/null +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/MockGeneralHoodieIncrSource.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utilities.sources; + +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.table.checkpoint.Checkpoint; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.utilities.ingestion.HoodieIngestionMetrics; +import org.apache.hudi.utilities.streamer.StreamContext; + +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; + +/** + * A mock implementation of HoodieIncrSource used for testing StreamSync functionality. + * This class simulates different checkpoint and data fetch scenarios to test the checkpoint handling + * and data ingestion behavior of the StreamSync class. + */ +public class MockGeneralHoodieIncrSource extends HoodieIncrSource { + + public MockGeneralHoodieIncrSource( + TypedProperties props, + JavaSparkContext sparkContext, + SparkSession sparkSession, + HoodieIngestionMetrics metrics, + StreamContext streamContext) { + super(props, sparkContext, sparkSession, metrics, streamContext); + } + + + /** + * Overrides the fetchNextBatch method to simulate different test scenarios based on configuration. + * + * @param lastCheckpoint Option containing the last checkpoint + * @param sourceLimit maximum number of records to fetch + * @return Pair containing Option> and Checkpoint + */ + @Override + public Pair>, Checkpoint> fetchNextBatch(Option lastCheckpoint, long sourceLimit) { + CheckpointValidator.validateCheckpointOption(lastCheckpoint, props); + return DummyOperationExecutor.executeDummyOperation(lastCheckpoint, sourceLimit, props); + } +} diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/streamer/TestS3GcsEventsHoodieIncrSourceE2ECkpVersion.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/streamer/TestHoodieIncrSourceE2E.java similarity index 80% rename from hudi-utilities/src/test/java/org/apache/hudi/utilities/streamer/TestS3GcsEventsHoodieIncrSourceE2ECkpVersion.java rename to hudi-utilities/src/test/java/org/apache/hudi/utilities/streamer/TestHoodieIncrSourceE2E.java index 7c461fa845062..830d773da40fe 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/streamer/TestS3GcsEventsHoodieIncrSourceE2ECkpVersion.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/streamer/TestHoodieIncrSourceE2E.java @@ -22,7 +22,9 @@ import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.HoodieTableVersion; import org.apache.hudi.common.table.checkpoint.CheckpointUtils; +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; import org.apache.hudi.common.util.Option; import org.apache.hudi.testutils.HoodieClientTestUtils; import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer; @@ -53,7 +55,8 @@ import static org.apache.hudi.utilities.sources.CheckpointValidator.VAL_EMPTY_CKP_KEY; import static org.apache.hudi.utilities.sources.CheckpointValidator.VAL_INPUT_CKP; import static org.apache.hudi.utilities.sources.CheckpointValidator.VAL_NON_EMPTY_CKP_ALL_MEMBERS; -import static org.apache.hudi.utilities.sources.DummyOperationExecutor.OP_EMPTY_ROW_SET_NONE_NULL_CKP_KEY; +import static org.apache.hudi.utilities.sources.DummyOperationExecutor.OP_EMPTY_ROW_SET_NONE_NULL_CKP_V1_KEY; +import static org.apache.hudi.utilities.sources.DummyOperationExecutor.OP_EMPTY_ROW_SET_NONE_NULL_CKP_V2_KEY; import static org.apache.hudi.utilities.sources.DummyOperationExecutor.OP_EMPTY_ROW_SET_NULL_CKP_KEY; import static org.apache.hudi.utilities.sources.DummyOperationExecutor.OP_FETCH_NEXT_BATCH; import static org.apache.hudi.utilities.sources.DummyOperationExecutor.RETURN_CHECKPOINT_KEY; @@ -61,9 +64,11 @@ import static org.apache.hudi.utilities.streamer.StreamSync.CHECKPOINT_IGNORE_KEY; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; @ExtendWith(MockitoExtension.class) -public class TestS3GcsEventsHoodieIncrSourceE2ECkpVersion extends S3EventsHoodieIncrSourceHarness { +public class TestHoodieIncrSourceE2E extends S3EventsHoodieIncrSourceHarness { private String toggleVersion(String version) { return "8".equals(version) ? "6" : "8"; @@ -127,7 +132,56 @@ public void verifyLastInstantCommitMetadata(Map expectedMetadata * - Verifies commit metadata still uses checkpoint V1 format * - Verifies final checkpoint="30" */ + @ParameterizedTest + @CsvSource({ + "6, org.apache.hudi.utilities.sources.MockGeneralHoodieIncrSource", + "8, org.apache.hudi.utilities.sources.MockGeneralHoodieIncrSource" + }) + public void testSyncE2EWrongCheckpointVersionErrorOut(String tableVersion, String sourceClass) throws Exception { + // First start with no previous checkpoint and ingest till ckp 1 with table version. + // Disable auto upgrade and MDT as we want to keep things as it is. + metaClient = getHoodieMetaClientWithTableVersion(storageConf(), basePath(), tableVersion); + TypedProperties props = setupBaseProperties(tableVersion); + // Round 1: ingest start from beginning to checkpoint 10. Return wrong version of checkpoint. + props.put(OP_FETCH_NEXT_BATCH, + tableVersion.equals("6") ? OP_EMPTY_ROW_SET_NONE_NULL_CKP_V2_KEY : OP_EMPTY_ROW_SET_NONE_NULL_CKP_V1_KEY); + props.put(RETURN_CHECKPOINT_KEY, "10"); + // Validating the source input ckp is empty when doing the sync. + props.put(VAL_INPUT_CKP, VAL_EMPTY_CKP_KEY); + props.put("hoodie.metadata.enable", "false"); + props.put("hoodie.write.auto.upgrade", "false"); + props.put("hoodie.write.table.version", tableVersion); + HoodieDeltaStreamer ds = new HoodieDeltaStreamer(createConfig(basePath(), null, sourceClass), jsc, Option.of(props)); + Exception ex = assertThrows(java.lang.IllegalStateException.class, ds::sync); + // Contain error messages. + if (tableVersion.equals("8")) { + assertTrue(ex.getMessage().contains("Data source should return checkpoint version V2.")); + } else { + assertTrue(ex.getMessage().contains("Data source should return checkpoint version V1.")); + } + } + + /** + * Tests the end-to-end sync behavior with multiple sync iterations. + * + * Test flow: + * 1. First sync: + * - Starts with no checkpoint + * - Ingests data until checkpoint "10" + * - Verifies commit metadata contains only checkpoint="10" + * + * 2. Second sync: + * - Uses different table version + * - Continues from checkpoint "10" to "20" + * - Verifies commit metadata contains checkpoint="20" + * + * 3. Third sync: + * - Upgrades to table version 8 + * - Continues from checkpoint "20" to "30" + * - Verifies commit metadata still uses checkpoint V1 format + * - Verifies final checkpoint="30" + */ @ParameterizedTest @CsvSource({ "6, org.apache.hudi.utilities.sources.MockS3EventsHoodieIncrSource", @@ -141,7 +195,7 @@ public void testSyncE2ENoPrevCkpThenSyncMultipleTimes(String tableVersion, Strin metaClient = getHoodieMetaClientWithTableVersion(storageConf(), basePath(), tableVersion); TypedProperties props = setupBaseProperties(tableVersion); // Round 1: ingest start from beginning to checkpoint 10. No checkpoint override. - props.put(OP_FETCH_NEXT_BATCH, OP_EMPTY_ROW_SET_NONE_NULL_CKP_KEY); + props.put(OP_FETCH_NEXT_BATCH, OP_EMPTY_ROW_SET_NONE_NULL_CKP_V1_KEY); props.put(RETURN_CHECKPOINT_KEY, "10"); // Validating the source input ckp is empty when doing the sync. props.put(VAL_INPUT_CKP, VAL_EMPTY_CKP_KEY); @@ -162,7 +216,7 @@ public void testSyncE2ENoPrevCkpThenSyncMultipleTimes(String tableVersion, Strin props = setupBaseProperties(toggleVersion(tableVersion)); props.put("hoodie.metadata.enable", "false"); // Dummy behavior injection to return ckp 2. - props.put(OP_FETCH_NEXT_BATCH, OP_EMPTY_ROW_SET_NONE_NULL_CKP_KEY); + props.put(OP_FETCH_NEXT_BATCH, OP_EMPTY_ROW_SET_NONE_NULL_CKP_V1_KEY); props.put(RETURN_CHECKPOINT_KEY, "20"); props.put("hoodie.write.auto.upgrade", "false"); // Validate the given checkpoint is ckp 1 when doing the sync. @@ -181,11 +235,12 @@ public void testSyncE2ENoPrevCkpThenSyncMultipleTimes(String tableVersion, Strin expectedMetadata.put(STREAMER_CHECKPOINT_KEY_V1, "20"); verifyLastInstantCommitMetadata(expectedMetadata); - // In the third round, enable MDT and auto upgrade, use table version 8 + // In the third round, disable MDT and auto upgrade, use table write version 8. The result is + // no upgrade will happen. props = setupBaseProperties("8"); props.put("hoodie.metadata.enable", "false"); // Dummy behavior injection to return ckp 1. - props.put(OP_FETCH_NEXT_BATCH, OP_EMPTY_ROW_SET_NONE_NULL_CKP_KEY); + props.put(OP_FETCH_NEXT_BATCH, OP_EMPTY_ROW_SET_NONE_NULL_CKP_V1_KEY); props.put(RETURN_CHECKPOINT_KEY, "30"); props.put("hoodie.write.auto.upgrade", "false"); // Validate the given checkpoint is ckp 2 when doing the sync. @@ -197,11 +252,44 @@ public void testSyncE2ENoPrevCkpThenSyncMultipleTimes(String tableVersion, Strin ds = new HoodieDeltaStreamer(createConfig(basePath(), null, sourceClass), jsc, Option.of(props)); ds.sync(); - // After upgrading, we still use checkpoint V1 since this is s3/Gcs incremental source. + // Assert no upgrade happens. + if (tableVersion.equals("6")) { + metaClient = getHoodieMetaClientWithTableVersion(storageConf(), basePath(), "6"); + assertEquals(HoodieTableVersion.SIX, metaClient.getTableConfig().getTableVersion()); + assertEquals(TimelineLayoutVersion.LAYOUT_VERSION_1, metaClient.getTableConfig().getTimelineLayoutVersion().get()); + } + + // After that, we still use checkpoint V1 since this is s3/Gcs incremental source. expectedMetadata = new HashMap<>(); expectedMetadata.put("schema", ""); expectedMetadata.put(STREAMER_CHECKPOINT_KEY_V1, "30"); verifyLastInstantCommitMetadata(expectedMetadata); + + // In the forth round, enable auto upgrade, use table write version 8, the upgrade is successful. + props = setupBaseProperties("8"); + props.put("hoodie.metadata.enable", "false"); + // Dummy behavior injection to return ckp 1. + props.put(OP_FETCH_NEXT_BATCH, OP_EMPTY_ROW_SET_NONE_NULL_CKP_V1_KEY); + props.put(RETURN_CHECKPOINT_KEY, "40"); + props.put("hoodie.write.auto.upgrade", "true"); + // Validate the given checkpoint is ckp 2 when doing the sync. + props.put(VAL_INPUT_CKP, VAL_NON_EMPTY_CKP_ALL_MEMBERS); + props.put(VAL_CKP_KEY_EQ_VAL, "30"); + props.put(VAL_CKP_RESET_KEY_IS_NULL, "IGNORED"); + props.put(VAL_CKP_IGNORE_KEY_IS_NULL, "IGNORED"); + + ds = new HoodieDeltaStreamer(createConfig(basePath(), null, sourceClass), jsc, Option.of(props)); + ds.sync(); + + // After upgrading, we still use checkpoint V1 since this is s3/Gcs incremental source. + // Assert the table is upgraded. + metaClient = getHoodieMetaClientWithTableVersion(storageConf(), basePath(), "8"); + assertEquals(HoodieTableVersion.EIGHT, metaClient.getTableConfig().getTableVersion()); + assertEquals(TimelineLayoutVersion.LAYOUT_VERSION_2, metaClient.getTableConfig().getTimelineLayoutVersion().get()); + expectedMetadata = new HashMap<>(); + expectedMetadata.put("schema", ""); + expectedMetadata.put(STREAMER_CHECKPOINT_KEY_V1, "40"); + verifyLastInstantCommitMetadata(expectedMetadata); } /** @@ -237,7 +325,7 @@ public void testSyncE2ENoPrevCkpWithCkpOverride(String tableVersion) throws Exce // set it to start at ckp "10" and in that iteration the streamer stops at ckp "30 metaClient = getHoodieMetaClientWithTableVersion(storageConf(), basePath(), tableVersion); TypedProperties props = setupBaseProperties(tableVersion); - props.put(OP_FETCH_NEXT_BATCH, OP_EMPTY_ROW_SET_NONE_NULL_CKP_KEY); + props.put(OP_FETCH_NEXT_BATCH, OP_EMPTY_ROW_SET_NONE_NULL_CKP_V1_KEY); props.put(VAL_INPUT_CKP, VAL_NON_EMPTY_CKP_ALL_MEMBERS); props.put(RETURN_CHECKPOINT_KEY, "30"); // The data source said it stops at "30". props.put(VAL_CKP_KEY_EQ_VAL, "10"); // Ensure the data source is notified we should start at "10" @@ -295,7 +383,7 @@ public void testSyncE2ENoPrevCkpWithCkpOverride(String tableVersion) throws Exce // In the next iteration, using the same config with ignore key setting does not lead to checkpoint reset. props = setupBaseProperties(tableVersion); - props.put(OP_FETCH_NEXT_BATCH, OP_EMPTY_ROW_SET_NONE_NULL_CKP_KEY); + props.put(OP_FETCH_NEXT_BATCH, OP_EMPTY_ROW_SET_NONE_NULL_CKP_V1_KEY); props.put(RETURN_CHECKPOINT_KEY, "70"); // Stop at 70 after ingestion. props.put(VAL_INPUT_CKP, VAL_NON_EMPTY_CKP_ALL_MEMBERS); props.put(VAL_CKP_KEY_EQ_VAL, "60"); // Ensure the data source is notified we should start at "60". @@ -312,7 +400,7 @@ public void testSyncE2ENoPrevCkpWithCkpOverride(String tableVersion) throws Exce // contain that info. cfg.ignoreCheckpoint = null; props = setupBaseProperties(tableVersion); - props.put(OP_FETCH_NEXT_BATCH, OP_EMPTY_ROW_SET_NONE_NULL_CKP_KEY); + props.put(OP_FETCH_NEXT_BATCH, OP_EMPTY_ROW_SET_NONE_NULL_CKP_V1_KEY); props.put(RETURN_CHECKPOINT_KEY, "80"); // Stop at 70 after ingestion. props.put(VAL_INPUT_CKP, VAL_NON_EMPTY_CKP_ALL_MEMBERS); props.put(VAL_CKP_KEY_EQ_VAL, "70"); // Ensure the data source is notified we should start at "60". @@ -398,7 +486,7 @@ public void testSyncE2EForceSkip(String tableVersion) throws Exception { metaClient = getHoodieMetaClientWithTableVersion(storageConf(), basePath(), tableVersion); TypedProperties props = setupBaseProperties(tableVersion); props.put(CHECKPOINT_FORCE_SKIP.key(), "true"); - props.put(OP_FETCH_NEXT_BATCH, OP_EMPTY_ROW_SET_NONE_NULL_CKP_KEY); + props.put(OP_FETCH_NEXT_BATCH, OP_EMPTY_ROW_SET_NONE_NULL_CKP_V1_KEY); props.put(VAL_INPUT_CKP, VAL_EMPTY_CKP_KEY); HoodieDeltaStreamer ds = new HoodieDeltaStreamer(createConfig(basePath(), null), jsc, Option.of(props)); diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/streamer/TestHoodieIncrSourceE2EAutoUpgrade.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/streamer/TestHoodieIncrSourceE2EAutoUpgrade.java new file mode 100644 index 0000000000000..22b9673a078f4 --- /dev/null +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/streamer/TestHoodieIncrSourceE2EAutoUpgrade.java @@ -0,0 +1,312 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utilities.streamer; + +import org.apache.hudi.common.config.DFSPropertiesConfiguration; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.HoodieTableVersion; +import org.apache.hudi.common.table.checkpoint.StreamerCheckpointV1; +import org.apache.hudi.common.table.checkpoint.StreamerCheckpointV2; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; +import org.apache.hudi.common.testutils.SchemaTestUtil; +import org.apache.hudi.common.util.ConfigUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieUpgradeDowngradeException; +import org.apache.hudi.storage.StorageConfiguration; +import org.apache.hudi.testutils.HoodieClientTestUtils; +import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer; +import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerTestBase; +import org.apache.hudi.utilities.sources.MockGeneralHoodieIncrSource; +import org.apache.hudi.utilities.sources.S3EventsHoodieIncrSourceHarness; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.junit.jupiter.MockitoExtension; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Properties; + +import static org.apache.hudi.common.model.HoodieTableType.COPY_ON_WRITE; +import static org.apache.hudi.common.table.checkpoint.HoodieIncrSourceCheckpointValUtils.REQUEST_TIME_PREFIX; +import static org.apache.hudi.common.table.checkpoint.HoodieIncrSourceCheckpointValUtils.RESET_CHECKPOINT_V2_SEPARATOR; +import static org.apache.hudi.common.table.checkpoint.StreamerCheckpointV1.STREAMER_CHECKPOINT_KEY_V1; +import static org.apache.hudi.common.table.checkpoint.StreamerCheckpointV2.STREAMER_CHECKPOINT_KEY_V2; +import static org.apache.hudi.common.table.checkpoint.StreamerCheckpointV2.STREAMER_CHECKPOINT_RESET_KEY_V2; +import static org.apache.hudi.common.testutils.HoodieTestUtils.RAW_TRIPS_TEST_NAME; +import static org.apache.hudi.config.HoodieWriteConfig.WRITE_TABLE_VERSION; +import static org.apache.hudi.utilities.sources.CheckpointValidator.VAL_CKP_IGNORE_KEY_IS_NULL; +import static org.apache.hudi.utilities.sources.CheckpointValidator.VAL_CKP_INSTANCE_OF; +import static org.apache.hudi.utilities.sources.CheckpointValidator.VAL_CKP_KEY_EQ_VAL; +import static org.apache.hudi.utilities.sources.CheckpointValidator.VAL_CKP_RESET_KEY_EQUALS; +import static org.apache.hudi.utilities.sources.CheckpointValidator.VAL_CKP_RESET_KEY_IS_NULL; +import static org.apache.hudi.utilities.sources.CheckpointValidator.VAL_EMPTY_CKP_KEY; +import static org.apache.hudi.utilities.sources.CheckpointValidator.VAL_INPUT_CKP; +import static org.apache.hudi.utilities.sources.CheckpointValidator.VAL_NON_EMPTY_CKP_ALL_MEMBERS; +import static org.apache.hudi.utilities.sources.CheckpointValidator.VAL_NO_INGESTION_HAPPENS; +import static org.apache.hudi.utilities.sources.DummyOperationExecutor.OP_EMPTY_ROW_SET_NONE_NULL_CKP_V1_KEY; +import static org.apache.hudi.utilities.sources.DummyOperationExecutor.OP_EMPTY_ROW_SET_NONE_NULL_CKP_V2_KEY; +import static org.apache.hudi.utilities.sources.DummyOperationExecutor.OP_FETCH_NEXT_BATCH; +import static org.apache.hudi.utilities.sources.DummyOperationExecutor.RETURN_CHECKPOINT_KEY; +import static org.apache.hudi.utilities.sources.helpers.IncrSourceHelper.MissingCheckpointStrategy.READ_UPTO_LATEST_COMMIT; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +@ExtendWith(MockitoExtension.class) +public class TestHoodieIncrSourceE2EAutoUpgrade extends S3EventsHoodieIncrSourceHarness { + + private String schemaStr; + + @BeforeEach + void setup() throws IOException { + super.setUp(); + schemaStr = SchemaTestUtil.getSimpleSchema().toString(); + } + + @Override + public HoodieTableMetaClient getHoodieMetaClient(StorageConfiguration storageConf, String basePath, Properties props) throws IOException { + return HoodieTableMetaClient.newTableBuilder() + .setTableName(RAW_TRIPS_TEST_NAME) + .setTableType(COPY_ON_WRITE) + .setPayloadClass(HoodieAvroPayload.class) + .setTableVersion(ConfigUtils.getIntWithAltKeys(new TypedProperties(props), WRITE_TABLE_VERSION)) + .setTableCreateSchema(schemaStr) + .fromProperties(props) + .initTable(storageConf.newInstance(), basePath); + } + + private String toggleVersion(String version) { + return "8".equals(version) ? "6" : "8"; + } + + private HoodieDeltaStreamer.Config createConfig(String basePath, String sourceCheckpoint, String sourceClass) { + HoodieDeltaStreamer.Config cfg = HoodieDeltaStreamerTestBase.TestHelpers.makeConfig( + basePath, + WriteOperationType.INSERT, + sourceClass, + Collections.emptyList(), + sourceCheckpoint != null ? DFSPropertiesConfiguration.DEFAULT_PATH.toString() : null, + false, + false, + 100000, + false, + null, + null, + "timestamp", + sourceCheckpoint); + cfg.propsFilePath = DFSPropertiesConfiguration.DEFAULT_PATH.toString(); + return cfg; + } + + private TypedProperties setupBaseProperties(String tableVersion) { + TypedProperties props = setProps(READ_UPTO_LATEST_COMMIT); + props.put(WRITE_TABLE_VERSION.key(), tableVersion); + return props; + } + + public void verifyLastInstantCommitMetadata(Map expectedMetadata) { + metaClient.reloadActiveTimeline(); + Option metadata = HoodieClientTestUtils.getCommitMetadataForInstant( + metaClient, metaClient.getActiveTimeline().lastInstant().get()); + assertFalse(metadata.isEmpty()); + assertEquals(expectedMetadata, metadata.get().getExtraMetadata()); + } + + /** + * Tests the end-to-end sync behavior with multiple sync iterations. + */ + @Test + public void testSyncE2ENoPrevCkpThenSyncMultipleTimes() throws Exception { + String sourceClass = MockGeneralHoodieIncrSource.class.getName(); + // ========== First start with no previous checkpoint and ingest till ckp 10 with table version. ============ + // table version 6, write version 6 without checkpoint override. + // start from checkpoint v1 null stop at checkpoint v1 10. + + // Disable auto upgrade and MDT as we want to keep things as it is. + metaClient = getHoodieMetaClientWithTableVersion(storageConf(), basePath(), "6"); + TypedProperties props = setupBaseProperties("6"); + // Round 1: ingest start from beginning to checkpoint V1 10. No checkpoint override. + props.put(OP_FETCH_NEXT_BATCH, OP_EMPTY_ROW_SET_NONE_NULL_CKP_V1_KEY); + props.put(RETURN_CHECKPOINT_KEY, "10"); + // Validating the source input ckp is empty when doing the sync. + props.put(VAL_INPUT_CKP, VAL_EMPTY_CKP_KEY); + props.put("hoodie.metadata.enable", "false"); + props.put("hoodie.write.auto.upgrade", "false"); + + HoodieDeltaStreamer ds = new HoodieDeltaStreamer(createConfig(basePath(), null, sourceClass), jsc, Option.of(props)); + ds.sync(); + metaClient.reloadActiveTimeline(); + + Map expectedMetadata = new HashMap<>(); + // Confirm the resulting checkpoint is 10 and no other metadata. + expectedMetadata.put("schema", ""); + expectedMetadata.put(STREAMER_CHECKPOINT_KEY_V1, "10"); + verifyLastInstantCommitMetadata(expectedMetadata); + assertEquals(HoodieTableVersion.SIX.versionCode(), metaClient.getTableConfig().getTableVersion().versionCode()); + assertEquals(TimelineLayoutVersion.LAYOUT_VERSION_1.getVersion(), metaClient.getTableConfig().getTimelineLayoutVersion().get().getVersion()); + + // Save the last instant for later validation. + HoodieInstant instantAfterFirstRound = metaClient.getActiveTimeline().lastInstant().get(); + + // ========== Verify we don't allow checkpoint config when upgrading. ========== + // table version 6, write version 6 without checkpoint override. + // start from checkpoint v1 null, no successful ingestion. + + // Then resume from ckp 1 and ingest till ckp V2 20 with table version 8. Expect exception thrown and nothing is ingested. + props = setupBaseProperties("8"); + // In the ingestion iteration that does upgrade, we should allow MDT on at the same time. + props.put("hoodie.metadata.enable", "true"); + // Dummy behavior injection to return ckp 2. + props.put("hoodie.write.auto.upgrade", "true"); + // Validate no ingestion happens. + props.put(VAL_INPUT_CKP, VAL_NO_INGESTION_HAPPENS); + + HoodieDeltaStreamer.Config cfg = createConfig(basePath(), null, sourceClass); + cfg.checkpoint = "overrideWhenAutoUpgradingWouldFail"; + ds = new HoodieDeltaStreamer(cfg, jsc, Option.of(props)); + Exception ex = assertThrows(HoodieUpgradeDowngradeException.class, ds::sync); + assertTrue(ex.getMessage().contains("When upgrade/downgrade is happening, please avoid setting --checkpoint option and --ignore-checkpoint for your delta streamers.")); + // No changes to the timeline / table config. + metaClient.reloadActiveTimeline(); + assertEquals(metaClient.getActiveTimeline().lastInstant().get(), instantAfterFirstRound); + assertEquals(HoodieTableVersion.SIX, metaClient.getTableConfig().getTableVersion()); + assertEquals(TimelineLayoutVersion.LAYOUT_VERSION_1, metaClient.getTableConfig().getTimelineLayoutVersion().get()); + + // ========== If no checkpoint override, upgrade would go through ========== + // table version 6, write version 8 without checkpoint override. + // start from checkpoint v1 10 stop at checkpoint v2 20. + props = setupBaseProperties("8"); + props.put("hoodie.metadata.enable", "false"); // follow up with MDT enabled + // In this iteration, we expect to resume from the previous checkpoint "10". The data source would get a v1 checkpoint as + // for all hoodie incremental source the checkpoint translation step is a no-op. + props.put(VAL_INPUT_CKP, VAL_NON_EMPTY_CKP_ALL_MEMBERS); + props.put(VAL_CKP_INSTANCE_OF, StreamerCheckpointV1.class.getName()); + props.put(VAL_CKP_KEY_EQ_VAL, "10"); + props.put(VAL_CKP_RESET_KEY_IS_NULL, "IGNORED"); + props.put(VAL_CKP_IGNORE_KEY_IS_NULL, "IGNORED"); + // Dummy behavior injection to return ckp v2 20. + // Expect to get previous checkpoint v1 but it is directly translated to v2 + props.put(OP_FETCH_NEXT_BATCH, OP_EMPTY_ROW_SET_NONE_NULL_CKP_V2_KEY); + props.put(RETURN_CHECKPOINT_KEY, "20"); + props.put("hoodie.write.auto.upgrade", "true"); + + ds = new HoodieDeltaStreamer(createConfig(basePath(), null, sourceClass), jsc, Option.of(props)); + ds.sync(); + // After the sync the table is upgraded to version 8. + metaClient = getHoodieMetaClientWithTableVersion(storageConf(), basePath(), "8"); + // Confirm the resulting checkpoint is 10 and no other metadata. + expectedMetadata.clear(); + expectedMetadata.put("schema", schemaStr); + expectedMetadata.put(STREAMER_CHECKPOINT_KEY_V2, "20"); + verifyLastInstantCommitMetadata(expectedMetadata); + // Table is upgraded + metaClient.reloadActiveTimeline(); + assertNotEquals(metaClient.getActiveTimeline().lastInstant().get(), instantAfterFirstRound); + assertEquals(HoodieTableVersion.EIGHT, metaClient.getTableConfig().getTableVersion()); + assertEquals(TimelineLayoutVersion.LAYOUT_VERSION_2, metaClient.getTableConfig().getTimelineLayoutVersion().get()); + + // ========== Checkpoint override with new format : ========== + props = setupBaseProperties("8"); + props.put("hoodie.metadata.enable", "true"); + // If user wants to do request time based checkpoint override. + cfg = createConfig(basePath(), REQUEST_TIME_PREFIX + RESET_CHECKPOINT_V2_SEPARATOR + "30", sourceClass); + // In this iteration, we override the checkpoint instead of resuming from ckp 20. Covers 2 modes: Event time based and + // commit time based. + props.put(VAL_INPUT_CKP, VAL_NON_EMPTY_CKP_ALL_MEMBERS); + // Data source will know this checkpoint is configured by the override, and it is translated to + // checkpoint v1 in translate checkpoint step. + props.put(VAL_CKP_INSTANCE_OF, StreamerCheckpointV1.class.getName()); + // The data source will only see this is a v1 checkpoint (request time based) with offset 30 to resume. + props.put(VAL_CKP_KEY_EQ_VAL, "30"); + props.put(VAL_CKP_RESET_KEY_IS_NULL, "IGNORED"); + props.put(VAL_CKP_IGNORE_KEY_IS_NULL, "IGNORED"); + // Dummy behavior is consuming the given v1 checkpoint and always return a v2 checkpoint. We only support resetting checkpoint + // based on request time, but after that it would be translated to completion time and everything after that is completion + // time based. We are not following request time ordering anymore in version 8. + props.put(OP_FETCH_NEXT_BATCH, OP_EMPTY_ROW_SET_NONE_NULL_CKP_V2_KEY); + props.put(RETURN_CHECKPOINT_KEY, "40"); + props.put("hoodie.write.auto.upgrade", "true"); + + ds = new HoodieDeltaStreamer(cfg, jsc, Option.of(props)); + ds.sync(); + // After the sync the table is upgraded to version 8. + metaClient = getHoodieMetaClientWithTableVersion(storageConf(), basePath(), "8"); + // Confirm the resulting commit metadata contains the reset ckp and 30 + expectedMetadata.clear(); + expectedMetadata.put("schema", schemaStr); + expectedMetadata.put(STREAMER_CHECKPOINT_KEY_V2, "40"); // V2 means completion time based. + expectedMetadata.put(STREAMER_CHECKPOINT_RESET_KEY_V2, REQUEST_TIME_PREFIX + RESET_CHECKPOINT_V2_SEPARATOR + "30"); + verifyLastInstantCommitMetadata(expectedMetadata); + // Table is upgraded + metaClient.reloadActiveTimeline(); + assertNotEquals(metaClient.getActiveTimeline().lastInstant().get(), instantAfterFirstRound); + assertEquals(HoodieTableVersion.EIGHT, metaClient.getTableConfig().getTableVersion()); + assertEquals(TimelineLayoutVersion.LAYOUT_VERSION_2, metaClient.getTableConfig().getTimelineLayoutVersion().get()); + + // ============ Reuse the same writ config with : ========== + // table version 8, write version 8 without request time based checkpoint override. + // start from checkpoint v2 40 stop at checkpoint v2 50. + + // after resetting the checkpoint, reusing the same configure should not lead to checkpoint reset. + props = setupBaseProperties("8"); + props.put("hoodie.metadata.enable", "true"); + // If user wants to do request time based checkpoint override. + // In this iteration, we override the checkpoint instead of resuming from ckp 20. Covers 2 modes: Event time based and + // commit time based. + props.put(VAL_INPUT_CKP, VAL_NON_EMPTY_CKP_ALL_MEMBERS); + // As normal iteration does, it should use v8 ckp for version 8. + props.put(VAL_CKP_INSTANCE_OF, StreamerCheckpointV2.class.getName()); + // The data source now starts at what the previous iteration stops, no override. + props.put(VAL_CKP_KEY_EQ_VAL, "40"); + props.put(VAL_CKP_RESET_KEY_EQUALS, REQUEST_TIME_PREFIX + RESET_CHECKPOINT_V2_SEPARATOR + "30"); + props.put(VAL_CKP_IGNORE_KEY_IS_NULL, "IGNORED"); + // Dummy behavior is consuming the given v1 checkpoint and always return a v2 checkpoint. We only support resetting checkpoint + // based on request time, but after that it would be translated to completion time and everything after that is completion + // time based. We are not following request time ordering anymore in version 8. + props.put(OP_FETCH_NEXT_BATCH, OP_EMPTY_ROW_SET_NONE_NULL_CKP_V2_KEY); + props.put(RETURN_CHECKPOINT_KEY, "50"); + props.put("hoodie.write.auto.upgrade", "true"); + + ds = new HoodieDeltaStreamer(cfg, jsc, Option.of(props)); + ds.sync(); + // After the sync the table is upgraded to version 8. + metaClient = getHoodieMetaClientWithTableVersion(storageConf(), basePath(), "8"); + // Confirm the checkpoint commit metadata. + expectedMetadata.clear(); + expectedMetadata.put("schema", schemaStr); + expectedMetadata.put(STREAMER_CHECKPOINT_KEY_V2, "50"); + expectedMetadata.put(STREAMER_CHECKPOINT_RESET_KEY_V2, REQUEST_TIME_PREFIX + RESET_CHECKPOINT_V2_SEPARATOR + "30"); + verifyLastInstantCommitMetadata(expectedMetadata); + // Table is still version 8 + metaClient.reloadActiveTimeline(); + assertEquals(HoodieTableVersion.EIGHT, metaClient.getTableConfig().getTableVersion()); + assertEquals(TimelineLayoutVersion.LAYOUT_VERSION_2, metaClient.getTableConfig().getTimelineLayoutVersion().get()); + } +} \ No newline at end of file diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/streamer/TestStreamerCheckpointUtils.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/streamer/TestStreamerCheckpointUtils.java index 113f8e3f8d899..83fe492c2390a 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/streamer/TestStreamerCheckpointUtils.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/streamer/TestStreamerCheckpointUtils.java @@ -21,17 +21,20 @@ import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.HoodieTableVersion; import org.apache.hudi.common.table.checkpoint.Checkpoint; import org.apache.hudi.common.table.checkpoint.StreamerCheckpointV1; import org.apache.hudi.common.table.checkpoint.StreamerCheckpointV2; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; import org.apache.hudi.common.table.timeline.versioning.v2.InstantComparatorV2; import org.apache.hudi.common.testutils.HoodieCommonTestHarness; import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieUpgradeDowngradeException; import org.apache.hudi.testutils.SparkClientFunctionalTestHarness; import org.apache.hudi.utilities.exception.HoodieStreamerException; @@ -44,6 +47,7 @@ import java.util.HashMap; import java.util.Map; +import static org.apache.hudi.common.testutils.HoodieTestUtils.getDefaultStorageConf; import static org.apache.hudi.utilities.streamer.HoodieStreamer.CHECKPOINT_KEY; import static org.apache.hudi.utilities.streamer.StreamSync.CHECKPOINT_IGNORE_KEY; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -66,7 +70,7 @@ public void setUp() throws IOException { @Test public void testEmptyTimelineCase() throws IOException { - Option checkpoint = StreamerCheckpointUtils.getCheckpointToResumeString( + Option checkpoint = StreamerCheckpointUtils.resolveCheckpointBetweenConfigAndPrevCommit( metaClient.getActiveTimeline(), streamerConfig, props); assertTrue(checkpoint.isEmpty()); } @@ -82,7 +86,7 @@ public void testIgnoreCheckpointCaseEmptyIgnoreKey() throws IOException { streamerConfig.ignoreCheckpoint = "ignore_checkpoint_1"; props.setProperty(HoodieWriteConfig.WRITE_TABLE_VERSION.key(), "2"); - Option checkpoint = StreamerCheckpointUtils.getCheckpointToResumeString( + Option checkpoint = StreamerCheckpointUtils.resolveCheckpointBetweenConfigAndPrevCommit( metaClient.getActiveTimeline(), streamerConfig, props); assertTrue(checkpoint.isEmpty()); } @@ -98,7 +102,7 @@ public void testIgnoreCheckpointCaseIgnoreKeyMismatch() throws IOException { streamerConfig.ignoreCheckpoint = "ignore_checkpoint_1"; props.setProperty(HoodieWriteConfig.WRITE_TABLE_VERSION.key(), "2"); - Option checkpoint = StreamerCheckpointUtils.getCheckpointToResumeString( + Option checkpoint = StreamerCheckpointUtils.resolveCheckpointBetweenConfigAndPrevCommit( metaClient.getActiveTimeline(), streamerConfig, props); assertTrue(checkpoint.isEmpty()); } @@ -114,7 +118,7 @@ public void testThrowExceptionCase() throws IOException { props.setProperty(HoodieWriteConfig.WRITE_TABLE_VERSION.key(), "2"); HoodieStreamerException exception = assertThrows(HoodieStreamerException.class, () -> { - StreamerCheckpointUtils.getCheckpointToResumeString( + StreamerCheckpointUtils.resolveCheckpointBetweenConfigAndPrevCommit( metaClient.getActiveTimeline(), streamerConfig, props); }); assertTrue(exception.getMessage().contains("Unable to find previous checkpoint")); @@ -132,7 +136,7 @@ public void testNewCheckpointV2WithResetKeyCase() throws IOException { streamerConfig.sourceClassName = "org.apache.hudi.utilities.sources.KafkaSource"; props.setProperty(HoodieWriteConfig.WRITE_TABLE_VERSION.key(), "2"); - Option checkpoint = StreamerCheckpointUtils.getCheckpointToResumeString( + Option checkpoint = StreamerCheckpointUtils.resolveCheckpointBetweenConfigAndPrevCommit( metaClient.getActiveTimeline(), streamerConfig, props); assertTrue(checkpoint.get() instanceof StreamerCheckpointV1); assertEquals("earliest", checkpoint.get().getCheckpointKey()); @@ -149,7 +153,7 @@ public void testNewCheckpointV1WithResetKeyCase() throws IOException { streamerConfig.sourceClassName = "org.apache.hudi.utilities.sources.KafkaSource"; props.setProperty(HoodieWriteConfig.WRITE_TABLE_VERSION.key(), "1"); - Option checkpoint = StreamerCheckpointUtils.getCheckpointToResumeString( + Option checkpoint = StreamerCheckpointUtils.resolveCheckpointBetweenConfigAndPrevCommit( metaClient.getActiveTimeline(), streamerConfig, props); assertTrue(checkpoint.get() instanceof StreamerCheckpointV1); assertEquals("earliest", checkpoint.get().getCheckpointKey()); @@ -166,7 +170,7 @@ public void testReuseCheckpointCase() throws IOException { props.setProperty(HoodieWriteConfig.WRITE_TABLE_VERSION.key(), "2"); - Option checkpoint = StreamerCheckpointUtils.getCheckpointToResumeString( + Option checkpoint = StreamerCheckpointUtils.resolveCheckpointBetweenConfigAndPrevCommit( metaClient.getActiveTimeline(), streamerConfig, props); assertEquals("earliest-0-100", checkpoint.get().getCheckpointKey()); } @@ -180,7 +184,7 @@ public void testNewCheckpointV2NoMetadataCase() throws IOException { streamerConfig.sourceClassName = "org.apache.hudi.utilities.sources.KafkaSource"; props.setProperty(HoodieWriteConfig.WRITE_TABLE_VERSION.key(), "2"); - Option checkpoint = StreamerCheckpointUtils.getCheckpointToResumeString( + Option checkpoint = StreamerCheckpointUtils.resolveCheckpointBetweenConfigAndPrevCommit( metaClient.getActiveTimeline(), streamerConfig, props); assertTrue(checkpoint.get() instanceof StreamerCheckpointV2); assertEquals("earliest", checkpoint.get().getCheckpointKey()); @@ -196,7 +200,7 @@ public void testNewCheckpointV1NoMetadataCase() throws IOException { streamerConfig.sourceClassName = "org.apache.hudi.utilities.sources.KafkaSource"; props.setProperty(HoodieWriteConfig.WRITE_TABLE_VERSION.key(), "1"); - Option checkpoint = StreamerCheckpointUtils.getCheckpointToResumeString( + Option checkpoint = StreamerCheckpointUtils.resolveCheckpointBetweenConfigAndPrevCommit( metaClient.getActiveTimeline(), streamerConfig, props); assertTrue(checkpoint.get() instanceof StreamerCheckpointV1); assertEquals("earliest", checkpoint.get().getCheckpointKey()); @@ -224,7 +228,7 @@ public void testIgnoreCheckpointNullKeyCase() throws IOException { streamerConfig.ignoreCheckpoint = "ignore_checkpoint_1"; props.setProperty(HoodieWriteConfig.WRITE_TABLE_VERSION.key(), "2"); - Option checkpoint = StreamerCheckpointUtils.getCheckpointToResumeString( + Option checkpoint = StreamerCheckpointUtils.resolveCheckpointBetweenConfigAndPrevCommit( metaClient.getActiveTimeline(), streamerConfig, props); assertTrue(checkpoint.isEmpty()); } @@ -240,7 +244,7 @@ public void testNewCheckpointWithEmptyResetKey() throws IOException { streamerConfig.checkpoint = "new-checkpoint"; props.setProperty(HoodieWriteConfig.WRITE_TABLE_VERSION.key(), "2"); - Option checkpoint = StreamerCheckpointUtils.getCheckpointToResumeString( + Option checkpoint = StreamerCheckpointUtils.resolveCheckpointBetweenConfigAndPrevCommit( metaClient.getActiveTimeline(), streamerConfig, props); assertTrue(checkpoint.get() instanceof StreamerCheckpointV1); assertEquals("new-checkpoint", checkpoint.get().getCheckpointKey()); @@ -257,7 +261,7 @@ public void testNewCheckpointWithDifferentResetKey() throws IOException { streamerConfig.checkpoint = "new-checkpoint"; props.setProperty(HoodieWriteConfig.WRITE_TABLE_VERSION.key(), "2"); - Option checkpoint = StreamerCheckpointUtils.getCheckpointToResumeString( + Option checkpoint = StreamerCheckpointUtils.resolveCheckpointBetweenConfigAndPrevCommit( metaClient.getActiveTimeline(), streamerConfig, props); assertTrue(checkpoint.get() instanceof StreamerCheckpointV1); assertEquals("new-checkpoint", checkpoint.get().getCheckpointKey()); @@ -283,7 +287,7 @@ public void testMergeOnReadWithDeltaCommits() throws IOException { deltaCommitMetadata.put(HoodieStreamer.CHECKPOINT_KEY, "deltacommit-cp"); createDeltaCommit(deltaCommitTime, deltaCommitMetadata); - Option checkpoint = StreamerCheckpointUtils.getCheckpointToResumeString( + Option checkpoint = StreamerCheckpointUtils.resolveCheckpointBetweenConfigAndPrevCommit( metaClient.getActiveTimeline(), streamerConfig, props); // Should use deltacommit checkpoint @@ -302,7 +306,7 @@ public void testMergeOnReadWithoutDeltaCommits() throws IOException { commitMetadata.put(HoodieStreamer.CHECKPOINT_KEY, "commit-cp"); createCommit(commitTime, commitMetadata); - Option checkpoint = StreamerCheckpointUtils.getCheckpointToResumeString( + Option checkpoint = StreamerCheckpointUtils.resolveCheckpointBetweenConfigAndPrevCommit( metaClient.getActiveTimeline(), streamerConfig, props); // Should use commit checkpoint @@ -325,8 +329,8 @@ public void testCreateNewCheckpointV2WithNullTimeline() throws IOException { streamerConfig.sourceClassName = "org.apache.hudi.utilities.sources.KafkaSource"; props.setProperty(HoodieWriteConfig.WRITE_TABLE_VERSION.key(), "2"); - Option checkpoint = StreamerCheckpointUtils.getCheckpointToResumeFrom( - Option.empty(), streamerConfig, props); + Option checkpoint = StreamerCheckpointUtils.resolveCheckpointToResumeFrom( + Option.empty(), streamerConfig, props, metaClient); assertTrue(checkpoint.get() instanceof StreamerCheckpointV1); assertEquals("test-cp", checkpoint.get().getCheckpointKey()); } @@ -336,8 +340,8 @@ public void testCreateNewCheckpointV1WithNullTimeline() throws IOException { streamerConfig.checkpoint = "test-cp"; props.setProperty(HoodieWriteConfig.WRITE_TABLE_VERSION.key(), "1"); - Option checkpoint = StreamerCheckpointUtils.getCheckpointToResumeFrom( - Option.empty(), streamerConfig, props); + Option checkpoint = StreamerCheckpointUtils.resolveCheckpointToResumeFrom( + Option.empty(), streamerConfig, props, metaClient); assertTrue(checkpoint.get() instanceof StreamerCheckpointV1); assertEquals("test-cp", checkpoint.get().getCheckpointKey()); } @@ -345,8 +349,8 @@ public void testCreateNewCheckpointV1WithNullTimeline() throws IOException { @Test public void testEmptyTimelineAndNullCheckpoint() throws IOException { streamerConfig.checkpoint = null; - Option checkpoint = StreamerCheckpointUtils.getCheckpointToResumeFrom( - Option.empty(), streamerConfig, props); + Option checkpoint = StreamerCheckpointUtils.resolveCheckpointToResumeFrom( + Option.empty(), streamerConfig, props, metaClient); assertTrue(checkpoint.isEmpty()); } @@ -359,8 +363,128 @@ public void testTimelineWithCheckpointOverridesConfigCheckpoint() throws IOExcep streamerConfig.checkpoint = "config-cp"; - Option checkpoint = StreamerCheckpointUtils.getCheckpointToResumeFrom( - Option.of(metaClient.getActiveTimeline()), streamerConfig, props); + Option checkpoint = StreamerCheckpointUtils.resolveCheckpointToResumeFrom( + Option.of(metaClient.getActiveTimeline()), streamerConfig, props, metaClient); assertEquals("config-cp", checkpoint.get().getCheckpointKey()); } + + @Test + public void testAssertNoCheckpointOverrideDuringUpgradeSuccess() throws IOException { + // Create metaclient with older version + metaClient = + HoodieTableMetaClient.newTableBuilder() + .setDatabaseName("dataset") + .setTableName("testTable") + .setTimelineLayoutVersion(TimelineLayoutVersion.VERSION_1) + .setTableVersion(HoodieTableVersion.SIX) + .setTableType(HoodieTableType.MERGE_ON_READ) + .initTable(getDefaultStorageConf(), basePath()); + // No checkpoint override configs set + streamerConfig.checkpoint = null; + streamerConfig.ignoreCheckpoint = null; + + // Even with auto-upgrade enabled and version mismatch, should pass when no checkpoint override + props.setProperty(HoodieWriteConfig.AUTO_UPGRADE_VERSION.key(), "true"); + props.setProperty(HoodieWriteConfig.WRITE_TABLE_VERSION.key(), "8"); + + // Should not throw exception + StreamerCheckpointUtils.assertNoCheckpointOverrideDuringUpgradeForHoodieIncSource(metaClient, streamerConfig, props); + } + + @Test + public void testAssertNoCheckpointOverrideDuringUpgradeFailure() throws IOException { + metaClient = + HoodieTableMetaClient.newTableBuilder() + .setDatabaseName("dataset") + .setTableName("testTable") + .setTimelineLayoutVersion(TimelineLayoutVersion.VERSION_1) + .setTableVersion(HoodieTableVersion.SIX) + .setTableType(HoodieTableType.MERGE_ON_READ) + .initTable(getDefaultStorageConf(), basePath()); + + // Set checkpoint override + streamerConfig.checkpoint = "test-cp"; + streamerConfig.targetBasePath = "dummyVal"; + streamerConfig.sourceClassName = "org.apache.hudi.utilities.sources.S3EventsHoodieIncrSource"; + // Enable auto-upgrade and set newer write version + props.setProperty(HoodieWriteConfig.AUTO_UPGRADE_VERSION.key(), "true"); + props.setProperty(HoodieWriteConfig.WRITE_TABLE_VERSION.key(), "8"); + // Should throw exception due to checkpoint override during upgrade + assertThrows(HoodieUpgradeDowngradeException.class, () -> { + StreamerCheckpointUtils.assertNoCheckpointOverrideDuringUpgradeForHoodieIncSource(metaClient, streamerConfig, props); + }); + + // If not a hoodie incremental source, checkpoint override is allowed during upgrade. + streamerConfig.sourceClassName = "org.apache.hudi.utilities.sources.NotAHoodieIncrSource"; + streamerConfig.checkpoint = "test-cp"; + StreamerCheckpointUtils.assertNoCheckpointOverrideDuringUpgradeForHoodieIncSource(metaClient, streamerConfig, props); + } + + @Test + public void testAssertNoCheckpointOverrideDuringUpgradeWithIgnoreCheckpoint() throws IOException { + // Create metaclient with older version + metaClient = HoodieTableMetaClient.newTableBuilder() + .setDatabaseName("dataset") + .setTableName("testTable") + .setTimelineLayoutVersion(TimelineLayoutVersion.VERSION_1) + .setTableVersion(HoodieTableVersion.SIX) + .setTableType(HoodieTableType.MERGE_ON_READ) + .initTable(getDefaultStorageConf(), basePath()); + // Set ignore checkpoint override + streamerConfig.ignoreCheckpoint = "ignore-cp"; + streamerConfig.targetBasePath = "dummyVal"; + streamerConfig.sourceClassName = "org.apache.hudi.utilities.sources.GcsEventsHoodieIncrSource"; + + // Enable auto-upgrade and set newer write version + props.setProperty(HoodieWriteConfig.AUTO_UPGRADE_VERSION.key(), "true"); + props.setProperty(HoodieWriteConfig.WRITE_TABLE_VERSION.key(), "8"); + + // Should throw exception due to ignore checkpoint override during upgrade + assertThrows(HoodieUpgradeDowngradeException.class, () -> { + StreamerCheckpointUtils.assertNoCheckpointOverrideDuringUpgradeForHoodieIncSource(metaClient, streamerConfig, props); + }); + } + + @Test + public void testAssertNoCheckpointOverrideDuringUpgradeWithAutoUpgradeDisabledVersion6() throws IOException { + // Test case 1: Version 6 table with version 6 write config + metaClient = HoodieTableMetaClient.newTableBuilder() + .setDatabaseName("dataset") + .setTableName("testTable") + .setTimelineLayoutVersion(TimelineLayoutVersion.VERSION_1) + .setTableVersion(HoodieTableVersion.SIX) + .setTableType(HoodieTableType.MERGE_ON_READ) + .initTable(getDefaultStorageConf(), basePath()); + + streamerConfig.checkpoint = "test-cp"; + streamerConfig.targetBasePath = "dummyVal"; + + // Disable auto-upgrade and set matching version + props.setProperty(HoodieWriteConfig.AUTO_UPGRADE_VERSION.key(), "false"); + props.setProperty(HoodieWriteConfig.WRITE_TABLE_VERSION.key(), "6"); + + // Should pass since versions match + StreamerCheckpointUtils.assertNoCheckpointOverrideDuringUpgradeForHoodieIncSource(metaClient, streamerConfig, props); + } + + @Test + public void testAssertNoCheckpointOverrideDuringUpgradeWithAutoUpgradeDisabledVersion8() throws IOException { + metaClient = HoodieTableMetaClient.newTableBuilder() + .setDatabaseName("dataset") + .setTableName("testTable") + .setTimelineLayoutVersion(TimelineLayoutVersion.VERSION_1) + .setTableVersion(HoodieTableVersion.EIGHT) + .setTableType(HoodieTableType.MERGE_ON_READ) + .initTable(getDefaultStorageConf(), basePath()); + + streamerConfig.checkpoint = "test-cp"; + streamerConfig.targetBasePath = "dummyVal"; + + // Disable auto-upgrade and set matching version 8 + props.setProperty(HoodieWriteConfig.AUTO_UPGRADE_VERSION.key(), "false"); + props.setProperty(HoodieWriteConfig.WRITE_TABLE_VERSION.key(), "8"); + + // Should pass since versions match + StreamerCheckpointUtils.assertNoCheckpointOverrideDuringUpgradeForHoodieIncSource(metaClient, streamerConfig, props); + } }