diff --git a/commons/build.gradle.kts b/commons/build.gradle.kts index 23240446..101ef8db 100644 --- a/commons/build.gradle.kts +++ b/commons/build.gradle.kts @@ -87,6 +87,7 @@ dependencies { testImplementation(jackson.databind) testImplementation(testinglibs.mockito.core) testImplementation(testinglibs.assertj.core) + testImplementation(testinglibs.awaitility) testImplementation(testFixtures(project(":commons"))) testImplementation(testinglibs.woodstox.stax2.api) testImplementation(apache.hadoop.mapreduce.client.core) diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceCommonConfig.java b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceCommonConfig.java index 7fb8cd9b..954c9151 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceCommonConfig.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceCommonConfig.java @@ -22,6 +22,8 @@ import io.aiven.kafka.connect.common.config.enums.ErrorsTolerance; import io.aiven.kafka.connect.common.source.input.InputFormat; +import io.aiven.kafka.connect.common.source.input.Transformer; +import io.aiven.kafka.connect.common.source.input.TransformerFactory; public class SourceCommonConfig extends CommonConfig { @@ -64,11 +66,15 @@ public String getTargetTopicPartitions() { } public ErrorsTolerance getErrorsTolerance() { - return ErrorsTolerance.forName(sourceConfigFragment.getErrorsTolerance()); + return sourceConfigFragment.getErrorsTolerance(); } public int getMaxPollRecords() { return sourceConfigFragment.getMaxPollRecords(); } + public Transformer getTransformer() { + return TransformerFactory.getTransformer(schemaRegistryFragment.getInputFormat()); + } + } diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java index c62431dc..58befa60 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java @@ -88,8 +88,8 @@ public int getExpectedMaxMessageBytes() { return cfg.getInt(EXPECTED_MAX_MESSAGE_BYTES); } - public String getErrorsTolerance() { - return cfg.getString(ERRORS_TOLERANCE); + public ErrorsTolerance getErrorsTolerance() { + return ErrorsTolerance.forName(cfg.getString(ERRORS_TOLERANCE)); } private static class ErrorsToleranceValidator implements ConfigDef.Validator { diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/AbstractSourceTask.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/AbstractSourceTask.java new file mode 100644 index 00000000..f55257f4 --- /dev/null +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/AbstractSourceTask.java @@ -0,0 +1,511 @@ +/* + * Copyright 2024-2025 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.concurrent.atomic.AtomicBoolean; + +import org.apache.kafka.connect.source.SourceRecord; +import org.apache.kafka.connect.source.SourceTask; + +import io.aiven.kafka.connect.common.config.SourceCommonConfig; +import io.aiven.kafka.connect.common.config.enums.ErrorsTolerance; + +import org.apache.commons.lang3.time.StopWatch; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This class handles extracting records from an iterator and returning them to Kafka. It uses an exponential backoff + * with jitter to reduce the number of calls to the backend when there is no data. This solution: + * + * + * + */ +public abstract class AbstractSourceTask extends SourceTask { + + public static final List NULL_RESULT = null; + + /** + * The maximum time to spend polling. This is set to 5 seconds as that is the time that is allotted to a system for + * shutdown. + */ + public static final Duration MAX_POLL_TIME = Duration.ofSeconds(5); + /** + * The boolean that indicates the connector is stopped. + */ + private final AtomicBoolean connectorStopped; + + /** + * The logger to use. Set from the class implementing AbstractSourceTask. + */ + private final Logger logger; + + /** + * The maximum number of records to put in a poll. Specified in the configuration. + */ + private int maxPollRecords; + + /** + * The Backoff implementation that executes the delay in the poll loop. + */ + private final Backoff backoff; + + private final Timer timer; + + /** + * The configuration + */ + private SourceCommonConfig config; + + private Iterator sourceRecordIterator; + + /** + * Constructor. + * + * @param logger + * the logger to use. + */ + protected AbstractSourceTask(final Logger logger) { + super(); + this.logger = logger; + connectorStopped = new AtomicBoolean(); + timer = new Timer(MAX_POLL_TIME); + backoff = new Backoff(timer.getBackoffConfig()); + } + + /** + * Gets the iterator of SourceRecords. The iterator that SourceRecords are extracted from during a poll event. When + * this iterator runs out of records it should attempt to reset and read more records from the backend on the next + * {@code hasNext()} call. In this way it should detect when new data has been added to the backend and continue + * processing. + *

+ * This method should handle any backend exception that can be retried. Any runtime exceptions that are thrown when + * this iterator executes may cause the task to abort. + *

+ * + * @param config + * the configuration for the Backoff. + * @return The iterator of SourceRecords. + */ + abstract protected Iterator getIterator(BackoffConfig config); + + /** + * Called by {@link #start} to allows the concrete implementation to configure itself based on properties. + * + * @param props + * the properties to use for configuration. + */ + abstract protected SourceCommonConfig configure(Map props); + + @Override + public final void start(final Map props) { + logger.debug("Starting"); + config = configure(props); + maxPollRecords = config.getMaxPollRecords(); + sourceRecordIterator = getIterator(timer.getBackoffConfig()); + } + + /** + * Try to add a SourceRecord to the results. + * + * @param results + * the result to add the record to. + * @param sourceRecordIterator + * the source record iterator. + * @return true if successful, false if the iterator is empty. + */ + private boolean tryAdd(final List results, final Iterator sourceRecordIterator) { + if (sourceRecordIterator.hasNext()) { + backoff.reset(); + final SourceRecord sourceRecord = sourceRecordIterator.next(); + if (logger.isDebugEnabled()) { + logger.debug("tryAdd() : read record {}", sourceRecord.sourceOffset()); + } + results.add(sourceRecord); + return true; + } + logger.info("No records found in tryAdd call"); + return false; + } + + /** + * Returns {@code true} if the connector is not stopped and the timer has not expired. + * + * @return {@code true} if the connector is not stopped and the timer has not expired. + */ + protected boolean stillPolling() { + final boolean result = !connectorStopped.get() && !timer.isExpired(); + logger.debug("Still polling: {}", result); + return result; + } + + @Override + public final List poll() { + logger.debug("Polling"); + if (connectorStopped.get()) { + logger.info("Stopping"); + closeResources(); + return NULL_RESULT; + } else { + timer.start(); + try { + final List result = populateList(); + if (logger.isDebugEnabled()) { + logger.debug("Poll() returning {} SourceRecords.", result == null ? null : result.size()); + } + return result; + } finally { + timer.stop(); + timer.reset(); + } + } + } + + /** + * Attempts to populate the return list. Will read as many records into the list as it can until the timer expires + * or the task is shut down. + * + * @return A list SourceRecords or {@code null} if the system hit a runtime exception. + */ + private List populateList() { + final List results = new ArrayList<>(); + try { + while (stillPolling() && results.size() < maxPollRecords) { + if (!tryAdd(results, sourceRecordIterator)) { + if (!results.isEmpty()) { + logger.debug("tryAdd() did not add to the list, returning current results."); + // if we could not get a record and the results are not empty return them + break; + } + logger.debug("Attempting {}", backoff); + backoff.cleanDelay(); + } + } + + } catch (RuntimeException e) { // NOPMD must catch runtime here. + logger.error("Error during poll(): {}", e.getMessage(), e); + if (config.getErrorsTolerance() == ErrorsTolerance.NONE) { + logger.error("Stopping Task"); + throw e; + } + } + return results.isEmpty() ? NULL_RESULT : results; + } + + @Override + public final void stop() { + logger.debug("Stopping"); + connectorStopped.set(true); + } + + /** + * Returns the running state of the task. + * + * @return {@code true} if the connector is running, {@code false} otherwise. + */ + public final boolean isRunning() { + return !connectorStopped.get(); + } + + /** + * Close any resources the source has open. Called by the IteratorRunnable when it is stopping. + */ + abstract protected void closeResources(); + + /** + * Calculates elapsed time and flags when expired. + */ + protected static class Timer extends StopWatch { + /** + * The length of time that the timer should run. + */ + private final long duration; + + /** + * The flag that indicates the timer has been aborted. + */ + private boolean hasAborted; + + /** + * Constructor. + * + * @param duration + * the length of time the timer should run. + */ + Timer(final Duration duration) { + super(); + this.duration = duration.toMillis(); + } + + /** + * Gets the maximum duration for this timer. + * + * @return the maximum duration for the timer. + */ + public long millisecondsRemaining() { + return super.isStarted() ? duration - super.getTime() : duration; + } + + /** + * Returns {@code true} if the timer has expired. + * + * @return {@code true} if the timer has expired. + */ + public boolean isExpired() { + return hasAborted || super.getTime() >= duration; + } + + /** + * Aborts the timer. Timer will report that it has expired until reset is called. + */ + public void abort() { + hasAborted = true; + } + + @Override + public void start() { + try { + hasAborted = false; + super.start(); + } catch (IllegalStateException e) { + throw new IllegalStateException("Timer: " + e.getMessage()); + } + } + + @Override + public void stop() { + try { + super.stop(); + } catch (IllegalStateException e) { + throw new IllegalStateException("Timer: " + e.getMessage()); + } + } + + @Override + public void reset() { + try { + hasAborted = false; + super.reset(); + } catch (IllegalStateException e) { + throw new IllegalStateException("Timer: " + e.getMessage()); + } + } + + /** + * Gets a Backoff Config for this timer. + * + * @return a backoff Configuration. + */ + public BackoffConfig getBackoffConfig() { + return new BackoffConfig() { + + @Override + public SupplierOfLong getSupplierOfTimeRemaining() { + return Timer.this::millisecondsRemaining; + } + + @Override + public AbortTrigger getAbortTrigger() { + return Timer.this::abort; + } + }; + } + } + + /** + * Performs a delay based on the number of successive {@link #delay()} or {@link #cleanDelay()} calls without a + * {@link #reset()}. Delay increases exponentially but never exceeds the time remaining by more than 0.512 seconds. + */ + public static class Backoff { + /** The logger to write to */ + private static final Logger LOGGER = LoggerFactory.getLogger(Backoff.class); + /** + * The maximum jitter random number. Should be a power of 2 for speed. + */ + public static final int MAX_JITTER = 1024; + + public static final int JITTER_SUBTRAHEND = MAX_JITTER / 2; + /** + * A supplier of the time remaining (in milliseconds) on the overriding timer. + */ + private final SupplierOfLong timeRemaining; + + /** + * A function to call to abort the timer. + */ + private final AbortTrigger abortTrigger; + + /** + * The maximum number of times {@link #delay()} will be called before maxWait is reached. + */ + private int maxCount; + /** + * The number of times {@link #delay()} has been called. + */ + private int waitCount; + + /** + * A random number generator to construct jitter. + */ + Random random = new Random(); + + /** + * Constructor. + * + * @param config + * The configuration for the backoff. + */ + public Backoff(final BackoffConfig config) { + this.timeRemaining = config.getSupplierOfTimeRemaining(); + this.abortTrigger = config.getAbortTrigger(); + reset(); + } + + /** + * Reset the backoff time so that delay is again at the minimum. + */ + public final void reset() { + // if the reminaing time is 0 or negative the maxCount will be infinity + // so make sure that it is 0 in that case. + final long remainingTime = timeRemaining.get(); + maxCount = remainingTime < 1L ? 0 : (int) (Math.log10(remainingTime) / Math.log10(2)); + waitCount = 0; + LOGGER.debug("Reset {}", this); + } + + /** + * Handle adjustment when maxCount could not be set. + * + * @return the corrected maxCount + */ + private int getMaxCount() { + if (maxCount == 0) { + reset(); + } + return maxCount; + } + + /** + * Calculates the delay wihtout jitter. + * + * @return the number of milliseconds the delay will be. + */ + public long estimatedDelay() { + long sleepTime = timeRemaining.get(); + if (sleepTime > 0 && waitCount < maxCount) { + sleepTime = (long) Math.min(sleepTime, Math.pow(2, waitCount + 1)); + } + return sleepTime < 0 ? 0 : sleepTime; + } + + /** + * Calculates the range of jitter in milliseconds. + * + * @return the maximum jitter in milliseconds. jitter is +/- maximum jitter. + */ + public int getMaxJitter() { + return MAX_JITTER - JITTER_SUBTRAHEND; + } + + private long timeWithJitter() { + // generate approx +/- 0.512 seconds of jitter + final int jitter = random.nextInt(MAX_JITTER) - JITTER_SUBTRAHEND; + return (long) Math.pow(2, waitCount) + jitter; + } + + /** + * Delay execution based on the number of times this method has been called. + * + * @throws InterruptedException + * If any thread interrupts this thread. + */ + public void delay() throws InterruptedException { + final long sleepTime = timeRemaining.get(); + if (sleepTime > 0 && waitCount < (maxCount == 0 ? getMaxCount() : maxCount)) { + waitCount++; + final long nextSleep = timeWithJitter(); + // don't sleep negative time. Jitter can introduce negative tme. + if (nextSleep > 0) { + if (nextSleep >= sleepTime) { + LOGGER.debug("Backoff aborting timer"); + abortTrigger.apply(); + } else { + LOGGER.debug("Backoff sleepiing {}", nextSleep); + Thread.sleep(nextSleep); + } + } + } + } + + /** + * Like {@link #delay} but swallows the {@link InterruptedException}. + */ + public void cleanDelay() { + try { + delay(); + } catch (InterruptedException exception) { + // do nothing return results below + } + } + + @Override + public String toString() { + return String.format("Backoff %s/%s, %s milliseconds remaining.", waitCount, maxCount, timeRemaining.get()); + } + } + + /** + * A functional interface to return long values. + */ + @FunctionalInterface + public interface SupplierOfLong { + long get(); + } + + /** + * A functional interface that will abort the timer. After being called timer will indicate that it is expired, + * until it is reset. + */ + @FunctionalInterface + public interface AbortTrigger { + void apply(); + } + + /** + * An interface to define the Backoff configuration. Used for convenience with Timer. + */ + public interface BackoffConfig { + SupplierOfLong getSupplierOfTimeRemaining(); + AbortTrigger getAbortTrigger(); + } +} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/AvroTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/AvroTransformer.java index de770cbc..760d074d 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/AvroTransformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/AvroTransformer.java @@ -37,7 +37,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class AvroTransformer extends Transformer { +public class AvroTransformer extends Transformer { private final AvroData avroData; @@ -54,9 +54,9 @@ public void configureValueConverter(final Map config, final Abst } @Override - public StreamSpliterator createSpliterator(final IOSupplier inputStreamIOSupplier, - final String topic, final int topicPartition, final AbstractConfig sourceConfig) { - return new StreamSpliterator<>(LOGGER, inputStreamIOSupplier) { + public StreamSpliterator createSpliterator(final IOSupplier inputStreamIOSupplier, final String topic, + final int topicPartition, final AbstractConfig sourceConfig) { + return new StreamSpliterator(LOGGER, inputStreamIOSupplier) { private DataFileStream dataFileStream; private final DatumReader datumReader = new GenericDatumReader<>(); @@ -78,9 +78,10 @@ public void doClose() { } @Override - protected boolean doAdvance(final Consumer action) { + protected boolean doAdvance(final Consumer action) { if (dataFileStream.hasNext()) { - action.accept(dataFileStream.next()); + final GenericRecord record = dataFileStream.next(); + action.accept(avroData.toConnectData(record.getSchema(), record)); return true; } return false; @@ -88,12 +89,6 @@ protected boolean doAdvance(final Consumer action) { }; } - @Override - public SchemaAndValue getValueData(final GenericRecord record, final String topic, - final AbstractConfig sourceConfig) { - return avroData.toConnectData(record.getSchema(), record); - } - @Override public SchemaAndValue getKeyData(final Object cloudStorageKey, final String topic, final AbstractConfig sourceConfig) { diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformer.java index f571062d..232aaef2 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformer.java @@ -31,7 +31,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class ByteArrayTransformer extends Transformer { +public class ByteArrayTransformer extends Transformer { private static final Logger LOGGER = LoggerFactory.getLogger(ByteArrayTransformer.class); private static final int MAX_BUFFER_SIZE = 4096; @@ -42,9 +42,9 @@ public void configureValueConverter(final Map config, final Abst } @Override - public StreamSpliterator createSpliterator(final IOSupplier inputStreamIOSupplier, - final String topic, final int topicPartition, final AbstractConfig sourceConfig) { - return new StreamSpliterator(LOGGER, inputStreamIOSupplier) { + public StreamSpliterator createSpliterator(final IOSupplier inputStreamIOSupplier, final String topic, + final int topicPartition, final AbstractConfig sourceConfig) { + return new StreamSpliterator(LOGGER, inputStreamIOSupplier) { @Override protected InputStream inputOpened(final InputStream input) { return input; @@ -56,7 +56,7 @@ protected void doClose() { } @Override - protected boolean doAdvance(final Consumer action) { + protected boolean doAdvance(final Consumer action) { final byte[] buffer = new byte[MAX_BUFFER_SIZE]; try { final int bytesRead = IOUtils.read(inputStream, buffer); @@ -64,9 +64,9 @@ protected boolean doAdvance(final Consumer action) { return false; } if (bytesRead < MAX_BUFFER_SIZE) { - action.accept(Arrays.copyOf(buffer, bytesRead)); + action.accept(new SchemaAndValue(null, Arrays.copyOf(buffer, bytesRead))); } else { - action.accept(buffer); + action.accept(new SchemaAndValue(null, buffer)); } return true; } catch (IOException e) { @@ -77,11 +77,6 @@ protected boolean doAdvance(final Consumer action) { }; } - @Override - public SchemaAndValue getValueData(final byte[] record, final String topic, final AbstractConfig sourceConfig) { - return new SchemaAndValue(null, record); - } - @Override public SchemaAndValue getKeyData(final Object cloudStorageKey, final String topic, final AbstractConfig sourceConfig) { diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java index 4ff0f1a2..c6aea0e8 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java @@ -34,7 +34,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class JsonTransformer extends Transformer { +public class JsonTransformer extends Transformer { private final JsonConverter jsonConverter; @@ -52,9 +52,9 @@ public void configureValueConverter(final Map config, final Abst } @Override - public StreamSpliterator createSpliterator(final IOSupplier inputStreamIOSupplier, - final String topic, final int topicPartition, final AbstractConfig sourceConfig) { - final StreamSpliterator spliterator = new StreamSpliterator<>(LOGGER, inputStreamIOSupplier) { + public StreamSpliterator createSpliterator(final IOSupplier inputStreamIOSupplier, final String topic, + final int topicPartition, final AbstractConfig sourceConfig) { + return new StreamSpliterator(LOGGER, inputStreamIOSupplier) { BufferedReader reader; @Override @@ -75,7 +75,7 @@ public void doClose() { } @Override - public boolean doAdvance(final Consumer action) { + public boolean doAdvance(final Consumer action) { String line = null; try { // remove blank and empty lines. @@ -87,7 +87,7 @@ public boolean doAdvance(final Consumer action) { } } line = line.trim(); - action.accept(line.getBytes(StandardCharsets.UTF_8)); + action.accept(jsonConverter.toConnectData(topic, line.getBytes(StandardCharsets.UTF_8))); return true; } catch (IOException e) { LOGGER.error("Error reading input stream: {}", e.getMessage(), e); @@ -95,13 +95,6 @@ public boolean doAdvance(final Consumer action) { } } }; - - return spliterator; - } - - @Override - public SchemaAndValue getValueData(final byte[] record, final String topic, final AbstractConfig sourceConfig) { - return jsonConverter.toConnectData(topic, record); } @Override diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ParquetTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ParquetTransformer.java index 7da61c41..2c47d510 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ParquetTransformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ParquetTransformer.java @@ -43,7 +43,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class ParquetTransformer extends Transformer { +public class ParquetTransformer extends Transformer { private final AvroData avroData; @@ -59,12 +59,6 @@ public void configureValueConverter(final Map config, final Abst config.put(SCHEMA_REGISTRY_URL, sourceConfig.getString(SCHEMA_REGISTRY_URL)); } - @Override - public SchemaAndValue getValueData(final GenericRecord record, final String topic, - final AbstractConfig sourceConfig) { - return avroData.toConnectData(record.getSchema(), record); - } - @Override public SchemaAndValue getKeyData(final Object cloudStorageKey, final String topic, final AbstractConfig sourceConfig) { @@ -72,10 +66,10 @@ public SchemaAndValue getKeyData(final Object cloudStorageKey, final String topi } @Override - public StreamSpliterator createSpliterator(final IOSupplier inputStreamIOSupplier, - final String topic, final int topicPartition, final AbstractConfig sourceConfig) { + public StreamSpliterator createSpliterator(final IOSupplier inputStreamIOSupplier, final String topic, + final int topicPartition, final AbstractConfig sourceConfig) { - final StreamSpliterator spliterator = new StreamSpliterator<>(LOGGER, inputStreamIOSupplier) { + return new StreamSpliterator(LOGGER, inputStreamIOSupplier) { private ParquetReader reader; private File parquetFile; @@ -114,11 +108,11 @@ protected void doClose() { } @Override - protected boolean doAdvance(final Consumer action) { + protected boolean doAdvance(final Consumer action) { try { final GenericRecord record = reader.read(); if (record != null) { - action.accept(record); // Pass record to the stream + action.accept(avroData.toConnectData(record.getSchema(), record)); // Pass record to the stream return true; } } catch (IOException e) { @@ -127,7 +121,6 @@ protected boolean doAdvance(final Consumer action) { return false; } }; - return spliterator; } static void deleteTmpFile(final Path parquetFile) { diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/Transformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/Transformer.java index 196d9ae3..09e8c0ca 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/Transformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/Transformer.java @@ -30,14 +30,14 @@ import org.apache.commons.io.function.IOSupplier; import org.slf4j.Logger; -public abstract class Transformer { +public abstract class Transformer { public abstract void configureValueConverter(Map config, AbstractConfig sourceConfig); - public final Stream getRecords(final IOSupplier inputStreamIOSupplier, final String topic, - final int topicPartition, final AbstractConfig sourceConfig, final long skipRecords) { + public final Stream getRecords(final IOSupplier inputStreamIOSupplier, + final String topic, final int topicPartition, final AbstractConfig sourceConfig, final long skipRecords) { - final StreamSpliterator spliterator = createSpliterator(inputStreamIOSupplier, topic, topicPartition, + final StreamSpliterator spliterator = createSpliterator(inputStreamIOSupplier, topic, topicPartition, sourceConfig); return StreamSupport.stream(spliterator, false).onClose(spliterator::close).skip(skipRecords); } @@ -55,20 +55,15 @@ public final Stream getRecords(final IOSupplier inputStreamIOSup * the source configuraiton. * @return a StreamSpliterator instance. */ - protected abstract StreamSpliterator createSpliterator(IOSupplier inputStreamIOSupplier, - String topic, int topicPartition, AbstractConfig sourceConfig); - - public abstract SchemaAndValue getValueData(T record, String topic, AbstractConfig sourceConfig); + protected abstract StreamSpliterator createSpliterator(IOSupplier inputStreamIOSupplier, String topic, + int topicPartition, AbstractConfig sourceConfig); public abstract SchemaAndValue getKeyData(Object cloudStorageKey, String topic, AbstractConfig sourceConfig); /** * A Spliterator that performs various checks on the opening/closing of the input stream. - * - * @param - * the type of item created by this Spliterator. */ - protected abstract static class StreamSpliterator implements Spliterator { + protected abstract static class StreamSpliterator implements Spliterator { /** * The input stream supplier. */ @@ -109,7 +104,7 @@ protected StreamSpliterator(final Logger logger, final IOSupplier i * the Consumer to call if record is created. * @return {@code true} if a record was processed, {@code false} otherwise. */ - abstract protected boolean doAdvance(Consumer action); + abstract protected boolean doAdvance(Consumer action); /** * Method to close additional inputs if needed. @@ -121,6 +116,7 @@ public final void close() { try { if (inputStream != null) { inputStream.close(); + inputStream = null; // NOPMD setting null to release resources closed = true; } } catch (IOException e) { @@ -143,15 +139,16 @@ public final void close() { abstract protected InputStream inputOpened(InputStream input) throws IOException; @Override - public final boolean tryAdvance(final Consumer action) { - boolean result = false; + public final boolean tryAdvance(final Consumer action) { if (closed) { - logger.error("Attempt to advance after closed"); + return false; } + boolean result = false; try { if (inputStream == null) { try { - inputStream = inputOpened(inputStreamIOSupplier.get()); + inputStream = inputStreamIOSupplier.get(); + inputOpened(inputStream); } catch (IOException e) { logger.error("Error trying to open inputStream: {}", e.getMessage(), e); close(); @@ -169,7 +166,7 @@ public final boolean tryAdvance(final Consumer action) { } @Override - public final Spliterator trySplit() { // NOPMD returning null is reqruied by API + public final Spliterator trySplit() { // NOPMD returning null is reqruied by API return null; } diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/TransformerFactory.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/TransformerFactory.java index 43a1b0ef..57460430 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/TransformerFactory.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/TransformerFactory.java @@ -16,48 +16,46 @@ package io.aiven.kafka.connect.common.source.input; -import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.INPUT_FORMAT_KEY; import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.SCHEMAS_ENABLE; -import java.util.HashMap; import java.util.Map; import org.apache.kafka.connect.json.JsonConverter; -import io.aiven.kafka.connect.common.config.SchemaRegistryFragment; -import io.aiven.kafka.connect.common.config.SourceCommonConfig; - import io.confluent.connect.avro.AvroData; +/** + * A factory to create Transformers. + */ public final class TransformerFactory { - + /** The cache size for systems that read Avro data */ public static final int CACHE_SIZE = 100; private TransformerFactory() { // hidden } - public static Transformer getTransformer(final SourceCommonConfig sourceConfig) { - final InputFormat inputFormatEnum = new SchemaRegistryFragment(sourceConfig).getInputFormat(); - switch (inputFormatEnum) { + + /** + * Gets a configured Transformer. + * + * @param inputFormat + * The input format for the transformer. + * @return the Transformer for the specified input format. + */ + public static Transformer getTransformer(final InputFormat inputFormat) { + switch (inputFormat) { case AVRO : return new AvroTransformer(new AvroData(CACHE_SIZE)); case PARQUET : return new ParquetTransformer(new AvroData(CACHE_SIZE)); case JSONL : final JsonConverter jsonConverter = new JsonConverter(); - configureJsonConverter(jsonConverter); + jsonConverter.configure(Map.of(SCHEMAS_ENABLE, "false"), false); return new JsonTransformer(jsonConverter); case BYTES : return new ByteArrayTransformer(); default : - throw new IllegalArgumentException( - "Unknown input format in configuration: " + sourceConfig.getString(INPUT_FORMAT_KEY)); + throw new IllegalArgumentException("Unknown input format in configuration: " + inputFormat); } } - - private static void configureJsonConverter(final JsonConverter jsonConverter) { - final Map config = new HashMap<>(); - config.put(SCHEMAS_ENABLE, "false"); - jsonConverter.configure(config, false); - } } diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/AbstractSourceTaskTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/AbstractSourceTaskTest.java new file mode 100644 index 00000000..9b3a581e --- /dev/null +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/AbstractSourceTaskTest.java @@ -0,0 +1,145 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatExceptionOfType; +import static org.awaitility.Awaitility.await; + +import java.time.Duration; +import java.util.concurrent.atomic.AtomicBoolean; + +import org.apache.commons.lang3.time.StopWatch; +import org.junit.jupiter.api.Test; + +class AbstractSourceTaskTest { + + @Test + void timerTest() { + final AbstractSourceTask.Timer timer = new AbstractSourceTask.Timer(Duration.ofSeconds(1)); + assertThat(timer.millisecondsRemaining()).isEqualTo(Duration.ofSeconds(1).toMillis()); + timer.start(); + await().atMost(Duration.ofSeconds(2)).until(timer::isExpired); + assertThat(timer.millisecondsRemaining()).isLessThan(0); + timer.stop(); + assertThat(timer.millisecondsRemaining()).isEqualTo(Duration.ofSeconds(1).toMillis()); + } + + @Test + void timerSequenceTest() { + final AbstractSourceTask.Timer timer = new AbstractSourceTask.Timer(Duration.ofSeconds(1)); + // stopped state does not allow stop + assertThatExceptionOfType(IllegalStateException.class).as("stop while not running") + .isThrownBy(timer::stop) + .withMessageStartingWith("Timer: "); + timer.reset(); // verify that an exception is not thrown. + + // started state does not allow start + timer.start(); + assertThatExceptionOfType(IllegalStateException.class).as("start while running") + .isThrownBy(timer::start) + .withMessageStartingWith("Timer: "); + timer.reset(); + timer.start(); // restart the timer. + timer.stop(); + + // stopped state does not allow stop or start + assertThatExceptionOfType(IllegalStateException.class).as("stop after stop") + .isThrownBy(timer::stop) + .withMessageStartingWith("Timer: "); + assertThatExceptionOfType(IllegalStateException.class).as("start after stop") + .isThrownBy(timer::start) + .withMessageStartingWith("Timer: "); + timer.reset(); + + // stopped + reset does not allow stop. + assertThatExceptionOfType(IllegalStateException.class).as("stop after reset (1)") + .isThrownBy(timer::stop) + .withMessageStartingWith("Timer: "); + timer.start(); + timer.reset(); + + // started + reset does not allow stop; + assertThatExceptionOfType(IllegalStateException.class).as("stop after reset (2)") + .isThrownBy(timer::stop) + .withMessageStartingWith("Timer: "); + } + + @Test + void backoffTest() throws InterruptedException { + final AbstractSourceTask.Timer timer = new AbstractSourceTask.Timer(Duration.ofSeconds(1)); + final AbstractSourceTask.Backoff backoff = new AbstractSourceTask.Backoff(timer.getBackoffConfig()); + final long estimatedDelay = backoff.estimatedDelay(); + assertThat(estimatedDelay).isLessThan(500); + + // execute delay without timer running. + final StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + backoff.delay(); + stopWatch.stop(); + assertThat(stopWatch.getTime()).as("Result without timer running") + .isBetween(estimatedDelay - backoff.getMaxJitter(), estimatedDelay + backoff.getMaxJitter()); + + timer.start(); + for (int i = 0; i < 9; i++) { + stopWatch.reset(); + timer.reset(); + timer.start(); + stopWatch.start(); + await().atMost(Duration.ofSeconds(2)).until(() -> { + backoff.delay(); + return backoff.estimatedDelay() == 0 || timer.isExpired(); + }); + stopWatch.stop(); + timer.stop(); + final int step = i; + if (!timer.isExpired()) { + assertThat(stopWatch.getTime()).as(() -> String.format("Result with timer running at step %s", step)) + .isBetween(Duration.ofSeconds(1).toMillis() - backoff.getMaxJitter(), + Duration.ofSeconds(1).toMillis() + backoff.getMaxJitter()); + } + } + } + + @Test + void backoffIncrementalTimeTest() throws InterruptedException { + final AtomicBoolean abortTrigger = new AtomicBoolean(); + // delay increases in powers of 2. + final long maxDelay = 1000; // not a power of 2 + final AbstractSourceTask.BackoffConfig config = new AbstractSourceTask.BackoffConfig() { + @Override + public AbstractSourceTask.SupplierOfLong getSupplierOfTimeRemaining() { + return () -> maxDelay; + } + + @Override + public AbstractSourceTask.AbortTrigger getAbortTrigger() { + return () -> abortTrigger.set(true); + } + }; + + final AbstractSourceTask.Backoff backoff = new AbstractSourceTask.Backoff(config); + long expected = 2; + while (backoff.estimatedDelay() < maxDelay) { + assertThat(backoff.estimatedDelay()).isEqualTo(expected); + backoff.delay(); + expected *= 2; + } + assertThat(backoff.estimatedDelay()).isEqualTo(maxDelay); + assertThat(abortTrigger).isFalse(); + } +} diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/AvroTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/AvroTransformerTest.java index 50e54a28..617dd290 100644 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/AvroTransformerTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/AvroTransformerTest.java @@ -32,6 +32,9 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +import org.apache.kafka.connect.data.SchemaAndValue; +import org.apache.kafka.connect.data.Struct; + import io.aiven.kafka.connect.common.config.SourceCommonConfig; import io.confluent.connect.avro.AvroData; @@ -75,7 +78,7 @@ void testConfigureValueConverter() { void testReadAvroRecordsInvalidData() { final InputStream inputStream = new ByteArrayInputStream("mock-avro-data".getBytes(StandardCharsets.UTF_8)); - final Stream records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, + final Stream records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, 0); final List recs = records.collect(Collectors.toList()); @@ -87,11 +90,17 @@ void testReadAvroRecords() throws Exception { final ByteArrayOutputStream avroData = generateMockAvroData(25); final InputStream inputStream = new ByteArrayInputStream(avroData.toByteArray()); - final Stream records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, + final List expected = new ArrayList<>(); + for (int i = 0; i < 25; i++) { + expected.add("Hello, Kafka Connect S3 Source! object " + i); + } + + final Stream records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, 0); - final List recs = records.collect(Collectors.toList()); - assertThat(recs).hasSize(25); + assertThat(records).extracting(SchemaAndValue::value) + .extracting(sv -> ((Struct) sv).getString("message")) + .containsExactlyElementsOf(expected); } @Test @@ -99,14 +108,16 @@ void testReadAvroRecordsSkipFew() throws Exception { final ByteArrayOutputStream avroData = generateMockAvroData(20); final InputStream inputStream = new ByteArrayInputStream(avroData.toByteArray()); - final Stream records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, + final List expected = new ArrayList<>(); + for (int i = 5; i < 20; i++) { + expected.add("Hello, Kafka Connect S3 Source! object " + i); + } + final Stream records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, 5); - final List recs = records.collect(Collectors.toList()); - assertThat(recs).hasSize(15); - // get first rec - assertThat(((GenericRecord) recs.get(0)).get("message").toString()) - .isEqualTo("Hello, Kafka Connect S3 Source! object 5"); + assertThat(records).extracting(SchemaAndValue::value) + .extracting(sv -> ((Struct) sv).getString("message")) + .containsExactlyElementsOf(expected); } @Test @@ -114,11 +125,10 @@ void testReadAvroRecordsSkipMoreRecordsThanExist() throws Exception { final ByteArrayOutputStream avroData = generateMockAvroData(20); final InputStream inputStream = new ByteArrayInputStream(avroData.toByteArray()); - final Stream records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, + final Stream records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, 25); - final List recs = records.collect(Collectors.toList()); - assertThat(recs).hasSize(0); + assertThat(records).isEmpty(); } static ByteArrayOutputStream generateMockAvroData(final int numRecs) throws IOException { diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformerTest.java index ee6b7600..80820e13 100644 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformerTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformerTest.java @@ -24,6 +24,8 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +import org.apache.kafka.connect.data.SchemaAndValue; + import io.aiven.kafka.connect.common.config.SourceCommonConfig; import org.apache.commons.io.function.IOSupplier; @@ -53,12 +55,12 @@ void testGetRecordsSingleChunk() { final InputStream inputStream = new ByteArrayInputStream(data); final IOSupplier inputStreamIOSupplier = () -> inputStream; - final Stream records = byteArrayTransformer.getRecords(inputStreamIOSupplier, TEST_TOPIC, 0, + final Stream records = byteArrayTransformer.getRecords(inputStreamIOSupplier, TEST_TOPIC, 0, sourceCommonConfig, 0); - final List recs = records.collect(Collectors.toList()); + final List recs = records.collect(Collectors.toList()); assertThat(recs).hasSize(1); - assertThat((byte[]) recs.get(0)).isEqualTo(data); + assertThat(recs.get(0).value()).isEqualTo(data); } @Test @@ -67,18 +69,9 @@ void testGetRecordsEmptyInputStream() { final IOSupplier inputStreamIOSupplier = () -> inputStream; - final Stream records = byteArrayTransformer.getRecords(inputStreamIOSupplier, TEST_TOPIC, 0, + final Stream records = byteArrayTransformer.getRecords(inputStreamIOSupplier, TEST_TOPIC, 0, sourceCommonConfig, 0); assertThat(records).hasSize(0); } - - @Test - void testGetValueBytes() { - final byte[] record = { 1, 2, 3 }; - final byte[] result = (byte[]) byteArrayTransformer.getValueData(record, TEST_TOPIC, sourceCommonConfig) - .value(); - - assertThat(result).containsExactlyInAnyOrder(record); - } } diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java index a38a2bc8..e482fd61 100644 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java @@ -18,7 +18,6 @@ import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.SCHEMAS_ENABLE; import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -26,13 +25,13 @@ import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; +import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.stream.Collectors; import java.util.stream.Stream; -import org.apache.kafka.connect.errors.DataException; +import org.apache.kafka.connect.data.SchemaAndValue; import org.apache.kafka.connect.json.JsonConverter; import io.aiven.kafka.connect.common.config.SourceCommonConfig; @@ -77,31 +76,38 @@ void destroy() { @Test void testHandleValueDataWithValidJson() { final InputStream validJsonInputStream = new ByteArrayInputStream( - "{\"key\":\"value\"}".getBytes(StandardCharsets.UTF_8)); - final IOSupplier inputStreamIOSupplier = () -> validJsonInputStream; - final Stream jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, + getJsonRecs(100).getBytes(StandardCharsets.UTF_8)); + + final List expected = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + expected.add("value" + i); + } + + final Stream records = jsonTransformer.getRecords(() -> validJsonInputStream, TESTTOPIC, 1, sourceCommonConfig, 0); - assertThat(jsonNodes).hasSize(1); + assertThat(records).extracting(SchemaAndValue::value) + .extracting(sv -> ((Map) sv).get("key")) + .containsExactlyElementsOf(expected); } @Test void testHandleValueDataWithValidJsonSkipFew() { final InputStream validJsonInputStream = new ByteArrayInputStream( getJsonRecs(100).getBytes(StandardCharsets.UTF_8)); - final IOSupplier inputStreamIOSupplier = () -> validJsonInputStream; - final Stream jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, + + final List expected = new ArrayList<>(); + for (int i = 25; i < 100; i++) { + expected.add("value" + i); + } + + final Stream records = jsonTransformer.getRecords(() -> validJsonInputStream, TESTTOPIC, 1, sourceCommonConfig, 25L); - final List recs = jsonNodes.collect(Collectors.toList()); - assertThat(recs).hasSize(75); - assertThat(recs).extracting(record -> ((Map) jsonTransformer.getValueData(record, "", null).value()).get("key")) - .doesNotContain("value1") - .doesNotContain("value2") - .doesNotContain("value25") - .contains("value26") - .contains("value27") - .contains("value100"); + assertThat(records).extracting(SchemaAndValue::value) + .extracting(sv -> ((Map) sv).get("key")) + .containsExactlyElementsOf(expected); + } @Test @@ -110,35 +116,17 @@ void testHandleValueDataWithInvalidJson() { "invalid-json".getBytes(StandardCharsets.UTF_8)); final IOSupplier inputStreamIOSupplier = () -> invalidJsonInputStream; - final Stream jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, + final Stream jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, sourceCommonConfig, 0); - assertThatThrownBy(() -> jsonTransformer.getValueData(jsonNodes.findAny().get(), "", null)) - .isInstanceOf(DataException.class) - .hasMessage("Converting byte[] to Kafka Connect data failed due to serialization error: "); - } + assertThat(jsonNodes).isEmpty(); - @Test - void testSerializeJsonDataValid() throws IOException { - final InputStream validJsonInputStream = new ByteArrayInputStream( - "{\"key\":\"value\"}".getBytes(StandardCharsets.UTF_8)); - final IOSupplier inputStreamIOSupplier = () -> validJsonInputStream; - final Stream jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, - sourceCommonConfig, 0); - final Object serializedData = jsonTransformer - .getValueData( - jsonNodes.findFirst().orElseThrow(() -> new AssertionError("No records found in stream!")), - TESTTOPIC, sourceCommonConfig) - .value(); - - // Assert: Verify the serialized data - assertThat(serializedData).isInstanceOf(Map.class).extracting("key").isEqualTo("value"); } @Test void testGetRecordsWithIOException() throws IOException { when(inputStreamIOSupplierMock.get()).thenThrow(new IOException("Test IOException")); - final Stream resultStream = jsonTransformer.getRecords(inputStreamIOSupplierMock, "topic", 0, null, 0); + final Stream resultStream = jsonTransformer.getRecords(inputStreamIOSupplierMock, "topic", 0, null, 0); assertThat(resultStream).isEmpty(); } @@ -146,14 +134,14 @@ void testGetRecordsWithIOException() throws IOException { @Test void testCustomSpliteratorWithIOExceptionDuringInitialization() throws IOException { when(inputStreamIOSupplierMock.get()).thenThrow(new IOException("Test IOException during initialization")); - final Stream resultStream = jsonTransformer.getRecords(inputStreamIOSupplierMock, "topic", 0, null, 0); + final Stream resultStream = jsonTransformer.getRecords(inputStreamIOSupplierMock, "topic", 0, null, 0); assertThat(resultStream).isEmpty(); } static String getJsonRecs(final int recordCount) { final StringBuilder jsonRecords = new StringBuilder(); - for (int i = 1; i <= recordCount; i++) { + for (int i = 0; i < recordCount; i++) { jsonRecords.append(String.format("{\"key\":\"value%d\"}", i)); if (i < recordCount) { jsonRecords.append("\n"); // NOPMD AppendCharacterWithChar diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ParquetTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ParquetTransformerTest.java index 154baf45..2f7a405f 100644 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ParquetTransformerTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ParquetTransformerTest.java @@ -29,14 +29,17 @@ import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; +import java.util.ArrayList; import java.util.List; import java.util.stream.Collectors; import java.util.stream.Stream; +import org.apache.kafka.connect.data.SchemaAndValue; +import org.apache.kafka.connect.data.Struct; + import io.aiven.kafka.connect.common.config.SourceCommonConfig; import io.confluent.connect.avro.AvroData; -import org.apache.avro.generic.GenericRecord; import org.apache.commons.io.IOUtils; import org.apache.commons.io.function.IOSupplier; import org.junit.jupiter.api.BeforeEach; @@ -63,7 +66,7 @@ void testHandleValueDataWithZeroBytes() { final String topic = "test-topic"; final int topicPartition = 0; - final Stream recs = parquetTransformer.getRecords(inputStreamIOSupplier, topic, topicPartition, + final Stream recs = parquetTransformer.getRecords(inputStreamIOSupplier, topic, topicPartition, s3SourceConfig, 0L); assertThat(recs).isEmpty(); @@ -78,15 +81,17 @@ void testGetRecordsWithValidData() throws Exception { final String topic = "test-topic"; final int topicPartition = 0; - - final List records = parquetTransformer + final List expected = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + expected.add("name" + i); + } + final List records = parquetTransformer .getRecords(inputStreamIOSupplier, topic, topicPartition, s3SourceConfig, 0L) .collect(Collectors.toList()); - assertThat(records).hasSize(100); - assertThat(records).extracting(record -> ((GenericRecord) record).get("name").toString()) - .contains("name1") - .contains("name2"); + assertThat(records).extracting(SchemaAndValue::value) + .extracting(sv -> ((Struct) sv).getString("name")) + .containsExactlyElementsOf(expected); } @Test @@ -99,18 +104,18 @@ void testGetRecordsWithValidDataSkipFew() throws Exception { final String topic = "test-topic"; final int topicPartition = 0; - final List records = parquetTransformer + final List expected = new ArrayList<>(); + for (int i = 25; i < 100; i++) { + expected.add("name" + i); + } + + final List records = parquetTransformer .getRecords(inputStreamIOSupplier, topic, topicPartition, s3SourceConfig, 25L) .collect(Collectors.toList()); - assertThat(records).hasSize(75); - assertThat(records).extracting(record -> ((GenericRecord) record).get("name").toString()) - .doesNotContain("name1") - .doesNotContain("name2") - .doesNotContain("name24") - .contains("name25") - .contains("name26") - .contains("name99"); + assertThat(records).extracting(SchemaAndValue::value) + .extracting(sv -> ((Struct) sv).getString("name")) + .containsExactlyElementsOf(expected); } @Test @@ -124,7 +129,7 @@ void testGetRecordsWithInvalidData() { final String topic = "test-topic"; final int topicPartition = 0; - final Stream records = parquetTransformer.getRecords(inputStreamIOSupplier, topic, + final Stream records = parquetTransformer.getRecords(inputStreamIOSupplier, topic, topicPartition, s3SourceConfig, 0L); assertThat(records).isEmpty(); } @@ -150,7 +155,7 @@ void testIOExceptionCreatingTempFile() { .thenThrow(new IOException("Test IOException for temp file")); final IOSupplier inputStreamSupplier = mock(IOSupplier.class); - final Stream resultStream = parquetTransformer.getRecords(inputStreamSupplier, "test-topic", + final Stream resultStream = parquetTransformer.getRecords(inputStreamSupplier, "test-topic", 1, null, 0L); assertThat(resultStream).isEmpty(); @@ -163,7 +168,7 @@ void testIOExceptionDuringDataCopy() throws IOException { when(inputStreamMock.read(any(byte[].class))).thenThrow(new IOException("Test IOException during copy")); final IOSupplier inputStreamSupplier = () -> inputStreamMock; - final Stream resultStream = parquetTransformer.getRecords(inputStreamSupplier, "test-topic", + final Stream resultStream = parquetTransformer.getRecords(inputStreamSupplier, "test-topic", 1, null, 0L); assertThat(resultStream).isEmpty(); diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/TransformerStreamingTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/TransformerStreamingTest.java index f61dd942..73b27b01 100644 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/TransformerStreamingTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/TransformerStreamingTest.java @@ -17,6 +17,8 @@ package io.aiven.kafka.connect.common.source.input; import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyInt; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -32,11 +34,10 @@ import org.apache.kafka.common.config.AbstractConfig; import org.apache.kafka.common.config.ConfigDef; -import org.apache.kafka.connect.json.JsonConverter; +import org.apache.kafka.connect.data.SchemaAndValue; import io.aiven.kafka.connect.common.config.CommonConfig; -import io.confluent.connect.avro.AvroData; import org.apache.commons.io.function.IOSupplier; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; @@ -49,8 +50,8 @@ class TransformerStreamingTest { @ParameterizedTest @MethodSource("testData") - void verifyExceptionDuringIOOpen(final Transformer transformer, final byte[] testData, - final AbstractConfig config, final int expectedCount) throws IOException { + void verifyExceptionDuringIOOpen(final Transformer transformer, final byte[] testData, final AbstractConfig config, + final int expectedCount) throws IOException { final IOSupplier ioSupplier = mock(IOSupplier.class); when(ioSupplier.get()).thenThrow(new IOException("Test IOException during initialization")); final Stream objStream = transformer.getRecords(ioSupplier, "topic", 1, config, 0); @@ -59,7 +60,28 @@ void verifyExceptionDuringIOOpen(final Transformer transformer, final byte[] @ParameterizedTest @MethodSource("testData") - void verifyCloseCalledAtEnd(final Transformer transformer, final byte[] testData, final AbstractConfig config, + void verifyExceptionDuringRead(final Transformer transformer, final byte[] testData, final AbstractConfig config, + final int expectedCount) throws IOException { + try (InputStream inputStream = mock(InputStream.class)) { + when(inputStream.read()).thenThrow(new IOException("Test IOException during read")); + when(inputStream.read(any())).thenThrow(new IOException("Test IOException during read")); + when(inputStream.read(any(), anyInt(), anyInt())) + .thenThrow(new IOException("Test IOException during read")); + when(inputStream.readNBytes(any(), anyInt(), anyInt())) + .thenThrow(new IOException("Test IOException during read")); + when(inputStream.readNBytes(anyInt())).thenThrow(new IOException("Test IOException during read")); + when(inputStream.readAllBytes()).thenThrow(new IOException("Test IOException during read")); + try (CloseTrackingStream stream = new CloseTrackingStream(inputStream)) { + final Stream objStream = transformer.getRecords(() -> stream, "topic", 1, config, 0); + assertThat(objStream).isEmpty(); + assertThat(stream.closeCount).isGreaterThan(0); + } + } + } + + @ParameterizedTest + @MethodSource("testData") + void verifyCloseCalledAtEnd(final Transformer transformer, final byte[] testData, final AbstractConfig config, final int expectedCount) throws IOException { final CloseTrackingStream stream = new CloseTrackingStream(new ByteArrayInputStream(testData)); final Stream objStream = transformer.getRecords(() -> stream, "topic", 1, config, 0); @@ -70,11 +92,11 @@ void verifyCloseCalledAtEnd(final Transformer transformer, final byte[] testD @ParameterizedTest @MethodSource("testData") - void verifyCloseCalledAtIteratorEnd(final Transformer transformer, final byte[] testData, + void verifyCloseCalledAtIteratorEnd(final Transformer transformer, final byte[] testData, final AbstractConfig config, final int expectedCount) throws IOException { final CloseTrackingStream stream = new CloseTrackingStream(new ByteArrayInputStream(testData)); - final Stream objStream = transformer.getRecords(() -> stream, "topic", 1, config, 0); - final Iterator iter = objStream.iterator(); + final Stream objStream = transformer.getRecords(() -> stream, "topic", 1, config, 0); + final Iterator iter = objStream.iterator(); long count = 0L; while (iter.hasNext()) { count += 1; @@ -86,19 +108,19 @@ void verifyCloseCalledAtIteratorEnd(final Transformer transformer, final byte static Stream testData() throws IOException { final List lst = new ArrayList<>(); - final AvroData avroData = new AvroData(100); - lst.add(Arguments.of(new AvroTransformer(avroData), AvroTransformerTest.generateMockAvroData(100).toByteArray(), + lst.add(Arguments.of(TransformerFactory.getTransformer(InputFormat.AVRO), + AvroTransformerTest.generateMockAvroData(100).toByteArray(), new CommonConfig(new ConfigDef(), new HashMap<>()) { }, 100)); - lst.add(Arguments.of(new ByteArrayTransformer(), "Hello World".getBytes(StandardCharsets.UTF_8), - new CommonConfig(new ConfigDef(), new HashMap<>()) { + lst.add(Arguments.of(TransformerFactory.getTransformer(InputFormat.BYTES), + "Hello World".getBytes(StandardCharsets.UTF_8), new CommonConfig(new ConfigDef(), new HashMap<>()) { }, 1)); - lst.add(Arguments.of(new JsonTransformer(new JsonConverter()), + lst.add(Arguments.of(TransformerFactory.getTransformer(InputFormat.JSONL), JsonTransformerTest.getJsonRecs(100).getBytes(StandardCharsets.UTF_8), new CommonConfig(new ConfigDef(), new HashMap<>()) { }, 100)); - lst.add(Arguments.of(new ParquetTransformer(avroData), ParquetTransformerTest.generateMockParquetData(), - new CommonConfig(new ConfigDef(), new HashMap<>()) { + lst.add(Arguments.of(TransformerFactory.getTransformer(InputFormat.PARQUET), + ParquetTransformerTest.generateMockParquetData(), new CommonConfig(new ConfigDef(), new HashMap<>()) { }, 100)); return lst.stream(); } diff --git a/s3-source-connector/build.gradle.kts b/s3-source-connector/build.gradle.kts index 20d5a3b8..db1b4a7d 100644 --- a/s3-source-connector/build.gradle.kts +++ b/s3-source-connector/build.gradle.kts @@ -65,6 +65,7 @@ dependencies { compileOnly(apache.kafka.connect.api) compileOnly(apache.kafka.connect.runtime) + implementation(apache.commons.collection4) implementation(project(":commons")) implementation(project(":s3-commons")) implementation("software.amazon.awssdk:s3:$amazonS3Version") diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/AwsIntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/AwsIntegrationTest.java new file mode 100644 index 00000000..42d10aad --- /dev/null +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/AwsIntegrationTest.java @@ -0,0 +1,310 @@ +/* + * Copyright 2025 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source; + +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.AVRO_VALUE_SERIALIZER; +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.INPUT_FORMAT_KEY; +import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPICS; +import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPIC_PARTITIONS; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_ACCESS_KEY_ID_CONFIG; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_BUCKET_NAME_CONFIG; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_ENDPOINT_CONFIG; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_PREFIX_CONFIG; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_SECRET_ACCESS_KEY_CONFIG; +import static io.aiven.kafka.connect.s3.source.S3SourceTask.OBJECT_KEY; +import static io.aiven.kafka.connect.s3.source.utils.OffsetManager.SEPARATOR; +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.time.ZonedDateTime; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.kafka.connect.source.SourceTaskContext; +import org.apache.kafka.connect.storage.OffsetStorageReader; + +import io.aiven.kafka.connect.common.source.input.InputFormat; +import io.aiven.kafka.connect.common.source.input.TransformerFactory; +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; +import io.aiven.kafka.connect.s3.source.utils.AWSV2SourceClient; +import io.aiven.kafka.connect.s3.source.utils.OffsetManager; +import io.aiven.kafka.connect.s3.source.utils.S3SourceRecord; +import io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator; + +import org.apache.avro.Schema; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInfo; +import org.testcontainers.containers.localstack.LocalStackContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.S3Object; + +@Testcontainers +class AwsIntegrationTest implements IntegrationBase { + + private static final String COMMON_PREFIX = "s3-source-connector-for-apache-kafka-AWS-test-"; + + @Container + public static final LocalStackContainer LOCALSTACK = IntegrationBase.createS3Container(); + + private static String s3Prefix; + + private S3Client s3Client; + private String s3Endpoint; + + private BucketAccessor testBucketAccessor; + + @Override + public String getS3Prefix() { + return s3Prefix; + } + + @Override + public S3Client getS3Client() { + return s3Client; + } + + @BeforeAll + static void setUpAll() { + s3Prefix = COMMON_PREFIX + ZonedDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME) + "/"; + } + + @BeforeEach + void setupAWS() { + s3Client = IntegrationBase.createS3Client(LOCALSTACK); + s3Endpoint = LOCALSTACK.getEndpoint().toString(); + testBucketAccessor = new BucketAccessor(s3Client, TEST_BUCKET_NAME); + testBucketAccessor.createBucket(); + } + + @AfterEach + void tearDownAWS() { + testBucketAccessor.removeBucket(); + s3Client.close(); + } + + private Map getConfig(final String topics, final int maxTasks) { + final Map config = new HashMap<>(); + config.put(AWS_ACCESS_KEY_ID_CONFIG, S3_ACCESS_KEY_ID); + config.put(AWS_SECRET_ACCESS_KEY_CONFIG, S3_SECRET_ACCESS_KEY); + config.put(AWS_S3_ENDPOINT_CONFIG, s3Endpoint); + config.put(AWS_S3_BUCKET_NAME_CONFIG, TEST_BUCKET_NAME); + config.put(AWS_S3_PREFIX_CONFIG, getS3Prefix()); + config.put(TARGET_TOPIC_PARTITIONS, "0,1"); + config.put(TARGET_TOPICS, topics); + config.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); + config.put(VALUE_CONVERTER_KEY, "org.apache.kafka.connect.converters.ByteArrayConverter"); + config.put("tasks.max", String.valueOf(maxTasks)); + return config; + } + + /** + * Test the integration with the Amazon connector + * + * @param testInfo + * The testing configuration. + */ + @Test + void sourceRecordIteratorBytesTest(final TestInfo testInfo) { + final var topicName = IntegrationBase.topicName(testInfo); + final Map configData = getConfig(topicName, 1); + + configData.put(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); + + final String testData1 = "Hello, Kafka Connect S3 Source! object 1"; + final String testData2 = "Hello, Kafka Connect S3 Source! object 2"; + + final List offsetKeys = new ArrayList<>(); + final List expectedKeys = new ArrayList<>(); + // write 2 objects to s3 + expectedKeys.add(writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "00000")); + expectedKeys.add(writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00000")); + expectedKeys.add(writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "00001")); + expectedKeys.add(writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00001")); + + // we don't expext the empty one. + offsetKeys.addAll(expectedKeys); + offsetKeys.add(writeToS3(topicName, new byte[0], "00003")); + + assertThat(testBucketAccessor.listObjects()).hasSize(5); + + final S3SourceConfig s3SourceConfig = new S3SourceConfig(configData); + final SourceTaskContext context = mock(SourceTaskContext.class); + final OffsetStorageReader offsetStorageReader = mock(OffsetStorageReader.class); + when(context.offsetStorageReader()).thenReturn(offsetStorageReader); + when(offsetStorageReader.offsets(any())).thenReturn(new HashMap<>()); + + final OffsetManager offsetManager = new OffsetManager(context, s3SourceConfig); + + final AWSV2SourceClient sourceClient = new AWSV2SourceClient(s3SourceConfig, new HashSet<>()); + + final Iterator sourceRecordIterator = new SourceRecordIterator(s3SourceConfig, offsetManager, + TransformerFactory.getTransformer(InputFormat.BYTES), sourceClient); + + final HashSet seenKeys = new HashSet<>(); + while (sourceRecordIterator.hasNext()) { + final S3SourceRecord s3SourceRecord = sourceRecordIterator.next(); + final String key = OBJECT_KEY + SEPARATOR + s3SourceRecord.getObjectKey(); + assertThat(offsetKeys).contains(key); + seenKeys.add(key); + } + assertThat(seenKeys).containsAll(expectedKeys); + } + + @Test + void sourceRecordIteratorAvroTest(final TestInfo testInfo) throws IOException { + final var topicName = IntegrationBase.topicName(testInfo); + + final Map configData = getConfig(topicName, 1); + + configData.put(INPUT_FORMAT_KEY, InputFormat.AVRO.getValue()); + configData.put(VALUE_CONVERTER_KEY, "io.confluent.connect.avro.AvroConverter"); + configData.put(AVRO_VALUE_SERIALIZER, "io.confluent.kafka.serializers.KafkaAvroSerializer"); + + // Define Avro schema + final String schemaJson = "{\n" + " \"type\": \"record\",\n" + " \"name\": \"TestRecord\",\n" + + " \"fields\": [\n" + " {\"name\": \"message\", \"type\": \"string\"},\n" + + " {\"name\": \"id\", \"type\": \"int\"}\n" + " ]\n" + "}"; + final Schema.Parser parser = new Schema.Parser(); + final Schema schema = parser.parse(schemaJson); + + final int numOfRecsFactor = 5000; + + final byte[] outputStream1 = IntegrationBase.generateNextAvroMessagesStartingFromId(1, numOfRecsFactor, schema); + final byte[] outputStream2 = IntegrationBase.generateNextAvroMessagesStartingFromId(numOfRecsFactor + 1, + numOfRecsFactor, schema); + final byte[] outputStream3 = IntegrationBase.generateNextAvroMessagesStartingFromId(2 * numOfRecsFactor + 1, + numOfRecsFactor, schema); + final byte[] outputStream4 = IntegrationBase.generateNextAvroMessagesStartingFromId(3 * numOfRecsFactor + 1, + numOfRecsFactor, schema); + final byte[] outputStream5 = IntegrationBase.generateNextAvroMessagesStartingFromId(4 * numOfRecsFactor + 1, + numOfRecsFactor, schema); + + final Set offsetKeys = new HashSet<>(); + + offsetKeys.add(writeToS3(topicName, outputStream1, "00001")); + offsetKeys.add(writeToS3(topicName, outputStream2, "00001")); + + offsetKeys.add(writeToS3(topicName, outputStream3, "00002")); + offsetKeys.add(writeToS3(topicName, outputStream4, "00002")); + offsetKeys.add(writeToS3(topicName, outputStream5, "00002")); + + assertThat(testBucketAccessor.listObjects()).hasSize(5); + + final S3SourceConfig s3SourceConfig = new S3SourceConfig(configData); + final SourceTaskContext context = mock(SourceTaskContext.class); + final OffsetStorageReader offsetStorageReader = mock(OffsetStorageReader.class); + when(context.offsetStorageReader()).thenReturn(offsetStorageReader); + when(offsetStorageReader.offsets(any())).thenReturn(new HashMap<>()); + + final OffsetManager offsetManager = new OffsetManager(context, s3SourceConfig); + + final AWSV2SourceClient sourceClient = new AWSV2SourceClient(s3SourceConfig, new HashSet<>()); + + final Iterator sourceRecordIterator = new SourceRecordIterator(s3SourceConfig, offsetManager, + TransformerFactory.getTransformer(InputFormat.AVRO), sourceClient); + + final HashSet seenKeys = new HashSet<>(); + final Map> seenRecords = new HashMap<>(); + while (sourceRecordIterator.hasNext()) { + final S3SourceRecord s3SourceRecord = sourceRecordIterator.next(); + final String key = OBJECT_KEY + SEPARATOR + s3SourceRecord.getObjectKey(); + seenRecords.compute(key, (k, v) -> { + final List lst = v == null ? new ArrayList<>() : v; // NOPMD new object inside loop + lst.add(s3SourceRecord.getRecordNumber()); + return lst; + }); + assertThat(offsetKeys).contains(key); + seenKeys.add(key); + } + assertThat(seenKeys).containsAll(offsetKeys); + assertThat(seenRecords).hasSize(5); + final List expected = new ArrayList<>(); + for (long l = 0; l < numOfRecsFactor; l++) { + expected.add(l + 1); + } + for (final String key : offsetKeys) { + final List seen = seenRecords.get(key); + assertThat(seen).as("Count for " + key).containsExactlyInAnyOrderElementsOf(expected); + } + } + + @Test + void verifyIteratorRehydration(final TestInfo testInfo) { + // create 2 files. + final var topicName = IntegrationBase.topicName(testInfo); + final Map configData = getConfig(topicName, 1); + + configData.put(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); + + final String testData1 = "Hello, Kafka Connect S3 Source! object 1"; + final String testData2 = "Hello, Kafka Connect S3 Source! object 2"; + final String testData3 = "Hello, Kafka Connect S3 Source! object 3"; + + final List expectedKeys = new ArrayList<>(); + + final List actualKeys = new ArrayList<>(); + + // write 2 objects to s3 + expectedKeys.add(writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "00000") + .substring((OBJECT_KEY + SEPARATOR).length())); + expectedKeys.add(writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00000") + .substring((OBJECT_KEY + SEPARATOR).length())); + + assertThat(testBucketAccessor.listObjects()).hasSize(2); + + final S3SourceConfig s3SourceConfig = new S3SourceConfig(configData); + final AWSV2SourceClient sourceClient = new AWSV2SourceClient(s3SourceConfig, new HashSet<>()); + final Iterator iter = sourceClient.getS3ObjectIterator(null); + + assertThat(iter).hasNext(); + S3Object object = iter.next(); + actualKeys.add(object.key()); + assertThat(iter).hasNext(); + object = iter.next(); + actualKeys.add(object.key()); + assertThat(iter).isExhausted(); + assertThat(actualKeys).containsAll(expectedKeys); + + // write 3rd object to s3 + expectedKeys.add(writeToS3(topicName, testData3.getBytes(StandardCharsets.UTF_8), "00000") + .substring((OBJECT_KEY + SEPARATOR).length())); + assertThat(testBucketAccessor.listObjects()).hasSize(3); + + assertThat(iter).hasNext(); + object = iter.next(); + actualKeys.add(object.key()); + assertThat(iter).isExhausted(); + assertThat(actualKeys).containsAll(expectedKeys); + + } +} diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java index 442993bf..a8b91a19 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java @@ -16,9 +16,12 @@ package io.aiven.kafka.connect.s3.source; +import static io.aiven.kafka.connect.s3.source.S3SourceTask.OBJECT_KEY; +import static io.aiven.kafka.connect.s3.source.utils.OffsetManager.SEPARATOR; import static org.assertj.core.api.Assertions.assertThat; import static org.awaitility.Awaitility.await; +import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.net.ServerSocket; @@ -52,20 +55,90 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import io.confluent.kafka.serializers.KafkaAvroDeserializer; +import org.apache.avro.Schema; +import org.apache.avro.file.DataFileWriter; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericRecord; +import org.apache.avro.io.DatumWriter; import org.junit.jupiter.api.TestInfo; import org.testcontainers.containers.Container; import org.testcontainers.containers.localstack.LocalStackContainer; import org.testcontainers.utility.DockerImageName; import software.amazon.awssdk.auth.credentials.AwsBasicCredentials; import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider; +import software.amazon.awssdk.core.sync.RequestBody; import software.amazon.awssdk.regions.Region; import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.PutObjectRequest; +@SuppressWarnings("PMD.ExcessiveImports") public interface IntegrationBase { String PLUGINS_S3_SOURCE_CONNECTOR_FOR_APACHE_KAFKA = "plugins/s3-source-connector-for-apache-kafka/"; String S3_SOURCE_CONNECTOR_FOR_APACHE_KAFKA_TEST = "s3-source-connector-for-apache-kafka-test-"; ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + String TEST_BUCKET_NAME = "test-bucket0"; + String S3_ACCESS_KEY_ID = "test-key-id0"; + String VALUE_CONVERTER_KEY = "value.converter"; + String S3_SECRET_ACCESS_KEY = "test_secret_key0"; + + static byte[] generateNextAvroMessagesStartingFromId(final int messageId, final int noOfAvroRecs, + final Schema schema) throws IOException { + final DatumWriter datumWriter = new GenericDatumWriter<>(schema); + try (DataFileWriter dataFileWriter = new DataFileWriter<>(datumWriter); + ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) { + dataFileWriter.create(schema, outputStream); + for (int i = messageId; i < messageId + noOfAvroRecs; i++) { + final GenericRecord avroRecord = new GenericData.Record(schema); // NOPMD + avroRecord.put("message", "Hello, Kafka Connect S3 Source! object " + i); + avroRecord.put("id", i); + dataFileWriter.append(avroRecord); + } + + dataFileWriter.flush(); + return outputStream.toByteArray(); + } + } + + S3Client getS3Client(); + + String getS3Prefix(); + + /** + * Write file to s3 with the specified key and data. + * + * @param objectKey + * the key. + * @param testDataBytes + * the data. + */ + default void writeToS3WithKey(final String objectKey, final byte[] testDataBytes) { + final PutObjectRequest request = PutObjectRequest.builder() + .bucket(IntegrationTest.TEST_BUCKET_NAME) + .key(objectKey) + .build(); + getS3Client().putObject(request, RequestBody.fromBytes(testDataBytes)); + + } + + /** + * Writes to S3 using a key of the form {@code [prefix]topicName-partitionId-systemTime.txt}. + * + * @param topicName + * the topic name to use + * @param testDataBytes + * the data. + * @param partitionId + * the partition id. + * @return the key prefixed by {@link S3SourceTask#OBJECT_KEY} and + * {@link io.aiven.kafka.connect.s3.source.utils.OffsetManager#SEPARATOR} + */ + default String writeToS3(final String topicName, final byte[] testDataBytes, final String partitionId) { + final String objectKey = org.apache.commons.lang3.StringUtils.defaultIfBlank(getS3Prefix(), "") + topicName + + "-" + partitionId + "-" + System.currentTimeMillis() + ".txt"; + writeToS3WithKey(objectKey, testDataBytes); + return OBJECT_KEY + SEPARATOR + objectKey; + } default AdminClient newAdminClient(final String bootstrapServers) { final Properties adminClientConfig = new Properties(); diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 5a573395..083d8627 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -27,13 +27,10 @@ import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_ENDPOINT_CONFIG; import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_PREFIX_CONFIG; import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_SECRET_ACCESS_KEY_CONFIG; -import static io.aiven.kafka.connect.s3.source.S3SourceTask.OBJECT_KEY; -import static io.aiven.kafka.connect.s3.source.utils.OffsetManager.SEPARATOR; import static java.util.Map.entry; import static org.assertj.core.api.Assertions.assertThat; import static org.awaitility.Awaitility.await; -import java.io.ByteArrayOutputStream; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; @@ -63,23 +60,17 @@ import com.fasterxml.jackson.databind.JsonNode; import org.apache.avro.Schema; -import org.apache.avro.file.DataFileWriter; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericRecord; -import org.apache.avro.io.DatumWriter; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.TestInfo; -import org.junit.platform.commons.util.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.testcontainers.containers.localstack.LocalStackContainer; import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Testcontainers; -import software.amazon.awssdk.core.sync.RequestBody; import software.amazon.awssdk.services.s3.S3Client; import software.amazon.awssdk.services.s3.model.PutObjectRequest; @@ -92,13 +83,6 @@ final class IntegrationTest implements IntegrationBase { private static final String COMMON_PREFIX = "s3-source-connector-for-apache-kafka-test-"; private static final int OFFSET_FLUSH_INTERVAL_MS = 500; - private static final String S3_ACCESS_KEY_ID = "test-key-id0"; - private static final String S3_SECRET_ACCESS_KEY = "test_secret_key0"; - - private static final String VALUE_CONVERTER_KEY = "value.converter"; - - private static final String TEST_BUCKET_NAME = "test-bucket0"; - private static String s3Endpoint; private static String s3Prefix; private static BucketAccessor testBucketAccessor; @@ -112,8 +96,19 @@ final class IntegrationTest implements IntegrationBase { private static S3Client s3Client; - @BeforeAll - static void setUpAll() throws IOException, InterruptedException { + @Override + public S3Client getS3Client() { + return s3Client; + } + + @Override + public String getS3Prefix() { + return s3Prefix; + } + + public + + @BeforeAll static void setUpAll() throws IOException, InterruptedException { s3Prefix = COMMON_PREFIX + ZonedDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME) + "/"; s3Client = IntegrationBase.createS3Client(LOCALSTACK); @@ -159,7 +154,7 @@ void tearDown() { @Test void bytesTest(final TestInfo testInfo) { final var topicName = IntegrationBase.topicName(testInfo); - final Map connectorConfig = getConfig(CONNECTOR_NAME, topicName, 2); + final Map connectorConfig = getConfig(CONNECTOR_NAME, topicName, 1); connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); connectRunner.configureConnector(CONNECTOR_NAME, connectorConfig); @@ -208,15 +203,15 @@ void avroTest(final TestInfo testInfo) throws IOException { final int numOfRecsFactor = 5000; - final byte[] outputStream1 = generateNextAvroMessagesStartingFromId(1, numOfRecsFactor, schema); - final byte[] outputStream2 = generateNextAvroMessagesStartingFromId(numOfRecsFactor + 1, numOfRecsFactor, - schema); - final byte[] outputStream3 = generateNextAvroMessagesStartingFromId(2 * numOfRecsFactor + 1, numOfRecsFactor, - schema); - final byte[] outputStream4 = generateNextAvroMessagesStartingFromId(3 * numOfRecsFactor + 1, numOfRecsFactor, - schema); - final byte[] outputStream5 = generateNextAvroMessagesStartingFromId(4 * numOfRecsFactor + 1, numOfRecsFactor, - schema); + final byte[] outputStream1 = IntegrationBase.generateNextAvroMessagesStartingFromId(1, numOfRecsFactor, schema); + final byte[] outputStream2 = IntegrationBase.generateNextAvroMessagesStartingFromId(numOfRecsFactor + 1, + numOfRecsFactor, schema); + final byte[] outputStream3 = IntegrationBase.generateNextAvroMessagesStartingFromId(2 * numOfRecsFactor + 1, + numOfRecsFactor, schema); + final byte[] outputStream4 = IntegrationBase.generateNextAvroMessagesStartingFromId(3 * numOfRecsFactor + 1, + numOfRecsFactor, schema); + final byte[] outputStream5 = IntegrationBase.generateNextAvroMessagesStartingFromId(4 * numOfRecsFactor + 1, + numOfRecsFactor, schema); final Set offsetKeys = new HashSet<>(); @@ -254,8 +249,8 @@ void parquetTest(final TestInfo testInfo) throws IOException { final var topicName = IntegrationBase.topicName(testInfo); final String partition = "00000"; - final String fileName = addPrefixOrDefault("") + topicName + "-" + partition + "-" + System.currentTimeMillis() - + ".txt"; + final String fileName = org.apache.commons.lang3.StringUtils.defaultIfBlank(getS3Prefix(), "") + topicName + "-" + + partition + "-" + System.currentTimeMillis() + ".txt"; final String name = "testuser"; final Map connectorConfig = getAvroConfig(topicName, InputFormat.PARQUET); @@ -321,36 +316,6 @@ void jsonTest(final TestInfo testInfo) { verifyOffsetPositions(Map.of(offsetKey, 500), connectRunner.getBootstrapServers()); } - private static byte[] generateNextAvroMessagesStartingFromId(final int messageId, final int noOfAvroRecs, - final Schema schema) throws IOException { - final DatumWriter datumWriter = new GenericDatumWriter<>(schema); - try (DataFileWriter dataFileWriter = new DataFileWriter<>(datumWriter); - ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) { - dataFileWriter.create(schema, outputStream); - for (int i = messageId; i < messageId + noOfAvroRecs; i++) { - final GenericRecord avroRecord = new GenericData.Record(schema); // NOPMD - avroRecord.put("message", "Hello, Kafka Connect S3 Source! object " + i); - avroRecord.put("id", i); - dataFileWriter.append(avroRecord); - } - - dataFileWriter.flush(); - return outputStream.toByteArray(); - } - } - - private static String writeToS3(final String topicName, final byte[] testDataBytes, final String partitionId) { - final String objectKey = addPrefixOrDefault("") + topicName + "-" + partitionId + "-" - + System.currentTimeMillis() + ".txt"; - final PutObjectRequest request = PutObjectRequest.builder().bucket(TEST_BUCKET_NAME).key(objectKey).build(); - s3Client.putObject(request, RequestBody.fromBytes(testDataBytes)); - return OBJECT_KEY + SEPARATOR + objectKey; - } - - private static String addPrefixOrDefault(final String defaultValue) { - return StringUtils.isNotBlank(s3Prefix) ? s3Prefix : defaultValue; - } - private Map getConfig(final String connectorName, final String topics, final int maxTasks) { final Map config = new HashMap<>(basicS3ConnectorConfig()); config.put("name", connectorName); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 320fa19c..1bfc5558 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -16,22 +16,17 @@ package io.aiven.kafka.connect.s3.source; -import static io.aiven.kafka.connect.common.config.SourceConfigFragment.MAX_POLL_RECORDS; - -import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; -import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Set; -import java.util.concurrent.atomic.AtomicBoolean; -import org.apache.kafka.connect.errors.DataException; import org.apache.kafka.connect.source.SourceRecord; -import org.apache.kafka.connect.source.SourceTask; +import io.aiven.kafka.connect.common.config.SourceCommonConfig; +import io.aiven.kafka.connect.common.source.AbstractSourceTask; import io.aiven.kafka.connect.common.source.input.Transformer; -import io.aiven.kafka.connect.common.source.input.TransformerFactory; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import io.aiven.kafka.connect.s3.source.utils.AWSV2SourceClient; import io.aiven.kafka.connect.s3.source.utils.OffsetManager; @@ -40,18 +35,17 @@ import io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator; import io.aiven.kafka.connect.s3.source.utils.Version; +import org.apache.commons.collections4.IteratorUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import software.amazon.awssdk.core.exception.SdkException; -import software.amazon.awssdk.services.s3.S3Client; /** * S3SourceTask is a Kafka Connect SourceTask implementation that reads from source-s3 buckets and generates Kafka * Connect records. */ -@SuppressWarnings({ "PMD.TooManyMethods", "PMD.ExcessiveImports" }) -public class S3SourceTask extends SourceTask { - +public class S3SourceTask extends AbstractSourceTask { + /** The logger to write to */ private static final Logger LOGGER = LoggerFactory.getLogger(S3SourceTask.class); public static final String BUCKET = "bucket"; @@ -60,29 +54,23 @@ public class S3SourceTask extends SourceTask { public static final String OBJECT_KEY = "object_key"; public static final String PARTITION = "topicPartition"; - private static final long S_3_POLL_INTERVAL_MS = 10_000L; - private static final long ERROR_BACKOFF = 1000L; - - private S3SourceConfig s3SourceConfig; - private S3Client s3Client; - - private Iterator sourceRecordIterator; + /** An iterator or S3SourceRecords */ + private Iterator s3SourceRecordIterator; + /** + * The transformer that we are using TODO move this to AbstractSourceTask + */ private Transformer transformer; + /** The AWS Source client */ - private boolean taskInitialized; - - private final AtomicBoolean connectorStopped = new AtomicBoolean(); - - private final Object pollLock = new Object(); private AWSV2SourceClient awsv2SourceClient; + /** The list of failed object keys */ private final Set failedObjectKeys = new HashSet<>(); - private final Set inProcessObjectKeys = new HashSet<>(); - + /** The offset manager this task uses */ private OffsetManager offsetManager; + private S3SourceConfig s3SourceConfig; - @SuppressWarnings("PMD.UnnecessaryConstructor") public S3SourceTask() { - super(); + super(LOGGER); } @Override @@ -91,100 +79,98 @@ public String version() { } @Override - public void start(final Map props) { + protected Iterator getIterator(BackoffConfig config) { // NOPMD cognitive complexity + final Iterator inner = new Iterator<>() { + /** + * The backoff for Amazon retryable exceptions + */ + final Backoff backoff = new Backoff(config); + + @Override + public boolean hasNext() { + while (stillPolling()) { + try { + return s3SourceRecordIterator.hasNext(); + } catch (SdkException exception) { + if (exception.retryable()) { + LOGGER.warn("Retryable error encountered during polling. Waiting before retrying...", + exception); + try { + backoff.delay(); + } catch (InterruptedException e) { + LOGGER.warn("Backoff delay was interrupted. Throwing original exception: {}", + exception.getMessage()); + throw exception; + } + } else { + // TODO validate that the iterator does not lose an S3Object. Add test to + // S3ObjectIterator. + throw exception; + } + } + } + return false; + } + + @Override + public SourceRecord next() { + final S3SourceRecord s3SourceRecord = s3SourceRecordIterator.next(); + offsetManager.updateAndReturnCurrentOffsets(s3SourceRecord.getPartitionMap(), + s3SourceRecord.getObjectKey(), s3SourceRecord.getRecordNumber()); + return RecordProcessor.createSourceRecord(s3SourceRecord, s3SourceConfig, awsv2SourceClient, + offsetManager); + } + }; + return IteratorUtils.filteredIterator(inner, Objects::nonNull); + } + + @Override + protected SourceCommonConfig configure(final Map props) { LOGGER.info("S3 Source task started."); - s3SourceConfig = new S3SourceConfig(props); - this.transformer = TransformerFactory.getTransformer(s3SourceConfig); + this.s3SourceConfig = new S3SourceConfig(props); + this.transformer = s3SourceConfig.getTransformer(); offsetManager = new OffsetManager(context, s3SourceConfig); awsv2SourceClient = new AWSV2SourceClient(s3SourceConfig, failedObjectKeys); - prepareReaderFromOffsetStorageReader(); - this.taskInitialized = true; - } - - private void prepareReaderFromOffsetStorageReader() { - sourceRecordIterator = new SourceRecordIterator(s3SourceConfig, offsetManager, this.transformer, - awsv2SourceClient); + setS3SourceRecordIterator( + new SourceRecordIterator(s3SourceConfig, offsetManager, this.transformer, awsv2SourceClient)); + return s3SourceConfig; } @Override - public List poll() throws InterruptedException { - LOGGER.info("Polling for new records..."); - synchronized (pollLock) { - final List results = new ArrayList<>(s3SourceConfig.getInt(MAX_POLL_RECORDS)); - - if (connectorStopped.get()) { - LOGGER.info("Connector has been stopped. Returning empty result list."); - return results; - } - - while (!connectorStopped.get()) { - try { - extractSourceRecords(results); - LOGGER.info("Number of records extracted and sent: {}", results.size()); - return results; - } catch (SdkException exception) { - if (exception.retryable()) { - LOGGER.warn("Retryable error encountered during polling. Waiting before retrying...", - exception); - pollLock.wait(ERROR_BACKOFF); - - prepareReaderFromOffsetStorageReader(); - } else { - LOGGER.warn("Non-retryable AmazonS3Exception occurred. Stopping polling.", exception); - return null; // NOPMD - } - } catch (DataException exception) { - LOGGER.warn("DataException occurred during polling. No retries will be attempted.", exception); - } catch (final Throwable t) { // NOPMD - LOGGER.error("Unexpected error encountered. Closing resources and stopping task.", t); - closeResources(); - throw t; - } - } - return results; - } + public void commit() { + LOGGER.info("Committed all records through last poll()"); } - private List extractSourceRecords(final List results) throws InterruptedException { - waitForObjects(); - if (connectorStopped.get()) { - return results; + @Override + public void commitRecord(final SourceRecord record) { + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("Committed individual record {} committed", (Map) record.sourceOffset()); } - return RecordProcessor.processRecords(sourceRecordIterator, results, s3SourceConfig, connectorStopped, - awsv2SourceClient, offsetManager); } - private void waitForObjects() throws InterruptedException { - while (!sourceRecordIterator.hasNext() && !connectorStopped.get()) { - LOGGER.debug("Blocking until new S3 files are available."); - Thread.sleep(S_3_POLL_INTERVAL_MS); - prepareReaderFromOffsetStorageReader(); - } + /** + * Set the S3 source record iterator that this task is using. Protected to be overridden in testing implementation. + * + * @param iterator + * The S3SourceRecord iterator to use. + */ + protected void setS3SourceRecordIterator(final Iterator iterator) { + s3SourceRecordIterator = iterator; } @Override - public void stop() { - this.taskInitialized = false; - this.connectorStopped.set(true); - synchronized (pollLock) { - closeResources(); - } - } - - private void closeResources() { + protected void closeResources() { awsv2SourceClient.shutdown(); } // below for visibility in tests + + /** + * Get the transformer that we are using. + * + * @return the transformer that we are using. + */ public Transformer getTransformer() { return transformer; } - - public boolean isTaskInitialized() { - return taskInitialized; - } - - public AtomicBoolean getConnectorStopped() { - return new AtomicBoolean(connectorStopped.get()); - } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClient.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClient.java index 44e28dfa..ed460a50 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClient.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClient.java @@ -28,7 +28,9 @@ import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import org.apache.commons.io.function.IOSupplier; -import org.codehaus.plexus.util.StringUtils; +import org.apache.commons.lang3.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import software.amazon.awssdk.core.ResponseBytes; import software.amazon.awssdk.services.s3.S3Client; import software.amazon.awssdk.services.s3.model.GetObjectRequest; @@ -42,6 +44,7 @@ */ public class AWSV2SourceClient { + private static final Logger LOGGER = LoggerFactory.getLogger(AWSV2SourceClient.class); public static final int PAGE_SIZE_FACTOR = 2; private final S3SourceConfig s3SourceConfig; private final S3Client s3Client; @@ -50,6 +53,9 @@ public class AWSV2SourceClient { private Predicate filterPredicate = s3Object -> s3Object.size() > 0; private final Set failedObjectKeys; + private final int taskId; + private final int maxTasks; + /** * @param s3SourceConfig * configuration for Source connector @@ -57,11 +63,7 @@ public class AWSV2SourceClient { * all objectKeys which have already been tried but have been unable to process. */ public AWSV2SourceClient(final S3SourceConfig s3SourceConfig, final Set failedObjectKeys) { - this.s3SourceConfig = s3SourceConfig; - final S3ClientFactory s3ClientFactory = new S3ClientFactory(); - this.s3Client = s3ClientFactory.createAmazonS3Client(s3SourceConfig); - this.bucketName = s3SourceConfig.getAwsS3BucketName(); - this.failedObjectKeys = new HashSet<>(failedObjectKeys); + this(new S3ClientFactory().createAmazonS3Client(s3SourceConfig), s3SourceConfig, failedObjectKeys); } /** @@ -80,42 +82,96 @@ public AWSV2SourceClient(final S3SourceConfig s3SourceConfig, final Set this.s3Client = s3Client; this.bucketName = s3SourceConfig.getAwsS3BucketName(); this.failedObjectKeys = new HashSet<>(failedObjectKeys); + + // TODO the code below should be configured in some sort of taks assignement method/process/call. + int maxTasks; + try { + final Object value = s3SourceConfig.originals().get("tasks.max"); + if (value == null) { + LOGGER.info("Setting tasks.max to 1"); + maxTasks = 1; + } else { + maxTasks = Integer.parseInt(value.toString()); + } + } catch (NumberFormatException e) { // NOPMD catch null pointer + LOGGER.warn("Invalid tasks.max: {}", e.getMessage()); + LOGGER.info("Setting tasks.max to 1"); + maxTasks = 1; + } + this.maxTasks = maxTasks; + int taskId; + try { + final Object value = s3SourceConfig.originals().get("task.id"); + if (value == null) { + LOGGER.info("Setting task.id to 0"); + taskId = 0; + } else { + taskId = Integer.parseInt(value.toString()) % maxTasks; + } + } catch (NumberFormatException e) { // NOPMD catch null pointer + LOGGER.warn("Invalid task.id: {}", e.getMessage()); + LOGGER.info("Setting task.id to 0"); + taskId = 0; + } + this.taskId = taskId; } - public Iterator getListOfObjectKeys(final String startToken) { + /** + * Creates a stream from which we will create an iterator. + * + * @param startToken + * the beginning key, or {@code null} to start at the beginning. + * @return a Stream of S3Objects for the current state of the S3 storage. + */ + private Stream getS3ObjectStream(final String startToken) { final ListObjectsV2Request request = ListObjectsV2Request.builder() .bucket(bucketName) .maxKeys(s3SourceConfig.getS3ConfigFragment().getFetchPageSize() * PAGE_SIZE_FACTOR) - .prefix(optionalKey(s3SourceConfig.getAwsS3Prefix())) - .startAfter(optionalKey(startToken)) + .prefix(StringUtils.defaultIfBlank(s3SourceConfig.getAwsS3Prefix(), null)) + .startAfter(StringUtils.defaultIfBlank(startToken, null)) .build(); - final Stream s3ObjectKeyStream = Stream - .iterate(s3Client.listObjectsV2(request), Objects::nonNull, response -> { - // This is called every time next() is called on the iterator. - if (response.isTruncated()) { - return s3Client.listObjectsV2(ListObjectsV2Request.builder() - .maxKeys(s3SourceConfig.getS3ConfigFragment().getFetchPageSize() * PAGE_SIZE_FACTOR) - .continuationToken(response.nextContinuationToken()) - .build()); - } else { - return null; - } - - }) + return Stream.iterate(s3Client.listObjectsV2(request), Objects::nonNull, response -> { + // This is called every time next() is called on the iterator. + if (response.isTruncated()) { + return s3Client.listObjectsV2(ListObjectsV2Request.builder() + .maxKeys(s3SourceConfig.getS3ConfigFragment().getFetchPageSize() * PAGE_SIZE_FACTOR) + .continuationToken(response.nextContinuationToken()) + .build()); + } else { + return null; + } + + }) .flatMap(response -> response.contents() .stream() .filter(filterPredicate) .filter(objectSummary -> assignObjectToTask(objectSummary.key())) - .filter(objectSummary -> !failedObjectKeys.contains(objectSummary.key()))) - .map(S3Object::key); - return s3ObjectKeyStream.iterator(); + .filter(objectSummary -> !failedObjectKeys.contains(objectSummary.key()))); } - private String optionalKey(final String key) { - if (StringUtils.isNotBlank(key)) { - return key; - } - return null; + + /** + * Creates an S3Object iterator that will return the objects from the current objects in S3 storage and then try to + * refresh on every {@code hasNext()} that returns false. This should pick up new files as they are dropped on the + * file system. + * + * @param startToken + * the beginning key, or {@code null} to start at the beginning. + * @return an Iterator on the S3Objects. + */ + public Iterator getS3ObjectIterator(final String startToken) { + return new S3ObjectIterator(startToken); + } + + /** + * Gets an iterator of keys from the current S3 storage. + * + * @param startToken + * the beginning key, or {@code null} to start at the beginning. + * @return an Iterator on the keys of the current S3Objects. + */ + public Iterator getListOfObjectKeys(final String startToken) { + return getS3ObjectStream(startToken).map(S3Object::key).iterator(); } public IOSupplier getObject(final String objectKey) { @@ -133,8 +189,6 @@ public void setFilterPredicate(final Predicate predicate) { } private boolean assignObjectToTask(final String objectKey) { - final int maxTasks = Integer.parseInt(s3SourceConfig.originals().get("tasks.max").toString()); - final int taskId = Integer.parseInt(s3SourceConfig.originals().get("task.id").toString()) % maxTasks; final int taskAssignment = Math.floorMod(objectKey.hashCode(), maxTasks); return taskAssignment == taskId; } @@ -143,4 +197,39 @@ public void shutdown() { s3Client.close(); } + /** + * An iterator that reads from + */ + public class S3ObjectIterator implements Iterator { + + /** The current iterator. */ + private Iterator inner; + /** The last object key that was seen. */ + private String lastSeenObjectKey; + + private S3ObjectIterator(final String initialKey) { + lastSeenObjectKey = initialKey; + inner = getS3ObjectStream(lastSeenObjectKey).iterator(); + } + @Override + public boolean hasNext() { + if (!inner.hasNext()) { + inner = getS3ObjectStream(lastSeenObjectKey).iterator(); + } + return inner.hasNext(); + } + + @Override + public S3Object next() { + final S3Object result = inner.next(); + lastSeenObjectKey = result.key(); + return result; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + } + } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ConnectUtils.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ConnectUtils.java index f401c4e1..6c60bb8e 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ConnectUtils.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ConnectUtils.java @@ -28,7 +28,7 @@ final public class ConnectUtils { private ConnectUtils() { // hidden } - static Map getPartitionMap(final String topicName, final Integer defaultPartitionId, + public static Map getPartitionMap(final String topicName, final Integer defaultPartitionId, final String bucketName) { final Map partitionMap = new HashMap<>(); partitionMap.put(BUCKET, bucketName); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java index 1b52d8d8..95bc4053 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java @@ -23,6 +23,7 @@ import java.util.Arrays; import java.util.Collections; import java.util.HashMap; +import java.util.Hashtable; import java.util.List; import java.util.Map; import java.util.Set; @@ -81,7 +82,17 @@ public long incrementAndUpdateOffsetMap(final Map partitionMap, return startOffset; } - public String getObjectMapKey(final String currentObjectKey) { + public Map updateAndReturnCurrentOffsets(final Map partitionMap, + final String currentObjectKey, final long offset) { + final Map offsetMap = offsets.compute(partitionMap, (k, v) -> { + final Map map = v == null ? new Hashtable<>() : v; + map.put(getObjectMapKey(currentObjectKey), offset); + return map; + }); + return new HashMap<>(offsetMap); + } + + public static String getObjectMapKey(final String currentObjectKey) { return OBJECT_KEY + SEPARATOR + currentObjectKey; } @@ -92,29 +103,6 @@ public long recordsProcessedForObjectKey(final Map partitionMap, return 0L; } - public void createNewOffsetMap(final Map partitionMap, final String objectKey, - final long offsetId) { - final Map offsetMap = getOffsetValueMap(objectKey, offsetId); - offsets.put(partitionMap, offsetMap); - } - - public Map getOffsetValueMap(final String currentObjectKey, final long offsetId) { - final Map offsetMap = new HashMap<>(); - offsetMap.put(getObjectMapKey(currentObjectKey), offsetId); - - return offsetMap; - } - - void updateCurrentOffsets(final Map partitionMap, final Map offsetValueMap) { - if (offsets.containsKey(partitionMap)) { - final Map offsetMap = new HashMap<>(offsets.get(partitionMap)); - offsetMap.putAll(offsetValueMap); - offsets.put(partitionMap, offsetMap); - } else { - offsets.put(partitionMap, offsetValueMap); - } - } - private static Set parsePartitions(final S3SourceConfig s3SourceConfig) { final String partitionString = s3SourceConfig.getTargetTopicPartitions(); return Arrays.stream(partitionString.split(",")).map(Integer::parseInt).collect(Collectors.toSet()); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java index bdf26533..e945c256 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java @@ -16,10 +16,6 @@ package io.aiven.kafka.connect.s3.source.utils; -import java.util.Iterator; -import java.util.List; -import java.util.concurrent.atomic.AtomicBoolean; - import org.apache.kafka.connect.errors.ConnectException; import org.apache.kafka.connect.errors.DataException; import org.apache.kafka.connect.source.SourceRecord; @@ -35,33 +31,13 @@ public final class RecordProcessor { private static final Logger LOGGER = LoggerFactory.getLogger(RecordProcessor.class); private RecordProcessor() { - - } - - public static List processRecords(final Iterator sourceRecordIterator, - final List results, final S3SourceConfig s3SourceConfig, final AtomicBoolean connectorStopped, - final AWSV2SourceClient sourceClient, final OffsetManager offsetManager) { - - final int maxPollRecords = s3SourceConfig.getMaxPollRecords(); - - for (int i = 0; sourceRecordIterator.hasNext() && i < maxPollRecords && !connectorStopped.get(); i++) { - final S3SourceRecord s3SourceRecord = sourceRecordIterator.next(); - if (s3SourceRecord != null) { - final SourceRecord sourceRecord = createSourceRecord(s3SourceRecord, s3SourceConfig, sourceClient, - offsetManager); - results.add(sourceRecord); - } - } - - return results; } - static SourceRecord createSourceRecord(final S3SourceRecord s3SourceRecord, final S3SourceConfig s3SourceConfig, - final AWSV2SourceClient sourceClient, final OffsetManager offsetManager) { + public static SourceRecord createSourceRecord(final S3SourceRecord s3SourceRecord, + final S3SourceConfig s3SourceConfig, final AWSV2SourceClient sourceClient, + final OffsetManager offsetManager) { try { - offsetManager.updateCurrentOffsets(s3SourceRecord.getPartitionMap(), s3SourceRecord.getOffsetMap()); - s3SourceRecord.setOffsetMap(offsetManager.getOffsets().get(s3SourceRecord.getPartitionMap())); - return s3SourceRecord.getSourceRecord(); + return s3SourceRecord.getSourceRecord(offsetManager); } catch (DataException e) { if (ErrorsTolerance.NONE.equals(s3SourceConfig.getErrorsTolerance())) { throw new ConnectException("Data Exception caught during S3 record to source record transformation", e); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/S3SourceRecord.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/S3SourceRecord.java index c4be5021..05ca02ba 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/S3SourceRecord.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/S3SourceRecord.java @@ -25,7 +25,7 @@ public class S3SourceRecord { private final Map partitionMap; - private Map offsetMap; + private final long recordNumber; private final String topic; private final Integer topicPartition; private final SchemaAndValue keyData; @@ -34,11 +34,11 @@ public class S3SourceRecord { private final String objectKey; - public S3SourceRecord(final Map partitionMap, final Map offsetMap, - final String topic, final Integer topicPartition, final String objectKey, final SchemaAndValue keyData, + public S3SourceRecord(final Map partitionMap, final long recordNumber, final String topic, + final Integer topicPartition, final String objectKey, final SchemaAndValue keyData, final SchemaAndValue valueData) { this.partitionMap = new HashMap<>(partitionMap); - this.offsetMap = new HashMap<>(offsetMap); + this.recordNumber = recordNumber; this.topic = topic; this.topicPartition = topicPartition; this.keyData = keyData; @@ -50,8 +50,8 @@ public Map getPartitionMap() { return Collections.unmodifiableMap(partitionMap); } - public Map getOffsetMap() { - return Collections.unmodifiableMap(offsetMap); + public long getRecordNumber() { + return recordNumber; } public String getTopic() { @@ -66,12 +66,18 @@ public String getObjectKey() { return objectKey; } - public void setOffsetMap(final Map offsetMap) { - this.offsetMap = new HashMap<>(offsetMap); + public SchemaAndValue getKey() { + return new SchemaAndValue(keyData.schema(), keyData.value()); } - public SourceRecord getSourceRecord() { - return new SourceRecord(getPartitionMap(), getOffsetMap(), topic, partition(), keyData.schema(), - keyData.value(), valueData.schema(), valueData.value()); + public SchemaAndValue getValue() { + return new SchemaAndValue(valueData.schema(), valueData.value()); + } + + public SourceRecord getSourceRecord(final OffsetManager offsetManager) { + final Map offsetMap = offsetManager.updateAndReturnCurrentOffsets(getPartitionMap(), + getObjectKey(), getRecordNumber()); + return new SourceRecord(getPartitionMap(), offsetMap, topic, partition(), keyData.schema(), keyData.value(), + valueData.schema(), valueData.value()); } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index 26f3c03c..bded51d1 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -16,13 +16,11 @@ package io.aiven.kafka.connect.s3.source.utils; -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; -import java.util.List; import java.util.Map; +import java.util.function.Function; +import java.util.function.Predicate; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Stream; @@ -33,27 +31,20 @@ import io.aiven.kafka.connect.common.source.input.Transformer; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import org.apache.commons.io.function.IOSupplier; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import software.amazon.awssdk.core.exception.SdkException; +import org.apache.commons.collections4.IteratorUtils; +import software.amazon.awssdk.services.s3.model.S3Object; /** * Iterator that processes S3 files and creates Kafka source records. Supports different output formats (Avro, JSON, * Parquet). */ public final class SourceRecordIterator implements Iterator { - private static final Logger LOGGER = LoggerFactory.getLogger(SourceRecordIterator.class); public static final String PATTERN_TOPIC_KEY = "topicName"; public static final String PATTERN_PARTITION_KEY = "partitionId"; public static final Pattern FILE_DEFAULT_PATTERN = Pattern.compile("(?[^/]+?)-" + "(?\\d{5})-" + "(?[a-zA-Z0-9]+)" + "\\.(?[^.]+)$"); // topic-00001.txt public static final long BYTES_TRANSFORMATION_NUM_OF_RECS = 1L; - private String currentObjectKey; - - private Iterator objectListIterator; - private Iterator recordIterator = Collections.emptyIterator(); private final OffsetManager offsetManager; @@ -63,165 +54,55 @@ public final class SourceRecordIterator implements Iterator { private final Transformer transformer; // Once we decouple the S3Object from the Source Iterator we can change this to be the SourceApiClient // At which point it will work for al our integrations. - private final AWSV2SourceClient sourceClient; // NOPMD - - public SourceRecordIterator(final S3SourceConfig s3SourceConfig, final OffsetManager offsetManager, - final Transformer transformer, final AWSV2SourceClient sourceClient) { - this.s3SourceConfig = s3SourceConfig; - this.offsetManager = offsetManager; + private final AWSV2SourceClient sourceClient; - this.bucketName = s3SourceConfig.getAwsS3BucketName(); - this.transformer = transformer; - this.sourceClient = sourceClient; - objectListIterator = sourceClient.getListOfObjectKeys(null); - } + private String topic; + private int partitionId; - private void nextS3Object() { - if (!objectListIterator.hasNext()) { - // Start after the object Key we have just finished with. - objectListIterator = sourceClient.getListOfObjectKeys(currentObjectKey); - if (!objectListIterator.hasNext()) { - recordIterator = Collections.emptyIterator(); - return; - } - } + private final Iterator inner; - try { - currentObjectKey = objectListIterator.next(); - if (currentObjectKey != null) { - recordIterator = createIteratorForCurrentFile(); - } - } catch (IOException e) { - throw SdkException.create(e.getMessage(), e.getCause()); - } - } + private Iterator outer; - private Iterator createIteratorForCurrentFile() throws IOException { + private final Predicate fileNamePredicate = s3Object -> { - final Matcher fileMatcher = FILE_DEFAULT_PATTERN.matcher(currentObjectKey); - String topicName; - int defaultPartitionId; + final Matcher fileMatcher = FILE_DEFAULT_PATTERN.matcher(s3Object.key()); if (fileMatcher.find()) { // TODO move this from the SourceRecordIterator so that we can decouple it from S3 and make it API agnostic + topic = fileMatcher.group(PATTERN_TOPIC_KEY); + partitionId = Integer.parseInt(fileMatcher.group(PATTERN_PARTITION_KEY)); + return true; + } + return false; + }; - final IOSupplier s3Object = sourceClient.getObject(currentObjectKey); - topicName = fileMatcher.group(PATTERN_TOPIC_KEY); - defaultPartitionId = Integer.parseInt(fileMatcher.group(PATTERN_PARTITION_KEY)); - - final long defaultStartOffsetId = 1L; - - final String finalTopic = topicName; - final Map partitionMap = ConnectUtils.getPartitionMap(topicName, defaultPartitionId, - bucketName); - - return getObjectIterator(s3Object, finalTopic, defaultPartitionId, defaultStartOffsetId, transformer, - partitionMap); + public SourceRecordIterator(final S3SourceConfig s3SourceConfig, final OffsetManager offsetManager, + final Transformer transformer, final AWSV2SourceClient sourceClient) { + super(); + this.s3SourceConfig = s3SourceConfig; + this.offsetManager = offsetManager; - } else { - LOGGER.error("File naming doesn't match to any topic. {}", currentObjectKey); - return Collections.emptyIterator(); - } - } + this.bucketName = s3SourceConfig.getAwsS3BucketName(); + this.transformer = transformer; + this.sourceClient = sourceClient; - @SuppressWarnings("PMD.CognitiveComplexity") - private Iterator getObjectIterator(final IOSupplier s3Object, final String topic, - final int topicPartition, final long startOffset, final Transformer transformer, - final Map partitionMap) { - return new Iterator<>() { - private final Iterator internalIterator = readNext().iterator(); - - private List readNext() { - - final List sourceRecords = new ArrayList<>(); - - final long numberOfRecsAlreadyProcessed = offsetManager.recordsProcessedForObjectKey(partitionMap, - currentObjectKey); - - // Optimizing without reading stream again. - if (checkBytesTransformation(transformer, numberOfRecsAlreadyProcessed)) { - return sourceRecords; - } - - try (Stream recordStream = transformer.getRecords(s3Object, topic, topicPartition, - s3SourceConfig, numberOfRecsAlreadyProcessed)) { - - final Iterator recordIterator = recordStream.iterator(); - while (recordIterator.hasNext()) { - final Object record = recordIterator.next(); - - sourceRecords.add(getSourceRecord(topic, topicPartition, offsetManager, startOffset, - partitionMap, transformer.getValueData(record, topic, s3SourceConfig), - transformer.getKeyData(currentObjectKey, topic, s3SourceConfig))); - - // Break if we have reached the max records per poll - if (sourceRecords.size() >= s3SourceConfig.getMaxPollRecords()) { - break; - } - } - } - - return sourceRecords; - } - - // For bytes transformation, read whole file as 1 record - private boolean checkBytesTransformation(final Transformer transformer, - final long numberOfRecsAlreadyProcessed) { - return transformer instanceof ByteArrayTransformer - && numberOfRecsAlreadyProcessed == BYTES_TRANSFORMATION_NUM_OF_RECS; - } - - private S3SourceRecord getSourceRecord(final String topic, final int topicPartition, - final OffsetManager offsetManager, final long startOffset, final Map partitionMap, - final SchemaAndValue valueData, final SchemaAndValue keyData) { - - long currentOffset; - - if (offsetManager.getOffsets().containsKey(partitionMap)) { - LOGGER.info("***** offsetManager.getOffsets() ***** {}", offsetManager.getOffsets()); - currentOffset = offsetManager.incrementAndUpdateOffsetMap(partitionMap, currentObjectKey, - startOffset); - } else { - LOGGER.info("Into else block ..."); - currentOffset = startOffset; - offsetManager.createNewOffsetMap(partitionMap, currentObjectKey, currentOffset); - } - - final Map offsetMap = offsetManager.getOffsetValueMap(currentObjectKey, currentOffset); - - return new S3SourceRecord(partitionMap, offsetMap, topic, topicPartition, currentObjectKey, keyData, - valueData); - } - - @Override - public boolean hasNext() { - return internalIterator.hasNext(); - } - - @Override - public S3SourceRecord next() { - return internalIterator.next(); - } - }; + // call filters out bad file names and extracts topic/partition + inner = IteratorUtils.filteredIterator(sourceClient.getS3ObjectIterator(null), + s3Object -> this.fileNamePredicate.test(s3Object)); + outer = Collections.emptyIterator(); } @Override public boolean hasNext() { - return recordIterator.hasNext() || objectListIterator.hasNext(); + while (!outer.hasNext() && inner.hasNext()) { + outer = convert(inner.next()).iterator(); + } + return outer.hasNext(); } @Override public S3SourceRecord next() { - if (!recordIterator.hasNext()) { - nextS3Object(); - } - - if (!recordIterator.hasNext()) { - // If there are still no records, return null or throw an exception - return null; // Or throw new NoSuchElementException(); - } - - return recordIterator.next(); + return outer.next(); } @Override @@ -229,4 +110,63 @@ public void remove() { throw new UnsupportedOperationException("This iterator is unmodifiable"); } + /** + * Converts the S3Object into stream of S3SourceRecords. + * + * @param s3Object + * the S3Object to read data from. + * @return a stream of S3SourceRecords created from the input stream of the S3Object. + */ + private Stream convert(final S3Object s3Object) { + + final Map partitionMap = ConnectUtils.getPartitionMap(topic, partitionId, bucketName); + final long recordCount = offsetManager.recordsProcessedForObjectKey(partitionMap, s3Object.key()); + + // Optimizing without reading stream again. + if (transformer instanceof ByteArrayTransformer && recordCount > 0) { + return Stream.empty(); + } + + final SchemaAndValue keyData = transformer.getKeyData(s3Object.key(), topic, s3SourceConfig); + + return transformer + .getRecords(sourceClient.getObject(s3Object.key()), topic, partitionId, s3SourceConfig, recordCount) + .map(new Mapper(partitionMap, recordCount, keyData, s3Object.key())); + } + + /** + * maps the data from the @{link Transformer} stream to an S3SourceRecord given all the additional data required. + */ + class Mapper implements Function { + /** + * The partition map + */ + private final Map partitionMap; + /** + * The record number for the record being created. + */ + private long recordCount; + /** + * The schema and value for the key + */ + private final SchemaAndValue keyData; + /** + * The object key from S3 + */ + private final String objectKey; + + public Mapper(final Map partitionMap, final long recordCount, final SchemaAndValue keyData, + final String objectKey) { + this.partitionMap = partitionMap; + this.recordCount = recordCount; + this.keyData = keyData; + this.objectKey = objectKey; + } + + @Override + public S3SourceRecord apply(final SchemaAndValue valueData) { + recordCount++; + return new S3SourceRecord(partitionMap, recordCount, topic, partitionId, objectKey, keyData, valueData); + } + } } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java index 13ac6684..944ccbfd 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java @@ -23,10 +23,14 @@ import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; -import java.lang.reflect.Field; import java.net.URI; import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.time.Duration; +import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Random; @@ -37,38 +41,47 @@ import org.apache.kafka.connect.source.SourceTaskContext; import org.apache.kafka.connect.storage.OffsetStorageReader; +import io.aiven.kafka.connect.common.config.SourceConfigFragment; +import io.aiven.kafka.connect.common.source.AbstractSourceTask; import io.aiven.kafka.connect.common.source.input.ByteArrayTransformer; import io.aiven.kafka.connect.common.source.input.InputFormat; -import io.aiven.kafka.connect.common.source.input.Transformer; import io.aiven.kafka.connect.config.s3.S3ConfigFragment; import io.aiven.kafka.connect.iam.AwsCredentialProviderFactory; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; +import io.aiven.kafka.connect.s3.source.utils.ConnectUtils; +import io.aiven.kafka.connect.s3.source.utils.OffsetManager; import io.aiven.kafka.connect.s3.source.utils.S3SourceRecord; -import io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator; import io.findify.s3mock.S3Mock; +import org.apache.commons.lang3.time.StopWatch; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.ExtendWith; -import org.mockito.Mock; -import org.mockito.junit.jupiter.MockitoExtension; import software.amazon.awssdk.core.client.config.ClientOverrideConfiguration; import software.amazon.awssdk.core.retry.RetryMode; import software.amazon.awssdk.services.s3.S3Client; import software.amazon.awssdk.services.s3.S3Configuration; -@ExtendWith(MockitoExtension.class) final class S3SourceTaskTest { + /** + * The amount of extra time that we will allow for timing errors. + */ + private static final long TIMING_DELTA = 500; + private static final Random RANDOM = new Random(); private Map properties; - private static BucketAccessor testBucketAccessor; private static final String TEST_BUCKET = "test-bucket"; + + private static final String TOPIC = "TOPIC1"; + + private static final int PARTITION = 1; + + private static final String OBJECT_KEY = "object_key"; + // TODO S3Mock has not been maintained in 4 years // Adobe have an alternative we can move to. private static S3Mock s3Api; @@ -76,12 +89,6 @@ final class S3SourceTaskTest { private static Map commonProperties; - @Mock - private SourceTaskContext mockedSourceTaskContext; - - @Mock - private OffsetStorageReader mockedOffsetStorageReader; - @BeforeAll public static void setUpClass() throws URISyntaxException { final int s3Port = RANDOM.nextInt(10_000) + 10_000; @@ -107,9 +114,6 @@ public static void setUpClass() throws URISyntaxException { .serviceConfiguration(S3Configuration.builder().pathStyleAccessEnabled(true).build()) .credentialsProvider(credentialFactory.getAwsV2Provider(config.getS3ConfigFragment())) .build(); - - testBucketAccessor = new BucketAccessor(s3Client, TEST_BUCKET); - testBucketAccessor.createBucket(); } @AfterAll @@ -121,8 +125,6 @@ public static void tearDownClass() { public void setUp() { properties = new HashMap<>(commonProperties); s3Client.createBucket(create -> create.bucket(TEST_BUCKET).build()); - mockedSourceTaskContext = mock(SourceTaskContext.class); - mockedOffsetStorageReader = mock(OffsetStorageReader.class); } @AfterEach @@ -135,74 +137,271 @@ void testS3SourceTaskInitialization() { final S3SourceTask s3SourceTask = new S3SourceTask(); startSourceTask(s3SourceTask); - final Transformer transformer = s3SourceTask.getTransformer(); - assertThat(transformer).isInstanceOf(ByteArrayTransformer.class); + assertThat(s3SourceTask.getTransformer()).isInstanceOf(ByteArrayTransformer.class); - final boolean taskInitialized = s3SourceTask.isTaskInitialized(); - assertThat(taskInitialized).isTrue(); + assertThat(s3SourceTask.isRunning()).isTrue(); } @Test - void testPoll() throws Exception { + void testStop() { final S3SourceTask s3SourceTask = new S3SourceTask(); startSourceTask(s3SourceTask); + s3SourceTask.stop(); + + assertThat(s3SourceTask.isRunning()).isFalse(); + } - SourceRecordIterator mockSourceRecordIterator; + private static S3SourceRecord createS3SourceRecord(final String topicName, final Integer defaultPartitionId, + final String bucketName, final String objectKey, final byte[] key, final byte[] value) { + return new S3SourceRecord(ConnectUtils.getPartitionMap(topicName, defaultPartitionId, bucketName), 0L, + topicName, defaultPartitionId, objectKey, new SchemaAndValue(Schema.OPTIONAL_BYTES_SCHEMA, key), + new SchemaAndValue(Schema.OPTIONAL_BYTES_SCHEMA, value)); + } + + private void startSourceTask(final S3SourceTask s3SourceTask) { + final SourceTaskContext mockedSourceTaskContext = mock(SourceTaskContext.class); + final OffsetStorageReader mockedOffsetStorageReader = mock(OffsetStorageReader.class); + when(mockedSourceTaskContext.offsetStorageReader()).thenReturn(mockedOffsetStorageReader); + s3SourceTask.initialize(mockedSourceTaskContext); - mockSourceRecordIterator = mock(SourceRecordIterator.class); - setPrivateField(s3SourceTask, "sourceRecordIterator", mockSourceRecordIterator); - when(mockSourceRecordIterator.hasNext()).thenReturn(true).thenReturn(true).thenReturn(false); + setBasicProperties(); + s3SourceTask.start(properties); + } - final S3SourceRecord s3SourceRecordList = getAivenS3SourceRecord(); - when(mockSourceRecordIterator.next()).thenReturn(s3SourceRecordList); + private void setBasicProperties() { + properties.putIfAbsent(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); + properties.putIfAbsent("name", "test_source_connector"); + properties.putIfAbsent("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); + properties.putIfAbsent("value.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); + properties.putIfAbsent("tasks.max", "1"); + properties.putIfAbsent("connector.class", AivenKafkaConnectS3SourceConnector.class.getName()); + properties.putIfAbsent(TARGET_TOPIC_PARTITIONS, "0,1"); + properties.putIfAbsent(TARGET_TOPICS, "testtopic"); - final List sourceRecordList = s3SourceTask.poll(); - assertThat(sourceRecordList).isNotEmpty(); } @Test - void testStop() { - final S3SourceTask s3SourceTask = new S3SourceTask(); + void testPollWithNoDataReturned() { + final S3SourceConfig s3SourceConfig = mock(S3SourceConfig.class); + when(s3SourceConfig.getMaxPollRecords()).thenReturn(5); + final Iterator sourceRecordIterator = Collections.emptyIterator(); + final S3SourceTask s3SourceTask = new TestingS3SourceTask(sourceRecordIterator); + startSourceTask(s3SourceTask); - s3SourceTask.stop(); + final StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + final List results = s3SourceTask.poll(); + stopWatch.stop(); + assertThat(results).isNull(); + assertThat(stopWatch.getTime()).isLessThan(AbstractSourceTask.MAX_POLL_TIME.toMillis() + TIMING_DELTA); + } + + private void assertEquals(final S3SourceRecord s3Record, final SourceRecord sourceRecord) { + assertThat(sourceRecord).isNotNull(); + assertThat(sourceRecord.sourcePartition()).isEqualTo(s3Record.getPartitionMap()); + final Map map = (Map) sourceRecord.sourceOffset(); + + assertThat(map.get(OffsetManager.getObjectMapKey(s3Record.getObjectKey()))) + .isEqualTo(s3Record.getRecordNumber()); + assertThat(sourceRecord.key()).isEqualTo(s3Record.getKey().value()); + assertThat(sourceRecord.value()).isEqualTo(s3Record.getValue().value()); + } + + @Test + void testPollsWithRecords() { + final List lst = createS3SourceRecords(2); + final Iterator sourceRecordIterator = lst.iterator(); + final S3SourceTask s3SourceTask = new TestingS3SourceTask(sourceRecordIterator); - final boolean taskInitialized = s3SourceTask.isTaskInitialized(); - assertThat(taskInitialized).isFalse(); - assertThat(s3SourceTask.getConnectorStopped()).isTrue(); + startSourceTask(s3SourceTask); + final StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + final List results = s3SourceTask.poll(); + stopWatch.stop(); + + assertThat(results).hasSize(2); + assertEquals(lst.get(0), results.get(0)); + assertEquals(lst.get(1), results.get(1)); + assertThat(stopWatch.getTime()).isLessThan(AbstractSourceTask.MAX_POLL_TIME.toMillis()); } - private static S3SourceRecord getAivenS3SourceRecord() { - return new S3SourceRecord(new HashMap<>(), new HashMap<>(), "testtopic", 0, "", - new SchemaAndValue(Schema.OPTIONAL_BYTES_SCHEMA, new byte[0]), - new SchemaAndValue(Schema.OPTIONAL_BYTES_SCHEMA, new byte[0])); + private List createS3SourceRecords(final int count) { + final List lst = new ArrayList<>(); + if (count > 0) { + lst.add(createS3SourceRecord(TOPIC, PARTITION, TEST_BUCKET, OBJECT_KEY, + "Hello".getBytes(StandardCharsets.UTF_8), "Hello World".getBytes(StandardCharsets.UTF_8))); + for (int i = 1; i < count; i++) { + lst.add(createS3SourceRecord(TOPIC, PARTITION, TEST_BUCKET, OBJECT_KEY + i, + "Goodbye".getBytes(StandardCharsets.UTF_8), + String.format("Goodbye cruel World (%s)", i).getBytes(StandardCharsets.UTF_8))); + } + } + return lst; } - @SuppressWarnings("PMD.AvoidAccessibilityAlteration") - private void setPrivateField(final Object object, final String fieldName, final Object value) - throws NoSuchFieldException, IllegalAccessException { - Field field; - field = object.getClass().getDeclaredField(fieldName); - field.setAccessible(true); - field.set(object, value); + @Test + void testPollWithInterruptedIterator() { + final List lst = createS3SourceRecords(3); + + final Iterator inner1 = lst.subList(0, 2).iterator(); + final Iterator inner2 = lst.subList(2, 3).iterator(); + final Iterator sourceRecordIterator = new Iterator<>() { + Iterator inner = inner1; + @Override + public boolean hasNext() { + if (inner == null) { + inner = inner2; + return false; + } + return inner.hasNext(); + } + + @Override + public S3SourceRecord next() { + final S3SourceRecord result = inner.next(); + if (!inner.hasNext()) { + inner = null; // NOPMD null assignment + } + return result; + } + }; + + final S3SourceTask s3SourceTask = new TestingS3SourceTask(sourceRecordIterator); + startSourceTask(s3SourceTask); + final StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + List results = s3SourceTask.poll(); + stopWatch.stop(); + + assertThat(results).hasSize(2); + assertEquals(lst.get(0), results.get(0)); + assertEquals(lst.get(1), results.get(1)); + + results = s3SourceTask.poll(); + assertThat(results).hasSize(1); + + assertThat(stopWatch.getTime()).isLessThan(AbstractSourceTask.MAX_POLL_TIME.toMillis()); + } - private void startSourceTask(final S3SourceTask s3SourceTask) { - s3SourceTask.initialize(mockedSourceTaskContext); - when(mockedSourceTaskContext.offsetStorageReader()).thenReturn(mockedOffsetStorageReader); + @Test + void testPollWithSlowProducer() { + final List lst = createS3SourceRecords(3); + + final Iterator sourceRecordIterator = new Iterator<>() { + final Iterator inner = lst.iterator(); + @Override + public boolean hasNext() { + return inner.hasNext(); + } + + @Override + public S3SourceRecord next() { + try { + Thread.sleep(Duration.ofSeconds(6).toMillis()); + } catch (InterruptedException e) { + // do nothing. + } + return inner.next(); + } + }; + + final List results = new ArrayList<>(); + // since the polling is returning data at or near the time limit the 3 record may be returned as follows + // Record 1 may be returned in Poll1 or Poll2 + // Record 2 may be returned in Poll2 or Poll2 + // Record 3 may be returned in Poll3 or Poll4 + + final S3SourceTask s3SourceTask = new TestingS3SourceTask(sourceRecordIterator); + startSourceTask(s3SourceTask); + final StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + // poll 1 + List pollResult = s3SourceTask.poll(); + stopWatch.stop(); + if (pollResult != null) { + results.addAll(pollResult); + } + assertThat(results).hasSizeLessThanOrEqualTo(1); + // poll 2 + stopWatch.reset(); + stopWatch.start(); + pollResult = s3SourceTask.poll(); + stopWatch.stop(); + if (pollResult != null) { + results.addAll(pollResult); + } + assertThat(results).hasSizeLessThanOrEqualTo(2); + // poll 3 + stopWatch.reset(); + stopWatch.start(); + pollResult = s3SourceTask.poll(); + stopWatch.stop(); + if (pollResult != null) { + results.addAll(pollResult); + } + assertThat(results).hasSizeLessThanOrEqualTo(3); + // poll 4 + stopWatch.reset(); + stopWatch.start(); + pollResult = s3SourceTask.poll(); + stopWatch.stop(); + if (results.size() == lst.size()) { + assertThat(pollResult).isNull(); + } else { + results.addAll(pollResult); + } + assertThat(results).hasSize(3); + } - setBasicProperties(); - s3SourceTask.start(properties); + @Test + void testPollsWithExcessRecords() { + // test that multiple polls to get all records succeeds. + properties.put(SourceConfigFragment.MAX_POLL_RECORDS, "2"); + + final List lst = createS3SourceRecords(3); + + final Iterator sourceRecordIterator = lst.iterator(); + final S3SourceTask s3SourceTask = new TestingS3SourceTask(sourceRecordIterator); + + startSourceTask(s3SourceTask); + final StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + List results = s3SourceTask.poll(); + assertThat(results).hasSize(2); + results = s3SourceTask.poll(); + assertThat(results).hasSize(1); + stopWatch.stop(); + assertThat(stopWatch.getTime()).isLessThan(AbstractSourceTask.MAX_POLL_TIME.toMillis() * 2); } - private void setBasicProperties() { - properties.put(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); - properties.put("name", "test_source_connector"); - properties.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); - properties.put("value.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); - properties.put("tasks.max", "1"); - properties.put("connector.class", AivenKafkaConnectS3SourceConnector.class.getName()); - properties.put(TARGET_TOPIC_PARTITIONS, "0,1"); - properties.put(TARGET_TOPICS, "testtopic"); + @Test + void testPollWhenConnectorStopped() { + final List lst = createS3SourceRecords(3); + final Iterator sourceRecordIterator = lst.iterator(); + final S3SourceTask s3SourceTask = new TestingS3SourceTask(sourceRecordIterator); + + startSourceTask(s3SourceTask); + s3SourceTask.stop(); + final StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + final List results = s3SourceTask.poll(); + stopWatch.stop(); + assertThat(results).isNull(); + assertThat(stopWatch.getTime()).isLessThan(TIMING_DELTA); + + } + + private static class TestingS3SourceTask extends S3SourceTask { // NOPMD not a test class + + TestingS3SourceTask(final Iterator realIterator) { + super(); + super.setS3SourceRecordIterator(realIterator); + } + @Override + protected void setS3SourceRecordIterator(final Iterator iterator) { + // do nothing. + } } } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java index e02135d1..cc9db65c 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java @@ -17,19 +17,17 @@ package io.aiven.kafka.connect.s3.source.utils; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatExceptionOfType; import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.never; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; import static org.mockito.internal.verification.VerificationModeFactory.times; -import java.net.ConnectException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.concurrent.atomic.AtomicBoolean; +import java.util.function.Supplier; +import org.apache.kafka.connect.errors.ConnectException; import org.apache.kafka.connect.errors.DataException; import org.apache.kafka.connect.source.SourceRecord; import org.apache.kafka.connect.storage.Converter; @@ -38,7 +36,6 @@ import io.aiven.kafka.connect.common.source.input.Transformer; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; @@ -61,75 +58,46 @@ class RecordProcessorTest { @Mock private AWSV2SourceClient sourceClient; - private AtomicBoolean connectorStopped; - private Iterator sourceRecordIterator; - - @BeforeEach - void setUp() { - connectorStopped = new AtomicBoolean(false); - sourceRecordIterator = mock(Iterator.class); - } + private static final Supplier TRUE = () -> true; + private static final Supplier FALSE = () -> false; @Test - void testProcessRecordsNoRecords() { - when(s3SourceConfig.getMaxPollRecords()).thenReturn(5); - when(sourceRecordIterator.hasNext()).thenReturn(false); - - final List results = new ArrayList<>(); - final List processedRecords = RecordProcessor.processRecords( - sourceRecordIterator, - results, - s3SourceConfig, - connectorStopped, - sourceClient, offsetManager - ); - - assertThat(processedRecords).as("Processed records should be empty when there are no records.").isEmpty(); + void testCreateSourceRecord() { + + final SourceRecord mockSourceRecord = mock(SourceRecord.class); + final S3SourceRecord mockRecord = mock(S3SourceRecord.class); + when(mockRecord.getSourceRecord(any(OffsetManager.class))).thenReturn(mockSourceRecord); + + final SourceRecord result = RecordProcessor.createSourceRecord(mockRecord, s3SourceConfig, sourceClient, + offsetManager); + + verify(mockRecord, times(1)).getSourceRecord(any()); + assertThat(result).isEqualTo(mockSourceRecord); + } @Test - void testProcessRecordsWithRecords() throws ConnectException { - when(s3SourceConfig.getMaxPollRecords()).thenReturn(5); - when(sourceRecordIterator.hasNext()).thenReturn(true, false); // One iteration with records + void testCreateSourceRecordWithDataError() { final S3SourceRecord mockRecord = mock(S3SourceRecord.class); - when(sourceRecordIterator.next()).thenReturn(mockRecord); - - final List results = new ArrayList<>(); - RecordProcessor.processRecords( - sourceRecordIterator, - results, - s3SourceConfig, - connectorStopped, - sourceClient, offsetManager - ); - - assertThat(results).hasSize(1); - verify(sourceRecordIterator, times(1)).next(); - } + when(mockRecord.getSourceRecord(any(OffsetManager.class))).thenThrow(new DataException("Testing exception")); - @Test - void testProcessRecordsConnectorStopped() { - when(s3SourceConfig.getMaxPollRecords()).thenReturn(5); - connectorStopped.set(true); // Simulate connector stopped - - final List results = new ArrayList<>(); - final List processedRecords = RecordProcessor.processRecords( - sourceRecordIterator, - results, - s3SourceConfig, - connectorStopped, - sourceClient, offsetManager - ); - - assertThat(processedRecords).as("Processed records should be empty when connector is stopped.").isEmpty(); - verify(sourceRecordIterator, never()).next(); + when(s3SourceConfig.getErrorsTolerance()).thenReturn(ErrorsTolerance.NONE); + + assertThatExceptionOfType(ConnectException.class).as("Errors tolerance: NONE") + .isThrownBy(() -> RecordProcessor.createSourceRecord(mockRecord, s3SourceConfig, sourceClient, + offsetManager)); + + when(s3SourceConfig.getErrorsTolerance()).thenReturn(ErrorsTolerance.ALL); + final SourceRecord result = RecordProcessor.createSourceRecord(mockRecord, s3SourceConfig, sourceClient, + offsetManager); + assertThat(result).isNull(); } @Test void testCreateSourceRecords() { final S3SourceRecord mockRecord = mock(S3SourceRecord.class); - when(mockRecord.getSourceRecord()).thenReturn(mock(SourceRecord.class)); + when(mockRecord.getSourceRecord(any(OffsetManager.class))).thenReturn(mock(SourceRecord.class)); final SourceRecord sourceRecords = RecordProcessor.createSourceRecord(mockRecord, s3SourceConfig, sourceClient, offsetManager); @@ -140,13 +108,13 @@ void testCreateSourceRecords() { @Test void errorToleranceOnNONE() { final S3SourceRecord mockRecord = mock(S3SourceRecord.class); - when(mockRecord.getSourceRecord()).thenThrow(new DataException("generic issue")); + when(mockRecord.getSourceRecord(any(OffsetManager.class))).thenThrow(new DataException("generic issue")); when(s3SourceConfig.getErrorsTolerance()).thenReturn(ErrorsTolerance.NONE); assertThatThrownBy( () -> RecordProcessor.createSourceRecord(mockRecord, s3SourceConfig, sourceClient, offsetManager)) - .isInstanceOf(org.apache.kafka.connect.errors.ConnectException.class) + .isInstanceOf(ConnectException.class) .hasMessage("Data Exception caught during S3 record to source record transformation"); } @@ -154,7 +122,7 @@ void errorToleranceOnNONE() { @Test void errorToleranceOnALL() { final S3SourceRecord mockRecord = mock(S3SourceRecord.class); - when(mockRecord.getSourceRecord()).thenThrow(new DataException("generic issue")); + when(mockRecord.getSourceRecord(any(OffsetManager.class))).thenThrow(new DataException("generic issue")); when(s3SourceConfig.getErrorsTolerance()).thenReturn(ErrorsTolerance.ALL); diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java index b701ea85..af9b679f 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java @@ -31,16 +31,24 @@ import java.io.ByteArrayInputStream; import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; import java.util.Collections; +import java.util.Iterator; import java.util.stream.Stream; +import org.apache.kafka.connect.data.SchemaAndValue; + import io.aiven.kafka.connect.common.source.input.AvroTransformer; import io.aiven.kafka.connect.common.source.input.ByteArrayTransformer; +import io.aiven.kafka.connect.common.source.input.InputFormat; import io.aiven.kafka.connect.common.source.input.Transformer; +import io.aiven.kafka.connect.common.source.input.TransformerFactory; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import software.amazon.awssdk.services.s3.model.S3Object; final class SourceRecordIteratorTest { @@ -67,25 +75,26 @@ void testIteratorProcessesS3Objects() throws Exception { try (InputStream mockInputStream = new ByteArrayInputStream(new byte[] {})) { when(mockSourceApiClient.getObject(anyString())).thenReturn(() -> mockInputStream); - when(mockTransformer.getRecords(any(), anyString(), anyInt(), any(), anyLong())) - .thenReturn(Stream.of(new Object())); + mockTransformer = TransformerFactory.getTransformer(InputFormat.BYTES); when(mockOffsetManager.getOffsets()).thenReturn(Collections.emptyMap()); - when(mockSourceApiClient.getListOfObjectKeys(any())).thenReturn(Collections.emptyIterator()); - SourceRecordIterator iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, mockTransformer, + when(mockSourceApiClient.getS3ObjectIterator(any())).thenReturn(Collections.emptyIterator()); + Iterator iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, mockTransformer, mockSourceApiClient); assertThat(iterator.hasNext()).isFalse(); - assertThat(iterator.next()).isNull(); - when(mockSourceApiClient.getListOfObjectKeys(any())) - .thenReturn(Collections.singletonList(key).listIterator()); + final S3Object obj = S3Object.builder().key(key).build(); + final ByteArrayInputStream bais = new ByteArrayInputStream("Hello World".getBytes(StandardCharsets.UTF_8)); + when(mockSourceApiClient.getS3ObjectIterator(any())).thenReturn(Arrays.asList(obj).iterator()); + when(mockSourceApiClient.getObject(any())).thenReturn(() -> bais); iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, mockTransformer, mockSourceApiClient); - assertThat(iterator.hasNext()).isTrue(); + assertThat(iterator).hasNext(); assertThat(iterator.next()).isNotNull(); + assertThat(iterator).isExhausted(); } } @@ -93,15 +102,17 @@ void testIteratorProcessesS3Objects() throws Exception { void testIteratorProcessesS3ObjectsForByteArrayTransformer() throws Exception { final String key = "topic-00001-abc123.txt"; + final S3Object s3Object = S3Object.builder().key(key).build(); - // Mock InputStream - try (InputStream mockInputStream = new ByteArrayInputStream(new byte[] {})) { - when(mockSourceApiClient.getObject(anyString())).thenReturn(() -> mockInputStream); + // With ByteArrayTransformer + try (InputStream inputStream = new ByteArrayInputStream("Hello World".getBytes(StandardCharsets.UTF_8))) { + when(mockSourceApiClient.getObject(key)).thenReturn(() -> inputStream); + + when(mockSourceApiClient.getS3ObjectIterator(any())).thenReturn(Arrays.asList(s3Object).iterator()); - // With ByteArrayTransformer mockTransformer = mock(ByteArrayTransformer.class); when(mockTransformer.getRecords(any(), anyString(), anyInt(), any(), anyLong())) - .thenReturn(Stream.of(new Object())); + .thenReturn(Stream.of(SchemaAndValue.NULL)); when(mockOffsetManager.getOffsets()).thenReturn(Collections.emptyMap()); @@ -110,24 +121,36 @@ void testIteratorProcessesS3ObjectsForByteArrayTransformer() throws Exception { when(mockOffsetManager.recordsProcessedForObjectKey(anyMap(), anyString())) .thenReturn(BYTES_TRANSFORMATION_NUM_OF_RECS); - SourceRecordIterator iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, mockTransformer, - mockSourceApiClient); - assertThat(iterator.hasNext()).isTrue(); - iterator.next(); + // should skip if any records were produced by source record iterator. + final Iterator iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, + mockTransformer, mockSourceApiClient); + assertThat(iterator.hasNext()).isFalse(); + verify(mockSourceApiClient, never()).getObject(any()); verify(mockTransformer, never()).getRecords(any(), anyString(), anyInt(), any(), anyLong()); + } - // With AvroTransformer + // With AvroTransformer + try (InputStream inputStream = new ByteArrayInputStream("Hello World".getBytes(StandardCharsets.UTF_8))) { + when(mockSourceApiClient.getObject(key)).thenReturn(() -> inputStream); + when(mockSourceApiClient.getS3ObjectIterator(any())).thenReturn(Arrays.asList(s3Object).iterator()); mockTransformer = mock(AvroTransformer.class); when(mockSourceApiClient.getListOfObjectKeys(any())) .thenReturn(Collections.singletonList(key).listIterator()); + when(mockOffsetManager.recordsProcessedForObjectKey(anyMap(), anyString())) .thenReturn(BYTES_TRANSFORMATION_NUM_OF_RECS); - iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, mockTransformer, mockSourceApiClient); + when(mockTransformer.getKeyData(anyString(), anyString(), any())).thenReturn(SchemaAndValue.NULL); + when(mockTransformer.getRecords(any(), anyString(), anyInt(), any(), anyLong())) + .thenReturn(Arrays.asList(SchemaAndValue.NULL).stream()); + + final Iterator iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, + mockTransformer, mockSourceApiClient); assertThat(iterator.hasNext()).isTrue(); iterator.next(); verify(mockTransformer, times(1)).getRecords(any(), anyString(), anyInt(), any(), anyLong()); } } + } diff --git a/settings.gradle.kts b/settings.gradle.kts index 21aca87b..a4451cb5 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -6,6 +6,7 @@ val avroConverterVersion by extra("7.2.2") val avroDataVersion by extra("7.2.2") val awaitilityVersion by extra("4.2.1") val commonsTextVersion by extra("1.11.0") +val commonsCollections4Version by extra("4.4") val hadoopVersion by extra("3.4.0") val hamcrestVersion by extra("2.2") val jacksonVersion by extra("2.15.3") @@ -30,6 +31,9 @@ dependencyResolutionManagement { create("apache") { library("avro", "org.apache.avro:avro:$avroVersion") library("commons-text", "org.apache.commons:commons-text:$commonsTextVersion") + library( + "commons-collection4", + "org.apache.commons:commons-collections4:$commonsCollections4Version") library("kafka-connect-api", "org.apache.kafka:connect-api:$kafkaVersion") library("kafka-connect-json", "org.apache.kafka:connect-json:$kafkaVersion") library("kafka-connect-runtime", "org.apache.kafka:connect-runtime:$kafkaVersion")