From 833312fe2afc5495045f08aa428b08574961b3a9 Mon Sep 17 00:00:00 2001 From: John Mazanec Date: Thu, 20 Feb 2025 08:42:20 -0800 Subject: [PATCH 1/2] Support custom codecs via stored fields. Adds a segment info attribute that stores the delegate codec name as a segment info attribute so that custom codecs can be used in conjunction with the knn codec. Signed-off-by: John Mazanec --- .../codec/KNN10010Codec/KNN10010Codec.java | 7 +++- .../DerivedSourceStoredFieldsFormat.java | 34 +++++++++++++++++-- .../KNN9120Codec/KNN9120Codec.java | 7 +++- 3 files changed, 44 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/opensearch/knn/index/codec/KNN10010Codec/KNN10010Codec.java b/src/main/java/org/opensearch/knn/index/codec/KNN10010Codec/KNN10010Codec.java index 73371bb61e..acec0fa7f0 100644 --- a/src/main/java/org/opensearch/knn/index/codec/KNN10010Codec/KNN10010Codec.java +++ b/src/main/java/org/opensearch/knn/index/codec/KNN10010Codec/KNN10010Codec.java @@ -104,6 +104,11 @@ private StoredFieldsFormat getStoredFieldsFormat() { } return null; })); - return new DerivedSourceStoredFieldsFormat(delegate.storedFieldsFormat(), derivedSourceReadersSupplier, mapperService); + return new DerivedSourceStoredFieldsFormat( + delegate.storedFieldsFormat(), + derivedSourceReadersSupplier, + mapperService, + delegate.getName() + ); } } diff --git a/src/main/java/org/opensearch/knn/index/codec/KNN9120Codec/DerivedSourceStoredFieldsFormat.java b/src/main/java/org/opensearch/knn/index/codec/KNN9120Codec/DerivedSourceStoredFieldsFormat.java index e60b82b2e3..109431d082 100644 --- a/src/main/java/org/opensearch/knn/index/codec/KNN9120Codec/DerivedSourceStoredFieldsFormat.java +++ b/src/main/java/org/opensearch/knn/index/codec/KNN9120Codec/DerivedSourceStoredFieldsFormat.java @@ -6,6 +6,7 @@ package org.opensearch.knn.index.codec.KNN9120Codec; import lombok.AllArgsConstructor; +import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.codecs.StoredFieldsWriter; @@ -32,15 +33,26 @@ @AllArgsConstructor public class DerivedSourceStoredFieldsFormat extends StoredFieldsFormat { + private static final String DELEGATE_CODEC_KEY = "knn_delegate_codec"; + private final StoredFieldsFormat delegate; private final DerivedSourceReadersSupplier derivedSourceReadersSupplier; // IMPORTANT Do not rely on this for the reader, it will be null if SPI is used @Nullable private final MapperService mapperService; + // IMPORTANT Do not rely on this for the reader, it will be null if SPI is used + @Nullable + private final String delegateCodecName; @Override public StoredFieldsReader fieldsReader(Directory directory, SegmentInfo segmentInfo, FieldInfos fieldInfos, IOContext ioContext) throws IOException { + StoredFieldsFormat delegateFromWriting = delegate; + if (segmentInfo.getAttribute(DELEGATE_CODEC_KEY) != null) { + String delegateCodecName = segmentInfo.getAttribute(DELEGATE_CODEC_KEY); + delegateFromWriting = Codec.forName(delegateCodecName).storedFieldsFormat(); + } + List derivedVectorFields = null; for (FieldInfo fieldInfo : fieldInfos) { if (DERIVED_VECTOR_FIELD_ATTRIBUTE_TRUE_VALUE.equals(fieldInfo.attributes().get(DERIVED_VECTOR_FIELD_ATTRIBUTE_KEY))) { @@ -53,10 +65,10 @@ public StoredFieldsReader fieldsReader(Directory directory, SegmentInfo segmentI } // If no fields have it enabled, we can just short-circuit and return the delegate's fieldReader if (derivedVectorFields == null || derivedVectorFields.isEmpty()) { - return delegate.fieldsReader(directory, segmentInfo, fieldInfos, ioContext); + return delegateFromWriting.fieldsReader(directory, segmentInfo, fieldInfos, ioContext); } return new DerivedSourceStoredFieldsReader( - delegate.fieldsReader(directory, segmentInfo, fieldInfos, ioContext), + delegateFromWriting.fieldsReader(directory, segmentInfo, fieldInfos, ioContext), derivedVectorFields, derivedSourceReadersSupplier, new SegmentReadState(directory, segmentInfo, fieldInfos, ioContext) @@ -65,6 +77,24 @@ public StoredFieldsReader fieldsReader(Directory directory, SegmentInfo segmentI @Override public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo segmentInfo, IOContext ioContext) throws IOException { + // We write the delegate codec name into the segmentInfo attributes so that we can read it when loading the + // codec from SPI. + // This is similar to whats done in + // https://github.com/opensearch-project/custom-codecs/blob/2.19.0.0/src/main/java/org/opensearch/index/codec/customcodecs/Lucene912CustomStoredFieldsFormat.java#L95-L100 + String previous = segmentInfo.putAttribute(DELEGATE_CODEC_KEY, delegateCodecName); + if (previous != null && previous.equals(delegateCodecName) == false) { + throw new IllegalStateException( + "found existing value for " + + DELEGATE_CODEC_KEY + + " for segment: " + + segmentInfo.name + + " old = " + + previous + + ", new = " + + delegateCodecName + ); + } + StoredFieldsWriter delegateWriter = delegate.fieldsWriter(directory, segmentInfo, ioContext); if (mapperService != null && KNNSettings.isKNNDerivedSourceEnabled(mapperService.getIndexSettings().getSettings())) { List vectorFieldTypes = new ArrayList<>(); diff --git a/src/main/java/org/opensearch/knn/index/codec/backward_codecs/KNN9120Codec/KNN9120Codec.java b/src/main/java/org/opensearch/knn/index/codec/backward_codecs/KNN9120Codec/KNN9120Codec.java index 375b9fe481..ecd02705b1 100644 --- a/src/main/java/org/opensearch/knn/index/codec/backward_codecs/KNN9120Codec/KNN9120Codec.java +++ b/src/main/java/org/opensearch/knn/index/codec/backward_codecs/KNN9120Codec/KNN9120Codec.java @@ -100,6 +100,11 @@ private StoredFieldsFormat getStoredFieldsFormat() { } return null; })); - return new DerivedSourceStoredFieldsFormat(delegate.storedFieldsFormat(), derivedSourceReadersSupplier, mapperService); + return new DerivedSourceStoredFieldsFormat( + delegate.storedFieldsFormat(), + derivedSourceReadersSupplier, + mapperService, + delegate.getName() + ); } } From 2a83f33e635ef6cf3fb87afd84bd94c4d9e2a378 Mon Sep 17 00:00:00 2001 From: John Mazanec Date: Thu, 20 Feb 2025 09:45:56 -0800 Subject: [PATCH 2/2] Add uT in WIP state Signed-off-by: John Mazanec --- .../codec/KNN10010Codec/KNN10010Codec.java | 5 +- .../KNN9120Codec/KNN9120Codec.java | 2 +- .../DerivedSourceStoredFieldsFormat.java | 3 +- .../DerivedSourceStoredFieldsReader.java | 5 +- .../DerivedSourceStoredFieldsWriter.java | 2 +- .../DerivedSourceStoredFieldsFormatTests.java | 55 +++++++++++++++++++ .../DerivedSourceStoredFieldsWriterTests.java | 2 +- 7 files changed, 62 insertions(+), 12 deletions(-) rename src/main/java/org/opensearch/knn/index/codec/{KNN9120Codec => derivedsource}/DerivedSourceStoredFieldsFormat.java (97%) rename src/main/java/org/opensearch/knn/index/codec/{KNN9120Codec => derivedsource}/DerivedSourceStoredFieldsReader.java (94%) rename src/main/java/org/opensearch/knn/index/codec/{KNN9120Codec => derivedsource}/DerivedSourceStoredFieldsWriter.java (98%) create mode 100644 src/test/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceStoredFieldsFormatTests.java rename src/test/java/org/opensearch/knn/index/codec/{KNN9120Codec => derivedsource}/DerivedSourceStoredFieldsWriterTests.java (96%) diff --git a/src/main/java/org/opensearch/knn/index/codec/KNN10010Codec/KNN10010Codec.java b/src/main/java/org/opensearch/knn/index/codec/KNN10010Codec/KNN10010Codec.java index acec0fa7f0..0d7913f579 100644 --- a/src/main/java/org/opensearch/knn/index/codec/KNN10010Codec/KNN10010Codec.java +++ b/src/main/java/org/opensearch/knn/index/codec/KNN10010Codec/KNN10010Codec.java @@ -17,7 +17,7 @@ import org.opensearch.index.mapper.MapperService; import org.opensearch.knn.index.codec.KNN80Codec.KNN80CompoundFormat; import org.opensearch.knn.index.codec.KNN80Codec.KNN80DocValuesFormat; -import org.opensearch.knn.index.codec.KNN9120Codec.DerivedSourceStoredFieldsFormat; +import org.opensearch.knn.index.codec.derivedsource.DerivedSourceStoredFieldsFormat; import org.opensearch.knn.index.codec.KNN9120Codec.KNN9120PerFieldKnnVectorsFormat; import org.opensearch.knn.index.codec.derivedsource.DerivedSourceReadersSupplier; @@ -26,12 +26,11 @@ /** * KNN Codec that wraps the Lucene Codec which is part of Lucene 10.0.1 */ - public class KNN10010Codec extends FilterCodec { private static final String NAME = "KNN10010Codec"; public static final Codec DEFAULT_DELEGATE = new Lucene101Codec(); - private static final PerFieldKnnVectorsFormat DEFAULT_KNN_VECTOR_FORMAT = new KNN9120PerFieldKnnVectorsFormat(Optional.empty()); + public static final PerFieldKnnVectorsFormat DEFAULT_KNN_VECTOR_FORMAT = new KNN9120PerFieldKnnVectorsFormat(Optional.empty()); private final PerFieldKnnVectorsFormat perFieldKnnVectorsFormat; private final StoredFieldsFormat storedFieldsFormat; diff --git a/src/main/java/org/opensearch/knn/index/codec/backward_codecs/KNN9120Codec/KNN9120Codec.java b/src/main/java/org/opensearch/knn/index/codec/backward_codecs/KNN9120Codec/KNN9120Codec.java index ecd02705b1..6e7d90de8c 100644 --- a/src/main/java/org/opensearch/knn/index/codec/backward_codecs/KNN9120Codec/KNN9120Codec.java +++ b/src/main/java/org/opensearch/knn/index/codec/backward_codecs/KNN9120Codec/KNN9120Codec.java @@ -16,7 +16,7 @@ import org.opensearch.index.mapper.MapperService; import org.opensearch.knn.index.codec.KNN80Codec.KNN80CompoundFormat; import org.opensearch.knn.index.codec.KNN80Codec.KNN80DocValuesFormat; -import org.opensearch.knn.index.codec.KNN9120Codec.DerivedSourceStoredFieldsFormat; +import org.opensearch.knn.index.codec.derivedsource.DerivedSourceStoredFieldsFormat; import org.opensearch.knn.index.codec.KNN9120Codec.KNN9120PerFieldKnnVectorsFormat; import org.opensearch.knn.index.codec.derivedsource.DerivedSourceReadersSupplier; diff --git a/src/main/java/org/opensearch/knn/index/codec/KNN9120Codec/DerivedSourceStoredFieldsFormat.java b/src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceStoredFieldsFormat.java similarity index 97% rename from src/main/java/org/opensearch/knn/index/codec/KNN9120Codec/DerivedSourceStoredFieldsFormat.java rename to src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceStoredFieldsFormat.java index 109431d082..50d985ab2c 100644 --- a/src/main/java/org/opensearch/knn/index/codec/KNN9120Codec/DerivedSourceStoredFieldsFormat.java +++ b/src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceStoredFieldsFormat.java @@ -3,7 +3,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -package org.opensearch.knn.index.codec.KNN9120Codec; +package org.opensearch.knn.index.codec.derivedsource; import lombok.AllArgsConstructor; import org.apache.lucene.codecs.Codec; @@ -20,7 +20,6 @@ import org.opensearch.index.mapper.MappedFieldType; import org.opensearch.index.mapper.MapperService; import org.opensearch.knn.index.KNNSettings; -import org.opensearch.knn.index.codec.derivedsource.DerivedSourceReadersSupplier; import org.opensearch.knn.index.mapper.KNNVectorFieldType; import java.io.IOException; diff --git a/src/main/java/org/opensearch/knn/index/codec/KNN9120Codec/DerivedSourceStoredFieldsReader.java b/src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceStoredFieldsReader.java similarity index 94% rename from src/main/java/org/opensearch/knn/index/codec/KNN9120Codec/DerivedSourceStoredFieldsReader.java rename to src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceStoredFieldsReader.java index 24900eb19e..af3fbf68bf 100644 --- a/src/main/java/org/opensearch/knn/index/codec/KNN9120Codec/DerivedSourceStoredFieldsReader.java +++ b/src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceStoredFieldsReader.java @@ -3,7 +3,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -package org.opensearch.knn.index.codec.KNN9120Codec; +package org.opensearch.knn.index.codec.derivedsource; import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.index.FieldInfo; @@ -11,9 +11,6 @@ import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.util.IOUtils; import org.opensearch.index.fieldvisitor.FieldsVisitor; -import org.opensearch.knn.index.codec.derivedsource.DerivedSourceReadersSupplier; -import org.opensearch.knn.index.codec.derivedsource.DerivedSourceStoredFieldVisitor; -import org.opensearch.knn.index.codec.derivedsource.DerivedSourceVectorInjector; import java.io.IOException; import java.util.List; diff --git a/src/main/java/org/opensearch/knn/index/codec/KNN9120Codec/DerivedSourceStoredFieldsWriter.java b/src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceStoredFieldsWriter.java similarity index 98% rename from src/main/java/org/opensearch/knn/index/codec/KNN9120Codec/DerivedSourceStoredFieldsWriter.java rename to src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceStoredFieldsWriter.java index 0c43f6a493..32098a546e 100644 --- a/src/main/java/org/opensearch/knn/index/codec/KNN9120Codec/DerivedSourceStoredFieldsWriter.java +++ b/src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceStoredFieldsWriter.java @@ -3,7 +3,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -package org.opensearch.knn.index.codec.KNN9120Codec; +package org.opensearch.knn.index.codec.derivedsource; import lombok.RequiredArgsConstructor; import org.apache.lucene.codecs.StoredFieldsWriter; diff --git a/src/test/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceStoredFieldsFormatTests.java b/src/test/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceStoredFieldsFormatTests.java new file mode 100644 index 0000000000..b2d29169d4 --- /dev/null +++ b/src/test/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceStoredFieldsFormatTests.java @@ -0,0 +1,55 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.knn.index.codec.derivedsource; + +import lombok.SneakyThrows; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.lucene101.Lucene101Codec; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.SerialMergeScheduler; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.opensearch.knn.KNNTestCase; +import org.opensearch.knn.index.codec.KNN10010Codec.KNN10010Codec; + +import static org.apache.lucene.codecs.lucene101.Lucene101Codec.Mode.BEST_COMPRESSION; + +public class DerivedSourceStoredFieldsFormatTests extends KNNTestCase { + + @SneakyThrows + public void testCustomCodecDelegate() { + // TODO: We need to replace this with a custom codec so that we can properly test. See + // https://github.com/opensearch-project/custom-codecs/blob/main/src/main/java/org/opensearch/index/codec/customcodecs/Lucene912CustomCodec.java#L37 + Codec codec = new KNN10010Codec(new Lucene101Codec(BEST_COMPRESSION), KNN10010Codec.DEFAULT_KNN_VECTOR_FORMAT, null); + + Directory dir = newFSDirectory(createTempDir()); + IndexWriterConfig iwc = newIndexWriterConfig(); + iwc.setMergeScheduler(new SerialMergeScheduler()); + iwc.setCodec(codec); + + String fieldName = "test"; + String randomString = randomAlphaOfLengthBetween(100, 1000); + TextField basicTextField = new TextField(fieldName, randomString, Field.Store.YES); + try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc)) { + Document doc = new Document(); + doc.add(basicTextField); + writer.addDocument(doc); + } + + try (IndexReader indexReader = DirectoryReader.open(dir)) { + IndexSearcher searcher = new IndexSearcher(indexReader); + Document doc = searcher.storedFields().document(0); + assertEquals(randomString, doc.get(fieldName)); + } + dir.close(); + } +} diff --git a/src/test/java/org/opensearch/knn/index/codec/KNN9120Codec/DerivedSourceStoredFieldsWriterTests.java b/src/test/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceStoredFieldsWriterTests.java similarity index 96% rename from src/test/java/org/opensearch/knn/index/codec/KNN9120Codec/DerivedSourceStoredFieldsWriterTests.java rename to src/test/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceStoredFieldsWriterTests.java index 2953539adb..ff69d7cf59 100644 --- a/src/test/java/org/opensearch/knn/index/codec/KNN9120Codec/DerivedSourceStoredFieldsWriterTests.java +++ b/src/test/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceStoredFieldsWriterTests.java @@ -3,7 +3,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -package org.opensearch.knn.index.codec.KNN9120Codec; +package org.opensearch.knn.index.codec.derivedsource; import lombok.SneakyThrows; import org.apache.lucene.codecs.StoredFieldsWriter;