Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support custom codecs via stored fields #2547

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import org.opensearch.index.mapper.MapperService;
import org.opensearch.knn.index.codec.KNN80Codec.KNN80CompoundFormat;
import org.opensearch.knn.index.codec.KNN80Codec.KNN80DocValuesFormat;
import org.opensearch.knn.index.codec.KNN9120Codec.DerivedSourceStoredFieldsFormat;
import org.opensearch.knn.index.codec.derivedsource.DerivedSourceStoredFieldsFormat;
import org.opensearch.knn.index.codec.KNN9120Codec.KNN9120PerFieldKnnVectorsFormat;
import org.opensearch.knn.index.codec.derivedsource.DerivedSourceReadersSupplier;

Expand All @@ -26,12 +26,11 @@
/**
* KNN Codec that wraps the Lucene Codec which is part of Lucene 10.0.1
*/

public class KNN10010Codec extends FilterCodec {

private static final String NAME = "KNN10010Codec";
public static final Codec DEFAULT_DELEGATE = new Lucene101Codec();
private static final PerFieldKnnVectorsFormat DEFAULT_KNN_VECTOR_FORMAT = new KNN9120PerFieldKnnVectorsFormat(Optional.empty());
public static final PerFieldKnnVectorsFormat DEFAULT_KNN_VECTOR_FORMAT = new KNN9120PerFieldKnnVectorsFormat(Optional.empty());

private final PerFieldKnnVectorsFormat perFieldKnnVectorsFormat;
private final StoredFieldsFormat storedFieldsFormat;
Expand Down Expand Up @@ -104,6 +103,11 @@ private StoredFieldsFormat getStoredFieldsFormat() {
}
return null;
}));
return new DerivedSourceStoredFieldsFormat(delegate.storedFieldsFormat(), derivedSourceReadersSupplier, mapperService);
return new DerivedSourceStoredFieldsFormat(
delegate.storedFieldsFormat(),
derivedSourceReadersSupplier,
mapperService,
delegate.getName()
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import org.opensearch.index.mapper.MapperService;
import org.opensearch.knn.index.codec.KNN80Codec.KNN80CompoundFormat;
import org.opensearch.knn.index.codec.KNN80Codec.KNN80DocValuesFormat;
import org.opensearch.knn.index.codec.KNN9120Codec.DerivedSourceStoredFieldsFormat;
import org.opensearch.knn.index.codec.derivedsource.DerivedSourceStoredFieldsFormat;
import org.opensearch.knn.index.codec.KNN9120Codec.KNN9120PerFieldKnnVectorsFormat;
import org.opensearch.knn.index.codec.derivedsource.DerivedSourceReadersSupplier;

Expand Down Expand Up @@ -100,6 +100,11 @@ private StoredFieldsFormat getStoredFieldsFormat() {
}
return null;
}));
return new DerivedSourceStoredFieldsFormat(delegate.storedFieldsFormat(), derivedSourceReadersSupplier, mapperService);
return new DerivedSourceStoredFieldsFormat(
delegate.storedFieldsFormat(),
derivedSourceReadersSupplier,
mapperService,
delegate.getName()
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
* SPDX-License-Identifier: Apache-2.0
*/

package org.opensearch.knn.index.codec.KNN9120Codec;
package org.opensearch.knn.index.codec.derivedsource;

import lombok.AllArgsConstructor;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.StoredFieldsReader;
import org.apache.lucene.codecs.StoredFieldsWriter;
Expand All @@ -19,7 +20,6 @@
import org.opensearch.index.mapper.MappedFieldType;
import org.opensearch.index.mapper.MapperService;
import org.opensearch.knn.index.KNNSettings;
import org.opensearch.knn.index.codec.derivedsource.DerivedSourceReadersSupplier;
import org.opensearch.knn.index.mapper.KNNVectorFieldType;

import java.io.IOException;
Expand All @@ -32,15 +32,26 @@
@AllArgsConstructor
public class DerivedSourceStoredFieldsFormat extends StoredFieldsFormat {

private static final String DELEGATE_CODEC_KEY = "knn_delegate_codec";

private final StoredFieldsFormat delegate;
private final DerivedSourceReadersSupplier derivedSourceReadersSupplier;
// IMPORTANT Do not rely on this for the reader, it will be null if SPI is used
@Nullable
private final MapperService mapperService;
// IMPORTANT Do not rely on this for the reader, it will be null if SPI is used
@Nullable
private final String delegateCodecName;

@Override
public StoredFieldsReader fieldsReader(Directory directory, SegmentInfo segmentInfo, FieldInfos fieldInfos, IOContext ioContext)
throws IOException {
StoredFieldsFormat delegateFromWriting = delegate;
if (segmentInfo.getAttribute(DELEGATE_CODEC_KEY) != null) {
String delegateCodecName = segmentInfo.getAttribute(DELEGATE_CODEC_KEY);
delegateFromWriting = Codec.forName(delegateCodecName).storedFieldsFormat();
}

List<FieldInfo> derivedVectorFields = null;
for (FieldInfo fieldInfo : fieldInfos) {
if (DERIVED_VECTOR_FIELD_ATTRIBUTE_TRUE_VALUE.equals(fieldInfo.attributes().get(DERIVED_VECTOR_FIELD_ATTRIBUTE_KEY))) {
Expand All @@ -53,10 +64,10 @@ public StoredFieldsReader fieldsReader(Directory directory, SegmentInfo segmentI
}
// If no fields have it enabled, we can just short-circuit and return the delegate's fieldReader
if (derivedVectorFields == null || derivedVectorFields.isEmpty()) {
return delegate.fieldsReader(directory, segmentInfo, fieldInfos, ioContext);
return delegateFromWriting.fieldsReader(directory, segmentInfo, fieldInfos, ioContext);
}
return new DerivedSourceStoredFieldsReader(
delegate.fieldsReader(directory, segmentInfo, fieldInfos, ioContext),
delegateFromWriting.fieldsReader(directory, segmentInfo, fieldInfos, ioContext),
derivedVectorFields,
derivedSourceReadersSupplier,
new SegmentReadState(directory, segmentInfo, fieldInfos, ioContext)
Expand All @@ -65,6 +76,24 @@ public StoredFieldsReader fieldsReader(Directory directory, SegmentInfo segmentI

@Override
public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo segmentInfo, IOContext ioContext) throws IOException {
// We write the delegate codec name into the segmentInfo attributes so that we can read it when loading the
// codec from SPI.
// This is similar to whats done in
// https://github.com/opensearch-project/custom-codecs/blob/2.19.0.0/src/main/java/org/opensearch/index/codec/customcodecs/Lucene912CustomStoredFieldsFormat.java#L95-L100
String previous = segmentInfo.putAttribute(DELEGATE_CODEC_KEY, delegateCodecName);
if (previous != null && previous.equals(delegateCodecName) == false) {
throw new IllegalStateException(
"found existing value for "
+ DELEGATE_CODEC_KEY
+ " for segment: "
+ segmentInfo.name
+ " old = "
+ previous
+ ", new = "
+ delegateCodecName
);
}

StoredFieldsWriter delegateWriter = delegate.fieldsWriter(directory, segmentInfo, ioContext);
if (mapperService != null && KNNSettings.isKNNDerivedSourceEnabled(mapperService.getIndexSettings().getSettings())) {
List<String> vectorFieldTypes = new ArrayList<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,14 @@
* SPDX-License-Identifier: Apache-2.0
*/

package org.opensearch.knn.index.codec.KNN9120Codec;
package org.opensearch.knn.index.codec.derivedsource;

import org.apache.lucene.codecs.StoredFieldsReader;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.util.IOUtils;
import org.opensearch.index.fieldvisitor.FieldsVisitor;
import org.opensearch.knn.index.codec.derivedsource.DerivedSourceReadersSupplier;
import org.opensearch.knn.index.codec.derivedsource.DerivedSourceStoredFieldVisitor;
import org.opensearch.knn.index.codec.derivedsource.DerivedSourceVectorInjector;

import java.io.IOException;
import java.util.List;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* SPDX-License-Identifier: Apache-2.0
*/

package org.opensearch.knn.index.codec.KNN9120Codec;
package org.opensearch.knn.index.codec.derivedsource;

import lombok.RequiredArgsConstructor;
import org.apache.lucene.codecs.StoredFieldsWriter;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/

package org.opensearch.knn.index.codec.derivedsource;

import lombok.SneakyThrows;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.lucene101.Lucene101Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.SerialMergeScheduler;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.opensearch.knn.KNNTestCase;
import org.opensearch.knn.index.codec.KNN10010Codec.KNN10010Codec;

import static org.apache.lucene.codecs.lucene101.Lucene101Codec.Mode.BEST_COMPRESSION;

public class DerivedSourceStoredFieldsFormatTests extends KNNTestCase {

@SneakyThrows
public void testCustomCodecDelegate() {
// TODO: We need to replace this with a custom codec so that we can properly test. See
// https://github.com/opensearch-project/custom-codecs/blob/main/src/main/java/org/opensearch/index/codec/customcodecs/Lucene912CustomCodec.java#L37
Codec codec = new KNN10010Codec(new Lucene101Codec(BEST_COMPRESSION), KNN10010Codec.DEFAULT_KNN_VECTOR_FORMAT, null);

Directory dir = newFSDirectory(createTempDir());
IndexWriterConfig iwc = newIndexWriterConfig();
iwc.setMergeScheduler(new SerialMergeScheduler());
iwc.setCodec(codec);

String fieldName = "test";
String randomString = randomAlphaOfLengthBetween(100, 1000);
TextField basicTextField = new TextField(fieldName, randomString, Field.Store.YES);
try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc)) {
Document doc = new Document();
doc.add(basicTextField);
writer.addDocument(doc);
}

try (IndexReader indexReader = DirectoryReader.open(dir)) {
IndexSearcher searcher = new IndexSearcher(indexReader);
Document doc = searcher.storedFields().document(0);
assertEquals(randomString, doc.get(fieldName));
}
dir.close();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* SPDX-License-Identifier: Apache-2.0
*/

package org.opensearch.knn.index.codec.KNN9120Codec;
package org.opensearch.knn.index.codec.derivedsource;

import lombok.SneakyThrows;
import org.apache.lucene.codecs.StoredFieldsWriter;
Expand Down
Loading