Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor and javadoc update for KNN vector writer classes #13548

Merged
merged 6 commits into from
Jul 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,8 @@ New Features

Improvements
---------------------
(No changes)

* GITHUB#13548: Refactor and javadoc update for KNN vector writer classes. (Patrick Zhai)

Optimizations
---------------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,18 @@
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import org.apache.lucene.index.ByteVectorValues;
import org.apache.lucene.index.DocIDMerger;
import org.apache.lucene.index.DocsWithFieldSet;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.Sorter;
import org.apache.lucene.index.VectorEncoding;
import org.apache.lucene.internal.hppc.IntIntHashMap;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.VectorScorer;
import org.apache.lucene.util.Accountable;
Expand Down Expand Up @@ -139,6 +143,60 @@ public int nextDoc() throws IOException {
}
}

/**
* Given old doc ids and an id mapping, maps old ordinal to new ordinal. Note: this method return
* nothing and output are written to parameters
*
zhaih marked this conversation as resolved.
Show resolved Hide resolved
* @param oldDocIds the old or current document ordinals. Must not be null.
* @param sortMap the document sorting map for how to make the new ordinals. Must not be null.
* @param old2NewOrd int[] maps from old ord to new ord
* @param new2OldOrd int[] maps from new ord to old ord
* @param newDocsWithField set of new doc ids which has the value
*/
public static void mapOldOrdToNewOrd(
DocsWithFieldSet oldDocIds,
Sorter.DocMap sortMap,
int[] old2NewOrd,
int[] new2OldOrd,
DocsWithFieldSet newDocsWithField)
throws IOException {
// TODO: a similar function exists in IncrementalHnswGraphMerger#getNewOrdMapping
// maybe we can do a further refactoring
Objects.requireNonNull(oldDocIds);
Objects.requireNonNull(sortMap);
assert (old2NewOrd != null || new2OldOrd != null || newDocsWithField != null);
zhaih marked this conversation as resolved.
Show resolved Hide resolved
assert (old2NewOrd == null || old2NewOrd.length == oldDocIds.cardinality());
assert (new2OldOrd == null || new2OldOrd.length == oldDocIds.cardinality());
IntIntHashMap newIdToOldOrd = new IntIntHashMap();
zhaih marked this conversation as resolved.
Show resolved Hide resolved
DocIdSetIterator iterator = oldDocIds.iterator();
int[] newDocIds = new int[oldDocIds.cardinality()];
int oldOrd = 0;
for (int oldDocId = iterator.nextDoc();
oldDocId != DocIdSetIterator.NO_MORE_DOCS;
oldDocId = iterator.nextDoc()) {
int newId = sortMap.oldToNew(oldDocId);
newIdToOldOrd.put(newId, oldOrd);
newDocIds[oldOrd] = newId;
oldOrd++;
}

Arrays.sort(newDocIds);
int newOrd = 0;
for (int newDocId : newDocIds) {
int currOldOrd = newIdToOldOrd.get(newDocId);
if (old2NewOrd != null) {
old2NewOrd[currOldOrd] = newOrd;
}
if (new2OldOrd != null) {
new2OldOrd[newOrd] = currOldOrd;
}
if (newDocsWithField != null) {
newDocsWithField.add(newDocId);
}
newOrd++;
}
}

/** View over multiple vector values supporting iterator-style access via DocIdMerger. */
public static final class MergedVectorValues {
private MergedVectorValues() {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public class OrdToDocDISIReaderConfiguration {
* <p>Within outputMeta the format is as follows:
*
* <ul>
* <li><b>[int8]</b> if equals to -2, empty - no vectory values. If equals to -1, dense – all
* <li><b>[int8]</b> if equals to -2, empty - no vector values. If equals to -1, dense – all
* documents have values for a field. If equals to 0, sparse – some documents missing
* values.
* <li>DocIds were encoded by {@link IndexedDISI#writeBitSet(DocIdSetIterator, IndexOutput,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@
* <li><b>[vlong]</b> length of this field's vectors, in bytes
* <li><b>[vint]</b> dimension of this field's vectors
* <li><b>[int]</b> the number of documents having values for this field
* <li><b>[int8]</b> if equals to -1, dense – all documents have values for a field. If equals to
* 0, sparse – some documents missing values.
* <li><b>[int8]</b> if equals to -2, empty - no vector values. If equals to -1, dense – all
* documents have values for a field. If equals to 0, sparse – some documents missing values.
* <li>DocIds were encoded by {@link IndexedDISI#writeBitSet(DocIdSetIterator, IndexOutput, byte)}
* <li>OrdToDoc was encoded by {@link org.apache.lucene.util.packed.DirectMonotonicWriter}, note
* that only in sparse case
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Sorter;
import org.apache.lucene.index.VectorEncoding;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
Expand Down Expand Up @@ -191,27 +190,10 @@ private void writeByteVectors(FieldWriter<?> fieldData) throws IOException {

private void writeSortingField(FieldWriter<?> fieldData, int maxDoc, Sorter.DocMap sortMap)
throws IOException {
final int[] docIdOffsets = new int[sortMap.size()];
int offset = 1; // 0 means no vector for this (field, document)
DocIdSetIterator iterator = fieldData.docsWithField.iterator();
for (int docID = iterator.nextDoc();
docID != DocIdSetIterator.NO_MORE_DOCS;
docID = iterator.nextDoc()) {
int newDocID = sortMap.oldToNew(docID);
docIdOffsets[newDocID] = offset++;
}
final int[] ordMap = new int[fieldData.docsWithField.cardinality()]; // new ord to old ord

DocsWithFieldSet newDocsWithField = new DocsWithFieldSet();
final int[] ordMap = new int[offset - 1]; // new ord to old ord
int ord = 0;
int doc = 0;
for (int docIdOffset : docIdOffsets) {
if (docIdOffset != 0) {
ordMap[ord] = docIdOffset - 1;
newDocsWithField.add(doc);
ord++;
}
doc++;
}
mapOldOrdToNewOrd(fieldData.docsWithField, sortMap, null, ordMap, newDocsWithField);

// write vector values
long vectorDataOffset =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,11 @@
import org.apache.lucene.codecs.KnnVectorsWriter;
import org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil;
import org.apache.lucene.codecs.hnsw.FlatVectorsFormat;
import org.apache.lucene.codecs.lucene90.IndexedDISI;
import org.apache.lucene.index.MergePolicy;
import org.apache.lucene.index.MergeScheduler;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.TaskExecutor;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.hnsw.HnswGraph;

/**
Expand Down Expand Up @@ -69,11 +66,6 @@
* <li><b>[vlong]</b> length of this field's index data, in bytes
* <li><b>[vint]</b> dimension of this field's vectors
* <li><b>[int]</b> the number of documents having values for this field
* <li><b>[int8]</b> if equals to -1, dense – all documents have values for a field. If equals to
* 0, sparse – some documents missing values.
* <li>DocIds were encoded by {@link IndexedDISI#writeBitSet(DocIdSetIterator, IndexOutput, byte)}
* <li>OrdToDoc was encoded by {@link org.apache.lucene.util.packed.DirectMonotonicWriter}, note
* that only in sparse case
* <li><b>[vint]</b> the maximum number of connections (neighbours) that each node can have
* <li><b>[vint]</b> number of levels in the graph
* <li>Graph nodes by level. For each level
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -196,29 +196,10 @@ private void writeField(FieldWriter<?> fieldData) throws IOException {

private void writeSortingField(FieldWriter<?> fieldData, Sorter.DocMap sortMap)
throws IOException {
final int[] docIdOffsets = new int[sortMap.size()];
int offset = 1; // 0 means no vector for this (field, document)
DocIdSetIterator iterator = fieldData.docsWithField.iterator();
for (int docID = iterator.nextDoc();
docID != DocIdSetIterator.NO_MORE_DOCS;
docID = iterator.nextDoc()) {
int newDocID = sortMap.oldToNew(docID);
docIdOffsets[newDocID] = offset++;
}
DocsWithFieldSet newDocsWithField = new DocsWithFieldSet();
final int[] ordMap = new int[offset - 1]; // new ord to old ord
final int[] oldOrdMap = new int[offset - 1]; // old ord to new ord
int ord = 0;
int doc = 0;
for (int docIdOffset : docIdOffsets) {
if (docIdOffset != 0) {
ordMap[ord] = docIdOffset - 1;
oldOrdMap[docIdOffset - 1] = ord;
newDocsWithField.add(doc);
ord++;
}
doc++;
}
final int[] ordMap = new int[fieldData.docsWithField.cardinality()]; // new ord to old ord
final int[] oldOrdMap = new int[fieldData.docsWithField.cardinality()]; // old ord to new ord

mapOldOrdToNewOrd(fieldData.docsWithField, sortMap, oldOrdMap, ordMap, null);
// write graph
long vectorIndexOffset = vectorIndex.getFilePointer();
OnHeapHnswGraph graph = fieldData.getGraph();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -399,27 +399,10 @@ private void writeQuantizedVectors(FieldWriter fieldData) throws IOException {

private void writeSortingField(FieldWriter fieldData, int maxDoc, Sorter.DocMap sortMap)
throws IOException {
final int[] docIdOffsets = new int[sortMap.size()];
int offset = 1; // 0 means no vector for this (field, document)
DocIdSetIterator iterator = fieldData.docsWithField.iterator();
for (int docID = iterator.nextDoc();
docID != DocIdSetIterator.NO_MORE_DOCS;
docID = iterator.nextDoc()) {
int newDocID = sortMap.oldToNew(docID);
docIdOffsets[newDocID] = offset++;
}
final int[] ordMap = new int[fieldData.docsWithField.cardinality()]; // new ord to old ord

DocsWithFieldSet newDocsWithField = new DocsWithFieldSet();
final int[] ordMap = new int[offset - 1]; // new ord to old ord
int ord = 0;
int doc = 0;
for (int docIdOffset : docIdOffsets) {
if (docIdOffset != 0) {
ordMap[ord] = docIdOffset - 1;
newDocsWithField.add(doc);
ord++;
}
doc++;
}
mapOldOrdToNewOrd(fieldData.docsWithField, sortMap, null, ordMap, newDocsWithField);

// write vector values
long vectorDataOffset = quantizedVectorData.alignFilePointer(Float.BYTES);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,7 @@ BufferedUpdate next() throws IOException {
}
}

BytesRef nextTerm() throws IOException {
private BytesRef nextTerm() throws IOException {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems like an extra thing out of nowhere? Its probably ok in this commit, just struck me as a strange inclusion :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I just edited it when I was going over the code, thought it should be fine leaving it here :)

if (lookAheadTermIterator != null) {
if (bufferedUpdate.termValue == null) {
lookAheadTermIterator.next();
Expand Down