From 02d9a6c6feed4a3f2618f5d1dc8fa8a908890941 Mon Sep 17 00:00:00 2001 From: Patrick Zhai Date: Sun, 7 Jul 2024 12:34:24 -0700 Subject: [PATCH 1/6] refactor the old to new ordinal logic in KNN vector wrtier --- .../lucene/codecs/KnnVectorsWriter.java | 41 +++++++++++++++++++ .../lucene99/Lucene99FlatVectorsWriter.java | 24 ++--------- .../lucene99/Lucene99HnswVectorsWriter.java | 27 ++---------- .../Lucene99ScalarQuantizedVectorsWriter.java | 23 ++--------- .../lucene/index/FieldUpdatesBuffer.java | 2 +- 5 files changed, 52 insertions(+), 65 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java index 8ae86d4c807e..c3872357c527 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java @@ -20,14 +20,17 @@ import java.io.Closeable; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.DocIDMerger; +import org.apache.lucene.index.DocsWithFieldSet; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.Sorter; import org.apache.lucene.index.VectorEncoding; +import org.apache.lucene.internal.hppc.IntIntHashMap; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.VectorScorer; import org.apache.lucene.util.Accountable; @@ -139,6 +142,44 @@ public int nextDoc() throws IOException { } } + public static void mapOldOrdToNewOrd( + DocsWithFieldSet oldDocIds, + Sorter.DocMap sortMap, + int[] old2NewOrd, + int[] new2OldOrd, + DocsWithFieldSet newDocsWithField) + throws IOException { + assert (old2NewOrd != null || new2OldOrd != null || newDocsWithField != null); + IntIntHashMap newIdToOldOrd = new IntIntHashMap(); + DocIdSetIterator iterator = oldDocIds.iterator(); + int[] newDocIds = new int[oldDocIds.cardinality()]; + int oldOrd = 0; + for (int oldDocId = iterator.nextDoc(); + oldDocId != DocIdSetIterator.NO_MORE_DOCS; + oldDocId = iterator.nextDoc()) { + int newId = sortMap.oldToNew(oldDocId); + newIdToOldOrd.put(newId, oldOrd); + newDocIds[oldOrd] = newId; + oldOrd++; + } + + Arrays.sort(newDocIds); + int newOrd = 0; + for (int newDocId : newDocIds) { + int currOldOrd = newIdToOldOrd.get(newDocId); + if (old2NewOrd != null) { + old2NewOrd[currOldOrd] = newOrd; + } + if (new2OldOrd != null) { + new2OldOrd[newOrd] = currOldOrd; + } + if (newDocsWithField != null) { + newDocsWithField.add(newDocId); + } + newOrd++; + } + } + /** View over multiple vector values supporting iterator-style access via DocIdMerger. */ public static final class MergedVectorValues { private MergedVectorValues() {} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java index 6232489c08d1..b80c9f4d7f5c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java @@ -44,7 +44,6 @@ import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.Sorter; import org.apache.lucene.index.VectorEncoding; -import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; @@ -191,27 +190,10 @@ private void writeByteVectors(FieldWriter fieldData) throws IOException { private void writeSortingField(FieldWriter fieldData, int maxDoc, Sorter.DocMap sortMap) throws IOException { - final int[] docIdOffsets = new int[sortMap.size()]; - int offset = 1; // 0 means no vector for this (field, document) - DocIdSetIterator iterator = fieldData.docsWithField.iterator(); - for (int docID = iterator.nextDoc(); - docID != DocIdSetIterator.NO_MORE_DOCS; - docID = iterator.nextDoc()) { - int newDocID = sortMap.oldToNew(docID); - docIdOffsets[newDocID] = offset++; - } + final int[] ordMap = new int[fieldData.docsWithField.cardinality()]; // new ord to old ord + DocsWithFieldSet newDocsWithField = new DocsWithFieldSet(); - final int[] ordMap = new int[offset - 1]; // new ord to old ord - int ord = 0; - int doc = 0; - for (int docIdOffset : docIdOffsets) { - if (docIdOffset != 0) { - ordMap[ord] = docIdOffset - 1; - newDocsWithField.add(doc); - ord++; - } - doc++; - } + mapOldOrdToNewOrd(fieldData.docsWithField, sortMap, null, ordMap, newDocsWithField); // write vector values long vectorDataOffset = diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java index 8f715993a2b0..e2171a3513fe 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java @@ -196,29 +196,10 @@ private void writeField(FieldWriter fieldData) throws IOException { private void writeSortingField(FieldWriter fieldData, Sorter.DocMap sortMap) throws IOException { - final int[] docIdOffsets = new int[sortMap.size()]; - int offset = 1; // 0 means no vector for this (field, document) - DocIdSetIterator iterator = fieldData.docsWithField.iterator(); - for (int docID = iterator.nextDoc(); - docID != DocIdSetIterator.NO_MORE_DOCS; - docID = iterator.nextDoc()) { - int newDocID = sortMap.oldToNew(docID); - docIdOffsets[newDocID] = offset++; - } - DocsWithFieldSet newDocsWithField = new DocsWithFieldSet(); - final int[] ordMap = new int[offset - 1]; // new ord to old ord - final int[] oldOrdMap = new int[offset - 1]; // old ord to new ord - int ord = 0; - int doc = 0; - for (int docIdOffset : docIdOffsets) { - if (docIdOffset != 0) { - ordMap[ord] = docIdOffset - 1; - oldOrdMap[docIdOffset - 1] = ord; - newDocsWithField.add(doc); - ord++; - } - doc++; - } + final int[] ordMap = new int[fieldData.docsWithField.cardinality()]; // new ord to old ord + final int[] oldOrdMap = new int[fieldData.docsWithField.cardinality()]; // old ord to new ord + + mapOldOrdToNewOrd(fieldData.docsWithField, sortMap, oldOrdMap, ordMap, null); // write graph long vectorIndexOffset = vectorIndex.getFilePointer(); OnHeapHnswGraph graph = fieldData.getGraph(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java index c052ce20646d..98dfd891ebf5 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java @@ -399,27 +399,10 @@ private void writeQuantizedVectors(FieldWriter fieldData) throws IOException { private void writeSortingField(FieldWriter fieldData, int maxDoc, Sorter.DocMap sortMap) throws IOException { - final int[] docIdOffsets = new int[sortMap.size()]; - int offset = 1; // 0 means no vector for this (field, document) - DocIdSetIterator iterator = fieldData.docsWithField.iterator(); - for (int docID = iterator.nextDoc(); - docID != DocIdSetIterator.NO_MORE_DOCS; - docID = iterator.nextDoc()) { - int newDocID = sortMap.oldToNew(docID); - docIdOffsets[newDocID] = offset++; - } + final int[] ordMap = new int[fieldData.docsWithField.cardinality()]; // new ord to old ord + DocsWithFieldSet newDocsWithField = new DocsWithFieldSet(); - final int[] ordMap = new int[offset - 1]; // new ord to old ord - int ord = 0; - int doc = 0; - for (int docIdOffset : docIdOffsets) { - if (docIdOffset != 0) { - ordMap[ord] = docIdOffset - 1; - newDocsWithField.add(doc); - ord++; - } - doc++; - } + mapOldOrdToNewOrd(fieldData.docsWithField, sortMap, null, ordMap, newDocsWithField); // write vector values long vectorDataOffset = quantizedVectorData.alignFilePointer(Float.BYTES); diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldUpdatesBuffer.java b/lucene/core/src/java/org/apache/lucene/index/FieldUpdatesBuffer.java index 9739df9cd837..a4d38dcd2030 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldUpdatesBuffer.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldUpdatesBuffer.java @@ -356,7 +356,7 @@ BufferedUpdate next() throws IOException { } } - BytesRef nextTerm() throws IOException { + private BytesRef nextTerm() throws IOException { if (lookAheadTermIterator != null) { if (bufferedUpdate.termValue == null) { lookAheadTermIterator.next(); From 46b8c5adce56890420f7a1d5faab69c98ec46689 Mon Sep 17 00:00:00 2001 From: Patrick Zhai Date: Sun, 7 Jul 2024 16:04:57 -0700 Subject: [PATCH 2/6] update javadoc --- .../lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java | 4 ++-- .../lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java | 8 -------- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java index 78e0cf000fab..c20c708b0b2d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java @@ -56,8 +56,8 @@ *
  • [vlong] length of this field's vectors, in bytes *
  • [vint] dimension of this field's vectors *
  • [int] the number of documents having values for this field - *
  • [int8] if equals to -1, dense – all documents have values for a field. If equals to - * 0, sparse – some documents missing values. + *
  • [int8] if equals to -2, empty - no vectory values. If equals to -1, dense – all + * documents have values for a field. If equals to 0, sparse – some documents missing values. *
  • DocIds were encoded by {@link IndexedDISI#writeBitSet(DocIdSetIterator, IndexOutput, byte)} *
  • OrdToDoc was encoded by {@link org.apache.lucene.util.packed.DirectMonotonicWriter}, note * that only in sparse case diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java index 3238fd1f4ae4..117393706dbd 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java @@ -24,14 +24,11 @@ import org.apache.lucene.codecs.KnnVectorsWriter; import org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil; import org.apache.lucene.codecs.hnsw.FlatVectorsFormat; -import org.apache.lucene.codecs.lucene90.IndexedDISI; import org.apache.lucene.index.MergePolicy; import org.apache.lucene.index.MergeScheduler; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.TaskExecutor; -import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.hnsw.HnswGraph; /** @@ -69,11 +66,6 @@ *
  • [vlong] length of this field's index data, in bytes *
  • [vint] dimension of this field's vectors *
  • [int] the number of documents having values for this field - *
  • [int8] if equals to -1, dense – all documents have values for a field. If equals to - * 0, sparse – some documents missing values. - *
  • DocIds were encoded by {@link IndexedDISI#writeBitSet(DocIdSetIterator, IndexOutput, byte)} - *
  • OrdToDoc was encoded by {@link org.apache.lucene.util.packed.DirectMonotonicWriter}, note - * that only in sparse case *
  • [vint] the maximum number of connections (neighbours) that each node can have *
  • [vint] number of levels in the graph *
  • Graph nodes by level. For each level From b0c8b2f734483014d5f84eb1b204afb4db8ad8b3 Mon Sep 17 00:00:00 2001 From: Patrick Zhai Date: Sun, 7 Jul 2024 16:12:04 -0700 Subject: [PATCH 3/6] CHNAGES.txt --- lucene/CHANGES.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 320de7aaefe8..2f05f946b7db 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -258,7 +258,8 @@ New Features Improvements --------------------- -(No changes) + +* GITHUB#13548: Refactor and javadoc update for KNN vector writer classes. (Patrick Zhai) Optimizations --------------------- From f8d0df107ccfce9a5aaaa95f77ed72889626ca79 Mon Sep 17 00:00:00 2001 From: Patrick Zhai Date: Sun, 7 Jul 2024 16:29:49 -0700 Subject: [PATCH 4/6] javadoc --- .../org/apache/lucene/codecs/KnnVectorsWriter.java | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java index c3872357c527..8ae3cbc5760b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java @@ -142,6 +142,14 @@ public int nextDoc() throws IOException { } } + /** + * Given old doc ids and an id mapping, maps old ordinal to new ordinal. Note: this method return + * nothing and output are written to parameters + * + * @param old2NewOrd int[] maps from old ord to new ord + * @param new2OldOrd int[] maps from new ord to old ord + * @param newDocsWithField set of new doc ids which has the value + */ public static void mapOldOrdToNewOrd( DocsWithFieldSet oldDocIds, Sorter.DocMap sortMap, @@ -149,6 +157,8 @@ public static void mapOldOrdToNewOrd( int[] new2OldOrd, DocsWithFieldSet newDocsWithField) throws IOException { + // TODO: a similar function exists in IncrementalHnswGraphMerger#getNewOrdMapping + // maybe we can do a further refactoring assert (old2NewOrd != null || new2OldOrd != null || newDocsWithField != null); IntIntHashMap newIdToOldOrd = new IntIntHashMap(); DocIdSetIterator iterator = oldDocIds.iterator(); From 396695cbc86f79da6aab6cbad94d7397e847f17e Mon Sep 17 00:00:00 2001 From: Patrick Zhai Date: Mon, 8 Jul 2024 10:58:45 -0700 Subject: [PATCH 5/6] Addressing comment --- .../java/org/apache/lucene/codecs/KnnVectorsWriter.java | 7 +++++++ .../codecs/lucene95/OrdToDocDISIReaderConfiguration.java | 2 +- .../lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java index 8ae3cbc5760b..fb40a51d7c7e 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java @@ -22,6 +22,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.Objects; import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.DocIDMerger; import org.apache.lucene.index.DocsWithFieldSet; @@ -146,6 +147,8 @@ public int nextDoc() throws IOException { * Given old doc ids and an id mapping, maps old ordinal to new ordinal. Note: this method return * nothing and output are written to parameters * + * @param oldDocIds the old or current document ordinals. Must not be null. + * @param sortMap, the document sorting map for how to make the new ordinals. Must not be null. * @param old2NewOrd int[] maps from old ord to new ord * @param new2OldOrd int[] maps from new ord to old ord * @param newDocsWithField set of new doc ids which has the value @@ -159,7 +162,11 @@ public static void mapOldOrdToNewOrd( throws IOException { // TODO: a similar function exists in IncrementalHnswGraphMerger#getNewOrdMapping // maybe we can do a further refactoring + Objects.requireNonNull(oldDocIds); + Objects.requireNonNull(sortMap); assert (old2NewOrd != null || new2OldOrd != null || newDocsWithField != null); + assert (old2NewOrd == null || old2NewOrd.length == oldDocIds.cardinality()); + assert (new2OldOrd == null || new2OldOrd.length == oldDocIds.cardinality()); IntIntHashMap newIdToOldOrd = new IntIntHashMap(); DocIdSetIterator iterator = oldDocIds.iterator(); int[] newDocIds = new int[oldDocIds.cardinality()]; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OrdToDocDISIReaderConfiguration.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OrdToDocDISIReaderConfiguration.java index a13485eed5d3..e4c921ddee22 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OrdToDocDISIReaderConfiguration.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/OrdToDocDISIReaderConfiguration.java @@ -40,7 +40,7 @@ public class OrdToDocDISIReaderConfiguration { *

    Within outputMeta the format is as follows: * *

      - *
    • [int8] if equals to -2, empty - no vectory values. If equals to -1, dense – all + *
    • [int8] if equals to -2, empty - no vector values. If equals to -1, dense – all * documents have values for a field. If equals to 0, sparse – some documents missing * values. *
    • DocIds were encoded by {@link IndexedDISI#writeBitSet(DocIdSetIterator, IndexOutput, diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java index c20c708b0b2d..c8ef2709db66 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java @@ -56,7 +56,7 @@ *
    • [vlong] length of this field's vectors, in bytes *
    • [vint] dimension of this field's vectors *
    • [int] the number of documents having values for this field - *
    • [int8] if equals to -2, empty - no vectory values. If equals to -1, dense – all + *
    • [int8] if equals to -2, empty - no vector values. If equals to -1, dense – all * documents have values for a field. If equals to 0, sparse – some documents missing values. *
    • DocIds were encoded by {@link IndexedDISI#writeBitSet(DocIdSetIterator, IndexOutput, byte)} *
    • OrdToDoc was encoded by {@link org.apache.lucene.util.packed.DirectMonotonicWriter}, note From 34d55aae10b674d198f3860149f6f4a59f24f46e Mon Sep 17 00:00:00 2001 From: Patrick Zhai Date: Mon, 8 Jul 2024 11:40:10 -0700 Subject: [PATCH 6/6] javadoc fix --- .../src/java/org/apache/lucene/codecs/KnnVectorsWriter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java index fb40a51d7c7e..053ab893df18 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java @@ -148,7 +148,7 @@ public int nextDoc() throws IOException { * nothing and output are written to parameters * * @param oldDocIds the old or current document ordinals. Must not be null. - * @param sortMap, the document sorting map for how to make the new ordinals. Must not be null. + * @param sortMap the document sorting map for how to make the new ordinals. Must not be null. * @param old2NewOrd int[] maps from old ord to new ord * @param new2OldOrd int[] maps from new ord to old ord * @param newDocsWithField set of new doc ids which has the value