Skip to content

Commit

Permalink
Provide StoreReader::enumerate to simplify creation of secondary indexes
Browse files Browse the repository at this point in the history
For secondary indexes, it is often necessary to read all documents, compute
some function on them and associated the result with a document ID.

Currently, this requires something like

```rust
let reader = segment.get_store_reader(1)?;

for doc_id in segment.doc_ids_alive() {
    let doc = reader.get(doc_id)?;

    // Use doc and doc_id here ...
}
```

which can be simplified to

```rust
let reader = segment.get_store_reader(1)?;

for res in reader.enumerate() {
    let (doc_id, doc) = res?;

    // Use doc and doc_id here ...
}
```

using the method proposed here.

(I added a new method instead of modifying `StoreReader::iter` to make the
change backwards compatible, i.e. possible to include in a point release.)
  • Loading branch information
adamreichold committed Apr 15, 2024
1 parent b493743 commit ebf92f7
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 8 deletions.
4 changes: 2 additions & 2 deletions src/indexer/merger.rs
Original file line number Diff line number Diff line change
Expand Up @@ -697,7 +697,7 @@ impl IndexMerger {
for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() {
let doc_bytes_it = &mut document_iterators[old_doc_addr.segment_ord as usize];
if let Some(doc_bytes_res) = doc_bytes_it.next() {
let doc_bytes = doc_bytes_res?;
let (_, doc_bytes) = doc_bytes_res?;
store_writer.store_bytes(&doc_bytes)?;
} else {
return Err(DataCorruption::comment_only(format!(
Expand Down Expand Up @@ -729,7 +729,7 @@ impl IndexMerger {
|| store_reader.decompressor() != store_writer.compressor().into()
{
for doc_bytes_res in store_reader.iter_raw(reader.alive_bitset()) {
let doc_bytes = doc_bytes_res?;
let (_, doc_bytes) = doc_bytes_res?;
store_writer.store_bytes(&doc_bytes)?;
}
} else {
Expand Down
23 changes: 17 additions & 6 deletions src/store/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -239,12 +239,23 @@ impl StoreReader {
&'b self,
alive_bitset: Option<&'a AliveBitSet>,
) -> impl Iterator<Item = crate::Result<D>> + 'b {
self.enumerate(alive_bitset)
.map(|res| res.map(|(_, doc)| doc))
}

/// A variant of [`iter`][Self::iter] which also yields document ID.
pub fn enumerate<'a: 'b, 'b, D: DocumentDeserialize>(
&'b self,
alive_bitset: Option<&'a AliveBitSet>,
) -> impl Iterator<Item = crate::Result<(DocId, D)>> + 'b {
self.iter_raw(alive_bitset).map(|doc_bytes_res| {
let mut doc_bytes = doc_bytes_res?;
let (doc_id, mut doc_bytes) = doc_bytes_res?;

let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes)
.map_err(crate::TantivyError::from)?;
D::deserialize(deserializer).map_err(crate::TantivyError::from)
let doc = D::deserialize(deserializer).map_err(crate::TantivyError::from)?;

Ok((doc_id, doc))
})
}

Expand All @@ -254,7 +265,7 @@ impl StoreReader {
pub(crate) fn iter_raw<'a: 'b, 'b>(
&'b self,
alive_bitset: Option<&'a AliveBitSet>,
) -> impl Iterator<Item = crate::Result<OwnedBytes>> + 'b {
) -> impl Iterator<Item = crate::Result<(DocId, OwnedBytes)>> + 'b {
let last_doc_id = self
.block_checkpoints()
.last()
Expand Down Expand Up @@ -282,14 +293,14 @@ impl StoreReader {

let alive = alive_bitset.map_or(true, |bitset| bitset.is_alive(doc_id));
let res = if alive {
Some((curr_block.clone(), doc_pos))
Some((doc_id, curr_block.clone(), doc_pos))
} else {
None
};
doc_pos += 1;
res
})
.map(move |(block, doc_pos)| {
.map(move |(doc_id, block, doc_pos)| {
let block = block
.ok_or_else(|| {
DataCorruption::comment_only(
Expand All @@ -302,7 +313,7 @@ impl StoreReader {
})?;

let range = block_read_index(&block, doc_pos)?;
Ok(block.slice(range))
Ok((doc_id, block.slice(range)))
})
}

Expand Down

0 comments on commit ebf92f7

Please sign in to comment.