Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

split Term into Term and IndexingTerm #2366

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 21 additions & 27 deletions src/core/json_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use rustc_hash::FxHashMap;

use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value};
use crate::schema::indexing_term::IndexingTerm;
use crate::schema::{Field, Type};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::{OffsetDateTime, UtcOffset};
Expand Down Expand Up @@ -74,7 +75,7 @@ pub(crate) fn index_json_values<'a, V: Value<'a>>(
json_visitors: impl Iterator<Item = crate::Result<V::ObjectIter>>,
text_analyzer: &mut TextAnalyzer,
expand_dots_enabled: bool,
term_buffer: &mut Term,
term_buffer: &mut IndexingTerm,
postings_writer: &mut dyn PostingsWriter,
json_path_writer: &mut JsonPathWriter,
ctx: &mut IndexingContext,
Expand Down Expand Up @@ -103,7 +104,7 @@ fn index_json_object<'a, V: Value<'a>>(
doc: DocId,
json_visitor: V::ObjectIter,
text_analyzer: &mut TextAnalyzer,
term_buffer: &mut Term,
term_buffer: &mut IndexingTerm,
json_path_writer: &mut JsonPathWriter,
postings_writer: &mut dyn PostingsWriter,
ctx: &mut IndexingContext,
Expand All @@ -130,19 +131,16 @@ fn index_json_value<'a, V: Value<'a>>(
doc: DocId,
json_value: V,
text_analyzer: &mut TextAnalyzer,
term_buffer: &mut Term,
term_buffer: &mut IndexingTerm,
json_path_writer: &mut JsonPathWriter,
postings_writer: &mut dyn PostingsWriter,
ctx: &mut IndexingContext,
positions_per_path: &mut IndexingPositionsPerPath,
) {
let set_path_id = |term_buffer: &mut Term, unordered_id: u32| {
let set_path_id = |term_buffer: &mut IndexingTerm, unordered_id: u32| {
term_buffer.truncate_value_bytes(0);
term_buffer.append_bytes(&unordered_id.to_be_bytes());
};
let set_type = |term_buffer: &mut Term, typ: Type| {
term_buffer.append_bytes(&[typ.to_code()]);
};

match json_value.as_value() {
ReferenceValue::Leaf(leaf) => match leaf {
Expand All @@ -155,7 +153,7 @@ fn index_json_value<'a, V: Value<'a>>(

// TODO: make sure the chain position works out.
set_path_id(term_buffer, unordered_id);
set_type(term_buffer, Type::Str);
term_buffer.append_bytes(&[Type::Str.to_code()]);
let indexing_position = positions_per_path.get_position_from_id(unordered_id);
postings_writer.index_text(
doc,
Expand Down Expand Up @@ -211,18 +209,16 @@ fn index_json_value<'a, V: Value<'a>>(
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
ReferenceValueLeaf::PreTokStr(_) => {
unimplemented!(
"Pre-tokenized string support in dynamic fields is not yet implemented"
)
unimplemented!("Pre-tokenized string support in JSON fields is not yet implemented")
}
ReferenceValueLeaf::Bytes(_) => {
unimplemented!("Bytes support in dynamic fields is not yet implemented")
unimplemented!("Bytes support in JSON fields is not yet implemented")
}
ReferenceValueLeaf::Facet(_) => {
unimplemented!("Facet support in dynamic fields is not yet implemented")
unimplemented!("Facet support in JSON fields is not yet implemented")
}
ReferenceValueLeaf::IpAddr(_) => {
unimplemented!("IP address support in dynamic fields is not yet implemented")
unimplemented!("IP address support in JSON fields is not yet implemented")
}
},
ReferenceValue::Array(elements) => {
Expand Down Expand Up @@ -257,14 +253,12 @@ fn index_json_value<'a, V: Value<'a>>(
/// Tries to infer a JSON type from a string and append it to the term.
///
/// The term must be json + JSON path.
pub(crate) fn convert_to_fast_value_and_append_to_json_term(
mut term: Term,
phrase: &str,
) -> Option<Term> {
pub fn convert_to_fast_value_and_append_to_json_term(mut term: Term, phrase: &str) -> Option<Term> {
assert_eq!(
term.value()
.as_json_value_bytes()
.as_json()
.expect("expecting a Term with a json type and json path")
.1
.as_serialized()
.len(),
0,
Expand Down Expand Up @@ -410,8 +404,8 @@ mod tests {
term.append_type_and_fast_value(-4i64);

assert_eq!(
term.serialized_term(),
b"\x00\x00\x00\x01jcolor\x00i\x7f\xff\xff\xff\xff\xff\xff\xfc"
term.value().as_serialized(),
b"jcolor\x00i\x7f\xff\xff\xff\xff\xff\xff\xfc"
)
}

Expand All @@ -422,8 +416,8 @@ mod tests {
term.append_type_and_fast_value(4u64);

assert_eq!(
term.serialized_term(),
b"\x00\x00\x00\x01jcolor\x00u\x00\x00\x00\x00\x00\x00\x00\x04"
term.value().as_serialized(),
b"jcolor\x00u\x00\x00\x00\x00\x00\x00\x00\x04"
)
}

Expand All @@ -433,8 +427,8 @@ mod tests {
let mut term = term_from_json_paths(field, ["color"].into_iter(), false);
term.append_type_and_fast_value(4.0f64);
assert_eq!(
term.serialized_term(),
b"\x00\x00\x00\x01jcolor\x00f\xc0\x10\x00\x00\x00\x00\x00\x00"
term.value().as_serialized(),
b"jcolor\x00f\xc0\x10\x00\x00\x00\x00\x00\x00"
)
}

Expand All @@ -444,8 +438,8 @@ mod tests {
let mut term = term_from_json_paths(field, ["color"].into_iter(), false);
term.append_type_and_fast_value(true);
assert_eq!(
term.serialized_term(),
b"\x00\x00\x00\x01jcolor\x00o\x00\x00\x00\x00\x00\x00\x00\x01"
term.value().as_serialized(),
b"jcolor\x00o\x00\x00\x00\x00\x00\x00\x00\x01"
)
}

Expand Down
15 changes: 7 additions & 8 deletions src/indexer/segment_writer.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use columnar::MonotonicallyMappableToU64;
use common::JsonPathWriter;
use itertools::Itertools;
use tokenizer_api::BoxTokenStream;
Expand All @@ -15,7 +14,8 @@ use crate::postings::{
PerFieldPostingsWriter, PostingsWriter,
};
use crate::schema::document::{Document, ReferenceValue, Value};
use crate::schema::{FieldEntry, FieldType, Schema, Term, DATE_TIME_PRECISION_INDEXED};
use crate::schema::indexing_term::IndexingTerm;
use crate::schema::{FieldEntry, FieldType, Schema};
use crate::store::{StoreReader, StoreWriter};
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer};
use crate::{DocId, Opstamp, SegmentComponent, TantivyError};
Expand Down Expand Up @@ -70,7 +70,7 @@ pub struct SegmentWriter {
pub(crate) json_path_writer: JsonPathWriter,
pub(crate) doc_opstamps: Vec<Opstamp>,
per_field_text_analyzers: Vec<TextAnalyzer>,
term_buffer: Term,
term_buffer: IndexingTerm,
schema: Schema,
}

Expand Down Expand Up @@ -126,7 +126,7 @@ impl SegmentWriter {
)?,
doc_opstamps: Vec::with_capacity(1_000),
per_field_text_analyzers,
term_buffer: Term::with_capacity(16),
term_buffer: IndexingTerm::new(),
schema,
})
}
Expand Down Expand Up @@ -195,7 +195,7 @@ impl SegmentWriter {
let (term_buffer, ctx) = (&mut self.term_buffer, &mut self.ctx);
let postings_writer: &mut dyn PostingsWriter =
self.per_field_postings_writers.get_for_field_mut(field);
term_buffer.clear_with_field_and_type(field_entry.field_type().value_type(), field);
term_buffer.clear_with_field(field);

match field_entry.field_type() {
FieldType::Facet(_) => {
Expand Down Expand Up @@ -271,8 +271,7 @@ impl SegmentWriter {

num_vals += 1;
let date_val = value.as_datetime().ok_or_else(make_schema_error)?;
term_buffer
.set_u64(date_val.truncate(DATE_TIME_PRECISION_INDEXED).to_u64());
term_buffer.set_date(date_val);
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
}
if field_entry.has_fieldnorms() {
Expand Down Expand Up @@ -332,7 +331,7 @@ impl SegmentWriter {

num_vals += 1;
let bytes = value.as_bytes().ok_or_else(make_schema_error)?;
term_buffer.set_bytes(bytes);
term_buffer.set_value_bytes(bytes);
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
}
if field_entry.has_fieldnorms() {
Expand Down
96 changes: 66 additions & 30 deletions src/postings/json_postings_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@ use crate::indexer::path_to_unordered_id::OrderedPathId;
use crate::postings::postings_writer::SpecializedPostingsWriter;
use crate::postings::recorder::{BufferLender, DocIdRecorder, Recorder};
use crate::postings::{FieldSerializer, IndexingContext, IndexingPosition, PostingsWriter};
use crate::schema::{Field, Type};
use crate::schema::indexing_term::IndexingTerm;
use crate::schema::{Field, Type, ValueBytes};
use crate::tokenizer::TokenStream;
use crate::{DocId, Term};
use crate::DocId;

/// The `JsonPostingsWriter` is odd in that it relies on a hidden contract:
///
Expand All @@ -34,7 +35,7 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
&mut self,
doc: crate::DocId,
pos: u32,
term: &crate::Term,
term: &IndexingTerm,
ctx: &mut IndexingContext,
) {
self.non_str_posting_writer.subscribe(doc, pos, term, ctx);
Expand All @@ -44,7 +45,7 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
&mut self,
doc_id: DocId,
token_stream: &mut dyn TokenStream,
term_buffer: &mut Term,
term_buffer: &mut IndexingTerm,
ctx: &mut IndexingContext,
indexing_position: &mut IndexingPosition,
) {
Expand All @@ -66,42 +67,40 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
ctx: &IndexingContext,
serializer: &mut FieldSerializer,
) -> io::Result<()> {
let mut term_buffer = Term::with_capacity(48);
let mut term_buffer = JsonTermSerializer(Vec::with_capacity(48));
let mut buffer_lender = BufferLender::default();
term_buffer.clear_with_field_and_type(Type::Json, Field::from_field_id(0));
let mut prev_term_id = u32::MAX;
let mut term_path_len = 0; // this will be set in the first iteration
for (_field, path_id, term, addr) in term_addrs {
if prev_term_id != path_id.path_id() {
term_buffer.truncate_value_bytes(0);
term_buffer.clear();
term_buffer.append_path(ordered_id_to_path[path_id.path_id() as usize].as_bytes());
term_buffer.append_bytes(&[JSON_END_OF_PATH]);
term_path_len = term_buffer.len_bytes();
term_path_len = term_buffer.len();
prev_term_id = path_id.path_id();
}
term_buffer.truncate_value_bytes(term_path_len);
term_buffer.truncate(term_path_len);
term_buffer.append_bytes(term);
if let Some(json_value) = term_buffer.value().as_json_value_bytes() {
let typ = json_value.typ();
if typ == Type::Str {
SpecializedPostingsWriter::<Rec>::serialize_one_term(
term_buffer.serialized_value_bytes(),
*addr,
doc_id_map,
&mut buffer_lender,
ctx,
serializer,
)?;
} else {
SpecializedPostingsWriter::<DocIdRecorder>::serialize_one_term(
term_buffer.serialized_value_bytes(),
*addr,
doc_id_map,
&mut buffer_lender,
ctx,
serializer,
)?;
}
let json_value = ValueBytes::wrap(term);
let typ = json_value.typ();
if typ == Type::Str {
SpecializedPostingsWriter::<Rec>::serialize_one_term(
term_buffer.as_bytes(),
*addr,
doc_id_map,
&mut buffer_lender,
ctx,
serializer,
)?;
} else {
SpecializedPostingsWriter::<DocIdRecorder>::serialize_one_term(
term_buffer.as_bytes(),
*addr,
doc_id_map,
&mut buffer_lender,
ctx,
serializer,
)?;
}
}
Ok(())
Expand All @@ -111,3 +110,40 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
self.str_posting_writer.total_num_tokens() + self.non_str_posting_writer.total_num_tokens()
}
}

struct JsonTermSerializer(Vec<u8>);
impl JsonTermSerializer {
#[inline]
pub fn append_path(&mut self, bytes: &[u8]) {
if bytes.contains(&0u8) {
self.0
.extend(bytes.iter().map(|&b| if b == 0 { b'0' } else { b }));
} else {
self.0.extend_from_slice(bytes);
}
}

/// Appends value bytes to the Term.
///
/// This function returns the segment that has just been added.
#[inline]
pub fn append_bytes(&mut self, bytes: &[u8]) -> &mut [u8] {
let len_before = self.0.len();
self.0.extend_from_slice(bytes);
&mut self.0[len_before..]
}

fn clear(&mut self) {
self.0.clear();
}
fn truncate(&mut self, len: usize) {
self.0.truncate(len);
}
fn len(&self) -> usize {
self.0.len()
}

fn as_bytes(&self) -> &[u8] {
&self.0
}
}
Loading