From 705b50fffdb7d9c1255708174ede5c1e87772da4 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 6 Jun 2024 16:14:10 +0900 Subject: [PATCH] low hanging fruit in optimization --- .../phrase_prefix_scorer.rs | 40 ++++++++++---- .../phrase_prefix_weight.rs | 53 ++++++++++++++++--- src/query/phrase_query/mod.rs | 2 +- src/query/phrase_query/phrase_scorer.rs | 2 +- 4 files changed, 77 insertions(+), 20 deletions(-) diff --git a/src/query/phrase_prefix_query/phrase_prefix_scorer.rs b/src/query/phrase_prefix_query/phrase_prefix_scorer.rs index 4e76d49299..ed3ebf3bbe 100644 --- a/src/query/phrase_prefix_query/phrase_prefix_scorer.rs +++ b/src/query/phrase_prefix_query/phrase_prefix_scorer.rs @@ -2,7 +2,7 @@ use crate::docset::{DocSet, TERMINATED}; use crate::fieldnorm::FieldNormReader; use crate::postings::Postings; use crate::query::bm25::Bm25Weight; -use crate::query::phrase_query::{intersection_count, PhraseScorer}; +use crate::query::phrase_query::{intersection_count, intersection_exists, PhraseScorer}; use crate::query::Scorer; use crate::{DocId, Score}; @@ -92,14 +92,17 @@ impl Scorer for PhraseKind { } } -pub struct PhrasePrefixScorer { +pub struct PhrasePrefixScorer { phrase_scorer: PhraseKind, suffixes: Vec, suffix_offset: u32, phrase_count: u32, + suffix_position_buffer: Vec, } -impl PhrasePrefixScorer { +impl + PhrasePrefixScorer +{ // If similarity_weight is None, then scoring is disabled. pub fn new( mut term_postings: Vec<(usize, TPostings)>, @@ -107,7 +110,7 @@ impl PhrasePrefixScorer { fieldnorm_reader: FieldNormReader, suffixes: Vec, suffix_pos: usize, - ) -> PhrasePrefixScorer { + ) -> PhrasePrefixScorer { // correct indices so we can merge with our suffix term the PhraseScorer doesn't know about let max_offset = term_postings .iter() @@ -140,6 +143,7 @@ impl PhrasePrefixScorer { suffixes, suffix_offset: (max_offset - suffix_pos) as u32, phrase_count: 0, + suffix_position_buffer: Vec::with_capacity(100), }; if phrase_prefix_scorer.doc() != TERMINATED && !phrase_prefix_scorer.matches_prefix() { phrase_prefix_scorer.advance(); @@ -153,7 +157,6 @@ impl PhrasePrefixScorer { fn matches_prefix(&mut self) -> bool { let mut count = 0; - let mut positions = Vec::new(); let current_doc = self.doc(); let pos_matching = self.phrase_scorer.get_intersection(); for suffix in &mut self.suffixes { @@ -162,16 +165,27 @@ impl PhrasePrefixScorer { } let doc = suffix.seek(current_doc); if doc == current_doc { - suffix.positions_with_offset(self.suffix_offset, &mut positions); - count += intersection_count(pos_matching, &positions); + suffix.positions_with_offset(self.suffix_offset, &mut self.suffix_position_buffer); + if SCORING_ENABLED { + count += intersection_count(pos_matching, &self.suffix_position_buffer); + } else { + if intersection_exists(pos_matching, &self.suffix_position_buffer) { + return true; + } + } } } + if !SCORING_ENABLED { + return false; + } self.phrase_count = count as u32; count != 0 } } -impl DocSet for PhrasePrefixScorer { +impl DocSet + for PhrasePrefixScorer +{ fn advance(&mut self) -> DocId { loop { let doc = self.phrase_scorer.advance(); @@ -198,9 +212,15 @@ impl DocSet for PhrasePrefixScorer { } } -impl Scorer for PhrasePrefixScorer { +impl Scorer + for PhrasePrefixScorer +{ fn score(&mut self) -> Score { + if SCORING_ENABLED { + self.phrase_scorer.score() + } else { + 1.0f32 + } // TODO modify score?? - self.phrase_scorer.score() } } diff --git a/src/query/phrase_prefix_query/phrase_prefix_weight.rs b/src/query/phrase_prefix_query/phrase_prefix_weight.rs index 866c3c2c5a..47f7db1ee7 100644 --- a/src/query/phrase_prefix_query/phrase_prefix_weight.rs +++ b/src/query/phrase_prefix_query/phrase_prefix_weight.rs @@ -42,11 +42,11 @@ impl PhrasePrefixWeight { Ok(FieldNormReader::constant(reader.max_doc(), 1)) } - pub(crate) fn phrase_scorer( + pub(crate) fn phrase_prefix_scorer( &self, reader: &SegmentReader, boost: Score, - ) -> crate::Result>> { + ) -> crate::Result>> { let similarity_weight_opt = self .similarity_weight_opt .as_ref() @@ -128,15 +128,20 @@ impl PhrasePrefixWeight { impl Weight for PhrasePrefixWeight { fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { - if let Some(scorer) = self.phrase_scorer(reader, boost)? { - Ok(Box::new(scorer)) + if self.similarity_weight_opt.is_some() { + if let Some(scorer) = self.phrase_prefix_scorer::(reader, boost)? { + return Ok(Box::new(scorer)); + } } else { - Ok(Box::new(EmptyScorer)) + if let Some(scorer) = self.phrase_prefix_scorer::(reader, boost)? { + return Ok(Box::new(scorer)); + } } + Ok(Box::new(EmptyScorer)) } fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result { - let scorer_opt = self.phrase_scorer(reader, 1.0)?; + let scorer_opt = self.phrase_prefix_scorer::(reader, 1.0)?; if scorer_opt.is_none() { return Err(does_not_match(doc)); } @@ -200,7 +205,7 @@ mod tests { .unwrap() .unwrap(); let mut phrase_scorer = phrase_weight - .phrase_scorer(searcher.segment_reader(0u32), 1.0)? + .phrase_prefix_scorer::(searcher.segment_reader(0u32), 1.0)? .unwrap(); assert_eq!(phrase_scorer.doc(), 1); assert_eq!(phrase_scorer.phrase_count(), 2); @@ -211,6 +216,38 @@ mod tests { Ok(()) } + #[test] + pub fn test_phrase_no_count() -> crate::Result<()> { + let index = create_index(&[ + "aa bb dd cc", + "aa aa bb c dd aa bb cc aa bb dc", + " aa bb cd", + ])?; + let schema = index.schema(); + let text_field = schema.get_field("text").unwrap(); + let searcher = index.reader()?.searcher(); + let phrase_query = PhrasePrefixQuery::new(vec![ + Term::from_field_text(text_field, "aa"), + Term::from_field_text(text_field, "bb"), + Term::from_field_text(text_field, "c"), + ]); + let enable_scoring = EnableScoring::enabled_from_searcher(&searcher); + let phrase_weight = phrase_query + .phrase_prefix_query_weight(enable_scoring) + .unwrap() + .unwrap(); + let mut phrase_scorer = phrase_weight + .phrase_prefix_scorer::(searcher.segment_reader(0u32), 1.0)? + .unwrap(); + assert_eq!(phrase_scorer.doc(), 1); + assert_eq!(phrase_scorer.phrase_count(), 1); + assert_eq!(phrase_scorer.advance(), 2); + assert_eq!(phrase_scorer.doc(), 2); + assert_eq!(phrase_scorer.phrase_count(), 1); + assert_eq!(phrase_scorer.advance(), TERMINATED); + Ok(()) + } + #[test] pub fn test_phrase_count_mid() -> crate::Result<()> { let index = create_index(&["aa dd cc", "aa aa bb c dd aa bb cc aa dc", " aa bb cd"])?; @@ -227,7 +264,7 @@ mod tests { .unwrap() .unwrap(); let mut phrase_scorer = phrase_weight - .phrase_scorer(searcher.segment_reader(0u32), 1.0)? + .phrase_prefix_scorer::(searcher.segment_reader(0u32), 1.0)? .unwrap(); assert_eq!(phrase_scorer.doc(), 1); assert_eq!(phrase_scorer.phrase_count(), 2); diff --git a/src/query/phrase_query/mod.rs b/src/query/phrase_query/mod.rs index 7b8d3e0074..89b22cef1a 100644 --- a/src/query/phrase_query/mod.rs +++ b/src/query/phrase_query/mod.rs @@ -3,8 +3,8 @@ mod phrase_scorer; mod phrase_weight; pub use self::phrase_query::PhraseQuery; -pub(crate) use self::phrase_scorer::intersection_count; pub use self::phrase_scorer::PhraseScorer; +pub(crate) use self::phrase_scorer::{intersection_count, intersection_exists}; pub use self::phrase_weight::PhraseWeight; #[cfg(test)] diff --git a/src/query/phrase_query/phrase_scorer.rs b/src/query/phrase_query/phrase_scorer.rs index 147aef29b3..a8c4e4babd 100644 --- a/src/query/phrase_query/phrase_scorer.rs +++ b/src/query/phrase_query/phrase_scorer.rs @@ -58,7 +58,7 @@ pub struct PhraseScorer { } /// Returns true if and only if the two sorted arrays contain a common element -fn intersection_exists(left: &[u32], right: &[u32]) -> bool { +pub(crate) fn intersection_exists(left: &[u32], right: &[u32]) -> bool { let mut left_index = 0; let mut right_index = 0; while left_index < left.len() && right_index < right.len() {