diff --git a/CMakeLists.txt b/CMakeLists.txt index 86c09bd60..3733a2249 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,6 +9,9 @@ endif() set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) +if(NOT CMAKE_CXX_STANDARD EQUAL 17) + add_compile_definitions(PISA_ENABLE_CONCEPTS=1) +endif() option(PISA_BUILD_TOOLS "Build command line tools." ON) option(PISA_ENABLE_TESTING "Enable testing of the library." ON) diff --git a/include/pisa/accumulator/README.md b/include/pisa/accumulator/README.md new file mode 100644 index 000000000..af6c0de91 --- /dev/null +++ b/include/pisa/accumulator/README.md @@ -0,0 +1,12 @@ +# Score Accumulators + +Score accumulators are used to accumulate (and later aggregate) document +scores. These are handy for term-at-a-time (TAAT) query processing. Two +implementations are available: `SimpleAccumulator` and +`LazyAccumulator`. They both satisfy the `PartialScoreAccumulator` +concept (if using in C++20 mode). For the definition, see +`partial_score_accumulator.hpp`. + +`SimpleAccumulator` is a simple wrapper over a `std::vector`, +while `LazyAccumulator` implements some optimizations as described in +`lazy_accumulator.hpp`. diff --git a/include/pisa/accumulator/lazy_accumulator.hpp b/include/pisa/accumulator/lazy_accumulator.hpp index 5dc9dfdcc..8bd4ec66a 100644 --- a/include/pisa/accumulator/lazy_accumulator.hpp +++ b/include/pisa/accumulator/lazy_accumulator.hpp @@ -5,12 +5,22 @@ #include #include +#include "concepts.hpp" +#include "partial_score_accumulator.hpp" #include "topk_queue.hpp" namespace pisa { +/** + * Lazy accumulator fully resets the entire array only (1 << counter_bit_size) call to `reset()`. + * For example, if `counter_bit_size = 3`, then all values are set to 0 every 8th reset. To allow + * for that, the array is partitioned into blocks, each of which has a number of accumulators and + * a descriptor that encodes when was the last time the block was in use. If it was used before + * the current query (according to a counter that is reset each cycle), the block is wiped out + * before accumulating another score. + */ template -struct Lazy_Accumulator { +class LazyAccumulator { using reference = float&; static_assert( @@ -46,11 +56,14 @@ struct Lazy_Accumulator { } }; - explicit Lazy_Accumulator(std::size_t size) + public: + explicit LazyAccumulator(std::size_t size) : m_size(size), m_accumulators((size + counters_in_descriptor - 1) / counters_in_descriptor) - {} + { + PISA_ASSERT_CONCEPT(PartialScoreAccumulator); + } - void init() + void reset() { if (m_counter == 0) { auto first = reinterpret_cast(&m_accumulators.front()); @@ -60,7 +73,7 @@ struct Lazy_Accumulator { } } - void accumulate(std::ptrdiff_t const document, float score) + void accumulate(std::size_t document, float score) { auto const block = document / counters_in_descriptor; auto const pos_in_block = document % counters_in_descriptor; @@ -70,7 +83,7 @@ struct Lazy_Accumulator { m_accumulators[block].accumulators[pos_in_block] += score; } - void aggregate(topk_queue& topk) + void collect(topk_queue& topk) { uint64_t docid = 0U; for (auto const& block: m_accumulators) { @@ -86,8 +99,6 @@ struct Lazy_Accumulator { } [[nodiscard]] auto size() const noexcept -> std::size_t { return m_size; } - [[nodiscard]] auto blocks() noexcept -> std::vector& { return m_accumulators; } - [[nodiscard]] auto counter() const noexcept -> int { return m_counter; } private: std::size_t m_size; diff --git a/include/pisa/accumulator/partial_score_accumulator.hpp b/include/pisa/accumulator/partial_score_accumulator.hpp new file mode 100644 index 000000000..1421e2c92 --- /dev/null +++ b/include/pisa/accumulator/partial_score_accumulator.hpp @@ -0,0 +1,60 @@ +// Copyright 2023 PISA developers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// clang-format off + +#pragma once + +#ifdef PISA_ENABLE_CONCEPTS + +#include +#include +#include +#include + +#include "topk_queue.hpp" + +namespace pisa { + +/** + * Accumulator capable of accumulating partial scores. One document can be accumulated multiple + * times, and the scores will be summed. Typically used for term-at-a-time (TAAT) processing. + */ +template +concept PartialScoreAccumulator = requires(T a, std::uint32_t docid, float score) +{ + /** + * Resets the accumulator. After a reset, it is ready to be used for the next query. + */ + a.reset(); + + /** + * Accumulates a partial score for the given document. + */ + a.accumulate(docid, score); +} +&& requires(T const a, float score, pisa::topk_queue& topk) +{ + /** + * Pushes results to the top-k priority queue. + */ + a.collect(topk); + { a.size() } -> std::same_as; +}; + +}; // namespace pisa + +// clang-format on + +#endif diff --git a/include/pisa/accumulator/simple_accumulator.hpp b/include/pisa/accumulator/simple_accumulator.hpp index 8142fb4c6..604e11ceb 100644 --- a/include/pisa/accumulator/simple_accumulator.hpp +++ b/include/pisa/accumulator/simple_accumulator.hpp @@ -1,19 +1,49 @@ +/* Copyright 2023 PISA developers + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + #pragma once + #include #include +#include #include +#include "concepts.hpp" +#include "partial_score_accumulator.hpp" #include "topk_queue.hpp" namespace pisa { -struct Simple_Accumulator: public std::vector { - explicit Simple_Accumulator(std::ptrdiff_t size) : std::vector(size) {} - void init() { std::fill(begin(), end(), 0.0); } - void accumulate(uint32_t doc, float score) { operator[](doc) += score; } - void aggregate(topk_queue& topk) +/** + * Simple accumulator is an array of scores, where element n is the score of the n-th document. + * Each reset sets all values to 0, and accumulating is done by simply adding the given score to + * the score in the accumulator. + */ +class SimpleAccumulator: public std::vector { + public: + explicit SimpleAccumulator(std::size_t size) : std::vector(size) + { + PISA_ASSERT_CONCEPT(PartialScoreAccumulator); + } + + void reset() { std::fill(begin(), end(), 0.0); } + + void accumulate(std::uint32_t doc, float score) { operator[](doc) += score; } + + void collect(topk_queue& topk) { - uint64_t docid = 0U; + std::uint32_t docid = 0U; std::for_each(begin(), end(), [&](auto score) { if (topk.would_enter(score)) { topk.insert(score, docid); diff --git a/include/pisa/concepts.hpp b/include/pisa/concepts.hpp new file mode 100644 index 000000000..0a7b4b922 --- /dev/null +++ b/include/pisa/concepts.hpp @@ -0,0 +1,31 @@ +/* Copyright 2023 PISA developers + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef PISA_ENABLE_CONCEPTS + +#define PISA_REQUIRES(x) \ + requires (x) + +#define PISA_ASSERT_CONCEPT(x) \ + static_assert(x) + +#else + +#define PISA_REQUIRES(x) /**/ + +#define PISA_ASSERT_CONCEPT(x) /**/ + +#endif diff --git a/include/pisa/query/algorithm/range_taat_query.hpp b/include/pisa/query/algorithm/range_taat_query.hpp index fbda2e380..0ba7306b7 100644 --- a/include/pisa/query/algorithm/range_taat_query.hpp +++ b/include/pisa/query/algorithm/range_taat_query.hpp @@ -1,5 +1,7 @@ #pragma once +#include "accumulator/partial_score_accumulator.hpp" +#include "concepts.hpp" #include "query/queries.hpp" #include "topk_queue.hpp" @@ -10,13 +12,14 @@ struct range_taat_query { explicit range_taat_query(topk_queue& topk) : m_topk(topk) {} template + PISA_REQUIRES(PartialScoreAccumulator) void operator()(CursorRange&& cursors, uint64_t max_docid, size_t range_size, Acc&& accumulator) { if (cursors.empty()) { return; } - accumulator.init(); + accumulator.reset(); for (size_t end = range_size; end + range_size < max_docid; end += range_size) { process_range(cursors, end, accumulator); diff --git a/include/pisa/query/algorithm/ranked_or_taat_query.hpp b/include/pisa/query/algorithm/ranked_or_taat_query.hpp index 56516f0ef..d42146174 100644 --- a/include/pisa/query/algorithm/ranked_or_taat_query.hpp +++ b/include/pisa/query/algorithm/ranked_or_taat_query.hpp @@ -1,11 +1,8 @@ #pragma once +#include "accumulator/partial_score_accumulator.hpp" +#include "concepts.hpp" #include "query/queries.hpp" -#include "topk_queue.hpp" -#include "util/intrinsics.hpp" - -#include "accumulator/simple_accumulator.hpp" - #include "topk_queue.hpp" namespace pisa { @@ -15,12 +12,13 @@ class ranked_or_taat_query { explicit ranked_or_taat_query(topk_queue& topk) : m_topk(topk) {} template + PISA_REQUIRES(PartialScoreAccumulator) void operator()(CursorRange&& cursors, uint64_t max_docid, Acc&& accumulator) { if (cursors.empty()) { return; } - accumulator.init(); + accumulator.reset(); for (auto&& cursor: cursors) { while (cursor.docid() < max_docid) { @@ -28,7 +26,7 @@ class ranked_or_taat_query { cursor.next(); } } - accumulator.aggregate(m_topk); + accumulator.collect(m_topk); } std::vector const& topk() const { return m_topk.topk(); } diff --git a/test/test_ranked_queries.cpp b/test/test_ranked_queries.cpp index 58a6759b8..a99e633e6 100644 --- a/test/test_ranked_queries.cpp +++ b/test/test_ranked_queries.cpp @@ -4,6 +4,7 @@ #include #include "accumulator/lazy_accumulator.hpp" +#include "accumulator/simple_accumulator.hpp" #include "cursor/block_max_scored_cursor.hpp" #include "cursor/max_scored_cursor.hpp" #include "cursor/scored_cursor.hpp" @@ -99,14 +100,14 @@ class range_query_128: public range_query { TEMPLATE_TEST_CASE( "Ranked query test", "[query][ranked][integration]", - ranked_or_taat_query_acc, - ranked_or_taat_query_acc>, + ranked_or_taat_query_acc, + ranked_or_taat_query_acc>, wand_query, maxscore_query, block_max_wand_query, block_max_maxscore_query, - range_query_128>, - range_query_128>>, + range_query_128>, + range_query_128>>, range_query_128, range_query_128, range_query_128, diff --git a/tools/evaluate_queries.cpp b/tools/evaluate_queries.cpp index 12f98c1a2..b703f8d3d 100644 --- a/tools/evaluate_queries.cpp +++ b/tools/evaluate_queries.cpp @@ -15,6 +15,7 @@ #include #include "accumulator/lazy_accumulator.hpp" +#include "accumulator/simple_accumulator.hpp" #include "app.hpp" #include "cursor/block_max_scored_cursor.hpp" #include "cursor/max_scored_cursor.hpp" @@ -115,7 +116,7 @@ void evaluate_queries( return topk.topk(); }; } else if (query_type == "ranked_or_taat") { - query_fun = [&, accumulator = Simple_Accumulator(index.num_docs())](Query query) mutable { + query_fun = [&, accumulator = SimpleAccumulator(index.num_docs())](Query query) mutable { topk_queue topk(k); ranked_or_taat_query ranked_or_taat_q(topk); ranked_or_taat_q( @@ -124,7 +125,7 @@ void evaluate_queries( return topk.topk(); }; } else if (query_type == "ranked_or_taat_lazy") { - query_fun = [&, accumulator = Lazy_Accumulator<4>(index.num_docs())](Query query) mutable { + query_fun = [&, accumulator = LazyAccumulator<4>(index.num_docs())](Query query) mutable { topk_queue topk(k); ranked_or_taat_query ranked_or_taat_q(topk); ranked_or_taat_q( diff --git a/tools/queries.cpp b/tools/queries.cpp index 8eb8d545c..c9bcc3e95 100644 --- a/tools/queries.cpp +++ b/tools/queries.cpp @@ -13,6 +13,7 @@ #include #include "accumulator/lazy_accumulator.hpp" +#include "accumulator/simple_accumulator.hpp" #include "app.hpp" #include "cursor/block_max_scored_cursor.hpp" #include "cursor/cursor.hpp" @@ -257,7 +258,7 @@ void perftest( return topk.topk().size(); }; } else if (t == "ranked_or_taat" && wand_data_filename) { - Simple_Accumulator accumulator(index.num_docs()); + SimpleAccumulator accumulator(index.num_docs()); topk_queue topk(k); ranked_or_taat_query ranked_or_taat_q(topk); query_fun = [&, ranked_or_taat_q, accumulator](Query query, Score threshold) mutable { @@ -270,7 +271,7 @@ void perftest( return topk.topk().size(); }; } else if (t == "ranked_or_taat_lazy" && wand_data_filename) { - Lazy_Accumulator<4> accumulator(index.num_docs()); + LazyAccumulator<4> accumulator(index.num_docs()); topk_queue topk(k); ranked_or_taat_query ranked_or_taat_q(topk); query_fun = [&, ranked_or_taat_q, accumulator](Query query, Score threshold) mutable {