Support triple patterns with zero variables (#1395)

So far, triple patterns in a query had to contain at least one variable. But the SPARQL 1.1 standard also supports triple patterns with no variables, like in SELECT * WHERE { wd:Q42 wdt:P31 wd:Q5 ... }. The semantics is that if the triple exists in the dataset, the triple pattern acts as the neutral element (that is, as if it weren't there), and if it does not exist in the dataset, it acts as the zero element (that is, the result of the whole graph pattern will be empty). This is now implemented, here is an example query: https://qlever.cs.uni-freiburg.de/wikidata/TkjahB . In particular, this fixes #835. As part of this fix, the code is refactored and simplified significantly, in particular: The ScanSpecification class, which so far was used only in the index classes, now has a sibling ScanSpecificationAsTripleComponent, which is now also used in the class for the IndexScan operation. Remove significant amounts of redundant code from the time, when index scans with one variable were executed (and cached) at query planning time. Remove the associated special case (used in testing), of an index scan without query execution context.
ad-freiburg · Aug 3, 2024 · 0b9d26f · 0b9d26f
1 parent 8a68213
commit 0b9d26f
Show file tree

Hide file tree

Showing 33 changed files with 521 additions and 308 deletions.
diff --git a/src/engine/CartesianProductJoin.cpp b/src/engine/CartesianProductJoin.cpp
@@ -154,10 +154,19 @@ ProtoResult CartesianProductJoin::computeResult(
       child.setLimit(limitIfPresent.value());
     }
     subResults.push_back(child.getResult());
+
+    const auto& table = subResults.back()->idTable();
     // Early stopping: If one of the results is empty, we can stop early.
-    if (subResults.back()->idTable().size() == 0) {
+    if (table.empty()) {
       break;
     }
+
+    // If one of the children is the neutral element (because of a triple with
+    // zero variables), we can simply ignore it here.
+    if (table.numRows() == 1 && table.numColumns() == 0) {
+      subResults.pop_back();
+      continue;
+    }
     // Example for the following calculation: If we have a LIMIT of 1000 and
     // the first child already has a result of size 100, then the second child
     // needs to evaluate only its first 10 results. The +1 is because integer
@@ -169,6 +178,10 @@ ProtoResult CartesianProductJoin::computeResult(
     }
   }
 
+  // TODO<joka921> Find a solution to cheaply handle the case, that only a
+  // single result is left. This can probably be done by using the
+  // `ProtoResult`.
+
   auto sizesView = std::views::transform(
       subResults, [](const auto& child) { return child->idTable().size(); });
   auto totalResultSize = std::accumulate(sizesView.begin(), sizesView.end(),

diff --git a/src/engine/CartesianProductJoin.h b/src/engine/CartesianProductJoin.h
@@ -2,9 +2,7 @@
 //                  Chair of Algorithms and Data Structures.
 //  Author: Johannes Kalmbach <[email protected]>
 
-#ifndef QLEVER_CARTESIANPRODUCTJOIN_H
-#define QLEVER_CARTESIANPRODUCTJOIN_H
-
+#pragma once
 #include "engine/Operation.h"
 #include "engine/QueryExecutionTree.h"
 
@@ -92,5 +90,3 @@ class CartesianProductJoin : public Operation {
                          std::span<const Id> inputColumn, size_t groupSize,
                          size_t offset);
 };
-
-#endif  // QLEVER_CARTESIANPRODUCTJOIN_H
diff --git a/src/engine/IndexScan.cpp b/src/engine/IndexScan.cpp
@@ -6,6 +6,7 @@
 
 #include <absl/strings/str_join.h>
 
+#include <boost/optional.hpp>
 #include <sstream>
 #include <string>
 
@@ -26,17 +27,19 @@ IndexScan::IndexScan(QueryExecutionContext* qec, Permutation::Enum permutation,
       numVariables_(static_cast<size_t>(subject_.isVariable()) +
                     static_cast<size_t>(predicate_.isVariable()) +
                     static_cast<size_t>(object_.isVariable())) {
+  // We previously had `nullptr`s here in unit tests. This is no longer
+  // necessary nor allowed.
+  AD_CONTRACT_CHECK(qec != nullptr);
   for (auto& [idx, variable] : triple.additionalScanColumns_) {
     additionalColumns_.push_back(idx);
     additionalVariables_.push_back(variable);
   }
   sizeEstimate_ = computeSizeEstimate();
 
-  // Check the following invariant: The permuted input triple must contain at
-  // least one variable, and all the variables must be at the end of the
+  // Check the following invariant: All the variables must be at the end of the
   // permuted triple. For example in the PSO permutation, either only the O, or
-  // the S and O, or all three of P, S, O can be variables, all other
-  // combinations are not supported.
+  // the S and O, or all three of P, S, O, or none of them can be variables, all
+  // other combinations are not supported.
   auto permutedTriple = getPermutedTriple();
   for (size_t i = 0; i < 3 - numVariables_; ++i) {
     AD_CONTRACT_CHECK(!permutedTriple.at(i)->isVariable());
@@ -57,7 +60,7 @@ string IndexScan::getCacheKeyImpl() const {
   auto permutationString = Permutation::toString(permutation_);
 
   if (numVariables_ == 3) {
-    os << "SCAN FOR FULL INDEX " << permutationString << " (DUMMY OPERATION)";
+    os << "SCAN FOR FULL INDEX " << permutationString;
 
   } else {
     os << "SCAN " << permutationString << " with ";
@@ -66,10 +69,9 @@ string IndexScan::getCacheKeyImpl() const {
       const auto& key = getPermutedTriple().at(idx)->toRdfLiteral();
       os << keyString << " = \"" << key << "\"";
     };
-    addKey(0);
-    if (numVariables_ == 1) {
+    for (size_t i = 0; i < 3 - numVariables_; ++i) {
+      addKey(i);
       os << ", ";
-      addKey(1);
     }
   }
   if (!additionalColumns_.empty()) {
@@ -92,16 +94,8 @@ size_t IndexScan::getResultWidth() const {
 
 // _____________________________________________________________________________
 vector<ColumnIndex> IndexScan::resultSortedOn() const {
-  switch (numVariables_) {
-    case 1:
-      return {ColumnIndex{0}};
-    case 2:
-      return {ColumnIndex{0}, ColumnIndex{1}};
-    case 3:
-      return {ColumnIndex{0}, ColumnIndex{1}, ColumnIndex{2}};
-    default:
-      AD_FAIL();
-  }
+  auto resAsView = ad_utility::integerRange(ColumnIndex{numVariables_});
+  return std::vector<ColumnIndex>{resAsView.begin(), resAsView.end()};
 }
 
 // _____________________________________________________________________________
@@ -130,12 +124,8 @@ ProtoResult IndexScan::computeResult([[maybe_unused]] bool requestLaziness) {
   using enum Permutation::Enum;
   idTable.setNumColumns(numVariables_);
   const auto& index = _executionContext->getIndex();
-  const auto permutedTriple = getPermutedTriple();
-  if (numVariables_ == 2) {
-    idTable = index.scan(*permutedTriple[0], std::nullopt, permutation_,
-                         additionalColumns(), cancellationHandle_, getLimit());
-  } else if (numVariables_ == 1) {
-    idTable = index.scan(*permutedTriple[0], *permutedTriple[1], permutation_,
+  if (numVariables_ < 3) {
+    idTable = index.scan(getScanSpecification(), permutation_,
                          additionalColumns(), cancellationHandle_, getLimit());
   } else {
     AD_CORRECTNESS_CHECK(numVariables_ == 3);
@@ -150,44 +140,19 @@ ProtoResult IndexScan::computeResult([[maybe_unused]] bool requestLaziness) {
 
 // _____________________________________________________________________________
 size_t IndexScan::computeSizeEstimate() const {
-  if (_executionContext) {
-    // Should always be in this branch. Else is only for test cases.
-
-    // We have to do a simple scan anyway so might as well do it now
-    if (numVariables_ == 1) {
-      // TODO<C++23> Use the monadic operation `std::optional::or_else`.
-      // Note: we cannot use `optional::value_or()` here, because the else
-      // case is expensive to compute, and we need it lazily evaluated.
-      if (auto size = getExecutionContext()->getQueryTreeCache().getPinnedSize(
-              getCacheKey());
-          size.has_value()) {
-        return size.value();
-      } else {
-        // This call explicitly has to read two blocks of triples from memory to
-        // obtain an exact size estimate.
-        return getIndex().getResultSizeOfScan(
-            *getPermutedTriple()[0], *getPermutedTriple().at(1), permutation_);
-      }
-    } else if (numVariables_ == 2) {
-      const TripleComponent& firstKey = *getPermutedTriple().at(0);
-      return getIndex().getCardinality(firstKey, permutation_);
-    } else {
-      // The triple consists of three variables.
-      // TODO<joka921> As soon as all implementations of a full index scan
-      // (Including the "dummy joins" in Join.cpp) consistently exclude the
-      // internal triples, this estimate should be changed to only return
-      // the number of triples in the actual knowledge graph (excluding the
-      // internal triples).
-      AD_CORRECTNESS_CHECK(numVariables_ == 3);
-      return getIndex().numTriples().normalAndInternal_();
-    }
+  AD_CORRECTNESS_CHECK(_executionContext);
+  // We have to do a simple scan anyway so might as well do it now
+  if (numVariables_ < 3) {
+    return getIndex().getResultSizeOfScan(getScanSpecification(), permutation_);
   } else {
-    // Only for test cases. The handling of the objects is to make the
-    // strange query planner tests pass.
-    auto strLen = [](const auto& el) {
-      return (el.isString() ? el.getString() : el.toString()).size();
-    };
-    return 1000 + strLen(subject_) + strLen(object_) + strLen(predicate_);
+    // The triple consists of three variables.
+    // TODO<joka921> As soon as all implementations of a full index scan
+    // (Including the "dummy joins" in Join.cpp) consistently exclude the
+    // internal triples, this estimate should be changed to only return
+    // the number of triples in the actual knowledge graph (excluding the
+    // internal triples).
+    AD_CORRECTNESS_CHECK(numVariables_ == 3);
+    return getIndex().numTriples().normalAndInternal_();
   }
 }
 
@@ -200,29 +165,20 @@ size_t IndexScan::getCostEstimate() {
 
 // _____________________________________________________________________________
 void IndexScan::determineMultiplicities() {
-  multiplicity_.clear();
-  if (_executionContext) {
+  multiplicity_ = [this]() -> std::vector<float> {
     const auto& idx = getIndex();
-    if (numVariables_ == 1) {
+    if (numVariables_ == 0) {
+      return {};
+    } else if (numVariables_ == 1) {
       // There are no duplicate triples in RDF and two elements are fixed.
-      multiplicity_.emplace_back(1);
+      return {1.0f};
     } else if (numVariables_ == 2) {
-      const auto permutedTriple = getPermutedTriple();
-      multiplicity_ = idx.getMultiplicities(*permutedTriple[0], permutation_);
+      return idx.getMultiplicities(*getPermutedTriple()[0], permutation_);
     } else {
       AD_CORRECTNESS_CHECK(numVariables_ == 3);
-      multiplicity_ = idx.getMultiplicities(permutation_);
-    }
-  } else {
-    // This branch is only used in certain unit tests.
-    multiplicity_.emplace_back(1);
-    if (numVariables_ == 2) {
-      multiplicity_.emplace_back(1);
-    }
-    if (numVariables_ == 3) {
-      multiplicity_.emplace_back(1);
+      return idx.getMultiplicities(permutation_);
     }
-  }
+  }();
   for ([[maybe_unused]] size_t i :
        std::views::iota(multiplicity_.size(), getResultWidth())) {
     multiplicity_.emplace_back(1);
@@ -277,6 +233,12 @@ std::array<const TripleComponent* const, 3> IndexScan::getPermutedTriple()
           triple[permutation[2]]};
 }
 
+// ___________________________________________________________________________
+ScanSpecificationAsTripleComponent IndexScan::getScanSpecification() const {
+  auto permutedTriple = getPermutedTriple();
+  return {*permutedTriple[0], *permutedTriple[1], *permutedTriple[2]};
+}
+
 // ___________________________________________________________________________
 Permutation::IdTableGenerator IndexScan::getLazyScan(
     const IndexScan& s, std::vector<CompressedBlockMetadata> blocks) {
@@ -290,6 +252,10 @@ Permutation::IdTableGenerator IndexScan::getLazyScan(
     col1Id = s.getPermutedTriple()[1]->toValueId(index.getVocab()).value();
   }
 
+  // This function is currently only called by the `getLazyScanForJoin...`
+  // functions. In these cases we always have at least one variable in each of
+  // the scans, because otherwise there would be no join column.
+  AD_CORRECTNESS_CHECK(s.numVariables_ >= 1);
   // If there is a LIMIT or OFFSET clause that constrains the scan
   // (which can happen with an explicit subquery), we cannot use the prefiltered
   // blocks, as we currently have no mechanism to include limits and offsets
@@ -306,28 +272,20 @@ Permutation::IdTableGenerator IndexScan::getLazyScan(
 // ________________________________________________________________
 std::optional<Permutation::MetadataAndBlocks> IndexScan::getMetadataForScan(
     const IndexScan& s) {
-  auto permutedTriple = s.getPermutedTriple();
-  const IndexImpl& index = s.getIndex().getImpl();
-  auto numVars = s.numVariables_;
-  std::optional<Id> col0Id =
-      numVars == 3 ? std::nullopt
-                   : permutedTriple[0]->toValueId(index.getVocab());
-  std::optional<Id> col1Id =
-      numVars >= 2 ? std::nullopt
-                   : permutedTriple[1]->toValueId(index.getVocab());
-  if ((!col0Id.has_value() && numVars < 3) ||
-      (!col1Id.has_value() && numVars < 2)) {
+  const auto& index = s.getExecutionContext()->getIndex().getImpl();
+  auto scanSpec = s.getScanSpecification().toScanSpecification(index);
+  if (!scanSpec.has_value()) {
     return std::nullopt;
   }
-
   return index.getPermutation(s.permutation())
-      .getMetadataAndBlocks({col0Id, col1Id, std::nullopt});
+      .getMetadataAndBlocks(scanSpec.value());
 };
 
 // ________________________________________________________________
 std::array<Permutation::IdTableGenerator, 2>
 IndexScan::lazyScanForJoinOfTwoScans(const IndexScan& s1, const IndexScan& s2) {
   AD_CONTRACT_CHECK(s1.numVariables_ <= 3 && s2.numVariables_ <= 3);
+  AD_CONTRACT_CHECK(s1.numVariables_ >= 1 && s2.numVariables_ >= 1);
 
   // This function only works for single column joins. This means that the first
   // variable of both scans must be equal, but all other variables of the scans
@@ -376,7 +334,7 @@ IndexScan::lazyScanForJoinOfTwoScans(const IndexScan& s1, const IndexScan& s2) {
 Permutation::IdTableGenerator IndexScan::lazyScanForJoinOfColumnWithScan(
     std::span<const Id> joinColumn, const IndexScan& s) {
   AD_EXPENSIVE_CHECK(std::ranges::is_sorted(joinColumn));
-  AD_CORRECTNESS_CHECK(s.numVariables_ <= 3);
+  AD_CORRECTNESS_CHECK(s.numVariables_ <= 3 && s.numVariables_ > 0);
 
   auto metaBlocks1 = getMetadataForScan(s);
 

diff --git a/src/engine/IndexScan.h b/src/engine/IndexScan.h
@@ -100,6 +100,7 @@ class IndexScan final : public Operation {
   // `permutation_`. For example if `permutation_ == PSO` then the result is
   // {&predicate_, &subject_, &object_}
   std::array<const TripleComponent* const, 3> getPermutedTriple() const;
+  ScanSpecificationAsTripleComponent getScanSpecification() const;
 
  private:
   ProtoResult computeResult([[maybe_unused]] bool requestLaziness) override;

diff --git a/src/engine/Operation.cpp b/src/engine/Operation.cpp
@@ -88,25 +88,6 @@ std::shared_ptr<const Result> Operation::getResult(
   const bool pinResult =
       _executionContext->_pinSubtrees || pinFinalResultButNotSubtrees;
 
-  // When we pin the final result but no subtrees, we need to remember the sizes
-  // of all involved index scans that have only one free variable. Note that
-  // these index scans are executed already during query planning because they
-  // have to be executed anyway, for any query plan. If we don't remember these
-  // sizes here, future queries that take the result from the cache would redo
-  // these index scans. Note that we do not need to remember the multiplicity
-  // (and distinctness) because the multiplicity for an index scan with a single
-  // free variable is always 1.
-  if (pinFinalResultButNotSubtrees) {
-    auto lock =
-        getExecutionContext()->getQueryTreeCache().pinnedSizes().wlock();
-    forAllDescendants([&lock](QueryExecutionTree* child) {
-      if (child->getRootOperation()->isIndexScanWithNumVariables(1)) {
-        (*lock)[child->getRootOperation()->getCacheKey()] =
-            child->getSizeEstimate();
-      }
-    });
-  }
-
   try {
     // In case of an exception, create the correct runtime info, no matter which
     // exception handler is called.
@@ -270,7 +251,7 @@ void Operation::updateRuntimeInformationOnSuccess(
 
 // ____________________________________________________________________________________________________________________
 void Operation::updateRuntimeInformationOnSuccess(
-    const ConcurrentLruCache ::ResultAndCacheStatus& resultAndCacheStatus,
+    const QueryResultCache::ResultAndCacheStatus& resultAndCacheStatus,
     Milliseconds duration) {
   updateRuntimeInformationOnSuccess(
       *resultAndCacheStatus._resultPointer->resultTable(),

diff --git a/src/engine/Operation.h b/src/engine/Operation.h
@@ -263,7 +263,7 @@ class Operation {
   // Create and store the complete runtime information for this operation after
   // it has either been successfully computed or read from the cache.
   virtual void updateRuntimeInformationOnSuccess(
-      const ConcurrentLruCache::ResultAndCacheStatus& resultAndCacheStatus,
+      const QueryResultCache::ResultAndCacheStatus& resultAndCacheStatus,
       Milliseconds duration) final;
 
   // Similar to the function above, but the components are specified manually.

diff --git a/src/engine/QueryExecutionContext.h b/src/engine/QueryExecutionContext.h
@@ -53,37 +53,8 @@ class CacheValue {
 // Threadsafe LRU cache for (partial) query results, that
 // checks on insertion, if the result is currently being computed
 // by another query.
-using ConcurrentLruCache = ad_utility::ConcurrentCache<
+using QueryResultCache = ad_utility::ConcurrentCache<
     ad_utility::LRUCache<string, CacheValue, CacheValue::SizeGetter>>;
-using PinnedSizes =
-    ad_utility::Synchronized<ad_utility::HashMap<std::string, size_t>,
-                             std::shared_mutex>;
-class QueryResultCache : public ConcurrentLruCache {
- private:
-  PinnedSizes _pinnedSizes;
-
- public:
-  virtual ~QueryResultCache() = default;
-  void clearAll() override {
-    // The _pinnedSizes are not part of the (otherwise threadsafe) _cache
-    // and thus have to be manually locked.
-    auto lock = _pinnedSizes.wlock();
-    ConcurrentLruCache::clearAll();
-    lock->clear();
-  }
-  // Inherit the constructor.
-  using ConcurrentLruCache::ConcurrentLruCache;
-  const PinnedSizes& pinnedSizes() const { return _pinnedSizes; }
-  PinnedSizes& pinnedSizes() { return _pinnedSizes; }
-  std::optional<size_t> getPinnedSize(const std::string& key) {
-    auto rlock = _pinnedSizes.rlock();
-    if (rlock->contains(key)) {
-      return rlock->at(key);
-    } else {
-      return std::nullopt;
-    }
-  }
-};
 
 // Execution context for queries.
 // Holds references to index and engine, implements caching.