diff --git a/CHANGES.rst b/CHANGES.rst index e82b0cd5..bde692b9 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,7 +4,10 @@ Changelog 4.5 (unreleased) ---------------- -- Nothing changed yet. +- Refactor ``Products.PluginIndexes.unindex.Unindex.query_index`` + (`#56 `_) + and fix + (`#55 `_). 4.4 (2019-03-08) diff --git a/src/Products/PluginIndexes/FieldIndex/FieldIndex.py b/src/Products/PluginIndexes/FieldIndex/FieldIndex.py index 01bc835e..011472a0 100644 --- a/src/Products/PluginIndexes/FieldIndex/FieldIndex.py +++ b/src/Products/PluginIndexes/FieldIndex/FieldIndex.py @@ -21,6 +21,7 @@ class FieldIndex(UnIndex): """ meta_type = 'FieldIndex' query_options = ('query', 'range', 'not') + potentially_multivalued = False # optimization for exclude terms manage_options = ( {'label': 'Settings', 'action': 'manage_main'}, diff --git a/src/Products/PluginIndexes/interfaces.py b/src/Products/PluginIndexes/interfaces.py index 0df5d9cd..dbff414b 100644 --- a/src/Products/PluginIndexes/interfaces.py +++ b/src/Products/PluginIndexes/interfaces.py @@ -90,9 +90,13 @@ def clear(): class ILimitedResultIndex(IPluggableIndex): def _apply_index(request, resultset=None): - """Same as IPluggableIndex' _apply_index method. The additional - resultset argument contains the resultset, as already calculated by - ZCatalog's search method. + """Same as IPluggableIndex' _apply_index method. + + The additional *resultset* argument contains the resultset, + as already calculated by ZCatalog's search method. + If it is not `None` and `_apply_index` does not return + `None`, then the preliminary result must be intersected + with *resultset*. """ @@ -103,9 +107,16 @@ class IQueryIndex(IPluggableIndex): useOperator = Attribute('A string specifying the default operator.') query_options = Attribute('Supported query options for the index.') - def query_index(record, resultset=None): - """Same as _apply_index, but the query is already a pre-parsed - IndexQuery object. + def query_index(query, resultset=None): + """return the result of searching for *query*. + + *query* is a `Products.ZCatalog.query.IndexQuery` and + describes what is searched for. + + The return value is an `IISet` or `IITreeSet`. + + If *resultset* is not `None`, then the preliminary result + must be intersected with *resultset*. """ diff --git a/src/Products/PluginIndexes/tests/test_unindex.py b/src/Products/PluginIndexes/tests/test_unindex.py index 383306e3..16b0e949 100644 --- a/src/Products/PluginIndexes/tests/test_unindex.py +++ b/src/Products/PluginIndexes/tests/test_unindex.py @@ -13,7 +13,7 @@ import unittest -from BTrees.IIBTree import difference +from BTrees.IIBTree import difference, IISet from OFS.SimpleItem import SimpleItem from Testing.makerequest import makerequest @@ -193,3 +193,54 @@ class Dummy(object): query = IndexQuery({'counter': 42}, 'counter') res = index.query_index(query) self.assertListEqual(list(res), []) + + def test_resultset_intersection(self): + """see #55 + `https://github.com/zopefoundation/Products.ZCatalog/issues/55`. + """ + index = self._makeOne("rsi") + for i in (1, 2): + index.insertForwardIndexEntry(i, i) + ers = IISet() + for q in ((1,), (1, 2)): + qd = dict(rsi=dict(query=q)) + self.assertEqual(q, tuple(index._apply_index(qd)[0])) + self.assertEqual((), tuple(index._apply_index(qd, ers)[0])) + + def test_not(self): + index = self._makeOne("idx") + index.query_options = "not", "operator" # activate `not`, `operator` + apply = index._apply_index + for i, vals in enumerate(((10, 11, 12), (11, 12, 13))): + for v in vals: + index.insertForwardIndexEntry(v, i) + query = {"query": (10, 11), "not": (10,)} + req = dict(idx=query) + # or(10,11), not(10) + self.assertEqual((1, ), tuple(apply(req)[0]), "or(10,11), not(10)") + # and(10, 11), not(10) + query["operator"] = "and" + self.assertEqual((), tuple(apply(req)[0]), "and(10, 11), not(10)") + # 11, not 10 + query["query"] = 11 + self.assertEqual((1,), tuple(apply(req)[0]), "11, not(10)") + + def test_range(self): + index = self._makeOne("idx") + index.query_options = "range", "usage" # activate `range`, `usage` + apply = index._apply_index + docs = tuple(range(10)) + for i in docs: + index.insertForwardIndexEntry(i, i) + ranges = (9, None), (None, 1), (5, 6), (None, None), + for op in ("range", "usage"): + for r in ranges: + spec = (["range"] if op == "usage" else []) \ + + (["min"] if r[0] is not None else []) \ + + (["max"] if r[1] is not None else []) + query = {"query": [v for v in r if v is not None], + op: ":".join(spec)} + self.assertEqual( + docs[r[0]: (r[1] + 1 if r[1] is not None else None)], + tuple(apply(dict(idx=query))[0]), + "%s: %s" % (op, r)) diff --git a/src/Products/PluginIndexes/unindex.py b/src/Products/PluginIndexes/unindex.py index c00fcfdc..e77dd09c 100644 --- a/src/Products/PluginIndexes/unindex.py +++ b/src/Products/PluginIndexes/unindex.py @@ -60,6 +60,12 @@ class UnIndex(SimpleItem): useOperator = 'or' query_options = () + # Set to `False` in derived indexes that know that each + # document is indexed under at most 1 value + # as is e.g. the case for a `FieldIndex`. + # This makes "not/exclude" term searches more efficient + potentially_multivalued = True + def __init__(self, id, ignore_ex=None, call_methods=None, extra=None, caller=None): """Create an unindex @@ -328,18 +334,6 @@ def unindex_object(self, documentId): LOG.debug('Attempt to unindex nonexistent document' ' with id %s', documentId, exc_info=True) - def _apply_not(self, not_parm, resultset=None): - index = self._index - setlist = [] - for k in not_parm: - s = index.get(k, None) - if s is None: - continue - elif isinstance(s, int): - s = IISet((s, )) - setlist.append(s) - return multiunion(setlist) - def _convert(self, value, default=None): return value @@ -406,255 +400,163 @@ def _apply_index(self, request, resultset=None): return None return (self.query_index(record, resultset=resultset), (self.id, )) - def query_index(self, record, resultset=None): - """Search the index with the given IndexQuery object. - - If the query has a key which matches the 'id' of - the index instance, one of a few things can happen: - - - if the value is a string, turn the value into - a single-element sequence, and proceed. + def query_index(self, query, resultset=None): + """Search the index with the given `IndexQuery` object *query*. - - if the value is a sequence, return a union search. - - - If the value is a dict and contains a key of the form - '_operator' this overrides the default method - ('or') to combine search results. Valid values are 'or' - and 'and'. + If *resultset* is not `None`, then intersect the primary result + with *resultset*. """ - index = self._index - r = None - opr = None - - # not / exclude parameter - not_parm = record.get('not', None) - - operator = record.operator - + primary_result = None + # do we need this caching? It is promising only + # when the same subquery is performed at least twice + # in the same request + # On the other hand, it significantly complicates the code. cachekey = None cache = self.getRequestCache() if cache is not None: - cachekey = self.getRequestCacheKey(record) - if cachekey is not None: - cached = None - if operator == 'or': - cached = cache.get(cachekey, None) - else: - cached_setlist = cache.get(cachekey, None) - if cached_setlist is not None: - r = resultset - for s in cached_setlist: - # the result is bound by the resultset - r = intersection(r, s) - # If intersection, we can't possibly get a - # smaller result - if not r: - break - cached = r - - if cached is not None: - if isinstance(cached, int): - cached = IISet((cached, )) - - if not_parm: - not_parm = list(map(self._convert, not_parm)) - exclude = self._apply_not(not_parm, resultset) - cached = difference(cached, exclude) - - return cached - - if not record.keys and not_parm: - # convert into indexed format - not_parm = list(map(self._convert, not_parm)) - # we have only a 'not' query - record.keys = [k for k in index.keys() if k not in not_parm] - else: - # convert query arguments into indexed format - record.keys = list(map(self._convert, record.keys)) - - # Range parameter - range_parm = record.get('range', None) - if range_parm: - opr = 'range' - opr_args = [] - if range_parm.find('min') > -1: - opr_args.append('min') - if range_parm.find('max') > -1: - opr_args.append('max') - - if record.get('usage', None): - # see if any usage params are sent to field - opr = record.usage.lower().split(':') - opr, opr_args = opr[0], opr[1:] + cachekey = self.getRequestCacheKey(query) + primary_result = cache.get(cachekey) + if primary_result is None: + combiner_info = self.get_combiner_info(query) + primary_result = combiner_info["sets"] + if len(primary_result) > 1: + operator = combiner_info["operator"] + if operator == "or": + # Note: `len(resultset)` can be expensive + if resultset is not None and len(resultset) < 200: + # if *resultset* is small, intersecting first + # is more efficient then intersecting after + # the `multiunion` + # We cannot cache in this case + primary_result = [ + intersection(resultset, s) + for s in primary_result] + resultset = None # intersection already done + cachekey = None # do not cache + primary_result = multiunion(primary_result), + else: # "and" + # For intersection, sort with smallest data set first + # Note: "len" can be expensive + primary_result = sorted(primary_result, key=len) + # Note: If we have `not_sets`, the order computed + # above may not be optimal. + # We could avoid this by postponing the sorting. + # However, the case is likely rare and not + # worth any special handling + if primary_result: + # handle "excludes" + # Note: in the original, the unexcluded partial + # result was cached. This caches + # the excluded result + excludes = combiner_info["not_sets"] + if excludes: + excludes = multiunion(excludes) + primary_result = [ + difference(s, excludes) for s in primary_result] + if cachekey: # we should cache + cache[cachekey] = primary_result + r = resultset if primary_result else None + for s in primary_result: + r = intersection(r, s) + if r is None: + r = IISet() + return r - if opr == 'range': # range search - if 'min' in opr_args: - lo = min(record.keys) - else: - lo = None - if 'max' in opr_args: - hi = max(record.keys) - else: - hi = None - if hi: - setlist = index.values(lo, hi) - else: - setlist = index.values(lo) - - # If we only use one key, intersect and return immediately - if len(setlist) == 1: - result = setlist[0] - if isinstance(result, int): - result = IISet((result,)) - - if cachekey is not None: - if operator == 'or': - cache[cachekey] = result - else: - cache[cachekey] = [result] - - if not_parm: - exclude = self._apply_not(not_parm, resultset) - result = difference(result, exclude) - return result - - if operator == 'or': - tmp = [] - for s in setlist: - if isinstance(s, int): - s = IISet((s,)) - tmp.append(s) - r = multiunion(tmp) - - if cachekey is not None: - cache[cachekey] = r - else: - # For intersection, sort with smallest data set first - tmp = [] - for s in setlist: - if isinstance(s, int): - s = IISet((s,)) - tmp.append(s) - if len(tmp) > 2: - setlist = sorted(tmp, key=len) - else: - setlist = tmp - - # 'r' is not invariant of resultset. Thus, we - # have to remember 'setlist' - if cachekey is not None: - cache[cachekey] = setlist - - r = resultset - for s in setlist: - # the result is bound by the resultset - r = intersection(r, s) - # If intersection, we can't possibly get a smaller result - if not r: - break + # `ISimpleCombinerIndex` + # Such an index is characterized by the fact that its search result + # is the combination of a sequence of sets with + # a single `operator` (either `"and"` or `"or"`), optionally + # with some excluded sets specified by `not_sets`. + def get_combiner_info(self, query): + """preprocess *query* and return partial results. + + The result is a dict with keys `operator`, `sets` and `not_sets`. + + The search result is computed as "operator(*sets) - OR(*not_sets)". + We do not perform this combination here to allow for + outside optimizations (e.g. caching or the handling + of an incoming result set). + """ + keys_info = self.get_combiner_keys_info(query) + operator = keys_info["operator"] + index = self._index + result = dict(operator=operator, sets=[], not_sets=[]) - else: # not a range search - # Filter duplicates - setlist = [] - for k in record.keys: - if k is None: - # Prevent None from being looked up. None doesn't - # have a valid ordering definition compared to any - # other object. BTrees 4.0+ will throw a TypeError - # "object has default comparison". - continue + # perform the lookups + def lookup(operator, keys, result): + for k in keys: try: - s = index.get(k, None) - except TypeError: - # key is not valid for this Btree so the value is None - LOG.error( - '%(context)s: query_index tried ' - 'to look up key %(key)r from index %(index)r ' - 'but key was of the wrong type.', dict( - context=self.__class__.__name__, - key=k, - index=self.id, - ) - ) + s = index.get(k) + except TypeError: # key of wrong type is not in the index s = None - # If None, try to bail early if s is None: - if operator == 'or': - # If union, we can possibly get a bigger result - continue - # If intersection, we can't possibly get a smaller result - if cachekey is not None: - # If operator is 'and', we have to cache a list of - # IISet objects - cache[cachekey] = [IISet()] - return IISet() + if operator == "or": + continue # missing `or` term + # missing `and` term -- result empty + result[:] = [] + break elif isinstance(s, int): + # old style index s = IISet((s,)) - setlist.append(s) - - # If we only use one key return immediately - if len(setlist) == 1: - result = setlist[0] - if isinstance(result, int): - result = IISet((result,)) - - if cachekey is not None: - if operator == 'or': - cache[cachekey] = result - else: - cache[cachekey] = [result] - - if not_parm: - exclude = self._apply_not(not_parm, resultset) - result = difference(result, exclude) - return result - - if operator == 'or': - # If we already get a small result set passed in, intersecting - # the various indexes with it and doing the union later is - # faster than creating a multiunion first. - - if resultset is not None and len(resultset) < 200: - smalllist = [] - for s in setlist: - smalllist.append(intersection(resultset, s)) - r = multiunion(smalllist) - - # 'r' is not invariant of resultset. Thus, we - # have to remember the union of 'setlist'. But - # this is maybe a performance killer. So we do not cache. - # if cachekey is not None: - # cache[cachekey] = multiunion(setlist) + result.append(s) - else: - r = multiunion(setlist) - if cachekey is not None: - cache[cachekey] = r + lookup(operator, keys_info["keys"], result["sets"]) + not_keys = keys_info["not_keys"] + if not_keys and self.potentially_multivalued and result["sets"]: + lookup("or", not_keys, result["not_sets"]) + return result + + def get_combiner_keys_info(self, query): + """preprocess *query* and return the relevant keys information. + + This handles normalization (--> `_convert`) and + range searches and prepares and, or and not searches. + + The result is a dict with keys `operator`, `keys` and `not_keys`. + Note: the value for `keys` may be a generator. + + This function could be inlined into `get_combiner_info`. + It is separated out in order to facilitate + (value based) filtering (rather then index based set intersection) + (as used, e.g. in `AdvancedQuery`). + """ + index = self._index + # Normalize + normalize = self._convert + keys = [normalize(k) for k in query.keys] or None + not_keys = [normalize(k) for k in query.get("not", ())] + operator = query.operator + # check for range + opr = None + range_param = query.get("range", None) + if range_param is not None: + opr = "range" + opr_args = [tag for tag in ("min", "max") if tag in range_param] + elif query.get("usage", None): # another way to specify a range + opr = query.usage.lower().split(":") + opr, opr_args = opr[0], opr[1:] + if opr == "range": + lo = min(keys) if "min" in opr_args else None + hi = max(keys) if "max" in opr_args else None + keys = index.keys(lo, hi) # Note: `keys` handles `None` correctly + if keys is None: # no keys have been specified + if not_keys: # pure not + keys = index.keys() else: - # For intersection, sort with smallest data set first - if len(setlist) > 2: - setlist = sorted(setlist, key=len) - - # 'r' is not invariant of resultset. Thus, we - # have to remember the union of 'setlist' - if cachekey is not None: - cache[cachekey] = setlist - - r = resultset - for s in setlist: - r = intersection(r, s) - # If intersection, we can't possibly get a smaller result - if not r: + # Note: might want to turn this into `index.keys()` + # for `operator == "and"` allowing to implement searches + # for all documents indexed by this index. + keys = () + if not_keys: + if operator == "or": + keys = (k for k in keys if k not in not_keys) + else: # "and" + for k in not_keys: + if k in keys: + # empty result + keys = () break - - if isinstance(r, int): - r = IISet((r, )) - if r is None: - return IISet() - if not_parm: - exclude = self._apply_not(not_parm, resultset) - r = difference(r, exclude) - return r + return dict(operator=operator, keys=keys, not_keys=not_keys) def hasUniqueValuesFor(self, name): """has unique values for column name"""