From c8580d1009eca2d915c0c3c39e6e40fbab1606f2 Mon Sep 17 00:00:00 2001 From: dieter Date: Fri, 8 Mar 2019 14:34:54 +0100 Subject: [PATCH 1/4] refactor `PluginIndexes.unindex.UnIndex.query_index` fixing #55 and #56 --- CHANGES.rst | 5 + .../PluginIndexes/FieldIndex/FieldIndex.py | 1 + .../PluginIndexes/tests/test_unindex.py | 15 +- src/Products/PluginIndexes/unindex.py | 358 ++++++------------ 4 files changed, 132 insertions(+), 247 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index c063ea44..0db384eb 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,6 +4,11 @@ Changelog 4.4 (unreleased) ---------------- +- Refactor ``Products.PluginIndexes.unindex.Unindex.query_index`` + (`#56 `_) + and fix + (`#55 `_). + - Specify supported Python versions using ``python_requires`` in setup.py (`Zope#481 `_) diff --git a/src/Products/PluginIndexes/FieldIndex/FieldIndex.py b/src/Products/PluginIndexes/FieldIndex/FieldIndex.py index 01bc835e..011472a0 100644 --- a/src/Products/PluginIndexes/FieldIndex/FieldIndex.py +++ b/src/Products/PluginIndexes/FieldIndex/FieldIndex.py @@ -21,6 +21,7 @@ class FieldIndex(UnIndex): """ meta_type = 'FieldIndex' query_options = ('query', 'range', 'not') + potentially_multivalued = False # optimization for exclude terms manage_options = ( {'label': 'Settings', 'action': 'manage_main'}, diff --git a/src/Products/PluginIndexes/tests/test_unindex.py b/src/Products/PluginIndexes/tests/test_unindex.py index 383306e3..925e922e 100644 --- a/src/Products/PluginIndexes/tests/test_unindex.py +++ b/src/Products/PluginIndexes/tests/test_unindex.py @@ -13,7 +13,7 @@ import unittest -from BTrees.IIBTree import difference +from BTrees.IIBTree import difference, IISet from OFS.SimpleItem import SimpleItem from Testing.makerequest import makerequest @@ -193,3 +193,16 @@ class Dummy(object): query = IndexQuery({'counter': 42}, 'counter') res = index.query_index(query) self.assertListEqual(list(res), []) + + def test_resultset_intersection(self): + """see #55 + `https://github.com/zopefoundation/Products.ZCatalog/issues/55`. + """ + index = self._makeOne("rsi") + for i in (1, 2): + index.insertForwardIndexEntry(i, i) + ers = IISet() # empty result set + for q in ((1,), (1, 2)): + qd = dict(rsi=dict(query=q)) + self.assertEqual(q, tuple(index._apply_index(qd)[0])) + self.assertEqual((), tuple(index._apply_index(qd, ers)[0])) diff --git a/src/Products/PluginIndexes/unindex.py b/src/Products/PluginIndexes/unindex.py index c00fcfdc..77635dbb 100644 --- a/src/Products/PluginIndexes/unindex.py +++ b/src/Products/PluginIndexes/unindex.py @@ -60,6 +60,12 @@ class UnIndex(SimpleItem): useOperator = 'or' query_options = () + # Set to `False` in derived indexes that know that each + # document is indexed under at most 1 value + # as is e.g. the case for a `FieldIndex`. + # This makes "not/exclude" term searches more efficient + potentially_multivalued = True + def __init__(self, id, ignore_ex=None, call_methods=None, extra=None, caller=None): """Create an unindex @@ -328,18 +334,6 @@ def unindex_object(self, documentId): LOG.debug('Attempt to unindex nonexistent document' ' with id %s', documentId, exc_info=True) - def _apply_not(self, not_parm, resultset=None): - index = self._index - setlist = [] - for k in not_parm: - s = index.get(k, None) - if s is None: - continue - elif isinstance(s, int): - s = IISet((s, )) - setlist.append(s) - return multiunion(setlist) - def _convert(self, value, default=None): return value @@ -406,255 +400,127 @@ def _apply_index(self, request, resultset=None): return None return (self.query_index(record, resultset=resultset), (self.id, )) - def query_index(self, record, resultset=None): - """Search the index with the given IndexQuery object. + def query_index(self, query, resultset=None): + """Search the index with the given `IndexQuery` object *query*. - If the query has a key which matches the 'id' of - the index instance, one of a few things can happen: + If *resultset* is not `None`, then intersect the primary result + with *resultset*. + """ + primary_result = None + # do we need this caching? It is promising only + # when the same subquery is performed at least twice + # in the same request + # On the other hand, it significantly complicates the code. + cachekey = None + cache = self.getRequestCache() + if cache is not None: + cachekey = self.getRequestCacheKey(query) + primary_result = cache.get(cachekey) + if primary_result is None: + combiner_info = self.get_combiner_info(query) + primary_result = combiner_info["sets"] + if len(primary_result) > 1: + operator = combiner_info["operator"] + if operator == "or": + # Note: `len(resultset)` can be expensive + if resultset is not None and len(resultset) < 200: + # if *resultset* is small, intersecting first + # is more efficient then intersecting after + # the `multiunion` + # We cannot cache in this case + primary_result = [ + intersection(resultset, s) + for s in primary_result] + resultset = None # intersection already done + cachekey = None # do not cache + primary_result = multiunion(primary_result), + else: # "and" + # For intersection, sort with smallest data set first + # Note: "len" can be expensive + primary_result = sorted(primary_result, key=len) + if primary_result: + # handle "excludes" + # Note: in the original, the unexcluded partial + # result was cached. This caches + # the excluded result + excludes = combiner_info["exclude_sets"] + if excludes: + excludes = multiunion(excludes) + primary_result = [ + difference(s, excludes) for s in primary_result] + if cachekey: # we should cache + cache[cachekey] = primary_result + r = resultset if primary_result else None + for s in primary_result: + r = intersection(r, s) + if r is None: + r = IISet() + return r - - if the value is a string, turn the value into - a single-element sequence, and proceed. + # `ISimpleCombinerIndex` + # Such an index is characterized by the fact that its search result + # is the combination of a sequence of sets with + # a single `operator` (either `"and"` or `"or"`), optionally + # with some excluded sets specified by `exclude_sets`. + def get_combiner_info(self, query): + """preprocess *query* and return partial results. - - if the value is a sequence, return a union search. + The result is a dict with keys `operator`, `sets` and `exclude_sets`. - - If the value is a dict and contains a key of the form - '_operator' this overrides the default method - ('or') to combine search results. Valid values are 'or' - and 'and'. + The search result is: "operator(*sets) - OR(*exclude_sets)". + We do not perform this combination here to allow for + outside optimizations. """ index = self._index - r = None + # Normalize + normalize = self._convert + keys = [normalize(k) for k in query.keys] or None + not_keys = [normalize(k) for k in query.get("not", ())] + # check for range opr = None - - # not / exclude parameter - not_parm = record.get('not', None) - - operator = record.operator - - cachekey = None - cache = self.getRequestCache() - if cache is not None: - cachekey = self.getRequestCacheKey(record) - if cachekey is not None: - cached = None - if operator == 'or': - cached = cache.get(cachekey, None) - else: - cached_setlist = cache.get(cachekey, None) - if cached_setlist is not None: - r = resultset - for s in cached_setlist: - # the result is bound by the resultset - r = intersection(r, s) - # If intersection, we can't possibly get a - # smaller result - if not r: - break - cached = r - - if cached is not None: - if isinstance(cached, int): - cached = IISet((cached, )) - - if not_parm: - not_parm = list(map(self._convert, not_parm)) - exclude = self._apply_not(not_parm, resultset) - cached = difference(cached, exclude) - - return cached - - if not record.keys and not_parm: - # convert into indexed format - not_parm = list(map(self._convert, not_parm)) - # we have only a 'not' query - record.keys = [k for k in index.keys() if k not in not_parm] - else: - # convert query arguments into indexed format - record.keys = list(map(self._convert, record.keys)) - - # Range parameter - range_parm = record.get('range', None) - if range_parm: - opr = 'range' - opr_args = [] - if range_parm.find('min') > -1: - opr_args.append('min') - if range_parm.find('max') > -1: - opr_args.append('max') - - if record.get('usage', None): - # see if any usage params are sent to field - opr = record.usage.lower().split(':') + range_param = query.get("range", None) + if range_param is not None: + opr = "range" + opr_args = [tag for tag in ("min", "max") if tag in range_param] + elif query.get("usage", None): # another way to specify a range + opr = query.usage.lower().split(":") opr, opr_args = opr[0], opr[1:] - - if opr == 'range': # range search - if 'min' in opr_args: - lo = min(record.keys) - else: - lo = None - if 'max' in opr_args: - hi = max(record.keys) - else: - hi = None - if hi: - setlist = index.values(lo, hi) - else: - setlist = index.values(lo) - - # If we only use one key, intersect and return immediately - if len(setlist) == 1: - result = setlist[0] - if isinstance(result, int): - result = IISet((result,)) - - if cachekey is not None: - if operator == 'or': - cache[cachekey] = result - else: - cache[cachekey] = [result] - - if not_parm: - exclude = self._apply_not(not_parm, resultset) - result = difference(result, exclude) - return result - - if operator == 'or': - tmp = [] - for s in setlist: - if isinstance(s, int): - s = IISet((s,)) - tmp.append(s) - r = multiunion(tmp) - - if cachekey is not None: - cache[cachekey] = r + if opr == "range": + lo = min(keys) if "min" in opr_args else None + hi = max(keys) if "max" in opr_args else None + keys = index.keys(lo, hi) # Note: `keys` handles `None` correctly + operator = query.operator + result = dict(operator=operator, sets=[], exclude_sets=[]) + if keys is None: # no keys have been specified + if not_keys: # pure not + keys = index.keys() else: - # For intersection, sort with smallest data set first - tmp = [] - for s in setlist: - if isinstance(s, int): - s = IISet((s,)) - tmp.append(s) - if len(tmp) > 2: - setlist = sorted(tmp, key=len) - else: - setlist = tmp - - # 'r' is not invariant of resultset. Thus, we - # have to remember 'setlist' - if cachekey is not None: - cache[cachekey] = setlist - - r = resultset - for s in setlist: - # the result is bound by the resultset - r = intersection(r, s) - # If intersection, we can't possibly get a smaller result - if not r: - break - - else: # not a range search - # Filter duplicates - setlist = [] - for k in record.keys: - if k is None: - # Prevent None from being looked up. None doesn't - # have a valid ordering definition compared to any - # other object. BTrees 4.0+ will throw a TypeError - # "object has default comparison". - continue + keys = () + + # perform the lookups + def lookup(operator, keys, result): + for k in keys: try: - s = index.get(k, None) + s = index.get(k) # key of wrong type is not in the index except TypeError: - # key is not valid for this Btree so the value is None - LOG.error( - '%(context)s: query_index tried ' - 'to look up key %(key)r from index %(index)r ' - 'but key was of the wrong type.', dict( - context=self.__class__.__name__, - key=k, - index=self.id, - ) - ) s = None - # If None, try to bail early if s is None: - if operator == 'or': - # If union, we can possibly get a bigger result - continue - # If intersection, we can't possibly get a smaller result - if cachekey is not None: - # If operator is 'and', we have to cache a list of - # IISet objects - cache[cachekey] = [IISet()] - return IISet() + if operator == "or": + continue # missing `or` term + # missing `and` term -- result empty + result[:] = [] + break elif isinstance(s, int): + # old style index s = IISet((s,)) - setlist.append(s) - - # If we only use one key return immediately - if len(setlist) == 1: - result = setlist[0] - if isinstance(result, int): - result = IISet((result,)) - - if cachekey is not None: - if operator == 'or': - cache[cachekey] = result - else: - cache[cachekey] = [result] - - if not_parm: - exclude = self._apply_not(not_parm, resultset) - result = difference(result, exclude) - return result - - if operator == 'or': - # If we already get a small result set passed in, intersecting - # the various indexes with it and doing the union later is - # faster than creating a multiunion first. - - if resultset is not None and len(resultset) < 200: - smalllist = [] - for s in setlist: - smalllist.append(intersection(resultset, s)) - r = multiunion(smalllist) - - # 'r' is not invariant of resultset. Thus, we - # have to remember the union of 'setlist'. But - # this is maybe a performance killer. So we do not cache. - # if cachekey is not None: - # cache[cachekey] = multiunion(setlist) - - else: - r = multiunion(setlist) - if cachekey is not None: - cache[cachekey] = r - else: - # For intersection, sort with smallest data set first - if len(setlist) > 2: - setlist = sorted(setlist, key=len) - - # 'r' is not invariant of resultset. Thus, we - # have to remember the union of 'setlist' - if cachekey is not None: - cache[cachekey] = setlist - - r = resultset - for s in setlist: - r = intersection(r, s) - # If intersection, we can't possibly get a smaller result - if not r: - break - - if isinstance(r, int): - r = IISet((r, )) - if r is None: - return IISet() - if not_parm: - exclude = self._apply_not(not_parm, resultset) - r = difference(r, exclude) - return r + result.append(s) + lookup(operator, + (k for k in keys if k not in not_keys), + result["sets"] + ) + if not_keys and self.potentially_multivalued and result["sets"]: + lookup("or", not_keys, result["exclude_sets"]) + return result def hasUniqueValuesFor(self, name): """has unique values for column name""" From 5e9e1b523b8e0c5bedc532b07d2257aff8db9af1 Mon Sep 17 00:00:00 2001 From: dieter Date: Fri, 8 Mar 2019 15:49:29 +0100 Subject: [PATCH 2/4] improve interface docstrings --- src/Products/PluginIndexes/interfaces.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/Products/PluginIndexes/interfaces.py b/src/Products/PluginIndexes/interfaces.py index 0df5d9cd..dbff414b 100644 --- a/src/Products/PluginIndexes/interfaces.py +++ b/src/Products/PluginIndexes/interfaces.py @@ -90,9 +90,13 @@ def clear(): class ILimitedResultIndex(IPluggableIndex): def _apply_index(request, resultset=None): - """Same as IPluggableIndex' _apply_index method. The additional - resultset argument contains the resultset, as already calculated by - ZCatalog's search method. + """Same as IPluggableIndex' _apply_index method. + + The additional *resultset* argument contains the resultset, + as already calculated by ZCatalog's search method. + If it is not `None` and `_apply_index` does not return + `None`, then the preliminary result must be intersected + with *resultset*. """ @@ -103,9 +107,16 @@ class IQueryIndex(IPluggableIndex): useOperator = Attribute('A string specifying the default operator.') query_options = Attribute('Supported query options for the index.') - def query_index(record, resultset=None): - """Same as _apply_index, but the query is already a pre-parsed - IndexQuery object. + def query_index(query, resultset=None): + """return the result of searching for *query*. + + *query* is a `Products.ZCatalog.query.IndexQuery` and + describes what is searched for. + + The return value is an `IISet` or `IITreeSet`. + + If *resultset* is not `None`, then the preliminary result + must be intersected with *resultset*. """ From b1b55ad662721f7bd43714f73831f2e2c808fcbf Mon Sep 17 00:00:00 2001 From: dieter Date: Fri, 8 Mar 2019 23:06:30 +0100 Subject: [PATCH 3/4] fix bug for index searches combining "and" and "not"; test "not" searches --- .../PluginIndexes/tests/test_unindex.py | 20 ++++++++++++++++++- src/Products/PluginIndexes/unindex.py | 15 +++++++++----- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/src/Products/PluginIndexes/tests/test_unindex.py b/src/Products/PluginIndexes/tests/test_unindex.py index 925e922e..7ad888a7 100644 --- a/src/Products/PluginIndexes/tests/test_unindex.py +++ b/src/Products/PluginIndexes/tests/test_unindex.py @@ -201,8 +201,26 @@ def test_resultset_intersection(self): index = self._makeOne("rsi") for i in (1, 2): index.insertForwardIndexEntry(i, i) - ers = IISet() # empty result set + ers = IISet() for q in ((1,), (1, 2)): qd = dict(rsi=dict(query=q)) self.assertEqual(q, tuple(index._apply_index(qd)[0])) self.assertEqual((), tuple(index._apply_index(qd, ers)[0])) + + def test_not(self): + index = self._makeOne("idx") + index.query_options = "not", "operator" # activate `not`, `operator` + apply = index._apply_index + for i, vals in enumerate(((10, 11, 12), (11, 12, 13))): + for v in vals: + index.insertForwardIndexEntry(v, i) + query = {"query" : (10, 11), "not" : (10,)} + req = dict(idx=query) + # or(10,11), not(10) + self.assertEqual((1, ), tuple(apply(req)[0]), "or(10,11), not(10)") + # and(10, 11), not(10) + query["operator"] = "and" + self.assertEqual((), tuple(apply(req)[0]), "and(10, 11), not(10)") + # 11, not 10 + query["query"] = 11 + self.assertEqual((1,), tuple(apply(req)[0]), "11, not(10)") diff --git a/src/Products/PluginIndexes/unindex.py b/src/Products/PluginIndexes/unindex.py index 77635dbb..8e19c0ad 100644 --- a/src/Products/PluginIndexes/unindex.py +++ b/src/Products/PluginIndexes/unindex.py @@ -496,7 +496,15 @@ def get_combiner_info(self, query): keys = index.keys() else: keys = () - + if not_keys: + if operator == "or": + keys = (k for k in keys if k not in not_keys) + else: # "and" + for k in not_keys: + if k in keys: + # empty result + keys = () + break # perform the lookups def lookup(operator, keys, result): for k in keys: @@ -514,10 +522,7 @@ def lookup(operator, keys, result): # old style index s = IISet((s,)) result.append(s) - lookup(operator, - (k for k in keys if k not in not_keys), - result["sets"] - ) + lookup(operator, keys, result["sets"]) if not_keys and self.potentially_multivalued and result["sets"]: lookup("or", not_keys, result["exclude_sets"]) return result From 9193a1fc076ee80d4ab7ed65303e9dec015f5fe8 Mon Sep 17 00:00:00 2001 From: dieter Date: Sat, 9 Mar 2019 08:55:41 +0100 Subject: [PATCH 4/4] range query tests; refactoring (to facilitate filtering) --- .../PluginIndexes/tests/test_unindex.py | 22 ++++- src/Products/PluginIndexes/unindex.py | 87 +++++++++++++------ 2 files changed, 80 insertions(+), 29 deletions(-) diff --git a/src/Products/PluginIndexes/tests/test_unindex.py b/src/Products/PluginIndexes/tests/test_unindex.py index 7ad888a7..16b0e949 100644 --- a/src/Products/PluginIndexes/tests/test_unindex.py +++ b/src/Products/PluginIndexes/tests/test_unindex.py @@ -214,7 +214,7 @@ def test_not(self): for i, vals in enumerate(((10, 11, 12), (11, 12, 13))): for v in vals: index.insertForwardIndexEntry(v, i) - query = {"query" : (10, 11), "not" : (10,)} + query = {"query": (10, 11), "not": (10,)} req = dict(idx=query) # or(10,11), not(10) self.assertEqual((1, ), tuple(apply(req)[0]), "or(10,11), not(10)") @@ -224,3 +224,23 @@ def test_not(self): # 11, not 10 query["query"] = 11 self.assertEqual((1,), tuple(apply(req)[0]), "11, not(10)") + + def test_range(self): + index = self._makeOne("idx") + index.query_options = "range", "usage" # activate `range`, `usage` + apply = index._apply_index + docs = tuple(range(10)) + for i in docs: + index.insertForwardIndexEntry(i, i) + ranges = (9, None), (None, 1), (5, 6), (None, None), + for op in ("range", "usage"): + for r in ranges: + spec = (["range"] if op == "usage" else []) \ + + (["min"] if r[0] is not None else []) \ + + (["max"] if r[1] is not None else []) + query = {"query": [v for v in r if v is not None], + op: ":".join(spec)} + self.assertEqual( + docs[r[0]: (r[1] + 1 if r[1] is not None else None)], + tuple(apply(dict(idx=query))[0]), + "%s: %s" % (op, r)) diff --git a/src/Products/PluginIndexes/unindex.py b/src/Products/PluginIndexes/unindex.py index 8e19c0ad..e77dd09c 100644 --- a/src/Products/PluginIndexes/unindex.py +++ b/src/Products/PluginIndexes/unindex.py @@ -438,12 +438,17 @@ def query_index(self, query, resultset=None): # For intersection, sort with smallest data set first # Note: "len" can be expensive primary_result = sorted(primary_result, key=len) + # Note: If we have `not_sets`, the order computed + # above may not be optimal. + # We could avoid this by postponing the sorting. + # However, the case is likely rare and not + # worth any special handling if primary_result: # handle "excludes" # Note: in the original, the unexcluded partial # result was cached. This caches # the excluded result - excludes = combiner_info["exclude_sets"] + excludes = combiner_info["not_sets"] if excludes: excludes = multiunion(excludes) primary_result = [ @@ -461,21 +466,66 @@ def query_index(self, query, resultset=None): # Such an index is characterized by the fact that its search result # is the combination of a sequence of sets with # a single `operator` (either `"and"` or `"or"`), optionally - # with some excluded sets specified by `exclude_sets`. + # with some excluded sets specified by `not_sets`. def get_combiner_info(self, query): """preprocess *query* and return partial results. - The result is a dict with keys `operator`, `sets` and `exclude_sets`. + The result is a dict with keys `operator`, `sets` and `not_sets`. - The search result is: "operator(*sets) - OR(*exclude_sets)". + The search result is computed as "operator(*sets) - OR(*not_sets)". We do not perform this combination here to allow for - outside optimizations. + outside optimizations (e.g. caching or the handling + of an incoming result set). + """ + keys_info = self.get_combiner_keys_info(query) + operator = keys_info["operator"] + index = self._index + result = dict(operator=operator, sets=[], not_sets=[]) + + # perform the lookups + def lookup(operator, keys, result): + for k in keys: + try: + s = index.get(k) + except TypeError: # key of wrong type is not in the index + s = None + if s is None: + if operator == "or": + continue # missing `or` term + # missing `and` term -- result empty + result[:] = [] + break + elif isinstance(s, int): + # old style index + s = IISet((s,)) + result.append(s) + + lookup(operator, keys_info["keys"], result["sets"]) + not_keys = keys_info["not_keys"] + if not_keys and self.potentially_multivalued and result["sets"]: + lookup("or", not_keys, result["not_sets"]) + return result + + def get_combiner_keys_info(self, query): + """preprocess *query* and return the relevant keys information. + + This handles normalization (--> `_convert`) and + range searches and prepares and, or and not searches. + + The result is a dict with keys `operator`, `keys` and `not_keys`. + Note: the value for `keys` may be a generator. + + This function could be inlined into `get_combiner_info`. + It is separated out in order to facilitate + (value based) filtering (rather then index based set intersection) + (as used, e.g. in `AdvancedQuery`). """ index = self._index # Normalize normalize = self._convert keys = [normalize(k) for k in query.keys] or None not_keys = [normalize(k) for k in query.get("not", ())] + operator = query.operator # check for range opr = None range_param = query.get("range", None) @@ -489,12 +539,13 @@ def get_combiner_info(self, query): lo = min(keys) if "min" in opr_args else None hi = max(keys) if "max" in opr_args else None keys = index.keys(lo, hi) # Note: `keys` handles `None` correctly - operator = query.operator - result = dict(operator=operator, sets=[], exclude_sets=[]) if keys is None: # no keys have been specified if not_keys: # pure not keys = index.keys() else: + # Note: might want to turn this into `index.keys()` + # for `operator == "and"` allowing to implement searches + # for all documents indexed by this index. keys = () if not_keys: if operator == "or": @@ -505,27 +556,7 @@ def get_combiner_info(self, query): # empty result keys = () break - # perform the lookups - def lookup(operator, keys, result): - for k in keys: - try: - s = index.get(k) # key of wrong type is not in the index - except TypeError: - s = None - if s is None: - if operator == "or": - continue # missing `or` term - # missing `and` term -- result empty - result[:] = [] - break - elif isinstance(s, int): - # old style index - s = IISet((s,)) - result.append(s) - lookup(operator, keys, result["sets"]) - if not_keys and self.potentially_multivalued and result["sets"]: - lookup("or", not_keys, result["exclude_sets"]) - return result + return dict(operator=operator, keys=keys, not_keys=not_keys) def hasUniqueValuesFor(self, name): """has unique values for column name"""