-
Notifications
You must be signed in to change notification settings - Fork 314
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Term indexer improvements #303
base: master
Are you sure you want to change the base?
Changes from 9 commits
6142e43
7e6da01
9713464
ce3a345
4fa417b
b66dbc1
960c16f
c920d8f
a43ee36
2ce16d3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -61,6 +61,12 @@ | |
// never be any document regions larger than the query region. This can | ||
// significantly reduce the size of queries. | ||
// | ||
// + If the query will contain only points (rather than general regions), then | ||
// we can skip all the ancestor terms mentioned above (except last cell see | ||
// `GetIndexTerms(const S2Point& point...` for details) because there will | ||
// never be any document regions larger than the index region. This can | ||
// significantly reduce the size of index. | ||
// | ||
// + If it is more important to optimize index size rather than query speed, | ||
// the number of index terms can be reduced by creating ancestor terms only | ||
// for the *proper* ancestors of the cells in a document region, and | ||
|
@@ -129,6 +135,14 @@ string S2RegionTermIndexer::GetTerm(TermType term_type, const S2CellId id, | |
|
||
vector<string> S2RegionTermIndexer::GetIndexTerms(const S2Point& point, | ||
string_view prefix) { | ||
vector<string> terms; | ||
GetIndexTerms(point, prefix, &terms); | ||
return terms; | ||
} | ||
|
||
void S2RegionTermIndexer::GetIndexTerms(const S2Point& point, | ||
string_view prefix, | ||
vector<string>* terms) { | ||
// See the top of this file for an overview of the indexing strategy. | ||
// | ||
// The last cell generated by this loop is effectively the covering for | ||
|
@@ -139,12 +153,13 @@ vector<string> S2RegionTermIndexer::GetIndexTerms(const S2Point& point, | |
// max_level() != true_max_level() (see S2RegionCoverer::Options). | ||
|
||
const S2CellId id(point); | ||
vector<string> terms; | ||
for (int level = options_.min_level(); level <= options_.max_level(); | ||
level += options_.level_mod()) { | ||
terms.push_back(GetTerm(TermType::ANCESTOR, id.parent(level), prefix)); | ||
int level = options_.min_level(); | ||
if (options_.query_contains_points_only()) { | ||
MBkkt marked this conversation as resolved.
Show resolved
Hide resolved
|
||
level = options_.true_max_level(); | ||
} | ||
for (; level <= options_.max_level(); level += options_.level_mod()) { | ||
terms->push_back(GetTerm(TermType::ANCESTOR, id.parent(level), prefix)); | ||
} | ||
return terms; | ||
} | ||
|
||
vector<string> S2RegionTermIndexer::GetIndexTerms(const S2Region& region, | ||
|
@@ -157,6 +172,13 @@ vector<string> S2RegionTermIndexer::GetIndexTerms(const S2Region& region, | |
|
||
vector<string> S2RegionTermIndexer::GetIndexTermsForCanonicalCovering( | ||
const S2CellUnion& covering, string_view prefix) { | ||
vector<string> terms; | ||
GetIndexTermsForCanonicalCovering(covering, prefix, &terms); | ||
return terms; | ||
} | ||
|
||
void S2RegionTermIndexer::GetIndexTermsForCanonicalCovering( | ||
const S2CellUnion& covering, string_view prefix, vector<string>* terms) { | ||
// See the top of this file for an overview of the indexing strategy. | ||
// | ||
// Cells in the covering are normally indexed as covering terms. If we are | ||
|
@@ -171,7 +193,6 @@ vector<string> S2RegionTermIndexer::GetIndexTermsForCanonicalCovering( | |
*coverer_.mutable_options() = options_; | ||
S2_CHECK(coverer_.IsCanonical(covering)); | ||
} | ||
vector<string> terms; | ||
S2CellId prev_id = S2CellId::None(); | ||
int true_max_level = options_.true_max_level(); | ||
for (S2CellId id : covering) { | ||
|
@@ -181,14 +202,19 @@ vector<string> S2RegionTermIndexer::GetIndexTermsForCanonicalCovering( | |
S2_DCHECK_GE(level, options_.min_level()); | ||
S2_DCHECK_LE(level, options_.max_level()); | ||
S2_DCHECK_EQ(0, (level - options_.min_level()) % options_.level_mod()); | ||
S2_DCHECK_LE(level, options_.true_max_level()); | ||
|
||
if (level < true_max_level) { | ||
// Add a covering term for this cell. | ||
terms.push_back(GetTerm(TermType::COVERING, id, prefix)); | ||
} | ||
if (level == true_max_level || !options_.optimize_for_space()) { | ||
// Add an ancestor term for this cell at the constrained level. | ||
terms.push_back(GetTerm(TermType::ANCESTOR, id.parent(level), prefix)); | ||
const bool is_max_level_cell = level == true_max_level; | ||
// Add a term for this cell, max_level cell ANCESTOR is optimization | ||
terms->push_back(GetTerm(is_max_level_cell ? TermType::ANCESTOR | ||
: TermType::COVERING, | ||
id, prefix)); | ||
|
||
if (options_.query_contains_points_only()) continue; | ||
|
||
if (!options_.optimize_for_space() && !is_max_level_cell) { | ||
// Add an ancestor term for this cell. | ||
terms->push_back(GetTerm(TermType::ANCESTOR, id, prefix)); | ||
} | ||
// Finally, add ancestor terms for all the ancestors of this cell. | ||
while ((level -= options_.level_mod()) >= options_.min_level()) { | ||
|
@@ -197,29 +223,34 @@ vector<string> S2RegionTermIndexer::GetIndexTermsForCanonicalCovering( | |
prev_id.parent(level) == ancestor_id) { | ||
break; // We have already processed this cell and its ancestors. | ||
} | ||
terms.push_back(GetTerm(TermType::ANCESTOR, ancestor_id, prefix)); | ||
terms->push_back(GetTerm(TermType::ANCESTOR, ancestor_id, prefix)); | ||
} | ||
prev_id = id; | ||
} | ||
return terms; | ||
} | ||
|
||
vector<string> S2RegionTermIndexer::GetQueryTerms(const S2Point& point, | ||
string_view prefix) { | ||
vector<string> terms; | ||
GetQueryTerms(point, prefix, &terms); | ||
return terms; | ||
} | ||
|
||
void S2RegionTermIndexer::GetQueryTerms(const S2Point& point, | ||
string_view prefix, | ||
vector<string>* terms) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please split changing the type signature into a separate PR. We can discuss it there. Does this have a measurable and meaningful performance difference for you? I would have guessed that computing the covering takes way longer than reallocating vectors. https://google.github.io/eng-practices/review/developer/small-cls.html There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't want it, it's conflicting changes. It's not convenient to make separate PRs and then resolve conflict in one to another There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
For points it's obliviously better 0 allocations (only first time) instead of log(max-min/level) for each call For covering it depends, so I add such overload only for function which already have covering
In my overloads we don't really compute it There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, it's better, but is it 10% better or 0.001% better? We need to weigh the complexity of the change vs. the performance improvements. How much does this improve 1. time to build the whole index (or insert an object) 2. query time? (or some other user-observable action) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
But it's not affect users who doesn't want to use it Why is it issue? Why don't you want same as for S2RegionCoverer? Could you explain please? Why for coverer it's ok, but for indexer it's too difficult? They usage are pretty same
I don't measure this part, but it depends on what we do. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Already done There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I mean we have index with points only. Every allocation cost at least 10ns (on practice more), so if we make 1 instead of N it decrease overall time at least on (N-1)*10ns Query with points only it's other part of PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How many terms do your queries have? According to your benchmarks, you save 3µs for 16 query terms. This sounds not very significant to me, so could you explain either why this is significant, or what N you care about? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
It's significant for me not because it's big but because it costs nothing to me, I just should use other API There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Concrete for me for query it will be same. Maybe as a compromise I can keep the new indexing API but remove the new querying API (if you think that would be better)? |
||
// See the top of this file for an overview of the indexing strategy. | ||
|
||
const S2CellId id(point); | ||
vector<string> terms; | ||
// Recall that all true_max_level() cells are indexed only as ancestor terms. | ||
int level = options_.true_max_level(); | ||
terms.push_back(GetTerm(TermType::ANCESTOR, id.parent(level), prefix)); | ||
if (options_.index_contains_points_only()) return terms; | ||
terms->push_back(GetTerm(TermType::ANCESTOR, id.parent(level), prefix)); | ||
if (options_.index_contains_points_only()) return; | ||
|
||
// Add covering terms for all the ancestor cells. | ||
for (; level >= options_.min_level(); level -= options_.level_mod()) { | ||
terms.push_back(GetTerm(TermType::COVERING, id.parent(level), prefix)); | ||
terms->push_back(GetTerm(TermType::COVERING, id.parent(level), prefix)); | ||
} | ||
return terms; | ||
} | ||
|
||
vector<string> S2RegionTermIndexer::GetQueryTerms(const S2Region& region, | ||
|
@@ -232,13 +263,20 @@ vector<string> S2RegionTermIndexer::GetQueryTerms(const S2Region& region, | |
|
||
vector<string> S2RegionTermIndexer::GetQueryTermsForCanonicalCovering( | ||
const S2CellUnion& covering, string_view prefix) { | ||
vector<string> terms; | ||
GetQueryTermsForCanonicalCovering(covering, prefix, &terms); | ||
return terms; | ||
} | ||
|
||
void S2RegionTermIndexer::GetQueryTermsForCanonicalCovering( | ||
const S2CellUnion& covering, string_view prefix, vector<string>* terms) { | ||
// See the top of this file for an overview of the indexing strategy. | ||
|
||
S2_CHECK(!options_.query_contains_points_only()); | ||
if (google::DEBUG_MODE) { | ||
*coverer_.mutable_options() = options_; | ||
S2_CHECK(coverer_.IsCanonical(covering)); | ||
} | ||
vector<string> terms; | ||
S2CellId prev_id = S2CellId::None(); | ||
int true_max_level = options_.true_max_level(); | ||
for (S2CellId id : covering) { | ||
|
@@ -248,18 +286,19 @@ vector<string> S2RegionTermIndexer::GetQueryTermsForCanonicalCovering( | |
S2_DCHECK_GE(level, options_.min_level()); | ||
S2_DCHECK_LE(level, options_.max_level()); | ||
S2_DCHECK_EQ(0, (level - options_.min_level()) % options_.level_mod()); | ||
S2_DCHECK_LE(level, options_.true_max_level()); | ||
|
||
// Cells in the covering are always queried as ancestor terms. | ||
terms.push_back(GetTerm(TermType::ANCESTOR, id, prefix)); | ||
terms->push_back(GetTerm(TermType::ANCESTOR, id, prefix)); | ||
|
||
// If the index only contains points, there are no covering terms. | ||
if (options_.index_contains_points_only()) continue; | ||
|
||
// If we are optimizing for index space rather than query time, cells are | ||
// also queried as covering terms (except for true_max_level() cells, | ||
// which are indexed and queried as ancestor cells only). | ||
if (options_.optimize_for_space() && level < true_max_level) { | ||
terms.push_back(GetTerm(TermType::COVERING, id, prefix)); | ||
if (options_.optimize_for_space() && level != true_max_level) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Only change what you need to. More changes just makes it harder to see what's really changing. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just want to make it same for different function |
||
terms->push_back(GetTerm(TermType::COVERING, id, prefix)); | ||
} | ||
// Finally, add covering terms for all the ancestors of this cell. | ||
while ((level -= options_.level_mod()) >= options_.min_level()) { | ||
|
@@ -268,9 +307,8 @@ vector<string> S2RegionTermIndexer::GetQueryTermsForCanonicalCovering( | |
prev_id.parent(level) == ancestor_id) { | ||
break; // We have already processed this cell and its ancestors. | ||
} | ||
terms.push_back(GetTerm(TermType::COVERING, ancestor_id, prefix)); | ||
terms->push_back(GetTerm(TermType::COVERING, ancestor_id, prefix)); | ||
} | ||
prev_id = id; | ||
} | ||
return terms; | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -198,8 +198,21 @@ class S2RegionTermIndexer { | |
// this flag if your index consists entirely of points.) | ||
// | ||
// DEFAULT: false | ||
bool index_contains_points_only() const { return points_only_; } | ||
void set_index_contains_points_only(bool value) { points_only_ = value; } | ||
bool index_contains_points_only() const { return index_points_only_; } | ||
void set_index_contains_points_only(bool value) { index_points_only_ = value; } | ||
|
||
// If your query will only contain points (rather than regions), be sure | ||
// to set this flag. This will generate smaller and faster index that | ||
// are specialized for the points-only case. | ||
// | ||
// With the default quality settings, this flag reduces the number of | ||
// index terms by about a factor of two. (The improvement gets smaller | ||
// as max_cells() is increased, but there is really no reason not to use | ||
// this flag if your query consist entirely of points.) | ||
// | ||
// DEFAULT: false | ||
bool query_contains_points_only() const { return query_points_only_; } | ||
void set_query_contains_points_only(bool value) { query_points_only_ = value; } | ||
|
||
// If true, the index will be optimized for space rather than for query | ||
// time. With the default quality settings, this flag reduces the number | ||
|
@@ -223,7 +236,8 @@ class S2RegionTermIndexer { | |
void set_marker_character(char ch); | ||
|
||
private: | ||
bool points_only_ = false; | ||
bool index_points_only_ = false; | ||
bool query_points_only_ = false; | ||
bool optimize_for_space_ = false; | ||
std::string marker_ = std::string(1, '$'); | ||
}; | ||
|
@@ -289,6 +303,21 @@ class S2RegionTermIndexer { | |
std::vector<std::string> GetQueryTermsForCanonicalCovering( | ||
const S2CellUnion& covering, absl::string_view prefix); | ||
|
||
// Same as above but allows to reuse same buffer for different points or use | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think its fine to have these since the old ones call the new variants. Since the terms vector isn't optional let's just make it a non-const reference though. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems like a small inefficiency we should forget about 97% of the time. @MBkkt even said the few microseconds it saved were irrelevant to him. Why do we want to complicate the interface with this? Now we have to document what happens with the vectors. Are they cleared? appended to? The formatting is also inconsistent ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah yes if the performance gain isn't worth it then we should probably just elide. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
appended to, I think function is pretty small to see it from code.
Sorry you don't have clang-format in repo, so my ide reformat it randomly, I tried to make it like was in other function, maybe in some places I missed it
I already wrote, now it's ok for me don't have such api for query, but for indexing, calling other api make my code faster (a little but measurable), so I want other api. Offtop:Btw today we discussed how speedup geo terms search: now we need to make disjunction of all query terms (it's from s2 documentation and it works now for us). Like query covering:
Like query terms:
So we can make query like:
instead of
It has few other tricks, like we should store in index for pair: term, document in posting list: is it last or not But in general it's a lot of better. Why do I write about this? Because for use such api we need to know term level. So will be nice to have overload in future like:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
reference looks better for me, but in all other places in s2 mutable reference arguments passed like pointer |
||
// single buffer for multiple points (common case is GeoJson MultiPoint) | ||
void GetIndexTerms(const S2Point& point, absl::string_view prefix, | ||
std::vector<std::string>* terms); | ||
void GetQueryTerms(const S2Point& point, absl::string_view prefix, | ||
std::vector<std::string>* terms); | ||
|
||
// Same as above but allows to reuse same buffer for different covering | ||
void GetIndexTermsForCanonicalCovering(const S2CellUnion &covering, | ||
absl::string_view prefix, | ||
std::vector<std::string> *terms); | ||
void GetQueryTermsForCanonicalCovering(const S2CellUnion &covering, | ||
absl::string_view prefix, | ||
std::vector<std::string> *terms); | ||
|
||
private: | ||
enum TermType { ANCESTOR, COVERING }; | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think saying "never be any document regions smaller than the query region" makes the comment have the same sense as 1. above.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you're right
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you have an indexing benchmark, or are you referring to the microbenchmarks you wrote in #304? What are the results?
As you say, off-topic for this PR. Please open a discussion or an issue.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jmr Today another day when I thought that will be good to have such API
But because now s2 don't have it I need to write code like this:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We already computed, it saves at least N*10 ns, where N number of inserts.
Why do you against this API so much?