Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revise perfect hash to align with libgrape-lite's pthash #1851

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions NOTICE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,6 @@ This product includes software from the ClickHouse project
* Copyright 2016-2022 ClickHouse, Inc.
* https://github.com/ClickHouse/ClickHouse

This product includes software from the BBHash project
* Copyright (c) 2015 Guillaume Rizk
* https://github.com/rizkg/BBHash

This product includes software from the rax project (BSD, 2-clause)
* Copyright (c) 2017-2019, Salvatore Sanfilippo <antirez at gmail dot com>
* https://github.com/antirez/rax
1 change: 0 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,6 @@ We thank the following excellent open-source projects:
- `skywalking-infra-e2e <https://github.com/apache/skywalking-infra-e2e>`_ A generation End-to-End Testing framework.
- `skywalking-swck <https://github.com/apache/skywalking-swck>`_ A kubernetes operator for the Apache Skywalking.
- `wyhash <https://github.com/alainesp/wy>`_, C++ wrapper around wyhash and wyrand.
- `BBHash <https://github.com/rizkg/BBHash>`_, a fast, minimal-memory perfect hash function.
- `rax <https://github.com/antirez/rax>`_, an ANSI C radix tree implementation.
- `MurmurHash3 <https://github.com/aappleby/smhasher>`_, a fast non-cryptographic hash function.

Expand Down
7 changes: 0 additions & 7 deletions modules/basic/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,6 @@ install(DIRECTORY ${PROJECT_SOURCE_DIR}/thirdparty/wyhash
PATTERN "*.hpp" # select C++ template header files
)

install(DIRECTORY ${PROJECT_SOURCE_DIR}/thirdparty/BBHash
DESTINATION include/vineyard/contrib # target directory
FILES_MATCHING # install only matched files
PATTERN "*.h" # select header files
PATTERN "*.hpp" # select C++ template header files
)

# install bundled thirdparty: cityhash
install(DIRECTORY ${PROJECT_SOURCE_DIR}/thirdparty/cityhash
DESTINATION include/vineyard/contrib # target directory
Expand Down
106 changes: 69 additions & 37 deletions modules/basic/ds/hashmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,7 @@ limitations under the License.
#include "client/ds/blob.h"
#include "client/ds/i_object.h"
#include "common/util/arrow.h" // IWYU pragma: keep

#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-variable"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#endif
#include "BBHash/BooPHF.h"
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif
#include "grape/graph/perfect_hash_indexer.h"

namespace vineyard {

Expand Down Expand Up @@ -229,8 +220,6 @@ class PerfectHashmapBuilder : public PerfectHashmapBaseBuilder<K, V> {
public:
static_assert(std::is_pod<V>::value, "V in perfect hashmap must be POD type");

typedef boomphf::SingleHashFunctor<K> hasher_t;

explicit PerfectHashmapBuilder(Client& client)
: PerfectHashmapBaseBuilder<K, V>(client) {}

Expand All @@ -248,12 +237,21 @@ class PerfectHashmapBuilder : public PerfectHashmapBaseBuilder<K, V> {
const V* values, const size_t n_elements) {
this->set_num_elements_(n_elements);
this->set_ph_keys_(keys);
RETURN_ON_ERROR(detail::boomphf::build_keys(
bphf_, reinterpret_cast<const K*>(keys->data()), n_elements));
for (size_t i = 0; i < n_elements; ++i) {
this->builder_.add((reinterpret_cast<const K*>(keys->data()))[i]);
}

this->builder_.buildPhf();
std::unique_ptr<BlobWriter> writer;
size_t serialize_size = this->builder_.getSerializeSize();
RETURN_ON_ERROR(client.CreateBlob(serialize_size, writer));
this->builder_.finish(writer->data(), serialize_size, this->idxer_);
writer->Seal(client, buf);

return this->allocateValues(
client, n_elements, [&](V* shuffled_values) -> Status {
return detail::boomphf::build_values(
bphf_, reinterpret_cast<const K*>(keys->data()), n_elements,
return detail::perfect_hash::build_values(
idxer_, reinterpret_cast<const K*>(keys->data()), n_elements,
values, shuffled_values);
});
}
Expand All @@ -266,11 +264,27 @@ class PerfectHashmapBuilder : public PerfectHashmapBaseBuilder<K, V> {
const V* values, const size_t n_elements) {
this->set_num_elements_(n_elements);
this->set_ph_keys_(keys);
RETURN_ON_ERROR(detail::boomphf::build_keys(bphf_, keys->GetArray()));
for (auto iter =
detail::perfect_hash::arrow_array_iterator<K, ArrowArrayType<K>>(
keys->GetArray()->begin());
iter !=
detail::perfect_hash::arrow_array_iterator<K, ArrowArrayType<K>>(
keys->GetArray()->end());
iter++) {
this->builder_.add(*iter);
}

this->builder_.buildPhf();
std::unique_ptr<BlobWriter> writer;
size_t serialize_size = this->builder_.getSerializeSize();
RETURN_ON_ERROR(client.CreateBlob(serialize_size, writer));
this->builder_.finish(writer->data(), serialize_size, this->idxer_);
writer->Seal(client, buf);

return this->allocateValues(
client, n_elements, [&](V* shuffled_values) -> Status {
return detail::boomphf::build_values(bphf_, keys->GetArray(), values,
shuffled_values);
return detail::perfect_hash::build_values(idxer_, keys->GetArray(),
values, shuffled_values);
});
return Status::OK();
}
Expand All @@ -289,12 +303,21 @@ class PerfectHashmapBuilder : public PerfectHashmapBaseBuilder<K, V> {
const V begin_value, const size_t n_elements) {
this->set_num_elements_(n_elements);
this->set_ph_keys_(keys);
RETURN_ON_ERROR(detail::boomphf::build_keys(
bphf_, reinterpret_cast<const K*>(keys->data()), n_elements));
for (size_t i = 0; i < n_elements; ++i) {
this->builder_.add((reinterpret_cast<const K*>(keys->data()))[i]);
}

this->builder_.buildPhf();
std::unique_ptr<BlobWriter> writer;
size_t serialize_size = this->builder_.getSerializeSize();
RETURN_ON_ERROR(client.CreateBlob(serialize_size, writer));
this->builder_.finish(writer->data(), serialize_size, this->idxer_);
writer->Seal(client, buf);

return this->allocateValues(
client, n_elements, [&](V* shuffled_values) -> Status {
return detail::boomphf::build_values(
bphf_, reinterpret_cast<const K*>(keys->data()), n_elements,
return detail::perfect_hash::build_values(
idxer_, reinterpret_cast<const K*>(keys->data()), n_elements,
begin_value, shuffled_values);
});
}
Expand All @@ -307,11 +330,27 @@ class PerfectHashmapBuilder : public PerfectHashmapBaseBuilder<K, V> {
const V begin_value, const size_t n_elements) {
this->set_num_elements_(n_elements);
this->set_ph_keys_(keys);
RETURN_ON_ERROR(detail::boomphf::build_keys(bphf_, keys->GetArray()));
for (auto iter =
detail::perfect_hash::arrow_array_iterator<K, ArrowArrayType<K>>(
keys->GetArray()->begin());
iter !=
detail::perfect_hash::arrow_array_iterator<K, ArrowArrayType<K>>(
keys->GetArray()->end());
iter++) {
this->builder_.add(*iter);
}

this->builder_.buildPhf();
std::unique_ptr<BlobWriter> writer;
size_t serialize_size = this->builder_.getSerializeSize();
RETURN_ON_ERROR(client.CreateBlob(serialize_size, writer));
this->builder_.finish(writer->data(), serialize_size, this->idxer_);
writer->Seal(client, buf);

return this->allocateValues(
client, n_elements, [&](V* shuffled_values) -> Status {
return detail::boomphf::build_values(bphf_, keys->GetArray(),
begin_value, shuffled_values);
return detail::perfect_hash::build_values(
idxer_, keys->GetArray(), begin_value, shuffled_values);
});
return Status::OK();
}
Expand All @@ -323,15 +362,7 @@ class PerfectHashmapBuilder : public PerfectHashmapBaseBuilder<K, V> {
*
*/
Status Build(Client& client) override {
size_t size = detail::boomphf::bphf_serde::compute_size(bphf_);
std::unique_ptr<BlobWriter> blob_writer;
RETURN_ON_ERROR(client.CreateBlob(size, blob_writer));
char* dst = detail::boomphf::bphf_serde::ser(blob_writer->data(), bphf_);
RETURN_ON_ASSERT(dst == blob_writer->data() + size,
"boomphf serialization error: buffer size mismatched");
std::shared_ptr<Object> blob;
RETURN_ON_ERROR(blob_writer->Seal(client, blob));
this->set_ph_(std::dynamic_pointer_cast<Blob>(blob));
this->set_ph_(buf);
return Status::OK();
}

Expand Down Expand Up @@ -359,10 +390,11 @@ class PerfectHashmapBuilder : public PerfectHashmapBaseBuilder<K, V> {
return Status::OK();
}

boomphf::mphf<K, hasher_t> bphf_;
PHIdxerViewBuilder<K, uint64_t> builder_;
ImmPHIdxer<K, uint64_t> idxer_;
std::shared_ptr<Object> buf;

const int concurrency_ = std::thread::hardware_concurrency();
const double gamma_ = 2.5f;
};

} // namespace vineyard
Expand Down
Loading