From ea02fd1403807748909ea30c3035de72d405621e Mon Sep 17 00:00:00 2001 From: Jamie Broomall <88007022+jamie256@users.noreply.github.com> Date: Wed, 31 Aug 2022 16:13:34 -0700 Subject: [PATCH] Don't serialize empty or single preamble (#15) * Don't serialize or allow overriding preamble bits to empty or single format * Remove preamble check to show serialize exception without fix --- .github/workflows/build_wheels.yml | 2 +- README.md | 5 +++++ kll/include/kll_sketch.hpp | 6 +----- kll/include/kll_sketch_impl.hpp | 22 ++++++++++++---------- kll/test/kll_sketch_test.cpp | 11 +++++++++++ python/src/kll_wrapper.cpp | 6 ++++-- setup.py | 3 +-- 7 files changed, 35 insertions(+), 20 deletions(-) diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index 197c982..35b3107 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -95,7 +95,7 @@ jobs: env: CIBW_PRERELEASE_PYTHONS: True # enable Python 3.11 CIBW_SKIP: "*-win32" - CIBW_BUILD: "cp37-*64 cp38-*64 cp39-*64 cp310-*64 cp311-*64" + CIBW_BUILD: "cp36-*64 cp37-*64 cp38-*64 cp39-*64 cp310-*64 cp311-*64" CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014 CIBW_ARCHS_MACOS: "x86_64 universal2" CIBW_ARCHS_LINUX: ${{ matrix.config.arch }} diff --git a/README.md b/README.md index 16eedf3..34ef833 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,11 @@ DEB, STGZ, TGZ, TZ, ZIP, etc. $ cmake3 --build build/Release -t package ``` +To generate python the wheels locally, after building the c++ parts of the project described above you can run standard python commands to setup the wheel file e.g.: +``` +python setup.py bdist_wheel +``` + The DataSketches project can be included in other projects' CMakeLists.txt files in one of two ways. If DataSketches has been installed on the host (using an RPM, DEB, "make install" into /usr/local, or some way, then CMake's `find_package` command can be used like this: diff --git a/kll/include/kll_sketch.hpp b/kll/include/kll_sketch.hpp index 5160ae6..6e0e880 100644 --- a/kll/include/kll_sketch.hpp +++ b/kll/include/kll_sketch.hpp @@ -157,6 +157,7 @@ template using vector_d = std::vector>; namespace kll_constants { const uint16_t DEFAULT_K = 200; const uint16_t DEFAULT_PREAMBLE = 0; + const uint16_t DEFAULT_INTS_SHORT = 2; const uint16_t DEFAULT_PREAMBLE_FLOAT = 5; const uint16_t DEFAULT_PREAMBLE_DOUBLE = 6; const uint16_t DEFAULT_PREAMBLE_INT = 7; @@ -547,11 +548,6 @@ class kll_sketch { enum flags { IS_EMPTY, IS_LEVEL_ZERO_SORTED, IS_SINGLE_ITEM }; - static const uint8_t PREAMBLE_INTS_SHORT = 2; // for empty and single item - static const uint8_t PREAMBLE_INTS_FULL = 5; - - - // for deserialization class item_deleter; class items_deleter; diff --git a/kll/include/kll_sketch_impl.hpp b/kll/include/kll_sketch_impl.hpp index ad13dcc..d784aef 100644 --- a/kll/include/kll_sketch_impl.hpp +++ b/kll/include/kll_sketch_impl.hpp @@ -189,8 +189,8 @@ void kll_sketch::merge(FwdSk&& other) { throw std::invalid_argument("incompatible M: " + std::to_string(m_) + " and " + std::to_string(other.m_)); } if (get_preamble() != other.get_preamble() - && get_preamble() != PREAMBLE_INTS_SHORT - && other.get_preamble() != PREAMBLE_INTS_SHORT) { + && get_preamble() != kll_constants::DEFAULT_INTS_SHORT + && other.get_preamble() != kll_constants::DEFAULT_INTS_SHORT) { throw std::invalid_argument("incompatible preambles: " + std::to_string(get_preamble()) + " and " + std::to_string(other.get_preamble())); } @@ -224,11 +224,11 @@ uint16_t kll_sketch::get_k() const { template uint16_t kll_sketch::get_preamble() const { - if (preamble_ != kll_constants::DEFAULT_PREAMBLE) { + if (preamble_ != kll_constants::DEFAULT_PREAMBLE && preamble_ != kll_constants::DEFAULT_INTS_SHORT) { return preamble_; } const bool is_single_item = n_ == 1; - return is_empty() || is_single_item ? PREAMBLE_INTS_SHORT : resolve_preamble_ints(); + return is_empty() || is_single_item ? kll_constants::DEFAULT_INTS_SHORT : resolve_preamble_ints(); } template @@ -411,7 +411,7 @@ size_t kll_sketch::get_max_serialized_size_bytes(uint16_t k, uint64_ template void kll_sketch::serialize(std::ostream& os) const { const bool is_single_item = n_ == 1; - const uint8_t preamble_ints(is_empty() || is_single_item ? PREAMBLE_INTS_SHORT : get_preamble()); + const uint8_t preamble_ints(is_empty() || is_single_item ? kll_constants::DEFAULT_INTS_SHORT : get_preamble()); write(os, preamble_ints); const uint8_t serial_version(is_single_item ? SERIAL_VERSION_2 : SERIAL_VERSION_1); write(os, serial_version); @@ -454,7 +454,7 @@ vector_u8 kll_sketch::serialize(unsigned header_size_bytes) const vector_u8 bytes(size, 0, allocator_); uint8_t* ptr = bytes.data() + header_size_bytes; const uint8_t* end_ptr = ptr + size; - const uint8_t preamble_ints(is_empty() || is_single_item ? PREAMBLE_INTS_SHORT : get_preamble()); + const uint8_t preamble_ints(is_empty() || is_single_item ? kll_constants::DEFAULT_INTS_SHORT : get_preamble()); ptr += copy_to_mem(preamble_ints, ptr); const uint8_t serial_version(is_single_item ? SERIAL_VERSION_2 : SERIAL_VERSION_1); ptr += copy_to_mem(serial_version, ptr); @@ -488,7 +488,9 @@ vector_u8 kll_sketch::serialize(unsigned header_size_bytes) const ptr += S().serialize(ptr, bytes_remaining, &items_[levels_[0]], get_num_retained()); } const size_t delta = ptr - bytes.data(); - if (delta != size) throw std::logic_error("serialized size mismatch: " + std::to_string(delta) + " != " + std::to_string(size)); + if (delta != size) { + throw std::logic_error("serialized size mismatch: " + std::to_string(delta) + " != " + std::to_string(size)); + } return bytes; } @@ -687,7 +689,7 @@ kll_sketch::kll_sketch(uint16_t k, uint16_t preamble, uint16_t min_k std::unique_ptr max_value, bool is_level_zero_sorted): allocator_(levels.get_allocator()), k_(k), -preamble_(preamble), +preamble_(preamble == kll_constants::DEFAULT_INTS_SHORT ? kll_constants::DEFAULT_PREAMBLE : preamble), m_(DEFAULT_M), min_k_(min_k), n_(n), @@ -1016,9 +1018,9 @@ void kll_sketch::check_preamble_ints(uint8_t preamble_ints, uint8_t const bool is_empty(flags_byte & (1 << flags::IS_EMPTY)); const bool is_single_item(flags_byte & (1 << flags::IS_SINGLE_ITEM)); if (is_empty || is_single_item) { - if (preamble_ints != PREAMBLE_INTS_SHORT) { + if (preamble_ints != kll_constants::DEFAULT_INTS_SHORT) { throw std::invalid_argument("Possible corruption: preamble ints must be " - + std::to_string(PREAMBLE_INTS_SHORT) + " for an empty or single item sketch: " + std::to_string(preamble_ints)); + + std::to_string(kll_constants::DEFAULT_INTS_SHORT) + " for an empty or single item sketch: " + std::to_string(preamble_ints)); } } else { uint8_t expected_preamble_ints = resolve_preamble_ints(); diff --git a/kll/test/kll_sketch_test.cpp b/kll/test/kll_sketch_test.cpp index de1e83a..b47e50c 100644 --- a/kll/test/kll_sketch_test.cpp +++ b/kll/test/kll_sketch_test.cpp @@ -711,6 +711,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") { REQUIRE(sketch1.get_preamble() == 5); sketch1.merge(single2); REQUIRE(sketch1.get_preamble() == 5); + REQUIRE(single2.get_preamble() == 2); REQUIRE(sketch1.get_min_value() == 0.0f); REQUIRE(sketch1.get_max_value() == 9.0f); @@ -718,6 +719,16 @@ TEST_CASE("kll sketch", "[kll_sketch]") { REQUIRE(single_merge.get_max_value() == sketch1.get_max_value()); } + SECTION("double update override preamble to 2 size check") { + kll_double_sketch sketch1(200, 2, 0); + REQUIRE(sketch1.get_preamble() == 2); + sketch1.update(1); + sketch1.update(2); + std::stringstream s(std::ios::in | std::ios::out | std::ios::binary); + auto bytes = sketch1.serialize(); + REQUIRE(bytes.size() == 60); + } + SECTION("merge double override preamble") { kll_double_sketch sketch1(200, 5, 0); for (int i = 0; i < 10; i++) sketch1.update(static_cast(i)); diff --git a/python/src/kll_wrapper.cpp b/python/src/kll_wrapper.cpp index c58ae0b..f58f6cc 100644 --- a/python/src/kll_wrapper.cpp +++ b/python/src/kll_wrapper.cpp @@ -132,11 +132,13 @@ void kll_sketch_update(kll_sketch& sk, py::array_t(); + for (uint32_t i = 0; i < data.size(); ++i) { sk.update(data(i)); } + } template @@ -253,7 +255,7 @@ void bind_kll_sketch(py::module &m, const char* name) { //, const uint16_t pream "Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials") .def("serialize", &dspy::kll_sketch_serialize, "Serializes the sketch into a bytes object") .def_static("deserialize", &dspy::kll_sketch_deserialize, "Deserializes the sketch from a bytes object") - .def_static("float_to_doubles", &dspy::kll_float_to_doubles, "Float to doubles") + .def_static("float_to_doubles", &dspy::kll_float_to_doubles, "Float to doubles") ; } diff --git a/setup.py b/setup.py index ca48a15..aad778b 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,5 @@ import os import sys -import sysconfig import platform import subprocess @@ -66,7 +65,7 @@ def build_extension(self, ext): setup( name='whylogs-sketching', - version='3.4.1.dev2', + version='3.4.1.dev3', author='whylogs team', author_email='support@whylabs.ai', description='sketching library of whylogs',