Skip to content

Commit

Permalink
Don't serialize empty or single preamble (#15)
Browse files Browse the repository at this point in the history
* Don't serialize or allow overriding preamble bits to empty or single format

* Remove preamble check to show serialize exception without fix
  • Loading branch information
jamie256 authored Aug 31, 2022
1 parent 67bf3eb commit ea02fd1
Show file tree
Hide file tree
Showing 7 changed files with 35 additions and 20 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build_wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ jobs:
env:
CIBW_PRERELEASE_PYTHONS: True # enable Python 3.11
CIBW_SKIP: "*-win32"
CIBW_BUILD: "cp37-*64 cp38-*64 cp39-*64 cp310-*64 cp311-*64"
CIBW_BUILD: "cp36-*64 cp37-*64 cp38-*64 cp39-*64 cp310-*64 cp311-*64"
CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014
CIBW_ARCHS_MACOS: "x86_64 universal2"
CIBW_ARCHS_LINUX: ${{ matrix.config.arch }}
Expand Down
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ DEB, STGZ, TGZ, TZ, ZIP, etc.
$ cmake3 --build build/Release -t package
```

To generate python the wheels locally, after building the c++ parts of the project described above you can run standard python commands to setup the wheel file e.g.:
```
python setup.py bdist_wheel
```

The DataSketches project can be included in other projects' CMakeLists.txt files in one of two ways.
If DataSketches has been installed on the host (using an RPM, DEB, "make install" into /usr/local, or some
way, then CMake's `find_package` command can be used like this:
Expand Down
6 changes: 1 addition & 5 deletions kll/include/kll_sketch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ template<typename A> using vector_d = std::vector<double, AllocD<A>>;
namespace kll_constants {
const uint16_t DEFAULT_K = 200;
const uint16_t DEFAULT_PREAMBLE = 0;
const uint16_t DEFAULT_INTS_SHORT = 2;
const uint16_t DEFAULT_PREAMBLE_FLOAT = 5;
const uint16_t DEFAULT_PREAMBLE_DOUBLE = 6;
const uint16_t DEFAULT_PREAMBLE_INT = 7;
Expand Down Expand Up @@ -547,11 +548,6 @@ class kll_sketch {

enum flags { IS_EMPTY, IS_LEVEL_ZERO_SORTED, IS_SINGLE_ITEM };

static const uint8_t PREAMBLE_INTS_SHORT = 2; // for empty and single item
static const uint8_t PREAMBLE_INTS_FULL = 5;



// for deserialization
class item_deleter;
class items_deleter;
Expand Down
22 changes: 12 additions & 10 deletions kll/include/kll_sketch_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,8 @@ void kll_sketch<T, C, S, A>::merge(FwdSk&& other) {
throw std::invalid_argument("incompatible M: " + std::to_string(m_) + " and " + std::to_string(other.m_));
}
if (get_preamble() != other.get_preamble()
&& get_preamble() != PREAMBLE_INTS_SHORT
&& other.get_preamble() != PREAMBLE_INTS_SHORT) {
&& get_preamble() != kll_constants::DEFAULT_INTS_SHORT
&& other.get_preamble() != kll_constants::DEFAULT_INTS_SHORT) {
throw std::invalid_argument("incompatible preambles: " + std::to_string(get_preamble())
+ " and " + std::to_string(other.get_preamble()));
}
Expand Down Expand Up @@ -224,11 +224,11 @@ uint16_t kll_sketch<T, C, S, A>::get_k() const {

template<typename T, typename C, typename S, typename A>
uint16_t kll_sketch<T, C, S, A>::get_preamble() const {
if (preamble_ != kll_constants::DEFAULT_PREAMBLE) {
if (preamble_ != kll_constants::DEFAULT_PREAMBLE && preamble_ != kll_constants::DEFAULT_INTS_SHORT) {
return preamble_;
}
const bool is_single_item = n_ == 1;
return is_empty() || is_single_item ? PREAMBLE_INTS_SHORT : resolve_preamble_ints();
return is_empty() || is_single_item ? kll_constants::DEFAULT_INTS_SHORT : resolve_preamble_ints();
}

template<typename T, typename C, typename S, typename A>
Expand Down Expand Up @@ -411,7 +411,7 @@ size_t kll_sketch<T, C, S, A>::get_max_serialized_size_bytes(uint16_t k, uint64_
template<typename T, typename C, typename S, typename A>
void kll_sketch<T, C, S, A>::serialize(std::ostream& os) const {
const bool is_single_item = n_ == 1;
const uint8_t preamble_ints(is_empty() || is_single_item ? PREAMBLE_INTS_SHORT : get_preamble());
const uint8_t preamble_ints(is_empty() || is_single_item ? kll_constants::DEFAULT_INTS_SHORT : get_preamble());
write(os, preamble_ints);
const uint8_t serial_version(is_single_item ? SERIAL_VERSION_2 : SERIAL_VERSION_1);
write(os, serial_version);
Expand Down Expand Up @@ -454,7 +454,7 @@ vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const
vector_u8<A> bytes(size, 0, allocator_);
uint8_t* ptr = bytes.data() + header_size_bytes;
const uint8_t* end_ptr = ptr + size;
const uint8_t preamble_ints(is_empty() || is_single_item ? PREAMBLE_INTS_SHORT : get_preamble());
const uint8_t preamble_ints(is_empty() || is_single_item ? kll_constants::DEFAULT_INTS_SHORT : get_preamble());
ptr += copy_to_mem(preamble_ints, ptr);
const uint8_t serial_version(is_single_item ? SERIAL_VERSION_2 : SERIAL_VERSION_1);
ptr += copy_to_mem(serial_version, ptr);
Expand Down Expand Up @@ -488,7 +488,9 @@ vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const
ptr += S().serialize(ptr, bytes_remaining, &items_[levels_[0]], get_num_retained());
}
const size_t delta = ptr - bytes.data();
if (delta != size) throw std::logic_error("serialized size mismatch: " + std::to_string(delta) + " != " + std::to_string(size));
if (delta != size) {
throw std::logic_error("serialized size mismatch: " + std::to_string(delta) + " != " + std::to_string(size));
}
return bytes;
}

Expand Down Expand Up @@ -687,7 +689,7 @@ kll_sketch<T, C, S, A>::kll_sketch(uint16_t k, uint16_t preamble, uint16_t min_k
std::unique_ptr<T, item_deleter> max_value, bool is_level_zero_sorted):
allocator_(levels.get_allocator()),
k_(k),
preamble_(preamble),
preamble_(preamble == kll_constants::DEFAULT_INTS_SHORT ? kll_constants::DEFAULT_PREAMBLE : preamble),
m_(DEFAULT_M),
min_k_(min_k),
n_(n),
Expand Down Expand Up @@ -1016,9 +1018,9 @@ void kll_sketch<T, C, S, A>::check_preamble_ints(uint8_t preamble_ints, uint8_t
const bool is_empty(flags_byte & (1 << flags::IS_EMPTY));
const bool is_single_item(flags_byte & (1 << flags::IS_SINGLE_ITEM));
if (is_empty || is_single_item) {
if (preamble_ints != PREAMBLE_INTS_SHORT) {
if (preamble_ints != kll_constants::DEFAULT_INTS_SHORT) {
throw std::invalid_argument("Possible corruption: preamble ints must be "
+ std::to_string(PREAMBLE_INTS_SHORT) + " for an empty or single item sketch: " + std::to_string(preamble_ints));
+ std::to_string(kll_constants::DEFAULT_INTS_SHORT) + " for an empty or single item sketch: " + std::to_string(preamble_ints));
}
} else {
uint8_t expected_preamble_ints = resolve_preamble_ints();
Expand Down
11 changes: 11 additions & 0 deletions kll/test/kll_sketch_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -711,13 +711,24 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
REQUIRE(sketch1.get_preamble() == 5);
sketch1.merge(single2);
REQUIRE(sketch1.get_preamble() == 5);
REQUIRE(single2.get_preamble() == 2);

REQUIRE(sketch1.get_min_value() == 0.0f);
REQUIRE(sketch1.get_max_value() == 9.0f);
REQUIRE(single_merge.get_min_value() == sketch1.get_min_value());
REQUIRE(single_merge.get_max_value() == sketch1.get_max_value());
}

SECTION("double update override preamble to 2 size check") {
kll_double_sketch sketch1(200, 2, 0);
REQUIRE(sketch1.get_preamble() == 2);
sketch1.update(1);
sketch1.update(2);
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
auto bytes = sketch1.serialize();
REQUIRE(bytes.size() == 60);
}

SECTION("merge double override preamble") {
kll_double_sketch sketch1(200, 5, 0);
for (int i = 0; i < 10; i++) sketch1.update(static_cast<float>(i));
Expand Down
6 changes: 4 additions & 2 deletions python/src/kll_wrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,11 +132,13 @@ void kll_sketch_update(kll_sketch<T>& sk, py::array_t<T, py::array::c_style | py
throw std::invalid_argument("input data must have only one dimension. Found: "
+ std::to_string(items.ndim()));
}

auto data = items.template unchecked<1>();

for (uint32_t i = 0; i < data.size(); ++i) {
sk.update(data(i));
}

}

template<typename T>
Expand Down Expand Up @@ -253,7 +255,7 @@ void bind_kll_sketch(py::module &m, const char* name) { //, const uint16_t pream
"Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials")
.def("serialize", &dspy::kll_sketch_serialize<T>, "Serializes the sketch into a bytes object")
.def_static("deserialize", &dspy::kll_sketch_deserialize<T>, "Deserializes the sketch from a bytes object")
.def_static("float_to_doubles", &dspy::kll_float_to_doubles, "Float to doubles")
.def_static("float_to_doubles", &dspy::kll_float_to_doubles, "Float to doubles")
;
}

Expand Down
3 changes: 1 addition & 2 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import os
import sys
import sysconfig
import platform
import subprocess

Expand Down Expand Up @@ -66,7 +65,7 @@ def build_extension(self, ext):

setup(
name='whylogs-sketching',
version='3.4.1.dev2',
version='3.4.1.dev3',
author='whylogs team',
author_email='[email protected]',
description='sketching library of whylogs',
Expand Down

0 comments on commit ea02fd1

Please sign in to comment.