Skip to content

Commit

Permalink
normalize chunk sizes according to fastcdc algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
kszucs committed Jan 30, 2025
1 parent 1b7fb93 commit c7a0b3a
Show file tree
Hide file tree
Showing 7 changed files with 131 additions and 96 deletions.
1 change: 1 addition & 0 deletions cpp/src/parquet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,7 @@ add_parquet_test(reader-test

add_parquet_test(writer-test
SOURCES
column_chunker_test.cc
column_writer_test.cc
file_serialize_test.cc
stream_writer_test.cc)
Expand Down
178 changes: 98 additions & 80 deletions cpp/src/parquet/column_chunker.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,77 +27,74 @@ using arrow::internal::checked_cast;
namespace parquet {
namespace internal {

// Constants
const uint64_t GEAR_HASH_TABLE[] = {
0xb088d3a9e840f559, 0x5652c7f739ed20d6, 0x45b28969898972ab, 0x6b0a89d5b68ec777,
0x368f573e8b7a31b7, 0x1dc636dce936d94b, 0x207a4c4e5554d5b6, 0xa474b34628239acb,
0x3b06a83e1ca3b912, 0x90e78d6c2f02baf7, 0xe1c92df7150d9a8a, 0x8e95053a1086d3ad,
0x5a2ef4f1b83a0722, 0xa50fac949f807fae, 0x0e7303eb80d8d681, 0x99b07edc1570ad0f,
0x689d2fb555fd3076, 0x00005082119ea468, 0xc4b08306a88fcc28, 0x3eb0678af6374afd,
0xf19f87ab86ad7436, 0xf2129fbfbe6bc736, 0x481149575c98a4ed, 0x0000010695477bc5,
0x1fba37801a9ceacc, 0x3bf06fd663a49b6d, 0x99687e9782e3874b, 0x79a10673aa50d8e3,
0xe4accf9e6211f420, 0x2520e71f87579071, 0x2bd5d3fd781a8a9b, 0x00de4dcddd11c873,
0xeaa9311c5a87392f, 0xdb748eb617bc40ff, 0xaf579a8df620bf6f, 0x86a6e5da1b09c2b1,
0xcc2fc30ac322a12e, 0x355e2afec1f74267, 0x2d99c8f4c021a47b, 0xbade4b4a9404cfc3,
0xf7b518721d707d69, 0x3286b6587bf32c20, 0x0000b68886af270c, 0xa115d6e4db8a9079,
0x484f7e9c97b2e199, 0xccca7bb75713e301, 0xbf2584a62bb0f160, 0xade7e813625dbcc8,
0x000070940d87955a, 0x8ae69108139e626f, 0xbd776ad72fde38a2, 0xfb6b001fc2fcc0cf,
0xc7a474b8e67bc427, 0xbaf6f11610eb5d58, 0x09cb1f5b6de770d1, 0xb0b219e6977d4c47,
0x00ccbc386ea7ad4a, 0xcc849d0adf973f01, 0x73a3ef7d016af770, 0xc807d2d386bdbdfe,
0x7f2ac9966c791730, 0xd037a86bc6c504da, 0xf3f17c661eaa609d, 0xaca626b04daae687,
0x755a99374f4a5b07, 0x90837ee65b2caede, 0x6ee8ad93fd560785, 0x0000d9e11053edd8,
0x9e063bb2d21cdbd7, 0x07ab77f12a01d2b2, 0xec550255e6641b44, 0x78fb94a8449c14c6,
0xc7510e1bc6c0f5f5, 0x0000320b36e4cae3, 0x827c33262c8b1a2d, 0x14675f0b48ea4144,
0x267bd3a6498deceb, 0xf1916ff982f5035e, 0x86221b7ff434fb88, 0x9dbecee7386f49d8,
0xea58f8cac80f8f4a, 0x008d198692fc64d8, 0x6d38704fbabf9a36, 0xe032cb07d1e7be4c,
0x228d21f6ad450890, 0x635cb1bfc02589a5, 0x4620a1739ca2ce71, 0xa7e7dfe3aae5fb58,
0x0c10ca932b3c0deb, 0x2727fee884afed7b, 0xa2df1c6df9e2ab1f, 0x4dcdd1ac0774f523,
0x000070ffad33e24e, 0xa2ace87bc5977816, 0x9892275ab4286049, 0xc2861181ddf18959,
0xbb9972a042483e19, 0xef70cd3766513078, 0x00000513abfc9864, 0xc058b61858c94083,
0x09e850859725e0de, 0x9197fb3bf83e7d94, 0x7e1e626d12b64bce, 0x520c54507f7b57d1,
0xbee1797174e22416, 0x6fd9ac3222e95587, 0x0023957c9adfbf3e, 0xa01c7d7e234bbe15,
0xaba2c758b8a38cbb, 0x0d1fa0ceec3e2b30, 0x0bb6a58b7e60b991, 0x4333dd5b9fa26635,
0xc2fd3b7d4001c1a3, 0xfb41802454731127, 0x65a56185a50d18cb, 0xf67a02bd8784b54f,
0x696f11dd67e65063, 0x00002022fca814ab, 0x8cd6be912db9d852, 0x695189b6e9ae8a57,
0xee9453b50ada0c28, 0xd8fc5ea91a78845e, 0xab86bf191a4aa767, 0x0000c6b5c86415e5,
0x267310178e08a22e, 0xed2d101b078bca25, 0x3b41ed84b226a8fb, 0x13e622120f28dc06,
0xa315f5ebfb706d26, 0x8816c34e3301bace, 0xe9395b9cbb71fdae, 0x002ce9202e721648,
0x4283db1d2bb3c91c, 0xd77d461ad2b1a6a5, 0xe2ec17e46eeb866b, 0xb8e0be4039fbc47c,
0xdea160c4d5299d04, 0x7eec86c8d28c3634, 0x2119ad129f98a399, 0xa6ccf46b61a283ef,
0x2c52cedef658c617, 0x2db4871169acdd83, 0x0000f0d6f39ecbe9, 0x3dd5d8c98d2f9489,
0x8a1872a22b01f584, 0xf282a4c40e7b3cf2, 0x8020ec2ccb1ba196, 0x6693b6e09e59e313,
0x0000ce19cc7c83eb, 0x20cb5735f6479c3b, 0x762ebf3759d75a5b, 0x207bfe823d693975,
0xd77dc112339cd9d5, 0x9ba7834284627d03, 0x217dc513e95f51e9, 0xb27b1a29fc5e7816,
0x00d5cd9831bb662d, 0x71e39b806d75734c, 0x7e572af006fb1a23, 0xa2734f2f6ae91f85,
0xbf82c6b5022cddf2, 0x5c3beac60761a0de, 0xcdc893bb47416998, 0x6d1085615c187e01,
0x77f8ae30ac277c5d, 0x917c6b81122a2c91, 0x5b75b699add16967, 0x0000cf6ae79a069b,
0xf3c40afa60de1104, 0x2063127aa59167c3, 0x621de62269d1894d, 0xd188ac1de62b4726,
0x107036e2154b673c, 0x0000b85f28553a1d, 0xf2ef4e4c18236f3d, 0xd9d6de6611b9f602,
0xa1fc7955fb47911c, 0xeb85fd032f298dbd, 0xbe27502fb3befae1, 0xe3034251c4cd661e,
0x441364d354071836, 0x0082b36c75f2983e, 0xb145910316fa66f0, 0x021c069c9847caf7,
0x2910dfc75a4b5221, 0x735b353e1c57a8b5, 0xce44312ce98ed96c, 0xbc942e4506bdfa65,
0xf05086a71257941b, 0xfec3b215d351cead, 0x00ae1055e0144202, 0xf54b40846f42e454,
0x00007fd9c8bcbcc8, 0xbfbd9ef317de9bfe, 0xa804302ff2854e12, 0x39ce4957a5e5d8d4,
0xffb9e2a45637ba84, 0x55b9ad1d9ea0818b, 0x00008acbf319178a, 0x48e2bfc8d0fbfb38,
0x8be39841e848b5e8, 0x0e2712160696a08b, 0xd51096e84b44242a, 0x1101ba176792e13a,
0xc22e770f4531689d, 0x1689eff272bbc56c, 0x00a92a197f5650ec, 0xbc765990bda1784e,
0xc61441e392fcb8ae, 0x07e13a2ced31e4a0, 0x92cbe984234e9d4d, 0x8f4ff572bb7d8ac5,
0x0b9670c00b963bd0, 0x62955a581a03eb01, 0x645f83e5ea000254, 0x41fce516cd88f299,
0xbbda9748da7a98cf, 0x0000aab2fe4845fa, 0x19761b069bf56555, 0x8b8f5e8343b6ad56,
0x3e5d1cfd144821d9, 0xec5c1e2ca2b0cd8f, 0xfaf7e0fea7fbb57f, 0x000000d3ba12961b,
0xda3f90178401b18e, 0x70ff906de33a5feb, 0x0527d5a7c06970e7, 0x22d8e773607c13e9,
0xc9ab70df643c3bac, 0xeda4c6dc8abe12e3, 0xecef1f410033e78a, 0x0024c2b274ac72cb,
0x06740d954fa900b4, 0x1d7a299b323d6304, 0xb3c37cb298cbead5, 0xc986e3c76178739b,
0x9fabea364b46f58a, 0x6da214c5af85cc56, 0x17a43ed8b7a38f84, 0x6eccec511d9adbeb,
0xf9cab30913335afb, 0x4a5e60c5f415eed2, 0x00006967503672b4, 0x9da51d121454bb87,
0x84321e13b9bbc816, 0xfb3d6fb6ab2fdd8d, 0x60305eed8e160a8d, 0xcbbf4b14e9946ce8,
0x00004f63381b10c3, 0x07d5b7816fcc4e10, 0xe5a536726a6a8155, 0x57afb23447a07fdd,
0x18f346f7abc9d394, 0x636dc655d61ad33d, 0xcc8bab4939f7f3f6, 0x63c7a906c1dd187b};
const uint64_t GEAR_TABLE[256] = {
0x3b5d3c7d207e37dc, 0x784d68ba91123086, 0xcd52880f882e7298, 0xeacf8e4e19fdcca7,
0xc31f385dfbd1632b, 0x1d5f27001e25abe6, 0x83130bde3c9ad991, 0xc4b225676e9b7649,
0xaa329b29e08eb499, 0xb67fcbd21e577d58, 0x0027baaada2acf6b, 0xe3ef2d5ac73c2226,
0x0890f24d6ed312b7, 0xa809e036851d7c7e, 0xf0a6fe5e0013d81b, 0x1d026304452cec14,
0x03864632648e248f, 0xcdaacf3dcd92b9b4, 0xf5e012e63c187856, 0x8862f9d3821c00b6,
0xa82f7338750f6f8a, 0x1e583dc6c1cb0b6f, 0x7a3145b69743a7f1, 0xabb20fee404807eb,
0xb14b3cfe07b83a5d, 0xb9dc27898adb9a0f, 0x3703f5e91baa62be, 0xcf0bb866815f7d98,
0x3d9867c41ea9dcd3, 0x1be1fa65442bf22c, 0x14300da4c55631d9, 0xe698e9cbc6545c99,
0x4763107ec64e92a5, 0xc65821fc65696a24, 0x76196c064822f0b7, 0x485be841f3525e01,
0xf652bc9c85974ff5, 0xcad8352face9e3e9, 0x2a6ed1dceb35e98e, 0xc6f483badc11680f,
0x3cfd8c17e9cf12f1, 0x89b83c5e2ea56471, 0xae665cfd24e392a9, 0xec33c4e504cb8915,
0x3fb9b15fc9fe7451, 0xd7fd1fd1945f2195, 0x31ade0853443efd8, 0x255efc9863e1e2d2,
0x10eab6008d5642cf, 0x46f04863257ac804, 0xa52dc42a789a27d3, 0xdaaadf9ce77af565,
0x6b479cd53d87febb, 0x6309e2d3f93db72f, 0xc5738ffbaa1ff9d6, 0x6bd57f3f25af7968,
0x67605486d90d0a4a, 0xe14d0b9663bfbdae, 0xb7bbd8d816eb0414, 0xdef8a4f16b35a116,
0xe7932d85aaaffed6, 0x08161cbae90cfd48, 0x855507beb294f08b, 0x91234ea6ffd399b2,
0xad70cf4b2435f302, 0xd289a97565bc2d27, 0x8e558437ffca99de, 0x96d2704b7115c040,
0x0889bbcdfc660e41, 0x5e0d4e67dc92128d, 0x72a9f8917063ed97, 0x438b69d409e016e3,
0xdf4fed8a5d8a4397, 0x00f41dcf41d403f7, 0x4814eb038e52603f, 0x9dafbacc58e2d651,
0xfe2f458e4be170af, 0x4457ec414df6a940, 0x06e62f1451123314, 0xbd1014d173ba92cc,
0xdef318e25ed57760, 0x9fea0de9dfca8525, 0x459de1e76c20624b, 0xaeec189617e2d666,
0x126a2c06ab5a83cb, 0xb1321532360f6132, 0x65421503dbb40123, 0x2d67c287ea089ab3,
0x6c93bff5a56bd6b6, 0x4ffb2036cab6d98d, 0xce7b785b1be7ad4f, 0xedb42ef6189fd163,
0xdc905288703988f6, 0x365f9c1d2c691884, 0xc640583680d99bfe, 0x3cd4624c07593ec6,
0x7f1ea8d85d7c5805, 0x014842d480b57149, 0x0b649bcb5a828688, 0xbcd5708ed79b18f0,
0xe987c862fbd2f2f0, 0x982731671f0cd82c, 0xbaf13e8b16d8c063, 0x8ea3109cbd951bba,
0xd141045bfb385cad, 0x2acbc1a0af1f7d30, 0xe6444d89df03bfdf, 0xa18cc771b8188ff9,
0x9834429db01c39bb, 0x214add07fe086a1f, 0x8f07c19b1f6b3ff9, 0x56a297b1bf4ffe55,
0x94d558e493c54fc7, 0x40bfc24c764552cb, 0x931a706f8a8520cb, 0x32229d322935bd52,
0x2560d0f5dc4fefaf, 0x9dbcc48355969bb6, 0x0fd81c3985c0b56a, 0xe03817e1560f2bda,
0xc1bb4f81d892b2d5, 0xb0c4864f4e28d2d7, 0x3ecc49f9d9d6c263, 0x51307e99b52ba65e,
0x8af2b688da84a752, 0xf5d72523b91b20b6, 0x6d95ff1ff4634806, 0x562f21555458339a,
0xc0ce47f889336346, 0x487823e5089b40d8, 0xe4727c7ebc6d9592, 0x5a8f7277e94970ba,
0xfca2f406b1c8bb50, 0x5b1f8a95f1791070, 0xd304af9fc9028605, 0x5440ab7fc930e748,
0x312d25fbca2ab5a1, 0x10f4a4b234a4d575, 0x90301d55047e7473, 0x3b6372886c61591e,
0x293402b77c444e06, 0x451f34a4d3e97dd7, 0x3158d814d81bc57b, 0x034942425b9bda69,
0xe2032ff9e532d9bb, 0x62ae066b8b2179e5, 0x9545e10c2f8d71d8, 0x7ff7483eb2d23fc0,
0x00945fcebdc98d86, 0x8764bbbe99b26ca2, 0x1b1ec62284c0bfc3, 0x58e0fcc4f0aa362b,
0x5f4abefa878d458d, 0xfd74ac2f9607c519, 0xa4e3fb37df8cbfa9, 0xbf697e43cac574e5,
0x86f14a3f68f4cd53, 0x24a23d076f1ce522, 0xe725cd8048868cc8, 0xbf3c729eb2464362,
0xd8f6cd57b3cc1ed8, 0x6329e52425541577, 0x62aa688ad5ae1ac0, 0x0a242566269bf845,
0x168b1a4753aca74b, 0xf789afefff2e7e3c, 0x6c3362093b6fccdb, 0x4ce8f50bd28c09b2,
0x006a2db95ae8aa93, 0x975b0d623c3d1a8c, 0x18605d3935338c5b, 0x5bb6f6136cad3c71,
0x0f53a20701f8d8a6, 0xab8c5ad2e7e93c67, 0x40b5ac5127acaa29, 0x8c7bf63c2075895f,
0x78bd9f7e014a805c, 0xb2c9e9f4f9c8c032, 0xefd6049827eb91f3, 0x2be459f482c16fbd,
0xd92ce0c5745aaa8c, 0x0aaa8fb298d965b9, 0x2b37f92c6c803b15, 0x8c54a5e94e0f0e78,
0x95f9b6e90c0a3032, 0xe7939faa436c7874, 0xd16bfe8f6a8a40c9, 0x44982b86263fd2fa,
0xe285fb39f984e583, 0x779a8df72d7619d3, 0xf2d79a8de8d5dd1e, 0xd1037354d66684e2,
0x004c82a4e668a8e5, 0x31d40a7668b044e6, 0xd70578538bd02c11, 0xdb45431078c5f482,
0x977121bb7f6a51ad, 0x73d5ccbd34eff8dd, 0xe437a07d356e17cd, 0x47b2782043c95627,
0x9fb251413e41d49a, 0xccd70b60652513d3, 0x1c95b31e8a1b49b2, 0xcae73dfd1bcb4c1b,
0x34d98331b1f5b70f, 0x784e39f22338d92f, 0x18613d4a064df420, 0xf1d8dae25f0bcebe,
0x33f77c15ae855efc, 0x3c88b3b912eb109c, 0x956a2ec96bafeea5, 0x1aa005b5e0ad0e87,
0x5500d70527c4bb8e, 0xe36c57196421cc44, 0x13c4d286cc36ee39, 0x5654a23d818b2a81,
0x77b1dc13d161abdc, 0x734f44de5f8d5eb5, 0x60717e174a6c89a2, 0xd47d9649266a211e,
0x5b13a4322bb69e90, 0xf7669609f8b5fc3c, 0x21e6ac55bedcdac9, 0x9b56b62b61166dea,
0xf48f66b939797e9c, 0x35f332f9c0e6ae9a, 0xcc733f6a9a878db0, 0x3da161e41cc108c2,
0xb7d74ae535914d51, 0x4d493b0b11d36469, 0xce264d1dfba9741a, 0xa9d1f2dc7436dc06,
0x70738016604c2a27, 0x231d36e96e93f3d5, 0x7666881197838d19, 0x4a2a83090aaad40c,
0xf1e761591668b35d, 0x7363236497f730a7, 0x301080e37379dd4d, 0x502dea2971827042,
0xc2c5eb858f32625f, 0x786afb9edfafbdff, 0xdaee0d868490b2a4, 0x617366b3268609f6,
0xae0e35a0fe46173e, 0xd1a07de93e824f11, 0x079b8b115ea4cca8, 0x93a99274558faebb,
0xfb1e6e22e08a03b3, 0xea635fdba3698dd0, 0xcf53659328503a5c, 0xcde3b31e6fd5d780,
0x8e3e4221d3614413, 0xef14d0d86bf1a22c, 0xe1d830d3f16c5ddb, 0xaabd2b2a451504e1};

const uint64_t MASK = 0xffff00000000000;
// const int MIN_LEN = 65536 / 8;
// const int MAX_LEN = 65536 * 2;
const int64_t MIN_LEN = 256 * 1024;
const int64_t AVG_LEN = 1 * 1024 * 1024;
const int64_t MAX_LEN = 2 * 1024 * 1024;

// create a fake null array class with a GetView method returning 0 always
Expand All @@ -110,26 +107,40 @@ class FakeNullArray {
int64_t null_count() const { return 0; }
};

class GearHash {
static uint64_t GetMask(uint64_t avg_len, uint8_t bit_adjustment) {
size_t mask_bits = std::log2(avg_len);
size_t effective_bits = mask_bits + bit_adjustment;
return ((1ULL << effective_bits) - 1) << (64 - effective_bits);
}

class FastCDC {
public:
GearHash(const LevelInfo& level_info, uint64_t mask, uint64_t min_len, uint64_t max_len)
FastCDC(const LevelInfo& level_info, uint64_t min_len, uint64_t avg_len,
uint64_t max_len, uint8_t normalization_level = 1)
: level_info_(level_info),
mask_(mask == 0 ? MASK : mask),
min_len_(min_len == 0 ? MIN_LEN : min_len),
max_len_(max_len == 0 ? MAX_LEN : max_len) {}
avg_len_(avg_len == 0 ? AVG_LEN : avg_len),
max_len_(max_len == 0 ? MAX_LEN : max_len),
mask_s_(GetMask(avg_len_, -normalization_level)),
mask_l_(GetMask(avg_len_, +normalization_level)) {}

template <typename T>
bool Roll(const T value) {
constexpr size_t BYTE_WIDTH = sizeof(T);
chunk_size_ += BYTE_WIDTH;
uint64_t mask;
if (chunk_size_ < min_len_) {
return false;
} else if (chunk_size_ < avg_len_) {
mask = mask_l_;
} else {
mask = mask_s_;
}
auto bytes = reinterpret_cast<const uint8_t*>(&value);
bool match = false;
for (size_t i = 0; i < BYTE_WIDTH; ++i) {
hash_ = (hash_ << 1) + GEAR_HASH_TABLE[bytes[i]];
if ((hash_ & mask_) == 0) {
hash_ = (hash_ << 1) + GEAR_TABLE[bytes[i]];
if ((hash_ & mask) == 0) {
match = true;
}
}
Expand All @@ -138,13 +149,18 @@ class GearHash {

bool Roll(std::string_view value) {
chunk_size_ += value.size();
uint64_t mask;
if (chunk_size_ < min_len_) {
return false;
} else if (chunk_size_ < avg_len_) {
mask = mask_l_;
} else {
mask = mask_s_;
}
bool match = false;
for (char c : value) {
hash_ = (hash_ << 1) + GEAR_HASH_TABLE[static_cast<uint8_t>(c)];
if ((hash_ & mask_) == 0) {
hash_ = (hash_ << 1) + GEAR_TABLE[static_cast<uint8_t>(c)];
if ((hash_ & mask) == 0) {
match = true;
}
}
Expand Down Expand Up @@ -303,9 +319,11 @@ class GearHash {

private:
const internal::LevelInfo& level_info_;
uint64_t mask_ = MASK;
uint64_t min_len_;
uint64_t max_len_;
const uint64_t min_len_;
const uint64_t avg_len_;
const uint64_t max_len_;
const uint64_t mask_s_;
const uint64_t mask_l_;
uint64_t hash_ = 0;
uint64_t chunk_size_ = 0;
};
Expand Down
16 changes: 16 additions & 0 deletions cpp/src/parquet/column_chunker_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
6 changes: 3 additions & 3 deletions cpp/src/parquet/column_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -754,8 +754,8 @@ class ColumnWriterImpl {
fallback_(false),
definition_levels_sink_(allocator_),
repetition_levels_sink_(allocator_),
content_defined_chunker_(level_info_, properties->cdc_mask(),
properties->cdc_min_size(), properties->cdc_max_size()) {
content_defined_chunker_(level_info_, properties->cdc_min_size(),
properties->cdc_avg_size(), properties->cdc_max_size()) {
definition_levels_rle_ =
std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(allocator_, 0));
repetition_levels_rle_ =
Expand Down Expand Up @@ -895,7 +895,7 @@ class ColumnWriterImpl {

std::vector<std::unique_ptr<DataPage>> data_pages_;

internal::GearHash content_defined_chunker_;
internal::FastCDC content_defined_chunker_;

private:
void InitSinks() {
Expand Down
20 changes: 10 additions & 10 deletions cpp/src/parquet/properties.h
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ class PARQUET_EXPORT WriterProperties {
page_checksum_enabled_(false),
size_statistics_level_(DEFAULT_SIZE_STATISTICS_LEVEL),
cdc_enabled_(false),
cdc_mask_(0),
cdc_avg_size_(0),
cdc_min_size_(0),
cdc_max_size_(0) {}

Expand Down Expand Up @@ -297,8 +297,8 @@ class PARQUET_EXPORT WriterProperties {
return this;
}

Builder* cdc_mask(uint64_t mask) {
cdc_mask_ = mask;
Builder* cdc_avg_size(uint64_t avg_size) {
cdc_avg_size_ = avg_size;
return this;
}

Expand Down Expand Up @@ -734,8 +734,8 @@ class PARQUET_EXPORT WriterProperties {
pagesize_, version_, created_by_, page_checksum_enabled_,
size_statistics_level_, std::move(file_encryption_properties_),
default_column_properties_, column_properties, data_page_version_,
store_decimal_as_integer_, std::move(sorting_columns_), cdc_enabled_, cdc_mask_,
cdc_min_size_, cdc_max_size_));
store_decimal_as_integer_, std::move(sorting_columns_), cdc_enabled_,
cdc_avg_size_, cdc_min_size_, cdc_max_size_));
}

private:
Expand Down Expand Up @@ -766,7 +766,7 @@ class PARQUET_EXPORT WriterProperties {
std::unordered_map<std::string, bool> page_index_enabled_;

bool cdc_enabled_;
uint64_t cdc_mask_;
uint64_t cdc_avg_size_;
uint64_t cdc_min_size_;
uint64_t cdc_max_size_;
};
Expand Down Expand Up @@ -794,7 +794,7 @@ class PARQUET_EXPORT WriterProperties {
inline bool page_checksum_enabled() const { return page_checksum_enabled_; }

inline bool cdc_enabled() const { return cdc_enabled_; }
inline uint64_t cdc_mask() const { return cdc_mask_; }
inline uint64_t cdc_avg_size() const { return cdc_avg_size_; }
inline uint64_t cdc_min_size() const { return cdc_min_size_; }
inline uint64_t cdc_max_size() const { return cdc_max_size_; }

Expand Down Expand Up @@ -900,7 +900,7 @@ class PARQUET_EXPORT WriterProperties {
const ColumnProperties& default_column_properties,
const std::unordered_map<std::string, ColumnProperties>& column_properties,
ParquetDataPageVersion data_page_version, bool store_short_decimal_as_integer,
std::vector<SortingColumn> sorting_columns, bool cdc_enabled, uint64_t cdc_mask,
std::vector<SortingColumn> sorting_columns, bool cdc_enabled, uint64_t cdc_avg_size,
uint64_t cdc_min_size, uint64_t cdc_max_size)
: pool_(pool),
dictionary_pagesize_limit_(dictionary_pagesize_limit),
Expand All @@ -918,7 +918,7 @@ class PARQUET_EXPORT WriterProperties {
default_column_properties_(default_column_properties),
column_properties_(column_properties),
cdc_enabled_(cdc_enabled),
cdc_mask_(cdc_mask),
cdc_avg_size_(cdc_avg_size),
cdc_min_size_(cdc_min_size),
cdc_max_size_(cdc_max_size) {}

Expand All @@ -942,7 +942,7 @@ class PARQUET_EXPORT WriterProperties {
std::unordered_map<std::string, ColumnProperties> column_properties_;

bool cdc_enabled_;
uint64_t cdc_mask_;
uint64_t cdc_avg_size_;
uint64_t cdc_min_size_;
uint64_t cdc_max_size_;
};
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/_parquet.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,7 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil:
Builder* disable_page_checksum()
Builder* enable_cdc()
Builder* disable_cdc()
Builder* cdc_mask(uint64_t mask)
Builder* cdc_avg_size(uint64_t avg_size)
Builder* cdc_min_size(uint64_t min_size)
Builder* cdc_max_size(uint64_t max_size)
shared_ptr[WriterProperties] build()
Expand Down
4 changes: 2 additions & 2 deletions python/pyarrow/_parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2006,9 +2006,9 @@ cdef shared_ptr[WriterProperties] _create_writer_properties(
elif content_defined_chunking is True:
props.enable_cdc()
elif isinstance(content_defined_chunking, tuple):
mask, min_size, max_size = content_defined_chunking
min_size, avg_size, max_size = content_defined_chunking
props.enable_cdc()
props.cdc_mask(mask)
props.cdc_avg_size(avg_size)
props.cdc_min_size(min_size)
props.cdc_max_size(max_size)
else:
Expand Down

0 comments on commit c7a0b3a

Please sign in to comment.