From 0d8c2ca4e5429652f23a85e1d49d16b043203ee6 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 27 May 2024 10:08:36 -0400 Subject: [PATCH 1/3] See what happens if we don't track thrift i32 --- fastparquet/cencoding.pyx | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/fastparquet/cencoding.pyx b/fastparquet/cencoding.pyx index 64f54174..44356b7b 100644 --- a/fastparquet/cencoding.pyx +++ b/fastparquet/cencoding.pyx @@ -524,9 +524,6 @@ cpdef dict read_thrift(NumpyIO data): cdef char byte, id = 0, bit cdef int32_t size cdef dict out = {} - cdef bint hasi64 = 0 - cdef bint hasi32 = 0 - cdef list i32 = None while True: byte = data.read_byte() if byte == 0: @@ -536,12 +533,8 @@ cpdef dict read_thrift(NumpyIO data): if bit == 5: out[id] = zigzag_long(read_unsigned_var_int(data)) hasi32 = True - if i32 is None: - i32 = list() - i32.append(id) elif bit == 6: out[id] = zigzag_long(read_unsigned_var_int(data)) - hasi64 = True elif bit == 7: out[id] = data.get_pointer()[0] data.seek(8, 1) @@ -565,11 +558,6 @@ cpdef dict read_thrift(NumpyIO data): out[id] = data.read_byte() else: print("Corrupted thrift data at ", data.tell(), ": ", id, bit) - if hasi32: - if hasi64: - out["i32list"] = i32 - else: - out["i32"] = 1 return out From 9ac836ea6953df4514319805103fce334ba17dc0 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 27 May 2024 11:14:29 -0400 Subject: [PATCH 2/3] one more --- fastparquet/cencoding.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/fastparquet/cencoding.pyx b/fastparquet/cencoding.pyx index 44356b7b..5f79ba85 100644 --- a/fastparquet/cencoding.pyx +++ b/fastparquet/cencoding.pyx @@ -532,7 +532,6 @@ cpdef dict read_thrift(NumpyIO data): bit = byte & 0b00001111 if bit == 5: out[id] = zigzag_long(read_unsigned_var_int(data)) - hasi32 = True elif bit == 6: out[id] = zigzag_long(read_unsigned_var_int(data)) elif bit == 7: From 40ea4c077d75746dfb4534ee0a1f5b25a2c17319 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 27 May 2024 14:49:25 -0400 Subject: [PATCH 3/3] small wins --- fastparquet/cencoding.pyx | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/fastparquet/cencoding.pyx b/fastparquet/cencoding.pyx index 5f79ba85..2f7ac41c 100644 --- a/fastparquet/cencoding.pyx +++ b/fastparquet/cencoding.pyx @@ -176,7 +176,7 @@ cpdef uint64_t read_unsigned_var_int(NumpyIO file_obj): cdef uint64_t result = 0 cdef int32_t shift = 0 cdef char byte - cdef char * inptr = file_obj.get_pointer() + cdef char * inptr = file_obj.ptr + file_obj.loc # file_obj.get_pointer() while True: byte = inptr[0] @@ -185,7 +185,7 @@ cpdef uint64_t read_unsigned_var_int(NumpyIO file_obj): if (byte & 0x80) == 0: break shift += 7 - file_obj.loc += inptr - file_obj.get_pointer() + file_obj.loc += inptr - (file_obj.ptr + file_obj.loc) return result @@ -222,7 +222,9 @@ cdef void delta_read_bitpacked(NumpyIO file_obj, uint8_t bitwidth, uint64_t mask = 0XFFFFFFFFFFFFFFFF >> (64 - bitwidth) while count > 0: if (left - right) < bitwidth: - data = data | (file_obj.read_byte() << left) + # data = data | (file_obj.read_byte() << left) + data = data | (file_obj.ptr[file_obj.loc] << left) + file_obj.loc += 1 left += 8 elif right > 8: data >>= 8 @@ -525,7 +527,10 @@ cpdef dict read_thrift(NumpyIO data): cdef int32_t size cdef dict out = {} while True: - byte = data.read_byte() + # byte = data.read_byte() + byte = data.ptr[data.loc] + data.loc += 1 + if byte == 0: break id += (byte & 0b11110000) >> 4 @@ -554,7 +559,9 @@ cpdef dict read_thrift(NumpyIO data): out[id] = zigzag_long(read_unsigned_var_int(data)) elif bit == 3: # I8 - out[id] = data.read_byte() + # out[id] = data.read_byte() + out[id] = data.ptr[data.loc] + data.loc = 1 else: print("Corrupted thrift data at ", data.tell(), ": ", id, bit) return out @@ -563,7 +570,10 @@ cpdef dict read_thrift(NumpyIO data): cdef list read_list(NumpyIO data): cdef unsigned char byte, typ cdef int32_t size, bsize, _ - byte = data.read_byte() + # byte = data.read_byte() + byte = data.ptr[data.loc] + data.loc += 1 + if byte >= 0xf0: # 0b11110000 size = read_unsigned_var_int(data) else: @@ -577,8 +587,8 @@ cdef list read_list(NumpyIO data): for _ in range(size): # all parquet list types contain str, not bytes bsize = read_unsigned_var_int(data) - out.append(PyUnicode_DecodeUTF8(data.get_pointer(), bsize, "ignore")) - data.seek(bsize, 1) + out.append(PyUnicode_DecodeUTF8(data.ptr + data.loc, bsize, "ignore")) + data.loc += bsize else: for _ in range(size): out.append(read_thrift(data))