-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcli.py
executable file
·63 lines (47 loc) · 1.6 KB
/
cli.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/bin/env python
import hashlib
import click
import json
import os
from os import path
from subprocess import PIPE, Popen
import pandas as pd
from pyarrow.parquet import ParquetFile
COMPRESSORS = ['snappy', 'gzip', 'brotli', 'lz4', 'zstd']
ENGINES = ['pyarrow', 'fastparquet']
def run(df, compression, engine):
out_dir = f'out/{engine}/{compression}'
os.makedirs(out_dir, exist_ok=True)
parquet_path = path.join(out_dir, 'empty.parquet')
json_path = path.join(out_dir, 'metadata.json')
xxd_path = path.join(out_dir, 'xxd.txt')
df.to_parquet(parquet_path, compression=compression, engine=engine)
# Compute sha256sum
with open(parquet_path, 'rb', buffering=0) as f:
sha256sum = hashlib.file_digest(f, 'sha256').hexdigest()
# Load metadata via pyarrow
parquet_file = ParquetFile(parquet_path)
metadata = parquet_file.metadata
metadata_dict = metadata.to_dict()
# Get Parquet file disk size
size = os.stat(parquet_path).st_size
output = {
'metadata': metadata_dict,
'sha256sum': sha256sum,
'size': size,
}
with open(json_path, 'w') as f:
json.dump(output, f, indent=2)
proc = Popen(['xxd', parquet_path], stdout=PIPE, stderr=PIPE)
stdout, stderr = proc.communicate()
with open(xxd_path, 'w') as f:
f.write(stdout.decode('utf-8'))
@click.command('parquet-diff-test')
def main():
df = pd.DataFrame([{ 'a': 111 }])
empty_df = df.iloc[:0]
for compression in COMPRESSORS:
for engine in ENGINES:
run(empty_df, compression, engine)
if __name__ == '__main__':
main()