-
Notifications
You must be signed in to change notification settings - Fork 28
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added python scripts used to generate test/fixtures data for reader
- Loading branch information
1 parent
d943f39
commit b2d4455
Showing
9 changed files
with
891 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
output | ||
!output/.gitkeep | ||
parquet |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# Test Data Generators | ||
|
||
This directory contains scripts to generate test data for the Flwo PHP Parquet reader/writer. | ||
|
||
### Prerequisites | ||
- Python 3.x installed | ||
- pip installed (Python Package Index) | ||
|
||
### Installation | ||
|
||
First go to the `src/lib/parquet/resources/python` directory and run the following command to install the required dependencies: | ||
|
||
```shell | ||
python3 -m venv parquet | ||
source parquet/bin/activate | ||
pip install -r requirements.txt | ||
``` | ||
|
||
Once all dependencies are installed, you can run the following command to generate the test data: | ||
|
||
```shell | ||
python generators/lists.py | ||
python generators/maps.py | ||
python generators/orders.py | ||
python generators/primitives.py | ||
python generators/structs.py | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
import pandas as pd | ||
import random | ||
import os | ||
import pyarrow as pa | ||
import pyarrow.parquet as pq | ||
|
||
# Number of rows to generate | ||
n_rows = 100 | ||
|
||
# Functions to generate the data | ||
def generate_list_nested(): | ||
return [ | ||
[ | ||
[ | ||
random.randint(1, 10) for _ in range(random.randint(1, 3)) | ||
] for _ in range(random.randint(1, 3)) | ||
] for _ in range(random.randint(1, 3)) | ||
] | ||
|
||
# Columns | ||
list_col = pd.Series([[random.randint(1, 10) for _ in range(3)] for _ in range(n_rows)], dtype='object') | ||
list_nullable_col = pd.Series([[random.randint(1, 10) for _ in range(3)] if i % 2 == 0 else None for i in range(n_rows)], dtype='object') | ||
list_mixed_types_col = pd.Series([ | ||
[ | ||
{'int': i, 'string': None, 'bool': None}, | ||
{'int': None, 'string': "string_" + str(i), 'bool': None}, | ||
{'int': None, 'string': None, 'bool': bool(i % 2)}, | ||
{'int': None, 'string': None, 'bool': None} | ||
] for i in range(n_rows) | ||
], dtype='object') | ||
list_nested_col = pd.Series([generate_list_nested() for _ in range(n_rows)], dtype='object') | ||
|
||
# Creating the DataFrame with only the new column | ||
df_nested_list = pd.DataFrame({ | ||
'list': list_col, | ||
'list_nullable': list_nullable_col, | ||
'list_mixed_types': list_mixed_types_col, | ||
'list_nested': list_nested_col | ||
}) | ||
|
||
# Types | ||
list_type = pa.list_(pa.int32()) | ||
list_mixed_type = pa.list_( | ||
pa.struct([ | ||
pa.field('int', pa.int32()), | ||
pa.field('string', pa.string()), | ||
pa.field('bool', pa.bool_()) | ||
]) | ||
) | ||
list_nested_type = pa.list_(pa.list_(pa.list_(pa.int32()))) | ||
|
||
# Define the schema | ||
schema = pa.schema([ | ||
('list', list_type), | ||
('list_nullable', list_type), | ||
('list_mixed_types', list_mixed_type), | ||
('list_nested', list_nested_type), | ||
]) | ||
|
||
parquet_file = 'output/lists.parquet' | ||
# Create a PyArrow Table | ||
table = pa.Table.from_pandas(df_nested_list, schema=schema) | ||
|
||
# Check if the file exists and remove it | ||
if os.path.exists(parquet_file): | ||
os.remove(parquet_file) | ||
|
||
# Write the PyArrow Table to a Parquet file | ||
with pq.ParquetWriter(parquet_file, schema, compression='GZIP') as writer: | ||
writer.write_table(table) | ||
|
||
pd.set_option('display.max_columns', None) # Show all columns | ||
pd.set_option('display.max_rows', None) # Show all rows | ||
pd.set_option('display.width', None) # Auto-detect the width for displaying | ||
pd.set_option('display.max_colwidth', None) # Show complete text in each cell | ||
|
||
# Show the first few rows of the DataFrame for verification | ||
print(df_nested_list.head(10)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,193 @@ | ||
import pandas as pd | ||
import random | ||
import os | ||
import pyarrow as pa | ||
import pyarrow.parquet as pq | ||
import sys | ||
|
||
pd.set_option('display.max_columns', None) # Show all columns | ||
pd.set_option('display.max_rows', None) # Show all rows | ||
pd.set_option('display.width', None) # Auto-detect the width for displaying | ||
pd.set_option('display.max_colwidth', None) # Show complete text in each cell | ||
|
||
# Number of rows to generate | ||
n_rows = 100 | ||
|
||
# Functions to generate the data | ||
def generate_map_of_maps(): | ||
return { | ||
f'outer_key_{i}': { | ||
f'inner_key_{j}': random.randint(1, 10) | ||
for j in range(random.randint(1, 3)) | ||
} | ||
for i in range(random.randint(1, 3)) | ||
} | ||
|
||
def generate_map_complex_nested_list(): | ||
return [ | ||
[ | ||
{ | ||
'int': random.randint(1, 10), | ||
'string': f'string_{i}_{j}' | ||
} | ||
for j in range(random.randint(1, 3)) | ||
] | ||
for i in range(random.randint(1, 3)) | ||
] | ||
|
||
def generate_map_of_lists(): | ||
return {f'key_{i}': [random.randint(1, 10) for _ in range(random.randint(1, 3))] for i in range(random.randint(1, 3))} | ||
|
||
def generate_map_of_complex_lists(): | ||
return { | ||
f'key_{i}': [ | ||
{ | ||
'int': random.randint(1, 10), | ||
'string': f'string_{i}_{j}', | ||
'bool': bool(random.getrandbits(1)) | ||
} | ||
for j in range(random.randint(1, 3)) | ||
] | ||
for i in range(random.randint(1, 3)) | ||
} | ||
|
||
def generate_map_of_list_of_map_of_lists(): | ||
return { | ||
f'key_{i}': [ | ||
{ | ||
f'string_{i}_{j}_{k}': [random.randint(1, 10) for _ in range(random.randint(1, 3))] | ||
for k in range(random.randint(1, 3)) | ||
} | ||
for j in range(random.randint(1, 3)) | ||
] | ||
for i in range(random.randint(1, 3)) | ||
} | ||
|
||
def generate_map_of_structs(): | ||
map_of_structs_data = [] | ||
for i in range(n_rows): | ||
# Generating a map where each value is a struct with an Int32 and a String field | ||
map_of_structs_value = { | ||
f'key_{j}': { | ||
'int_field': j, | ||
'string_field': f'string_{j}' | ||
} for j in range(3) | ||
} | ||
map_of_structs_data.append(map_of_structs_value) | ||
return map_of_structs_data | ||
|
||
def generate_map_of_struct_of_structs(n_rows): | ||
map_of_struct_of_structs_data = [] # List to hold all the data | ||
for i in range(n_rows): | ||
map_of_struct_of_structs_value = { | ||
f'key_{j}': { | ||
'struct': { | ||
'nested_struct': { | ||
'int': random.randint(1, 100), | ||
'string': f'string_{j}' | ||
} | ||
} | ||
} for j in range(3) # Creating 3 key-value pairs in each map | ||
} | ||
map_of_struct_of_structs_data.append(map_of_struct_of_structs_value) | ||
return map_of_struct_of_structs_data | ||
|
||
# Columns | ||
map_col = [{"key_" + str(i): i} for i in range(n_rows)] | ||
map_nullable_col = pd.Series([{"key_" + str(i): i} if i % 2 == 0 else None for i in range(n_rows)], dtype='object') | ||
map_of_maps_col = pd.Series([generate_map_of_maps() for _ in range(n_rows)], dtype='object') | ||
map_of_lists_col = pd.Series([generate_map_of_lists() for _ in range(n_rows)], dtype='object') | ||
map_of_complex_lists_col = pd.Series([generate_map_of_complex_lists() for _ in range(n_rows)], dtype='object') | ||
map_of_list_of_map_of_lists_col = pd.Series([generate_map_of_list_of_map_of_lists() for _ in range(n_rows)], dtype='object') | ||
map_of_structs_col = generate_map_of_structs() | ||
map_of_struct_of_structs_col = generate_map_of_struct_of_structs(n_rows) | ||
|
||
# Creating the DataFrame with only the new column | ||
df_nested_list = pd.DataFrame({ | ||
'map': map_col, | ||
'map_nullable': map_nullable_col, | ||
'map_of_maps': map_of_maps_col, | ||
'map_of_lists': map_of_lists_col, | ||
'map_of_complex_lists': map_of_complex_lists_col, | ||
'map_of_list_of_map_of_lists': map_of_list_of_map_of_lists_col, | ||
'map_of_structs': map_of_structs_col, | ||
'map_of_struct_of_structs': map_of_struct_of_structs_col | ||
}) | ||
|
||
# Types | ||
map_type = pa.map_(pa.string(), pa.int32()) | ||
map_of_maps_type = pa.map_( | ||
pa.string(), | ||
pa.map_( | ||
pa.string(), | ||
pa.int32() | ||
) | ||
) | ||
map_of_lists_type = pa.map_(pa.string(), pa.list_(pa.int32())) | ||
map_of_complex_lists_element_type = pa.struct([ | ||
pa.field('int', pa.int32()), | ||
pa.field('string', pa.string()), | ||
pa.field('bool', pa.bool_()) | ||
]) | ||
map_of_complex_lists_type = pa.map_(pa.string(), pa.list_(map_of_complex_lists_element_type)) | ||
|
||
map_of_list_of_map_of_lists_inner_list_map_type = pa.map_(pa.string(), pa.list_(pa.int32())) | ||
map_of_list_of_map_of_lists_inner_list_type = pa.list_(map_of_list_of_map_of_lists_inner_list_map_type) | ||
map_of_list_of_map_of_lists_type = pa.map_(pa.string(), map_of_list_of_map_of_lists_inner_list_type) | ||
|
||
map_of_structs_struct = pa.struct([ | ||
pa.field('int_field', pa.int32()), | ||
pa.field('string_field', pa.string()) | ||
]) | ||
|
||
# Schema for the map of structs | ||
map_of_structs_type = pa.map_(pa.string(), map_of_structs_struct) | ||
|
||
# Schema for the map of struct of structs | ||
map_of_struct_of_structs_struct_struct_struct_type = pa.struct([ | ||
pa.field('int', pa.int32()), | ||
pa.field('string', pa.string()) | ||
]) | ||
|
||
# Define the schema for the intermediate struct `struct` | ||
map_of_struct_of_structs_struct_struct_type = pa.struct([ | ||
pa.field('nested_struct', map_of_struct_of_structs_struct_struct_struct_type) | ||
]) | ||
|
||
# Define the schema for the outer struct which includes the 'struct' key | ||
map_of_struct_of_structs_struct_type = pa.struct([ | ||
pa.field('struct', map_of_struct_of_structs_struct_struct_type) | ||
]) | ||
|
||
# Define the schema for the map `map_of_struct_of_structs` | ||
map_of_struct_of_structs_type = pa.map_( | ||
pa.field('key', pa.string(), nullable=False), # Map keys must be non-nullable | ||
pa.field('value', map_of_struct_of_structs_struct_type) | ||
) | ||
|
||
# Define the schema | ||
schema = pa.schema([ | ||
('map', map_type), | ||
('map_nullable', map_type), | ||
('map_of_maps', map_of_maps_type), | ||
('map_of_lists', map_of_lists_type), | ||
('map_of_complex_lists', map_of_complex_lists_type), | ||
('map_of_list_of_map_of_lists', map_of_list_of_map_of_lists_type), | ||
('map_of_structs', map_of_structs_type), | ||
('map_of_struct_of_structs', map_of_struct_of_structs_type), | ||
]) | ||
|
||
parquet_file = 'output/maps.parquet' | ||
# Create a PyArrow Table | ||
table = pa.Table.from_pandas(df_nested_list, schema=schema) | ||
|
||
# Check if the file exists and remove it | ||
if os.path.exists(parquet_file): | ||
os.remove(parquet_file) | ||
|
||
# Write the PyArrow Table to a Parquet file | ||
with pq.ParquetWriter(parquet_file, schema, compression='GZIP') as writer: | ||
writer.write_table(table) | ||
|
||
# Show the first few rows of the DataFrame for verification | ||
print(df_nested_list.head(1)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
import pandas as pd | ||
import pyarrow as pa | ||
import pyarrow.parquet as pq | ||
from faker import Faker | ||
import uuid | ||
from datetime import datetime | ||
import random | ||
|
||
# Initialize Faker | ||
fake = Faker() | ||
|
||
# Number of rows you want in your Parquet file | ||
num_rows = 100000 | ||
|
||
# Generate data | ||
order_ids = [str(uuid.uuid4()) for _ in range(num_rows)] | ||
total_prices = [round(random.uniform(50.0, 200.0), 2) for _ in range(num_rows)] | ||
discounts = [round(random.uniform(0.0, 50.0), 2) for _ in range(num_rows)] | ||
created_at = [datetime.now() for _ in range(num_rows)] | ||
updated_at = [datetime.now() for _ in range(num_rows)] | ||
|
||
customers = [{'customer_id': str(uuid.uuid4()), 'first_name': fake.first_name(), 'last_name': fake.last_name(), 'email': fake.email()} for _ in range(num_rows)] | ||
|
||
addresses = [{'address_id': str(uuid.uuid4()), 'street': fake.street_address(), 'city': fake.city(), 'state': fake.state(), 'zip': fake.zipcode(), 'country': fake.country()} for _ in range(num_rows)] | ||
|
||
order_lines = [[{'order_line_id': str(uuid.uuid4()), 'product_id': str(uuid.uuid4()), 'quantity': random.randint(1, 10), 'price': round(random.uniform(1.0, 50.0), 2)} for _ in range(random.randint(1, 5))] for _ in range(num_rows)] | ||
|
||
notes = [[{'note_id': str(uuid.uuid4()), 'note_text': fake.text()} for _ in range(random.randint(1, 3))] for _ in range(num_rows)] | ||
|
||
# Create a DataFrame | ||
df = pd.DataFrame({ | ||
'order_id': order_ids, | ||
'total_price': total_prices, | ||
'discount': discounts, | ||
'created_at': created_at, | ||
'updated_at': updated_at, | ||
'customer': customers, | ||
'address': addresses, | ||
'order_lines': order_lines, | ||
'notes': notes | ||
}) | ||
|
||
# Define schema | ||
schema = pa.schema([ | ||
('order_id', pa.string()), | ||
('total_price', pa.float32()), | ||
('discount', pa.float32()), | ||
('created_at', pa.timestamp('ns')), | ||
('updated_at', pa.timestamp('ns')), | ||
('customer', pa.struct([ | ||
('customer_id', pa.string()), | ||
('first_name', pa.string()), | ||
('last_name', pa.string()), | ||
('email', pa.string()) | ||
])), | ||
('address', pa.struct([ | ||
('address_id', pa.string()), | ||
('street', pa.string()), | ||
('city', pa.string()), | ||
('state', pa.string()), | ||
('zip', pa.string()), | ||
('country', pa.string()) | ||
])), | ||
('order_lines', pa.list_( | ||
pa.struct([ | ||
('order_line_id', pa.string()), | ||
('product_id', pa.string()), | ||
('quantity', pa.int32()), | ||
('price', pa.float32()) | ||
]) | ||
)), | ||
('notes', pa.list_( | ||
pa.struct([ | ||
('note_id', pa.string()), | ||
('note_text', pa.string()) | ||
]) | ||
)) | ||
]) | ||
|
||
# Convert DataFrame to PyArrow Table | ||
table = pa.table(df, schema=schema) | ||
|
||
# Write out as Parquet file with Snappy compression | ||
pq.write_table(table, 'output/orders.parquet', compression='gzip') |
Oops, something went wrong.