Skip to content

Commit

Permalink
Added python scripts used to generate test/fixtures data for reader
Browse files Browse the repository at this point in the history
  • Loading branch information
norberttech committed Oct 13, 2023
1 parent d943f39 commit b2d4455
Show file tree
Hide file tree
Showing 9 changed files with 891 additions and 0 deletions.
3 changes: 3 additions & 0 deletions src/lib/parquet/resources/python/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
output
!output/.gitkeep
parquet
27 changes: 27 additions & 0 deletions src/lib/parquet/resources/python/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Test Data Generators

This directory contains scripts to generate test data for the Flwo PHP Parquet reader/writer.

### Prerequisites
- Python 3.x installed
- pip installed (Python Package Index)

### Installation

First go to the `src/lib/parquet/resources/python` directory and run the following command to install the required dependencies:

```shell
python3 -m venv parquet
source parquet/bin/activate
pip install -r requirements.txt
```

Once all dependencies are installed, you can run the following command to generate the test data:

```shell
python generators/lists.py
python generators/maps.py
python generators/orders.py
python generators/primitives.py
python generators/structs.py
```
78 changes: 78 additions & 0 deletions src/lib/parquet/resources/python/generators/lists.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import pandas as pd
import random
import os
import pyarrow as pa
import pyarrow.parquet as pq

# Number of rows to generate
n_rows = 100

# Functions to generate the data
def generate_list_nested():
return [
[
[
random.randint(1, 10) for _ in range(random.randint(1, 3))
] for _ in range(random.randint(1, 3))
] for _ in range(random.randint(1, 3))
]

# Columns
list_col = pd.Series([[random.randint(1, 10) for _ in range(3)] for _ in range(n_rows)], dtype='object')
list_nullable_col = pd.Series([[random.randint(1, 10) for _ in range(3)] if i % 2 == 0 else None for i in range(n_rows)], dtype='object')
list_mixed_types_col = pd.Series([
[
{'int': i, 'string': None, 'bool': None},
{'int': None, 'string': "string_" + str(i), 'bool': None},
{'int': None, 'string': None, 'bool': bool(i % 2)},
{'int': None, 'string': None, 'bool': None}
] for i in range(n_rows)
], dtype='object')
list_nested_col = pd.Series([generate_list_nested() for _ in range(n_rows)], dtype='object')

# Creating the DataFrame with only the new column
df_nested_list = pd.DataFrame({
'list': list_col,
'list_nullable': list_nullable_col,
'list_mixed_types': list_mixed_types_col,
'list_nested': list_nested_col
})

# Types
list_type = pa.list_(pa.int32())
list_mixed_type = pa.list_(
pa.struct([
pa.field('int', pa.int32()),
pa.field('string', pa.string()),
pa.field('bool', pa.bool_())
])
)
list_nested_type = pa.list_(pa.list_(pa.list_(pa.int32())))

# Define the schema
schema = pa.schema([
('list', list_type),
('list_nullable', list_type),
('list_mixed_types', list_mixed_type),
('list_nested', list_nested_type),
])

parquet_file = 'output/lists.parquet'
# Create a PyArrow Table
table = pa.Table.from_pandas(df_nested_list, schema=schema)

# Check if the file exists and remove it
if os.path.exists(parquet_file):
os.remove(parquet_file)

# Write the PyArrow Table to a Parquet file
with pq.ParquetWriter(parquet_file, schema, compression='GZIP') as writer:
writer.write_table(table)

pd.set_option('display.max_columns', None) # Show all columns
pd.set_option('display.max_rows', None) # Show all rows
pd.set_option('display.width', None) # Auto-detect the width for displaying
pd.set_option('display.max_colwidth', None) # Show complete text in each cell

# Show the first few rows of the DataFrame for verification
print(df_nested_list.head(10))
193 changes: 193 additions & 0 deletions src/lib/parquet/resources/python/generators/maps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
import pandas as pd
import random
import os
import pyarrow as pa
import pyarrow.parquet as pq
import sys

pd.set_option('display.max_columns', None) # Show all columns
pd.set_option('display.max_rows', None) # Show all rows
pd.set_option('display.width', None) # Auto-detect the width for displaying
pd.set_option('display.max_colwidth', None) # Show complete text in each cell

# Number of rows to generate
n_rows = 100

# Functions to generate the data
def generate_map_of_maps():
return {
f'outer_key_{i}': {
f'inner_key_{j}': random.randint(1, 10)
for j in range(random.randint(1, 3))
}
for i in range(random.randint(1, 3))
}

def generate_map_complex_nested_list():
return [
[
{
'int': random.randint(1, 10),
'string': f'string_{i}_{j}'
}
for j in range(random.randint(1, 3))
]
for i in range(random.randint(1, 3))
]

def generate_map_of_lists():
return {f'key_{i}': [random.randint(1, 10) for _ in range(random.randint(1, 3))] for i in range(random.randint(1, 3))}

def generate_map_of_complex_lists():
return {
f'key_{i}': [
{
'int': random.randint(1, 10),
'string': f'string_{i}_{j}',
'bool': bool(random.getrandbits(1))
}
for j in range(random.randint(1, 3))
]
for i in range(random.randint(1, 3))
}

def generate_map_of_list_of_map_of_lists():
return {
f'key_{i}': [
{
f'string_{i}_{j}_{k}': [random.randint(1, 10) for _ in range(random.randint(1, 3))]
for k in range(random.randint(1, 3))
}
for j in range(random.randint(1, 3))
]
for i in range(random.randint(1, 3))
}

def generate_map_of_structs():
map_of_structs_data = []
for i in range(n_rows):
# Generating a map where each value is a struct with an Int32 and a String field
map_of_structs_value = {
f'key_{j}': {
'int_field': j,
'string_field': f'string_{j}'
} for j in range(3)
}
map_of_structs_data.append(map_of_structs_value)
return map_of_structs_data

def generate_map_of_struct_of_structs(n_rows):
map_of_struct_of_structs_data = [] # List to hold all the data
for i in range(n_rows):
map_of_struct_of_structs_value = {
f'key_{j}': {
'struct': {
'nested_struct': {
'int': random.randint(1, 100),
'string': f'string_{j}'
}
}
} for j in range(3) # Creating 3 key-value pairs in each map
}
map_of_struct_of_structs_data.append(map_of_struct_of_structs_value)
return map_of_struct_of_structs_data

# Columns
map_col = [{"key_" + str(i): i} for i in range(n_rows)]
map_nullable_col = pd.Series([{"key_" + str(i): i} if i % 2 == 0 else None for i in range(n_rows)], dtype='object')
map_of_maps_col = pd.Series([generate_map_of_maps() for _ in range(n_rows)], dtype='object')
map_of_lists_col = pd.Series([generate_map_of_lists() for _ in range(n_rows)], dtype='object')
map_of_complex_lists_col = pd.Series([generate_map_of_complex_lists() for _ in range(n_rows)], dtype='object')
map_of_list_of_map_of_lists_col = pd.Series([generate_map_of_list_of_map_of_lists() for _ in range(n_rows)], dtype='object')
map_of_structs_col = generate_map_of_structs()
map_of_struct_of_structs_col = generate_map_of_struct_of_structs(n_rows)

# Creating the DataFrame with only the new column
df_nested_list = pd.DataFrame({
'map': map_col,
'map_nullable': map_nullable_col,
'map_of_maps': map_of_maps_col,
'map_of_lists': map_of_lists_col,
'map_of_complex_lists': map_of_complex_lists_col,
'map_of_list_of_map_of_lists': map_of_list_of_map_of_lists_col,
'map_of_structs': map_of_structs_col,
'map_of_struct_of_structs': map_of_struct_of_structs_col
})

# Types
map_type = pa.map_(pa.string(), pa.int32())
map_of_maps_type = pa.map_(
pa.string(),
pa.map_(
pa.string(),
pa.int32()
)
)
map_of_lists_type = pa.map_(pa.string(), pa.list_(pa.int32()))
map_of_complex_lists_element_type = pa.struct([
pa.field('int', pa.int32()),
pa.field('string', pa.string()),
pa.field('bool', pa.bool_())
])
map_of_complex_lists_type = pa.map_(pa.string(), pa.list_(map_of_complex_lists_element_type))

map_of_list_of_map_of_lists_inner_list_map_type = pa.map_(pa.string(), pa.list_(pa.int32()))
map_of_list_of_map_of_lists_inner_list_type = pa.list_(map_of_list_of_map_of_lists_inner_list_map_type)
map_of_list_of_map_of_lists_type = pa.map_(pa.string(), map_of_list_of_map_of_lists_inner_list_type)

map_of_structs_struct = pa.struct([
pa.field('int_field', pa.int32()),
pa.field('string_field', pa.string())
])

# Schema for the map of structs
map_of_structs_type = pa.map_(pa.string(), map_of_structs_struct)

# Schema for the map of struct of structs
map_of_struct_of_structs_struct_struct_struct_type = pa.struct([
pa.field('int', pa.int32()),
pa.field('string', pa.string())
])

# Define the schema for the intermediate struct `struct`
map_of_struct_of_structs_struct_struct_type = pa.struct([
pa.field('nested_struct', map_of_struct_of_structs_struct_struct_struct_type)
])

# Define the schema for the outer struct which includes the 'struct' key
map_of_struct_of_structs_struct_type = pa.struct([
pa.field('struct', map_of_struct_of_structs_struct_struct_type)
])

# Define the schema for the map `map_of_struct_of_structs`
map_of_struct_of_structs_type = pa.map_(
pa.field('key', pa.string(), nullable=False), # Map keys must be non-nullable
pa.field('value', map_of_struct_of_structs_struct_type)
)

# Define the schema
schema = pa.schema([
('map', map_type),
('map_nullable', map_type),
('map_of_maps', map_of_maps_type),
('map_of_lists', map_of_lists_type),
('map_of_complex_lists', map_of_complex_lists_type),
('map_of_list_of_map_of_lists', map_of_list_of_map_of_lists_type),
('map_of_structs', map_of_structs_type),
('map_of_struct_of_structs', map_of_struct_of_structs_type),
])

parquet_file = 'output/maps.parquet'
# Create a PyArrow Table
table = pa.Table.from_pandas(df_nested_list, schema=schema)

# Check if the file exists and remove it
if os.path.exists(parquet_file):
os.remove(parquet_file)

# Write the PyArrow Table to a Parquet file
with pq.ParquetWriter(parquet_file, schema, compression='GZIP') as writer:
writer.write_table(table)

# Show the first few rows of the DataFrame for verification
print(df_nested_list.head(1))
84 changes: 84 additions & 0 deletions src/lib/parquet/resources/python/generators/orders.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from faker import Faker
import uuid
from datetime import datetime
import random

# Initialize Faker
fake = Faker()

# Number of rows you want in your Parquet file
num_rows = 100000

# Generate data
order_ids = [str(uuid.uuid4()) for _ in range(num_rows)]
total_prices = [round(random.uniform(50.0, 200.0), 2) for _ in range(num_rows)]
discounts = [round(random.uniform(0.0, 50.0), 2) for _ in range(num_rows)]
created_at = [datetime.now() for _ in range(num_rows)]
updated_at = [datetime.now() for _ in range(num_rows)]

customers = [{'customer_id': str(uuid.uuid4()), 'first_name': fake.first_name(), 'last_name': fake.last_name(), 'email': fake.email()} for _ in range(num_rows)]

addresses = [{'address_id': str(uuid.uuid4()), 'street': fake.street_address(), 'city': fake.city(), 'state': fake.state(), 'zip': fake.zipcode(), 'country': fake.country()} for _ in range(num_rows)]

order_lines = [[{'order_line_id': str(uuid.uuid4()), 'product_id': str(uuid.uuid4()), 'quantity': random.randint(1, 10), 'price': round(random.uniform(1.0, 50.0), 2)} for _ in range(random.randint(1, 5))] for _ in range(num_rows)]

notes = [[{'note_id': str(uuid.uuid4()), 'note_text': fake.text()} for _ in range(random.randint(1, 3))] for _ in range(num_rows)]

# Create a DataFrame
df = pd.DataFrame({
'order_id': order_ids,
'total_price': total_prices,
'discount': discounts,
'created_at': created_at,
'updated_at': updated_at,
'customer': customers,
'address': addresses,
'order_lines': order_lines,
'notes': notes
})

# Define schema
schema = pa.schema([
('order_id', pa.string()),
('total_price', pa.float32()),
('discount', pa.float32()),
('created_at', pa.timestamp('ns')),
('updated_at', pa.timestamp('ns')),
('customer', pa.struct([
('customer_id', pa.string()),
('first_name', pa.string()),
('last_name', pa.string()),
('email', pa.string())
])),
('address', pa.struct([
('address_id', pa.string()),
('street', pa.string()),
('city', pa.string()),
('state', pa.string()),
('zip', pa.string()),
('country', pa.string())
])),
('order_lines', pa.list_(
pa.struct([
('order_line_id', pa.string()),
('product_id', pa.string()),
('quantity', pa.int32()),
('price', pa.float32())
])
)),
('notes', pa.list_(
pa.struct([
('note_id', pa.string()),
('note_text', pa.string())
])
))
])

# Convert DataFrame to PyArrow Table
table = pa.table(df, schema=schema)

# Write out as Parquet file with Snappy compression
pq.write_table(table, 'output/orders.parquet', compression='gzip')
Loading

0 comments on commit b2d4455

Please sign in to comment.