Added python scripts used to generate test/fixtures data for reader

flow-php · Oct 13, 2023 · b2d4455 · b2d4455
1 parent d943f39
commit b2d4455
Show file tree

Hide file tree

Showing 9 changed files with 891 additions and 0 deletions.
diff --git a/src/lib/parquet/resources/python/.gitignore b/src/lib/parquet/resources/python/.gitignore
@@ -0,0 +1,3 @@
+output
+!output/.gitkeep
+parquet
diff --git a/src/lib/parquet/resources/python/README.md b/src/lib/parquet/resources/python/README.md
@@ -0,0 +1,27 @@
+# Test Data Generators
+
+This directory contains scripts to generate test data for the Flwo PHP Parquet reader/writer.
+
+### Prerequisites
+ - Python 3.x installed
+ - pip installed (Python Package Index)
+
+### Installation
+
+First go to the `src/lib/parquet/resources/python` directory and run the following command to install the required dependencies:
+
+```shell
+python3 -m venv parquet
+source parquet/bin/activate
+pip install -r requirements.txt
+```
+
+Once all dependencies are installed, you can run the following command to generate the test data:
+
+```shell
+python generators/lists.py
+python generators/maps.py
+python generators/orders.py
+python generators/primitives.py
+python generators/structs.py
+```
diff --git a/src/lib/parquet/resources/python/generators/lists.py b/src/lib/parquet/resources/python/generators/lists.py
@@ -0,0 +1,78 @@
+import pandas as pd
+import random
+import os
+import pyarrow as pa
+import pyarrow.parquet as pq
+
+# Number of rows to generate
+n_rows = 100
+
+# Functions to generate the data
+def generate_list_nested():
+    return [
+        [
+            [
+                random.randint(1, 10) for _ in range(random.randint(1, 3))
+            ] for _ in range(random.randint(1, 3))
+        ] for _ in range(random.randint(1, 3))
+    ]
+
+# Columns
+list_col = pd.Series([[random.randint(1, 10) for _ in range(3)] for _ in range(n_rows)], dtype='object')
+list_nullable_col = pd.Series([[random.randint(1, 10) for _ in range(3)] if i % 2 == 0 else None for i in range(n_rows)], dtype='object')
+list_mixed_types_col = pd.Series([
+    [
+        {'int': i, 'string': None, 'bool': None},
+        {'int': None, 'string': "string_" + str(i), 'bool': None},
+        {'int': None, 'string': None, 'bool': bool(i % 2)},
+        {'int': None, 'string': None, 'bool': None}
+    ] for i in range(n_rows)
+], dtype='object')
+list_nested_col = pd.Series([generate_list_nested() for _ in range(n_rows)], dtype='object')
+
+# Creating the DataFrame with only the new column
+df_nested_list = pd.DataFrame({
+    'list': list_col,
+    'list_nullable': list_nullable_col,
+    'list_mixed_types': list_mixed_types_col,
+    'list_nested': list_nested_col
+})
+
+# Types
+list_type = pa.list_(pa.int32())
+list_mixed_type = pa.list_(
+    pa.struct([
+        pa.field('int', pa.int32()),
+        pa.field('string', pa.string()),
+        pa.field('bool', pa.bool_())
+    ])
+)
+list_nested_type = pa.list_(pa.list_(pa.list_(pa.int32())))
+
+# Define the schema
+schema = pa.schema([
+    ('list', list_type),
+    ('list_nullable', list_type),
+    ('list_mixed_types', list_mixed_type),
+    ('list_nested', list_nested_type),
+])
+
+parquet_file = 'output/lists.parquet'
+# Create a PyArrow Table
+table = pa.Table.from_pandas(df_nested_list, schema=schema)
+
+# Check if the file exists and remove it
+if os.path.exists(parquet_file):
+    os.remove(parquet_file)
+
+# Write the PyArrow Table to a Parquet file
+with pq.ParquetWriter(parquet_file, schema, compression='GZIP') as writer:
+    writer.write_table(table)
+
+pd.set_option('display.max_columns', None)  # Show all columns
+pd.set_option('display.max_rows', None)     # Show all rows
+pd.set_option('display.width', None)        # Auto-detect the width for displaying
+pd.set_option('display.max_colwidth', None) # Show complete text in each cell
+
+# Show the first few rows of the DataFrame for verification
+print(df_nested_list.head(10))
diff --git a/src/lib/parquet/resources/python/generators/maps.py b/src/lib/parquet/resources/python/generators/maps.py
@@ -0,0 +1,193 @@
+import pandas as pd
+import random
+import os
+import pyarrow as pa
+import pyarrow.parquet as pq
+import sys
+
+pd.set_option('display.max_columns', None)  # Show all columns
+pd.set_option('display.max_rows', None)     # Show all rows
+pd.set_option('display.width', None)        # Auto-detect the width for displaying
+pd.set_option('display.max_colwidth', None) # Show complete text in each cell
+
+# Number of rows to generate
+n_rows = 100
+
+# Functions to generate the data
+def generate_map_of_maps():
+    return {
+        f'outer_key_{i}': {
+            f'inner_key_{j}': random.randint(1, 10)
+            for j in range(random.randint(1, 3))
+        }
+        for i in range(random.randint(1, 3))
+    }
+
+def generate_map_complex_nested_list():
+    return [
+        [
+            {
+                'int': random.randint(1, 10),
+                'string': f'string_{i}_{j}'
+            }
+            for j in range(random.randint(1, 3))
+        ]
+        for i in range(random.randint(1, 3))
+    ]
+
+def generate_map_of_lists():
+    return {f'key_{i}': [random.randint(1, 10) for _ in range(random.randint(1, 3))] for i in range(random.randint(1, 3))}
+
+def generate_map_of_complex_lists():
+    return {
+        f'key_{i}': [
+            {
+                'int': random.randint(1, 10),
+                'string': f'string_{i}_{j}',
+                'bool': bool(random.getrandbits(1))
+            }
+            for j in range(random.randint(1, 3))
+        ]
+        for i in range(random.randint(1, 3))
+    }
+
+def generate_map_of_list_of_map_of_lists():
+    return {
+        f'key_{i}': [
+            {
+                f'string_{i}_{j}_{k}': [random.randint(1, 10) for _ in range(random.randint(1, 3))]
+                for k in range(random.randint(1, 3))
+            }
+            for j in range(random.randint(1, 3))
+        ]
+        for i in range(random.randint(1, 3))
+    }
+
+def generate_map_of_structs():
+    map_of_structs_data = []
+    for i in range(n_rows):
+        # Generating a map where each value is a struct with an Int32 and a String field
+        map_of_structs_value = {
+            f'key_{j}': {
+                'int_field': j,
+                'string_field': f'string_{j}'
+            } for j in range(3)
+        }
+        map_of_structs_data.append(map_of_structs_value)
+    return map_of_structs_data
+
+def generate_map_of_struct_of_structs(n_rows):
+    map_of_struct_of_structs_data = []  # List to hold all the data
+    for i in range(n_rows):
+        map_of_struct_of_structs_value = {
+            f'key_{j}': {
+                'struct': {
+                    'nested_struct': {
+                        'int': random.randint(1, 100),
+                        'string': f'string_{j}'
+                    }
+                }
+            } for j in range(3)  # Creating 3 key-value pairs in each map
+        }
+        map_of_struct_of_structs_data.append(map_of_struct_of_structs_value)
+    return map_of_struct_of_structs_data
+
+# Columns
+map_col = [{"key_" + str(i): i} for i in range(n_rows)]
+map_nullable_col = pd.Series([{"key_" + str(i): i} if i % 2 == 0 else None for i in range(n_rows)], dtype='object')
+map_of_maps_col = pd.Series([generate_map_of_maps() for _ in range(n_rows)], dtype='object')
+map_of_lists_col = pd.Series([generate_map_of_lists() for _ in range(n_rows)], dtype='object')
+map_of_complex_lists_col = pd.Series([generate_map_of_complex_lists() for _ in range(n_rows)], dtype='object')
+map_of_list_of_map_of_lists_col = pd.Series([generate_map_of_list_of_map_of_lists() for _ in range(n_rows)], dtype='object')
+map_of_structs_col = generate_map_of_structs()
+map_of_struct_of_structs_col = generate_map_of_struct_of_structs(n_rows)
+
+# Creating the DataFrame with only the new column
+df_nested_list = pd.DataFrame({
+    'map': map_col,
+    'map_nullable': map_nullable_col,
+    'map_of_maps': map_of_maps_col,
+    'map_of_lists': map_of_lists_col,
+    'map_of_complex_lists': map_of_complex_lists_col,
+    'map_of_list_of_map_of_lists': map_of_list_of_map_of_lists_col,
+    'map_of_structs': map_of_structs_col,
+    'map_of_struct_of_structs': map_of_struct_of_structs_col
+})
+
+# Types
+map_type = pa.map_(pa.string(), pa.int32())
+map_of_maps_type = pa.map_(
+    pa.string(),
+    pa.map_(
+        pa.string(),
+        pa.int32()
+    )
+)
+map_of_lists_type = pa.map_(pa.string(), pa.list_(pa.int32()))
+map_of_complex_lists_element_type = pa.struct([
+    pa.field('int', pa.int32()),
+    pa.field('string', pa.string()),
+    pa.field('bool', pa.bool_())
+])
+map_of_complex_lists_type = pa.map_(pa.string(), pa.list_(map_of_complex_lists_element_type))
+
+map_of_list_of_map_of_lists_inner_list_map_type = pa.map_(pa.string(), pa.list_(pa.int32()))
+map_of_list_of_map_of_lists_inner_list_type = pa.list_(map_of_list_of_map_of_lists_inner_list_map_type)
+map_of_list_of_map_of_lists_type = pa.map_(pa.string(), map_of_list_of_map_of_lists_inner_list_type)
+
+map_of_structs_struct = pa.struct([
+    pa.field('int_field', pa.int32()),
+    pa.field('string_field', pa.string())
+])
+
+# Schema for the map of structs
+map_of_structs_type = pa.map_(pa.string(), map_of_structs_struct)
+
+# Schema for the map of struct of structs
+map_of_struct_of_structs_struct_struct_struct_type = pa.struct([
+    pa.field('int', pa.int32()),
+    pa.field('string', pa.string())
+])
+
+# Define the schema for the intermediate struct `struct`
+map_of_struct_of_structs_struct_struct_type = pa.struct([
+    pa.field('nested_struct', map_of_struct_of_structs_struct_struct_struct_type)
+])
+
+# Define the schema for the outer struct which includes the 'struct' key
+map_of_struct_of_structs_struct_type = pa.struct([
+    pa.field('struct', map_of_struct_of_structs_struct_struct_type)
+])
+
+# Define the schema for the map `map_of_struct_of_structs`
+map_of_struct_of_structs_type = pa.map_(
+    pa.field('key', pa.string(), nullable=False),  # Map keys must be non-nullable
+    pa.field('value', map_of_struct_of_structs_struct_type)
+)
+
+# Define the schema
+schema = pa.schema([
+    ('map', map_type),
+    ('map_nullable', map_type),
+    ('map_of_maps', map_of_maps_type),
+    ('map_of_lists', map_of_lists_type),
+    ('map_of_complex_lists', map_of_complex_lists_type),
+    ('map_of_list_of_map_of_lists', map_of_list_of_map_of_lists_type),
+    ('map_of_structs', map_of_structs_type),
+    ('map_of_struct_of_structs', map_of_struct_of_structs_type),
+])
+
+parquet_file = 'output/maps.parquet'
+# Create a PyArrow Table
+table = pa.Table.from_pandas(df_nested_list, schema=schema)
+
+# Check if the file exists and remove it
+if os.path.exists(parquet_file):
+    os.remove(parquet_file)
+
+# Write the PyArrow Table to a Parquet file
+with pq.ParquetWriter(parquet_file, schema, compression='GZIP') as writer:
+    writer.write_table(table)
+
+# Show the first few rows of the DataFrame for verification
+print(df_nested_list.head(1))
diff --git a/src/lib/parquet/resources/python/generators/orders.py b/src/lib/parquet/resources/python/generators/orders.py
@@ -0,0 +1,84 @@
+import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet as pq
+from faker import Faker
+import uuid
+from datetime import datetime
+import random
+
+# Initialize Faker
+fake = Faker()
+
+# Number of rows you want in your Parquet file
+num_rows = 100000
+
+# Generate data
+order_ids = [str(uuid.uuid4()) for _ in range(num_rows)]
+total_prices = [round(random.uniform(50.0, 200.0), 2) for _ in range(num_rows)]
+discounts = [round(random.uniform(0.0, 50.0), 2) for _ in range(num_rows)]
+created_at = [datetime.now() for _ in range(num_rows)]
+updated_at = [datetime.now() for _ in range(num_rows)]
+
+customers = [{'customer_id': str(uuid.uuid4()), 'first_name': fake.first_name(), 'last_name': fake.last_name(), 'email': fake.email()} for _ in range(num_rows)]
+
+addresses = [{'address_id': str(uuid.uuid4()), 'street': fake.street_address(), 'city': fake.city(), 'state': fake.state(), 'zip': fake.zipcode(), 'country': fake.country()} for _ in range(num_rows)]
+
+order_lines = [[{'order_line_id': str(uuid.uuid4()), 'product_id': str(uuid.uuid4()), 'quantity': random.randint(1, 10), 'price': round(random.uniform(1.0, 50.0), 2)} for _ in range(random.randint(1, 5))] for _ in range(num_rows)]
+
+notes = [[{'note_id': str(uuid.uuid4()), 'note_text': fake.text()} for _ in range(random.randint(1, 3))] for _ in range(num_rows)]
+
+# Create a DataFrame
+df = pd.DataFrame({
+    'order_id': order_ids,
+    'total_price': total_prices,
+    'discount': discounts,
+    'created_at': created_at,
+    'updated_at': updated_at,
+    'customer': customers,
+    'address': addresses,
+    'order_lines': order_lines,
+    'notes': notes
+})
+
+# Define schema
+schema = pa.schema([
+    ('order_id', pa.string()),
+    ('total_price', pa.float32()),
+    ('discount', pa.float32()),
+    ('created_at', pa.timestamp('ns')),
+    ('updated_at', pa.timestamp('ns')),
+    ('customer', pa.struct([
+        ('customer_id', pa.string()),
+        ('first_name', pa.string()),
+        ('last_name', pa.string()),
+        ('email', pa.string())
+    ])),
+    ('address', pa.struct([
+        ('address_id', pa.string()),
+        ('street', pa.string()),
+        ('city', pa.string()),
+        ('state', pa.string()),
+        ('zip', pa.string()),
+        ('country', pa.string())
+    ])),
+    ('order_lines', pa.list_(
+        pa.struct([
+            ('order_line_id', pa.string()),
+            ('product_id', pa.string()),
+            ('quantity', pa.int32()),
+            ('price', pa.float32())
+        ])
+    )),
+    ('notes', pa.list_(
+        pa.struct([
+            ('note_id', pa.string()),
+            ('note_text', pa.string())
+        ])
+    ))
+])
+
+# Convert DataFrame to PyArrow Table
+table = pa.table(df, schema=schema)
+
+# Write out as Parquet file with Snappy compression
+pq.write_table(table, 'output/orders.parquet', compression='gzip')