Skip to content

Commit

Permalink
Update ingestion scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
zacdezgeo committed Jul 22, 2024
1 parent 7b6f354 commit ae46ef7
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 4 deletions.
11 changes: 8 additions & 3 deletions postgres/chunk_parquet.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
import pandas as pd
import os

import pandas as pd

df = pd.read_parquet('space2stats.parquet')
chunk_dir = "parquet_chunks"
df = pd.read_parquet('space2stats_updated.parquet')
chunk_size = 100000 # Number of rows per chunk

if not os.path.exists(chunk_dir):
os.mkdir(chunk_dir)

for i in range(0, len(df), chunk_size):
chunk = df.iloc[i:i + chunk_size]
chunk.to_parquet(f'parquet_chunks/space2stats_part_{i // chunk_size}.parquet')
chunk.to_parquet(os.path.join(chunk_dir, f'space2stats_part_{i // chunk_size}.parquet'))

print("Parquet file split into smaller chunks.")
3 changes: 2 additions & 1 deletion postgres/load_parquet_chunks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ do
ogr2ogr -f "PostgreSQL" \
PG:"host=$DB_HOST port=$DB_PORT dbname=$DB_NAME user=$DB_USER password=$DB_PASSWORD" \
"$PARQUET_FILE" \
-nln $TABLE_NAME
-nln $TABLE_NAME \
-lco SPATIAL_INDEX=NONE

TABLE_EXISTS="t"
fi
Expand Down

0 comments on commit ae46ef7

Please sign in to comment.