Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/deploy #25

Merged
merged 14 commits into from
Aug 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r space2stats_api/requirements.txt
pip install pre-commit
pip install -r space2stats_api/src/requirements.txt
pip install pre-commit pytest

- name: Set PYTHONPATH
run: echo "PYTHONPATH=$(pwd)/space2stats_api" >> $GITHUB_ENV
Expand Down
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -100,4 +100,8 @@ db.env
# data
*.parquet
*.duckdb
.pgdata
.pgdata
space2stats_api/space2stats_env
*.env
cdk.out
lambda_layer
127 changes: 38 additions & 89 deletions notebooks/space2stats_api_demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -21,18 +21,18 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"BASE_URL = \"http://localhost:8000\"\n",
"BASE_URL = \"https://space2stats.ds.io\"\n",
"FIELDS_ENDPOINT = f\"{BASE_URL}/fields\"\n",
"SUMMARY_ENDPOINT = f\"{BASE_URL}/summary\""
]
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 17,
"metadata": {},
"outputs": [
{
Expand All @@ -54,7 +54,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -115,15 +115,15 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"# Define the Request Payload\n",
"request_payload = {\n",
" \"aoi\": aoi,\n",
" \"spatial_join_method\": \"centroid\",\n",
" \"fields\": [\"sum_pop_2020\", \"sum_pop_f_2020\", \"sum_pop_m_2020\"], \n",
" \"fields\": [\"sum_pop_2020\"], \n",
" \"geometry\": \"point\"\n",
"}\n",
"\n",
Expand All @@ -138,7 +138,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 20,
"metadata": {},
"outputs": [
{
Expand All @@ -165,135 +165,98 @@
" <th>hex_id</th>\n",
" <th>geometry</th>\n",
" <th>sum_pop_2020</th>\n",
" <th>sum_pop_f_2020</th>\n",
" <th>sum_pop_m_2020</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>866a4a48fffffff</td>\n",
" <td>POINT (36.31771 2.23633)</td>\n",
" <td>POINT (35.76352 2.99589)</td>\n",
" <td>399.860905</td>\n",
" <td>189.675539</td>\n",
" <td>210.185366</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>866a4a497ffffff</td>\n",
" <td>POINT (40.18159 0.05763)</td>\n",
" <td>POINT (40.58048 -3.79365)</td>\n",
" <td>582.555159</td>\n",
" <td>276.337255</td>\n",
" <td>306.217904</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>866a4a49fffffff</td>\n",
" <td>POINT (38.59096 0.13944)</td>\n",
" <td>POINT (41.10421 3.37873)</td>\n",
" <td>749.911237</td>\n",
" <td>355.723245</td>\n",
" <td>394.187992</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>866a4a4d7ffffff</td>\n",
" <td>POINT (35.07124 0.80971)</td>\n",
" <td>POINT (37.26153 3.74581)</td>\n",
" <td>863.888290</td>\n",
" <td>418.309236</td>\n",
" <td>445.579054</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>866a5820fffffff</td>\n",
" <td>POINT (37.4356 3.35699)</td>\n",
" <td>POINT (40.01148 1.53124)</td>\n",
" <td>525.085147</td>\n",
" <td>249.076134</td>\n",
" <td>276.009012</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16212</th>\n",
" <td>867b5dd77ffffff</td>\n",
" <td>POINT (39.15438 -1.51437)</td>\n",
" <td>POINT (34.94474 1.24558)</td>\n",
" <td>-36.000000</td>\n",
" <td>-18.000000</td>\n",
" <td>-18.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16213</th>\n",
" <td>867b5dd87ffffff</td>\n",
" <td>POINT (35.80252 0.90823)</td>\n",
" <td>POINT (40.95343 -1.83280)</td>\n",
" <td>-36.000000</td>\n",
" <td>-18.000000</td>\n",
" <td>-18.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16214</th>\n",
" <td>867b5dd8fffffff</td>\n",
" <td>POINT (37.93845 0.83454)</td>\n",
" <td>POINT (35.20290 -0.29666)</td>\n",
" <td>-36.000000</td>\n",
" <td>-18.000000</td>\n",
" <td>-18.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16215</th>\n",
" <td>867b5dd9fffffff</td>\n",
" <td>POINT (38.65824 -2.60028)</td>\n",
" <td>POINT (41.28333 -1.08552)</td>\n",
" <td>-36.000000</td>\n",
" <td>-18.000000</td>\n",
" <td>-18.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16216</th>\n",
" <td>867b5ddafffffff</td>\n",
" <td>POINT (36.6641 2.37083)</td>\n",
" <td>POINT (36.63048 1.35038)</td>\n",
" <td>-36.000000</td>\n",
" <td>-18.000000</td>\n",
" <td>-18.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>16217 rows × 5 columns</p>\n",
"<p>16217 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" hex_id geometry sum_pop_2020 \\\n",
"0 866a4a48fffffff POINT (36.31771 2.23633) 399.860905 \n",
"1 866a4a497ffffff POINT (40.18159 0.05763) 582.555159 \n",
"2 866a4a49fffffff POINT (38.59096 0.13944) 749.911237 \n",
"3 866a4a4d7ffffff POINT (35.07124 0.80971) 863.888290 \n",
"4 866a5820fffffff POINT (37.4356 3.35699) 525.085147 \n",
"... ... ... ... \n",
"16212 867b5dd77ffffff POINT (39.15438 -1.51437) -36.000000 \n",
"16213 867b5dd87ffffff POINT (35.80252 0.90823) -36.000000 \n",
"16214 867b5dd8fffffff POINT (37.93845 0.83454) -36.000000 \n",
"16215 867b5dd9fffffff POINT (38.65824 -2.60028) -36.000000 \n",
"16216 867b5ddafffffff POINT (36.6641 2.37083) -36.000000 \n",
"\n",
" sum_pop_f_2020 sum_pop_m_2020 \n",
"0 189.675539 210.185366 \n",
"1 276.337255 306.217904 \n",
"2 355.723245 394.187992 \n",
"3 418.309236 445.579054 \n",
"4 249.076134 276.009012 \n",
"... ... ... \n",
"16212 -18.000000 -18.000000 \n",
"16213 -18.000000 -18.000000 \n",
"16214 -18.000000 -18.000000 \n",
"16215 -18.000000 -18.000000 \n",
"16216 -18.000000 -18.000000 \n",
" hex_id geometry sum_pop_2020\n",
"0 866a4a48fffffff POINT (35.76352 2.99589) 399.860905\n",
"1 866a4a497ffffff POINT (40.58048 -3.79365) 582.555159\n",
"2 866a4a49fffffff POINT (41.10421 3.37873) 749.911237\n",
"3 866a4a4d7ffffff POINT (37.26153 3.74581) 863.888290\n",
"4 866a5820fffffff POINT (40.01148 1.53124) 525.085147\n",
"... ... ... ...\n",
"16212 867b5dd77ffffff POINT (34.94474 1.24558) -36.000000\n",
"16213 867b5dd87ffffff POINT (40.95343 -1.83280) -36.000000\n",
"16214 867b5dd8fffffff POINT (35.20290 -0.29666) -36.000000\n",
"16215 867b5dd9fffffff POINT (41.28333 -1.08552) -36.000000\n",
"16216 867b5ddafffffff POINT (36.63048 1.35038) -36.000000\n",
"\n",
"[16217 rows x 5 columns]"
"[16217 rows x 3 columns]"
]
},
"execution_count": 17,
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -306,24 +269,24 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f9d4a524b5bf4d1a950f1cb8cc8d5b54",
"model_id": "00bbfea95ae440d3a73ebb161e3142ab",
"version_major": 2,
"version_minor": 1
"version_minor": 0
},
"text/plain": [
"Map(layers=[ScatterplotLayer(get_fill_color=<pyarrow.lib.FixedSizeListArray object at 0x13966dde0>\n",
"Map(layers=[ScatterplotLayer(get_fill_color=<pyarrow.lib.FixedSizeListArray object at 0x1631ef160>\n",
"[\n",
" [\n",
" 2…"
]
},
"execution_count": 18,
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -338,20 +301,6 @@
"m = Map(layer)\n",
"m\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
7 changes: 7 additions & 0 deletions postgres/deploy.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
## Deployment Notes

- Create database instance
- Update configuration in `db.env`
- Ingest parquet file with `load_to_prod.sh` (may require `chmod +x load_to_prod.sh`)
- Create index on hex_id (for performance):`CREATE INDEX idx_hex_id ON space2stats (hex_id)` - critical for performance of our queries
- Test with the [example notebook](notebooks/space2stats_api_demo.ipynb)
16 changes: 10 additions & 6 deletions postgres/load_nyc_sample.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
#!/bin/bash

# Database connection details
DB_HOST="localhost"
DB_PORT="5439"
DB_NAME="postgis"
DB_USER="username"
DB_PASSWORD="password"
# Load environment variables from db.env file
if [ -f db.env ]; then
export $(cat db.env | grep -v '#' | awk '/=/ {print $1}')
fi

# Check if required environment variables are set
if [ -z "$DB_HOST" ] || [ -z "$DB_PORT" ] || [ -z "$DB_NAME" ] || [ -z "$DB_USER" ] || [ -z "$DB_PASSWORD" ]; then
echo "One or more required environment variables are missing."
exit 1
fi

# Path to the sample Parquet file
PARQUET_FILE="nyc_sample.parquet"
Expand Down
17 changes: 11 additions & 6 deletions postgres/load_parquet_chunks.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
#!/bin/bash

# Database connection details
DB_HOST="localhost"
DB_PORT="5439"
DB_NAME="postgis"
DB_USER="username"
DB_PASSWORD="password"

# Load environment variables from db.env file
if [ -f db.env ]; then
export $(cat db.env | grep -v '#' | awk '/=/ {print $1}')
fi

# Check if required environment variables are set
if [ -z "$DB_HOST" ] || [ -z "$DB_PORT" ] || [ -z "$DB_NAME" ] || [ -z "$DB_USER" ] || [ -z "$DB_PASSWORD" ]; then
echo "One or more required environment variables are missing."
exit 1
fi

# Directory containing the Parquet chunks
CHUNKS_DIR="parquet_chunks"
Expand Down
30 changes: 30 additions & 0 deletions postgres/load_to_prod.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash


# Load environment variables from db.env file
if [ -f db.env ]; then
export $(cat db.env | grep -v '#' | awk '/=/ {print $1}')
fi

# Check if required environment variables are set
if [ -z "$DB_HOST" ] || [ -z "$DB_PORT" ] || [ -z "$DB_NAME" ] || [ -z "$DB_USER" ] || [ -z "$DB_PASSWORD" ]; then
echo "One or more required environment variables are missing."
exit 1
fi

# Directory containing the Parquet chunks
CHUNKS_DIR="parquet_chunks"

# Name of the target table
TABLE_NAME="space2stats"
PARQUET_FILE=space2stats_updated.parquet

echo "Starting"

ogr2ogr -progress -f "PostgreSQL" \
PG:"host=$DB_HOST port=$DB_PORT dbname=$DB_NAME user=$DB_USER password=$DB_PASSWORD" \
"$PARQUET_FILE" \
-nln $TABLE_NAME \
-append \
-lco SPATIAL_INDEX=NONE

Loading
Loading