Improve query management in BigQuery integration

mkuthan · Nov 22, 2024 · 55e5f7b · 55e5f7b
1 parent 22762bd
commit 55e5f7b
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 5 deletions.
diff --git a/.streamlit/config.toml b/.streamlit/config.toml
@@ -6,6 +6,13 @@ gatherUsageStats = false
 # Show only options set externally (e.g. hrough st.set_page_config)
 toolbarMode = "minimal"
 
+[logger]
+# Info is a default logging level, but keep for reference
+level = "info"
+
+# Add logging level and logger name to make logs more complete
+messageFormat = "%(asctime)s %(levelname) -7s %(name)s: %(message)s"
+
 [runner]
 # Raise an exception after adding unserializable data to Session State.
 enforceSerializableSessionState = true

diff --git a/README.md b/README.md
@@ -28,15 +28,14 @@ This project demonstrates how to leverage the built-in power of Streamlit using
 * 🗂️ BigQuery integration using the New York Taxi public dataset
 * 🔒 Authentication skeleton, easily replaceable with OAuth
 * 🔗 Application state sharing via URL
-* 💾 Dataframe export to CSV and XLS
+* 💾 Dataframe export buttons to CSV and XLS
 
 ### TODO
 
 * 🐳 Create Docker image
 * 🧪 Implement BigQuery integration tests
 * 📈 Add more visualizations for integrated public dataset
 * 🔐 Integration with external OAuth provider, see [roadmap](https://roadmap.streamlit.app/)
-* 📋 Better table with sorting and filtering
 * 📝 Add request logging
 * 🔄 Redirect to the original page after login
 * ⚖️ Describe load balancer strategies, for example: sticky session

diff --git a/example/infrastructure/big_query.py b/example/infrastructure/big_query.py
@@ -1,8 +1,16 @@
 import pandas as pd
 from google.cloud import bigquery
 
+# Don't allow unbounded results to avoid OOM errors
+_MAX_RESULTS = 100_000
+
+# Limit the time a query can run to avoid long waits
+_JOB_TIMEOUT_MS = 60_000
+
+# Define labels for better query management
+_JOB_LABELS = {"application": "example-streamlit"}
+
 
-# TODO: add error handling, define timeouts, etc.
 def query(q: str, params: dict = None) -> pd.DataFrame:
     client = __get_client()
 
@@ -11,10 +19,17 @@ def query(q: str, params: dict = None) -> pd.DataFrame:
             # TODO: add support for other types
             bigquery.ScalarQueryParameter(name, "STRING", value)
             for name, value in (params or {}).items()
-        ]
+        ],
+        job_timeout_ms=_JOB_TIMEOUT_MS,
+        labels=_JOB_LABELS,
     )
+
+    # Default timeout is None, but there is a good reason for that, see sources for more details.
+    # Default retry and job_retry polices look good, so we don't need to change them.
     results = client.query(q, job_config=job_config)
-    return results.to_dataframe()
+
+    # Use regular Job instead of Storage API to avoid costs
+    return results.to_dataframe(max_results=_MAX_RESULTS, create_bqstorage_client=False)
 
 
 def __get_client() -> bigquery.Client: