Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add timeout options to cli/module #57

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ entire dataset for the specified type will be downloaded
* `--type`/`-t` (required): The Overture map data type to be downloaded. Examples of types are `building`
for building footprints, `place` for POI places data, etc. Run `overturemaps download --help` for the
complete list of allowed types
* `--connect_timeout` (optional): Socket connection timeout, in seconds. If omitted, the AWS SDK default value is used (typically 1 second).
* `--request_timeout` (optional): Socket read timeouts on Windows and macOS, in seconds. If omitted, the AWS SDK default value is used (typically 3 seconds). This option is ignored on non-Windows, non-macOS systems.

This downloads data directly from Overture's S3 bucket without interacting with any other servers.
By including bounding box extents on each row in the Overture distribution, the underlying Parquet
Expand Down
6 changes: 4 additions & 2 deletions overturemaps/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,13 @@ def cli():
type=click.Choice(get_all_overture_types()),
required=True,
)
def download(bbox, output_format, output, type_):
@click.option("--connect_timeout", required=False, type=int)
@click.option("--request_timeout", required=False, type=int)
def download(bbox, output_format, output, type_, connect_timeout, request_timeout):
if output is None:
output = sys.stdout

reader = record_batch_reader(type_, bbox)
reader = record_batch_reader(type_, bbox, connect_timeout, request_timeout)
if reader is None:
return

Expand Down
29 changes: 24 additions & 5 deletions overturemaps/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,19 @@
import pyarrow.fs as fs

# Allows for optional import of additional dependencies
try:
try:
import geopandas as gpd
from geopandas import GeoDataFrame

HAS_GEOPANDAS = True
except ImportError:
HAS_GEOPANDAS = False
GeoDataFrame = None

def record_batch_reader(overture_type, bbox=None) -> Optional[pa.RecordBatchReader]:

def record_batch_reader(
overture_type, bbox=None, connect_timeout=None, request_timeout=None
) -> Optional[pa.RecordBatchReader]:
"""
Return a pyarrow RecordBatchReader for the desired bounding box and s3 path
"""
Expand All @@ -32,7 +36,13 @@ def record_batch_reader(overture_type, bbox=None) -> Optional[pa.RecordBatchRead
filter = None

dataset = ds.dataset(
path, filesystem=fs.S3FileSystem(anonymous=True, region="us-west-2")
path,
filesystem=fs.S3FileSystem(
anonymous=True,
region="us-west-2",
connect_timeout=connect_timeout,
request_timeout=request_timeout,
),
)
batches = dataset.to_batches(filter=filter)

Expand All @@ -48,14 +58,22 @@ def record_batch_reader(overture_type, bbox=None) -> Optional[pa.RecordBatchRead
reader = pa.RecordBatchReader.from_batches(geoarrow_schema, non_empty_batches)
return reader

def geodataframe(overture_type: str, bbox: (float, float, float, float) = None) -> GeoDataFrame:

def geodataframe(
overture_type: str,
bbox: (float, float, float, float) = None,
connect_timeout: int = None,
request_timeout: int = None,
) -> GeoDataFrame:
"""
Loads geoparquet for specified type into a geopandas dataframe

Parameters
----------
overture_type: type to load
bbox: optional bounding box for data fetch (xmin, ymin, xmax, ymax)
connect_timeout: optional connection timeout in seconds
request_timeout: optional request timeout in seconds

Returns
-------
Expand All @@ -65,9 +83,10 @@ def geodataframe(overture_type: str, bbox: (float, float, float, float) = None)
if not HAS_GEOPANDAS:
raise ImportError("geopandas is required to use this function")

reader = record_batch_reader(overture_type, bbox)
reader = record_batch_reader(overture_type, bbox, connect_timeout, request_timeout)
return gpd.GeoDataFrame.from_arrow(reader)


def geoarrow_schema_adapter(schema: pa.Schema) -> pa.Schema:
"""
Convert a geoarrow-compatible schema to a proper geoarrow schema
Expand Down