Skip to content

Commit

Permalink
Merge branch 'main' into pre-commit-ci-update-config
Browse files Browse the repository at this point in the history
  • Loading branch information
RaczeQ authored Sep 23, 2024
2 parents 34faec8 + fea4cfd commit 763c944
Show file tree
Hide file tree
Showing 9 changed files with 123 additions and 48 deletions.
31 changes: 30 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,29 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [0.10.0] - 2024-09-23

### Changed

- **BREAKING** Changed required minimal number of points in polygon from 3 to 4
- Added removal of repeated points in linestrings

### Fixed

- Removed support for yanked polars version `1.7.0`

## [0.9.4] - 2024-09-11

### Changed

- Excluded DuckDB `1.1.0` version from dependencies

## [0.9.3] - 2024-09-10

### Removed

- `geoarrow-rust-core` from dependencies

## [0.9.2] - 2024-08-28

### Changed
Expand Down Expand Up @@ -350,7 +373,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Created QuackOSM repository
- Implemented PbfFileReader

[Unreleased]: https://github.com/kraina-ai/quackosm/compare/0.9.2...HEAD
[Unreleased]: https://github.com/kraina-ai/quackosm/compare/0.10.0...HEAD

[0.10.0]: https://github.com/kraina-ai/quackosm/compare/0.9.4...0.10.0

[0.9.4]: https://github.com/kraina-ai/quackosm/compare/0.9.3...0.9.4

[0.9.3]: https://github.com/kraina-ai/quackosm/compare/0.9.2...0.9.3

[0.9.2]: https://github.com/kraina-ai/quackosm/compare/0.9.1...0.9.2

Expand Down
6 changes: 2 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,20 +70,18 @@ QuackOSM supports **Python >= 3.9**

Required:

- `duckdb (>=0.10.2)`: For all DuckDB operations on PBF files
- `duckdb (>=0.10.2, <1.1.0)`: For all DuckDB operations on PBF files

- `pyarrow (>=16.0.0)`: For parquet files wrangling

- `geoarrow-pyarrow (>=0.1.2)`: For GeoParquet IO operations
- `geoarrow-pyarrow (>=0.1.2)`: For GeoParquet IO operations and transforming Arrow data to Shapely objects

- `geoarrow-pandas (>=0.1.1)`: For GeoParquet integration with GeoPandas

- `geopandas (>=0.6)`: For returning GeoDataFrames and reading Geo files

- `shapely (>=2.0)`: For parsing WKT and GeoJSON strings and fixing geometries

- `geoarrow-rust-core (>=0.2)`: For transforming Arrow data to Shapely objects

- `polars (>=0.19.4)`: For faster OSM ways grouping operation

- `typeguard (>=3.0)`: For internal validation of types
Expand Down
17 changes: 1 addition & 16 deletions pdm.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 4 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
[project]
name = "QuackOSM"
version = "0.9.2"
version = "0.10.0"
description = "An open-source tool for reading OpenStreetMap PBF files using DuckDB"
authors = [{ name = "Kamil Raczycki", email = "[email protected]" }]
dependencies = [
"geopandas>=0.6",
"shapely>=2",
"pyarrow>=16.0.0",
"duckdb>=0.10.2",
"duckdb>=0.10.2,<1.1.0",
"geoarrow-pyarrow>=0.1.2",
"geoarrow-pandas>=0.1.1",
"typeguard>=3.0.0",
Expand All @@ -16,9 +16,8 @@ dependencies = [
"tqdm>=4.42.0",
"beautifulsoup4",
"requests",
"polars>=0.19.4",
"polars>=0.19.4,!=1.7.0",
"rich>=12.0.0",
"geoarrow-rust-core>=0.2.0",
"geopy>=2.0.0",
"numpy>=1.26.0",
]
Expand Down Expand Up @@ -171,7 +170,7 @@ close-quotes-on-newline = true
wrap-one-line = true

[tool.bumpver]
current_version = "0.9.2"
current_version = "0.10.0"
version_pattern = "MAJOR.MINOR.PATCH[PYTAGNUM]"
commit_message = "chore(CI/CD): bump version {old_version} -> {new_version}"
commit = true
Expand Down
2 changes: 1 addition & 1 deletion quackosm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from quackosm.pbf_file_reader import PbfFileReader

__app_name__ = "QuackOSM"
__version__ = "0.9.2"
__version__ = "0.10.0"

__all__ = [
"PbfFileReader",
Expand Down
12 changes: 8 additions & 4 deletions quackosm/_intersection.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
from pathlib import Path
from typing import Optional

import geoarrow.pyarrow as ga
import pyarrow as pa
from geoarrow.rust.core import PointArray
from shapely import STRtree
from shapely.geometry.base import BaseGeometry

Expand All @@ -15,11 +15,15 @@ def _intersect_nodes(
table: pa.Table,
geometry_filter: BaseGeometry,
) -> pa.Table: # pragma: no cover
points_array = PointArray.from_xy(
x=table["lon"].combine_chunks(), y=table["lat"].combine_chunks()
points_array = ga.to_geopandas(
ga.point().from_geobuffers(
None,
x=table["lon"].to_numpy(),
y=table["lat"].to_numpy(),
)
)

tree = STRtree(points_array.to_shapely())
tree = STRtree(points_array)

intersecting_ids_array = table["id"].take(tree.query(geometry_filter, predicate="intersects"))

Expand Down
16 changes: 10 additions & 6 deletions quackosm/pbf_file_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2026,7 +2026,9 @@ def _get_filtered_ways_with_proper_geometry(
-- if first and last nodes are the same
ST_Equals(linestring[1]::POINT_2D, linestring[-1]::POINT_2D)
-- if linestring has at least 3 points
AND len(linestring) >= 3
AND ST_NPoints(ST_RemoveRepeatedPoints(
linestring::struct(x DECIMAL(10, 7), y DECIMAL(10, 7))[]::LINESTRING_2D
)) >= 4
-- if the element doesn't have any tags leave it as a Linestring
AND raw_tags IS NOT NULL
-- if the element is specifically tagged 'area':'no' -> LineString
Expand Down Expand Up @@ -2106,7 +2108,7 @@ def _get_filtered_relations_with_geometry(
GROUP BY id, ref_role
) x
JOIN any_outer_refs aor ON aor.id = x.id
WHERE ST_NPoints(geom) >= 4
WHERE ST_NPoints(ST_RemoveRepeatedPoints(geom)) >= 4
),
valid_relations AS (
SELECT id, is_valid
Expand All @@ -2132,7 +2134,7 @@ def _get_filtered_relations_with_geometry(
)
relation_inner_parts = self.connection.sql(
f"""
SELECT id, geometry_id, ST_MakePolygon(geometry) geometry
SELECT id, geometry_id, ST_MakePolygon(ST_RemoveRepeatedPoints(geometry)) geometry
FROM ({valid_relation_parts_parquet.sql_query()})
WHERE ref_role = 'inner'
"""
Expand All @@ -2144,7 +2146,7 @@ def _get_filtered_relations_with_geometry(
)
relation_outer_parts = self.connection.sql(
f"""
SELECT id, geometry_id, ST_MakePolygon(geometry) geometry
SELECT id, geometry_id, ST_MakePolygon(ST_RemoveRepeatedPoints(geometry)) geometry
FROM ({valid_relation_parts_parquet.sql_query()})
WHERE ref_role = 'outer'
"""
Expand Down Expand Up @@ -2714,13 +2716,15 @@ def _set_up_duckdb_connection(
connection.sql(
"""
CREATE OR REPLACE MACRO linestring_to_linestring_geometry(ls) AS
ls::struct(x DECIMAL(10, 7), y DECIMAL(10, 7))[]::LINESTRING_2D::GEOMETRY;
ST_RemoveRepeatedPoints(
ls::struct(x DECIMAL(10, 7), y DECIMAL(10, 7))[]::LINESTRING_2D
)::GEOMETRY;
"""
)
connection.sql(
"""
CREATE OR REPLACE MACRO linestring_to_polygon_geometry(ls) AS
[ls::struct(x DECIMAL(10, 7), y DECIMAL(10, 7))[]]::POLYGON_2D::GEOMETRY;
ST_MakePolygon(linestring_to_linestring_geometry(ls));
"""
)

Expand Down
13 changes: 9 additions & 4 deletions tests/base/test_intersection.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
from pathlib import Path

import duckdb
import geoarrow.pyarrow as ga
import pyarrow as pa
import pyarrow.parquet as pq
from geoarrow.rust.core import PointArray

from quackosm import geocode_to_geometry
from quackosm._intersection import intersect_nodes_with_geometry
Expand Down Expand Up @@ -38,9 +38,14 @@ def test_nodes_intersection() -> None:
"""
)
nodes_points = pq.ParquetDataset(nodes_destination).read()
points_array = PointArray.from_xy(
x=nodes_points["lon"].combine_chunks(), y=nodes_points["lat"].combine_chunks()
).to_shapely()
points_array = ga.to_geopandas(
ga.point().from_geobuffers(
None,
x=nodes_points["lon"].to_numpy(),
y=nodes_points["lat"].to_numpy(),
)
)

intersecting_points_mask = geom_filter.intersects(points_array)

intersecting_ids_array = (
Expand Down
65 changes: 58 additions & 7 deletions tests/base/test_pbf_file_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import pytest
from parametrization import Parametrization as P
from pytest_mock import MockerFixture
from shapely import from_wkt, hausdorff_distance
from shapely import from_wkt, get_coordinates, hausdorff_distance
from shapely.geometry import (
GeometryCollection,
LinearRing,
Expand Down Expand Up @@ -647,7 +647,7 @@ def check_if_relation_in_osm_is_valid_based_on_geometry(pbf_file: str, relation_
GROUP BY unnested_relations.id, unnested_relations.ref_role
) x
JOIN any_outer_refs aor ON aor.id = x.id
WHERE ST_NPoints(geom) >= 4
WHERE ST_NPoints(ST_RemoveRepeatedPoints(geom)) >= 4
),
valid_relations AS (
SELECT id, is_valid
Expand Down Expand Up @@ -827,6 +827,21 @@ def test_gdal_parity(extract_name: str) -> None:

invalid_geometries_df = joined_df

invalid_geometries_df["duckdb_geometry_type"] = invalid_geometries_df.apply(
lambda x: x.duckdb_geometry.geom_type,
axis=1,
)
invalid_geometries_df["gdal_geometry_type"] = invalid_geometries_df.apply(
lambda x: x.gdal_geometry.geom_type,
axis=1,
)
invalid_geometries_df["duckdb_geometry_num_points"] = invalid_geometries_df[
"duckdb_geometry"
].apply(lambda x: len(get_coordinates(x)))
invalid_geometries_df["gdal_geometry_num_points"] = invalid_geometries_df[
"gdal_geometry"
].apply(lambda x: len(get_coordinates(x)))

# Check if both geometries are closed or open
invalid_geometries_df["duckdb_is_closed"] = invalid_geometries_df["duckdb_geometry"].apply(
lambda x: x.is_closed
Expand Down Expand Up @@ -954,17 +969,37 @@ def test_gdal_parity(extract_name: str) -> None:
invalid_geometries_df["hausdorff_distance_value"] < 1e-10
)

# Check if geometries are the same type and close, but different number of points
# where duckdb version is simplified
# duckdb_geometry_num_points
invalid_geometries_df.loc[
invalid_geometries_df["geometry_close_hausdorff_distance"],
"is_duckdb_geometry_the_same_type_but_different_number_of_points",
] = invalid_geometries_df.loc[invalid_geometries_df["geometry_close_hausdorff_distance"]].apply(
lambda x: x.duckdb_geometry_type == x.gdal_geometry_type
and x.duckdb_geometry_num_points != x.gdal_geometry_num_points,
axis=1,
)
invalid_geometries_df = invalid_geometries_df.loc[
~(
invalid_geometries_df["geometry_close_hausdorff_distance"]
& invalid_geometries_df[
"is_duckdb_geometry_the_same_type_but_different_number_of_points"
]
)
]

# Check if GDAL geometry is a linestring while DuckDB geometry is a polygon
invalid_geometries_df.loc[
invalid_geometries_df["geometry_close_hausdorff_distance"],
"is_duckdb_polygon_and_gdal_linestring",
] = invalid_geometries_df.loc[invalid_geometries_df["geometry_close_hausdorff_distance"]].apply(
lambda x: x.duckdb_geometry.geom_type
lambda x: x.duckdb_geometry_type
in (
"Polygon",
"MultiPolygon",
)
and x.gdal_geometry.geom_type in ("LineString", "MultiLineString"),
and x.gdal_geometry_type in ("LineString", "MultiLineString"),
axis=1,
)

Expand Down Expand Up @@ -1008,12 +1043,12 @@ def test_gdal_parity(extract_name: str) -> None:
invalid_geometries_df["geometry_close_hausdorff_distance"],
"is_duckdb_linestring_and_gdal_polygon",
] = invalid_geometries_df.loc[invalid_geometries_df["geometry_close_hausdorff_distance"]].apply(
lambda x: x.duckdb_geometry.geom_type
lambda x: x.duckdb_geometry_type
in (
"LineString",
"MultiLineString",
)
and x.gdal_geometry.geom_type in ("Polygon", "MultiPolygon"),
and x.gdal_geometry_type in ("Polygon", "MultiPolygon"),
axis=1,
)

Expand Down Expand Up @@ -1048,11 +1083,27 @@ def test_gdal_parity(extract_name: str) -> None:
)
)

# Check if DuckDB geometry should be a linestring and not a polygon
# based on minimal number of points
invalid_geometries_df.loc[
invalid_geometries_df["geometry_close_hausdorff_distance"]
& invalid_geometries_df["is_duckdb_linestring_and_gdal_polygon"],
"has_less_than_4_points",
] = invalid_geometries_df.loc[
invalid_geometries_df["geometry_close_hausdorff_distance"]
& invalid_geometries_df["is_duckdb_linestring_and_gdal_polygon"]
].apply(
lambda x: x.duckdb_geometry_num_points < 4, axis=1
)

invalid_geometries_df = invalid_geometries_df.loc[
~(
invalid_geometries_df["geometry_close_hausdorff_distance"]
& invalid_geometries_df["is_duckdb_linestring_and_gdal_polygon"]
& invalid_geometries_df["is_not_in_filter_tag_value"]
& (
invalid_geometries_df["is_not_in_filter_tag_value"]
| invalid_geometries_df["has_less_than_4_points"]
)
)
]
if invalid_geometries_df.empty:
Expand Down

0 comments on commit 763c944

Please sign in to comment.