Skip to content

Commit

Permalink
Merge pull request #1283 from cal-itp/fix-shn-segments
Browse files Browse the repository at this point in the history
update shared_utils and segment_speed_utils / fix SHN segments
  • Loading branch information
tiffanychu90 authored Nov 7, 2024
2 parents 16c44dd + fc6cd14 commit 6adecac
Show file tree
Hide file tree
Showing 27 changed files with 467 additions and 386 deletions.
2 changes: 1 addition & 1 deletion _shared_utils/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
setup(
name="shared_utils",
packages=find_packages(),
version="2.8",
version="3.0",
description="Shared utility functions for data analyses",
author="Cal-ITP",
license="Apache",
Expand Down
4 changes: 4 additions & 0 deletions _shared_utils/shared_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,26 @@
arcgis_query,
catalog_utils,
dask_utils,
geo_utils,
gtfs_utils_v2,
portfolio_utils,
publish_utils,
rt_dates,
rt_utils,
schedule_rt_utils,
time_helpers,
)

__all__ = [
"arcgis_query",
"catalog_utils",
"dask_utils",
"geo_utils",
"gtfs_utils_v2",
"portfolio_utils",
"publish_utils",
"rt_dates",
"rt_utils",
"schedule_rt_utils",
"time_helpers",
]
177 changes: 177 additions & 0 deletions _shared_utils/shared_utils/geo_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
"""
Geospatial utility functions
"""
import geopandas as gpd
import numpy as np
import pandas as pd
import shapely
from calitp_data_analysis import geography_utils
from scipy.spatial import KDTree
from shared_utils import rt_utils

# Could we use distance to filter for nearest neighbor?
# It can make the length of results more unpredictable...maybe we stick to
# k_neighbors and keep the nearest k, so that we can at least be
# more consistent with the arrays returned
geo_const_meters = 6_371_000 * np.pi / 180
geo_const_miles = 3_959_000 * np.pi / 180


def nearest_snap(line: shapely.LineString, point: shapely.Point, k_neighbors: int = 1) -> np.ndarray:
"""
Based off of this function,
but we want to return the index value, rather than the point.
https://github.com/UTEL-UIUC/gtfs_segments/blob/main/gtfs_segments/geom_utils.py
"""
line = np.asarray(line.coords)
point = np.asarray(point.coords)
tree = KDTree(line)

# np_dist is array of distances of result (let's not return it)
# np_inds is array of indices of result
_, np_inds = tree.query(
point,
workers=-1,
k=k_neighbors,
)

return np_inds.squeeze()


def vp_as_gdf(vp: pd.DataFrame, crs: str = "EPSG:3310") -> gpd.GeoDataFrame:
"""
Turn vp as a gdf and project to EPSG:3310.
"""
vp_gdf = (
geography_utils.create_point_geometry(vp, longitude_col="x", latitude_col="y", crs=geography_utils.WGS84)
.to_crs(crs)
.drop(columns=["x", "y"])
)

return vp_gdf


def add_arrowized_geometry(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
"""
Add a column where the segment is arrowized.
"""
segment_geom = gpd.GeoSeries(gdf.geometry)
CRS = gdf.crs.to_epsg()

# TODO: parallel_offset is going to be deprecated? offset_curve is the new one
geom_parallel = gpd.GeoSeries([rt_utils.try_parallel(i) for i in segment_geom], crs=CRS)
# geom_parallel = gpd.GeoSeries(
# [i.offset_curve(30) for i in segment_geom],
# crs=CRS
# )

geom_arrowized = rt_utils.arrowize_segment(geom_parallel, buffer_distance=20)

gdf = gdf.assign(geometry_arrowized=geom_arrowized)

return gdf


def get_direction_vector(start: shapely.geometry.Point, end: shapely.geometry.Point) -> tuple:
"""
Given 2 points (in a projected CRS...not WGS84), return a
tuple that shows (delta_x, delta_y).
https://www.varsitytutors.com/precalculus-help/find-a-direction-vector-when-given-two-points
https://stackoverflow.com/questions/17332759/finding-vectors-with-2-points
"""
return ((end.x - start.x), (end.y - start.y))


def distill_array_into_direction_vector(array: np.ndarray) -> tuple:
"""
Given an array of n items, let's take the start/end of that.
From start/end, we can turn 2 coordinate points into 1 distance vector.
Distance vector is a tuple that equals (delta_x, delta_y).
"""
origin = array[0]
destination = array[-1]
return get_direction_vector(origin, destination)


def get_vector_norm(vector: tuple) -> float:
"""
Get the length (off of Pythagorean Theorem) by summing up
the squares of the components and then taking square root.
Use Pythagorean Theorem to get unit vector. Divide the vector
by the length of the vector to get unit/normalized vector.
This equation tells us what we need to divide by.
"""
return np.sqrt(vector[0] ** 2 + vector[1] ** 2)


def get_normalized_vector(vector: tuple) -> tuple:
"""
Apply Pythagorean Theorem and normalize the vector of distances.
https://stackoverflow.com/questions/21030391/how-to-normalize-a-numpy-array-to-a-unit-vector
"""
x_norm = vector[0] / get_vector_norm(vector)
y_norm = vector[1] / get_vector_norm(vector)

return (x_norm, y_norm)


def dot_product(vec1: tuple, vec2: tuple) -> float:
"""
Take the dot product. Multiply the x components, the y components, and
sum it up.
"""
return vec1[0] * vec2[0] + vec1[1] * vec2[1]


def segmentize_by_indices(line_geometry: shapely.LineString, start_idx: int, end_idx: int) -> shapely.LineString:
"""
Cut a line according to index values.
Similar to shapely.segmentize, which allows you to cut
line according to distances.
Here, we don't have specified distances, but we want to customize
where segment the line.
"""
all_coords = shapely.get_coordinates(line_geometry)

if end_idx + 1 > all_coords.size:
subset_coords = all_coords[start_idx:end_idx]
else:
subset_coords = all_coords[start_idx : end_idx + 1]

if len(subset_coords) < 2:
return shapely.LineString()
else:
return shapely.LineString([shapely.Point(i) for i in subset_coords])


def draw_line_between_points(gdf: gpd.GeoDataFrame, group_cols: list) -> gpd.GeoDataFrame:
"""
Use the current postmile as the
starting geometry / segment beginning
and the subsequent postmile (based on odometer)
as the ending geometry / segment end.
Segment goes from current to next postmile.
"""
# Grab the subsequent point geometry
# We can drop whenever the last point is missing within
# a group. If we have 3 points, we can draw 2 lines.
gdf = gdf.assign(end_geometry=(gdf.groupby(group_cols, group_keys=False).geometry.shift(-1))).dropna(
subset="end_geometry"
)

# Construct linestring with 2 point coordinates
gdf = (
gdf.assign(
line_geometry=gdf.apply(lambda x: shapely.LineString([x.geometry, x.end_geometry]), axis=1).set_crs(
geography_utils.WGS84
)
)
.drop(columns=["geometry", "end_geometry"])
.rename(columns={"line_geometry": "geometry"})
)

return gdf
92 changes: 87 additions & 5 deletions _shared_utils/shared_utils/schedule_rt_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,14 @@
from siuba import *

PACIFIC_TIMEZONE = "US/Pacific"
RENAME_DISTRICT_DICT = {
"Marysville / Sacramento": "Marysville", # D3
"Bay Area / Oakland": "Oakland", # D4
"San Luis Obispo / Santa Barbara": "San Luis Obispo", # D5
"Fresno / Bakersfield": "Fresno", # D6
"San Bernardino / Riverside": "San Bernardino", # D8
"Orange County": "Irvine", # D12
}


def localize_timestamp_col(df: dd.DataFrame, timestamp_col: Union[str, list]) -> dd.DataFrame:
Expand Down Expand Up @@ -84,7 +92,10 @@ def filter_dim_gtfs_datasets(
custom_filtering: dict = None,
get_df: bool = True,
) -> Union[pd.DataFrame, siuba.sql.verbs.LazyTbl]:
""" """
"""
Filter mart_transit_database.dim_gtfs_dataset table
and keep only the valid rows that passed data quality checks.
"""
if "key" not in keep_cols:
raise KeyError("Include key in keep_cols list")

Expand Down Expand Up @@ -164,9 +175,73 @@ def get_organization_id(
return df2


def filter_dim_county_geography(
date: str,
keep_cols: list[str] = ["caltrans_district"],
) -> pd.DataFrame:
"""
Merge mart_transit_database.dim_county_geography with
mart_transit_database.bridge_organizations_x_headquarters_county_geography.
Both tables are at organization-county-feed_period grain.
dim_county_geography holds additional geography columns like
MSA, FIPS, etc.
Use this merge to get caltrans_district.
Organizations belong to county, and counties are assigned to districts.
"""
bridge_orgs_county_geog = (
tbls.mart_transit_database.bridge_organizations_x_headquarters_county_geography()
>> gtfs_utils_v2.subset_cols([_.organization_name, _.county_geography_key, _._valid_from, _._valid_to])
>> collect()
)

keep_cols2 = list(set(keep_cols + ["county_geography_key", "caltrans_district_name"]))

dim_county_geography = (
tbls.mart_transit_database.dim_county_geography()
>> rename(county_geography_key=_.key)
>> gtfs_utils_v2.subset_cols(keep_cols2)
>> collect()
)

# Several caltrans_district values in mart_transit_database
# now contain slashes.
# Use dict to standardize these against how previous versions were
dim_county_geography = dim_county_geography.assign(
caltrans_district_name=dim_county_geography.apply(
lambda x: RENAME_DISTRICT_DICT[x.caltrans_district_name]
if x.caltrans_district_name in RENAME_DISTRICT_DICT.keys()
else x.caltrans_district_name,
axis=1,
)
)

bridge_orgs_county_geog = localize_timestamp_col(bridge_orgs_county_geog, ["_valid_from", "_valid_to"])

bridge_orgs_county_geog2 = bridge_orgs_county_geog >> filter(
_._valid_from_local <= pd.to_datetime(date), _._valid_to_local >= pd.to_datetime(date)
)

# Merge organization-county with caltrans_district info
# it appears to be a 1:1 merge. checked whether organization can belong to multiple districts,
# and that doesn't appear to happen
df = pd.merge(bridge_orgs_county_geog2, dim_county_geography, on="county_geography_key", how="inner")

df2 = (
df.assign(caltrans_district=df.caltrans_district.astype(str).str.zfill(2) + " - " + df.caltrans_district_name)[
["organization_name"] + keep_cols
]
.drop_duplicates()
.reset_index(drop=True)
)

return df2


def filter_dim_organizations(
date: str,
keep_cols: list[str] = ["source_record_id", "caltrans_district"],
keep_cols: list[str] = ["source_record_id"],
custom_filtering: dict = None,
get_df: bool = True,
) -> Union[pd.DataFrame, siuba.sql.verbs.LazyTbl]:
Expand Down Expand Up @@ -201,7 +276,8 @@ def sample_gtfs_dataset_key_to_organization_crosswalk(
"base64_url",
"uri",
],
dim_organization_cols: list[str] = ["source_record_id", "name", "caltrans_district"],
dim_organization_cols: list[str] = ["source_record_id", "name"],
dim_county_geography_cols: list[str] = ["caltrans_district"],
) -> pd.DataFrame:
"""
Get crosswalk from gtfs_dataset_key to certain quartet data identifiers
Expand Down Expand Up @@ -243,11 +319,17 @@ def sample_gtfs_dataset_key_to_organization_crosswalk(

feeds_with_org_id = get_organization_id(feeds_with_quartet_info, date, merge_cols=merge_cols)

# (4) Merge in dim_orgs to get caltrans_district
# (4) Merge in dim_orgs to get organization info - everything except caltrans_district is found here
ORG_RENAME_DICT = {"source_record_id": "organization_source_record_id", "name": "organization_name"}
orgs = filter_dim_organizations(date, keep_cols=dim_organization_cols, get_df=True).rename(columns=ORG_RENAME_DICT)

feeds_with_district = pd.merge(feeds_with_org_id, orgs, on="organization_source_record_id")
feeds_with_org_info = pd.merge(feeds_with_org_id, orgs, on="organization_source_record_id")

# (5) Merge in dim_county_geography to get caltrans_district
# https://github.com/cal-itp/data-analyses/issues/1282
district = filter_dim_county_geography(date, dim_county_geography_cols)

feeds_with_district = pd.merge(feeds_with_org_info, district, on="organization_name")

return feeds_with_district

Expand Down
Loading

0 comments on commit 6adecac

Please sign in to comment.