Merge pull request #1283 from cal-itp/fix-shn-segments

update shared_utils and segment_speed_utils / fix SHN segments
cal-itp · Nov 7, 2024 · 6adecac · 6adecac
2 parents 16c44dd + fc6cd14
commit 6adecac
Show file tree

Hide file tree

Showing 27 changed files with 467 additions and 386 deletions.
diff --git a/_shared_utils/setup.py b/_shared_utils/setup.py
@@ -4,7 +4,7 @@
 setup(
     name="shared_utils",
     packages=find_packages(),
-    version="2.8",
+    version="3.0",
     description="Shared utility functions for data analyses",
     author="Cal-ITP",
     license="Apache",

diff --git a/_shared_utils/shared_utils/__init__.py b/_shared_utils/shared_utils/__init__.py
@@ -2,22 +2,26 @@
     arcgis_query,
     catalog_utils,
     dask_utils,
+    geo_utils,
     gtfs_utils_v2,
     portfolio_utils,
     publish_utils,
     rt_dates,
     rt_utils,
     schedule_rt_utils,
+    time_helpers,
 )
 
 __all__ = [
     "arcgis_query",
     "catalog_utils",
     "dask_utils",
+    "geo_utils",
     "gtfs_utils_v2",
     "portfolio_utils",
     "publish_utils",
     "rt_dates",
     "rt_utils",
     "schedule_rt_utils",
+    "time_helpers",
 ]
diff --git a/_shared_utils/shared_utils/geo_utils.py b/_shared_utils/shared_utils/geo_utils.py
@@ -0,0 +1,177 @@
+"""
+Geospatial utility functions
+"""
+import geopandas as gpd
+import numpy as np
+import pandas as pd
+import shapely
+from calitp_data_analysis import geography_utils
+from scipy.spatial import KDTree
+from shared_utils import rt_utils
+
+# Could we use distance to filter for nearest neighbor?
+# It can make the length of results more unpredictable...maybe we stick to
+# k_neighbors and keep the nearest k, so that we can at least be
+# more consistent with the arrays returned
+geo_const_meters = 6_371_000 * np.pi / 180
+geo_const_miles = 3_959_000 * np.pi / 180
+
+
+def nearest_snap(line: shapely.LineString, point: shapely.Point, k_neighbors: int = 1) -> np.ndarray:
+    """
+    Based off of this function,
+    but we want to return the index value, rather than the point.
+    https://github.com/UTEL-UIUC/gtfs_segments/blob/main/gtfs_segments/geom_utils.py
+    """
+    line = np.asarray(line.coords)
+    point = np.asarray(point.coords)
+    tree = KDTree(line)
+
+    # np_dist is array of distances of result (let's not return it)
+    # np_inds is array of indices of result
+    _, np_inds = tree.query(
+        point,
+        workers=-1,
+        k=k_neighbors,
+    )
+
+    return np_inds.squeeze()
+
+
+def vp_as_gdf(vp: pd.DataFrame, crs: str = "EPSG:3310") -> gpd.GeoDataFrame:
+    """
+    Turn vp as a gdf and project to EPSG:3310.
+    """
+    vp_gdf = (
+        geography_utils.create_point_geometry(vp, longitude_col="x", latitude_col="y", crs=geography_utils.WGS84)
+        .to_crs(crs)
+        .drop(columns=["x", "y"])
+    )
+
+    return vp_gdf
+
+
+def add_arrowized_geometry(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
+    """
+    Add a column where the segment is arrowized.
+    """
+    segment_geom = gpd.GeoSeries(gdf.geometry)
+    CRS = gdf.crs.to_epsg()
+
+    # TODO: parallel_offset is going to be deprecated? offset_curve is the new one
+    geom_parallel = gpd.GeoSeries([rt_utils.try_parallel(i) for i in segment_geom], crs=CRS)
+    # geom_parallel = gpd.GeoSeries(
+    #    [i.offset_curve(30) for i in segment_geom],
+    #    crs=CRS
+    # )
+
+    geom_arrowized = rt_utils.arrowize_segment(geom_parallel, buffer_distance=20)
+
+    gdf = gdf.assign(geometry_arrowized=geom_arrowized)
+
+    return gdf
+
+
+def get_direction_vector(start: shapely.geometry.Point, end: shapely.geometry.Point) -> tuple:
+    """
+    Given 2 points (in a projected CRS...not WGS84), return a
+    tuple that shows (delta_x, delta_y).
+
+    https://www.varsitytutors.com/precalculus-help/find-a-direction-vector-when-given-two-points
+    https://stackoverflow.com/questions/17332759/finding-vectors-with-2-points
+
+    """
+    return ((end.x - start.x), (end.y - start.y))
+
+
+def distill_array_into_direction_vector(array: np.ndarray) -> tuple:
+    """
+    Given an array of n items, let's take the start/end of that.
+    From start/end, we can turn 2 coordinate points into 1 distance vector.
+    Distance vector is a tuple that equals (delta_x, delta_y).
+    """
+    origin = array[0]
+    destination = array[-1]
+    return get_direction_vector(origin, destination)
+
+
+def get_vector_norm(vector: tuple) -> float:
+    """
+    Get the length (off of Pythagorean Theorem) by summing up
+    the squares of the components and then taking square root.
+
+    Use Pythagorean Theorem to get unit vector. Divide the vector
+    by the length of the vector to get unit/normalized vector.
+    This equation tells us what we need to divide by.
+    """
+    return np.sqrt(vector[0] ** 2 + vector[1] ** 2)
+
+
+def get_normalized_vector(vector: tuple) -> tuple:
+    """
+    Apply Pythagorean Theorem and normalize the vector of distances.
+    https://stackoverflow.com/questions/21030391/how-to-normalize-a-numpy-array-to-a-unit-vector
+    """
+    x_norm = vector[0] / get_vector_norm(vector)
+    y_norm = vector[1] / get_vector_norm(vector)
+
+    return (x_norm, y_norm)
+
+
+def dot_product(vec1: tuple, vec2: tuple) -> float:
+    """
+    Take the dot product. Multiply the x components, the y components, and
+    sum it up.
+    """
+    return vec1[0] * vec2[0] + vec1[1] * vec2[1]
+
+
+def segmentize_by_indices(line_geometry: shapely.LineString, start_idx: int, end_idx: int) -> shapely.LineString:
+    """
+    Cut a line according to index values.
+    Similar to shapely.segmentize, which allows you to cut
+    line according to distances.
+    Here, we don't have specified distances, but we want to customize
+    where segment the line.
+    """
+    all_coords = shapely.get_coordinates(line_geometry)
+
+    if end_idx + 1 > all_coords.size:
+        subset_coords = all_coords[start_idx:end_idx]
+    else:
+        subset_coords = all_coords[start_idx : end_idx + 1]
+
+    if len(subset_coords) < 2:
+        return shapely.LineString()
+    else:
+        return shapely.LineString([shapely.Point(i) for i in subset_coords])
+
+
+def draw_line_between_points(gdf: gpd.GeoDataFrame, group_cols: list) -> gpd.GeoDataFrame:
+    """
+    Use the current postmile as the
+    starting geometry / segment beginning
+    and the subsequent postmile (based on odometer)
+    as the ending geometry / segment end.
+
+    Segment goes from current to next postmile.
+    """
+    # Grab the subsequent point geometry
+    # We can drop whenever the last point is missing within
+    # a group. If we have 3 points, we can draw 2 lines.
+    gdf = gdf.assign(end_geometry=(gdf.groupby(group_cols, group_keys=False).geometry.shift(-1))).dropna(
+        subset="end_geometry"
+    )
+
+    # Construct linestring with 2 point coordinates
+    gdf = (
+        gdf.assign(
+            line_geometry=gdf.apply(lambda x: shapely.LineString([x.geometry, x.end_geometry]), axis=1).set_crs(
+                geography_utils.WGS84
+            )
+        )
+        .drop(columns=["geometry", "end_geometry"])
+        .rename(columns={"line_geometry": "geometry"})
+    )
+
+    return gdf
diff --git a/_shared_utils/shared_utils/schedule_rt_utils.py b/_shared_utils/shared_utils/schedule_rt_utils.py
@@ -13,6 +13,14 @@
 from siuba import *
 
 PACIFIC_TIMEZONE = "US/Pacific"
+RENAME_DISTRICT_DICT = {
+    "Marysville / Sacramento": "Marysville",  # D3
+    "Bay Area / Oakland": "Oakland",  # D4
+    "San Luis Obispo / Santa Barbara": "San Luis Obispo",  # D5
+    "Fresno / Bakersfield": "Fresno",  # D6
+    "San Bernardino / Riverside": "San Bernardino",  # D8
+    "Orange County": "Irvine",  # D12
+}
 
 
 def localize_timestamp_col(df: dd.DataFrame, timestamp_col: Union[str, list]) -> dd.DataFrame:
@@ -84,7 +92,10 @@ def filter_dim_gtfs_datasets(
     custom_filtering: dict = None,
     get_df: bool = True,
 ) -> Union[pd.DataFrame, siuba.sql.verbs.LazyTbl]:
-    """ """
+    """
+    Filter mart_transit_database.dim_gtfs_dataset table
+    and keep only the valid rows that passed data quality checks.
+    """
     if "key" not in keep_cols:
         raise KeyError("Include key in keep_cols list")
 
@@ -164,9 +175,73 @@ def get_organization_id(
         return df2
 
 
+def filter_dim_county_geography(
+    date: str,
+    keep_cols: list[str] = ["caltrans_district"],
+) -> pd.DataFrame:
+    """
+    Merge mart_transit_database.dim_county_geography with
+    mart_transit_database.bridge_organizations_x_headquarters_county_geography.
+    Both tables are at organization-county-feed_period grain.
+
+    dim_county_geography holds additional geography columns like
+    MSA, FIPS, etc.
+
+    Use this merge to get caltrans_district.
+    Organizations belong to county, and counties are assigned to districts.
+    """
+    bridge_orgs_county_geog = (
+        tbls.mart_transit_database.bridge_organizations_x_headquarters_county_geography()
+        >> gtfs_utils_v2.subset_cols([_.organization_name, _.county_geography_key, _._valid_from, _._valid_to])
+        >> collect()
+    )
+
+    keep_cols2 = list(set(keep_cols + ["county_geography_key", "caltrans_district_name"]))
+
+    dim_county_geography = (
+        tbls.mart_transit_database.dim_county_geography()
+        >> rename(county_geography_key=_.key)
+        >> gtfs_utils_v2.subset_cols(keep_cols2)
+        >> collect()
+    )
+
+    # Several caltrans_district values in mart_transit_database
+    # now contain slashes.
+    # Use dict to standardize these against how previous versions were
+    dim_county_geography = dim_county_geography.assign(
+        caltrans_district_name=dim_county_geography.apply(
+            lambda x: RENAME_DISTRICT_DICT[x.caltrans_district_name]
+            if x.caltrans_district_name in RENAME_DISTRICT_DICT.keys()
+            else x.caltrans_district_name,
+            axis=1,
+        )
+    )
+
+    bridge_orgs_county_geog = localize_timestamp_col(bridge_orgs_county_geog, ["_valid_from", "_valid_to"])
+
+    bridge_orgs_county_geog2 = bridge_orgs_county_geog >> filter(
+        _._valid_from_local <= pd.to_datetime(date), _._valid_to_local >= pd.to_datetime(date)
+    )
+
+    # Merge organization-county with caltrans_district info
+    # it appears to be a 1:1 merge. checked whether organization can belong to multiple districts,
+    # and that doesn't appear to happen
+    df = pd.merge(bridge_orgs_county_geog2, dim_county_geography, on="county_geography_key", how="inner")
+
+    df2 = (
+        df.assign(caltrans_district=df.caltrans_district.astype(str).str.zfill(2) + " - " + df.caltrans_district_name)[
+            ["organization_name"] + keep_cols
+        ]
+        .drop_duplicates()
+        .reset_index(drop=True)
+    )
+
+    return df2
+
+
 def filter_dim_organizations(
     date: str,
-    keep_cols: list[str] = ["source_record_id", "caltrans_district"],
+    keep_cols: list[str] = ["source_record_id"],
     custom_filtering: dict = None,
     get_df: bool = True,
 ) -> Union[pd.DataFrame, siuba.sql.verbs.LazyTbl]:
@@ -201,7 +276,8 @@ def sample_gtfs_dataset_key_to_organization_crosswalk(
         "base64_url",
         "uri",
     ],
-    dim_organization_cols: list[str] = ["source_record_id", "name", "caltrans_district"],
+    dim_organization_cols: list[str] = ["source_record_id", "name"],
+    dim_county_geography_cols: list[str] = ["caltrans_district"],
 ) -> pd.DataFrame:
     """
     Get crosswalk from gtfs_dataset_key to certain quartet data identifiers
@@ -243,11 +319,17 @@ def sample_gtfs_dataset_key_to_organization_crosswalk(
 
     feeds_with_org_id = get_organization_id(feeds_with_quartet_info, date, merge_cols=merge_cols)
 
-    # (4) Merge in dim_orgs to get caltrans_district
+    # (4) Merge in dim_orgs to get organization info - everything except caltrans_district is found here
     ORG_RENAME_DICT = {"source_record_id": "organization_source_record_id", "name": "organization_name"}
     orgs = filter_dim_organizations(date, keep_cols=dim_organization_cols, get_df=True).rename(columns=ORG_RENAME_DICT)
 
-    feeds_with_district = pd.merge(feeds_with_org_id, orgs, on="organization_source_record_id")
+    feeds_with_org_info = pd.merge(feeds_with_org_id, orgs, on="organization_source_record_id")
+
+    # (5) Merge in dim_county_geography to get caltrans_district
+    # https://github.com/cal-itp/data-analyses/issues/1282
+    district = filter_dim_county_geography(date, dim_county_geography_cols)
+
+    feeds_with_district = pd.merge(feeds_with_org_info, district, on="organization_name")
 
     return feeds_with_district