From fc6cd14f515465720315c4b4068db0a5a9ac7f1b Mon Sep 17 00:00:00 2001 From: tiffanychu90 Date: Thu, 7 Nov 2024 22:28:31 +0000 Subject: [PATCH] (remove) dim_county_geog notebook --- .../shared_utils/add_dim_county_geog.ipynb | 215 ------------------ 1 file changed, 215 deletions(-) delete mode 100644 _shared_utils/shared_utils/add_dim_county_geog.ipynb diff --git a/_shared_utils/shared_utils/add_dim_county_geog.ipynb b/_shared_utils/shared_utils/add_dim_county_geog.ipynb deleted file mode 100644 index 7ac3e628d..000000000 --- a/_shared_utils/shared_utils/add_dim_county_geog.ipynb +++ /dev/null @@ -1,215 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "4ee66ea4-c26c-460f-81ef-f98b3ab249c7", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "from shared_utils import schedule_rt_utils\n", - "from segment_speed_utils import helpers" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "a0263896-0115-4d80-8b9e-730a7b5a5368", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'\\nfrom calitp_data_analysis.tables import tbls\\nfrom siuba import *\\n\\nbridge_orgs_county_geog = (\\n tbls.mart_transit_database.bridge_organizations_x_headquarters_county_geography()\\n >> collect()\\n)\\n\\nbridge_orgs_county_geog.to_parquet(\"bridge_orgs_county_geog.parquet\")\\n\\ndim_county_geography = (\\n tbls.mart_transit_database.dim_county_geography()\\n >> collect()\\n)\\n\\ndim_county_geography.to_parquet(\"dim_county_geography.parquet\")\\n\\ndim_organizations = (\\n tbls.mart_transit_database.dim_organizations()\\n >> collect()\\n)\\n\\ndim_organizations.to_parquet(\"dim_organizations.parquet\")\\n\\nbridge_orgs_county_geog = pd.read_parquet(\"bridge_orgs_county_geog.parquet\")\\ndim_county_geography = pd.read_parquet(\"dim_county_geography.parquet\")\\ndim_organizations = pd.read_parquet(\"dim_organizations.parquet\")\\n'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "'''\n", - "from calitp_data_analysis.tables import tbls\n", - "from siuba import *\n", - "\n", - "bridge_orgs_county_geog = (\n", - " tbls.mart_transit_database.bridge_organizations_x_headquarters_county_geography()\n", - " >> collect()\n", - ")\n", - "\n", - "bridge_orgs_county_geog.to_parquet(\"bridge_orgs_county_geog.parquet\")\n", - "\n", - "dim_county_geography = (\n", - " tbls.mart_transit_database.dim_county_geography()\n", - " >> collect()\n", - ")\n", - "\n", - "dim_county_geography.to_parquet(\"dim_county_geography.parquet\")\n", - "\n", - "dim_organizations = (\n", - " tbls.mart_transit_database.dim_organizations()\n", - " >> collect()\n", - ")\n", - "\n", - "dim_organizations.to_parquet(\"dim_organizations.parquet\")\n", - "\n", - "bridge_orgs_county_geog = pd.read_parquet(\"bridge_orgs_county_geog.parquet\")\n", - "dim_county_geography = pd.read_parquet(\"dim_county_geography.parquet\")\n", - "dim_organizations = pd.read_parquet(\"dim_organizations.parquet\")\n", - "'''" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "1aa7fa60-b28f-4530-8e8c-6ba29987286f", - "metadata": {}, - "outputs": [], - "source": [ - "def create_gtfs_dataset_key_to_organization_crosswalk(\n", - " analysis_date: str\n", - ") -> pd.DataFrame:\n", - " \"\"\"\n", - " For every operator that appears in schedule data, \n", - " create a crosswalk that links to organization_source_record_id.\n", - " For all our downstream outputs, at various aggregations,\n", - " we need to attach these over and over again.\n", - " \"\"\"\n", - " df = helpers.import_scheduled_trips(\n", - " analysis_date,\n", - " columns = [\"gtfs_dataset_key\", \"name\"],\n", - " get_pandas = True\n", - " ).rename(columns = {\"schedule_gtfs_dataset_key\": \"gtfs_dataset_key\"})\n", - " # rename columns because we must use simply gtfs_dataset_key in schedule_rt_utils function\n", - " \n", - " # Get base64_url, organization_source_record_id and organization_name\n", - " crosswalk = schedule_rt_utils.sample_gtfs_dataset_key_to_organization_crosswalk(\n", - " df,\n", - " analysis_date,\n", - " quartet_data = \"schedule\",\n", - " dim_gtfs_dataset_cols = [\"key\", \"source_record_id\", \"base64_url\"],\n", - " dim_organization_cols = [\"source_record_id\", \"name\", \n", - " \"itp_id\",\n", - " \"ntd_id_2022\"],\n", - " dim_county_geography_cols = [\"caltrans_district\"]\n", - " )\n", - "\n", - " df_with_org = pd.merge(\n", - " df.rename(columns = {\"gtfs_dataset_key\": \"schedule_gtfs_dataset_key\"}),\n", - " crosswalk,\n", - " on = \"schedule_gtfs_dataset_key\",\n", - " how = \"inner\"\n", - " )\n", - " \n", - " return df_with_org" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9b1801fb-7900-465f-8f00-98b57a68ef23", - "metadata": {}, - "outputs": [], - "source": [ - "date = \"2024-10-16\"\n", - "\n", - "new_crosswalk = create_gtfs_dataset_key_to_organization_crosswalk(date)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ba5d0edd-ee38-45ca-b3a2-7df251084127", - "metadata": {}, - "outputs": [], - "source": [ - "GCS = \"gs://calitp-analytics-data/data-analyses/\"\n", - "SCHED_GCS = f\"{GCS}gtfs_schedule/\"\n", - "CROSSWALK_FILE = \"crosswalk/gtfs_key_organization\"\n", - "\n", - "crosswalk = pd.read_parquet(f\"{SCHED_GCS}{CROSSWALK_FILE}_{date}.parquet\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7251b855-d7db-45e8-8d56-f9cf2d4b48d1", - "metadata": {}, - "outputs": [], - "source": [ - "new_crosswalk.shape, crosswalk.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18e85072-1833-4a39-9293-9d46a8f3e5d5", - "metadata": {}, - "outputs": [], - "source": [ - "cols_in_common = [c for c in crosswalk.columns if c in new_crosswalk.columns]\n", - "new_crosswalk2 = new_crosswalk[cols_in_common].rename(columns = {\"caltrans_district\": \"caltrans_district2\"})\n", - "crosswalk2 = crosswalk[cols_in_common]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7343c8d9-55fc-4d5f-aad7-f7d4f7b8a02d", - "metadata": {}, - "outputs": [], - "source": [ - "merge_cols = [c for c in cols_in_common if c != \"caltrans_district\"]\n", - "df = pd.merge(\n", - " crosswalk2,\n", - " new_crosswalk2,\n", - " on = merge_cols,\n", - " how = \"outer\",\n", - " indicator=True\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d6f01eaa-014b-4770-9119-482c11a277aa", - "metadata": {}, - "outputs": [], - "source": [ - "df[df.caltrans_district != df.caltrans_district2]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "75e4681e-3822-4665-a9ba-9783a1b00bfa", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}