From fc6cd14f515465720315c4b4068db0a5a9ac7f1b Mon Sep 17 00:00:00 2001
From: tiffanychu90 <tiffany.ku@dot.ca.gov>
Date: Thu, 7 Nov 2024 22:28:31 +0000
Subject: [PATCH] (remove) dim_county_geog notebook

---
 .../shared_utils/add_dim_county_geog.ipynb    | 215 ------------------
 1 file changed, 215 deletions(-)
 delete mode 100644 _shared_utils/shared_utils/add_dim_county_geog.ipynb

diff --git a/_shared_utils/shared_utils/add_dim_county_geog.ipynb b/_shared_utils/shared_utils/add_dim_county_geog.ipynb
deleted file mode 100644
index 7ac3e628d..000000000
--- a/_shared_utils/shared_utils/add_dim_county_geog.ipynb
+++ /dev/null
@@ -1,215 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "4ee66ea4-c26c-460f-81ef-f98b3ab249c7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "\n",
-    "from shared_utils import schedule_rt_utils\n",
-    "from segment_speed_utils import helpers"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "a0263896-0115-4d80-8b9e-730a7b5a5368",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'\\nfrom calitp_data_analysis.tables import tbls\\nfrom siuba import *\\n\\nbridge_orgs_county_geog = (\\n    tbls.mart_transit_database.bridge_organizations_x_headquarters_county_geography()\\n    >> collect()\\n)\\n\\nbridge_orgs_county_geog.to_parquet(\"bridge_orgs_county_geog.parquet\")\\n\\ndim_county_geography = (\\n    tbls.mart_transit_database.dim_county_geography()\\n    >> collect()\\n)\\n\\ndim_county_geography.to_parquet(\"dim_county_geography.parquet\")\\n\\ndim_organizations = (\\n    tbls.mart_transit_database.dim_organizations()\\n    >> collect()\\n)\\n\\ndim_organizations.to_parquet(\"dim_organizations.parquet\")\\n\\nbridge_orgs_county_geog = pd.read_parquet(\"bridge_orgs_county_geog.parquet\")\\ndim_county_geography = pd.read_parquet(\"dim_county_geography.parquet\")\\ndim_organizations = pd.read_parquet(\"dim_organizations.parquet\")\\n'"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "'''\n",
-    "from calitp_data_analysis.tables import tbls\n",
-    "from siuba import *\n",
-    "\n",
-    "bridge_orgs_county_geog = (\n",
-    "    tbls.mart_transit_database.bridge_organizations_x_headquarters_county_geography()\n",
-    "    >> collect()\n",
-    ")\n",
-    "\n",
-    "bridge_orgs_county_geog.to_parquet(\"bridge_orgs_county_geog.parquet\")\n",
-    "\n",
-    "dim_county_geography = (\n",
-    "    tbls.mart_transit_database.dim_county_geography()\n",
-    "    >> collect()\n",
-    ")\n",
-    "\n",
-    "dim_county_geography.to_parquet(\"dim_county_geography.parquet\")\n",
-    "\n",
-    "dim_organizations = (\n",
-    "    tbls.mart_transit_database.dim_organizations()\n",
-    "    >> collect()\n",
-    ")\n",
-    "\n",
-    "dim_organizations.to_parquet(\"dim_organizations.parquet\")\n",
-    "\n",
-    "bridge_orgs_county_geog = pd.read_parquet(\"bridge_orgs_county_geog.parquet\")\n",
-    "dim_county_geography = pd.read_parquet(\"dim_county_geography.parquet\")\n",
-    "dim_organizations = pd.read_parquet(\"dim_organizations.parquet\")\n",
-    "'''"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "1aa7fa60-b28f-4530-8e8c-6ba29987286f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def create_gtfs_dataset_key_to_organization_crosswalk(\n",
-    "    analysis_date: str\n",
-    ") -> pd.DataFrame:\n",
-    "    \"\"\"\n",
-    "    For every operator that appears in schedule data, \n",
-    "    create a crosswalk that links to organization_source_record_id.\n",
-    "    For all our downstream outputs, at various aggregations,\n",
-    "    we need to attach these over and over again.\n",
-    "    \"\"\"\n",
-    "    df = helpers.import_scheduled_trips(\n",
-    "        analysis_date,\n",
-    "        columns = [\"gtfs_dataset_key\", \"name\"],\n",
-    "        get_pandas = True\n",
-    "    ).rename(columns = {\"schedule_gtfs_dataset_key\": \"gtfs_dataset_key\"})\n",
-    "    # rename columns because we must use simply gtfs_dataset_key in schedule_rt_utils function\n",
-    "    \n",
-    "    # Get base64_url, organization_source_record_id and organization_name\n",
-    "    crosswalk = schedule_rt_utils.sample_gtfs_dataset_key_to_organization_crosswalk(\n",
-    "        df,\n",
-    "        analysis_date,\n",
-    "        quartet_data = \"schedule\",\n",
-    "        dim_gtfs_dataset_cols = [\"key\", \"source_record_id\", \"base64_url\"],\n",
-    "        dim_organization_cols = [\"source_record_id\", \"name\", \n",
-    "                                 \"itp_id\",\n",
-    "                                  \"ntd_id_2022\"],\n",
-    "        dim_county_geography_cols = [\"caltrans_district\"]\n",
-    "    )\n",
-    "\n",
-    "    df_with_org = pd.merge(\n",
-    "        df.rename(columns = {\"gtfs_dataset_key\": \"schedule_gtfs_dataset_key\"}),\n",
-    "        crosswalk,\n",
-    "        on = \"schedule_gtfs_dataset_key\",\n",
-    "        how = \"inner\"\n",
-    "    )\n",
-    "    \n",
-    "    return df_with_org"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9b1801fb-7900-465f-8f00-98b57a68ef23",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "date = \"2024-10-16\"\n",
-    "\n",
-    "new_crosswalk = create_gtfs_dataset_key_to_organization_crosswalk(date)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ba5d0edd-ee38-45ca-b3a2-7df251084127",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "GCS = \"gs://calitp-analytics-data/data-analyses/\"\n",
-    "SCHED_GCS = f\"{GCS}gtfs_schedule/\"\n",
-    "CROSSWALK_FILE = \"crosswalk/gtfs_key_organization\"\n",
-    "\n",
-    "crosswalk = pd.read_parquet(f\"{SCHED_GCS}{CROSSWALK_FILE}_{date}.parquet\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7251b855-d7db-45e8-8d56-f9cf2d4b48d1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "new_crosswalk.shape, crosswalk.shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "18e85072-1833-4a39-9293-9d46a8f3e5d5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cols_in_common = [c for c in crosswalk.columns if c in new_crosswalk.columns]\n",
-    "new_crosswalk2 = new_crosswalk[cols_in_common].rename(columns = {\"caltrans_district\": \"caltrans_district2\"})\n",
-    "crosswalk2 = crosswalk[cols_in_common]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7343c8d9-55fc-4d5f-aad7-f7d4f7b8a02d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "merge_cols = [c for c in cols_in_common if c != \"caltrans_district\"]\n",
-    "df = pd.merge(\n",
-    "    crosswalk2,\n",
-    "    new_crosswalk2,\n",
-    "    on = merge_cols,\n",
-    "    how = \"outer\",\n",
-    "    indicator=True\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d6f01eaa-014b-4770-9119-482c11a277aa",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df[df.caltrans_district != df.caltrans_district2]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "75e4681e-3822-4665-a9ba-9783a1b00bfa",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.13"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}