-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1224 from cal-itp/pems-daytype
Match PEMS stations with SHN postmiles
- Loading branch information
Showing
12 changed files
with
1,803 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
""" | ||
Query beyond the 2,000 rows ESRI gives. | ||
https://gis.stackexchange.com/questions/266897/how-to-get-around-the-1000-objectids-limit-on-arcgis-server | ||
""" | ||
import urllib.parse | ||
|
||
import geopandas as gpd | ||
import numpy as np | ||
import pandas as pd | ||
import requests | ||
|
||
|
||
def query_arcgis_feature_server(url_feature_server=""): | ||
""" | ||
This function downloads all of the features available on a given ArcGIS | ||
feature server. The function is written to bypass the limitations imposed | ||
by the online service, such as only returning up to 1,000 or 2,000 featues | ||
at a time. | ||
Parameters | ||
---------- | ||
url_feature_server : string | ||
Sting containing the URL of the service API you want to query. It should | ||
end in a forward slash and look something like this: | ||
'https://services.arcgis.com/P3ePLMYs2RVChkJx/arcgis/rest/services/USA_Counties/FeatureServer/0/' | ||
Returns | ||
------- | ||
geodata_final : gpd.GeoDataFrame | ||
This is a GeoDataFrame that contains all of the features from the | ||
Feature Server. After calling this function, the `geodata_final` object | ||
can be used to store the data on disk in several different formats | ||
including, but not limited to, Shapefile (.shp), GeoJSON (.geojson), | ||
GeoPackage (.gpkg), or PostGIS. | ||
See https://geopandas.org/en/stable/docs/user_guide/io.html#writing-spatial-data | ||
for more details. | ||
""" | ||
if url_feature_server == "": | ||
geodata_final = gpd.GeoDataFrame() | ||
return geodata_final | ||
|
||
# Fixing last character in case the URL provided didn't end in a | ||
# forward slash | ||
if url_feature_server[-1] != "/": | ||
url_feature_server = url_feature_server + "/" | ||
|
||
# Getting the layer definitions. This contains important info such as the | ||
# name of the column used as feature_ids/object_ids, among other things. | ||
layer_def = requests.get(url_feature_server + "?f=pjson").json() | ||
|
||
# The `objectIdField` is the column name used for the | ||
# feature_ids/object_ids | ||
fid_colname = layer_def["objectIdField"] | ||
|
||
# The `maxRecordCount` tells us the maximum number of records this REST | ||
# API service can return at once. The code below is written such that we | ||
# perform multiple calls to the API, each one being short enough never to | ||
# go beyond this limit. | ||
record_count_max = layer_def["maxRecordCount"] | ||
|
||
# Part of the URL that specifically requests only the object IDs | ||
url_query_get_ids = f"query?f=geojson&returnIdsOnly=true" f"&where={fid_colname}+is+not+null" | ||
|
||
url_comb = url_feature_server + url_query_get_ids | ||
|
||
# Getting all the object IDs | ||
service_request = requests.get(url_comb) | ||
all_objectids = np.sort(service_request.json()["properties"]["objectIds"]) | ||
|
||
# This variable will store all the parts of the multiple queries. These | ||
# parts will, at the end, be concatenated into one large GeoDataFrame. | ||
geodata_parts = [] | ||
|
||
# This part of the query is fixed and never actually changes | ||
url_query_fixed = "query?f=geojson&outFields=*&where=" | ||
|
||
# Identifying the largest query size allowed per request. This will dictate | ||
# how many queries will need to be made. We start the search at | ||
# the max record count, but that generates errors sometimes - the query | ||
# might time out because it's too big. If the test query times out, we try | ||
# shrink the query size until the test query goes through without | ||
# generating a time-out error. | ||
block_size = min(record_count_max, len(all_objectids)) | ||
worked = False | ||
while not worked: | ||
# Moving the "cursors" to their appropriate locations | ||
id_start = all_objectids[0] | ||
id_end = all_objectids[block_size - 1] | ||
|
||
readable_query_string = f"{fid_colname}>={id_start} " f"and {fid_colname}<={id_end}" | ||
|
||
url_query_variable = urllib.parse.quote(readable_query_string) | ||
|
||
url_comb = url_feature_server + url_query_fixed + url_query_variable | ||
|
||
url_get = requests.get(url_comb) | ||
|
||
if "error" in url_get.json(): | ||
block_size = int(block_size / 2) + 1 | ||
else: | ||
geodata_part = gpd.read_file(url_get.text) | ||
|
||
geodata_parts.append(geodata_part.copy()) | ||
worked = True | ||
|
||
# Performing the actual query to the API multiple times. This skips the | ||
# first few rows/features in the data because those rows were already | ||
# captured in the query performed in the code chunk above. | ||
for i in range(block_size, len(all_objectids), block_size): | ||
# Moving the "cursors" to their appropriate locations and finding the | ||
# limits of each block | ||
sub_list = all_objectids[i : i + block_size] | ||
id_start = sub_list[0] | ||
id_end = sub_list[-1] | ||
|
||
readable_query_string = f"{fid_colname}>={id_start} " f"and {fid_colname}<={id_end}" | ||
|
||
# Encoding from readable text to URL | ||
url_query_variable = urllib.parse.quote(readable_query_string) | ||
|
||
# Constructing the full request URL | ||
url_comb = url_feature_server + url_query_fixed + url_query_variable | ||
|
||
# Actually performing the query and storing its results in a | ||
# GeoDataFrame | ||
geodata_part = gpd.read_file(url_comb, driver="GeoJSON") | ||
|
||
# Appending the result to `geodata_parts` | ||
if geodata_part.shape[0] > 0: | ||
geodata_parts.append(geodata_part) | ||
|
||
# Concatenating all of the query parts into one large GeoDataFrame | ||
geodata_final = pd.concat(geodata_parts, ignore_index=True).sort_values(by=fid_colname).reset_index(drop=True) | ||
|
||
# Checking if any object ID is missing | ||
ids_queried = set(geodata_final[fid_colname]) | ||
for i, this_id in enumerate(all_objectids): | ||
if this_id not in ids_queried: | ||
print("WARNING! The following ObjectID is missing from the final " f"GeoDataFrame: ObjectID={this_id}") | ||
pass | ||
|
||
# Checking if any object ID is included twice | ||
geodata_temp = geodata_final[[fid_colname]].copy() | ||
geodata_temp["temp"] = 1 | ||
geodata_temp = geodata_temp.groupby(fid_colname).agg({"temp": "sum"}).reset_index() | ||
geodata_temp = geodata_temp.loc[geodata_temp["temp"] > 1].copy() | ||
for i, this_id in enumerate(geodata_temp[fid_colname].values): | ||
n_times = geodata_temp["temp"].values[i] | ||
print( | ||
"WARNING! The following ObjectID is included multiple times in" | ||
f"the final GeoDataFrame: ObjectID={this_id}\tOccurrences={n_times}" | ||
) | ||
|
||
return geodata_final |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.