Skip to content

Commit

Permalink
Feature/issue 258 - Granules with large numbers of features cannot be…
Browse files Browse the repository at this point in the history
… loadedd (#259)

* change assemble attrs function to avoid for loop

* change how attributes are concatenated during shp unpack to avoid slow looping

* remove unused import

* Update API test data with less precise data coordinates

* remove logging every item in batch writer

* lint

---------

Co-authored-by: Nikki <[email protected]>
  • Loading branch information
torimcd and nikki-t authored Oct 31, 2024
1 parent 57c7d6f commit c15774e
Show file tree
Hide file tree
Showing 8 changed files with 1,141 additions and 1,144 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Deprecated
### Removed
### Fixed
- Issue 258 - Granules with very large feature counts cannot be added to hydrocron
- Issue 235 - Track ingest table can be populated with granules that aren't loaded into Hydrocron
- Issue 248 - Track ingest operations need to query UAT for granule files if track ingest is running in SIT or UAT
### Security
Expand Down
21 changes: 9 additions & 12 deletions hydrocron/db/io/swot_shp.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
Unpacks SWOT Shapefiles
"""
import os.path
import json
import tempfile
from datetime import datetime, timezone
from importlib import resources
Expand Down Expand Up @@ -109,7 +108,7 @@ def handle_null_geometries(geodf):
geodf_no_nulls : geopandas.GeoDataFrame
the geodataframe with null geometries handled
"""

logging.info('Starting handle null geometries')
geodf['geometry'].fillna(
value=Polygon(constants.SWOT_PRIOR_LAKE_FILL_GEOMETRY_COORDS),
inplace=True)
Expand All @@ -131,6 +130,8 @@ def convert_polygon_to_centerpoint(geodf_polygon):
geodf_centerpoint : geopandas.GeoDataFrame
the geodataframe with point feature types and calculated centerpoint geometries
"""
logging.info('Starting convert polygon to centerpoint')

geodf_centerpoint = geodf_polygon
geodf_centerpoint['geometry'] = geodf_polygon['geometry'].centroid

Expand All @@ -152,6 +153,7 @@ def parse_metadata_from_shpxml(xml_elem):
metadata_attrs : dict
a dictionary of metadata attributes to add to record
"""
logging.info('Starting parse metadata from shpfile')
# get SWORD version
for globs in xml_elem.findall('global_attributes'):
prior_db_files = globs.find('xref_prior_river_db_files').text
Expand Down Expand Up @@ -191,18 +193,13 @@ def assemble_attributes(geodf, attributes):
attributes : dict
A dictionary of attributes to concatenate
"""
logging.info('Starting assemble attributes')

items = []
# rework to use dataframe instead of file as string
for _index, row in geodf.iterrows():

shp_attrs = json.loads(
row.to_json(default_handler=str))

item_attrs = shp_attrs | attributes

item_attrs = {key: str(item_attrs[key]) for key in item_attrs.keys()}
items.append(item_attrs)
geodf = geodf.astype(str)
geodf = geodf.assign(**attributes)
items = geodf.to_dict('records')

return items

Expand All @@ -222,7 +219,7 @@ def parse_from_filename(filename):
filename_attrs : dict
A dictionary of attributes from the filename
"""

logging.info('Starting parse attributes from filename')
filename_components = filename.split("_")

collection = ""
Expand Down
4 changes: 4 additions & 0 deletions hydrocron/db/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,8 +365,12 @@ def load_data(dynamo_resource, table_name, items):
logging.info("Item %s: %s", feature_id, items[i][feature_id])
hydrocron_table.batch_fill_table(items)

logging.info("Finished loading %s items", len(items))

else:
logging.info("Adding %s items to table individually", feature_name)
for item_attrs in items:
logging.info("Item %s: %s", feature_id, item_attrs[feature_id])
hydrocron_table.add_data(**item_attrs)

logging.info("Finished loading %s items", len(items))
5 changes: 0 additions & 5 deletions hydrocron/db/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,6 @@ def batch_fill_table(self, items):
try:
with table.batch_writer() as writer:
for item in items:
logger.info(
"Item %s size: %s",
item[self.partition_key_name],
str(sys.getsizeof(item))
)
if sys.getsizeof(item) < 300000:
writer.put_item(Item=item)
else:
Expand Down
2 changes: 1 addition & 1 deletion tests/test_data/api_query_results_csv.csv

Large diffs are not rendered by default.

Loading

0 comments on commit c15774e

Please sign in to comment.