Feature/issue 258 - Granules with large numbers of features cannot be…

… loadedd (#259) * change assemble attrs function to avoid for loop * change how attributes are concatenated during shp unpack to avoid slow looping * remove unused import * Update API test data with less precise data coordinates * remove logging every item in batch writer * lint --------- Co-authored-by: Nikki <[email protected]>
podaac · Oct 31, 2024 · c15774e · c15774e
1 parent 57c7d6f
commit c15774e
Show file tree

Hide file tree

Showing 8 changed files with 1,141 additions and 1,144 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Deprecated
 ### Removed
 ### Fixed
+    - Issue 258 - Granules with very large feature counts cannot be added to hydrocron
     - Issue 235 - Track ingest table can be populated with granules that aren't loaded into Hydrocron
     - Issue 248 - Track ingest operations need to query UAT for granule files if track ingest is running in SIT or UAT
 ### Security

diff --git a/hydrocron/db/io/swot_shp.py b/hydrocron/db/io/swot_shp.py
@@ -2,7 +2,6 @@
 Unpacks SWOT Shapefiles
 """
 import os.path
-import json
 import tempfile
 from datetime import datetime, timezone
 from importlib import resources
@@ -109,7 +108,7 @@ def handle_null_geometries(geodf):
     geodf_no_nulls : geopandas.GeoDataFrame
         the geodataframe with null geometries handled
     """
-
+    logging.info('Starting handle null geometries')
     geodf['geometry'].fillna(
         value=Polygon(constants.SWOT_PRIOR_LAKE_FILL_GEOMETRY_COORDS),
         inplace=True)
@@ -131,6 +130,8 @@ def convert_polygon_to_centerpoint(geodf_polygon):
     geodf_centerpoint : geopandas.GeoDataFrame
         the geodataframe with point feature types and calculated centerpoint geometries
     """
+    logging.info('Starting convert polygon to centerpoint')
+
     geodf_centerpoint = geodf_polygon
     geodf_centerpoint['geometry'] = geodf_polygon['geometry'].centroid
 
@@ -152,6 +153,7 @@ def parse_metadata_from_shpxml(xml_elem):
     metadata_attrs : dict
         a dictionary of metadata attributes to add to record
     """
+    logging.info('Starting parse metadata from shpfile')
     # get SWORD version
     for globs in xml_elem.findall('global_attributes'):
         prior_db_files = globs.find('xref_prior_river_db_files').text
@@ -191,18 +193,13 @@ def assemble_attributes(geodf, attributes):
     attributes : dict
         A dictionary of attributes to concatenate
     """
+    logging.info('Starting assemble attributes')
 
     items = []
-    # rework to use dataframe instead of file as string
-    for _index, row in geodf.iterrows():
-
-        shp_attrs = json.loads(
-            row.to_json(default_handler=str))
 
-        item_attrs = shp_attrs | attributes
-
-        item_attrs = {key: str(item_attrs[key]) for key in item_attrs.keys()}
-        items.append(item_attrs)
+    geodf = geodf.astype(str)
+    geodf = geodf.assign(**attributes)
+    items = geodf.to_dict('records')
 
     return items
 
@@ -222,7 +219,7 @@ def parse_from_filename(filename):
     filename_attrs : dict
         A dictionary of attributes from the filename
     """
-
+    logging.info('Starting parse attributes from filename')
     filename_components = filename.split("_")
 
     collection = ""

diff --git a/hydrocron/db/load_data.py b/hydrocron/db/load_data.py
@@ -365,8 +365,12 @@ def load_data(dynamo_resource, table_name, items):
             logging.info("Item %s: %s", feature_id, items[i][feature_id])
         hydrocron_table.batch_fill_table(items)
 
+        logging.info("Finished loading %s items", len(items))
+
     else:
         logging.info("Adding %s items to table individually", feature_name)
         for item_attrs in items:
             logging.info("Item %s: %s", feature_id, item_attrs[feature_id])
             hydrocron_table.add_data(**item_attrs)
+
+        logging.info("Finished loading %s items", len(items))
diff --git a/hydrocron/db/schema.py b/hydrocron/db/schema.py
@@ -115,11 +115,6 @@ def batch_fill_table(self, items):
         try:
             with table.batch_writer() as writer:
                 for item in items:
-                    logger.info(
-                        "Item %s size: %s",
-                        item[self.partition_key_name],
-                        str(sys.getsizeof(item))
-                    )
                     if sys.getsizeof(item) < 300000:
                         writer.put_item(Item=item)
                     else:

diff --git a/tests/test_data/api_query_results_csv.csv b/tests/test_data/api_query_results_csv.csv