4dn-dcic · netsettler · Oct 31, 2023 · Aug 14, 2023 · Aug 14, 2023 · Aug 14, 2023
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -7,6 +7,33 @@ Change Log
 ----------
 
 
+7.12.0
+======
+
+* New module ``sheet_utils`` for loading workbooks.
+
+  * Important things of interest:
+
+    * Class ``ItemManager`` for loading Item-style data
+      from any ``.xlsx``, ``.csv`` or ``.tsv`` files.
+
+    * Function ``load_items`` that does the same as ``ItemManager.load``.
+
+  * Various lower-level implementation classes such as:
+
+    * Classes ``XlsxManager``, ``CsvManager`` and ``TsvManager`` for loading raw data
+      from ``.xlsx``, ``.csv``, and ``.tsv`` files, respectively.
+
+    * Classes ``XlsxItemManager``, ``CsvItemManager``, and ``TsvItemManager`` for loading Item-style data
+      from ``.xlsx``, ``.csv``, and ``.tsv`` files, respectively.
+
+* New functionality in ``misc_utils``:
+
+  * New function ``is_uuid`` (migrated from Fourfront)
+  * New function ``pad_to``
+  * New class ``JsonLinesReader``
+
+
 7.11.0
 ======
 
@@ -16,6 +43,7 @@ Change Log
   * Fix in ``get_schema`` and ``get_schemas`` for the ``portal_vapp`` returning webtest.response.TestResponse
     which has a ``json`` object property rather than a function.
 
+
 7.10.0
 ======
 

diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py
@@ -9,6 +9,7 @@
 import inspect
 import math
 import io
+import json
 import os
 import logging
 import pytz
@@ -191,7 +192,11 @@ class _VirtualAppHelper(webtest.TestApp):
     pass
 
 
-class VirtualApp:
+class AbstractVirtualApp:
+    pass
+
+
+class VirtualApp(AbstractVirtualApp):
     """
     Wrapper class for TestApp, to allow custom control over submitting Encoded requests,
     simulating a number of conditions, including permissions.
@@ -1352,6 +1357,25 @@ def capitalize1(s):
     return s[:1].upper() + s[1:]
 
 
+"""
+Python's UUID ignores all dashes, whereas Postgres is more strict
+http://www.postgresql.org/docs/9.2/static/datatype-uuid.html
+See also http://www.postgresql.org/docs/9.2/static/datatype-uuid.html
+And, anyway, this pattern is what our portals have been doing
+for quite a while, so it's the most stable choice for us now.
+"""
+
+uuid_re = re.compile(r'(?i)[{]?(?:[0-9a-f]{4}-?){8}[}]?')
+
+
+def is_uuid(instance):
+    """
+    Predicate returns true for any group of 32 hex characters with optional hyphens every four characters.
+    We insist on lowercase to make matching faster. See other notes on this design choice above.
+    """
+    return bool(uuid_re.match(instance))
+
+
 def string_list(s):
     """
     Turns a comma-separated list into an actual list, trimming whitespace and ignoring nulls.
@@ -2313,3 +2337,73 @@ def parse_in_radix(text: str, *, radix: int):
     except Exception:
         pass
     raise ValueError(f"Unable to parse: {text!r}")
+
+
+def pad_to(target_size: int, data: list, *, padding=None):
+    """
+    This will pad to a given target size, a list of a potentially different actual size, using given padding.
+    e.g., pad_to(3, [1, 2]) will return [1, 2, None]
+    """
+    actual_size = len(data)
+    if actual_size < target_size:
+        data = data + [padding] * (target_size - actual_size)
+    return data
+
+
+class JsonLinesReader:
+
+    def __init__(self, fp, padded=False, padding=None):
+        """
+        Given an fp (the conventional name for a "file pointer", the thing a call to io.open returns,
+        this creates an object that can be used to iterate across the lines in the JSON lines file
+        that the fp is reading from.
+
+        There are two possible formats that this will return.
+
+        For files that contain a series of dictionaries, such as:
+            {"something": 1, "else": "a"}
+            {"something": 2, "else": "b"}
+            ...etc
+        this will just return thos those dictionaries one-by-one when iterated over.
+
+        The same set of dictionaries will also be yielded by a file containing:
+            ["something", "else"]
+            [1, "a"]
+            [2, "b"]
+            ...etc
+        this will just return thos those dictionaries one-by-one when iterated over.
+
+        NOTES:
+
+        * In the second case, shorter lists on subsequent lines return only partial dictionaries.
+        * In the second case, longer lists on subsequent lines will quietly drop any extra elements.
+        """
+
+        self.fp = fp
+        self.padded: bool = padded
+        self.padding = padding
+        self.headers = None  # Might change after we see first line
+
+    def __iter__(self):
+        first_line = True
+        n_headers = 0
+        for raw_line in self.fp:
+            line = json.loads(raw_line)
+            if first_line:
+                first_line = False
+                if isinstance(line, list):
+                    self.headers = line
+                    n_headers = len(line)
+                    continue
+            # If length of line is more than we expect, ignore it. Let user put comments beyond our table
+            # But if length of line is less than we expect, extend the line with None
+            if self.headers:
+                if not isinstance(line, list):
+                    raise Exception("If the first line is a list, all lines must be.")
+                if self.padded and len(line) < n_headers:
+                    line = pad_to(n_headers, line, padding=self.padding)
+                yield dict(zip(self.headers, line))
+            elif isinstance(line, dict):
+                yield line
+            else:
+                raise Exception(f"If the first line is not a list, all lines must be dictionaries: {line!r}")