diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f073f4a79..35ab4eac2 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,16 @@ Change Log ---------- +7.10.0 +====== + +* New module ``sheet_utils`` for loading workbooks. + + * class ``WorkbookManager`` for loading raw data + + * class ``ItemManager`` for loading item data + + 7.9.0 ===== diff --git a/dcicutils/license_utils.py b/dcicutils/license_utils.py index 855fa5c80..db18fd7df 100644 --- a/dcicutils/license_utils.py +++ b/dcicutils/license_utils.py @@ -810,6 +810,12 @@ class C4InfrastructureLicenseChecker(LicenseChecker): 'pytest-timeout', # MIT Licensed ], + # Linking = With Restrictions, Private Use = Yes + # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + 'GNU Lesser General Public License v2 or later (LGPLv2+)': [ + 'chardet' # used at runtime during server operation (ingestion), but not modified or distributed + ], + # Linking = With Restrictions, Private Use = Yes # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses 'GNU Lesser General Public License v3 or later (LGPLv3+)': [ diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py new file mode 100644 index 000000000..8125f27d3 --- /dev/null +++ b/dcicutils/sheet_utils.py @@ -0,0 +1,218 @@ +import copy + +from dcicutils.common import AnyJsonData +from openpyxl import load_workbook +from openpyxl.worksheet.worksheet import Worksheet +from openpyxl.workbook.workbook import Workbook +from typing import Any, Dict, List, Optional, Union + + +Header = str +Headers = List[str] +ParsedHeader = List[Union[str, int]] +ParsedHeaders = List[ParsedHeader] +SheetCellValue = Union[int, float, str] + + +class ItemTools: + """ + Implements operations on table-related data without pre-supposing the specific representation of the table. + It is assumed this can be used for data that was obtained from .json, .csv, .tsv, and .xlsx files because + it does not presuppose the source of the data nor where it will be written to. + + For the purpose of this class: + + * a 'header' is a string representing the top of a column. + + * a 'parsed header' is a list of strings and/or ints, after splitting at uses of '#' or '.', so that + "a.b.c" is represented as ["a", "b", "c"], and "x.y#0" is represented as ["x", "y", 0], and representing + each numeric token as an int instead of a string. + + * a 'headers' object is just a list of strings, each of which is a 'header'. + + * a 'parsed headers' object is a non-empty list of lists, each of which is a 'parsed header'. + e..g., the headers ["a.b.c", "x.y#0"] is represented as parsed hearders [["a", "b", "c"], ["x", "y", 0]]. + + """ + + @classmethod + def parse_sheet_header(cls, header: Header) -> ParsedHeader: + result = [] + token = "" + for i in range(len(header)): + ch = header[i] + if ch == '.' or ch == '#': + if token: + result.append(int(token) if token.isdigit() else token) + token = "" + else: + token += ch + if token: + result.append(int(token) if token.isdigit() else token) + return result + + @classmethod + def parse_sheet_headers(cls, headers: Headers): + return [cls.parse_sheet_header(header) + for header in headers] + + @classmethod + def compute_patch_prototype(cls, parsed_headers: ParsedHeaders): + prototype = {} + for parsed_header in parsed_headers: + parsed_header0 = parsed_header[0] + if isinstance(parsed_header0, int): + raise ValueError(f"A header cannot begin with a numeric ref: {parsed_header0}") + cls.assure_patch_prototype_shape(parent=prototype, keys=parsed_header) + return prototype + + @classmethod + def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: ParsedHeader): + [key0, *more_keys] = keys + key1 = more_keys[0] if more_keys else None + if isinstance(key1, int): + placeholder = [] + elif isinstance(key1, str): + placeholder = {} + else: + placeholder = None + if isinstance(key0, int): + n = len(parent) + if key0 == n: + parent.append(placeholder) + elif key0 > n: + raise Exception("Numeric items must occur sequentially.") + elif isinstance(key0, str): + if key0 not in parent: + parent[key0] = placeholder + if key1 is not None: + cls.assure_patch_prototype_shape(parent=parent[key0], keys=more_keys) + return parent + + @classmethod + def parse_value(cls, value: SheetCellValue) -> AnyJsonData: + if isinstance(value, str): + lvalue = value.lower() + # TODO: We could consult a schema to make this less heuristic, but this may do for now + if lvalue == 'true': + return True + elif lvalue == 'false': + return False + elif lvalue == 'null' or lvalue == '': + return None + elif '|' in value: + return [cls.parse_value(subvalue) for subvalue in value.split('|')] + else: + ch0 = value[0] + if ch0 == '+' or ch0 == '-' or ch0.isdigit(): + try: + return int(value) + except Exception: + pass + try: + return float(value) + except Exception: + pass + return value + else: # presumably a number (int or float) + return value + + @classmethod + def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any, force: bool = False): + if (value is None or value == '') and not force: + return + [key, *more_path] = path + if not more_path: + datum[key] = value + else: + cls.set_path_value(datum[key], more_path, value) + + +class WorkbookManager: + + @classmethod + def load_workbook(cls, filename: str): + wb = cls(filename) + return wb.load_content() + + def __init__(self, filename: str): + self.filename: str = filename + self.workbook: Optional[Workbook] = None + self.headers_by_sheetname: Dict[str, List[str]] = {} + self.content_by_sheetname: Dict[str, List[Any]] = {} + + def sheet_headers(self, sheetname: str) -> List[str]: + return self.headers_by_sheetname[sheetname] + + def sheet_content(self, sheetname: str) -> List[Any]: + return self.content_by_sheetname[sheetname] + + @classmethod + def _all_rows(cls, sheet: Worksheet): + row_max = sheet.max_row + for row in range(2, row_max + 1): + yield row + + @classmethod + def _all_cols(cls, sheet: Worksheet): + col_max = sheet.max_column + for col in range(1, col_max + 1): + yield col + + def _load_headers(self, sheet: Worksheet): + headers: List[str] = [str(sheet.cell(row=1, column=col).value) + for col in self._all_cols(sheet)] + self.headers_by_sheetname[sheet.title] = headers + + def _load_row(self, *, sheet: Worksheet, row: int): + headers = self.sheet_headers(sheet.title) + row_dict: Dict[str, Any] = {headers[col-1]: sheet.cell(row=row, column=col).value + for col in self._all_cols(sheet)} + return row_dict + + def load_content(self): + workbook: Workbook = load_workbook(self.filename) + self.workbook = workbook + for sheetname in workbook.sheetnames: + sheet: Worksheet = workbook[sheetname] + self._load_headers(sheet) + content = [] + for row in self._all_rows(sheet): + row_dict = self._load_row(sheet=sheet, row=row) + content.append(row_dict) + self.content_by_sheetname[sheetname] = content + return self.content_by_sheetname + + +class ItemManager(ItemTools, WorkbookManager): + + def __init__(self, filename: str): + super().__init__(filename=filename) + self.patch_prototypes_by_sheetname: Dict[str, Dict] = {} + self.parsed_headers_by_sheetname: Dict[str, List[List[Union[int, str]]]] = {} + + def sheet_patch_prototype(self, sheetname: str) -> Dict: + return self.patch_prototypes_by_sheetname[sheetname] + + def sheet_parsed_headers(self, sheetname: str) -> List[List[Union[int, str]]]: + return self.parsed_headers_by_sheetname[sheetname] + + def _load_headers(self, sheet: Worksheet): + super()._load_headers(sheet) + self._compile_sheet_headers(sheet.title) + + def _compile_sheet_headers(self, sheetname: str): + headers = self.headers_by_sheetname[sheetname] + parsed_headers = self.parse_sheet_headers(headers) + self.parsed_headers_by_sheetname[sheetname] = parsed_headers + prototype = self.compute_patch_prototype(parsed_headers) + self.patch_prototypes_by_sheetname[sheetname] = prototype + + def _load_row(self, *, sheet: Worksheet, row: int): + parsed_headers = self.sheet_parsed_headers(sheet.title) + patch_item = copy.deepcopy(self.sheet_patch_prototype(sheet.title)) + for col in self._all_cols(sheet): + value = sheet.cell(row=row, column=col).value + parsed_value = self.parse_value(value) + self.set_path_value(patch_item, parsed_headers[col - 1], parsed_value) + return patch_item diff --git a/docs/source/dcicutils.rst b/docs/source/dcicutils.rst index f15307d0e..8481da6a7 100644 --- a/docs/source/dcicutils.rst +++ b/docs/source/dcicutils.rst @@ -281,6 +281,13 @@ secrets_utils :members: +sheet_utils +^^^^^^^^^^^ + +.. automodule:: dcicutils.sheet_utils + :members: + + snapshot_utils ^^^^^^^^^^^^^^ diff --git a/poetry.lock b/poetry.lock index d7e77523c..480148ea1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -752,6 +752,18 @@ develop = ["black", "coverage", "jinja2", "mock", "pytest", "pytest-cov", "pyyam docs = ["sphinx (<1.7)", "sphinx-rtd-theme"] requests = ["requests (>=2.4.0,<3.0.0)"] +[[package]] +name = "et-xmlfile" +version = "1.1.0" +description = "An implementation of lxml.xmlfile for the standard library" +category = "main" +optional = false +python-versions = ">=3.6" +files = [ + {file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"}, + {file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"}, +] + [[package]] name = "exceptiongroup" version = "1.1.2" @@ -911,6 +923,21 @@ files = [ [package.dependencies] psutil = {version = ">=4.0.0", markers = "sys_platform != \"cygwin\""} +[[package]] +name = "openpyxl" +version = "3.1.2" +description = "A Python library to read/write Excel 2010 xlsx/xlsm files" +category = "main" +optional = false +python-versions = ">=3.6" +files = [ + {file = "openpyxl-3.1.2-py2.py3-none-any.whl", hash = "sha256:f91456ead12ab3c6c2e9491cf33ba6d08357d802192379bb482f1033ade496f5"}, + {file = "openpyxl-3.1.2.tar.gz", hash = "sha256:a6f5977418eff3b2d5500d54d9db50c8277a368436f4e4f8ddb1be3422870184"}, +] + +[package.dependencies] +et-xmlfile = "*" + [[package]] name = "opensearch-py" version = "2.3.0" @@ -1594,4 +1621,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = ">=3.7,<3.10" -content-hash = "b8d6612bb28cfb9da79306a82b2ac35a20678e1f62ef86c93b8af3c3d1ed798e" +content-hash = "9d01884634874c0304ebd91ae564ad7920cece54aea7de4c67955c2343e7d44b" diff --git a/pyproject.toml b/pyproject.toml index 70f90b624..ec5adce92 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.9.0" +version = "7.9.0.1b2" # to become "7.10.0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" @@ -37,6 +37,7 @@ classifiers = [ [tool.poetry.dependencies] python = ">=3.7,<3.10" + boto3 = "^1.17.39" botocore = "^1.20.39" # The DCIC portals (cgap-portal and fourfront) are very particular about which ElasticSearch version. @@ -45,20 +46,21 @@ elasticsearch = "7.13.4" aws-requests-auth = ">=0.4.2,<1" docker = "^4.4.4" gitpython = "^3.1.2" +openpyxl = "^3.1.2" +opensearch-py = "^2.0.1" +pyOpenSSL = "^23.1.1" +PyJWT = "^2.6.0" pytz = ">=2020.4" PyYAML = ">=5.1,<5.5" +redis = "^4.5.1" requests = "^2.21.0" rfc3986 = "^1.4.0" structlog = "^19.2.0" toml = ">=0.10.1,<1" +tqdm = "^4.65.0" typing-extensions = ">=3.8" # Fourfront uses 3.8 urllib3 = "^1.26.6" webtest = "^2.0.34" -opensearch-py = "^2.0.1" -redis = "^4.5.1" -pyOpenSSL = "^23.1.1" -PyJWT = "^2.6.0" -tqdm = "^4.65.0" [tool.poetry.dev-dependencies] diff --git a/test/data_files/sample_items.xlsx b/test/data_files/sample_items.xlsx new file mode 100644 index 000000000..19ca2acc8 Binary files /dev/null and b/test/data_files/sample_items.xlsx differ diff --git a/test/data_files/sample_items_sheet2.csv b/test/data_files/sample_items_sheet2.csv new file mode 100644 index 000000000..b1d3ec2da --- /dev/null +++ b/test/data_files/sample_items_sheet2.csv @@ -0,0 +1,3 @@ +name,age,mother.name,mother.age,father.name,father.age,friends#0.name,friends#0.age,friends#1.name,friends#1.age +bill,23,mary,58,fred,63,sam,22,arthur,19 +joe,9,estrella,35,anthony,34,anders,9,, \ No newline at end of file diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py new file mode 100644 index 000000000..40286d2e3 --- /dev/null +++ b/test/test_sheet_utils.py @@ -0,0 +1,197 @@ +import os +import pytest + +from dcicutils.sheet_utils import ItemTools, WorkbookManager, ItemManager +from .conftest_settings import TEST_DIR + + +def test_item_tools_parse_sheet_header(): + assert ItemTools.parse_sheet_header('.a') == ['a'] + assert ItemTools.parse_sheet_header('a') == ['a'] + assert ItemTools.parse_sheet_header('#0') == [0] + assert ItemTools.parse_sheet_header('0') == [0] + assert ItemTools.parse_sheet_header('foo.bar') == ['foo', 'bar'] + assert ItemTools.parse_sheet_header('a.b#0') == ['a', 'b', 0] + assert ItemTools.parse_sheet_header('x.xx#17#8.z') == ['x', 'xx', 17, 8, 'z'] + + # We don't error-check this, but it shouldn't matter + assert ItemTools.parse_sheet_header('#abc') == ['abc'] + assert ItemTools.parse_sheet_header('.123') == [123] + assert ItemTools.parse_sheet_header('#abc.123#456.def') == ['abc', 123, 456, 'def'] + + +def test_item_tools_parse_sheet_headers(): + input = ['a.b', 'a.c', 'a.d#1', 'a.d#2'] + expected = [['a', 'b'], ['a', 'c'], ['a', 'd', 1], ['a', 'd', 2]] + assert ItemTools.parse_sheet_headers(input) == expected + + +@pytest.mark.parametrize('parsed_headers,expected_prototype', [ + (['a'], + {'a': None}), + (['a', 'b'], + {'a': None, 'b': None}), + (['a.b', 'a.c', 'a.d#0', 'a.d#1'], + {'a': {'b': None, 'c': None, 'd': [None, None]}}), + (['a.b', 'a.c', 'a.d#0.foo', 'a.d#0.bar'], + {'a': {'b': None, 'c': None, 'd': [{'foo': None, 'bar': None}]}}), + (['a.b', 'a.c', 'a.d#0.foo', 'a.d#0.bar', 'a.d#1.foo', 'a.d#1.bar'], + {'a': {'b': None, 'c': None, 'd': [{'foo': None, 'bar': None}, {'foo': None, 'bar': None}]}}), +]) +def test_item_tools_compute_patch_prototype(parsed_headers, expected_prototype): + parsed_headers = ItemTools.parse_sheet_headers(parsed_headers) + assert ItemTools.compute_patch_prototype(parsed_headers) == expected_prototype + + +@pytest.mark.parametrize('headers', [['0'], ['x', '0.y']]) +def test_item_tools_compute_patch_prototype_errors(headers): + + parsed_headers = ItemTools.parse_sheet_headers(headers) + with pytest.raises(ValueError) as exc: + ItemTools.compute_patch_prototype(parsed_headers) + assert str(exc.value) == "A header cannot begin with a numeric ref: 0" + + +def test_item_tools_parse_value(): + + for x in [37, 19.3, True, False, None, 'simple text']: + assert ItemTools.parse_value(x) == x + + assert ItemTools.parse_value('3') == 3 + assert ItemTools.parse_value('+3') == 3 + assert ItemTools.parse_value('-3') == -3 + + assert ItemTools.parse_value('3.5') == 3.5 + assert ItemTools.parse_value('+3.5') == 3.5 + assert ItemTools.parse_value('-3.5') == -3.5 + + assert ItemTools.parse_value('3.5e1') == 35.0 + assert ItemTools.parse_value('+3.5e1') == 35.0 + assert ItemTools.parse_value('-3.5e1') == -35.0 + + assert ItemTools.parse_value('') is None + + assert ItemTools.parse_value('null') is None + assert ItemTools.parse_value('Null') is None + assert ItemTools.parse_value('NULL') is None + + assert ItemTools.parse_value('true') is True + assert ItemTools.parse_value('True') is True + assert ItemTools.parse_value('TRUE') is True + + assert ItemTools.parse_value('false') is False + assert ItemTools.parse_value('False') is False + assert ItemTools.parse_value('FALSE') is False + + assert ItemTools.parse_value('alpha|beta|gamma') == ['alpha', 'beta', 'gamma'] + assert ItemTools.parse_value('alpha|true|false|null||7|1.5') == ['alpha', True, False, None, None, 7, 1.5] + + +def test_item_tools_set_path_value(): + + x = {'foo': 1, 'bar': 2} + ItemTools.set_path_value(x, ['foo'], 3) + assert x == {'foo': 3, 'bar': 2} + + x = {'foo': [11, 22, 33], 'bar': {'x': 'xx', 'y': 'yy'}} + ItemTools.set_path_value(x, ['foo', 1], 17) + assert x == {'foo': [11, 17, 33], 'bar': {'x': 'xx', 'y': 'yy'}} + + x = {'foo': [11, 22, 33], 'bar': {'x': 'xx', 'y': 'yy'}} + ItemTools.set_path_value(x, ['bar', 'x'], 'something') + assert x == {'foo': [11, 22, 33], 'bar': {'x': 'something', 'y': 'yy'}} + + +SAMPLE_XLSX_FILE = os.path.join(TEST_DIR, 'data_files/sample_items.xlsx') + +SAMPLE_XLSX_FILE_RAW_CONTENT = { + "Sheet1": [ + {"x": 1, "y.a": 1, "y.z": 1}, + {"x": 1, "y.a": 2, "y.z": 3}, + {"x": "alpha", "y.a": "beta", "y.z": "gamma|delta"}, + ], + "Sheet2": [ + { + "name": "bill", "age": 23, + "mother.name": "mary", "mother.age": 58, + "father.name": "fred", "father.age": 63, + "friends#0.name": "sam", "friends#0.age": 22, + "friends#1.name": "arthur", "friends#1.age": 19, + }, + { + "name": "joe", "age": 9, + "mother.name": "estrella", "mother.age": 35, + "father.name": "anthony", "father.age": 34, + "friends#0.name": "anders", "friends#0.age": 9, + "friends#1.name": None, "friends#1.age": None, + }, + ] +} + +SAMPLE_XLSX_FILE_ITEM_CONTENT = { + "Sheet1": [ + {"x": 1, "y": {"a": 1, "z": 1}}, + {"x": 1, "y": {"a": 2, "z": 3}}, + {"x": "alpha", "y": {"a": "beta", "z": ["gamma", "delta"]}}, + ], + "Sheet2": [ + { + "name": "bill", "age": 23, + "mother": {"name": "mary", "age": 58}, + "father": {"name": "fred", "age": 63}, + "friends": [ + {"name": "sam", "age": 22}, + {"name": "arthur", "age": 19}, + ] + }, + { + "name": "joe", "age": 9, + "mother": {"name": "estrella", "age": 35}, + "father": {"name": "anthony", "age": 34}, + "friends": [ + {"name": "anders", "age": 9}, + {"name": None, "age": None} + ] + }, + ], +} + +SAMPLE_CSV_FILE = os.path.join(TEST_DIR, 'data_files/sample_items_sheet2.csv') + +SAMPLE_CSV_FILE_RAW_CONTENT = SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2'] + +SAMPLE_CSV_FILE_ITEM_CONTENT = SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2'] + + +def test_workbook_manager_load_content(): + + wt = WorkbookManager(SAMPLE_XLSX_FILE) + assert wt.load_content() == SAMPLE_XLSX_FILE_RAW_CONTENT + + +def test_workbook_manager_load_workbook(): + + assert WorkbookManager.load_workbook(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_RAW_CONTENT + + +def test_workbook_manager_load_csv(): + + with pytest.raises(Exception): + WorkbookManager.load_workbook(SAMPLE_CSV_FILE) + + +def test_item_manager_load_content(): + + it = ItemManager(SAMPLE_XLSX_FILE) + assert it.load_content() == SAMPLE_XLSX_FILE_ITEM_CONTENT + + +def test_item_manager_load_workbook(): + + assert ItemManager.load_workbook(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT + + +def test_item_manager_load_csv(): + + with pytest.raises(Exception): + ItemManager.load_workbook(SAMPLE_CSV_FILE)