diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 345c0a1d1..270ae0a39 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,33 @@ Change Log ---------- +7.12.0 +====== + +* New module ``sheet_utils`` for loading workbooks. + + * Important things of interest: + + * Class ``ItemManager`` for loading Item-style data + from any ``.xlsx``, ``.csv`` or ``.tsv`` files. + + * Function ``load_items`` that does the same as ``ItemManager.load``. + + * Various lower-level implementation classes such as: + + * Classes ``XlsxManager``, ``CsvManager`` and ``TsvManager`` for loading raw data + from ``.xlsx``, ``.csv``, and ``.tsv`` files, respectively. + + * Classes ``XlsxItemManager``, ``CsvItemManager``, and ``TsvItemManager`` for loading Item-style data + from ``.xlsx``, ``.csv``, and ``.tsv`` files, respectively. + +* New functionality in ``misc_utils``: + + * New function ``is_uuid`` (migrated from Fourfront) + * New function ``pad_to`` + * New class ``JsonLinesReader`` + + 7.11.0 ====== @@ -16,6 +43,7 @@ Change Log * Fix in ``get_schema`` and ``get_schemas`` for the ``portal_vapp`` returning webtest.response.TestResponse which has a ``json`` object property rather than a function. + 7.10.0 ====== diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index cc18f4b19..8ebd991a4 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -9,6 +9,7 @@ import inspect import math import io +import json import os import logging import pytz @@ -191,7 +192,11 @@ class _VirtualAppHelper(webtest.TestApp): pass -class VirtualApp: +class AbstractVirtualApp: + pass + + +class VirtualApp(AbstractVirtualApp): """ Wrapper class for TestApp, to allow custom control over submitting Encoded requests, simulating a number of conditions, including permissions. @@ -1352,6 +1357,25 @@ def capitalize1(s): return s[:1].upper() + s[1:] +""" +Python's UUID ignores all dashes, whereas Postgres is more strict +http://www.postgresql.org/docs/9.2/static/datatype-uuid.html +See also http://www.postgresql.org/docs/9.2/static/datatype-uuid.html +And, anyway, this pattern is what our portals have been doing +for quite a while, so it's the most stable choice for us now. +""" + +uuid_re = re.compile(r'(?i)[{]?(?:[0-9a-f]{4}-?){8}[}]?') + + +def is_uuid(instance): + """ + Predicate returns true for any group of 32 hex characters with optional hyphens every four characters. + We insist on lowercase to make matching faster. See other notes on this design choice above. + """ + return bool(uuid_re.match(instance)) + + def string_list(s): """ Turns a comma-separated list into an actual list, trimming whitespace and ignoring nulls. @@ -2313,3 +2337,73 @@ def parse_in_radix(text: str, *, radix: int): except Exception: pass raise ValueError(f"Unable to parse: {text!r}") + + +def pad_to(target_size: int, data: list, *, padding=None): + """ + This will pad to a given target size, a list of a potentially different actual size, using given padding. + e.g., pad_to(3, [1, 2]) will return [1, 2, None] + """ + actual_size = len(data) + if actual_size < target_size: + data = data + [padding] * (target_size - actual_size) + return data + + +class JsonLinesReader: + + def __init__(self, fp, padded=False, padding=None): + """ + Given an fp (the conventional name for a "file pointer", the thing a call to io.open returns, + this creates an object that can be used to iterate across the lines in the JSON lines file + that the fp is reading from. + + There are two possible formats that this will return. + + For files that contain a series of dictionaries, such as: + {"something": 1, "else": "a"} + {"something": 2, "else": "b"} + ...etc + this will just return thos those dictionaries one-by-one when iterated over. + + The same set of dictionaries will also be yielded by a file containing: + ["something", "else"] + [1, "a"] + [2, "b"] + ...etc + this will just return thos those dictionaries one-by-one when iterated over. + + NOTES: + + * In the second case, shorter lists on subsequent lines return only partial dictionaries. + * In the second case, longer lists on subsequent lines will quietly drop any extra elements. + """ + + self.fp = fp + self.padded: bool = padded + self.padding = padding + self.headers = None # Might change after we see first line + + def __iter__(self): + first_line = True + n_headers = 0 + for raw_line in self.fp: + line = json.loads(raw_line) + if first_line: + first_line = False + if isinstance(line, list): + self.headers = line + n_headers = len(line) + continue + # If length of line is more than we expect, ignore it. Let user put comments beyond our table + # But if length of line is less than we expect, extend the line with None + if self.headers: + if not isinstance(line, list): + raise Exception("If the first line is a list, all lines must be.") + if self.padded and len(line) < n_headers: + line = pad_to(n_headers, line, padding=self.padding) + yield dict(zip(self.headers, line)) + elif isinstance(line, dict): + yield line + else: + raise Exception(f"If the first line is not a list, all lines must be dictionaries: {line!r}") diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py new file mode 100644 index 000000000..5a311f7c0 --- /dev/null +++ b/dcicutils/sheet_utils.py @@ -0,0 +1,1131 @@ +import chardet +import contextlib +import copy +import csv +import glob +import io +import json +import openpyxl +import os +import re +import subprocess +import uuid +import yaml + +from openpyxl.worksheet.worksheet import Worksheet +from openpyxl.workbook.workbook import Workbook +from tempfile import TemporaryFile, TemporaryDirectory +from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union +from .common import AnyJsonData +from .env_utils import public_env_name, EnvUtils +from .ff_utils import get_schema +from .lang_utils import conjoined_list, disjoined_list, maybe_pluralize, there_are +from .misc_utils import ignored, PRINT, pad_to, JsonLinesReader, AbstractVirtualApp, remove_suffix +from .task_utils import pmap + + +Header = str +Headers = List[str] +ParsedHeader = List[Union[str, int]] +ParsedHeaders = List[ParsedHeader] +SheetCellValue = Union[int, float, str] +SheetRow = List[SheetCellValue] +CsvReader = type(csv.reader(TemporaryFile())) +SheetData = List[dict] +TabbedSheetData = Dict[str, SheetData] +Regexp = type(re.compile("sample")) + + +class LoadFailure(Exception): + """ + In general, we'd prefer to load up the spreadsheet with clumsy data that can then be validated in detail, + but some errors are so confusing or so problematic that we need to just fail the load right away. + """ + pass + + +class LoadArgumentsError(LoadFailure): + """ + Errors of this class represent situations where we can't get started because + there's a problem with the given arguments. + """ + pass + + +class LoadTableError(LoadFailure): + """ + Errors of this class represent situations where we can't get started because + there's a problem with some table's syntax, for example headers that don't make sense. + """ + pass + + +@contextlib.contextmanager +def deferred_problems(): + problems = [] + + def note_problems(problem): + problems.append(problem) + + yield note_problems + + if problems: + for problem in problems: + PRINT(f"Problem: {problem}") + raise Exception(there_are(problems, kind='problem while compiling hints', tense='past', show=False)) + + +def unwanted_kwargs(*, context, kwargs, context_plural=False, detailed=False): + if kwargs: + unwanted = [f"{argname}={value!r}" if detailed else argname + for argname, value in kwargs.items() + if value is not None] + if unwanted: + does_not = "don't" if context_plural else "doesn't" + raise LoadArgumentsError(f"{context} {does_not} use" + f" {maybe_pluralize(unwanted, 'keyword argument')} {conjoined_list(unwanted)}.") + + +def prefer_number(value: SheetCellValue): + if isinstance(value, str): # the given value might be an int or float, in which case just fall through + if not value: + return None + value = value + ch0 = value[0] + if ch0 == '+' or ch0 == '-' or ch0.isdigit(): + try: + return int(value) + except Exception: + pass + try: + return float(value) + except Exception: + pass + # If we couldn't parse it as an int or float, fall through to returning the original value + pass + return value + + +def expand_string_escape_sequences(text: str) -> str: + s = io.StringIO() + escaping = False + for ch in text: + if escaping: + if ch == 'r': + s.write('\r') + elif ch == 't': + s.write('\t') + elif ch == 'n': + s.write('\n') + elif ch == '\\': + s.write('\\') + else: + # Rather than err, just leave other sequences as-is. + s.write(f"\\{ch}") + escaping = False + elif ch == '\\': + escaping = True + else: + s.write(ch) + return s.getvalue() + + +def open_unicode_text_input_file_respecting_byte_order_mark(filename): + """ + Opens a file for text input, respecting a byte-order mark (BOM). + """ + with io.open(filename, 'rb') as fp: + leading_bytes = fp.read(4 * 8) # 4 bytes is all we need + bom_info = chardet.detect(leading_bytes, should_rename_legacy=True) + detected_encoding = bom_info and bom_info.get('encoding') # tread lightly + use_encoding = 'utf-8' if detected_encoding == 'ascii' else detected_encoding + return io.open(filename, 'r', encoding=use_encoding) + + +class TypeHint: + def apply_hint(self, value): + return value + + def __str__(self): + return f"<{self.__class__.__name__}>" + + def __repr__(self): + return self.__str__() + + +class BoolHint(TypeHint): + + def apply_hint(self, value): + if isinstance(value, str) and value: + if 'true'.startswith(value.lower()): + return True + elif 'false'.startswith(value.lower()): + return False + return super().apply_hint(value) + + +class EnumHint(TypeHint): + + def __str__(self): + return f"" + + def __init__(self, value_map): + self.value_map = value_map + + def apply_hint(self, value): + if isinstance(value, str): + if value in self.value_map: + result = self.value_map[value] + return result + else: + lvalue = value.lower() + found = [] + for lkey, key in self.value_map.items(): + if lkey.startswith(lvalue): + found.append(lkey) + if len(found) == 1: + [only_found] = found + result = self.value_map[only_found] + return result + return super().apply_hint(value) + + +OptionalTypeHints = List[Optional[TypeHint]] + + +class ItemTools: + """ + Implements operations on table-related data without pre-supposing the specific representation of the table. + It is assumed this can be used for data that was obtained from .json, .csv, .tsv, and .xlsx files because + it does not presuppose the source of the data nor where it will be written to. + + For the purpose of this class: + + * a 'header' is a string representing the top of a column. + + * a 'parsed header' is a list of strings and/or ints, after splitting at uses of '#' or '.', so that + "a.b.c" is represented as ["a", "b", "c"], and "x.y#0" is represented as ["x", "y", 0], and representing + each numeric token as an int instead of a string. + + * a 'headers' object is just a list of strings, each of which is a 'header'. + + * a 'parsed headers' object is a non-empty list of lists, each of which is a 'parsed header'. + e..g., the headers ["a.b.c", "x.y#0"] is represented as parsed hearders [["a", "b", "c"], ["x", "y", 0]]. + + """ + + @classmethod + def parse_sheet_header(cls, header: Header) -> ParsedHeader: + result = [] + token = "" + for i in range(len(header)): + ch = header[i] + if ch == '.' or ch == '#': + if token: + result.append(int(token) if token.isdigit() else token) + token = "" + else: + token += ch + if token: + result.append(int(token) if token.isdigit() else token) + return result + + @classmethod + def parse_sheet_headers(cls, headers: Headers): + return [cls.parse_sheet_header(header) + for header in headers] + + @classmethod + def compute_patch_prototype(cls, parsed_headers: ParsedHeaders): + prototype = {} + for parsed_header in parsed_headers: + parsed_header0 = parsed_header[0] + if isinstance(parsed_header0, int): + raise LoadTableError(f"A header cannot begin with a numeric ref: {parsed_header0}") + cls.assure_patch_prototype_shape(parent=prototype, keys=parsed_header) + return prototype + + @classmethod + def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: ParsedHeader): + [key0, *more_keys] = keys + key1 = more_keys[0] if more_keys else None + if isinstance(key1, int): + placeholder = [] + elif isinstance(key1, str): + placeholder = {} + else: + placeholder = None + if isinstance(key0, int): + n = len(parent) + if key0 == n: + parent.append(placeholder) + elif key0 > n: + raise LoadTableError("Numeric items must occur sequentially.") + elif isinstance(key0, str): + if key0 not in parent: + parent[key0] = placeholder + if key1 is not None: + cls.assure_patch_prototype_shape(parent=parent[key0], keys=more_keys) + return parent + + INSTAGUIDS_ENABLED = False # Experimental feature not enabled by default + + @classmethod + def parse_item_value(cls, value: SheetCellValue, context=None) -> AnyJsonData: + # TODO: Remodularize this for easier testing and more Schema-driven effect + # Doug asks that this be broken up into different mechanisms, more modular and separately testable. + # I pretty much agree with that. I'm just waiting for suggestions on what kinds of features are desired. + if isinstance(value, str): + lvalue = value.lower() + # TODO: We could consult a schema to make this less heuristic, but this may do for now + if lvalue == 'true': + return True + elif lvalue == 'false': + return False + elif lvalue == 'null' or lvalue == '': + return None + elif '|' in value: + if value == '|': # Use '|' for [] + return [] + else: + if value.endswith("|"): # Use 'foo|' for ['foo'] + value = value[:-1] + return [cls.parse_item_value(subvalue, context=context) for subvalue in value.split('|')] + elif cls.INSTAGUIDS_ENABLED and context is not None and value.startswith('#'): + # Note that this clause MUST follow '|' clause above so '#foo|#bar' isn't seen as instaguid + return cls.get_instaguid(value, context=context) + else: + # Doug points out that the schema might not agree, might want a string representation of a number. + # At this semantic layer, this might be a bad choice. + return prefer_number(value) + else: # presumably a number (int or float) + return value + + @classmethod + def get_instaguid(cls, guid_placeholder: str, *, context: Optional[Dict] = None): + if context is None: + return guid_placeholder + else: + referent = context.get(guid_placeholder) + if not referent: + context[guid_placeholder] = referent = str(uuid.uuid4()) + return referent + + @classmethod + def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any, force: bool = False): + if (value is None or value == '') and not force: + return + [key, *more_path] = path + if not more_path: + datum[key] = value + else: + cls.set_path_value(datum[key], more_path, value) + + @classmethod + def find_type_hint(cls, parsed_header: Optional[ParsedHeader], schema: Any): + + def finder(subheader, subschema): + if not parsed_header: + return None + else: + [key1, *other_headers] = subheader + if isinstance(key1, str) and isinstance(subschema, dict): + if subschema.get('type') == 'object': + def1 = subschema.get('properties', {}).get(key1) + if not other_headers: + if def1 is not None: + t = def1.get('type') + if t == 'string': + enum = def1.get('enum') + if enum: + mapping = {e.lower(): e for e in enum} + return EnumHint(mapping) + elif t == 'boolean': + return BoolHint() + else: + pass # fall through to asking super() + else: + pass # fall through to asking super() + else: + return finder(subheader=other_headers, subschema=def1) + + return finder(subheader=parsed_header, subschema=schema) + + @classmethod + def infer_tab_name(cls, filename): + return os.path.basename(filename).split('.')[0] + + +# TODO: Consider whether this might want to be an abstract base class. Some change might be needed. +# +# Doug thinks we might want (metaclass=ABCMeta) here to make this an abstract base class. +# I am less certain but open to discussion. Among other things, as implemented now, +# the __init__ method here needs to run and the documentation says that ABC's won't appear +# in the method resolution order. -kmp 17-Aug-2023 +# See also discussion at https://github.com/4dn-dcic/utils/pull/276#discussion_r1297775535 +class AbstractTableSetManager: + """ + The TableSetManager is the spanning class of anything that wants to be able to load a table set, + regardless of what it wants to load it from. To do this, it must support a load method + that takes a filename and returns the file content in the form: + { + "Sheet1": [ + {...representation of row1 as some kind of dict...}, + {...representation of row2 as some kind of dict...} + ], + "Sheet2": [...], + ..., + } + It also needs some implementation of the .tab_names property. + Note that at this level of abstraction, we take no position on what form of representation is used + for the rows, as long as it is JSON data of some kind. It might be + {"col1": "val1", "col2": "val2", ...} + or it might be something more structured like + {"something": "val1", {"something_else": ["val2"]}} + Additionally, the values stored might be altered as well. In particular, the most likely alteration + is to turn "123" to 123 or "" to None, though the specifics of whether and how such transformations + happen is not constrained by this class. + """ + + ALLOWED_FILE_EXTENSIONS: List[str] = [] + + def __init__(self, filename: str, **kwargs): + self.filename: str = filename + unwanted_kwargs(context=self.__class__.__name__, kwargs=kwargs) + + # TODO: Consider whether this should be an abstractmethod (but first see detailed design note at top of class.) + @classmethod + def load(cls, filename: str, **kwargs) -> TabbedSheetData: + """ + Reads a filename and returns a dictionary that maps sheet names to rows of dictionary data. + For more information, see documentation of AbstractTableSetManager. + """ + raise NotImplementedError(f".load(...) is not implemented for {cls.__name__}.") # noQA + + @property + def tab_names(self) -> List[str]: + raise NotImplementedError(f".tab_names is not implemented for {self.__class__.__name__}..") # noQA + + def load_content(self) -> Any: + raise NotImplementedError(f".load_content() is not implemented for {self.__class__.__name__}.") # noQA + + +class BasicTableSetManager(AbstractTableSetManager): + """ + A BasicTableManager provides some structure that most kinds of parsers will need. + In particular, everything will likely need some way of storing headers and some way of storing content + of each sheet. Even a csv file, which doesn't have multiple tabs can be seen as the degenerate case + of this where there's only one set of headers and only one block of content. + """ + + def __init__(self, filename: str, **kwargs): + super().__init__(filename=filename, **kwargs) + self.headers_by_tab_name: Dict[str, Headers] = {} + self.content_by_tab_name: Dict[str, SheetData] = {} + self.reader_agent: Any = self._get_reader_agent() + + def tab_headers(self, tab_name: str) -> Headers: + return self.headers_by_tab_name[tab_name] + + def tab_content(self, tab_name: str) -> List[AnyJsonData]: + return self.content_by_tab_name[tab_name] + + @classmethod + def _create_tab_processor_state(cls, tab_name: str) -> Any: + """ + This method provides for the possibility that some parsers will want auxiliary state, + (such as parsed headers or a line count or a table of temporary names for objects to cross-link + or some other such feature) that it carries with it as it moves from line to line parsing things. + Subclasses might therefore want to make this do something more interesting. + """ + ignored(tab_name) # subclasses might need this, but we don't + return None + + def _get_reader_agent(self) -> Any: + """This function is responsible for opening the workbook and returning a workbook object.""" + raise NotImplementedError(f"._get_reader_agent() is not implemented for {self.__class__.__name__}.") # noQA + + +class SemanticTableSetManager(BasicTableSetManager): + """ + This is the base class for all workbook-like data sources, i.e., that may need to apply semantic processing. + Those may be: + * Excel workbook readers (.xlsx) + * Comma-separated file readers (.csv) + * Tab-separarated file readers (.tsv in most of the world, but Microsoft stupidly calls this .txt, outright + refusing to write a .tsv file, so many people seem to compromise and call this .tsv.txt) + There are two levels to each of these: a class that is not semantically interpreted, + and a class that is semantically interpreted as an "item". + + This is NOT a parent class of these kinds of files, which we always take literally as if semantic processing + were already done (in part so that they can be used to test the results of other formats): + * Json files + * Yaml files + * Inserts directories + * JsonLines files + """ + + @classmethod + def load(cls, filename: str, **kwargs) -> AnyJsonData: + if cls.ALLOWED_FILE_EXTENSIONS: + if not any(filename.lower().endswith(suffix) for suffix in cls.ALLOWED_FILE_EXTENSIONS): + raise LoadArgumentsError(f"The TableSetManager subclass {cls.__name__} expects only" + f" {disjoined_list(cls.ALLOWED_FILE_EXTENSIONS)} filenames: {filename}") + + table_set_manager: SemanticTableSetManager = cls(filename=filename, **kwargs) + return table_set_manager.load_content() + + def __init__(self, filename: str, **kwargs): + super().__init__(filename=filename, **kwargs) + + def _raw_row_generator_for_tab_name(self, tab_name: str) -> Iterable[SheetRow]: + """ + Given a tab_name and a state (returned by _sheet_loader_state), return a generator for a set of row values. + """ + raise NotImplementedError(f"._rows_for_tab_name(...) is not implemented for {self.__class__.__name__}.") # noQA + + def _process_row(self, tab_name: str, state: Any, row: List[SheetCellValue]) -> AnyJsonData: + """ + This needs to take a state and whatever represents a row and + must return a list of objects representing column values. + What constitutes a processed up to the class, but other than that the result must be a JSON dictionary. + """ + raise NotImplementedError(f"._process_row(...) is not implemented for {self.__class__.__name__}.") # noQA + + def load_content(self) -> AnyJsonData: + for tab_name in self.tab_names: + sheet_content = [] + state = self._create_tab_processor_state(tab_name) + for row_data in self._raw_row_generator_for_tab_name(tab_name): + processed_row_data: AnyJsonData = self._process_row(tab_name, state, row_data) + sheet_content.append(processed_row_data) + self.content_by_tab_name[tab_name] = sheet_content + return self.content_by_tab_name + + @classmethod + def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData: + return prefer_number(value) + + +class AbstractItemManager(AbstractTableSetManager): + + pass + + +class TableSetManagerRegistry: + + def __init__(self): + self.manager_table: Dict[str, Type[AbstractTableSetManager]] = {} + self.regexp_mappings: List[Tuple[Regexp, Type[AbstractTableSetManager]]] = [] + + def register(self, regexp: Optional[str] = None): + def _wrapped_register(class_to_register: Type[AbstractTableSetManager]): + if regexp: + self.regexp_mappings.append((re.compile(regexp), class_to_register)) + for ext in class_to_register.ALLOWED_FILE_EXTENSIONS: + existing = self.manager_table.get(ext) + if existing: + raise Exception(f"Tried to define {class_to_register} to extension {ext}," + f" but {existing} already claimed that.") + self.manager_table[ext] = class_to_register + return class_to_register + return _wrapped_register + + register1 = register + + def manager_for_filename(self, filename: str) -> Type[AbstractTableSetManager]: + base: str = os.path.basename(filename) + suffix_parts = base.split('.')[1:] + if suffix_parts: + for i in range(0, len(suffix_parts)): + suffix = f".{'.'.join(suffix_parts[i:])}" + found: Optional[Type[AbstractTableSetManager]] = self.manager_table.get(suffix) + if found: + return found + else: + special_case: Optional[Type[AbstractItemManager]] = self.manager_for_special_filename(filename) + if special_case: + return special_case + raise LoadArgumentsError(f"Unknown file type: {filename}") + + def manager_for_special_filename(self, filename: str) -> Optional[Type[AbstractTableSetManager]]: + for pattern, manager_class in self.regexp_mappings: + if pattern.match(filename): + return manager_class + return None + + +TABLE_SET_MANAGER_REGISTRY = TableSetManagerRegistry() +ITEM_MANAGER_REGISTRY = TableSetManagerRegistry() + + +@TABLE_SET_MANAGER_REGISTRY.register() +class XlsxManager(SemanticTableSetManager): + """ + This implements the mechanism to get a series of rows out of the sheets in an XLSX file. + """ + + ALLOWED_FILE_EXTENSIONS = ['.xlsx'] + + @classmethod + def _all_rows(cls, sheet: Worksheet): + row_max = sheet.max_row + for row in range(2, row_max + 1): + yield row + + @classmethod + def _all_cols(cls, sheet: Worksheet): + col_max = sheet.max_column + for col in range(1, col_max + 1): + yield col + + @property + def tab_names(self) -> List[str]: + return self.reader_agent.sheetnames + + def _get_reader_agent(self) -> Workbook: + return openpyxl.load_workbook(self.filename) + + def _raw_row_generator_for_tab_name(self, tab_name: str) -> Iterable[SheetRow]: + sheet = self.reader_agent[tab_name] + return (self._get_raw_row_content_tuple(sheet, row) + for row in self._all_rows(sheet)) + + def _get_raw_row_content_tuple(self, sheet: Worksheet, row: int) -> SheetRow: + return [sheet.cell(row=row, column=col).value + for col in self._all_cols(sheet)] + + def _create_tab_processor_state(self, tab_name: str) -> Headers: + sheet = self.reader_agent[tab_name] + headers: Headers = [str(sheet.cell(row=1, column=col).value) + for col in self._all_cols(sheet)] + self.headers_by_tab_name[sheet.title] = headers + return headers + + def _process_row(self, tab_name: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: + ignored(tab_name) + return {headers[i]: self.parse_cell_value(row_datum) + for i, row_datum in enumerate(row_data)} + + +class SchemaAutoloadMixin(AbstractTableSetManager): + + SCHEMA_CACHE = {} # Shared cache. Do not override. Use .clear_schema_cache() to clear it. + CACHE_SCHEMAS = True # Controls whether we're doing caching at all + AUTOLOAD_SCHEMAS_DEFAULT = True + + def __init__(self, filename: str, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None, + portal_vapp: Optional[AbstractVirtualApp] = None, **kwargs): + # This setup must be in place before the class initialization is done (via the super call). + self.autoload_schemas: bool = self.AUTOLOAD_SCHEMAS_DEFAULT if autoload_schemas is None else autoload_schemas + if self.autoload_schemas: # If autoload_schemas is False, we don't care about doing this defaulting. + if portal_env is None and portal_vapp is None: + portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) + PRINT(f"The portal_env was not explicitly supplied. Schemas will come from portal_env={portal_env!r}.") + self.portal_env: Optional[str] = portal_env + self.portal_vapp: Optional[AbstractVirtualApp] = portal_vapp + super().__init__(filename=filename, **kwargs) + + def fetch_relevant_schemas(self, schema_names: List[str]): + # The schema_names argument is not normally given, but it is there for easier testing + def fetch_schema(schema_name): + schema = self.fetch_schema(schema_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp) + return schema_name, schema + if self.autoload_schemas and (self.portal_env or self.portal_vapp): + autoloaded = {tab_name: schema + for tab_name, schema in pmap(fetch_schema, schema_names)} + return autoloaded + else: + return {} + + @classmethod + def fetch_schema(cls, schema_name: str, *, portal_env: Optional[str] = None, + portal_vapp: Optional[AbstractVirtualApp] = None): + def just_fetch_it(): + return get_schema(schema_name, portal_env=portal_env, portal_vapp=portal_vapp) + if cls.CACHE_SCHEMAS: + schema: Optional[AnyJsonData] = cls.SCHEMA_CACHE.get(schema_name) + if schema is None: + cls.SCHEMA_CACHE[schema_name] = schema = just_fetch_it() + return schema + else: + return just_fetch_it() + + @classmethod + def clear_schema_cache(cls): + for key in list(cls.SCHEMA_CACHE.keys()): # important to get the list of keys as a separate object first + cls.SCHEMA_CACHE.pop(key, None) + + +class ItemManagerMixin(SchemaAutoloadMixin, AbstractItemManager, BasicTableSetManager): + """ + This can add functionality to a reader such as an XlsxManager or a CsvManager in order to make its rows + get handled like Items instead of just flat table rows. + """ + + def __init__(self, filename: str, schemas: Optional[Dict[str, AnyJsonData]] = None, **kwargs): + super().__init__(filename=filename, **kwargs) + self.patch_prototypes_by_tab_name: Dict[str, Dict] = {} + self.parsed_headers_by_tab_name: Dict[str, ParsedHeaders] = {} + self.type_hints_by_tab_name: Dict[str, OptionalTypeHints] = {} + self._schemas = schemas + self._instaguid_context_table: Dict[str, str] = {} + + @property + def schemas(self): + schemas = self._schemas + if schemas is None: + self._schemas = schemas = self.fetch_relevant_schemas(self.tab_names) + return schemas + + def sheet_patch_prototype(self, tab_name: str) -> Dict: + return self.patch_prototypes_by_tab_name[tab_name] + + def sheet_parsed_headers(self, tab_name: str) -> ParsedHeaders: + return self.parsed_headers_by_tab_name[tab_name] + + def sheet_type_hints(self, tab_name: str) -> OptionalTypeHints: + return self.type_hints_by_tab_name[tab_name] + + class SheetState: + + def __init__(self, parsed_headers: ParsedHeaders, type_hints: OptionalTypeHints): + self.parsed_headers = parsed_headers + self.type_hints = type_hints + + def _compile_type_hints(self, tab_name: str): + parsed_headers = self.sheet_parsed_headers(tab_name) + schema = self.schemas.get(tab_name) + with deferred_problems() as note_problem: + for required_header in self._schema_required_headers(schema): + if required_header not in parsed_headers: + note_problem("Missing required header") + type_hints = [ItemTools.find_type_hint(parsed_header, schema) if schema else None + for parsed_header in parsed_headers] + self.type_hints_by_tab_name[tab_name] = type_hints + + @classmethod + def _schema_required_headers(cls, schema): + ignored(schema) + return [] # TODO: Make this compute a list of required headers (in parsed header form) + + def _compile_sheet_headers(self, tab_name: str): + headers = self.headers_by_tab_name[tab_name] + parsed_headers = ItemTools.parse_sheet_headers(headers) + self.parsed_headers_by_tab_name[tab_name] = parsed_headers + prototype = ItemTools.compute_patch_prototype(parsed_headers) + self.patch_prototypes_by_tab_name[tab_name] = prototype + + def _create_tab_processor_state(self, tab_name: str) -> SheetState: + super()._create_tab_processor_state(tab_name) + # This will create state that allows us to efficiently assign values in the right place on each row + # by setting up a prototype we can copy and then drop values into. + self._compile_sheet_headers(tab_name) + self._compile_type_hints(tab_name) + return self.SheetState(parsed_headers=self.sheet_parsed_headers(tab_name), + type_hints=self.sheet_type_hints(tab_name)) + + def _process_row(self, tab_name: str, state: SheetState, row_data: SheetRow) -> AnyJsonData: + parsed_headers = state.parsed_headers + type_hints = state.type_hints + patch_item = copy.deepcopy(self.sheet_patch_prototype(tab_name)) + for i, value in enumerate(row_data): + parsed_value = self.parse_cell_value(value) + type_hint = type_hints[i] + if type_hint: + parsed_value = type_hint.apply_hint(parsed_value) + ItemTools.set_path_value(patch_item, parsed_headers[i], parsed_value) + return patch_item + + def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData: + return ItemTools.parse_item_value(value, context=self._instaguid_context_table) + + +@ITEM_MANAGER_REGISTRY.register() +class XlsxItemManager(ItemManagerMixin, XlsxManager): + """ + This layers item-style row processing functionality on an XLSX file. + """ + pass + + +class SingleTableMixin(AbstractTableSetManager): + + def __init__(self, filename: str, tab_name: Optional[str] = None, **kwargs): + self._tab_name = tab_name or ItemTools.infer_tab_name(filename) + super().__init__(filename=filename, **kwargs) + + @property + def tab_names(self) -> List[str]: + return [self._tab_name] + + +class InsertsManager(BasicTableSetManager): # ItemManagerMixin isn't really appropriate here + + ALLOWED_FILE_EXTENSIONS = [] + + def _parse_inserts_data(self, filename: str) -> AnyJsonData: + raise NotImplementedError(f"._parse_inserts_dataa(...) is not implemented for {self.__class__.__name__}.") # noQA + + def _load_inserts_data(self, filename: str) -> TabbedSheetData: + data: AnyJsonData = self._parse_inserts_data(filename) + tabbed_inserts: AnyJsonData = self._wrap_inserts_data(filename, data) + if (not isinstance(tabbed_inserts, dict) + or not all(isinstance(tab_name, str) for tab_name in tabbed_inserts.keys()) + or not all(isinstance(content, list) and all(isinstance(item, dict) for item in content) + for content in tabbed_inserts.values())): + raise ValueError(f"Data in {filename} is not of type TabbedSheetData (Dict[str, List[dict]]).") + tabbed_inserts: TabbedSheetData # we've just checked that + return tabbed_inserts + + @classmethod + def _wrap_inserts_data(cls, filename: str, data: AnyJsonData) -> AnyJsonData: + ignored(filename) + return data + + @property + def tab_names(self) -> List[str]: + return list(self.content_by_tab_name.keys()) + + def _get_reader_agent(self) -> Any: + return self + + def load_content(self) -> Dict[str, AnyJsonData]: + data = self._load_inserts_data(self.filename) + for tab_name, tab_content in data.items(): + self.content_by_tab_name[tab_name] = tab_content + if not tab_content: + self.headers_by_tab_name[tab_name] = [] + else: + self.headers_by_tab_name[tab_name] = list(tab_content[0].keys()) + return self.content_by_tab_name + + +class SimpleInsertsMixin(SingleTableMixin): + + def _wrap_inserts_data(self, filename: str, data: AnyJsonData) -> TabbedSheetData: + if (not isinstance(data, list) + or not all(isinstance(item, dict) for item in data)): + raise ValueError(f"Data in {filename} is not of type SheetData (List[dict]).") + return {self._tab_name: data} + + +class JsonInsertsMixin: + + @classmethod + def _parse_inserts_data(cls, filename: str) -> AnyJsonData: + return json.load(open_unicode_text_input_file_respecting_byte_order_mark(filename)) + + +@TABLE_SET_MANAGER_REGISTRY.register() +class TabbedJsonInsertsManager(JsonInsertsMixin, InsertsManager): + + ALLOWED_FILE_EXTENSIONS = [".tabs.json"] # If you want them all in one family, use this extension + + +@TABLE_SET_MANAGER_REGISTRY.register() +class SimpleJsonInsertsManager(SimpleInsertsMixin, JsonInsertsMixin, InsertsManager): + + ALLOWED_FILE_EXTENSIONS = [".json"] + + +class YamlInsertsMixin: + + def _parse_inserts_data(self, filename) -> AnyJsonData: + return yaml.safe_load(open_unicode_text_input_file_respecting_byte_order_mark(filename)) + + +@TABLE_SET_MANAGER_REGISTRY.register() +class TabbedYamlInsertsManager(YamlInsertsMixin, InsertsManager): + + ALLOWED_FILE_EXTENSIONS = [".tabs.yaml"] + + def _parse_inserts_data(self, filename) -> AnyJsonData: + return yaml.safe_load(open_unicode_text_input_file_respecting_byte_order_mark(filename)) + + +@TABLE_SET_MANAGER_REGISTRY.register() +class SimpleYamlInsertsManager(SimpleInsertsMixin, YamlInsertsMixin, InsertsManager): + + ALLOWED_FILE_EXTENSIONS = [".yaml"] + + +class InsertsItemMixin(AbstractItemManager): # ItemManagerMixin isn't really appropriate here + """ + This class is used for inserts directories and other JSON-like data that will be literally used as an Item + without semantic pre-processing. In other words, these classes will not be pre-checked for semantic correctness + but instead assumed to have been checked by other means. + """ + + AUTOLOAD_SCHEMAS_DEFAULT = False # Has no effect, but someone might inspect the value. + + def __init__(self, filename: str, *, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None, + portal_vapp: Optional[AbstractVirtualApp] = None, schemas: Optional[Dict[str, AnyJsonData]] = None, + **kwargs): + ignored(portal_env, portal_vapp) # Would only be used if autoload_schemas was true, and we don't allow that. + if schemas not in [None, {}]: + raise ValueError(f"{self.__class__.__name__} does not allow schemas={schemas!r}.") + if autoload_schemas not in [None, False]: + raise ValueError(f"{self.__class__.__name__} does not allow autoload_schemas={autoload_schemas!r}.") + super().__init__(filename=filename, **kwargs) + + +@ITEM_MANAGER_REGISTRY.register() +class TabbedJsonInsertsItemManager(InsertsItemMixin, TabbedJsonInsertsManager): + pass + + +@ITEM_MANAGER_REGISTRY.register() +class SimpleJsonInsertsItemManager(InsertsItemMixin, SimpleJsonInsertsManager): + pass + + +@ITEM_MANAGER_REGISTRY.register() +class TabbedYamlInsertsItemManager(InsertsItemMixin, TabbedYamlInsertsManager): + pass + + +@ITEM_MANAGER_REGISTRY.register() +class SimpleYamlInsertsItemManager(InsertsItemMixin, SimpleYamlInsertsManager): + pass + + +@TABLE_SET_MANAGER_REGISTRY.register() +class SimpleJsonLinesInsertsManager(SimpleInsertsMixin, InsertsManager): + + ALLOWED_FILE_EXTENSIONS = [".jsonl"] + + def _parse_inserts_data(self, filename: str) -> AnyJsonData: + return [line for line in JsonLinesReader(open_unicode_text_input_file_respecting_byte_order_mark(filename))] + + +@ITEM_MANAGER_REGISTRY.register() +class SimpleJsonLinesInsertsItemManager(InsertsItemMixin, SimpleJsonLinesInsertsManager): + pass + + +@TABLE_SET_MANAGER_REGISTRY.register(regexp="^(.*/)?(|[^/]*[-_])inserts/?$") +class InsertsDirectoryManager(InsertsManager): + + ALLOWED_FILE_EXTENSIONS = [] + + def _parse_inserts_data(self, filename: str) -> AnyJsonData: + if not os.path.isdir(filename): + raise LoadArgumentsError(f"{filename} is not the name of an inserts directory.") + tab_files = glob.glob(os.path.join(filename, "*.json")) + data = {} + for tab_file in tab_files: + tab_content = json.load(open_unicode_text_input_file_respecting_byte_order_mark(tab_file)) + # Here we don't use os.path.splitext because we want to split on the first dot. + # e.g., for foo.bar.baz, return just foo + # this allows names like ExperimentSet.tab.json that might need to use multi-dot suffixes + # for things unrelated to the tab name. + tab_name = os.path.basename(tab_file).split('.')[0] + data[tab_name] = tab_content + return data + + +@ITEM_MANAGER_REGISTRY.register(regexp="^(.*/)?(|[^/]*[-_])inserts/?$") +class InsertsDirectoryItemManager(InsertsItemMixin, InsertsDirectoryManager): + pass + + +@TABLE_SET_MANAGER_REGISTRY.register() +class CsvManager(SingleTableMixin, SemanticTableSetManager): + """ + This implements the mechanism to get a series of rows out of the sheet in a csv file, + returning a result that still looks like there could have been multiple tabs. + """ + + ALLOWED_FILE_EXTENSIONS = ['.csv'] + + def __init__(self, filename: str, escaping: Optional[bool] = None, **kwargs): + super().__init__(filename=filename, **kwargs) + self.escaping: bool = escaping or False + + def _get_reader_agent(self) -> CsvReader: + return self._get_reader_agent_for_filename(self.filename) + + @classmethod + def _get_reader_agent_for_filename(cls, filename) -> CsvReader: + return csv.reader(open_unicode_text_input_file_respecting_byte_order_mark(filename)) + + PAD_TRAILING_TABS = True + + def _raw_row_generator_for_tab_name(self, tab_name: str) -> Iterable[SheetRow]: + headers = self.tab_headers(tab_name) + n_headers = len(headers) + for row_data in self.reader_agent: + if self.PAD_TRAILING_TABS: + row_data = pad_to(n_headers, row_data, padding='') + yield row_data + + def _create_tab_processor_state(self, tab_name: str) -> Headers: + headers: Optional[Headers] = self.headers_by_tab_name.get(tab_name) + if headers is None: + self.headers_by_tab_name[tab_name] = headers = self.reader_agent.__next__() + return headers + + @classmethod + def _escape_cell_text(cls, cell_text): + if '\\' in cell_text: + return expand_string_escape_sequences(cell_text) + else: + return cell_text + + def _process_row(self, tab_name: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: + ignored(tab_name) + if self.escaping: + return {headers[i]: self.parse_cell_value(self._escape_cell_text(cell_text)) + for i, cell_text in enumerate(row_data)} + else: + return {headers[i]: self.parse_cell_value(cell_text) + for i, cell_text in enumerate(row_data)} + + +@ITEM_MANAGER_REGISTRY.register() +class CsvItemManager(ItemManagerMixin, CsvManager): + """ + This layers item-style row processing functionality on a CSV file. + """ + pass + + +@TABLE_SET_MANAGER_REGISTRY.register() +class TsvManager(CsvManager): + """ + TSV files are just CSV files with tabs instead of commas as separators. + (We do not presently handle any escaping of strange characters. May need to add handling for backslash escaping.) + """ + ALLOWED_FILE_EXTENSIONS = ['.tsv', '.tsv.txt'] + + @classmethod + def _get_reader_agent_for_filename(cls, filename) -> CsvReader: + return csv.reader(open_unicode_text_input_file_respecting_byte_order_mark(filename), delimiter='\t') + + +@ITEM_MANAGER_REGISTRY.register() +class TsvItemManager(ItemManagerMixin, TsvManager): + """ + This layers item-style row processing functionality on a TSV file. + """ + pass + + +def _do_shell_command(command, cwd=None): + # This might need to be more elaborate, but hopefully it will do for now. -kmp 11-Sep-2023 + subprocess.check_output(command, cwd=cwd) + + +@contextlib.contextmanager +def maybe_unpack(filename): # Maybe move to another module + """ + If necessary, unpack a file that is zipped and/or tarred, yielding the name of the file (unpacked or not). + """ + unpackables = ['.tar.gz', '.tar', '.tgz', '.gz', '.zip'] + ext = None + for unpackable in unpackables: + if filename.endswith(unpackable): + ext = unpackable + break + if not ext: + yield filename + return + if not os.path.exists(filename): + # We don't bother to raise this error if we're not planning to do any unpacking. + # The caller can decide if/when such errors are needed in that case. + # But if we are going to have to move bits around, they'll need to actually be there. + # -kmp 12-Sep-2023 + raise ValueError(f"The file {filename!r} does not exist.") + target_base_part = remove_suffix(ext, os.path.basename(filename), required=True) + target_ext = '.tar.gz' if ext == '.tgz' else ext + with TemporaryDirectory() as temp_dir: + temp_base = os.path.join(temp_dir, target_base_part) + temp_filename = temp_base + target_ext + _do_shell_command(['cp', filename, temp_filename]) + if temp_filename.endswith('.gz'): + _do_shell_command(['gunzip', temp_filename], cwd=temp_dir) + temp_filename = remove_suffix('.gz', temp_filename) + elif temp_filename.endswith(".zip"): + _do_shell_command(['unzip', temp_filename], cwd=temp_dir) + temp_filename = remove_suffix('.zip', temp_filename) + if temp_filename.endswith(".tar"): + _do_shell_command(['tar', '-xf', temp_filename], cwd=temp_dir) + tar_file = temp_filename + temp_filename = remove_suffix(".tar", temp_filename, required=True) + if not os.path.isdir(temp_filename): + raise Exception(f"{tar_file} didn't unpack to a dir: {temp_filename}") + # print(f"Unpacked {filename} to {temp_filename}") + yield temp_filename + + +class TableSetManager(AbstractTableSetManager): + """ + This class will open a .xlsx or .csv file and load its content in our standard format. + (See more detailed description in AbstractTableManager.) + """ + + @classmethod + def create_implementation_manager(cls, filename: str, **kwargs) -> AbstractTableSetManager: + reader_agent_class = TABLE_SET_MANAGER_REGISTRY.manager_for_filename(filename) + if issubclass(reader_agent_class, AbstractItemManager): + raise ValueError(f"TableSetManager unexpectedly found reader agent class {reader_agent_class}.") + reader_agent = reader_agent_class(filename=filename, **kwargs) + return reader_agent + + @classmethod + def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None, + **kwargs) -> TabbedSheetData: + """ + Given a filename and various options + """ + with maybe_unpack(filename) as filename: + manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping, + **kwargs) + return manager.load_content() + + +class ItemManager(AbstractTableSetManager): + """ + This class will open a .xlsx or .csv file and load its content in our standard format. + (See more detailed description in AbstractTableManager.) + """ + + @classmethod + def create_implementation_manager(cls, filename: str, **kwargs) -> AbstractItemManager: + reader_agent_class: Type[AbstractTableSetManager] = ITEM_MANAGER_REGISTRY.manager_for_filename(filename) + if not issubclass(reader_agent_class, AbstractItemManager): + raise ValueError(f"ItemManager unexpectedly found reader agent class {reader_agent_class}.") + reader_agent_class: Type[AbstractItemManager] + reader_agent = reader_agent_class(filename=filename, **kwargs) + return reader_agent + + @classmethod + def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None, + schemas: Optional[Dict] = None, autoload_schemas: Optional[bool] = None, + portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None, + **kwargs) -> TabbedSheetData: + """ + Given a filename and various options, loads the items associated with that filename. + + :param filename: The name of the file to load. + :param tab_name: For files that lack multiple tabs (such as .csv or .tsv), + the tab name to associate with the data. + :param escaping: Whether to perform escape processing on backslashes. + :param schemas: A set of schemas to use instead of trying to load them. + :param autoload_schemas: Whether to try autoloading schemas. + :param portal_env: A portal to consult to find schemas (usually if calling from the outside of a portal). + :param portal_vapp: A vapp to use (usually if calling from within a portal). + """ + + with maybe_unpack(filename) as filename: + + manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping, + schemas=schemas, autoload_schemas=autoload_schemas, + portal_env=portal_env, portal_vapp=portal_vapp, + **kwargs) + return manager.load_content() + + +load_table_set = TableSetManager.load +load_items = ItemManager.load diff --git a/docs/source/dcicutils.rst b/docs/source/dcicutils.rst index f15307d0e..8481da6a7 100644 --- a/docs/source/dcicutils.rst +++ b/docs/source/dcicutils.rst @@ -281,6 +281,13 @@ secrets_utils :members: +sheet_utils +^^^^^^^^^^^ + +.. automodule:: dcicutils.sheet_utils + :members: + + snapshot_utils ^^^^^^^^^^^^^^ diff --git a/poetry.lock b/poetry.lock index d7e77523c..95670b506 100644 --- a/poetry.lock +++ b/poetry.lock @@ -489,6 +489,18 @@ files = [ [package.dependencies] pycparser = "*" +[[package]] +name = "chardet" +version = "5.2.0" +description = "Universal encoding detector for Python 3" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"}, + {file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"}, +] + [[package]] name = "charset-normalizer" version = "3.2.0" @@ -752,6 +764,18 @@ develop = ["black", "coverage", "jinja2", "mock", "pytest", "pytest-cov", "pyyam docs = ["sphinx (<1.7)", "sphinx-rtd-theme"] requests = ["requests (>=2.4.0,<3.0.0)"] +[[package]] +name = "et-xmlfile" +version = "1.1.0" +description = "An implementation of lxml.xmlfile for the standard library" +category = "main" +optional = false +python-versions = ">=3.6" +files = [ + {file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"}, + {file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"}, +] + [[package]] name = "exceptiongroup" version = "1.1.2" @@ -911,6 +935,21 @@ files = [ [package.dependencies] psutil = {version = ">=4.0.0", markers = "sys_platform != \"cygwin\""} +[[package]] +name = "openpyxl" +version = "3.1.2" +description = "A Python library to read/write Excel 2010 xlsx/xlsm files" +category = "main" +optional = false +python-versions = ">=3.6" +files = [ + {file = "openpyxl-3.1.2-py2.py3-none-any.whl", hash = "sha256:f91456ead12ab3c6c2e9491cf33ba6d08357d802192379bb482f1033ade496f5"}, + {file = "openpyxl-3.1.2.tar.gz", hash = "sha256:a6f5977418eff3b2d5500d54d9db50c8277a368436f4e4f8ddb1be3422870184"}, +] + +[package.dependencies] +et-xmlfile = "*" + [[package]] name = "opensearch-py" version = "2.3.0" @@ -1594,4 +1633,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = ">=3.7,<3.10" -content-hash = "b8d6612bb28cfb9da79306a82b2ac35a20678e1f62ef86c93b8af3c3d1ed798e" +content-hash = "eb629a04469e24b917d9525dd06dac72f2014cc9ede879946909929f5c09b9fd" diff --git a/pyproject.toml b/pyproject.toml index 65dba0353..846624504 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.11.0" +version = "7.11.0.1b9" # to become "7.12.0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" @@ -37,28 +37,31 @@ classifiers = [ [tool.poetry.dependencies] python = ">=3.7,<3.10" + boto3 = "^1.17.39" botocore = "^1.20.39" # The DCIC portals (cgap-portal and fourfront) are very particular about which ElasticSearch version. # This value is intentionally pinned and must not be changed casually. elasticsearch = "7.13.4" aws-requests-auth = ">=0.4.2,<1" +chardet = "^5.2.0" docker = "^4.4.4" gitpython = "^3.1.2" +openpyxl = "^3.1.2" +opensearch-py = "^2.0.1" +pyOpenSSL = "^23.1.1" +PyJWT = "^2.6.0" pytz = ">=2020.4" PyYAML = ">=5.1,<5.5" +redis = "^4.5.1" requests = "^2.21.0" rfc3986 = "^1.4.0" structlog = "^19.2.0" toml = ">=0.10.1,<1" +tqdm = "^4.65.0" typing-extensions = ">=3.8" # Fourfront uses 3.8 urllib3 = "^1.26.6" webtest = "^2.0.34" -opensearch-py = "^2.0.1" -redis = "^4.5.1" -pyOpenSSL = "^23.1.1" -PyJWT = "^2.6.0" -tqdm = "^4.65.0" [tool.poetry.dev-dependencies] diff --git a/test/data_files/escaping-false.json b/test/data_files/escaping-false.json new file mode 100644 index 000000000..84ab06993 --- /dev/null +++ b/test/data_files/escaping-false.json @@ -0,0 +1,67 @@ +{ + "escaping": [ + { + "name": "backslash", + "unquoted": "\\\\", + "doublequoted": "\\\\", + "singlequoted": "'\\\\'", + "overflow": null + }, + { + "name": "formfeed", + "unquoted": "\\f", + "doublequoted": "\\f", + "singlequoted": "'\\f'", + "overflow": null + }, + { + "name": "newline", + "unquoted": "\\n", + "doublequoted": "\\n", + "singlequoted": "'\\n'", + "overflow": null + }, + { + "name": "return", + "unquoted": "\\r", + "doublequoted": "\\r", + "singlequoted": "'\\r'", + "overflow": null + }, + { + "name": "tab", + "unquoted": "\\t", + "doublequoted": "\\t", + "singlequoted": "'\\t'", + "overflow": null + }, + { + "name": "misc", + "unquoted": "\\m", + "doublequoted": "\\m", + "singlequoted": "'\\m'", + "overflow": null + }, + { + "name": "quote1", + "unquoted": "N/A", + "doublequoted": "x,,z", + "singlequoted": "N/A", + "overflow": null + }, + { + "name": "quotelong", + "unquoted": "N/A", + "doublequoted": "x,,z,N/A\nquotlongcontinued,", + "singlequoted": "N/A", + "overflow": null + }, + { + "name": "comma", + "unquoted": "N/A", + "doublequoted": ",", + "singlequoted": "'", + "overflow": "'" + } + ] +} diff --git a/test/data_files/escaping-true.json b/test/data_files/escaping-true.json new file mode 100644 index 000000000..5d6c837a6 --- /dev/null +++ b/test/data_files/escaping-true.json @@ -0,0 +1,67 @@ +{ + "escaping": [ + { + "name": "backslash", + "unquoted": "\\", + "doublequoted": "\\", + "singlequoted": "'\\'", + "overflow": null + }, + { + "name": "formfeed", + "unquoted": "\\f", + "doublequoted": "\\f", + "singlequoted": "'\\f'", + "overflow": null + }, + { + "name": "newline", + "unquoted": "\n", + "doublequoted": "\n", + "singlequoted": "'\n'", + "overflow": null + }, + { + "name": "return", + "unquoted": "\r", + "doublequoted": "\r", + "singlequoted": "'\r'", + "overflow": null + }, + { + "name": "tab", + "unquoted": "\t", + "doublequoted": "\t", + "singlequoted": "'\t'", + "overflow": null + }, + { + "name": "misc", + "unquoted": "\\m", + "doublequoted": "\\m", + "singlequoted": "'\\m'", + "overflow": null + }, + { + "name": "quote1", + "unquoted": "N/A", + "doublequoted": "x,,z", + "singlequoted": "N/A", + "overflow": null + }, + { + "name": "quotelong", + "unquoted": "N/A", + "doublequoted": "x,,z,N/A\nquotlongcontinued,", + "singlequoted": "N/A", + "overflow": null + }, + { + "name": "comma", + "unquoted": "N/A", + "doublequoted": ",", + "singlequoted": "'", + "overflow": "'" + } + ] +} diff --git a/test/data_files/escaping.csv b/test/data_files/escaping.csv new file mode 100644 index 000000000..ec04defbd --- /dev/null +++ b/test/data_files/escaping.csv @@ -0,0 +1,11 @@ +name,unquoted,doublequoted,singlequoted,overflow +backslash,\\,"\\",'\\' +formfeed,\f,"\f",'\f' +newline,\n,"\n",'\n' +return,\r,"\r",'\r' +tab,\t,"\t",'\t' +misc,\m,"\m",'\m' +quote1,N/A,"x,,z",N/A +quotelong,N/A,"x,,z,N/A +quotlongcontinued,",N/A +comma,N/A,",",',' diff --git a/test/data_files/sample_items.tabs.json b/test/data_files/sample_items.tabs.json new file mode 100644 index 000000000..f972245f0 --- /dev/null +++ b/test/data_files/sample_items.tabs.json @@ -0,0 +1,74 @@ +{ + "Sheet1": [ + { + "x": 1, + "y": { + "a": 1, + "z": 1 + } + }, + { + "x": 1, + "y": { + "a": 2, + "z": 3 + } + }, + { + "x": "alpha", + "y": { + "a": "beta", + "z": [ + "gamma", + "delta" + ] + } + } + ], + "Sheet2": [ + { + "name": "bill", + "age": 23, + "mother": { + "name": "mary", + "age": 58 + }, + "father": { + "name": "fred", + "age": 63 + }, + "friends": [ + { + "name": "sam", + "age": 22 + }, + { + "name": "arthur", + "age": 19 + } + ] + }, + { + "name": "joe", + "age": 9, + "mother": { + "name": "estrella", + "age": 35 + }, + "father": { + "name": "anthony", + "age": 34 + }, + "friends": [ + { + "name": "anders", + "age": 9 + }, + { + "name": null, + "age": null + } + ] + } + ] +} diff --git a/test/data_files/sample_items.tabs.yaml b/test/data_files/sample_items.tabs.yaml new file mode 100644 index 000000000..f98d9259b --- /dev/null +++ b/test/data_files/sample_items.tabs.yaml @@ -0,0 +1,42 @@ +Sheet1: +- x: 1 + y: + a: 1 + z: 1 +- x: 1 + y: + a: 2 + z: 3 +- x: alpha + y: + a: beta + z: + - gamma + - delta +Sheet2: +- age: 23 + father: + age: 63 + name: fred + friends: + - age: 22 + name: sam + - age: 19 + name: arthur + mother: + age: 58 + name: mary + name: bill +- age: 9 + father: + age: 34 + name: anthony + friends: + - age: 9 + name: anders + - age: null + name: null + mother: + age: 35 + name: estrella + name: joe diff --git a/test/data_files/sample_items.xlsx b/test/data_files/sample_items.xlsx new file mode 100644 index 000000000..19ca2acc8 Binary files /dev/null and b/test/data_files/sample_items.xlsx differ diff --git a/test/data_files/sample_items2.csv b/test/data_files/sample_items2.csv new file mode 100644 index 000000000..2e32bf426 --- /dev/null +++ b/test/data_files/sample_items2.csv @@ -0,0 +1,5 @@ +name,sex,member +john,M,false +juan,male,true +igor,unknown, +mary,Female,t diff --git a/test/data_files/sample_items2.json b/test/data_files/sample_items2.json new file mode 100644 index 000000000..7e084f908 --- /dev/null +++ b/test/data_files/sample_items2.json @@ -0,0 +1,6 @@ +[ + {"name": "john", "sex": "Male", "member": false}, + {"name": "juan", "sex": "Male", "member": true}, + {"name": "igor", "sex": "unknown", "member": null}, + {"name": "mary", "sex": "Female", "member": true} +] diff --git a/test/data_files/sample_items3.csv b/test/data_files/sample_items3.csv new file mode 100644 index 000000000..ee2d61b61 --- /dev/null +++ b/test/data_files/sample_items3.csv @@ -0,0 +1,5 @@ +name,sex,uuid,father,mother,parents,children +John,Male,#john,#igor,#mary,, +Juan,Male,#juan,,,#igor|#mary, +Igor,Male,#igor,,,,#john| +Mary,Female,#mary,,,,#john| diff --git a/test/data_files/sample_items_for_real_schemas.csv b/test/data_files/sample_items_for_real_schemas.csv new file mode 100644 index 000000000..29af47792 --- /dev/null +++ b/test/data_files/sample_items_for_real_schemas.csv @@ -0,0 +1,3 @@ +accession,fragment_size_selection_method +foo,spri +bar,blue diff --git a/test/data_files/sample_items_sheet2.csv b/test/data_files/sample_items_sheet2.csv new file mode 100644 index 000000000..95567c42a --- /dev/null +++ b/test/data_files/sample_items_sheet2.csv @@ -0,0 +1,3 @@ +name,age,mother.name,mother.age,father.name,father.age,friends#0.name,friends#0.age,friends#1.name,friends#1.age +bill,23,mary,58,fred,63,sam,22,arthur,19 +joe,9,estrella,35,anthony,34,anders,9,, diff --git a/test/data_files/sample_items_sheet2.tsv b/test/data_files/sample_items_sheet2.tsv new file mode 100644 index 000000000..e862bf36d --- /dev/null +++ b/test/data_files/sample_items_sheet2.tsv @@ -0,0 +1,3 @@ +name age mother.name mother.age father.name father.age friends#0.name friends#0.age friends#1.name friends#1.age +bill 23 mary 58 fred 63 sam 22 arthur 19 +joe 9 estrella 35 anthony 34 anders 9 diff --git a/test/data_files/sample_items_sheet2a.jsonl b/test/data_files/sample_items_sheet2a.jsonl new file mode 100644 index 000000000..a0e96e83e --- /dev/null +++ b/test/data_files/sample_items_sheet2a.jsonl @@ -0,0 +1,3 @@ +["name", "age", "mother.name", "mother.age", "father.name", "father.age", "friends#0.name", "friends#0.age", "friends#1.name", "friends#1.age"] +["bill", 23, "mary", 58, "fred", 63, "sam", 22, "arthur", 19] +["joe", 9, "estrella", 35, "anthony", 34, "anders", 9] diff --git a/test/data_files/sample_items_sheet2b.jsonl b/test/data_files/sample_items_sheet2b.jsonl new file mode 100644 index 000000000..c044bfe18 --- /dev/null +++ b/test/data_files/sample_items_sheet2b.jsonl @@ -0,0 +1,2 @@ +{"name": "bill", "age": 23, "mother.name": "mary", "mother.age": 58, "father.name": "fred", "father.age": 63, "friends#0.name": "sam", "friends#0.age": 22, "friends#1.name": "arthur", "friends#1.age": 19} +{"name": "joe", "age": 9, "mother.name": "estrella", "mother.age": 35, "father.name": "anthony", "father.age": 34, "friends#0.name": "anders", "friends#0.age": 9} diff --git a/test/data_files/sample_items_sheet2b1.jsonl b/test/data_files/sample_items_sheet2b1.jsonl new file mode 100644 index 000000000..8f5c3345b --- /dev/null +++ b/test/data_files/sample_items_sheet2b1.jsonl @@ -0,0 +1,3 @@ +["name", "age", "mother", "father", "friends"] +["bill", 23, {"name": "mary", "age": 58}, {"name": "fred", "age": 63}, [{"name": "sam", "age": 22}, {"name": "arthur", "age": 19}]] +["joe", 9, {"name": "estrella", "age": 35}, {"name": "anthony", "age": 34}, [{"name": "anders", "age": 9}]] diff --git a/test/data_files/sample_items_sheet2b2.jsonl b/test/data_files/sample_items_sheet2b2.jsonl new file mode 100644 index 000000000..1ef8d9f11 --- /dev/null +++ b/test/data_files/sample_items_sheet2b2.jsonl @@ -0,0 +1,2 @@ +{"name": "bill", "age": 23, "mother": {"name": "mary", "age": 58}, "father": {"name": "fred", "age": 63}, "friends": [{"name": "sam", "age": 22}, {"name": "arthur", "age": 19}]} +{"name": "joe", "age": 9, "mother": {"name": "estrella", "age": 35}, "father": {"name": "anthony", "age": 34}, "friends": [{"name": "anders", "age": 9}]} \ No newline at end of file diff --git a/test/test_misc_utils.py b/test/test_misc_utils.py index a07c6d234..0017bd16e 100644 --- a/test/test_misc_utils.py +++ b/test/test_misc_utils.py @@ -17,7 +17,7 @@ from dcicutils.misc_utils import ( PRINT, ignored, ignorable, filtered_warnings, get_setting_from_context, TestApp, VirtualApp, VirtualAppError, _VirtualAppHelper, # noqa - yes, this is a protected member, but we still want to test it - Retry, apply_dict_overrides, utc_today_str, RateManager, environ_bool, str_to_bool, + Retry, apply_dict_overrides, utc_today_str, RateManager, environ_bool, str_to_bool, is_uuid, LockoutManager, check_true, remove_prefix, remove_suffix, full_class_name, full_object_name, constantly, keyword_as_title, file_contents, CachedField, camel_case_to_snake_case, snake_case_to_camel_case, make_counter, CustomizableProperty, UncustomizedInstance, getattr_customized, copy_json, url_path_join, @@ -30,12 +30,13 @@ classproperty, classproperty_cached, classproperty_cached_each_subclass, Singleton, NamedObject, obsolete, ObsoleteError, CycleError, TopologicalSorter, keys_and_values_to_dict, dict_to_keys_and_values, is_c4_arn, deduplicate_list, chunked, parse_in_radix, format_in_radix, managed_property, future_datetime, - MIN_DATETIME, MIN_DATETIME_UTC, INPUT, builtin_print, map_chunked, to_camel_case, + MIN_DATETIME, MIN_DATETIME_UTC, INPUT, builtin_print, map_chunked, to_camel_case, pad_to, JsonLinesReader, ) from dcicutils.qa_utils import ( Occasionally, ControlledTime, override_environ as qa_override_environ, MockFileSystem, printed_output, raises_regexp, MockId, MockLog, input_series, ) +from typing import Any, Dict, List from unittest import mock @@ -1094,7 +1095,7 @@ def test_lockout_manager(): protected_action = "simulated action" - # The function now() will get us the time. This assure us that binding datetime.datetime + # The function now() will get us the time. This assures us that binding datetime.datetime # will not be affecting us. now = datetime_module.datetime.now @@ -1197,7 +1198,7 @@ def test_rate_manager(): # PyCharm thinks this is not used. -kmp 26-Jul-2020 # r = RateManager(interval_seconds=60, safety_seconds=1, allowed_attempts=4) - # The function now() will get us the time. This assure us that binding datetime.datetime + # The function now() will get us the time. This assures us that binding datetime.datetime # will not be affecting us. now = datetime_module.datetime.now @@ -1885,7 +1886,7 @@ def test_cached_field_mocked(self): assert field.get() == val5 assert field.get() == val5 - dt.sleep(self.DEFAULT_TIMEOUT) # Fast forward to where we're going to refill again + dt.sleep(self.DEFAULT_TIMEOUT) # Fast-forward to where we're going to refill again val6 = field.get() assert val6 != val5 @@ -2007,6 +2008,33 @@ def test_capitalize1(token, expected): assert capitalize1(token) == expected +def test_is_uuid(): + + good_uuid = str(uuid.uuid4()) + bad_uuid = '123-456-789' + + assert not is_uuid("12345678abcd678123456781234") # wrong length. expecting 32 digits + assert not is_uuid("12-3456781234abcd1234567812345678") # hyphens only allowed at multiple of four boundaries + assert not is_uuid("12-3456781234abcd1234567-812345678") # ditto + + assert is_uuid("123456781234abcd1234567812345678") + assert is_uuid("12345678abcd56781234ABCD12345678") + assert is_uuid("1234-5678abcd56781234ABCD12345678") + assert is_uuid("12345678abcd-56781234ABCD1234-5678") + assert is_uuid("1234-5678-abcd56781234ABCD-12345678") + assert is_uuid("1234-5678-abcd-56781234ABCD12345678") + assert is_uuid("1234-5678-abcd-5678-1234-ABCD-1234-5678") + assert is_uuid("1234-5678-abcd-5678-1234-ABCD-1234-5678-") # we don't really want this, but we tolerate it + + assert is_uuid("{12345678abcd56781234ABCD12345678}") # braces are optionally allowed + assert is_uuid("{1234-5678-abcd5678-1234-ABCD-1234-5678}") # ditto + assert is_uuid("1234-5678-abcd5678-1234-ABCD-1234-5678}") # ditto + assert is_uuid("{1234-5678-abcd5678-1234-ABCD-1234-5678-}") # balanced braces trailing hyphen tolerated + + assert is_uuid(good_uuid) is True + assert is_uuid(bad_uuid) is False + + def test_string_list(): assert string_list('') == [] @@ -2050,7 +2078,7 @@ def test_copy_json(obj): def test_copy_json_side_effects(): - obj = {'foo': [1, 2, 3], 'bar': [{'x': 4, 'y': 5}, {'x': 2, 'y': 7}]} + obj: Dict[str, Any] = {'foo': [1, 2, 3], 'bar': [{'x': 4, 'y': 5}, {'x': 2, 'y': 7}]} obj_copy = copy_json(obj) obj['foo'][1] = 20 obj['bar'][0]['y'] = 500 # NoQA - PyCharm wrongly fears there are type errors in this line, that it will fail. @@ -2904,7 +2932,7 @@ class SubClock(Clock): assert str(exc.value) == ("The subclasses= argument to classproperty_cached.reset must not be False" " because classproperty_cached does not use per-subclass caches.") - # This will clear SubClock cache, bu that's shared with the Clock cache, so both will clear. + # This will clear SubClock cache, but that's shared with the Clock cache, so both will clear. assert classproperty_cached.reset(instance_class=SubClock, attribute_name='sample') is True c_t5 = Clock.sample # This should recompute Clock.sample cache, which is shared by SubCLock @@ -3258,7 +3286,7 @@ def test_deduplicate_list(): xlen = len(x) assert sorted(deduplicate_list(x)) == ['a', 'b', 'c'] - assert len(x) == xlen # make sure there was no side-effect to the original list + assert len(x) == xlen # make sure there was no side effect to the original list y = ['a'] y0 = deduplicate_list(y) @@ -3468,3 +3496,85 @@ def test_map_chunked(): res = map_chunked(lambda x: ''.join(x), "abcdefghij", chunk_size=4, reduce=lambda x: '.'.join(x)) assert res == 'abcd.efgh.ij' + + +def test_pad_to(): + + assert pad_to(5, []) == [None, None, None, None, None] + assert pad_to(5, [], padding='foo') == ['foo', 'foo', 'foo', 'foo', 'foo'] + + assert pad_to(5, ['x']) == ['x', None, None, None, None] + assert pad_to(5, ['x'], padding='foo') == ['x', 'foo', 'foo', 'foo', 'foo'] + + six_elements = ['a', 'b', 'c', 'd', 'e', 'f'] + + assert pad_to(5, six_elements) == six_elements + assert pad_to(5, six_elements, padding='foo') + + +def test_json_lines_reader_dicts(): + + print() # start on a fresh line + + mfs = MockFileSystem() + + with mfs.mock_exists_open_remove(): + + item1 = {"foo": 1, "bar": 2} + item2 = {"foo": 3, "bar": 4} + + item1_str = json.dumps(item1) + item2_str = json.dumps(item2) + + sample_lines = [item1_str, item2_str] + + sample_filename = "somefile.jsonl" + + with io.open(sample_filename, 'w') as fp: + for line in sample_lines: + print(line, file=fp) + + for file, content in mfs.files.items(): + print("=" * 20, file, "=" * 20) + print(content.decode('utf-8')) + print("=" * 80) + + with io.open(sample_filename) as fp: + assert [line for line in JsonLinesReader(fp)] == [item1, item2] + + +def test_json_lines_reader_lists(): + + print() # start on a fresh line + + mfs = MockFileSystem() + + with mfs.mock_exists_open_remove(): + + item1 = {"foo": 1, "bar": 2} + item2 = {"foo": 3, "bar": 4} + + headers: List[str] = list(item1.keys()) + + item1_str = json.dumps([item1[header] for header in headers]) + item2_str = json.dumps([item2[header] for header in headers]) + + sample_lines = [item1_str, item2_str] + + sample_filename = "somefile.jsonl" + + with io.open(sample_filename, 'w') as fp: + + print(json.dumps(headers), file=fp) + for line in sample_lines: + print(line, file=fp) + + for file, content in mfs.files.items(): + print("=" * 20, file, "=" * 20) + print(content.decode('utf-8')) + print("=" * 80) + + with io.open(sample_filename) as fp: + parsed = [line for line in JsonLinesReader(fp)] + expected = [item1, item2] + assert parsed == expected diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py new file mode 100644 index 000000000..ed312bf21 --- /dev/null +++ b/test/test_sheet_utils.py @@ -0,0 +1,935 @@ +import contextlib +import json +import os +import pytest + +from collections import namedtuple +from dcicutils import sheet_utils as sheet_utils_module, ff_utils as ff_utils_module +from dcicutils.common import AnyJsonData +from dcicutils.env_utils import EnvUtils, public_env_name +from dcicutils.misc_utils import is_uuid, local_attrs, NamedObject, AbstractVirtualApp +from dcicutils.qa_utils import printed_output, mock_not_called, MockResponse +from dcicutils.sheet_utils import ( + # High-level interfaces + ItemManager, load_items, TABLE_SET_MANAGER_REGISTRY, ITEM_MANAGER_REGISTRY, + # Low-level implementation + BasicTableSetManager, SchemaAutoloadMixin, + ItemTools, XlsxManager, XlsxItemManager, + CsvManager, CsvItemManager, TsvManager, TsvItemManager, + # TypeHint, EnumHint, + BoolHint, + # Error handling + LoadFailure, LoadArgumentsError, LoadTableError, + # Utilities + prefer_number, unwanted_kwargs, expand_string_escape_sequences, +) +from typing import Dict, Optional +from unittest import mock +from .conftest_settings import TEST_DIR +from .helpers import using_fresh_ff_state_for_testing + + +TEST_SHEET_1 = 'Sheet1' + + +def test_load_failure(): + + sample_message = "This is a test." + + load_failure_object = LoadFailure(sample_message) + assert isinstance(load_failure_object, LoadFailure) + assert str(load_failure_object) == sample_message + + +def test_load_argument_error(): + + sample_message = "This is a test." + + load_failure_object = LoadArgumentsError(sample_message) + assert isinstance(load_failure_object, LoadArgumentsError) + assert str(load_failure_object) == sample_message + + +def test_load_table_error(): + + sample_message = "This is a test." + + load_failure_object = LoadTableError(sample_message) + assert isinstance(load_failure_object, LoadTableError) + assert str(load_failure_object) == sample_message + + +def test_prefer_number(): + + assert prefer_number('') is None + assert prefer_number('123') == 123 + assert prefer_number('3.14') == 3.14 + assert prefer_number('abc') == 'abc' + assert prefer_number('123i') == '123i' + assert prefer_number('123e') == '123e' + assert prefer_number('123e0') == 123.0 + assert prefer_number('123e1') == 1230.0 + assert prefer_number('123e+1') == 1230.0 + assert prefer_number('123e-1') == 12.3 + + +def test_expand_string_escape_sequences(): + + assert expand_string_escape_sequences("foo") == "foo" + assert expand_string_escape_sequences("foo\\tbar") == "foo\tbar" + assert expand_string_escape_sequences("\\r\\t\\n\\\\") == "\r\t\n\\" + assert expand_string_escape_sequences("foo\\fbar") == "foo\\fbar" + + +def test_unwanted_kwargs_without_error(): + unwanted_kwargs(context="Function foo", kwargs={}) + unwanted_kwargs(context="Function foo", kwargs={}, context_plural=True, detailed=True) + + +tst_args = "context,context_plural,detailed,kwargs,message" + +TstArgs = namedtuple("TstArgs1", tst_args, defaults=(None,) * len(tst_args.split(','))) + + +@pytest.mark.parametrize(tst_args, [ + TstArgs(context="Function foo", context_plural=False, detailed=False, kwargs={'a': 1}, + message="Function foo doesn't use keyword argument a."), + TstArgs(context="Function foo", context_plural=False, detailed=False, kwargs={'a': 1, 'b': 2}, + message="Function foo doesn't use keyword arguments a and b."), + TstArgs(context="Functions like foo", context_plural=True, detailed=False, kwargs={'a': 1}, + message="Functions like foo don't use keyword argument a."), + TstArgs(context="Functions like foo", context_plural=True, detailed=False, kwargs={'a': 1, 'b': 2}, + message="Functions like foo don't use keyword arguments a and b."), + # Don't need to do all the cases again + TstArgs(context="Function foo", kwargs={'a': 1, 'b': 2}, + message="Function foo doesn't use keyword arguments a and b."), # noQA - PyCharm can't see defaults + TstArgs(context="Function foo", detailed=True, kwargs={'a': 1, 'b': 2}, + message="Function foo doesn't use keyword arguments a=1 and b=2."), # noQA PyCharm can't see defaults +]) +def test_unwanted_kwargs_with_error(context, context_plural, detailed, kwargs, message): + + with pytest.raises(LoadArgumentsError) as exc: + unwanted_kwargs(context=context, kwargs=kwargs, context_plural=context_plural, detailed=detailed) + assert str(exc.value) == message + + +def test_back_table_set_create_state(): + + assert BasicTableSetManager._create_tab_processor_state('some-tab') is None + + +def test_item_tools_parse_sheet_header(): + assert ItemTools.parse_sheet_header('.a') == ['a'] + assert ItemTools.parse_sheet_header('a') == ['a'] + assert ItemTools.parse_sheet_header('#0') == [0] + assert ItemTools.parse_sheet_header('0') == [0] + assert ItemTools.parse_sheet_header('foo.bar') == ['foo', 'bar'] + assert ItemTools.parse_sheet_header('a.b#0') == ['a', 'b', 0] + assert ItemTools.parse_sheet_header('x.xx#17#8.z') == ['x', 'xx', 17, 8, 'z'] + + # We don't error-check this, but it shouldn't matter + assert ItemTools.parse_sheet_header('#abc') == ['abc'] + assert ItemTools.parse_sheet_header('.123') == [123] + assert ItemTools.parse_sheet_header('#abc.123#456.def') == ['abc', 123, 456, 'def'] + + +def test_item_tools_parse_sheet_headers(): + input = ['a.b', 'a.c', 'a.d#1', 'a.d#2'] + expected = [['a', 'b'], ['a', 'c'], ['a', 'd', 1], ['a', 'd', 2]] + assert ItemTools.parse_sheet_headers(input) == expected + + +def test_item_tools_infer_tab_name(): + + assert ItemTools.infer_tab_name('some/dir/some') == 'some' + assert ItemTools.infer_tab_name('some/dir/some.file') == 'some' + assert ItemTools.infer_tab_name('some/dir/some.file.name') == 'some' + + +@pytest.mark.parametrize('parsed_headers,expected_prototype', [ + (['a'], + {'a': None}), + (['a', 'b'], + {'a': None, 'b': None}), + (['a.b', 'a.c', 'a.d#0', 'a.d#1'], + {'a': {'b': None, 'c': None, 'd': [None, None]}}), + (['a.b', 'a.c', 'a.d#0.foo', 'a.d#0.bar'], + {'a': {'b': None, 'c': None, 'd': [{'foo': None, 'bar': None}]}}), + (['a.b', 'a.c', 'a.d#0.foo', 'a.d#0.bar', 'a.d#1.foo', 'a.d#1.bar'], + {'a': {'b': None, 'c': None, 'd': [{'foo': None, 'bar': None}, {'foo': None, 'bar': None}]}}), +]) +def test_item_tools_compute_patch_prototype(parsed_headers, expected_prototype): + parsed_headers = ItemTools.parse_sheet_headers(parsed_headers) + assert ItemTools.compute_patch_prototype(parsed_headers) == expected_prototype + + +@pytest.mark.parametrize('headers', [['0'], ['x', '0.y']]) +def test_item_tools_compute_patch_prototype_errors(headers): + + parsed_headers = ItemTools.parse_sheet_headers(headers) + with pytest.raises(LoadTableError) as exc: + ItemTools.compute_patch_prototype(parsed_headers) + assert str(exc.value) == "A header cannot begin with a numeric ref: 0" + + +def test_item_tools_parse_item_value_basic(): + + for x in [37, 19.3, True, False, None, 'simple text']: + assert ItemTools.parse_item_value(x) == x + + assert ItemTools.parse_item_value('3') == 3 + assert ItemTools.parse_item_value('+3') == 3 + assert ItemTools.parse_item_value('-3') == -3 + + assert ItemTools.parse_item_value('3.5') == 3.5 + assert ItemTools.parse_item_value('+3.5') == 3.5 + assert ItemTools.parse_item_value('-3.5') == -3.5 + + assert ItemTools.parse_item_value('3.5e1') == 35.0 + assert ItemTools.parse_item_value('+3.5e1') == 35.0 + assert ItemTools.parse_item_value('-3.5e1') == -35.0 + + assert ItemTools.parse_item_value('') is None + + assert ItemTools.parse_item_value('null') is None + assert ItemTools.parse_item_value('Null') is None + assert ItemTools.parse_item_value('NULL') is None + + assert ItemTools.parse_item_value('true') is True + assert ItemTools.parse_item_value('True') is True + assert ItemTools.parse_item_value('TRUE') is True + + assert ItemTools.parse_item_value('false') is False + assert ItemTools.parse_item_value('False') is False + assert ItemTools.parse_item_value('FALSE') is False + + assert ItemTools.parse_item_value('|') == [] # special case: lone '|' means empty + assert ItemTools.parse_item_value('alpha|') == ['alpha'] # special case: trailing '|' means singleton + assert ItemTools.parse_item_value('|alpha|') == [None, 'alpha'] + assert ItemTools.parse_item_value('|alpha') == [None, 'alpha'] + assert ItemTools.parse_item_value('alpha|beta|gamma') == ['alpha', 'beta', 'gamma'] + assert ItemTools.parse_item_value('alpha|true|false|null||7|1.5') == ['alpha', True, False, None, None, 7, 1.5] + + +@pytest.mark.parametrize('instaguids_enabled', [True, False]) +def test_item_tools_parse_item_value_guids(instaguids_enabled): + + with local_attrs(ItemTools, INSTAGUIDS_ENABLED=instaguids_enabled): + + sample_simple_field_input = "#foo" + + parsed = ItemTools.parse_item_value(sample_simple_field_input) + assert parsed == sample_simple_field_input + + context = {} + parsed = ItemTools.parse_item_value(sample_simple_field_input, context=context) + if instaguids_enabled: + assert is_uuid(parsed) + assert parsed == context[sample_simple_field_input] + else: + assert parsed == sample_simple_field_input + assert context == {} + + sample_compound_field_input = '#foo|#bar' + sample_compound_field_list = ['#foo', '#bar'] + + parsed = ItemTools.parse_item_value(sample_compound_field_input) + assert parsed == sample_compound_field_list + + context = {} + parsed = ItemTools.parse_item_value(sample_compound_field_input, context=context) + assert isinstance(parsed, list) + if instaguids_enabled: + assert all(is_uuid(x) for x in parsed) + assert '#foo' in context and '#bar' in context + else: + assert parsed == sample_compound_field_list + assert context == {} + + +def test_item_tools_set_path_value(): + + x = {'foo': 1, 'bar': 2} + ItemTools.set_path_value(x, ['foo'], 3) + assert x == {'foo': 3, 'bar': 2} + + x = {'foo': [11, 22, 33], 'bar': {'x': 'xx', 'y': 'yy'}} + ItemTools.set_path_value(x, ['foo', 1], 17) + assert x == {'foo': [11, 17, 33], 'bar': {'x': 'xx', 'y': 'yy'}} + + x = {'foo': [11, 22, 33], 'bar': {'x': 'xx', 'y': 'yy'}} + ItemTools.set_path_value(x, ['bar', 'x'], 'something') + assert x == {'foo': [11, 22, 33], 'bar': {'x': 'something', 'y': 'yy'}} + + +def test_item_tools_find_type_hint(): + + assert ItemTools.find_type_hint(None, 'anything') is None + + assert ItemTools.find_type_hint(['foo', 'bar'], None) is None + assert ItemTools.find_type_hint(['foo', 'bar'], "something") is None + assert ItemTools.find_type_hint(['foo', 'bar'], {}) is None + + actual = ItemTools.find_type_hint(['foo', 'bar'], {"type": "object"}) + assert actual is None + + schema = { + "type": "object", + "properties": { + "foo": { + "type": "boolean" + } + } + } + actual = ItemTools.find_type_hint(['foo', 'bar'], schema) + assert actual is None + + actual = ItemTools.find_type_hint(['foo'], schema) + assert isinstance(actual, BoolHint) + + schema = { + "type": "object", + "properties": { + "foo": { + "type": "object", + "properties": { + "bar": { + "type": "boolean" + } + } + } + } + } + actual = ItemTools.find_type_hint(['foo', 'bar'], schema) + assert isinstance(actual, BoolHint) + + actual = ItemTools.find_type_hint(['foo'], schema) + assert actual is None + + +def test_table_set_manager_registry_manager_for_filename(): + + assert TABLE_SET_MANAGER_REGISTRY.manager_for_filename("xyz/foo.csv") == CsvManager + + with pytest.raises(Exception) as exc: + TABLE_SET_MANAGER_REGISTRY.manager_for_filename("xyz/foo.something.missing") + assert str(exc.value) == "Unknown file type: xyz/foo.something.missing" + + assert ITEM_MANAGER_REGISTRY.manager_for_filename("xyz/foo.csv") == CsvItemManager + + with pytest.raises(Exception) as exc: + ITEM_MANAGER_REGISTRY.manager_for_filename("xyz/foo.something.missing") + assert str(exc.value) == "Unknown file type: xyz/foo.something.missing" + + +SAMPLE_XLSX_FILE = os.path.join(TEST_DIR, 'data_files/sample_items.xlsx') + +SAMPLE_XLSX_FILE_RAW_CONTENT = { + "Sheet1": [ + {"x": 1, "y.a": 1, "y.z": 1}, + {"x": 1, "y.a": 2, "y.z": 3}, + {"x": "alpha", "y.a": "beta", "y.z": "gamma|delta"}, + ], + "Sheet2": [ + { + "name": "bill", "age": 23, + "mother.name": "mary", "mother.age": 58, + "father.name": "fred", "father.age": 63, + "friends#0.name": "sam", "friends#0.age": 22, + "friends#1.name": "arthur", "friends#1.age": 19, + }, + { + "name": "joe", "age": 9, + "mother.name": "estrella", "mother.age": 35, + "father.name": "anthony", "father.age": 34, + "friends#0.name": "anders", "friends#0.age": 9, + "friends#1.name": None, "friends#1.age": None, + }, + ] +} + +SAMPLE_XLSX_FILE_ITEM_CONTENT = { + "Sheet1": [ + {"x": 1, "y": {"a": 1, "z": 1}}, + {"x": 1, "y": {"a": 2, "z": 3}}, + {"x": "alpha", "y": {"a": "beta", "z": ["gamma", "delta"]}}, + ], + "Sheet2": [ + { + "name": "bill", "age": 23, + "mother": {"name": "mary", "age": 58}, + "father": {"name": "fred", "age": 63}, + "friends": [ + {"name": "sam", "age": 22}, + {"name": "arthur", "age": 19}, + ] + }, + { + "name": "joe", "age": 9, + "mother": {"name": "estrella", "age": 35}, + "father": {"name": "anthony", "age": 34}, + "friends": [ + {"name": "anders", "age": 9}, + {"name": None, "age": None} + ] + }, + ], +} + +SAMPLE_CSV_FILE = os.path.join(TEST_DIR, 'data_files/sample_items_sheet2.csv') + +SAMPLE_CSV_FILE_SHEET_NAME = ItemTools.infer_tab_name(SAMPLE_CSV_FILE) + +SAMPLE_CSV_FILE_RAW_CONTENT = {SAMPLE_CSV_FILE_SHEET_NAME: SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2']} + +SAMPLE_CSV_FILE_ITEM_CONTENT = {SAMPLE_CSV_FILE_SHEET_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} + +SAMPLE_TSV_FILE = os.path.join(TEST_DIR, 'data_files/sample_items_sheet2.tsv') + +SAMPLE_TSV_FILE_SHEET_NAME = ItemTools.infer_tab_name(SAMPLE_TSV_FILE) + +SAMPLE_TSV_FILE_RAW_CONTENT = {SAMPLE_TSV_FILE_SHEET_NAME: SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2']} + +SAMPLE_TSV_FILE_ITEM_CONTENT = {SAMPLE_TSV_FILE_SHEET_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} + +SAMPLE_JSON_TABS_FILE = os.path.join(TEST_DIR, 'data_files/sample_items.tabs.json') + +SAMPLE_JSON_TABS_FILE_ITEM_CONTENT = SAMPLE_XLSX_FILE_ITEM_CONTENT + +SAMPLE_YAML_TABS_FILE = os.path.join(TEST_DIR, 'data_files/sample_items.tabs.yaml') + +SAMPLE_YAML_TABS_FILE_ITEM_CONTENT = SAMPLE_XLSX_FILE_ITEM_CONTENT + + +def test_xlsx_manager_load_content(): + + wt = XlsxManager(SAMPLE_XLSX_FILE) + assert wt.load_content() == SAMPLE_XLSX_FILE_RAW_CONTENT + + +def test_xlsx_manager_load(): + + assert XlsxManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_RAW_CONTENT + + +def test_xlsx_manager_load_csv(): + + with pytest.raises(LoadArgumentsError) as exc: + XlsxManager.load(SAMPLE_CSV_FILE) + assert str(exc.value).startswith('The TableSetManager subclass XlsxManager' + ' expects only .xlsx filenames:') + + +def test_xlsx_item_manager_load_content(): + + it = XlsxItemManager(SAMPLE_XLSX_FILE, autoload_schemas=False) + assert it.load_content() == SAMPLE_XLSX_FILE_ITEM_CONTENT + + +def test_xlsx_item_manager_load(): + + assert XlsxItemManager.load(SAMPLE_XLSX_FILE, autoload_schemas=False) == SAMPLE_XLSX_FILE_ITEM_CONTENT + + +def test_xlsx_item_manager_load_csv(): + + with pytest.raises(LoadArgumentsError) as exc: + XlsxItemManager.load(SAMPLE_CSV_FILE) + assert str(exc.value).startswith('The TableSetManager subclass XlsxItemManager' + ' expects only .xlsx filenames:') + + +def test_csv_manager_load_content(): + + wt = CsvManager(SAMPLE_CSV_FILE) + assert wt.load_content() == SAMPLE_CSV_FILE_RAW_CONTENT + + +def test_csv_manager_load(): + + assert CsvManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_RAW_CONTENT + + +def test_csv_manager_load_csv(): + + with pytest.raises(LoadArgumentsError) as exc: + CsvManager.load(SAMPLE_XLSX_FILE) + assert str(exc.value).startswith('The TableSetManager subclass CsvManager' + ' expects only .csv filenames:') + + +def test_csv_item_manager_load_content(): + + it = CsvItemManager(SAMPLE_CSV_FILE, autoload_schemas=False) + assert it.load_content() == SAMPLE_CSV_FILE_ITEM_CONTENT + + +def test_csv_item_manager_load(): + + assert CsvItemManager.load(SAMPLE_CSV_FILE, autoload_schemas=False) == SAMPLE_CSV_FILE_ITEM_CONTENT + + +def test_csv_item_manager_load_csv(): + + with pytest.raises(LoadArgumentsError) as exc: + CsvItemManager.load(SAMPLE_XLSX_FILE, autoload_schemas=False) + assert str(exc.value).startswith('The TableSetManager subclass CsvItemManager' + ' expects only .csv filenames:') + + +def test_csv_escaping(): + + actual = CsvManager.load("test/data_files/escaping.csv", escaping=False) + expected = json.load(open("test/data_files/escaping-false.json")) + assert actual == expected + + actual = CsvManager.load("test/data_files/escaping.csv", escaping=True) + expected = json.load(open("test/data_files/escaping-true.json")) + assert actual == expected + + +def test_tsv_manager_load_content(): + + wt = TsvManager(SAMPLE_TSV_FILE) + assert wt.load_content() == SAMPLE_TSV_FILE_RAW_CONTENT + + +def test_tsv_manager_load(): + + assert TsvManager.load(SAMPLE_TSV_FILE) == SAMPLE_TSV_FILE_RAW_CONTENT + + +def test_tsv_manager_load_csv(): + + with pytest.raises(LoadArgumentsError) as exc: + TsvManager.load(SAMPLE_XLSX_FILE) + assert str(exc.value).startswith('The TableSetManager subclass TsvManager' + ' expects only .tsv or .tsv.txt filenames:') + + +def test_tsv_item_manager_load_content(): + + it = TsvItemManager(SAMPLE_TSV_FILE, autoload_schemas=False) + assert it.load_content() == SAMPLE_TSV_FILE_ITEM_CONTENT + + +def test_tsv_item_manager_load(): + + assert TsvItemManager.load(SAMPLE_TSV_FILE, autoload_schemas=False) == SAMPLE_TSV_FILE_ITEM_CONTENT + + +def test_tsv_item_manager_load_csv(): + + with pytest.raises(LoadArgumentsError) as exc: + TsvItemManager.load(SAMPLE_XLSX_FILE, autoload_schemas=False) + assert str(exc.value).startswith('The TableSetManager subclass TsvItemManager' + ' expects only .tsv or .tsv.txt filenames:') + + +def test_item_manager_load(): + + assert ItemManager.load(SAMPLE_XLSX_FILE, autoload_schemas=False) == SAMPLE_XLSX_FILE_ITEM_CONTENT + + assert ItemManager.load(SAMPLE_CSV_FILE, autoload_schemas=False) == SAMPLE_CSV_FILE_ITEM_CONTENT + + assert ItemManager.load(SAMPLE_TSV_FILE, autoload_schemas=False) == SAMPLE_TSV_FILE_ITEM_CONTENT + + loaded = ItemManager.load(SAMPLE_JSON_TABS_FILE, autoload_schemas=False) + print("loaded=", json.dumps(loaded, indent=2)) + expected = SAMPLE_JSON_TABS_FILE_ITEM_CONTENT + print("expected=", json.dumps(expected, indent=2)) + assert loaded == expected + + with pytest.raises(LoadArgumentsError) as exc: + ItemManager.load("something.else") + assert str(exc.value) == "Unknown file type: something.else" + + +def test_load_items(): + + assert load_items(SAMPLE_XLSX_FILE, autoload_schemas=False) == SAMPLE_XLSX_FILE_ITEM_CONTENT + + assert load_items(SAMPLE_CSV_FILE, autoload_schemas=False) == SAMPLE_CSV_FILE_ITEM_CONTENT + + with pytest.raises(LoadArgumentsError) as exc: + load_items("something.else") + assert str(exc.value) == "Unknown file type: something.else" + + +SAMPLE_CSV_FILE2 = os.path.join(TEST_DIR, 'data_files/sample_items2.csv') + +SAMPLE_CSV_FILE2_SHEET_NAME = ItemTools.infer_tab_name(SAMPLE_CSV_FILE2) + +SAMPLE_CSV_FILE2_SCHEMAS = { + "Person": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "sex": {"type": "string", "enum": ["Male", "Female"]}, + "member": {"type": "boolean"} + } + } +} + +SAMPLE_CSV_FILE2_CONTENT = { + SAMPLE_CSV_FILE2_SHEET_NAME: [ + {"name": "john", "sex": "M", "member": "false"}, + {"name": "juan", "sex": "male", "member": "true"}, + {"name": "igor", "sex": "unknown", "member": None}, + {"name": "mary", "sex": "Female", "member": "t"} + ] +} + +SAMPLE_CSV_FILE2_ITEM_CONTENT = { + SAMPLE_CSV_FILE2_SHEET_NAME: [ + {"name": "john", "sex": "M", "member": False}, + {"name": "juan", "sex": "male", "member": True}, + {"name": "igor", "sex": "unknown", "member": None}, + {"name": "mary", "sex": "Female", "member": "t"} + ] +} + +SAMPLE_CSV_FILE2_PERSON_CONTENT_HINTED = { + "Person": [ + {"name": "john", "sex": "Male", "member": False}, + {"name": "juan", "sex": "Male", "member": True}, + {"name": "igor", "sex": "unknown", "member": None}, + {"name": "mary", "sex": "Female", "member": True} + ] +} + + +SAMPLE_JSON_FILE2 = os.path.join(TEST_DIR, 'data_files/sample_items2.json') + +SAMPLE_JSON_FILE2_SHEET_NAME = ItemTools.infer_tab_name(SAMPLE_JSON_FILE2) + + +SAMPLE_CSV_FILE3_SCHEMAS = { + "Person": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "sex": {"type": "string", "enum": ["Male", "Female"]}, + "children": {"type": "array", "items": {"type": "string"}}, + "parents": {"type": "array", "items": {"type": "string"}}, + "mother": {"type": "string"}, + "father": {"type": "string"}, + } + } +} + +SAMPLE_CSV_FILE3_PERSON_CONTENT_HINTED = { + "Person": [ + { + "name": "John", + "uuid": "#john", + "sex": "Male", + "father": "#igor", + "mother": "#mary", + "parents": None, + "children": None, + }, + { + "name": "Juan", + "uuid": "#juan", + "sex": "Male", + "father": None, + "mother": None, + "parents": ["#igor", "#mary"], + "children": None, + }, + { + "name": "Igor", + "uuid": "#igor", + "sex": "Male", + "father": None, + "mother": None, + "parents": None, + "children": ["#john"], + }, + { + "name": "Mary", + "uuid": "#mary", + "sex": "Female", + "father": None, + "mother": None, + "parents": None, + "children": ["#john"], + }, + ] +} + +SAMPLE_CSV_FILE3 = os.path.join(TEST_DIR, 'data_files/sample_items3.csv') + + +def matches_template(json1: AnyJsonData, json2: AnyJsonData, *, previous_matches: Dict[str, str] = None) -> bool: + if previous_matches is None: + previous_matches = {} + if isinstance(json1, dict) and isinstance(json2, dict): + keys1 = set(json1.keys()) + keys2 = set(json2.keys()) + if keys1 != keys2: + print(f"Keys don't match: {keys1} vs {keys2}") + return False + return all(matches_template(json1[key], json2[key], previous_matches=previous_matches) for key in keys1) + elif isinstance(json1, list) and isinstance(json2, list): + n1 = len(json1) + n2 = len(json2) + if n1 != n2: + print(f"Length doesn't match: {n1} vs {n2}") + return False + return all(matches_template(json1[i], json2[i], previous_matches=previous_matches) for i in range(n1)) + elif isinstance(json1, str) and isinstance(json2, str) and is_uuid(json1) and json2.startswith("#"): + previously_matched = previous_matches.get(json2) + if previously_matched: + result = json1 == previously_matched + if not result: + print(f"Instaguid mismatch: {json1} vs {json2}") + return result + else: + # Remember the match + previous_matches[json2] = json1 + return True + else: # any other atomic items can be just directly compared + result = json1 == json2 + if not result: + print(f"Unequal: {json1} vs {json2}") + return result + + +def test_load_items_with_schema(): + + print("Case 1") + expected = SAMPLE_CSV_FILE2_CONTENT + actual = CsvManager.load(SAMPLE_CSV_FILE2) + assert actual == expected + + print("Case 2") + expected = SAMPLE_CSV_FILE2_ITEM_CONTENT + actual = load_items(SAMPLE_CSV_FILE2, schemas=SAMPLE_CSV_FILE2_SCHEMAS) + assert actual == expected + + print("Case 3") + expected = SAMPLE_CSV_FILE2_PERSON_CONTENT_HINTED + actual = load_items(SAMPLE_CSV_FILE2, schemas=SAMPLE_CSV_FILE2_SCHEMAS, tab_name='Person') + assert actual == expected + + +def test_sample_items_csv_vs_json(): + + csv_content = load_items(SAMPLE_CSV_FILE2, schemas=SAMPLE_CSV_FILE2_SCHEMAS, tab_name='Person') + + json_content = load_items(SAMPLE_JSON_FILE2, tab_name="Person") + + assert csv_content == json_content + + +def test_sample_items_json_vs_yaml(): + + tabs_data_from_json = load_items(SAMPLE_JSON_TABS_FILE) + tabs_data_from_yaml = load_items(SAMPLE_YAML_TABS_FILE) + assert tabs_data_from_json == tabs_data_from_yaml + + +@pytest.mark.parametrize('instaguids_enabled', [True, False]) +def test_load_items_with_schema_and_instaguids(instaguids_enabled): + + with local_attrs(ItemTools, INSTAGUIDS_ENABLED=instaguids_enabled): + + expected = SAMPLE_CSV_FILE3_PERSON_CONTENT_HINTED + print("expected=", json.dumps(expected, indent=2)) + actual = load_items(SAMPLE_CSV_FILE3, schemas=SAMPLE_CSV_FILE3_SCHEMAS, tab_name='Person') + print("actual=", json.dumps(actual, indent=2)) + if instaguids_enabled: + assert matches_template(actual, expected) + else: + assert actual == expected # no substitution performed + + +class SchemaAutoloaderForTesting(SchemaAutoloadMixin): + + def __init__(self, **kwargs): + super().__init__(filename='ignored.file.name', **kwargs) + + +@contextlib.contextmanager +def schema_autoloader_for_testing(**kwargs) -> SchemaAutoloadMixin: + autoloader: Optional[SchemaAutoloadMixin] = None + success = False + try: + autoloader: SchemaAutoloadMixin = SchemaAutoloaderForTesting(**kwargs) + assert autoloader.SCHEMA_CACHE == {}, "The schema cache is not clean." + yield autoloader + success = True + finally: + if autoloader is not None: + autoloader.clear_schema_cache() + assert autoloader.SCHEMA_CACHE == SchemaAutoloadMixin.SCHEMA_CACHE == {} + if not success: + raise + + +@using_fresh_ff_state_for_testing() +@pytest.mark.integrated +@pytest.mark.parametrize('portal_env', [None, 'data']) +def test_schema_autoload_mixin_caching(portal_env): + + with schema_autoloader_for_testing(portal_env=portal_env) as autoloader: + + assert autoloader.portal_env == 'data' # it should have defaulted even if we didn't supply it + + assert autoloader.SCHEMA_CACHE == SchemaAutoloadMixin.SCHEMA_CACHE == {} + + sample_schema_name = 'foo' + sample_schema = {'mock_schema_for': 'foo'} + + with mock.patch.object(sheet_utils_module, "get_schema") as mock_get_schema: + mock_get_schema.return_value = sample_schema + assert autoloader.fetch_schema(sample_schema_name, portal_env=autoloader.portal_env) == sample_schema + + schema_cache_with_sample_schema = {sample_schema_name: sample_schema} + assert SchemaAutoloadMixin.SCHEMA_CACHE == schema_cache_with_sample_schema + assert autoloader.SCHEMA_CACHE == schema_cache_with_sample_schema + + +@using_fresh_ff_state_for_testing() +@pytest.mark.integrated +@pytest.mark.parametrize('portal_env', [None, 'data']) +def test_schema_autoload_mixin_fetch_schema(portal_env): + + with schema_autoloader_for_testing(portal_env=portal_env) as autoloader: + + assert autoloader.portal_env == 'data' + + user_schema = autoloader.fetch_schema('user', portal_env=autoloader.portal_env) + + assert user_schema['$id'] == '/profiles/user.json' + assert user_schema['title'] == 'User' + assert 'properties' in user_schema + + +@using_fresh_ff_state_for_testing() +@pytest.mark.integrated +@pytest.mark.parametrize('autoload_schemas', [True, False]) +@pytest.mark.parametrize('cache_schemas', [True, False]) +@pytest.mark.parametrize('portal_env', [None, 'data']) +def test_schema_autoload_mixin_fetch_relevant_schemas(autoload_schemas, cache_schemas, portal_env): + + with printed_output() as printed: + with local_attrs(SchemaAutoloadMixin, CACHE_SCHEMAS=cache_schemas): + with schema_autoloader_for_testing(portal_env=portal_env, autoload_schemas=autoload_schemas) as autoloader: + + assert autoloader.portal_env == ('data' if autoload_schemas or portal_env else None) + + if autoload_schemas: + + schemas = autoloader.fetch_relevant_schemas(['User', 'Lab']) + assert isinstance(schemas, dict) + assert len(schemas) == 2 + assert set(schemas.keys()) == {'User', 'Lab'} + + else: + + assert autoloader.fetch_relevant_schemas(['User', 'Lab']) == {} + + if portal_env == 'data' or not autoload_schemas: + assert printed.lines == [] + else: + assert printed.lines == [ + "The portal_env was not explicitly supplied. Schemas will come from portal_env='data'." + ] + + +SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE = os.path.join(TEST_DIR, 'data_files/sample_items_for_real_schemas.csv') + + +@using_fresh_ff_state_for_testing() +@pytest.mark.integrated +def test_workbook_with_schemas(): + + print() # start o a fresh line + + SchemaAutoloadMixin.clear_schema_cache() + + actual_data = CsvManager(filename=SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE, tab_name='ExperimentSeq').load_content() + expected_data = { + "ExperimentSeq": [ + { + "accession": "foo", + "fragment_size_selection_method": "spri" + }, + { + "accession": "bar", + "fragment_size_selection_method": "blue" + } + ] + } + assert actual_data == expected_data + + actual_items = load_items(SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE, + tab_name='ExperimentSeq', autoload_schemas=True) + expected_items = { + "ExperimentSeq": [ + { + "accession": "foo", + "fragment_size_selection_method": "SPRI beads" + }, + { + "accession": "bar", + "fragment_size_selection_method": "BluePippin" + } + ] + } + assert actual_items == expected_items + + +@using_fresh_ff_state_for_testing() +@pytest.mark.integrated +def test_workbook_with_schemas_and_portal_vapp(): + + print() # start on a fresh line + + SchemaAutoloadMixin.clear_schema_cache() + + portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) + + experiment_seq_schema = ff_utils_module.get_schema('ExperimentSeq', portal_env=portal_env) + + expected_items = { + "ExperimentSeq": [ + { + "accession": "foo", + "fragment_size_selection_method": "SPRI beads" + }, + { + "accession": "bar", + "fragment_size_selection_method": "BluePippin" + } + ] + } + + class MockVapp(NamedObject, AbstractVirtualApp): + + def __init__(self, name): + super().__init__(name=name) + self.call_count = 0 + + def get(self, path_url): + assert path_url.startswith('/profiles/ExperimentSeq.json?') + self.call_count += 1 + response = MockResponse(200, json=experiment_seq_schema) + return response + + portal_vapp = MockVapp(name=f'MockVapp[{portal_env}]') + + old_count = portal_vapp.call_count + + with mock.patch.object(ff_utils_module, "get_authentication_with_server", + mock_not_called("get_authentication_with_server")): + with mock.patch.object(ff_utils_module, "get_metadata", + mock_not_called("get_metadata")): + actual_items = load_items(SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE, + tab_name='ExperimentSeq', autoload_schemas=True, portal_vapp=portal_vapp) + + assert portal_vapp.call_count == old_count + 1 + assert actual_items == expected_items