From 4c84f0bc6dfdc25857ddf98cd3ff4aef79c21743 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Mon, 14 Aug 2023 07:21:14 -0400 Subject: [PATCH 01/11] First cut at tools for parsing workbooks. --- CHANGELOG.rst | 10 ++ dcicutils/sheet_utils.py | 188 ++++++++++++++++++++++++++++++ pyproject.toml | 12 +- test/data_files/sample_items.xlsx | Bin 0 -> 9901 bytes test/test_sheet_utils.py | 179 ++++++++++++++++++++++++++++ 5 files changed, 384 insertions(+), 5 deletions(-) create mode 100644 dcicutils/sheet_utils.py create mode 100644 test/data_files/sample_items.xlsx create mode 100644 test/test_sheet_utils.py diff --git a/CHANGELOG.rst b/CHANGELOG.rst index fe2b0b147..61f334d68 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,16 @@ Change Log ---------- +7.8.0 +===== + +* New module ``sheet_utils`` for loading workbooks. + + * class ``WorkbookManager`` for loading raw data + + * class ``ItemManager`` for loading item data + + 7.7.2 ===== diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py new file mode 100644 index 000000000..c23129ffe --- /dev/null +++ b/dcicutils/sheet_utils.py @@ -0,0 +1,188 @@ +import copy + +from openpyxl import load_workbook +from openpyxl.worksheet.worksheet import Worksheet +from openpyxl.workbook.workbook import Workbook +from typing import Any, Dict, List, Optional, Union + + +class WorkbookManager: + + @classmethod + def load_workbook(cls, filename: str): + wb = cls(filename) + return wb.load_content() + + def __init__(self, filename: str): + self.filename: str = filename + self.workbook: Optional[Workbook] = None + self.headers_by_sheetname: Dict[List[str]] = {} + self.content_by_sheetname: Dict[List[Any]] = {} + + def sheet_headers(self, sheet: Worksheet) -> List[str]: + return self.headers_by_sheetname[sheet.title] + + def sheet_content(self, sheet: Worksheet) -> List[Any]: + return self.content_by_sheetname[sheet.title] + + @classmethod + def all_rows(cls, sheet: Worksheet): + row_max = sheet.max_row + for row in range(2, row_max + 1): + yield row + + @classmethod + def all_cols(cls, sheet: Worksheet): + col_max = sheet.max_column + for col in range(1, col_max + 1): + yield col + + def load_headers(self, sheet: Worksheet): + headers: List[str] = [str(sheet.cell(row=1, column=col).value) + for col in self.all_cols(sheet)] + self.headers_by_sheetname[sheet.title] = headers + + def load_content(self): + workbook: Workbook = load_workbook(self.filename) + self.workbook = workbook + for sheetname in workbook.sheetnames: + sheet: Worksheet = workbook[sheetname] + self.load_headers(sheet) + content = [] + for row in self.all_rows(sheet): + row_dict = self.load_row(sheet=sheet, row=row) + content.append(row_dict) + self.content_by_sheetname[sheetname] = content + return self.content_by_sheetname + + def load_row(self, *, sheet: Worksheet, row: int): + headers = self.sheet_headers(sheet) + row_dict: Dict[str, Any] = {headers[col-1]: sheet.cell(row=row, column=col).value + for col in self.all_cols(sheet)} + return row_dict + + +class ItemManager(WorkbookManager): + + def __init__(self, filename: str): + super().__init__(filename=filename) + self.patch_prototypes_by_sheetname: Dict[Dict] = {} + self.parsed_headers_by_sheetname: Dict[List[List[Union[int, str]]]] = {} + + def sheet_patch_prototype(self, sheet: Worksheet) -> Dict: + return self.patch_prototypes_by_sheetname[sheet.title] + + def sheet_parsed_headers(self, sheet: Worksheet) -> List[List[Union[int, str]]]: + return self.parsed_headers_by_sheetname[sheet.title] + + def load_headers(self, sheet: Worksheet): + super().load_headers(sheet) + self.compile_sheet_headers(sheet) + + def compile_sheet_headers(self, sheet: Worksheet): + headers = self.headers_by_sheetname[sheet.title] + parsed_headers = self.parse_sheet_headers(headers) + self.parsed_headers_by_sheetname[sheet.title] = parsed_headers + prototype = self.compute_patch_prototype(parsed_headers) + self.patch_prototypes_by_sheetname[sheet.title] = prototype + + @classmethod + def compute_patch_prototype(cls, parsed_headers): + prototype = {} + for parsed_header in parsed_headers: + parsed_header0 = parsed_header[0] + if isinstance(parsed_header0, int): + raise ValueError(f"A header cannot begin with a numeric ref: {parsed_header0}") + cls.assure_patch_prototype_shape(parent=prototype, keys=parsed_header) + return prototype + + @classmethod + def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: List[Union[int, str]]): + [key0, *more_keys] = keys + key1 = more_keys[0] if more_keys else None + if isinstance(key1, int): + placeholder = [] + elif isinstance(key1, str): + placeholder = {} + else: + placeholder = None + if isinstance(key0, int): + n = len(parent) + if key0 == n: + parent.append(placeholder) + elif key0 > n: + raise Exception("Numeric items must occur sequentially.") + elif isinstance(key0, str): + if key0 not in parent: + parent[key0] = placeholder + if key1 is not None: + cls.assure_patch_prototype_shape(parent=parent[key0], keys=more_keys) + return parent + + @classmethod + def parse_sheet_headers(cls, headers): + return [cls.parse_sheet_header(header) + for header in headers] + + @classmethod + def parse_sheet_header(cls, header) -> List[Union[int, str]]: + result = [] + token = "" + for i in range(len(header)): + ch = header[i] + if ch == '.' or ch == '#': + if token: + result.append(int(token) if token.isdigit() else token) + token = "" + else: + token += ch + if token: + result.append(int(token) if token.isdigit() else token) + return result + + def load_row(self, *, sheet: Worksheet, row: int): + parsed_headers = self.sheet_parsed_headers(sheet) + patch_item = copy.deepcopy(self.sheet_patch_prototype(sheet)) + for col in self.all_cols(sheet): + value = sheet.cell(row=row, column=col).value + parsed_value = self.parse_value(value) + self.set_path_value(patch_item, parsed_headers[col - 1], parsed_value) + return patch_item + + @classmethod + def set_path_value(cls, datum, path, value, force=False): + if (value is None or value == '') and not force: + return + [key, *more_path] = path + if not more_path: + datum[key] = value + else: + cls.set_path_value(datum[key], more_path, value) + + @classmethod + def parse_value(cls, value): + if isinstance(value, str): + lvalue = value.lower() + # TODO: We could consult a schema to make this less heuristic, but this may do for now + if lvalue == 'true': + return True + elif lvalue == 'false': + return False + elif lvalue == 'null' or lvalue == '': + return None + elif '|' in value: + return [cls.parse_value(subvalue) for subvalue in value.split('|')] + else: + ch0 = value[0] + if ch0 == '+' or ch0 == '-' or ch0.isdigit(): + try: + return int(value) + except Exception: + pass + try: + return float(value) + except Exception: + pass + return value + else: # probably a number + return value diff --git a/pyproject.toml b/pyproject.toml index 7c56d6b7e..647c13fe0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,7 @@ classifiers = [ [tool.poetry.dependencies] python = ">=3.7,<3.10" + boto3 = "^1.17.39" botocore = "^1.20.39" # The DCIC portals (cgap-portal and fourfront) are very particular about which ElasticSearch version. @@ -45,20 +46,21 @@ elasticsearch = "7.13.4" aws-requests-auth = ">=0.4.2,<1" docker = "^4.4.4" gitpython = "^3.1.2" +openpyxl = "^3.1.2" +opensearch-py = "^2.0.1" +pyOpenSSL = "^23.1.1" +PyJWT = "^2.6.0" pytz = ">=2020.4" PyYAML = ">=5.1,<5.5" +redis = "^4.5.1" requests = "^2.21.0" rfc3986 = "^1.4.0" structlog = "^19.2.0" toml = ">=0.10.1,<1" +tqdm = "^4.65.0" typing-extensions = ">=3.8" # Fourfront uses 3.8 urllib3 = "^1.26.6" webtest = "^2.0.34" -opensearch-py = "^2.0.1" -redis = "^4.5.1" -pyOpenSSL = "^23.1.1" -PyJWT = "^2.6.0" -tqdm = "^4.65.0" [tool.poetry.dev-dependencies] diff --git a/test/data_files/sample_items.xlsx b/test/data_files/sample_items.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..19ca2acc8ddc54c0e84ee2f3fc84f9dd2e92f8e5 GIT binary patch literal 9901 zcmeHNRajifwr)ajch|-xxLfex9^7f%-GT;(0FAo@cXtVH2^vCh4;mx{x7*2_xigcQ z`*@$u)Wfc?zujw9?Y;l?FDqp^=%?5KSO7c#0H6S1;;$^5K>+~xFaQ7+0RD-txP!eb z$lldZ-OCZ=qW{d(&Xzp?=@YtKz!S*-{~iCuZ=gSM!oHUkdGI*l0#&0*jcAQk0V|~M zftaVD7H*hO95_uCi4xqLLu8a*g!=mVLZJ%L?X91aDcaD%pgCi>We1~V_ihBT-gcwY znQDhp&Y3_w1$}YSIjU?e6!zxS`K)6bLI8n<0F$Hw_EBMfS*VopIyCP}+Vy9;jXB;F zQ4J(6IKJI;d(bvX;M+_c6gEQPUU_C_kdnCr2|3QB#f$f3tC?!tyIXP4v9#pFChU$j%G`vu4>p*6<5r6OAHrX@yKCk%#` z_V4t&8(mx$jM*QixY*#Vh{D7c05*A4hNj#&xxq2eI44UvRjl`8xX+%?UZhFOdeOOe z#xj(*l;p{ctOBKGP9)seP>st!d`T-A|av z@SjUA+Cvcu=aWC2PQw{?F|nAh@*TFNIKL*;P_yK-sxirS=BM;BG_!qoCXvyNb??ip zm^rLU$%%QzJ|R6soqO)D)4+K;mgUvYgV0w#d@>RgMOLx`@l$^{64}xs;~U5uk%k5U z2m$a=p0+H%*@=gPvyF*^gUt^g_PfnMK^z%G%YXOi0DrLVXGMN}9MTm$)#(I&P#)#? zp%3Y7T7+Y*#SBxhH7ynKx=l|!;;t!3nS4vddB*nM{Fd+g1_{t@g`TdC_-Wfh#&{#5 z!KISsC3$!wN82STPV|eK%4aOx6Q6AqmCV|oQdwe16_d_1JcCcIJ)Q^en@*|o&B|sX9KuXAc!(10+ zb!UL9{RGAAFwTSC~0Sd*$)uJx(f9mjhU_GX*jxLJ%3NJHwv{DmC4EYvRb z49Kys_HSw#IX{S8IKO<{NBSq^s5pNah=rgA8UX-6gFpg;oIhixLc`ACEid|m-}F6P zzlXKFClZ6UC_ul^U88XR;z(^fK&=AsCE7|=<95|UUTnc$u~_dizkiS;VQ9iLn=QrA z-K|haZHjH>HGdbYTMRcJi`E(T{%UgC`hgCuj9OeI59dci%VPhu`-jEtw7O zzdrTVzcR*xwOK;Yqt%^d!t2-G`3{@e8fybZVa5FEB-_4>4M9~hh9lJQUa!NR&t>VniF|4-kyMaR0Uq}PiiU-C z_-VL6UvzBcI@*ao9;M|@YUQ#F<4A$w%x0G{wXHi##7JHpkb5;iJz-AZvQoYc@*XK0 z7d`HmQ50Ag8BW+@uy`tS9Pu^BUl_1P8>AA{tZI4Uo1`aBa5T(;4I^tv?0q4db+c7F zA{jtts5UXiPTL!bqrlPk;qYTP6AN2Mzd{H65Y{3`w>Wz*inF4(HtBw4uK053Ms|z+ zu>E%=G{r^o+}eoCOOJ2)i_`1>UtRcp#o?&)IH$Zb++x{RV^P>q&H$vk5Yna|n1Jz3V&pI4=10g@w&_V5NDSFtfSmxhZ{{&v^dm$t&QiTNoP=2#r7gsM^kjoFYFVL{FS!6|j5dQ3s zeseVHzR85pG&cQFd3P|EKa+?maAMux@ZBtrwCBy?$g~# zhoNX%Vwd@vWS<(u(#Q?w|D29e8)Mv1_Yj9oh1y3SpU5ak&SZ8iLJm&fX+n@?CfdJB zM>@I?!j-DiTXIO`c4ZE*##6A2XtT67`Iu%3Ub&@=-71`VId_EU7*S6VE|%dJc2LKyEz-lcG)Cu!I~FU7O| z2#rI80HBhn@K7{^Q84PoOjqD{L>jw*z0C)VPBvGvfstLn9@dvVoeV^p+ojmIj7l)& zBxE zTZ&P@xx6FdCiRczHtHt8bvvgPCVyjT7$;mPZE%e2e=(==Zi|?FvTeK9=^U;%K-+Bx z_q74kPPM^mrW)?9#DMhz;-%|N>U#ZsghofwDg8dus@0sh*_Ew`XJLNnL=XR^k#UES zWaT=+gIDt%tKl+^6Dv|my^-?=uu(RXrYL)>e$~qX!RWq4!?$p2D0AKZ)!(q6edi66*v1PEUixRf=RjGhS zBaAr_aS~G}Yo6amdYwj|YD!nWL5%4n$FzM^eagomI_?j9?eLFy?!iOvQiKBl2FZWq z;Xi$mt0l+|#PajX_5;<2+8?5bc(A+iZbXn>JnpzQW2slxCTtUysm!txi5r^^RbFzi zgFA?@o>23htI`z|rieKRB~CwwM(O+>i$K+|M>a+K;WMyun<~RrR>vJBO8TL*xcD;L z&-={vbhj4z=xYhvyks&NzC{My0r=4PmUrg7fvYspLoa%PIk-c_B*81GxQvCWmH0sLl|hh+ zvd=4a3g971lB~Z^X@}A}Tf7)bG~KFZSuJ&5?Pgz{Sa`^;>JbUW!Z+h=HUZKNpXp0F zOiVw^YwG(Qyk-{Msh2bo+pyZ=_PLMjt@1{`T2n6H_Ij+R$>D=_dgVjJrA&B=W=P;UNiqbwog+!FdMq|w@=bpgk$clQm}3UaV$CzyUh zB@U(hP*qJ{i;`zsC;Et87!dG8l}ct~WB^%d627h_ijHrsf023y9@8Ndlg6k;ogPJ2 zZ-y9@9h_2cq7dLWyMcvx@bDpJVL0mmG}^KQYiH-|uy+ z`P#cktThPJA=qs*-$r)YO7$kKTB;Ui$u!y>--CaxEPd?c#4%fbIpM4|qCH|V-sb3; zYz5$Qha*>WLv=o;2J{6b5;LNRw`W?dyH-4!q`X}`e0xraeM7z`;`N5L?5g`pY-WXg&49*`eX#n4mu?S^Sw2%aROORk<09#bGTUNz z_V1CVsbZm$@Q$QEyD?>YE%${#?3_@v8xeUE^_En>pBpUaygS_*h0hT$mSP5G(0}~N-w09n(q#jT%urRT@r?-jx8FZi> ziNJbX{EUL88wpv4tQ(mQWr7|rg0+~0RXhdk)7h)%o zb;Lo-?=mCh&(%FSXmU*p!rT~fawaQp70*0lV`daBS}WSpOOar}*PirxG2$t*MY}pe z$19Z|5Bzr3J+xC`>E>tfjpfaWRNU}Y3{l0f4J=+8%%^EhxoS-*@2BMHNI7jIvfAO` zcy3>;Np^0Bi7tqSniD7PnBo*Sq;*7BA&#d>c_(m`ra>v2uyByax~k2$IeA&TS(SB4 z7BB;&UF&05r&)vW)|mu8B;RTewxxxAWLIi6X(e*z*pETOu#u#sZ3(2IQJd@+u)l7! zhHqv7a@t&D?c(>%dBU~=KkDM;tF^cj$Yk4Jc^D@X;MIjmB=YMeD2~Fi#i#lt_A?tY z`zN7rP7&wZF=_^&3+rsAKW8M%a6)$#81`iE0a}GFP|Xwf zc9O-!c0%*-@L&{ZX|XWEy@?2j3U=QAz!sQMgA>1f?Xh7rYB|>tIn9^&2FM?!?4F-$ zmfg!0QT2XGiIEInX+FIt@8FqWY8muJr_wSH+3ctbtkoBbq5+2ekFE)OVFFty<=@0V z)-e>?-zzdZSJ^ykMa7);lI>Xk_$mszhm|?!J-dQ-hhkr3eZ9(J0|yg4veHNHD?2wS zUZK+}T9M(b_d0IRnGR&Lu5Lz`I%dt`JOTy19sPNWRd@Fq+Ck-0@$I-Ph>tyD^}x8O5vUq0SuKK(UDBjC8Q^bNNoj?@Sg;{&$10KXi=Vo1aYif|+TkR6KBw2xyW}^GtdvZoo zqiy-MI{e=H(AQ6U_73kfHeA$2(X-`wwyRM5o|1afs@`8oac!I=pPv5VyB zp~p!#Wau7YSHM!c0d2c1tz}(}#$}G4O?;SWq%Ap(N1vfLV14=&OKNoJaEmV?DFZVy zjLv|Af;~h+#iGRLwDQdDcB20G6+wxZJ6@VDAbz@C0^(OXeh&@2gzYZO0kki6^+a`# zZ!_6wbtab-685-##V!i2KRS1JPv}A2 zwZSKtd)+~er7C>w5dsMA!k!5X+Px7*65Ib$BDw)2y(dV?8E^5}P6Qd1B> zV*|T6tC(+&^yOg%?4##Vh*(n1BcH+{_8TGhI`KY7ki}`p%zOJ7Esu;bY~O!*nq&|I z`(Qs3DnJ+Goz&=XTD&G*nz!z=?Vq~I6Sw$jbDuZ_RdM9t3^DV0lnH$T{LCFBf_i-X z8DJ%X^$~kn(O*|%UI2=&PYVm6>{u{O-W4+_gnRomtDCf`*Eo#$LJQPaM9iU2uYv2$ z+k`%U3wq35HS?&tEos2QC2gypRgaW&DzCaBwYAC_fPG*?lb z#t>~9geiU$&IUL7G7gYotu#BUe0b~O(a7hjLcP~sB9gvOB0i;yt9bh<8n$FUo+;Dc zVGWY6{AZJO^e^O<&J~zsC*Cu@9F|dFtkqzj58#3j{BFt)9th8(peXtCkFwU^^Q4S``X*e+t#t^vv3~=$ zi>0wM$V|=E*~;GHXS(DI?vd?hg@8KfSb1ZZpmb#`*KXh$DbL)Z?_1YHXC7&D zgA4AYl#|#2lR;x*(GheL?>P_oAMbBVWc+h{ZjODLLe;TZ5WbF8NFq z8**DSQc3ymrt!;Lhcw%btD4^(oYGcm5r=XVwI1%K#Vz{a7g-eqOowNMj^|L|Vqfn5 z;0L^P+)D^QEb9N{2X$2$f5N;(%lK-ZtlAD$txGo9R&wbmi)c;fc!FDp#0Kn7yfaq_ z;Q9G8yK90md_2u*yL%7g{6y`~W5!U1DyO$hk~55qej(JyuRm1P7`NGWCAf(`77~y; zbs&#!i{lE)tG(Ca%Yw1u)@Rn7XTR1HE>7R#I}ewuC(bk8hgrt!;?Cc0)Pu0_wHw$Y;0TPd*-sT4 zh?_`h-jJR0rHXKDI(|#-H^v-LlkFe(ddp({b|U@+QhE5DOcHcagILieo5k7yd6K8` z3lj8b%P^vt-5<4r+I*0xCk6cPpe&v(WztVCWfaY7=+a;N`2}6ZJK?5kp0i%;zXRg> zUbmH@HRssNFAFXJqh1ElYhTvW3fSi0`>d8qgl2(_h3e`jnub+#zQ$SQe2oVQZ({Tk zlS^v>u;Jwb*m{xParH93`|wnp(S@BM8NabFK}9j?VncxXMISc1M5B8eN^PcS=({Bl z=C`aT>}FispJPfouLwlB-Q5U;yAeAE5FAM23QSrg=l)_bTm;&J3BbKu$VeH zgMOG4q}BCr*&gySqQNS1eXQ6at1yq^BR(nSB>duwfuq^Pi1u_{AzSW9+%hux%MYDa z?X#oUzVhR(hU2YNx`!G#uY%mBcZuU=f+#1K;7gnN7AjIsi6zH5n6*HhTe|>M=E$+A z=($89B&AOujXIy=!GPTp=lB-94Ar7ZRs&7!F^T!Z3u_~<9UFA@I3C8E0*YmvLvTE4 zQY~6w2G5^MNEfU@*FDr|z@Zb1!yq{cG)d<24DgNf1&V#8^l$FJ8PtNQ+g_OFfBrd0XUB*-0Z2*acQ8H2`-j{ias zg1!HY%mh(~MONgHW9SPq;((JYk$*c)E{{-^q8{cEz89erY%as7=CH!QWY^G}Uwy%@=fo~aWJK)uF z@v-Je9vv+A6egruZq;7A(^-B!sRfC3GEGFU|U?wwS+NfY3HEb5VU`DbFh^ zS&3iMd8I5X5SC{TUSCwaWNYWS5c&6m#h{o1jiCcnl-tz!`z+iR%y|~_!yd#b(S|i% zsEejrf_)lv>HC=>%`%L%MEOuB-b9zJjm!MBE@dZ-j~xO}O=ot*8_Ow7(wNn!fdS_;h01O{gLjGya@cEJ3Dl?R=>G!*QbGV&G}QB{XYN00Uu?#zbg3av4h_Qf6Q5sCI55?;a9=GwqpM**b6Ds|KE+-U-kUjnEON1 z9pu!~FKxPCg@5e~{2|w?MzecNnXrLqd$9?>aU4PZ`SI_*14gesC6ae^}pZ-<+ jucZ2Caaf8!iT{UUmE~X{82NF|1qm<+2?eHrKYslW&fXYe literal 0 HcmV?d00001 diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py new file mode 100644 index 000000000..714ade0eb --- /dev/null +++ b/test/test_sheet_utils.py @@ -0,0 +1,179 @@ +import os +import pytest + +from dcicutils.sheet_utils import WorkbookManager, ItemManager +from .conftest_settings import TEST_DIR + + +def test_item_manager_parse_sheet_header(): + assert ItemManager.parse_sheet_header('.a') == ['a'] + assert ItemManager.parse_sheet_header('a') == ['a'] + assert ItemManager.parse_sheet_header('#0') == [0] + assert ItemManager.parse_sheet_header('0') == [0] + assert ItemManager.parse_sheet_header('foo.bar') == ['foo', 'bar'] + assert ItemManager.parse_sheet_header('a.b#0') == ['a', 'b', 0] + assert ItemManager.parse_sheet_header('x.xx#17#8.z') == ['x', 'xx', 17, 8, 'z'] + + # We don't error-check this, but it shouldn't matter + assert ItemManager.parse_sheet_header('#abc') == ['abc'] + assert ItemManager.parse_sheet_header('.123') == [123] + assert ItemManager.parse_sheet_header('#abc.123#456.def') == ['abc', 123, 456, 'def'] + + +def test_item_manager_parse_sheet_headers(): + input = ['a.b', 'a.c', 'a.d#1', 'a.d#2'] + expected = [['a', 'b'], ['a', 'c'], ['a', 'd', 1], ['a', 'd', 2]] + assert ItemManager.parse_sheet_headers(input) == expected + + +@pytest.mark.parametrize('parsed_headers,expected_prototype', [ + (['a'], + {'a': None}), + (['a', 'b'], + {'a': None, 'b': None}), + (['a.b', 'a.c', 'a.d#0', 'a.d#1'], + {'a': {'b': None, 'c': None, 'd': [None, None]}}), + (['a.b', 'a.c', 'a.d#0.foo', 'a.d#0.bar'], + {'a': {'b': None, 'c': None, 'd': [{'foo': None, 'bar': None}]}}), + (['a.b', 'a.c', 'a.d#0.foo', 'a.d#0.bar', 'a.d#1.foo', 'a.d#1.bar'], + {'a': {'b': None, 'c': None, 'd': [{'foo': None, 'bar': None}, {'foo': None, 'bar': None}]}}), +]) +def test_item_manager_compute_patch_prototype(parsed_headers, expected_prototype): + parsed_headers = ItemManager.parse_sheet_headers(parsed_headers) + assert ItemManager.compute_patch_prototype(parsed_headers) == expected_prototype + + +@pytest.mark.parametrize('headers', [['0'], ['x', '0.y']]) +def test_item_manager_compute_patch_prototype_errors(headers): + + parsed_headers = ItemManager.parse_sheet_headers(headers) + with pytest.raises(ValueError) as exc: + ItemManager.compute_patch_prototype(parsed_headers) + assert str(exc.value) == "A header cannot begin with a numeric ref: 0" + + +def test_item_manager_set_path_value(): + + x = {'foo': 1, 'bar': 2} + ItemManager.set_path_value(x, ['foo'], 3) + assert x == {'foo': 3, 'bar': 2} + + x = {'foo': [11, 22, 33], 'bar': {'x': 'xx', 'y': 'yy'}} + ItemManager.set_path_value(x, ['foo', 1], 17) + assert x == {'foo': [11, 17, 33], 'bar': {'x': 'xx', 'y': 'yy'}} + + x = {'foo': [11, 22, 33], 'bar': {'x': 'xx', 'y': 'yy'}} + ItemManager.set_path_value(x, ['bar', 'x'], 'something') + assert x == {'foo': [11, 22, 33], 'bar': {'x': 'something', 'y': 'yy'}} + + +SAMPLE_FILE = os.path.join(TEST_DIR, 'data_files/sample_items.xlsx') + +SAMPLE_FILE_RAW_CONTENT = { + "Sheet1": [ + {"x": 1, "y.a": 1, "y.z": 1}, + {"x": 1, "y.a": 2, "y.z": 3}, + {"x": "alpha", "y.a": "beta", "y.z": "gamma|delta"}, + ], + "Sheet2": [ + { + "name": "bill", "age": 23, + "mother.name": "mary", "mother.age": 58, + "father.name": "fred", "father.age": 63, + "friends#0.name": "sam", "friends#0.age": 22, + "friends#1.name": "arthur", "friends#1.age": 19, + }, + { + "name": "joe", "age": 9, + "mother.name": "estrella", "mother.age": 35, + "father.name": "anthony", "father.age": 34, + "friends#0.name": "anders", "friends#0.age": 9, + "friends#1.name": None, "friends#1.age": None, + }, + ] +} + +SAMPLE_FILE_ITEM_CONTENT = { + "Sheet1": [ + {"x": 1, "y": {"a": 1, "z": 1}}, + {"x": 1, "y": {"a": 2, "z": 3}}, + {"x": "alpha", "y": {"a": "beta", "z": ["gamma", "delta"]}}, + ], + "Sheet2": [ + { + "name": "bill", "age": 23, + "mother": {"name": "mary", "age": 58}, + "father": {"name": "fred", "age": 63}, + "friends": [ + {"name": "sam", "age": 22}, + {"name": "arthur", "age": 19}, + ] + }, + { + "name": "joe", "age": 9, + "mother": {"name": "estrella", "age": 35}, + "father": {"name": "anthony", "age": 34}, + "friends": [ + {"name": "anders", "age": 9}, + {"name": None, "age": None} + ] + }, + ], +} + + +def test_workbook_manager_load_content(): + + wt = WorkbookManager(SAMPLE_FILE) + assert wt.load_content() == SAMPLE_FILE_RAW_CONTENT + + +def test_workbook_manager_load_workbook(): + + assert WorkbookManager.load_workbook(SAMPLE_FILE) == SAMPLE_FILE_RAW_CONTENT + + +def test_item_manager_parse_value(): + + for x in [37, 19.3, True, False, None, 'simple text']: + assert ItemManager.parse_value(x) == x + + assert ItemManager.parse_value('3') == 3 + assert ItemManager.parse_value('+3') == 3 + assert ItemManager.parse_value('-3') == -3 + + assert ItemManager.parse_value('3.5') == 3.5 + assert ItemManager.parse_value('+3.5') == 3.5 + assert ItemManager.parse_value('-3.5') == -3.5 + + assert ItemManager.parse_value('3.5e1') == 35.0 + assert ItemManager.parse_value('+3.5e1') == 35.0 + assert ItemManager.parse_value('-3.5e1') == -35.0 + + assert ItemManager.parse_value('') is None + + assert ItemManager.parse_value('null') is None + assert ItemManager.parse_value('Null') is None + assert ItemManager.parse_value('NULL') is None + + assert ItemManager.parse_value('true') is True + assert ItemManager.parse_value('True') is True + assert ItemManager.parse_value('TRUE') is True + + assert ItemManager.parse_value('false') is False + assert ItemManager.parse_value('False') is False + assert ItemManager.parse_value('FALSE') is False + + assert ItemManager.parse_value('alpha|beta|gamma') == ['alpha', 'beta', 'gamma'] + assert ItemManager.parse_value('alpha|true|false|null||7|1.5') == ['alpha', True, False, None, None, 7, 1.5] + + +def test_item_manager_load_content(): + + it = ItemManager(SAMPLE_FILE) + assert it.load_content() == SAMPLE_FILE_ITEM_CONTENT + + +def test_item_manager_load_workbook(): + + assert ItemManager.load_workbook(SAMPLE_FILE) == SAMPLE_FILE_ITEM_CONTENT From 7b73a67313ebf52eaebc3f119d42d232d35bc530 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Mon, 14 Aug 2023 08:30:01 -0400 Subject: [PATCH 02/11] Refactor to separate some functionality into a separate sevice class. --- dcicutils/sheet_utils.py | 67 +++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 32 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index c23129ffe..9633999d8 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -62,29 +62,7 @@ def load_row(self, *, sheet: Worksheet, row: int): return row_dict -class ItemManager(WorkbookManager): - - def __init__(self, filename: str): - super().__init__(filename=filename) - self.patch_prototypes_by_sheetname: Dict[Dict] = {} - self.parsed_headers_by_sheetname: Dict[List[List[Union[int, str]]]] = {} - - def sheet_patch_prototype(self, sheet: Worksheet) -> Dict: - return self.patch_prototypes_by_sheetname[sheet.title] - - def sheet_parsed_headers(self, sheet: Worksheet) -> List[List[Union[int, str]]]: - return self.parsed_headers_by_sheetname[sheet.title] - - def load_headers(self, sheet: Worksheet): - super().load_headers(sheet) - self.compile_sheet_headers(sheet) - - def compile_sheet_headers(self, sheet: Worksheet): - headers = self.headers_by_sheetname[sheet.title] - parsed_headers = self.parse_sheet_headers(headers) - self.parsed_headers_by_sheetname[sheet.title] = parsed_headers - prototype = self.compute_patch_prototype(parsed_headers) - self.patch_prototypes_by_sheetname[sheet.title] = prototype +class ItemTools: @classmethod def compute_patch_prototype(cls, parsed_headers): @@ -140,15 +118,6 @@ def parse_sheet_header(cls, header) -> List[Union[int, str]]: result.append(int(token) if token.isdigit() else token) return result - def load_row(self, *, sheet: Worksheet, row: int): - parsed_headers = self.sheet_parsed_headers(sheet) - patch_item = copy.deepcopy(self.sheet_patch_prototype(sheet)) - for col in self.all_cols(sheet): - value = sheet.cell(row=row, column=col).value - parsed_value = self.parse_value(value) - self.set_path_value(patch_item, parsed_headers[col - 1], parsed_value) - return patch_item - @classmethod def set_path_value(cls, datum, path, value, force=False): if (value is None or value == '') and not force: @@ -186,3 +155,37 @@ def parse_value(cls, value): return value else: # probably a number return value + + +class ItemManager(ItemTools, WorkbookManager): + + def __init__(self, filename: str): + super().__init__(filename=filename) + self.patch_prototypes_by_sheetname: Dict[Dict] = {} + self.parsed_headers_by_sheetname: Dict[List[List[Union[int, str]]]] = {} + + def sheet_patch_prototype(self, sheet: Worksheet) -> Dict: + return self.patch_prototypes_by_sheetname[sheet.title] + + def sheet_parsed_headers(self, sheet: Worksheet) -> List[List[Union[int, str]]]: + return self.parsed_headers_by_sheetname[sheet.title] + + def load_headers(self, sheet: Worksheet): + super().load_headers(sheet) + self.compile_sheet_headers(sheet) + + def compile_sheet_headers(self, sheet: Worksheet): + headers = self.headers_by_sheetname[sheet.title] + parsed_headers = self.parse_sheet_headers(headers) + self.parsed_headers_by_sheetname[sheet.title] = parsed_headers + prototype = self.compute_patch_prototype(parsed_headers) + self.patch_prototypes_by_sheetname[sheet.title] = prototype + + def load_row(self, *, sheet: Worksheet, row: int): + parsed_headers = self.sheet_parsed_headers(sheet) + patch_item = copy.deepcopy(self.sheet_patch_prototype(sheet)) + for col in self.all_cols(sheet): + value = sheet.cell(row=row, column=col).value + parsed_value = self.parse_value(value) + self.set_path_value(patch_item, parsed_headers[col - 1], parsed_value) + return patch_item From 3d4573fc089694d9ab34145b0481f9414f57e363 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Mon, 14 Aug 2023 08:32:34 -0400 Subject: [PATCH 03/11] Add a csv file for testing. --- test/data_files/sample_items_sheet2.csv | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 test/data_files/sample_items_sheet2.csv diff --git a/test/data_files/sample_items_sheet2.csv b/test/data_files/sample_items_sheet2.csv new file mode 100644 index 000000000..b1d3ec2da --- /dev/null +++ b/test/data_files/sample_items_sheet2.csv @@ -0,0 +1,3 @@ +name,age,mother.name,mother.age,father.name,father.age,friends#0.name,friends#0.age,friends#1.name,friends#1.age +bill,23,mary,58,fred,63,sam,22,arthur,19 +joe,9,estrella,35,anthony,34,anders,9,, \ No newline at end of file From f4e5cfa4f605168a453e7c601574c38be9854e97 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Mon, 14 Aug 2023 08:38:17 -0400 Subject: [PATCH 04/11] Add some negative testing. --- test/test_sheet_utils.py | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index 714ade0eb..32dffb25b 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -67,9 +67,9 @@ def test_item_manager_set_path_value(): assert x == {'foo': [11, 22, 33], 'bar': {'x': 'something', 'y': 'yy'}} -SAMPLE_FILE = os.path.join(TEST_DIR, 'data_files/sample_items.xlsx') +SAMPLE_XLSX_FILE = os.path.join(TEST_DIR, 'data_files/sample_items.xlsx') -SAMPLE_FILE_RAW_CONTENT = { +SAMPLE_XLSX_FILE_RAW_CONTENT = { "Sheet1": [ {"x": 1, "y.a": 1, "y.z": 1}, {"x": 1, "y.a": 2, "y.z": 3}, @@ -93,7 +93,7 @@ def test_item_manager_set_path_value(): ] } -SAMPLE_FILE_ITEM_CONTENT = { +SAMPLE_XLSX_FILE_ITEM_CONTENT = { "Sheet1": [ {"x": 1, "y": {"a": 1, "z": 1}}, {"x": 1, "y": {"a": 2, "z": 3}}, @@ -121,16 +121,28 @@ def test_item_manager_set_path_value(): ], } +SAMPLE_CSV_FILE = os.path.join(TEST_DIR, 'data_files/sample_items_sheet2.csv') + +SAMPLE_CSV_FILE_RAW_CONTENT = SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2'] + +SAMPLE_CSV_FILE_ITEM_CONTENT = SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2'] + def test_workbook_manager_load_content(): - wt = WorkbookManager(SAMPLE_FILE) - assert wt.load_content() == SAMPLE_FILE_RAW_CONTENT + wt = WorkbookManager(SAMPLE_XLSX_FILE) + assert wt.load_content() == SAMPLE_XLSX_FILE_RAW_CONTENT def test_workbook_manager_load_workbook(): - assert WorkbookManager.load_workbook(SAMPLE_FILE) == SAMPLE_FILE_RAW_CONTENT + assert WorkbookManager.load_workbook(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_RAW_CONTENT + + +def test_workbook_manager_load_csv(): + + with pytest.raises(Exception): + WorkbookManager.load_workbook(SAMPLE_CSV_FILE) def test_item_manager_parse_value(): @@ -170,10 +182,16 @@ def test_item_manager_parse_value(): def test_item_manager_load_content(): - it = ItemManager(SAMPLE_FILE) - assert it.load_content() == SAMPLE_FILE_ITEM_CONTENT + it = ItemManager(SAMPLE_XLSX_FILE) + assert it.load_content() == SAMPLE_XLSX_FILE_ITEM_CONTENT def test_item_manager_load_workbook(): - assert ItemManager.load_workbook(SAMPLE_FILE) == SAMPLE_FILE_ITEM_CONTENT + assert ItemManager.load_workbook(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT + + +def test_item_manager_load_csv(): + + with pytest.raises(Exception): + ItemManager.load_workbook(SAMPLE_CSV_FILE) From e9d2465f7e8b5b73c140dc3704df32556440d4c4 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Mon, 14 Aug 2023 08:43:24 -0400 Subject: [PATCH 05/11] Update lock file. --- poetry.lock | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index d7e77523c..480148ea1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -752,6 +752,18 @@ develop = ["black", "coverage", "jinja2", "mock", "pytest", "pytest-cov", "pyyam docs = ["sphinx (<1.7)", "sphinx-rtd-theme"] requests = ["requests (>=2.4.0,<3.0.0)"] +[[package]] +name = "et-xmlfile" +version = "1.1.0" +description = "An implementation of lxml.xmlfile for the standard library" +category = "main" +optional = false +python-versions = ">=3.6" +files = [ + {file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"}, + {file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"}, +] + [[package]] name = "exceptiongroup" version = "1.1.2" @@ -911,6 +923,21 @@ files = [ [package.dependencies] psutil = {version = ">=4.0.0", markers = "sys_platform != \"cygwin\""} +[[package]] +name = "openpyxl" +version = "3.1.2" +description = "A Python library to read/write Excel 2010 xlsx/xlsm files" +category = "main" +optional = false +python-versions = ">=3.6" +files = [ + {file = "openpyxl-3.1.2-py2.py3-none-any.whl", hash = "sha256:f91456ead12ab3c6c2e9491cf33ba6d08357d802192379bb482f1033ade496f5"}, + {file = "openpyxl-3.1.2.tar.gz", hash = "sha256:a6f5977418eff3b2d5500d54d9db50c8277a368436f4e4f8ddb1be3422870184"}, +] + +[package.dependencies] +et-xmlfile = "*" + [[package]] name = "opensearch-py" version = "2.3.0" @@ -1594,4 +1621,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = ">=3.7,<3.10" -content-hash = "b8d6612bb28cfb9da79306a82b2ac35a20678e1f62ef86c93b8af3c3d1ed798e" +content-hash = "9d01884634874c0304ebd91ae564ad7920cece54aea7de4c67955c2343e7d44b" From 6e9060f670a309bef7de997d7f5ae7e33a8b9272 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Mon, 14 Aug 2023 10:28:30 -0400 Subject: [PATCH 06/11] Document new sheets_utils module. --- docs/source/dcicutils.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/source/dcicutils.rst b/docs/source/dcicutils.rst index 7fdaba7ea..cf8654a96 100644 --- a/docs/source/dcicutils.rst +++ b/docs/source/dcicutils.rst @@ -281,6 +281,13 @@ secrets_utils :members: +sheets_utils +^^^^^^^^^^^^ + +.. automodule:: dcicutils.sheets_utils + :members: + + snapshot_utils ^^^^^^^^^^^^^^ From df12c91bba8a9dc12816ae8b5555460fd2662fab Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Tue, 15 Aug 2023 16:30:01 -0400 Subject: [PATCH 07/11] Issue a beta for this functionality. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 647c13fe0..8fd8826a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.7.2" +version = "7.7.2.1b0" # to become "7.8.0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" From 6a39c8a6dcafa584a22e311224e347522fac8b84 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Tue, 15 Aug 2023 16:31:41 -0400 Subject: [PATCH 08/11] Fix documentation for sheet_utils. --- docs/source/dcicutils.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/dcicutils.rst b/docs/source/dcicutils.rst index cf8654a96..f0f07c49d 100644 --- a/docs/source/dcicutils.rst +++ b/docs/source/dcicutils.rst @@ -281,10 +281,10 @@ secrets_utils :members: -sheets_utils -^^^^^^^^^^^^ +sheet_utils +^^^^^^^^^^^ -.. automodule:: dcicutils.sheets_utils +.. automodule:: dcicutils.sheet_utils :members: From eedb5c68307fe1644d3396ef2d9de77eee6b5439 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Wed, 16 Aug 2023 10:25:12 -0400 Subject: [PATCH 09/11] Add some declarations. Small refactors to improve modularity. --- dcicutils/sheet_utils.py | 107 ++++++++++++++++++++++++--------------- 1 file changed, 67 insertions(+), 40 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 9633999d8..db310aeb2 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -1,11 +1,19 @@ import copy +from dcicutils.common import AnyJsonData from openpyxl import load_workbook from openpyxl.worksheet.worksheet import Worksheet from openpyxl.workbook.workbook import Workbook from typing import Any, Dict, List, Optional, Union +Header = str +Headers = List[str] +ParsedHeader = List[Union[str, int]] +ParsedHeaders = List[ParsedHeader] +SheetCellValue = Union[int, float, str] + + class WorkbookManager: @classmethod @@ -16,30 +24,30 @@ def load_workbook(cls, filename: str): def __init__(self, filename: str): self.filename: str = filename self.workbook: Optional[Workbook] = None - self.headers_by_sheetname: Dict[List[str]] = {} - self.content_by_sheetname: Dict[List[Any]] = {} + self.headers_by_sheetname: Dict[str, List[str]] = {} + self.content_by_sheetname: Dict[str, List[Any]] = {} - def sheet_headers(self, sheet: Worksheet) -> List[str]: - return self.headers_by_sheetname[sheet.title] + def sheet_headers(self, sheetname: str) -> List[str]: + return self.headers_by_sheetname[sheetname] - def sheet_content(self, sheet: Worksheet) -> List[Any]: - return self.content_by_sheetname[sheet.title] + def sheet_content(self, sheetname: str) -> List[Any]: + return self.content_by_sheetname[sheetname] @classmethod - def all_rows(cls, sheet: Worksheet): + def _all_rows(cls, sheet: Worksheet): row_max = sheet.max_row for row in range(2, row_max + 1): yield row @classmethod - def all_cols(cls, sheet: Worksheet): + def _all_cols(cls, sheet: Worksheet): col_max = sheet.max_column for col in range(1, col_max + 1): yield col - def load_headers(self, sheet: Worksheet): + def _load_headers(self, sheet: Worksheet): headers: List[str] = [str(sheet.cell(row=1, column=col).value) - for col in self.all_cols(sheet)] + for col in self._all_cols(sheet)] self.headers_by_sheetname[sheet.title] = headers def load_content(self): @@ -47,25 +55,44 @@ def load_content(self): self.workbook = workbook for sheetname in workbook.sheetnames: sheet: Worksheet = workbook[sheetname] - self.load_headers(sheet) + self._load_headers(sheet) content = [] - for row in self.all_rows(sheet): - row_dict = self.load_row(sheet=sheet, row=row) + for row in self._all_rows(sheet): + row_dict = self._load_row(sheet=sheet, row=row) content.append(row_dict) self.content_by_sheetname[sheetname] = content return self.content_by_sheetname - def load_row(self, *, sheet: Worksheet, row: int): - headers = self.sheet_headers(sheet) + def _load_row(self, *, sheet: Worksheet, row: int): + headers = self.sheet_headers(sheet.title) row_dict: Dict[str, Any] = {headers[col-1]: sheet.cell(row=row, column=col).value - for col in self.all_cols(sheet)} + for col in self._all_cols(sheet)} return row_dict class ItemTools: + """ + Implements operations on table-related data without pre-supposing the specific representation of the table. + It is assumed this can be used for data that was obtained from .json, .csv, .tsv, and .xlsx files because + it does not presuppose the source of the data nor where it will be written to. + + For the purpose of this class: + + * a 'header' is a string representing the top of a column. + + * a 'parsed header' is a list of strings and/or ints, after splitting at uses of '#' or '.', so that + "a.b.c" is represented as ["a", "b", "c"], and "x.y#0" is represented as ["x", "y", 0], and representing + each numeric token as an int instead of a string. + + * a 'headers' object is just a list of strings, each of which is a 'header'. + + * a 'parsed headers' object is a non-empty list of lists, each of which is a 'parsed header'. + e..g., the headers ["a.b.c", "x.y#0"] is represented as parsed hearders [["a", "b", "c"], ["x", "y", 0]]. + + """ @classmethod - def compute_patch_prototype(cls, parsed_headers): + def compute_patch_prototype(cls, parsed_headers: ParsedHeaders): prototype = {} for parsed_header in parsed_headers: parsed_header0 = parsed_header[0] @@ -75,7 +102,7 @@ def compute_patch_prototype(cls, parsed_headers): return prototype @classmethod - def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: List[Union[int, str]]): + def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: ParsedHeader): [key0, *more_keys] = keys key1 = more_keys[0] if more_keys else None if isinstance(key1, int): @@ -98,12 +125,12 @@ def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: List[U return parent @classmethod - def parse_sheet_headers(cls, headers): + def parse_sheet_headers(cls, headers: Headers): return [cls.parse_sheet_header(header) for header in headers] @classmethod - def parse_sheet_header(cls, header) -> List[Union[int, str]]: + def parse_sheet_header(cls, header: Header) -> ParsedHeader: result = [] token = "" for i in range(len(header)): @@ -119,7 +146,7 @@ def parse_sheet_header(cls, header) -> List[Union[int, str]]: return result @classmethod - def set_path_value(cls, datum, path, value, force=False): + def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any, force: bool = False): if (value is None or value == '') and not force: return [key, *more_path] = path @@ -129,7 +156,7 @@ def set_path_value(cls, datum, path, value, force=False): cls.set_path_value(datum[key], more_path, value) @classmethod - def parse_value(cls, value): + def parse_value(cls, value: SheetCellValue) -> AnyJsonData: if isinstance(value, str): lvalue = value.lower() # TODO: We could consult a schema to make this less heuristic, but this may do for now @@ -153,7 +180,7 @@ def parse_value(cls, value): except Exception: pass return value - else: # probably a number + else: # presumably a number (int or float) return value @@ -161,30 +188,30 @@ class ItemManager(ItemTools, WorkbookManager): def __init__(self, filename: str): super().__init__(filename=filename) - self.patch_prototypes_by_sheetname: Dict[Dict] = {} - self.parsed_headers_by_sheetname: Dict[List[List[Union[int, str]]]] = {} + self.patch_prototypes_by_sheetname: Dict[str, Dict] = {} + self.parsed_headers_by_sheetname: Dict[str, List[List[Union[int, str]]]] = {} - def sheet_patch_prototype(self, sheet: Worksheet) -> Dict: - return self.patch_prototypes_by_sheetname[sheet.title] + def sheet_patch_prototype(self, sheetname: str) -> Dict: + return self.patch_prototypes_by_sheetname[sheetname] - def sheet_parsed_headers(self, sheet: Worksheet) -> List[List[Union[int, str]]]: - return self.parsed_headers_by_sheetname[sheet.title] + def sheet_parsed_headers(self, sheetname: str) -> List[List[Union[int, str]]]: + return self.parsed_headers_by_sheetname[sheetname] - def load_headers(self, sheet: Worksheet): - super().load_headers(sheet) - self.compile_sheet_headers(sheet) + def _load_headers(self, sheet: Worksheet): + super()._load_headers(sheet) + self._compile_sheet_headers(sheet.title) - def compile_sheet_headers(self, sheet: Worksheet): - headers = self.headers_by_sheetname[sheet.title] + def _compile_sheet_headers(self, sheetname: str): + headers = self.headers_by_sheetname[sheetname] parsed_headers = self.parse_sheet_headers(headers) - self.parsed_headers_by_sheetname[sheet.title] = parsed_headers + self.parsed_headers_by_sheetname[sheetname] = parsed_headers prototype = self.compute_patch_prototype(parsed_headers) - self.patch_prototypes_by_sheetname[sheet.title] = prototype + self.patch_prototypes_by_sheetname[sheetname] = prototype - def load_row(self, *, sheet: Worksheet, row: int): - parsed_headers = self.sheet_parsed_headers(sheet) - patch_item = copy.deepcopy(self.sheet_patch_prototype(sheet)) - for col in self.all_cols(sheet): + def _load_row(self, *, sheet: Worksheet, row: int): + parsed_headers = self.sheet_parsed_headers(sheet.title) + patch_item = copy.deepcopy(self.sheet_patch_prototype(sheet.title)) + for col in self._all_cols(sheet): value = sheet.cell(row=row, column=col).value parsed_value = self.parse_value(value) self.set_path_value(patch_item, parsed_headers[col - 1], parsed_value) From a6b68feeb2219eb9f79635a7441656b09f38dc32 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Wed, 16 Aug 2023 13:33:06 -0400 Subject: [PATCH 10/11] Rearrange some methods for presentational reasons. --- dcicutils/sheet_utils.py | 174 +++++++++++++++++++-------------------- test/test_sheet_utils.py | 118 +++++++++++++------------- 2 files changed, 146 insertions(+), 146 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index db310aeb2..8125f27d3 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -14,62 +14,6 @@ SheetCellValue = Union[int, float, str] -class WorkbookManager: - - @classmethod - def load_workbook(cls, filename: str): - wb = cls(filename) - return wb.load_content() - - def __init__(self, filename: str): - self.filename: str = filename - self.workbook: Optional[Workbook] = None - self.headers_by_sheetname: Dict[str, List[str]] = {} - self.content_by_sheetname: Dict[str, List[Any]] = {} - - def sheet_headers(self, sheetname: str) -> List[str]: - return self.headers_by_sheetname[sheetname] - - def sheet_content(self, sheetname: str) -> List[Any]: - return self.content_by_sheetname[sheetname] - - @classmethod - def _all_rows(cls, sheet: Worksheet): - row_max = sheet.max_row - for row in range(2, row_max + 1): - yield row - - @classmethod - def _all_cols(cls, sheet: Worksheet): - col_max = sheet.max_column - for col in range(1, col_max + 1): - yield col - - def _load_headers(self, sheet: Worksheet): - headers: List[str] = [str(sheet.cell(row=1, column=col).value) - for col in self._all_cols(sheet)] - self.headers_by_sheetname[sheet.title] = headers - - def load_content(self): - workbook: Workbook = load_workbook(self.filename) - self.workbook = workbook - for sheetname in workbook.sheetnames: - sheet: Worksheet = workbook[sheetname] - self._load_headers(sheet) - content = [] - for row in self._all_rows(sheet): - row_dict = self._load_row(sheet=sheet, row=row) - content.append(row_dict) - self.content_by_sheetname[sheetname] = content - return self.content_by_sheetname - - def _load_row(self, *, sheet: Worksheet, row: int): - headers = self.sheet_headers(sheet.title) - row_dict: Dict[str, Any] = {headers[col-1]: sheet.cell(row=row, column=col).value - for col in self._all_cols(sheet)} - return row_dict - - class ItemTools: """ Implements operations on table-related data without pre-supposing the specific representation of the table. @@ -91,6 +35,27 @@ class ItemTools: """ + @classmethod + def parse_sheet_header(cls, header: Header) -> ParsedHeader: + result = [] + token = "" + for i in range(len(header)): + ch = header[i] + if ch == '.' or ch == '#': + if token: + result.append(int(token) if token.isdigit() else token) + token = "" + else: + token += ch + if token: + result.append(int(token) if token.isdigit() else token) + return result + + @classmethod + def parse_sheet_headers(cls, headers: Headers): + return [cls.parse_sheet_header(header) + for header in headers] + @classmethod def compute_patch_prototype(cls, parsed_headers: ParsedHeaders): prototype = {} @@ -124,37 +89,6 @@ def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: Parsed cls.assure_patch_prototype_shape(parent=parent[key0], keys=more_keys) return parent - @classmethod - def parse_sheet_headers(cls, headers: Headers): - return [cls.parse_sheet_header(header) - for header in headers] - - @classmethod - def parse_sheet_header(cls, header: Header) -> ParsedHeader: - result = [] - token = "" - for i in range(len(header)): - ch = header[i] - if ch == '.' or ch == '#': - if token: - result.append(int(token) if token.isdigit() else token) - token = "" - else: - token += ch - if token: - result.append(int(token) if token.isdigit() else token) - return result - - @classmethod - def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any, force: bool = False): - if (value is None or value == '') and not force: - return - [key, *more_path] = path - if not more_path: - datum[key] = value - else: - cls.set_path_value(datum[key], more_path, value) - @classmethod def parse_value(cls, value: SheetCellValue) -> AnyJsonData: if isinstance(value, str): @@ -183,6 +117,72 @@ def parse_value(cls, value: SheetCellValue) -> AnyJsonData: else: # presumably a number (int or float) return value + @classmethod + def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any, force: bool = False): + if (value is None or value == '') and not force: + return + [key, *more_path] = path + if not more_path: + datum[key] = value + else: + cls.set_path_value(datum[key], more_path, value) + + +class WorkbookManager: + + @classmethod + def load_workbook(cls, filename: str): + wb = cls(filename) + return wb.load_content() + + def __init__(self, filename: str): + self.filename: str = filename + self.workbook: Optional[Workbook] = None + self.headers_by_sheetname: Dict[str, List[str]] = {} + self.content_by_sheetname: Dict[str, List[Any]] = {} + + def sheet_headers(self, sheetname: str) -> List[str]: + return self.headers_by_sheetname[sheetname] + + def sheet_content(self, sheetname: str) -> List[Any]: + return self.content_by_sheetname[sheetname] + + @classmethod + def _all_rows(cls, sheet: Worksheet): + row_max = sheet.max_row + for row in range(2, row_max + 1): + yield row + + @classmethod + def _all_cols(cls, sheet: Worksheet): + col_max = sheet.max_column + for col in range(1, col_max + 1): + yield col + + def _load_headers(self, sheet: Worksheet): + headers: List[str] = [str(sheet.cell(row=1, column=col).value) + for col in self._all_cols(sheet)] + self.headers_by_sheetname[sheet.title] = headers + + def _load_row(self, *, sheet: Worksheet, row: int): + headers = self.sheet_headers(sheet.title) + row_dict: Dict[str, Any] = {headers[col-1]: sheet.cell(row=row, column=col).value + for col in self._all_cols(sheet)} + return row_dict + + def load_content(self): + workbook: Workbook = load_workbook(self.filename) + self.workbook = workbook + for sheetname in workbook.sheetnames: + sheet: Worksheet = workbook[sheetname] + self._load_headers(sheet) + content = [] + for row in self._all_rows(sheet): + row_dict = self._load_row(sheet=sheet, row=row) + content.append(row_dict) + self.content_by_sheetname[sheetname] = content + return self.content_by_sheetname + class ItemManager(ItemTools, WorkbookManager): diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index 32dffb25b..40286d2e3 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -1,29 +1,29 @@ import os import pytest -from dcicutils.sheet_utils import WorkbookManager, ItemManager +from dcicutils.sheet_utils import ItemTools, WorkbookManager, ItemManager from .conftest_settings import TEST_DIR -def test_item_manager_parse_sheet_header(): - assert ItemManager.parse_sheet_header('.a') == ['a'] - assert ItemManager.parse_sheet_header('a') == ['a'] - assert ItemManager.parse_sheet_header('#0') == [0] - assert ItemManager.parse_sheet_header('0') == [0] - assert ItemManager.parse_sheet_header('foo.bar') == ['foo', 'bar'] - assert ItemManager.parse_sheet_header('a.b#0') == ['a', 'b', 0] - assert ItemManager.parse_sheet_header('x.xx#17#8.z') == ['x', 'xx', 17, 8, 'z'] +def test_item_tools_parse_sheet_header(): + assert ItemTools.parse_sheet_header('.a') == ['a'] + assert ItemTools.parse_sheet_header('a') == ['a'] + assert ItemTools.parse_sheet_header('#0') == [0] + assert ItemTools.parse_sheet_header('0') == [0] + assert ItemTools.parse_sheet_header('foo.bar') == ['foo', 'bar'] + assert ItemTools.parse_sheet_header('a.b#0') == ['a', 'b', 0] + assert ItemTools.parse_sheet_header('x.xx#17#8.z') == ['x', 'xx', 17, 8, 'z'] # We don't error-check this, but it shouldn't matter - assert ItemManager.parse_sheet_header('#abc') == ['abc'] - assert ItemManager.parse_sheet_header('.123') == [123] - assert ItemManager.parse_sheet_header('#abc.123#456.def') == ['abc', 123, 456, 'def'] + assert ItemTools.parse_sheet_header('#abc') == ['abc'] + assert ItemTools.parse_sheet_header('.123') == [123] + assert ItemTools.parse_sheet_header('#abc.123#456.def') == ['abc', 123, 456, 'def'] -def test_item_manager_parse_sheet_headers(): +def test_item_tools_parse_sheet_headers(): input = ['a.b', 'a.c', 'a.d#1', 'a.d#2'] expected = [['a', 'b'], ['a', 'c'], ['a', 'd', 1], ['a', 'd', 2]] - assert ItemManager.parse_sheet_headers(input) == expected + assert ItemTools.parse_sheet_headers(input) == expected @pytest.mark.parametrize('parsed_headers,expected_prototype', [ @@ -38,32 +38,67 @@ def test_item_manager_parse_sheet_headers(): (['a.b', 'a.c', 'a.d#0.foo', 'a.d#0.bar', 'a.d#1.foo', 'a.d#1.bar'], {'a': {'b': None, 'c': None, 'd': [{'foo': None, 'bar': None}, {'foo': None, 'bar': None}]}}), ]) -def test_item_manager_compute_patch_prototype(parsed_headers, expected_prototype): - parsed_headers = ItemManager.parse_sheet_headers(parsed_headers) - assert ItemManager.compute_patch_prototype(parsed_headers) == expected_prototype +def test_item_tools_compute_patch_prototype(parsed_headers, expected_prototype): + parsed_headers = ItemTools.parse_sheet_headers(parsed_headers) + assert ItemTools.compute_patch_prototype(parsed_headers) == expected_prototype @pytest.mark.parametrize('headers', [['0'], ['x', '0.y']]) -def test_item_manager_compute_patch_prototype_errors(headers): +def test_item_tools_compute_patch_prototype_errors(headers): - parsed_headers = ItemManager.parse_sheet_headers(headers) + parsed_headers = ItemTools.parse_sheet_headers(headers) with pytest.raises(ValueError) as exc: - ItemManager.compute_patch_prototype(parsed_headers) + ItemTools.compute_patch_prototype(parsed_headers) assert str(exc.value) == "A header cannot begin with a numeric ref: 0" -def test_item_manager_set_path_value(): +def test_item_tools_parse_value(): + + for x in [37, 19.3, True, False, None, 'simple text']: + assert ItemTools.parse_value(x) == x + + assert ItemTools.parse_value('3') == 3 + assert ItemTools.parse_value('+3') == 3 + assert ItemTools.parse_value('-3') == -3 + + assert ItemTools.parse_value('3.5') == 3.5 + assert ItemTools.parse_value('+3.5') == 3.5 + assert ItemTools.parse_value('-3.5') == -3.5 + + assert ItemTools.parse_value('3.5e1') == 35.0 + assert ItemTools.parse_value('+3.5e1') == 35.0 + assert ItemTools.parse_value('-3.5e1') == -35.0 + + assert ItemTools.parse_value('') is None + + assert ItemTools.parse_value('null') is None + assert ItemTools.parse_value('Null') is None + assert ItemTools.parse_value('NULL') is None + + assert ItemTools.parse_value('true') is True + assert ItemTools.parse_value('True') is True + assert ItemTools.parse_value('TRUE') is True + + assert ItemTools.parse_value('false') is False + assert ItemTools.parse_value('False') is False + assert ItemTools.parse_value('FALSE') is False + + assert ItemTools.parse_value('alpha|beta|gamma') == ['alpha', 'beta', 'gamma'] + assert ItemTools.parse_value('alpha|true|false|null||7|1.5') == ['alpha', True, False, None, None, 7, 1.5] + + +def test_item_tools_set_path_value(): x = {'foo': 1, 'bar': 2} - ItemManager.set_path_value(x, ['foo'], 3) + ItemTools.set_path_value(x, ['foo'], 3) assert x == {'foo': 3, 'bar': 2} x = {'foo': [11, 22, 33], 'bar': {'x': 'xx', 'y': 'yy'}} - ItemManager.set_path_value(x, ['foo', 1], 17) + ItemTools.set_path_value(x, ['foo', 1], 17) assert x == {'foo': [11, 17, 33], 'bar': {'x': 'xx', 'y': 'yy'}} x = {'foo': [11, 22, 33], 'bar': {'x': 'xx', 'y': 'yy'}} - ItemManager.set_path_value(x, ['bar', 'x'], 'something') + ItemTools.set_path_value(x, ['bar', 'x'], 'something') assert x == {'foo': [11, 22, 33], 'bar': {'x': 'something', 'y': 'yy'}} @@ -145,41 +180,6 @@ def test_workbook_manager_load_csv(): WorkbookManager.load_workbook(SAMPLE_CSV_FILE) -def test_item_manager_parse_value(): - - for x in [37, 19.3, True, False, None, 'simple text']: - assert ItemManager.parse_value(x) == x - - assert ItemManager.parse_value('3') == 3 - assert ItemManager.parse_value('+3') == 3 - assert ItemManager.parse_value('-3') == -3 - - assert ItemManager.parse_value('3.5') == 3.5 - assert ItemManager.parse_value('+3.5') == 3.5 - assert ItemManager.parse_value('-3.5') == -3.5 - - assert ItemManager.parse_value('3.5e1') == 35.0 - assert ItemManager.parse_value('+3.5e1') == 35.0 - assert ItemManager.parse_value('-3.5e1') == -35.0 - - assert ItemManager.parse_value('') is None - - assert ItemManager.parse_value('null') is None - assert ItemManager.parse_value('Null') is None - assert ItemManager.parse_value('NULL') is None - - assert ItemManager.parse_value('true') is True - assert ItemManager.parse_value('True') is True - assert ItemManager.parse_value('TRUE') is True - - assert ItemManager.parse_value('false') is False - assert ItemManager.parse_value('False') is False - assert ItemManager.parse_value('FALSE') is False - - assert ItemManager.parse_value('alpha|beta|gamma') == ['alpha', 'beta', 'gamma'] - assert ItemManager.parse_value('alpha|true|false|null||7|1.5') == ['alpha', True, False, None, None, 7, 1.5] - - def test_item_manager_load_content(): it = ItemManager(SAMPLE_XLSX_FILE) From 56f702aaa381fe96f456f2a8e5558c0f41d40027 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Wed, 23 Aug 2023 22:04:34 -0400 Subject: [PATCH 11/11] Mark chardet as an acceptable license for use. --- dcicutils/license_utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dcicutils/license_utils.py b/dcicutils/license_utils.py index 855fa5c80..db18fd7df 100644 --- a/dcicutils/license_utils.py +++ b/dcicutils/license_utils.py @@ -810,6 +810,12 @@ class C4InfrastructureLicenseChecker(LicenseChecker): 'pytest-timeout', # MIT Licensed ], + # Linking = With Restrictions, Private Use = Yes + # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + 'GNU Lesser General Public License v2 or later (LGPLv2+)': [ + 'chardet' # used at runtime during server operation (ingestion), but not modified or distributed + ], + # Linking = With Restrictions, Private Use = Yes # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses 'GNU Lesser General Public License v3 or later (LGPLv3+)': [