From 2534909f748d4c4481cddeffd152115c6fc386d0 Mon Sep 17 00:00:00 2001 From: Tom Weber Date: Thu, 21 Nov 2019 15:25:48 -0500 Subject: [PATCH] performance improvement for DISK method off sd2df --- saspy/sasiohttp.py | 51 +++++++++++++++++------------ saspy/sasioiom.py | 72 +++++++++++++++++++++++++---------------- saspy/sasiostdio.py | 79 ++++++++++++++++++++++++++++----------------- 3 files changed, 125 insertions(+), 77 deletions(-) diff --git a/saspy/sasiohttp.py b/saspy/sasiohttp.py index feff4eaa..d7ee6f1c 100644 --- a/saspy/sasiohttp.py +++ b/saspy/sasiohttp.py @@ -1209,19 +1209,23 @@ def sasdata2dataframe(self, table: str, libref: str ='', dsopts: dict ={}, **kwa varcat = l2[2].split("\n", nvars) del varcat[nvars] - code = "data work.saspy_ds2df / view=work.saspy_ds2df; set "+tabname+self._sb._dsopts(dsopts)+";\n format " + code = "data work.saspy_ds2df / view=work.saspy_ds2df; set "+tabname+self._sb._dsopts(dsopts)+";\n" for i in range(nvars): if vartype[i] == 'FLOAT': + code += "format '"+varlist[i]+"'n " if varcat[i] in self._sb.sas_date_fmts: - code += "'"+varlist[i]+"'n E8601DA10. " + code += 'E8601DA10.' else: if varcat[i] in self._sb.sas_time_fmts: - code += "'"+varlist[i]+"'n E8601TM15.6 " + code += 'E8601TM15.6' else: if varcat[i] in self._sb.sas_datetime_fmts: - code += "'"+varlist[i]+"'n E8601DT26.6 " + code += 'E8601DT26.6' else: - code += "'"+varlist[i]+"'n best32. " + code += 'best32.' + code += '; ' + if i % 10 == 0: + code +='\n' code += ";run;\n" ll = self.submit(code, "text") @@ -1505,25 +1509,36 @@ def sasdata2dataframeDISK(self, table: str, libref: str ='', dsopts: dict ={}, code = "filename _tomodsx '"+self._sb.workpath+"_tomodsx' lrecl=1 recfm=f encoding=binary;\n" code += "data _null_; set "+tabname+self._sb._dsopts(dsopts)+";\n" - code += "file _tomodsx lrecl=1 recfm=f encoding=binary; put " for i in range(nvars): - code += "'"+varlist[i]+"'n " if vartype[i] == 'FLOAT': + code += "format '"+varlist[i]+"'n " if varcat[i] in self._sb.sas_date_fmts: - code += 'E8601DA10. ' + code += 'E8601DA10.' else: if varcat[i] in self._sb.sas_time_fmts: - code += 'E8601TM15.6 ' + code += 'E8601TM15.6' else: if varcat[i] in self._sb.sas_datetime_fmts: - code += 'E8601DT26.6 ' + code += 'E8601DT26.6' else: - code += 'best32. ' - if (i < (len(varlist)-1)): - code += cdelim+' ' + code += 'best32.' + code += '; ' + if i % 10 == 0: + code +='\n' + + code += "file _tomodsx lrecl=1 recfm=f encoding=binary;\n" + + last = len(varlist)-1 + for i in range(nvars): + code += "put '"+varlist[i]+"'n " + if i != last: + code += cdelim+'; ' else: - code += rdelim - code += "; run;\n" + code += rdelim+'; ' + if i % 10 == 0: + code +='\n' + code += "run;" + ll = self.submit(code, "text") ll = self.download(tmpcsv, self._sb.workpath+"_tomodsx") @@ -1540,11 +1555,7 @@ def sasdata2dataframeDISK(self, table: str, libref: str ='', dsopts: dict ={}, else: dts[varlist[i]] = 'str' - miss = [' .', - ' .', - ' .', - ' .', - ' '] + miss = ['.', ' '] df = pd.read_csv(tmpcsv, index_col=False, engine='c', header=None, names=varlist, sep=colsep, lineterminator=rowsep, dtype=dts, na_values=miss, **kwargs) diff --git a/saspy/sasioiom.py b/saspy/sasioiom.py index 3cc90420..c28ff9ee 100644 --- a/saspy/sasioiom.py +++ b/saspy/sasioiom.py @@ -1579,25 +1579,36 @@ def sasdata2dataframe(self, table: str, libref: str ='', dsopts: dict = None, ro rdelim = "'"+'%02x' % ord(rowsep.encode(self.sascfg.encoding))+"'x" cdelim = "'"+'%02x' % ord(colsep.encode(self.sascfg.encoding))+"'x " - code = "data _null_; set "+tabname+self._sb._dsopts(dsopts)+";\n file "+self._tomods1.decode()+" dlm="+cdelim+" termstr=NL; put " - + code = "data _null_; set "+tabname+self._sb._dsopts(dsopts)+";\n" for i in range(nvars): - code += "'"+varlist[i]+"'n " if vartype[i] == 'N': + code += "format '"+varlist[i]+"'n " if varcat[i] in self._sb.sas_date_fmts: - code += 'E8601DA10. '+cdelim + code += 'E8601DA10.' else: if varcat[i] in self._sb.sas_time_fmts: - code += 'E8601TM15.6 '+cdelim + code += 'E8601TM15.6' else: if varcat[i] in self._sb.sas_datetime_fmts: - code += 'E8601DT26.6 '+cdelim + code += 'E8601DT26.6' else: - code += 'best32. '+cdelim - if not (i < (len(varlist)-1)): - code += rdelim - code += ";\n run;" + code += 'best32.' + code += '; ' + if i % 10 == 0: + code +='\n' + + code += "file "+self._tomods1.decode()+" lrecl=1 recfm=f encoding=binary;\n" + last = len(varlist)-1 + for i in range(nvars): + code += "put '"+varlist[i]+"'n " + if i != last: + code += cdelim+'; ' + else: + code += rdelim+'; ' + if i % 10 == 0: + code +='\n' + code += "run;" ll = self._asubmit(code, 'text') self.stdin[0].send(b'\n'+logcodei.encode()+b'\n'+b'tom says EOL='+logcodeb+b'\n') @@ -1648,12 +1659,12 @@ def sasdata2dataframe(self, table: str, libref: str ='', dsopts: dict = None, ro first = False datar += data - data = datar.rpartition(colsep.encode()+rowsep.encode()+b'\n') + data = datar.rpartition(rowsep.encode()) datap = data[0]+data[1] datar = data[2] datap = datap.decode(self.sascfg.encoding, errors='replace') - for i in datap.split(sep=colsep+rowsep+'\n'): + for i in datap.split(sep=rowsep): if i != '': r.append(tuple(i.split(sep=colsep))) @@ -2044,25 +2055,36 @@ def sasdata2dataframeDISK(self, table: str, libref: str ='', dsopts: dict = None cdelim = "'"+'%02x' % ord(colsep.encode(self.sascfg.encoding))+"'x " code += "data _null_; set "+tabname+self._sb._dsopts(dsopts)+";\n" - code += "file "+outname+" lrecl=1 recfm=f encoding=binary; put " for i in range(nvars): - code += "'"+varlist[i]+"'n " if vartype[i] == 'N': + code += "format '"+varlist[i]+"'n " if varcat[i] in self._sb.sas_date_fmts: - code += 'E8601DA10. ' + code += 'E8601DA10.' else: if varcat[i] in self._sb.sas_time_fmts: - code += 'E8601TM15.6 ' + code += 'E8601TM15.6' else: if varcat[i] in self._sb.sas_datetime_fmts: - code += 'E8601DT26.6 ' + code += 'E8601DT26.6' else: - code += 'best32. ' - if (i < (len(varlist)-1)): - code += cdelim+' ' + code += 'best32.' + code += '; ' + if i % 10 == 0: + code +='\n' + + code += "file "+outname+" lrecl=1 recfm=f encoding=binary;\n" + + last = len(varlist)-1 + for i in range(nvars): + code += "put '"+varlist[i]+"'n " + if i != last: + code += cdelim+'; ' else: - code += rdelim - code += "; run;\n" + code += rdelim+'; ' + if i % 10 == 0: + code +='\n' + code += "run;" + ll = self._asubmit(code, "text") self.stdin[0].send(b'\n'+logcodei.encode()+b'\n'+b'tom says EOL='+logcodeo.encode()) @@ -2169,11 +2191,7 @@ def sasdata2dataframeDISK(self, table: str, libref: str ='', dsopts: dict = None else: dts[varlist[i]] = 'str' - miss = [' .', - ' .', - ' .', - ' .', - ' '] + miss = ['.', ' '] df = pd.read_csv(tmpcsv, index_col=False, engine='c', header=None, names=varlist, sep=colsep, lineterminator=rowsep, dtype=dts, na_values=miss, **kwargs) diff --git a/saspy/sasiostdio.py b/saspy/sasiostdio.py index 465f1fdf..a3dbfe9b 100644 --- a/saspy/sasiostdio.py +++ b/saspy/sasiostdio.py @@ -1555,27 +1555,39 @@ def sasdata2dataframe(self, table: str, libref: str ='', dsopts: dict = None, ro host = '' rdelim = "'"+'%02x' % ord(rowsep.encode(self.sascfg.encoding))+"'x" - cdelim = "'"+'%02x' % ord(colsep.encode(self.sascfg.encoding))+"'x " + cdelim = "'"+'%02x' % ord(colsep.encode(self.sascfg.encoding))+"'x" - code = "" - code += "filename sock socket '"+host+":"+str(port)+"' lrecl="+str(self.sascfg.lrecl)+" recfm=v termstr=LF;\n" - code += " data _null_; set "+tabname+self._sb._dsopts(dsopts)+";\n file sock dlm="+cdelim+"; put " + code = "filename sock socket '"+host+":"+str(port)+"' lrecl=1 recfm=f encoding=binary;\n" + code += "data _null_; set "+tabname+self._sb._dsopts(dsopts)+";\n" for i in range(nvars): - code += "'"+varlist[i]+"'n " if vartype[i] == 'N': + code += "format '"+varlist[i]+"'n " if varcat[i] in self._sb.sas_date_fmts: - code += 'E8601DA10. '+cdelim + code += 'E8601DA10.' else: if varcat[i] in self._sb.sas_time_fmts: - code += 'E8601TM15.6 '+cdelim + code += 'E8601TM15.6' else: if varcat[i] in self._sb.sas_datetime_fmts: - code += 'E8601DT26.6 '+cdelim + code += 'E8601DT26.6' else: - code += 'best32. '+cdelim - if not (i < (len(varlist)-1)): - code += rdelim - code += "; run;\n" + code += 'best32.' + code += '; ' + if i % 10 == 0: + code +='\n' + + code += "file sock; " + + last = len(varlist)-1 + for i in range(nvars): + code += "put '"+varlist[i]+"'n " + if i != last: + code += cdelim+'; ' + else: + code += rdelim+'; ' + if i % 10 == 0: + code +='\n' + code += "run;" sock.listen(1) self._asubmit(code, 'text') @@ -1598,12 +1610,12 @@ def sasdata2dataframe(self, table: str, libref: str ='', dsopts: dict = None, ro else: break - data = datar.rpartition(colsep.encode()+rowsep.encode()+b'\n') + data = datar.rpartition(rowsep.encode()) datap = data[0]+data[1] datar = data[2] datap = datap.decode(self.sascfg.encoding, errors='replace') - for i in datap.split(sep=colsep+rowsep+'\n'): + for i in datap.split(sep=rowsep): if i != '': r.append(tuple(i.split(sep=colsep))) @@ -1639,6 +1651,7 @@ def sasdata2dataframe(self, table: str, libref: str ='', dsopts: dict = None, ro newsock[0].close() sock.close() + ll = self.submit("", 'text') if len(r) > 0 or df is None: tdf = pd.DataFrame.from_records(r, columns=varlist) @@ -1950,27 +1963,37 @@ def sasdata2dataframeDISK(self, table: str, libref: str ='', dsopts: dict = None code = "filename sock '"+tmpcsv +"' lrecl=1 recfm=f encoding=binary;\n" rdelim = "'"+'%02x' % ord(rowsep.encode(self.sascfg.encoding))+"'x" - cdelim = "'"+'%02x' % ord(colsep.encode(self.sascfg.encoding))+"'x " + cdelim = "'"+'%02x' % ord(colsep.encode(self.sascfg.encoding))+"'x" - code += "data _null_; set "+tabname+self._sb._dsopts(dsopts)+";\n file sock; put " + code += "data _null_; set "+tabname+self._sb._dsopts(dsopts)+";\n" for i in range(nvars): - code += "'"+varlist[i]+"'n " if vartype[i] == 'N': + code += "format '"+varlist[i]+"'n " if varcat[i] in self._sb.sas_date_fmts: - code += 'E8601DA10. ' + code += 'E8601DA10.' else: if varcat[i] in self._sb.sas_time_fmts: - code += 'E8601TM15.6 ' + code += 'E8601TM15.6' else: if varcat[i] in self._sb.sas_datetime_fmts: - code += 'E8601DT26.6 ' + code += 'E8601DT26.6' else: - code += 'best32. ' - if (i < (len(varlist)-1)): - code += cdelim+' ' + code += 'best32.' + code += '; ' + if i % 10 == 0: + code +='\n' + + last = len(varlist)-1 + code += "file sock; " + for i in range(nvars): + code += "put '"+varlist[i]+"'n " + if i != last: + code += cdelim+'; ' else: - code += rdelim - code += "; run;\n" + code += rdelim+'; ' + if i % 10 == 0: + code +='\n' + code += "run;" if self.sascfg.ssh: csv = open(tmpcsv, mode='w') @@ -2027,11 +2050,7 @@ def sasdata2dataframeDISK(self, table: str, libref: str ='', dsopts: dict = None else: dts[varlist[i]] = 'str' - miss = [' .', - ' .', - ' .', - ' .', - ' '] + miss = ['.', ' '] df = pd.read_csv(tmpcsv, index_col=False, engine='c', header=None, names=varlist, sep=colsep, lineterminator=rowsep, dtype=dts, na_values=miss, **kwargs)