Skip to content

Commit

Permalink
performance improvement for DISK method off sd2df
Browse files Browse the repository at this point in the history
  • Loading branch information
tomweber-sas committed Nov 21, 2019
1 parent a000bf7 commit 2534909
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 77 deletions.
51 changes: 31 additions & 20 deletions saspy/sasiohttp.py
Original file line number Diff line number Diff line change
Expand Up @@ -1209,19 +1209,23 @@ def sasdata2dataframe(self, table: str, libref: str ='', dsopts: dict ={}, **kwa
varcat = l2[2].split("\n", nvars)
del varcat[nvars]

code = "data work.saspy_ds2df / view=work.saspy_ds2df; set "+tabname+self._sb._dsopts(dsopts)+";\n format "
code = "data work.saspy_ds2df / view=work.saspy_ds2df; set "+tabname+self._sb._dsopts(dsopts)+";\n"
for i in range(nvars):
if vartype[i] == 'FLOAT':
code += "format '"+varlist[i]+"'n "
if varcat[i] in self._sb.sas_date_fmts:
code += "'"+varlist[i]+"'n E8601DA10. "
code += 'E8601DA10.'
else:
if varcat[i] in self._sb.sas_time_fmts:
code += "'"+varlist[i]+"'n E8601TM15.6 "
code += 'E8601TM15.6'
else:
if varcat[i] in self._sb.sas_datetime_fmts:
code += "'"+varlist[i]+"'n E8601DT26.6 "
code += 'E8601DT26.6'
else:
code += "'"+varlist[i]+"'n best32. "
code += 'best32.'
code += '; '
if i % 10 == 0:
code +='\n'
code += ";run;\n"
ll = self.submit(code, "text")

Expand Down Expand Up @@ -1505,25 +1509,36 @@ def sasdata2dataframeDISK(self, table: str, libref: str ='', dsopts: dict ={},

code = "filename _tomodsx '"+self._sb.workpath+"_tomodsx' lrecl=1 recfm=f encoding=binary;\n"
code += "data _null_; set "+tabname+self._sb._dsopts(dsopts)+";\n"
code += "file _tomodsx lrecl=1 recfm=f encoding=binary; put "
for i in range(nvars):
code += "'"+varlist[i]+"'n "
if vartype[i] == 'FLOAT':
code += "format '"+varlist[i]+"'n "
if varcat[i] in self._sb.sas_date_fmts:
code += 'E8601DA10. '
code += 'E8601DA10.'
else:
if varcat[i] in self._sb.sas_time_fmts:
code += 'E8601TM15.6 '
code += 'E8601TM15.6'
else:
if varcat[i] in self._sb.sas_datetime_fmts:
code += 'E8601DT26.6 '
code += 'E8601DT26.6'
else:
code += 'best32. '
if (i < (len(varlist)-1)):
code += cdelim+' '
code += 'best32.'
code += '; '
if i % 10 == 0:
code +='\n'

code += "file _tomodsx lrecl=1 recfm=f encoding=binary;\n"

last = len(varlist)-1
for i in range(nvars):
code += "put '"+varlist[i]+"'n "
if i != last:
code += cdelim+'; '
else:
code += rdelim
code += "; run;\n"
code += rdelim+'; '
if i % 10 == 0:
code +='\n'
code += "run;"

ll = self.submit(code, "text")

ll = self.download(tmpcsv, self._sb.workpath+"_tomodsx")
Expand All @@ -1540,11 +1555,7 @@ def sasdata2dataframeDISK(self, table: str, libref: str ='', dsopts: dict ={},
else:
dts[varlist[i]] = 'str'

miss = [' .',
' .',
' .',
' .',
' ']
miss = ['.', ' ']

df = pd.read_csv(tmpcsv, index_col=False, engine='c', header=None, names=varlist,
sep=colsep, lineterminator=rowsep, dtype=dts, na_values=miss, **kwargs)
Expand Down
72 changes: 45 additions & 27 deletions saspy/sasioiom.py
Original file line number Diff line number Diff line change
Expand Up @@ -1579,25 +1579,36 @@ def sasdata2dataframe(self, table: str, libref: str ='', dsopts: dict = None, ro
rdelim = "'"+'%02x' % ord(rowsep.encode(self.sascfg.encoding))+"'x"
cdelim = "'"+'%02x' % ord(colsep.encode(self.sascfg.encoding))+"'x "

code = "data _null_; set "+tabname+self._sb._dsopts(dsopts)+";\n file "+self._tomods1.decode()+" dlm="+cdelim+" termstr=NL; put "

code = "data _null_; set "+tabname+self._sb._dsopts(dsopts)+";\n"
for i in range(nvars):
code += "'"+varlist[i]+"'n "
if vartype[i] == 'N':
code += "format '"+varlist[i]+"'n "
if varcat[i] in self._sb.sas_date_fmts:
code += 'E8601DA10. '+cdelim
code += 'E8601DA10.'
else:
if varcat[i] in self._sb.sas_time_fmts:
code += 'E8601TM15.6 '+cdelim
code += 'E8601TM15.6'
else:
if varcat[i] in self._sb.sas_datetime_fmts:
code += 'E8601DT26.6 '+cdelim
code += 'E8601DT26.6'
else:
code += 'best32. '+cdelim
if not (i < (len(varlist)-1)):
code += rdelim
code += ";\n run;"
code += 'best32.'
code += '; '
if i % 10 == 0:
code +='\n'

code += "file "+self._tomods1.decode()+" lrecl=1 recfm=f encoding=binary;\n"

last = len(varlist)-1
for i in range(nvars):
code += "put '"+varlist[i]+"'n "
if i != last:
code += cdelim+'; '
else:
code += rdelim+'; '
if i % 10 == 0:
code +='\n'
code += "run;"
ll = self._asubmit(code, 'text')

self.stdin[0].send(b'\n'+logcodei.encode()+b'\n'+b'tom says EOL='+logcodeb+b'\n')
Expand Down Expand Up @@ -1648,12 +1659,12 @@ def sasdata2dataframe(self, table: str, libref: str ='', dsopts: dict = None, ro
first = False

datar += data
data = datar.rpartition(colsep.encode()+rowsep.encode()+b'\n')
data = datar.rpartition(rowsep.encode())
datap = data[0]+data[1]
datar = data[2]

datap = datap.decode(self.sascfg.encoding, errors='replace')
for i in datap.split(sep=colsep+rowsep+'\n'):
for i in datap.split(sep=rowsep):
if i != '':
r.append(tuple(i.split(sep=colsep)))

Expand Down Expand Up @@ -2044,25 +2055,36 @@ def sasdata2dataframeDISK(self, table: str, libref: str ='', dsopts: dict = None
cdelim = "'"+'%02x' % ord(colsep.encode(self.sascfg.encoding))+"'x "

code += "data _null_; set "+tabname+self._sb._dsopts(dsopts)+";\n"
code += "file "+outname+" lrecl=1 recfm=f encoding=binary; put "
for i in range(nvars):
code += "'"+varlist[i]+"'n "
if vartype[i] == 'N':
code += "format '"+varlist[i]+"'n "
if varcat[i] in self._sb.sas_date_fmts:
code += 'E8601DA10. '
code += 'E8601DA10.'
else:
if varcat[i] in self._sb.sas_time_fmts:
code += 'E8601TM15.6 '
code += 'E8601TM15.6'
else:
if varcat[i] in self._sb.sas_datetime_fmts:
code += 'E8601DT26.6 '
code += 'E8601DT26.6'
else:
code += 'best32. '
if (i < (len(varlist)-1)):
code += cdelim+' '
code += 'best32.'
code += '; '
if i % 10 == 0:
code +='\n'

code += "file "+outname+" lrecl=1 recfm=f encoding=binary;\n"

last = len(varlist)-1
for i in range(nvars):
code += "put '"+varlist[i]+"'n "
if i != last:
code += cdelim+'; '
else:
code += rdelim
code += "; run;\n"
code += rdelim+'; '
if i % 10 == 0:
code +='\n'
code += "run;"

ll = self._asubmit(code, "text")

self.stdin[0].send(b'\n'+logcodei.encode()+b'\n'+b'tom says EOL='+logcodeo.encode())
Expand Down Expand Up @@ -2169,11 +2191,7 @@ def sasdata2dataframeDISK(self, table: str, libref: str ='', dsopts: dict = None
else:
dts[varlist[i]] = 'str'

miss = [' .',
' .',
' .',
' .',
' ']
miss = ['.', ' ']

df = pd.read_csv(tmpcsv, index_col=False, engine='c', header=None, names=varlist,
sep=colsep, lineterminator=rowsep, dtype=dts, na_values=miss, **kwargs)
Expand Down
79 changes: 49 additions & 30 deletions saspy/sasiostdio.py
Original file line number Diff line number Diff line change
Expand Up @@ -1555,27 +1555,39 @@ def sasdata2dataframe(self, table: str, libref: str ='', dsopts: dict = None, ro
host = ''

rdelim = "'"+'%02x' % ord(rowsep.encode(self.sascfg.encoding))+"'x"
cdelim = "'"+'%02x' % ord(colsep.encode(self.sascfg.encoding))+"'x "
cdelim = "'"+'%02x' % ord(colsep.encode(self.sascfg.encoding))+"'x"

code = ""
code += "filename sock socket '"+host+":"+str(port)+"' lrecl="+str(self.sascfg.lrecl)+" recfm=v termstr=LF;\n"
code += " data _null_; set "+tabname+self._sb._dsopts(dsopts)+";\n file sock dlm="+cdelim+"; put "
code = "filename sock socket '"+host+":"+str(port)+"' lrecl=1 recfm=f encoding=binary;\n"
code += "data _null_; set "+tabname+self._sb._dsopts(dsopts)+";\n"
for i in range(nvars):
code += "'"+varlist[i]+"'n "
if vartype[i] == 'N':
code += "format '"+varlist[i]+"'n "
if varcat[i] in self._sb.sas_date_fmts:
code += 'E8601DA10. '+cdelim
code += 'E8601DA10.'
else:
if varcat[i] in self._sb.sas_time_fmts:
code += 'E8601TM15.6 '+cdelim
code += 'E8601TM15.6'
else:
if varcat[i] in self._sb.sas_datetime_fmts:
code += 'E8601DT26.6 '+cdelim
code += 'E8601DT26.6'
else:
code += 'best32. '+cdelim
if not (i < (len(varlist)-1)):
code += rdelim
code += "; run;\n"
code += 'best32.'
code += '; '
if i % 10 == 0:
code +='\n'

code += "file sock; "

last = len(varlist)-1
for i in range(nvars):
code += "put '"+varlist[i]+"'n "
if i != last:
code += cdelim+'; '
else:
code += rdelim+'; '
if i % 10 == 0:
code +='\n'
code += "run;"

sock.listen(1)
self._asubmit(code, 'text')
Expand All @@ -1598,12 +1610,12 @@ def sasdata2dataframe(self, table: str, libref: str ='', dsopts: dict = None, ro
else:
break

data = datar.rpartition(colsep.encode()+rowsep.encode()+b'\n')
data = datar.rpartition(rowsep.encode())
datap = data[0]+data[1]
datar = data[2]

datap = datap.decode(self.sascfg.encoding, errors='replace')
for i in datap.split(sep=colsep+rowsep+'\n'):
for i in datap.split(sep=rowsep):
if i != '':
r.append(tuple(i.split(sep=colsep)))

Expand Down Expand Up @@ -1639,6 +1651,7 @@ def sasdata2dataframe(self, table: str, libref: str ='', dsopts: dict = None, ro
newsock[0].close()
sock.close()

ll = self.submit("", 'text')
if len(r) > 0 or df is None:
tdf = pd.DataFrame.from_records(r, columns=varlist)

Expand Down Expand Up @@ -1950,27 +1963,37 @@ def sasdata2dataframeDISK(self, table: str, libref: str ='', dsopts: dict = None
code = "filename sock '"+tmpcsv +"' lrecl=1 recfm=f encoding=binary;\n"

rdelim = "'"+'%02x' % ord(rowsep.encode(self.sascfg.encoding))+"'x"
cdelim = "'"+'%02x' % ord(colsep.encode(self.sascfg.encoding))+"'x "
cdelim = "'"+'%02x' % ord(colsep.encode(self.sascfg.encoding))+"'x"

code += "data _null_; set "+tabname+self._sb._dsopts(dsopts)+";\n file sock; put "
code += "data _null_; set "+tabname+self._sb._dsopts(dsopts)+";\n"
for i in range(nvars):
code += "'"+varlist[i]+"'n "
if vartype[i] == 'N':
code += "format '"+varlist[i]+"'n "
if varcat[i] in self._sb.sas_date_fmts:
code += 'E8601DA10. '
code += 'E8601DA10.'
else:
if varcat[i] in self._sb.sas_time_fmts:
code += 'E8601TM15.6 '
code += 'E8601TM15.6'
else:
if varcat[i] in self._sb.sas_datetime_fmts:
code += 'E8601DT26.6 '
code += 'E8601DT26.6'
else:
code += 'best32. '
if (i < (len(varlist)-1)):
code += cdelim+' '
code += 'best32.'
code += '; '
if i % 10 == 0:
code +='\n'

last = len(varlist)-1
code += "file sock; "
for i in range(nvars):
code += "put '"+varlist[i]+"'n "
if i != last:
code += cdelim+'; '
else:
code += rdelim
code += "; run;\n"
code += rdelim+'; '
if i % 10 == 0:
code +='\n'
code += "run;"

if self.sascfg.ssh:
csv = open(tmpcsv, mode='w')
Expand Down Expand Up @@ -2027,11 +2050,7 @@ def sasdata2dataframeDISK(self, table: str, libref: str ='', dsopts: dict = None
else:
dts[varlist[i]] = 'str'

miss = [' .',
' .',
' .',
' .',
' ']
miss = ['.', ' ']

df = pd.read_csv(tmpcsv, index_col=False, engine='c', header=None, names=varlist,
sep=colsep, lineterminator=rowsep, dtype=dts, na_values=miss, **kwargs)
Expand Down

0 comments on commit 2534909

Please sign in to comment.