-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataprofiler.py
134 lines (124 loc) · 3.66 KB
/
dataprofiler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import pandas as pd
import csv
import argparse
import time
df = ''
headers = ''
totalRecs = 0
datafile = ''
outfile = 'stats.csv'
datevarfile = ''
datefmtmask = '%Y%m%d'
strMode = False
datevarlist = []
# load the actual data file
# return a dataframe
def loadData(filename):
try:
print('Loading data file...' + filename)
if strMode:
return pd.read_csv(filename, dtype=str)
else:
return pd.read_csv(filename)
except Exception as e:
raise
else:
pass
finally:
pass
# load the list of specified date columns from a csv
def loadDateColumnVars(datefile):
l = []
try:
print('Loading date variables file...' + datefile)
v = pd.read_csv(datefile)
l = list(v)
except Exception as e:
pass
else:
pass
finally:
return l
# check for valid dates
# dlist - list of dates to check
# returns a count of the number of invalid dates
def checkDates(dlist):
invalids = 0
for dt in dlist:
if dt != dt:
pass
else:
try:
if strMode == False: # dates read in as numeric
strDt = str(int(dt))
else:
strDt = dt
time.strptime(strDt, datefmtmask) # make this configurable
#print (dt)
except Exception as e:
#print ('invalid date')
invalids = invalids + 1
continue
#print('number of invalid dates ' + str(invalids))
return invalids
def process():
df = loadData(datafile) #'c:/Users/midavis/Downloads/Breast-DCIS-Cases-Treasure-No-MRNs.csv'
totalRecs = len(df)
#print (df)
headers = list(df) # get a list of headers
#print (headers)
#print (len(headers))
print('Writing data stats file...' + outfile)
with open(outfile, 'w') as csvfile:
try:
fieldnames = ['column', 'count', 'percentage', 'top_value', 'top_freq', 'unique_values', 'datatype', 'invalid_dates']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect='excel', lineterminator='\n')
writer.writeheader()
for col in headers:
v = df[col].describe()
dtype = df[col].dtype
percent = (v[0] / totalRecs) * 100
invalidDt = 0
# if we have dates identified then do a check
try:
idx = datevarlist.index(col)
dl = list(df[col])
invalidDt = checkDates(dl)
except Exception as e:
invalidDt = '-'
if dtype == 'object':
try:
writer.writerow({'column': col, 'count': v[0], 'percentage': '{:3.1f}%'.format(percent),
'top_value': v[2], 'top_freq': v[3], 'unique_values': v[1], 'datatype': dtype, 'invalid_dates': invalidDt})
except Exception as ew:
print(ew)
pass
else:
writer.writerow({'column': col, 'count': v[0], 'percentage': '{:3.1f}%'.format(percent),
'top_value': v[6], 'top_freq': '-', 'unique_values': '-', 'datatype': dtype, 'invalid_dates': invalidDt})
except Exception as ex:
print(ex)
pass
finally:
csvfile.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("datafile", help="data filename")
parser.add_argument("-s", "--string", action="store_true", help="open file in string mode")
parser.add_argument("-o", "--outfile", help="output file; default will be: stats.csv")
parser.add_argument("-d", "--datevars", help="date variable file for checking dates")
parser.add_argument("-f", "--datefmtmask", help="date formart mask for checking dates")
args = parser.parse_args()
if args.datafile:
datafile = args.datafile
if args.string:
strMode = True
if args.outfile:
outfile = args.outfile
if args.datevars:
datevarfile = args.datevars
datevarlist = loadDateColumnVars(datevarfile)
#print (datevarlist)
if args.datefmtmask:
datefmtmask = args.datefmtmask
process()