-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmysqlcsv.py
141 lines (117 loc) · 4.04 KB
/
mysqlcsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
'''
Created on Jun 14, 2016
@author: dvya2
'''
import os
import re
import sys
import csv
import time
import argparse
import collections
import mysql
import warnings
import mysql.connector
def get_type(s):
"""Find type for this string
"""
number_formats = (
(int, 'integer'),
(float, 'double'),
)
for cast, number_type in number_formats:
try:
cast(s)
except ValueError:
pass
else:
return number_type
# check for timestamp
dt_formats = (
('%Y-%m-%d %H:%M:%S', 'datetime'),
('%Y-%m-%d %H:%M:%S.%f', 'datetime'),
('%Y-%m-%d', 'date'),
('%H:%M:%S', 'time'),
)
for dt_format, dt_type in dt_formats:
try:
time.strptime(s, dt_format)
except ValueError:
pass
else:
return dt_type
# doesn't match any other types so assume text
if len(s) > 255:
return 'text'
else:
return 'varchar(255)'
def most_common(l):
"""Return most common value from list
"""
return max(l, key=l.count)
# in get_col_types I have changed max_rows to 1 so I can control column type def with faked data row
# that contains properly-formatted values
# potential todo: add functionality to choose between majority type (current functionality) or
# default to varchar if ever a wild string appears
def get_col_types(input_file, max_rows=1):
"""Find the type for each CSV column
"""
csv_types = collections.defaultdict(list)
reader = csv.reader(open(input_file))
# test the first few rows for their data types
for row_i, row in enumerate(reader):
if row_i == 0:
header = row
else:
for col_i, s in enumerate(row):
data_type = get_type(s)
csv_types[header[col_i]].append(data_type)
if row_i == max_rows:
break
# take the most common data type for each row
return [most_common(csv_types[col]) for col in header]
def get_schema(table, header, col_types):
"""Generate the schema for this table from given types and columns
"""
schema_sql = """CREATE TABLE IF NOT EXISTS %s (
id int NOT NULL AUTO_INCREMENT,""" % table
for col_name, col_type in zip(header, col_types):
schema_sql += '\n%s %s,' % (col_name, col_type)
schema_sql += """\nPRIMARY KEY (id)
) DEFAULT CHARSET=utf8;"""
return schema_sql
def get_insert(table, header):
"""Generate the SQL for inserting rows
"""
field_names = ', '.join(header)
field_markers = ', '.join('%s' for col in header)
return 'INSERT INTO %s (%s) VALUES (%s);' % \
(table, field_names, field_markers)
def safe_col(s):
return re.sub('\W+', '_', s.lower()).strip('_')
def main(input_file, user, password, host, table, database):
insert_sql= ''
print "Importing `%s' into MySQL database `%s.%s'" % (input_file, database, table)
db = mysql.connector.connect(user=user, password=password,host=host,database=database)
cursor = db.cursor()
# create database and if doesn't exist
# define table
print 'Analyzing column types ...'
col_types = get_col_types(input_file)
print col_types
header = None
for row in csv.reader(open(input_file)):
if header == None:
header = [safe_col(col) for col in row]
schema_sql = get_schema(table, header, col_types)
print(schema_sql)
# create table
cursor.execute('DROP TABLE IF EXISTS %s;' % table)
cursor.execute(schema_sql)
# SQL string for inserting data
sql="LOAD DATA LOCAL INFILE '" + input_file + "' INTO TABLE " + table + " FIELDS TERMINATED BY ',' ENCLOSED BY '\"' LINES TERMINATED BY '\\r\\n' IGNORE 1 LINES"
cursor.execute(sql)
# commit rows to database
print 'Committing rows to database ...'
db.commit()
print 'Done!'