-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcount_cleaner.py
68 lines (59 loc) · 2.04 KB
/
count_cleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from csv import reader, writer
with open('finalmergeok.csv', newline='') as ip, open('Base_Features_count.csv', 'w', newline='') as op:
item = list(reader(ip))
item[0].insert(10,'Pages scraped')
body = item[1:]
count = 1
for row in body:
print("processing row " + str(count))
count +=1
features = row[11:]
try:
features_int = list(map(lambda x: int(x), features))
smol = min(features_int)
if smol > 6:
print(row)
if smol == 0:
item.remove(row)
features = list(map(lambda x: x - smol, features_int))
row[11:] = features
row.insert(10, smol)
except:
print(row)
item.remove(row)
csv_writer = writer(op)
csv_writer.writerows(item)
print("All is well")
"""
# cap max count to 20. No significant increase in accuracy. decided not to include this.
with open('training_features_count.csv', newline='') as ip, open('training_features_count_cap.csv', 'w', newline='') as op:
item = list(reader(ip))
body = item[1:]
for row in body:
features = row[11:]
features = list(map(lambda x: int(x), features))
features = list(map(lambda x: min(x, 20), features))
row[11:] = features
csv_writer = writer(op)
csv_writer.writerows(item)
print("all is well")
"""
"""
with open('Base_Features.csv', newline='') as ip, open('Base_Features_count.csv', 'w', newline='') as op:
item = list(reader(ip))
body = item[1:]
for row in body:
features = row[10:]
try:
features_int = list(map(lambda x: int(x), features))
smol = min(features_int)
if smol > 6:
print(row)
features = list(map(lambda x: x - smol, features_int))
row[10:] = features
except:
print(row)
item.remove(row)
csv_writer = writer(op)
csv_writer.writerows(item)
"""