-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwrangle.py
117 lines (98 loc) · 4.57 KB
/
wrangle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""
For digesting mailing list dump data
Aidan McLaughlin - Jan 21, 2017
"""
import re
import os
import json
from app.models import reset_db
DATA_DIR = os.path.join(os.path.dirname(__file__), 'data/')
month_map = {"Jan": "january", "Feb": "february", "Mar": "march", "Apr": "april", "May": "may", "Jun": "june", "Jul": "july", "Aug": "august",
"Sep": "september", "Oct": "october", "Nov": "november", "Dec": "december"}
def parse(fname):
"""
Given a filename in the data folder, creates the following dictionary
for each email:
{
"id": id assigned by carpediem mail server,
"text": raw email text (includes newlines, etc.),
"subject": email subject,
"date": send date,
"author_name": sender name,
"author_email": sender email,
"replying_to": if a reply, the ID of the email being replied to
}
"""
clean_messages = []
with open(os.path.join(DATA_DIR, fname), 'r') as dump:
text_dump = dump.read()
# Break apart messages by From line
raw_messages = re.compile("From\s\w+\.\w+\sat\s.*").split(text_dump)
for raw_message in raw_messages:
if not raw_message:
continue
data = {}
m_id = re.search("Message-ID:\s\<(.*)@.*\>", raw_message)
data["id"] = m_id.group(1) if m_id else False
# Split between message and metadata
message_chunks = re.compile("Message-ID:\s\<.*\>").split(raw_message)
try:
metadata, text = message_chunks[0], message_chunks[1]
except Exception as e:
print(e)
continue
author_info = re.search("From:\s(.*)\sat\s(.*)\s\((.*)\)", metadata)
# From line is the following:
# From: dredfern.olin at gmail.com (Derek Redfern)
# First two matches are the username and email server
data["author_email"] = author_info.group(1) + "@" + author_info.group(2) if author_info else False
# Last match is for name in parenthesis
data["author_name"] = author_info.group(3) if author_info else False
# Subject
subject = re.search("Subject:\s(.*(\n)*.*)", metadata)
subject = subject.group(1) if subject else False
data["subject"] = re.compile("In-Reply-To:|References:").split(subject)[0].replace("\n\t", " ").strip()
# Date
date = re.search("Date:\s(.*)", metadata)
data["date"] = date.group(1) if date else False
# Get ID of message being replied to if applicable
replying_to = re.search("In-Reply-To:\s\<(.*)@.*\>", metadata)
data["replying_to"] = replying_to.group(1) if replying_to else False
# Email text
email_body = re.compile("On\s\w+,\s\w+\s\w+,\s\w+\sat\s\w+:\w+\s[A|P]M\s.*\<.*\>").split(text)[0]
email_body = re.compile("\-+\snext\spart\s\-+").split(email_body)[0]
data["text"] = email_body
clean_messages.append(data)
return clean_messages
def update_jsons(emails, date):
""" Takes in emails in the form of a list of dictionaries, each with the format
{
"id": id assigned by carpediem mail server,
"text": raw email text (includes newlines, etc.),
"subject": email subject,
"date": send date,
"author_name": sender name,
"author_email": sender email,
"replying_to": if a reply, the ID of the email being replied to
}
Then finds the correct JSON to put them in and updates that JSON """
date = date.split()
date[0] = month_map[date[0]]
date_formatted = "-".join(date)
if date_formatted + ".json" in os.listdir(os.path.join(os.path.dirname(__file__), "parsed_data/")):
with open(os.path.join(os.path.dirname(__file__), "parsed_data/", date_formatted + ".json")) as file:
emails += json.load(file)
with open(os.path.join(os.path.dirname(__file__), "parsed_data/", date_formatted + ".json"), 'w') as file:
json.dump(emails, file)
reset_db() # Should change this to add_emails; just need to figure out how to selectively reset the database
def data_to_jsons():
"""Goes through each data dump, parses the emails, and throws them into a JSON
"""
for email_dump in os.listdir(DATA_DIR):
print(email_dump)
emails = []
emails += parse(email_dump)
with open(os.path.join(os.path.dirname(__file__), "parsed_data/", email_dump.split('.')[0] + ".json"), "w") as clean:
json.dump(emails, clean)
if __name__ == "__main__":
data_to_jsons()