-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_tweet.py
46 lines (34 loc) · 1.12 KB
/
process_tweet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
def process_tweet(tweet):
"""Process tweet function.
Input:
tweet: a string containing a tweet
Output:
tweets_clean: a list of words containing the processed tweet
"""
# cleaning
tweet = re.sub(r'^RT[\s]+','',tweet)
tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
tweet = re.sub(r'#', '',tweet)
tweet = re.sub(r'@', '',tweet)
# tokenization
token = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True)
tweet_tokenized = token.tokenize(tweet)
# STOP WORDS
stopwords_english = stopwords.words('english')
tweet_processed = []
for word in tweet_tokenized:
if (word not in stopwords_english and
word not in string.punctuation):
tweet_processed.append(word)
# stemming
tweet_stem = []
stem = PorterStemmer()
for word in tweet_processed:
stem_word = stem.stem(word)
tweet_stem.append(stem_word)
return tweet_stem