-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess_dataset.py
99 lines (84 loc) · 2.91 KB
/
preprocess_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from base64 import encode
# import encodings
# from multiprocessing.spawn import _main
# from unittest.main import main
from tqdm import tqdm
import pandas as pd
import re
import demoji
# import py_vncorenlp
from nltk.stem.porter import PorterStemmer
from vncorenlp import VnCoreNLP
dataset = './dataset/data_final_problem2.csv'
df = pd.read_csv(dataset)
stop_word = []
txt_file = open("./dataset/vietnamese-stopwords-dash.txt", "r", encoding = "utf-8")
file_content = txt_file.read()
content_list = file_content.split("\n")
stemmer = PorterStemmer()
# py_vncorenlp.download_model(save_dir='./vncorenlp/')
# rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='./vncorenlp/')
vnp = VnCoreNLP("./vncorenlp/VnCoreNLP-1.1.1.jar",annotators="wseg")
SENTI_DICT = {
1: "very_negative",
2: "negative",
3: "normal",
4: "positive",
5: "very_positive"
}
def remove_url(text):
text = re.sub(r"http\S+", "", text)
return text
def handle_emoji(string):
emojis = demoji.findall(string)
for emoji in emojis:
string = string.replace(emoji, " " + emojis[emoji].split(":")[0])
return string
def remove_stopwords(text):
text = [word for word in text if word not in content_list]
# new_text = " ".join(text)
return text
def stemming(text):
text = [stemmer.stem(word) for word in text]
# new_text = " ".join(text)
return text
def word_tokenizer(text):
# tokens = rdrsegmenter.word_segmenter(text)
tokens = vnp.tokenize(text)
tokens = [t for ts in tokens for t in ts]
word_segmented_text = " ".join(tokens)
return word_segmented_text
def preprocessing(text):
text = remove_url(text)
text = handle_emoji(text)
text = text.lower()
text = re.sub(r'[^\w\s]', '', text)
text = re.sub(r'(ks)', 'khách_sạn', text)
text = re.sub(r'(ko)', 'không', text)
text = word_tokenizer(text)
text = remove_stopwords(text)
text = " ".join(text[0])
return text
def process_labels_csv(data_frame):
labels_lst = []
for i in range(len(data_frame)):
sample_labels = []
for col in data_frame.columns.values[1:7]:
if data_frame[col][i] != 0:
polarity = data_frame[col][i]
label = f"{col}#{SENTI_DICT[polarity]}"
sample_labels.append(f"{{{label}}}")
label_str = ", ".join([ele for ele in sample_labels])
labels_lst.append(label_str)
data_frame["label"] = labels_lst
return data_frame
def process():
# df1 = df.drop(df[(df.giai_tri == 0) & (df.luu_tru == 0) & (df.nha_hang == 0) & (df.an_uong == 0) & (df.di_chuyen == 0) & (df.mua_sam == 0)].index)
df1 = df.copy()
tqdm.pandas()
df1['clean_review'] = df1['Review'].progress_map(preprocessing)
process_labels_csv(df1)
df1.to_csv('./dataset/clean_data_final_problem2.csv', encoding='utf-8-sig')
if __name__ == "__main__":
process()
print("Everything is done!")