-
Notifications
You must be signed in to change notification settings - Fork 15
/
translate_data.py
101 lines (79 loc) · 3.05 KB
/
translate_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import os
import time
import argparse
from tqdm import tqdm
from googletrans import Translator as GoogleTranslator
ORIG_DATA_DIR = os.path.join("goemotions", "data")
DATA_DIR = "data"
TRAIN_FILE = "train.tsv"
DEV_FILE = "dev.tsv"
TEST_FILE = "test.tsv"
TEXT_MAX_LENGTH = 5000 # google translate allows maximum size of 5000 for one request
GOOGLE_TIME_TO_SLEEP = 1.5
def make_chunks(sentence_lst):
"""
Chunk is a sentence that is not longer than TEXT_MAX_LENGTH
By looping the list of sentences, we will make a new chunk which is not longer than TEXT_MAX_LENGTH, while as long as possible
"""
input_chunk_lst = []
chunk = ""
for sentence in sentence_lst:
sentence = sentence.strip()
# https://www.reddit.com/r/OutOfTheLoop/comments/9abjhm/what_does_x200b_mean/
sentence = sentence.replace("​", "") # This one makes error
sentence = sentence + "\r\n"
if len((chunk.rstrip() + sentence).encode('utf-8')) > TEXT_MAX_LENGTH:
input_chunk_lst.append(chunk.rstrip())
chunk = sentence
else:
chunk = chunk + sentence
input_chunk_lst.append(chunk.rstrip())
return input_chunk_lst
def get_sentence_lst(file_path):
sentence_lst = []
label_lst = []
with open(file_path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
items = line.split("\t")
sentence = items[0].strip()
label = items[1]
sentence_lst.append(sentence)
label_lst.append(label)
return sentence_lst, label_lst
def google_translate(sentence_lst):
input_chunk_lst = make_chunks(sentence_lst)
trans = GoogleTranslator()
translated_sentence_lst = []
for en_chunk in tqdm(input_chunk_lst):
kr_chunk = trans.translate(en_chunk, src='en', dest='ko')
kr_chunk = kr_chunk.text
kr_sentences = kr_chunk.split("\r\n")
if kr_sentences[-1] == "":
kr_sentences = kr_sentences[:-1]
time.sleep(GOOGLE_TIME_TO_SLEEP)
translated_sentence_lst.extend(kr_sentences)
return translated_sentence_lst
def make_translate_data(orig_file_path, translated_file_path):
sentence_lst, label_lst = get_sentence_lst(orig_file_path)
translate_sentence_lst = google_translate(sentence_lst)
assert len(translate_sentence_lst) == len(label_lst)
with open(translated_file_path, "w", encoding="utf-8") as f:
for (translated_sent, label) in zip(translate_sentence_lst, label_lst):
f.write("{}\t{}\n".format(translated_sent, label))
print("Translating {} done".format(orig_file_path))
if __name__ == "__main__":
if not os.path.exists(DATA_DIR):
os.mkdir(DATA_DIR)
make_translate_data(
os.path.join(ORIG_DATA_DIR, TRAIN_FILE),
os.path.join(DATA_DIR, TRAIN_FILE)
)
make_translate_data(
os.path.join(ORIG_DATA_DIR, DEV_FILE),
os.path.join(DATA_DIR, DEV_FILE)
)
make_translate_data(
os.path.join(ORIG_DATA_DIR, TEST_FILE),
os.path.join(DATA_DIR, TEST_FILE)
)