-
Notifications
You must be signed in to change notification settings - Fork 3
/
preprocess.py
39 lines (27 loc) · 1.21 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
"""
Saving the data into csv
Format: Adding the target text in front of the source text
because this is how T5 model was trained for summarization task
"""
import unidecode
import pandas as pd
TRAIN_SAMPLE = 300_000
TRAIN_SRC = '/data/train.source'
TRAIN_TARGET = '/data/train.target'
TRAIN_SAVE = '/dataCSV/train300k.csv'
VALID_SAMPLE = 5_000
VALID_SRC = '/data/validation.source'
VALID_TARGET = '/data/validation.target'
VALID_SAVE = '/dataCSV/val5k.csv'
def save_to_csv(SRC, TARGET, SAMPLE, SAVE):
source = pd.read_csv(SRC, header=None, names=['source'], delimiter="\n", encoding='utf-8')
target = pd.read_csv(TARGET, header=None, names=['target'], delimiter="\n", encoding='utf-8')
data = pd.DataFrame()
data['source'] = np.squeeze(source.to_numpy())
data['target'] = np.squeeze(target.to_numpy())
data = data.sample(SAMPLE) # randomly select data
data['source'] = data['source'].apply(lambda x: unidecode.unidecode(x))
data['target'] = data['target'].apply(lambda x: unidecode.unidecode(x))
data.to_csv(SAVE,index=False)
save_to_csv(TRAIN_SRC, TRAIN_TARGET, TRAIN_SAMPLE, TRAIN_SAVE)
save_to_csv(VALID_SRC, VALID_TARGET, VALID_SAMPLE, VALID_SAVE)