-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsplit.sh
48 lines (34 loc) · 1.05 KB
/
split.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# dataset="coha.txt.raw.norm coca.txt.raw.norm arxiv.txt.raw.norm"
#coca 0 29 1990 - 2019
#coha 0 199 1810 2009
#arxiv 0 352 2007.4 - 2020.4
# nyt 1987- 2007
# nyt_yao 1986 - 2015
# dataset="input.txt" -101 1810 1990
#python normalized.py
datasets=" nyt_yao_tiny.txt.norm nyt.txt.norm nyt_yao.txt coha.txt.raw.token coca.txt.raw.token arxiv.txt.raw.token repubblica.txt.norm "
#datasets=" coha.txt "
ratio=" 9/10 "
for dataset in ${datasets}
do
shuf ${dataset} > ${dataset}.shuf
length=$(wc -l < ${dataset}.shuf)
echo $length
top=$(($length * $ratio ))
echo $top
head -n $top ${dataset}.shuf > ${dataset}.train
tail -n $(($length-$top)) ${dataset}.shuf > ${dataset}.test
rm ${dataset}.shuf
done
ratio=" 1/2 "
for dataset in ${datasets}
do
shuf ${dataset}.test > ${dataset}.shuf
length=$(wc -l < ${dataset}.shuf)
echo $length
top=$(($length * $ratio ))
echo $top
head -n $top ${dataset}.shuf > ${dataset}.dev
tail -n $(($length-$top)) ${dataset}.shuf > ${dataset}.test
rm ${dataset}.shuf
done