forked from dbpedia/fact-extractor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_verbs.sh
executable file
·122 lines (116 loc) · 2.8 KB
/
extract_verbs.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/bin/bash
# Call this script with the language you want to work with
# The accepted argument can be a language or a language code, e.g., 'en' OR 'english'
# sh extract_verbs.sh english
set -e
cd ..
if [[ $# -ne 1 ]]; then
echo "Usage: sh $(basename "$0") <LANGUAGE>"
exit 1
fi
# Lowercase argument
LANGUAGE="$(echo $1 | tr '[:upper:]' '[:lower:]')"
# Switch statement to select language
case $LANGUAGE in
"bulgarian" | "bg")
LANGUAGE="bulgarian"
LANGCODE="bg"
;;
"chinese" | "zh")
LANGUAGE="chinese"
LANGCODE="zh"
;;
"dutch" | "nl")
LANGUAGE="dutch"
LANGCODE="nl"
;;
"english" | "en")
LANGUAGE="english"
LANGCODE="en"
;;
"estonian" | "et")
LANGUAGE="estonian"
LANGCODE="et"
;;
"finnish" | "fi")
LANGUAGE="finnish"
LANGCODE="fi"
;;
"french" | "fr")
LANGUAGE="french"
LANGCODE="fr"
;;
"galician" | "gl")
LANGUAGE="galician"
LANGCODE="gl"
;;
"german" | "de")
LANGUAGE="german"
LANGCODE="de"
;;
"italian" | "it")
LANGUAGE="italian"
LANGCODE="it"
;;
"latin" | "la")
LANGUAGE="latin"
LANGCODE="la"
;;
"mongolian" | "mn")
LANGUAGE="mongolian"
LANGCODE="mn"
;;
"polish" | "pl")
LANGUAGE="polish"
LANGCODE="pl"
;;
"portuguese" | "pt")
LANGUAGE="portuguese"
LANGCODE="pt"
;;
"russian" | "ru")
LANGUAGE="russian"
LANGCODE="ru"
;;
"slovak" | "sk")
LANGUAGE="slovak"
LANGCODE="sk"
;;
"spanish" | "es")
LANGUAGE="spanish"
LANGCODE="es"
;;
"swahili" | "sw")
LANGUAGE="swahili"
LANGCODE="sw"
;;
*)
echo "Invalid or not supported language for now! QUITTING ..."
exit 1
;;
esac
# Form Wikipedia dump URL
URL="http://download.wikimedia.org/"$LANGCODE"wiki/latest/"$LANGCODE"wiki-latest-pages-articles.xml.bz2"
echo "Downloading dump from: $URL"
wget $URL
# Extract text
if [ ! -d "extracted"]; then
mkdir extracted
fi
bzcat "$LANGCODE"wiki-latest-pages-articles.xml.bz2 | scripts/lib/WikiExtractor.py -o extracted
# Split extraction by article
if [ ! -d "corpus"]; then
mkdir corpus
fi
cat extracted/*/* | csplit --suppress-matched -z -f 'corpus/doc_' - '/</doc>/' {*}
# Build a single big file
find extracted -type f -exec cat {} \; > all-extracted.txt
# Extract verbs with TreeTagger
# N.B. treetagger segfaults with the single big file, run it over each article instead
#cat all-extracted.txt | treetagger/cmd/tree-tagger-italian | grep VER | sort -u > verbi.txt
find extracted -type f -exec bash -c "cat '{}' | treetagger/cmd/tree-tagger-"$LANGUAGE" | grep VER >> verbs" \;
sort -u verbs > unique-sorted-verbs
# Extract vocabulary
python scripts/bag_of_words.py all-extracted.txt
# POS tagging + chunker with TextPro
perl textpro.pl -verbose -html -l ita -c token+sentence+pos+chunk -o . ~/srl/training/"$LANGCODE"wiki/gold