-
Notifications
You must be signed in to change notification settings - Fork 277
/
Copy pathphrasal_segmentation.sh
executable file
·98 lines (77 loc) · 3.07 KB
/
phrasal_segmentation.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/bin/bash
# As in "auto_phrase.sh", make the default model amd data directories depend on whether or not we're running
# from a Docker container.
if [ -d "default_models" ]; then
MODELS_DIR=${MODELS_DIR:- default_models}
else
MODELS_DIR=${MODELS_DIR:- models}
fi
MODEL=${MODEL:- ${MODELS_DIR}/DBLP}
if [ -d "default_data" ]; then
DATA_DIR=${DATA_DIR:- default_data}
else
DATA_DIR=${DATA_DIR:- data}
fi
TEXT_TO_SEG=${TEXT_TO_SEG:- ${DATA_DIR}/EN/DBLP.5K.txt}
HIGHLIGHT_MULTI=${HIGHLIGHT_MULTI:- 0.5}
HIGHLIGHT_SINGLE=${HIGHLIGHT_SINGLE:- 0.8}
SEGMENTATION_MODEL=${MODEL}/segmentation.model
TOKEN_MAPPING=${MODEL}/token_mapping.txt
POS_TAGGING_MODE=${POS_TAGGING_MODE:- 1}
THREAD=10
green=`tput setaf 2`
reset=`tput sgr0`
echo ${green}===Compilation===${reset}
COMPILE=${COMPILE:- 1}
if [ $COMPILE -eq 1 ]; then
bash compile.sh
fi
mkdir -p tmp
mkdir -p ${MODEL}
### END Compilation###
echo ${green}===Tokenization===${reset}
TOKENIZER="-cp .:tools/tokenizer/lib/*:tools/tokenizer/resources/:tools/tokenizer/build/ Tokenizer"
TOKENIZED_TEXT_TO_SEG=tmp/tokenized_text_to_seg.txt
CASE=tmp/case_tokenized_text_to_seg.txt
echo -ne "Current step: Tokenizing input file...\033[0K\r"
if [ $POS_TAGGING_MODE -eq 2 ]; then
time java $TOKENIZER -m direct_test -i $TEXT_TO_SEG -o $TOKENIZED_TEXT_TO_SEG -t $TOKEN_MAPPING -c N -thread $THREAD -delimiters " "
else
time java $TOKENIZER -m direct_test -i $TEXT_TO_SEG -o $TOKENIZED_TEXT_TO_SEG -t $TOKEN_MAPPING -c N -thread $THREAD
fi
LANGUAGE=`cat ${MODEL}/language.txt`
echo -ne "Detected Language: $LANGUAGE\033[0K\n"
### END Tokenization ###
if [ ! $LANGUAGE == "JA" ] && [ ! $LANGUAGE == "CN" ] && [ ! $LANGUAGE == "OTHER" ]; then
if [ $POS_TAGGING_MODE -eq 1 ]; then
echo ${green}===Part-Of-Speech Tagging===${reset}
RAW=tmp/raw_tokenized_text_to_seg.txt
export THREAD LANGUAGE RAW
bash ./tools/treetagger/pos_tag.sh
mv tmp/pos_tags.txt tmp/pos_tags_tokenized_text_to_seg.txt
elif [ $POS_TAGGING_MODE -eq 2 ]; then
echo ${green}===Loading Part-Of-Speech Tagged file===${reset}
cp $DATA_DIR/$LANGUAGE/pos_tags.txt tmp/pos_tags_tokenized_text_to_seg.txt
fi
fi
POS_TAGS=tmp/pos_tags_tokenized_text_to_seg.txt
### END Part-Of-Speech Tagging ###
echo ${green}===Phrasal Segmentation===${reset}
if [[ $POS_TAGGING_MODE -eq 1 || $POS_TAGGING_MODE -eq 2 ]]; then
time ./bin/segphrase_segment \
--pos_tag \
--thread $THREAD \
--model $SEGMENTATION_MODEL \
--highlight-multi $HIGHLIGHT_MULTI \
--highlight-single $HIGHLIGHT_SINGLE
else
time ./bin/segphrase_segment \
--thread $THREAD \
--model $SEGMENTATION_MODEL \
--highlight-multi $HIGHLIGHT_MULTI \
--highlight-single $HIGHLIGHT_SINGLE
fi
### END Segphrasing ###
echo ${green}===Generating Output===${reset}
time java $TOKENIZER -m segmentation -i $TEXT_TO_SEG -segmented tmp/tokenized_segmented_sentences.txt -o ${MODEL}/segmentation.txt -tokenized_raw tmp/raw_tokenized_text_to_seg.txt -tokenized_id tmp/tokenized_text_to_seg.txt -c N
### END Generating Output for Checking Quality ###