-
Notifications
You must be signed in to change notification settings - Fork 5
/
train_dblp.sh
executable file
·84 lines (72 loc) · 2.98 KB
/
train_dblp.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/bin/sh
INPUT=data/AMiner-Paper.txt
export OMP_NUM_THREADS=4
export NUM_KEYPHRASES=40000
export MIN_PHRASE_SUPPORT=10
make
Green='\033[0;32m'
NC='\033[0m'
if [ $INPUT == "data/AMiner-Paper.txt" ] && [ ! -e data/AMiner-Paper.txt ]; then
echo -e "${Green}Downloading dataset${NC}"
mkdir -p data
wget http://jialu.info/datasets/AMiner-Paper.txt.gz -O data/AMiner-Paper.txt.gz
gzip -d data/AMiner-Paper.txt.gz -f
fi
export PYTHON=~/python-env/bin/python
export PYPY=python
if type "pypy" > /dev/null; then
export PYPY=pypy
fi
export RAW_TEXT=tmp/dblp/pure_text.txt
echo -e "${Green}Preprocessing DBLP Input File${NC}"
mkdir -p tmp/dblp
${PYPY} src/preprocessing/dblp/prepare_data.py -input $INPUT -output $RAW_TEXT
echo -e "${Green}Training SegPhrase${NC}"
./domain_keyphrase_extraction.sh
mv SegPhrase/results/vectors.bin tmp/dblp/vectors.bin
mv SegPhrase/results/salient.csv tmp/dblp/keyphrases.csv
mv SegPhrase/results/segmentation.model tmp/dblp/segmentation.model
mv SegPhrase/results/w2w_nn.txt tmp/dblp/ann.txt
echo -e "${Green}Identifying Phrases in Input File${NC}"
./SegPhrase/bin/segphrase_parser tmp/dblp/segmentation.model \
tmp/dblp/keyphrases.csv $NUM_KEYPHRASES $RAW_TEXT ./tmp/dblp/segmented_text.txt 1
echo -e "${Green}Generating EM Training Data${NC}"
# detect phrases not noun
${PYPY} src/tools/detect_noise.py -input tmp/dblp/keyphrases.csv -output tmp/dblp/noise.txt
${PYPY} src/initialization/generate_em_training_data.py \
-input tmp/dblp/segmented_text.txt \
-stopwords src/tools/stopwords.txt \
-word2vec tmp/dblp/vectors.bin \
-noise tmp/dblp/noise.txt \
-window 100 \
-merge 1 \
-vocab tmp/dblp/vocabulary.txt \
-em tmp/dblp/training_data.txt \
-merged tmp/dblp/synonym.txt
echo -e "${Green}Generating Candidate Content Units${NC}"
${PYPY} src/initialization/generate_candidate_content_units.py \
-em tmp/dblp/training_data.txt \
-merged tmp/dblp/synonym.txt \
-vocab tmp/dblp/vocabulary.txt \
-word2vec tmp/dblp/vectors.bin \
-ann tmp/dblp/ann.txt \
-child 50 \
-candi tmp/dblp/candidate_content_units.txt
echo -e "${Green}Initializing Bayesian Network Structure${NC}"
${PYPY} src/initialization/initialize_model.py \
-em tmp/dblp/training_data.txt \
-candi tmp/dblp/candidate_content_units.txt \
-vocab tmp/dblp/vocabulary.txt \
-word2vec tmp/dblp/vectors.bin \
-model tmp/dblp/model.init
echo -e "${Green}Domain Keyphrase Silhouetting${NC}"
./build/silhouetting -model tmp/dblp/model.init \
-em tmp/dblp/training_data.txt \
-ratio 0.5 -iter 3 -batch ${NUM_KEYPHRASES} \
-thread ${OMP_NUM_THREADS} \
-output tmp/dblp/model.now \
-vocab tmp/dblp/vocabulary.txt \
-candi tmp/dblp/candidate_content_units.txt \
-infer_time 50 -infer_min 10 -infer_max 100 \
-infer_iter 500 -infer_link 1e-6 -infer_print 3000 \
# --v=3