-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Mike Speriosu
committed
Apr 26, 2013
1 parent
7fedf52
commit 9a05280
Showing
7 changed files
with
218 additions
and
25 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
#!/bin/bash | ||
|
||
indir=${1%/} | ||
outdir=${2%/} | ||
|
||
if [ ! -e $outdir ]; then | ||
mkdir $outdir | ||
fi | ||
|
||
for f in $indir/*.xml | ||
do | ||
filename=$(basename $f) | ||
filename=${filename%.*} | ||
grep '<milestone unit="sentence"' $f | sed -re 's/tgn,([^"]+)">([^<]+)/>tgn,\1-\2-]]/' | sed -re 's/tgn,([^"]+)-(\w+) (\w+)-]]/tgn,\1-\2-\3-]]/' | sed 's/<[^<>]*>//g' > $outdir/$filename.txt | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
#!/bin/bash | ||
|
||
if [ -z $FIELDSPRING_DIR ]; then | ||
echo "You must set the environment variable FIELDSPRING_DIR to point to Fieldspring's installation directory." | ||
exit | ||
fi | ||
|
||
origwd=`pwd` | ||
|
||
if [ ! -e $FIELDSPRING_DIR/data/gazetteers/allCountries.zip ]; then | ||
cd $FIELDSPRING_DIR/data/gazetteers | ||
wget http://web.corral.tacc.utexas.edu/utcompling/fieldspring-data/allCountries.zip | ||
fi | ||
|
||
|
||
|
||
cd $origwd |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#!/bin/bash | ||
|
||
if [ -z $FIELDSPRING_DIR ]; then | ||
echo "You must set the environment variable FIELDSPRING_DIR to point to Fieldspring's installation directory." | ||
exit | ||
fi | ||
|
||
origcwarxmldir=${1%/} | ||
pathtokml=$2 | ||
pathtogaz=$3 | ||
cwarxmloutdir=${4%/} | ||
|
||
echo "Converting original Cwar corpus to plain format..." | ||
cwarxml2txttgn.sh $origcwarxmldir cwarplaintgn | ||
echo "Splitting corpus into dev and test sets..." | ||
fieldspring --memory 2g run opennlp.fieldspring.tr.app.SplitDevTest cwarplaintgn | ||
if [ ! -e $cwarxmloutdir ]; then | ||
mkdir $cwarxmloutdir | ||
fi | ||
if [ ! -e $cwarxmloutdir/dev ]; then | ||
mkdir $cwarxmloutdir/dev | ||
fi | ||
if [ ! -e $cwarxmloutdir/test ]; then | ||
mkdir $cwarxmloutdir/test | ||
fi | ||
|
||
echo "Converting dev corpus to Fieldspring format..." | ||
fieldspring --memory 8g run opennlp.fieldspring.tr.app.ConvertCwarToGoldCorpus cwarplaintgndev $pathtokml $pathtogaz > $cwarxmloutdir/dev/cwar-dev.xml | ||
echo "Converting test corpus to Fieldspring format..." | ||
fieldspring --memory 8g run opennlp.fieldspring.tr.app.ConvertCwarToGoldCorpus cwarplaintgntest $pathtokml $pathtogaz > $cwarxmloutdir/test/cwar-test.xml | ||
|
||
echo "Deleting temporary files..." | ||
rm -rf cwarplaintgn | ||
rm -rf cwarplaintgndev | ||
rm -rf cwarplaintgntest | ||
echo "Done." | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
#!/bin/bash | ||
|
||
corpusname=$1; # tr or cwar | ||
split=$2; # dev or test | ||
topidmethod=$3; # gt or ner | ||
modelsdir=wistr-models-$corpusname$split/; | ||
if [ corpusname == "cwar" ]; then | ||
sercorpusprefix=cwar | ||
else | ||
sercorpusprefix=trf | ||
fi | ||
if [ corpusname == "cwar" ]; then | ||
sercorpussuffix="-20spd" | ||
else | ||
sercorpussuffix="" | ||
fi | ||
sercorpusfile=$sercorpusprefix$split-$topidmethod-g1dpc$sercorpussuffix.ser.gz; | ||
corpusdir=${4%/}/$split/; # fourth argument is path to corpus in XML format | ||
if [ corpusname == "cwar" ]; then | ||
logfileprefix=cwar | ||
else | ||
logfileprefix=trconll | ||
fi | ||
logfile=enwiki-$logfileprefix$split-100.log; | ||
|
||
mem=8g; | ||
|
||
function printres { | ||
|
||
if [ $topidmethod == "ner" ]; then | ||
|
||
precision=`grep -A35 "$1" temp-results.txt | grep "P: " | sed -e 's/^.*: //'` | ||
recall=`grep -A35 "$1" temp-results.txt | grep "R: " | sed -e 's/^.*: //'` | ||
fscore=`grep -A35 "$1" temp-results.txt | grep "F: " | sed -e 's/^.*: //'` | ||
|
||
echo $1 "&" $precision "&" $recall "&" $fscore | ||
|
||
else | ||
|
||
mean=`grep -A35 "$1" temp-results.txt | grep "Mean error distance (km): " | sed -e 's/^.*: //'` | ||
median=`grep -A35 "$1" temp-results.txt | grep "Median error distance (km): " | sed -e 's/^.*: //'` | ||
accuracy=`grep -A35 "$1" temp-results.txt | grep "F: " | sed -e 's/^.*: //'` | ||
|
||
echo $1 "&" $mean "&" $median "&" $accuracy | ||
|
||
fi | ||
} | ||
|
||
#function getmean { | ||
# echo `grep -A25 "$1" temp-results.txt | grep "Mean error distance (km): " | sed -e 's/^.*: //'` | ||
#} | ||
|
||
if [ -e temp-results.txt ]; then | ||
rm temp-results.txt | ||
fi | ||
|
||
# Good to go | ||
echo "\oracle" >> temp-results.txt | ||
fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -r random -oracle >> temp-results.txt | ||
printres "\oracle" | ||
|
||
# Good to go | ||
for i in 1 2 3 | ||
do | ||
echo "\rand"$i >> temp-results.txt | ||
fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -r random >> temp-results.txt | ||
printres "\rand"$i | ||
done | ||
|
||
# Good to go | ||
echo "\population" >> temp-results.txt | ||
fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -r pop >> temp-results.txt | ||
printres "\population" | ||
|
||
# Good to go | ||
for i in 1 2 3 | ||
do | ||
echo "\spider"$i >> temp-results.txt | ||
fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -r wmd -it 10 >> temp-results.txt | ||
printres "\spider"$i | ||
done | ||
|
||
#echo "\tripdl" >> temp-results.txt | ||
#fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -im $modelsdir -l $logfile -r prob -pdg | ||
#printres "\tripdl" | ||
|
||
#echo "\wistr" >> temp-results.txt | ||
#fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -im $modelsdir -l $logfile -r maxent | ||
#printres "\wistr" | ||
|
||
#echo '--- (Necessary for next step) ---'; | ||
#fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -im $modelsdir -l $logfile -r prob -pme | ||
#echo '---'; | ||
|
||
#echo "\wistr+\spider" >> temp-results.txt | ||
#fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -im $modelsdir -l $logfile -r wmd -it 10 -rwf | ||
#printres "\wistr+\spider" | ||
|
||
#echo "\trawl" >> temp-results.txt | ||
#fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -im $modelsdir -l $logfile -r prob | ||
#printres "\trawl" | ||
|
||
#echo "\trawl+\spider" >> temp-results.txt | ||
#fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -im $modelsdir -l $logfile -r wmd -it 10 -rwf | ||
#printres "\trawl+\spider" |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters