From 9a05280520c5d54fa8cfa674ca2e4e74ad56e7b4 Mon Sep 17 00:00:00 2001 From: Mike Speriosu Date: Thu, 25 Apr 2013 21:54:11 -0500 Subject: [PATCH] Added several scripts. --- CIAWFBFixer.scala | 37 ++++-- bin/cwarxml2txttgn.sh | 15 +++ bin/download-geonames.sh | 17 +++ bin/prepare-cwar.sh | 37 ++++++ bin/runexps.sh | 105 ++++++++++++++++++ data/gazetteers/getGeoNames.sh | 3 - .../tr/eval/SignatureEvaluator.java | 29 +++-- 7 files changed, 218 insertions(+), 25 deletions(-) create mode 100755 bin/cwarxml2txttgn.sh create mode 100644 bin/download-geonames.sh create mode 100755 bin/prepare-cwar.sh create mode 100755 bin/runexps.sh delete mode 100755 data/gazetteers/getGeoNames.sh diff --git a/CIAWFBFixer.scala b/CIAWFBFixer.scala index 916f677..1ef345f 100644 --- a/CIAWFBFixer.scala +++ b/CIAWFBFixer.scala @@ -44,7 +44,10 @@ object CIAWFBFixer extends App { //countriesToCoords.foreach(p => println(p._1 + " " + p._2._1 + "," + p._2._2)) - val lineRE = """^(.*lat=\")([^\"]+)(.*long=\")(-0)(.*humanPath=\")([^\"]+)(.*)$""".r + val lineRE = """^(.*lat=\")([^\"]+)(.*long=\")([^\"]+)(.*)$""".r + //val line2RE = """^(.*long=\")([^\"]+)(.*lat=\")([^\"]+)(.*)$""".r + //val lineRE = """^(.*lat=\")([^\"]+)(.*long=\")([^\"]+)(.*humanPath=\")([^\"]+)(.*)$""".r + val countryNameRE = """^.*humanPath=\"([^\"]+).*$""".r val inDir = new File(if(args(1).endsWith("/")) args(1).dropRight(1) else args(1)) val outDir = new File(if(args(2).endsWith("/")) args(2).dropRight(1) else args(2)) @@ -52,20 +55,34 @@ object CIAWFBFixer extends App { val out = new BufferedWriter(new FileWriter(outDir+"/"+file.getName)) - for(line <- scala.io.Source.fromFile(file).getLines) { - if(line.contains("CIAWFB") && line.contains("long=\"-0\"")) { - val lineRE(beg, lat0, mid, lon0, humpath, countryName, end) = line + /*var beg = "" + var lat0 = "" + var mid = "" + var lon0 = "" + var end = ""*/ - var lon = 0.0 + for(line <- scala.io.Source.fromFile(file).getLines) { + if(line.contains("CIAWFB") && lineRE.findFirstIn(line) != None) {// && line.contains("long=\"-0\"")) { + //line match { + /*case lineRE => */val lineRE(beg, lat0, mid, lon0, end) = line//; beg = begr; lat0 = lat0r; mid = midr; lon0 = lon0r; end = endr; + //case line2RE => val line2RE(begr, lon0r, midr, lat0r, endr) = line; beg = begr; lat0 = lat0r; mid = midr; lon0 = lon0r; end = endr; + //} + val countryNameRE(countryName) = line + //val countryName = "hi" + + //println(line) + + var lat = lat0//.toDouble + var lon = lon0//.toDouble if(countriesToCoords.contains(countryName.toLowerCase)) { - lon = countriesToCoords(countryName.toLowerCase)._2 + lat = countriesToCoords(countryName.toLowerCase)._1.toString + lon = countriesToCoords(countryName.toLowerCase)._2.toString } - var lat = lat0 - if(countryName.toLowerCase.equals("vatican")) - lat = countriesToCoords(countryName.toLowerCase)._1.toString + //if(countryName.toLowerCase.equals("vatican")) + // lat = countriesToCoords(countryName.toLowerCase)._1 - out.write(beg+lat+mid+lon+humpath+countryName+end+"\n") + out.write(beg+lat+mid+lon+end+"\n") //println(beg+lat+mid+lon+humpath+countryName+end+"\n") } else diff --git a/bin/cwarxml2txttgn.sh b/bin/cwarxml2txttgn.sh new file mode 100755 index 0000000..ceec4c5 --- /dev/null +++ b/bin/cwarxml2txttgn.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +indir=${1%/} +outdir=${2%/} + +if [ ! -e $outdir ]; then + mkdir $outdir +fi + +for f in $indir/*.xml +do + filename=$(basename $f) + filename=${filename%.*} + grep '([^<]+)/>tgn,\1-\2-]]/' | sed -re 's/tgn,([^"]+)-(\w+) (\w+)-]]/tgn,\1-\2-\3-]]/' | sed 's/<[^<>]*>//g' > $outdir/$filename.txt +done diff --git a/bin/download-geonames.sh b/bin/download-geonames.sh new file mode 100644 index 0000000..94cde31 --- /dev/null +++ b/bin/download-geonames.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +if [ -z $FIELDSPRING_DIR ]; then + echo "You must set the environment variable FIELDSPRING_DIR to point to Fieldspring's installation directory." + exit +fi + +origwd=`pwd` + +if [ ! -e $FIELDSPRING_DIR/data/gazetteers/allCountries.zip ]; then + cd $FIELDSPRING_DIR/data/gazetteers + wget http://web.corral.tacc.utexas.edu/utcompling/fieldspring-data/allCountries.zip +fi + + + +cd $origwd diff --git a/bin/prepare-cwar.sh b/bin/prepare-cwar.sh new file mode 100755 index 0000000..8b7568c --- /dev/null +++ b/bin/prepare-cwar.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +if [ -z $FIELDSPRING_DIR ]; then + echo "You must set the environment variable FIELDSPRING_DIR to point to Fieldspring's installation directory." + exit +fi + +origcwarxmldir=${1%/} +pathtokml=$2 +pathtogaz=$3 +cwarxmloutdir=${4%/} + +echo "Converting original Cwar corpus to plain format..." +cwarxml2txttgn.sh $origcwarxmldir cwarplaintgn +echo "Splitting corpus into dev and test sets..." +fieldspring --memory 2g run opennlp.fieldspring.tr.app.SplitDevTest cwarplaintgn +if [ ! -e $cwarxmloutdir ]; then + mkdir $cwarxmloutdir +fi +if [ ! -e $cwarxmloutdir/dev ]; then + mkdir $cwarxmloutdir/dev +fi +if [ ! -e $cwarxmloutdir/test ]; then + mkdir $cwarxmloutdir/test +fi + +echo "Converting dev corpus to Fieldspring format..." +fieldspring --memory 8g run opennlp.fieldspring.tr.app.ConvertCwarToGoldCorpus cwarplaintgndev $pathtokml $pathtogaz > $cwarxmloutdir/dev/cwar-dev.xml +echo "Converting test corpus to Fieldspring format..." +fieldspring --memory 8g run opennlp.fieldspring.tr.app.ConvertCwarToGoldCorpus cwarplaintgntest $pathtokml $pathtogaz > $cwarxmloutdir/test/cwar-test.xml + +echo "Deleting temporary files..." +rm -rf cwarplaintgn +rm -rf cwarplaintgndev +rm -rf cwarplaintgntest +echo "Done." + diff --git a/bin/runexps.sh b/bin/runexps.sh new file mode 100755 index 0000000..bc4dc3c --- /dev/null +++ b/bin/runexps.sh @@ -0,0 +1,105 @@ +#!/bin/bash + +corpusname=$1; # tr or cwar +split=$2; # dev or test +topidmethod=$3; # gt or ner +modelsdir=wistr-models-$corpusname$split/; +if [ corpusname == "cwar" ]; then + sercorpusprefix=cwar +else + sercorpusprefix=trf +fi +if [ corpusname == "cwar" ]; then + sercorpussuffix="-20spd" +else + sercorpussuffix="" +fi +sercorpusfile=$sercorpusprefix$split-$topidmethod-g1dpc$sercorpussuffix.ser.gz; +corpusdir=${4%/}/$split/; # fourth argument is path to corpus in XML format +if [ corpusname == "cwar" ]; then + logfileprefix=cwar +else + logfileprefix=trconll +fi +logfile=enwiki-$logfileprefix$split-100.log; + +mem=8g; + +function printres { + + if [ $topidmethod == "ner" ]; then + + precision=`grep -A35 "$1" temp-results.txt | grep "P: " | sed -e 's/^.*: //'` + recall=`grep -A35 "$1" temp-results.txt | grep "R: " | sed -e 's/^.*: //'` + fscore=`grep -A35 "$1" temp-results.txt | grep "F: " | sed -e 's/^.*: //'` + + echo $1 "&" $precision "&" $recall "&" $fscore + + else + + mean=`grep -A35 "$1" temp-results.txt | grep "Mean error distance (km): " | sed -e 's/^.*: //'` + median=`grep -A35 "$1" temp-results.txt | grep "Median error distance (km): " | sed -e 's/^.*: //'` + accuracy=`grep -A35 "$1" temp-results.txt | grep "F: " | sed -e 's/^.*: //'` + + echo $1 "&" $mean "&" $median "&" $accuracy + + fi +} + +#function getmean { +# echo `grep -A25 "$1" temp-results.txt | grep "Mean error distance (km): " | sed -e 's/^.*: //'` +#} + +if [ -e temp-results.txt ]; then + rm temp-results.txt +fi + +# Good to go +echo "\oracle" >> temp-results.txt +fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -r random -oracle >> temp-results.txt +printres "\oracle" + +# Good to go +for i in 1 2 3 +do + echo "\rand"$i >> temp-results.txt + fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -r random >> temp-results.txt + printres "\rand"$i +done + +# Good to go +echo "\population" >> temp-results.txt +fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -r pop >> temp-results.txt +printres "\population" + +# Good to go +for i in 1 2 3 +do + echo "\spider"$i >> temp-results.txt + fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -r wmd -it 10 >> temp-results.txt + printres "\spider"$i +done + +#echo "\tripdl" >> temp-results.txt +#fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -im $modelsdir -l $logfile -r prob -pdg +#printres "\tripdl" + +#echo "\wistr" >> temp-results.txt +#fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -im $modelsdir -l $logfile -r maxent +#printres "\wistr" + +#echo '--- (Necessary for next step) ---'; +#fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -im $modelsdir -l $logfile -r prob -pme +#echo '---'; + +#echo "\wistr+\spider" >> temp-results.txt +#fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -im $modelsdir -l $logfile -r wmd -it 10 -rwf +#printres "\wistr+\spider" + +#echo "\trawl" >> temp-results.txt +#fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -im $modelsdir -l $logfile -r prob +#printres "\trawl" + +#echo "\trawl+\spider" >> temp-results.txt +#fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -im $modelsdir -l $logfile -r wmd -it 10 -rwf +#printres "\trawl+\spider" diff --git a/data/gazetteers/getGeoNames.sh b/data/gazetteers/getGeoNames.sh deleted file mode 100755 index f19e68a..0000000 --- a/data/gazetteers/getGeoNames.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -wget http://download.geonames.org/export/dump/allCountries.zip diff --git a/src/main/java/opennlp/fieldspring/tr/eval/SignatureEvaluator.java b/src/main/java/opennlp/fieldspring/tr/eval/SignatureEvaluator.java index 089c255..4f61bc8 100644 --- a/src/main/java/opennlp/fieldspring/tr/eval/SignatureEvaluator.java +++ b/src/main/java/opennlp/fieldspring/tr/eval/SignatureEvaluator.java @@ -112,8 +112,13 @@ public Report evaluate(Corpus pred, boolean useSelected) { if(doOracleEval) { if(predCandidates.get(context).size() > 0) { Location closestMatch = getClosestMatch(goldLoc, predCandidates.get(context)); - dreport.addDistance(goldLoc.distanceInKm(closestMatch)); + double dist = goldLoc.distanceInKm(closestMatch); + dreport.addDistance(dist); report.incrementTP(); + String key = goldLoc.getName().toLowerCase(); + if(!errors.containsKey(key)) + errors.put(key, new ArrayList()); + errors.get(key).add(dist); } } else { @@ -145,19 +150,19 @@ public Report evaluate(Corpus pred, boolean useSelected) { } try { - BufferedWriter errOut = new BufferedWriter(new FileWriter("errors.txt")); + BufferedWriter errOut = new BufferedWriter(new FileWriter("errors.txt")); - for(String toponym : errors.keySet()) { - List errorList = errors.get(toponym); - double sum = 0.0; - for(double error : errorList) { - sum += error; + for(String toponym : errors.keySet()) { + List errorList = errors.get(toponym); + double sum = 0.0; + for(double error : errorList) { + sum += error; + } + errOut.write(toponym+" & "+errorList.size()+" & "+(sum/errorList.size())+" & "+sum+"\\\\\n"); } - errOut.write(toponym+" & "+errorList.size()+" & "+(sum/errorList.size())+" & "+sum+"\\\\\n"); - } - - errOut.close(); - + + errOut.close(); + } catch(Exception e) { e.printStackTrace(); System.exit(1);