Skip to content

Commit

Permalink
Added several scripts.
Browse files Browse the repository at this point in the history
  • Loading branch information
Mike Speriosu committed Apr 26, 2013
1 parent 7fedf52 commit 9a05280
Show file tree
Hide file tree
Showing 7 changed files with 218 additions and 25 deletions.
37 changes: 27 additions & 10 deletions CIAWFBFixer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -44,28 +44,45 @@ object CIAWFBFixer extends App {

//countriesToCoords.foreach(p => println(p._1 + " " + p._2._1 + "," + p._2._2))

val lineRE = """^(.*lat=\")([^\"]+)(.*long=\")(-0)(.*humanPath=\")([^\"]+)(.*)$""".r
val lineRE = """^(.*lat=\")([^\"]+)(.*long=\")([^\"]+)(.*)$""".r
//val line2RE = """^(.*long=\")([^\"]+)(.*lat=\")([^\"]+)(.*)$""".r
//val lineRE = """^(.*lat=\")([^\"]+)(.*long=\")([^\"]+)(.*humanPath=\")([^\"]+)(.*)$""".r
val countryNameRE = """^.*humanPath=\"([^\"]+).*$""".r

val inDir = new File(if(args(1).endsWith("/")) args(1).dropRight(1) else args(1))
val outDir = new File(if(args(2).endsWith("/")) args(2).dropRight(1) else args(2))
for(file <- inDir.listFiles.filter(_.getName.endsWith(".xml"))) {

val out = new BufferedWriter(new FileWriter(outDir+"/"+file.getName))

for(line <- scala.io.Source.fromFile(file).getLines) {
if(line.contains("CIAWFB") && line.contains("long=\"-0\"")) {
val lineRE(beg, lat0, mid, lon0, humpath, countryName, end) = line
/*var beg = ""
var lat0 = ""
var mid = ""
var lon0 = ""
var end = ""*/

var lon = 0.0
for(line <- scala.io.Source.fromFile(file).getLines) {
if(line.contains("CIAWFB") && lineRE.findFirstIn(line) != None) {// && line.contains("long=\"-0\"")) {
//line match {
/*case lineRE => */val lineRE(beg, lat0, mid, lon0, end) = line//; beg = begr; lat0 = lat0r; mid = midr; lon0 = lon0r; end = endr;
//case line2RE => val line2RE(begr, lon0r, midr, lat0r, endr) = line; beg = begr; lat0 = lat0r; mid = midr; lon0 = lon0r; end = endr;
//}
val countryNameRE(countryName) = line
//val countryName = "hi"

//println(line)

var lat = lat0//.toDouble
var lon = lon0//.toDouble
if(countriesToCoords.contains(countryName.toLowerCase)) {
lon = countriesToCoords(countryName.toLowerCase)._2
lat = countriesToCoords(countryName.toLowerCase)._1.toString
lon = countriesToCoords(countryName.toLowerCase)._2.toString
}

var lat = lat0
if(countryName.toLowerCase.equals("vatican"))
lat = countriesToCoords(countryName.toLowerCase)._1.toString
//if(countryName.toLowerCase.equals("vatican"))
// lat = countriesToCoords(countryName.toLowerCase)._1

out.write(beg+lat+mid+lon+humpath+countryName+end+"\n")
out.write(beg+lat+mid+lon+end+"\n")
//println(beg+lat+mid+lon+humpath+countryName+end+"\n")
}
else
Expand Down
15 changes: 15 additions & 0 deletions bin/cwarxml2txttgn.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

indir=${1%/}
outdir=${2%/}

if [ ! -e $outdir ]; then
mkdir $outdir
fi

for f in $indir/*.xml
do
filename=$(basename $f)
filename=${filename%.*}
grep '<milestone unit="sentence"' $f | sed -re 's/tgn,([^"]+)">([^<]+)/>tgn,\1-\2-]]/' | sed -re 's/tgn,([^"]+)-(\w+) (\w+)-]]/tgn,\1-\2-\3-]]/' | sed 's/<[^<>]*>//g' > $outdir/$filename.txt
done
17 changes: 17 additions & 0 deletions bin/download-geonames.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash

if [ -z $FIELDSPRING_DIR ]; then
echo "You must set the environment variable FIELDSPRING_DIR to point to Fieldspring's installation directory."
exit
fi

origwd=`pwd`

if [ ! -e $FIELDSPRING_DIR/data/gazetteers/allCountries.zip ]; then
cd $FIELDSPRING_DIR/data/gazetteers
wget http://web.corral.tacc.utexas.edu/utcompling/fieldspring-data/allCountries.zip
fi



cd $origwd
37 changes: 37 additions & 0 deletions bin/prepare-cwar.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/bin/bash

if [ -z $FIELDSPRING_DIR ]; then
echo "You must set the environment variable FIELDSPRING_DIR to point to Fieldspring's installation directory."
exit
fi

origcwarxmldir=${1%/}
pathtokml=$2
pathtogaz=$3
cwarxmloutdir=${4%/}

echo "Converting original Cwar corpus to plain format..."
cwarxml2txttgn.sh $origcwarxmldir cwarplaintgn
echo "Splitting corpus into dev and test sets..."
fieldspring --memory 2g run opennlp.fieldspring.tr.app.SplitDevTest cwarplaintgn
if [ ! -e $cwarxmloutdir ]; then
mkdir $cwarxmloutdir
fi
if [ ! -e $cwarxmloutdir/dev ]; then
mkdir $cwarxmloutdir/dev
fi
if [ ! -e $cwarxmloutdir/test ]; then
mkdir $cwarxmloutdir/test
fi

echo "Converting dev corpus to Fieldspring format..."
fieldspring --memory 8g run opennlp.fieldspring.tr.app.ConvertCwarToGoldCorpus cwarplaintgndev $pathtokml $pathtogaz > $cwarxmloutdir/dev/cwar-dev.xml
echo "Converting test corpus to Fieldspring format..."
fieldspring --memory 8g run opennlp.fieldspring.tr.app.ConvertCwarToGoldCorpus cwarplaintgntest $pathtokml $pathtogaz > $cwarxmloutdir/test/cwar-test.xml

echo "Deleting temporary files..."
rm -rf cwarplaintgn
rm -rf cwarplaintgndev
rm -rf cwarplaintgntest
echo "Done."

105 changes: 105 additions & 0 deletions bin/runexps.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#!/bin/bash

corpusname=$1; # tr or cwar
split=$2; # dev or test
topidmethod=$3; # gt or ner
modelsdir=wistr-models-$corpusname$split/;
if [ corpusname == "cwar" ]; then
sercorpusprefix=cwar
else
sercorpusprefix=trf
fi
if [ corpusname == "cwar" ]; then
sercorpussuffix="-20spd"
else
sercorpussuffix=""
fi
sercorpusfile=$sercorpusprefix$split-$topidmethod-g1dpc$sercorpussuffix.ser.gz;
corpusdir=${4%/}/$split/; # fourth argument is path to corpus in XML format
if [ corpusname == "cwar" ]; then
logfileprefix=cwar
else
logfileprefix=trconll
fi
logfile=enwiki-$logfileprefix$split-100.log;

mem=8g;

function printres {

if [ $topidmethod == "ner" ]; then

precision=`grep -A35 "$1" temp-results.txt | grep "P: " | sed -e 's/^.*: //'`
recall=`grep -A35 "$1" temp-results.txt | grep "R: " | sed -e 's/^.*: //'`
fscore=`grep -A35 "$1" temp-results.txt | grep "F: " | sed -e 's/^.*: //'`

echo $1 "&" $precision "&" $recall "&" $fscore

else

mean=`grep -A35 "$1" temp-results.txt | grep "Mean error distance (km): " | sed -e 's/^.*: //'`
median=`grep -A35 "$1" temp-results.txt | grep "Median error distance (km): " | sed -e 's/^.*: //'`
accuracy=`grep -A35 "$1" temp-results.txt | grep "F: " | sed -e 's/^.*: //'`

echo $1 "&" $mean "&" $median "&" $accuracy

fi
}

#function getmean {
# echo `grep -A25 "$1" temp-results.txt | grep "Mean error distance (km): " | sed -e 's/^.*: //'`
#}

if [ -e temp-results.txt ]; then
rm temp-results.txt
fi

# Good to go
echo "\oracle" >> temp-results.txt
fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -r random -oracle >> temp-results.txt
printres "\oracle"

# Good to go
for i in 1 2 3
do
echo "\rand"$i >> temp-results.txt
fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -r random >> temp-results.txt
printres "\rand"$i
done

# Good to go
echo "\population" >> temp-results.txt
fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -r pop >> temp-results.txt
printres "\population"

# Good to go
for i in 1 2 3
do
echo "\spider"$i >> temp-results.txt
fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -r wmd -it 10 >> temp-results.txt
printres "\spider"$i
done

#echo "\tripdl" >> temp-results.txt
#fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -im $modelsdir -l $logfile -r prob -pdg
#printres "\tripdl"

#echo "\wistr" >> temp-results.txt
#fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -im $modelsdir -l $logfile -r maxent
#printres "\wistr"

#echo '--- (Necessary for next step) ---';
#fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -im $modelsdir -l $logfile -r prob -pme
#echo '---';

#echo "\wistr+\spider" >> temp-results.txt
#fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -im $modelsdir -l $logfile -r wmd -it 10 -rwf
#printres "\wistr+\spider"

#echo "\trawl" >> temp-results.txt
#fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -im $modelsdir -l $logfile -r prob
#printres "\trawl"

#echo "\trawl+\spider" >> temp-results.txt
#fieldspring --memory $mem resolve -i $corpusdir -sci $sercorpusfile -cf tr -im $modelsdir -l $logfile -r wmd -it 10 -rwf
#printres "\trawl+\spider"
3 changes: 0 additions & 3 deletions data/gazetteers/getGeoNames.sh

This file was deleted.

29 changes: 17 additions & 12 deletions src/main/java/opennlp/fieldspring/tr/eval/SignatureEvaluator.java
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,13 @@ public Report evaluate(Corpus<Token> pred, boolean useSelected) {
if(doOracleEval) {
if(predCandidates.get(context).size() > 0) {
Location closestMatch = getClosestMatch(goldLoc, predCandidates.get(context));
dreport.addDistance(goldLoc.distanceInKm(closestMatch));
double dist = goldLoc.distanceInKm(closestMatch);
dreport.addDistance(dist);
report.incrementTP();
String key = goldLoc.getName().toLowerCase();
if(!errors.containsKey(key))
errors.put(key, new ArrayList<Double>());
errors.get(key).add(dist);
}
}
else {
Expand Down Expand Up @@ -145,19 +150,19 @@ public Report evaluate(Corpus<Token> pred, boolean useSelected) {
}

try {
BufferedWriter errOut = new BufferedWriter(new FileWriter("errors.txt"));
BufferedWriter errOut = new BufferedWriter(new FileWriter("errors.txt"));

for(String toponym : errors.keySet()) {
List<Double> errorList = errors.get(toponym);
double sum = 0.0;
for(double error : errorList) {
sum += error;
for(String toponym : errors.keySet()) {
List<Double> errorList = errors.get(toponym);
double sum = 0.0;
for(double error : errorList) {
sum += error;
}
errOut.write(toponym+" & "+errorList.size()+" & "+(sum/errorList.size())+" & "+sum+"\\\\\n");
}
errOut.write(toponym+" & "+errorList.size()+" & "+(sum/errorList.size())+" & "+sum+"\\\\\n");
}

errOut.close();


errOut.close();

} catch(Exception e) {
e.printStackTrace();
System.exit(1);
Expand Down

0 comments on commit 9a05280

Please sign in to comment.