diff --git a/karma-cleaning/pom.xml b/karma-cleaning/pom.xml
deleted file mode 100644
index 8ef7ad11c..000000000
--- a/karma-cleaning/pom.xml
+++ /dev/null
@@ -1,140 +0,0 @@
-
- 4.0.0
-
-
- edu.isi
- webkarma
- 0.0.1-SNAPSHOT
-
-
- karma-cleaning
-
-
- 0.9
-
-
-
-
- edu.isi
- karma-util
- ${project.version}
-
-
- junit
- junit
- 4.11
-
-
- javax.mail
- mail
- 1.4
-
-
- org.apache.commons
- commons-math3
- 3.0
-
-
- org.python
- jython-standalone
-
-
- log4j
- log4j
- 1.2.16
-
-
- org.slf4j
- slf4j-api
- 1.6.4
-
-
- org.slf4j
- slf4j-log4j12
- 1.6.4
-
-
- xml-apis
- xml-apis
- 1.0.b2
-
-
-
- de.micromata.jak
- JavaAPIforKml
- 2.2.0
-
-
- com.hp.hpl.jena
- arq
- 2.8.8
-
-
- com.hp.hpl.jena
- jena
- 2.6.4
-
-
- com.hp.hpl.jena
- iri
- 0.8
-
-
- com.hp.hpl.jena
- tdb
- 0.8.10
-
-
- net.sf.opencsv
- opencsv
- 2.3
-
-
- org.antlr
- antlr
- 3.2
-
-
- org.jdom
- jdom
- 1.1.2
-
-
- org.apache.poi
- poi
- 3.8-beta5
-
-
- org.apache.poi
- poi-ooxml
- 3.8-beta5
-
-
- org.apache.commons
- commons-lang3
- 3.1
-
-
- tw.edu.ntu.csie
- libsvm
- 3.17
-
-
- org.perf4j
- perf4j
- 0.9.16
-
-
- commons-lang
- commons-lang
- 2.3
-
-
- org.apache.commons
- commons-math
- 2.2
-
-
-
-
diff --git a/karma-cleaning/src/main/java/edu/isi/karma/cleaning/Correctness/Checker.java b/karma-cleaning/src/main/java/edu/isi/karma/cleaning/Correctness/Checker.java
deleted file mode 100644
index d65583c81..000000000
--- a/karma-cleaning/src/main/java/edu/isi/karma/cleaning/Correctness/Checker.java
+++ /dev/null
@@ -1,54 +0,0 @@
-package edu.isi.karma.cleaning.Correctness;
-
-import java.util.ArrayList;
-
-import edu.isi.karma.cleaning.features.RecordClassifier;
-import edu.isi.karma.cleaning.features.RecordFeatureSet;
-
-/*
- * check whether the transformed results are correct
- */
-public class Checker {
- RecordClassifier clf;
- public Checker()
- {
- RecordFeatureSet rfs1 = new RecordFeatureSet();
- clf = new RecordClassifier(rfs1);
-
- }
- public String binds(String[] exp)
- {
- String res = "";
- if(exp.length == 2)
- {
- res = String.format("bef:%s aft:%s", exp[0],exp[1]);
- }
- else
- {
- res = "NOEXP";
- }
- return res;
- }
- public void train(ArrayList postive, ArrayList negative)
- {
- clf.init();
- for(String[] pos:postive)
- {
- String tmp = binds(pos);
- clf.addTrainingData(tmp, "1");
- }
- for(String[] neg:negative)
- {
- String tmp = binds(neg);
- clf.addTrainingData(tmp, "-1");
- }
- clf.learnClassifer();
- }
-
- public String test(String[] record)
- {
- String line = binds(record);
- String label = clf.getLabel(line);
- return label;
- }
-}
diff --git a/karma-cleaning/src/main/java/edu/isi/karma/cleaning/Correctness/FormatFunc.java b/karma-cleaning/src/main/java/edu/isi/karma/cleaning/Correctness/FormatFunc.java
deleted file mode 100644
index ec620ff21..000000000
--- a/karma-cleaning/src/main/java/edu/isi/karma/cleaning/Correctness/FormatFunc.java
+++ /dev/null
@@ -1,95 +0,0 @@
-package edu.isi.karma.cleaning.Correctness;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Map;
-
-import edu.isi.karma.cleaning.UtilTools;
-
-/*
- * unseen formats detections:
- * distance to the class center and find the largest distance as the threshold
- *
- * boundary formats:
- * difference of distances to two classes are below 5%
- *
- * record = [id, org, tar, label]
- *
- */
-
-public class FormatFunc implements VerificationFunc {
- private int funid = 1;
- private HashMap cmeans = new HashMap();
- private HashMap mean_var = new HashMap();
- private double[] dmetric= null;
- public FormatFunc(ArrayList records, double[] dmetric)
- {
- dmetric = UtilTools.initArray(dmetric, 1.0);
- this.dmetric = dmetric;
- getMeanandDists(records, dmetric);
-
- }
- //identify the mean vector of each cluster
- private void getMeanandDists(ArrayList records, double[] dmetric)
- {
- HashMap> tmp = new HashMap>();
- for(TransRecord rec:records)
- {
- if(tmp.containsKey(rec.label))
- {
- tmp.get(rec.label).add(rec);
- }
- else
- {
- ArrayList x = new ArrayList();
- x.add(rec);
- tmp.put(rec.label, x);
- }
- }
- // find the means
- for(Map.Entry> stringArrayListEntry : tmp.entrySet())
- {
- ArrayList tdata = stringArrayListEntry.getValue();
- if(!tdata.isEmpty() || tdata.get(0).features.length > 0)
- {
- ArrayList tcl = new ArrayList();
- for(int i =0; i< tdata.size(); i++)
- {
- tcl.add(tdata.get(i).features);
- }
- double[] tmean = UtilTools.sum(tcl);
- tmean = UtilTools.produce(1.0/tdata.size(), tmean);
- cmeans.put(stringArrayListEntry.getKey(), tmean);
- // find the max distances
- // strictly bigger or smaller than [mean-3*delta, mean+3*delta]
- double d_mean = 0;
- double d_mu = 0;
- for(int i =0; i< tdata.size(); i++)
- {
- d_mean += UtilTools.distance(tdata.get(i).features, tmean, dmetric);
- }
- d_mean = d_mean*1.0/tdata.size();
- for(int i =0; i< tdata.size(); i++)
- {
- d_mu += Math.pow(UtilTools.distance(tdata.get(i).features, tmean, dmetric)-d_mean, 2);
- }
- d_mu = Math.sqrt(d_mu/tdata.size());
- double[] x = {d_mean,d_mu};
- mean_var.put(stringArrayListEntry.getKey(), x);
- }
- }
-
- //Prober.printFeatureandWeight(tmp, cmeans, dmetric);
- }
-
- public String verify(TransRecord record) {
- double dist = UtilTools.distance(record.features, cmeans.get(record.label), dmetric);
- //difference STRICTLY bigger than 2 standard deviations [68, 95, 99.7] rule
- if(Math.abs(dist - mean_var.get(record.label)[0]) > 2.0*mean_var.get(record.label)[1])
- {
- return String.valueOf(this.funid);
- }
- return "0";
- }
-
-}
diff --git a/karma-cleaning/src/main/java/edu/isi/karma/cleaning/Correctness/HypoTester.java b/karma-cleaning/src/main/java/edu/isi/karma/cleaning/Correctness/HypoTester.java
deleted file mode 100644
index 89597bcf9..000000000
--- a/karma-cleaning/src/main/java/edu/isi/karma/cleaning/Correctness/HypoTester.java
+++ /dev/null
@@ -1,7 +0,0 @@
-package edu.isi.karma.cleaning.Correctness;
-/*
- * Test whether different correctnesses can pass the hypotest
- */
-public class HypoTester {
-
-}
diff --git a/karma-cleaning/src/main/java/edu/isi/karma/cleaning/Correctness/OutlierDetector.java b/karma-cleaning/src/main/java/edu/isi/karma/cleaning/Correctness/OutlierDetector.java
deleted file mode 100644
index 43af4e9b9..000000000
--- a/karma-cleaning/src/main/java/edu/isi/karma/cleaning/Correctness/OutlierDetector.java
+++ /dev/null
@@ -1,45 +0,0 @@
-package edu.isi.karma.cleaning.Correctness;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-
-import libsvm.svm_parameter;
-import edu.isi.karma.cleaning.features.RecordClassifier;
-import edu.isi.karma.cleaning.features.RecordFeatureSet;
-
-public class OutlierDetector {
- RecordClassifier clf;
- public OutlierDetector()
- {
- RecordFeatureSet rfs1 = new RecordFeatureSet();
- clf = new RecordClassifier(rfs1, svm_parameter.ONE_CLASS);
- }
-
- public void train(ArrayList tdata)
- {
- for(String line:tdata)
- {
- clf.addTrainingData(line, "1");
- }
- clf.learnClassifer();
- }
- public String getLabel(String input)
- {
- String label = clf.getLabel(input);
- return label;
- }
- public static void main(String[] args)
- {
- OutlierDetector outDet = new OutlierDetector();
- String[] dat = {"A", "AA","B", "BB"};
- String[] tst = {"B", "b", "AAAAAAAAAAAA","."};
- ArrayList data = new ArrayList(Arrays.asList(dat));
- outDet.train(data);
- for(String l:tst)
- {
- String out = outDet.getLabel(l);
- System.out.println(l+": "+out);
- }
- }
-
-}
diff --git a/karma-cleaning/src/main/java/edu/isi/karma/cleaning/Correctness/Recommander.java b/karma-cleaning/src/main/java/edu/isi/karma/cleaning/Correctness/Recommander.java
deleted file mode 100644
index 3c213597b..000000000
--- a/karma-cleaning/src/main/java/edu/isi/karma/cleaning/Correctness/Recommander.java
+++ /dev/null
@@ -1,26 +0,0 @@
-package edu.isi.karma.cleaning.Correctness;
-
-import java.util.ArrayList;
-
-/* recommend
- * 1 the outlier
- 2 the points on the boundary
- in the test dataset.
-*/
-public class Recommander {
- public Recommander()
- {
-
- }
- //
- public ArrayList getOutliers()
- {
- ArrayList res = new ArrayList();
- return res;
- }
- public ArrayList getBoundaryPoints()
- {
- ArrayList res = new ArrayList();
- return res;
- }
-}
diff --git a/karma-cleaning/src/main/java/edu/isi/karma/cleaning/Correctness/TransRecord.java b/karma-cleaning/src/main/java/edu/isi/karma/cleaning/Correctness/TransRecord.java
deleted file mode 100644
index 0ac2ebfaf..000000000
--- a/karma-cleaning/src/main/java/edu/isi/karma/cleaning/Correctness/TransRecord.java
+++ /dev/null
@@ -1,18 +0,0 @@
-package edu.isi.karma.cleaning.Correctness;
-
-public class TransRecord {
- public String Id = "";
- public String org = "";
- public String tar = "";
- public String label = "";
- public String correct = "f";
- public double[] features = null;
- public TransRecord(String Id, String org, String tar, String lab, double[] feats)
- {
- this.Id = Id;
- this.org = org;
- this.tar = tar;
- this.label = lab;
- this.features = feats;
- }
-}
diff --git a/karma-cleaning/src/main/java/edu/isi/karma/cleaning/Correctness/VerificationFunc.java b/karma-cleaning/src/main/java/edu/isi/karma/cleaning/Correctness/VerificationFunc.java
deleted file mode 100644
index f4857c94b..000000000
--- a/karma-cleaning/src/main/java/edu/isi/karma/cleaning/Correctness/VerificationFunc.java
+++ /dev/null
@@ -1,9 +0,0 @@
-package edu.isi.karma.cleaning.Correctness;
-
-
-public interface VerificationFunc {
- //label a record
- //label 0 correct, >=1 doubious
- public String verify(TransRecord record);
-
-}
diff --git a/karma-cleaning/src/main/java/edu/isi/karma/cleaning/Correctness/ViewFunc.java b/karma-cleaning/src/main/java/edu/isi/karma/cleaning/Correctness/ViewFunc.java
deleted file mode 100644
index 45d058d3c..000000000
--- a/karma-cleaning/src/main/java/edu/isi/karma/cleaning/Correctness/ViewFunc.java
+++ /dev/null
@@ -1,120 +0,0 @@
-package edu.isi.karma.cleaning.Correctness;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Vector;
-
-import edu.isi.karma.cleaning.GrammarTreeNode;
-import edu.isi.karma.cleaning.Partition;
-import edu.isi.karma.cleaning.ProgSynthesis;
-import edu.isi.karma.cleaning.ProgramRule;
-import edu.isi.karma.cleaning.Template;
-import edu.isi.karma.cleaning.Traces;
-
-public class ViewFunc implements VerificationFunc {
- private HashMap data = new HashMap();
- private String contextId;
- public ViewFunc(ArrayList records, ProgSynthesis ps, ProgramRule pr, String contextId)
- {
- this.contextId = contextId;
- Vector pars = ps.myprog.partitions;
- HashMap> resHashMap = cluster(records);
- for(Partition p: pars)
- {
- handlePartition(resHashMap.get(p.label), p.trace, pr.getStringRule(p.label));
- }
- }
- public HashMap> cluster(ArrayList reds)
- {
- HashMap> res = new HashMap>();
- for(TransRecord r: reds)
- {
- if(res.containsKey(r.label))
- {
- res.get(r.label).add(r);
- }
- else {
- ArrayList line = new ArrayList();
- line.add(r);
- res.put(r.label, line);
- }
- }
- return res;
- }
- public void handlePartition(ArrayList records, Traces trace, String prog) {
-
- if (identifyIncorrRecord(records, prog)) {
- return;
- } else {
- identifyRecord(records, trace);
- }
- }
-
- // detect the records which prog failed on
- public boolean identifyIncorrRecord(ArrayList records,
- String prog) {
- ProgramRule pr = new ProgramRule(prog, contextId);
- boolean res = true;
- for (TransRecord r : records) {
- String orgString = r.org;
- String tar = pr.transform(orgString);
- if (tar.compareTo(r.tar) != 0) {
- res = false;
- data.put(r.org, "1");
- }
- }
- return res;
- }
-
- public void identifyRecord(ArrayList records, Traces trace) {
- ArrayList all = new ArrayList();
- Collection tlines = trace.traceline.values();
- all.addAll(tlines);
- ArrayList llines = new ArrayList();
- for (Integer k : trace.loopline.keySet()) {
- llines.addAll(trace.loopline.get(k).values());
- }
- all.addAll(llines);
- ArrayList rdata = new ArrayList();
- for (TransRecord tr : records) {
- rdata.add(tr.org);
- }
- int cnt = 0;
- HashMap>> views = new HashMap>>();
- for (Template t : all) {
- Vector bd = t.body;
- ArrayList> equviViews = new ArrayList>();
- for (GrammarTreeNode gt : bd) {
- String rule = "";
- HashSet vs = new HashSet();
- ArrayList lviews = new ArrayList();
- do {
- rule = gt.toProgram();
- ProgramRule pr = new ProgramRule(rule,contextId);
- String ares = "";
- for (String s : rdata) {
- ares += pr.transform(s);
- }
- if (!vs.contains(ares)) {
- lviews.add(rule);
- }
- } while (rule.compareTo("null") != 0);
- equviViews.add(lviews);
- }
- cnt++;
- views.put(String.valueOf(cnt), equviViews);
- }
-
-
- }
-
- public String verify(TransRecord record) {
- if (data.containsKey(record.org)) {
- return data.get(record.org);
- }
- return "0";
- }
-
-}
diff --git a/karma-cleaning/src/main/java/edu/isi/karma/cleaning/DataPreProcessor.java b/karma-cleaning/src/main/java/edu/isi/karma/cleaning/DataPreProcessor.java
deleted file mode 100644
index 5c3db7a2c..000000000
--- a/karma-cleaning/src/main/java/edu/isi/karma/cleaning/DataPreProcessor.java
+++ /dev/null
@@ -1,169 +0,0 @@
-package edu.isi.karma.cleaning;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Map;
-import java.util.Map.Entry;
-import java.util.Vector;
-
-import edu.isi.karma.cleaning.features.Feature;
-import edu.isi.karma.cleaning.features.RecordFeatureSet;
-//identify candidate tokens
-//vectorize string to feature vectors
-//decorrelate features
-//rescale features
-
-public class DataPreProcessor {
- public Collection data;
- HashMap data2Vector = new HashMap();
- RecordFeatureSet rfs = new RecordFeatureSet();
-
- public DataPreProcessor(Collection data) {
- this.data = data;
- }
- public HashMap getStandardData()
- {
- return data2Vector;
- }
- public void run() {
- Vector toks = buildDict(data);
- // vectorize String
- String[] vocb = toks.toArray(new String[toks.size()]);
- rfs.addVocabulary(vocb);
- HashMap xHashMap = vectorize(data);
- resacle(xHashMap);
- // deCorrelat features
- ArrayList toremove = deCorrelate(xHashMap);
- // update the featureSet and Vectorize the data again
- rfs.removeFeatures(toremove);
- // resvectorize the data
- xHashMap = vectorize(data);
- // rescale the data
- // rescale each feature
- resacle(xHashMap);
- this.data2Vector = xHashMap;
- }
- public void resacle(HashMap xHashMap)
- {
- double[] maxvals = new double[rfs.getFeatureNames().size()];
- maxvals = UtilTools.initArray(maxvals, -1);
- double[] minvals = new double[rfs.getFeatureNames().size()];
- minvals = UtilTools.initArray(minvals, Double.MAX_VALUE);
- for (double[] tmp : xHashMap.values()) {
- for (int i = 0; i < tmp.length; i++) {
- if (tmp[i] > maxvals[i]) {
- maxvals[i] = tmp[i];
- }
- if (tmp[i] < minvals[i]) {
- minvals[i] = tmp[i];
- }
- }
- }
- for (Entry stringEntry : xHashMap.entrySet()) {
- double[] value = stringEntry.getValue();
- for (int i = 0; i < value.length; i++) {
- if (maxvals[i] > minvals[i]) {
- double tmpval = (value[i] - minvals[i])/(maxvals[i] - minvals[i]);
- value[i] = Math.pow(tmpval, 0.5);
- } else {
- value[i] = 0;
- }
- }
- xHashMap.put(stringEntry.getKey(), value);
- }
- }
- public HashMap vectorize(Collection data) {
- HashMap res = new HashMap();
- for (String line : data) {
- if (!res.containsKey(line)) {
- double[] row = getFeatureArray(line);
- res.put(line, row);
- }
- }
- return res;
- }
-
- public ArrayList deCorrelate(HashMap data) {
- ArrayList toRemove = new ArrayList();
- HashSet signs = new HashSet();
- // build singature for each feature
- for (int i = 0; i < rfs.getFeatureNames().size(); i++) {
- String sg = "";
- for (Entry stringEntry : data.entrySet()) {
- sg += stringEntry.getValue()[i]+"\n";
- }
- if (signs.contains(sg)) {
- toRemove.add(i);
- } else {
- signs.add(sg);
- }
- }
- return toRemove;
-
- }
-
- public double[] getFeatureArray(String s) {
- Collection cfeat = rfs.computeFeatures(s, "");
- Feature[] x = cfeat.toArray(new Feature[cfeat.size()]);
- double[] res = new double[x.length];
- for (int i = 0; i < x.length; i++) {
- res[i] = x[i].getScore();
- }
- return res;
- }
-
- public static Vector buildDict(Collection data) {
- HashMap mapHashSet = new HashMap();
- for (String pair : data) {
- String s1 = pair;
- if (s1.contains("<_START>")) {
- s1 = s1.replace("<_START>", "");
- }
- if (s1.contains("<_END>")) {
- s1 = s1.replace("<_END>", "");
- }
- Ruler r = new Ruler();
- r.setNewInput(s1);
- Vector v = r.vec;
- HashSet curRow = new HashSet();
- for (TNode t : v) {
- String k = t.text;
- k = k.replaceAll("[0-9]+", "DIGITs");
- // filter punctuation
- if (k.trim().length() == 1 && !Character.isLetterOrDigit(k.charAt(0))) {
- continue;
- }
- if (k.trim().length() == 0)
- continue;
- // only consider K once in one row
- if (curRow.contains(k)) {
- continue;
- } else {
- curRow.add(k);
- }
- if (mapHashSet.containsKey(k)) {
- mapHashSet.put(k, mapHashSet.get(k) + 1);
- } else {
- mapHashSet.put(k, 1);
- }
- }
- }
- // prune infrequent terms
- int thresdhold = (int) (data.size() * 0.005);
- Iterator> iter = mapHashSet.entrySet()
- .iterator();
- while (iter.hasNext()) {
- Entry e = iter.next();
- if (e.getValue() < thresdhold) {
- iter.remove();
- }
- }
- Vector res = new Vector();
- res.addAll(mapHashSet.keySet());
- return res;
- }
-
-}
diff --git a/karma-cleaning/src/main/java/edu/isi/karma/cleaning/EmailNotification.java b/karma-cleaning/src/main/java/edu/isi/karma/cleaning/EmailNotification.java
deleted file mode 100644
index 8289f1dd0..000000000
--- a/karma-cleaning/src/main/java/edu/isi/karma/cleaning/EmailNotification.java
+++ /dev/null
@@ -1,75 +0,0 @@
-package edu.isi.karma.cleaning;
-
-import java.util.Properties;
-
-import javax.activation.DataHandler;
-import javax.activation.DataSource;
-import javax.activation.FileDataSource;
-import javax.mail.BodyPart;
-import javax.mail.Message;
-import javax.mail.MessagingException;
-import javax.mail.Multipart;
-import javax.mail.PasswordAuthentication;
-import javax.mail.Session;
-import javax.mail.Transport;
-import javax.mail.internet.InternetAddress;
-import javax.mail.internet.MimeBodyPart;
-import javax.mail.internet.MimeMessage;
-import javax.mail.internet.MimeMultipart;
-
-public class EmailNotification {
-
- public void notify(boolean real, String title)
- {
- if(real)
- {
- final String username = "wb_4365@163.com";
- final String password = "225015";
-
- Properties props = new Properties();
- props.put("mail.smtp.auth", "true");
- // props.put("mail.smtp.starttls.enable", "true");
- props.put("mail.smtp.host", "smtp.163.com");
- props.put("mail.smtp.port", "25");
-
- Session session = Session.getInstance(props,
- new javax.mail.Authenticator() {
- protected PasswordAuthentication getPasswordAuthentication() {
- return new PasswordAuthentication(username, password);
- }
- });
-
- try {
-
- Message message = new MimeMessage(session);
- message.setFrom(new InternetAddress("wb_4365@163.com"));
- message.setRecipients(Message.RecipientType.TO,
- InternetAddress.parse("bowu365@gmail.com"));
- message.setSubject("Experiment Done: "+title);
- // Create the message part
- BodyPart messageBodyPart = new MimeBodyPart();
- // Fill the message
- messageBodyPart.setText("This is message body");
- Multipart multipart = new MimeMultipart();
- // Set text message part
- multipart.addBodyPart(messageBodyPart);
- // Part two is attachment
- messageBodyPart = new MimeBodyPart();
- String filename = "./log/mylog.txt";
- DataSource source = new FileDataSource(filename);
- messageBodyPart.setDataHandler(new DataHandler(source));
- messageBodyPart.setFileName(filename);
- multipart.addBodyPart(messageBodyPart);
- message.setContent(multipart);
- Transport.send(message);
- System.out.println("Done");
-
- } catch (MessagingException e) {
- throw new RuntimeException(e);
- }
- }
- }
- public static void main(String[] args) {
-
- }
-}
diff --git a/karma-cleaning/src/main/java/edu/isi/karma/cleaning/ExampleCluster.java b/karma-cleaning/src/main/java/edu/isi/karma/cleaning/ExampleCluster.java
deleted file mode 100644
index 849386455..000000000
--- a/karma-cleaning/src/main/java/edu/isi/karma/cleaning/ExampleCluster.java
+++ /dev/null
@@ -1,684 +0,0 @@
-package edu.isi.karma.cleaning;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Vector;
-
-
-import edu.isi.karma.cleaning.features.Feature;
-
-public class ExampleCluster {
- public HashMap legalParitions = new HashMap();
- Vector examples = new Vector();
- HashSet exampleInputs = new HashSet();
- Vector> constraints = new Vector>();
- public ProgSynthesis pSynthesis; // data
- HashMap> uorgclusters = new HashMap>();
- HashMap string2Vector = new HashMap();
- int unlabelDataAmount = 10;
- double assignThreshold = 0.1;
- public int featuresize = 0;
- public int failedCnt = 0;
- public double[] weights = {};
- String contextId;
- public static enum method {
- CP, CPIC, SP, SPIC, DP, DPIC
- };
-
- public static method option = method.DPIC;
-
- /*
- * example are (id, (input, output)) constrain: true->must link constraints,
- * false->cannot link constraints
- */
- public ExampleCluster(String contextId) {
- this.contextId = contextId;
- }
-
- public ExampleCluster(ProgSynthesis pSynthesis, Vector examples,
- HashMap sData) {
- this.pSynthesis = pSynthesis;
- this.examples = examples;
- this.string2Vector = sData;
- this.featuresize = pSynthesis.featureSet.getFeatureNames().size();
- weights = new double[featuresize];
- for (int i = 0; i < weights.length; i++) {
- weights[i] = 1;
- }
- for (Partition p : examples) {
- for (Vector nodes : p.orgNodes) {
- exampleInputs.add(UtilTools.print(nodes));
- }
- }
- init();
- }
-
- public void updateWeights(double[] weight) {
- if (weight != null)
- this.weights = weight;
- }
-
- // accepting constraints from previous iterations
- public void updateConstraints(Vector> cnts) {
- if (option == method.CPIC || option == method.DPIC
- || option == method.SPIC) {
- HashMap> xHashMap1 = new HashMap>();
- for (Vector group : cnts) {
- xHashMap1.put(constraintKey(group), group);
- }
- HashSet xHashSet2 = new HashSet();
- for (Vector group : constraints) {
- xHashSet2.add(constraintKey(group));
- }
- for (Map.Entry> stringVectorEntry : xHashMap1.entrySet()) {
- if (!xHashSet2.contains(stringVectorEntry.getKey())) {
- constraints.add(stringVectorEntry.getValue());
- }
- }
- // update islegal ds
- for (Vector group : constraints) {
- ArrayList g = new ArrayList();
- for (String[] p : group) {
- String line = String.format("%s, %s\n", p[0], p[1]);
- g.add(line);
- }
- String res = "";
- res = UtilTools.createkey(new ArrayList(group));
- legalParitions.put(res, false);
- }
- }
- }
-
- public Vector> getConstraints() {
- return constraints;
- }
-
- public String constraintKey(Vector group) {
- ArrayList xArrayList = new ArrayList();
- for (String[] e : group) {
- xArrayList.add(Arrays.toString(e));
- }
- Collections.sort(xArrayList);
- return xArrayList.toString();
- }
-
- public void init() {
- if (option == method.DP || option == method.DPIC) {
- this.unlabelDataAmount = 10;
- } else {
- this.unlabelDataAmount = 0;
- }
- }
-
- public void diagnose() {
- System.out.println("" + this.pSynthesis.featureSet.getFeatureNames());
- System.out.println("" + Arrays.toString(weights));
- }
-
- // adaptive partition program learning
- public Vector adaptive_cluster_weightEuclidean(
- Vector pars) {
- // single example
- if (pars.size() == 1) {
- ProgramAdaptator pAdapter = new ProgramAdaptator(contextId);
- ArrayList exps = UtilTools
- .extractExamplesinPartition(pars);
- pAdapter.adapt(pSynthesis.msGer.exp2Space,
- pSynthesis.msGer.exp2program, exps);
- return pars;
- }
- while (true) {
- // find partitions with the smallest distance
- double mindist = Double.MAX_VALUE;
- int x_ind = -1;
- int y_ind = -1;
- /* print the partitioning information* */
- // ProgTracker.printPartition(pars);
- // ProgTracker.printConstraints(constraints);
- /***/
- for (int i = 0; i < pars.size(); i++) {
- for (int j = i + 1; j < pars.size(); j++) {
- String key = getStringKey(pars.get(i), pars.get(j));
- boolean good = true;
- for (Map.Entry stringBooleanEntry : legalParitions.entrySet()) {
- if (!stringBooleanEntry.getValue() && key.indexOf(stringBooleanEntry.getKey()) != -1) {
- good = false;
- break;
- }
- }
- if (!good) {
- legalParitions.put(key, false);
- continue;
- }
- double par_dist = getDistance(pars.get(i), pars.get(j),
- pars);
- if (par_dist < mindist) {
- mindist = par_dist;
- x_ind = i;
- y_ind = j;
- }
- }
- }
- if (x_ind == -1 || y_ind == -1) {
- break;
- }
- /* print the partitioning information* */
- // ProgTracker.printPartitions(pars.get(x_ind), pars.get(y_ind));
- /***/
- Partition z = pars.get(x_ind).mergewith(pars.get(y_ind));
-
- if (adaptive_isLegalPartition(z)) {
- legalParitions.put(z.getHashKey(), true);
- // update the partition vector
- pars = UpdatePartitions(x_ind, y_ind, pars);
- continue;
- } else {
- legalParitions.put(
- getStringKey(pars.get(x_ind), pars.get(y_ind)), false);
- // update the constraints
- Vector clique = new Vector();
- for (int k = 0; k < pars.get(x_ind).orgNodes.size(); k++) {
- String org = UtilTools.print(pars.get(x_ind).orgNodes
- .get(k));
- String tar = UtilTools.print(pars.get(x_ind).tarNodes
- .get(k));
- String[] pair = { org, tar };
- clique.add(pair);
- }
- for (int k = 0; k < pars.get(y_ind).orgNodes.size(); k++) {
- String org = UtilTools.print(pars.get(y_ind).orgNodes
- .get(k));
- String tar = UtilTools.print(pars.get(y_ind).tarNodes
- .get(k));
- String[] pair = { org, tar };
- clique.add(pair);
- }
- constraints.add(clique);
- // update the distance metrics
- updateDistanceMetric(pars);
- }
- }
- if (pars.size() > 1)
- updateDistanceMetric(pars); // tune the final weight given the
- // current
- // clusters
- // assign ulabled data to each partition
- for (Partition p : pars) {
- p.orgUnlabeledData.clear();
- }
- if (pars.size() >= 2) {
- assignUnlabeledData(pars);
- }
- //this.diagnose();
- return pars;
- }
-
- // use distorted distance function to cluster partitions
- public Vector cluster_weigthEuclidean(Vector pars) {
- // if (this.constraints.size() > 0)
- // updateDistanceMetric(pars);
-
- while (true) {
- // find partitions with the smallest distance
- double mindist = Double.MAX_VALUE;
- int x_ind = -1;
- int y_ind = -1;
- /* print the partitioning information* */
- // ProgTracker.printPartition(pars);
- // ProgTracker.printConstraints(constraints);
- /***/
- for (int i = 0; i < pars.size(); i++) {
- for (int j = i + 1; j < pars.size(); j++) {
- String key = getStringKey(pars.get(i), pars.get(j));
- boolean good = true;
- for (Map.Entry stringBooleanEntry : legalParitions.entrySet()) {
- if (!stringBooleanEntry.getValue() && key.indexOf(stringBooleanEntry.getKey()) != -1) {
- good = false;
- break;
- }
- }
- if (!good) {
- legalParitions.put(key, false);
- continue;
- }
- // double par_dist = getDistance(pars.get(i), pars.get(j));
- // double par_dist = getCompScore(pars.get(i), pars.get(j),
- // pars);// sumit heuristic
- double par_dist = getDistance(pars.get(i), pars.get(j),
- pars);
- if (par_dist < mindist) {
- mindist = par_dist;
- x_ind = i;
- y_ind = j;
- }
- }
- }
- if (x_ind == -1 || y_ind == -1) {
- break;
- }
- /* print the partitioning information* */
- // ProgTracker.printPartitions(pars.get(x_ind), pars.get(y_ind));
- /***/
- Partition z = pars.get(x_ind).mergewith(pars.get(y_ind));
- if (isLegalPartition(z)) {
- legalParitions.put(z.getHashKey(), true);
- // update the partition vector
- pars = UpdatePartitions(x_ind, y_ind, pars);
- continue;
- } else {
- legalParitions.put(
- getStringKey(pars.get(x_ind), pars.get(y_ind)), false);
- // update the constraints
- Vector clique = new Vector();
- for (int k = 0; k < pars.get(x_ind).orgNodes.size(); k++) {
- String org = UtilTools.print(pars.get(x_ind).orgNodes
- .get(k));
- String tar = UtilTools.print(pars.get(x_ind).tarNodes
- .get(k));
- String[] pair = { org, tar };
- clique.add(pair);
- }
- for (int k = 0; k < pars.get(y_ind).orgNodes.size(); k++) {
- String org = UtilTools.print(pars.get(y_ind).orgNodes
- .get(k));
- String tar = UtilTools.print(pars.get(y_ind).tarNodes
- .get(k));
- String[] pair = { org, tar };
- clique.add(pair);
- }
- constraints.add(clique);
- // update the distance metrics
- updateDistanceMetric(pars);
- }
- }
- if (pars.size() > 1)
- updateDistanceMetric(pars); // tune the final weight given the
- // current
- // clusters
- // assign ulabled data to each partition
- for (Partition p : pars) {
- p.orgUnlabeledData.clear();
- }
- if (pars.size() >= 2) {
- assignUnlabeledData(pars);
- }
- this.diagnose();
- return pars;
- }
-
- public void assignUnlabeledData(Vector pars) {
- HashMap dists = new HashMap();
- // find the distance between partitions
- for (int i = 0; i < pars.size(); i++) {
- for (int j = i + 1; j < pars.size(); j++) {
- String d = getStringKey(pars.get(i), pars.get(j));
- if (!dists.containsKey(d)) {
- dists.put(d, getDistance(pars.get(i), pars.get(j)));
- }
- }
- }
- HashMap> testResult = new HashMap>();
-
- for (String val : string2Vector.keySet()) {
- Partition p_index = null;
- Partition p_index_2nd = null;
- double min_val = Double.MAX_VALUE;
- double min_val_2nd = Double.MAX_VALUE;
- // find the two shortest distances.
- for (Partition p : pars) {
- double dist = getDistance(val, p);
- // /
- // /System.out.println(String.format("%s, %s: %f", val, p.label,
- // dist));
- // /
- if (dist < min_val) {
- min_val_2nd = min_val;
- p_index_2nd = p_index;
- min_val = dist;
- p_index = p;
-
- } else if (dist >= min_val && dist < min_val_2nd) {
- min_val_2nd = dist;
- p_index_2nd = p;
- }
-
- }
- double var_dist = min_val_2nd - min_val;
- String tKey = getStringKey(p_index_2nd, p_index);
- double pDist = dists.get(tKey);
- if (var_dist > pDist * assignThreshold) {
- if (!testResult.containsKey(p_index)) {
- HashMap cluster = new HashMap();
- cluster.put(val, min_val);
- testResult.put(p_index, cluster);
- } else {
- testResult.get(p_index).put(val, min_val);
- }
- }
- /*
- * if (p_index.orgUnlabeledData.size() < 10) {
- * p_index.orgUnlabeledData.add(val); }
- */
- }
- for (Map.Entry> partitionHashMapEntry : testResult.entrySet()) {
- Map, ?> dicttmp = UtilTools.sortByComparator(partitionHashMapEntry.getValue());
- /** print unlabeled data **/
- // System.out.println("Partition: " + key.label);
- // ProgTracker.printUnlabeledData(dicttmp);
- /****/
- int cnt = 0;
- for (Object xkey : dicttmp.keySet()) {
- if (cnt < unlabelDataAmount * partitionHashMapEntry.getKey().orgNodes.size()) {
- if (exampleInputs.contains((String) xkey))// exclude
- // examples from
- // unlabeled
- // data
- continue;
- partitionHashMapEntry.getKey().orgUnlabeledData.add((String) xkey);
- cnt++;
- } else {
- break;
- }
- }
- }
- }
-
- // current return all the examples.
- // monitor whether all constraints are positive.
- public ArrayList findMaximalSeperated() {
- ArrayList dists = new ArrayList();
- for (Partition e : examples) {
- for (Vector elem : e.orgNodes) {
- String line = UtilTools.print(elem);
- dists.add(line);
- }
- }
- return dists;
- }
-
- public ArrayList> convertStringSetToContrainMatrix(
- ArrayList strings) {
- ArrayList> res = new ArrayList>();
- for (int i = 0; i < strings.size(); i++) {
- for (int j = i + 1; j < strings.size(); j++) {
- String s1 = strings.get(i);
- String s2 = strings.get(j);
- double[] s1_vec = getFeatureArray(s1);
- double[] s2_vec = getFeatureArray(s2);
- ArrayList xArrayList = new ArrayList();
- for (int k = 0; k < s1_vec.length; k++) {
- xArrayList.add(s2_vec[k] - s1_vec[k]); // does sign matter?
- }
- res.add(xArrayList);
- }
- }
- return res;
- }
-
- // input with current contraints
- public void updateDistanceMetric(Vector pars) {
- if (option == method.DP || option == method.DPIC) {
- GradientDecendOptimizer gdo = new GradientDecendOptimizer();
- // calculate example array and individual groups
- ArrayList centers = new ArrayList();
- ArrayList> individuals = new ArrayList>();
- for (Partition p : pars) {
- ArrayList list = new ArrayList();
- double[] center = new double[featuresize];
- center = UtilTools.initArray(center, 0);
- for (Vector org : p.orgNodes) {
- String res = UtilTools.print(org);
- double[] vec = string2Vector.get(res);
- list.add(vec);
- center = addArray(center, vec);
- }
- individuals.add(list);
- center = UtilTools.produce(1.0 / p.orgNodes.size(), center);
- centers.add(center);
- }
- // calculate instance array
- ArrayList instances = new ArrayList();
- for (Map.Entry stringEntry : string2Vector.entrySet()) {
- double[] elem = stringEntry.getValue();
- instances.add(elem);
- }
- // calculate constraint array
- ArrayList> constraintgroup = new ArrayList>();
- for (Vector consts : constraints) {
- ArrayList group = new ArrayList();
- for (String[] exp : consts) {
- double[] e = string2Vector.get(exp[0]);
- group.add(e);
- }
- constraintgroup.add(group);
- }
- double[] w = gdo.doOptimize(centers, instances, constraintgroup,
- individuals, this.weights);
- this.weights = w;
- } else {
- this.weights = new double[featuresize];
- weights = UtilTools.initArray(this.weights, 1);
- }
- }
-
- public double[] getFeatureArray(String s) {
- Collection cfeat = pSynthesis.featureSet
- .computeFeatures(s, "");
- Feature[] x = cfeat.toArray(new Feature[cfeat.size()]);
- double[] res = new double[x.length];
- for (int i = 0; i < x.length; i++) {
- res[i] = x[i].getScore();
- }
- return res;
- }
-
- public double getDistance(String a, Partition b) {
- // find a string
- double[] x = string2Vector.get(a);
- double mindist = Double.MAX_VALUE;
- for (Vector orgs : b.orgNodes) {
- double[] e = string2Vector.get(UtilTools.print(orgs));
- double d = getDistance(x, e);
- if (d < mindist)
- mindist = d;
- }
- return mindist;
- }
-
- public double getDistance(Partition a, Partition b, Vector pars) {
- double res = 0.0;
- if (option == method.DPIC || option == method.DP) {
- res = getDistance(a, b); // use the closest points' distance as the
- // cluster distance
- } else if (option == method.SP || option == method.SPIC) {
- res = getCompScore(a, b, pars);
- } else {
- double[] x = getPartitionVector(a);
- double[] y = getPartitionVector(b);
- res = UtilTools.distance(x, y);
- }
- return res;
- }
-
- /*
- * public double getDistance(Partition a, Partition b) { double[] x =
- * getPartitionVector(a); double[] y = getPartitionVector(b); return
- * getDistance(x, y); }
- */
-
- public double getDistance(Partition a, Partition b) {
- double mindist = Double.MAX_VALUE;
- for (Vector orgs : a.orgNodes) {
- String e = UtilTools.print(orgs);
- double d = getDistance(e, b);
- if (d < mindist)
- mindist = d;
- }
- return mindist;
- }
-
- public double getCompScore(Partition a, Partition b, Vector pars) {
- String key = getStringKey(a, b);
- for (Map.Entry stringBooleanEntry : legalParitions.entrySet()) {
- if (key.indexOf(stringBooleanEntry.getKey()) != -1 && !stringBooleanEntry.getValue()) {
- return Double.MAX_VALUE;
- }
- }
- Partition p = a.mergewith(b);
- if (!isLegalPartition(p)) {
- legalParitions.put(key, false);
- // update the constraints
- Vector clique = new Vector();
- for (int k = 0; k < a.orgNodes.size(); k++) {
- String org = UtilTools.print(a.orgNodes.get(k));
- String tar = UtilTools.print(a.tarNodes.get(k));
- String[] pair = { org, tar };
- clique.add(pair);
- }
- for (int k = 0; k < b.orgNodes.size(); k++) {
- String org = UtilTools.print(b.orgNodes.get(k));
- String tar = UtilTools.print(b.tarNodes.get(k));
- String[] pair = { org, tar };
- clique.add(pair);
- }
- constraints.add(clique);
- return Double.MAX_VALUE;
- }
- double validCnt = 1e-3; // avoid overflowing
- for (int x = 0; x < pars.size(); x++) {
- if (pars.get(x) == a || pars.get(x) == b) {
- continue;
- }
- Partition q = p.mergewith(pars.get(x));
- if (isLegalPartition(q)) {
- validCnt++;
- }
- }
- return 1.0 / validCnt;
- }
-
- public double getDistance(double[] a, double[] b) {
- double sum = 0.0;
- for (int i = 0; i < a.length; i++) {
- sum += Math.pow(a[i] - b[i], 2) * weights[i];
- }
-
- return Math.sqrt(sum);
- }
- //check b can be copied from a
- public boolean iscovered(String a, String b)
- {
- String[] elems = b.split("\\*");
- boolean covered = true;
- for(String e: elems)
- {
- if(a.indexOf(e)== -1)
- {
- covered = false;
- break;
- }
- }
- return covered;
- }
- public boolean adaptive_isLegalPartition(Partition p) {
- if (p == null) {
- failedCnt++;
- return false;
- }
-
- String key = p.getHashKey();
- if (legalParitions.containsKey(key)) {
- return legalParitions.get(key);
- }
- // test whether its subset fails
- for (Map.Entry stringBooleanEntry : legalParitions.entrySet()) {
- if (!stringBooleanEntry.getValue() && iscovered(key, stringBooleanEntry.getKey())) {
- return false;
- }
- }
- ProgramAdaptator pAdapter = new ProgramAdaptator(contextId);
- ArrayList nPs = new ArrayList();
- nPs.add(p);
- ArrayList examps = UtilTools.extractExamplesinPartition(nPs);
- String fprogram = pAdapter.adapt(pSynthesis.msGer.exp2Space,pSynthesis.msGer.exp2program, examps);
- if (fprogram.indexOf("null")!= -1) {
- failedCnt++;
- legalParitions.put(key, false);
- return false;
- } else {
- legalParitions.put(key, true);
- return true;
- }
- }
-
- public boolean isLegalPartition(Partition p) {
- if (p == null) {
- failedCnt++;
- return false;
- }
- String key = p.getHashKey();
- if (legalParitions.containsKey(key)) {
- return legalParitions.get(key);
- }
- // test whether its subset fails
- for (Map.Entry stringBooleanEntry : legalParitions.entrySet()) {
- if (!stringBooleanEntry.getValue() && key.indexOf(stringBooleanEntry.getKey()) != -1) {
- return false;
- }
- }
- Vector xPar = new Vector();
- xPar.add(p);
- Collection cpr = pSynthesis.producePrograms(xPar);
- if (cpr == null || cpr.isEmpty()) {
- failedCnt++;
- legalParitions.put(key, false);
- return false;
- } else {
- legalParitions.put(key, true);
- return true;
- }
- }
-
- public Vector UpdatePartitions(int i, int j,
- Vector pars) {
- Partition p = pars.get(i).mergewith(pars.get(j));
- Vector res = new Vector();
- res.addAll(pars);
- res.set(i, p);
- res.remove(j);
- return res;
- }
-
- public String getStringKey(Partition a, Partition b) {
- ArrayList pars = new ArrayList();
- pars.add(a);
- pars.add(b);
- String res = Partition.getStringKey(pars);
- return res;
- }
-
- // get a vector that can represent a partition
- public double[] getPartitionVector(Partition p) {
-
- ArrayList vecs = new ArrayList();
- for (Vector orgs : p.orgNodes) {
- vecs.add(string2Vector.get(UtilTools.print(orgs)));
- }
- double[] vec = UtilTools.sum(vecs);
- vec = UtilTools.produce(1.0 / p.orgNodes.size(), vec);
- return vec;
- }
-
- public double[] addArray(double[] a, double[] b) {
- double[] x = new double[a.length];
- for (int i = 0; i < a.length; i++) {
- x[i] = a[i] + b[i];
- }
- return x;
- }
-}
diff --git a/karma-cleaning/src/main/java/edu/isi/karma/cleaning/ExampleSelection.java b/karma-cleaning/src/main/java/edu/isi/karma/cleaning/ExampleSelection.java
deleted file mode 100644
index f6c31638d..000000000
--- a/karma-cleaning/src/main/java/edu/isi/karma/cleaning/ExampleSelection.java
+++ /dev/null
@@ -1,382 +0,0 @@
-/*******************************************************************************
- * Copyright 2012 University of Southern California
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * This code was developed by the Information Integration Group as part
- * of the Karma project at the Information Sciences Institute of the
- * University of Southern California. For more information, publications,
- * and related projects, please see: http://www.isi.edu/integration
- ******************************************************************************/
-
-package edu.isi.karma.cleaning;
-
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Vector;
-
-import edu.isi.karma.cleaning.QuestionableRecord.OutlierDetector;
-
-public class ExampleSelection {
- public HashMap> org = new HashMap>();
- public HashMap> tran = new HashMap>();
- public HashMap raw = new HashMap();
- public boolean isDetectingQuestionableRecord = false;
- public OutlierDetector out;
- // testdata rowid:{tar, tarcolor}
- public HashMap> testdata = new HashMap>();
- public int way = 7;
- public HashSet dictionary = new HashSet();
-
- public ExampleSelection() {
- this.out = new OutlierDetector();
- }
-
- public String Choose() {
- String ID = "";
- switch (way) {
- case 1:
- ID = this.way1();
- break;
- case 2:
- ID = this.way2();
- break;
- case 3:
- ID = this.way3();
- break;
- case 4:
- ID = this.way4();
- break;
- case 6:
- ID = this.way6();
- break;
- case 7:
- ID = this.way7();
- break;
- case 8:
- ID = this.way8();
- break;
- case 9:
- ID = this.way9();
- break;
- default:
- ID = "";
- }
- return ID;
- }
-
- public Vector getOrgTarPair(HashMap exps) {
- Vector result = new Vector();
- for (Map.Entry stringEntry : exps.entrySet()) {
- String[] record = stringEntry.getValue();
- String[] tmp = { record[0], record[1] };
- result.add(tmp);
- }
- return result;
- }
-
- // exps: rowId: {org, tar, tarcode,classlabel}
- // example: partition id: [{raw,tarcode}]
- public void inite(HashMap exps,
- HashMap> examples) {
- // inite the class center vector
-
- if (way >= 6) {
- if (firsttime) {
- out = new OutlierDetector();
- out.buildDict(this.getOrgTarPair(exps));
- dictionary = out.dict;
- }
- out.buildMeanVector(examples, dictionary);
- }
- Ruler ruler = new Ruler();
- for (Map.Entry stringEntry : exps.entrySet()) {
- String e = stringEntry.getValue()[0];
- ruler.setNewInput(e);
- org.put(stringEntry.getKey(), ruler.vec);
- if (way >= 6) {
- String raw = stringEntry.getValue()[0];
- String[] pair = { raw, stringEntry.getValue()[2] };
- if (testdata.containsKey(stringEntry.getValue()[3])) {
- HashMap xelem = testdata.get(stringEntry.getValue()[3]);
- if (!xelem.containsKey(stringEntry.getKey())) {
- xelem.put(stringEntry.getKey(), pair);
- }
- } else {
- HashMap vstr = new HashMap();
- vstr.put(stringEntry.getKey(), pair);
- testdata.put(stringEntry.getValue()[3], vstr);
- }
- }
- }
-
- this.raw = exps;
- }
-
- // choose the most ambiguous
- public String way1() {
- String ID = "";
- int maximum = -1;
- for (String key : org.keySet()) {
- int s = this.ambiguityScore(org.get(key));
- if (s > maximum) {
- ID = key;
- maximum = s;
- }
- }
- return ID;
- }
-
- // return the least ambiguous
- public String way2() {
- String ID = "";
- int minimum = Integer.MAX_VALUE;
- for (String key : org.keySet()) {
- int s = this.ambiguityScore(org.get(key));
- if (s < minimum) {
- ID = key;
- minimum = s;
- }
- }
- return ID;
- }
-
- // return the first incorrect one, simulated ideal user
- public String way3() {
- String ID = "";
- int minimum = Integer.MAX_VALUE;
- for (String key : raw.keySet()) {
- int s = Integer.valueOf(key);
- if (s < minimum) {
- ID = key;
- minimum = s;
- }
- }
- return ID;
- }
-
- public int ambiguityScore(Vector vec) {
- HashMap d = new HashMap();
- int score = 0;
- for (int i = 0; i < vec.size(); i++) {
- if (d.containsKey(vec.get(i).text))
- continue;
- for (int j = 0; j < vec.size(); j++) {
- if (vec.get(j).text.compareTo(vec.get(i).text) == 0 && i != j
- && vec.get(j).text.compareTo(" ") != 0) {
- score++;
- }
- }
- if (!d.containsKey(vec.get(i).text)) {
- d.put(vec.get(i).text, score);
- }
- }
- return score;
- }
-
- // only try to find the wrong ones
- public static boolean firsttime = true;
-
- public String way4() {
- if (firsttime) {
- firsttime = false;
- return raw.keySet().iterator().next();
- }
- for (Map.Entry stringEntry : raw.entrySet()) {
-
- if (stringEntry.getValue()[2].indexOf("_FATAL_ERROR_") != -1) {
- return stringEntry.getKey();
- }
- }
- return this.way2();
- }
-
- public String way6() {
- int max = 2; // only the one with _FATAL_ERROR_ inside
- if (firsttime) {
- firsttime = false;
- return this.way2();
- }
- Vector examples = new Vector();
- for (Map.Entry stringEntry : raw.entrySet()) {
- int cnt = 0;
- String[] tmp = stringEntry.getValue()[2]
- .split("((?<=_\\d_FATAL_ERROR_)|(?=_\\d_FATAL_ERROR_))");
-
- for (String tmpstring : tmp) {
- int errnum = 0;
- if (tmpstring.indexOf("_FATAL_ERROR_") == -1) {
- continue;
- }
- errnum = Integer.valueOf(tmpstring.substring(1, 2));
- cnt += errnum;
- }
- if (cnt > max) {
- max = cnt;
- examples.clear();
- examples.add(stringEntry.getKey());
- }
- if (cnt == max && max > 1) {
- examples.add(stringEntry.getKey());
- }
- }
- // if now _FATAL_ERROR_ detected use outlier detection
- if (examples.size() == 0) {
- String row = "";
- row = way8();
- return row;
- } else { // select the most ambigious among all the record with same
- // number of FATALERROR
- String idString = "";
- int min = 10000;
- for (String key : examples) {
- int s = this.ambiguityScore(org.get(key));
- if (s < min) {
- min = s;
- idString = key;
- }
- }
- return idString;
- }
- }
-
- public String way7() {
- // this.printdata();
- int max = 2; // only the one with _FATAL_ERROR_ inside
- if (firsttime) {
- firsttime = false;
- return this.way2();
- }
- Vector examples = new Vector();
- for (Map.Entry stringEntry : raw.entrySet()) {
- int cnt = 0;
- String[] tmp = stringEntry.getValue()[2]
- .split("((?<=_\\d_FATAL_ERROR_)|(?=_\\d_FATAL_ERROR_))");
- for (String tmpstring : tmp) {
- int errnum = 0;
- if (tmpstring.indexOf("_FATAL_ERROR_") == -1) {
- continue;
- }
- errnum = Integer.valueOf(tmpstring.substring(1, 2));
- cnt += errnum;
- }
- if (cnt > max) {
- max = cnt;
- examples.clear();
- examples.add(stringEntry.getKey());
- }
- if (cnt == max && max > 1) {
- examples.add(stringEntry.getKey());
- }
- }
- // if no _FATAL_ERROR_ detected use outlier detection
- if (examples.size() == 0) {
- isDetectingQuestionableRecord = true;
- String row = "";
- double tmax = -1;
- for (Map.Entry> stringHashMapEntry : this.testdata.entrySet()) {
- String trowid = out.getOutliers(stringHashMapEntry.getValue(),
- out.rVectors.get(stringHashMapEntry.getKey()), tmax, dictionary);
- tmax = out.currentMax;
- if (trowid.length() > 0) {
- row = trowid;
- }
- }
- return row;
- } else { // select the most ambigious among all the record with same
- // number of FATALERROR
- isDetectingQuestionableRecord = false;
- String idString = "";
- int min = 10000;
- for (String key : examples) {
- int s = this.ambiguityScore(org.get(key));
- if (s < min) {
- min = s;
- idString = key;
- }
- }
- return idString;
- }
- }
-
- // shortest result
- // exps: rowId: {org, tar, tarcode,classlabel}
- public String way8() {
- if (firsttime) {
- firsttime = false;
- return this.way3();
- }
- String idString = "";
- int shortest = 10000;
- for (String rowid : raw.keySet()) {
- String xrow = raw.get(rowid)[1];
- if (xrow.indexOf("_FATAL_ERROR_") != -1) {
- xrow = raw.get(rowid)[0];
- }
- if (xrow.length() < shortest) {
- shortest = xrow.length();
- idString = rowid;
- }
- }
- return idString;
- }
-
- // longest result
- public String way9() {
- if (firsttime) {
- firsttime = false;
- return this.way3();
- }
- String idString = "";
- int longest = -1;
- for (String rowid : raw.keySet()) {
- String xrow = raw.get(rowid)[1];
- if (xrow.indexOf("_FATAL_ERROR_") != -1) {
- xrow = raw.get(rowid)[0];
- }
- if (xrow.length() > longest) {
- longest = xrow.length();
- idString = rowid;
- }
- }
- return idString;
- }
-
- public void clear() {
- this.raw.clear();
- org.clear();
- tran.clear();
- this.testdata.clear();
- }
-
- public void printdata() {
- String s1 = "";
- String s2 = "";
- for (Map.Entry> stringHashMapEntry : this.testdata.entrySet()) {
- HashMap r = stringHashMapEntry.getValue();
- s1 += "partition " + stringHashMapEntry.getKey() + "\n";
- for (String[] elem : r.values()) {
- s1 += Arrays.toString(elem) + "\n";
- }
- }
- System.out.println("" + s1);
- for (String[] v : this.raw.values()) {
- s2 += Arrays.toString(v) + "\n";
- }
- System.out.println(s2);
-
- }
-
-}
diff --git a/karma-cleaning/src/main/java/edu/isi/karma/cleaning/ExampleTraces.java b/karma-cleaning/src/main/java/edu/isi/karma/cleaning/ExampleTraces.java
deleted file mode 100644
index 6e8936d16..000000000
--- a/karma-cleaning/src/main/java/edu/isi/karma/cleaning/ExampleTraces.java
+++ /dev/null
@@ -1,86 +0,0 @@
-package edu.isi.karma.cleaning;
-
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.Vector;
-/*
- *store the traces for all the examples
- * */
-/*
- *store the traces for all the examples
- * */
-public class ExampleTraces {
- public HashMap expTraces = new HashMap();
- String contextId;
- public ExampleTraces(String contextId)
- {
- this.contextId = contextId;
- }
- public Traces createTrace(String[] example)
- {
- Vector orgNodes = new Vector();
- Vector tarNodes = new Vector();
- Ruler ruler = new Ruler();
- ruler.setNewInput("<_START>"+example[0]+"<_END>");
- orgNodes = ruler.vec;
- ruler.setNewInput(example[1]);
- tarNodes = ruler.vec;
- Traces t = new Traces(orgNodes, tarNodes, contextId);
- this.addTrace(example, t);
- return t;
- }
- public void addTrace(String[] example, Traces t)
- {
- String key = String.format("%s|%s", example[0],example[1]);
- expTraces.put(key, t);
- }
- public Traces getTrace(String[] example)
- {
- String key = String.format("%s|%s", example[0],example[1]);
- return expTraces.get(key);
- }
- public Vector> getCurrentSegments(String[] example)
- {
- Vector> res = new Vector>();
- Traces t = this.getTrace(example);
- Collection x = t.traceline.values();
- for(Template tmp:x)
- {
- Vector line = new Vector();
- for(GrammarTreeNode node:tmp.body)
- {
- line.add((Segment)node);
- }
- res.add(line);
- }
- return res;
- }
- public String getSegmentValue(Segment s)
- {
- if(s.isConstSegment())
- {
- return UtilTools.print(s.constNodes);
- }
- else
- {
- return s.tarString;
- }
- }
- public Vector getSegmentPos(Segment s)
- {
- if(s.isConstSegment())
- {
- return null;
- }
- else
- {
- Vector poses = new Vector();
- for(Section sec:s.section)
- {
- int[] x = {sec.pair[0].absPosition.get(0),sec.pair[1].absPosition.get(0)};
- poses.add(x);
- }
- return poses;
- }
- }
-}
diff --git a/karma-cleaning/src/main/java/edu/isi/karma/cleaning/GradientDecendOptimizer.java b/karma-cleaning/src/main/java/edu/isi/karma/cleaning/GradientDecendOptimizer.java
deleted file mode 100644
index 135ef8e3c..000000000
--- a/karma-cleaning/src/main/java/edu/isi/karma/cleaning/GradientDecendOptimizer.java
+++ /dev/null
@@ -1,377 +0,0 @@
-package edu.isi.karma.cleaning;
-
-import java.util.ArrayList;
-import java.util.Collections;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public class GradientDecendOptimizer {
- public double c_coef = 1;
- public double relativeCoef = 1;
- public static double ratio =0.4;
- public double stepsize = 0.1;
- public double maximalIternumber = 50;
- public double wIternumber = 100;
- public boolean coef_initialized = false;
- public double manualCoef = 1;
-
- Logger ulogger = LoggerFactory.getLogger(GradientDecendOptimizer.class);
-
- public GradientDecendOptimizer() {
- }
-
- // used to calculate the value of objective function.
- public double objectiveFunction(ArrayList r,
- ArrayList s, ArrayList t_star,
- double[] w) {
- double res = 0.0;
- for (double[] x : r) {
- res += UtilTools.product(x, w);
- }
- double ml = 0.0;
-
- for (double[] e : s) {
- ml += Math.pow(UtilTools.product(e, w), 0.5);
- }
- double tmp = 0.0;
- for (double[] x : t_star) {
- tmp += Math.pow(UtilTools.product(x, w), 0.5);
- }
- return res + c_coef * Math.log(ml) - c_coef
- * Math.log(tmp);
- }
-
- // compute the gradient according to the formula in slides
- public double[] getGradient(ArrayList r, ArrayList s,
- ArrayList t_star, double[] w_old) {
-
- int featuresize = w_old.length;
- double[] r_sum = UtilTools.sum(r);
- double[] res = new double[featuresize];
- // calculate the first component
- double t_scalar_sqrt_sum = 0;
- for (double[] e : t_star) {
- t_scalar_sqrt_sum += Math.sqrt(UtilTools.product(e, w_old));
- }
- // System.out.println("scalar_sqrt_sum: "+scalar_sqrt_sum);
- t_scalar_sqrt_sum = 1.0 / t_scalar_sqrt_sum;
- // calculate the coefficent for each t_i
- double[] vects = new double[featuresize];
- for (int i = 0; i < vects.length; i++) {
- vects[i] = 0;
- }
- for (double[] e : t_star) {
- double cof = Math.pow(UtilTools.product(e, w_old) + (1e-4), -0.5);
- double[] x = UtilTools.produce(cof, e);
- for (int j = 0; j < x.length; j++) {
- vects[j] += x[j];
- }
- }
- // calculate the coefficent for each t_i
- double s_scalar_sqrt_sum = 0;
- for (double[] e : s) {
- s_scalar_sqrt_sum += Math.sqrt(UtilTools.product(e, w_old));
- }
- // System.out.println("scalar_sqrt_sum: "+scalar_sqrt_sum);
- s_scalar_sqrt_sum = 1.0 / (s_scalar_sqrt_sum+1e-6);
- double[] svects = new double[featuresize];
- for (int i = 0; i < svects.length; i++) {
- svects[i] = 0;
- }
- for (double[] e : s) {
- double cof = Math.pow(UtilTools.product(e, w_old) + (1e-4), -0.5);
- //System.out.println("coef: "+cof);
- //System.out.println("e: "+Arrays.toString(e));
- double[] x = UtilTools.produce(cof, e);
- //System.out.println("x: "+Arrays.toString(x));
- for (int j = 0; j < x.length; j++) {
- svects[j] += x[j];
- }
- }
- //
- // System.out.println("scalar_sum: "+scalar_sqrt_sum);
- // System.out.println("vects: "+Arrays.toString(vects));
-
- if (!coef_initialized) {
- this.manualCoef = selectcoef(r_sum, vects);
- coef_initialized = true;
- this.c_coef = 2.0 * manualCoef / t_scalar_sqrt_sum;
- }
- // computer the gradient using subcomponents
- for (int i = 0; i < res.length; i++) {
- res[i] = r_sum[i] + this.relativeCoef * manualCoef * svects[i]- manualCoef * vects[i];
- //res[i] = this.relativeCoef * manualCoef *s_scalar_sqrt_sum*svects[i]- manualCoef *t_scalar_sqrt_sum*vects[i];;
- //res[i] = this.relativeCoef * manualCoef *svects[i]- manualCoef *vects[i];
- }
-
- return res;
- }
-
- public double selectcoef(double[] r_rum, double[] tvec) {
- ArrayList ratios = new ArrayList();
- for (int i = 0; i < r_rum.length; i++) {
- if (tvec[i] != 0 && r_rum[i] != 0) {
- double x = r_rum[i] * 1.0 / tvec[i];
- ratios.add(x);
- }
- }
- if (ratios.isEmpty())
- return 1;
- Collections.sort(ratios, Collections.reverseOrder());
- // System.out.println("ratios: "+ratios);
- Double result = 0.0;
- /*
- * double smallest = ratios.get(ratios.size() - 1); for (Double right :
- * ratios) { if (right / smallest <= 10) { result = right; } }
- */
- result = ratios.get(0);
- return result;
- }
- //approximate the largrange multiplier
- public double[] normalizeW(double[] w)
- {
- double sum = 0.0;
- double goal = w.length;
- double[] nw = new double[w.length];
- for(int i = 0; i < w.length; i++)
- {
- sum += w[i];
- }
- for(int i = 0; i < w.length; i++)
- {
- nw[i] =w[i]*goal*1.0/sum;
- }
- return nw;
- }
- // as the objective function is convex. Could check the trend each step
- public double[] doOptimize(ArrayList examples,
- ArrayList instances,
- ArrayList> constraints,
- ArrayList> individualExps, double[] w) {
-
- double[] w_0 = new double[examples.get(0).length];
- double[] oldGradient = new double[examples.get(0).length];
- if (w == null)
- // init it to 1
- {
- for (int i = 0; i < w_0.length; i++) {
- w_0[i] = 1;
- oldGradient[i] = Double.MAX_VALUE;
- }
- } else {
- w_0 = w;
- }
- int cntor = 0;
- while (cntor < maximalIternumber) {
- ArrayList r = compute_r(examples, instances, w_0);
- //ArrayList r = new ArrayList();
- ArrayList t = compute_t(constraints, w_0);
- ArrayList