diff --git a/src/main/groovy/komenti/App.groovy b/src/main/groovy/komenti/App.groovy index 4711471..9b15905 100644 --- a/src/main/groovy/komenti/App.groovy +++ b/src/main/groovy/komenti/App.groovy @@ -25,6 +25,7 @@ class App { _ longOpt: 'label-extension', 'Run a named label extension, e.g. cmo', args: 1 _ longOpt: 'direct', 'Receive only direct super/subclasses from the DL query. Default is false.', type: Boolean _ longOpt: 'class-mode', 'Return only one label per matching IRI.', type: Boolean + _ longOpt: 'field', 'Return only this metadata field', args: 1 // annotation options t longOpt: 'text', 'A file or directory of files to annotate.', args: 1 @@ -33,6 +34,7 @@ class App { _ longOpt: 'per-line', 'Process each line of each file seperately (useful for field-based data e.g. downloaded with get_metadata)', type: Boolean _ longOpt: 'disable-modifiers', 'Don\'t evaluate negation and uncertainty. The reason for this is: it takes a lot of time!', type: Boolean _ longOpt: 'family-modifier', 'Evaluate sentences for whether or not they mention a family member.', type: Boolean + _ longOpt: 'sentiment', 'Get sentiment score for annotations', type: Boolean _ longOpt: 'allergy-modifier', 'Evaluate sentences for whether or not they mention an allergy', type: Boolean _ longOpt: 'exclude', 'A list of phrases, which when matched in a sentence, will cause that sentence not to be annotated. One phrase per line.', args: 1 _ longOpt: 'write-pdfs-to-dir', 'If set, write the converted PDF text into the given directory.', args: 1 diff --git a/src/main/groovy/komenti/Komenti.groovy b/src/main/groovy/komenti/Komenti.groovy index c1474d0..06ee646 100644 --- a/src/main/groovy/komenti/Komenti.groovy +++ b/src/main/groovy/komenti/Komenti.groovy @@ -128,12 +128,13 @@ public class Komenti { static def get_metadata(o) { def outDir = getOutDir(o) - def files = [:] + ConcurrentHashMap files = [:] def vocab = Vocabulary.loadFile(o.l) def komentisto = new Komentisto(vocab, o['disable-modifiers'], o['family-modifier'], + o['sentiment'], o['allergy-modifier'], false, o['exclude'], @@ -155,17 +156,22 @@ public class Komenti { if(!o['decompose-entities']) { entityLabels = [] } - classLabels.each { iri, l -> + def i = 0 + GParsPool.withPool(o['threads'] ?: 1) { p -> + classLabels.eachParallel{ iri, l -> + println "(${++i}/${classLabels.size()})" KomentLib.AOSemanticQuery("<$iri>", l.o, false, "equivalent", { classes -> // we want the actual class, not just semantically equivalent ones. although tbh it might be better to get the metadata from those too. it has to be semantically equivalent to this class, after all def c = classes.find { it.class == iri } - def metadata = KomentLib.AOExtractMetadata(c, entityLabels) + def metadata = KomentLib.AOExtractMetadata(c, entityLabels, o['field']) if(o['lemmatise']) { // we do it per line here, since it's a field based document metadata = metadata.split('\n').collect { komentisto.lemmatise(it) }.join('\n') } - files[l.l[0]] = metadata + // TODO this is bad and lazy + files[iri.tokenize('/').last()] = metadata }) } + } println "Writing metadata files for ${files.size()} classes." files.each { n, c -> @@ -196,6 +202,7 @@ public class Komenti { def komentisto = new Komentisto(vocab, o['disable-modifiers'], o['family-modifier'], + o['sentiment'], o['allergy-modifier'], o['extract-triples'], o['exclude'], @@ -373,7 +380,7 @@ public class Komenti { if(o['id-list-only']) { writeOutput(aids.join('\n'), o, "Saved pmcids to $o.out!") } else { - def komentisto = new Komentisto(false, true, o['family-modifier'], o['allergy-modifier'], false, o['exclude'], o['threads'] ?: 1) + def komentisto = new Komentisto(false, true, o['family-modifier'], o['sentiment'], o['allergy-modifier'], false, o['exclude'], o['threads'] ?: 1) def abstracts = [] aids.each { pmcid -> KomentLib.PMCGetAbstracts(pmcid, { a -> diff --git a/src/main/groovy/komenti/klib/KomentLib.groovy b/src/main/groovy/komenti/klib/KomentLib.groovy index 9ca6a16..7662947 100644 --- a/src/main/groovy/komenti/klib/KomentLib.groovy +++ b/src/main/groovy/komenti/klib/KomentLib.groovy @@ -29,7 +29,7 @@ class KomentLib { } else { println "Error code: ${err.getStatusCode()}" } - System.exit(1) + //System.exit(1) } static def AOQueryNames(query, cb) { @@ -39,6 +39,7 @@ class KomentLib { cb(json.result) } } catch(e) { + println "Error query: $query" AOAPIErrorHandler(e) } } @@ -115,32 +116,44 @@ class KomentLib { } // metadata to text - static def AOExtractMetadata(c, dLabels) { + static def AOExtractMetadata(c, dLabels, field) { def out = '' c.each { k, v -> - if(k == 'SubClassOf') { return; } - if(k.length() > 30) { return; } // try to remove some of the bugprops - if(v instanceof Collection) { - out += "$k:\n" - v.unique(false).each { - out += " $it\n" - - def dec = dLabels.findAll { l -> "$it".indexOf(l) != -1 } - if(dec) { - dec.each { d -> - out += " (decomposed) ${it.replace(d, '')}\n" - out += " (decomposed): ${d}\n" + if(field) { + if(k == field) { + if(v instanceof Collection) { + v.unique(false).each { + out += "$it\n" } + } else { + out += "$v\n" } - } + } } else { - out += "$k: $v\n" + if(k == 'SubClassOf') { return; } + if(k.length() > 30) { return; } // try to remove some of the bugprops + if(v instanceof Collection) { + out += "$k:\n" + v.unique(false).each { + out += " $it\n" + + def dec = dLabels.findAll { l -> "$it".indexOf(l) != -1 } + if(dec) { + dec.each { d -> + out += " (decomposed) ${it.replace(d, '')}\n" + out += " (decomposed): ${d}\n" + } + } + } + } else { + out += "$k: $v\n" - def dec = dLabels.findAll { l -> "$v".indexOf(l) != -1 } - if(dec) { - dec.each { d -> - out += "$k (decomposed): ${v.replace(d, '')}\n" - out += "$k (decomposed): ${d}\n" + def dec = dLabels.findAll { l -> "$v".indexOf(l) != -1 } + if(dec) { + dec.each { d -> + out += "$k (decomposed): ${v.replace(d, '')}\n" + out += "$k (decomposed): ${d}\n" + } } } } diff --git a/src/main/groovy/komenti/klib/Komentisto.groovy b/src/main/groovy/komenti/klib/Komentisto.groovy index 4f105eb..241b83f 100644 --- a/src/main/groovy/komenti/klib/Komentisto.groovy +++ b/src/main/groovy/komenti/klib/Komentisto.groovy @@ -6,6 +6,8 @@ import edu.stanford.nlp.semgraph.* import edu.stanford.nlp.ie.util.RelationTriple import edu.stanford.nlp.util.* import edu.stanford.nlp.naturalli.* +import edu.stanford.nlp.sentiment.* +import edu.stanford.nlp.neural.rnn.* public class Komentisto { def REP_TOKEN = 'biscuit' @@ -23,8 +25,9 @@ public class Komentisto { def enableIE def vocabulary def threads + def sentiment - def Komentisto(vocabulary, disableModifiers, familyModifier, allergyModifier, enableIE, excludeFile, threads) { + def Komentisto(vocabulary, disableModifiers, familyModifier, sentiment, allergyModifier, enableIE, excludeFile, threads) { this.vocabulary = vocabulary uncertainTerms = UNC_WORDS_FILE.getText().split('\n') @@ -39,6 +42,7 @@ public class Komentisto { this.allergyModifier = allergyModifier this.enableIE = enableIE this.threads = threads + this.sentiment = sentiment initialiseCoreNLP() } @@ -54,6 +58,7 @@ public class Komentisto { aList.removeAll(["ner", "regexner", "entitymentions"]) } if(enableIE) { aList += ["depparse", "natlog", "openie"] } + if(sentiment) { aList += [ "parse", "sentiment" ] } println aList props.put("annotators", aList.join(', ')) @@ -84,7 +89,7 @@ public class Komentisto { def aDocument = new edu.stanford.nlp.pipeline.Annotation(text.toLowerCase()) // TODO I think we may be able to use the 'Annotator.Requirement' class to determine what needs to be run - [ "tokenize", "ssplit", "ner", "regexner", "entitymentions" ].each { + [ "tokenize", "ssplit", "ner", "regexner", "entitymentions", "parse", "sentiment" ].each { coreNLP.getExistingAnnotator(it).annotate(aDocument) } @@ -114,6 +119,20 @@ public class Komentisto { if(!disableModifiers) { def tags = evaluateSentenceConcept(sentence, ner) // add all tags that returned true a.tags = tags.findAll { it.getValue() }.collect { it.getKey() } + + // Thanks for helping me figure this out!! <3 https://github.com/Ruthwik/Sentiment-Analysis/ + if(sentiment) { + def tree = sentence.get(SentimentCoreAnnotations.SentimentAnnotatedTree.class); + def sm = RNNCoreAnnotations.getPredictions(tree); + def sentimentType = sentence.get(SentimentCoreAnnotations.SentimentClass.class); + + a.tags << "SentimentClass:$sentimentType" + a.tags << "S:VP:${(double)Math.round(sm.get(4) * 100d)}" + a.tags << "S:P:${(double)Math.round(sm.get(3) * 100d)}" + a.tags << "S:NEUT:${(double)Math.round(sm.get(2) * 100d)}" + a.tags << "S:N:${(double)Math.round(sm.get(1) * 100d)}" + a.tags << "S:VN:${(double)Math.round(sm.get(0) * 100d)}" + } } results << a @@ -216,9 +235,7 @@ public class Komentisto { if(allergyModifier) { out.allergy = text =~ ALLERGY_PATTERN - } - - out + } } def lemmatise(text) {