sentiment and metadata

reality · Oct 3, 2021 · f37b5da · f37b5da
1 parent 8536458
commit f37b5da
Show file tree

Hide file tree

Showing 4 changed files with 70 additions and 31 deletions.
diff --git a/src/main/groovy/komenti/App.groovy b/src/main/groovy/komenti/App.groovy
@@ -25,6 +25,7 @@ class App {
       _ longOpt: 'label-extension', 'Run a named label extension, e.g. cmo', args: 1
       _ longOpt: 'direct', 'Receive only direct super/subclasses from the DL query. Default is false.', type: Boolean
       _ longOpt: 'class-mode', 'Return only one label per matching IRI.', type: Boolean
+      _ longOpt: 'field', 'Return only this metadata field', args: 1
 
       // annotation options
       t longOpt: 'text', 'A file or directory of files to annotate.', args: 1
@@ -33,6 +34,7 @@ class App {
       _ longOpt: 'per-line', 'Process each line of each file seperately (useful for field-based data e.g. downloaded with get_metadata)', type: Boolean
       _ longOpt: 'disable-modifiers', 'Don\'t evaluate negation and uncertainty. The reason for this is: it takes a lot of time!', type: Boolean
       _ longOpt: 'family-modifier', 'Evaluate sentences for whether or not they mention a family member.', type: Boolean
+      _ longOpt: 'sentiment', 'Get sentiment score for annotations', type: Boolean
       _ longOpt: 'allergy-modifier', 'Evaluate sentences for whether or not they mention an allergy', type: Boolean
       _ longOpt: 'exclude', 'A list of phrases, which when matched in a sentence, will cause that sentence not to be annotated. One phrase per line.', args: 1
       _ longOpt: 'write-pdfs-to-dir', 'If set, write the converted PDF text into the given directory.', args: 1

diff --git a/src/main/groovy/komenti/Komenti.groovy b/src/main/groovy/komenti/Komenti.groovy
@@ -128,12 +128,13 @@ public class Komenti {
 
   static def get_metadata(o) {
     def outDir = getOutDir(o)
-    def files = [:]
+    ConcurrentHashMap files = [:]
 
     def vocab = Vocabulary.loadFile(o.l) 
     def komentisto = new Komentisto(vocab, 
       o['disable-modifiers'], 
       o['family-modifier'], 
+      o['sentiment'], 
       o['allergy-modifier'],
       false,
       o['exclude'], 
@@ -155,17 +156,22 @@ public class Komenti {
 
     if(!o['decompose-entities']) { entityLabels = [] }
 
-    classLabels.each { iri, l ->
+    def i = 0
+    GParsPool.withPool(o['threads'] ?: 1) { p -> 
+    classLabels.eachParallel{ iri, l ->
+      println "(${++i}/${classLabels.size()})"
       KomentLib.AOSemanticQuery("<$iri>", l.o, false, "equivalent", { classes ->
         // we want the actual class, not just semantically equivalent ones. although tbh it might be better to get the metadata from those too. it has to be semantically equivalent to this class, after all
         def c = classes.find { it.class == iri }
-        def metadata = KomentLib.AOExtractMetadata(c, entityLabels)
+        def metadata = KomentLib.AOExtractMetadata(c, entityLabels, o['field'])
         if(o['lemmatise']) { // we do it per line here, since it's a field based document
           metadata = metadata.split('\n').collect { komentisto.lemmatise(it) }.join('\n')
         }
-        files[l.l[0]] = metadata
+        // TODO this is bad and lazy 
+        files[iri.tokenize('/').last()] = metadata
       })
     }
+    }
 
     println "Writing metadata files for ${files.size()} classes."
     files.each { n, c ->
@@ -196,6 +202,7 @@ public class Komenti {
     def komentisto = new Komentisto(vocab, 
       o['disable-modifiers'], 
       o['family-modifier'], 
+      o['sentiment'], 
       o['allergy-modifier'],
       o['extract-triples'],
       o['exclude'], 
@@ -373,7 +380,7 @@ public class Komenti {
       if(o['id-list-only']) {
         writeOutput(aids.join('\n'), o, "Saved pmcids to $o.out!")
       } else {
-        def komentisto = new Komentisto(false, true, o['family-modifier'], o['allergy-modifier'], false,  o['exclude'], o['threads'] ?: 1)
+        def komentisto = new Komentisto(false, true, o['family-modifier'], o['sentiment'], o['allergy-modifier'], false,  o['exclude'], o['threads'] ?: 1)
         def abstracts = []
         aids.each { pmcid ->
           KomentLib.PMCGetAbstracts(pmcid, { a -> 

diff --git a/src/main/groovy/komenti/klib/KomentLib.groovy b/src/main/groovy/komenti/klib/KomentLib.groovy
@@ -29,7 +29,7 @@ class KomentLib {
     } else {
       println "Error code: ${err.getStatusCode()}" 
     }
-    System.exit(1)
+    //System.exit(1)
   }
 
   static def AOQueryNames(query, cb) {
@@ -39,6 +39,7 @@ class KomentLib {
         cb(json.result)
       }
     } catch(e) {
+      println "Error query: $query"
       AOAPIErrorHandler(e)
     }
   }
@@ -115,32 +116,44 @@ class KomentLib {
   }
 
   // metadata to text
-  static def AOExtractMetadata(c, dLabels) {
+  static def AOExtractMetadata(c, dLabels, field) {
     def out = ''
     c.each { k, v ->
-      if(k == 'SubClassOf') { return; }
-      if(k.length() > 30) { return; } // try to remove some of the bugprops
-      if(v instanceof Collection) {
-        out += "$k:\n"
-        v.unique(false).each {
-          out += "  $it\n"
-
-          def dec = dLabels.findAll { l -> "$it".indexOf(l) != -1 }
-          if(dec) {
-            dec.each { d ->
-              out += "  (decomposed) ${it.replace(d, '')}\n"
-              out += "  (decomposed): ${d}\n" 
+      if(field) {
+        if(k == field) {
+          if(v instanceof Collection) {
+            v.unique(false).each {
+              out += "$it\n"
             }
+          } else {
+            out += "$v\n" 
           }
-        }
+        } 
       } else {
-        out += "$k: $v\n" 
+        if(k == 'SubClassOf') { return; }
+        if(k.length() > 30) { return; } // try to remove some of the bugprops
+        if(v instanceof Collection) {
+          out += "$k:\n"
+          v.unique(false).each {
+            out += "  $it\n"
+
+            def dec = dLabels.findAll { l -> "$it".indexOf(l) != -1 }
+            if(dec) {
+              dec.each { d ->
+                out += "  (decomposed) ${it.replace(d, '')}\n"
+                out += "  (decomposed): ${d}\n" 
+              }
+            }
+          }
+        } else {
+          out += "$k: $v\n" 
 
-        def dec = dLabels.findAll { l -> "$v".indexOf(l) != -1 }
-        if(dec) {
-          dec.each { d ->
-            out += "$k (decomposed): ${v.replace(d, '')}\n" 
-            out += "$k (decomposed): ${d}\n" 
+          def dec = dLabels.findAll { l -> "$v".indexOf(l) != -1 }
+          if(dec) {
+            dec.each { d ->
+              out += "$k (decomposed): ${v.replace(d, '')}\n" 
+              out += "$k (decomposed): ${d}\n" 
+            }
           }
         }
       }

diff --git a/src/main/groovy/komenti/klib/Komentisto.groovy b/src/main/groovy/komenti/klib/Komentisto.groovy
@@ -6,6 +6,8 @@ import edu.stanford.nlp.semgraph.*
 import edu.stanford.nlp.ie.util.RelationTriple 
 import edu.stanford.nlp.util.*
 import edu.stanford.nlp.naturalli.*
+import edu.stanford.nlp.sentiment.*
+import edu.stanford.nlp.neural.rnn.*
 
 public class Komentisto {
   def REP_TOKEN = 'biscuit'
@@ -23,8 +25,9 @@ public class Komentisto {
   def enableIE
   def vocabulary
   def threads
+  def sentiment
 
-  def Komentisto(vocabulary, disableModifiers, familyModifier, allergyModifier, enableIE, excludeFile, threads) {
+  def Komentisto(vocabulary, disableModifiers, familyModifier, sentiment, allergyModifier, enableIE, excludeFile, threads) {
     this.vocabulary = vocabulary
 
     uncertainTerms = UNC_WORDS_FILE.getText().split('\n')
@@ -39,6 +42,7 @@ public class Komentisto {
     this.allergyModifier = allergyModifier
     this.enableIE = enableIE
     this.threads = threads
+    this.sentiment = sentiment
 
     initialiseCoreNLP()
   }
@@ -54,6 +58,7 @@ public class Komentisto {
       aList.removeAll(["ner", "regexner", "entitymentions"])
     }
     if(enableIE) { aList += ["depparse", "natlog", "openie"] }
+    if(sentiment) { aList += [ "parse", "sentiment" ] }
     println aList
     props.put("annotators", aList.join(', '))
 
@@ -84,7 +89,7 @@ public class Komentisto {
     def aDocument = new edu.stanford.nlp.pipeline.Annotation(text.toLowerCase())
 
     // TODO I think we may be able to use the 'Annotator.Requirement' class to determine what needs to be run
-    [ "tokenize", "ssplit", "ner", "regexner", "entitymentions" ].each {
+    [ "tokenize", "ssplit", "ner", "regexner", "entitymentions", "parse", "sentiment" ].each {
       coreNLP.getExistingAnnotator(it).annotate(aDocument)
     }
 
@@ -114,6 +119,20 @@ public class Komentisto {
           if(!disableModifiers) {
             def tags = evaluateSentenceConcept(sentence, ner) // add all tags that returned true
             a.tags = tags.findAll { it.getValue() }.collect { it.getKey() }
+
+            // Thanks for helping me figure this out!! <3 https://github.com/Ruthwik/Sentiment-Analysis/
+            if(sentiment) {
+                def tree = sentence.get(SentimentCoreAnnotations.SentimentAnnotatedTree.class);
+                def sm = RNNCoreAnnotations.getPredictions(tree);
+                def sentimentType = sentence.get(SentimentCoreAnnotations.SentimentClass.class);
+
+                a.tags << "SentimentClass:$sentimentType"
+                a.tags << "S:VP:${(double)Math.round(sm.get(4) * 100d)}"
+                a.tags << "S:P:${(double)Math.round(sm.get(3) * 100d)}"
+                a.tags << "S:NEUT:${(double)Math.round(sm.get(2) * 100d)}"
+                a.tags << "S:N:${(double)Math.round(sm.get(1) * 100d)}"
+                a.tags << "S:VN:${(double)Math.round(sm.get(0) * 100d)}"
+            }
           }
 
           results << a
@@ -216,9 +235,7 @@ public class Komentisto {
 
     if(allergyModifier) {
       out.allergy = text =~ ALLERGY_PATTERN
-    }
-
-    out
+    } 
   }
 
   def lemmatise(text) {