instructions for linguistic annotation

rsennrich · Jun 25, 2018 · 7fa3297 · 7fa3297
1 parent 5c767b5
commit 7fa3297
Show file tree

Hide file tree

Showing 2 changed files with 137 additions and 1 deletion.
diff --git a/factored_sample/README.md b/factored_sample/README.md
@@ -31,4 +31,29 @@ Given a model, preprocessed text can be translated thusly:
 Finally, you may want to post-process the translation output, namely merge BPE segments,
 detruecase and detokenize:
 
-  ./postprocess-test.sh < data/newstest2013.output > data/newstest2013.postprocessed
+  ./postprocess-test.sh < data/newstest2013.output > data/newstest2013.postprocessed
+
+
+INSTRUCTIONS FOR LINGUISTIC ANNOTATION
+--------------------------------------
+
+If you want to work with raw text, here are the commands we used for creating the CONLL-formatted text.
+All commands were run after tokenization (with the Moses tokenizer), and after running the Moses script `scripts/tokenizer/deescape-special-chars.perl` to undo the escaping of special characters:
+
+For English:
+
+Download Stanford CoreNLP, then run:
+
+  preprocess/stanford-conll-wrapper.py --corenlp /path/to/stanford-corenlp-3.5.0.jar --corenlp-models /path/to/stanford-corenlp-3.5.0-models.jar < input_file > output_file
+
+3.5.0 is the version used in the published experiments. Newer versions of CoreNLP also support CoNLL output directly, although the number of columns is smaller, and conll_to_factors.py needs to be adjusted:
+
+  java -cp /path/to/stanford-corenlp-3.9.1.jar:/path/to/stanford-corenlp-3.9.1-models.jar edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators "tokenize, ssplit, pos, depparse, lemma" -ssplit.eolonly true -tokenize.whitespace true -outputFormat conll < input_file > output_file 2> /dev/null
+
+For German:
+
+Download ParZu and execute `install.sh`: https://github.com/rsennrich/ParZu
+
+to parse tokenized, one-line-per-sentence German text, run:
+
+  ParZu/parzu --input tokenized_lines -p 8 < input_file > output_file
diff --git a/preprocess/stanford-conll-wrapper.py b/preprocess/stanford-conll-wrapper.py
@@ -0,0 +1,111 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Author: Rico Sennrich
+
+# wrapper to parse text file with StanfordCoreNLP tools and print it in CoNLL format
+# Input must be tokenized, and one line per sentence.
+
+# requirements:
+# - Stanford CoreNLP
+# - English models for CoreNLP
+
+from __future__ import print_function, unicode_literals
+import os
+import sys
+import codecs
+import io
+import argparse
+
+from collections import defaultdict
+from subprocess import Popen, PIPE
+
+def parse_args():
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--java', type=str, help = "path to JAVA runtime binary", default = 'java')
+    parser.add_argument('--corenlp', type=str, required=True, help = "path to stanford-corenlp-{version}.jar file")
+    parser.add_argument('--corenlp-models', type=str, required=True, help = "path to stanford-corenlp-{version}-models.jar")
+
+    return parser.parse_args()
+
+def process_stanford(infile, java, corenlp, corenlp_models):
+
+    stanford = Popen([java,
+               '-cp', corenlp + ':' + corenlp_models,
+               'edu.stanford.nlp.pipeline.StanfordCoreNLP',
+               '-annotators', 'tokenize, ssplit, pos, depparse, lemma',
+               '-ssplit.eolonly', 'true',
+               '-tokenize.whitespace', 'true',
+               '-numThreads', '8',
+               '-textFile', '-',
+               'outFile', '-'], stdin=infile, stdout = PIPE, stderr = open('/dev/null', 'w'))
+    return stanford.stdout
+
+
+def get_sentences(instream):
+    sentence = []
+    expect = 0
+
+    for line in instream:
+        if expect == 0 and line.startswith('Sentence #'):
+            if sentence:
+                yield sentence
+            sentence = []
+            expect = 1
+
+        elif line == '\n':
+            expect = 0
+
+        elif expect == 3:
+            try:
+                rel, remainder = line.split('(')
+            except:
+                sys.stderr.write(line + '\n')
+                raise
+            head, dep = remainder.split()
+            head_int = int(head.split('-')[-1][:-1])
+            dep_int = int(dep.split('-')[-1][:-1])
+            sentence[dep_int-1]['head'] = head_int
+            sentence[dep_int-1]['label'] = rel
+
+        elif expect == 2:
+            linesplit = line.split('[',1)[1].rsplit(']',1)[0].split('] [')
+            if len(linesplit) != len(sentence):
+                sys.stderr.write('Warning: mismatch in number of words in sentence\n')
+                sys.stderr.write(' '.join(w['word'] for w in sentence))
+                for i in range(len(sentence)):
+                    sentence[i]['pos'] = '-'
+                    sentence[i]['lemma'] = '-'
+                    sentence[i]['head'] = 0
+                    sentence[i]['label'] = '-'
+                expect = 0
+                continue
+            for i,w in enumerate(linesplit):
+                sentence[i]['pos'] = w.split(' PartOfSpeech=')[-1].split()[0]
+                sentence[i]['lemma'] = w.split(' Lemma=')[-1]
+            expect = 3
+
+        elif expect == 1:
+            for w in line.split():
+                sentence.append({'word':w})
+            expect = 2
+
+    if sentence:
+        yield sentence
+
+def write(sentence):
+    for i, w in enumerate(sentence):
+      sys.stdout.write('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n'.format(i+1, w['word'], w['lemma'], w['pos'], w['pos'], '-', w['head'], w['label']))
+
+if __name__ == '__main__':
+    if sys.version_info < (3, 0):
+        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
+        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+        sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+
+    args = parse_args()
+
+    stanford = process_stanford(sys.stdin, args.java, args.corenlp, args.corenlp_models)
+    for sentence in get_sentences(codecs.getreader('UTF-8')(stanford)):
+       write(sentence)
+       sys.stdout.write('\n')