Skip to content

Commit

Permalink
instructions for linguistic annotation
Browse files Browse the repository at this point in the history
  • Loading branch information
rsennrich committed Jun 25, 2018
1 parent 5c767b5 commit 7fa3297
Show file tree
Hide file tree
Showing 2 changed files with 137 additions and 1 deletion.
27 changes: 26 additions & 1 deletion factored_sample/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,29 @@ Given a model, preprocessed text can be translated thusly:
Finally, you may want to post-process the translation output, namely merge BPE segments,
detruecase and detokenize:

./postprocess-test.sh < data/newstest2013.output > data/newstest2013.postprocessed
./postprocess-test.sh < data/newstest2013.output > data/newstest2013.postprocessed


INSTRUCTIONS FOR LINGUISTIC ANNOTATION
--------------------------------------

If you want to work with raw text, here are the commands we used for creating the CONLL-formatted text.
All commands were run after tokenization (with the Moses tokenizer), and after running the Moses script `scripts/tokenizer/deescape-special-chars.perl` to undo the escaping of special characters:

For English:

Download Stanford CoreNLP, then run:

preprocess/stanford-conll-wrapper.py --corenlp /path/to/stanford-corenlp-3.5.0.jar --corenlp-models /path/to/stanford-corenlp-3.5.0-models.jar < input_file > output_file

3.5.0 is the version used in the published experiments. Newer versions of CoreNLP also support CoNLL output directly, although the number of columns is smaller, and conll_to_factors.py needs to be adjusted:

java -cp /path/to/stanford-corenlp-3.9.1.jar:/path/to/stanford-corenlp-3.9.1-models.jar edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators "tokenize, ssplit, pos, depparse, lemma" -ssplit.eolonly true -tokenize.whitespace true -outputFormat conll < input_file > output_file 2> /dev/null

For German:

Download ParZu and execute `install.sh`: https://github.com/rsennrich/ParZu

to parse tokenized, one-line-per-sentence German text, run:

ParZu/parzu --input tokenized_lines -p 8 < input_file > output_file
111 changes: 111 additions & 0 deletions preprocess/stanford-conll-wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Author: Rico Sennrich

# wrapper to parse text file with StanfordCoreNLP tools and print it in CoNLL format
# Input must be tokenized, and one line per sentence.

# requirements:
# - Stanford CoreNLP
# - English models for CoreNLP

from __future__ import print_function, unicode_literals
import os
import sys
import codecs
import io
import argparse

from collections import defaultdict
from subprocess import Popen, PIPE

def parse_args():

parser = argparse.ArgumentParser()
parser.add_argument('--java', type=str, help = "path to JAVA runtime binary", default = 'java')
parser.add_argument('--corenlp', type=str, required=True, help = "path to stanford-corenlp-{version}.jar file")
parser.add_argument('--corenlp-models', type=str, required=True, help = "path to stanford-corenlp-{version}-models.jar")

return parser.parse_args()

def process_stanford(infile, java, corenlp, corenlp_models):

stanford = Popen([java,
'-cp', corenlp + ':' + corenlp_models,
'edu.stanford.nlp.pipeline.StanfordCoreNLP',
'-annotators', 'tokenize, ssplit, pos, depparse, lemma',
'-ssplit.eolonly', 'true',
'-tokenize.whitespace', 'true',
'-numThreads', '8',
'-textFile', '-',
'outFile', '-'], stdin=infile, stdout = PIPE, stderr = open('/dev/null', 'w'))
return stanford.stdout


def get_sentences(instream):
sentence = []
expect = 0

for line in instream:
if expect == 0 and line.startswith('Sentence #'):
if sentence:
yield sentence
sentence = []
expect = 1

elif line == '\n':
expect = 0

elif expect == 3:
try:
rel, remainder = line.split('(')
except:
sys.stderr.write(line + '\n')
raise
head, dep = remainder.split()
head_int = int(head.split('-')[-1][:-1])
dep_int = int(dep.split('-')[-1][:-1])
sentence[dep_int-1]['head'] = head_int
sentence[dep_int-1]['label'] = rel

elif expect == 2:
linesplit = line.split('[',1)[1].rsplit(']',1)[0].split('] [')
if len(linesplit) != len(sentence):
sys.stderr.write('Warning: mismatch in number of words in sentence\n')
sys.stderr.write(' '.join(w['word'] for w in sentence))
for i in range(len(sentence)):
sentence[i]['pos'] = '-'
sentence[i]['lemma'] = '-'
sentence[i]['head'] = 0
sentence[i]['label'] = '-'
expect = 0
continue
for i,w in enumerate(linesplit):
sentence[i]['pos'] = w.split(' PartOfSpeech=')[-1].split()[0]
sentence[i]['lemma'] = w.split(' Lemma=')[-1]
expect = 3

elif expect == 1:
for w in line.split():
sentence.append({'word':w})
expect = 2

if sentence:
yield sentence

def write(sentence):
for i, w in enumerate(sentence):
sys.stdout.write('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n'.format(i+1, w['word'], w['lemma'], w['pos'], w['pos'], '-', w['head'], w['label']))

if __name__ == '__main__':
if sys.version_info < (3, 0):
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)

args = parse_args()

stanford = process_stanford(sys.stdin, args.java, args.corenlp, args.corenlp_models)
for sentence in get_sentences(codecs.getreader('UTF-8')(stanford)):
write(sentence)
sys.stdout.write('\n')

0 comments on commit 7fa3297

Please sign in to comment.