From 9499a9056d4a3306c681651f470611bf2ee9d067 Mon Sep 17 00:00:00 2001 From: Arild Date: Fri, 11 Nov 2016 11:30:25 +0100 Subject: [PATCH] regexp_tokenize() handling of paranthesis --- book/ch03.rst | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/book/ch03.rst b/book/ch03.rst index 7710ad71..f51f6ae1 100644 --- a/book/ch03.rst +++ b/book/ch03.rst @@ -1855,16 +1855,15 @@ NLTK's Regular Expression Tokenizer The function ``nltk.regexp_tokenize()`` is similar to ``re.findall()`` (as we've been using it for tokenization). However, ``nltk.regexp_tokenize()`` -is more efficient for this task, and avoids the need for special treatment of parentheses. -For readability we break up the regular expression over several lines -and add a comment about each line. The special ``(?x)`` "verbose flag" -tells Python to strip out the embedded whitespace and comments. +is more efficient for this task. For readability we break up the regular +expression over several lines and add a comment about each line. The special +``(?x)`` "verbose flag" tells Python to strip out the embedded whitespace and comments. >>> text = 'That U.S.A. poster-print costs $12.40...' >>> pattern = r'''(?x) # set flag to allow verbose regexps - ... ([A-Z]\.)+ # abbreviations, e.g. U.S.A. - ... | \w+(-\w+)* # words with optional internal hyphens - ... | \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82% + ... (?:[A-Z]\.)+ # abbreviations, e.g. U.S.A. + ... | \w+(?:-\w+)* # words with optional internal hyphens + ... | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82% ... | \.\.\. # ellipsis ... | [][.,;"'?():-_`] # these are separate tokens; includes ], [ ... '''