Merge pull request #133 from byexamples/Issue-128-Replace-regex-engine

Issue 128 replace regex engine
byexamples · Nov 26, 2020 · 483c17c · 483c17c
2 parents c636c12 + e03fff6
commit 483c17c
Show file tree

Hide file tree

Showing 23 changed files with 112 additions and 32 deletions.
diff --git a/byexample/common.py b/byexample/common.py
@@ -1,5 +1,6 @@
 from __future__ import unicode_literals
-import pprint, traceback, contextlib, os, re, string, shlex, logging, time
+import pprint, traceback, contextlib, os, string, shlex, logging, time
+from . import regex as re
 '''
 >>> from byexample.common import tohuman, short_string
 >>> import time
@@ -11,7 +12,7 @@ def indent(s, indent=4):
     ''' Indent the given text.
         See doctest._indent for the code that inspired this.
         '''
-    return re.sub('(?m)^(?!$)', indent * ' ', s)
+    return re.compile('(?m)^(?!$)').sub(indent * ' ', s)
 
 
 def short_string(s, max=14, sep='..'):

diff --git a/byexample/differ.py b/byexample/differ.py
@@ -1,6 +1,7 @@
 from __future__ import unicode_literals
 from .common import colored, ShebangTemplate
-import string, re, difflib, tempfile, os, subprocess
+import string, difflib, tempfile, os, subprocess
+from . import regex as re
 
 # what unicodes are control code?
 #   import unicodedata

diff --git a/byexample/expected.py b/byexample/expected.py
@@ -1,6 +1,7 @@
 from __future__ import unicode_literals
 from .log import clog, log_context
-import string, re, time
+import string, time
+from . import regex as re
 '''
 >>> from byexample.log import init_log_system
 >>> init_log_system()

diff --git a/byexample/finder.py b/byexample/finder.py
@@ -1,5 +1,6 @@
 from __future__ import unicode_literals
-import re, os
+import os
+from . import regex as re
 from .common import build_where_msg, tohuman, \
                     enhance_exceptions
 
@@ -476,14 +477,14 @@ def check_keep_matching(self, example_str, match):
         'check_and_remove_indent' and other processing functions.
 
             >>> from byexample.finder import ExampleFinder
-            >>> import re
+            >>> import byexample.regex as re
 
             >>> mfinder = ExampleFinder(0, 'utf8'); mfinder.target = 'python-prompt'
             >>> check_and_remove_indent = mfinder.check_and_remove_indent
             >>> check_keep_matching     = mfinder.check_keep_matching
 
             >>> code = '  >>> 1 + 2'
-            >>> match = re.match(r'[ ]*>>> [^\n]*', code)
+            >>> match = re.compile(r'[ ]*>>> [^\n]*').match(code)
 
             >>> code_i = check_and_remove_indent(code, '  ', (1, 2, 'foo.rst', None))
             >>> code_i != code

diff --git a/byexample/modules/clipboard.py b/byexample/modules/clipboard.py
@@ -1,6 +1,6 @@
 from __future__ import unicode_literals
 from byexample.concern import Concern
-import re
+import byexample.regex as re
 from functools import partial
 
 stability = 'provisional'
@@ -52,9 +52,8 @@ def before_build_regex(self, example, options):
         repl = partial(
             self.repl_from_clipboard, clipboard=self.clipboard, missing=[]
         )
-        example.expected_str = re.sub(
-            self.PASTE_RE, repl, example.expected_str
-        )
+        example.expected_str = re.compile(self.PASTE_RE
+                                          ).sub(repl, example.expected_str)
 
         # do not check for missings: we assume that they are capture tags
 
@@ -75,7 +74,7 @@ def finish_parse(self, example, options, exception):
             clipboard=self.clipboard,
             missing=missing
         )
-        example.source = re.sub(self.PASTE_RE, repl, example.source)
+        example.source = re.compile(self.PASTE_RE).sub(repl, example.source)
 
         if missing:
             raise PasteError(example, missing)

diff --git a/byexample/modules/cond.py b/byexample/modules/cond.py
@@ -1,6 +1,5 @@
 from __future__ import unicode_literals
 from byexample.concern import Concern
-import re
 from functools import partial
 
 stability = 'experimental'

diff --git a/byexample/modules/cpp.py b/byexample/modules/cpp.py
@@ -30,7 +30,8 @@
 """
 
 from __future__ import unicode_literals
-import re, sys, time
+import sys, time
+import byexample.regex as re
 from byexample.common import constant
 from byexample.parser import ExampleParser
 from byexample.runner import ExampleRunner, PexpectMixin, ShebangTemplate

diff --git a/byexample/modules/delimiters.py b/byexample/modules/delimiters.py
@@ -1,5 +1,6 @@
 from __future__ import unicode_literals
-import re, ast, itertools
+import ast, itertools
+import byexample.regex as re
 from byexample.finder import ZoneDelimiter
 from byexample.common import constant
 from byexample.log import clog

diff --git a/byexample/modules/elixir.py b/byexample/modules/elixir.py
@@ -49,7 +49,8 @@
 """
 
 from __future__ import unicode_literals
-import re, pexpect, sys, time
+import pexpect, sys, time
+import byexample.regex as re
 from byexample.common import constant
 from byexample.parser import ExampleParser
 from byexample.finder import ExampleFinder

diff --git a/byexample/modules/gdb.py b/byexample/modules/gdb.py
@@ -10,7 +10,8 @@
 """
 
 from __future__ import unicode_literals
-import re, pexpect, sys, time
+import pexpect, sys, time
+import byexample.regex as re
 from byexample.common import constant
 from byexample.parser import ExampleParser
 from byexample.finder import ExampleFinder

diff --git a/byexample/modules/javascript.py b/byexample/modules/javascript.py
@@ -33,7 +33,7 @@
 """
 
 from __future__ import unicode_literals
-import re
+import byexample.regex as re
 from byexample.common import constant, abspath
 from byexample.parser import ExampleParser
 from byexample.finder import ExampleFinder

diff --git a/byexample/modules/php.py b/byexample/modules/php.py
@@ -37,7 +37,7 @@
 """
 
 from __future__ import unicode_literals
-import re
+import byexample.regex as re
 from byexample.common import constant
 from byexample.parser import ExampleParser
 from byexample.finder import ExampleFinder

diff --git a/byexample/modules/python.py b/byexample/modules/python.py
@@ -23,7 +23,8 @@
 """
 
 from __future__ import unicode_literals
-import re, pexpect, sys, time
+import pexpect, sys, time
+import byexample.regex as re
 from byexample.common import constant
 from byexample.log import clog
 from byexample.parser import ExampleParser, ExtendOptionParserMixin

diff --git a/byexample/modules/ruby.py b/byexample/modules/ruby.py
@@ -53,7 +53,8 @@
 """
 
 from __future__ import unicode_literals
-import re, pexpect, sys, time
+import pexpect, sys, time
+import byexample.regex as re
 from byexample.common import constant
 from byexample.parser import ExampleParser
 from byexample.finder import ExampleFinder

diff --git a/byexample/modules/shell.py b/byexample/modules/shell.py
@@ -24,7 +24,8 @@
 """
 
 from __future__ import unicode_literals
-import re, pexpect, sys, time
+import pexpect, sys, time
+import byexample.regex as re
 from byexample.common import constant, Countdown
 from byexample.parser import ExampleParser
 from byexample.finder import ExampleFinder

diff --git a/byexample/parser.py b/byexample/parser.py
@@ -1,5 +1,6 @@
 from __future__ import unicode_literals
-import re, shlex, argparse, bisect, collections
+import shlex, argparse, bisect, collections
+from . import regex as re
 from .common import tohuman, constant
 from .options import OptionParser, UnrecognizedOption, ExtendOptionParserMixin
 from .expected import _LinearExpected, _RegexExpected
@@ -218,7 +219,7 @@ def expected_as_regexs(
 
             >>> from byexample.parser import ExampleParser
             >>> from functools import partial
-            >>> import re
+            >>> import byexample.regex as re
 
             >>> parser = ExampleParser(0, 'utf8', None); parser.language = 'python'
             >>> _as_regexs = partial(parser.expected_as_regexs, tags_enabled=True, input_enabled=True, normalize_whitespace=False, input_prefix_len_range=(6,12))

diff --git a/byexample/parser_sm.py b/byexample/parser_sm.py
@@ -1,4 +1,4 @@
-import re
+from . import regex as re
 from .common import constant, short_string
 from .log import clog, log_context, DEBUG
 import pprint
@@ -12,7 +12,7 @@
 
 >>> from byexample.parser_sm import SM, SM_NormWS, SM_NotNormWS
 >>> from byexample.parser import ExampleParser
->>> import re
+>>> import byexample.regex as re
 >>> from functools import partial
 
 >>> parser = ExampleParser(0, 'utf8', None); parser.language = 'python'

diff --git a/byexample/regex.py b/byexample/regex.py
@@ -0,0 +1,15 @@
+import sys
+import regex
+
+
+def compile(pattern, flags=0):
+    return regex.compile(pattern, flags)
+
+
+escape = regex.escape
+
+# Borrow from regex module its uppercase FLAGS
+# so they are accessible from importing this module directly
+module = sys.modules[__name__]
+for sym in (sym for sym in dir(regex) if sym.isupper()):
+    setattr(module, sym, getattr(regex, sym))
diff --git a/byexample/runner.py b/byexample/runner.py
@@ -1,5 +1,7 @@
 from __future__ import unicode_literals
-import re, pexpect, time, termios, operator, os, itertools, contextlib
+import pexpect, time, termios, operator, os, itertools, contextlib
+import re as python_re
+from . import regex as re
 from functools import reduce, partial
 from .executor import TimeoutException, InputPrefixNotFound
 from .common import tohuman, ShebangTemplate, Countdown, short_string
@@ -78,6 +80,45 @@ def cancel(self, example, options):
         return False
 
 
+class PexpectSpawnAdapter(pexpect.spawn):
+    def compile_pattern_list(self, patterns):
+        ''' This is an extension of pexpect.spawn.compile_pattern_list
+            to accept not only Python's regex objects (re module) but
+            also Barnett's regexs (third-party regex module).
+
+            This is a workaround for the issue #655
+            (https://github.com/pexpect/pexpect/issues/655)
+            '''
+        if patterns is None:
+            return []
+        if not isinstance(patterns, list):
+            patterns = [patterns]
+
+        # Allow dot to match \n
+        compile_flags = python_re.DOTALL
+        if self.ignorecase:
+            compile_flags = compile_flags | python_re.IGNORECASE
+        compiled_pattern_list = []
+        cls = pexpect.spawnbase
+        for idx, p in enumerate(patterns):
+            if isinstance(p, self.allowed_string_types):
+                p = self._coerce_expect_string(p)
+                compiled_pattern_list.append(
+                    python_re.compile(p, compile_flags)
+                )
+            elif p is cls.EOF:
+                compiled_pattern_list.append(cls.EOF)
+            elif p is cls.TIMEOUT:
+                compiled_pattern_list.append(cls.TIMEOUT)
+            elif isinstance(p, type(python_re.compile(''))):
+                compiled_pattern_list.append(p)
+            elif isinstance(p, type(re.compile(''))):  # <-- the workaround
+                compiled_pattern_list.append(p)
+            else:
+                self._pattern_type_err(p)
+        return compiled_pattern_list
+
+
 class PexpectMixin(object):
     def __init__(self, PS1_re, any_PS_re):
         self.PS1_re = re.compile(PS1_re)
@@ -104,7 +145,7 @@ def _spawn_interpreter(
         env.update({'LINES': str(rows), 'COLUMNS': str(cols)})
 
         self._drop_output()  # there shouldn't be any output yet but...
-        self.interpreter = pexpect.spawn(
+        self.interpreter = PexpectSpawnAdapter(
             cmd,
             echo=False,
             encoding=self.encoding,
@@ -250,7 +291,7 @@ def _change_terminal_geometry(self, rows, cols, options):
 
     @staticmethod
     def _universal_new_lines(out):
-        return re.sub(PexpectMixin.UNIV_NL, '\n', out)
+        return re.compile(PexpectMixin.UNIV_NL).sub('\n', out)
 
     def _emulate_ansi_terminal(self, chunks, join=True):
         for chunk in chunks:

diff --git a/docs/contrib/how-to-define-new-zones-where-to-find-examples.md b/docs/contrib/how-to-define-new-zones-where-to-find-examples.md
@@ -42,7 +42,7 @@ everything except the code between ``<pre>`` and ``</pre>`` tags.
 This is what you need to write:
 
 ```python
->>> import re
+>>> import byexample.regex as re
 >>> from byexample.finder import ZoneDelimiter
 
 >>> class HTMLPreBlockDelimiter(ZoneDelimiter):
@@ -64,5 +64,10 @@ or set of several extensions.
 The ``zone_regex`` method should return a regular expression to find and capture
 the zones.
 
+While you can use the standard
+[``re`` module](https://docs.python.org/3/library/re.html) it is
+recommended to use ``byexample.regex`` which has some built-in
+optimizations.
+
 And optionally, the ``get_zone`` can be overridden to post-process the captured
 string: use it to remove any spurious string that may had been captured.
diff --git a/docs/contrib/how-to-support-new-finders-and-languages.md b/docs/contrib/how-to-support-new-finders-and-languages.md
@@ -60,7 +60,7 @@ To accomplish this we need to create a regular expression to find the
 ``~~~``, where the snippet of code is and where the expected output is.
 
 ```python
->>> import re
+>>> import byexample.regex as re
 
 >>> example_re = re.compile(r'''
 ...     # begin with ~~~
@@ -107,6 +107,11 @@ The ``indent`` group is to count how many spaces are not part of the example
 and they are just for indentation: ``byexample`` will *drop* the first line that
 has a lower level of indentation and any subsequent line.
 
+While you can use the standard
+[``re`` module](https://docs.python.org/3/library/re.html) it is
+recommended to use ``byexample.regex`` which has some built-in
+optimizations.
+
 ### Detect the language
 
 Then, the finder needs to determinate in which language the example
@@ -351,7 +356,7 @@ you do not need to install a real ``ArnoldC`` compiler.
 ...     output = []
 ...     for line in source_code.split('\n'):
 ...         if line.startswith("TALK TO THE HAND"):
-...             to_print = re.search(r'"([^"]*)"', line).group(1)
+...             to_print = re.compile(r'"([^"]*)"').search(line).group(1)
 ...             output.append(to_print + '\n')
 ...
 ...     return '\n'.join(output)

diff --git a/setup.py b/setup.py
@@ -42,6 +42,7 @@
     'pexpect>=4,<5',     # pexpect 4.x.x required
     'appdirs>=1.4.3,<2', # appdirs 1.4.x (x >= 3) required
     'pyte==0.8.0',       # pyte exact version 0.8.0 required
+    'regex>=2017.01.12', # regex's pickle was introduced in 2016
     ]
 
 # these, on the other hand, are optional nice to have

diff --git a/test/Dockerfile b/test/Dockerfile
@@ -54,4 +54,7 @@ RUN wget https://packages.erlang-solutions.com/erlang-solutions_2.0_all.deb && d
         esl-erlang      \
         elixir
 
+RUN DEBIAN_FRONTEND=noninteractive apt-get --no-install-recommends install -y \
+        python3-dev
+
 CMD /bin/bash