From f1dae3eb95dc7b9ad5e5796e70e32aa55021d9dd Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Wed, 6 Aug 2014 11:40:09 -0700 Subject: [PATCH 01/14] Add Python build artifacts to gitignore. --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 9be761ee..f972b157 100644 --- a/.gitignore +++ b/.gitignore @@ -39,6 +39,10 @@ test-suite.log /gumbo-[0-9].[0-9].tar.gz /gumbo-[0-9].[0-9]/ +# Python dist artifacts +dist +python/gumbo.egg-info + # Example binaries clean_text find_links From 8cd6d980ebc93c8d2f965c79908b5a9c0ded7571 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Wed, 6 Aug 2014 11:41:35 -0700 Subject: [PATCH 02/14] Change {{VERSION}} placeholder to the current version 0.9.1. Initially this was set via shell script during the export process from Google's repository to the open-source one, but now that most changes are coming from the open-source repository, it doesn't make sense. Since it's only in 2 places and changes will likely be infrequent, it's easier to update it manually. --- Doxyfile | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Doxyfile b/Doxyfile index 7a5bb472..f99fbd6e 100644 --- a/Doxyfile +++ b/Doxyfile @@ -32,7 +32,7 @@ PROJECT_NAME = "Gumbo" # This could be handy for archiving the generated documentation or # if some version control system is used. -PROJECT_NUMBER = {{VERSION}} +PROJECT_NUMBER = 0.9.1 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer diff --git a/setup.py b/setup.py index 849408e4..d92863f4 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ def readme(): return f.read() setup(name='gumbo', - version='{{VERSION}}', + version='0.9.1', description='Python bindings for Gumbo HTML parser', long_description=readme(), url='http://github.com/google/gumbo-parser', From 4fe876dca3c5c7ca8c7265596344d18d47c38326 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Wed, 6 Aug 2014 12:47:35 -0700 Subject: [PATCH 03/14] Tuned the Python long-description to use reStructured Text, and customized it for Python usage. --- setup.py | 150 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 145 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index d92863f4..b120dd0b 100644 --- a/setup.py +++ b/setup.py @@ -1,14 +1,154 @@ #!/usr/bin/env python from setuptools import setup -def readme(): - with open('README.md') as f: - return f.read() +README = '''Gumbo - A pure-C HTML5 parser. +============================== + +Gumbo is an implementation of the `HTML5 parsing algorithm `_ implemented +as a pure C99 library with no outside dependencies. It's designed to serve +as a building block for other tools and libraries such as linters, +validators, templating languages, and refactoring and analysis tools. This +package contains the library itself, Python ctypes bindings for the library, and +adapters for html5lib and BeautifulSoup (3.2) that give it the same API as those +libaries. + +Goals & features: +----------------- + +- Robust and resilient to bad input. + +- Simple API that can be easily wrapped by other languages. + +- Support for source locations and pointers back to the original text. + +- Relatively lightweight, with no outside dependencies. + +- Passes all `html5lib-0.95 tests `_. + +- Tested on over 2.5 billion pages from Google's index. + +Non-goals: +---------- + +- Execution speed. Gumbo gains some of this by virtue of being written in + C, but it is not an important consideration for the intended use-case, and + was not a major design factor. + +- Support for encodings other than UTF-8. For the most part, client code + can convert the input stream to UTF-8 text using another library before + processing. + +- Security. Gumbo was initially designed for a product that worked with + trusted input files only. We're working to harden this and make sure that it + behaves as expected even on malicious input, but for now, Gumbo should only be + run on trusted input or within a sandbox. + +- C89 support. Most major compilers support C99 by now; the major exception + (Microsoft Visual Studio) should be able to compile this in C++ mode with + relatively few changes. (Bug reports welcome.) + +Wishlist (aka "We couldn't get these into the original release, but are +hoping to add them soon"): + +- Support for recent HTML5 spec changes to support the template tag. + +- Support for fragment parsing. + +- Full-featured error reporting. + +- Bindings in other languages. + +Installation +------------ + +```pip install gumbo``` should do it. If you have a local copy, ```python +setup.py install``` from the root directory. + +The `html5lib `_ and +`BeautifulSoup `_ adapters +require that their respective libraries be installed separately to work. + +Basic Usage +----------- + +For the ctypes bindings: + +.. code-block:: python + + from gumbo import gumboc + + with gumboc.parse(text) as output: + root = output.contents.root.contents + # root is a Node object representing the root of the parse tree + # tree-walk over it as necessary. + +For the BeautifulSoup bindings: + +.. code-block:: python + + from gumbo import soup_adapter + + soup = soup_adapter.parse(text) + # soup is a BeautifulSoup object representing the parse tree. + +For the html5lib bindings: + +.. code-block:: python + + from gumbo import html5lib_adapter + + doc = html5lib_adapter.parse(text[, treebuilder='lxml']) + +Recommended best-practice for Python usage is to use one of the adapters to +an existing API (personally, I prefer BeautifulSoup) and write your program +in terms of those. The raw CTypes bindings should be considered building +blocks for higher-level libraries and rarely referenced directly. + +See the source code, Pydoc, and implementation of soup_adapter and +html5lib_adapter for more information. + +A note on API/ABI compatibility +------------------------------- + +We'll make a best effort to preserve API compatibility between releases. +The initial release is a 0.9 (beta) release to solicit comments from early +adopters, but if no major problems are found with the API, a 1.0 release +will follow shortly, and the API of that should be considered stable. If +changes are necessary, we follow [semantic versioning][]. + +We make no such guarantees about the ABI, and it's very likely that +subsequent versions may require a recompile of client code. For this +reason, we recommend NOT using Gumbo data structures throughout a program, +and instead limiting them to a translation layer that picks out whatever +data is needed from the parse tree and then converts that to persistent +data structures more appropriate for the application. The API is +structured to encourage this use, with a single delete function for the +whole parse tree, and is not designed with mutation in mind. + +Most of this is transparent to Python usage, as the Python adapters are all +built with this in mind. However, since ctypes requires ABI compatibility, it +does mean you'll have to re-deploy the gumboc library and C extension when +upgrading to a new version. +''' + +classifiers = [ + 'Development Status :: 4 - Beta', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: Apache Software License', + 'Operating System :: Unix', + 'Programming Language :: C', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.6', + 'Programming Language :: Python :: 2.7', + 'Topic :: Software Development :: Libraries :: Python Modules', + 'Topic :: Text Processing :: Markup :: HTML' +] setup(name='gumbo', - version='0.9.1', + version='0.9.4', description='Python bindings for Gumbo HTML parser', - long_description=readme(), + long_description=README, url='http://github.com/google/gumbo-parser', keywords='gumbo html html5 parser google html5lib beautifulsoup', author='Jonathan Tang', From cf807fa1d8a1e7f36de024302e44317872c902ce Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Wed, 6 Aug 2014 12:49:10 -0700 Subject: [PATCH 04/14] Bumped up development status to production/stable. This library's well-tested enough that it's of comparable stability to other production/stable libraries, despite not yet reaching 1.0. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b120dd0b..b007a807 100644 --- a/setup.py +++ b/setup.py @@ -132,7 +132,7 @@ ''' classifiers = [ - 'Development Status :: 4 - Beta', + 'Development Status :: 5 - Production/Stable', 'Intended Audience :: Developers', 'License :: OSI Approved :: Apache Software License', 'Operating System :: Unix', From 54debd0bc57bf552dd1fb8dd8cd1991191530cea Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Wed, 6 Aug 2014 12:59:28 -0700 Subject: [PATCH 05/14] Add POSIX :: Linux to PyPi classifiers. --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index b007a807..6271fd24 100644 --- a/setup.py +++ b/setup.py @@ -136,6 +136,7 @@ 'Intended Audience :: Developers', 'License :: OSI Approved :: Apache Software License', 'Operating System :: Unix', + 'Operating System :: POSIX :: Linux', 'Programming Language :: C', 'Programming Language :: Python', 'Programming Language :: Python :: 2', From dd62757a9a1abbeccdcf49c0b2f21b1c4ef5ea43 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Wed, 6 Aug 2014 12:59:32 -0700 Subject: [PATCH 06/14] Revert "Bumped up development status to production/stable. This library's well-tested enough that it's of comparable stability to other production/stable libraries, despite not yet reaching 1.0." On second thought - can't hurt to be conservative for now and get some people using it off PyPI before claiming production quality. This reverts commit cf807fa1d8a1e7f36de024302e44317872c902ce. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6271fd24..819a8fb1 100644 --- a/setup.py +++ b/setup.py @@ -132,7 +132,7 @@ ''' classifiers = [ - 'Development Status :: 5 - Production/Stable', + 'Development Status :: 4 - Beta', 'Intended Audience :: Developers', 'License :: OSI Approved :: Apache Software License', 'Operating System :: Unix', From 3d1c75d48e00fa77411186ac07370b07b798aa4d Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Wed, 6 Aug 2014 13:57:20 -0700 Subject: [PATCH 07/14] Add Python build directory to gitignore. --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index f972b157..0d67508e 100644 --- a/.gitignore +++ b/.gitignore @@ -41,6 +41,7 @@ test-suite.log # Python dist artifacts dist +build python/gumbo.egg-info # Example binaries From 5cb77f8a1ecb9b5924f0409c9527c37e0e37f8ea Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Wed, 6 Aug 2014 14:32:04 -0700 Subject: [PATCH 08/14] Updated Python examples to match import structure. --- setup.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index 819a8fb1..26964de5 100644 --- a/setup.py +++ b/setup.py @@ -75,9 +75,9 @@ .. code-block:: python - from gumbo import gumboc + import gumbo - with gumboc.parse(text) as output: + with gumbo.parse(text) as output: root = output.contents.root.contents # root is a Node object representing the root of the parse tree # tree-walk over it as necessary. @@ -86,18 +86,18 @@ .. code-block:: python - from gumbo import soup_adapter + import gumbo - soup = soup_adapter.parse(text) + soup = gumbo.soup_parse(text) # soup is a BeautifulSoup object representing the parse tree. For the html5lib bindings: .. code-block:: python - from gumbo import html5lib_adapter + from gumbo import html5lib - doc = html5lib_adapter.parse(text[, treebuilder='lxml']) + doc = html5lib.parse(text[, treebuilder='lxml']) Recommended best-practice for Python usage is to use one of the adapters to an existing API (personally, I prefer BeautifulSoup) and write your program From b47cb608ffbccdba630c94b9b33b96e5d8c7c4ae Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Wed, 6 Aug 2014 14:33:05 -0700 Subject: [PATCH 09/14] Catch ImportErrors around html5lib_adapter and soup_adapter, so the library can be used even if those pre-reqs aren't installed. --- python/gumbo/__init__.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/python/gumbo/__init__.py b/python/gumbo/__init__.py index a8ea8517..ac5af545 100644 --- a/python/gumbo/__init__.py +++ b/python/gumbo/__init__.py @@ -31,5 +31,15 @@ """ from gumbo.gumboc import * -from gumbo import html5lib_adapter as html5lib -from gumbo.soup_adapter import parse as soup_parse + +try: + from gumbo import html5lib_adapter as html5lib +except ImportError: + # html5lib not installed + pass + +try: + from gumbo.soup_adapter import parse as soup_parse +except ImportError: + # BeautifulSoup not installed + pass From 6d47aca6651c5538d0777cb0768e68345d2b15c1 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Wed, 6 Aug 2014 14:33:28 -0700 Subject: [PATCH 10/14] Try to load the DLL from .libs first, to pick up development changes and allow for Python packaging. --- python/gumbo/gumboc.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/gumbo/gumboc.py b/python/gumbo/gumboc.py index 99875e23..f3967103 100644 --- a/python/gumbo/gumboc.py +++ b/python/gumbo/gumboc.py @@ -24,9 +24,15 @@ import contextlib import ctypes - +import os.path try: + # First look for a freshly-built .so in the .libs directory, for development + # and PyPI packaging. + _dll = ctypes.cdll.LoadLibrary(os.path.join( + os.path.dirname(__file__), '..', '..', '.libs', 'libgumbo.so')) +except OSError: + # System library, on unix _dll = ctypes.cdll.LoadLibrary('libgumbo.so') except OSError: # MacOS X From 44a9a850d1dcaba253581c467c6c7b5dfd66cd3e Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Wed, 6 Aug 2014 17:29:02 -0700 Subject: [PATCH 11/14] Make setuptools install libgumbo.so as well so the library can be used through pip install. --- .gitignore | 2 ++ python/gumbo/gumboc.py | 7 +++++-- setup.py | 19 +++++++++++++++++-- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 14150c22..cd3878fb 100644 --- a/.gitignore +++ b/.gitignore @@ -49,9 +49,11 @@ test-suite.log /gumbo-[0-9].[0-9]/ # Python dist artifacts +*.pyc dist build python/gumbo.egg-info +python/gumbo/libgumbo.so # Example binaries clean_text diff --git a/python/gumbo/gumboc.py b/python/gumbo/gumboc.py index f3967103..e90ad788 100644 --- a/python/gumbo/gumboc.py +++ b/python/gumbo/gumboc.py @@ -27,10 +27,13 @@ import os.path try: - # First look for a freshly-built .so in the .libs directory, for development - # and PyPI packaging. + # First look for a freshly-built .so in the .libs directory, for development. _dll = ctypes.cdll.LoadLibrary(os.path.join( os.path.dirname(__file__), '..', '..', '.libs', 'libgumbo.so')) +except OSError: + # PyPI or setuptools install, look in the current directory. + _dll = ctypes.cdll.LoadLibrary(os.path.join( + os.path.dirname(__file__), 'libgumbo.so')) except OSError: # System library, on unix _dll = ctypes.cdll.LoadLibrary('libgumbo.so') diff --git a/setup.py b/setup.py index 26964de5..e79cb369 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,18 @@ #!/usr/bin/env python from setuptools import setup +from setuptools.command.sdist import sdist + +class CustomSdistCommand(sdist): + """Customized Sdist command, to copy libgumbo.so into the Python directory + so that it can be installed with `pip install`.""" + def run(self): + try: + import shutil + shutil.copyfile('.libs/libgumbo.so', 'python/gumbo/libgumbo.so') + sdist.run(self) + except IOError as e: + print e + README = '''Gumbo - A pure-C HTML5 parser. ============================== @@ -147,7 +160,7 @@ ] setup(name='gumbo', - version='0.9.4', + version='0.9.8', description='Python bindings for Gumbo HTML parser', long_description=README, url='http://github.com/google/gumbo-parser', @@ -157,4 +170,6 @@ license='Apache 2.0', packages=['gumbo'], package_dir={'': 'python'}, - zip_safe=True) + package_data={'gumbo': ['libgumbo.so']}, + cmdclass={ 'sdist': CustomSdistCommand }, + zip_safe=False) From aca2844c85dafcbbb168c28edcfe4b0701e24607 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Wed, 6 Aug 2014 17:31:16 -0700 Subject: [PATCH 12/14] Add classifiers. Also update my e-mail to point to the non-Google one, since the Google one now bounces. --- setup.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index e79cb369..87ede18f 100644 --- a/setup.py +++ b/setup.py @@ -144,7 +144,7 @@ def run(self): upgrading to a new version. ''' -classifiers = [ +CLASSIFIERS = [ 'Development Status :: 4 - Beta', 'Intended Audience :: Developers', 'License :: OSI Approved :: Apache Software License', @@ -160,14 +160,15 @@ def run(self): ] setup(name='gumbo', - version='0.9.8', + version='0.9.9', description='Python bindings for Gumbo HTML parser', long_description=README, url='http://github.com/google/gumbo-parser', keywords='gumbo html html5 parser google html5lib beautifulsoup', author='Jonathan Tang', - author_email='jdtang@google.com', + author_email='jonathan.d.tang@gmail.com', license='Apache 2.0', + classifiers=CLASSIFIERS, packages=['gumbo'], package_dir={'': 'python'}, package_data={'gumbo': ['libgumbo.so']}, From f1ad6a7fac8c23b2d0f21529fbb215b23ff68b98 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Wed, 6 Aug 2014 18:27:35 -0700 Subject: [PATCH 13/14] Support Python 3 now. --- python/gumbo/gumboc.py | 70 +++++++++++++++++++++++++++++------------- setup.py | 5 +-- 2 files changed, 51 insertions(+), 24 deletions(-) diff --git a/python/gumbo/gumboc.py b/python/gumbo/gumboc.py index e90ad788..b88a1342 100644 --- a/python/gumbo/gumboc.py +++ b/python/gumbo/gumboc.py @@ -45,22 +45,31 @@ _bitvector = ctypes.c_uint _Ptr = ctypes.POINTER - -class Enum(ctypes.c_uint): - class __metaclass__(type(ctypes.c_uint)): - def __new__(metaclass, name, bases, cls_dict): - cls = type(ctypes.c_uint).__new__(metaclass, name, bases, cls_dict) - if name == 'Enum': - return cls - try: - for i, value in enumerate(cls_dict['_values_']): - setattr(cls, value, cls.from_param(i)) - except KeyError: - raise ValueError('No _values_ list found inside enum type.') - except TypeError: - raise ValueError('_values_ must be a list of names of enum constants.') +class EnumMetaclass(type(ctypes.c_uint)): + def __new__(metaclass, name, bases, cls_dict): + cls = type(ctypes.c_uint).__new__(metaclass, name, bases, cls_dict) + if name == 'Enum': return cls - + try: + for i, value in enumerate(cls_dict['_values_']): + setattr(cls, value, cls.from_param(i)) + except KeyError: + raise ValueError('No _values_ list found inside enum type.') + except TypeError: + raise ValueError('_values_ must be a list of names of enum constants.') + return cls + +def with_metaclass(mcls): + def decorator(cls): + body = vars(cls).copy() + # clean out class body + body.pop('__dict__', None) + body.pop('__weakref__', None) + return mcls(cls.__name__, cls.__bases__, body) + return decorator + +@with_metaclass(EnumMetaclass) +class Enum(ctypes.c_uint): @classmethod def from_param(cls, param): if isinstance(param, Enum): @@ -154,18 +163,30 @@ def __init__(self, vector): def __iter__(self): return self - def next(self): + def __next__(self): + # Python 3 if self.current >= self.vector.length: raise StopIteration obj = self.vector[self.current] self.current += 1 return obj + def next(self): + # Python 2 + return self.__next__() + def __len__(self): return self.length def __getitem__(self, i): - if isinstance(i, (int, long)): + try: + # Python 2 + numeric_types = (int, long) + except NameError: + # Python 3 + numeric_types = int + + if isinstance(i, numeric_types): if i < 0: i += self.length if i > self.length: @@ -433,8 +454,9 @@ class NodeUnion(ctypes.Union): class Node(ctypes.Structure): # _fields_ set later to avoid a circular reference - @property - def contents(self): + def _contents(self): + # Python3 enters an infinite loop if you use an @property within + # __getattr__, so we factor it out to a helper. if self.type == NodeType.DOCUMENT: return self.v.document elif self.type == NodeType.ELEMENT: @@ -442,11 +464,15 @@ def contents(self): else: return self.v.text + @property + def contents(self): + return self._contents() + def __getattr__(self, name): - return getattr(self.contents, name) + return getattr(self._contents(), name) def __setattr__(self, name, value): - return setattr(self.contents, name, value) + return setattr(self._contents(), name, value) def __repr__(self): return repr(self.contents) @@ -501,7 +527,7 @@ def parse(text, **kwargs): # outlives the parse output. If we let ctypes do it automatically on function # call, it creates a temporary buffer which is destroyed when the call # completes, and then the original_text pointers point into invalid memory. - text_ptr = ctypes.c_char_p(text) + text_ptr = ctypes.c_char_p(text.encode('utf-8')) output = _parse_with_options(ctypes.byref(options), text_ptr, len(text)) try: yield output diff --git a/setup.py b/setup.py index 87ede18f..1c3d929f 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ def run(self): shutil.copyfile('.libs/libgumbo.so', 'python/gumbo/libgumbo.so') sdist.run(self) except IOError as e: - print e + print(e) README = '''Gumbo - A pure-C HTML5 parser. @@ -153,8 +153,9 @@ def run(self): 'Programming Language :: C', 'Programming Language :: Python', 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.4', 'Topic :: Software Development :: Libraries :: Python Modules', 'Topic :: Text Processing :: Markup :: HTML' ] From 2db1796396f0c90a20a06466abca179fcb65be57 Mon Sep 17 00:00:00 2001 From: Jonathan Tang Date: Wed, 6 Aug 2014 18:35:53 -0700 Subject: [PATCH 14/14] Bump the version number back down to 0.9.1 now that we're done testing on testpypi. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1c3d929f..13c67548 100644 --- a/setup.py +++ b/setup.py @@ -161,7 +161,7 @@ def run(self): ] setup(name='gumbo', - version='0.9.9', + version='0.9.1', description='Python bindings for Gumbo HTML parser', long_description=README, url='http://github.com/google/gumbo-parser',