diff --git a/.gitignore b/.gitignore index f29a5aab..cd3878fb 100644 --- a/.gitignore +++ b/.gitignore @@ -48,6 +48,13 @@ test-suite.log /gumbo-[0-9].[0-9].tar.gz /gumbo-[0-9].[0-9]/ +# Python dist artifacts +*.pyc +dist +build +python/gumbo.egg-info +python/gumbo/libgumbo.so + # Example binaries clean_text find_links diff --git a/Doxyfile b/Doxyfile index 7a5bb472..f99fbd6e 100644 --- a/Doxyfile +++ b/Doxyfile @@ -32,7 +32,7 @@ PROJECT_NAME = "Gumbo" # This could be handy for archiving the generated documentation or # if some version control system is used. -PROJECT_NUMBER = {{VERSION}} +PROJECT_NUMBER = 0.9.1 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer diff --git a/python/gumbo/__init__.py b/python/gumbo/__init__.py index a8ea8517..ac5af545 100644 --- a/python/gumbo/__init__.py +++ b/python/gumbo/__init__.py @@ -31,5 +31,15 @@ """ from gumbo.gumboc import * -from gumbo import html5lib_adapter as html5lib -from gumbo.soup_adapter import parse as soup_parse + +try: + from gumbo import html5lib_adapter as html5lib +except ImportError: + # html5lib not installed + pass + +try: + from gumbo.soup_adapter import parse as soup_parse +except ImportError: + # BeautifulSoup not installed + pass diff --git a/python/gumbo/gumboc.py b/python/gumbo/gumboc.py index 99875e23..b88a1342 100644 --- a/python/gumbo/gumboc.py +++ b/python/gumbo/gumboc.py @@ -24,9 +24,18 @@ import contextlib import ctypes - +import os.path try: + # First look for a freshly-built .so in the .libs directory, for development. + _dll = ctypes.cdll.LoadLibrary(os.path.join( + os.path.dirname(__file__), '..', '..', '.libs', 'libgumbo.so')) +except OSError: + # PyPI or setuptools install, look in the current directory. + _dll = ctypes.cdll.LoadLibrary(os.path.join( + os.path.dirname(__file__), 'libgumbo.so')) +except OSError: + # System library, on unix _dll = ctypes.cdll.LoadLibrary('libgumbo.so') except OSError: # MacOS X @@ -36,22 +45,31 @@ _bitvector = ctypes.c_uint _Ptr = ctypes.POINTER - -class Enum(ctypes.c_uint): - class __metaclass__(type(ctypes.c_uint)): - def __new__(metaclass, name, bases, cls_dict): - cls = type(ctypes.c_uint).__new__(metaclass, name, bases, cls_dict) - if name == 'Enum': - return cls - try: - for i, value in enumerate(cls_dict['_values_']): - setattr(cls, value, cls.from_param(i)) - except KeyError: - raise ValueError('No _values_ list found inside enum type.') - except TypeError: - raise ValueError('_values_ must be a list of names of enum constants.') +class EnumMetaclass(type(ctypes.c_uint)): + def __new__(metaclass, name, bases, cls_dict): + cls = type(ctypes.c_uint).__new__(metaclass, name, bases, cls_dict) + if name == 'Enum': return cls - + try: + for i, value in enumerate(cls_dict['_values_']): + setattr(cls, value, cls.from_param(i)) + except KeyError: + raise ValueError('No _values_ list found inside enum type.') + except TypeError: + raise ValueError('_values_ must be a list of names of enum constants.') + return cls + +def with_metaclass(mcls): + def decorator(cls): + body = vars(cls).copy() + # clean out class body + body.pop('__dict__', None) + body.pop('__weakref__', None) + return mcls(cls.__name__, cls.__bases__, body) + return decorator + +@with_metaclass(EnumMetaclass) +class Enum(ctypes.c_uint): @classmethod def from_param(cls, param): if isinstance(param, Enum): @@ -145,18 +163,30 @@ def __init__(self, vector): def __iter__(self): return self - def next(self): + def __next__(self): + # Python 3 if self.current >= self.vector.length: raise StopIteration obj = self.vector[self.current] self.current += 1 return obj + def next(self): + # Python 2 + return self.__next__() + def __len__(self): return self.length def __getitem__(self, i): - if isinstance(i, (int, long)): + try: + # Python 2 + numeric_types = (int, long) + except NameError: + # Python 3 + numeric_types = int + + if isinstance(i, numeric_types): if i < 0: i += self.length if i > self.length: @@ -424,8 +454,9 @@ class NodeUnion(ctypes.Union): class Node(ctypes.Structure): # _fields_ set later to avoid a circular reference - @property - def contents(self): + def _contents(self): + # Python3 enters an infinite loop if you use an @property within + # __getattr__, so we factor it out to a helper. if self.type == NodeType.DOCUMENT: return self.v.document elif self.type == NodeType.ELEMENT: @@ -433,11 +464,15 @@ def contents(self): else: return self.v.text + @property + def contents(self): + return self._contents() + def __getattr__(self, name): - return getattr(self.contents, name) + return getattr(self._contents(), name) def __setattr__(self, name, value): - return setattr(self.contents, name, value) + return setattr(self._contents(), name, value) def __repr__(self): return repr(self.contents) @@ -492,7 +527,7 @@ def parse(text, **kwargs): # outlives the parse output. If we let ctypes do it automatically on function # call, it creates a temporary buffer which is destroyed when the call # completes, and then the original_text pointers point into invalid memory. - text_ptr = ctypes.c_char_p(text) + text_ptr = ctypes.c_char_p(text.encode('utf-8')) output = _parse_with_options(ctypes.byref(options), text_ptr, len(text)) try: yield output diff --git a/setup.py b/setup.py index 849408e4..13c67548 100644 --- a/setup.py +++ b/setup.py @@ -1,19 +1,177 @@ #!/usr/bin/env python from setuptools import setup +from setuptools.command.sdist import sdist -def readme(): - with open('README.md') as f: - return f.read() +class CustomSdistCommand(sdist): + """Customized Sdist command, to copy libgumbo.so into the Python directory + so that it can be installed with `pip install`.""" + def run(self): + try: + import shutil + shutil.copyfile('.libs/libgumbo.so', 'python/gumbo/libgumbo.so') + sdist.run(self) + except IOError as e: + print(e) + + +README = '''Gumbo - A pure-C HTML5 parser. +============================== + +Gumbo is an implementation of the `HTML5 parsing algorithm `_ implemented +as a pure C99 library with no outside dependencies. It's designed to serve +as a building block for other tools and libraries such as linters, +validators, templating languages, and refactoring and analysis tools. This +package contains the library itself, Python ctypes bindings for the library, and +adapters for html5lib and BeautifulSoup (3.2) that give it the same API as those +libaries. + +Goals & features: +----------------- + +- Robust and resilient to bad input. + +- Simple API that can be easily wrapped by other languages. + +- Support for source locations and pointers back to the original text. + +- Relatively lightweight, with no outside dependencies. + +- Passes all `html5lib-0.95 tests `_. + +- Tested on over 2.5 billion pages from Google's index. + +Non-goals: +---------- + +- Execution speed. Gumbo gains some of this by virtue of being written in + C, but it is not an important consideration for the intended use-case, and + was not a major design factor. + +- Support for encodings other than UTF-8. For the most part, client code + can convert the input stream to UTF-8 text using another library before + processing. + +- Security. Gumbo was initially designed for a product that worked with + trusted input files only. We're working to harden this and make sure that it + behaves as expected even on malicious input, but for now, Gumbo should only be + run on trusted input or within a sandbox. + +- C89 support. Most major compilers support C99 by now; the major exception + (Microsoft Visual Studio) should be able to compile this in C++ mode with + relatively few changes. (Bug reports welcome.) + +Wishlist (aka "We couldn't get these into the original release, but are +hoping to add them soon"): + +- Support for recent HTML5 spec changes to support the template tag. + +- Support for fragment parsing. + +- Full-featured error reporting. + +- Bindings in other languages. + +Installation +------------ + +```pip install gumbo``` should do it. If you have a local copy, ```python +setup.py install``` from the root directory. + +The `html5lib `_ and +`BeautifulSoup `_ adapters +require that their respective libraries be installed separately to work. + +Basic Usage +----------- + +For the ctypes bindings: + +.. code-block:: python + + import gumbo + + with gumbo.parse(text) as output: + root = output.contents.root.contents + # root is a Node object representing the root of the parse tree + # tree-walk over it as necessary. + +For the BeautifulSoup bindings: + +.. code-block:: python + + import gumbo + + soup = gumbo.soup_parse(text) + # soup is a BeautifulSoup object representing the parse tree. + +For the html5lib bindings: + +.. code-block:: python + + from gumbo import html5lib + + doc = html5lib.parse(text[, treebuilder='lxml']) + +Recommended best-practice for Python usage is to use one of the adapters to +an existing API (personally, I prefer BeautifulSoup) and write your program +in terms of those. The raw CTypes bindings should be considered building +blocks for higher-level libraries and rarely referenced directly. + +See the source code, Pydoc, and implementation of soup_adapter and +html5lib_adapter for more information. + +A note on API/ABI compatibility +------------------------------- + +We'll make a best effort to preserve API compatibility between releases. +The initial release is a 0.9 (beta) release to solicit comments from early +adopters, but if no major problems are found with the API, a 1.0 release +will follow shortly, and the API of that should be considered stable. If +changes are necessary, we follow [semantic versioning][]. + +We make no such guarantees about the ABI, and it's very likely that +subsequent versions may require a recompile of client code. For this +reason, we recommend NOT using Gumbo data structures throughout a program, +and instead limiting them to a translation layer that picks out whatever +data is needed from the parse tree and then converts that to persistent +data structures more appropriate for the application. The API is +structured to encourage this use, with a single delete function for the +whole parse tree, and is not designed with mutation in mind. + +Most of this is transparent to Python usage, as the Python adapters are all +built with this in mind. However, since ctypes requires ABI compatibility, it +does mean you'll have to re-deploy the gumboc library and C extension when +upgrading to a new version. +''' + +CLASSIFIERS = [ + 'Development Status :: 4 - Beta', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: Apache Software License', + 'Operating System :: Unix', + 'Operating System :: POSIX :: Linux', + 'Programming Language :: C', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.4', + 'Topic :: Software Development :: Libraries :: Python Modules', + 'Topic :: Text Processing :: Markup :: HTML' +] setup(name='gumbo', - version='{{VERSION}}', + version='0.9.1', description='Python bindings for Gumbo HTML parser', - long_description=readme(), + long_description=README, url='http://github.com/google/gumbo-parser', keywords='gumbo html html5 parser google html5lib beautifulsoup', author='Jonathan Tang', - author_email='jdtang@google.com', + author_email='jonathan.d.tang@gmail.com', license='Apache 2.0', + classifiers=CLASSIFIERS, packages=['gumbo'], package_dir={'': 'python'}, - zip_safe=True) + package_data={'gumbo': ['libgumbo.so']}, + cmdclass={ 'sdist': CustomSdistCommand }, + zip_safe=False)