Merge pull request #243 from nostrademons/pypi_fixes

Pypi fixes
google · Aug 7, 2014 · 4d4c8e6 · 4d4c8e6
2 parents 3a61e9a + 2db1796
commit 4d4c8e6
Show file tree

Hide file tree

Showing 5 changed files with 243 additions and 33 deletions.
diff --git a/.gitignore b/.gitignore
@@ -48,6 +48,13 @@ test-suite.log
 /gumbo-[0-9].[0-9].tar.gz
 /gumbo-[0-9].[0-9]/
 
+# Python dist artifacts
+*.pyc
+dist
+build
+python/gumbo.egg-info
+python/gumbo/libgumbo.so
+
 # Example binaries
 clean_text
 find_links

diff --git a/Doxyfile b/Doxyfile
@@ -32,7 +32,7 @@ PROJECT_NAME           = "Gumbo"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.
 
-PROJECT_NUMBER         = {{VERSION}}
+PROJECT_NUMBER         = 0.9.1
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer

diff --git a/python/gumbo/__init__.py b/python/gumbo/__init__.py
@@ -31,5 +31,15 @@
 """
 
 from gumbo.gumboc import *
-from gumbo import html5lib_adapter as html5lib
-from gumbo.soup_adapter import parse as soup_parse
+
+try:
+  from gumbo import html5lib_adapter as html5lib
+except ImportError:
+  # html5lib not installed
+  pass
+
+try:
+  from gumbo.soup_adapter import parse as soup_parse
+except ImportError:
+  # BeautifulSoup not installed
+  pass
diff --git a/python/gumbo/gumboc.py b/python/gumbo/gumboc.py
@@ -24,9 +24,18 @@
 
 import contextlib
 import ctypes
-
+import os.path
 
 try:
+  # First look for a freshly-built .so in the .libs directory, for development.
+  _dll = ctypes.cdll.LoadLibrary(os.path.join(
+      os.path.dirname(__file__), '..', '..', '.libs', 'libgumbo.so'))
+except OSError:
+  # PyPI or setuptools install, look in the current directory.
+  _dll = ctypes.cdll.LoadLibrary(os.path.join(
+      os.path.dirname(__file__), 'libgumbo.so'))
+except OSError:
+  # System library, on unix
   _dll = ctypes.cdll.LoadLibrary('libgumbo.so')
 except OSError:
   # MacOS X
@@ -36,22 +45,31 @@
 _bitvector = ctypes.c_uint
 _Ptr = ctypes.POINTER
 
-
-class Enum(ctypes.c_uint):
-  class __metaclass__(type(ctypes.c_uint)):
-    def __new__(metaclass, name, bases, cls_dict):
-      cls = type(ctypes.c_uint).__new__(metaclass, name, bases, cls_dict)
-      if name == 'Enum':
-        return cls
-      try:
-        for i, value in enumerate(cls_dict['_values_']):
-          setattr(cls, value, cls.from_param(i))
-      except KeyError:
-        raise ValueError('No _values_ list found inside enum type.')
-      except TypeError:
-        raise ValueError('_values_ must be a list of names of enum constants.')
+class EnumMetaclass(type(ctypes.c_uint)):
+  def __new__(metaclass, name, bases, cls_dict):
+    cls = type(ctypes.c_uint).__new__(metaclass, name, bases, cls_dict)
+    if name == 'Enum':
       return cls
-
+    try:
+      for i, value in enumerate(cls_dict['_values_']):
+        setattr(cls, value, cls.from_param(i))
+    except KeyError:
+      raise ValueError('No _values_ list found inside enum type.')
+    except TypeError:
+      raise ValueError('_values_ must be a list of names of enum constants.')
+    return cls
+
+def with_metaclass(mcls):
+    def decorator(cls):
+        body = vars(cls).copy()
+        # clean out class body
+        body.pop('__dict__', None)
+        body.pop('__weakref__', None)
+        return mcls(cls.__name__, cls.__bases__, body)
+    return decorator
+
+@with_metaclass(EnumMetaclass)
+class Enum(ctypes.c_uint):
   @classmethod
   def from_param(cls, param):
     if isinstance(param, Enum):
@@ -145,18 +163,30 @@ def __init__(self, vector):
     def __iter__(self):
       return self
 
-    def next(self):
+    def __next__(self):
+      # Python 3
       if self.current >= self.vector.length:
         raise StopIteration
       obj = self.vector[self.current]
       self.current += 1
       return obj
 
+    def next(self):
+      # Python 2
+      return self.__next__()
+
   def __len__(self):
     return self.length
 
   def __getitem__(self, i):
-    if isinstance(i, (int, long)):
+    try:
+      # Python 2
+      numeric_types = (int, long)
+    except NameError:
+      # Python 3
+      numeric_types = int
+
+    if isinstance(i, numeric_types):
       if i < 0:
         i += self.length
       if i > self.length:
@@ -424,20 +454,25 @@ class NodeUnion(ctypes.Union):
 class Node(ctypes.Structure):
   # _fields_ set later to avoid a circular reference
 
-  @property
-  def contents(self):
+  def _contents(self):
+    # Python3 enters an infinite loop if you use an @property within
+    # __getattr__, so we factor it out to a helper.
     if self.type == NodeType.DOCUMENT:
       return self.v.document
     elif self.type == NodeType.ELEMENT:
       return self.v.element
     else:
       return self.v.text
 
+  @property
+  def contents(self):
+    return self._contents()
+
   def __getattr__(self, name):
-    return getattr(self.contents, name)
+    return getattr(self._contents(), name)
 
   def __setattr__(self, name, value):
-    return setattr(self.contents, name, value)
+    return setattr(self._contents(), name, value)
 
   def __repr__(self):
     return repr(self.contents)
@@ -492,7 +527,7 @@ def parse(text, **kwargs):
   # outlives the parse output.  If we let ctypes do it automatically on function
   # call, it creates a temporary buffer which is destroyed when the call
   # completes, and then the original_text pointers point into invalid memory.
-  text_ptr = ctypes.c_char_p(text)
+  text_ptr = ctypes.c_char_p(text.encode('utf-8'))
   output = _parse_with_options(ctypes.byref(options), text_ptr, len(text))
   try:
     yield output

diff --git a/setup.py b/setup.py
@@ -1,19 +1,177 @@
 #!/usr/bin/env python
 from setuptools import setup
+from setuptools.command.sdist import sdist
 
-def readme():
-  with open('README.md') as f:
-    return f.read()
+class CustomSdistCommand(sdist):
+    """Customized Sdist command, to copy libgumbo.so into the Python directory
+    so that it can be installed with `pip install`."""
+    def run(self):
+        try:
+            import shutil
+            shutil.copyfile('.libs/libgumbo.so', 'python/gumbo/libgumbo.so')
+            sdist.run(self)
+        except IOError as e:
+            print(e)
+
+
+README = '''Gumbo - A pure-C HTML5 parser.
+==============================
+
+Gumbo is an implementation of the `HTML5 parsing algorithm <http://www.whatwg.org/specs/web-apps/current-work/multipage/#auto-toc-12>`_ implemented
+as a pure C99 library with no outside dependencies.  It's designed to serve
+as a building block for other tools and libraries such as linters,
+validators, templating languages, and refactoring and analysis tools.  This
+package contains the library itself, Python ctypes bindings for the library, and
+adapters for html5lib and BeautifulSoup (3.2) that give it the same API as those
+libaries.
+
+Goals & features:
+-----------------
+
+- Robust and resilient to bad input.
+
+- Simple API that can be easily wrapped by other languages.
+
+- Support for source locations and pointers back to the original text.
+
+- Relatively lightweight, with no outside dependencies.
+
+- Passes all `html5lib-0.95 tests <https://github.com/html5lib/html5lib-tests>`_.
+
+- Tested on over 2.5 billion pages from Google's index.
+
+Non-goals:
+----------
+
+- Execution speed.  Gumbo gains some of this by virtue of being written in
+  C, but it is not an important consideration for the intended use-case, and
+  was not a major design factor.
+
+- Support for encodings other than UTF-8.  For the most part, client code
+  can convert the input stream to UTF-8 text using another library before
+  processing.
+
+- Security.  Gumbo was initially designed for a product that worked with
+  trusted input files only.  We're working to harden this and make sure that it
+  behaves as expected even on malicious input, but for now, Gumbo should only be
+  run on trusted input or within a sandbox.
+
+- C89 support.  Most major compilers support C99 by now; the major exception
+  (Microsoft Visual Studio) should be able to compile this in C++ mode with
+  relatively few changes.  (Bug reports welcome.)
+
+Wishlist (aka "We couldn't get these into the original release, but are
+hoping to add them soon"):
+
+- Support for recent HTML5 spec changes to support the template tag.
+
+- Support for fragment parsing.
+
+- Full-featured error reporting.
+
+- Bindings in other languages.
+
+Installation
+------------
+
+```pip install gumbo``` should do it.  If you have a local copy, ```python
+setup.py install``` from the root directory.
+
+The `html5lib <https://pypi.python.org/pypi/html5lib/0.999>`_ and
+`BeautifulSoup <https://pypi.python.org/pypi/BeautifulSoup/3.2.1>`_ adapters
+require that their respective libraries be installed separately to work.
+
+Basic Usage
+-----------
+
+For the ctypes bindings:
+
+.. code-block:: python
+
+    import gumbo
+    
+    with gumbo.parse(text) as output:
+        root = output.contents.root.contents
+        # root is a Node object representing the root of the parse tree
+        # tree-walk over it as necessary.
+
+For the BeautifulSoup bindings:
+
+.. code-block:: python
+
+    import gumbo
+
+    soup = gumbo.soup_parse(text)
+    # soup is a BeautifulSoup object representing the parse tree.
+
+For the html5lib bindings:
+
+.. code-block:: python
+
+    from gumbo import html5lib
+
+    doc = html5lib.parse(text[, treebuilder='lxml'])
+
+Recommended best-practice for Python usage is to use one of the adapters to
+an existing API (personally, I prefer BeautifulSoup) and write your program
+in terms of those.  The raw CTypes bindings should be considered building
+blocks for higher-level libraries and rarely referenced directly.
+
+See the source code, Pydoc, and implementation of soup_adapter and
+html5lib_adapter for more information.
+
+A note on API/ABI compatibility
+-------------------------------
+
+We'll make a best effort to preserve API compatibility between releases.
+The initial release is a 0.9 (beta) release to solicit comments from early
+adopters, but if no major problems are found with the API, a 1.0 release
+will follow shortly, and the API of that should be considered stable.  If
+changes are necessary, we follow [semantic versioning][].
+
+We make no such guarantees about the ABI, and it's very likely that
+subsequent versions may require a recompile of client code.  For this
+reason, we recommend NOT using Gumbo data structures throughout a program,
+and instead limiting them to a translation layer that picks out whatever
+data is needed from the parse tree and then converts that to persistent
+data structures more appropriate for the application.  The API is
+structured to encourage this use, with a single delete function for the
+whole parse tree, and is not designed with mutation in mind.
+
+Most of this is transparent to Python usage, as the Python adapters are all
+built with this in mind.  However, since ctypes requires ABI compatibility, it
+does mean you'll have to re-deploy the gumboc library and C extension when
+upgrading to a new version.
+'''
+
+CLASSIFIERS = [
+    'Development Status :: 4 - Beta',
+    'Intended Audience :: Developers',
+    'License :: OSI Approved :: Apache Software License',
+    'Operating System :: Unix',
+    'Operating System :: POSIX :: Linux',
+    'Programming Language :: C',
+    'Programming Language :: Python',
+    'Programming Language :: Python :: 2',
+    'Programming Language :: Python :: 2.7',
+    'Programming Language :: Python :: 3',
+    'Programming Language :: Python :: 3.4',
+    'Topic :: Software Development :: Libraries :: Python Modules',
+    'Topic :: Text Processing :: Markup :: HTML'
+]
 
 setup(name='gumbo',
-      version='{{VERSION}}',
+      version='0.9.1',
       description='Python bindings for Gumbo HTML parser',
-      long_description=readme(),
+      long_description=README,
       url='http://github.com/google/gumbo-parser',
       keywords='gumbo html html5 parser google html5lib beautifulsoup',
       author='Jonathan Tang',
-      author_email='jdtang@google.com',
+      author_email='jonathan.d.tang@gmail.com',
       license='Apache 2.0',
+      classifiers=CLASSIFIERS,
       packages=['gumbo'],
       package_dir={'': 'python'},
-      zip_safe=True)
+      package_data={'gumbo': ['libgumbo.so']},
+      cmdclass={ 'sdist': CustomSdistCommand },
+      zip_safe=False)