Merge branch 'release/0.2.1'

turicas · Aug 10, 2016 · 0263e3c · 0263e3c
2 parents f2b7fc3 + 7188f71
commit 0263e3c
Show file tree

Hide file tree

Showing 35 changed files with 1,200 additions and 569 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,50 @@
 # rows' Log of Changes
 
+## Version `0.2.1`
+
+**Released on: 2016-08-10**
+
+### Backwards Incompatible Changes
+
+- `rows.utils.export_to_uri` signature is now like `rows.export_to_*` (first
+  the `rows.Table` object, then the URI)
+- Changed default table name in `import_from_sqlite` and `export_to_sqlite`
+  (from `rows` and `rows_{number}` to `table{number}`)
+
+
+### Bug Fixes
+
+- [#170](https://github.com/turicas/rows/issues/170) (SQLite plugin) Error
+  converting `int` and `float` when value is `None`.
+- [#168](https://github.com/turicas/rows/issues/168) Use `Field.serialize`
+  if does not know the field type (affecting: XLS, XLSX and SQLite plugins).
+- [#167](https://github.com/turicas/rows/issues/167) Use more data to detect
+  dialect, delimit the possible delimiters and fallback to excel if can't
+  detect.
+- [#176](https://github.com/turicas/rows/issues/176) Problem using quotes on
+  CSV plugin.
+- [#179](https://github.com/turicas/rows/issues/179) Fix double underscore
+  problem on `rows.utils.slug`
+- [#175](https://github.com/turicas/rows/issues/175) Fix `None`
+  serialization/deserialization in all plugins (and also field types)
+- [#172](https://github.com/turicas/rows/issues/172) Expose all tables in `rows
+  query` for SQLite databases
+- Fix `examples/cli/convert.sh` (missing `-`)
+- Avoids SQL injection in table name
+
+
+### Enhancements and Refactorings
+
+- Refactor `rows.utils.import_from_uri`
+- Encoding and file type are better detected on `rows.utils.import_from_uri`
+- Added helper functions to `rows.utils` regarding encoding and file
+  type/plugin detection
+- There's a better description of plugin metadata (MIME types accepted) on
+  `rows.utils` (should be refactored to be inside each plugin)
+- Moved `slug` and `ipartition` functions to `rows.plugins.utils`
+- Optimize `rows query` when using only one SQLite source
+
+
 ## Version `0.2.0`
 
 **Released on: 2016-07-15**

diff --git a/README.md b/README.md
@@ -342,10 +342,10 @@ file format you want. Currently we have the following plugins:
   installed by default)
 - TXT: use `rows.export_to_txt` (no dependencies)
 - JSON: use `rows.import_from_json` and `rows.export_to_json` (no dependencies)
-- HTML: use `rows.import_from_html` and `rows.export_to_html` (denpendencies
+- HTML: use `rows.import_from_html` and `rows.export_to_html` (dependencies
   must be installed with `pip install rows[html]`)
 - XPath: use `rows.import_from_xpath` passing the following arguments:
-  `filename_or_fobj`, `rows_xpath` and `fields_xpath` (denpendencies must be
+  `filename_or_fobj`, `rows_xpath` and `fields_xpath` (dependencies must be
   installed with `pip install rows[xpath]`) -- see an example in
   `examples/library/ecuador_radiodifusoras.py`.
 - Parquet: use `rows.import_from_parquet` passing the filename (dependencies

diff --git a/examples/cli/convert.sh b/examples/cli/convert.sh
@@ -8,10 +8,11 @@ rows convert --input-locale=$LOCALE --input-encoding=utf-8 $URL $FILENAME.csv
 rows convert $FILENAME.csv $FILENAME.html
 rows convert $FILENAME.html $FILENAME.xls
 rows convert $FILENAME.xls $FILENAME.txt
-rows convert $FILENAME.txt $FILENAME.sqlite
+rows convert $FILENAME.txt $FILENAME.xlsx
+rows convert $FILENAME.xlsx $FILENAME.sqlite
 rows convert $FILENAME.sqlite $FILENAME.json
 # When converting to JSON we cannot guarantee field order!
 
 # `convert` can also sort the data before saving it into the CSV file
-rows convert --input-encoding=utf-8 --input-locale=$LOCALE \
+rows convert --input-locale=$LOCALE --input-encoding=utf-8 \
              --order-by=^pessoas $URL $FILENAME-sorted.csv
diff --git a/rows/__init__.py b/rows/__init__.py
@@ -66,4 +66,4 @@
     pass
 
 
-__version__ = '0.2.0'
+__version__ = '0.2.1'
diff --git a/rows/cli.py b/rows/cli.py
@@ -1,6 +1,6 @@
 # coding: utf-8
 
-# Copyright 2014-2015 Álvaro Justen <https://github.com/turicas/rows/>
+# Copyright 2014-2016 Álvaro Justen <https://github.com/turicas/rows/>
 #
 #    This program is free software: you can redistribute it and/or modify
 #    it under the terms of the GNU General Public License as published by
@@ -17,11 +17,13 @@
 
 # TODO: define exit codes
 # TODO: move default options to base command
+# TODO: may move all 'destination' to '--output'
 # TODO: test this whole module
 # TODO: add option to pass 'create_table' options in command-line (like force
 #       fields)
 
 import shlex
+import sqlite3
 import sys
 
 from io import BytesIO
@@ -31,7 +33,8 @@
 
 import rows
 
-from rows.utils import import_from_uri, export_to_uri
+from rows.utils import (detect_source, export_to_uri, import_from_source,
+                        import_from_uri)
 from rows.plugins.utils import make_header
 
 
@@ -68,13 +71,7 @@ def _get_field_names(field_names, table_field_names, permit_not=False):
         click.echo('Table does not have fields: {}'.format(missing), err=True)
         sys.exit(1)
     else:
-        result = []
-        for field_name in table_field_names:
-            if field_name in new_field_names:
-                result.append(field_name)
-            elif '^' + field_name in new_field_names:
-                result.append('^' + field_name)
-        return result
+        return new_field_names
 
 
 @click.group()
@@ -115,9 +112,9 @@ def convert(input_encoding, output_encoding, input_locale, output_locale,
 
     if output_locale is not None:
         with rows.locale_context(output_locale):
-            export_to_uri(destination, table, encoding=output_encoding)
+            export_to_uri(table, destination, encoding=output_encoding)
     else:
-        export_to_uri(destination, table, encoding=output_encoding)
+        export_to_uri(table, destination, encoding=output_encoding)
 
 
 @cli.command(help='Join tables from `source` URIs using `key(s)` to group '
@@ -159,9 +156,9 @@ def join(input_encoding, output_encoding, input_locale, output_locale,
 
     if output_locale is not None:
         with rows.locale_context(output_locale):
-            export_to_uri(destination, result, encoding=output_encoding)
+            export_to_uri(result, destination, encoding=output_encoding)
     else:
-        export_to_uri(destination, result, encoding=output_encoding)
+        export_to_uri(result, destination, encoding=output_encoding)
 
 
 @cli.command(name='sum',
@@ -201,9 +198,9 @@ def sum_(input_encoding, output_encoding, input_locale, output_locale,
 
     if output_locale is not None:
         with rows.locale_context(output_locale):
-            export_to_uri(destination, result, encoding=output_encoding)
+            export_to_uri(result, destination, encoding=output_encoding)
     else:
-        export_to_uri(destination, result, encoding=output_encoding)
+        export_to_uri(result, destination, encoding=output_encoding)
 
 
 @cli.command(name='print', help='Print a table')
@@ -225,7 +222,6 @@ def print_(input_encoding, output_encoding, input_locale, output_locale,
                    err=True)
         sys.exit(20)
 
-    # TODO: may use sys.stdout.encoding if output_file = '-'
     output_encoding = output_encoding or sys.stdout.encoding or \
                       DEFAULT_OUTPUT_ENCODING
 
@@ -246,6 +242,8 @@ def print_(input_encoding, output_encoding, input_locale, output_locale,
     if fields_except is not None:
         fields_except = _get_field_names(fields_except, table_field_names)
 
+    # TODO: should set `export_fields = None` if `--fields` and
+    # `--fields-except` are `None`
     if fields is not None and fields_except is None:
         export_fields = fields
     elif fields is not None and fields_except is not None:
@@ -289,35 +287,59 @@ def print_(input_encoding, output_encoding, input_locale, output_locale,
 @click.argument('query', required=True)
 @click.argument('sources', nargs=-1, required=True)
 def query(input_encoding, output_encoding, input_locale, output_locale,
-        verify_ssl, fields, output, query, sources):
+          verify_ssl, fields, output, query, sources):
 
-    # TODO: may move all 'destination' to '--output'
     # TODO: may use sys.stdout.encoding if output_file = '-'
     output_encoding = output_encoding or sys.stdout.encoding or \
                       DEFAULT_OUTPUT_ENCODING
+
     if not query.lower().startswith('select'):
         field_names = '*' if fields is None else fields
         table_names = ', '.join(['table{}'.format(index)
                                  for index in range(1, len(sources) + 1)])
         query = 'SELECT {} FROM {} WHERE {}'.format(field_names, table_names,
                                                     query)
-    if input_locale is not None:
-        with rows.locale_context(input_locale):
+
+    if len(sources) == 1:
+        source = detect_source(sources[0], verify_ssl=verify_ssl)
+
+        if source.plugin_name != 'sqlite':
+            if input_locale is not None:
+                with rows.locale_context(input_locale):
+                    table = import_from_source(source, DEFAULT_INPUT_ENCODING)
+            else:
+                table = import_from_source(source, DEFAULT_INPUT_ENCODING)
+
+            sqlite_connection = sqlite3.Connection(':memory:')
+            rows.export_to_sqlite(table,
+                                  sqlite_connection,
+                                  table_name='table1')
+            result = rows.import_from_sqlite(sqlite_connection, query=query)
+
+        else:
+            # Optimization: query the SQLite database directly
+            result = import_from_source(source,
+                                        DEFAULT_INPUT_ENCODING,
+                                        query=query)
+
+    else:
+        if input_locale is not None:
+            with rows.locale_context(input_locale):
+                tables = [_import_table(source, encoding=input_encoding,
+                                        verify_ssl=verify_ssl)
+                          for source in sources]
+        else:
             tables = [_import_table(source, encoding=input_encoding,
                                     verify_ssl=verify_ssl)
-                     for source in sources]
-    else:
-        tables = [_import_table(source, encoding=input_encoding,
-                                verify_ssl=verify_ssl)
-                  for source in sources]
+                      for source in sources]
 
-    sqlite_connection = rows.export_to_sqlite(tables[0], ':memory:',
-                                              table_name='table1')
-    for index, table in enumerate(tables[1:], start=2):
-        rows.export_to_sqlite(table, sqlite_connection,
-                              table_name='table{}'.format(index))
+        sqlite_connection = sqlite3.Connection(':memory:')
+        for index, table in enumerate(tables, start=1):
+            rows.export_to_sqlite(table,
+                                  sqlite_connection,
+                                  table_name='table{}'.format(index))
 
-    result = rows.import_from_sqlite(sqlite_connection, query=query)
+        result = rows.import_from_sqlite(sqlite_connection, query=query)
 
     if output is None:
         fobj = BytesIO()
@@ -331,9 +353,9 @@ def query(input_encoding, output_encoding, input_locale, output_locale,
     else:
         if output_locale is not None:
             with rows.locale_context(output_locale):
-                export_to_uri(output, result, encoding=output_encoding)
+                export_to_uri(result, output, encoding=output_encoding)
         else:
-            export_to_uri(output, result, encoding=output_encoding)
+            export_to_uri(result, output, encoding=output_encoding)
 
 
 if __name__ == '__main__':

diff --git a/rows/fields.py b/rows/fields.py
@@ -226,9 +226,7 @@ def serialize(cls, value, *args, **kwargs):
     @classmethod
     def deserialize(cls, value, *args, **kwargs):
         value = super(DecimalField, cls).deserialize(value)
-        if is_null(value):
-            return None
-        elif isinstance(value, cls.TYPE):
+        if value is None or isinstance(value, cls.TYPE):
             return value
         elif type(value) in (int, float):
             return Decimal(str(value))
@@ -370,11 +368,7 @@ class TextField(Field):
 
     @classmethod
     def deserialize(cls, value, *args, **kwargs):
-        value = super(TextField, cls).deserialize(value)
-        if value is None:
-            return None
-
-        if isinstance(value, cls.TYPE):
+        if value is None or isinstance(value, cls.TYPE):
             return value
         elif 'encoding' in kwargs:
             return as_string(value).decode(kwargs['encoding'])
@@ -428,15 +422,12 @@ def deserialize(cls, value, *args, **kwargs):
         if isinstance(value, types.UnicodeType):
             value = value.encode('utf-8')
 
-        if value is None:
-            return None
-        elif isinstance(value, cls.TYPE):
+        if value is None or isinstance(value, cls.TYPE):
             return value
         else:
             return json.loads(value)
 
 
-
 AVAILABLE_FIELD_TYPES = [locals()[element] for element in __all__
                          if 'Field' in element and element != 'Field']
 
@@ -472,6 +463,7 @@ def detect_types(field_names, field_values, field_types=AVAILABLE_FIELD_TYPES,
                  *args, **kwargs):
     """Where the magic happens"""
 
+    # TODO: look strategy of csv.Sniffer.has_header
     # TODO: may receive 'type hints'
     # TODO: should support receiving unicode objects directly
     # TODO: should expect data in unicode or will be able to use binary data?

diff --git a/rows/plugins/_json.py b/rows/plugins/_json.py
@@ -21,7 +21,7 @@
 import decimal
 import json
 
-from rows.fields import DateField, DatetimeField, DecimalField, PercentField
+from rows import fields
 from rows.plugins.utils import (create_table, export_data,
                                 get_filename_and_fobj, prepare_to_export)
 
@@ -42,10 +42,21 @@ def import_from_json(filename_or_fobj, encoding='utf-8', *args, **kwargs):
 
 
 def _convert(value, field_type, *args, **kwargs):
-    if field_type in (DateField, DatetimeField, DecimalField, PercentField):
-        value = field_type.serialize(value, *args, **kwargs)
-
-    return value
+    if value is None or field_type in (
+                fields.BinaryField,
+                fields.BoolField,
+                fields.FloatField,
+                fields.IntegerField,
+                fields.JSONField,
+                fields.TextField,
+    ):
+        # If the field_type is one of those, the value can be passed directly
+        # to the JSON encoder
+        return value
+    else:
+        # The field type is not represented natively in JSON, then it needs to
+        # be serialized (converted to a string)
+        return field_type.serialize(value, *args, **kwargs)
 
 
 def export_to_json(table, filename_or_fobj=None, encoding='utf-8', indent=None,
@@ -61,6 +72,7 @@ def export_to_json(table, filename_or_fobj=None, encoding='utf-8', indent=None,
 
     result = json.dumps(data, indent=indent)
     if indent is not None:
+        # clean up empty spaces at the end of lines
         result = '\n'.join(line.rstrip() for line in result.splitlines())
 
     return export_data(filename_or_fobj, result)
diff --git a/rows/plugins/csv.py b/rows/plugins/csv.py
@@ -24,15 +24,21 @@
 from rows.plugins.utils import create_table, get_filename_and_fobj, serialize
 
 
-def import_from_csv(filename_or_fobj, encoding='utf-8', dialect=None, *args,
-                    **kwargs):
+sniffer = unicodecsv.Sniffer()
+
+def import_from_csv(filename_or_fobj, encoding='utf-8', dialect=None,
+                    sample_size=8192, *args, **kwargs):
     'Import data from a CSV file'
 
     filename, fobj = get_filename_and_fobj(filename_or_fobj)
 
     if dialect is None:
-        sample = fobj.readline().decode(encoding)
-        dialect = unicodecsv.Sniffer().sniff(sample)
+        sample = fobj.read(sample_size)
+        try:
+            dialect = sniffer.sniff(sample, delimiters=(',', ';', '\t'))
+        except unicodecsv.Error:
+            # Could not detect dialect, fall back to 'excel'
+            dialect = unicodecsv.excel
         fobj.seek(0)
 
     kwargs['encoding'] = encoding