Skip to content

Commit

Permalink
Merge pull request #203 from JVickery-TBS/fix/stringish-type-guessing
Browse files Browse the repository at this point in the history
PY2 & PY3 String/Binary Fixes
  • Loading branch information
ThrawnCA authored Jun 25, 2024
2 parents dd475e7 + a688b4e commit a27b993
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 3 deletions.
9 changes: 9 additions & 0 deletions ckanext/xloader/config_declaration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,15 @@ groups:
type: bool
required: false
legacy_key: ckanext.xloader.just_load_with_messytables
- key: ckanext.xloader.strict_type_guessing
default: True
example: False
description: |
Use with ckanext.xloader.use_type_guessing to set strict true or false
for type guessing. If set to False, the types will always fallback to string type.
Strict means that a type will not be guessed if parsing fails for a single cell in the column.
type: bool
- key: ckanext.xloader.max_type_guessing_length
default: 0
example: 100000
Expand Down
12 changes: 10 additions & 2 deletions ckanext/xloader/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import datetime
import itertools
from six import text_type as str, binary_type
import os
import os.path
import tempfile
Expand Down Expand Up @@ -384,7 +385,9 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
skip_rows.append({'type': 'preset', 'value': 'blank'})

TYPES, TYPE_MAPPING = get_types()
types = type_guess(stream.sample[1:], types=TYPES, strict=True)
strict_guessing = p.toolkit.asbool(
config.get('ckanext.xloader.strict_type_guessing', True))
types = type_guess(stream.sample[1:], types=TYPES, strict=strict_guessing)

# override with types user requested
if existing_info:
Expand Down Expand Up @@ -462,12 +465,17 @@ def row_iterator():


_TYPE_MAPPING = {
"<type 'str'>": 'text',
"<type 'unicode'>": 'text',
"<type 'bytes'>": 'text',
"<type 'bool'>": 'text',
"<type 'int'>": 'numeric',
"<type 'float'>": 'numeric',
"<class 'decimal.Decimal'>": 'numeric',
"<type 'datetime.datetime'>": 'timestamp',
"<class 'str'>": 'text',
"<class 'unicode'>": 'text',
"<class 'bytes'>": 'text',
"<class 'bool'>": 'text',
"<class 'int'>": 'numeric',
"<class 'float'>": 'numeric',
Expand All @@ -476,7 +484,7 @@ def row_iterator():


def get_types():
_TYPES = [int, bool, str, datetime.datetime, float, Decimal]
_TYPES = [int, bool, str, binary_type, datetime.datetime, float, Decimal]
TYPE_MAPPING = config.get('TYPE_MAPPING', _TYPE_MAPPING)
return _TYPES, TYPE_MAPPING

Expand Down
8 changes: 7 additions & 1 deletion ckanext/xloader/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import json
import datetime

from six import text_type as str, binary_type

from ckan import model
from ckan.lib import search
from collections import defaultdict
Expand All @@ -24,6 +26,8 @@
"application/vnd.oasis.opendocument.spreadsheet",
]

from .job_exceptions import JobError


class XLoaderFormats(object):
formats = None
Expand Down Expand Up @@ -184,7 +188,7 @@ def headers_guess(rows, tolerance=1):
return 0, []


TYPES = [int, bool, str, datetime.datetime, float, Decimal]
TYPES = [int, bool, str, binary_type, datetime.datetime, float, Decimal]


def type_guess(rows, types=TYPES, strict=False):
Expand Down Expand Up @@ -245,6 +249,8 @@ def type_guess(rows, types=TYPES, strict=False):
# element in case of a tie
# See: http://stackoverflow.com/a/6783101/214950
guesses_tuples = [(t, guess[t]) for t in types if t in guess]
if not guesses_tuples:
raise JobError('Failed to guess types')
_columns.append(max(guesses_tuples, key=lambda t_n: t_n[1])[0])
return _columns

Expand Down

0 comments on commit a27b993

Please sign in to comment.