From 6287c8bf4b670f6b301730b607295e7449c58dfd Mon Sep 17 00:00:00 2001 From: Jordan Morris Date: Mon, 5 Feb 2018 17:03:07 +1300 Subject: [PATCH 1/2] functionally working to save and load with user-created tables file --- .gitignore | 2 + deepdish/io/__init__.py | 4 +- deepdish/io/hdf5io.py | 231 +++++++++++++++++++++++++++----------- deepdish/tests/test_io.py | 8 ++ 4 files changed, 180 insertions(+), 65 deletions(-) diff --git a/.gitignore b/.gitignore index 8ea3204..e81856c 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,8 @@ _site .ruby-version build .coverage +deepdish.egg-info/** +dist/** # Numerous always-ignore extensions *.diff diff --git a/deepdish/io/__init__.py b/deepdish/io/__init__.py index cd3f6c4..d9cf6a4 100644 --- a/deepdish/io/__init__.py +++ b/deepdish/io/__init__.py @@ -8,10 +8,10 @@ _pytables_ok = False if _pytables_ok: - from .hdf5io import load, save, ForcePickle, Compression + from .hdf5io import load, load_from_file, save, save_to_file, ForcePickle, Compression else: def _f(*args, **kwargs): raise ImportError("You need PyTables for this function") load = save = _f -__all__ = ['load', 'save', 'ForcePickle', 'Compression'] +__all__ = ['load', 'load_from_file', 'save', 'save_to_file', 'ForcePickle', 'Compression'] diff --git a/deepdish/io/hdf5io.py b/deepdish/io/hdf5io.py index 636e4de..44f0774 100644 --- a/deepdish/io/hdf5io.py +++ b/deepdish/io/hdf5io.py @@ -564,43 +564,112 @@ def save(path, data, compression='default'): `zlib`, since it is highly portable; for much greater speed, try for instance `blosc`. + See also + -------- + load + """ + with tables.open_file(path, mode='w') as h5file: + save_to_file(h5file, data, compression) + +def save_to_file(file, data, compression='default'): + """ + Save any Python structure as HDF5 to a writable file-like object. It is particularly suited for + Numpy arrays. This function works similar to ``numpy.save``, except if you + save a Python object at the top level, you do not need to issue + ``data.flat[0]`` to retrieve it from inside a Numpy array of type + ``object``. + + Some types of objects get saved natively in HDF5. The rest get serialized + automatically. For most needs, you should be able to stick to the natively + supported types, which are: + + * Dictionaries + * Short lists and tuples (<256 in length) + * Basic data types (including strings and None) + * Numpy arrays + * Scipy sparse matrices + * Pandas ``DataFrame``, ``Series``, and ``Panel`` + * SimpleNamespaces (for Python >= 3.3, but see note below) + + A recommendation is to always convert your data to using only these types + That way your data will be portable and can be opened through any HDF5 + reader. A class that helps you with this is + :class:`deepdish.util.Saveable`. + + Lists and tuples are supported and can contain heterogeneous types. This is + mostly useful and plays well with HDF5 for short lists and tuples. If you + have a long list (>256) it will be serialized automatically. However, + in such cases it is common for the elements to have the same type, in which + case we strongly recommend converting to a Numpy array first. + + Note that the SimpleNamespace type will be read in as dictionaries for + earlier versions of Python. + + This function requires the `PyTables `_ module to + be installed. + + You can change the default compression method to ``blosc`` (much faster, + but less portable) by creating a ``~/.deepdish.conf`` with:: + + [io] + compression: blosc + + This is the recommended compression method if you plan to use your HDF5 + files exclusively through deepdish (or PyTables). + + Parameters + ---------- + file : file-like object + A writeable file-object to which the data is saved. + data : anything + Data to be saved. This can be anything from a Numpy array, a string, an + object, or a dictionary containing all of them including more + dictionaries. + compression : string or tuple + Set compression method, choosing from `blosc`, `zlib`, `lzo`, `bzip2` + and more (see PyTables documentation). It can also be specified as a + tuple (e.g. ``('blosc', 5)``), with the latter value specifying the + level of compression, choosing from 0 (no compression) to 9 (maximum + compression). Set to `None` to turn off compression. The default is + `zlib`, since it is highly portable; for much greater speed, try for + instance `blosc`. + See also -------- load """ filters = _get_compression_filters(compression) - with tables.open_file(path, mode='w') as h5file: - # If the data is a dictionary, put it flatly in the root - group = h5file.root - group._v_attrs[DEEPDISH_IO_VERSION_STR] = IO_VERSION - idtable = {} # dict to keep track of objects already saved - # Sparse matrices match isinstance(data, dict), so we'll have to be - # more strict with the type checking - if type(data) == type({}) and _dict_native_ok(data): - idtable[id(data)] = '/' - for key, value in data.items(): - _save_level(h5file, group, value, name=key, - filters=filters, idtable=idtable) - - elif (_sns and isinstance(data, SimpleNamespace) and - _dict_native_ok(data.__dict__)): - idtable[id(data)] = '/' - group._v_attrs[DEEPDISH_IO_ROOT_IS_SNS] = True - for key, value in data.__dict__.items(): - _save_level(h5file, group, value, name=key, - filters=filters, idtable=idtable) + # If the data is a dictionary, put it flatly in the root + group = file.root + group._v_attrs[DEEPDISH_IO_VERSION_STR] = IO_VERSION + idtable = {} # dict to keep track of objects already saved + # Sparse matrices match isinstance(data, dict), so we'll have to be + # more strict with the type checking + if type(data) == type({}) and _dict_native_ok(data): + idtable[id(data)] = '/' + for key, value in data.items(): + _save_level(file, group, value, name=key, + filters=filters, idtable=idtable) - else: - _save_level(h5file, group, data, name='data', + elif (_sns and isinstance(data, SimpleNamespace) and + _dict_native_ok(data.__dict__)): + idtable[id(data)] = '/' + group._v_attrs[DEEPDISH_IO_ROOT_IS_SNS] = True + for key, value in data.__dict__.items(): + _save_level(file, group, value, name=key, filters=filters, idtable=idtable) - # Mark this to automatically unpack when loaded - group._v_attrs[DEEPDISH_IO_UNPACK] = True + + else: + _save_level(file, group, data, name='data', + filters=filters, idtable=idtable) + # Mark this to automatically unpack when loaded + group._v_attrs[DEEPDISH_IO_UNPACK] = True def load(path, group=None, sel=None, unpack=False): """ - Loads an HDF5 saved with `save`. + Loads an HDF5 saved with `save` from a file path. This function requires the `PyTables `_ module to be installed. @@ -633,47 +702,83 @@ def load(path, group=None, sel=None, unpack=False): """ with tables.open_file(path, mode='r') as h5file: - pathtable = {} # dict to keep track of objects already loaded - if group is not None: - if isinstance(group, str): - data = _load_specific_level(h5file, h5file, group, sel=sel, - pathtable=pathtable) - else: # Assume group is a list or tuple - data = [] - for g in group: - data_i = _load_specific_level(h5file, h5file, g, sel=sel, - pathtable=pathtable) - data.append(data_i) - data = tuple(data) + return load_from_file(h5file, group, sel, unpack) + +def load_from_file(file, group=None, sel=None, unpack=False): + """ + Loads an HDF5 saved with `save` from a file-like object. + + This function requires the `PyTables `_ module to + be installed. + + Parameters + ---------- + file : file-like object + Readable file from which to load the data. + group : string or list + Load a specific group in the HDF5 hierarchy. If `group` is a list of + strings, then a tuple will be returned with all the groups that were + specified. + sel : slice or tuple of slices + If you specify `group` and the target is a numpy array, then you can + use this to slice it. This is useful for opening subsets of large HDF5 + files. To compose the selection, you can use `deepdish.aslice`. + unpack : bool + If True, a single-entry dictionaries will be unpacked and the value + will be returned directly. That is, if you save ``dict(a=100)``, only + ``100`` will be loaded. + + Returns + ------- + data : anything + Hopefully an identical reconstruction of the data that was saved. + + See also + -------- + save + + """ + pathtable = {} # dict to keep track of objects already loaded + if group is not None: + if isinstance(group, str): + data = _load_specific_level(file, file, group, sel=sel, + pathtable=pathtable) + else: # Assume group is a list or tuple + data = [] + for g in group: + data_i = _load_specific_level(file, file, g, sel=sel, + pathtable=pathtable) + data.append(data_i) + data = tuple(data) + else: + grp = file.root + auto_unpack = (DEEPDISH_IO_UNPACK in grp._v_attrs and + grp._v_attrs[DEEPDISH_IO_UNPACK]) + do_unpack = unpack or auto_unpack + if do_unpack and len(grp._v_children) == 1: + name = next(iter(grp._v_children)) + data = _load_specific_level(file, grp, name, sel=sel, + pathtable=pathtable) + do_unpack = False + elif sel is not None: + raise ValueError("Must specify group with `sel` unless it " + "automatically unpacks") else: - grp = h5file.root - auto_unpack = (DEEPDISH_IO_UNPACK in grp._v_attrs and - grp._v_attrs[DEEPDISH_IO_UNPACK]) - do_unpack = unpack or auto_unpack - if do_unpack and len(grp._v_children) == 1: - name = next(iter(grp._v_children)) - data = _load_specific_level(h5file, grp, name, sel=sel, - pathtable=pathtable) - do_unpack = False - elif sel is not None: - raise ValueError("Must specify group with `sel` unless it " - "automatically unpacks") - else: - data = _load_level(h5file, grp, pathtable) + data = _load_level(file, grp, pathtable) - if DEEPDISH_IO_VERSION_STR in grp._v_attrs: - v = grp._v_attrs[DEEPDISH_IO_VERSION_STR] - else: - v = 0 + if DEEPDISH_IO_VERSION_STR in grp._v_attrs: + v = grp._v_attrs[DEEPDISH_IO_VERSION_STR] + else: + v = 0 - if v > IO_VERSION: - warnings.warn('This file was saved with a newer version of ' - 'deepdish. Please upgrade to make sure it loads ' - 'correctly.') + if v > IO_VERSION: + warnings.warn('This file was saved with a newer version of ' + 'deepdish. Please upgrade to make sure it loads ' + 'correctly.') - # Attributes can't be unpacked with the method above, so fall back - # to this - if do_unpack and isinstance(data, dict) and len(data) == 1: - data = next(iter(data.values())) + # Attributes can't be unpacked with the method above, so fall back + # to this + if do_unpack and isinstance(data, dict) and len(data) == 1: + data = next(iter(data.values())) return data diff --git a/deepdish/tests/test_io.py b/deepdish/tests/test_io.py index cdf476c..0db63e9 100644 --- a/deepdish/tests/test_io.py +++ b/deepdish/tests/test_io.py @@ -6,6 +6,7 @@ import deepdish as dd import pandas as pd from contextlib import contextmanager +import tables try: from types import SimpleNamespace @@ -388,5 +389,12 @@ def test_compression_true(self): x1 = dd.io.load(fn) assert (x == x1).all() + def test_reconstruction_with_consumer_owned_file(self): + with tables.open_file("in_memory.h5", "w", driver="H5FD_CORE", driver_core_backing_store=0) as file: + x = 100 + dd.io.save_to_file(file, x) + x1 = dd.io.load_from_file(file) + assert x == x1 + if __name__ == '__main__': unittest.main() From 0abb106ff8b34f300e986c9dc1941e9b8d5dbb04 Mon Sep 17 00:00:00 2001 From: Jordan Morris Date: Mon, 5 Feb 2018 17:19:59 +1300 Subject: [PATCH 2/2] improved docs for save_to_file / load_from_file api methods --- deepdish/io/hdf5io.py | 30 +++++++++++++++--------------- doc/source/io.rst | 13 +++++++++++++ 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/deepdish/io/hdf5io.py b/deepdish/io/hdf5io.py index 44f0774..f899281 100644 --- a/deepdish/io/hdf5io.py +++ b/deepdish/io/hdf5io.py @@ -571,7 +571,7 @@ def save(path, data, compression='default'): with tables.open_file(path, mode='w') as h5file: save_to_file(h5file, data, compression) -def save_to_file(file, data, compression='default'): +def save_to_file(tables_file, data, compression='default'): """ Save any Python structure as HDF5 to a writable file-like object. It is particularly suited for Numpy arrays. This function works similar to ``numpy.save``, except if you @@ -619,8 +619,8 @@ def save_to_file(file, data, compression='default'): Parameters ---------- - file : file-like object - A writeable file-object to which the data is saved. + tables_file : PyTables file object + A writeable PyTables file object to which the data is saved. data : anything Data to be saved. This can be anything from a Numpy array, a string, an object, or a dictionary containing all of them including more @@ -641,7 +641,7 @@ def save_to_file(file, data, compression='default'): filters = _get_compression_filters(compression) # If the data is a dictionary, put it flatly in the root - group = file.root + group = tables_file.root group._v_attrs[DEEPDISH_IO_VERSION_STR] = IO_VERSION idtable = {} # dict to keep track of objects already saved # Sparse matrices match isinstance(data, dict), so we'll have to be @@ -649,7 +649,7 @@ def save_to_file(file, data, compression='default'): if type(data) == type({}) and _dict_native_ok(data): idtable[id(data)] = '/' for key, value in data.items(): - _save_level(file, group, value, name=key, + _save_level(tables_file, group, value, name=key, filters=filters, idtable=idtable) elif (_sns and isinstance(data, SimpleNamespace) and @@ -657,11 +657,11 @@ def save_to_file(file, data, compression='default'): idtable[id(data)] = '/' group._v_attrs[DEEPDISH_IO_ROOT_IS_SNS] = True for key, value in data.__dict__.items(): - _save_level(file, group, value, name=key, + _save_level(tables_file, group, value, name=key, filters=filters, idtable=idtable) else: - _save_level(file, group, data, name='data', + _save_level(tables_file, group, data, name='data', filters=filters, idtable=idtable) # Mark this to automatically unpack when loaded group._v_attrs[DEEPDISH_IO_UNPACK] = True @@ -704,7 +704,7 @@ def load(path, group=None, sel=None, unpack=False): with tables.open_file(path, mode='r') as h5file: return load_from_file(h5file, group, sel, unpack) -def load_from_file(file, group=None, sel=None, unpack=False): +def load_from_file(tables_file, group=None, sel=None, unpack=False): """ Loads an HDF5 saved with `save` from a file-like object. @@ -713,8 +713,8 @@ def load_from_file(file, group=None, sel=None, unpack=False): Parameters ---------- - file : file-like object - Readable file from which to load the data. + tables_file : PyTables file object + Readable PyTables file from which to load the data. group : string or list Load a specific group in the HDF5 hierarchy. If `group` is a list of strings, then a tuple will be returned with all the groups that were @@ -741,30 +741,30 @@ def load_from_file(file, group=None, sel=None, unpack=False): pathtable = {} # dict to keep track of objects already loaded if group is not None: if isinstance(group, str): - data = _load_specific_level(file, file, group, sel=sel, + data = _load_specific_level(tables_file, tables_file, group, sel=sel, pathtable=pathtable) else: # Assume group is a list or tuple data = [] for g in group: - data_i = _load_specific_level(file, file, g, sel=sel, + data_i = _load_specific_level(tables_file, tables_file, g, sel=sel, pathtable=pathtable) data.append(data_i) data = tuple(data) else: - grp = file.root + grp = tables_file.root auto_unpack = (DEEPDISH_IO_UNPACK in grp._v_attrs and grp._v_attrs[DEEPDISH_IO_UNPACK]) do_unpack = unpack or auto_unpack if do_unpack and len(grp._v_children) == 1: name = next(iter(grp._v_children)) - data = _load_specific_level(file, grp, name, sel=sel, + data = _load_specific_level(tables_file, grp, name, sel=sel, pathtable=pathtable) do_unpack = False elif sel is not None: raise ValueError("Must specify group with `sel` unless it " "automatically unpacks") else: - data = _load_level(file, grp, pathtable) + data = _load_level(tables_file, grp, pathtable) if DEEPDISH_IO_VERSION_STR in grp._v_attrs: v = grp._v_attrs[DEEPDISH_IO_VERSION_STR] diff --git a/doc/source/io.rst b/doc/source/io.rst index 19bfb94..1042b04 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -36,6 +36,19 @@ We can now reconstruct the dictionary from the file using >>> d = dd.io.load('test.h5') +An alternative save_to_file / load_from_file API exists where you control the PyTables file, e.g.: + +>>> with tables.open_file("in_memory.h5", "w", driver="H5FD_CORE", driver_core_backing_store=0) as my_file: +>>> d = {'foo': np.arange(10), 'bar': np.ones((5, 4, 3))} +>>> dd.io.save_to_file(my_file, d) +>>> # TODO: Save the file somewhere, such as cloud storage. +>>> # TODO: Load the file from the cloud. +>>> d = dd.io.load_from_file(my_file) + +This will be useful if, for example, you want to read and write HDF5 somewhere +other than a local disk. In all other respects these methods have the same +behavior as save and load. + Dictionaries ------------