Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extended API which enables the consumer to control the PyTables file object #31

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ _site
.ruby-version
build
.coverage
deepdish.egg-info/**
dist/**

# Numerous always-ignore extensions
*.diff
Expand Down
4 changes: 2 additions & 2 deletions deepdish/io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
_pytables_ok = False

if _pytables_ok:
from .hdf5io import load, save, ForcePickle, Compression
from .hdf5io import load, load_from_file, save, save_to_file, ForcePickle, Compression
else:
def _f(*args, **kwargs):
raise ImportError("You need PyTables for this function")
load = save = _f

__all__ = ['load', 'save', 'ForcePickle', 'Compression']
__all__ = ['load', 'load_from_file', 'save', 'save_to_file', 'ForcePickle', 'Compression']
231 changes: 168 additions & 63 deletions deepdish/io/hdf5io.py
Original file line number Diff line number Diff line change
Expand Up @@ -564,43 +564,112 @@ def save(path, data, compression='default'):
`zlib`, since it is highly portable; for much greater speed, try for
instance `blosc`.

See also
--------
load
"""
with tables.open_file(path, mode='w') as h5file:
save_to_file(h5file, data, compression)

def save_to_file(tables_file, data, compression='default'):
"""
Save any Python structure as HDF5 to a writable file-like object. It is particularly suited for
Numpy arrays. This function works similar to ``numpy.save``, except if you
save a Python object at the top level, you do not need to issue
``data.flat[0]`` to retrieve it from inside a Numpy array of type
``object``.

Some types of objects get saved natively in HDF5. The rest get serialized
automatically. For most needs, you should be able to stick to the natively
supported types, which are:

* Dictionaries
* Short lists and tuples (<256 in length)
* Basic data types (including strings and None)
* Numpy arrays
* Scipy sparse matrices
* Pandas ``DataFrame``, ``Series``, and ``Panel``
* SimpleNamespaces (for Python >= 3.3, but see note below)

A recommendation is to always convert your data to using only these types
That way your data will be portable and can be opened through any HDF5
reader. A class that helps you with this is
:class:`deepdish.util.Saveable`.

Lists and tuples are supported and can contain heterogeneous types. This is
mostly useful and plays well with HDF5 for short lists and tuples. If you
have a long list (>256) it will be serialized automatically. However,
in such cases it is common for the elements to have the same type, in which
case we strongly recommend converting to a Numpy array first.

Note that the SimpleNamespace type will be read in as dictionaries for
earlier versions of Python.

This function requires the `PyTables <http://www.pytables.org/>`_ module to
be installed.

You can change the default compression method to ``blosc`` (much faster,
but less portable) by creating a ``~/.deepdish.conf`` with::

[io]
compression: blosc

This is the recommended compression method if you plan to use your HDF5
files exclusively through deepdish (or PyTables).

Parameters
----------
tables_file : PyTables file object
A writeable PyTables file object to which the data is saved.
data : anything
Data to be saved. This can be anything from a Numpy array, a string, an
object, or a dictionary containing all of them including more
dictionaries.
compression : string or tuple
Set compression method, choosing from `blosc`, `zlib`, `lzo`, `bzip2`
and more (see PyTables documentation). It can also be specified as a
tuple (e.g. ``('blosc', 5)``), with the latter value specifying the
level of compression, choosing from 0 (no compression) to 9 (maximum
compression). Set to `None` to turn off compression. The default is
`zlib`, since it is highly portable; for much greater speed, try for
instance `blosc`.

See also
--------
load
"""
filters = _get_compression_filters(compression)

with tables.open_file(path, mode='w') as h5file:
# If the data is a dictionary, put it flatly in the root
group = h5file.root
group._v_attrs[DEEPDISH_IO_VERSION_STR] = IO_VERSION
idtable = {} # dict to keep track of objects already saved
# Sparse matrices match isinstance(data, dict), so we'll have to be
# more strict with the type checking
if type(data) == type({}) and _dict_native_ok(data):
idtable[id(data)] = '/'
for key, value in data.items():
_save_level(h5file, group, value, name=key,
filters=filters, idtable=idtable)

elif (_sns and isinstance(data, SimpleNamespace) and
_dict_native_ok(data.__dict__)):
idtable[id(data)] = '/'
group._v_attrs[DEEPDISH_IO_ROOT_IS_SNS] = True
for key, value in data.__dict__.items():
_save_level(h5file, group, value, name=key,
filters=filters, idtable=idtable)
# If the data is a dictionary, put it flatly in the root
group = tables_file.root
group._v_attrs[DEEPDISH_IO_VERSION_STR] = IO_VERSION
idtable = {} # dict to keep track of objects already saved
# Sparse matrices match isinstance(data, dict), so we'll have to be
# more strict with the type checking
if type(data) == type({}) and _dict_native_ok(data):
idtable[id(data)] = '/'
for key, value in data.items():
_save_level(tables_file, group, value, name=key,
filters=filters, idtable=idtable)

else:
_save_level(h5file, group, data, name='data',
elif (_sns and isinstance(data, SimpleNamespace) and
_dict_native_ok(data.__dict__)):
idtable[id(data)] = '/'
group._v_attrs[DEEPDISH_IO_ROOT_IS_SNS] = True
for key, value in data.__dict__.items():
_save_level(tables_file, group, value, name=key,
filters=filters, idtable=idtable)
# Mark this to automatically unpack when loaded
group._v_attrs[DEEPDISH_IO_UNPACK] = True

else:
_save_level(tables_file, group, data, name='data',
filters=filters, idtable=idtable)
# Mark this to automatically unpack when loaded
group._v_attrs[DEEPDISH_IO_UNPACK] = True


def load(path, group=None, sel=None, unpack=False):
"""
Loads an HDF5 saved with `save`.
Loads an HDF5 saved with `save` from a file path.

This function requires the `PyTables <http://www.pytables.org/>`_ module to
be installed.
Expand Down Expand Up @@ -633,47 +702,83 @@ def load(path, group=None, sel=None, unpack=False):

"""
with tables.open_file(path, mode='r') as h5file:
pathtable = {} # dict to keep track of objects already loaded
if group is not None:
if isinstance(group, str):
data = _load_specific_level(h5file, h5file, group, sel=sel,
pathtable=pathtable)
else: # Assume group is a list or tuple
data = []
for g in group:
data_i = _load_specific_level(h5file, h5file, g, sel=sel,
pathtable=pathtable)
data.append(data_i)
data = tuple(data)
return load_from_file(h5file, group, sel, unpack)

def load_from_file(tables_file, group=None, sel=None, unpack=False):
"""
Loads an HDF5 saved with `save` from a file-like object.

This function requires the `PyTables <http://www.pytables.org/>`_ module to
be installed.

Parameters
----------
tables_file : PyTables file object
Readable PyTables file from which to load the data.
group : string or list
Load a specific group in the HDF5 hierarchy. If `group` is a list of
strings, then a tuple will be returned with all the groups that were
specified.
sel : slice or tuple of slices
If you specify `group` and the target is a numpy array, then you can
use this to slice it. This is useful for opening subsets of large HDF5
files. To compose the selection, you can use `deepdish.aslice`.
unpack : bool
If True, a single-entry dictionaries will be unpacked and the value
will be returned directly. That is, if you save ``dict(a=100)``, only
``100`` will be loaded.

Returns
-------
data : anything
Hopefully an identical reconstruction of the data that was saved.

See also
--------
save

"""
pathtable = {} # dict to keep track of objects already loaded
if group is not None:
if isinstance(group, str):
data = _load_specific_level(tables_file, tables_file, group, sel=sel,
pathtable=pathtable)
else: # Assume group is a list or tuple
data = []
for g in group:
data_i = _load_specific_level(tables_file, tables_file, g, sel=sel,
pathtable=pathtable)
data.append(data_i)
data = tuple(data)
else:
grp = tables_file.root
auto_unpack = (DEEPDISH_IO_UNPACK in grp._v_attrs and
grp._v_attrs[DEEPDISH_IO_UNPACK])
do_unpack = unpack or auto_unpack
if do_unpack and len(grp._v_children) == 1:
name = next(iter(grp._v_children))
data = _load_specific_level(tables_file, grp, name, sel=sel,
pathtable=pathtable)
do_unpack = False
elif sel is not None:
raise ValueError("Must specify group with `sel` unless it "
"automatically unpacks")
else:
grp = h5file.root
auto_unpack = (DEEPDISH_IO_UNPACK in grp._v_attrs and
grp._v_attrs[DEEPDISH_IO_UNPACK])
do_unpack = unpack or auto_unpack
if do_unpack and len(grp._v_children) == 1:
name = next(iter(grp._v_children))
data = _load_specific_level(h5file, grp, name, sel=sel,
pathtable=pathtable)
do_unpack = False
elif sel is not None:
raise ValueError("Must specify group with `sel` unless it "
"automatically unpacks")
else:
data = _load_level(h5file, grp, pathtable)
data = _load_level(tables_file, grp, pathtable)

if DEEPDISH_IO_VERSION_STR in grp._v_attrs:
v = grp._v_attrs[DEEPDISH_IO_VERSION_STR]
else:
v = 0
if DEEPDISH_IO_VERSION_STR in grp._v_attrs:
v = grp._v_attrs[DEEPDISH_IO_VERSION_STR]
else:
v = 0

if v > IO_VERSION:
warnings.warn('This file was saved with a newer version of '
'deepdish. Please upgrade to make sure it loads '
'correctly.')
if v > IO_VERSION:
warnings.warn('This file was saved with a newer version of '
'deepdish. Please upgrade to make sure it loads '
'correctly.')

# Attributes can't be unpacked with the method above, so fall back
# to this
if do_unpack and isinstance(data, dict) and len(data) == 1:
data = next(iter(data.values()))
# Attributes can't be unpacked with the method above, so fall back
# to this
if do_unpack and isinstance(data, dict) and len(data) == 1:
data = next(iter(data.values()))

return data
8 changes: 8 additions & 0 deletions deepdish/tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import deepdish as dd
import pandas as pd
from contextlib import contextmanager
import tables

try:
from types import SimpleNamespace
Expand Down Expand Up @@ -388,5 +389,12 @@ def test_compression_true(self):
x1 = dd.io.load(fn)
assert (x == x1).all()

def test_reconstruction_with_consumer_owned_file(self):
with tables.open_file("in_memory.h5", "w", driver="H5FD_CORE", driver_core_backing_store=0) as file:
x = 100
dd.io.save_to_file(file, x)
x1 = dd.io.load_from_file(file)
assert x == x1

if __name__ == '__main__':
unittest.main()
13 changes: 13 additions & 0 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,19 @@ We can now reconstruct the dictionary from the file using

>>> d = dd.io.load('test.h5')

An alternative save_to_file / load_from_file API exists where you control the PyTables file, e.g.:

>>> with tables.open_file("in_memory.h5", "w", driver="H5FD_CORE", driver_core_backing_store=0) as my_file:
>>> d = {'foo': np.arange(10), 'bar': np.ones((5, 4, 3))}
>>> dd.io.save_to_file(my_file, d)
>>> # TODO: Save the file somewhere, such as cloud storage.
>>> # TODO: Load the file from the cloud.
>>> d = dd.io.load_from_file(my_file)

This will be useful if, for example, you want to read and write HDF5 somewhere
other than a local disk. In all other respects these methods have the same
behavior as save and load.

Dictionaries
------------

Expand Down