Skip to content

Commit

Permalink
ENH: Function to walk the group hierarchy of a PyTables HDF5 file.
Browse files Browse the repository at this point in the history
  • Loading branch information
Stephen Pascoe authored and victor committed Sep 30, 2018
1 parent 22f17c6 commit ea89912
Show file tree
Hide file tree
Showing 5 changed files with 120 additions and 1 deletion.
1 change: 1 addition & 0 deletions doc/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ HDFStore: PyTables (HDF5)
HDFStore.select
HDFStore.info
HDFStore.keys
HDFStore.walk

Feather
~~~~~~~
Expand Down
19 changes: 19 additions & 0 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3554,6 +3554,25 @@ everything in the sub-store and **below**, so be *careful*.
store.remove('food')
store
You can walk through the group hierarchy using the ``walk`` method which
will yield a tuple for each group key along with the relative keys of its contents.

.. versionadded:: 0.24.0


.. ipython:: python
for (path, subgroups, subkeys) in store.walk():
for subgroup in subgroups:
print('GROUP: {}/{}'.format(path, subgroup))
for subkey in subkeys:
key = '/'.join([path, subkey])
print('KEY: {}'.format(key))
print(store.get(key))
.. warning::

Hierarchical keys cannot be retrieved as dotted (attribute) access as described above for items stored under the root node.
Expand Down
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ Other Enhancements
reflect changes from the `Pandas-GBQ library version 0.5.0
<https://pandas-gbq.readthedocs.io/en/latest/changelog.html#changelog-0-5-0>`__.
(:issue:`21627`)

- New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`)
-

.. _whatsnew_0240.api_breaking:

Expand Down
47 changes: 47 additions & 0 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -1106,6 +1106,53 @@ def groups(self):
g._v_name != u('table'))))
]

def walk(self, where="/"):
""" Walk the pytables group hierarchy for pandas objects
This generator will yield the group path, subgroups and pandas object
names for each group.
Any non-pandas PyTables objects that are not a group will be ignored.
The `where` group itself is listed first (preorder), then each of its
child groups (following an alphanumerical order) is also traversed,
following the same procedure.
.. versionadded:: 0.24.0
Parameters
----------
where : str, optional
Group where to start walking.
If not supplied, the root group is used.
Yields
------
path : str
Full path to a group (without trailing '/')
groups : list of str
names of the groups contained in `path`
leaves : list of str
names of the pandas objects contained in `path`
"""
_tables()
self._check_if_open()
for g in self._handle.walk_groups(where):
if getattr(g._v_attrs, 'pandas_type', None) is not None:
continue

groups = []
leaves = []
for child in g._v_children.values():
pandas_type = getattr(child._v_attrs, 'pandas_type', None)
if pandas_type is None:
if isinstance(child, _table_mod.group.Group):
groups.append(child._v_name)
else:
leaves.append(child._v_name)

yield (g._v_pathname.rstrip('/'), groups, leaves)

def get_node(self, key):
""" return the node with the key or None if it does not exist """
self._check_if_open()
Expand Down
51 changes: 51 additions & 0 deletions pandas/tests/io/test_pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,57 @@ def test_get(self):

pytest.raises(KeyError, store.get, 'b')

@pytest.mark.parametrize('where, expected', [
('/', {
'': ({'first_group', 'second_group'}, set()),
'/first_group': (set(), {'df1', 'df2'}),
'/second_group': ({'third_group'}, {'df3', 's1'}),
'/second_group/third_group': (set(), {'df4'}),
}),
('/second_group', {
'/second_group': ({'third_group'}, {'df3', 's1'}),
'/second_group/third_group': (set(), {'df4'}),
})
])
def test_walk(self, where, expected):
# GH10143
objs = {
'df1': pd.DataFrame([1, 2, 3]),
'df2': pd.DataFrame([4, 5, 6]),
'df3': pd.DataFrame([6, 7, 8]),
'df4': pd.DataFrame([9, 10, 11]),
's1': pd.Series([10, 9, 8]),
# Next 3 items aren't pandas objects and should be ignored
'a1': np.array([[1, 2, 3], [4, 5, 6]]),
'tb1': np.array([(1, 2, 3), (4, 5, 6)], dtype='i,i,i'),
'tb2': np.array([(7, 8, 9), (10, 11, 12)], dtype='i,i,i')
}

with ensure_clean_store('walk_groups.hdf', mode='w') as store:
store.put('/first_group/df1', objs['df1'])
store.put('/first_group/df2', objs['df2'])
store.put('/second_group/df3', objs['df3'])
store.put('/second_group/s1', objs['s1'])
store.put('/second_group/third_group/df4', objs['df4'])
# Create non-pandas objects
store._handle.create_array('/first_group', 'a1', objs['a1'])
store._handle.create_table('/first_group', 'tb1', obj=objs['tb1'])
store._handle.create_table('/second_group', 'tb2', obj=objs['tb2'])

assert len(list(store.walk(where=where))) == len(expected)
for path, groups, leaves in store.walk(where=where):
assert path in expected
expected_groups, expected_frames = expected[path]
assert expected_groups == set(groups)
assert expected_frames == set(leaves)
for leaf in leaves:
frame_path = '/'.join([path, leaf])
obj = store.get(frame_path)
if 'df' in leaf:
tm.assert_frame_equal(obj, objs[leaf])
else:
tm.assert_series_equal(obj, objs[leaf])

def test_getattr(self):

with ensure_clean_store(self.path) as store:
Expand Down

0 comments on commit ea89912

Please sign in to comment.