Skip to content

Commit

Permalink
add value_counts to existing extension array
Browse files Browse the repository at this point in the history
  • Loading branch information
Zybulon committed Nov 26, 2023
1 parent d3750dc commit 2b26d21
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 15 deletions.
40 changes: 38 additions & 2 deletions h5pandas/h5array.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from h5pandas.h5datatype import HDF5Dtype
import numbers
from functools import cached_property

import warnings
from typing import (
TYPE_CHECKING,
Any,
Expand All @@ -17,6 +17,11 @@

import pandas

from pandas import (
Index,
Series,
)

from pandas.compat.numpy import function as nv

from pandas._libs import lib
Expand Down Expand Up @@ -65,6 +70,8 @@
npt,
)

from pandas.core.algorithms import _ensure_arraylike, value_counts_arraylike


class HDF5ExtensionArray(pandas.core.arraylike.OpsMixin, pandas.api.extensions.ExtensionArray):

Expand Down Expand Up @@ -547,7 +554,7 @@ def copy(self):
-------
ExtensionArray
"""
return HDF5ExtensionArray(self._ndarray)
return HDF5ExtensionArray(np.array(self._ndarray))

def _accumulate(self, name: str, *, skipna: bool = True, **kwargs):
"""
Expand Down Expand Up @@ -684,6 +691,35 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
# ------------------------------------------------------------------------
# Ops

def value_counts(self, dropna: bool = True) -> Series:
"""
Return a Series containing counts of unique values.
Parameters
----------
dropna : bool, optional
Don't include counts of NaN. The default is True.
Returns
-------
Series
value_counts.
"""
values = _ensure_arraylike(self._ndarray, func_name="value_counts")
keys, counts, _ = value_counts_arraylike(values, dropna)
if keys.dtype == np.float16:
keys = keys.astype(np.float32)

# For backwards compatibility, we let Index do its normal type
# inference, _except_ for if if infers from object to bool.
idx = Index(keys)
if idx.dtype == bool and keys.dtype == object:
idx = idx.astype(object)

result = Series(counts, index=idx, copy=False)
return result

def argsort(self):
return type(self)(np.argsort(self._ndarray))

Expand Down
8 changes: 4 additions & 4 deletions tests/test_HDFExtensionArray.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@


# class TestInterface(base.BaseInterfaceTests):
# # 1 failed, 13 passed
# # 14 passed
# pass


Expand All @@ -74,9 +74,9 @@
# pass


# class TestMethods(base.BaseMethodsTests):
# # 35 failed, 73 passed, 1 skipped, 4 errors
# pass
class TestMethods(base.BaseMethodsTests):
# 32 failed, 77 passed, 4 errors
pass


# class TestMissing(base.BaseMissingTests):
Expand Down
11 changes: 2 additions & 9 deletions tests/test_extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,14 @@ def TestH5extensions():
assert (d[0, 1] != 2.)

t1 = time.time()
# assert on time to make sure the file is not loaded
print(d.dtype, t1-t0)
t0 = time.time()

df = dataset_to_dataframe(d)
df = dataset_to_dataframe(d, ["a", "b", "c", "d", "e"])
t1 = time.time()
print(df, t1-t0)
# assert on time to make sure the file is not loaded
assert (t1-t0 < 0.02)
d[0, 0] = 0
assert (df.loc[0, 'a'] == 0)
Expand Down Expand Up @@ -126,18 +126,11 @@ def TestH5Group():

with h5pandas.File("foobar.h5", "r", libver='latest') as f:
df = f['h5pandas']
print(df)
print(type(df))

df = f['named_random_fixed']

print(f['random_fixed'], type(f['random_fixed']))

print(f['random_table'], type(f['random_table']))

print(f['named_random_fixed'], type(f['named_random_fixed']))


if __name__ == '__main__':
# TestH5extensions()
TestH5extensions()
TestH5Group()

0 comments on commit 2b26d21

Please sign in to comment.