From 2b26d214f5591ad1edea237beea52be4bb6ff8c4 Mon Sep 17 00:00:00 2001 From: FMasson <97910143+Zybulon@users.noreply.github.com> Date: Sun, 26 Nov 2023 15:41:51 +0100 Subject: [PATCH] add value_counts to existing extension array --- h5pandas/h5array.py | 40 +++++++++++++++++++++++++++++++-- tests/test_HDFExtensionArray.py | 8 +++---- tests/test_extension.py | 11 ++------- 3 files changed, 44 insertions(+), 15 deletions(-) diff --git a/h5pandas/h5array.py b/h5pandas/h5array.py index a8862c9..f67e874 100644 --- a/h5pandas/h5array.py +++ b/h5pandas/h5array.py @@ -5,7 +5,7 @@ from h5pandas.h5datatype import HDF5Dtype import numbers from functools import cached_property - +import warnings from typing import ( TYPE_CHECKING, Any, @@ -17,6 +17,11 @@ import pandas +from pandas import ( + Index, + Series, +) + from pandas.compat.numpy import function as nv from pandas._libs import lib @@ -65,6 +70,8 @@ npt, ) +from pandas.core.algorithms import _ensure_arraylike, value_counts_arraylike + class HDF5ExtensionArray(pandas.core.arraylike.OpsMixin, pandas.api.extensions.ExtensionArray): @@ -547,7 +554,7 @@ def copy(self): ------- ExtensionArray """ - return HDF5ExtensionArray(self._ndarray) + return HDF5ExtensionArray(np.array(self._ndarray)) def _accumulate(self, name: str, *, skipna: bool = True, **kwargs): """ @@ -684,6 +691,35 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): # ------------------------------------------------------------------------ # Ops + def value_counts(self, dropna: bool = True) -> Series: + """ + Return a Series containing counts of unique values. + + Parameters + ---------- + dropna : bool, optional + Don't include counts of NaN. The default is True. + + Returns + ------- + Series + value_counts. + + """ + values = _ensure_arraylike(self._ndarray, func_name="value_counts") + keys, counts, _ = value_counts_arraylike(values, dropna) + if keys.dtype == np.float16: + keys = keys.astype(np.float32) + + # For backwards compatibility, we let Index do its normal type + # inference, _except_ for if if infers from object to bool. + idx = Index(keys) + if idx.dtype == bool and keys.dtype == object: + idx = idx.astype(object) + + result = Series(counts, index=idx, copy=False) + return result + def argsort(self): return type(self)(np.argsort(self._ndarray)) diff --git a/tests/test_HDFExtensionArray.py b/tests/test_HDFExtensionArray.py index c42257d..2a0c424 100644 --- a/tests/test_HDFExtensionArray.py +++ b/tests/test_HDFExtensionArray.py @@ -65,7 +65,7 @@ # class TestInterface(base.BaseInterfaceTests): -# # 1 failed, 13 passed +# # 14 passed # pass @@ -74,9 +74,9 @@ # pass -# class TestMethods(base.BaseMethodsTests): -# # 35 failed, 73 passed, 1 skipped, 4 errors -# pass +class TestMethods(base.BaseMethodsTests): + # 32 failed, 77 passed, 4 errors + pass # class TestMissing(base.BaseMissingTests): diff --git a/tests/test_extension.py b/tests/test_extension.py index ad83504..fae0bbd 100644 --- a/tests/test_extension.py +++ b/tests/test_extension.py @@ -36,7 +36,6 @@ def TestH5extensions(): assert (d[0, 1] != 2.) t1 = time.time() - # assert on time to make sure the file is not loaded print(d.dtype, t1-t0) t0 = time.time() @@ -44,6 +43,7 @@ def TestH5extensions(): df = dataset_to_dataframe(d, ["a", "b", "c", "d", "e"]) t1 = time.time() print(df, t1-t0) + # assert on time to make sure the file is not loaded assert (t1-t0 < 0.02) d[0, 0] = 0 assert (df.loc[0, 'a'] == 0) @@ -126,18 +126,11 @@ def TestH5Group(): with h5pandas.File("foobar.h5", "r", libver='latest') as f: df = f['h5pandas'] - print(df) print(type(df)) df = f['named_random_fixed'] - print(f['random_fixed'], type(f['random_fixed'])) - - print(f['random_table'], type(f['random_table'])) - - print(f['named_random_fixed'], type(f['named_random_fixed'])) - if __name__ == '__main__': - # TestH5extensions() + TestH5extensions() TestH5Group()