add value_counts to existing extension array

Zybulon · Nov 26, 2023 · 2b26d21 · 2b26d21
1 parent d3750dc
commit 2b26d21
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 15 deletions.
diff --git a/h5pandas/h5array.py b/h5pandas/h5array.py
@@ -5,7 +5,7 @@
 from h5pandas.h5datatype import HDF5Dtype
 import numbers
 from functools import cached_property
-
+import warnings
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -17,6 +17,11 @@
 
 import pandas
 
+from pandas import (
+    Index,
+    Series,
+)
+
 from pandas.compat.numpy import function as nv
 
 from pandas._libs import lib
@@ -65,6 +70,8 @@
     npt,
 )
 
+from pandas.core.algorithms import _ensure_arraylike, value_counts_arraylike
+
 
 class HDF5ExtensionArray(pandas.core.arraylike.OpsMixin, pandas.api.extensions.ExtensionArray):
 
@@ -547,7 +554,7 @@ def copy(self):
         -------
         ExtensionArray
         """
-        return HDF5ExtensionArray(self._ndarray)
+        return HDF5ExtensionArray(np.array(self._ndarray))
 
     def _accumulate(self, name: str, *, skipna: bool = True, **kwargs):
         """
@@ -684,6 +691,35 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
     # ------------------------------------------------------------------------
     # Ops
 
+    def value_counts(self, dropna: bool = True) -> Series:
+        """
+        Return a Series containing counts of unique values.
+
+        Parameters
+        ----------
+        dropna : bool, optional
+            Don't include counts of NaN. The default is True.
+
+        Returns
+        -------
+        Series
+            value_counts.
+
+        """
+        values = _ensure_arraylike(self._ndarray, func_name="value_counts")
+        keys, counts, _ = value_counts_arraylike(values, dropna)
+        if keys.dtype == np.float16:
+            keys = keys.astype(np.float32)
+
+        # For backwards compatibility, we let Index do its normal type
+        #  inference, _except_ for if if infers from object to bool.
+        idx = Index(keys)
+        if idx.dtype == bool and keys.dtype == object:
+            idx = idx.astype(object)
+
+        result = Series(counts, index=idx, copy=False)
+        return result
+
     def argsort(self):
         return type(self)(np.argsort(self._ndarray))
 

diff --git a/tests/test_HDFExtensionArray.py b/tests/test_HDFExtensionArray.py
@@ -65,7 +65,7 @@
 
 
 # class TestInterface(base.BaseInterfaceTests):
-#     #   1 failed, 13 passed
+#     #  14 passed
 #     pass
 
 
@@ -74,9 +74,9 @@
 #     pass
 
 
-# class TestMethods(base.BaseMethodsTests):
-#     # 35 failed, 73 passed, 1 skipped, 4 errors
-#     pass
+class TestMethods(base.BaseMethodsTests):
+    # 32 failed, 77 passed, 4 errors
+    pass
 
 
 # class TestMissing(base.BaseMissingTests):

diff --git a/tests/test_extension.py b/tests/test_extension.py
@@ -36,14 +36,14 @@ def TestH5extensions():
         assert (d[0, 1] != 2.)
 
         t1 = time.time()
-        # assert on time to make sure the file is not loaded
         print(d.dtype, t1-t0)
         t0 = time.time()
 
         df = dataset_to_dataframe(d)
         df = dataset_to_dataframe(d, ["a", "b", "c", "d", "e"])
         t1 = time.time()
         print(df, t1-t0)
+        # assert on time to make sure the file is not loaded
         assert (t1-t0 < 0.02)
         d[0, 0] = 0
         assert (df.loc[0, 'a'] == 0)
@@ -126,18 +126,11 @@ def TestH5Group():
 
     with h5pandas.File("foobar.h5", "r", libver='latest') as f:
         df = f['h5pandas']
-        print(df)
         print(type(df))
 
         df = f['named_random_fixed']
 
-        print(f['random_fixed'], type(f['random_fixed']))
-
-        print(f['random_table'], type(f['random_table']))
-
-        print(f['named_random_fixed'], type(f['named_random_fixed']))
-
 
 if __name__ == '__main__':
-    # TestH5extensions()
+    TestH5extensions()
     TestH5Group()