From 3613d76f5c8317367afd0cc9f53a88d536e99d8d Mon Sep 17 00:00:00 2001 From: ajpotts Date: Fri, 2 Feb 2024 15:22:32 -0500 Subject: [PATCH] closes #2927 power divergence statistic (#2932) * closes #2927 power divergence statistic * add scipy to requirements * add arkouda/akstats/_stats_py.pyi * Fix F403 and F401 error codes on flake8 arkouda from arkouda/akmath/__init__.py and arkouda/akstats/__init__.py * un-pin scipy from specific version * add scipy license and minor changes in response to code review * Update tests/akmath/akmath_test.py --------- Co-authored-by: Amanda Potts Co-authored-by: pierce <48131946+pierce314159@users.noreply.github.com> --- PROTO_tests/tests/akmath/akmath_tests.py | 28 +++ PROTO_tests/tests/akstats/akstats_test.py | 73 ++++++++ arkouda-env-dev.yml | 3 +- arkouda-env.yml | 5 +- arkouda/__init__.py | 2 + arkouda/akmath/__init__.py | 5 + arkouda/akmath/_math.py | 52 ++++++ arkouda/akstats/LICENSE.txt | 30 +++ arkouda/akstats/__init__.py | 3 + arkouda/akstats/_stats_py.py | 212 ++++++++++++++++++++++ arkouda/akstats/_stats_py.pyi | 6 + pydoc/requirements.txt | 3 +- pydoc/setup/REQUIREMENTS.md | 1 + pytest.ini | 2 + setup.py | 1 + tests/akmath/akmath_test.py | 30 +++ tests/akstats/akstats_test.py | 77 ++++++++ 17 files changed, 529 insertions(+), 4 deletions(-) create mode 100644 PROTO_tests/tests/akmath/akmath_tests.py create mode 100644 PROTO_tests/tests/akstats/akstats_test.py create mode 100644 arkouda/akmath/__init__.py create mode 100644 arkouda/akmath/_math.py create mode 100644 arkouda/akstats/LICENSE.txt create mode 100644 arkouda/akstats/__init__.py create mode 100644 arkouda/akstats/_stats_py.py create mode 100644 arkouda/akstats/_stats_py.pyi create mode 100644 tests/akmath/akmath_test.py create mode 100644 tests/akstats/akstats_test.py diff --git a/PROTO_tests/tests/akmath/akmath_tests.py b/PROTO_tests/tests/akmath/akmath_tests.py new file mode 100644 index 00000000000..8737ea559ad --- /dev/null +++ b/PROTO_tests/tests/akmath/akmath_tests.py @@ -0,0 +1,28 @@ +import math + +import numpy as np + +import arkouda as ak +from arkouda.akmath import xlogy +from arkouda.pdarrayclass import pdarray + + +class TestStats: + def test_xlogy(self): + from scipy.special import xlogy as scipy_xlogy + + ys = [ak.array([1, 2, 3]), ak.array([10, 100, 100]), ak.array([-1, 0, np.nan])] + xs = [3, 5, np.float64(6), ak.array([1.0, 2.0, 4.5])] + + for y in ys: + for x in xs: + ak_result = xlogy(x, y) + + np_y = y.to_ndarray() + np_x = x + if isinstance(np_x, pdarray): + np_x = np_x.to_ndarray() + + scipy_result = scipy_xlogy(np_x, np_y) + + assert np.allclose(ak_result.to_ndarray(), scipy_result, equal_nan=True) diff --git a/PROTO_tests/tests/akstats/akstats_test.py b/PROTO_tests/tests/akstats/akstats_test.py new file mode 100644 index 00000000000..e9f1476b7a3 --- /dev/null +++ b/PROTO_tests/tests/akstats/akstats_test.py @@ -0,0 +1,73 @@ +import math + +import numpy as np +from scipy.stats import power_divergence as scipy_power_divergence + +import arkouda as ak +from arkouda.akstats import power_divergence as ak_power_divergence + + +class TestStats: + @staticmethod + def create_stat_test_pairs(): + pairs = [ + ( + ak.array([10000000, 20000000, 30000000, 40000000, 50000000, 60000000, 70000000]), + ak.array([10000000, 20000000, 30000000, 40000001, 50000000, 60000000, 70000000]), + ), + (ak.array([10000000, 20000000, 30000000, 40000000, 50000000, 60000000, 70000000]), None), + (ak.array([44, 24, 29, 3]) / 100 * 189, ak.array([43, 52, 54, 40])), + ] + return pairs + + def test_power_divergence(self): + pairs = self.create_stat_test_pairs() + + lambdas = [ + "pearson", + "log-likelihood", + "freeman-tukey", + "mod-log-likelihood", + "neyman", + "cressie-read", + ] + + ddofs = [0, 1, 2, 3, 4, 5] + + for f_obs, f_exp in pairs: + for lambda0 in lambdas: + for ddof in ddofs: + ak_power_div = ak_power_divergence(f_obs, f_exp, ddof=ddof, lambda_=lambda0) + + np_f_obs = f_obs.to_ndarray() + np_f_exp = None + if f_exp is not None: + np_f_exp = f_exp.to_ndarray() + + scipy_power_div = scipy_power_divergence( + np_f_obs, np_f_exp, ddof=ddof, axis=0, lambda_=lambda0 + ) + + assert np.allclose(ak_power_div, scipy_power_div, equal_nan=True) + + def test_chisquare(self): + from scipy.stats import chisquare as scipy_chisquare + + from arkouda.akstats import chisquare as ak_chisquare + + pairs = self.create_stat_test_pairs() + + ddofs = [0, 1, 2, 3, 4, 5] + + for f_obs, f_exp in pairs: + for ddof in ddofs: + ak_chisq = ak_chisquare(f_obs, f_exp, ddof=ddof) + + np_f_obs = f_obs.to_ndarray() + np_f_exp = None + if f_exp is not None: + np_f_exp = f_exp.to_ndarray() + + scipy_chisq = scipy_chisquare(np_f_obs, np_f_exp, ddof=ddof, axis=0) + + assert np.allclose(ak_chisq, scipy_chisq, equal_nan=True) diff --git a/arkouda-env-dev.yml b/arkouda-env-dev.yml index 7977e080753..8d0b335eaa0 100644 --- a/arkouda-env-dev.yml +++ b/arkouda-env-dev.yml @@ -20,6 +20,7 @@ dependencies: - libiconv - libidn2 - jupyter + - scipy # Developer dependencies - pexpect @@ -42,4 +43,4 @@ dependencies: - furo # sphinx theme - myst-parser - linkify-it-py - \ No newline at end of file + diff --git a/arkouda-env.yml b/arkouda-env.yml index 7277a3fa0ba..a9b2aac5414 100644 --- a/arkouda-env.yml +++ b/arkouda-env.yml @@ -20,6 +20,7 @@ dependencies: - libiconv - libidn2 - jupyter - + - scipy + - pip: - - typeguard==2.10.0 \ No newline at end of file + - typeguard==2.10.0 diff --git a/arkouda/__init__.py b/arkouda/__init__.py index fe62c030ddc..42ca2b3f16d 100644 --- a/arkouda/__init__.py +++ b/arkouda/__init__.py @@ -39,3 +39,5 @@ is_registered, broadcast_dims, ) +from arkouda.akmath import * +from arkouda.akstats import * diff --git a/arkouda/akmath/__init__.py b/arkouda/akmath/__init__.py new file mode 100644 index 00000000000..37b27767098 --- /dev/null +++ b/arkouda/akmath/__init__.py @@ -0,0 +1,5 @@ +from ._math import xlogy + +__all__ = [ + "xlogy", +] diff --git a/arkouda/akmath/_math.py b/arkouda/akmath/_math.py new file mode 100644 index 00000000000..63f121762ce --- /dev/null +++ b/arkouda/akmath/_math.py @@ -0,0 +1,52 @@ +from typing import Union +from warnings import warn + +import numpy as np + +from arkouda.numeric import log +from arkouda.pdarrayclass import pdarray + + +def xlogy(x: Union[pdarray, np.float64], y: pdarray): + """ + Computes x * log(y). + + Parameters + ---------- + x : pdarray or np.float64 + x must have a datatype that is castable to float64 + y : pdarray + + Returns + ------- + arkouda.pdarrayclass.pdarray + + Examples + -------- + + >>> import arkouda as ak + >>> ak.connect() + >>> from arkouda.akmath import xlogy + >>> xlogy( ak.array([1, 2, 3, 4]), ak.array([5,6,7,8])) + array([1.6094379124341003 3.5835189384561099 5.8377304471659395 8.317766166719343]) + >>> xlogy( 5.0, ak.array([1, 2, 3, 4])) + array([0.00000000000000000 3.4657359027997265 5.4930614433405491 6.9314718055994531]) + + + """ + if not isinstance(x, (np.float64, pdarray)) and np.can_cast(x, np.float64): + x = np.float64(x) + + if isinstance(x, pdarray) and isinstance(y, pdarray): + if x.size == y.size: + return x * log(y) + else: + msg = "x and y must have the same size." + warn(msg, UserWarning) + return None + elif isinstance(x, np.float64) and isinstance(y, pdarray): + return x * log(y) + else: + msg = "x and y must both be pdarrays or x must be castable to float64 and y must be a pdarray." + warn(msg, UserWarning) + return None diff --git a/arkouda/akstats/LICENSE.txt b/arkouda/akstats/LICENSE.txt new file mode 100644 index 00000000000..117117616e8 --- /dev/null +++ b/arkouda/akstats/LICENSE.txt @@ -0,0 +1,30 @@ +Copyright (c) 2001-2002 Enthought, Inc. 2003-2024, SciPy Developers. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/arkouda/akstats/__init__.py b/arkouda/akstats/__init__.py new file mode 100644 index 00000000000..d34f5e0f301 --- /dev/null +++ b/arkouda/akstats/__init__.py @@ -0,0 +1,3 @@ +from ._stats_py import Power_divergenceResult, chisquare, power_divergence + +__all__ = ["power_divergence", "chisquare", "Power_divergenceResult"] diff --git a/arkouda/akstats/_stats_py.py b/arkouda/akstats/_stats_py.py new file mode 100644 index 00000000000..1077d5b3ead --- /dev/null +++ b/arkouda/akstats/_stats_py.py @@ -0,0 +1,212 @@ +from collections import namedtuple + +import numpy as np +from numpy import asarray +from scipy.stats import chi2 + +import arkouda as ak +from arkouda.akmath import xlogy +from arkouda.dtypes import float64 as akfloat64 + +__all__ = ["power_divergence", "chisquare", "Power_divergenceResult"] + + +class Power_divergenceResult(namedtuple("Power_divergenceResult", ("statistic", "pvalue"))): + """ + The results of a power divergence statistical test. + + Attributes + ---------- + statistic : numpy.float64 + pvalue : numpy.float64 + """ + + +# Map from names to lambda_ values used in power_divergence(). +_power_div_lambda_names = { + "pearson": 1, + "log-likelihood": 0, + "freeman-tukey": -0.5, + "mod-log-likelihood": -1, + "neyman": -2, + "cressie-read": 2 / 3, +} + + +def power_divergence(f_obs, f_exp=None, ddof=0, lambda_=None): + """ + Computes the power divergence statistic and p-value. + + Parameters + ---------- + f_obs : pdarray + The observed frequency. + f_exp : pdarray, default = None + The expected frequency. + ddof : int + The delta degrees of freedom. + lambda_ : string, default = "pearson" + The power in the Cressie-Read power divergence statistic. + Allowed values: "pearson", "log-likelihood", "freeman-tukey", "mod-log-likelihood", + "neyman", "cressie-read" + + Powers correspond as follows: + + "pearson": 1 + + "log-likelihood": 0 + + "freeman-tukey": -0.5 + + "mod-log-likelihood": -1 + + "neyman": -2 + + "cressie-read": 2 / 3 + + + Returns + ------- + arkouda.akstats.Power_divergenceResult + + Examples + -------- + + >>> import arkouda as ak + >>> ak.connect() + >>> from arkouda.akstats import power_divergence + >>> x = ak.array([10, 20, 30, 10]) + >>> y = ak.array([10, 30, 20, 10]) + >>> power_divergence(x, y, lambda_="pearson") + Power_divergenceResult(statistic=8.333333333333334, pvalue=0.03960235520756414) + >>> power_divergence(x, y, lambda_="log-likelihood") + Power_divergenceResult(statistic=8.109302162163285, pvalue=0.04380595350226197) + + See Also + ------- + scipy.stats.power_divergence + arkouda.akstats.chisquare + + Notes + ----- + + This is a modified version of scipy.stats.power_divergence [2] + in order to scale using arkouda pdarrays. + + References + ---------- + + [1] "scipy.stats.power_divergence", + https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.power_divergence.html + + [2] Scipy contributors (2024) scipy (Version v1.12.0) [Source code]. + https://github.com/scipy/scipy + + """ + if isinstance(lambda_, str): + if lambda_ not in _power_div_lambda_names: + names = repr(list(_power_div_lambda_names.keys()))[1:-1] + raise ValueError(f"invalid string for lambda_: {lambda_!r}. " f"Valid strings are {names}") + lambda_ = _power_div_lambda_names[lambda_] + elif lambda_ is None: + lambda_ = 1 + + f_obs_float = f_obs.astype(akfloat64) + + if f_exp is not None: + rtol = 1e-8 + with np.errstate(invalid="ignore"): + f_obs_sum = f_obs_float.sum() + f_exp_sum = f_exp.sum() + relative_diff = np.abs(f_obs_sum - f_exp_sum) / np.minimum(f_obs_sum, f_exp_sum) + diff_gt_tol = (relative_diff > rtol).any() + if diff_gt_tol: + msg = ( + f"For each axis slice, the sum of the observed " + f"frequencies must agree with the sum of the " + f"expected frequencies to a relative tolerance " + f"of {rtol}, but the percent differences are:\n" + f"{relative_diff}" + ) + raise ValueError(msg) + + else: + # Ignore 'invalid' errors so the edge case of a data set with length 0 + # is handled without spurious warnings. + with np.errstate(invalid="ignore"): + f_exp = f_obs.mean() + + # `terms` is the array of terms that are summed along `axis` to create + # the test statistic. We use some specialized code for a few special + # cases of lambda_. + if lambda_ == 1: + # Pearson's chi-squared statistic + terms = (f_obs_float - f_exp) ** 2 / f_exp + elif lambda_ == 0: + # Log-likelihood ratio (i.e. G-test) + + if f_exp is not None: + terms = 2.0 * xlogy(f_obs, f_obs / f_exp) + else: + terms = ak.zeros_like(f_obs) + elif lambda_ == -1: + # Modified log-likelihood ratio + if (f_obs is not None) and (f_exp is not None): + terms = 2.0 * xlogy(f_exp, f_exp / f_obs) + else: + terms = ak.array([]) + + else: + # General Cressie-Read power divergence. + terms = f_obs * ((f_obs / f_exp) ** lambda_ - 1) + terms /= 0.5 * lambda_ * (lambda_ + 1) + + stat = terms.sum() + num_obs = terms.size + ddof = asarray(ddof) + p = chi2.sf(stat, num_obs - 1 - ddof) + + return Power_divergenceResult(stat, p) + + +def chisquare(f_obs, f_exp=None, ddof=0): + """ + Computes the chi square statistic and p-value. + + Parameters + ---------- + f_obs : pdarray + The observed frequency. + f_exp : pdarray, default = None + The expected frequency. + ddof : int + The delta degrees of freedom. + + Returns + ------- + arkouda.akstats.Power_divergenceResult + + Examples + -------- + + >>> import arkouda as ak + >>> ak.connect() + >>> from arkouda.akstats import chisquare + >>> chisquare(ak.array([10, 20, 30, 10]), ak.array([10, 30, 20, 10])) + Power_divergenceResult(statistic=8.333333333333334, pvalue=0.03960235520756414) + + + See Also + ------- + scipy.stats.chisquare + arkouda.akstats.power_divergence + + References + ---------- + [1] “Chi-squared test”, https://en.wikipedia.org/wiki/Chi-squared_test + + [2] "scipy.stats.chisquare", + https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html + + """ + return power_divergence(f_obs, f_exp=f_exp, ddof=ddof, lambda_="pearson") diff --git a/arkouda/akstats/_stats_py.pyi b/arkouda/akstats/_stats_py.pyi new file mode 100644 index 00000000000..2dc39af9aed --- /dev/null +++ b/arkouda/akstats/_stats_py.pyi @@ -0,0 +1,6 @@ +from _typeshed import Incomplete + +class Power_divergenceResult: ... + +def power_divergence(f_obs, f_exp: Incomplete | None = ..., ddof: int = ..., lambda_: Incomplete | None = ...): ... +def chisquare(f_obs, f_exp: Incomplete | None = ..., ddof: int = ...): ... diff --git a/pydoc/requirements.txt b/pydoc/requirements.txt index babece521f9..64819aee64a 100644 --- a/pydoc/requirements.txt +++ b/pydoc/requirements.txt @@ -16,6 +16,7 @@ tables>=3.7.0 pyarrow libiconv libidn2 +scipy # Developer dependencies pexpect @@ -30,4 +31,4 @@ linkify-it-py typed-ast mypy>=0.931,<0.990 flake8 -pytest-benchmark \ No newline at end of file +pytest-benchmark diff --git a/pydoc/setup/REQUIREMENTS.md b/pydoc/setup/REQUIREMENTS.md index 50a95a55d91..6afefd3c3d8 100644 --- a/pydoc/setup/REQUIREMENTS.md +++ b/pydoc/setup/REQUIREMENTS.md @@ -32,6 +32,7 @@ The following python packages are required by the Arkouda client package. - `types-tabulate` - `tables>=3.7.0` - `pyarrow` +- `scipy` ### Developer Specific diff --git a/pytest.ini b/pytest.ini index f502e7d5ad4..241d0d8c0ec 100644 --- a/pytest.ini +++ b/pytest.ini @@ -2,6 +2,8 @@ filterwarnings = ignore:Version mismatch between client .* testpaths = + tests/akmath/akmath_test.py + tests/akstats/akstats_test.py tests/alignment_tests.py tests/array_view_test.py tests/bitops_test.py diff --git a/setup.py b/setup.py index fe042307fc2..683808ae3ba 100644 --- a/setup.py +++ b/setup.py @@ -144,6 +144,7 @@ 'types-tabulate', 'tables>=3.7.0', 'pyarrow', + 'scipy', ], # List additional groups of dependencies here (e.g. development diff --git a/tests/akmath/akmath_test.py b/tests/akmath/akmath_test.py new file mode 100644 index 00000000000..89c60f925d7 --- /dev/null +++ b/tests/akmath/akmath_test.py @@ -0,0 +1,30 @@ +import numpy as np +from base_test import ArkoudaTest +from context import arkouda as ak + +from arkouda.akmath import xlogy +from arkouda.pdarrayclass import pdarray + + +class StatsTest(ArkoudaTest): + def setUp(self): + ArkoudaTest.setUp(self) + + def test_xlogy(self): + from scipy.special import xlogy as scipy_xlogy + + ys = [ak.array([1, 2, 3]), ak.array([10, 100, 100]), ak.array([-1, 0, np.nan])] + xs = [3, 5, np.float64(6), ak.array([1.0, 2.0, 4.5])] + + for y in ys: + for x in xs: + ak_result = xlogy(x, y) + + np_y = y.to_ndarray() + np_x = x + if isinstance(np_x, pdarray): + np_x = np_x.to_ndarray() + + scipy_result = scipy_xlogy(np_x, np_y) + + assert np.allclose(ak_result.to_ndarray(), scipy_result, equal_nan=True) diff --git a/tests/akstats/akstats_test.py b/tests/akstats/akstats_test.py new file mode 100644 index 00000000000..93982c4a507 --- /dev/null +++ b/tests/akstats/akstats_test.py @@ -0,0 +1,77 @@ +import math + +import numpy as np +from base_test import ArkoudaTest +from context import arkouda as ak + + +class StatsTest(ArkoudaTest): + def setUp(self): + ArkoudaTest.setUp(self) + + def create_stat_test_pairs(self): + pairs = [ + ( + ak.array([10000000, 20000000, 30000000, 40000000, 50000000, 60000000, 70000000]), + ak.array([10000000, 20000000, 30000000, 40000001, 50000000, 60000000, 70000000]), + ), + (ak.array([10000000, 20000000, 30000000, 40000000, 50000000, 60000000, 70000000]), None), + (ak.array([44, 24, 29, 3]) / 100 * 189, ak.array([43, 52, 54, 40])), + ] + return pairs + + def test_power_divergence(self): + from scipy.stats import power_divergence as scipy_power_divergence + + from arkouda.akstats import power_divergence as ak_power_divergence + + pairs = self.create_stat_test_pairs() + + lambdas = [ + "pearson", + "log-likelihood", + "freeman-tukey", + "mod-log-likelihood", + "neyman", + "cressie-read", + ] + + ddofs = [0, 1, 2, 3, 4, 5] + + for f_obs, f_exp in pairs: + for lambda0 in lambdas: + for ddof in ddofs: + ak_power_div = ak_power_divergence(f_obs, f_exp, ddof=ddof, lambda_=lambda0) + + np_f_obs = f_obs.to_ndarray() + np_f_exp = None + if f_exp is not None: + np_f_exp = f_exp.to_ndarray() + + scipy_power_div = scipy_power_divergence( + np_f_obs, np_f_exp, ddof=ddof, axis=0, lambda_=lambda0 + ) + + assert np.allclose(ak_power_div, scipy_power_div, equal_nan=True) + + def test_chisquare(self): + from scipy.stats import chisquare as scipy_chisquare + + from arkouda.akstats import chisquare as ak_chisquare + + pairs = self.create_stat_test_pairs() + + ddofs = [0, 1, 2, 3, 4, 5] + + for f_obs, f_exp in pairs: + for ddof in ddofs: + ak_chisq = ak_chisquare(f_obs, f_exp, ddof=ddof) + + np_f_obs = f_obs.to_ndarray() + np_f_exp = None + if f_exp is not None: + np_f_exp = f_exp.to_ndarray() + + scipy_chisq = scipy_chisquare(np_f_obs, np_f_exp, ddof=ddof, axis=0) + + assert np.allclose(ak_chisq, scipy_chisq, equal_nan=True)