after conflict

narwhals-dev · Nov 10, 2024 · 6368b04 · 6368b04
2 parents 3cbfe53 + b8d160d
commit 6368b04
Show file tree

Hide file tree

Showing 58 changed files with 1,212 additions and 213 deletions.
diff --git a/.github/workflows/downstream_tests.yml b/.github/workflows/downstream_tests.yml
@@ -184,6 +184,8 @@ jobs:
         run: |
             uv pip uninstall narwhals --system
             uv pip install -e . --system
+            # temporarily pin websockets to get CI green
+            uv pip install "websockets<14.0" --system
       - name: show-deps
         run: uv pip freeze
       - name: Run `make narwhals-test-integration`

diff --git a/README.md b/README.md
@@ -49,6 +49,7 @@ Join the party!
 - [rio](https://github.com/rio-labs/rio)
 - [scikit-lego](https://github.com/koaning/scikit-lego)
 - [scikit-playtime](https://github.com/koaning/scikit-playtime)
+- [tabmat](https://github.com/Quantco/tabmat)
 - [timebasedcv](https://github.com/FBruzzesi/timebasedcv)
 - [tubular](https://github.com/lvgig/tubular)
 - [wimsey](https://github.com/benrutter/wimsey)

diff --git a/docs/api-reference/expr.md b/docs/api-reference/expr.md
@@ -30,13 +30,15 @@
         - len
         - max
         - mean
+        - median
         - min
         - mode
         - null_count
         - n_unique
         - over
         - pipe
         - quantile
+        - replace_strict
         - round
         - sample
         - shift

diff --git a/docs/api-reference/narwhals.md b/docs/api-reference/narwhals.md
@@ -30,6 +30,7 @@ Here are the top-level functions available in Narwhals.
         - maybe_set_index
         - mean
         - mean_horizontal
+        - median
         - min
         - min_horizontal
         - narwhalify

diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md
@@ -37,6 +37,7 @@
         - len
         - max
         - mean
+        - median
         - min
         - mode
         - name
@@ -45,6 +46,7 @@
         - pipe
         - quantile
         - rename
+        - replace_strict
         - round
         - sample
         - scatter

diff --git a/docs/installation.md b/docs/installation.md
@@ -29,7 +29,7 @@ To verify the installation, start the Python REPL and execute:
 ```python
 >>> import narwhals
 >>> narwhals.__version__
-'1.13.2'
+'1.13.3'
 ```
 If you see the version number, then the installation was successful!
 

diff --git a/narwhals/__init__.py b/narwhals/__init__.py
@@ -40,6 +40,7 @@
 from narwhals.expr import max_horizontal
 from narwhals.expr import mean
 from narwhals.expr import mean_horizontal
+from narwhals.expr import median
 from narwhals.expr import min
 from narwhals.expr import min_horizontal
 from narwhals.expr import nth
@@ -67,7 +68,7 @@
 from narwhals.utils import maybe_reset_index
 from narwhals.utils import maybe_set_index
 
-__version__ = "1.13.2"
+__version__ = "1.13.3"
 
 __all__ = [
     "dependencies",
@@ -99,6 +100,7 @@
     "max_horizontal",
     "mean",
     "mean_horizontal",
+    "median",
     "min",
     "min_horizontal",
     "nth",

diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py
@@ -4,6 +4,7 @@
 from typing import Any
 from typing import Callable
 from typing import Literal
+from typing import Sequence
 
 from narwhals._expression_parsing import reuse_series_implementation
 from narwhals._expression_parsing import reuse_series_namespace_implementation
@@ -204,6 +205,9 @@ def filter(self, *predicates: IntoArrowExpr) -> Self:
     def mean(self) -> Self:
         return reuse_series_implementation(self, "mean", returns_scalar=True)
 
+    def median(self) -> Self:
+        return reuse_series_implementation(self, "median", returns_scalar=True)
+
     def count(self) -> Self:
         return reuse_series_implementation(self, "count", returns_scalar=True)
 
@@ -320,6 +324,13 @@ def is_last_distinct(self: Self) -> Self:
     def unique(self: Self, *, maintain_order: bool = False) -> Self:
         return reuse_series_implementation(self, "unique", maintain_order=maintain_order)
 
+    def replace_strict(
+        self: Self, old: Sequence[Any], new: Sequence[Any], *, return_dtype: DType | None
+    ) -> Self:
+        return reuse_series_implementation(
+            self, "replace_strict", old, new, return_dtype=return_dtype
+        )
+
     def sort(self: Self, *, descending: bool = False, nulls_last: bool = False) -> Self:
         return reuse_series_implementation(
             self, "sort", descending=descending, nulls_last=nulls_last

diff --git a/narwhals/_arrow/group_by.py b/narwhals/_arrow/group_by.py
@@ -8,6 +8,7 @@
 
 from narwhals._expression_parsing import is_simple_aggregation
 from narwhals._expression_parsing import parse_into_exprs
+from narwhals.utils import generate_temporary_column_name
 from narwhals.utils import remove_prefix
 
 if TYPE_CHECKING:
@@ -17,6 +18,7 @@
 
 POLARS_TO_ARROW_AGGREGATIONS = {
     "len": "count",
+    "median": "approximate_median",
     "n_unique": "count_distinct",
     "std": "stddev",
     "var": "variance",  # currently unused, we don't have `var` yet
@@ -79,16 +81,36 @@ def agg(
         )
 
     def __iter__(self) -> Iterator[tuple[Any, ArrowDataFrame]]:
-        key_values = self._df.select(*self._keys).unique(subset=self._keys, keep="first")
-        nw_namespace = self._df.__narwhals_namespace__()
+        import pyarrow as pa  # ignore-banned-import
+        import pyarrow.compute as pc  # ignore-banned-import
+
+        col_token = generate_temporary_column_name(n_bytes=8, columns=self._df.columns)
+        null_token = "__null_token_value__"  # noqa: S105
+
+        table = self._df._native_frame
+        key_values = pc.binary_join_element_wise(
+            *[pc.cast(table[key], pa.string()) for key in self._keys],
+            "",
+            null_handling="replace",
+            null_replacement=null_token,
+        )
+        table = table.add_column(i=0, field_=col_token, column=key_values)
+
         yield from (
             (
-                key_value,
-                self._df.filter(
-                    *[nw_namespace.col(k) == v for k, v in zip(self._keys, key_value)]
+                next(
+                    (
+                        t := self._df._from_native_frame(
+                            table.filter(pc.equal(table[col_token], v)).drop([col_token])
+                        )
+                    )
+                    .select(*self._keys)
+                    .head(1)
+                    .iter_rows()
                 ),
+                t,
             )
-            for key_value in key_values.iter_rows()
+            for v in pc.unique(key_values)
         )
 
 

diff --git a/narwhals/_arrow/namespace.py b/narwhals/_arrow/namespace.py
@@ -65,7 +65,7 @@ def _create_expr_from_series(self, series: ArrowSeries) -> ArrowExpr:
     def _create_series_from_scalar(self, value: Any, series: ArrowSeries) -> ArrowSeries:
         from narwhals._arrow.series import ArrowSeries
 
-        if self._backend_version < (13,) and hasattr(value, "as_py"):  # pragma: no cover
+        if self._backend_version < (13,) and hasattr(value, "as_py"):
             value = value.as_py()
         return ArrowSeries._from_iterable(
             [value],
@@ -152,7 +152,7 @@ def lit(self, value: Any, dtype: DType | None) -> ArrowExpr:
         def _lit_arrow_series(_: ArrowDataFrame) -> ArrowSeries:
             arrow_series = ArrowSeries._from_iterable(
                 data=[value],
-                name="lit",
+                name="literal",
                 backend_version=self._backend_version,
                 dtypes=self._dtypes,
             )
@@ -165,7 +165,7 @@ def _lit_arrow_series(_: ArrowDataFrame) -> ArrowSeries:
             depth=0,
             function_name="lit",
             root_names=None,
-            output_names=["lit"],
+            output_names=[_lit_arrow_series.__name__],
             backend_version=self._backend_version,
             dtypes=self._dtypes,
         )
@@ -325,6 +325,11 @@ def mean(self, *column_names: str) -> ArrowExpr:
             *column_names, backend_version=self._backend_version, dtypes=self._dtypes
         ).mean()
 
+    def median(self, *column_names: str) -> ArrowExpr:
+        return ArrowExpr.from_column_names(
+            *column_names, backend_version=self._backend_version, dtypes=self._dtypes
+        ).median()
+
     def max(self, *column_names: str) -> ArrowExpr:
         return ArrowExpr.from_column_names(
             *column_names, backend_version=self._backend_version, dtypes=self._dtypes

diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py
@@ -261,6 +261,17 @@ def mean(self) -> int:
 
         return pc.mean(self._native_series)  # type: ignore[no-any-return]
 
+    def median(self) -> int:
+        import pyarrow.compute as pc  # ignore-banned-import()
+
+        from narwhals._exceptions import InvalidOperationError
+
+        if not self.dtype.is_numeric():
+            msg = "`median` operation not supported for non-numeric input type."
+            raise InvalidOperationError(msg)
+
+        return pc.approximate_median(self._native_series)  # type: ignore[no-any-return]
+
     def min(self) -> int:
         import pyarrow.compute as pc  # ignore-banned-import()
 
@@ -655,6 +666,26 @@ def unique(self: Self, *, maintain_order: bool = False) -> ArrowSeries:
 
         return self._from_native_series(pc.unique(self._native_series))
 
+    def replace_strict(
+        self, old: Sequence[Any], new: Sequence[Any], *, return_dtype: DType | None
+    ) -> ArrowSeries:
+        import pyarrow as pa  # ignore-banned-import
+        import pyarrow.compute as pc  # ignore-banned-import
+
+        # https://stackoverflow.com/a/79111029/4451315
+        idxs = pc.index_in(self._native_series, pa.array(old))
+        result_native = pc.take(pa.array(new), idxs)
+        if return_dtype is not None:
+            result_native.cast(narwhals_to_native_dtype(return_dtype, self._dtypes))
+        result = self._from_native_series(result_native)
+        if result.is_null().sum() != self.is_null().sum():
+            msg = (
+                "replace_strict did not replace all non-null values.\n\n"
+                f"The following did not get replaced: {self.filter(~self.is_null() & result.is_null()).unique().to_list()}"
+            )
+            raise ValueError(msg)
+        return result
+
     def sort(
         self: Self, *, descending: bool = False, nulls_last: bool = False
     ) -> ArrowSeries:

diff --git a/narwhals/_arrow/utils.py b/narwhals/_arrow/utils.py
@@ -350,28 +350,28 @@ def convert_str_slice_to_int_slice(
 
 
 # Regex for date, time, separator and timezone components
-DATE_RE = r"(?P<date>\d{1,4}[-/.]\d{1,2}[-/.]\d{1,4})"
+DATE_RE = r"(?P<date>\d{1,4}[-/.]\d{1,2}[-/.]\d{1,4}|\d{8})"
 SEP_RE = r"(?P<sep>\s|T)"
-TIME_RE = r"(?P<time>\d{2}:\d{2}(?::\d{2})?)"  # \s*(?P<period>[AP]M)?)?
+TIME_RE = r"(?P<time>\d{2}:\d{2}(?::\d{2})?|\d{6}?)"  # \s*(?P<period>[AP]M)?)?
 HMS_RE = r"^(?P<hms>\d{2}:\d{2}:\d{2})$"
 HM_RE = r"^(?P<hm>\d{2}:\d{2})$"
+HMS_RE_NO_SEP = r"^(?P<hms_no_sep>\d{6})$"
 TZ_RE = r"(?P<tz>Z|[+-]\d{2}:?\d{2})"  # Matches 'Z', '+02:00', '+0200', '+02', etc.
 FULL_RE = rf"{DATE_RE}{SEP_RE}?{TIME_RE}?{TZ_RE}?$"
 
 # Separate regexes for different date formats
 YMD_RE = r"^(?P<year>(?:[12][0-9])?[0-9]{2})(?P<sep1>[-/.])(?P<month>0[1-9]|1[0-2])(?P<sep2>[-/.])(?P<day>0[1-9]|[12][0-9]|3[01])$"
 DMY_RE = r"^(?P<day>0[1-9]|[12][0-9]|3[01])(?P<sep1>[-/.])(?P<month>0[1-9]|1[0-2])(?P<sep2>[-/.])(?P<year>(?:[12][0-9])?[0-9]{2})$"
 MDY_RE = r"^(?P<month>0[1-9]|1[0-2])(?P<sep1>[-/.])(?P<day>0[1-9]|[12][0-9]|3[01])(?P<sep2>[-/.])(?P<year>(?:[12][0-9])?[0-9]{2})$"
+YMD_RE_NO_SEP = r"^(?P<year>(?:[12][0-9])?[0-9]{2})(?P<month>0[1-9]|1[0-2])(?P<day>0[1-9]|[12][0-9]|3[01])$"
 
 DATE_FORMATS = (
+    (YMD_RE_NO_SEP, "%Y%m%d"),
     (YMD_RE, "%Y-%m-%d"),
     (DMY_RE, "%d-%m-%Y"),
     (MDY_RE, "%m-%d-%Y"),
 )
-TIME_FORMATS = (
-    (HMS_RE, "%H:%M:%S"),
-    (HM_RE, "%H:%M"),
-)
+TIME_FORMATS = ((HMS_RE, "%H:%M:%S"), (HM_RE, "%H:%M"), (HMS_RE_NO_SEP, "%H%M%S"))
 
 
 def parse_datetime_format(arr: pa.StringArray) -> str:
@@ -418,7 +418,9 @@ def _parse_date_format(arr: pa.Array) -> str:
 
     for date_rgx, date_fmt in DATE_FORMATS:
         matches = pc.extract_regex(arr, pattern=date_rgx)
-        if (
+        if date_fmt == "%Y%m%d" and pc.all(matches.is_valid()).as_py():
+            return date_fmt
+        elif (
             pc.all(matches.is_valid()).as_py()
             and pc.count(pc.unique(sep1 := matches.field("sep1"))).as_py() == 1
             and pc.count(pc.unique(sep2 := matches.field("sep2"))).as_py() == 1

diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py
@@ -6,6 +6,7 @@
 from typing import Callable
 from typing import Literal
 from typing import NoReturn
+from typing import Sequence
 
 from narwhals._dask.utils import add_row_index
 from narwhals._dask.utils import maybe_evaluate
@@ -382,6 +383,19 @@ def mean(self) -> Self:
             returns_scalar=True,
         )
 
+    def median(self) -> Self:
+        from dask_expr._shuffle import _is_numeric_cast_type
+
+        from narwhals._exceptions import InvalidOperationError
+
+        def func(_input: dask_expr.Series) -> dask_expr.Series:
+            if not _is_numeric_cast_type(_input.dtype):
+                msg = "`median` operation not supported for non-numeric input type."
+                raise InvalidOperationError(msg)
+            return _input.median_approximate()
+
+        return self._from_call(func, "median", returns_scalar=True)
+
     def min(self) -> Self:
         return self._from_call(
             lambda _input: _input.min(),
@@ -491,6 +505,12 @@ def head(self) -> NoReturn:
         msg = "`Expr.head` is not supported for the Dask backend. Please use `LazyFrame.head` instead."
         raise NotImplementedError(msg)
 
+    def replace_strict(
+        self, old: Sequence[Any], new: Sequence[Any], *, return_dtype: DType | None
+    ) -> Self:
+        msg = "`replace_strict` is not yet supported for Dask expressions"
+        raise NotImplementedError(msg)
+
     def sort(self, *, descending: bool = False, nulls_last: bool = False) -> NoReturn:
         # We can't (yet?) allow methods which modify the index
         msg = "`Expr.sort` is not supported for the Dask backend. Please use `LazyFrame.sort` instead."

diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py
@@ -76,14 +76,14 @@ def convert_if_dtype(
 
         return DaskExpr(
             lambda df: [
-                df._native_frame.assign(lit=value)
-                .loc[:, "lit"]
+                df._native_frame.assign(literal=value)
+                .loc[:, "literal"]
                 .pipe(convert_if_dtype, dtype)
             ],
             depth=0,
             function_name="lit",
             root_names=None,
-            output_names=["lit"],
+            output_names=["literal"],
             returns_scalar=False,
             backend_version=self._backend_version,
             dtypes=self._dtypes,
@@ -104,6 +104,11 @@ def mean(self, *column_names: str) -> DaskExpr:
             *column_names, backend_version=self._backend_version, dtypes=self._dtypes
         ).mean()
 
+    def median(self, *column_names: str) -> DaskExpr:
+        return DaskExpr.from_column_names(
+            *column_names, backend_version=self._backend_version, dtypes=self._dtypes
+        ).median()
+
     def sum(self, *column_names: str) -> DaskExpr:
         return DaskExpr.from_column_names(
             *column_names, backend_version=self._backend_version, dtypes=self._dtypes

diff --git a/narwhals/_exceptions.py b/narwhals/_exceptions.py
@@ -2,3 +2,6 @@
 
 
 class ColumnNotFoundError(Exception): ...
+
+
+class InvalidOperationError(Exception): ...