Skip to content

Commit

Permalink
after conflict
Browse files Browse the repository at this point in the history
  • Loading branch information
DeaMariaLeon committed Nov 10, 2024
2 parents 3cbfe53 + b8d160d commit 6368b04
Show file tree
Hide file tree
Showing 58 changed files with 1,212 additions and 213 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/downstream_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,8 @@ jobs:
run: |
uv pip uninstall narwhals --system
uv pip install -e . --system
# temporarily pin websockets to get CI green
uv pip install "websockets<14.0" --system
- name: show-deps
run: uv pip freeze
- name: Run `make narwhals-test-integration`
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ Join the party!
- [rio](https://github.com/rio-labs/rio)
- [scikit-lego](https://github.com/koaning/scikit-lego)
- [scikit-playtime](https://github.com/koaning/scikit-playtime)
- [tabmat](https://github.com/Quantco/tabmat)
- [timebasedcv](https://github.com/FBruzzesi/timebasedcv)
- [tubular](https://github.com/lvgig/tubular)
- [wimsey](https://github.com/benrutter/wimsey)
Expand Down
2 changes: 2 additions & 0 deletions docs/api-reference/expr.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,15 @@
- len
- max
- mean
- median
- min
- mode
- null_count
- n_unique
- over
- pipe
- quantile
- replace_strict
- round
- sample
- shift
Expand Down
1 change: 1 addition & 0 deletions docs/api-reference/narwhals.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ Here are the top-level functions available in Narwhals.
- maybe_set_index
- mean
- mean_horizontal
- median
- min
- min_horizontal
- narwhalify
Expand Down
2 changes: 2 additions & 0 deletions docs/api-reference/series.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
- len
- max
- mean
- median
- min
- mode
- name
Expand All @@ -45,6 +46,7 @@
- pipe
- quantile
- rename
- replace_strict
- round
- sample
- scatter
Expand Down
2 changes: 1 addition & 1 deletion docs/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ To verify the installation, start the Python REPL and execute:
```python
>>> import narwhals
>>> narwhals.__version__
'1.13.2'
'1.13.3'
```
If you see the version number, then the installation was successful!

Expand Down
4 changes: 3 additions & 1 deletion narwhals/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
from narwhals.expr import max_horizontal
from narwhals.expr import mean
from narwhals.expr import mean_horizontal
from narwhals.expr import median
from narwhals.expr import min
from narwhals.expr import min_horizontal
from narwhals.expr import nth
Expand Down Expand Up @@ -67,7 +68,7 @@
from narwhals.utils import maybe_reset_index
from narwhals.utils import maybe_set_index

__version__ = "1.13.2"
__version__ = "1.13.3"

__all__ = [
"dependencies",
Expand Down Expand Up @@ -99,6 +100,7 @@
"max_horizontal",
"mean",
"mean_horizontal",
"median",
"min",
"min_horizontal",
"nth",
Expand Down
11 changes: 11 additions & 0 deletions narwhals/_arrow/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Any
from typing import Callable
from typing import Literal
from typing import Sequence

from narwhals._expression_parsing import reuse_series_implementation
from narwhals._expression_parsing import reuse_series_namespace_implementation
Expand Down Expand Up @@ -204,6 +205,9 @@ def filter(self, *predicates: IntoArrowExpr) -> Self:
def mean(self) -> Self:
return reuse_series_implementation(self, "mean", returns_scalar=True)

def median(self) -> Self:
return reuse_series_implementation(self, "median", returns_scalar=True)

def count(self) -> Self:
return reuse_series_implementation(self, "count", returns_scalar=True)

Expand Down Expand Up @@ -320,6 +324,13 @@ def is_last_distinct(self: Self) -> Self:
def unique(self: Self, *, maintain_order: bool = False) -> Self:
return reuse_series_implementation(self, "unique", maintain_order=maintain_order)

def replace_strict(
self: Self, old: Sequence[Any], new: Sequence[Any], *, return_dtype: DType | None
) -> Self:
return reuse_series_implementation(
self, "replace_strict", old, new, return_dtype=return_dtype
)

def sort(self: Self, *, descending: bool = False, nulls_last: bool = False) -> Self:
return reuse_series_implementation(
self, "sort", descending=descending, nulls_last=nulls_last
Expand Down
34 changes: 28 additions & 6 deletions narwhals/_arrow/group_by.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from narwhals._expression_parsing import is_simple_aggregation
from narwhals._expression_parsing import parse_into_exprs
from narwhals.utils import generate_temporary_column_name
from narwhals.utils import remove_prefix

if TYPE_CHECKING:
Expand All @@ -17,6 +18,7 @@

POLARS_TO_ARROW_AGGREGATIONS = {
"len": "count",
"median": "approximate_median",
"n_unique": "count_distinct",
"std": "stddev",
"var": "variance", # currently unused, we don't have `var` yet
Expand Down Expand Up @@ -79,16 +81,36 @@ def agg(
)

def __iter__(self) -> Iterator[tuple[Any, ArrowDataFrame]]:
key_values = self._df.select(*self._keys).unique(subset=self._keys, keep="first")
nw_namespace = self._df.__narwhals_namespace__()
import pyarrow as pa # ignore-banned-import
import pyarrow.compute as pc # ignore-banned-import

col_token = generate_temporary_column_name(n_bytes=8, columns=self._df.columns)
null_token = "__null_token_value__" # noqa: S105

table = self._df._native_frame
key_values = pc.binary_join_element_wise(
*[pc.cast(table[key], pa.string()) for key in self._keys],
"",
null_handling="replace",
null_replacement=null_token,
)
table = table.add_column(i=0, field_=col_token, column=key_values)

yield from (
(
key_value,
self._df.filter(
*[nw_namespace.col(k) == v for k, v in zip(self._keys, key_value)]
next(
(
t := self._df._from_native_frame(
table.filter(pc.equal(table[col_token], v)).drop([col_token])
)
)
.select(*self._keys)
.head(1)
.iter_rows()
),
t,
)
for key_value in key_values.iter_rows()
for v in pc.unique(key_values)
)


Expand Down
11 changes: 8 additions & 3 deletions narwhals/_arrow/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def _create_expr_from_series(self, series: ArrowSeries) -> ArrowExpr:
def _create_series_from_scalar(self, value: Any, series: ArrowSeries) -> ArrowSeries:
from narwhals._arrow.series import ArrowSeries

if self._backend_version < (13,) and hasattr(value, "as_py"): # pragma: no cover
if self._backend_version < (13,) and hasattr(value, "as_py"):
value = value.as_py()
return ArrowSeries._from_iterable(
[value],
Expand Down Expand Up @@ -152,7 +152,7 @@ def lit(self, value: Any, dtype: DType | None) -> ArrowExpr:
def _lit_arrow_series(_: ArrowDataFrame) -> ArrowSeries:
arrow_series = ArrowSeries._from_iterable(
data=[value],
name="lit",
name="literal",
backend_version=self._backend_version,
dtypes=self._dtypes,
)
Expand All @@ -165,7 +165,7 @@ def _lit_arrow_series(_: ArrowDataFrame) -> ArrowSeries:
depth=0,
function_name="lit",
root_names=None,
output_names=["lit"],
output_names=[_lit_arrow_series.__name__],
backend_version=self._backend_version,
dtypes=self._dtypes,
)
Expand Down Expand Up @@ -325,6 +325,11 @@ def mean(self, *column_names: str) -> ArrowExpr:
*column_names, backend_version=self._backend_version, dtypes=self._dtypes
).mean()

def median(self, *column_names: str) -> ArrowExpr:
return ArrowExpr.from_column_names(
*column_names, backend_version=self._backend_version, dtypes=self._dtypes
).median()

def max(self, *column_names: str) -> ArrowExpr:
return ArrowExpr.from_column_names(
*column_names, backend_version=self._backend_version, dtypes=self._dtypes
Expand Down
31 changes: 31 additions & 0 deletions narwhals/_arrow/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,17 @@ def mean(self) -> int:

return pc.mean(self._native_series) # type: ignore[no-any-return]

def median(self) -> int:
import pyarrow.compute as pc # ignore-banned-import()

from narwhals._exceptions import InvalidOperationError

if not self.dtype.is_numeric():
msg = "`median` operation not supported for non-numeric input type."
raise InvalidOperationError(msg)

return pc.approximate_median(self._native_series) # type: ignore[no-any-return]

def min(self) -> int:
import pyarrow.compute as pc # ignore-banned-import()

Expand Down Expand Up @@ -655,6 +666,26 @@ def unique(self: Self, *, maintain_order: bool = False) -> ArrowSeries:

return self._from_native_series(pc.unique(self._native_series))

def replace_strict(
self, old: Sequence[Any], new: Sequence[Any], *, return_dtype: DType | None
) -> ArrowSeries:
import pyarrow as pa # ignore-banned-import
import pyarrow.compute as pc # ignore-banned-import

# https://stackoverflow.com/a/79111029/4451315
idxs = pc.index_in(self._native_series, pa.array(old))
result_native = pc.take(pa.array(new), idxs)
if return_dtype is not None:
result_native.cast(narwhals_to_native_dtype(return_dtype, self._dtypes))
result = self._from_native_series(result_native)
if result.is_null().sum() != self.is_null().sum():
msg = (
"replace_strict did not replace all non-null values.\n\n"
f"The following did not get replaced: {self.filter(~self.is_null() & result.is_null()).unique().to_list()}"
)
raise ValueError(msg)
return result

def sort(
self: Self, *, descending: bool = False, nulls_last: bool = False
) -> ArrowSeries:
Expand Down
16 changes: 9 additions & 7 deletions narwhals/_arrow/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,28 +350,28 @@ def convert_str_slice_to_int_slice(


# Regex for date, time, separator and timezone components
DATE_RE = r"(?P<date>\d{1,4}[-/.]\d{1,2}[-/.]\d{1,4})"
DATE_RE = r"(?P<date>\d{1,4}[-/.]\d{1,2}[-/.]\d{1,4}|\d{8})"
SEP_RE = r"(?P<sep>\s|T)"
TIME_RE = r"(?P<time>\d{2}:\d{2}(?::\d{2})?)" # \s*(?P<period>[AP]M)?)?
TIME_RE = r"(?P<time>\d{2}:\d{2}(?::\d{2})?|\d{6}?)" # \s*(?P<period>[AP]M)?)?
HMS_RE = r"^(?P<hms>\d{2}:\d{2}:\d{2})$"
HM_RE = r"^(?P<hm>\d{2}:\d{2})$"
HMS_RE_NO_SEP = r"^(?P<hms_no_sep>\d{6})$"
TZ_RE = r"(?P<tz>Z|[+-]\d{2}:?\d{2})" # Matches 'Z', '+02:00', '+0200', '+02', etc.
FULL_RE = rf"{DATE_RE}{SEP_RE}?{TIME_RE}?{TZ_RE}?$"

# Separate regexes for different date formats
YMD_RE = r"^(?P<year>(?:[12][0-9])?[0-9]{2})(?P<sep1>[-/.])(?P<month>0[1-9]|1[0-2])(?P<sep2>[-/.])(?P<day>0[1-9]|[12][0-9]|3[01])$"
DMY_RE = r"^(?P<day>0[1-9]|[12][0-9]|3[01])(?P<sep1>[-/.])(?P<month>0[1-9]|1[0-2])(?P<sep2>[-/.])(?P<year>(?:[12][0-9])?[0-9]{2})$"
MDY_RE = r"^(?P<month>0[1-9]|1[0-2])(?P<sep1>[-/.])(?P<day>0[1-9]|[12][0-9]|3[01])(?P<sep2>[-/.])(?P<year>(?:[12][0-9])?[0-9]{2})$"
YMD_RE_NO_SEP = r"^(?P<year>(?:[12][0-9])?[0-9]{2})(?P<month>0[1-9]|1[0-2])(?P<day>0[1-9]|[12][0-9]|3[01])$"

DATE_FORMATS = (
(YMD_RE_NO_SEP, "%Y%m%d"),
(YMD_RE, "%Y-%m-%d"),
(DMY_RE, "%d-%m-%Y"),
(MDY_RE, "%m-%d-%Y"),
)
TIME_FORMATS = (
(HMS_RE, "%H:%M:%S"),
(HM_RE, "%H:%M"),
)
TIME_FORMATS = ((HMS_RE, "%H:%M:%S"), (HM_RE, "%H:%M"), (HMS_RE_NO_SEP, "%H%M%S"))


def parse_datetime_format(arr: pa.StringArray) -> str:
Expand Down Expand Up @@ -418,7 +418,9 @@ def _parse_date_format(arr: pa.Array) -> str:

for date_rgx, date_fmt in DATE_FORMATS:
matches = pc.extract_regex(arr, pattern=date_rgx)
if (
if date_fmt == "%Y%m%d" and pc.all(matches.is_valid()).as_py():
return date_fmt
elif (
pc.all(matches.is_valid()).as_py()
and pc.count(pc.unique(sep1 := matches.field("sep1"))).as_py() == 1
and pc.count(pc.unique(sep2 := matches.field("sep2"))).as_py() == 1
Expand Down
20 changes: 20 additions & 0 deletions narwhals/_dask/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import Callable
from typing import Literal
from typing import NoReturn
from typing import Sequence

from narwhals._dask.utils import add_row_index
from narwhals._dask.utils import maybe_evaluate
Expand Down Expand Up @@ -382,6 +383,19 @@ def mean(self) -> Self:
returns_scalar=True,
)

def median(self) -> Self:
from dask_expr._shuffle import _is_numeric_cast_type

from narwhals._exceptions import InvalidOperationError

def func(_input: dask_expr.Series) -> dask_expr.Series:
if not _is_numeric_cast_type(_input.dtype):
msg = "`median` operation not supported for non-numeric input type."
raise InvalidOperationError(msg)
return _input.median_approximate()

return self._from_call(func, "median", returns_scalar=True)

def min(self) -> Self:
return self._from_call(
lambda _input: _input.min(),
Expand Down Expand Up @@ -491,6 +505,12 @@ def head(self) -> NoReturn:
msg = "`Expr.head` is not supported for the Dask backend. Please use `LazyFrame.head` instead."
raise NotImplementedError(msg)

def replace_strict(
self, old: Sequence[Any], new: Sequence[Any], *, return_dtype: DType | None
) -> Self:
msg = "`replace_strict` is not yet supported for Dask expressions"
raise NotImplementedError(msg)

def sort(self, *, descending: bool = False, nulls_last: bool = False) -> NoReturn:
# We can't (yet?) allow methods which modify the index
msg = "`Expr.sort` is not supported for the Dask backend. Please use `LazyFrame.sort` instead."
Expand Down
11 changes: 8 additions & 3 deletions narwhals/_dask/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,14 +76,14 @@ def convert_if_dtype(

return DaskExpr(
lambda df: [
df._native_frame.assign(lit=value)
.loc[:, "lit"]
df._native_frame.assign(literal=value)
.loc[:, "literal"]
.pipe(convert_if_dtype, dtype)
],
depth=0,
function_name="lit",
root_names=None,
output_names=["lit"],
output_names=["literal"],
returns_scalar=False,
backend_version=self._backend_version,
dtypes=self._dtypes,
Expand All @@ -104,6 +104,11 @@ def mean(self, *column_names: str) -> DaskExpr:
*column_names, backend_version=self._backend_version, dtypes=self._dtypes
).mean()

def median(self, *column_names: str) -> DaskExpr:
return DaskExpr.from_column_names(
*column_names, backend_version=self._backend_version, dtypes=self._dtypes
).median()

def sum(self, *column_names: str) -> DaskExpr:
return DaskExpr.from_column_names(
*column_names, backend_version=self._backend_version, dtypes=self._dtypes
Expand Down
3 changes: 3 additions & 0 deletions narwhals/_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,6 @@


class ColumnNotFoundError(Exception): ...


class InvalidOperationError(Exception): ...
Loading

0 comments on commit 6368b04

Please sign in to comment.