Skip to content

Commit

Permalink
Merge branch 'narwhals-dev:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
ugohuche authored Jul 11, 2024
2 parents 27420ba + 66a1909 commit 7d15b35
Show file tree
Hide file tree
Showing 13 changed files with 142 additions and 40 deletions.
2 changes: 1 addition & 1 deletion docs/overhead.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ vs running pandas via Narwhals:

![Comparison of pandas vs "pandas via Narwhals" timings on TPC-H queries showing neglibile overhead](https://github.com/narwhals-dev/narwhals/assets/33491632/71029c26-4121-43bb-90fb-5ac1c16ab8a2)

[Here](https://www.kaggle.com/code/marcogorelli/narwhals-tpc-h-results-s-2-w-native)'s the code to
[Here](https://www.kaggle.com/code/marcogorelli/narwhals-tpc-h-results-s-2)'s the code to
reproduce the plot above, check the input
sources for notebooks which run each individual query, along with
the data sources.
Expand Down
46 changes: 45 additions & 1 deletion narwhals/_arrow/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,21 @@ def get_column(self, name: str) -> ArrowSeries:
backend_version=self._backend_version,
)

@overload
def __getitem__(self, item: tuple[Sequence[int], str | int]) -> ArrowSeries: ... # type: ignore[overload-overlap]

@overload
def __getitem__(self, item: Sequence[int]) -> ArrowDataFrame: ...

@overload
def __getitem__(self, item: str) -> ArrowSeries: ...

@overload
def __getitem__(self, item: slice) -> ArrowDataFrame: ...

def __getitem__(self, item: str | slice) -> ArrowSeries | ArrowDataFrame:
def __getitem__(
self, item: str | slice | Sequence[int] | tuple[Sequence[int], str | int]
) -> ArrowSeries | ArrowDataFrame:
if isinstance(item, str):
from narwhals._arrow.series import ArrowSeries

Expand All @@ -87,6 +95,16 @@ def __getitem__(self, item: str | slice) -> ArrowSeries | ArrowDataFrame:
name=item,
backend_version=self._backend_version,
)
elif isinstance(item, tuple) and len(item) == 2:
from narwhals._arrow.series import ArrowSeries

# PyArrow columns are always strings
col_name = item[1] if isinstance(item[1], str) else self.columns[item[1]]
return ArrowSeries(
self._native_dataframe[col_name].take(item[0]),
name=col_name,
backend_version=self._backend_version,
)

elif isinstance(item, slice):
if item.step is not None and item.step != 1:
Expand Down Expand Up @@ -195,6 +213,32 @@ def sort(
def to_pandas(self) -> Any:
return self._native_dataframe.to_pandas()

def to_numpy(self) -> Any:
import numpy as np

return np.column_stack([col.to_numpy() for col in self._native_dataframe.columns])

def to_dict(self, *, as_series: bool) -> Any:
df = self._native_dataframe

names_and_values = zip(df.column_names, df.columns)
if as_series:
from narwhals._arrow.series import ArrowSeries

return {
name: ArrowSeries(col, name=name, backend_version=self._backend_version)
for name, col in names_and_values
}
else:
return {name: col.to_pylist() for name, col in names_and_values}

def with_row_index(self, name: str) -> Self:
pa = get_pyarrow()
df = self._native_dataframe

row_indices = pa.array(range(df.num_rows))
return self._from_native_dataframe(df.append_column(name, row_indices))

def lazy(self) -> Self:
return self

Expand Down
27 changes: 26 additions & 1 deletion narwhals/_pandas_like/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,13 +99,21 @@ def get_column(self, name: str) -> PandasLikeSeries:
backend_version=self._backend_version,
)

@overload
def __getitem__(self, item: tuple[Sequence[int], str | int]) -> PandasLikeSeries: ... # type: ignore[overload-overlap]

@overload
def __getitem__(self, item: Sequence[int]) -> PandasLikeDataFrame: ...

@overload
def __getitem__(self, item: str) -> PandasLikeSeries: ...

@overload
def __getitem__(self, item: slice) -> PandasLikeDataFrame: ...

def __getitem__(self, item: str | slice) -> PandasLikeSeries | PandasLikeDataFrame:
def __getitem__(
self, item: str | slice | Sequence[int] | tuple[Sequence[int], str | int]
) -> PandasLikeSeries | PandasLikeDataFrame:
if isinstance(item, str):
from narwhals._pandas_like.series import PandasLikeSeries

Expand All @@ -115,6 +123,23 @@ def __getitem__(self, item: str | slice) -> PandasLikeSeries | PandasLikeDataFra
backend_version=self._backend_version,
)

elif isinstance(item, tuple) and len(item) == 2:
from narwhals._pandas_like.series import PandasLikeSeries

if isinstance(item[1], str):
native_series = self._native_dataframe.loc[item]
elif isinstance(item[1], int):
native_series = self._native_dataframe.iloc[item]
else: # pragma: no cover
msg = f"Expected str or int, got: {type(item[1])}"
raise TypeError(msg)

return PandasLikeSeries(
native_series,
implementation=self._implementation,
backend_version=self._backend_version,
)

elif isinstance(item, (slice, Sequence)) or (
(np := get_numpy()) is not None
and isinstance(item, np.ndarray)
Expand Down
15 changes: 11 additions & 4 deletions narwhals/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,15 +456,20 @@ def get_column(self, name: str) -> Series:
)

@overload
def __getitem__(self, item: Sequence[int]) -> Series: ...
def __getitem__(self, item: tuple[Sequence[int], str | int]) -> Series: ... # type: ignore[overload-overlap]

@overload
def __getitem__(self, item: Sequence[int]) -> Self: ...

@overload
def __getitem__(self, item: str) -> Series: ...

@overload
def __getitem__(self, item: slice) -> Self: ...

def __getitem__(self, item: str | slice | Sequence[int]) -> Series | Self:
def __getitem__(
self, item: str | slice | Sequence[int] | tuple[Sequence[int], str | int]
) -> Series | Self:
"""
Extract column or slice of DataFrame.
Expand All @@ -473,7 +478,9 @@ def __getitem__(self, item: str | slice | Sequence[int]) -> Series | Self:
- str: extract column
- slice or Sequence of integers: slice rows from dataframe.
- tuple of Sequence of integers and str or int: slice rows and extract column at the same time.
If the second element of the tuple is an integer, it is interpreted as the column index. Otherwise,
it is interpreted as the column name.
Notes:
In contrast with Polars, pandas allows non-string column names.
If you don't know whether the column name you're trying to extract
Expand Down Expand Up @@ -508,7 +515,7 @@ def __getitem__(self, item: str | slice | Sequence[int]) -> Series | Self:
2
]
"""
if isinstance(item, str):
if isinstance(item, str) or (isinstance(item, tuple) and len(item) == 2):
from narwhals.series import Series

return Series(
Expand Down
5 changes: 4 additions & 1 deletion narwhals/stable/v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,10 @@ class DataFrame(NwDataFrame[IntoDataFrameT]):
"""

@overload
def __getitem__(self, item: Sequence[int]) -> Series: ...
def __getitem__(self, item: tuple[Sequence[int], str | int]) -> Series: ... # type: ignore[overload-overlap]

@overload
def __getitem__(self, item: Sequence[int]) -> Self: ...

@overload
def __getitem__(self, item: str) -> Series: ...
Expand Down
22 changes: 22 additions & 0 deletions tests/expr/unary_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from typing import Any

import narwhals as nw
from tests.utils import compare_dicts


def test_unary(constructor_with_lazy: Any) -> None:
data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
result = (
nw.from_native(constructor_with_lazy(data))
.with_columns(
a_mean=nw.col("a").mean(),
a_sum=nw.col("a").sum(),
b_nunique=nw.col("b").n_unique(),
z_min=nw.col("z").min(),
z_max=nw.col("z").max(),
)
.select(nw.col("a_mean", "a_sum", "b_nunique", "z_min", "z_max").unique())
)
result_native = nw.to_native(result)
expected = {"a_mean": [2], "a_sum": [6], "b_nunique": [2], "z_min": [7], "z_max": [9]}
compare_dicts(result_native, expected)
4 changes: 2 additions & 2 deletions tests/frame/pipe_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
}


def test_pipe(constructor: Any) -> None:
df = nw.from_native(constructor(data))
def test_pipe(constructor_with_pyarrow: Any) -> None:
df = nw.from_native(constructor_with_pyarrow(data))
columns = df.lazy().collect().columns
result = df.pipe(lambda _df: _df.select([x for x in columns if len(x) == 2]))
expected = {"ab": ["foo", "bars"]}
Expand Down
22 changes: 22 additions & 0 deletions tests/frame/slice_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
import polars as pl
import pyarrow as pa
import pytest
from pandas.testing import assert_series_equal

import narwhals.stable.v1 as nw
from narwhals.utils import parse_version
from tests.utils import compare_dicts

data = {
Expand Down Expand Up @@ -69,3 +71,23 @@ def test_gather_pandas_index() -> None:
result = nw.from_native(df, eager_only=True)[[1, 2]]
expected = {"a": [1, 2], "b": [4, 2]}
compare_dicts(result, expected)


def test_gather_rows_cols(constructor_with_pyarrow: Any) -> None:
native_df = constructor_with_pyarrow(data)
df = nw.from_native(native_df, eager_only=True)
is_pandas_wo_pyarrow = parse_version(pd.__version__) < parse_version("1.0.0")
if isinstance(native_df, pa.Table) or is_pandas_wo_pyarrow:
# PyArrowSeries do not have `to_pandas`
result = df[[0, 3, 1], 1].to_numpy()
expected = np.array([11, 14, 12])
assert np.array_equal(result, expected)
result = df[np.array([0, 3, 1]), "b"].to_numpy()
assert np.array_equal(result, expected)
else:
result = df[[0, 3, 1], 1].to_pandas()
expected_index = range(3) if isinstance(native_df, pl.DataFrame) else [0, 3, 1]
expected = pd.Series([11, 14, 12], name="b", index=expected_index)
assert_series_equal(result, expected, check_dtype=False)
result = df[np.array([0, 3, 1]), "b"].to_pandas()
assert_series_equal(result, expected, check_dtype=False)
18 changes: 0 additions & 18 deletions tests/frame/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,24 +180,6 @@ def test_expr_binary(df_raw: Any) -> None:
compare_dicts(result_native, expected)


@pytest.mark.parametrize("df_raw", [df_polars, df_pandas, df_lazy])
def test_expr_unary(df_raw: Any) -> None:
result = (
nw.from_native(df_raw)
.with_columns(
a_mean=nw.col("a").mean(),
a_sum=nw.col("a").sum(),
b_nunique=nw.col("b").n_unique(),
z_min=nw.col("z").min(),
z_max=nw.col("z").max(),
)
.select(nw.col("a_mean", "a_sum", "b_nunique", "z_min", "z_max").unique())
)
result_native = nw.to_native(result)
expected = {"a_mean": [2], "a_sum": [6], "b_nunique": [2], "z_min": [7], "z_max": [9]}
compare_dicts(result_native, expected)


@pytest.mark.parametrize("df_raw", [df_polars, df_pandas, df_mpd, df_lazy])
def test_expr_transform(df_raw: Any) -> None:
result = nw.from_native(df_raw).with_columns(
Expand Down
8 changes: 4 additions & 4 deletions tests/frame/to_dict_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,16 @@
import narwhals.stable.v1 as nw


def test_to_dict(constructor: Any) -> None:
def test_to_dict(constructor_with_pyarrow: Any) -> None:
data = {"a": [1, 3, 2], "b": [4, 4, 6], "c": [7.0, 8, 9]}
df = nw.from_native(constructor(data), eager_only=True)
df = nw.from_native(constructor_with_pyarrow(data), eager_only=True)
result = df.to_dict(as_series=False)
assert result == data


def test_to_dict_as_series(constructor: Any) -> None:
def test_to_dict_as_series(constructor_with_pyarrow: Any) -> None:
data = {"a": [1, 3, 2], "b": [4, 4, 6], "c": [7.0, 8, 9]}
df = nw.from_native(constructor(data), eager_only=True)
df = nw.from_native(constructor_with_pyarrow(data), eager_only=True)
result = df.to_dict(as_series=True)
assert isinstance(result["a"], nw.Series)
assert isinstance(result["b"], nw.Series)
Expand Down
4 changes: 2 additions & 2 deletions tests/frame/to_numpy_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
import narwhals.stable.v1 as nw


def test_convert_numpy(constructor: Any) -> None:
def test_convert_numpy(constructor_with_pyarrow: Any) -> None:
data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.1, 8, 9]}
df_raw = constructor(data)
df_raw = constructor_with_pyarrow(data)
result = nw.from_native(df_raw, eager_only=True).to_numpy()

expected = np.array([[1, 3, 2], [4, 4, 6], [7.1, 8, 9]]).T
Expand Down
6 changes: 3 additions & 3 deletions tests/frame/with_row_index_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
}


def test_with_row_index(constructor: Any) -> None:
result = nw.from_native(constructor(data)).with_row_index()
def test_with_row_index(constructor_with_pyarrow: Any) -> None:
result = nw.from_native(constructor_with_pyarrow(data)).with_row_index()
expected = {"a": ["foo", "bars"], "ab": ["foo", "bars"], "index": [0, 1]}
compare_dicts(result, expected)
result = nw.from_native(constructor(data)).lazy().with_row_index()
result = nw.from_native(constructor_with_pyarrow(data)).lazy().with_row_index()
compare_dicts(result, expected)
3 changes: 0 additions & 3 deletions utils/check_backend_completeness.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,7 @@
"DataFrame.pipe",
"DataFrame.rename",
"DataFrame.tail",
"DataFrame.to_dict",
"DataFrame.to_numpy",
"DataFrame.unique",
"DataFrame.with_row_index",
"DataFrame.write_parquet",
"Series.drop_nulls",
"Series.fill_null",
Expand Down

0 comments on commit 7d15b35

Please sign in to comment.