diff --git a/docs/overhead.md b/docs/overhead.md index 7edfb1f95..1477f6fa6 100644 --- a/docs/overhead.md +++ b/docs/overhead.md @@ -10,7 +10,7 @@ vs running pandas via Narwhals: ![Comparison of pandas vs "pandas via Narwhals" timings on TPC-H queries showing neglibile overhead](https://github.com/narwhals-dev/narwhals/assets/33491632/71029c26-4121-43bb-90fb-5ac1c16ab8a2) -[Here](https://www.kaggle.com/code/marcogorelli/narwhals-tpc-h-results-s-2-w-native)'s the code to +[Here](https://www.kaggle.com/code/marcogorelli/narwhals-tpc-h-results-s-2)'s the code to reproduce the plot above, check the input sources for notebooks which run each individual query, along with the data sources. diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 9aab77381..525402b8e 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -72,13 +72,21 @@ def get_column(self, name: str) -> ArrowSeries: backend_version=self._backend_version, ) + @overload + def __getitem__(self, item: tuple[Sequence[int], str | int]) -> ArrowSeries: ... # type: ignore[overload-overlap] + + @overload + def __getitem__(self, item: Sequence[int]) -> ArrowDataFrame: ... + @overload def __getitem__(self, item: str) -> ArrowSeries: ... @overload def __getitem__(self, item: slice) -> ArrowDataFrame: ... - def __getitem__(self, item: str | slice) -> ArrowSeries | ArrowDataFrame: + def __getitem__( + self, item: str | slice | Sequence[int] | tuple[Sequence[int], str | int] + ) -> ArrowSeries | ArrowDataFrame: if isinstance(item, str): from narwhals._arrow.series import ArrowSeries @@ -87,6 +95,16 @@ def __getitem__(self, item: str | slice) -> ArrowSeries | ArrowDataFrame: name=item, backend_version=self._backend_version, ) + elif isinstance(item, tuple) and len(item) == 2: + from narwhals._arrow.series import ArrowSeries + + # PyArrow columns are always strings + col_name = item[1] if isinstance(item[1], str) else self.columns[item[1]] + return ArrowSeries( + self._native_dataframe[col_name].take(item[0]), + name=col_name, + backend_version=self._backend_version, + ) elif isinstance(item, slice): if item.step is not None and item.step != 1: @@ -195,6 +213,32 @@ def sort( def to_pandas(self) -> Any: return self._native_dataframe.to_pandas() + def to_numpy(self) -> Any: + import numpy as np + + return np.column_stack([col.to_numpy() for col in self._native_dataframe.columns]) + + def to_dict(self, *, as_series: bool) -> Any: + df = self._native_dataframe + + names_and_values = zip(df.column_names, df.columns) + if as_series: + from narwhals._arrow.series import ArrowSeries + + return { + name: ArrowSeries(col, name=name, backend_version=self._backend_version) + for name, col in names_and_values + } + else: + return {name: col.to_pylist() for name, col in names_and_values} + + def with_row_index(self, name: str) -> Self: + pa = get_pyarrow() + df = self._native_dataframe + + row_indices = pa.array(range(df.num_rows)) + return self._from_native_dataframe(df.append_column(name, row_indices)) + def lazy(self) -> Self: return self diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index a889885c1..6be62d3e0 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -99,13 +99,21 @@ def get_column(self, name: str) -> PandasLikeSeries: backend_version=self._backend_version, ) + @overload + def __getitem__(self, item: tuple[Sequence[int], str | int]) -> PandasLikeSeries: ... # type: ignore[overload-overlap] + + @overload + def __getitem__(self, item: Sequence[int]) -> PandasLikeDataFrame: ... + @overload def __getitem__(self, item: str) -> PandasLikeSeries: ... @overload def __getitem__(self, item: slice) -> PandasLikeDataFrame: ... - def __getitem__(self, item: str | slice) -> PandasLikeSeries | PandasLikeDataFrame: + def __getitem__( + self, item: str | slice | Sequence[int] | tuple[Sequence[int], str | int] + ) -> PandasLikeSeries | PandasLikeDataFrame: if isinstance(item, str): from narwhals._pandas_like.series import PandasLikeSeries @@ -115,6 +123,23 @@ def __getitem__(self, item: str | slice) -> PandasLikeSeries | PandasLikeDataFra backend_version=self._backend_version, ) + elif isinstance(item, tuple) and len(item) == 2: + from narwhals._pandas_like.series import PandasLikeSeries + + if isinstance(item[1], str): + native_series = self._native_dataframe.loc[item] + elif isinstance(item[1], int): + native_series = self._native_dataframe.iloc[item] + else: # pragma: no cover + msg = f"Expected str or int, got: {type(item[1])}" + raise TypeError(msg) + + return PandasLikeSeries( + native_series, + implementation=self._implementation, + backend_version=self._backend_version, + ) + elif isinstance(item, (slice, Sequence)) or ( (np := get_numpy()) is not None and isinstance(item, np.ndarray) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index e255844a9..e86380368 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -456,7 +456,10 @@ def get_column(self, name: str) -> Series: ) @overload - def __getitem__(self, item: Sequence[int]) -> Series: ... + def __getitem__(self, item: tuple[Sequence[int], str | int]) -> Series: ... # type: ignore[overload-overlap] + + @overload + def __getitem__(self, item: Sequence[int]) -> Self: ... @overload def __getitem__(self, item: str) -> Series: ... @@ -464,7 +467,9 @@ def __getitem__(self, item: str) -> Series: ... @overload def __getitem__(self, item: slice) -> Self: ... - def __getitem__(self, item: str | slice | Sequence[int]) -> Series | Self: + def __getitem__( + self, item: str | slice | Sequence[int] | tuple[Sequence[int], str | int] + ) -> Series | Self: """ Extract column or slice of DataFrame. @@ -473,7 +478,9 @@ def __getitem__(self, item: str | slice | Sequence[int]) -> Series | Self: - str: extract column - slice or Sequence of integers: slice rows from dataframe. - + - tuple of Sequence of integers and str or int: slice rows and extract column at the same time. + If the second element of the tuple is an integer, it is interpreted as the column index. Otherwise, + it is interpreted as the column name. Notes: In contrast with Polars, pandas allows non-string column names. If you don't know whether the column name you're trying to extract @@ -508,7 +515,7 @@ def __getitem__(self, item: str | slice | Sequence[int]) -> Series | Self: 2 ] """ - if isinstance(item, str): + if isinstance(item, str) or (isinstance(item, tuple) and len(item) == 2): from narwhals.series import Series return Series( diff --git a/narwhals/stable/v1.py b/narwhals/stable/v1.py index 86602a1de..e27a49485 100644 --- a/narwhals/stable/v1.py +++ b/narwhals/stable/v1.py @@ -67,7 +67,10 @@ class DataFrame(NwDataFrame[IntoDataFrameT]): """ @overload - def __getitem__(self, item: Sequence[int]) -> Series: ... + def __getitem__(self, item: tuple[Sequence[int], str | int]) -> Series: ... # type: ignore[overload-overlap] + + @overload + def __getitem__(self, item: Sequence[int]) -> Self: ... @overload def __getitem__(self, item: str) -> Series: ... diff --git a/tests/expr/unary_test.py b/tests/expr/unary_test.py new file mode 100644 index 000000000..c13084436 --- /dev/null +++ b/tests/expr/unary_test.py @@ -0,0 +1,22 @@ +from typing import Any + +import narwhals as nw +from tests.utils import compare_dicts + + +def test_unary(constructor_with_lazy: Any) -> None: + data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} + result = ( + nw.from_native(constructor_with_lazy(data)) + .with_columns( + a_mean=nw.col("a").mean(), + a_sum=nw.col("a").sum(), + b_nunique=nw.col("b").n_unique(), + z_min=nw.col("z").min(), + z_max=nw.col("z").max(), + ) + .select(nw.col("a_mean", "a_sum", "b_nunique", "z_min", "z_max").unique()) + ) + result_native = nw.to_native(result) + expected = {"a_mean": [2], "a_sum": [6], "b_nunique": [2], "z_min": [7], "z_max": [9]} + compare_dicts(result_native, expected) diff --git a/tests/frame/pipe_test.py b/tests/frame/pipe_test.py index 6f49966b9..9dd66f10a 100644 --- a/tests/frame/pipe_test.py +++ b/tests/frame/pipe_test.py @@ -9,8 +9,8 @@ } -def test_pipe(constructor: Any) -> None: - df = nw.from_native(constructor(data)) +def test_pipe(constructor_with_pyarrow: Any) -> None: + df = nw.from_native(constructor_with_pyarrow(data)) columns = df.lazy().collect().columns result = df.pipe(lambda _df: _df.select([x for x in columns if len(x) == 2])) expected = {"ab": ["foo", "bars"]} diff --git a/tests/frame/slice_test.py b/tests/frame/slice_test.py index 45390c561..4a911142e 100644 --- a/tests/frame/slice_test.py +++ b/tests/frame/slice_test.py @@ -5,8 +5,10 @@ import polars as pl import pyarrow as pa import pytest +from pandas.testing import assert_series_equal import narwhals.stable.v1 as nw +from narwhals.utils import parse_version from tests.utils import compare_dicts data = { @@ -69,3 +71,23 @@ def test_gather_pandas_index() -> None: result = nw.from_native(df, eager_only=True)[[1, 2]] expected = {"a": [1, 2], "b": [4, 2]} compare_dicts(result, expected) + + +def test_gather_rows_cols(constructor_with_pyarrow: Any) -> None: + native_df = constructor_with_pyarrow(data) + df = nw.from_native(native_df, eager_only=True) + is_pandas_wo_pyarrow = parse_version(pd.__version__) < parse_version("1.0.0") + if isinstance(native_df, pa.Table) or is_pandas_wo_pyarrow: + # PyArrowSeries do not have `to_pandas` + result = df[[0, 3, 1], 1].to_numpy() + expected = np.array([11, 14, 12]) + assert np.array_equal(result, expected) + result = df[np.array([0, 3, 1]), "b"].to_numpy() + assert np.array_equal(result, expected) + else: + result = df[[0, 3, 1], 1].to_pandas() + expected_index = range(3) if isinstance(native_df, pl.DataFrame) else [0, 3, 1] + expected = pd.Series([11, 14, 12], name="b", index=expected_index) + assert_series_equal(result, expected, check_dtype=False) + result = df[np.array([0, 3, 1]), "b"].to_pandas() + assert_series_equal(result, expected, check_dtype=False) diff --git a/tests/frame/test_common.py b/tests/frame/test_common.py index 69be5e6d1..99a16de60 100644 --- a/tests/frame/test_common.py +++ b/tests/frame/test_common.py @@ -180,24 +180,6 @@ def test_expr_binary(df_raw: Any) -> None: compare_dicts(result_native, expected) -@pytest.mark.parametrize("df_raw", [df_polars, df_pandas, df_lazy]) -def test_expr_unary(df_raw: Any) -> None: - result = ( - nw.from_native(df_raw) - .with_columns( - a_mean=nw.col("a").mean(), - a_sum=nw.col("a").sum(), - b_nunique=nw.col("b").n_unique(), - z_min=nw.col("z").min(), - z_max=nw.col("z").max(), - ) - .select(nw.col("a_mean", "a_sum", "b_nunique", "z_min", "z_max").unique()) - ) - result_native = nw.to_native(result) - expected = {"a_mean": [2], "a_sum": [6], "b_nunique": [2], "z_min": [7], "z_max": [9]} - compare_dicts(result_native, expected) - - @pytest.mark.parametrize("df_raw", [df_polars, df_pandas, df_mpd, df_lazy]) def test_expr_transform(df_raw: Any) -> None: result = nw.from_native(df_raw).with_columns( diff --git a/tests/frame/to_dict_test.py b/tests/frame/to_dict_test.py index b0950c5c9..8fa31c336 100644 --- a/tests/frame/to_dict_test.py +++ b/tests/frame/to_dict_test.py @@ -3,16 +3,16 @@ import narwhals.stable.v1 as nw -def test_to_dict(constructor: Any) -> None: +def test_to_dict(constructor_with_pyarrow: Any) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "c": [7.0, 8, 9]} - df = nw.from_native(constructor(data), eager_only=True) + df = nw.from_native(constructor_with_pyarrow(data), eager_only=True) result = df.to_dict(as_series=False) assert result == data -def test_to_dict_as_series(constructor: Any) -> None: +def test_to_dict_as_series(constructor_with_pyarrow: Any) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "c": [7.0, 8, 9]} - df = nw.from_native(constructor(data), eager_only=True) + df = nw.from_native(constructor_with_pyarrow(data), eager_only=True) result = df.to_dict(as_series=True) assert isinstance(result["a"], nw.Series) assert isinstance(result["b"], nw.Series) diff --git a/tests/frame/to_numpy_test.py b/tests/frame/to_numpy_test.py index 6f516334d..5cfa69bf7 100644 --- a/tests/frame/to_numpy_test.py +++ b/tests/frame/to_numpy_test.py @@ -7,9 +7,9 @@ import narwhals.stable.v1 as nw -def test_convert_numpy(constructor: Any) -> None: +def test_convert_numpy(constructor_with_pyarrow: Any) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.1, 8, 9]} - df_raw = constructor(data) + df_raw = constructor_with_pyarrow(data) result = nw.from_native(df_raw, eager_only=True).to_numpy() expected = np.array([[1, 3, 2], [4, 4, 6], [7.1, 8, 9]]).T diff --git a/tests/frame/with_row_index_test.py b/tests/frame/with_row_index_test.py index 1b0ad5792..ef54557bd 100644 --- a/tests/frame/with_row_index_test.py +++ b/tests/frame/with_row_index_test.py @@ -9,9 +9,9 @@ } -def test_with_row_index(constructor: Any) -> None: - result = nw.from_native(constructor(data)).with_row_index() +def test_with_row_index(constructor_with_pyarrow: Any) -> None: + result = nw.from_native(constructor_with_pyarrow(data)).with_row_index() expected = {"a": ["foo", "bars"], "ab": ["foo", "bars"], "index": [0, 1]} compare_dicts(result, expected) - result = nw.from_native(constructor(data)).lazy().with_row_index() + result = nw.from_native(constructor_with_pyarrow(data)).lazy().with_row_index() compare_dicts(result, expected) diff --git a/utils/check_backend_completeness.py b/utils/check_backend_completeness.py index 924115fbb..abed808c6 100644 --- a/utils/check_backend_completeness.py +++ b/utils/check_backend_completeness.py @@ -25,10 +25,7 @@ "DataFrame.pipe", "DataFrame.rename", "DataFrame.tail", - "DataFrame.to_dict", - "DataFrame.to_numpy", "DataFrame.unique", - "DataFrame.with_row_index", "DataFrame.write_parquet", "Series.drop_nulls", "Series.fill_null",