Skip to content

Commit

Permalink
update readme
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcoGorelli committed Mar 16, 2024
1 parent cc3106a commit 07a59a9
Show file tree
Hide file tree
Showing 7 changed files with 53 additions and 117 deletions.
81 changes: 28 additions & 53 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Seamlessly support both, without depending on either!

-**Just use** a subset of **the Polars API**, no need to learn anything new
-**No dependencies** (not even Polars), keep your library lightweight
-Support both **lazy** and eager execution
-Separate **lazy** and eager APIs
- ✅ Use Polars **Expressions**

**Note: this is work-in-progress, and a bit of an experiment, don't take it too seriously**.
Expand All @@ -29,16 +29,16 @@ Or just vendor it, it's only a bunch of pure-Python files.

There are three steps to writing dataframe-agnostic code using Narwhals:

1. use `narwhals.DataFrame` to wrap a pandas or Polars DataFrame to a Narwhals DataFrame
2. use the subset of the Polars API supported by Narwhals. Some methods are only available
if you initialised `narwhals.DataFrame` with `features=['eager']`, or `features=['lazy']`
1. use `narwhals.LazyFrame` or `narwhals.DataFrame` to wrap a pandas or Polars
DataFrame/LazyFrame in a Narwhals class
2. use the subset of the Polars API supported by Narwhals. Just like in Polars,
some methods (e.g. `to_numpy`) are only available for `DataFrame`, not `LazyFrame`
3. use `narwhals.to_native` to return an object to the user in its original
dataframe flavour. For example:

- if you started with a pandas DataFrame, you'll get a pandas DataFrame back
- if you started with a Polars DataFrame, you'll get a Polars DataFrame back
- if you started with a Polars LazyFrame, you'll get a Polars LazyFrame back (unless
you called `.collect`!)
- if you started with pandas, you'll get pandas back
- if you started with Polars, you'll get Polars back
- if you started with Polars, you'll get Polars back

## Example

Expand All @@ -56,29 +56,24 @@ def my_agnostic_function(
suppliers_native,
parts_native,
):
suppliers = nw.DataFrame(suppliers_native)
parts = nw.DataFrame(parts_native)
suppliers = nw.LazyFrame(suppliers_native)
parts = nw.LazyFrame(parts_native)

result = (
suppliers.join(parts, left_on="city", right_on="city")
.filter(
nw.col("color").is_in(["Red", "Green"]),
nw.col("weight") > 14,
)
.group_by("s", "p")
.filter(nw.col("weight") > 10)
.group_by("s")
.agg(
weight_mean=nw.col("weight").mean(),
weight_max=nw.col("weight").max(),
)
).with_columns(nw.col("weight_max").cast(nw.Int64))
)
return nw.to_native(result)

```
You can pass in a pandas or Polars dataframe, the output will be the same!
Let's try it out:

```python

suppliers = {
"s": ["S1", "S2", "S3", "S4", "S5"],
"sname": ["Smith", "Jones", "Blake", "Clark", "Adams"],
Expand All @@ -101,13 +96,6 @@ print(
)
)
print("\nPolars output:")
print(
my_agnostic_function(
pl.DataFrame(suppliers),
pl.DataFrame(parts),
)
)
print("\nPolars lazy output:")
print(
my_agnostic_function(
pl.LazyFrame(suppliers),
Expand All @@ -118,37 +106,24 @@ print(

```
pandas output:
s p weight_mean
0 S1 P6 19.0
1 S2 P2 17.0
2 S3 P2 17.0
3 S4 P6 19.0
s weight_mean weight_max
0 S1 15.0 19.0
1 S2 14.5 17.0
2 S3 14.5 17.0
3 S4 15.0 19.0
Polars output:
shape: (4, 3)
┌─────┬─────┬─────────────┐
│ s ┆ p ┆ weight_mean │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ f64 │
╞═════╪═════╪═════════════╡
│ S1 ┆ P6 ┆ 19.0 │
│ S3 ┆ P2 ┆ 17.0 │
│ S4 ┆ P6 ┆ 19.0 │
│ S2 ┆ P2 ┆ 17.0 │
└─────┴─────┴─────────────┘
Polars lazy output:
shape: (4, 3)
┌─────┬─────┬─────────────┐
│ s ┆ p ┆ weight_mean │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ f64 │
╞═════╪═════╪═════════════╡
│ S1 ┆ P6 ┆ 19.0 │
│ S3 ┆ P2 ┆ 17.0 │
│ S4 ┆ P6 ┆ 19.0 │
│ S2 ┆ P2 ┆ 17.0 │
└─────┴─────┴─────────────┘
┌─────┬─────────────┬────────────┐
│ s ┆ weight_mean ┆ weight_max │
│ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 │
╞═════╪═════════════╪════════════╡
│ S2 ┆ 14.5 ┆ 17.0 │
│ S3 ┆ 14.5 ┆ 17.0 │
│ S4 ┆ 15.0 ┆ 19.0 │
│ S1 ┆ 15.0 ┆ 19.0 │
└─────┴─────────────┴────────────┘
```
Magic! 🪄

Expand Down
36 changes: 9 additions & 27 deletions f.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,27 @@
# ruff: noqa
from typing import Any, TYPE_CHECKING, TypeVar
# type: ignore
import pandas as pd
import polars as pl

import narwhals as nw

T = TypeVar("T")


def my_agnostic_function(
suppliers_native: T,
parts_native: T,
) -> T:
suppliers = nw.DataFrame(suppliers_native)
parts = nw.DataFrame(parts_native)
suppliers_native,
parts_native,
):
suppliers = nw.LazyFrame(suppliers_native)
parts = nw.LazyFrame(parts_native)

result = (
suppliers.join(parts, left_on="city", right_on="city")
.filter(
nw.col("color").is_in(["Red", "Green"]),
nw.col("weight") > 14,
)
.group_by("s", "p")
.filter(nw.col("weight") > 10)
.group_by("s")
.agg(
weight_mean=nw.col("weight").mean(),
weight_max=nw.col("weight").max(),
)
).with_columns(nw.col("weight_max").cast(nw.Int64))
)
return nw.to_native(result)


Expand All @@ -52,19 +47,6 @@ def my_agnostic_function(
)
)
print("\nPolars output:")
print(
my_agnostic_function(
pl.DataFrame(suppliers),
pl.DataFrame(parts),
)
)
print(
my_agnostic_function(
pl.DataFrame(suppliers),
pl.DataFrame(parts),
)
)
print("\nPolars lazy output:")
print(
my_agnostic_function(
pl.LazyFrame(suppliers),
Expand Down
26 changes: 6 additions & 20 deletions narwhals/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

from typing import TYPE_CHECKING
from typing import Any
from typing import Generic
from typing import Iterable
from typing import Literal
from typing import Sequence
Expand All @@ -21,22 +20,9 @@
from narwhals.series import Series
from narwhals.typing import IntoExpr
from narwhals.typing import T
from narwhals.typing import T


def _validate_features(df: Any, features: set[str]) -> None:
if (pl := get_polars()) is not None and isinstance(df, pl.DataFrame):
df_features = {"eager"}
elif (pl := get_polars()) is not None and isinstance(df, pl.LazyFrame):
df_features = {"lazy"}
else:
df_features = df._features
if diff := {f for f in features if f not in df_features}:
msg = f"Features {diff} not supported by {type(df)} DataFrame"
raise TypeError(msg)


class BaseFrame(Generic[T]):
class BaseFrame:
_dataframe: Any
_implementation: str

Expand Down Expand Up @@ -122,7 +108,7 @@ def filter(self, *predicates: IntoExpr | Iterable[IntoExpr]) -> Self:
self._dataframe.filter(*predicates),
)

def group_by(self, *keys: str | Iterable[str]) -> GroupBy[T]:
def group_by(self, *keys: str | Iterable[str]) -> GroupBy:
from narwhals.group_by import GroupBy

# todo: groupby and lazygroupby
Expand Down Expand Up @@ -156,7 +142,7 @@ def join(
)


class DataFrame(BaseFrame[T]):
class DataFrame(BaseFrame):
def __init__(
self,
df: T,
Expand Down Expand Up @@ -194,7 +180,7 @@ def to_numpy(self) -> Any:
def shape(self) -> tuple[int, int]:
return self._dataframe.shape # type: ignore[no-any-return]

def __getitem__(self, col_name: str) -> Series[Any]:
def __getitem__(self, col_name: str) -> Series:
from narwhals.series import Series

return Series(self._dataframe[col_name], implementation=self._implementation)
Expand All @@ -203,7 +189,7 @@ def to_dict(self, *, as_series: bool = True) -> dict[str, Any]:
return self._dataframe.to_dict(as_series=as_series) # type: ignore[no-any-return]


class LazyFrame(BaseFrame[T]):
class LazyFrame(BaseFrame):
def __init__(
self,
df: T,
Expand All @@ -229,7 +215,7 @@ def __init__(
msg = f"Expected pandas-like dataframe, Polars dataframe, or Polars lazyframe, got: {type(df)}"
raise TypeError(msg)

def collect(self) -> DataFrame[Any]:
def collect(self) -> DataFrame:
return DataFrame(
self._dataframe.collect(),
implementation=self._implementation,
Expand Down
10 changes: 3 additions & 7 deletions narwhals/group_by.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,24 @@
from __future__ import annotations

from typing import TYPE_CHECKING
from typing import Generic
from typing import Iterable

if TYPE_CHECKING:
from narwhals.dataframe import DataFrame
from narwhals.dataframe import LazyFrame
from narwhals.typing import IntoExpr
from narwhals.typing import T

# todo: make groupby and lazygroupby


class GroupBy(Generic[T]):
def __init__(
self, df: DataFrame[T] | LazyFrame[T], *keys: str | Iterable[str]
) -> None:
class GroupBy:
def __init__(self, df: DataFrame | LazyFrame, *keys: str | Iterable[str]) -> None:
self._df = df
self._keys = keys

def agg(
self, *aggs: IntoExpr | Iterable[IntoExpr], **named_aggs: IntoExpr
) -> DataFrame[T] | LazyFrame[T]:
) -> DataFrame | LazyFrame:
aggs, named_aggs = self._df._flatten_and_extract(*aggs, **named_aggs)
return self._df.__class__(
self._df._dataframe.group_by(*self._keys).agg(*aggs, **named_aggs),
Expand Down
6 changes: 2 additions & 4 deletions narwhals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,18 @@

from typing import TYPE_CHECKING
from typing import Any
from typing import Generic

from narwhals.translate import get_pandas
from narwhals.translate import get_polars

if TYPE_CHECKING:
from typing_extensions import Self
from narwhals.typing import T


class Series(Generic[T]):
class Series:
def __init__(
self,
series: T,
series: Any,
*,
implementation: str | None = None,
) -> None:
Expand Down
8 changes: 4 additions & 4 deletions narwhals/translate.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

from typing import TYPE_CHECKING
from typing import Any

from narwhals.dependencies import get_modin
from narwhals.dependencies import get_pandas
Expand All @@ -9,21 +10,20 @@
if TYPE_CHECKING:
from narwhals.dataframe import BaseFrame
from narwhals.series import Series
from narwhals.typing import T


def to_native(obj: BaseFrame[T] | Series[T]) -> T:
def to_native(obj: BaseFrame | Series) -> Any:
from narwhals.dataframe import BaseFrame
from narwhals.series import Series

if isinstance(obj, BaseFrame):
return ( # type: ignore[no-any-return]
return (
obj._dataframe
if obj._implementation == "polars"
else obj._dataframe._dataframe
)
if isinstance(obj, Series):
return obj._series if obj._implementation == "polars" else obj._series._series # type: ignore[no-any-return]
return obj._series if obj._implementation == "polars" else obj._series._series

msg = f"Expected Narwhals object, got {type(obj)}."
raise TypeError(msg)
Expand Down
3 changes: 1 addition & 2 deletions narwhals/typing.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
from typing import TYPE_CHECKING
from typing import Any
from typing import TypeAlias
from typing import TypeVar

if TYPE_CHECKING:
from narwhals.expression import Expr
from narwhals.series import Series

IntoExpr: TypeAlias = Expr | str | int | float | Series[Any]
IntoExpr: TypeAlias = Expr | str | int | float | Series

NativeDataFrame = TypeVar("NativeDataFrame")
NativeSeries = TypeVar("NativeSeries")
Expand Down

0 comments on commit 07a59a9

Please sign in to comment.