Skip to content

Commit

Permalink
Support for polars
Browse files Browse the repository at this point in the history
  • Loading branch information
ecomodeller committed Aug 6, 2024
1 parent f649ac6 commit d1af799
Show file tree
Hide file tree
Showing 8 changed files with 181 additions and 17 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,5 @@ docs/api/
.venv/

.testmondata
objects.json
objects.json
.jupyter_cache/
1 change: 1 addition & 0 deletions docs/_quarto.yml
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ quartodoc:
- read
- read_pfs
- from_pandas
- from_polars
- title: Dataset
desc: ""
contents:
Expand Down
23 changes: 23 additions & 0 deletions docs/user-guide/dfs0.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,29 @@ df = pd.read_csv(
df.to_dfs0("mauna_loa_co2.dfs0")
```

```{python}
import polars as pl
import mikeio
from datetime import datetime
df = pl.DataFrame(
{
"time": [datetime(2021, 1, 1), datetime(2021, 1, 2)],
"A": [1.0, 2.0],
"B": [4.0, 5.0],
}
)
ds = mikeio.from_polars(
df,
items={
"A": mikeio.ItemInfo(mikeio.EUMType.Water_Level),
"B": mikeio.ItemInfo(mikeio.EUMType.Discharge),
},
)
ds
```

## Dfs0 example notebooks

* [Dfs0](https://nbviewer.jupyter.org/github/DHI/mikeio/blob/main/notebooks/Dfs0%20-%20Timeseries.ipynb) - read, write, to_dataframe, non-equidistant, accumulated timestep, extrapolation
Expand Down
3 changes: 2 additions & 1 deletion mikeio/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
if "64" not in architecture()[0]:
raise Exception("This library has not been tested for a 32 bit system.")

from .dataset import DataArray, Dataset, from_pandas
from .dataset import DataArray, Dataset, from_pandas, from_polars
from .dfs import Dfs0, Dfs1, Dfs2, Dfs3
from .dfsu import Dfsu, Mesh
from .eum import EUMType, EUMUnit, ItemInfo
Expand Down Expand Up @@ -210,4 +210,5 @@ def open(filename: str | Path, **kwargs: Any) -> Any:
"read",
"open",
"from_pandas",
"from_polars",
]
4 changes: 2 additions & 2 deletions mikeio/dataset/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from ._dataarray import DataArray
from ._dataset import Dataset, from_pandas
from ._dataset import Dataset, from_pandas, from_polars

__all__ = ["DataArray", "Dataset", "from_pandas"]
__all__ = ["DataArray", "Dataset", "from_pandas", "from_polars"]
109 changes: 98 additions & 11 deletions mikeio/dataset/_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

if TYPE_CHECKING:
import xarray
import polars as pl

from ._dataarray import DataArray
from ._data_utils import _to_safe_name, _get_time_idx_list, _n_selected_timesteps
Expand Down Expand Up @@ -1952,27 +1953,113 @@ def from_pandas(
ncol = df.values.shape[1]
data = [df.values[:, i] for i in range(ncol)]

# column names are always used as item names
item_list = _parse_items(df.columns, items)

das = {
item.name: DataArray(data=d, item=item, time=df.index)
for d, item in zip(data, item_list)
}
ds = Dataset(das)
return ds


def from_polars(
df: "pl.DataFrame",
items: Mapping[str, ItemInfo] | Sequence[ItemInfo] | ItemInfo | None = None,
datetime_col: str | None = None,
) -> "Dataset":
"""Create a Dataset from a polars DataFrame
Parameters
----------
df: pl.DataFrame
DataFrame
items: Mapping[str, ItemInfo] | Sequence[ItemInfo] | ItemInfo | None, optional
Mapping of item names to ItemInfo objects, or a sequence of ItemInfo objects, or a single ItemInfo object.
datetime_col: str, optional
Name of the column containing datetime information, default is to use the first datetime column found.
Returns
-------
Dataset
time series dataset
Examples
--------
```{python}
import polars as pl
import mikeio
from datetime import datetime
df = pl.DataFrame(
{
"time": [datetime(2021, 1, 1), datetime(2021, 1, 2)],
"A": [1.0, 2.0],
"B": [4.0, 5.0],
}
)
ds = mikeio.from_polars(
df,
items={
"A": mikeio.ItemInfo(mikeio.EUMType.Water_Level),
"B": mikeio.ItemInfo(mikeio.EUMType.Discharge),
},
)
ds
```
"""

import polars as pl

if datetime_col is None:
for col, dtype in zip(df.columns, df.dtypes):
if isinstance(dtype, pl.Datetime):
datetime_col = col
break

if datetime_col is None:
raise ValueError("Datetime column not found. Please specify time_col.")

time = pd.DatetimeIndex(df[datetime_col])
df = df.drop(datetime_col)

# convert the polars dataframe to list of numpy arrays
array = df.to_numpy()

data = [array[:, i] for i in range(array.shape[1])]

item_list = _parse_items(df.columns, items)

das = {
item.name: DataArray(data=d, item=item, time=time)
for d, item in zip(data, item_list)
}
ds = Dataset(das)
return ds


def _parse_items(
column_names: Sequence[str],
items: Mapping[str, ItemInfo] | Sequence[ItemInfo] | ItemInfo | None = None,
) -> List[ItemInfo]:
if items is None:
item_list: Sequence[ItemInfo] = [ItemInfo(name) for name in df.columns]
item_list: List[ItemInfo] = [ItemInfo(name) for name in column_names]
elif isinstance(items, ItemInfo):
eum_type = items.type
eum_unit = items.unit
item_list = [ItemInfo(name, eum_type, eum_unit) for name in df.columns]
item_list = [ItemInfo(name, eum_type, eum_unit) for name in column_names]

elif isinstance(items, Mapping):
item_list = [
ItemInfo(name, items[name].type, items[name].unit) for name in df.columns
ItemInfo(name, items[name].type, items[name].unit) for name in column_names
]
elif isinstance(items, Sequence):
item_list = [
ItemInfo(col, item.type, item.unit) for col, item in zip(df.columns, items)
ItemInfo(col, item.type, item.unit)
for col, item in zip(column_names, items)
]
else:
raise TypeError("items must be a mapping, sequence or ItemInfo")

das = {
item.name: DataArray(data=d, item=item, time=df.index)
for d, item in zip(data, item_list)
}
ds = Dataset(das)
return ds
return item_list
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ dev = ["pytest",
"mypy==1.11.1",
]

test = ["pytest", "pytest-cov", "xarray","mypy==1.6.1","shapely","pyproj"]
test = ["pytest", "pytest-cov", "xarray","mypy==1.6.1","shapely","pyproj", "polars"]

notebooks= [
"nbformat",
Expand Down
53 changes: 52 additions & 1 deletion tests/test_dfs0.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from datetime import datetime
import numpy as np
import pandas as pd
import mikeio
Expand Down Expand Up @@ -349,7 +350,7 @@ def test_from_pandas_use_first_datetime_column() -> None:
assert ds.time[-1].year == 2001


def test_no_time_raises_error() -> None:
def test_from_pandas_no_time_raises_error() -> None:
df = pd.DataFrame(
{
"flow": np.array([1, np.nan, 2]),
Expand All @@ -361,6 +362,56 @@ def test_no_time_raises_error() -> None:
mikeio.from_pandas(df)


def test_from_polars_explicit_time_column() -> None:
import polars as pl

df = pl.DataFrame(
{
"flow": [1.0, None, 2.0],
"level": [2, 3.0, -1.3],
"time": [
datetime(2001, 1, 1, 0),
datetime(2001, 1, 1, 1),
datetime(2001, 1, 1, 2),
],
}
)

ds = mikeio.from_polars(
df,
datetime_col="time",
items={
"flow": ItemInfo(EUMType.Discharge),
"level": ItemInfo(EUMType.Water_Level),
},
)
assert ds.time[0].year == 2001
assert ds["flow"].item.type == EUMType.Discharge
assert ds["level"].item.name == "level"


def test_from_polars_use_first_datetime_column() -> None:
import polars as pl

df = pl.DataFrame(
{
"time": [
datetime(2001, 1, 1, 0),
datetime(2001, 1, 1, 1),
datetime(2001, 1, 1, 2),
],
"flow": [1.0, None, 2.0],
"level": [2, 3.0, -1.3],
}
)

ds = mikeio.from_polars(df)

assert ds.n_timesteps == 3
assert ds.time[-1].year == 2001
assert ds["flow"].values[-1] == pytest.approx(2.0)


def test_write_from_pandas_series_monkey_patched(tmp_path):
df = pd.read_csv(
"tests/testdata/co2-mm-mlo.csv",
Expand Down

0 comments on commit d1af799

Please sign in to comment.