Skip to content

Commit

Permalink
fix(python): Handle DataFrame.extend extending by itself (#9897)
Browse files Browse the repository at this point in the history
  • Loading branch information
stinodego authored Jul 16, 2023
1 parent bb36e4c commit 2021249
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 65 deletions.
40 changes: 30 additions & 10 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5827,6 +5827,10 @@ def vstack(self, other: DataFrame, *, in_place: bool = False) -> Self:
in_place
Modify in place.
See Also
--------
extend
Examples
--------
>>> df1 = pl.DataFrame(
Expand Down Expand Up @@ -5874,26 +5878,36 @@ def extend(self, other: DataFrame) -> Self:
"""
Extend the memory backed by this `DataFrame` with the values from `other`.
Different from `vstack` which adds the chunks from `other` to the chunks of this
`DataFrame` `extend` appends the data from `other` to the underlying memory
locations and thus may cause a reallocation.
Different from ``vstack`` which adds the chunks from ``other`` to the chunks of
this ``DataFrame``, ``extend`` appends the data from `other` to the underlying
memory locations and thus may cause a reallocation.
If this does not cause a reallocation, the resulting data structure will not
have any extra chunks and thus will yield faster queries.
Prefer `extend` over `vstack` when you want to do a query after a single append.
For instance during online operations where you add `n` rows and rerun a query.
Prefer ``extend`` over ``vstack`` when you want to do a query after a single
append. For instance, during online operations where you add `n` rows and rerun
a query.
Prefer `vstack` over `extend` when you want to append many times before doing a
query. For instance when you read in multiple files and when to store them in a
single `DataFrame`. In the latter case, finish the sequence of `vstack`
operations with a `rechunk`.
Prefer ``vstack`` over ``extend`` when you want to append many times before
doing a query. For instance, when you read in multiple files and want to store
them in a single ``DataFrame``. In the latter case, finish the sequence of
``vstack`` operations with a ``rechunk``.
Parameters
----------
other
DataFrame to vertically add.
Warnings
--------
This method modifies the dataframe in-place. The dataframe is returned for
convenience only.
See Also
--------
vstack
Examples
--------
>>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})
Expand All @@ -5914,7 +5928,13 @@ def extend(self, other: DataFrame) -> Self:
└─────┴─────┘
"""
self._df.extend(other._df)
try:
self._df.extend(other._df)
except RuntimeError as exc:
if str(exc) == "Already mutably borrowed":
self._df.extend(other._df.clone())
else:
raise exc
return self

def drop(self, columns: str | Collection[str], *more_columns: str) -> DataFrame:
Expand Down
55 changes: 0 additions & 55 deletions py-polars/tests/unit/dataframe/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -704,61 +704,6 @@ def test_hstack_dataframe(in_place: bool) -> None:
assert_frame_equal(df_out, expected)


def test_extend() -> None:
with pl.StringCache():
df1 = pl.DataFrame(
{
"foo": [1, 2],
"bar": [True, False],
"ham": ["a", "b"],
"cat": ["A", "B"],
"dates": [datetime(2021, 1, 1), datetime(2021, 2, 1)],
}
).with_columns(
[
pl.col("cat").cast(pl.Categorical),
]
)
df2 = pl.DataFrame(
{
"foo": [3, 4],
"bar": [True, None],
"ham": ["c", "d"],
"cat": ["C", "B"],
"dates": [datetime(2022, 9, 1), datetime(2021, 2, 1)],
}
).with_columns(
[
pl.col("cat").cast(pl.Categorical),
]
)

df1.extend(df2)
expected = pl.DataFrame(
{
"foo": [1, 2, 3, 4],
"bar": [True, False, True, None],
"ham": ["a", "b", "c", "d"],
"cat": ["A", "B", "C", "B"],
"dates": [
datetime(2021, 1, 1),
datetime(2021, 2, 1),
datetime(2022, 9, 1),
datetime(2021, 2, 1),
],
}
).with_columns(
pl.col("cat").cast(pl.Categorical),
)
assert_frame_equal(df1, expected)

# 8745
df = pl.DataFrame([{"age": 1}, {"age": 2}, {"age": 3}])
df = df[:-1]
tail = pl.DataFrame([{"age": 8}])
assert df.extend(tail).to_dict(False) == {"age": [1, 2, 8]}


def test_file_buffer() -> None:
f = BytesIO()
f.write(b"1,2,3,4,5,6\n7,8,9,10,11,12")
Expand Down
81 changes: 81 additions & 0 deletions py-polars/tests/unit/dataframe/test_extend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from datetime import datetime

import pytest

import polars as pl
from polars.testing import assert_frame_equal


def test_extend_various_dtypes() -> None:
with pl.StringCache():
df1 = pl.DataFrame(
{
"foo": [1, 2],
"bar": [True, False],
"ham": ["a", "b"],
"cat": ["A", "B"],
"dates": [datetime(2021, 1, 1), datetime(2021, 2, 1)],
},
schema_overrides={"cat": pl.Categorical},
)
df2 = pl.DataFrame(
{
"foo": [3, 4],
"bar": [True, None],
"ham": ["c", "d"],
"cat": ["C", "B"],
"dates": [datetime(2022, 9, 1), datetime(2021, 2, 1)],
},
schema_overrides={"cat": pl.Categorical},
)

df1.extend(df2)

expected = pl.DataFrame(
{
"foo": [1, 2, 3, 4],
"bar": [True, False, True, None],
"ham": ["a", "b", "c", "d"],
"cat": ["A", "B", "C", "B"],
"dates": [
datetime(2021, 1, 1),
datetime(2021, 2, 1),
datetime(2022, 9, 1),
datetime(2021, 2, 1),
],
},
schema_overrides={"cat": pl.Categorical},
)
assert_frame_equal(df1, expected)


def test_extend_slice_offset_8745() -> None:
df = pl.DataFrame([{"age": 1}, {"age": 2}, {"age": 3}])
df = df[:-1]
tail = pl.DataFrame([{"age": 8}])
assert df.extend(tail).to_dict(False) == {"age": [1, 2, 8]}


def test_extend_self() -> None:
df = pl.DataFrame({"a": [1, 2], "b": [True, False]})

df.extend(df)

expected = pl.DataFrame({"a": [1, 2, 1, 2], "b": [True, False, True, False]})
assert_frame_equal(df, expected)


def test_extend_column_number_mismatch() -> None:
df1 = pl.DataFrame({"a": [1, 2], "b": [True, False]})
df2 = df1.drop("a")

with pytest.raises(pl.ShapeError):
df1.extend(df2)


def test_extend_column_name_mismatch() -> None:
df1 = pl.DataFrame({"a": [1, 2], "b": [True, False]})
df2 = df1.with_columns(pl.col("a").alias("c"))

with pytest.raises(pl.ShapeError):
df1.extend(df2)
14 changes: 14 additions & 0 deletions py-polars/tests/unit/dataframe/test_vstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,17 @@ def test_vstack_self_in_place(df1: pl.DataFrame) -> None:
{"foo": [1, 2, 1, 2], "bar": [6, 7, 6, 7], "ham": ["a", "b", "a", "b"]}
)
assert_frame_equal(df1, expected)


def test_vstack_column_number_mismatch(df1: pl.DataFrame) -> None:
df2 = df1.drop("ham")

with pytest.raises(pl.ShapeError):
df1.vstack(df2)


def test_vstack_column_name_mismatch(df1: pl.DataFrame) -> None:
df2 = df1.with_columns(pl.col("foo").alias("oof"))

with pytest.raises(pl.ShapeError):
df1.vstack(df2)

0 comments on commit 2021249

Please sign in to comment.