From 20212499e1e39649b96ede8cd28edc8f00ac669d Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sun, 16 Jul 2023 14:49:45 +0200 Subject: [PATCH] fix(python): Handle `DataFrame.extend` extending by itself (#9897) --- py-polars/polars/dataframe/frame.py | 40 ++++++--- py-polars/tests/unit/dataframe/test_df.py | 55 ------------- py-polars/tests/unit/dataframe/test_extend.py | 81 +++++++++++++++++++ py-polars/tests/unit/dataframe/test_vstack.py | 14 ++++ 4 files changed, 125 insertions(+), 65 deletions(-) create mode 100644 py-polars/tests/unit/dataframe/test_extend.py diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 6ce26c79211f..f86fbb4c45b1 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -5827,6 +5827,10 @@ def vstack(self, other: DataFrame, *, in_place: bool = False) -> Self: in_place Modify in place. + See Also + -------- + extend + Examples -------- >>> df1 = pl.DataFrame( @@ -5874,26 +5878,36 @@ def extend(self, other: DataFrame) -> Self: """ Extend the memory backed by this `DataFrame` with the values from `other`. - Different from `vstack` which adds the chunks from `other` to the chunks of this - `DataFrame` `extend` appends the data from `other` to the underlying memory - locations and thus may cause a reallocation. + Different from ``vstack`` which adds the chunks from ``other`` to the chunks of + this ``DataFrame``, ``extend`` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. If this does not cause a reallocation, the resulting data structure will not have any extra chunks and thus will yield faster queries. - Prefer `extend` over `vstack` when you want to do a query after a single append. - For instance during online operations where you add `n` rows and rerun a query. + Prefer ``extend`` over ``vstack`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. - Prefer `vstack` over `extend` when you want to append many times before doing a - query. For instance when you read in multiple files and when to store them in a - single `DataFrame`. In the latter case, finish the sequence of `vstack` - operations with a `rechunk`. + Prefer ``vstack`` over ``extend`` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single ``DataFrame``. In the latter case, finish the sequence of + ``vstack`` operations with a ``rechunk``. Parameters ---------- other DataFrame to vertically add. + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + Examples -------- >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) @@ -5914,7 +5928,13 @@ def extend(self, other: DataFrame) -> Self: └─────┴─────┘ """ - self._df.extend(other._df) + try: + self._df.extend(other._df) + except RuntimeError as exc: + if str(exc) == "Already mutably borrowed": + self._df.extend(other._df.clone()) + else: + raise exc return self def drop(self, columns: str | Collection[str], *more_columns: str) -> DataFrame: diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py index a2056ee74bef..6739a9a6d72f 100644 --- a/py-polars/tests/unit/dataframe/test_df.py +++ b/py-polars/tests/unit/dataframe/test_df.py @@ -704,61 +704,6 @@ def test_hstack_dataframe(in_place: bool) -> None: assert_frame_equal(df_out, expected) -def test_extend() -> None: - with pl.StringCache(): - df1 = pl.DataFrame( - { - "foo": [1, 2], - "bar": [True, False], - "ham": ["a", "b"], - "cat": ["A", "B"], - "dates": [datetime(2021, 1, 1), datetime(2021, 2, 1)], - } - ).with_columns( - [ - pl.col("cat").cast(pl.Categorical), - ] - ) - df2 = pl.DataFrame( - { - "foo": [3, 4], - "bar": [True, None], - "ham": ["c", "d"], - "cat": ["C", "B"], - "dates": [datetime(2022, 9, 1), datetime(2021, 2, 1)], - } - ).with_columns( - [ - pl.col("cat").cast(pl.Categorical), - ] - ) - - df1.extend(df2) - expected = pl.DataFrame( - { - "foo": [1, 2, 3, 4], - "bar": [True, False, True, None], - "ham": ["a", "b", "c", "d"], - "cat": ["A", "B", "C", "B"], - "dates": [ - datetime(2021, 1, 1), - datetime(2021, 2, 1), - datetime(2022, 9, 1), - datetime(2021, 2, 1), - ], - } - ).with_columns( - pl.col("cat").cast(pl.Categorical), - ) - assert_frame_equal(df1, expected) - - # 8745 - df = pl.DataFrame([{"age": 1}, {"age": 2}, {"age": 3}]) - df = df[:-1] - tail = pl.DataFrame([{"age": 8}]) - assert df.extend(tail).to_dict(False) == {"age": [1, 2, 8]} - - def test_file_buffer() -> None: f = BytesIO() f.write(b"1,2,3,4,5,6\n7,8,9,10,11,12") diff --git a/py-polars/tests/unit/dataframe/test_extend.py b/py-polars/tests/unit/dataframe/test_extend.py new file mode 100644 index 000000000000..08359cc85c19 --- /dev/null +++ b/py-polars/tests/unit/dataframe/test_extend.py @@ -0,0 +1,81 @@ +from datetime import datetime + +import pytest + +import polars as pl +from polars.testing import assert_frame_equal + + +def test_extend_various_dtypes() -> None: + with pl.StringCache(): + df1 = pl.DataFrame( + { + "foo": [1, 2], + "bar": [True, False], + "ham": ["a", "b"], + "cat": ["A", "B"], + "dates": [datetime(2021, 1, 1), datetime(2021, 2, 1)], + }, + schema_overrides={"cat": pl.Categorical}, + ) + df2 = pl.DataFrame( + { + "foo": [3, 4], + "bar": [True, None], + "ham": ["c", "d"], + "cat": ["C", "B"], + "dates": [datetime(2022, 9, 1), datetime(2021, 2, 1)], + }, + schema_overrides={"cat": pl.Categorical}, + ) + + df1.extend(df2) + + expected = pl.DataFrame( + { + "foo": [1, 2, 3, 4], + "bar": [True, False, True, None], + "ham": ["a", "b", "c", "d"], + "cat": ["A", "B", "C", "B"], + "dates": [ + datetime(2021, 1, 1), + datetime(2021, 2, 1), + datetime(2022, 9, 1), + datetime(2021, 2, 1), + ], + }, + schema_overrides={"cat": pl.Categorical}, + ) + assert_frame_equal(df1, expected) + + +def test_extend_slice_offset_8745() -> None: + df = pl.DataFrame([{"age": 1}, {"age": 2}, {"age": 3}]) + df = df[:-1] + tail = pl.DataFrame([{"age": 8}]) + assert df.extend(tail).to_dict(False) == {"age": [1, 2, 8]} + + +def test_extend_self() -> None: + df = pl.DataFrame({"a": [1, 2], "b": [True, False]}) + + df.extend(df) + + expected = pl.DataFrame({"a": [1, 2, 1, 2], "b": [True, False, True, False]}) + assert_frame_equal(df, expected) + + +def test_extend_column_number_mismatch() -> None: + df1 = pl.DataFrame({"a": [1, 2], "b": [True, False]}) + df2 = df1.drop("a") + + with pytest.raises(pl.ShapeError): + df1.extend(df2) + + +def test_extend_column_name_mismatch() -> None: + df1 = pl.DataFrame({"a": [1, 2], "b": [True, False]}) + df2 = df1.with_columns(pl.col("a").alias("c")) + + with pytest.raises(pl.ShapeError): + df1.extend(df2) diff --git a/py-polars/tests/unit/dataframe/test_vstack.py b/py-polars/tests/unit/dataframe/test_vstack.py index ecf88a2f987f..504ae9a24b97 100644 --- a/py-polars/tests/unit/dataframe/test_vstack.py +++ b/py-polars/tests/unit/dataframe/test_vstack.py @@ -44,3 +44,17 @@ def test_vstack_self_in_place(df1: pl.DataFrame) -> None: {"foo": [1, 2, 1, 2], "bar": [6, 7, 6, 7], "ham": ["a", "b", "a", "b"]} ) assert_frame_equal(df1, expected) + + +def test_vstack_column_number_mismatch(df1: pl.DataFrame) -> None: + df2 = df1.drop("ham") + + with pytest.raises(pl.ShapeError): + df1.vstack(df2) + + +def test_vstack_column_name_mismatch(df1: pl.DataFrame) -> None: + df2 = df1.with_columns(pl.col("foo").alias("oof")) + + with pytest.raises(pl.ShapeError): + df1.vstack(df2)