fix(python): Handle DataFrame.extend extending by itself (#9897)

pola-rs · Jul 16, 2023 · 2021249 · 2021249
1 parent bb36e4c
commit 2021249
Show file tree

Hide file tree

Showing 4 changed files with 125 additions and 65 deletions.
diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
@@ -5827,6 +5827,10 @@ def vstack(self, other: DataFrame, *, in_place: bool = False) -> Self:
         in_place
             Modify in place.
 
+        See Also
+        --------
+        extend
+
         Examples
         --------
         >>> df1 = pl.DataFrame(
@@ -5874,26 +5878,36 @@ def extend(self, other: DataFrame) -> Self:
         """
         Extend the memory backed by this `DataFrame` with the values from `other`.
 
-        Different from `vstack` which adds the chunks from `other` to the chunks of this
-        `DataFrame` `extend` appends the data from `other` to the underlying memory
-        locations and thus may cause a reallocation.
+        Different from ``vstack`` which adds the chunks from ``other`` to the chunks of
+        this ``DataFrame``, ``extend`` appends the data from `other` to the underlying
+        memory locations and thus may cause a reallocation.
 
         If this does not cause a reallocation, the resulting data structure will not
         have any extra chunks and thus will yield faster queries.
 
-        Prefer `extend` over `vstack` when you want to do a query after a single append.
-        For instance during online operations where you add `n` rows and rerun a query.
+        Prefer ``extend`` over ``vstack`` when you want to do a query after a single
+        append. For instance, during online operations where you add `n` rows and rerun
+        a query.
 
-        Prefer `vstack` over `extend` when you want to append many times before doing a
-        query. For instance when you read in multiple files and when to store them in a
-        single `DataFrame`. In the latter case, finish the sequence of `vstack`
-        operations with a `rechunk`.
+        Prefer ``vstack`` over ``extend`` when you want to append many times before
+        doing a query. For instance, when you read in multiple files and want to store
+        them in a single ``DataFrame``. In the latter case, finish the sequence of
+        ``vstack`` operations with a ``rechunk``.
 
         Parameters
         ----------
         other
             DataFrame to vertically add.
 
+        Warnings
+        --------
+        This method modifies the dataframe in-place. The dataframe is returned for
+        convenience only.
+
+        See Also
+        --------
+        vstack
+
         Examples
         --------
         >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})
@@ -5914,7 +5928,13 @@ def extend(self, other: DataFrame) -> Self:
         └─────┴─────┘
 
         """
-        self._df.extend(other._df)
+        try:
+            self._df.extend(other._df)
+        except RuntimeError as exc:
+            if str(exc) == "Already mutably borrowed":
+                self._df.extend(other._df.clone())
+            else:
+                raise exc
         return self
 
     def drop(self, columns: str | Collection[str], *more_columns: str) -> DataFrame:

diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py
@@ -704,61 +704,6 @@ def test_hstack_dataframe(in_place: bool) -> None:
         assert_frame_equal(df_out, expected)
 
 
-def test_extend() -> None:
-    with pl.StringCache():
-        df1 = pl.DataFrame(
-            {
-                "foo": [1, 2],
-                "bar": [True, False],
-                "ham": ["a", "b"],
-                "cat": ["A", "B"],
-                "dates": [datetime(2021, 1, 1), datetime(2021, 2, 1)],
-            }
-        ).with_columns(
-            [
-                pl.col("cat").cast(pl.Categorical),
-            ]
-        )
-        df2 = pl.DataFrame(
-            {
-                "foo": [3, 4],
-                "bar": [True, None],
-                "ham": ["c", "d"],
-                "cat": ["C", "B"],
-                "dates": [datetime(2022, 9, 1), datetime(2021, 2, 1)],
-            }
-        ).with_columns(
-            [
-                pl.col("cat").cast(pl.Categorical),
-            ]
-        )
-
-        df1.extend(df2)
-        expected = pl.DataFrame(
-            {
-                "foo": [1, 2, 3, 4],
-                "bar": [True, False, True, None],
-                "ham": ["a", "b", "c", "d"],
-                "cat": ["A", "B", "C", "B"],
-                "dates": [
-                    datetime(2021, 1, 1),
-                    datetime(2021, 2, 1),
-                    datetime(2022, 9, 1),
-                    datetime(2021, 2, 1),
-                ],
-            }
-        ).with_columns(
-            pl.col("cat").cast(pl.Categorical),
-        )
-        assert_frame_equal(df1, expected)
-
-        # 8745
-        df = pl.DataFrame([{"age": 1}, {"age": 2}, {"age": 3}])
-        df = df[:-1]
-        tail = pl.DataFrame([{"age": 8}])
-        assert df.extend(tail).to_dict(False) == {"age": [1, 2, 8]}
-
-
 def test_file_buffer() -> None:
     f = BytesIO()
     f.write(b"1,2,3,4,5,6\n7,8,9,10,11,12")

diff --git a/py-polars/tests/unit/dataframe/test_extend.py b/py-polars/tests/unit/dataframe/test_extend.py
@@ -0,0 +1,81 @@
+from datetime import datetime
+
+import pytest
+
+import polars as pl
+from polars.testing import assert_frame_equal
+
+
+def test_extend_various_dtypes() -> None:
+    with pl.StringCache():
+        df1 = pl.DataFrame(
+            {
+                "foo": [1, 2],
+                "bar": [True, False],
+                "ham": ["a", "b"],
+                "cat": ["A", "B"],
+                "dates": [datetime(2021, 1, 1), datetime(2021, 2, 1)],
+            },
+            schema_overrides={"cat": pl.Categorical},
+        )
+        df2 = pl.DataFrame(
+            {
+                "foo": [3, 4],
+                "bar": [True, None],
+                "ham": ["c", "d"],
+                "cat": ["C", "B"],
+                "dates": [datetime(2022, 9, 1), datetime(2021, 2, 1)],
+            },
+            schema_overrides={"cat": pl.Categorical},
+        )
+
+        df1.extend(df2)
+
+        expected = pl.DataFrame(
+            {
+                "foo": [1, 2, 3, 4],
+                "bar": [True, False, True, None],
+                "ham": ["a", "b", "c", "d"],
+                "cat": ["A", "B", "C", "B"],
+                "dates": [
+                    datetime(2021, 1, 1),
+                    datetime(2021, 2, 1),
+                    datetime(2022, 9, 1),
+                    datetime(2021, 2, 1),
+                ],
+            },
+            schema_overrides={"cat": pl.Categorical},
+        )
+        assert_frame_equal(df1, expected)
+
+
+def test_extend_slice_offset_8745() -> None:
+    df = pl.DataFrame([{"age": 1}, {"age": 2}, {"age": 3}])
+    df = df[:-1]
+    tail = pl.DataFrame([{"age": 8}])
+    assert df.extend(tail).to_dict(False) == {"age": [1, 2, 8]}
+
+
+def test_extend_self() -> None:
+    df = pl.DataFrame({"a": [1, 2], "b": [True, False]})
+
+    df.extend(df)
+
+    expected = pl.DataFrame({"a": [1, 2, 1, 2], "b": [True, False, True, False]})
+    assert_frame_equal(df, expected)
+
+
+def test_extend_column_number_mismatch() -> None:
+    df1 = pl.DataFrame({"a": [1, 2], "b": [True, False]})
+    df2 = df1.drop("a")
+
+    with pytest.raises(pl.ShapeError):
+        df1.extend(df2)
+
+
+def test_extend_column_name_mismatch() -> None:
+    df1 = pl.DataFrame({"a": [1, 2], "b": [True, False]})
+    df2 = df1.with_columns(pl.col("a").alias("c"))
+
+    with pytest.raises(pl.ShapeError):
+        df1.extend(df2)
diff --git a/py-polars/tests/unit/dataframe/test_vstack.py b/py-polars/tests/unit/dataframe/test_vstack.py
@@ -44,3 +44,17 @@ def test_vstack_self_in_place(df1: pl.DataFrame) -> None:
         {"foo": [1, 2, 1, 2], "bar": [6, 7, 6, 7], "ham": ["a", "b", "a", "b"]}
     )
     assert_frame_equal(df1, expected)
+
+
+def test_vstack_column_number_mismatch(df1: pl.DataFrame) -> None:
+    df2 = df1.drop("ham")
+
+    with pytest.raises(pl.ShapeError):
+        df1.vstack(df2)
+
+
+def test_vstack_column_name_mismatch(df1: pl.DataFrame) -> None:
+    df2 = df1.with_columns(pl.col("foo").alias("oof"))
+
+    with pytest.raises(pl.ShapeError):
+        df1.vstack(df2)