From c7dcc2d9513e34a9fa958a2461e27e11af6d9120 Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Thu, 6 Jun 2024 13:13:12 +0100 Subject: [PATCH 1/6] Update concatenation docs to include relaxed and changes to rechunk default behaviour --- .../transformations/concatenation.py | 24 +++++++++++++++++++ .../transformations/concatenation.rs | 18 ++++++++++++++ .../transformations/concatenation.md | 19 +++++++++++---- 3 files changed, 56 insertions(+), 5 deletions(-) diff --git a/docs/src/python/user-guide/transformations/concatenation.py b/docs/src/python/user-guide/transformations/concatenation.py index 60a24aed1930..92df4f9ec555 100644 --- a/docs/src/python/user-guide/transformations/concatenation.py +++ b/docs/src/python/user-guide/transformations/concatenation.py @@ -26,6 +26,30 @@ print(df_vertical_concat) # --8<-- [end:vertical] +# --8<-- [start:vertical_relaxed] +df_v1 = pl.DataFrame( + { + "a": [1.0], + "b": [3], + }, +) +df_v2 = pl.DataFrame( + { + "a": [2], + "b": [4], + }, +) +df_vertical_relaxed_concat = pl.concat( + [ + df_v1, + df_v2, + ], + how="vertical_relaxed", +) +print(df_vertical_relaxed_concat) +# --8<-- [end:vertical_relaxed] + + # --8<-- [start:horizontal] df_h1 = pl.DataFrame( { diff --git a/docs/src/rust/user-guide/transformations/concatenation.rs b/docs/src/rust/user-guide/transformations/concatenation.rs index 240ff8e9f59b..0b12b6ebd9de 100644 --- a/docs/src/rust/user-guide/transformations/concatenation.rs +++ b/docs/src/rust/user-guide/transformations/concatenation.rs @@ -20,6 +20,24 @@ fn main() -> Result<(), Box> { println!("{}", &df_vertical_concat); // --8<-- [end:vertical] + // --8<-- [start:vertical_relaxed] + let df_v1 = df!( + "a"=> &[1.0], + "b"=> &[3], +)?; +let df_v2 = df!( + "a"=> &[2], + "b"=> &[4], +)?; +let df_vertical_relaxed_concat = concat( + [df_v1.clone().lazy(), df_v2.clone().lazy()], + UnionArgs::default(), +)? +.collect()?; +println!("{}", &df_vertical_relaxed_concat); +// --8<-- [end:vertical_relaxed] + + // --8<-- [start:horizontal] let df_h1 = df!( "l1"=> &[1, 2], diff --git a/docs/user-guide/transformations/concatenation.md b/docs/user-guide/transformations/concatenation.md index 0708c84c3cd0..57d2a998fae0 100644 --- a/docs/user-guide/transformations/concatenation.md +++ b/docs/user-guide/transformations/concatenation.md @@ -17,8 +17,16 @@ In a vertical concatenation you combine all of the rows from a list of `DataFram --8<-- "python/user-guide/transformations/concatenation.py:vertical" ``` -Vertical concatenation fails when the dataframes do not have the same column names. +Vertical concatenation fails when the dataframes do not have the same column names and dtypes. +For certain differences in dtypes, Polars can do a relaxed vertical concatenation where the differences in dtype are resolved by casting all columns with the same name but different dtypes to a *supertype*. For example, if column `'a'` in the first `DataFrame` is `Float32` but column `'a'` in the second `DataFrame` is `Int64`, then both columns are cast to their supertype `Float64` before concatenation. If the set of dtypes for a column do not have a supertype, the concatenation fails. The supertype mappings are defined internally in Polars. + + +{{code_block('user-guide/transformations/concatenation','vertical_relaxed',['concat'])}} + +```python exec="on" result="text" session="user-guide/transformations/concatenation" +--8<-- "python/user-guide/transformations/concatenation.py:vertical_relaxed" +``` ## Horizontal concatenation - getting wider In a horizontal concatenation you combine all of the columns from a list of `DataFrames` into a single wider `DataFrame`. @@ -42,7 +50,7 @@ columns will be padded with `null` values at the end up to the maximum length. ## Diagonal concatenation - getting longer, wider and `null`ier -In a diagonal concatenation you combine all of the row and columns from a list of `DataFrames` into a single longer and/or wider `DataFrame`. +In a diagonal concatenation you combine all of the rows and columns from a list of `DataFrames` into a single longer and/or wider `DataFrame`. {{code_block('user-guide/transformations/concatenation','cross',['concat'])}} @@ -50,11 +58,12 @@ In a diagonal concatenation you combine all of the row and columns from a list o --8<-- "python/user-guide/transformations/concatenation.py:cross" ``` -Diagonal concatenation generates nulls when the column names do not overlap. +Diagonal concatenation generates nulls when the column names do not overlap but fails if the dtypes do not match for columns with the same name. As with vertical concatenation there is an alternative `diagonal_relaxed` method that tries to cast columns to a supertype if columns with the same name have different dtypes. When the dataframe shapes do not match and we have an overlapping semantic key then [we can join the dataframes](joins.md) instead of concatenating them. ## Rechunking -Before a concatenation we have two dataframes `df1` and `df2`. Each column in `df1` and `df2` is in one or more chunks in memory. By default, during concatenation the chunks in each column are copied to a single new chunk - this is known as **rechunking**. Rechunking is an expensive operation, but is often worth it because future operations will be faster. -If you do not want Polars to rechunk the concatenated `DataFrame` you specify `rechunk = False` when doing the concatenation. +We have a `list` of `DataFrames` and we want to concatenate them. Each column in each `DataFrame` is stored in one or more chunks in memory. When we concatenate the `DataFrames` then the data from each column in each `DataFrame` can be copied to a single location in memory - this is known as **rechunking**. Rechunking is an expensive process as it requires copying data from one location to another. However, rechunking can make subsequent operations faster as the data is in a single location in memory. + +By default when we do a concatenation in eager mode rechunking does not happen. If we want Polars to rechunk the concatenated `DataFrame` then specify `rechunk = True` when doing the concatenation. In lazy mode the query optimizer assesses whether to do rechunking based on the query plan. From 7b56c41c9f47a48ca461441545ddef1cd47dc641 Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Thu, 6 Jun 2024 13:15:01 +0100 Subject: [PATCH 2/6] Update concatenation docs to include relaxed and changes to rechunk default behaviour --- docs/user-guide/transformations/concatenation.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/user-guide/transformations/concatenation.md b/docs/user-guide/transformations/concatenation.md index 57d2a998fae0..5d3b7ae696a5 100644 --- a/docs/user-guide/transformations/concatenation.md +++ b/docs/user-guide/transformations/concatenation.md @@ -21,7 +21,6 @@ Vertical concatenation fails when the dataframes do not have the same column nam For certain differences in dtypes, Polars can do a relaxed vertical concatenation where the differences in dtype are resolved by casting all columns with the same name but different dtypes to a *supertype*. For example, if column `'a'` in the first `DataFrame` is `Float32` but column `'a'` in the second `DataFrame` is `Int64`, then both columns are cast to their supertype `Float64` before concatenation. If the set of dtypes for a column do not have a supertype, the concatenation fails. The supertype mappings are defined internally in Polars. - {{code_block('user-guide/transformations/concatenation','vertical_relaxed',['concat'])}} ```python exec="on" result="text" session="user-guide/transformations/concatenation" From 305c9ae9fbd3468e360c2fe560ec933b78f43f8e Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Thu, 6 Jun 2024 13:31:54 +0100 Subject: [PATCH 3/6] Markdown formatting --- docs/user-guide/transformations/concatenation.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/user-guide/transformations/concatenation.md b/docs/user-guide/transformations/concatenation.md index 5d3b7ae696a5..29faea42bf9a 100644 --- a/docs/user-guide/transformations/concatenation.md +++ b/docs/user-guide/transformations/concatenation.md @@ -19,13 +19,14 @@ In a vertical concatenation you combine all of the rows from a list of `DataFram Vertical concatenation fails when the dataframes do not have the same column names and dtypes. -For certain differences in dtypes, Polars can do a relaxed vertical concatenation where the differences in dtype are resolved by casting all columns with the same name but different dtypes to a *supertype*. For example, if column `'a'` in the first `DataFrame` is `Float32` but column `'a'` in the second `DataFrame` is `Int64`, then both columns are cast to their supertype `Float64` before concatenation. If the set of dtypes for a column do not have a supertype, the concatenation fails. The supertype mappings are defined internally in Polars. +For certain differences in dtypes, Polars can do a relaxed vertical concatenation where the differences in dtype are resolved by casting all columns with the same name but different dtypes to a _supertype_. For example, if column `'a'` in the first `DataFrame` is `Float32` but column `'a'` in the second `DataFrame` is `Int64`, then both columns are cast to their supertype `Float64` before concatenation. If the set of dtypes for a column do not have a supertype, the concatenation fails. The supertype mappings are defined internally in Polars. {{code_block('user-guide/transformations/concatenation','vertical_relaxed',['concat'])}} ```python exec="on" result="text" session="user-guide/transformations/concatenation" --8<-- "python/user-guide/transformations/concatenation.py:vertical_relaxed" ``` + ## Horizontal concatenation - getting wider In a horizontal concatenation you combine all of the columns from a list of `DataFrames` into a single wider `DataFrame`. From aa3905470c1323902f54e3c865b4c79f9cc879d7 Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Thu, 6 Jun 2024 13:34:23 +0100 Subject: [PATCH 4/6] Cargo fmt --- docs/development/contributing/index.md | 4 ++- .../transformations/concatenation.rs | 29 +++++++++---------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/docs/development/contributing/index.md b/docs/development/contributing/index.md index 2aa65ed0fa5e..6296ed9f30b9 100644 --- a/docs/development/contributing/index.md +++ b/docs/development/contributing/index.md @@ -253,7 +253,9 @@ The snippet is delimited by `--8<-- [start:]` and `--8<-- [end: Result<(), Box> { // --8<-- [start:vertical_relaxed] let df_v1 = df!( - "a"=> &[1.0], - "b"=> &[3], -)?; -let df_v2 = df!( - "a"=> &[2], - "b"=> &[4], -)?; -let df_vertical_relaxed_concat = concat( - [df_v1.clone().lazy(), df_v2.clone().lazy()], - UnionArgs::default(), -)? -.collect()?; -println!("{}", &df_vertical_relaxed_concat); -// --8<-- [end:vertical_relaxed] - + "a"=> &[1.0], + "b"=> &[3], + )?; + let df_v2 = df!( + "a"=> &[2], + "b"=> &[4], + )?; + let df_vertical_relaxed_concat = concat( + [df_v1.clone().lazy(), df_v2.clone().lazy()], + UnionArgs::default(), + )? + .collect()?; + println!("{}", &df_vertical_relaxed_concat); + // --8<-- [end:vertical_relaxed] // --8<-- [start:horizontal] let df_h1 = df!( From 237e428530fbb80eb1e58fa8003321d413a20d94 Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Thu, 6 Jun 2024 13:46:32 +0100 Subject: [PATCH 5/6] Lint contributing doc --- docs/development/contributing/index.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/development/contributing/index.md b/docs/development/contributing/index.md index 6296ed9f30b9..341109979fbb 100644 --- a/docs/development/contributing/index.md +++ b/docs/development/contributing/index.md @@ -253,7 +253,8 @@ The snippet is delimited by `--8<-- [start:]` and `--8<-- [end: Date: Thu, 6 Jun 2024 16:53:31 +0100 Subject: [PATCH 6/6] Update with align method --- .../user-guide/transformations/concatenation.py | 15 +++++++++++++++ .../user-guide/transformations/concatenation.rs | 5 +++++ docs/user-guide/transformations/concatenation.md | 7 +++++++ 3 files changed, 27 insertions(+) diff --git a/docs/src/python/user-guide/transformations/concatenation.py b/docs/src/python/user-guide/transformations/concatenation.py index 92df4f9ec555..b07311195aef 100644 --- a/docs/src/python/user-guide/transformations/concatenation.py +++ b/docs/src/python/user-guide/transformations/concatenation.py @@ -97,6 +97,21 @@ print(df_horizontal_concat) # --8<-- [end:horizontal_different_lengths] +# --8<-- [start:horizontal_align] +df_h1 = pl.DataFrame({"a": ["a", "b", "d", "e", "e"], "b": [1, 2, 4, 5, 6]}) +df_h2 = pl.DataFrame({"a": ["a", "b", "c", "d", "e"], "d": ["w", "x", "y", "z", None]}) +df_align = pl.concat( + [ + df_h1, + df_h2, + ], + how="align", +) +print(df_align) + +# --8<-- [end:horizontal_align] + + # --8<-- [start:cross] df_d1 = pl.DataFrame( { diff --git a/docs/src/rust/user-guide/transformations/concatenation.rs b/docs/src/rust/user-guide/transformations/concatenation.rs index 4675fbff2159..7197dc65f67d 100644 --- a/docs/src/rust/user-guide/transformations/concatenation.rs +++ b/docs/src/rust/user-guide/transformations/concatenation.rs @@ -64,6 +64,11 @@ fn main() -> Result<(), Box> { println!("{}", &df_horizontal_concat); // --8<-- [end:horizontal_different_lengths] + // --8<-- [start:horizontal_align] + println!("Not available in Rust"); + + // --8<-- [end:horizontal_align] + // --8<-- [start:cross] let df_d1 = df!( "a"=> &[1], diff --git a/docs/user-guide/transformations/concatenation.md b/docs/user-guide/transformations/concatenation.md index 29faea42bf9a..9990b7bf03df 100644 --- a/docs/user-guide/transformations/concatenation.md +++ b/docs/user-guide/transformations/concatenation.md @@ -48,6 +48,13 @@ columns will be padded with `null` values at the end up to the maximum length. --8<-- "python/user-guide/transformations/concatenation.py:horizontal_different_lengths" ``` +An alternative horizontal concatenation method is `align` where Polars combines frames horizontally by determining the common key columns and aligning rows. +{{code_block('user-guide/transformations/concatenation','horizontal_align',['concat'])}} + +```python exec="on" result="text" session="user-guide/transformations/concatenation" +--8<-- "python/user-guide/transformations/concatenation.py:horizontal_align" +``` + ## Diagonal concatenation - getting longer, wider and `null`ier In a diagonal concatenation you combine all of the rows and columns from a list of `DataFrames` into a single longer and/or wider `DataFrame`.