From 77a8dff6eab35d288abe646f3ef77498700ddf04 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Tue, 30 Jul 2024 17:20:13 +0400 Subject: [PATCH] feat: Add `name.replace` expression to support additional column rename options --- crates/polars-plan/src/dsl/name.rs | 13 ++++++ py-polars/polars/expr/name.py | 46 +++++++++++++++++++ py-polars/polars/expr/string.py | 4 +- py-polars/polars/series/string.py | 4 +- py-polars/src/expr/name.rs | 8 ++++ .../unit/operations/namespaces/test_name.py | 38 +++++++++++++++ 6 files changed, 109 insertions(+), 4 deletions(-) diff --git a/crates/polars-plan/src/dsl/name.rs b/crates/polars-plan/src/dsl/name.rs index ab7231b2e151..c7fc9fa1050d 100644 --- a/crates/polars-plan/src/dsl/name.rs +++ b/crates/polars-plan/src/dsl/name.rs @@ -1,3 +1,4 @@ +use regex::Regex; #[cfg(feature = "dtype-struct")] use smartstring::alias::String as SmartString; @@ -48,6 +49,18 @@ impl ExprNameNameSpace { self.map(move |name| Ok(format!("{name}{suffix}"))) } + /// Replace matching string pattern in the root column name with a new value. + pub fn replace(self, pattern: &str, value: &str, literal: bool) -> Expr { + let value = value.to_string(); + let pattern = pattern.to_string(); + if literal { + self.map(move |name| Ok(name.replace(&pattern, &value))) + } else { + let rx = Regex::new(&pattern); + self.map(move |name| Ok(rx.clone()?.replace_all(name, &value).to_string())) + } + } + /// Update the root column name to use lowercase characters. #[allow(clippy::wrong_self_convention)] pub fn to_lowercase(self) -> Expr { diff --git a/py-polars/polars/expr/name.py b/py-polars/polars/expr/name.py index 9c730d2d3206..2f90206184f8 100644 --- a/py-polars/polars/expr/name.py +++ b/py-polars/polars/expr/name.py @@ -326,6 +326,52 @@ def prefix_fields(self, prefix: str) -> Expr: """ return self._from_pyexpr(self._pyexpr.name_prefix_fields(prefix)) + def replace(self, pattern: str, value: str, *, literal: bool = False) -> Expr: + """ + Replace matching regex/literal substring in the name with a new value. + + Parameters + ---------- + pattern + A valid regular expression pattern, compatible with the `regex crate + `_. + value + String that will replace the matched substring. + literal + Treat `pattern` as a literal string, not a regex. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. Only one name operation per expression will work. + Consider using `.name.map` for advanced renaming. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "n_foo": [1, 2, 3], + ... "n_bar": ["x", "y", "z"], + ... } + ... ) + >>> df.select(pl.all().name.replace("^n_", "col_")) + shape: (3, 2) + ┌─────────┬─────────┐ + │ col_foo ┆ col_bar │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════════╪═════════╡ + │ 1 ┆ x │ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────────┴─────────┘ + >>> df.select(pl.all().name.replace("(a|e|i|o|u)", "@")).schema + Schema([('n_f@@', Int64), ('n_b@r', String)]) + """ + return self._from_pyexpr(self._pyexpr.name_replace(pattern, value, literal)) + def suffix_fields(self, suffix: str) -> Expr: """ Add a suffix to all fields name of a struct. diff --git a/py-polars/polars/expr/string.py b/py-polars/polars/expr/string.py index 109aaa096893..f8ea6cd22858 100644 --- a/py-polars/polars/expr/string.py +++ b/py-polars/polars/expr/string.py @@ -1881,7 +1881,7 @@ def replace( value String that will replace the matched substring. literal - Treat `pattern` as a literal string. + Treat `pattern` as a literal string, not a regex. n Number of matches to replace. @@ -1974,7 +1974,7 @@ def replace_all( value String that will replace the matched substring. literal - Treat `pattern` as a literal string. + Treat `pattern` as a literal string, not a regex. See Also -------- diff --git a/py-polars/polars/series/string.py b/py-polars/polars/series/string.py index 1591a3c516de..ca5c1e4aa9ff 100644 --- a/py-polars/polars/series/string.py +++ b/py-polars/polars/series/string.py @@ -1095,7 +1095,7 @@ def replace( value String that will replace the matched substring. literal - Treat `pattern` as a literal string. + Treat `pattern` as a literal string, not a regex. n Number of matches to replace. @@ -1169,7 +1169,7 @@ def replace_all(self, pattern: str, value: str, *, literal: bool = False) -> Ser value String that will replace the matched substring. literal - Treat `pattern` as a literal string. + Treat `pattern` as a literal string, not a regex. See Also -------- diff --git a/py-polars/src/expr/name.rs b/py-polars/src/expr/name.rs index 6bbda4a6668a..313aa76dbd1e 100644 --- a/py-polars/src/expr/name.rs +++ b/py-polars/src/expr/name.rs @@ -44,6 +44,14 @@ impl PyExpr { self.inner.clone().name().to_uppercase().into() } + fn name_replace(&self, pattern: &str, value: &str, literal: bool) -> Self { + self.inner + .clone() + .name() + .replace(pattern, value, literal) + .into() + } + fn name_map_fields(&self, name_mapper: PyObject) -> Self { let name_mapper = Arc::new(move |name: &str| { Python::with_gil(|py| { diff --git a/py-polars/tests/unit/operations/namespaces/test_name.py b/py-polars/tests/unit/operations/namespaces/test_name.py index eac08e537a88..f73f13afffe1 100644 --- a/py-polars/tests/unit/operations/namespaces/test_name.py +++ b/py-polars/tests/unit/operations/namespaces/test_name.py @@ -2,7 +2,10 @@ from collections import OrderedDict +import pytest + import polars as pl +from polars.exceptions import ComputeError def test_name_change_case() -> None: @@ -43,6 +46,41 @@ def test_name_prefix_suffix() -> None: ) +def test_name_replace() -> None: + df = pl.DataFrame( + schema={"n_foo": pl.Int32, "n_bar": pl.String, "misc?": pl.Float64}, + ) + + assert df.select( + pl.all().name.replace("^n_", "col_"), + ).schema == { + "col_foo": pl.Int32, + "col_bar": pl.String, + "misc?": pl.Float64, + } + + assert df.select( + pl.all().name.replace("(a|e|i|o|u)", "#"), + ).schema == { + "n_f##": pl.Int32, + "n_b#r": pl.String, + "m#sc?": pl.Float64, + } + + with pytest.raises(ComputeError, match="repetition operator missing expression"): + df.select( + pl.all().name.replace("?", "!!"), + ) + + assert df.select( + pl.all().name.replace("?", "!!", literal=True), + ).schema == { + "n_foo": pl.Int32, + "n_bar": pl.String, + "misc!!": pl.Float64, + } + + def test_name_update_all() -> None: df = pl.DataFrame( schema={