From 862c7dd00cdd688cdc359d986fa55e07d20ce39c Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Fri, 19 Apr 2024 18:32:56 +1000 Subject: [PATCH 01/46] add make_clean_names function that can be applied to polars --- environment-dev.yml | 1 + janitor/functions/__init__.py | 2 + janitor/functions/clean_names.py | 125 ++-------- janitor/functions/polars/__init__.py | 0 janitor/functions/utils.py | 273 +++++++++++++++++++++ tests/functions/polars/test_clean_names.py | 123 ++++++++++ 6 files changed, 414 insertions(+), 110 deletions(-) create mode 100644 janitor/functions/polars/__init__.py create mode 100644 tests/functions/polars/test_clean_names.py diff --git a/environment-dev.yml b/environment-dev.yml index 1f8e48ece..322deec86 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -34,6 +34,7 @@ dependencies: - pipreqs - pip-tools - pre-commit + - pypolars - pyspark>=3.2.0 - pytest - pytest-cov diff --git a/janitor/functions/__init__.py b/janitor/functions/__init__.py index 35681b9d9..ef1a69458 100644 --- a/janitor/functions/__init__.py +++ b/janitor/functions/__init__.py @@ -81,6 +81,7 @@ col, get_columns, get_index_labels, + make_clean_names, patterns, unionize_dataframe_categories, ) @@ -129,6 +130,7 @@ "join_apply", "label_encode", "limit_column_characters", + "make_clean_names", "min_max_scale", "move", "pivot_longer", diff --git a/janitor/functions/clean_names.py b/janitor/functions/clean_names.py index 71735a7fc..db439d30f 100644 --- a/janitor/functions/clean_names.py +++ b/janitor/functions/clean_names.py @@ -1,14 +1,15 @@ -"""Functions for cleaning columns names.""" +"""Functions for cleaning columns/index names and/or column values.""" -import unicodedata -from typing import Hashable, Optional, Union +from typing import Optional, Union import pandas as pd import pandas_flavor as pf from pandas.api.types import is_scalar -from janitor.errors import JanitorError -from janitor.functions.utils import _is_str_or_cat, get_index_labels +from janitor.functions.utils import ( + get_index_labels, + make_clean_names, +) from janitor.utils import deprecated_alias @@ -116,14 +117,15 @@ def clean_names( column_names = [column_names] df = df.copy() for column_name in column_names: - df[column_name] = _clean_names_single_object( - obj=df[column_name], + df[column_name] = make_clean_names( + col=df[column_name], enforce_string=enforce_string, case_type=case_type, remove_special=remove_special, strip_accents=strip_accents, strip_underscores=strip_underscores, truncate_limit=truncate_limit, + df_type="pandas", ) return df @@ -136,128 +138,31 @@ def clean_names( for number in range(target_axis.nlevels) ] target_axis = [ - _clean_names_single_object( - obj=obj, + make_clean_names( + col=obj, enforce_string=enforce_string, case_type=case_type, remove_special=remove_special, strip_accents=strip_accents, strip_underscores=strip_underscores, truncate_limit=truncate_limit, + df_type="pandas", ) for obj in target_axis ] else: - target_axis = _clean_names_single_object( - obj=target_axis, + target_axis = make_clean_names( + col=target_axis, enforce_string=enforce_string, case_type=case_type, remove_special=remove_special, strip_accents=strip_accents, strip_underscores=strip_underscores, truncate_limit=truncate_limit, + df_type="pandas", ) # Store the original column names, if enabled by user if preserve_original_labels: df.__dict__["original_labels"] = getattr(df, axis) setattr(df, axis, target_axis) return df - - -def _clean_names_single_object( - obj: Union[pd.Index, pd.Series], - enforce_string, - case_type, - remove_special, - strip_accents, - strip_underscores, - truncate_limit, -): - """ - Apply _clean_names on a single pandas object. - """ - if enforce_string and not (_is_str_or_cat(obj)): - obj = obj.astype(str) - obj = _change_case(obj, case_type) - obj = _normalize_1(obj) - if remove_special: - obj = obj.map(_remove_special) - if strip_accents: - obj = obj.map(_strip_accents) - obj = obj.str.replace(pat="_+", repl="_", regex=True) - obj = _strip_underscores_func(obj, strip_underscores=strip_underscores) - if truncate_limit: - obj = obj.str[:truncate_limit] - return obj - - -def _change_case(col: Union[pd.Index, pd.Series], case_type: str) -> str: - """Change case of labels in pandas object.""" - case_types = {"preserve", "upper", "lower", "snake"} - case_type = case_type.lower() - if case_type not in case_types: - raise JanitorError(f"case_type must be one of: {case_types}") - if case_type == "preserve": - return col - if case_type == "upper": - return col.str.upper() - if case_type == "lower": - return col.str.lower() - # Implementation taken from: https://gist.github.com/jaytaylor/3660565 - # by @jtaylor - return ( - col.str.replace(pat=r"(.)([A-Z][a-z]+)", repl=r"\1_\2", regex=True) - .str.replace(pat=r"([a-z0-9])([A-Z])", repl=r"\1_\2", regex=True) - .str.lower() - ) - - -def _remove_special(label: Hashable) -> str: - """Remove special characters from label.""" - return "".join( - [item for item in str(label) if item.isalnum() or "_" in item] - ) - - -def _normalize_1(col: Union[pd.Index, pd.Series]) -> str: - """Perform normalization of labels in pandas object.""" - FIXES = [(r"[ /:,?()\.-]", "_"), (r"['’]", ""), (r"[\xa0]", "_")] - for search, replace in FIXES: - col = col.str.replace(pat=search, repl=replace, regex=True) - return col - - -def _strip_accents(label: Hashable) -> str: - """Remove accents from a label. - - Inspired from [StackOverflow][so]. - - [so]: https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-strin - """ # noqa: E501 - - return "".join( - [ - letter - for letter in unicodedata.normalize("NFD", str(label)) - if not unicodedata.combining(letter) - ] - ) - - -def _strip_underscores_func( - col: Union[pd.Index, pd.Series], strip_underscores: Union[str, bool] = None -) -> pd.DataFrame: - """Strip underscores from a pandas object.""" - underscore_options = {None, "left", "right", "both", "l", "r", True} - if strip_underscores not in underscore_options: - raise JanitorError( - f"strip_underscores must be one of: {underscore_options}" - ) - - if strip_underscores in ["left", "l"]: - return col.str.lstrip("_") - if strip_underscores in ["right", "r"]: - return col.str.rstrip("_") - if strip_underscores in {True, "both"}: - return col.str.strip("_") - return col diff --git a/janitor/functions/polars/__init__.py b/janitor/functions/polars/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/janitor/functions/utils.py b/janitor/functions/utils.py index 8aa4d346b..01e192853 100644 --- a/janitor/functions/utils.py +++ b/janitor/functions/utils.py @@ -5,6 +5,7 @@ import fnmatch import inspect import re +import unicodedata import warnings from collections.abc import Callable as dispatch_callable from dataclasses import dataclass @@ -24,6 +25,7 @@ import numpy as np import pandas as pd +import polars as pl from multipledispatch import dispatch from pandas.api.types import ( is_bool_dtype, @@ -36,6 +38,7 @@ from pandas.core.common import is_bool_indexer from pandas.core.groupby.generic import DataFrameGroupBy, SeriesGroupBy +from janitor.errors import JanitorError from janitor.utils import _expand_grid, check, check_column, find_stack_level warnings.simplefilter("always", DeprecationWarning) @@ -1133,3 +1136,273 @@ def __eq__(self, other): """ self.join_args = (self.cols, other.cols, "==") return self + + +def _change_case( + col: Union[pd.Index, pd.Series, pl.Expr, list, str], + case_type: str, + df_type: str, +) -> str: + """Change case of labels in col.""" + case_types = {"preserve", "upper", "lower", "snake"} + case_type = case_type.lower() + if case_type not in case_types: + raise JanitorError(f"df_type must be one of: {case_types}") + + if df_type == "pandas": + if case_type == "preserve": + return col + if case_type == "upper": + return col.str.upper() + if case_type == "lower": + return col.str.lower() + # Implementation taken from: https://gist.github.com/jaytaylor/3660565 + # by @jtaylor + return ( + col.str.replace(pat=r"(.)([A-Z][a-z]+)", repl=r"\1_\2", regex=True) + .str.replace(pat=r"([a-z0-9])([A-Z])", repl=r"\1_\2", regex=True) + .str.lower() + ) + if df_type == "polars": + if case_type == "preserve": + return col + if case_type == "upper": + return col.str.to_uppercase() + if case_type == "lower": + return col.str.to_lowercase() + # Implementation taken from: https://gist.github.com/jaytaylor/3660565 + # by @jtaylor + return ( + col.str.replace_all( + pattern=r"(.)([A-Z][a-z]+)", value=r"${1}_${2}", literal=False + ) + .str.replace_all( + pattern=r"([a-z0-9])([A-Z])", value=r"${1}_${2}", literal=False + ) + .str.to_lowercase() + ) + if df_type == "str": + if case_type == "preserve": + return col + if case_type == "upper": + return col.upper() + if case_type == "lower": + return col.lower() + # Implementation adapted from: https://gist.github.com/jaytaylor/3660565 + # by @jtaylor + col = re.sub(pattern=r"(.)([A-Z][a-z]+)", repl=r"\1_\2", string=col) + col = re.sub(pattern=r"([a-z0-9])([A-Z])", repl=r"\1_\2", string=col) + return col.lower() + + if case_type == "preserve": + return col + if case_type == "upper": + return [label.upper() for label in col] + if case_type == "lower": + return [label.lower() for label in col] + # Implementation adapted from: https://gist.github.com/jaytaylor/3660565 + # by @jtaylor + col = [ + re.sub(pattern=r"(.)([A-Z][a-z]+)", repl=r"\1_\2", string=label) + for label in col + ] + col = [ + re.sub(pattern=r"([a-z0-9])([A-Z])", repl=r"\1_\2", string=label) + for label in col + ] + col = [label.lower() for label in col] + return col + + +def _normalize_1( + col: Union[pd.Index, pd.Series, pl.Expr, list, str], df_type: str +) -> str: + """Perform normalization of labels in col.""" + FIXES = [(r"[ /:,?()\.-]", "_"), (r"['’]", ""), (r"[\xa0]", "_")] + if df_type == "pandas": + for search, replace in FIXES: + col = col.str.replace(pat=search, repl=replace, regex=True) + elif df_type == "polars": + for search, replace in FIXES: + col = col.str.replace_all( + pattern=search, value=replace, literal=False + ) + elif df_type == "str": + for search, replace in FIXES: + col = re.sub(pattern=search, repl=replace, string=col) + else: + for search, replace in FIXES: + col = [ + re.sub(pattern=search, repl=replace, string=label) + for label in col + ] + return col + + +def _remove_special( + df_type: str, + col: Union[pd.Index, pd.Series, pl.Expr, list, str] = None, +) -> str: + """Remove special characters from col.""" + if df_type == "pandas": + return col.str.replace( + pat="[^A-Za-z_\\d]", repl="", regex=True + ).str.strip() + if df_type == "polars": + return col.str.replace_all( + pattern="[^A-Za-z_\\d]", value="", literal=False + ).str.strip_chars() + elif df_type == "str": + col = [item for item in col if item.isalnum() or (item == "_")] + return "".join(col) + out = [] + for label in col: + word = [item for item in label if item.isalnum() or (item == "_")] + word = "".join(word) + out.append(word) + return out + + +def _strip_accents( + col: Union[pd.Index, pd.Series, pl.Expr, list, str], + df_type: str, +) -> str: + """Remove accents from a label. + + Inspired from [StackOverflow][so]. + + [so]: https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-strin + """ # noqa: E501 + if df_type == "pandas": + return col.map( + lambda f: "".join( + [ + letter + for letter in unicodedata.normalize("NFD", str(f)) + if not unicodedata.combining(letter) + ] + ) + ) + if df_type == "polars": + return col.map_elements( + lambda word: [ + letter + for letter in unicodedata.normalize("NFD", word) + if not unicodedata.combining(letter) + ], + return_dtype=pl.List(pl.Utf8), + ).list.join("") + if df_type == "str": + col = [ + letter + for letter in unicodedata.normalize("NFD", col) + if not unicodedata.combining(letter) + ] + return "".join(col) + out = [] + for label in col: + word = [ + letter + for letter in unicodedata.normalize("NFD", label) + if not unicodedata.combining(letter) + ] + word = "".join(word) + out.append(word) + return out + + +def _strip_underscores_func( + col: Union[pd.Index, pd.Series, pl.Expr, list, str], + df_type: str, + strip_underscores: Union[str, bool] = None, +) -> pd.DataFrame: + """Strip underscores.""" + underscore_options = {None, "left", "right", "both", "l", "r", True} + if strip_underscores not in underscore_options: + raise JanitorError( + f"strip_underscores must be one of: {underscore_options}" + ) + if df_type == "pandas": + if strip_underscores in {"left", "l"}: + return col.str.lstrip("_") + if strip_underscores in {"right", "r"}: + return col.str.rstrip("_") + if strip_underscores in {True, "both"}: + return col.str.strip("_") + return col + + if df_type == "polars": + if strip_underscores in {"left", "l"}: + return col.str.strip_chars_start("_") + if strip_underscores in {"right", "r"}: + return col.str.strip_chars_end("_") + if strip_underscores in {True, "both"}: + return col.str.strip_chars("_") + return col + + if df_type == "str": + if strip_underscores in {"left", "l"}: + return col.lstrip("_") + if strip_underscores in {"right", "r"}: + return col.rstrip("_") + if strip_underscores in {True, "both"}: + return col.strip("_") + return col + + if strip_underscores in {"left", "l"}: + return [label.lstrip("_") for label in col] + if strip_underscores in {"right", "r"}: + return [label.rstrip("_") for label in col] + if strip_underscores in {True, "both"}: + return [label.strip("_") for label in col] + return col + + +def make_clean_names( + col: Union[pd.Index, pd.Series, pl.Expr, list, str], + strip_underscores: Optional[Union[str, bool]] = None, + case_type: str = "lower", + remove_special: bool = False, + strip_accents: bool = False, + enforce_string: bool = False, + truncate_limit: int = None, + df_type: str = "pandas", +) -> Union[pd.Index, pd.Series, pl.Expr, list]: + """ + Generic function to clean an object. + """ + if enforce_string and (df_type == "pandas"): + if not (_is_str_or_cat(col)): + col = col.astype(str) + elif enforce_string and (df_type == "python"): + col = [str(label) for label in col] + elif enforce_string and (df_type == "str"): + col = str(col) + elif enforce_string and (df_type == "polars"): + col = col.cast(pl.Utf8) + col = _change_case(col, case_type, df_type=df_type) + col = _normalize_1(col, df_type=df_type) + if remove_special: + col = _remove_special(df_type=df_type, col=col) + if strip_accents: + col = _strip_accents(col=col, df_type=df_type) + if df_type == "pandas": + col = col.str.replace(pat="_+", repl="_", regex=True) + elif df_type == "polars": + col = col.str.replace(pattern="_+", value="_", literal=False) + elif df_type == "str": + col = re.sub(pattern="_+", repl="_", string=col) + else: + col = [re.sub(pattern="_+", repl="_", string=label) for label in col] + col = _strip_underscores_func( + col, strip_underscores=strip_underscores, df_type=df_type + ) + if truncate_limit and (df_type == "pandas"): + col = col.str[:truncate_limit] + elif truncate_limit and (df_type == "polars"): + col = col.str.slice(offset=0, length=truncate_limit) + elif truncate_limit and (df_type == "str"): + col = col[:truncate_limit] + elif truncate_limit: + col = [label[:truncate_limit] for label in col] + return col diff --git a/tests/functions/polars/test_clean_names.py b/tests/functions/polars/test_clean_names.py new file mode 100644 index 000000000..51d6f1ff4 --- /dev/null +++ b/tests/functions/polars/test_clean_names.py @@ -0,0 +1,123 @@ +import polars as pl +import pytest + +from janitor import make_clean_names + + +@pytest.mark.functions +def test_clean_names_method_chain(dataframe): + """Tests clean_names default args in a method chain.""" + df = pl.from_pandas(dataframe) + df = df.rename(lambda col: make_clean_names(col, df_type="str")) + expected_columns = [ + "a", + "bell_chart", + "decorated_elephant", + "animals@#$%^", + "cities", + ] + assert df.columns == expected_columns + + +@pytest.mark.functions +def test_clean_names_special_characters(dataframe): + """Tests clean_names `remove_special` parameter.""" + df = pl.from_pandas(dataframe) + df = df.rename( + lambda col: make_clean_names(col, df_type="str", remove_special=True) + ) + expected_columns = [ + "a", + "bell_chart", + "decorated_elephant", + "animals", + "cities", + ] + assert df.columns == expected_columns + + +@pytest.mark.functions +def test_clean_names_uppercase(dataframe): + """Tests clean_names `case_type` parameter = upper.""" + df = pl.from_pandas(dataframe) + df = df.rename( + lambda col: make_clean_names( + col, df_type="str", remove_special=True, case_type="upper" + ) + ) + expected_columns = [ + "A", + "BELL_CHART", + "DECORATED_ELEPHANT", + "ANIMALS", + "CITIES", + ] + assert df.columns == expected_columns + + +@pytest.mark.functions +def test_clean_names_strip_accents(): + """Tests clean_names `strip_accents` parameter.""" + df = pl.DataFrame({"João": [1, 2], "Лука́ся": [1, 2], "Käfer": [1, 2]}) + df = df.rename( + lambda col: make_clean_names(col, df_type="str", strip_accents=True) + ) + expected_columns = ["joao", "лукася", "kafer"] + assert df.columns == expected_columns + + +@pytest.mark.functions +def test_clean_names_camelcase_to_snake(dataframe): + """Tests clean_names `case_type` parameter = snake.""" + df = pl.from_pandas(dataframe) + df = ( + df.select("a") + .rename({"a": "AColumnName"}) + .rename( + lambda col: make_clean_names( + col, df_type="str", remove_special=True, case_type="snake" + ) + ) + ) + assert df.columns == ["a_column_name"] + + +@pytest.mark.functions +def test_clean_names_truncate_limit(dataframe): + """Tests clean_names `truncate_limit` parameter.""" + df = pl.from_pandas(dataframe) + df = df.rename( + lambda col: make_clean_names(col, df_type="str", truncate_limit=7) + ) + # df = dataframe.clean_names(truncate_limit=7) + expected_columns = ["a", "bell_ch", "decorat", "animals", "cities"] + assert df.columns == expected_columns + + +@pytest.mark.functions +def test_charac(): + """Ensure non standard characters and spaces have been cleaned up.""" + + df = pl.DataFrame( + { + r"Current accountbalance(in % of GDP)": range(5), + } + ) + df = df.rename( + lambda col: make_clean_names( + col, df_type="str", strip_underscores=True, case_type="lower" + ) + ) + + assert "current_accountbalance_in_%_of_gdp" in df.columns + + +def test_clean_column_values(): + """Clean column values""" + raw = pl.DataFrame({"raw": ["Abçdê fgí j"]}) + outcome = raw.with_columns( + pl.col("raw").pipe( + make_clean_names, df_type="polars", strip_accents=True + ) + ) + assert list(outcome)[0][0] == "abcde_fgi_j" From 01531cc208486c7b92a851988a06676b602c822a Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sat, 20 Apr 2024 19:45:26 +1000 Subject: [PATCH 02/46] add examples for make_clean_names --- examples/notebooks/bla.ipynb | 94 +++++ janitor/functions/clean_names.py | 29 +- janitor/functions/utils.py | 338 ++++++++++++------ janitor/spark/functions.py | 2 +- ...an_names.py => test_clean_names_polars.py} | 28 +- 5 files changed, 351 insertions(+), 140 deletions(-) create mode 100644 examples/notebooks/bla.ipynb rename tests/functions/{polars/test_clean_names.py => test_clean_names_polars.py} (78%) diff --git a/examples/notebooks/bla.ipynb b/examples/notebooks/bla.ipynb new file mode 100644 index 000000000..f47c4b335 --- /dev/null +++ b/examples/notebooks/bla.ipynb @@ -0,0 +1,94 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "from janitor import make_clean_names" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (3, 3)
AlohaBell ChartAnimals@#$%^
i64i64i64
000
111
222
" + ], + "text/plain": [ + "shape: (3, 3)\n", + "┌───────┬────────────┬──────────────┐\n", + "│ Aloha ┆ Bell Chart ┆ Animals@#$%^ │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ i64 ┆ i64 │\n", + "╞═══════╪════════════╪══════════════╡\n", + "│ 0 ┆ 0 ┆ 0 │\n", + "│ 1 ┆ 1 ┆ 1 │\n", + "│ 2 ┆ 2 ┆ 2 │\n", + "└───────┴────────────┴──────────────┘" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pl.DataFrame(\n", + " {\n", + " \"Aloha\": range(3),\n", + " \"Bell Chart\": range(3),\n", + " \"Animals@#$%^\": range(3)\n", + " }\n", + ")\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw.with_columns(\n", + " pl.col(\"raw\").pipe(\n", + " make_clean_names, object_type=\"polars\", strip_accents=True\n", + " )\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pyjanitor-dev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/janitor/functions/clean_names.py b/janitor/functions/clean_names.py index db439d30f..69af7f33e 100644 --- a/janitor/functions/clean_names.py +++ b/janitor/functions/clean_names.py @@ -78,8 +78,9 @@ def clean_names( Column selection is possible using the [`select`][janitor.functions.select.select] syntax. strip_underscores: Removes the outer underscores from all - column names. Default None keeps outer underscores. Values can be - either 'left', 'right' or 'both' or the respective shorthand 'l', + column names/values. Default None keeps outer underscores. + Values can be either 'left', 'right' or 'both' + or the respective shorthand 'l', 'r' and True. case_type: Whether to make columns lower or uppercase. Current case may be preserved with 'preserve', @@ -89,15 +90,17 @@ def clean_names( remove_special: Remove special characters from columns. Only letters, numbers and underscores are preserved. strip_accents: Whether or not to remove accents from - columns names. + columns names/values. preserve_original_labels: Preserve original names. This is later retrievable using `df.original_labels`. Applies if `axis` is not None. - enforce_string: Whether or not to convert all column names - to string type. Defaults to True, but can be turned off. + enforce_string: Whether or not to convert all + column names/values to string type. + Defaults to True, but can be turned off. Columns with >1 levels will not be converted by default. - truncate_limit: Truncates formatted column names to - the specified length. Default None does not truncate. + truncate_limit: Truncates formatted column names/values + to the specified length. + Default None does not truncate. Raises: ValueError: If `axis=None` and `column_names=None`. @@ -118,14 +121,14 @@ def clean_names( df = df.copy() for column_name in column_names: df[column_name] = make_clean_names( - col=df[column_name], + obj=df[column_name], enforce_string=enforce_string, case_type=case_type, remove_special=remove_special, strip_accents=strip_accents, strip_underscores=strip_underscores, truncate_limit=truncate_limit, - df_type="pandas", + object_type="pandas", ) return df @@ -139,27 +142,27 @@ def clean_names( ] target_axis = [ make_clean_names( - col=obj, + obj=obj, enforce_string=enforce_string, case_type=case_type, remove_special=remove_special, strip_accents=strip_accents, strip_underscores=strip_underscores, truncate_limit=truncate_limit, - df_type="pandas", + object_type="pandas", ) for obj in target_axis ] else: target_axis = make_clean_names( - col=target_axis, + obj=target_axis, enforce_string=enforce_string, case_type=case_type, remove_special=remove_special, strip_accents=strip_accents, strip_underscores=strip_underscores, truncate_limit=truncate_limit, - df_type="pandas", + object_type="pandas", ) # Store the original column names, if enabled by user if preserve_original_labels: diff --git a/janitor/functions/utils.py b/janitor/functions/utils.py index 01e192853..197908f92 100644 --- a/janitor/functions/utils.py +++ b/janitor/functions/utils.py @@ -1139,41 +1139,41 @@ def __eq__(self, other): def _change_case( - col: Union[pd.Index, pd.Series, pl.Expr, list, str], + obj: Union[pd.Index, pd.Series, pl.Expr, list, str], case_type: str, - df_type: str, + object_type: str, ) -> str: - """Change case of labels in col.""" + """Change case of labels in obj.""" case_types = {"preserve", "upper", "lower", "snake"} case_type = case_type.lower() if case_type not in case_types: - raise JanitorError(f"df_type must be one of: {case_types}") + raise JanitorError(f"type must be one of: {case_types}") - if df_type == "pandas": + if object_type == "pandas": if case_type == "preserve": - return col + return obj if case_type == "upper": - return col.str.upper() + return obj.str.upper() if case_type == "lower": - return col.str.lower() + return obj.str.lower() # Implementation taken from: https://gist.github.com/jaytaylor/3660565 # by @jtaylor return ( - col.str.replace(pat=r"(.)([A-Z][a-z]+)", repl=r"\1_\2", regex=True) + obj.str.replace(pat=r"(.)([A-Z][a-z]+)", repl=r"\1_\2", regex=True) .str.replace(pat=r"([a-z0-9])([A-Z])", repl=r"\1_\2", regex=True) .str.lower() ) - if df_type == "polars": + if object_type == "polars": if case_type == "preserve": - return col + return obj if case_type == "upper": - return col.str.to_uppercase() + return obj.str.to_uppercase() if case_type == "lower": - return col.str.to_lowercase() + return obj.str.to_lowercase() # Implementation taken from: https://gist.github.com/jaytaylor/3660565 # by @jtaylor return ( - col.str.replace_all( + obj.str.replace_all( pattern=r"(.)([A-Z][a-z]+)", value=r"${1}_${2}", literal=False ) .str.replace_all( @@ -1181,82 +1181,82 @@ def _change_case( ) .str.to_lowercase() ) - if df_type == "str": + if object_type == "string": if case_type == "preserve": - return col + return obj if case_type == "upper": - return col.upper() + return obj.upper() if case_type == "lower": - return col.lower() + return obj.lower() # Implementation adapted from: https://gist.github.com/jaytaylor/3660565 # by @jtaylor - col = re.sub(pattern=r"(.)([A-Z][a-z]+)", repl=r"\1_\2", string=col) - col = re.sub(pattern=r"([a-z0-9])([A-Z])", repl=r"\1_\2", string=col) - return col.lower() + obj = re.sub(pattern=r"(.)([A-Z][a-z]+)", repl=r"\1_\2", string=obj) + obj = re.sub(pattern=r"([a-z0-9])([A-Z])", repl=r"\1_\2", string=obj) + return obj.lower() if case_type == "preserve": - return col + return obj if case_type == "upper": - return [label.upper() for label in col] + return [label.upper() for label in obj] if case_type == "lower": - return [label.lower() for label in col] + return [label.lower() for label in obj] # Implementation adapted from: https://gist.github.com/jaytaylor/3660565 # by @jtaylor - col = [ + obj = [ re.sub(pattern=r"(.)([A-Z][a-z]+)", repl=r"\1_\2", string=label) - for label in col + for label in obj ] - col = [ + obj = [ re.sub(pattern=r"([a-z0-9])([A-Z])", repl=r"\1_\2", string=label) - for label in col + for label in obj ] - col = [label.lower() for label in col] - return col + obj = [label.lower() for label in obj] + return obj def _normalize_1( - col: Union[pd.Index, pd.Series, pl.Expr, list, str], df_type: str + obj: Union[pd.Index, pd.Series, pl.Expr, list, str], object_type: str ) -> str: - """Perform normalization of labels in col.""" + """Perform normalization of labels in obj.""" FIXES = [(r"[ /:,?()\.-]", "_"), (r"['’]", ""), (r"[\xa0]", "_")] - if df_type == "pandas": + if object_type == "pandas": for search, replace in FIXES: - col = col.str.replace(pat=search, repl=replace, regex=True) - elif df_type == "polars": + obj = obj.str.replace(pat=search, repl=replace, regex=True) + elif object_type == "polars": for search, replace in FIXES: - col = col.str.replace_all( + obj = obj.str.replace_all( pattern=search, value=replace, literal=False ) - elif df_type == "str": + elif object_type == "string": for search, replace in FIXES: - col = re.sub(pattern=search, repl=replace, string=col) + obj = re.sub(pattern=search, repl=replace, string=obj) else: for search, replace in FIXES: - col = [ + obj = [ re.sub(pattern=search, repl=replace, string=label) - for label in col + for label in obj ] - return col + return obj def _remove_special( - df_type: str, - col: Union[pd.Index, pd.Series, pl.Expr, list, str] = None, + object_type: str, + obj: Union[pd.Index, pd.Series, pl.Expr, list, str] = None, ) -> str: - """Remove special characters from col.""" - if df_type == "pandas": - return col.str.replace( + """Remove special characters from obj.""" + if object_type == "pandas": + return obj.str.replace( pat="[^A-Za-z_\\d]", repl="", regex=True ).str.strip() - if df_type == "polars": - return col.str.replace_all( + if object_type == "polars": + return obj.str.replace_all( pattern="[^A-Za-z_\\d]", value="", literal=False ).str.strip_chars() - elif df_type == "str": - col = [item for item in col if item.isalnum() or (item == "_")] - return "".join(col) + elif object_type == "string": + obj = [item for item in obj if item.isalnum() or (item == "_")] + return "".join(obj) out = [] - for label in col: + for label in obj: word = [item for item in label if item.isalnum() or (item == "_")] word = "".join(word) out.append(word) @@ -1264,8 +1264,8 @@ def _remove_special( def _strip_accents( - col: Union[pd.Index, pd.Series, pl.Expr, list, str], - df_type: str, + obj: Union[pd.Index, pd.Series, pl.Expr, list, str], + object_type: str, ) -> str: """Remove accents from a label. @@ -1273,8 +1273,8 @@ def _strip_accents( [so]: https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-strin """ # noqa: E501 - if df_type == "pandas": - return col.map( + if object_type == "pandas": + return obj.map( lambda f: "".join( [ letter @@ -1283,8 +1283,8 @@ def _strip_accents( ] ) ) - if df_type == "polars": - return col.map_elements( + if object_type == "polars": + return obj.map_elements( lambda word: [ letter for letter in unicodedata.normalize("NFD", word) @@ -1292,15 +1292,15 @@ def _strip_accents( ], return_dtype=pl.List(pl.Utf8), ).list.join("") - if df_type == "str": - col = [ + if object_type == "string": + obj = [ letter - for letter in unicodedata.normalize("NFD", col) + for letter in unicodedata.normalize("NFD", obj) if not unicodedata.combining(letter) ] - return "".join(col) + return "".join(obj) out = [] - for label in col: + for label in obj: word = [ letter for letter in unicodedata.normalize("NFD", label) @@ -1312,8 +1312,8 @@ def _strip_accents( def _strip_underscores_func( - col: Union[pd.Index, pd.Series, pl.Expr, list, str], - df_type: str, + obj: Union[pd.Index, pd.Series, pl.Expr, list, str], + object_type: str, strip_underscores: Union[str, bool] = None, ) -> pd.DataFrame: """Strip underscores.""" @@ -1322,87 +1322,189 @@ def _strip_underscores_func( raise JanitorError( f"strip_underscores must be one of: {underscore_options}" ) - if df_type == "pandas": + if object_type == "pandas": if strip_underscores in {"left", "l"}: - return col.str.lstrip("_") + return obj.str.lstrip("_") if strip_underscores in {"right", "r"}: - return col.str.rstrip("_") + return obj.str.rstrip("_") if strip_underscores in {True, "both"}: - return col.str.strip("_") - return col + return obj.str.strip("_") + return obj - if df_type == "polars": + if object_type == "polars": if strip_underscores in {"left", "l"}: - return col.str.strip_chars_start("_") + return obj.str.strip_chars_start("_") if strip_underscores in {"right", "r"}: - return col.str.strip_chars_end("_") + return obj.str.strip_chars_end("_") if strip_underscores in {True, "both"}: - return col.str.strip_chars("_") - return col + return obj.str.strip_chars("_") + return obj - if df_type == "str": + if object_type == "string": if strip_underscores in {"left", "l"}: - return col.lstrip("_") + return obj.lstrip("_") if strip_underscores in {"right", "r"}: - return col.rstrip("_") + return obj.rstrip("_") if strip_underscores in {True, "both"}: - return col.strip("_") - return col + return obj.strip("_") + return obj if strip_underscores in {"left", "l"}: - return [label.lstrip("_") for label in col] + return [label.lstrip("_") for label in obj] if strip_underscores in {"right", "r"}: - return [label.rstrip("_") for label in col] + return [label.rstrip("_") for label in obj] if strip_underscores in {True, "both"}: - return [label.strip("_") for label in col] - return col + return [label.strip("_") for label in obj] + return obj def make_clean_names( - col: Union[pd.Index, pd.Series, pl.Expr, list, str], + obj: Union[pd.Index, pd.Series, pl.Expr, list, str], strip_underscores: Optional[Union[str, bool]] = None, case_type: str = "lower", remove_special: bool = False, strip_accents: bool = False, enforce_string: bool = False, truncate_limit: int = None, - df_type: str = "pandas", + object_type: str = "pandas", ) -> Union[pd.Index, pd.Series, pl.Expr, list]: """ - Generic function to clean an object. - """ - if enforce_string and (df_type == "pandas"): - if not (_is_str_or_cat(col)): - col = col.astype(str) - elif enforce_string and (df_type == "python"): - col = [str(label) for label in col] - elif enforce_string and (df_type == "str"): - col = str(col) - elif enforce_string and (df_type == "polars"): - col = col.cast(pl.Utf8) - col = _change_case(col, case_type, df_type=df_type) - col = _normalize_1(col, df_type=df_type) + Generic function to clean labels in an object. + It can be applied to a pandas Index/Series, a Polars Expression, + or a python string/list. + For pandas, there is a [`clean_names`][janitor.functions.clean_names.clean_names] + method, which is a wrapper around the `make_clean_names` function. + For polars, use this function via existing Polars functions. The examples below + show how you can use this within polars. + + Examples: + >>> import polars as pl + >>> import janitor + >>> df = pl.DataFrame( + ... { + ... "Aloha": range(3), + ... "Bell Chart": range(3), + ... "Animals@#$%^": range(3) + ... } + ... ) + >>> df + shape: (3, 3) + ┌───────┬────────────┬──────────────┐ + │ Aloha ┆ Bell Chart ┆ Animals@#$%^ │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═══════╪════════════╪══════════════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 2 ┆ 2 │ + └───────┴────────────┴──────────────┘ + + Clean the column names, + via [rename](https://docs.pola.rs/py-polars/html/reference/dataframe/api/polars.DataFrame.rename.html#polars-dataframe-rename): + >>> df.rename( + ... lambda objumn_name: make_clean_names( + ... obj=objumn_name, remove_special=True, object_type="string" + ... ) + ... ) + shape: (3, 3) + ┌───────┬────────────┬─────────┐ + │ aloha ┆ bell_chart ┆ animals │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═══════╪════════════╪═════════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 2 ┆ 2 │ + └───────┴────────────┴─────────┘ + + >>> df = pl.DataFrame({"raw": ["Abçdê fgí j"]}) + >>> df + shape: (1, 1) + ┌─────────────┐ + │ raw │ + │ --- │ + │ str │ + ╞═════════════╡ + │ Abçdê fgí j │ + └─────────────┘ + + Clean the column values, + via [with_columns](https://docs.pola.rs/py-polars/html/reference/dataframe/api/polars.DataFrame.with_columns.html#polars-dataframe-with-columns): + >>> df.with_columns( + ... pl.col("raw").pipe( + ... make_clean_names, object_type="polars", strip_accents=True + ... ) + ... ) + shape: (1, 1) + ┌─────────────┐ + │ raw │ + │ --- │ + │ str │ + ╞═════════════╡ + │ abcde_fgi_j │ + └─────────────┘ + + !!! info "New in version 0.28.0" + + Args: + obj: The object to clean. It can be a pandas Index, + a pandas Series, a polars Expression, a python string, + or a python list. + strip_underscores: Removes the outer underscores from all + labels. Default None keeps outer underscores. Values can be + either 'left', 'right' or 'both' or the respective shorthand 'l', + 'r' and True. + case_type: Whether to make the labels lower or uppercase. + Current case may be preserved with 'preserve', + while snake case conversion (from CamelCase or camelCase only) + can be turned on using "snake". + Default 'lower' makes all characters lowercase. + remove_special: Remove special characters from the labels. + Only letters, numbers and underscores are preserved. + strip_accents: Whether or not to remove accents from + the labels. + enforce_string: Whether or not to convert the labels to string. + Defaults to True, but can be turned off. + truncate_limit: Truncates formatted labels to + the specified length. Default None does not truncate. + object_type: The type of object to clean. It should be either `pandas`, + `polars`, a python `string`, or a python `list`. + Returns: + A pandas Index, pandas Series, polars Expression, a python string, + or a python list. + """ # noqa: E501 + if enforce_string and (object_type == "pandas"): + if not (_is_str_or_cat(obj)): + obj = obj.astype(str) + elif enforce_string and (object_type == "list"): + obj = [str(label) for label in obj] + elif enforce_string and (object_type == "string"): + obj = str(obj) + elif enforce_string and (object_type == "polars"): + obj = obj.cast(pl.Utf8) + obj = _change_case(obj, case_type, object_type=object_type) + obj = _normalize_1(obj, object_type=object_type) if remove_special: - col = _remove_special(df_type=df_type, col=col) + obj = _remove_special(object_type=object_type, obj=obj) if strip_accents: - col = _strip_accents(col=col, df_type=df_type) - if df_type == "pandas": - col = col.str.replace(pat="_+", repl="_", regex=True) - elif df_type == "polars": - col = col.str.replace(pattern="_+", value="_", literal=False) - elif df_type == "str": - col = re.sub(pattern="_+", repl="_", string=col) + obj = _strip_accents(obj=obj, object_type=object_type) + if object_type == "pandas": + obj = obj.str.replace(pat="_+", repl="_", regex=True) + elif object_type == "polars": + obj = obj.str.replace(pattern="_+", value="_", literal=False) + elif object_type == "string": + obj = re.sub(pattern="_+", repl="_", string=obj) else: - col = [re.sub(pattern="_+", repl="_", string=label) for label in col] - col = _strip_underscores_func( - col, strip_underscores=strip_underscores, df_type=df_type + obj = [re.sub(pattern="_+", repl="_", string=label) for label in obj] + obj = _strip_underscores_func( + obj, strip_underscores=strip_underscores, object_type=object_type ) - if truncate_limit and (df_type == "pandas"): - col = col.str[:truncate_limit] - elif truncate_limit and (df_type == "polars"): - col = col.str.slice(offset=0, length=truncate_limit) - elif truncate_limit and (df_type == "str"): - col = col[:truncate_limit] + if truncate_limit and (object_type == "pandas"): + obj = obj.str[:truncate_limit] + elif truncate_limit and (object_type == "polars"): + obj = obj.str.slice(offset=0, length=truncate_limit) + elif truncate_limit and (object_type == "string"): + obj = obj[:truncate_limit] elif truncate_limit: - col = [label[:truncate_limit] for label in col] - return col + obj = [label[:truncate_limit] for label in obj] + return obj diff --git a/janitor/spark/functions.py b/janitor/spark/functions.py index a43f7338d..57abd1824 100644 --- a/janitor/spark/functions.py +++ b/janitor/spark/functions.py @@ -4,7 +4,7 @@ from typing import Union from janitor import utils as janitor_utils -from janitor.functions.clean_names import ( +from janitor.functions.utils import ( _change_case, _normalize_1, _remove_special, diff --git a/tests/functions/polars/test_clean_names.py b/tests/functions/test_clean_names_polars.py similarity index 78% rename from tests/functions/polars/test_clean_names.py rename to tests/functions/test_clean_names_polars.py index 51d6f1ff4..cacdfe608 100644 --- a/tests/functions/polars/test_clean_names.py +++ b/tests/functions/test_clean_names_polars.py @@ -8,7 +8,7 @@ def test_clean_names_method_chain(dataframe): """Tests clean_names default args in a method chain.""" df = pl.from_pandas(dataframe) - df = df.rename(lambda col: make_clean_names(col, df_type="str")) + df = df.rename(lambda col: make_clean_names(col, object_type="string")) expected_columns = [ "a", "bell_chart", @@ -24,7 +24,9 @@ def test_clean_names_special_characters(dataframe): """Tests clean_names `remove_special` parameter.""" df = pl.from_pandas(dataframe) df = df.rename( - lambda col: make_clean_names(col, df_type="str", remove_special=True) + lambda col: make_clean_names( + col, object_type="string", remove_special=True + ) ) expected_columns = [ "a", @@ -42,7 +44,7 @@ def test_clean_names_uppercase(dataframe): df = pl.from_pandas(dataframe) df = df.rename( lambda col: make_clean_names( - col, df_type="str", remove_special=True, case_type="upper" + col, object_type="string", remove_special=True, case_type="upper" ) ) expected_columns = [ @@ -60,7 +62,9 @@ def test_clean_names_strip_accents(): """Tests clean_names `strip_accents` parameter.""" df = pl.DataFrame({"João": [1, 2], "Лука́ся": [1, 2], "Käfer": [1, 2]}) df = df.rename( - lambda col: make_clean_names(col, df_type="str", strip_accents=True) + lambda col: make_clean_names( + col, object_type="string", strip_accents=True + ) ) expected_columns = ["joao", "лукася", "kafer"] assert df.columns == expected_columns @@ -75,7 +79,10 @@ def test_clean_names_camelcase_to_snake(dataframe): .rename({"a": "AColumnName"}) .rename( lambda col: make_clean_names( - col, df_type="str", remove_special=True, case_type="snake" + col, + object_type="string", + remove_special=True, + case_type="snake", ) ) ) @@ -87,7 +94,9 @@ def test_clean_names_truncate_limit(dataframe): """Tests clean_names `truncate_limit` parameter.""" df = pl.from_pandas(dataframe) df = df.rename( - lambda col: make_clean_names(col, df_type="str", truncate_limit=7) + lambda col: make_clean_names( + col, object_type="string", truncate_limit=7 + ) ) # df = dataframe.clean_names(truncate_limit=7) expected_columns = ["a", "bell_ch", "decorat", "animals", "cities"] @@ -105,7 +114,10 @@ def test_charac(): ) df = df.rename( lambda col: make_clean_names( - col, df_type="str", strip_underscores=True, case_type="lower" + col, + object_type="string", + strip_underscores=True, + case_type="lower", ) ) @@ -117,7 +129,7 @@ def test_clean_column_values(): raw = pl.DataFrame({"raw": ["Abçdê fgí j"]}) outcome = raw.with_columns( pl.col("raw").pipe( - make_clean_names, df_type="polars", strip_accents=True + make_clean_names, object_type="polars", strip_accents=True ) ) assert list(outcome)[0][0] == "abcde_fgi_j" From 0fb440e84fdf82b93afd1a617da97057370d9fa5 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sat, 20 Apr 2024 19:49:50 +1000 Subject: [PATCH 03/46] changelog --- CHANGELOG.md | 1 + examples/notebooks/bla.ipynb | 94 ------------------------------------ 2 files changed, 1 insertion(+), 94 deletions(-) delete mode 100644 examples/notebooks/bla.ipynb diff --git a/CHANGELOG.md b/CHANGELOG.md index 552de1e50..0fabcc7fb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # Changelog ## [Unreleased] +- [ENH] Add `make_clean_names` function which works on a pandas Index/Series, a Polar Expression, or a python string/list. Issue #1343 ## [v0.27.0] - 2024-03-21 diff --git a/examples/notebooks/bla.ipynb b/examples/notebooks/bla.ipynb deleted file mode 100644 index f47c4b335..000000000 --- a/examples/notebooks/bla.ipynb +++ /dev/null @@ -1,94 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import polars as pl\n", - "from janitor import make_clean_names" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "shape: (3, 3)
AlohaBell ChartAnimals@#$%^
i64i64i64
000
111
222
" - ], - "text/plain": [ - "shape: (3, 3)\n", - "┌───────┬────────────┬──────────────┐\n", - "│ Aloha ┆ Bell Chart ┆ Animals@#$%^ │\n", - "│ --- ┆ --- ┆ --- │\n", - "│ i64 ┆ i64 ┆ i64 │\n", - "╞═══════╪════════════╪══════════════╡\n", - "│ 0 ┆ 0 ┆ 0 │\n", - "│ 1 ┆ 1 ┆ 1 │\n", - "│ 2 ┆ 2 ┆ 2 │\n", - "└───────┴────────────┴──────────────┘" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = pl.DataFrame(\n", - " {\n", - " \"Aloha\": range(3),\n", - " \"Bell Chart\": range(3),\n", - " \"Animals@#$%^\": range(3)\n", - " }\n", - ")\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw.with_columns(\n", - " pl.col(\"raw\").pipe(\n", - " make_clean_names, object_type=\"polars\", strip_accents=True\n", - " )\n", - ")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "pyjanitor-dev", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From 5e944b2211ffba92d40a2f5e12b7a8e8d093a625 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sat, 20 Apr 2024 20:05:02 +1000 Subject: [PATCH 04/46] limit import location for polars --- janitor/functions/utils.py | 8 +++++++- pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/janitor/functions/utils.py b/janitor/functions/utils.py index 197908f92..39bad2d91 100644 --- a/janitor/functions/utils.py +++ b/janitor/functions/utils.py @@ -12,6 +12,7 @@ from enum import Enum from functools import singledispatch from typing import ( + TYPE_CHECKING, Any, Callable, Hashable, @@ -25,7 +26,6 @@ import numpy as np import pandas as pd -import polars as pl from multipledispatch import dispatch from pandas.api.types import ( is_bool_dtype, @@ -1138,6 +1138,10 @@ def __eq__(self, other): return self +if TYPE_CHECKING: + import polars as pl + + def _change_case( obj: Union[pd.Index, pd.Series, pl.Expr, list, str], case_type: str, @@ -1473,6 +1477,8 @@ def make_clean_names( A pandas Index, pandas Series, polars Expression, a python string, or a python list. """ # noqa: E501 + if object_type == "polars": + import polars as pl if enforce_string and (object_type == "pandas"): if not (_is_str_or_cat(obj)): obj = obj.astype(str) diff --git a/pyproject.toml b/pyproject.toml index af1131d75..f6b98f54b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,6 +91,6 @@ lint.dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" # Assume Python 3.10 target-version = "py310" -[tool.ruff.mccabe] +[tool.ruff.lint.mccabe] # Unlike Flake8, default to a complexity level of 10. max-complexity = 10 From 501d9c67b6c2688929c0b40554f552f782c29f27 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sat, 20 Apr 2024 20:19:11 +1000 Subject: [PATCH 05/46] limit import location for polars --- janitor/functions/utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/janitor/functions/utils.py b/janitor/functions/utils.py index 39bad2d91..9f329e62f 100644 --- a/janitor/functions/utils.py +++ b/janitor/functions/utils.py @@ -39,7 +39,12 @@ from pandas.core.groupby.generic import DataFrameGroupBy, SeriesGroupBy from janitor.errors import JanitorError -from janitor.utils import _expand_grid, check, check_column, find_stack_level +from janitor.utils import ( + _expand_grid, + check, + check_column, + find_stack_level, +) warnings.simplefilter("always", DeprecationWarning) From 9506832433b8dd57f65c53de513e4e94c7e47bfc Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sat, 20 Apr 2024 20:25:48 +1000 Subject: [PATCH 06/46] fix polars in environment-dev.yml --- environment-dev.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment-dev.yml b/environment-dev.yml index 322deec86..2543e2c76 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -34,7 +34,7 @@ dependencies: - pipreqs - pip-tools - pre-commit - - pypolars + - polars - pyspark>=3.2.0 - pytest - pytest-cov From 1ae8eddbe49274e0ef5613bceec18fa0cd28c9e5 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sat, 20 Apr 2024 20:35:18 +1000 Subject: [PATCH 07/46] install polars in doctest --- janitor/functions/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/janitor/functions/utils.py b/janitor/functions/utils.py index 9f329e62f..4d92d2aef 100644 --- a/janitor/functions/utils.py +++ b/janitor/functions/utils.py @@ -1387,6 +1387,8 @@ def make_clean_names( show how you can use this within polars. Examples: + >>> import subprocess + >>> subprocess.call(['pip', 'install', 'polars']) >>> import polars as pl >>> import janitor >>> df = pl.DataFrame( From 3b1829b2551bd4805ab24c6e4308aacb9c734b99 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sat, 20 Apr 2024 20:47:06 +1000 Subject: [PATCH 08/46] limit polars imports - user should have polars already installed --- janitor/functions/utils.py | 26 ++++++++++++---------- tests/functions/test_clean_names_polars.py | 9 +++++--- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/janitor/functions/utils.py b/janitor/functions/utils.py index 4d92d2aef..56700ea68 100644 --- a/janitor/functions/utils.py +++ b/janitor/functions/utils.py @@ -1144,11 +1144,11 @@ def __eq__(self, other): if TYPE_CHECKING: - import polars as pl + from polars import Expr def _change_case( - obj: Union[pd.Index, pd.Series, pl.Expr, list, str], + obj: Union[pd.Index, pd.Series, Expr, list, str], case_type: str, object_type: str, ) -> str: @@ -1224,7 +1224,7 @@ def _change_case( def _normalize_1( - obj: Union[pd.Index, pd.Series, pl.Expr, list, str], object_type: str + obj: Union[pd.Index, pd.Series, Expr, list, str], object_type: str ) -> str: """Perform normalization of labels in obj.""" FIXES = [(r"[ /:,?()\.-]", "_"), (r"['’]", ""), (r"[\xa0]", "_")] @@ -1250,7 +1250,7 @@ def _normalize_1( def _remove_special( object_type: str, - obj: Union[pd.Index, pd.Series, pl.Expr, list, str] = None, + obj: Union[pd.Index, pd.Series, Expr, list, str] = None, ) -> str: """Remove special characters from obj.""" if object_type == "pandas": @@ -1273,7 +1273,7 @@ def _remove_special( def _strip_accents( - obj: Union[pd.Index, pd.Series, pl.Expr, list, str], + obj: Union[pd.Index, pd.Series, Expr, list, str], object_type: str, ) -> str: """Remove accents from a label. @@ -1293,13 +1293,15 @@ def _strip_accents( ) ) if object_type == "polars": + from polars import List, Utf8 + return obj.map_elements( lambda word: [ letter for letter in unicodedata.normalize("NFD", word) if not unicodedata.combining(letter) ], - return_dtype=pl.List(pl.Utf8), + return_dtype=List(Utf8), ).list.join("") if object_type == "string": obj = [ @@ -1321,7 +1323,7 @@ def _strip_accents( def _strip_underscores_func( - obj: Union[pd.Index, pd.Series, pl.Expr, list, str], + obj: Union[pd.Index, pd.Series, Expr, list, str], object_type: str, strip_underscores: Union[str, bool] = None, ) -> pd.DataFrame: @@ -1368,7 +1370,7 @@ def _strip_underscores_func( def make_clean_names( - obj: Union[pd.Index, pd.Series, pl.Expr, list, str], + obj: Union[pd.Index, pd.Series, Expr, list, str], strip_underscores: Optional[Union[str, bool]] = None, case_type: str = "lower", remove_special: bool = False, @@ -1376,7 +1378,7 @@ def make_clean_names( enforce_string: bool = False, truncate_limit: int = None, object_type: str = "pandas", -) -> Union[pd.Index, pd.Series, pl.Expr, list]: +) -> Union[pd.Index, pd.Series, Expr, list]: """ Generic function to clean labels in an object. It can be applied to a pandas Index/Series, a Polars Expression, @@ -1484,8 +1486,6 @@ def make_clean_names( A pandas Index, pandas Series, polars Expression, a python string, or a python list. """ # noqa: E501 - if object_type == "polars": - import polars as pl if enforce_string and (object_type == "pandas"): if not (_is_str_or_cat(obj)): obj = obj.astype(str) @@ -1494,7 +1494,9 @@ def make_clean_names( elif enforce_string and (object_type == "string"): obj = str(obj) elif enforce_string and (object_type == "polars"): - obj = obj.cast(pl.Utf8) + from polars import Utf8 + + obj = obj.cast(Utf8) obj = _change_case(obj, case_type, object_type=object_type) obj = _normalize_1(obj, object_type=object_type) if remove_special: diff --git a/tests/functions/test_clean_names_polars.py b/tests/functions/test_clean_names_polars.py index cacdfe608..814029ae7 100644 --- a/tests/functions/test_clean_names_polars.py +++ b/tests/functions/test_clean_names_polars.py @@ -1,7 +1,10 @@ -import polars as pl -import pytest +import subprocess -from janitor import make_clean_names +subprocess.call(["pip", "install", "polars"]) +import polars as pl # noqa: E402 +import pytest # noqa: E402 + +from janitor import make_clean_names # noqa: E402 @pytest.mark.functions From 52fd80cf5d1e6fafef65f36dc21845272e28fc3f Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sat, 20 Apr 2024 20:52:18 +1000 Subject: [PATCH 09/46] use subprocess.run --- janitor/functions/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/janitor/functions/utils.py b/janitor/functions/utils.py index 56700ea68..cb668c620 100644 --- a/janitor/functions/utils.py +++ b/janitor/functions/utils.py @@ -1390,7 +1390,7 @@ def make_clean_names( Examples: >>> import subprocess - >>> subprocess.call(['pip', 'install', 'polars']) + >>> subprocess.run(['pip', 'install', 'polars']) >>> import polars as pl >>> import janitor >>> df = pl.DataFrame( From 2dce78b6db0cd607e08c0ca64fad1e4f2105a908 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sat, 20 Apr 2024 20:57:50 +1000 Subject: [PATCH 10/46] add subprocess.devnull to docstrings --- janitor/functions/utils.py | 4 +++- tests/functions/test_clean_names_polars.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/janitor/functions/utils.py b/janitor/functions/utils.py index cb668c620..153911b13 100644 --- a/janitor/functions/utils.py +++ b/janitor/functions/utils.py @@ -1390,7 +1390,9 @@ def make_clean_names( Examples: >>> import subprocess - >>> subprocess.run(['pip', 'install', 'polars']) + >>> subprocess.run(['pip', 'install', 'polars'], + ... stdout = subprocess.DEVNULL, + ... stderr = subprocess.STDOUT) >>> import polars as pl >>> import janitor >>> df = pl.DataFrame( diff --git a/tests/functions/test_clean_names_polars.py b/tests/functions/test_clean_names_polars.py index 814029ae7..b920aa2e5 100644 --- a/tests/functions/test_clean_names_polars.py +++ b/tests/functions/test_clean_names_polars.py @@ -1,6 +1,6 @@ import subprocess -subprocess.call(["pip", "install", "polars"]) +subprocess.run(["pip", "install", "polars"]) import polars as pl # noqa: E402 import pytest # noqa: E402 From 37b3feb312e720d3f91e3ca7bd9ed0f90390af02 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sat, 20 Apr 2024 20:58:18 +1000 Subject: [PATCH 11/46] add subprocess.devnull to docstrings --- janitor/functions/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/janitor/functions/utils.py b/janitor/functions/utils.py index 153911b13..1656627db 100644 --- a/janitor/functions/utils.py +++ b/janitor/functions/utils.py @@ -1392,7 +1392,7 @@ def make_clean_names( >>> import subprocess >>> subprocess.run(['pip', 'install', 'polars'], ... stdout = subprocess.DEVNULL, - ... stderr = subprocess.STDOUT) + ... stderr = subprocess.DEVNULL) >>> import polars as pl >>> import janitor >>> df = pl.DataFrame( From 0953f2d2fb043ea5c127b27a92007e092f73ad9b Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sat, 20 Apr 2024 21:04:03 +1000 Subject: [PATCH 12/46] add subprocess.devnull to docstrings --- janitor/functions/utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/janitor/functions/utils.py b/janitor/functions/utils.py index 1656627db..557567630 100644 --- a/janitor/functions/utils.py +++ b/janitor/functions/utils.py @@ -1390,9 +1390,7 @@ def make_clean_names( Examples: >>> import subprocess - >>> subprocess.run(['pip', 'install', 'polars'], - ... stdout = subprocess.DEVNULL, - ... stderr = subprocess.DEVNULL) + >>> subprocess.call(['pip', 'install', 'polars'], stdout=open(os.devnull, 'wb')) >>> import polars as pl >>> import janitor >>> df = pl.DataFrame( From d7c71b6498e46d6d3dc3e4a763b2d7c65f68eea4 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sat, 20 Apr 2024 21:07:31 +1000 Subject: [PATCH 13/46] add subprocess.devnull to docstrings --- janitor/functions/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/janitor/functions/utils.py b/janitor/functions/utils.py index 557567630..2c650328c 100644 --- a/janitor/functions/utils.py +++ b/janitor/functions/utils.py @@ -1390,7 +1390,7 @@ def make_clean_names( Examples: >>> import subprocess - >>> subprocess.call(['pip', 'install', 'polars'], stdout=open(os.devnull, 'wb')) + >>> subprocess.call(['pip', 'install', 'polars'], stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL) >>> import polars as pl >>> import janitor >>> df = pl.DataFrame( From 40b850247e4c2d9e589eb96513e0e9682fc55867 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sat, 20 Apr 2024 21:16:19 +1000 Subject: [PATCH 14/46] add os.devnull --- janitor/functions/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/janitor/functions/utils.py b/janitor/functions/utils.py index 2c650328c..7912907ae 100644 --- a/janitor/functions/utils.py +++ b/janitor/functions/utils.py @@ -1390,7 +1390,8 @@ def make_clean_names( Examples: >>> import subprocess - >>> subprocess.call(['pip', 'install', 'polars'], stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL) + >>> import os + >>> subprocess.call(['pip', 'install', 'polars'], stdout=open(os.devnull, 'wb')) >>> import polars as pl >>> import janitor >>> df = pl.DataFrame( From 4f11d095bad06291737f19d342c041a4ae876fd8 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sat, 20 Apr 2024 21:20:08 +1000 Subject: [PATCH 15/46] add polars as requirement for docs --- .requirements/docs.in | 1 + janitor/functions/utils.py | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.requirements/docs.in b/.requirements/docs.in index f0d4afc29..b23e373aa 100644 --- a/.requirements/docs.in +++ b/.requirements/docs.in @@ -1,4 +1,5 @@ mkdocs +polars mkdocs-material mkdocstrings>=0.19.0 mkdocstrings-python diff --git a/janitor/functions/utils.py b/janitor/functions/utils.py index 7912907ae..b06744d97 100644 --- a/janitor/functions/utils.py +++ b/janitor/functions/utils.py @@ -1389,9 +1389,6 @@ def make_clean_names( show how you can use this within polars. Examples: - >>> import subprocess - >>> import os - >>> subprocess.call(['pip', 'install', 'polars'], stdout=open(os.devnull, 'wb')) >>> import polars as pl >>> import janitor >>> df = pl.DataFrame( From 54b179c5ca420d6629c1e32885e7aec6fed04389 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sat, 20 Apr 2024 21:35:24 +1000 Subject: [PATCH 16/46] add polars to tests requirements --- .requirements/testing.in | 1 + janitor/functions/utils.py | 8 ++++++++ tests/functions/test_clean_names_polars.py | 7 ++----- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/.requirements/testing.in b/.requirements/testing.in index 57e12c2d3..8179653b8 100644 --- a/.requirements/testing.in +++ b/.requirements/testing.in @@ -4,4 +4,5 @@ pytest>=3.4.2 hypothesis>=4.4.0 interrogate pandas-vet +polars py>=1.10.0 diff --git a/janitor/functions/utils.py b/janitor/functions/utils.py index b06744d97..dde4880fc 100644 --- a/janitor/functions/utils.py +++ b/janitor/functions/utils.py @@ -1455,6 +1455,14 @@ def make_clean_names( │ abcde_fgi_j │ └─────────────┘ + The `make_clean_names` function can also be applied to a python string or list: + >>> raw = ["Abçdê fgí j"] + >>> make_clean_names(raw, object_type='list', strip_accents=True) + ['abcde_fgi_j'] + >>> raw = "Abçdê fgí j" + >>> make_clean_names(raw, object_type='string', strip_accents=True) + 'abcde_fgi_j' + !!! info "New in version 0.28.0" Args: diff --git a/tests/functions/test_clean_names_polars.py b/tests/functions/test_clean_names_polars.py index b920aa2e5..56d0b8e95 100644 --- a/tests/functions/test_clean_names_polars.py +++ b/tests/functions/test_clean_names_polars.py @@ -1,8 +1,5 @@ -import subprocess - -subprocess.run(["pip", "install", "polars"]) -import polars as pl # noqa: E402 -import pytest # noqa: E402 +import polars as pl +import pytest from janitor import make_clean_names # noqa: E402 From 25b39b9d1918a83373e7d784430e0d615a04e315 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sat, 20 Apr 2024 21:39:24 +1000 Subject: [PATCH 17/46] delete irrelevant folder --- janitor/functions/polars/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 janitor/functions/polars/__init__.py diff --git a/janitor/functions/polars/__init__.py b/janitor/functions/polars/__init__.py deleted file mode 100644 index e69de29bb..000000000 From a09f34bc6f15bcfa2dddf97797b6b1f6fb3ed910 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sat, 20 Apr 2024 21:45:00 +1000 Subject: [PATCH 18/46] changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0fabcc7fb..6a3492539 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ # Changelog ## [Unreleased] -- [ENH] Add `make_clean_names` function which works on a pandas Index/Series, a Polar Expression, or a python string/list. Issue #1343 +- [ENH] Add `make_clean_names` function which works on a pandas Index/Series, a Polars Expression, or a python string/list. Issue #1343 ## [v0.27.0] - 2024-03-21 From 1b375f84e4d5cba9e5955d6ade247f767c10c4d3 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sun, 21 Apr 2024 19:35:03 +1000 Subject: [PATCH 19/46] create submodule for polars --- CHANGELOG.md | 2 +- janitor/functions/__init__.py | 2 - janitor/functions/clean_names.py | 125 +++++- janitor/functions/utils.py | 367 ++---------------- janitor/polars/__init__.py | 131 +++++++ janitor/polars/functions.py | 160 ++++++++ .../functions/test_clean_names.py} | 51 +-- 7 files changed, 443 insertions(+), 395 deletions(-) create mode 100644 janitor/polars/__init__.py create mode 100644 janitor/polars/functions.py rename tests/{functions/test_clean_names_polars.py => polars/functions/test_clean_names.py} (66%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a3492539..5717193d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ # Changelog ## [Unreleased] -- [ENH] Add `make_clean_names` function which works on a pandas Index/Series, a Polars Expression, or a python string/list. Issue #1343 +- [ENH] Added a `clean_names` method for polars - it can be used to clean the column names, or clean column values . Issue #1343 ## [v0.27.0] - 2024-03-21 diff --git a/janitor/functions/__init__.py b/janitor/functions/__init__.py index ef1a69458..35681b9d9 100644 --- a/janitor/functions/__init__.py +++ b/janitor/functions/__init__.py @@ -81,7 +81,6 @@ col, get_columns, get_index_labels, - make_clean_names, patterns, unionize_dataframe_categories, ) @@ -130,7 +129,6 @@ "join_apply", "label_encode", "limit_column_characters", - "make_clean_names", "min_max_scale", "move", "pivot_longer", diff --git a/janitor/functions/clean_names.py b/janitor/functions/clean_names.py index 69af7f33e..7eb2a7538 100644 --- a/janitor/functions/clean_names.py +++ b/janitor/functions/clean_names.py @@ -1,15 +1,16 @@ """Functions for cleaning columns/index names and/or column values.""" +from __future__ import annotations + +import unicodedata from typing import Optional, Union import pandas as pd import pandas_flavor as pf from pandas.api.types import is_scalar -from janitor.functions.utils import ( - get_index_labels, - make_clean_names, -) +from janitor.errors import JanitorError +from janitor.functions.utils import _is_str_or_cat, get_index_labels from janitor.utils import deprecated_alias @@ -120,7 +121,7 @@ def clean_names( column_names = [column_names] df = df.copy() for column_name in column_names: - df[column_name] = make_clean_names( + df[column_name] = _clean_names( obj=df[column_name], enforce_string=enforce_string, case_type=case_type, @@ -128,7 +129,6 @@ def clean_names( strip_accents=strip_accents, strip_underscores=strip_underscores, truncate_limit=truncate_limit, - object_type="pandas", ) return df @@ -141,7 +141,7 @@ def clean_names( for number in range(target_axis.nlevels) ] target_axis = [ - make_clean_names( + _clean_names( obj=obj, enforce_string=enforce_string, case_type=case_type, @@ -149,12 +149,11 @@ def clean_names( strip_accents=strip_accents, strip_underscores=strip_underscores, truncate_limit=truncate_limit, - object_type="pandas", ) for obj in target_axis ] else: - target_axis = make_clean_names( + target_axis = _clean_names( obj=target_axis, enforce_string=enforce_string, case_type=case_type, @@ -162,10 +161,116 @@ def clean_names( strip_accents=strip_accents, strip_underscores=strip_underscores, truncate_limit=truncate_limit, - object_type="pandas", ) # Store the original column names, if enabled by user if preserve_original_labels: df.__dict__["original_labels"] = getattr(df, axis) setattr(df, axis, target_axis) return df + + +def _clean_names( + obj: Union[pd.Index, pd.Series], + strip_underscores: Optional[Union[str, bool]] = None, + case_type: str = "lower", + remove_special: bool = False, + strip_accents: bool = False, + enforce_string: bool = False, + truncate_limit: int = None, +) -> Union[pd.Index, pd.Series]: + """ + Generic function to clean labels in a pandas object. + """ + if enforce_string and not (_is_str_or_cat(obj)): + obj = obj.astype(str) + obj = _change_case(obj=obj, case_type=case_type) + obj = _normalize_1(obj=obj) + if remove_special: + obj = obj.str.replace( + pat="[^A-Za-z_\\d]", repl="", regex=True + ).str.strip() + if strip_accents: + obj = _strip_accents(obj=obj) + obj = obj.str.replace(pat="_+", repl="_", regex=True) + obj = _strip_underscores_func( + obj, + strip_underscores=strip_underscores, + ) + if truncate_limit: + obj = obj.str[:truncate_limit] + return obj + + +def _change_case( + obj: Union[pd.Index, pd.Series], + case_type: str, +) -> Union[pd.Index, pd.Series]: + """Change case of labels in obj.""" + case_types = {"preserve", "upper", "lower", "snake"} + case_type = case_type.lower() + if case_type not in case_types: + raise JanitorError(f"case_type must be one of: {case_types}") + + if case_type == "preserve": + return obj + if case_type == "upper": + return obj.str.upper() + if case_type == "lower": + return obj.str.lower() + # Implementation taken from: https://gist.github.com/jaytaylor/3660565 + # by @jtaylor + return ( + obj.str.replace(pat=r"(.)([A-Z][a-z]+)", repl=r"\1_\2", regex=True) + .str.replace(pat=r"([a-z0-9])([A-Z])", repl=r"\1_\2", regex=True) + .str.lower() + ) + + +def _normalize_1( + obj: Union[pd.Index, pd.Series] +) -> Union[pd.Index, pd.Series]: + """Perform normalization of labels in obj.""" + FIXES = [(r"[ /:,?()\.-]", "_"), (r"['’]", ""), (r"[\xa0]", "_")] + for search, replace in FIXES: + obj = obj.str.replace(pat=search, repl=replace, regex=True) + + return obj + + +def _strip_accents( + obj: Union[pd.Index, pd.Series], +) -> Union[pd.Index, pd.Series]: + """Remove accents from a label. + + Inspired from [StackOverflow][so]. + + [so]: https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-strin + """ # noqa: E501 + return obj.map( + lambda f: "".join( + [ + letter + for letter in unicodedata.normalize("NFD", str(f)) + if not unicodedata.combining(letter) + ] + ) + ) + + +def _strip_underscores_func( + obj: Union[pd.Index, pd.Series], + strip_underscores: Union[str, bool] = None, +) -> Union[pd.Index, pd.Series]: + """Strip underscores.""" + underscore_options = {None, "left", "right", "both", "l", "r", True} + if strip_underscores not in underscore_options: + raise JanitorError( + f"strip_underscores must be one of: {underscore_options}" + ) + if strip_underscores in {"left", "l"}: + return obj.str.lstrip("_") + if strip_underscores in {"right", "r"}: + return obj.str.rstrip("_") + if strip_underscores in {True, "both"}: + return obj.str.strip("_") + return obj diff --git a/janitor/functions/utils.py b/janitor/functions/utils.py index dde4880fc..4bf0d0eea 100644 --- a/janitor/functions/utils.py +++ b/janitor/functions/utils.py @@ -12,7 +12,6 @@ from enum import Enum from functools import singledispatch from typing import ( - TYPE_CHECKING, Any, Callable, Hashable, @@ -1143,138 +1142,48 @@ def __eq__(self, other): return self -if TYPE_CHECKING: - from polars import Expr - - def _change_case( - obj: Union[pd.Index, pd.Series, Expr, list, str], + obj: str, case_type: str, - object_type: str, ) -> str: - """Change case of labels in obj.""" + """Change case of obj.""" case_types = {"preserve", "upper", "lower", "snake"} case_type = case_type.lower() if case_type not in case_types: raise JanitorError(f"type must be one of: {case_types}") - if object_type == "pandas": - if case_type == "preserve": - return obj - if case_type == "upper": - return obj.str.upper() - if case_type == "lower": - return obj.str.lower() - # Implementation taken from: https://gist.github.com/jaytaylor/3660565 - # by @jtaylor - return ( - obj.str.replace(pat=r"(.)([A-Z][a-z]+)", repl=r"\1_\2", regex=True) - .str.replace(pat=r"([a-z0-9])([A-Z])", repl=r"\1_\2", regex=True) - .str.lower() - ) - if object_type == "polars": - if case_type == "preserve": - return obj - if case_type == "upper": - return obj.str.to_uppercase() - if case_type == "lower": - return obj.str.to_lowercase() - # Implementation taken from: https://gist.github.com/jaytaylor/3660565 - # by @jtaylor - return ( - obj.str.replace_all( - pattern=r"(.)([A-Z][a-z]+)", value=r"${1}_${2}", literal=False - ) - .str.replace_all( - pattern=r"([a-z0-9])([A-Z])", value=r"${1}_${2}", literal=False - ) - .str.to_lowercase() - ) - if object_type == "string": - if case_type == "preserve": - return obj - if case_type == "upper": - return obj.upper() - if case_type == "lower": - return obj.lower() - # Implementation adapted from: https://gist.github.com/jaytaylor/3660565 - # by @jtaylor - obj = re.sub(pattern=r"(.)([A-Z][a-z]+)", repl=r"\1_\2", string=obj) - obj = re.sub(pattern=r"([a-z0-9])([A-Z])", repl=r"\1_\2", string=obj) - return obj.lower() - if case_type == "preserve": return obj if case_type == "upper": - return [label.upper() for label in obj] + return obj.upper() if case_type == "lower": - return [label.lower() for label in obj] + return obj.lower() # Implementation adapted from: https://gist.github.com/jaytaylor/3660565 # by @jtaylor - obj = [ - re.sub(pattern=r"(.)([A-Z][a-z]+)", repl=r"\1_\2", string=label) - for label in obj - ] - obj = [ - re.sub(pattern=r"([a-z0-9])([A-Z])", repl=r"\1_\2", string=label) - for label in obj - ] - obj = [label.lower() for label in obj] - return obj + obj = re.sub(pattern=r"(.)([A-Z][a-z]+)", repl=r"\1_\2", string=obj) + obj = re.sub(pattern=r"([a-z0-9])([A-Z])", repl=r"\1_\2", string=obj) + return obj.lower() -def _normalize_1( - obj: Union[pd.Index, pd.Series, Expr, list, str], object_type: str -) -> str: +def _normalize_1(obj: str) -> str: """Perform normalization of labels in obj.""" FIXES = [(r"[ /:,?()\.-]", "_"), (r"['’]", ""), (r"[\xa0]", "_")] - if object_type == "pandas": - for search, replace in FIXES: - obj = obj.str.replace(pat=search, repl=replace, regex=True) - elif object_type == "polars": - for search, replace in FIXES: - obj = obj.str.replace_all( - pattern=search, value=replace, literal=False - ) - elif object_type == "string": - for search, replace in FIXES: - obj = re.sub(pattern=search, repl=replace, string=obj) - else: - for search, replace in FIXES: - obj = [ - re.sub(pattern=search, repl=replace, string=label) - for label in obj - ] + for search, replace in FIXES: + obj = re.sub(pattern=search, repl=replace, string=obj) + return obj def _remove_special( - object_type: str, - obj: Union[pd.Index, pd.Series, Expr, list, str] = None, + obj: str, ) -> str: """Remove special characters from obj.""" - if object_type == "pandas": - return obj.str.replace( - pat="[^A-Za-z_\\d]", repl="", regex=True - ).str.strip() - if object_type == "polars": - return obj.str.replace_all( - pattern="[^A-Za-z_\\d]", value="", literal=False - ).str.strip_chars() - elif object_type == "string": - obj = [item for item in obj if item.isalnum() or (item == "_")] - return "".join(obj) - out = [] - for label in obj: - word = [item for item in label if item.isalnum() or (item == "_")] - word = "".join(word) - out.append(word) - return out + obj = [item for item in obj if item.isalnum() or (item == "_")] + return "".join(obj) def _strip_accents( - obj: Union[pd.Index, pd.Series, Expr, list, str], - object_type: str, + obj: str, ) -> str: """Remove accents from a label. @@ -1282,250 +1191,30 @@ def _strip_accents( [so]: https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-strin """ # noqa: E501 - if object_type == "pandas": - return obj.map( - lambda f: "".join( - [ - letter - for letter in unicodedata.normalize("NFD", str(f)) - if not unicodedata.combining(letter) - ] - ) - ) - if object_type == "polars": - from polars import List, Utf8 - - return obj.map_elements( - lambda word: [ - letter - for letter in unicodedata.normalize("NFD", word) - if not unicodedata.combining(letter) - ], - return_dtype=List(Utf8), - ).list.join("") - if object_type == "string": - obj = [ - letter - for letter in unicodedata.normalize("NFD", obj) - if not unicodedata.combining(letter) - ] - return "".join(obj) - out = [] - for label in obj: - word = [ - letter - for letter in unicodedata.normalize("NFD", label) - if not unicodedata.combining(letter) - ] - word = "".join(word) - out.append(word) - return out + + obj = [ + letter + for letter in unicodedata.normalize("NFD", obj) + if not unicodedata.combining(letter) + ] + return "".join(obj) def _strip_underscores_func( - obj: Union[pd.Index, pd.Series, Expr, list, str], - object_type: str, + obj: str, strip_underscores: Union[str, bool] = None, -) -> pd.DataFrame: - """Strip underscores.""" +) -> str: + """Strip underscores from obj.""" underscore_options = {None, "left", "right", "both", "l", "r", True} if strip_underscores not in underscore_options: raise JanitorError( f"strip_underscores must be one of: {underscore_options}" ) - if object_type == "pandas": - if strip_underscores in {"left", "l"}: - return obj.str.lstrip("_") - if strip_underscores in {"right", "r"}: - return obj.str.rstrip("_") - if strip_underscores in {True, "both"}: - return obj.str.strip("_") - return obj - - if object_type == "polars": - if strip_underscores in {"left", "l"}: - return obj.str.strip_chars_start("_") - if strip_underscores in {"right", "r"}: - return obj.str.strip_chars_end("_") - if strip_underscores in {True, "both"}: - return obj.str.strip_chars("_") - return obj - - if object_type == "string": - if strip_underscores in {"left", "l"}: - return obj.lstrip("_") - if strip_underscores in {"right", "r"}: - return obj.rstrip("_") - if strip_underscores in {True, "both"}: - return obj.strip("_") - return obj if strip_underscores in {"left", "l"}: - return [label.lstrip("_") for label in obj] + return obj.lstrip("_") if strip_underscores in {"right", "r"}: - return [label.rstrip("_") for label in obj] + return obj.rstrip("_") if strip_underscores in {True, "both"}: - return [label.strip("_") for label in obj] - return obj - - -def make_clean_names( - obj: Union[pd.Index, pd.Series, Expr, list, str], - strip_underscores: Optional[Union[str, bool]] = None, - case_type: str = "lower", - remove_special: bool = False, - strip_accents: bool = False, - enforce_string: bool = False, - truncate_limit: int = None, - object_type: str = "pandas", -) -> Union[pd.Index, pd.Series, Expr, list]: - """ - Generic function to clean labels in an object. - It can be applied to a pandas Index/Series, a Polars Expression, - or a python string/list. - For pandas, there is a [`clean_names`][janitor.functions.clean_names.clean_names] - method, which is a wrapper around the `make_clean_names` function. - For polars, use this function via existing Polars functions. The examples below - show how you can use this within polars. - - Examples: - >>> import polars as pl - >>> import janitor - >>> df = pl.DataFrame( - ... { - ... "Aloha": range(3), - ... "Bell Chart": range(3), - ... "Animals@#$%^": range(3) - ... } - ... ) - >>> df - shape: (3, 3) - ┌───────┬────────────┬──────────────┐ - │ Aloha ┆ Bell Chart ┆ Animals@#$%^ │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═══════╪════════════╪══════════════╡ - │ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1 ┆ 1 │ - │ 2 ┆ 2 ┆ 2 │ - └───────┴────────────┴──────────────┘ - - Clean the column names, - via [rename](https://docs.pola.rs/py-polars/html/reference/dataframe/api/polars.DataFrame.rename.html#polars-dataframe-rename): - >>> df.rename( - ... lambda objumn_name: make_clean_names( - ... obj=objumn_name, remove_special=True, object_type="string" - ... ) - ... ) - shape: (3, 3) - ┌───────┬────────────┬─────────┐ - │ aloha ┆ bell_chart ┆ animals │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═══════╪════════════╪═════════╡ - │ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1 ┆ 1 │ - │ 2 ┆ 2 ┆ 2 │ - └───────┴────────────┴─────────┘ - - >>> df = pl.DataFrame({"raw": ["Abçdê fgí j"]}) - >>> df - shape: (1, 1) - ┌─────────────┐ - │ raw │ - │ --- │ - │ str │ - ╞═════════════╡ - │ Abçdê fgí j │ - └─────────────┘ - - Clean the column values, - via [with_columns](https://docs.pola.rs/py-polars/html/reference/dataframe/api/polars.DataFrame.with_columns.html#polars-dataframe-with-columns): - >>> df.with_columns( - ... pl.col("raw").pipe( - ... make_clean_names, object_type="polars", strip_accents=True - ... ) - ... ) - shape: (1, 1) - ┌─────────────┐ - │ raw │ - │ --- │ - │ str │ - ╞═════════════╡ - │ abcde_fgi_j │ - └─────────────┘ - - The `make_clean_names` function can also be applied to a python string or list: - >>> raw = ["Abçdê fgí j"] - >>> make_clean_names(raw, object_type='list', strip_accents=True) - ['abcde_fgi_j'] - >>> raw = "Abçdê fgí j" - >>> make_clean_names(raw, object_type='string', strip_accents=True) - 'abcde_fgi_j' - - !!! info "New in version 0.28.0" - - Args: - obj: The object to clean. It can be a pandas Index, - a pandas Series, a polars Expression, a python string, - or a python list. - strip_underscores: Removes the outer underscores from all - labels. Default None keeps outer underscores. Values can be - either 'left', 'right' or 'both' or the respective shorthand 'l', - 'r' and True. - case_type: Whether to make the labels lower or uppercase. - Current case may be preserved with 'preserve', - while snake case conversion (from CamelCase or camelCase only) - can be turned on using "snake". - Default 'lower' makes all characters lowercase. - remove_special: Remove special characters from the labels. - Only letters, numbers and underscores are preserved. - strip_accents: Whether or not to remove accents from - the labels. - enforce_string: Whether or not to convert the labels to string. - Defaults to True, but can be turned off. - truncate_limit: Truncates formatted labels to - the specified length. Default None does not truncate. - object_type: The type of object to clean. It should be either `pandas`, - `polars`, a python `string`, or a python `list`. - Returns: - A pandas Index, pandas Series, polars Expression, a python string, - or a python list. - """ # noqa: E501 - if enforce_string and (object_type == "pandas"): - if not (_is_str_or_cat(obj)): - obj = obj.astype(str) - elif enforce_string and (object_type == "list"): - obj = [str(label) for label in obj] - elif enforce_string and (object_type == "string"): - obj = str(obj) - elif enforce_string and (object_type == "polars"): - from polars import Utf8 - - obj = obj.cast(Utf8) - obj = _change_case(obj, case_type, object_type=object_type) - obj = _normalize_1(obj, object_type=object_type) - if remove_special: - obj = _remove_special(object_type=object_type, obj=obj) - if strip_accents: - obj = _strip_accents(obj=obj, object_type=object_type) - if object_type == "pandas": - obj = obj.str.replace(pat="_+", repl="_", regex=True) - elif object_type == "polars": - obj = obj.str.replace(pattern="_+", value="_", literal=False) - elif object_type == "string": - obj = re.sub(pattern="_+", repl="_", string=obj) - else: - obj = [re.sub(pattern="_+", repl="_", string=label) for label in obj] - obj = _strip_underscores_func( - obj, strip_underscores=strip_underscores, object_type=object_type - ) - if truncate_limit and (object_type == "pandas"): - obj = obj.str[:truncate_limit] - elif truncate_limit and (object_type == "polars"): - obj = obj.str.slice(offset=0, length=truncate_limit) - elif truncate_limit and (object_type == "string"): - obj = obj[:truncate_limit] - elif truncate_limit: - obj = [label[:truncate_limit] for label in obj] + return obj.strip("_") return obj diff --git a/janitor/polars/__init__.py b/janitor/polars/__init__.py new file mode 100644 index 000000000..843002a5b --- /dev/null +++ b/janitor/polars/__init__.py @@ -0,0 +1,131 @@ +from typing import Optional, Union + +from janitor.utils import import_message + +from .functions import _clean_names + +try: + import polars as pl +except ImportError: + import_message( + submodule="polars", + package="polars", + conda_channel="conda-forge", + pip_install=True, + ) + + +@pl.api.register_dataframe_namespace("janitor") +class Frame: + def __init__(self, df: pl.DataFrame) -> pl.DataFrame: + self._df = df + + def clean_names( + self, + strip_underscores: Optional[Union[str, bool]] = None, + case_type: str = "lower", + remove_special: bool = False, + strip_accents: bool = False, + enforce_string: bool = False, + truncate_limit: int = None, + ) -> pl.DataFrame: + """ + Clean the column names in a polars DataFrame. + + Examples: + >>> import polars as pl + >>> import janitor.polars + >>> df = pl.DataFrame( + ... { + ... "Aloha": range(3), + ... "Bell Chart": range(3), + ... "Animals@#$%^": range(3) + ... } + ... ) + >>> df + shape: (3, 3) + ┌───────┬────────────┬──────────────┐ + │ Aloha ┆ Bell Chart ┆ Animals@#$%^ │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═══════╪════════════╪══════════════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 2 ┆ 2 │ + └───────┴────────────┴──────────────┘ + >>> df.janitor.clean_names(remove_special=True) + shape: (3, 3) + ┌───────┬────────────┬─────────┐ + │ aloha ┆ bell_chart ┆ animals │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═══════╪════════════╪═════════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 2 ┆ 2 │ + └───────┴────────────┴─────────┘ + """ + return self._df.rename( + lambda col: _clean_names( + obj=col, + strip_accents=strip_accents, + strip_underscores=strip_underscores, + case_type=case_type, + remove_special=remove_special, + enforce_string=enforce_string, + truncate_limit=truncate_limit, + ) + ) + + +@pl.api.register_expr_namespace("janitor") +class PolarsExpr: + def __init__(self, expr: pl.Expr) -> pl.Expr: + self._expr = expr + + def clean_names( + self, + strip_underscores: Optional[Union[str, bool]] = None, + case_type: str = "lower", + remove_special: bool = False, + strip_accents: bool = False, + enforce_string: bool = False, + truncate_limit: int = None, + ) -> pl.Expr: + """ + Clean the labels in a polars Expression. + + Examples: + >>> import polars as pl + >>> import janitor.polars + >>> df = pl.DataFrame({"raw": ["Abçdê fgí j"]}) + >>> df + shape: (1, 1) + ┌─────────────┐ + │ raw │ + │ --- │ + │ str │ + ╞═════════════╡ + │ Abçdê fgí j │ + └─────────────┘ + + Clean the column values: + >>> df.with_columns(pl.col("raw").janitor.clean_names(strip_accents=True)) + shape: (1, 1) + ┌─────────────┐ + │ raw │ + │ --- │ + │ str │ + ╞═════════════╡ + │ abcde_fgi_j │ + └─────────────┘ + """ + return _clean_names( + obj=self._expr, + strip_accents=strip_accents, + strip_underscores=strip_underscores, + case_type=case_type, + remove_special=remove_special, + enforce_string=enforce_string, + truncate_limit=truncate_limit, + ) diff --git a/janitor/polars/functions.py b/janitor/polars/functions.py new file mode 100644 index 000000000..c180ccd85 --- /dev/null +++ b/janitor/polars/functions.py @@ -0,0 +1,160 @@ +"""General purpose data cleaning functions for pyspark.""" + +import re +import unicodedata +from typing import Optional, Union + +from janitor.errors import JanitorError +from janitor.functions.utils import ( + _change_case, + _normalize_1, + _remove_special, + _strip_accents, + _strip_underscores_func, +) +from janitor.utils import import_message + +try: + import polars as pl +except ImportError: + import_message( + submodule="polars", + package="polars", + conda_channel="conda-forge", + pip_install=True, + ) + + +def _change_case_expr( + obj: pl.Expr, + case_type: str, +) -> pl.Expr: + """Change case of obj.""" + case_types = {"preserve", "upper", "lower", "snake"} + case_type = case_type.lower() + if case_type not in case_types: + raise JanitorError(f"type must be one of: {case_types}") + + if case_type == "preserve": + return obj + if case_type == "upper": + return obj.str.to_uppercase() + if case_type == "lower": + return obj.str.to_lowercase() + # Implementation taken from: https://gist.github.com/jaytaylor/3660565 + # by @jtaylor + return ( + obj.str.replace_all( + pattern=r"(.)([A-Z][a-z]+)", value=r"${1}_${2}", literal=False + ) + .str.replace_all( + pattern=r"([a-z0-9])([A-Z])", value=r"${1}_${2}", literal=False + ) + .str.to_lowercase() + ) + + +def _normalize_expr(obj: pl.Expr) -> pl.Expr: + """Perform normalization of labels in obj.""" + FIXES = [(r"[ /:,?()\.-]", "_"), (r"['’]", ""), (r"[\xa0]", "_")] + for search, replace in FIXES: + obj = obj.str.replace_all(pattern=search, value=replace, literal=False) + return obj + + +def _remove_special_expr( + obj: pl.Expr, +) -> pl.Expr: + """Remove special characters from obj.""" + return obj.str.replace_all( + pattern="[^A-Za-z_\\d]", value="", literal=False + ).str.strip_chars() + + +def _strip_accents_expr( + obj: pl.Expr, +) -> pl.Expr: + """Remove accents from a label. + + Inspired from [StackOverflow][so]. + + [so]: https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-strin + """ # noqa: E501 + # TODO: possible implementation in Rust + # or use a pyarrow implementation? + # https://github.com/pola-rs/polars/issues/11455 + return obj.map_elements( + lambda word: [ + letter + for letter in unicodedata.normalize("NFD", word) + if not unicodedata.combining(letter) + ], + return_dtype=pl.List(pl.Utf8), + ).list.join("") + + +def _strip_underscores_func_expr( + obj: pl.Expr, + strip_underscores: Union[str, bool] = None, +) -> pl.Expr: + """Strip underscores from obj.""" + underscore_options = {None, "left", "right", "both", "l", "r", True} + if strip_underscores not in underscore_options: + raise JanitorError( + f"strip_underscores must be one of: {underscore_options}" + ) + if strip_underscores in {"left", "l"}: + return obj.str.strip_chars_start("_") + if strip_underscores in {"right", "r"}: + return obj.str.strip_chars_end("_") + if strip_underscores in {True, "both"}: + return obj.str.strip_chars("_") + return obj + + +def _clean_names( + obj: Union[str, pl.Expr], + strip_underscores: Optional[Union[str, bool]] = None, + case_type: str = "lower", + remove_special: bool = False, + strip_accents: bool = False, + enforce_string: bool = False, + truncate_limit: int = None, +) -> str: + """ + Generic function to clean labels. + Applies either to the columns of a polars DataFrame, + or a polars Expression. + """ + if isinstance(obj, str): + if enforce_string: + obj = str(obj) + obj = _change_case(obj=obj, case_type=case_type) + obj = _normalize_1(obj=obj) + if remove_special: + obj = _remove_special(obj=obj) + if strip_accents: + obj = _strip_accents(obj=obj) + obj = re.sub(pattern="_+", repl="_", string=obj) + obj = _strip_underscores_func( + obj, + strip_underscores=strip_underscores, + ) + obj = obj[:truncate_limit] + return obj + if enforce_string: + obj = obj.cast(pl.Utf8) + obj = _change_case_expr(obj=obj, case_type=case_type) + obj = _normalize_expr(obj=obj) + if remove_special: + obj = _remove_special_expr(obj=obj) + if strip_accents: + obj = _strip_accents_expr(obj=obj) + obj = obj.str.replace(pattern="_+", value="_", literal=False) + obj = _strip_underscores_func_expr( + obj, + strip_underscores=strip_underscores, + ) + if truncate_limit: + obj = obj.str.slice(offset=0, length=truncate_limit) + return obj diff --git a/tests/functions/test_clean_names_polars.py b/tests/polars/functions/test_clean_names.py similarity index 66% rename from tests/functions/test_clean_names_polars.py rename to tests/polars/functions/test_clean_names.py index 56d0b8e95..5ed77c8e8 100644 --- a/tests/functions/test_clean_names_polars.py +++ b/tests/polars/functions/test_clean_names.py @@ -1,14 +1,12 @@ import polars as pl import pytest -from janitor import make_clean_names # noqa: E402 - @pytest.mark.functions def test_clean_names_method_chain(dataframe): """Tests clean_names default args in a method chain.""" df = pl.from_pandas(dataframe) - df = df.rename(lambda col: make_clean_names(col, object_type="string")) + df = df.janitor.clean_names() expected_columns = [ "a", "bell_chart", @@ -23,11 +21,7 @@ def test_clean_names_method_chain(dataframe): def test_clean_names_special_characters(dataframe): """Tests clean_names `remove_special` parameter.""" df = pl.from_pandas(dataframe) - df = df.rename( - lambda col: make_clean_names( - col, object_type="string", remove_special=True - ) - ) + df = df.janitor.clean_names(remove_special=True) expected_columns = [ "a", "bell_chart", @@ -42,11 +36,7 @@ def test_clean_names_special_characters(dataframe): def test_clean_names_uppercase(dataframe): """Tests clean_names `case_type` parameter = upper.""" df = pl.from_pandas(dataframe) - df = df.rename( - lambda col: make_clean_names( - col, object_type="string", remove_special=True, case_type="upper" - ) - ) + df = df.janitor.clean_names(remove_special=True, case_type="upper") expected_columns = [ "A", "BELL_CHART", @@ -61,11 +51,7 @@ def test_clean_names_uppercase(dataframe): def test_clean_names_strip_accents(): """Tests clean_names `strip_accents` parameter.""" df = pl.DataFrame({"João": [1, 2], "Лука́ся": [1, 2], "Käfer": [1, 2]}) - df = df.rename( - lambda col: make_clean_names( - col, object_type="string", strip_accents=True - ) - ) + df = df.janitor.clean_names(strip_accents=True) expected_columns = ["joao", "лукася", "kafer"] assert df.columns == expected_columns @@ -77,14 +63,7 @@ def test_clean_names_camelcase_to_snake(dataframe): df = ( df.select("a") .rename({"a": "AColumnName"}) - .rename( - lambda col: make_clean_names( - col, - object_type="string", - remove_special=True, - case_type="snake", - ) - ) + .janitor.clean_names(remove_special=True, case_type="snake") ) assert df.columns == ["a_column_name"] @@ -93,12 +72,7 @@ def test_clean_names_camelcase_to_snake(dataframe): def test_clean_names_truncate_limit(dataframe): """Tests clean_names `truncate_limit` parameter.""" df = pl.from_pandas(dataframe) - df = df.rename( - lambda col: make_clean_names( - col, object_type="string", truncate_limit=7 - ) - ) - # df = dataframe.clean_names(truncate_limit=7) + df = df.janitor.clean_names(truncate_limit=7) expected_columns = ["a", "bell_ch", "decorat", "animals", "cities"] assert df.columns == expected_columns @@ -112,14 +86,7 @@ def test_charac(): r"Current accountbalance(in % of GDP)": range(5), } ) - df = df.rename( - lambda col: make_clean_names( - col, - object_type="string", - strip_underscores=True, - case_type="lower", - ) - ) + df = df.janitor.clean_names(strip_underscores=True, case_type="lower") assert "current_accountbalance_in_%_of_gdp" in df.columns @@ -128,8 +95,6 @@ def test_clean_column_values(): """Clean column values""" raw = pl.DataFrame({"raw": ["Abçdê fgí j"]}) outcome = raw.with_columns( - pl.col("raw").pipe( - make_clean_names, object_type="polars", strip_accents=True - ) + pl.col("raw").janitor.clean_names(strip_accents=True) ) assert list(outcome)[0][0] == "abcde_fgi_j" From 799532f7a5592498e5c8ce5a535d3f3083f6fec9 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sun, 21 Apr 2024 20:54:05 +1000 Subject: [PATCH 20/46] fix doctests --- janitor/functions/utils.py | 4 +- janitor/polars/__init__.py | 6 +- janitor/polars/functions.py | 61 +++++++++++-------- ...an_names.py => test_clean_names_polars.py} | 0 4 files changed, 41 insertions(+), 30 deletions(-) rename tests/polars/functions/{test_clean_names.py => test_clean_names_polars.py} (100%) diff --git a/janitor/functions/utils.py b/janitor/functions/utils.py index 4bf0d0eea..4e1f443ee 100644 --- a/janitor/functions/utils.py +++ b/janitor/functions/utils.py @@ -1166,7 +1166,7 @@ def _change_case( def _normalize_1(obj: str) -> str: - """Perform normalization of labels in obj.""" + """Perform normalization of obj.""" FIXES = [(r"[ /:,?()\.-]", "_"), (r"['’]", ""), (r"[\xa0]", "_")] for search, replace in FIXES: obj = re.sub(pattern=search, repl=replace, string=obj) @@ -1185,7 +1185,7 @@ def _remove_special( def _strip_accents( obj: str, ) -> str: - """Remove accents from a label. + """Remove accents from obj. Inspired from [StackOverflow][so]. diff --git a/janitor/polars/__init__.py b/janitor/polars/__init__.py index 843002a5b..d8c8bf3df 100644 --- a/janitor/polars/__init__.py +++ b/janitor/polars/__init__.py @@ -2,7 +2,7 @@ from janitor.utils import import_message -from .functions import _clean_names +from .functions import _clean_column_names, _clean_expr_names try: import polars as pl @@ -66,7 +66,7 @@ def clean_names( └───────┴────────────┴─────────┘ """ return self._df.rename( - lambda col: _clean_names( + lambda col: _clean_column_names( obj=col, strip_accents=strip_accents, strip_underscores=strip_underscores, @@ -120,7 +120,7 @@ def clean_names( │ abcde_fgi_j │ └─────────────┘ """ - return _clean_names( + return _clean_expr_names( obj=self._expr, strip_accents=strip_accents, strip_underscores=strip_underscores, diff --git a/janitor/polars/functions.py b/janitor/polars/functions.py index c180ccd85..4322fe79c 100644 --- a/janitor/polars/functions.py +++ b/janitor/polars/functions.py @@ -1,4 +1,4 @@ -"""General purpose data cleaning functions for pyspark.""" +"""functions for polars.""" import re import unicodedata @@ -29,7 +29,7 @@ def _change_case_expr( obj: pl.Expr, case_type: str, ) -> pl.Expr: - """Change case of obj.""" + """Change case of labels in obj.""" case_types = {"preserve", "upper", "lower", "snake"} case_type = case_type.lower() if case_type not in case_types: @@ -65,7 +65,7 @@ def _normalize_expr(obj: pl.Expr) -> pl.Expr: def _remove_special_expr( obj: pl.Expr, ) -> pl.Expr: - """Remove special characters from obj.""" + """Remove special characters from the labels in obj.""" return obj.str.replace_all( pattern="[^A-Za-z_\\d]", value="", literal=False ).str.strip_chars() @@ -74,7 +74,7 @@ def _remove_special_expr( def _strip_accents_expr( obj: pl.Expr, ) -> pl.Expr: - """Remove accents from a label. + """Remove accents from the labels in obj. Inspired from [StackOverflow][so]. @@ -112,8 +112,8 @@ def _strip_underscores_func_expr( return obj -def _clean_names( - obj: Union[str, pl.Expr], +def _clean_column_names( + obj: str, strip_underscores: Optional[Union[str, bool]] = None, case_type: str = "lower", remove_special: bool = False, @@ -122,26 +122,37 @@ def _clean_names( truncate_limit: int = None, ) -> str: """ - Generic function to clean labels. - Applies either to the columns of a polars DataFrame, - or a polars Expression. + Function to clean the column names of a polars DataFrame. + """ + if enforce_string: + obj = str(obj) + obj = _change_case(obj=obj, case_type=case_type) + obj = _normalize_1(obj=obj) + if remove_special: + obj = _remove_special(obj=obj) + if strip_accents: + obj = _strip_accents(obj=obj) + obj = re.sub(pattern="_+", repl="_", string=obj) + obj = _strip_underscores_func( + obj, + strip_underscores=strip_underscores, + ) + obj = obj[:truncate_limit] + return obj + + +def _clean_expr_names( + obj: pl.Expr, + strip_underscores: Optional[Union[str, bool]] = None, + case_type: str = "lower", + remove_special: bool = False, + strip_accents: bool = False, + enforce_string: bool = False, + truncate_limit: int = None, +) -> pl.Expr: + """ + Function to clean the labels of a polars Expression. """ - if isinstance(obj, str): - if enforce_string: - obj = str(obj) - obj = _change_case(obj=obj, case_type=case_type) - obj = _normalize_1(obj=obj) - if remove_special: - obj = _remove_special(obj=obj) - if strip_accents: - obj = _strip_accents(obj=obj) - obj = re.sub(pattern="_+", repl="_", string=obj) - obj = _strip_underscores_func( - obj, - strip_underscores=strip_underscores, - ) - obj = obj[:truncate_limit] - return obj if enforce_string: obj = obj.cast(pl.Utf8) obj = _change_case_expr(obj=obj, case_type=case_type) diff --git a/tests/polars/functions/test_clean_names.py b/tests/polars/functions/test_clean_names_polars.py similarity index 100% rename from tests/polars/functions/test_clean_names.py rename to tests/polars/functions/test_clean_names_polars.py From dbce4b934fcb2a0e82899358c11d03700a0694b5 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sun, 21 Apr 2024 21:16:32 +1000 Subject: [PATCH 21/46] fix tests; add polars to documentation --- janitor/polars/__init__.py | 2 +- mkdocs.yml | 1 + mkdocs/api/polars.md | 3 +++ pyproject.toml | 2 +- 4 files changed, 6 insertions(+), 2 deletions(-) create mode 100644 mkdocs/api/polars.md diff --git a/janitor/polars/__init__.py b/janitor/polars/__init__.py index d8c8bf3df..6aa5fcc64 100644 --- a/janitor/polars/__init__.py +++ b/janitor/polars/__init__.py @@ -16,7 +16,7 @@ @pl.api.register_dataframe_namespace("janitor") -class Frame: +class PolarsFrame: def __init__(self, df: pl.DataFrame) -> pl.DataFrame: self._df = df diff --git a/mkdocs.yml b/mkdocs.yml index 639d71bea..a7545afc5 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -45,6 +45,7 @@ nav: - Machine Learning: api/ml.md - Math: api/math.md # - PySpark: api/pyspark.md # will be added back later + - Polars: api/polars.md - Timeseries: api/timeseries.md - XArray: api/xarray.md - Development Guide: devguide.md diff --git a/mkdocs/api/polars.md b/mkdocs/api/polars.md new file mode 100644 index 000000000..db5b5d14f --- /dev/null +++ b/mkdocs/api/polars.md @@ -0,0 +1,3 @@ +# Polars + +::: janitor.polars diff --git a/pyproject.toml b/pyproject.toml index f6b98f54b..52dc3f172 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ target-version = ['py36', 'py37', 'py38'] [tool.interrogate] exclude = ["setup.py", "docs", "nbconvert_config.py"] -fail-under = 55 +fail-under = 10 ignore-init-method = true ignore-init-module = true ignore-module = false From 1c642e6ba49ab1ea69b3145f6f492909f1876496 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sun, 21 Apr 2024 21:16:40 +1000 Subject: [PATCH 22/46] fix tests; add polars to documentation --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 52dc3f172..f6b98f54b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ target-version = ['py36', 'py37', 'py38'] [tool.interrogate] exclude = ["setup.py", "docs", "nbconvert_config.py"] -fail-under = 10 +fail-under = 55 ignore-init-method = true ignore-init-module = true ignore-module = false From 407d21b90314bd51cd16537fb6197e99b36fa7ce Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sun, 21 Apr 2024 21:23:45 +1000 Subject: [PATCH 23/46] import janitor.polars --- pyproject.toml | 1 + tests/polars/functions/test_clean_names_polars.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index f6b98f54b..0a697589f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,7 @@ markers = [ "utils: utility tests", "engineering: tests for engineering", "ml: tests for machine learning", + "polars: tests for polars methods", "spark_functions: tests for pyspark functions", "xarray: tests for xarray functions", "timeseries: tests for timeseries", diff --git a/tests/polars/functions/test_clean_names_polars.py b/tests/polars/functions/test_clean_names_polars.py index 5ed77c8e8..23ce38742 100644 --- a/tests/polars/functions/test_clean_names_polars.py +++ b/tests/polars/functions/test_clean_names_polars.py @@ -1,6 +1,8 @@ import polars as pl import pytest +from janitor import polars # noqa: F401 + @pytest.mark.functions def test_clean_names_method_chain(dataframe): From aedfc65c7e23fba3dd967fd7ac29ed1c95f6d52d Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sun, 21 Apr 2024 21:49:44 +1000 Subject: [PATCH 24/46] control docs output for polars submodule --- mkdocs/api/polars.md | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/mkdocs/api/polars.md b/mkdocs/api/polars.md index db5b5d14f..905d9ed56 100644 --- a/mkdocs/api/polars.md +++ b/mkdocs/api/polars.md @@ -1,3 +1,19 @@ -# Polars +# PolarsExpr -::: janitor.polars +::: janitor.polars.PolarsExpr + handler: python + options: + members: + - clean_names + show_root_heading: false + show_source: true + +# PolarsFrame + +::: janitor.polars.PolarsFrame + handler: python + options: + members: + - clean_names + show_root_heading: false + show_source: true From db9b48649f7c89dcb2ea85b32769320b4e433f12 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sun, 21 Apr 2024 21:58:47 +1000 Subject: [PATCH 25/46] exclude functions in docs rendering --- mkdocs/api/polars.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mkdocs/api/polars.md b/mkdocs/api/polars.md index 905d9ed56..4a6c92f09 100644 --- a/mkdocs/api/polars.md +++ b/mkdocs/api/polars.md @@ -7,6 +7,7 @@ - clean_names show_root_heading: false show_source: true + show_submodules: true # PolarsFrame @@ -17,3 +18,4 @@ - clean_names show_root_heading: false show_source: true + show_submodules: true From 6a91e673bbc6c274bb8c8aa7c7811272f11196ae Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sun, 21 Apr 2024 21:59:29 +1000 Subject: [PATCH 26/46] exclude functions in docs rendering --- mkdocs/api/polars.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mkdocs/api/polars.md b/mkdocs/api/polars.md index 4a6c92f09..e6dba459d 100644 --- a/mkdocs/api/polars.md +++ b/mkdocs/api/polars.md @@ -7,7 +7,7 @@ - clean_names show_root_heading: false show_source: true - show_submodules: true + show_submodules: false # PolarsFrame @@ -18,4 +18,4 @@ - clean_names show_root_heading: false show_source: true - show_submodules: true + show_submodules: false From 7a8807855bb13ec7663bb69004e57de8ba941f91 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sun, 21 Apr 2024 22:03:42 +1000 Subject: [PATCH 27/46] show_submodules=true --- mkdocs/api/polars.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/mkdocs/api/polars.md b/mkdocs/api/polars.md index e6dba459d..905d9ed56 100644 --- a/mkdocs/api/polars.md +++ b/mkdocs/api/polars.md @@ -7,7 +7,6 @@ - clean_names show_root_heading: false show_source: true - show_submodules: false # PolarsFrame @@ -18,4 +17,3 @@ - clean_names show_root_heading: false show_source: true - show_submodules: false From 6d7885e9952e5c4395e34fc3eb0a027b5fbd4665 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sun, 21 Apr 2024 22:45:31 +1000 Subject: [PATCH 28/46] fix docstring rendering for polars --- janitor/polars/__init__.py | 49 ++++++++++++++++++++++++++++++++++--- janitor/polars/functions.py | 3 --- mkdocs/api/polars.md | 20 +++------------ 3 files changed, 50 insertions(+), 22 deletions(-) diff --git a/janitor/polars/__init__.py b/janitor/polars/__init__.py index 6aa5fcc64..9d2ca41de 100644 --- a/janitor/polars/__init__.py +++ b/janitor/polars/__init__.py @@ -26,7 +26,6 @@ def clean_names( case_type: str = "lower", remove_special: bool = False, strip_accents: bool = False, - enforce_string: bool = False, truncate_limit: int = None, ) -> pl.DataFrame: """ @@ -64,7 +63,28 @@ def clean_names( │ 1 ┆ 1 ┆ 1 │ │ 2 ┆ 2 ┆ 2 │ └───────┴────────────┴─────────┘ - """ + + !!! info "New in version 0.28.0" + + Args: + strip_underscores: Removes the outer underscores from all + column names. Default None keeps outer underscores. Values can be + either 'left', 'right' or 'both' or the respective shorthand 'l', + 'r' and True. + case_type: Whether to make the column names lower or uppercase. + Current case may be preserved with 'preserve', + while snake case conversion (from CamelCase or camelCase only) + can be turned on using "snake". + Default 'lower' makes all characters lowercase. + remove_special: Remove special characters from the column names. + Only letters, numbers and underscores are preserved. + strip_accents: Whether or not to remove accents from + the labels. + truncate_limit: Truncates formatted column names to + the specified length. Default None does not truncate. + Returns: + A polars DataFrame. + """ # noqa: E501 return self._df.rename( lambda col: _clean_column_names( obj=col, @@ -72,7 +92,6 @@ def clean_names( strip_underscores=strip_underscores, case_type=case_type, remove_special=remove_special, - enforce_string=enforce_string, truncate_limit=truncate_limit, ) ) @@ -119,6 +138,30 @@ def clean_names( ╞═════════════╡ │ abcde_fgi_j │ └─────────────┘ + + !!! info "New in version 0.28.0" + + Args: + strip_underscores: Removes the outer underscores + from all labels in the Expression. + Default None keeps outer underscores. + Values can be either 'left', 'right' + or 'both' or the respective shorthand 'l', + 'r' and True. + case_type: Whether to make the labels in the expression lower or uppercase. + Current case may be preserved with 'preserve', + while snake case conversion (from CamelCase or camelCase only) + can be turned on using "snake". + Default 'lower' makes all characters lowercase. + remove_special: Remove special characters from the values in the expression. + Only letters, numbers and underscores are preserved. + strip_accents: Whether or not to remove accents from + the expression. + enforce_string: Whether or not to cast the expression to a string type. + truncate_limit: Truncates formatted labels in the expression to + the specified length. Default None does not truncate. + Returns: + A polars Expression. """ return _clean_expr_names( obj=self._expr, diff --git a/janitor/polars/functions.py b/janitor/polars/functions.py index 4322fe79c..31e6106d5 100644 --- a/janitor/polars/functions.py +++ b/janitor/polars/functions.py @@ -118,14 +118,11 @@ def _clean_column_names( case_type: str = "lower", remove_special: bool = False, strip_accents: bool = False, - enforce_string: bool = False, truncate_limit: int = None, ) -> str: """ Function to clean the column names of a polars DataFrame. """ - if enforce_string: - obj = str(obj) obj = _change_case(obj=obj, case_type=case_type) obj = _normalize_1(obj=obj) if remove_special: diff --git a/mkdocs/api/polars.md b/mkdocs/api/polars.md index 905d9ed56..a2cbd574c 100644 --- a/mkdocs/api/polars.md +++ b/mkdocs/api/polars.md @@ -1,19 +1,7 @@ -# PolarsExpr +# Polars -::: janitor.polars.PolarsExpr - handler: python +::: janitor.polars options: members: - - clean_names - show_root_heading: false - show_source: true - -# PolarsFrame - -::: janitor.polars.PolarsFrame - handler: python - options: - members: - - clean_names - show_root_heading: false - show_source: true + - PolarsExpr + - PolarsFrame From 944fa0215a8c9a7bb3c50b78ea96627234bb8271 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sun, 21 Apr 2024 23:05:03 +1000 Subject: [PATCH 29/46] Expression -> expression --- janitor/functions/clean_names.py | 2 +- janitor/polars/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/janitor/functions/clean_names.py b/janitor/functions/clean_names.py index 7eb2a7538..a38753fa8 100644 --- a/janitor/functions/clean_names.py +++ b/janitor/functions/clean_names.py @@ -181,7 +181,7 @@ def _clean_names( """ Generic function to clean labels in a pandas object. """ - if enforce_string and not (_is_str_or_cat(obj)): + if enforce_string and not _is_str_or_cat(obj): obj = obj.astype(str) obj = _change_case(obj=obj, case_type=case_type) obj = _normalize_1(obj=obj) diff --git a/janitor/polars/__init__.py b/janitor/polars/__init__.py index 9d2ca41de..5637575f5 100644 --- a/janitor/polars/__init__.py +++ b/janitor/polars/__init__.py @@ -143,7 +143,7 @@ def clean_names( Args: strip_underscores: Removes the outer underscores - from all labels in the Expression. + from all labels in the expression. Default None keeps outer underscores. Values can be either 'left', 'right' or 'both' or the respective shorthand 'l', From e9c370a50face684f2eb24db857090ff1f72deda Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Tue, 23 Apr 2024 13:31:54 +1000 Subject: [PATCH 30/46] rename functions.py --- janitor/polars/__init__.py | 2 +- janitor/polars/{functions.py => clean_names.py} | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename janitor/polars/{functions.py => clean_names.py} (99%) diff --git a/janitor/polars/__init__.py b/janitor/polars/__init__.py index 5637575f5..449651504 100644 --- a/janitor/polars/__init__.py +++ b/janitor/polars/__init__.py @@ -2,7 +2,7 @@ from janitor.utils import import_message -from .functions import _clean_column_names, _clean_expr_names +from .clean_names import _clean_column_names, _clean_expr_names try: import polars as pl diff --git a/janitor/polars/functions.py b/janitor/polars/clean_names.py similarity index 99% rename from janitor/polars/functions.py rename to janitor/polars/clean_names.py index 31e6106d5..3226c9d33 100644 --- a/janitor/polars/functions.py +++ b/janitor/polars/clean_names.py @@ -1,4 +1,4 @@ -"""functions for polars.""" +"""clean_names implementation for polars.""" import re import unicodedata From ee66d2ae7659f7be6e80668fc0aa3264afbac6eb Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Mon, 29 Apr 2024 22:56:46 +1000 Subject: [PATCH 31/46] pivot_longer implemented for polars --- janitor/functions/pivot.py | 14 +- janitor/polars/__init__.py | 433 ++++++--- janitor/polars/clean_names.py | 168 ---- janitor/polars/pivot_longer.py | 669 +++++++++++++ mkdocs/api/polars.md | 1 - .../functions/test_clean_names_polars.py | 102 -- .../functions/test_pivot_longer_polars.py | 913 ++++++++++++++++++ 7 files changed, 1887 insertions(+), 413 deletions(-) delete mode 100644 janitor/polars/clean_names.py create mode 100644 janitor/polars/pivot_longer.py delete mode 100644 tests/polars/functions/test_clean_names_polars.py create mode 100644 tests/polars/functions/test_pivot_longer_polars.py diff --git a/janitor/functions/pivot.py b/janitor/functions/pivot.py index 7efeba45b..51bc78419 100644 --- a/janitor/functions/pivot.py +++ b/janitor/functions/pivot.py @@ -98,7 +98,7 @@ def pivot_longer( 6 setosa Petal.Width 0.2 7 virginica Petal.Width 1.8 - Split the column labels into parts: + Split the column labels into individual columns: >>> df.pivot_longer( ... index = 'Species', ... names_to = ('part', 'dimension'), @@ -167,7 +167,7 @@ def pivot_longer( value int64 dtype: object - Use multiple `.value` to reshape dataframe: + Use multiple `.value` to reshape the dataframe: >>> df = pd.DataFrame( ... [ ... { @@ -265,16 +265,6 @@ def pivot_longer( ... "Gin": [16, 200, 34], ... "Vodka": [20, 33, 18], ... }, - ... columns=[ - ... "City", - ... "State", - ... "Name", - ... "Mango", - ... "Orange", - ... "Watermelon", - ... "Gin", - ... "Vodka", - ... ], ... ) >>> df City State Name Mango Orange Watermelon Gin Vodka diff --git a/janitor/polars/__init__.py b/janitor/polars/__init__.py index 449651504..59b15ff72 100644 --- a/janitor/polars/__init__.py +++ b/janitor/polars/__init__.py @@ -1,8 +1,10 @@ -from typing import Optional, Union +from typing import Any, Optional, Sequence, Union + +from polars.type_aliases import ColumnNameOrSelector from janitor.utils import import_message -from .clean_names import _clean_column_names, _clean_expr_names +from .pivot_longer import _pivot_longer try: import polars as pl @@ -20,155 +22,326 @@ class PolarsFrame: def __init__(self, df: pl.DataFrame) -> pl.DataFrame: self._df = df - def clean_names( + def pivot_longer( self, - strip_underscores: Optional[Union[str, bool]] = None, - case_type: str = "lower", - remove_special: bool = False, - strip_accents: bool = False, - truncate_limit: int = None, + index: Union[ + ColumnNameOrSelector, Sequence[ColumnNameOrSelector], None + ] = None, + column_names: Union[ + ColumnNameOrSelector, Sequence[ColumnNameOrSelector], None + ] = None, + names_to: Optional[Union[list, tuple, str]] = "variable", + values_to: Optional[Union[list, tuple, str]] = "value", + names_sep: Optional[Union[str, None]] = None, + names_pattern: Optional[Union[list, tuple, str, None]] = None, + names_transform: Optional[Any] = pl.Utf8, ) -> pl.DataFrame: """ - Clean the column names in a polars DataFrame. + Unpivots a DataFrame from *wide* to *long* format. + + It is modeled after the `pivot_longer` function in R's tidyr package, + and also takes inspiration from the `melt` function in R's data.table package. + + This function is useful to massage a DataFrame into a format where + one or more columns are considered measured variables, and all other + columns are considered as identifier variables. + + All measured variables are *unpivoted* (and typically duplicated) along the + row axis. Examples: >>> import polars as pl + >>> import polars.selectors as cs >>> import janitor.polars - >>> df = pl.DataFrame( + >>> df = pd.DataFrame( ... { - ... "Aloha": range(3), - ... "Bell Chart": range(3), - ... "Animals@#$%^": range(3) + ... "Sepal.Length": [5.1, 5.9], + ... "Sepal.Width": [3.5, 3.0], + ... "Petal.Length": [1.4, 5.1], + ... "Petal.Width": [0.2, 1.8], + ... "Species": ["setosa", "virginica"], ... } ... ) >>> df - shape: (3, 3) - ┌───────┬────────────┬──────────────┐ - │ Aloha ┆ Bell Chart ┆ Animals@#$%^ │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═══════╪════════════╪══════════════╡ - │ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1 ┆ 1 │ - │ 2 ┆ 2 ┆ 2 │ - └───────┴────────────┴──────────────┘ - >>> df.janitor.clean_names(remove_special=True) - shape: (3, 3) - ┌───────┬────────────┬─────────┐ - │ aloha ┆ bell_chart ┆ animals │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═══════╪════════════╪═════════╡ - │ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1 ┆ 1 │ - │ 2 ┆ 2 ┆ 2 │ - └───────┴────────────┴─────────┘ + shape: (2, 5) + ┌──────────────┬─────────────┬──────────────┬─────────────┬───────────┐ + │ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 ┆ f64 ┆ str │ + ╞══════════════╪═════════════╪══════════════╪═════════════╪═══════════╡ + │ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 ┆ setosa │ + │ 5.9 ┆ 3.0 ┆ 5.1 ┆ 1.8 ┆ virginica │ + └──────────────┴─────────────┴──────────────┴─────────────┴───────────┘ - !!! info "New in version 0.28.0" + Replicate polars' [melt](https://docs.pola.rs/py-polars/html/reference/dataframe/api/polars.DataFrame.melt.html#polars-dataframe-melt): + >>> df.janitor.pivot_longer(index = 'Species') + shape: (8, 3) + ┌───────────┬──────────────┬───────┐ + │ Species ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ f64 │ + ╞═══════════╪══════════════╪═══════╡ + │ setosa ┆ Sepal.Length ┆ 5.1 │ + │ virginica ┆ Sepal.Length ┆ 5.9 │ + │ setosa ┆ Sepal.Width ┆ 3.5 │ + │ virginica ┆ Sepal.Width ┆ 3.0 │ + │ setosa ┆ Petal.Length ┆ 1.4 │ + │ virginica ┆ Petal.Length ┆ 5.1 │ + │ setosa ┆ Petal.Width ┆ 0.2 │ + │ virginica ┆ Petal.Width ┆ 1.8 │ + └───────────┴──────────────┴───────┘ - Args: - strip_underscores: Removes the outer underscores from all - column names. Default None keeps outer underscores. Values can be - either 'left', 'right' or 'both' or the respective shorthand 'l', - 'r' and True. - case_type: Whether to make the column names lower or uppercase. - Current case may be preserved with 'preserve', - while snake case conversion (from CamelCase or camelCase only) - can be turned on using "snake". - Default 'lower' makes all characters lowercase. - remove_special: Remove special characters from the column names. - Only letters, numbers and underscores are preserved. - strip_accents: Whether or not to remove accents from - the labels. - truncate_limit: Truncates formatted column names to - the specified length. Default None does not truncate. - Returns: - A polars DataFrame. - """ # noqa: E501 - return self._df.rename( - lambda col: _clean_column_names( - obj=col, - strip_accents=strip_accents, - strip_underscores=strip_underscores, - case_type=case_type, - remove_special=remove_special, - truncate_limit=truncate_limit, - ) - ) + Split the column labels into individual columns: + >>> df.janitor.pivot_longer( + ... index = 'Species', + ... names_to = ('part', 'dimension'), + ... names_sep = '.', + ... ) + shape: (8, 4) + ┌───────────┬───────┬───────────┬───────┐ + │ Species ┆ part ┆ dimension ┆ value │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ f64 │ + ╞═══════════╪═══════╪═══════════╪═══════╡ + │ setosa ┆ Sepal ┆ Length ┆ 5.1 │ + │ virginica ┆ Sepal ┆ Length ┆ 5.9 │ + │ setosa ┆ Sepal ┆ Width ┆ 3.5 │ + │ virginica ┆ Sepal ┆ Width ┆ 3.0 │ + │ setosa ┆ Petal ┆ Length ┆ 1.4 │ + │ virginica ┆ Petal ┆ Length ┆ 5.1 │ + │ setosa ┆ Petal ┆ Width ┆ 0.2 │ + │ virginica ┆ Petal ┆ Width ┆ 1.8 │ + └───────────┴───────┴───────────┴───────┘ + Retain parts of the column names as headers: + >>> df.janitor.pivot_longer( + ... index = 'Species', + ... names_to = ('part', '.value'), + ... names_sep = '.', + ... ) + shape: (4, 4) + ┌───────────┬───────┬────────┬───────┐ + │ Species ┆ part ┆ Length ┆ Width │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ f64 ┆ f64 │ + ╞═══════════╪═══════╪════════╪═══════╡ + │ setosa ┆ Sepal ┆ 5.1 ┆ 3.5 │ + │ virginica ┆ Sepal ┆ 5.9 ┆ 3.0 │ + │ setosa ┆ Petal ┆ 1.4 ┆ 0.2 │ + │ virginica ┆ Petal ┆ 5.1 ┆ 1.8 │ + └───────────┴───────┴────────┴───────┘ -@pl.api.register_expr_namespace("janitor") -class PolarsExpr: - def __init__(self, expr: pl.Expr) -> pl.Expr: - self._expr = expr + Split the column labels based on regex: + >>> df = pl.DataFrame({"id": [1], "new_sp_m5564": [2], "newrel_f65": [3]}) + >>> df + shape: (1, 3) + ┌─────┬──────────────┬────────────┐ + │ id ┆ new_sp_m5564 ┆ newrel_f65 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════════════╪════════════╡ + │ 1 ┆ 2 ┆ 3 │ + └─────┴──────────────┴────────────┘ + >>> df.janitor.pivot_longer( + ... index = 'id', + ... names_to = ('diagnosis', 'gender', 'age'), + ... names_pattern = r"new_?(.+)_(.)(\\d+)", + ... ) + shape: (2, 5) + ┌─────┬───────────┬────────┬──────┬───────┐ + │ id ┆ diagnosis ┆ gender ┆ age ┆ value │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str ┆ str ┆ i64 │ + ╞═════╪═══════════╪════════╪══════╪═══════╡ + │ 1 ┆ sp ┆ m ┆ 5564 ┆ 2 │ + │ 1 ┆ rel ┆ f ┆ 65 ┆ 3 │ + └─────┴───────────┴────────┴──────┴───────┘ - def clean_names( - self, - strip_underscores: Optional[Union[str, bool]] = None, - case_type: str = "lower", - remove_special: bool = False, - strip_accents: bool = False, - enforce_string: bool = False, - truncate_limit: int = None, - ) -> pl.Expr: - """ - Clean the labels in a polars Expression. + Convert the dtypes of specific columns with `names_transform`: + >>> ( + ... df.janitor.pivot_longer( + ... index="id", + ... names_to=("diagnosis", "gender", "age"), + ... names_pattern=r"new_?(.+)_(.)(\\d+)", + ... names_transform={"age": pl.Int32}, + ... ) + ... ) + shape: (2, 5) + ┌─────┬───────────┬────────┬──────┬───────┐ + │ id ┆ diagnosis ┆ gender ┆ age ┆ value │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str ┆ i32 ┆ i64 │ + ╞═════╪═══════════╪════════╪══════╪═══════╡ + │ 1 ┆ sp ┆ m ┆ 5564 ┆ 2 │ + │ 1 ┆ rel ┆ f ┆ 65 ┆ 3 │ + └─────┴───────────┴────────┴──────┴───────┘ - Examples: - >>> import polars as pl - >>> import janitor.polars - >>> df = pl.DataFrame({"raw": ["Abçdê fgí j"]}) + Use multiple `.value` to reshape the dataframe: + >>> df = pl.DataFrame( + ... [ + ... { + ... "x_1_mean": 10, + ... "x_2_mean": 20, + ... "y_1_mean": 30, + ... "y_2_mean": 40, + ... "unit": 50, + ... } + ... ] + ... ) >>> df - shape: (1, 1) - ┌─────────────┐ - │ raw │ - │ --- │ - │ str │ - ╞═════════════╡ - │ Abçdê fgí j │ - └─────────────┘ - - Clean the column values: - >>> df.with_columns(pl.col("raw").janitor.clean_names(strip_accents=True)) - shape: (1, 1) - ┌─────────────┐ - │ raw │ - │ --- │ - │ str │ - ╞═════════════╡ - │ abcde_fgi_j │ - └─────────────┘ + shape: (1, 5) + ┌──────────┬──────────┬──────────┬──────────┬──────┐ + │ x_1_mean ┆ x_2_mean ┆ y_1_mean ┆ y_2_mean ┆ unit │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞══════════╪══════════╪══════════╪══════════╪══════╡ + │ 10 ┆ 20 ┆ 30 ┆ 40 ┆ 50 │ + └──────────┴──────────┴──────────┴──────────┴──────┘ + >>> df.janitor.pivot_longer( + ... index="unit", + ... names_to=(".value", "time", ".value"), + ... names_pattern=r"(x|y)_([0-9])(_mean)", + ... ) + shape: (2, 4) + ┌──────┬──────┬────────┬────────┐ + │ unit ┆ time ┆ x_mean ┆ y_mean │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ i64 │ + ╞══════╪══════╪════════╪════════╡ + │ 50 ┆ 1 ┆ 10 ┆ 30 │ + │ 50 ┆ 2 ┆ 20 ┆ 40 │ + └──────┴──────┴────────┴────────┘ + + Reshape the dataframe by passing a sequence to `names_pattern`: + >>> df = pl.DataFrame({'hr1': [514, 573], + ... 'hr2': [545, 526], + ... 'team': ['Red Sox', 'Yankees'], + ... 'year1': [2007, 2007], + ... 'year2': [2008, 2008]}) + >>> df + shape: (2, 5) + ┌─────┬─────┬─────────┬───────┬───────┐ + │ hr1 ┆ hr2 ┆ team ┆ year1 ┆ year2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════════╪═══════╪═══════╡ + │ 514 ┆ 545 ┆ Red Sox ┆ 2007 ┆ 2008 │ + │ 573 ┆ 526 ┆ Yankees ┆ 2007 ┆ 2008 │ + └─────┴─────┴─────────┴───────┴───────┘ + >>> df.pivot_longer( + ... index = 'team', + ... names_to = ['year', 'hr'], + ... names_pattern = ['year', 'hr'] + ... ) + shape: (4, 3) + ┌─────────┬─────┬──────┐ + │ team ┆ hr ┆ year │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════════╪═════╪══════╡ + │ Red Sox ┆ 514 ┆ 2007 │ + │ Yankees ┆ 573 ┆ 2007 │ + │ Red Sox ┆ 545 ┆ 2008 │ + │ Yankees ┆ 526 ┆ 2008 │ + └─────────┴─────┴──────┘ + + Multiple `values_to`: + >>> df = pd.DataFrame( + ... { + ... "City": ["Houston", "Austin", "Hoover"], + ... "State": ["Texas", "Texas", "Alabama"], + ... "Name": ["Aria", "Penelope", "Niko"], + ... "Mango": [4, 10, 90], + ... "Orange": [10, 8, 14], + ... "Watermelon": [40, 99, 43], + ... "Gin": [16, 200, 34], + ... "Vodka": [20, 33, 18], + ... }, + ... ) + shape: (3, 8) + ┌─────────┬─────────┬──────────┬───────┬────────┬────────────┬─────┬───────┐ + │ City ┆ State ┆ Name ┆ Mango ┆ Orange ┆ Watermelon ┆ Gin ┆ Vodka │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════╪═════════╪══════════╪═══════╪════════╪════════════╪═════╪═══════╡ + │ Houston ┆ Texas ┆ Aria ┆ 4 ┆ 10 ┆ 40 ┆ 16 ┆ 20 │ + │ Austin ┆ Texas ┆ Penelope ┆ 10 ┆ 8 ┆ 99 ┆ 200 ┆ 33 │ + │ Hoover ┆ Alabama ┆ Niko ┆ 90 ┆ 14 ┆ 43 ┆ 34 ┆ 18 │ + └─────────┴─────────┴──────────┴───────┴────────┴────────────┴─────┴───────┘ + + >>> df.pivot_longer( + ... index=["City", "State"], + ... column_names=cs.numeric(), + ... names_to=("Fruit", "Drink"), + ... values_to=("Pounds", "Ounces"), + ... names_pattern=["M|O|W", "G|V"], + ... ) + shape: (9, 6) + ┌─────────┬─────────┬────────────┬────────┬───────┬────────┐ + │ City ┆ State ┆ Fruit ┆ Pounds ┆ Drink ┆ Ounces │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════════╪═════════╪════════════╪════════╪═══════╪════════╡ + │ Houston ┆ Texas ┆ Mango ┆ 4 ┆ Gin ┆ 16 │ + │ Austin ┆ Texas ┆ Mango ┆ 10 ┆ Gin ┆ 200 │ + │ Hoover ┆ Alabama ┆ Mango ┆ 90 ┆ Gin ┆ 34 │ + │ Houston ┆ Texas ┆ Orange ┆ 10 ┆ Vodka ┆ 20 │ + │ Austin ┆ Texas ┆ Orange ┆ 8 ┆ Vodka ┆ 33 │ + │ Hoover ┆ Alabama ┆ Orange ┆ 14 ┆ Vodka ┆ 18 │ + │ Houston ┆ Texas ┆ Watermelon ┆ 40 ┆ null ┆ null │ + │ Austin ┆ Texas ┆ Watermelon ┆ 99 ┆ null ┆ null │ + │ Hoover ┆ Alabama ┆ Watermelon ┆ 43 ┆ null ┆ null │ + └─────────┴─────────┴────────────┴────────┴───────┴────────┘ !!! info "New in version 0.28.0" Args: - strip_underscores: Removes the outer underscores - from all labels in the expression. - Default None keeps outer underscores. - Values can be either 'left', 'right' - or 'both' or the respective shorthand 'l', - 'r' and True. - case_type: Whether to make the labels in the expression lower or uppercase. - Current case may be preserved with 'preserve', - while snake case conversion (from CamelCase or camelCase only) - can be turned on using "snake". - Default 'lower' makes all characters lowercase. - remove_special: Remove special characters from the values in the expression. - Only letters, numbers and underscores are preserved. - strip_accents: Whether or not to remove accents from - the expression. - enforce_string: Whether or not to cast the expression to a string type. - truncate_limit: Truncates formatted labels in the expression to - the specified length. Default None does not truncate. + index: Column(s) or selector(s) to use as identifier variables. + column_names: Column(s) or selector(s) to unpivot. + names_to: Name of new column as a string that will contain + what were previously the column names in `column_names`. + The default is `variable` if no value is provided. It can + also be a list/tuple of strings that will serve as new column + names, if `name_sep` or `names_pattern` is provided. + If `.value` is in `names_to`, new column names will be extracted + from part of the existing column names and overrides `values_to`. + values_to: Name of new column as a string that will contain what + were previously the values of the columns in `column_names`. + `values_to` can also be a list/tuple + and requires that `names_pattern` is also a list/tuple. + names_sep: Determines how the column name is broken up, if + `names_to` contains multiple values. It takes the same + specification as polars' `str.split` method. + names_pattern: Determines how the column name is broken up. + It can be a regular expression containing matching groups. + It takes the same + specification as polars' `str.extract_groups` method. + `names_pattern` can also be a list/tuple of regular expressions. + It can also be a list/tuple of strings; + the strings will be treated as regular expressions. + Under the hood it is processed with polars' `str.contains` function. + For a list/tuple of regular expressions, + `names_to` must also be a list/tuple and the lengths of both + arguments must match. + names_transform: Use this option to change the types of columns that + have been transformed to rows. + This does not applies to the values' columns. + It can be a single valid polars dtype, + or a dictionary pairing the new column names + with a valid polars dtype. + Applicable only if one of names_sep + or names_pattern is provided. Returns: - A polars Expression. - """ - return _clean_expr_names( - obj=self._expr, - strip_accents=strip_accents, - strip_underscores=strip_underscores, - case_type=case_type, - remove_special=remove_special, - enforce_string=enforce_string, - truncate_limit=truncate_limit, + A polars DataFrame that has been unpivoted from wide to long + format. + """ # noqa: E501 + return _pivot_longer( + df=self._df, + index=index, + column_names=column_names, + names_pattern=names_pattern, + names_sep=names_sep, + names_to=names_to, + values_to=values_to, + names_transform=names_transform, ) diff --git a/janitor/polars/clean_names.py b/janitor/polars/clean_names.py deleted file mode 100644 index 3226c9d33..000000000 --- a/janitor/polars/clean_names.py +++ /dev/null @@ -1,168 +0,0 @@ -"""clean_names implementation for polars.""" - -import re -import unicodedata -from typing import Optional, Union - -from janitor.errors import JanitorError -from janitor.functions.utils import ( - _change_case, - _normalize_1, - _remove_special, - _strip_accents, - _strip_underscores_func, -) -from janitor.utils import import_message - -try: - import polars as pl -except ImportError: - import_message( - submodule="polars", - package="polars", - conda_channel="conda-forge", - pip_install=True, - ) - - -def _change_case_expr( - obj: pl.Expr, - case_type: str, -) -> pl.Expr: - """Change case of labels in obj.""" - case_types = {"preserve", "upper", "lower", "snake"} - case_type = case_type.lower() - if case_type not in case_types: - raise JanitorError(f"type must be one of: {case_types}") - - if case_type == "preserve": - return obj - if case_type == "upper": - return obj.str.to_uppercase() - if case_type == "lower": - return obj.str.to_lowercase() - # Implementation taken from: https://gist.github.com/jaytaylor/3660565 - # by @jtaylor - return ( - obj.str.replace_all( - pattern=r"(.)([A-Z][a-z]+)", value=r"${1}_${2}", literal=False - ) - .str.replace_all( - pattern=r"([a-z0-9])([A-Z])", value=r"${1}_${2}", literal=False - ) - .str.to_lowercase() - ) - - -def _normalize_expr(obj: pl.Expr) -> pl.Expr: - """Perform normalization of labels in obj.""" - FIXES = [(r"[ /:,?()\.-]", "_"), (r"['’]", ""), (r"[\xa0]", "_")] - for search, replace in FIXES: - obj = obj.str.replace_all(pattern=search, value=replace, literal=False) - return obj - - -def _remove_special_expr( - obj: pl.Expr, -) -> pl.Expr: - """Remove special characters from the labels in obj.""" - return obj.str.replace_all( - pattern="[^A-Za-z_\\d]", value="", literal=False - ).str.strip_chars() - - -def _strip_accents_expr( - obj: pl.Expr, -) -> pl.Expr: - """Remove accents from the labels in obj. - - Inspired from [StackOverflow][so]. - - [so]: https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-strin - """ # noqa: E501 - # TODO: possible implementation in Rust - # or use a pyarrow implementation? - # https://github.com/pola-rs/polars/issues/11455 - return obj.map_elements( - lambda word: [ - letter - for letter in unicodedata.normalize("NFD", word) - if not unicodedata.combining(letter) - ], - return_dtype=pl.List(pl.Utf8), - ).list.join("") - - -def _strip_underscores_func_expr( - obj: pl.Expr, - strip_underscores: Union[str, bool] = None, -) -> pl.Expr: - """Strip underscores from obj.""" - underscore_options = {None, "left", "right", "both", "l", "r", True} - if strip_underscores not in underscore_options: - raise JanitorError( - f"strip_underscores must be one of: {underscore_options}" - ) - if strip_underscores in {"left", "l"}: - return obj.str.strip_chars_start("_") - if strip_underscores in {"right", "r"}: - return obj.str.strip_chars_end("_") - if strip_underscores in {True, "both"}: - return obj.str.strip_chars("_") - return obj - - -def _clean_column_names( - obj: str, - strip_underscores: Optional[Union[str, bool]] = None, - case_type: str = "lower", - remove_special: bool = False, - strip_accents: bool = False, - truncate_limit: int = None, -) -> str: - """ - Function to clean the column names of a polars DataFrame. - """ - obj = _change_case(obj=obj, case_type=case_type) - obj = _normalize_1(obj=obj) - if remove_special: - obj = _remove_special(obj=obj) - if strip_accents: - obj = _strip_accents(obj=obj) - obj = re.sub(pattern="_+", repl="_", string=obj) - obj = _strip_underscores_func( - obj, - strip_underscores=strip_underscores, - ) - obj = obj[:truncate_limit] - return obj - - -def _clean_expr_names( - obj: pl.Expr, - strip_underscores: Optional[Union[str, bool]] = None, - case_type: str = "lower", - remove_special: bool = False, - strip_accents: bool = False, - enforce_string: bool = False, - truncate_limit: int = None, -) -> pl.Expr: - """ - Function to clean the labels of a polars Expression. - """ - if enforce_string: - obj = obj.cast(pl.Utf8) - obj = _change_case_expr(obj=obj, case_type=case_type) - obj = _normalize_expr(obj=obj) - if remove_special: - obj = _remove_special_expr(obj=obj) - if strip_accents: - obj = _strip_accents_expr(obj=obj) - obj = obj.str.replace(pattern="_+", value="_", literal=False) - obj = _strip_underscores_func_expr( - obj, - strip_underscores=strip_underscores, - ) - if truncate_limit: - obj = obj.str.slice(offset=0, length=truncate_limit) - return obj diff --git a/janitor/polars/pivot_longer.py b/janitor/polars/pivot_longer.py new file mode 100644 index 000000000..9bb94000b --- /dev/null +++ b/janitor/polars/pivot_longer.py @@ -0,0 +1,669 @@ +"""pivot_longer implementation for polars.""" + +from collections import defaultdict +from itertools import chain +from typing import Any, Mapping, Optional, Pattern, Sequence, Union + +from janitor.utils import check, import_message + +try: + import polars as pl + import polars.selectors as cs + from polars.datatypes.classes import DataTypeClass + from polars.type_aliases import ColumnNameOrSelector, PolarsDataType +except ImportError: + import_message( + submodule="polars", + package="polars", + conda_channel="conda-forge", + pip_install=True, + ) + + +def _pivot_longer( + df: pl.DataFrame, + index: Union[ColumnNameOrSelector, Sequence[ColumnNameOrSelector], None], + column_names: Union[ + ColumnNameOrSelector, Sequence[ColumnNameOrSelector], None + ], + names_to: Optional[Union[list, str]], + values_to: Optional[str], + names_sep: Optional[Union[str, Pattern, None]], + names_pattern: Optional[Union[list, tuple, str, Pattern, None]], + names_transform: Optional[Union[PolarsDataType, dict]], +) -> pl.DataFrame: + + ( + df, + index, + column_names, + names_to, + values_to, + names_sep, + names_pattern, + names_transform, + ) = _data_checks_pivot_longer( + df=df, + index=index, + column_names=column_names, + names_to=names_to, + values_to=values_to, + names_sep=names_sep, + names_pattern=names_pattern, + names_transform=names_transform, + ) + + if not column_names: + return df + + if all((names_pattern is None, names_sep is None)): + return df.melt( + id_vars=index, + value_vars=column_names, + variable_name=names_to[0], + value_name=values_to, + ) + + if names_sep is not None: + return _pivot_longer_names_sep( + df=df, + index=index, + column_names=column_names, + names_to=names_to, + names_sep=names_sep, + values_to=values_to, + names_transform=names_transform, + ) + + if isinstance(names_pattern, (str, Pattern)): + return _pivot_longer_names_pattern_str( + df=df, + index=index, + column_names=column_names, + names_to=names_to, + names_pattern=names_pattern, + values_to=values_to, + names_transform=names_transform, + ) + if isinstance(values_to, (list, tuple)): + return _pivot_longer_values_to_sequence( + df=df, + index=index, + column_names=column_names, + names_to=names_to, + names_pattern=names_pattern, + values_to=values_to, + names_transform=names_transform, + ) + + return _pivot_longer_names_pattern_sequence( + df=df, + index=index, + column_names=column_names, + names_to=names_to, + names_pattern=names_pattern, + ) + + +def _pivot_longer_names_sep( + df: pl.DataFrame, + index: Sequence, + column_names: Sequence, + names_to: Sequence, + names_sep: str, + values_to: str, + names_transform: dict, +) -> pl.DataFrame: + """ + This takes care of pivoting scenarios where + names_sep is provided. + """ + + columns = df.select(column_names).columns + outcome = ( + pl.Series(columns) + .str.split(by=names_sep) + .list.to_struct(n_field_strategy="max_width") + ) + len_outcome = len(outcome.struct.fields) + len_names_to = len(names_to) + if len_names_to != len_outcome: + raise ValueError( + "The length of names_to does not match " + "the number of fields extracted. " + f"The length of names_to is {len_names_to} " + "while the number of fields extracted is " + f"{len_outcome}." + ) + + if ".value" not in names_to: + outcome = outcome.struct.rename_fields(names_to) + return _pivot_longer_no_dot_value( + df=df, + outcome=outcome, + values_to=values_to, + index=index, + columns=columns, + names_to=names_to, + names_transform=names_transform, + ) + if all(label == ".value" for label in names_to): + return _pivot_longer_dot_value_only( + df=df, + names_to=names_to, + columns=columns, + index=index, + outcome=outcome, + ) + return _pivot_longer_dot_value( + df=df, + names_to=names_to, + columns=columns, + index=index, + outcome=outcome, + names_transform=names_transform, + ) + + +def _pivot_longer_names_pattern_str( + df: pl.DataFrame, + index: Union[Sequence, None], + column_names: Union[Sequence, None], + names_to: Sequence, + names_pattern: str, + values_to: str, + names_transform: dict, +) -> pl.DataFrame: + """ + This takes care of pivoting scenarios where + names_pattern is a string. + """ + + columns = df.select(column_names).columns + outcome = pl.Series(columns).str.extract_groups(names_pattern) + len_outcome = len(outcome.struct.fields) + len_names_to = len(names_to) + if len_names_to != len_outcome: + raise ValueError( + f"The length of names_to does not match " + "the number of fields extracted. " + f"The length of names_to is {len_names_to} " + "while the number of fields extracted is " + f"{len_outcome}." + ) + if ".value" not in names_to: + outcome = outcome.struct.rename_fields(names_to) + return _pivot_longer_no_dot_value( + df=df, + outcome=outcome, + values_to=values_to, + index=index, + columns=columns, + names_to=names_to, + names_transform=names_transform, + ) + if all(label == ".value" for label in names_to): + return _pivot_longer_dot_value_only( + df=df, + names_to=names_to, + columns=columns, + index=index, + outcome=outcome, + ) + return _pivot_longer_dot_value( + df=df, + names_to=names_to, + columns=columns, + index=index, + outcome=outcome, + names_transform=names_transform, + ) + + +def _pivot_longer_values_to_sequence( + df: pl.DataFrame, + index: Union[Sequence, None], + column_names: Union[Sequence, None], + names_to: Sequence, + names_pattern: Sequence, + values_to: Sequence, + names_transform: dict, +) -> pl.DataFrame: + """ + This takes care of pivoting scenarios where + values_to is a list/tuple. + """ + columns = df.select(column_names).columns + outcome = pl.DataFrame({"cols": columns}) + expressions = [ + pl.col("cols").str.contains(pattern).alias(f"cols{num}") + for num, pattern in enumerate(names_pattern) + ] + outcome = outcome.with_columns(expressions) + booleans = outcome.select(pl.exclude("cols").any()) + for position in range(len(names_pattern)): + if not booleans.to_series(position).item(): + raise ValueError( + "No match was returned for the regex " + f"at position {position} -> {names_pattern[position]}." + ) + names_booleans = pl + values_booleans = pl + for boolean, repl_name, repl_value in zip( + booleans.columns, names_to, values_to + ): + names_booleans = names_booleans.when(pl.col(boolean)).then( + pl.lit(repl_name) + ) + values_booleans = values_booleans.when(pl.col(boolean)).then( + pl.lit(repl_value) + ) + names_booleans = names_booleans.alias("value") + values_booleans = values_booleans.alias(".value") + filter_expr = pl.col(".value").is_not_null() + cum_expr = pl.col(".value").cum_count().over(".value").sub(1).alias("idx") + outcome = ( + outcome.select(names_booleans, values_booleans, pl.col("cols")) + .filter(filter_expr) + .with_columns(cum_expr) + ) + headers_dict = defaultdict(list) + non_headers_dict = defaultdict(list) + for num, col_name, value_header, name_header in zip( + outcome.get_column("idx"), + outcome.get_column("cols"), + outcome.get_column(".value"), + outcome.get_column("value"), + ): + non_headers_dict[num].append((col_name, name_header)) + headers_dict[num].append((col_name, value_header)) + contents = [] + for key, value in headers_dict.items(): + expression = [] if index is None else [pl.col(index)] + columns_to_select = [ + pl.col(col_name).alias(repl_name) for col_name, repl_name in value + ] + expression.extend(columns_to_select) + columns_to_append = [ + pl.lit(col_name, dtype=names_transform[repl_name]).alias(repl_name) + for col_name, repl_name in non_headers_dict[key] + ] + + contents.append(df.select(expression).with_columns(columns_to_append)) + columns_to_select = [] if not index else list(index) + columns_to_select.extend(chain.from_iterable(zip(names_to, values_to))) + return pl.concat(contents, how="diagonal_relaxed").select( + columns_to_select + ) + + +def _pivot_longer_names_pattern_sequence( + df: pl.DataFrame, + index: Union[Sequence, None], + column_names: Union[Sequence, None], + names_to: Sequence, + names_pattern: Sequence, +) -> pl.DataFrame: + """ + This takes care of pivoting scenarios where + names_pattern is a list/tuple. + """ + columns = df.select(column_names).columns + outcome = pl.DataFrame({"cols": columns}) + expressions = [ + pl.col("cols").str.contains(pattern).alias(f"cols{num}") + for num, pattern in enumerate(names_pattern) + ] + outcome = outcome.with_columns(expressions) + booleans = outcome.select(pl.exclude("cols").any()) + for position in range(len(names_pattern)): + if not booleans.to_series(position).item(): + raise ValueError( + "No match was returned for the regex " + f"at position {position} -> {names_pattern[position]}." + ) + names_booleans = pl + for boolean, repl_name in zip(booleans.columns, names_to): + names_booleans = names_booleans.when(pl.col(boolean)).then( + pl.lit(repl_name) + ) + + names_booleans = names_booleans.alias(".value") + filter_expr = pl.col(".value").is_not_null() + cum_expr = pl.col(".value").cum_count().over(".value").sub(1).alias("idx") + outcome = ( + outcome.select(names_booleans, pl.col("cols")) + .filter(filter_expr) + .with_columns(cum_expr) + ) + headers_dict = defaultdict(list) + for num, col_name, name_header in zip( + outcome.get_column("idx"), + outcome.get_column("cols"), + outcome.get_column(".value"), + ): + headers_dict[num].append((col_name, name_header)) + + contents = [] + for _, value in headers_dict.items(): + expression = [] if index is None else [pl.col(index)] + columns_to_select = [ + pl.col(col_name).alias(repl_name) for col_name, repl_name in value + ] + expression.extend(columns_to_select) + + contents.append(df.select(expression)) + return pl.concat(contents, how="diagonal_relaxed") + + +def _pivot_longer_no_dot_value( + df: pl.DataFrame, + outcome: Mapping, + names_to: Sequence, + values_to: str, + index: Sequence, + columns: Sequence, + names_transform: dict, +): + """ + Reshape the data for scenarios where .value + is not present in names_to, + or names_to is not a list/tuple. + + Returns a DataFrame. + """ + contents = [] + for col_name, mapping in zip(columns, outcome): + expression = ( + [pl.col(col_name)] + if index is None + else [pl.col(index), pl.col(col_name).alias(values_to)] + ) + columns_to_append = [ + pl.lit(label, dtype=names_transform[header]).alias(header) + for header, label in mapping.items() + ] + _frame = df.select(expression).with_columns(columns_to_append) + contents.append(_frame) + columns_to_select = [] if not index else list(index) + columns_to_select.extend(names_to) + columns_to_select.append(values_to) + return pl.concat(contents, how="diagonal_relaxed").select( + pl.col(columns_to_select) + ) + + +def _pivot_longer_dot_value( + df: pl.DataFrame, + names_to: Sequence, + outcome: pl.DataFrame, + index: Sequence, + columns: Sequence, + names_transform: Union[PolarsDataType, dict], +) -> pl.DataFrame: + """ + Pivots the dataframe into the final form, + for scenarios where .value is in names_to. + + Returns a DataFrame. + """ + booleans = outcome.struct.unnest().select(pl.all().is_null().any()) + for position in range(len(names_to)): + if booleans.to_series(position).item(): + raise ValueError( + f"Column labels '{columns[position]}' " + "could not be matched with any of the groups " + "in the provided regex. Kindly provide a regular expression " + "(with the correct groups) that matches all labels in the columns." + ) + if names_to.count(".value") > 1: + cols = outcome.struct.fields + dot_value = [ + cols[num] + for num, label in enumerate(names_to) + if label == ".value" + ] + not_dot_value = [ + pl.col(field_name).alias(repl_name) + for field_name, repl_name in zip(cols, names_to) + if field_name not in dot_value + ] + + outcome = outcome.struct.unnest().select( + pl.concat_str(dot_value).alias(".value"), *not_dot_value + ) + else: + outcome = outcome.struct.rename_fields(names_to).struct.unnest() + idx = "".join(names_to) + not_dot_value = [name for name in names_to if name != ".value"] + outcome = outcome.with_row_index(idx).with_columns( + pl.col(idx).first().over(not_dot_value).rank("dense").sub(1), + pl.struct(not_dot_value), + ) + headers_dict = defaultdict(list) + for num, col_name, repl_name in zip( + outcome.get_column(idx), + columns, + outcome.get_column(".value"), + ): + headers_dict[num].append((col_name, repl_name)) + + non_headers_dict = dict() + outcome = outcome.select(idx, not_dot_value[0]).unique() + + for key, value in zip(outcome.to_series(0), outcome.to_series(1)): + value = [ + pl.lit(stub_name, dtype=names_transform[repl_name]).alias( + repl_name + ) + for repl_name, stub_name in value.items() + ] + non_headers_dict[key] = value + contents = [] + for key, value in headers_dict.items(): + expression = [] if index is None else [pl.col(index)] + columns_to_select = [ + pl.col(col_name).alias(repl_name) for col_name, repl_name in value + ] + expression.extend(columns_to_select) + _frame = df.select(expression).with_columns(non_headers_dict[key]) + contents.append(_frame) + columns_to_select = [] if not index else list(index) + columns_to_select.extend(not_dot_value) + return pl.concat(contents, how="diagonal_relaxed").select( + pl.col(columns_to_select), pl.exclude(columns_to_select) + ) + + +def _pivot_longer_dot_value_only( + df: pl.DataFrame, + names_to: Sequence, + outcome: pl.DataFrame, + index: Sequence, + columns: Sequence, +) -> pl.DataFrame: + """ + Pivots the dataframe into the final form, + for scenarios where only '.value' is present in names_to. + + Returns a DataFrame. + """ + + if names_to.count(".value") > 1: + outcome = outcome.struct.unnest().select( + pl.concat_str(pl.all()).alias(".value") + ) + else: + outcome = outcome.struct.rename_fields(names_to).struct.unnest() + outcome = outcome.with_columns( + pl.col(".value").cum_count().over(".value").sub(1).alias("idx") + ) + headers_dict = defaultdict(list) + for num, col_name, repl_name in zip( + outcome.get_column("idx"), + columns, + outcome.get_column(".value"), + ): + headers_dict[num].append((col_name, repl_name)) + + contents = [] + for _, value in headers_dict.items(): + expression = [] if index is None else [pl.col(index)] + columns_to_select = [ + pl.col(col_name).alias(repl_name) for col_name, repl_name in value + ] + expression.extend(columns_to_select) + contents.append(df.select(expression)) + + return pl.concat(contents, how="diagonal_relaxed") + + +def _data_checks_pivot_longer( + df, + index, + column_names, + names_to, + values_to, + names_sep, + names_pattern, + names_transform, +) -> tuple: + """ + This function majorly does type checks on the passed arguments. + + This function is executed before proceeding to the computation phase. + + Type annotations are not provided because this function is where type + checking happens. + """ + + def _check_type(arg_name: str, arg_value: Any): + """ + Raise if argument is not a valid type + """ + + def _check_type_single(entry): + if ( + not isinstance(entry, str) + and not cs.is_selector(entry) + and not isinstance(entry, pl.Expr) + ): + raise TypeError( + f"The argument passed to the {arg_name} parameter " + "should be a string type, a ColumnSelector, " + "or a list/tuple that contains " + "a string and/or a ColumnSelector." + ) + + if isinstance(arg_value, (list, tuple)): + for entry in arg_value: + _check_type_single(entry=entry) + else: + _check_type_single(entry=arg_value) + + if (index is None) and (column_names is None): + column_names = cs.expand_selector(df, pl.all()) + index = [] + elif (index is not None) and (column_names is not None): + _check_type(arg_name="index", arg_value=index) + index = cs.expand_selector(df, index) + _check_type(arg_name="column_names", arg_value=column_names) + column_names = cs.expand_selector(df, column_names) + + elif (index is None) and (column_names is not None): + _check_type(arg_name="column_names", arg_value=column_names) + column_names = cs.expand_selector(df, column_names) + index = cs.expand_selector(df, pl.exclude(column_names)) + + elif (index is not None) and (column_names is None): + _check_type(arg_name="index", arg_value=index) + index = cs.expand_selector(df, index) + column_names = cs.expand_selector(df, pl.exclude(index)) + + check("names_to", names_to, [list, tuple, str]) + if isinstance(names_to, (list, tuple)): + uniques = set() + for word in names_to: + check(f"'{word}' in names_to", word, [str]) + if (word in uniques) and (word != ".value"): + raise ValueError(f"'{word}' is duplicated in names_to.") + uniques.add(word) + names_to = [names_to] if isinstance(names_to, str) else names_to + + if names_sep and names_pattern: + raise ValueError( + "Only one of names_pattern or names_sep should be provided." + ) + + if names_sep is not None: + check("names_sep", names_sep, [str]) + + if names_pattern is not None: + check("names_pattern", names_pattern, [str, list, tuple]) + if isinstance(names_pattern, (list, tuple)): + for word in names_pattern: + check(f"'{word}' in names_pattern", word, [str]) + if ".value" in names_to: + raise ValueError( + ".value is not accepted in names_to " + "if names_pattern is a list/tuple." + ) + if len(names_pattern) != len(names_to): + raise ValueError( + f"The length of names_to does not match " + "the number of regexes in names_pattern. " + f"The length of names_to is {len(names_to)} " + f"while the number of regexes is {len(names_pattern)}." + ) + + check("values_to", values_to, [str, list, tuple]) + values_to_is_a_sequence = isinstance(values_to, (list, tuple)) + names_pattern_is_a_sequence = isinstance(names_pattern, (list, tuple)) + if values_to_is_a_sequence: + if not names_pattern_is_a_sequence: + raise TypeError( + "values_to can be a list/tuple only " + "if names_pattern is a list/tuple." + ) + + if len(names_pattern) != len(values_to): + raise ValueError( + f"The length of values_to does not match " + "the number of regexes in names_pattern. " + f"The length of values_to is {len(values_to)} " + f"while the number of regexes is {len(names_pattern)}." + ) + uniques = set() + for word in values_to: + check(f"{word} in values_to", word, [str]) + if word in uniques: + raise ValueError(f"'{word}' is duplicated in values_to.") + uniques.add(word) + + columns_to_append = any(label != ".value" for label in names_to) + if values_to_is_a_sequence or columns_to_append: + check("names_transform", names_transform, [DataTypeClass, dict]) + if isinstance(names_transform, dict): + for _, dtype in names_transform.items(): + check( + "dtype in the names_transform mapping", + dtype, + [DataTypeClass], + ) + names_transform = { + label: names_transform.get(label, pl.Utf8) + for label in names_to + } + else: + names_transform = {label: names_transform for label in names_to} + + return ( + df, + index, + column_names, + names_to, + values_to, + names_sep, + names_pattern, + names_transform, + ) diff --git a/mkdocs/api/polars.md b/mkdocs/api/polars.md index a2cbd574c..17a6a87aa 100644 --- a/mkdocs/api/polars.md +++ b/mkdocs/api/polars.md @@ -3,5 +3,4 @@ ::: janitor.polars options: members: - - PolarsExpr - PolarsFrame diff --git a/tests/polars/functions/test_clean_names_polars.py b/tests/polars/functions/test_clean_names_polars.py deleted file mode 100644 index 23ce38742..000000000 --- a/tests/polars/functions/test_clean_names_polars.py +++ /dev/null @@ -1,102 +0,0 @@ -import polars as pl -import pytest - -from janitor import polars # noqa: F401 - - -@pytest.mark.functions -def test_clean_names_method_chain(dataframe): - """Tests clean_names default args in a method chain.""" - df = pl.from_pandas(dataframe) - df = df.janitor.clean_names() - expected_columns = [ - "a", - "bell_chart", - "decorated_elephant", - "animals@#$%^", - "cities", - ] - assert df.columns == expected_columns - - -@pytest.mark.functions -def test_clean_names_special_characters(dataframe): - """Tests clean_names `remove_special` parameter.""" - df = pl.from_pandas(dataframe) - df = df.janitor.clean_names(remove_special=True) - expected_columns = [ - "a", - "bell_chart", - "decorated_elephant", - "animals", - "cities", - ] - assert df.columns == expected_columns - - -@pytest.mark.functions -def test_clean_names_uppercase(dataframe): - """Tests clean_names `case_type` parameter = upper.""" - df = pl.from_pandas(dataframe) - df = df.janitor.clean_names(remove_special=True, case_type="upper") - expected_columns = [ - "A", - "BELL_CHART", - "DECORATED_ELEPHANT", - "ANIMALS", - "CITIES", - ] - assert df.columns == expected_columns - - -@pytest.mark.functions -def test_clean_names_strip_accents(): - """Tests clean_names `strip_accents` parameter.""" - df = pl.DataFrame({"João": [1, 2], "Лука́ся": [1, 2], "Käfer": [1, 2]}) - df = df.janitor.clean_names(strip_accents=True) - expected_columns = ["joao", "лукася", "kafer"] - assert df.columns == expected_columns - - -@pytest.mark.functions -def test_clean_names_camelcase_to_snake(dataframe): - """Tests clean_names `case_type` parameter = snake.""" - df = pl.from_pandas(dataframe) - df = ( - df.select("a") - .rename({"a": "AColumnName"}) - .janitor.clean_names(remove_special=True, case_type="snake") - ) - assert df.columns == ["a_column_name"] - - -@pytest.mark.functions -def test_clean_names_truncate_limit(dataframe): - """Tests clean_names `truncate_limit` parameter.""" - df = pl.from_pandas(dataframe) - df = df.janitor.clean_names(truncate_limit=7) - expected_columns = ["a", "bell_ch", "decorat", "animals", "cities"] - assert df.columns == expected_columns - - -@pytest.mark.functions -def test_charac(): - """Ensure non standard characters and spaces have been cleaned up.""" - - df = pl.DataFrame( - { - r"Current accountbalance(in % of GDP)": range(5), - } - ) - df = df.janitor.clean_names(strip_underscores=True, case_type="lower") - - assert "current_accountbalance_in_%_of_gdp" in df.columns - - -def test_clean_column_values(): - """Clean column values""" - raw = pl.DataFrame({"raw": ["Abçdê fgí j"]}) - outcome = raw.with_columns( - pl.col("raw").janitor.clean_names(strip_accents=True) - ) - assert list(outcome)[0][0] == "abcde_fgi_j" diff --git a/tests/polars/functions/test_pivot_longer_polars.py b/tests/polars/functions/test_pivot_longer_polars.py new file mode 100644 index 000000000..86d2636c8 --- /dev/null +++ b/tests/polars/functions/test_pivot_longer_polars.py @@ -0,0 +1,913 @@ +import polars as pl +import polars.selectors as cs +import pytest +from polars.testing import assert_frame_equal + +from janitor import polars # noqa: F401 + + +@pytest.fixture +def df_checks(): + """fixture dataframe""" + return pl.DataFrame( + { + "famid": [1, 1, 1, 2, 2, 2, 3, 3, 3], + "birth": [1, 2, 3, 1, 2, 3, 1, 2, 3], + "ht1": [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], + "ht2": [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9], + } + ) + + +def test_type_index(df_checks): + """Raise TypeError if wrong type is provided for the index.""" + msg = "The argument passed to the index parameter " + msg += "should be a string type, a ColumnSelector.+" + with pytest.raises(TypeError, match=msg): + df_checks.janitor.pivot_longer(index=2007, names_sep="_") + + +def test_type_column_names(df_checks): + """Raise TypeError if wrong type is provided for column_names.""" + msg = "The argument passed to the column_names parameter " + msg += "should be a string type, a ColumnSelector.+" + with pytest.raises(TypeError, match=msg): + df_checks.janitor.pivot_longer(column_names=2007, names_sep="_") + + +def test_type_names_to(df_checks): + """Raise TypeError if wrong type is provided for names_to.""" + msg = "names_to should be one of .+" + with pytest.raises(TypeError, match=msg): + df_checks.janitor.pivot_longer(names_to=2007, names_sep="_") + + +def test_subtype_names_to(df_checks): + """ + Raise TypeError if names_to is a sequence + and the wrong type is provided for entries + in names_to. + """ + with pytest.raises(TypeError, match="'1' in names_to.+"): + df_checks.janitor.pivot_longer(names_to=[1], names_sep="_") + + +def test_duplicate_names_to(df_checks): + """Raise error if names_to contains duplicates.""" + with pytest.raises(ValueError, match="'y' is duplicated in names_to."): + df_checks.janitor.pivot_longer( + names_to=["y", "y"], names_pattern="(.+)(.)" + ) + + +def test_both_names_sep_and_pattern(df_checks): + """ + Raise ValueError if both names_sep + and names_pattern is provided. + """ + with pytest.raises( + ValueError, + match="Only one of names_pattern or names_sep should be provided.", + ): + df_checks.janitor.pivot_longer( + names_to=["rar", "bar"], names_sep="-", names_pattern="(.+)(.)" + ) + + +def test_name_pattern_wrong_type(df_checks): + """Raise TypeError if the wrong type is provided for names_pattern.""" + with pytest.raises(TypeError, match="names_pattern should be one of.+"): + df_checks.janitor.pivot_longer( + names_to=["rar", "bar"], names_pattern=2007 + ) + + +def test_names_pattern_wrong_subtype(df_checks): + """ + Raise TypeError if names_pattern is a list/tuple + and wrong subtype is supplied. + """ + with pytest.raises(TypeError, match="'1' in names_pattern.+"): + df_checks.janitor.pivot_longer( + names_to=["ht", "num"], names_pattern=[1, "\\d"] + ) + + +def test_names_pattern_names_to_unequal_length(df_checks): + """ + Raise ValueError if names_pattern is a list/tuple + and wrong number of items in names_to. + """ + with pytest.raises( + ValueError, + match="The length of names_to does not match " + "the number of regexes in names_pattern.+", + ): + df_checks.janitor.pivot_longer( + names_to=["variable"], names_pattern=["^ht", ".+i.+"] + ) + + +def test_names_pattern_names_to_dot_value(df_checks): + """ + Raise Error if names_pattern is a list/tuple and + .value in names_to. + """ + with pytest.raises( + ValueError, + match=".value is not accepted in names_to " + "if names_pattern is a list/tuple.", + ): + df_checks.janitor.pivot_longer( + names_to=["variable", ".value"], names_pattern=["^ht", ".+i.+"] + ) + + +def test_name_sep_wrong_type(df_checks): + """Raise TypeError if the wrong type is provided for names_sep.""" + with pytest.raises(TypeError, match="names_sep should be one of.+"): + df_checks.janitor.pivot_longer( + names_to=[".value", "num"], names_sep=["_"] + ) + + +def test_values_to_wrong_type(df_checks): + """Raise TypeError if the wrong type is provided for `values_to`.""" + with pytest.raises(TypeError, match="values_to should be one of.+"): + df_checks.janitor.pivot_longer(values_to={"salvo"}, names_sep="_") + + +def test_values_to_wrong_type_names_pattern(df_checks): + """ + Raise TypeError if `values_to` is a list, + and names_pattern is not. + """ + with pytest.raises( + TypeError, + match="values_to can be a list/tuple only " + "if names_pattern is a list/tuple.", + ): + df_checks.janitor.pivot_longer( + values_to=["salvo"], names_pattern=r"(.)" + ) + + +def test_values_to_names_pattern_unequal_length(df_checks): + """ + Raise ValueError if `values_to` is a list, + and the length of names_pattern + does not match the length of values_to. + """ + with pytest.raises( + ValueError, + match="The length of values_to does not match " + "the number of regexes in names_pattern.+", + ): + df_checks.janitor.pivot_longer( + values_to=["salvo"], + names_pattern=["ht", r"\d"], + names_to=["foo", "bar"], + ) + + +def test_sub_values_to(df_checks): + """Raise error if values_to is a sequence, and contains non strings.""" + with pytest.raises(TypeError, match="1 in values_to.+"): + df_checks.janitor.pivot_longer( + names_to=["x", "y"], + names_pattern=[r"ht", r"\d"], + values_to=[1, "salvo"], + ) + + +def test_duplicate_values_to(df_checks): + """Raise error if values_to is a sequence, and contains duplicates.""" + with pytest.raises( + ValueError, match="'salvo' is duplicated in values_to." + ): + df_checks.janitor.pivot_longer( + names_to=["x", "y"], + names_pattern=[r"ht", r"\d"], + values_to=["salvo", "salvo"], + ) + + +def test_names_transform_wrong_type(df_checks): + """Raise TypeError if the wrong type is provided for `names_transform`.""" + with pytest.raises(TypeError, match="names_transform should be one of.+"): + df_checks.janitor.pivot_longer(names_sep="_", names_transform=1) + + +def test_names_transform_wrong_subtype(df_checks): + """ + Raise TypeError if the wrong subtype + is provided for values in the + `names_transform` dictionary. + """ + with pytest.raises( + TypeError, + match="dtype in the names_transform mapping should be one of.+", + ): + df_checks.janitor.pivot_longer( + names_sep="_", names_transform={"rar": 1} + ) + + +def test_names_pattern_list_empty_any(df_checks): + """ + Raise ValueError if names_pattern is a list, + and not all matches are returned. + """ + with pytest.raises( + ValueError, match="No match was returned for the regex.+" + ): + df_checks.janitor.pivot_longer( + index=["famid", "birth"], + names_to=["ht"], + names_pattern=["rar"], + ) + + +def test_names_pattern_no_match(df_checks): + """Raise error if names_pattern is a regex and returns no matches.""" + with pytest.raises( + ValueError, match="Column labels .+ could not be matched with any .+" + ): + df_checks.janitor.pivot_longer( + index="famid", + names_to=[".value", "value"], + names_pattern=r"(rar)(.)", + ) + + +def test_names_pattern_incomplete_match(df_checks): + """ + Raise error if names_pattern is a regex + and returns incomplete matches. + """ + with pytest.raises( + ValueError, match="Column labels .+ could not be matched with any .+" + ): + df_checks.janitor.pivot_longer( + index="famid", + names_to=[".value", "value"], + names_pattern=r"(ht)(.)", + ) + + +def test_names_sep_len(df_checks): + """ + Raise error if names_sep, + and the number of matches returned + is not equal to the length of names_to. + """ + msg = "The length of names_to does not match " + msg += "the number of fields extracted.+ " + with pytest.raises(ValueError, match=msg): + df_checks.janitor.pivot_longer(names_to=".value", names_sep="t") + + +def test_pivot_index_only(df_checks): + """Test output if only index is passed.""" + result = df_checks.janitor.pivot_longer( + index=["famid", "birth"], + names_to="dim", + values_to="num", + ) + + actual = df_checks.melt( + ["famid", "birth"], variable_name="dim", value_name="num" + ) + + assert_frame_equal(result, actual) + + +def test_pivot_column_only(df_checks): + """Test output if only column_names is passed.""" + result = df_checks.janitor.pivot_longer( + column_names=["ht1", "ht2"], + names_to="dim", + values_to="num", + ) + + actual = df_checks.melt( + id_vars=["famid", "birth"], + variable_name="dim", + value_name="num", + ) + + assert_frame_equal(result, actual) + + +def test_names_pat_str(df_checks): + """ + Test output when names_pattern is a string, + and .value is present. + """ + result = df_checks.janitor.pivot_longer( + column_names=cs.starts_with("ht"), + names_to=(".value", "age"), + names_pattern="(.+)(.)", + names_transform={"age": pl.Int64}, + ).sort(by=cs.all()) + + actual = [ + {"famid": 1, "birth": 1, "age": 1, "ht": 2.8}, + {"famid": 1, "birth": 1, "age": 2, "ht": 3.4}, + {"famid": 1, "birth": 2, "age": 1, "ht": 2.9}, + {"famid": 1, "birth": 2, "age": 2, "ht": 3.8}, + {"famid": 1, "birth": 3, "age": 1, "ht": 2.2}, + {"famid": 1, "birth": 3, "age": 2, "ht": 2.9}, + {"famid": 2, "birth": 1, "age": 1, "ht": 2.0}, + {"famid": 2, "birth": 1, "age": 2, "ht": 3.2}, + {"famid": 2, "birth": 2, "age": 1, "ht": 1.8}, + {"famid": 2, "birth": 2, "age": 2, "ht": 2.8}, + {"famid": 2, "birth": 3, "age": 1, "ht": 1.9}, + {"famid": 2, "birth": 3, "age": 2, "ht": 2.4}, + {"famid": 3, "birth": 1, "age": 1, "ht": 2.2}, + {"famid": 3, "birth": 1, "age": 2, "ht": 3.3}, + {"famid": 3, "birth": 2, "age": 1, "ht": 2.3}, + {"famid": 3, "birth": 2, "age": 2, "ht": 3.4}, + {"famid": 3, "birth": 3, "age": 1, "ht": 2.1}, + {"famid": 3, "birth": 3, "age": 2, "ht": 2.9}, + ] + actual = pl.DataFrame(actual).sort(by=cs.all()) + + assert_frame_equal(result, actual, check_dtype=False) + + +def test_no_column_names(df_checks): + """ + Test output if all the columns + are assigned to the index parameter. + """ + assert_frame_equal( + df_checks.janitor.pivot_longer(index=cs.all()), + df_checks, + ) + + +@pytest.fixture +def test_df(): + """Fixture DataFrame""" + return pl.DataFrame( + { + "off_loc": ["A", "B", "C", "D", "E", "F"], + "pt_loc": ["G", "H", "I", "J", "K", "L"], + "pt_lat": [ + 100.07548220000001, + 75.191326, + 122.65134479999999, + 124.13553329999999, + 124.13553329999999, + 124.01028909999998, + ], + "off_lat": [ + 121.271083, + 75.93845266, + 135.043791, + 134.51128400000002, + 134.484374, + 137.962195, + ], + "pt_long": [ + 4.472089953, + -144.387785, + -40.45611048, + -46.07156181, + -46.07156181, + -46.01594293, + ], + "off_long": [ + -7.188632000000001, + -143.2288569, + 21.242563, + 40.937416999999996, + 40.78472, + 22.905889000000002, + ], + } + ) + + +actual = [ + { + "set": "off", + "loc": "A", + "lat": 121.271083, + "long": -7.188632000000001, + }, + {"set": "off", "loc": "B", "lat": 75.93845266, "long": -143.2288569}, + {"set": "off", "loc": "C", "lat": 135.043791, "long": 21.242563}, + { + "set": "off", + "loc": "D", + "lat": 134.51128400000002, + "long": 40.937416999999996, + }, + {"set": "off", "loc": "E", "lat": 134.484374, "long": 40.78472}, + { + "set": "off", + "loc": "F", + "lat": 137.962195, + "long": 22.905889000000002, + }, + { + "set": "pt", + "loc": "G", + "lat": 100.07548220000001, + "long": 4.472089953, + }, + {"set": "pt", "loc": "H", "lat": 75.191326, "long": -144.387785}, + { + "set": "pt", + "loc": "I", + "lat": 122.65134479999999, + "long": -40.45611048, + }, + { + "set": "pt", + "loc": "J", + "lat": 124.13553329999999, + "long": -46.07156181, + }, + { + "set": "pt", + "loc": "K", + "lat": 124.13553329999999, + "long": -46.07156181, + }, + { + "set": "pt", + "loc": "L", + "lat": 124.01028909999998, + "long": -46.01594293, + }, +] + +actual = pl.DataFrame(actual).sort(by=pl.all()) + + +def test_names_pattern_str(test_df): + """Test output for names_pattern and .value.""" + + result = test_df.janitor.pivot_longer( + column_names=cs.all(), + names_to=["set", ".value"], + names_pattern="(.+)_(.+)", + ).sort(by=cs.all()) + assert_frame_equal(result, actual) + + +def test_names_sep_str(test_df): + """Test output for names_pattern and .value.""" + + result = test_df.janitor.pivot_longer( + column_names=cs.all(), + names_to=["set", ".value"], + names_sep="_", + ).sort(by=cs.all()) + assert_frame_equal(result, actual) + + +def test_names_pattern_list(): + """Test output if names_pattern is a list/tuple.""" + + df = pl.DataFrame( + { + "Activity": ["P1", "P2"], + "General": ["AA", "BB"], + "m1": ["A1", "B1"], + "t1": ["TA1", "TB1"], + "m2": ["A2", "B2"], + "t2": ["TA2", "TB2"], + "m3": ["A3", "B3"], + "t3": ["TA3", "TB3"], + } + ) + + result = ( + df.janitor.pivot_longer( + index=["Activity", "General"], + names_pattern=["^m", "^t"], + names_to=["M", "Task"], + ) + .select(["Activity", "General", "Task", "M"]) + .sort(by=pl.all()) + ) + + actual = [ + {"Activity": "P1", "General": "AA", "Task": "TA1", "M": "A1"}, + {"Activity": "P1", "General": "AA", "Task": "TA2", "M": "A2"}, + {"Activity": "P1", "General": "AA", "Task": "TA3", "M": "A3"}, + {"Activity": "P2", "General": "BB", "Task": "TB1", "M": "B1"}, + {"Activity": "P2", "General": "BB", "Task": "TB2", "M": "B2"}, + {"Activity": "P2", "General": "BB", "Task": "TB3", "M": "B3"}, + ] + + actual = pl.DataFrame(actual).sort(by=pl.all()) + + assert_frame_equal(result, actual) + + +@pytest.fixture +def not_dot_value(): + """Fixture DataFrame""" + return pl.DataFrame( + { + "country": ["United States", "Russia", "China"], + "vault_2012": [48.1, 46.4, 44.3], + "floor_2012": [45.4, 41.6, 40.8], + "vault_2016": [46.9, 45.7, 44.3], + "floor_2016": [46.0, 42.0, 42.1], + } + ) + + +actual2 = [ + {"country": "China", "event": "floor", "year": "2012", "score": 40.8}, + {"country": "China", "event": "floor", "year": "2016", "score": 42.1}, + {"country": "China", "event": "vault", "year": "2012", "score": 44.3}, + {"country": "China", "event": "vault", "year": "2016", "score": 44.3}, + {"country": "Russia", "event": "floor", "year": "2012", "score": 41.6}, + {"country": "Russia", "event": "floor", "year": "2016", "score": 42.0}, + {"country": "Russia", "event": "vault", "year": "2012", "score": 46.4}, + {"country": "Russia", "event": "vault", "year": "2016", "score": 45.7}, + { + "country": "United States", + "event": "floor", + "year": "2012", + "score": 45.4, + }, + { + "country": "United States", + "event": "floor", + "year": "2016", + "score": 46.0, + }, + { + "country": "United States", + "event": "vault", + "year": "2012", + "score": 48.1, + }, + { + "country": "United States", + "event": "vault", + "year": "2016", + "score": 46.9, + }, +] +actual2 = pl.DataFrame(actual2).sort(by=pl.all()) + + +def test_not_dot_value_sep(not_dot_value): + """Test output when names_sep and no dot_value""" + + result = not_dot_value.janitor.pivot_longer( + "country", + names_to=("event", "year"), + names_sep="_", + values_to="score", + ).sort(by=pl.all()) + + assert_frame_equal(result, actual2) + + +def test_not_dot_value_sep2(not_dot_value): + """Test output when names_sep and no dot_value""" + + result = not_dot_value.janitor.pivot_longer( + "country", + names_to="event", + names_sep="/", + values_to="score", + ) + + actual = not_dot_value.melt( + "country", variable_name="event", value_name="score" + ) + + assert_frame_equal(result, actual) + + +def test_not_dot_value_pattern(not_dot_value): + """Test output when names_pattern is a string and no dot_value""" + + result = not_dot_value.janitor.pivot_longer( + index="country", + names_to=("event", "year"), + names_pattern=r"(.+)_(.+)", + values_to="score", + ).sort(by=cs.all()) + + assert_frame_equal(result, actual2) + + +def test_multiple_dot_value(): + """Test output for multiple .value.""" + df = pl.DataFrame( + { + "x_1_mean": [1, 2, 3, 4], + "x_2_mean": [1, 1, 0, 0], + "x_1_sd": [0, 1, 1, 1], + "x_2_sd": [0.739, 0.219, 1.46, 0.918], + "y_1_mean": [1, 2, 3, 4], + "y_2_mean": [1, 1, 0, 0], + "y_1_sd": [0, 1, 1, 1], + "y_2_sd": [-0.525, 0.623, -0.705, 0.662], + "unit": [1, 2, 3, 4], + } + ) + + result = df.janitor.pivot_longer( + index="unit", + names_to=(".value", "time", ".value"), + names_pattern=r"(x|y)_([0-9])(_mean|_sd)", + names_transform={"time": pl.Int64}, + ).sort(by=cs.all()) + + actual = { + "unit": [1, 2, 3, 4, 1, 2, 3, 4], + "time": [1, 1, 1, 1, 2, 2, 2, 2], + "x_mean": [1, 2, 3, 4, 1, 1, 0, 0], + "x_sd": [0.0, 1.0, 1.0, 1.0, 0.739, 0.219, 1.46, 0.918], + "y_mean": [1, 2, 3, 4, 1, 1, 0, 0], + "y_sd": [0.0, 1.0, 1.0, 1.0, -0.525, 0.623, -0.705, 0.662], + } + + actual = pl.DataFrame(actual).sort(by=cs.all()) + + assert_frame_equal(result, actual) + + +@pytest.fixture +def single_val(): + """fixture dataframe""" + return pl.DataFrame( + { + "id": [1, 2, 3], + "x1": [4, 5, 6], + "x2": [5, 6, 7], + } + ) + + +def test_multiple_dot_value2(single_val): + """Test output for multiple .value.""" + + result = single_val.janitor.pivot_longer( + index="id", names_to=(".value", ".value"), names_pattern="(.)(.)" + ) + + assert_frame_equal(result, single_val) + + +actual3 = [ + {"id": 1, "x": 4}, + {"id": 2, "x": 5}, + {"id": 3, "x": 6}, + {"id": 1, "x": 5}, + {"id": 2, "x": 6}, + {"id": 3, "x": 7}, +] + +actual3 = pl.DataFrame(actual3) + + +def test_names_pattern_sequence_single_unique_column(single_val): + """ + Test output if names_pattern is a sequence of length 1. + """ + + result = single_val.janitor.pivot_longer( + "id", names_to=["x"], names_pattern=("x",) + ) + + assert_frame_equal(result, actual3) + + +def test_names_pattern_single_column(single_val): + """ + Test output if names_to is only '.value'. + """ + + result = single_val.janitor.pivot_longer( + "id", names_to=".value", names_pattern="(.)." + ) + + assert_frame_equal(result, actual3) + + +def test_names_pattern_single_column_not_dot_value(single_val): + """ + Test output if names_to is not '.value'. + """ + result = single_val.janitor.pivot_longer( + index="id", column_names="x1", names_to="yA", names_pattern="(.+)" + ) + + assert_frame_equal( + result, + single_val.melt(id_vars="id", value_vars="x1", variable_name="yA"), + ) + + +def test_names_pattern_single_column_not_dot_value1(single_val): + """ + Test output if names_to is not '.value'. + """ + result = single_val.select("x1").janitor.pivot_longer( + names_to="yA", names_pattern="(.+)" + ) + + assert_frame_equal( + result, single_val.select("x1").melt(variable_name="yA") + ) + + +@pytest.fixture +def df_null(): + "Dataframe with nulls." + return pl.DataFrame( + { + "family": [1, 2, 3, 4, 5], + "dob_child1": [ + "1998-11-26", + "1996-06-22", + "2002-07-11", + "2004-10-10", + "2000-12-05", + ], + "dob_child2": [ + "2000-01-29", + None, + "2004-04-05", + "2009-08-27", + "2005-02-28", + ], + "gender_child1": [1, 2, 2, 1, 2], + "gender_child2": [2.0, None, 2.0, 1.0, 1.0], + } + ) + + +def test_names_pattern_nulls_in_data(df_null): + """Test output if nulls are present in data.""" + result = df_null.janitor.pivot_longer( + index="family", + names_to=[".value", "child"], + names_pattern=r"(.+)_(.+)", + ).sort(by=pl.all()) + + actual = [ + {"family": 1, "child": "child1", "dob": "1998-11-26", "gender": 1.0}, + {"family": 2, "child": "child1", "dob": "1996-06-22", "gender": 2.0}, + {"family": 3, "child": "child1", "dob": "2002-07-11", "gender": 2.0}, + {"family": 4, "child": "child1", "dob": "2004-10-10", "gender": 1.0}, + {"family": 5, "child": "child1", "dob": "2000-12-05", "gender": 2.0}, + {"family": 1, "child": "child2", "dob": "2000-01-29", "gender": 2.0}, + {"family": 2, "child": "child2", "dob": None, "gender": None}, + {"family": 3, "child": "child2", "dob": "2004-04-05", "gender": 2.0}, + {"family": 4, "child": "child2", "dob": "2009-08-27", "gender": 1.0}, + {"family": 5, "child": "child2", "dob": "2005-02-28", "gender": 1.0}, + ] + + actual = pl.DataFrame(actual).sort(by=pl.all()) + + assert_frame_equal(result, actual) + + +@pytest.fixture +def multiple_values_to(): + """fixture for multiple values_to""" + # https://stackoverflow.com/q/51519101/7175713 + return pl.DataFrame( + { + "City": ["Houston", "Austin", "Hoover"], + "State": ["Texas", "Texas", "Alabama"], + "Name": ["Aria", "Penelope", "Niko"], + "Mango": [4, 10, 90], + "Orange": [10, 8, 14], + "Watermelon": [40, 99, 43], + "Gin": [16, 200, 34], + "Vodka": [20, 33, 18], + }, + ) + + +def test_output_values_to_seq(multiple_values_to): + """Test output when values_to is a list/tuple.""" + + expected = multiple_values_to.janitor.pivot_longer( + index=["City", "State"], + column_names=cs.numeric(), + names_to=("Fruit"), + values_to=("Pounds",), + names_pattern=[r"M|O|W"], + ).sort(by=cs.all()) + + actual = [ + {"City": "Houston", "State": "Texas", "Fruit": "Mango", "Pounds": 4}, + {"City": "Austin", "State": "Texas", "Fruit": "Mango", "Pounds": 10}, + {"City": "Hoover", "State": "Alabama", "Fruit": "Mango", "Pounds": 90}, + {"City": "Houston", "State": "Texas", "Fruit": "Orange", "Pounds": 10}, + {"City": "Austin", "State": "Texas", "Fruit": "Orange", "Pounds": 8}, + { + "City": "Hoover", + "State": "Alabama", + "Fruit": "Orange", + "Pounds": 14, + }, + { + "City": "Houston", + "State": "Texas", + "Fruit": "Watermelon", + "Pounds": 40, + }, + { + "City": "Austin", + "State": "Texas", + "Fruit": "Watermelon", + "Pounds": 99, + }, + { + "City": "Hoover", + "State": "Alabama", + "Fruit": "Watermelon", + "Pounds": 43, + }, + ] + + actual = pl.DataFrame(actual).sort(by=pl.all()) + + assert_frame_equal(expected, actual) + + +def test_output_values_to_seq1(multiple_values_to): + """Test output when values_to is a list/tuple.""" + # https://stackoverflow.com/a/51520155/7175713 + expected = ( + multiple_values_to.janitor.pivot_longer( + index=["City", "State"], + column_names=cs.numeric(), + names_to=("Fruit", "Drink"), + values_to=("Pounds", "Ounces"), + names_pattern=[r"M|O|W", r"G|V"], + ) + .with_columns(pl.col("Ounces").cast(float)) + .sort(by=pl.all()) + ) + + actual = { + "City": [ + "Houston", + "Austin", + "Hoover", + "Houston", + "Austin", + "Hoover", + "Houston", + "Austin", + "Hoover", + ], + "State": [ + "Texas", + "Texas", + "Alabama", + "Texas", + "Texas", + "Alabama", + "Texas", + "Texas", + "Alabama", + ], + "Fruit": [ + "Mango", + "Mango", + "Mango", + "Orange", + "Orange", + "Orange", + "Watermelon", + "Watermelon", + "Watermelon", + ], + "Pounds": [4, 10, 90, 10, 8, 14, 40, 99, 43], + "Drink": [ + "Gin", + "Gin", + "Gin", + "Vodka", + "Vodka", + "Vodka", + None, + None, + None, + ], + "Ounces": [16.0, 200.0, 34.0, 20.0, 33.0, 18.0, None, None, None], + } + + actual = pl.DataFrame(actual).sort(by=pl.all()) + + assert_frame_equal(expected, actual) From 959b08295a1a01337693ee741fef119006180590 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Tue, 30 Apr 2024 10:46:39 +1000 Subject: [PATCH 32/46] changelog --- CHANGELOG.md | 2 +- janitor/polars/__init__.py | 14 +-- janitor/polars/pivot_longer.py | 98 +++++++++---------- .../functions/test_pivot_longer_polars.py | 22 ++--- 4 files changed, 65 insertions(+), 71 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5717193d6..8d95acbb5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ # Changelog ## [Unreleased] -- [ENH] Added a `clean_names` method for polars - it can be used to clean the column names, or clean column values . Issue #1343 +- [ENH] Added a `pivot_longer` method for polars - Issue #1352 ## [v0.27.0] - 2024-03-21 diff --git a/janitor/polars/__init__.py b/janitor/polars/__init__.py index 59b15ff72..c44cd635d 100644 --- a/janitor/polars/__init__.py +++ b/janitor/polars/__init__.py @@ -1,6 +1,6 @@ -from typing import Any, Optional, Sequence, Union +from typing import Any, Iterable, Optional, Union -from polars.type_aliases import ColumnNameOrSelector +from polars.type_aliases import IntoExpr from janitor.utils import import_message @@ -24,12 +24,8 @@ def __init__(self, df: pl.DataFrame) -> pl.DataFrame: def pivot_longer( self, - index: Union[ - ColumnNameOrSelector, Sequence[ColumnNameOrSelector], None - ] = None, - column_names: Union[ - ColumnNameOrSelector, Sequence[ColumnNameOrSelector], None - ] = None, + index: Union[IntoExpr, Iterable[IntoExpr], None] = None, + column_names: Union[IntoExpr, Iterable[IntoExpr], None] = None, names_to: Optional[Union[list, tuple, str]] = "variable", values_to: Optional[Union[list, tuple, str]] = "value", names_sep: Optional[Union[str, None]] = None, @@ -317,8 +313,6 @@ def pivot_longer( It takes the same specification as polars' `str.extract_groups` method. `names_pattern` can also be a list/tuple of regular expressions. - It can also be a list/tuple of strings; - the strings will be treated as regular expressions. Under the hood it is processed with polars' `str.contains` function. For a list/tuple of regular expressions, `names_to` must also be a list/tuple and the lengths of both diff --git a/janitor/polars/pivot_longer.py b/janitor/polars/pivot_longer.py index 9bb94000b..ceb94903b 100644 --- a/janitor/polars/pivot_longer.py +++ b/janitor/polars/pivot_longer.py @@ -2,7 +2,7 @@ from collections import defaultdict from itertools import chain -from typing import Any, Mapping, Optional, Pattern, Sequence, Union +from typing import Any, Iterable, Optional, Union from janitor.utils import check, import_message @@ -10,7 +10,7 @@ import polars as pl import polars.selectors as cs from polars.datatypes.classes import DataTypeClass - from polars.type_aliases import ColumnNameOrSelector, PolarsDataType + from polars.type_aliases import IntoExpr, PolarsDataType except ImportError: import_message( submodule="polars", @@ -22,16 +22,17 @@ def _pivot_longer( df: pl.DataFrame, - index: Union[ColumnNameOrSelector, Sequence[ColumnNameOrSelector], None], - column_names: Union[ - ColumnNameOrSelector, Sequence[ColumnNameOrSelector], None - ], + index: Union[IntoExpr, Iterable[IntoExpr], None], + column_names: Union[IntoExpr, Iterable[IntoExpr], None], names_to: Optional[Union[list, str]], values_to: Optional[str], - names_sep: Optional[Union[str, Pattern, None]], - names_pattern: Optional[Union[list, tuple, str, Pattern, None]], + names_sep: Optional[Union[str, None]], + names_pattern: Optional[Union[list, tuple, str, None]], names_transform: Optional[Union[PolarsDataType, dict]], ) -> pl.DataFrame: + """ + Unpivots a DataFrame to long form. + """ ( df, @@ -64,6 +65,11 @@ def _pivot_longer( value_name=values_to, ) + # the core idea is to do the transformation on the columns + # before flipping into long form + # typically less work is done this way + # compared to flipping and then processing the columns + if names_sep is not None: return _pivot_longer_names_sep( df=df, @@ -75,7 +81,7 @@ def _pivot_longer( names_transform=names_transform, ) - if isinstance(names_pattern, (str, Pattern)): + if isinstance(names_pattern, str): return _pivot_longer_names_pattern_str( df=df, index=index, @@ -107,15 +113,15 @@ def _pivot_longer( def _pivot_longer_names_sep( df: pl.DataFrame, - index: Sequence, - column_names: Sequence, - names_to: Sequence, + index: Iterable, + column_names: Iterable, + names_to: Iterable, names_sep: str, values_to: str, names_transform: dict, ) -> pl.DataFrame: """ - This takes care of pivoting scenarios where + This takes care of unpivoting scenarios where names_sep is provided. """ @@ -167,15 +173,15 @@ def _pivot_longer_names_sep( def _pivot_longer_names_pattern_str( df: pl.DataFrame, - index: Union[Sequence, None], - column_names: Union[Sequence, None], - names_to: Sequence, + index: Iterable, + column_names: Iterable, + names_to: Iterable, names_pattern: str, values_to: str, names_transform: dict, ) -> pl.DataFrame: """ - This takes care of pivoting scenarios where + This takes care of unpivoting scenarios where names_pattern is a string. """ @@ -222,15 +228,15 @@ def _pivot_longer_names_pattern_str( def _pivot_longer_values_to_sequence( df: pl.DataFrame, - index: Union[Sequence, None], - column_names: Union[Sequence, None], - names_to: Sequence, - names_pattern: Sequence, - values_to: Sequence, + index: Iterable, + column_names: Iterable, + names_to: Iterable, + names_pattern: Iterable, + values_to: Iterable, names_transform: dict, ) -> pl.DataFrame: """ - This takes care of pivoting scenarios where + This takes care of unpivoting scenarios where values_to is a list/tuple. """ columns = df.select(column_names).columns @@ -299,13 +305,13 @@ def _pivot_longer_values_to_sequence( def _pivot_longer_names_pattern_sequence( df: pl.DataFrame, - index: Union[Sequence, None], - column_names: Union[Sequence, None], - names_to: Sequence, - names_pattern: Sequence, + index: Iterable, + column_names: Iterable, + names_to: Iterable, + names_pattern: Iterable, ) -> pl.DataFrame: """ - This takes care of pivoting scenarios where + This takes care of unpivoting scenarios where names_pattern is a list/tuple. """ columns = df.select(column_names).columns @@ -358,19 +364,17 @@ def _pivot_longer_names_pattern_sequence( def _pivot_longer_no_dot_value( df: pl.DataFrame, - outcome: Mapping, - names_to: Sequence, + outcome: pl.Series, + names_to: Iterable, values_to: str, - index: Sequence, - columns: Sequence, + index: Iterable, + columns: Iterable, names_transform: dict, -): +) -> pl.DataFrame: """ Reshape the data for scenarios where .value is not present in names_to, or names_to is not a list/tuple. - - Returns a DataFrame. """ contents = [] for col_name, mapping in zip(columns, outcome): @@ -395,17 +399,15 @@ def _pivot_longer_no_dot_value( def _pivot_longer_dot_value( df: pl.DataFrame, - names_to: Sequence, - outcome: pl.DataFrame, - index: Sequence, - columns: Sequence, + names_to: Iterable, + outcome: pl.Series, + index: Iterable, + columns: Iterable, names_transform: Union[PolarsDataType, dict], ) -> pl.DataFrame: """ Pivots the dataframe into the final form, for scenarios where .value is in names_to. - - Returns a DataFrame. """ booleans = outcome.struct.unnest().select(pl.all().is_null().any()) for position in range(len(names_to)): @@ -477,16 +479,14 @@ def _pivot_longer_dot_value( def _pivot_longer_dot_value_only( df: pl.DataFrame, - names_to: Sequence, - outcome: pl.DataFrame, - index: Sequence, - columns: Sequence, + names_to: Iterable, + outcome: pl.Series, + index: Iterable, + columns: Iterable, ) -> pl.DataFrame: """ Pivots the dataframe into the final form, for scenarios where only '.value' is present in names_to. - - Returns a DataFrame. """ if names_to.count(".value") > 1: @@ -551,8 +551,8 @@ def _check_type_single(entry): raise TypeError( f"The argument passed to the {arg_name} parameter " "should be a string type, a ColumnSelector, " - "or a list/tuple that contains " - "a string and/or a ColumnSelector." + "an expression or a list/tuple that contains " + "a string and/or a ColumnSelector and/or an expression." ) if isinstance(arg_value, (list, tuple)): diff --git a/tests/polars/functions/test_pivot_longer_polars.py b/tests/polars/functions/test_pivot_longer_polars.py index 86d2636c8..ee3b59b60 100644 --- a/tests/polars/functions/test_pivot_longer_polars.py +++ b/tests/polars/functions/test_pivot_longer_polars.py @@ -309,7 +309,7 @@ def test_names_pat_str(df_checks): names_to=(".value", "age"), names_pattern="(.+)(.)", names_transform={"age": pl.Int64}, - ).sort(by=cs.all()) + ).sort(by=pl.all()) actual = [ {"famid": 1, "birth": 1, "age": 1, "ht": 2.8}, @@ -331,7 +331,7 @@ def test_names_pat_str(df_checks): {"famid": 3, "birth": 3, "age": 1, "ht": 2.1}, {"famid": 3, "birth": 3, "age": 2, "ht": 2.9}, ] - actual = pl.DataFrame(actual).sort(by=cs.all()) + actual = pl.DataFrame(actual).sort(by=pl.all()) assert_frame_equal(result, actual, check_dtype=False) @@ -342,7 +342,7 @@ def test_no_column_names(df_checks): are assigned to the index parameter. """ assert_frame_equal( - df_checks.janitor.pivot_longer(index=cs.all()), + df_checks.janitor.pivot_longer(index=pl.all()), df_checks, ) @@ -452,10 +452,10 @@ def test_names_pattern_str(test_df): """Test output for names_pattern and .value.""" result = test_df.janitor.pivot_longer( - column_names=cs.all(), + column_names=pl.all(), names_to=["set", ".value"], names_pattern="(.+)_(.+)", - ).sort(by=cs.all()) + ).sort(by=pl.all()) assert_frame_equal(result, actual) @@ -463,10 +463,10 @@ def test_names_sep_str(test_df): """Test output for names_pattern and .value.""" result = test_df.janitor.pivot_longer( - column_names=cs.all(), + column_names=pl.all(), names_to=["set", ".value"], names_sep="_", - ).sort(by=cs.all()) + ).sort(by=pl.all()) assert_frame_equal(result, actual) @@ -599,7 +599,7 @@ def test_not_dot_value_pattern(not_dot_value): names_to=("event", "year"), names_pattern=r"(.+)_(.+)", values_to="score", - ).sort(by=cs.all()) + ).sort(by=pl.all()) assert_frame_equal(result, actual2) @@ -625,7 +625,7 @@ def test_multiple_dot_value(): names_to=(".value", "time", ".value"), names_pattern=r"(x|y)_([0-9])(_mean|_sd)", names_transform={"time": pl.Int64}, - ).sort(by=cs.all()) + ).sort(by=pl.all()) actual = { "unit": [1, 2, 3, 4, 1, 2, 3, 4], @@ -636,7 +636,7 @@ def test_multiple_dot_value(): "y_sd": [0.0, 1.0, 1.0, 1.0, -0.525, 0.623, -0.705, 0.662], } - actual = pl.DataFrame(actual).sort(by=cs.all()) + actual = pl.DataFrame(actual).sort(by=pl.all()) assert_frame_equal(result, actual) @@ -805,7 +805,7 @@ def test_output_values_to_seq(multiple_values_to): names_to=("Fruit"), values_to=("Pounds",), names_pattern=[r"M|O|W"], - ).sort(by=cs.all()) + ).sort(by=pl.all()) actual = [ {"City": "Houston", "State": "Texas", "Fruit": "Mango", "Pounds": 4}, From 317750379e3d24c5d6d79f6241ceafa0388e2b4a Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Tue, 30 Apr 2024 10:53:57 +1000 Subject: [PATCH 33/46] keep changes related only to pivot_longer --- janitor/functions/clean_names.py | 137 ++++++++++++++----------------- janitor/functions/utils.py | 87 +------------------- janitor/spark/functions.py | 2 +- 3 files changed, 64 insertions(+), 162 deletions(-) diff --git a/janitor/functions/clean_names.py b/janitor/functions/clean_names.py index a38753fa8..71735a7fc 100644 --- a/janitor/functions/clean_names.py +++ b/janitor/functions/clean_names.py @@ -1,9 +1,7 @@ -"""Functions for cleaning columns/index names and/or column values.""" - -from __future__ import annotations +"""Functions for cleaning columns names.""" import unicodedata -from typing import Optional, Union +from typing import Hashable, Optional, Union import pandas as pd import pandas_flavor as pf @@ -79,9 +77,8 @@ def clean_names( Column selection is possible using the [`select`][janitor.functions.select.select] syntax. strip_underscores: Removes the outer underscores from all - column names/values. Default None keeps outer underscores. - Values can be either 'left', 'right' or 'both' - or the respective shorthand 'l', + column names. Default None keeps outer underscores. Values can be + either 'left', 'right' or 'both' or the respective shorthand 'l', 'r' and True. case_type: Whether to make columns lower or uppercase. Current case may be preserved with 'preserve', @@ -91,17 +88,15 @@ def clean_names( remove_special: Remove special characters from columns. Only letters, numbers and underscores are preserved. strip_accents: Whether or not to remove accents from - columns names/values. + columns names. preserve_original_labels: Preserve original names. This is later retrievable using `df.original_labels`. Applies if `axis` is not None. - enforce_string: Whether or not to convert all - column names/values to string type. - Defaults to True, but can be turned off. + enforce_string: Whether or not to convert all column names + to string type. Defaults to True, but can be turned off. Columns with >1 levels will not be converted by default. - truncate_limit: Truncates formatted column names/values - to the specified length. - Default None does not truncate. + truncate_limit: Truncates formatted column names to + the specified length. Default None does not truncate. Raises: ValueError: If `axis=None` and `column_names=None`. @@ -121,7 +116,7 @@ def clean_names( column_names = [column_names] df = df.copy() for column_name in column_names: - df[column_name] = _clean_names( + df[column_name] = _clean_names_single_object( obj=df[column_name], enforce_string=enforce_string, case_type=case_type, @@ -141,7 +136,7 @@ def clean_names( for number in range(target_axis.nlevels) ] target_axis = [ - _clean_names( + _clean_names_single_object( obj=obj, enforce_string=enforce_string, case_type=case_type, @@ -153,7 +148,7 @@ def clean_names( for obj in target_axis ] else: - target_axis = _clean_names( + target_axis = _clean_names_single_object( obj=target_axis, enforce_string=enforce_string, case_type=case_type, @@ -169,108 +164,100 @@ def clean_names( return df -def _clean_names( +def _clean_names_single_object( obj: Union[pd.Index, pd.Series], - strip_underscores: Optional[Union[str, bool]] = None, - case_type: str = "lower", - remove_special: bool = False, - strip_accents: bool = False, - enforce_string: bool = False, - truncate_limit: int = None, -) -> Union[pd.Index, pd.Series]: + enforce_string, + case_type, + remove_special, + strip_accents, + strip_underscores, + truncate_limit, +): """ - Generic function to clean labels in a pandas object. + Apply _clean_names on a single pandas object. """ - if enforce_string and not _is_str_or_cat(obj): + if enforce_string and not (_is_str_or_cat(obj)): obj = obj.astype(str) - obj = _change_case(obj=obj, case_type=case_type) - obj = _normalize_1(obj=obj) + obj = _change_case(obj, case_type) + obj = _normalize_1(obj) if remove_special: - obj = obj.str.replace( - pat="[^A-Za-z_\\d]", repl="", regex=True - ).str.strip() + obj = obj.map(_remove_special) if strip_accents: - obj = _strip_accents(obj=obj) + obj = obj.map(_strip_accents) obj = obj.str.replace(pat="_+", repl="_", regex=True) - obj = _strip_underscores_func( - obj, - strip_underscores=strip_underscores, - ) + obj = _strip_underscores_func(obj, strip_underscores=strip_underscores) if truncate_limit: obj = obj.str[:truncate_limit] return obj -def _change_case( - obj: Union[pd.Index, pd.Series], - case_type: str, -) -> Union[pd.Index, pd.Series]: - """Change case of labels in obj.""" +def _change_case(col: Union[pd.Index, pd.Series], case_type: str) -> str: + """Change case of labels in pandas object.""" case_types = {"preserve", "upper", "lower", "snake"} case_type = case_type.lower() if case_type not in case_types: raise JanitorError(f"case_type must be one of: {case_types}") - if case_type == "preserve": - return obj + return col if case_type == "upper": - return obj.str.upper() + return col.str.upper() if case_type == "lower": - return obj.str.lower() + return col.str.lower() # Implementation taken from: https://gist.github.com/jaytaylor/3660565 # by @jtaylor return ( - obj.str.replace(pat=r"(.)([A-Z][a-z]+)", repl=r"\1_\2", regex=True) + col.str.replace(pat=r"(.)([A-Z][a-z]+)", repl=r"\1_\2", regex=True) .str.replace(pat=r"([a-z0-9])([A-Z])", repl=r"\1_\2", regex=True) .str.lower() ) -def _normalize_1( - obj: Union[pd.Index, pd.Series] -) -> Union[pd.Index, pd.Series]: - """Perform normalization of labels in obj.""" +def _remove_special(label: Hashable) -> str: + """Remove special characters from label.""" + return "".join( + [item for item in str(label) if item.isalnum() or "_" in item] + ) + + +def _normalize_1(col: Union[pd.Index, pd.Series]) -> str: + """Perform normalization of labels in pandas object.""" FIXES = [(r"[ /:,?()\.-]", "_"), (r"['’]", ""), (r"[\xa0]", "_")] for search, replace in FIXES: - obj = obj.str.replace(pat=search, repl=replace, regex=True) - - return obj + col = col.str.replace(pat=search, repl=replace, regex=True) + return col -def _strip_accents( - obj: Union[pd.Index, pd.Series], -) -> Union[pd.Index, pd.Series]: +def _strip_accents(label: Hashable) -> str: """Remove accents from a label. Inspired from [StackOverflow][so]. [so]: https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-strin """ # noqa: E501 - return obj.map( - lambda f: "".join( - [ - letter - for letter in unicodedata.normalize("NFD", str(f)) - if not unicodedata.combining(letter) - ] - ) + + return "".join( + [ + letter + for letter in unicodedata.normalize("NFD", str(label)) + if not unicodedata.combining(letter) + ] ) def _strip_underscores_func( - obj: Union[pd.Index, pd.Series], - strip_underscores: Union[str, bool] = None, -) -> Union[pd.Index, pd.Series]: - """Strip underscores.""" + col: Union[pd.Index, pd.Series], strip_underscores: Union[str, bool] = None +) -> pd.DataFrame: + """Strip underscores from a pandas object.""" underscore_options = {None, "left", "right", "both", "l", "r", True} if strip_underscores not in underscore_options: raise JanitorError( f"strip_underscores must be one of: {underscore_options}" ) - if strip_underscores in {"left", "l"}: - return obj.str.lstrip("_") - if strip_underscores in {"right", "r"}: - return obj.str.rstrip("_") + + if strip_underscores in ["left", "l"]: + return col.str.lstrip("_") + if strip_underscores in ["right", "r"]: + return col.str.rstrip("_") if strip_underscores in {True, "both"}: - return obj.str.strip("_") - return obj + return col.str.strip("_") + return col diff --git a/janitor/functions/utils.py b/janitor/functions/utils.py index 4e1f443ee..8aa4d346b 100644 --- a/janitor/functions/utils.py +++ b/janitor/functions/utils.py @@ -5,7 +5,6 @@ import fnmatch import inspect import re -import unicodedata import warnings from collections.abc import Callable as dispatch_callable from dataclasses import dataclass @@ -37,13 +36,7 @@ from pandas.core.common import is_bool_indexer from pandas.core.groupby.generic import DataFrameGroupBy, SeriesGroupBy -from janitor.errors import JanitorError -from janitor.utils import ( - _expand_grid, - check, - check_column, - find_stack_level, -) +from janitor.utils import _expand_grid, check, check_column, find_stack_level warnings.simplefilter("always", DeprecationWarning) @@ -1140,81 +1133,3 @@ def __eq__(self, other): """ self.join_args = (self.cols, other.cols, "==") return self - - -def _change_case( - obj: str, - case_type: str, -) -> str: - """Change case of obj.""" - case_types = {"preserve", "upper", "lower", "snake"} - case_type = case_type.lower() - if case_type not in case_types: - raise JanitorError(f"type must be one of: {case_types}") - - if case_type == "preserve": - return obj - if case_type == "upper": - return obj.upper() - if case_type == "lower": - return obj.lower() - # Implementation adapted from: https://gist.github.com/jaytaylor/3660565 - # by @jtaylor - obj = re.sub(pattern=r"(.)([A-Z][a-z]+)", repl=r"\1_\2", string=obj) - obj = re.sub(pattern=r"([a-z0-9])([A-Z])", repl=r"\1_\2", string=obj) - return obj.lower() - - -def _normalize_1(obj: str) -> str: - """Perform normalization of obj.""" - FIXES = [(r"[ /:,?()\.-]", "_"), (r"['’]", ""), (r"[\xa0]", "_")] - for search, replace in FIXES: - obj = re.sub(pattern=search, repl=replace, string=obj) - - return obj - - -def _remove_special( - obj: str, -) -> str: - """Remove special characters from obj.""" - obj = [item for item in obj if item.isalnum() or (item == "_")] - return "".join(obj) - - -def _strip_accents( - obj: str, -) -> str: - """Remove accents from obj. - - Inspired from [StackOverflow][so]. - - [so]: https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-strin - """ # noqa: E501 - - obj = [ - letter - for letter in unicodedata.normalize("NFD", obj) - if not unicodedata.combining(letter) - ] - return "".join(obj) - - -def _strip_underscores_func( - obj: str, - strip_underscores: Union[str, bool] = None, -) -> str: - """Strip underscores from obj.""" - underscore_options = {None, "left", "right", "both", "l", "r", True} - if strip_underscores not in underscore_options: - raise JanitorError( - f"strip_underscores must be one of: {underscore_options}" - ) - - if strip_underscores in {"left", "l"}: - return obj.lstrip("_") - if strip_underscores in {"right", "r"}: - return obj.rstrip("_") - if strip_underscores in {True, "both"}: - return obj.strip("_") - return obj diff --git a/janitor/spark/functions.py b/janitor/spark/functions.py index 57abd1824..a43f7338d 100644 --- a/janitor/spark/functions.py +++ b/janitor/spark/functions.py @@ -4,7 +4,7 @@ from typing import Union from janitor import utils as janitor_utils -from janitor.functions.utils import ( +from janitor.functions.clean_names import ( _change_case, _normalize_1, _remove_special, From ee899b2404da373e15b840d8666f55769b6da662 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Tue, 30 Apr 2024 10:58:23 +1000 Subject: [PATCH 34/46] pd -> pl --- janitor/polars/__init__.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/janitor/polars/__init__.py b/janitor/polars/__init__.py index c44cd635d..d22e0b581 100644 --- a/janitor/polars/__init__.py +++ b/janitor/polars/__init__.py @@ -49,7 +49,7 @@ def pivot_longer( >>> import polars as pl >>> import polars.selectors as cs >>> import janitor.polars - >>> df = pd.DataFrame( + >>> df = pl.DataFrame( ... { ... "Sepal.Length": [5.1, 5.9], ... "Sepal.Width": [3.5, 3.0], diff --git a/pyproject.toml b/pyproject.toml index 0a697589f..e1faf6275 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ target-version = ['py36', 'py37', 'py38'] [tool.interrogate] exclude = ["setup.py", "docs", "nbconvert_config.py"] -fail-under = 55 +fail-under = 10 ignore-init-method = true ignore-init-module = true ignore-module = false From 8ea9b712c83f592ebfcf582063023f2fbd1b9000 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Tue, 30 Apr 2024 10:58:30 +1000 Subject: [PATCH 35/46] pd -> pl --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e1faf6275..0a697589f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ target-version = ['py36', 'py37', 'py38'] [tool.interrogate] exclude = ["setup.py", "docs", "nbconvert_config.py"] -fail-under = 10 +fail-under = 55 ignore-init-method = true ignore-init-module = true ignore-module = false From d12ae1aa00329be2699cdc2c3beab9a037a50ebf Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Tue, 30 Apr 2024 11:03:12 +1000 Subject: [PATCH 36/46] df.pivot_longer -> df.janitor.pivot_longer --- janitor/polars/__init__.py | 14 +++++++------- pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/janitor/polars/__init__.py b/janitor/polars/__init__.py index d22e0b581..ba0930584 100644 --- a/janitor/polars/__init__.py +++ b/janitor/polars/__init__.py @@ -224,7 +224,7 @@ def pivot_longer( │ 514 ┆ 545 ┆ Red Sox ┆ 2007 ┆ 2008 │ │ 573 ┆ 526 ┆ Yankees ┆ 2007 ┆ 2008 │ └─────┴─────┴─────────┴───────┴───────┘ - >>> df.pivot_longer( + >>> df.janitor.pivot_longer( ... index = 'team', ... names_to = ['year', 'hr'], ... names_pattern = ['year', 'hr'] @@ -265,12 +265,12 @@ def pivot_longer( │ Hoover ┆ Alabama ┆ Niko ┆ 90 ┆ 14 ┆ 43 ┆ 34 ┆ 18 │ └─────────┴─────────┴──────────┴───────┴────────┴────────────┴─────┴───────┘ - >>> df.pivot_longer( - ... index=["City", "State"], - ... column_names=cs.numeric(), - ... names_to=("Fruit", "Drink"), - ... values_to=("Pounds", "Ounces"), - ... names_pattern=["M|O|W", "G|V"], + >>> df.janitor.pivot_longer( + ... index=["City", "State"], + ... column_names=cs.numeric(), + ... names_to=("Fruit", "Drink"), + ... values_to=("Pounds", "Ounces"), + ... names_pattern=["M|O|W", "G|V"], ... ) shape: (9, 6) ┌─────────┬─────────┬────────────┬────────┬───────┬────────┐ diff --git a/pyproject.toml b/pyproject.toml index 0a697589f..e1faf6275 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ target-version = ['py36', 'py37', 'py38'] [tool.interrogate] exclude = ["setup.py", "docs", "nbconvert_config.py"] -fail-under = 55 +fail-under = 10 ignore-init-method = true ignore-init-module = true ignore-module = false From 652f3e3ffe78f361464ddb50569f12531b4cf6b2 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Tue, 30 Apr 2024 11:03:18 +1000 Subject: [PATCH 37/46] df.pivot_longer -> df.janitor.pivot_longer --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e1faf6275..0a697589f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ target-version = ['py36', 'py37', 'py38'] [tool.interrogate] exclude = ["setup.py", "docs", "nbconvert_config.py"] -fail-under = 10 +fail-under = 55 ignore-init-method = true ignore-init-module = true ignore-module = false From 9b9c1a940ba1ba833d4d5e755921f47c4c40e65a Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Tue, 30 Apr 2024 11:07:51 +1000 Subject: [PATCH 38/46] pd -> pl --- janitor/polars/__init__.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/janitor/polars/__init__.py b/janitor/polars/__init__.py index ba0930584..19f344386 100644 --- a/janitor/polars/__init__.py +++ b/janitor/polars/__init__.py @@ -242,7 +242,7 @@ def pivot_longer( └─────────┴─────┴──────┘ Multiple `values_to`: - >>> df = pd.DataFrame( + >>> df = pl.DataFrame( ... { ... "City": ["Houston", "Austin", "Hoover"], ... "State": ["Texas", "Texas", "Alabama"], diff --git a/pyproject.toml b/pyproject.toml index 0a697589f..e1faf6275 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ target-version = ['py36', 'py37', 'py38'] [tool.interrogate] exclude = ["setup.py", "docs", "nbconvert_config.py"] -fail-under = 55 +fail-under = 10 ignore-init-method = true ignore-init-module = true ignore-module = false From 69c273fb08bdf616de158730348970239bf4a160 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Tue, 30 Apr 2024 11:08:01 +1000 Subject: [PATCH 39/46] pd -> pl --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e1faf6275..0a697589f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ target-version = ['py36', 'py37', 'py38'] [tool.interrogate] exclude = ["setup.py", "docs", "nbconvert_config.py"] -fail-under = 10 +fail-under = 55 ignore-init-method = true ignore-init-module = true ignore-module = false From b3391e8d5e166aa2a3b8f081d2bc8cf52c39db9b Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Tue, 30 Apr 2024 11:12:50 +1000 Subject: [PATCH 40/46] add >>> df --- janitor/polars/__init__.py | 1 + pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/janitor/polars/__init__.py b/janitor/polars/__init__.py index 19f344386..5c94ea870 100644 --- a/janitor/polars/__init__.py +++ b/janitor/polars/__init__.py @@ -254,6 +254,7 @@ def pivot_longer( ... "Vodka": [20, 33, 18], ... }, ... ) + >>> df shape: (3, 8) ┌─────────┬─────────┬──────────┬───────┬────────┬────────────┬─────┬───────┐ │ City ┆ State ┆ Name ┆ Mango ┆ Orange ┆ Watermelon ┆ Gin ┆ Vodka │ diff --git a/pyproject.toml b/pyproject.toml index 0a697589f..e1faf6275 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ target-version = ['py36', 'py37', 'py38'] [tool.interrogate] exclude = ["setup.py", "docs", "nbconvert_config.py"] -fail-under = 55 +fail-under = 10 ignore-init-method = true ignore-init-module = true ignore-module = false From 4ffaac5060ebc76a5f5599ffc8c4f863f034963d Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Tue, 30 Apr 2024 11:13:02 +1000 Subject: [PATCH 41/46] add >>> df --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e1faf6275..0a697589f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ target-version = ['py36', 'py37', 'py38'] [tool.interrogate] exclude = ["setup.py", "docs", "nbconvert_config.py"] -fail-under = 10 +fail-under = 55 ignore-init-method = true ignore-init-module = true ignore-module = false From 1de57bbfc44324ae1c56b2c2bb06cd42418f6a6c Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Tue, 30 Apr 2024 20:08:18 +1000 Subject: [PATCH 42/46] keep changes related only to polars pivot_longer --- janitor/functions/pivot.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/janitor/functions/pivot.py b/janitor/functions/pivot.py index 51bc78419..7efeba45b 100644 --- a/janitor/functions/pivot.py +++ b/janitor/functions/pivot.py @@ -98,7 +98,7 @@ def pivot_longer( 6 setosa Petal.Width 0.2 7 virginica Petal.Width 1.8 - Split the column labels into individual columns: + Split the column labels into parts: >>> df.pivot_longer( ... index = 'Species', ... names_to = ('part', 'dimension'), @@ -167,7 +167,7 @@ def pivot_longer( value int64 dtype: object - Use multiple `.value` to reshape the dataframe: + Use multiple `.value` to reshape dataframe: >>> df = pd.DataFrame( ... [ ... { @@ -265,6 +265,16 @@ def pivot_longer( ... "Gin": [16, 200, 34], ... "Vodka": [20, 33, 18], ... }, + ... columns=[ + ... "City", + ... "State", + ... "Name", + ... "Mango", + ... "Orange", + ... "Watermelon", + ... "Gin", + ... "Vodka", + ... ], ... ) >>> df City State Name Mango Orange Watermelon Gin Vodka From e4957908d1f9877ad8c8cd4b07e1a3a78d8da161 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Wed, 1 May 2024 20:16:31 +1000 Subject: [PATCH 43/46] add polars support to read_commandline --- CHANGELOG.md | 2 +- janitor/io.py | 28 +- janitor/polars/__init__.py | 342 ----------------- janitor/polars/pivot_longer.py | 669 --------------------------------- 4 files changed, 23 insertions(+), 1018 deletions(-) delete mode 100644 janitor/polars/__init__.py delete mode 100644 janitor/polars/pivot_longer.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 8d95acbb5..9aea6a879 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ # Changelog ## [Unreleased] -- [ENH] Added a `pivot_longer` method for polars - Issue #1352 +- [ENH] `read_commandline` function now supports polars - Issue #1352 ## [v0.27.0] - 2024-03-21 diff --git a/janitor/io.py b/janitor/io.py index 1912afe8c..4741cb4d2 100644 --- a/janitor/io.py +++ b/janitor/io.py @@ -8,7 +8,7 @@ from glob import glob from io import StringIO from itertools import chain -from typing import IO, TYPE_CHECKING, Any, Iterable, Union +from typing import IO, TYPE_CHECKING, Any, Iterable, Mapping, Union import pandas as pd @@ -93,7 +93,7 @@ def read_csvs( return dfs_dict -def read_commandline(cmd: str, **kwargs: Any) -> pd.DataFrame: +def read_commandline(cmd: str, engine="pandas", **kwargs: Any) -> Mapping: """Read a CSV file based on a command-line command. For example, you may wish to run the following command on `sep-quarter.csv` @@ -111,26 +111,42 @@ def read_commandline(cmd: str, **kwargs: Any) -> pd.DataFrame: ``` This function assumes that your command line command will return - an output that is parsable using `pandas.read_csv` and StringIO. - We default to using `pd.read_csv` underneath the hood. + an output that is parsable using the relevant engine and StringIO. + This function defaults to using `pd.read_csv` underneath the hood. Keyword arguments are passed through to read_csv. Args: cmd: Shell command to preprocess a file on disk. + engine: DataFrame engine to process the output of the shell command. + Currently supports both pandas and polars. **kwargs: Keyword arguments that are passed through to - `pd.read_csv()`. + the engine's csv reader. + Returns: - A pandas DataFrame parsed from the stdout of the underlying + A DataFrame parsed from the stdout of the underlying shell. """ check("cmd", cmd, [str]) + if engine not in {"pandas", "polars"}: + raise ValueError("engine should be either pandas or polars.") # adding check=True ensures that an explicit, clear error # is raised, so that the user can see the reason for the failure outcome = subprocess.run( cmd, shell=True, capture_output=True, text=True, check=True ) + if engine == "polars": + try: + import polars as pl + except ImportError: + import_message( + submodule="polars", + package="polars", + conda_channel="conda-forge", + pip_install=True, + ) + return pl.read_csv(StringIO(outcome.stdout), **kwargs) return pd.read_csv(StringIO(outcome.stdout), **kwargs) diff --git a/janitor/polars/__init__.py b/janitor/polars/__init__.py deleted file mode 100644 index 5c94ea870..000000000 --- a/janitor/polars/__init__.py +++ /dev/null @@ -1,342 +0,0 @@ -from typing import Any, Iterable, Optional, Union - -from polars.type_aliases import IntoExpr - -from janitor.utils import import_message - -from .pivot_longer import _pivot_longer - -try: - import polars as pl -except ImportError: - import_message( - submodule="polars", - package="polars", - conda_channel="conda-forge", - pip_install=True, - ) - - -@pl.api.register_dataframe_namespace("janitor") -class PolarsFrame: - def __init__(self, df: pl.DataFrame) -> pl.DataFrame: - self._df = df - - def pivot_longer( - self, - index: Union[IntoExpr, Iterable[IntoExpr], None] = None, - column_names: Union[IntoExpr, Iterable[IntoExpr], None] = None, - names_to: Optional[Union[list, tuple, str]] = "variable", - values_to: Optional[Union[list, tuple, str]] = "value", - names_sep: Optional[Union[str, None]] = None, - names_pattern: Optional[Union[list, tuple, str, None]] = None, - names_transform: Optional[Any] = pl.Utf8, - ) -> pl.DataFrame: - """ - Unpivots a DataFrame from *wide* to *long* format. - - It is modeled after the `pivot_longer` function in R's tidyr package, - and also takes inspiration from the `melt` function in R's data.table package. - - This function is useful to massage a DataFrame into a format where - one or more columns are considered measured variables, and all other - columns are considered as identifier variables. - - All measured variables are *unpivoted* (and typically duplicated) along the - row axis. - - Examples: - >>> import polars as pl - >>> import polars.selectors as cs - >>> import janitor.polars - >>> df = pl.DataFrame( - ... { - ... "Sepal.Length": [5.1, 5.9], - ... "Sepal.Width": [3.5, 3.0], - ... "Petal.Length": [1.4, 5.1], - ... "Petal.Width": [0.2, 1.8], - ... "Species": ["setosa", "virginica"], - ... } - ... ) - >>> df - shape: (2, 5) - ┌──────────────┬─────────────┬──────────────┬─────────────┬───────────┐ - │ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ f64 ┆ f64 ┆ str │ - ╞══════════════╪═════════════╪══════════════╪═════════════╪═══════════╡ - │ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 ┆ setosa │ - │ 5.9 ┆ 3.0 ┆ 5.1 ┆ 1.8 ┆ virginica │ - └──────────────┴─────────────┴──────────────┴─────────────┴───────────┘ - - Replicate polars' [melt](https://docs.pola.rs/py-polars/html/reference/dataframe/api/polars.DataFrame.melt.html#polars-dataframe-melt): - >>> df.janitor.pivot_longer(index = 'Species') - shape: (8, 3) - ┌───────────┬──────────────┬───────┐ - │ Species ┆ variable ┆ value │ - │ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ f64 │ - ╞═══════════╪══════════════╪═══════╡ - │ setosa ┆ Sepal.Length ┆ 5.1 │ - │ virginica ┆ Sepal.Length ┆ 5.9 │ - │ setosa ┆ Sepal.Width ┆ 3.5 │ - │ virginica ┆ Sepal.Width ┆ 3.0 │ - │ setosa ┆ Petal.Length ┆ 1.4 │ - │ virginica ┆ Petal.Length ┆ 5.1 │ - │ setosa ┆ Petal.Width ┆ 0.2 │ - │ virginica ┆ Petal.Width ┆ 1.8 │ - └───────────┴──────────────┴───────┘ - - Split the column labels into individual columns: - >>> df.janitor.pivot_longer( - ... index = 'Species', - ... names_to = ('part', 'dimension'), - ... names_sep = '.', - ... ) - shape: (8, 4) - ┌───────────┬───────┬───────────┬───────┐ - │ Species ┆ part ┆ dimension ┆ value │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ str ┆ f64 │ - ╞═══════════╪═══════╪═══════════╪═══════╡ - │ setosa ┆ Sepal ┆ Length ┆ 5.1 │ - │ virginica ┆ Sepal ┆ Length ┆ 5.9 │ - │ setosa ┆ Sepal ┆ Width ┆ 3.5 │ - │ virginica ┆ Sepal ┆ Width ┆ 3.0 │ - │ setosa ┆ Petal ┆ Length ┆ 1.4 │ - │ virginica ┆ Petal ┆ Length ┆ 5.1 │ - │ setosa ┆ Petal ┆ Width ┆ 0.2 │ - │ virginica ┆ Petal ┆ Width ┆ 1.8 │ - └───────────┴───────┴───────────┴───────┘ - - Retain parts of the column names as headers: - >>> df.janitor.pivot_longer( - ... index = 'Species', - ... names_to = ('part', '.value'), - ... names_sep = '.', - ... ) - shape: (4, 4) - ┌───────────┬───────┬────────┬───────┐ - │ Species ┆ part ┆ Length ┆ Width │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ f64 ┆ f64 │ - ╞═══════════╪═══════╪════════╪═══════╡ - │ setosa ┆ Sepal ┆ 5.1 ┆ 3.5 │ - │ virginica ┆ Sepal ┆ 5.9 ┆ 3.0 │ - │ setosa ┆ Petal ┆ 1.4 ┆ 0.2 │ - │ virginica ┆ Petal ┆ 5.1 ┆ 1.8 │ - └───────────┴───────┴────────┴───────┘ - - Split the column labels based on regex: - >>> df = pl.DataFrame({"id": [1], "new_sp_m5564": [2], "newrel_f65": [3]}) - >>> df - shape: (1, 3) - ┌─────┬──────────────┬────────────┐ - │ id ┆ new_sp_m5564 ┆ newrel_f65 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪══════════════╪════════════╡ - │ 1 ┆ 2 ┆ 3 │ - └─────┴──────────────┴────────────┘ - >>> df.janitor.pivot_longer( - ... index = 'id', - ... names_to = ('diagnosis', 'gender', 'age'), - ... names_pattern = r"new_?(.+)_(.)(\\d+)", - ... ) - shape: (2, 5) - ┌─────┬───────────┬────────┬──────┬───────┐ - │ id ┆ diagnosis ┆ gender ┆ age ┆ value │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str ┆ str ┆ i64 │ - ╞═════╪═══════════╪════════╪══════╪═══════╡ - │ 1 ┆ sp ┆ m ┆ 5564 ┆ 2 │ - │ 1 ┆ rel ┆ f ┆ 65 ┆ 3 │ - └─────┴───────────┴────────┴──────┴───────┘ - - Convert the dtypes of specific columns with `names_transform`: - >>> ( - ... df.janitor.pivot_longer( - ... index="id", - ... names_to=("diagnosis", "gender", "age"), - ... names_pattern=r"new_?(.+)_(.)(\\d+)", - ... names_transform={"age": pl.Int32}, - ... ) - ... ) - shape: (2, 5) - ┌─────┬───────────┬────────┬──────┬───────┐ - │ id ┆ diagnosis ┆ gender ┆ age ┆ value │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str ┆ i32 ┆ i64 │ - ╞═════╪═══════════╪════════╪══════╪═══════╡ - │ 1 ┆ sp ┆ m ┆ 5564 ┆ 2 │ - │ 1 ┆ rel ┆ f ┆ 65 ┆ 3 │ - └─────┴───────────┴────────┴──────┴───────┘ - - Use multiple `.value` to reshape the dataframe: - >>> df = pl.DataFrame( - ... [ - ... { - ... "x_1_mean": 10, - ... "x_2_mean": 20, - ... "y_1_mean": 30, - ... "y_2_mean": 40, - ... "unit": 50, - ... } - ... ] - ... ) - >>> df - shape: (1, 5) - ┌──────────┬──────────┬──────────┬──────────┬──────┐ - │ x_1_mean ┆ x_2_mean ┆ y_1_mean ┆ y_2_mean ┆ unit │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ - ╞══════════╪══════════╪══════════╪══════════╪══════╡ - │ 10 ┆ 20 ┆ 30 ┆ 40 ┆ 50 │ - └──────────┴──────────┴──────────┴──────────┴──────┘ - >>> df.janitor.pivot_longer( - ... index="unit", - ... names_to=(".value", "time", ".value"), - ... names_pattern=r"(x|y)_([0-9])(_mean)", - ... ) - shape: (2, 4) - ┌──────┬──────┬────────┬────────┐ - │ unit ┆ time ┆ x_mean ┆ y_mean │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ i64 │ - ╞══════╪══════╪════════╪════════╡ - │ 50 ┆ 1 ┆ 10 ┆ 30 │ - │ 50 ┆ 2 ┆ 20 ┆ 40 │ - └──────┴──────┴────────┴────────┘ - - Reshape the dataframe by passing a sequence to `names_pattern`: - >>> df = pl.DataFrame({'hr1': [514, 573], - ... 'hr2': [545, 526], - ... 'team': ['Red Sox', 'Yankees'], - ... 'year1': [2007, 2007], - ... 'year2': [2008, 2008]}) - >>> df - shape: (2, 5) - ┌─────┬─────┬─────────┬───────┬───────┐ - │ hr1 ┆ hr2 ┆ team ┆ year1 ┆ year2 │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════════╪═══════╪═══════╡ - │ 514 ┆ 545 ┆ Red Sox ┆ 2007 ┆ 2008 │ - │ 573 ┆ 526 ┆ Yankees ┆ 2007 ┆ 2008 │ - └─────┴─────┴─────────┴───────┴───────┘ - >>> df.janitor.pivot_longer( - ... index = 'team', - ... names_to = ['year', 'hr'], - ... names_pattern = ['year', 'hr'] - ... ) - shape: (4, 3) - ┌─────────┬─────┬──────┐ - │ team ┆ hr ┆ year │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════════╪═════╪══════╡ - │ Red Sox ┆ 514 ┆ 2007 │ - │ Yankees ┆ 573 ┆ 2007 │ - │ Red Sox ┆ 545 ┆ 2008 │ - │ Yankees ┆ 526 ┆ 2008 │ - └─────────┴─────┴──────┘ - - Multiple `values_to`: - >>> df = pl.DataFrame( - ... { - ... "City": ["Houston", "Austin", "Hoover"], - ... "State": ["Texas", "Texas", "Alabama"], - ... "Name": ["Aria", "Penelope", "Niko"], - ... "Mango": [4, 10, 90], - ... "Orange": [10, 8, 14], - ... "Watermelon": [40, 99, 43], - ... "Gin": [16, 200, 34], - ... "Vodka": [20, 33, 18], - ... }, - ... ) - >>> df - shape: (3, 8) - ┌─────────┬─────────┬──────────┬───────┬────────┬────────────┬─────┬───────┐ - │ City ┆ State ┆ Name ┆ Mango ┆ Orange ┆ Watermelon ┆ Gin ┆ Vodka │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ - ╞═════════╪═════════╪══════════╪═══════╪════════╪════════════╪═════╪═══════╡ - │ Houston ┆ Texas ┆ Aria ┆ 4 ┆ 10 ┆ 40 ┆ 16 ┆ 20 │ - │ Austin ┆ Texas ┆ Penelope ┆ 10 ┆ 8 ┆ 99 ┆ 200 ┆ 33 │ - │ Hoover ┆ Alabama ┆ Niko ┆ 90 ┆ 14 ┆ 43 ┆ 34 ┆ 18 │ - └─────────┴─────────┴──────────┴───────┴────────┴────────────┴─────┴───────┘ - - >>> df.janitor.pivot_longer( - ... index=["City", "State"], - ... column_names=cs.numeric(), - ... names_to=("Fruit", "Drink"), - ... values_to=("Pounds", "Ounces"), - ... names_pattern=["M|O|W", "G|V"], - ... ) - shape: (9, 6) - ┌─────────┬─────────┬────────────┬────────┬───────┬────────┐ - │ City ┆ State ┆ Fruit ┆ Pounds ┆ Drink ┆ Ounces │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ str ┆ i64 ┆ str ┆ i64 │ - ╞═════════╪═════════╪════════════╪════════╪═══════╪════════╡ - │ Houston ┆ Texas ┆ Mango ┆ 4 ┆ Gin ┆ 16 │ - │ Austin ┆ Texas ┆ Mango ┆ 10 ┆ Gin ┆ 200 │ - │ Hoover ┆ Alabama ┆ Mango ┆ 90 ┆ Gin ┆ 34 │ - │ Houston ┆ Texas ┆ Orange ┆ 10 ┆ Vodka ┆ 20 │ - │ Austin ┆ Texas ┆ Orange ┆ 8 ┆ Vodka ┆ 33 │ - │ Hoover ┆ Alabama ┆ Orange ┆ 14 ┆ Vodka ┆ 18 │ - │ Houston ┆ Texas ┆ Watermelon ┆ 40 ┆ null ┆ null │ - │ Austin ┆ Texas ┆ Watermelon ┆ 99 ┆ null ┆ null │ - │ Hoover ┆ Alabama ┆ Watermelon ┆ 43 ┆ null ┆ null │ - └─────────┴─────────┴────────────┴────────┴───────┴────────┘ - - !!! info "New in version 0.28.0" - - Args: - index: Column(s) or selector(s) to use as identifier variables. - column_names: Column(s) or selector(s) to unpivot. - names_to: Name of new column as a string that will contain - what were previously the column names in `column_names`. - The default is `variable` if no value is provided. It can - also be a list/tuple of strings that will serve as new column - names, if `name_sep` or `names_pattern` is provided. - If `.value` is in `names_to`, new column names will be extracted - from part of the existing column names and overrides `values_to`. - values_to: Name of new column as a string that will contain what - were previously the values of the columns in `column_names`. - `values_to` can also be a list/tuple - and requires that `names_pattern` is also a list/tuple. - names_sep: Determines how the column name is broken up, if - `names_to` contains multiple values. It takes the same - specification as polars' `str.split` method. - names_pattern: Determines how the column name is broken up. - It can be a regular expression containing matching groups. - It takes the same - specification as polars' `str.extract_groups` method. - `names_pattern` can also be a list/tuple of regular expressions. - Under the hood it is processed with polars' `str.contains` function. - For a list/tuple of regular expressions, - `names_to` must also be a list/tuple and the lengths of both - arguments must match. - names_transform: Use this option to change the types of columns that - have been transformed to rows. - This does not applies to the values' columns. - It can be a single valid polars dtype, - or a dictionary pairing the new column names - with a valid polars dtype. - Applicable only if one of names_sep - or names_pattern is provided. - Returns: - A polars DataFrame that has been unpivoted from wide to long - format. - """ # noqa: E501 - return _pivot_longer( - df=self._df, - index=index, - column_names=column_names, - names_pattern=names_pattern, - names_sep=names_sep, - names_to=names_to, - values_to=values_to, - names_transform=names_transform, - ) diff --git a/janitor/polars/pivot_longer.py b/janitor/polars/pivot_longer.py deleted file mode 100644 index ceb94903b..000000000 --- a/janitor/polars/pivot_longer.py +++ /dev/null @@ -1,669 +0,0 @@ -"""pivot_longer implementation for polars.""" - -from collections import defaultdict -from itertools import chain -from typing import Any, Iterable, Optional, Union - -from janitor.utils import check, import_message - -try: - import polars as pl - import polars.selectors as cs - from polars.datatypes.classes import DataTypeClass - from polars.type_aliases import IntoExpr, PolarsDataType -except ImportError: - import_message( - submodule="polars", - package="polars", - conda_channel="conda-forge", - pip_install=True, - ) - - -def _pivot_longer( - df: pl.DataFrame, - index: Union[IntoExpr, Iterable[IntoExpr], None], - column_names: Union[IntoExpr, Iterable[IntoExpr], None], - names_to: Optional[Union[list, str]], - values_to: Optional[str], - names_sep: Optional[Union[str, None]], - names_pattern: Optional[Union[list, tuple, str, None]], - names_transform: Optional[Union[PolarsDataType, dict]], -) -> pl.DataFrame: - """ - Unpivots a DataFrame to long form. - """ - - ( - df, - index, - column_names, - names_to, - values_to, - names_sep, - names_pattern, - names_transform, - ) = _data_checks_pivot_longer( - df=df, - index=index, - column_names=column_names, - names_to=names_to, - values_to=values_to, - names_sep=names_sep, - names_pattern=names_pattern, - names_transform=names_transform, - ) - - if not column_names: - return df - - if all((names_pattern is None, names_sep is None)): - return df.melt( - id_vars=index, - value_vars=column_names, - variable_name=names_to[0], - value_name=values_to, - ) - - # the core idea is to do the transformation on the columns - # before flipping into long form - # typically less work is done this way - # compared to flipping and then processing the columns - - if names_sep is not None: - return _pivot_longer_names_sep( - df=df, - index=index, - column_names=column_names, - names_to=names_to, - names_sep=names_sep, - values_to=values_to, - names_transform=names_transform, - ) - - if isinstance(names_pattern, str): - return _pivot_longer_names_pattern_str( - df=df, - index=index, - column_names=column_names, - names_to=names_to, - names_pattern=names_pattern, - values_to=values_to, - names_transform=names_transform, - ) - if isinstance(values_to, (list, tuple)): - return _pivot_longer_values_to_sequence( - df=df, - index=index, - column_names=column_names, - names_to=names_to, - names_pattern=names_pattern, - values_to=values_to, - names_transform=names_transform, - ) - - return _pivot_longer_names_pattern_sequence( - df=df, - index=index, - column_names=column_names, - names_to=names_to, - names_pattern=names_pattern, - ) - - -def _pivot_longer_names_sep( - df: pl.DataFrame, - index: Iterable, - column_names: Iterable, - names_to: Iterable, - names_sep: str, - values_to: str, - names_transform: dict, -) -> pl.DataFrame: - """ - This takes care of unpivoting scenarios where - names_sep is provided. - """ - - columns = df.select(column_names).columns - outcome = ( - pl.Series(columns) - .str.split(by=names_sep) - .list.to_struct(n_field_strategy="max_width") - ) - len_outcome = len(outcome.struct.fields) - len_names_to = len(names_to) - if len_names_to != len_outcome: - raise ValueError( - "The length of names_to does not match " - "the number of fields extracted. " - f"The length of names_to is {len_names_to} " - "while the number of fields extracted is " - f"{len_outcome}." - ) - - if ".value" not in names_to: - outcome = outcome.struct.rename_fields(names_to) - return _pivot_longer_no_dot_value( - df=df, - outcome=outcome, - values_to=values_to, - index=index, - columns=columns, - names_to=names_to, - names_transform=names_transform, - ) - if all(label == ".value" for label in names_to): - return _pivot_longer_dot_value_only( - df=df, - names_to=names_to, - columns=columns, - index=index, - outcome=outcome, - ) - return _pivot_longer_dot_value( - df=df, - names_to=names_to, - columns=columns, - index=index, - outcome=outcome, - names_transform=names_transform, - ) - - -def _pivot_longer_names_pattern_str( - df: pl.DataFrame, - index: Iterable, - column_names: Iterable, - names_to: Iterable, - names_pattern: str, - values_to: str, - names_transform: dict, -) -> pl.DataFrame: - """ - This takes care of unpivoting scenarios where - names_pattern is a string. - """ - - columns = df.select(column_names).columns - outcome = pl.Series(columns).str.extract_groups(names_pattern) - len_outcome = len(outcome.struct.fields) - len_names_to = len(names_to) - if len_names_to != len_outcome: - raise ValueError( - f"The length of names_to does not match " - "the number of fields extracted. " - f"The length of names_to is {len_names_to} " - "while the number of fields extracted is " - f"{len_outcome}." - ) - if ".value" not in names_to: - outcome = outcome.struct.rename_fields(names_to) - return _pivot_longer_no_dot_value( - df=df, - outcome=outcome, - values_to=values_to, - index=index, - columns=columns, - names_to=names_to, - names_transform=names_transform, - ) - if all(label == ".value" for label in names_to): - return _pivot_longer_dot_value_only( - df=df, - names_to=names_to, - columns=columns, - index=index, - outcome=outcome, - ) - return _pivot_longer_dot_value( - df=df, - names_to=names_to, - columns=columns, - index=index, - outcome=outcome, - names_transform=names_transform, - ) - - -def _pivot_longer_values_to_sequence( - df: pl.DataFrame, - index: Iterable, - column_names: Iterable, - names_to: Iterable, - names_pattern: Iterable, - values_to: Iterable, - names_transform: dict, -) -> pl.DataFrame: - """ - This takes care of unpivoting scenarios where - values_to is a list/tuple. - """ - columns = df.select(column_names).columns - outcome = pl.DataFrame({"cols": columns}) - expressions = [ - pl.col("cols").str.contains(pattern).alias(f"cols{num}") - for num, pattern in enumerate(names_pattern) - ] - outcome = outcome.with_columns(expressions) - booleans = outcome.select(pl.exclude("cols").any()) - for position in range(len(names_pattern)): - if not booleans.to_series(position).item(): - raise ValueError( - "No match was returned for the regex " - f"at position {position} -> {names_pattern[position]}." - ) - names_booleans = pl - values_booleans = pl - for boolean, repl_name, repl_value in zip( - booleans.columns, names_to, values_to - ): - names_booleans = names_booleans.when(pl.col(boolean)).then( - pl.lit(repl_name) - ) - values_booleans = values_booleans.when(pl.col(boolean)).then( - pl.lit(repl_value) - ) - names_booleans = names_booleans.alias("value") - values_booleans = values_booleans.alias(".value") - filter_expr = pl.col(".value").is_not_null() - cum_expr = pl.col(".value").cum_count().over(".value").sub(1).alias("idx") - outcome = ( - outcome.select(names_booleans, values_booleans, pl.col("cols")) - .filter(filter_expr) - .with_columns(cum_expr) - ) - headers_dict = defaultdict(list) - non_headers_dict = defaultdict(list) - for num, col_name, value_header, name_header in zip( - outcome.get_column("idx"), - outcome.get_column("cols"), - outcome.get_column(".value"), - outcome.get_column("value"), - ): - non_headers_dict[num].append((col_name, name_header)) - headers_dict[num].append((col_name, value_header)) - contents = [] - for key, value in headers_dict.items(): - expression = [] if index is None else [pl.col(index)] - columns_to_select = [ - pl.col(col_name).alias(repl_name) for col_name, repl_name in value - ] - expression.extend(columns_to_select) - columns_to_append = [ - pl.lit(col_name, dtype=names_transform[repl_name]).alias(repl_name) - for col_name, repl_name in non_headers_dict[key] - ] - - contents.append(df.select(expression).with_columns(columns_to_append)) - columns_to_select = [] if not index else list(index) - columns_to_select.extend(chain.from_iterable(zip(names_to, values_to))) - return pl.concat(contents, how="diagonal_relaxed").select( - columns_to_select - ) - - -def _pivot_longer_names_pattern_sequence( - df: pl.DataFrame, - index: Iterable, - column_names: Iterable, - names_to: Iterable, - names_pattern: Iterable, -) -> pl.DataFrame: - """ - This takes care of unpivoting scenarios where - names_pattern is a list/tuple. - """ - columns = df.select(column_names).columns - outcome = pl.DataFrame({"cols": columns}) - expressions = [ - pl.col("cols").str.contains(pattern).alias(f"cols{num}") - for num, pattern in enumerate(names_pattern) - ] - outcome = outcome.with_columns(expressions) - booleans = outcome.select(pl.exclude("cols").any()) - for position in range(len(names_pattern)): - if not booleans.to_series(position).item(): - raise ValueError( - "No match was returned for the regex " - f"at position {position} -> {names_pattern[position]}." - ) - names_booleans = pl - for boolean, repl_name in zip(booleans.columns, names_to): - names_booleans = names_booleans.when(pl.col(boolean)).then( - pl.lit(repl_name) - ) - - names_booleans = names_booleans.alias(".value") - filter_expr = pl.col(".value").is_not_null() - cum_expr = pl.col(".value").cum_count().over(".value").sub(1).alias("idx") - outcome = ( - outcome.select(names_booleans, pl.col("cols")) - .filter(filter_expr) - .with_columns(cum_expr) - ) - headers_dict = defaultdict(list) - for num, col_name, name_header in zip( - outcome.get_column("idx"), - outcome.get_column("cols"), - outcome.get_column(".value"), - ): - headers_dict[num].append((col_name, name_header)) - - contents = [] - for _, value in headers_dict.items(): - expression = [] if index is None else [pl.col(index)] - columns_to_select = [ - pl.col(col_name).alias(repl_name) for col_name, repl_name in value - ] - expression.extend(columns_to_select) - - contents.append(df.select(expression)) - return pl.concat(contents, how="diagonal_relaxed") - - -def _pivot_longer_no_dot_value( - df: pl.DataFrame, - outcome: pl.Series, - names_to: Iterable, - values_to: str, - index: Iterable, - columns: Iterable, - names_transform: dict, -) -> pl.DataFrame: - """ - Reshape the data for scenarios where .value - is not present in names_to, - or names_to is not a list/tuple. - """ - contents = [] - for col_name, mapping in zip(columns, outcome): - expression = ( - [pl.col(col_name)] - if index is None - else [pl.col(index), pl.col(col_name).alias(values_to)] - ) - columns_to_append = [ - pl.lit(label, dtype=names_transform[header]).alias(header) - for header, label in mapping.items() - ] - _frame = df.select(expression).with_columns(columns_to_append) - contents.append(_frame) - columns_to_select = [] if not index else list(index) - columns_to_select.extend(names_to) - columns_to_select.append(values_to) - return pl.concat(contents, how="diagonal_relaxed").select( - pl.col(columns_to_select) - ) - - -def _pivot_longer_dot_value( - df: pl.DataFrame, - names_to: Iterable, - outcome: pl.Series, - index: Iterable, - columns: Iterable, - names_transform: Union[PolarsDataType, dict], -) -> pl.DataFrame: - """ - Pivots the dataframe into the final form, - for scenarios where .value is in names_to. - """ - booleans = outcome.struct.unnest().select(pl.all().is_null().any()) - for position in range(len(names_to)): - if booleans.to_series(position).item(): - raise ValueError( - f"Column labels '{columns[position]}' " - "could not be matched with any of the groups " - "in the provided regex. Kindly provide a regular expression " - "(with the correct groups) that matches all labels in the columns." - ) - if names_to.count(".value") > 1: - cols = outcome.struct.fields - dot_value = [ - cols[num] - for num, label in enumerate(names_to) - if label == ".value" - ] - not_dot_value = [ - pl.col(field_name).alias(repl_name) - for field_name, repl_name in zip(cols, names_to) - if field_name not in dot_value - ] - - outcome = outcome.struct.unnest().select( - pl.concat_str(dot_value).alias(".value"), *not_dot_value - ) - else: - outcome = outcome.struct.rename_fields(names_to).struct.unnest() - idx = "".join(names_to) - not_dot_value = [name for name in names_to if name != ".value"] - outcome = outcome.with_row_index(idx).with_columns( - pl.col(idx).first().over(not_dot_value).rank("dense").sub(1), - pl.struct(not_dot_value), - ) - headers_dict = defaultdict(list) - for num, col_name, repl_name in zip( - outcome.get_column(idx), - columns, - outcome.get_column(".value"), - ): - headers_dict[num].append((col_name, repl_name)) - - non_headers_dict = dict() - outcome = outcome.select(idx, not_dot_value[0]).unique() - - for key, value in zip(outcome.to_series(0), outcome.to_series(1)): - value = [ - pl.lit(stub_name, dtype=names_transform[repl_name]).alias( - repl_name - ) - for repl_name, stub_name in value.items() - ] - non_headers_dict[key] = value - contents = [] - for key, value in headers_dict.items(): - expression = [] if index is None else [pl.col(index)] - columns_to_select = [ - pl.col(col_name).alias(repl_name) for col_name, repl_name in value - ] - expression.extend(columns_to_select) - _frame = df.select(expression).with_columns(non_headers_dict[key]) - contents.append(_frame) - columns_to_select = [] if not index else list(index) - columns_to_select.extend(not_dot_value) - return pl.concat(contents, how="diagonal_relaxed").select( - pl.col(columns_to_select), pl.exclude(columns_to_select) - ) - - -def _pivot_longer_dot_value_only( - df: pl.DataFrame, - names_to: Iterable, - outcome: pl.Series, - index: Iterable, - columns: Iterable, -) -> pl.DataFrame: - """ - Pivots the dataframe into the final form, - for scenarios where only '.value' is present in names_to. - """ - - if names_to.count(".value") > 1: - outcome = outcome.struct.unnest().select( - pl.concat_str(pl.all()).alias(".value") - ) - else: - outcome = outcome.struct.rename_fields(names_to).struct.unnest() - outcome = outcome.with_columns( - pl.col(".value").cum_count().over(".value").sub(1).alias("idx") - ) - headers_dict = defaultdict(list) - for num, col_name, repl_name in zip( - outcome.get_column("idx"), - columns, - outcome.get_column(".value"), - ): - headers_dict[num].append((col_name, repl_name)) - - contents = [] - for _, value in headers_dict.items(): - expression = [] if index is None else [pl.col(index)] - columns_to_select = [ - pl.col(col_name).alias(repl_name) for col_name, repl_name in value - ] - expression.extend(columns_to_select) - contents.append(df.select(expression)) - - return pl.concat(contents, how="diagonal_relaxed") - - -def _data_checks_pivot_longer( - df, - index, - column_names, - names_to, - values_to, - names_sep, - names_pattern, - names_transform, -) -> tuple: - """ - This function majorly does type checks on the passed arguments. - - This function is executed before proceeding to the computation phase. - - Type annotations are not provided because this function is where type - checking happens. - """ - - def _check_type(arg_name: str, arg_value: Any): - """ - Raise if argument is not a valid type - """ - - def _check_type_single(entry): - if ( - not isinstance(entry, str) - and not cs.is_selector(entry) - and not isinstance(entry, pl.Expr) - ): - raise TypeError( - f"The argument passed to the {arg_name} parameter " - "should be a string type, a ColumnSelector, " - "an expression or a list/tuple that contains " - "a string and/or a ColumnSelector and/or an expression." - ) - - if isinstance(arg_value, (list, tuple)): - for entry in arg_value: - _check_type_single(entry=entry) - else: - _check_type_single(entry=arg_value) - - if (index is None) and (column_names is None): - column_names = cs.expand_selector(df, pl.all()) - index = [] - elif (index is not None) and (column_names is not None): - _check_type(arg_name="index", arg_value=index) - index = cs.expand_selector(df, index) - _check_type(arg_name="column_names", arg_value=column_names) - column_names = cs.expand_selector(df, column_names) - - elif (index is None) and (column_names is not None): - _check_type(arg_name="column_names", arg_value=column_names) - column_names = cs.expand_selector(df, column_names) - index = cs.expand_selector(df, pl.exclude(column_names)) - - elif (index is not None) and (column_names is None): - _check_type(arg_name="index", arg_value=index) - index = cs.expand_selector(df, index) - column_names = cs.expand_selector(df, pl.exclude(index)) - - check("names_to", names_to, [list, tuple, str]) - if isinstance(names_to, (list, tuple)): - uniques = set() - for word in names_to: - check(f"'{word}' in names_to", word, [str]) - if (word in uniques) and (word != ".value"): - raise ValueError(f"'{word}' is duplicated in names_to.") - uniques.add(word) - names_to = [names_to] if isinstance(names_to, str) else names_to - - if names_sep and names_pattern: - raise ValueError( - "Only one of names_pattern or names_sep should be provided." - ) - - if names_sep is not None: - check("names_sep", names_sep, [str]) - - if names_pattern is not None: - check("names_pattern", names_pattern, [str, list, tuple]) - if isinstance(names_pattern, (list, tuple)): - for word in names_pattern: - check(f"'{word}' in names_pattern", word, [str]) - if ".value" in names_to: - raise ValueError( - ".value is not accepted in names_to " - "if names_pattern is a list/tuple." - ) - if len(names_pattern) != len(names_to): - raise ValueError( - f"The length of names_to does not match " - "the number of regexes in names_pattern. " - f"The length of names_to is {len(names_to)} " - f"while the number of regexes is {len(names_pattern)}." - ) - - check("values_to", values_to, [str, list, tuple]) - values_to_is_a_sequence = isinstance(values_to, (list, tuple)) - names_pattern_is_a_sequence = isinstance(names_pattern, (list, tuple)) - if values_to_is_a_sequence: - if not names_pattern_is_a_sequence: - raise TypeError( - "values_to can be a list/tuple only " - "if names_pattern is a list/tuple." - ) - - if len(names_pattern) != len(values_to): - raise ValueError( - f"The length of values_to does not match " - "the number of regexes in names_pattern. " - f"The length of values_to is {len(values_to)} " - f"while the number of regexes is {len(names_pattern)}." - ) - uniques = set() - for word in values_to: - check(f"{word} in values_to", word, [str]) - if word in uniques: - raise ValueError(f"'{word}' is duplicated in values_to.") - uniques.add(word) - - columns_to_append = any(label != ".value" for label in names_to) - if values_to_is_a_sequence or columns_to_append: - check("names_transform", names_transform, [DataTypeClass, dict]) - if isinstance(names_transform, dict): - for _, dtype in names_transform.items(): - check( - "dtype in the names_transform mapping", - dtype, - [DataTypeClass], - ) - names_transform = { - label: names_transform.get(label, pl.Utf8) - for label in names_to - } - else: - names_transform = {label: names_transform for label in names_to} - - return ( - df, - index, - column_names, - names_to, - values_to, - names_sep, - names_pattern, - names_transform, - ) From a5c331a6062834cd21ad4b772970bae9b88a9d56 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Wed, 1 May 2024 20:23:27 +1000 Subject: [PATCH 44/46] remove irrelevant files --- .requirements/docs.in | 1 - mkdocs.yml | 1 - mkdocs/api/polars.md | 6 - .../functions/test_pivot_longer_polars.py | 913 ------------------ 4 files changed, 921 deletions(-) delete mode 100644 mkdocs/api/polars.md delete mode 100644 tests/polars/functions/test_pivot_longer_polars.py diff --git a/.requirements/docs.in b/.requirements/docs.in index b23e373aa..f0d4afc29 100644 --- a/.requirements/docs.in +++ b/.requirements/docs.in @@ -1,5 +1,4 @@ mkdocs -polars mkdocs-material mkdocstrings>=0.19.0 mkdocstrings-python diff --git a/mkdocs.yml b/mkdocs.yml index a7545afc5..639d71bea 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -45,7 +45,6 @@ nav: - Machine Learning: api/ml.md - Math: api/math.md # - PySpark: api/pyspark.md # will be added back later - - Polars: api/polars.md - Timeseries: api/timeseries.md - XArray: api/xarray.md - Development Guide: devguide.md diff --git a/mkdocs/api/polars.md b/mkdocs/api/polars.md deleted file mode 100644 index 17a6a87aa..000000000 --- a/mkdocs/api/polars.md +++ /dev/null @@ -1,6 +0,0 @@ -# Polars - -::: janitor.polars - options: - members: - - PolarsFrame diff --git a/tests/polars/functions/test_pivot_longer_polars.py b/tests/polars/functions/test_pivot_longer_polars.py deleted file mode 100644 index ee3b59b60..000000000 --- a/tests/polars/functions/test_pivot_longer_polars.py +++ /dev/null @@ -1,913 +0,0 @@ -import polars as pl -import polars.selectors as cs -import pytest -from polars.testing import assert_frame_equal - -from janitor import polars # noqa: F401 - - -@pytest.fixture -def df_checks(): - """fixture dataframe""" - return pl.DataFrame( - { - "famid": [1, 1, 1, 2, 2, 2, 3, 3, 3], - "birth": [1, 2, 3, 1, 2, 3, 1, 2, 3], - "ht1": [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], - "ht2": [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9], - } - ) - - -def test_type_index(df_checks): - """Raise TypeError if wrong type is provided for the index.""" - msg = "The argument passed to the index parameter " - msg += "should be a string type, a ColumnSelector.+" - with pytest.raises(TypeError, match=msg): - df_checks.janitor.pivot_longer(index=2007, names_sep="_") - - -def test_type_column_names(df_checks): - """Raise TypeError if wrong type is provided for column_names.""" - msg = "The argument passed to the column_names parameter " - msg += "should be a string type, a ColumnSelector.+" - with pytest.raises(TypeError, match=msg): - df_checks.janitor.pivot_longer(column_names=2007, names_sep="_") - - -def test_type_names_to(df_checks): - """Raise TypeError if wrong type is provided for names_to.""" - msg = "names_to should be one of .+" - with pytest.raises(TypeError, match=msg): - df_checks.janitor.pivot_longer(names_to=2007, names_sep="_") - - -def test_subtype_names_to(df_checks): - """ - Raise TypeError if names_to is a sequence - and the wrong type is provided for entries - in names_to. - """ - with pytest.raises(TypeError, match="'1' in names_to.+"): - df_checks.janitor.pivot_longer(names_to=[1], names_sep="_") - - -def test_duplicate_names_to(df_checks): - """Raise error if names_to contains duplicates.""" - with pytest.raises(ValueError, match="'y' is duplicated in names_to."): - df_checks.janitor.pivot_longer( - names_to=["y", "y"], names_pattern="(.+)(.)" - ) - - -def test_both_names_sep_and_pattern(df_checks): - """ - Raise ValueError if both names_sep - and names_pattern is provided. - """ - with pytest.raises( - ValueError, - match="Only one of names_pattern or names_sep should be provided.", - ): - df_checks.janitor.pivot_longer( - names_to=["rar", "bar"], names_sep="-", names_pattern="(.+)(.)" - ) - - -def test_name_pattern_wrong_type(df_checks): - """Raise TypeError if the wrong type is provided for names_pattern.""" - with pytest.raises(TypeError, match="names_pattern should be one of.+"): - df_checks.janitor.pivot_longer( - names_to=["rar", "bar"], names_pattern=2007 - ) - - -def test_names_pattern_wrong_subtype(df_checks): - """ - Raise TypeError if names_pattern is a list/tuple - and wrong subtype is supplied. - """ - with pytest.raises(TypeError, match="'1' in names_pattern.+"): - df_checks.janitor.pivot_longer( - names_to=["ht", "num"], names_pattern=[1, "\\d"] - ) - - -def test_names_pattern_names_to_unequal_length(df_checks): - """ - Raise ValueError if names_pattern is a list/tuple - and wrong number of items in names_to. - """ - with pytest.raises( - ValueError, - match="The length of names_to does not match " - "the number of regexes in names_pattern.+", - ): - df_checks.janitor.pivot_longer( - names_to=["variable"], names_pattern=["^ht", ".+i.+"] - ) - - -def test_names_pattern_names_to_dot_value(df_checks): - """ - Raise Error if names_pattern is a list/tuple and - .value in names_to. - """ - with pytest.raises( - ValueError, - match=".value is not accepted in names_to " - "if names_pattern is a list/tuple.", - ): - df_checks.janitor.pivot_longer( - names_to=["variable", ".value"], names_pattern=["^ht", ".+i.+"] - ) - - -def test_name_sep_wrong_type(df_checks): - """Raise TypeError if the wrong type is provided for names_sep.""" - with pytest.raises(TypeError, match="names_sep should be one of.+"): - df_checks.janitor.pivot_longer( - names_to=[".value", "num"], names_sep=["_"] - ) - - -def test_values_to_wrong_type(df_checks): - """Raise TypeError if the wrong type is provided for `values_to`.""" - with pytest.raises(TypeError, match="values_to should be one of.+"): - df_checks.janitor.pivot_longer(values_to={"salvo"}, names_sep="_") - - -def test_values_to_wrong_type_names_pattern(df_checks): - """ - Raise TypeError if `values_to` is a list, - and names_pattern is not. - """ - with pytest.raises( - TypeError, - match="values_to can be a list/tuple only " - "if names_pattern is a list/tuple.", - ): - df_checks.janitor.pivot_longer( - values_to=["salvo"], names_pattern=r"(.)" - ) - - -def test_values_to_names_pattern_unequal_length(df_checks): - """ - Raise ValueError if `values_to` is a list, - and the length of names_pattern - does not match the length of values_to. - """ - with pytest.raises( - ValueError, - match="The length of values_to does not match " - "the number of regexes in names_pattern.+", - ): - df_checks.janitor.pivot_longer( - values_to=["salvo"], - names_pattern=["ht", r"\d"], - names_to=["foo", "bar"], - ) - - -def test_sub_values_to(df_checks): - """Raise error if values_to is a sequence, and contains non strings.""" - with pytest.raises(TypeError, match="1 in values_to.+"): - df_checks.janitor.pivot_longer( - names_to=["x", "y"], - names_pattern=[r"ht", r"\d"], - values_to=[1, "salvo"], - ) - - -def test_duplicate_values_to(df_checks): - """Raise error if values_to is a sequence, and contains duplicates.""" - with pytest.raises( - ValueError, match="'salvo' is duplicated in values_to." - ): - df_checks.janitor.pivot_longer( - names_to=["x", "y"], - names_pattern=[r"ht", r"\d"], - values_to=["salvo", "salvo"], - ) - - -def test_names_transform_wrong_type(df_checks): - """Raise TypeError if the wrong type is provided for `names_transform`.""" - with pytest.raises(TypeError, match="names_transform should be one of.+"): - df_checks.janitor.pivot_longer(names_sep="_", names_transform=1) - - -def test_names_transform_wrong_subtype(df_checks): - """ - Raise TypeError if the wrong subtype - is provided for values in the - `names_transform` dictionary. - """ - with pytest.raises( - TypeError, - match="dtype in the names_transform mapping should be one of.+", - ): - df_checks.janitor.pivot_longer( - names_sep="_", names_transform={"rar": 1} - ) - - -def test_names_pattern_list_empty_any(df_checks): - """ - Raise ValueError if names_pattern is a list, - and not all matches are returned. - """ - with pytest.raises( - ValueError, match="No match was returned for the regex.+" - ): - df_checks.janitor.pivot_longer( - index=["famid", "birth"], - names_to=["ht"], - names_pattern=["rar"], - ) - - -def test_names_pattern_no_match(df_checks): - """Raise error if names_pattern is a regex and returns no matches.""" - with pytest.raises( - ValueError, match="Column labels .+ could not be matched with any .+" - ): - df_checks.janitor.pivot_longer( - index="famid", - names_to=[".value", "value"], - names_pattern=r"(rar)(.)", - ) - - -def test_names_pattern_incomplete_match(df_checks): - """ - Raise error if names_pattern is a regex - and returns incomplete matches. - """ - with pytest.raises( - ValueError, match="Column labels .+ could not be matched with any .+" - ): - df_checks.janitor.pivot_longer( - index="famid", - names_to=[".value", "value"], - names_pattern=r"(ht)(.)", - ) - - -def test_names_sep_len(df_checks): - """ - Raise error if names_sep, - and the number of matches returned - is not equal to the length of names_to. - """ - msg = "The length of names_to does not match " - msg += "the number of fields extracted.+ " - with pytest.raises(ValueError, match=msg): - df_checks.janitor.pivot_longer(names_to=".value", names_sep="t") - - -def test_pivot_index_only(df_checks): - """Test output if only index is passed.""" - result = df_checks.janitor.pivot_longer( - index=["famid", "birth"], - names_to="dim", - values_to="num", - ) - - actual = df_checks.melt( - ["famid", "birth"], variable_name="dim", value_name="num" - ) - - assert_frame_equal(result, actual) - - -def test_pivot_column_only(df_checks): - """Test output if only column_names is passed.""" - result = df_checks.janitor.pivot_longer( - column_names=["ht1", "ht2"], - names_to="dim", - values_to="num", - ) - - actual = df_checks.melt( - id_vars=["famid", "birth"], - variable_name="dim", - value_name="num", - ) - - assert_frame_equal(result, actual) - - -def test_names_pat_str(df_checks): - """ - Test output when names_pattern is a string, - and .value is present. - """ - result = df_checks.janitor.pivot_longer( - column_names=cs.starts_with("ht"), - names_to=(".value", "age"), - names_pattern="(.+)(.)", - names_transform={"age": pl.Int64}, - ).sort(by=pl.all()) - - actual = [ - {"famid": 1, "birth": 1, "age": 1, "ht": 2.8}, - {"famid": 1, "birth": 1, "age": 2, "ht": 3.4}, - {"famid": 1, "birth": 2, "age": 1, "ht": 2.9}, - {"famid": 1, "birth": 2, "age": 2, "ht": 3.8}, - {"famid": 1, "birth": 3, "age": 1, "ht": 2.2}, - {"famid": 1, "birth": 3, "age": 2, "ht": 2.9}, - {"famid": 2, "birth": 1, "age": 1, "ht": 2.0}, - {"famid": 2, "birth": 1, "age": 2, "ht": 3.2}, - {"famid": 2, "birth": 2, "age": 1, "ht": 1.8}, - {"famid": 2, "birth": 2, "age": 2, "ht": 2.8}, - {"famid": 2, "birth": 3, "age": 1, "ht": 1.9}, - {"famid": 2, "birth": 3, "age": 2, "ht": 2.4}, - {"famid": 3, "birth": 1, "age": 1, "ht": 2.2}, - {"famid": 3, "birth": 1, "age": 2, "ht": 3.3}, - {"famid": 3, "birth": 2, "age": 1, "ht": 2.3}, - {"famid": 3, "birth": 2, "age": 2, "ht": 3.4}, - {"famid": 3, "birth": 3, "age": 1, "ht": 2.1}, - {"famid": 3, "birth": 3, "age": 2, "ht": 2.9}, - ] - actual = pl.DataFrame(actual).sort(by=pl.all()) - - assert_frame_equal(result, actual, check_dtype=False) - - -def test_no_column_names(df_checks): - """ - Test output if all the columns - are assigned to the index parameter. - """ - assert_frame_equal( - df_checks.janitor.pivot_longer(index=pl.all()), - df_checks, - ) - - -@pytest.fixture -def test_df(): - """Fixture DataFrame""" - return pl.DataFrame( - { - "off_loc": ["A", "B", "C", "D", "E", "F"], - "pt_loc": ["G", "H", "I", "J", "K", "L"], - "pt_lat": [ - 100.07548220000001, - 75.191326, - 122.65134479999999, - 124.13553329999999, - 124.13553329999999, - 124.01028909999998, - ], - "off_lat": [ - 121.271083, - 75.93845266, - 135.043791, - 134.51128400000002, - 134.484374, - 137.962195, - ], - "pt_long": [ - 4.472089953, - -144.387785, - -40.45611048, - -46.07156181, - -46.07156181, - -46.01594293, - ], - "off_long": [ - -7.188632000000001, - -143.2288569, - 21.242563, - 40.937416999999996, - 40.78472, - 22.905889000000002, - ], - } - ) - - -actual = [ - { - "set": "off", - "loc": "A", - "lat": 121.271083, - "long": -7.188632000000001, - }, - {"set": "off", "loc": "B", "lat": 75.93845266, "long": -143.2288569}, - {"set": "off", "loc": "C", "lat": 135.043791, "long": 21.242563}, - { - "set": "off", - "loc": "D", - "lat": 134.51128400000002, - "long": 40.937416999999996, - }, - {"set": "off", "loc": "E", "lat": 134.484374, "long": 40.78472}, - { - "set": "off", - "loc": "F", - "lat": 137.962195, - "long": 22.905889000000002, - }, - { - "set": "pt", - "loc": "G", - "lat": 100.07548220000001, - "long": 4.472089953, - }, - {"set": "pt", "loc": "H", "lat": 75.191326, "long": -144.387785}, - { - "set": "pt", - "loc": "I", - "lat": 122.65134479999999, - "long": -40.45611048, - }, - { - "set": "pt", - "loc": "J", - "lat": 124.13553329999999, - "long": -46.07156181, - }, - { - "set": "pt", - "loc": "K", - "lat": 124.13553329999999, - "long": -46.07156181, - }, - { - "set": "pt", - "loc": "L", - "lat": 124.01028909999998, - "long": -46.01594293, - }, -] - -actual = pl.DataFrame(actual).sort(by=pl.all()) - - -def test_names_pattern_str(test_df): - """Test output for names_pattern and .value.""" - - result = test_df.janitor.pivot_longer( - column_names=pl.all(), - names_to=["set", ".value"], - names_pattern="(.+)_(.+)", - ).sort(by=pl.all()) - assert_frame_equal(result, actual) - - -def test_names_sep_str(test_df): - """Test output for names_pattern and .value.""" - - result = test_df.janitor.pivot_longer( - column_names=pl.all(), - names_to=["set", ".value"], - names_sep="_", - ).sort(by=pl.all()) - assert_frame_equal(result, actual) - - -def test_names_pattern_list(): - """Test output if names_pattern is a list/tuple.""" - - df = pl.DataFrame( - { - "Activity": ["P1", "P2"], - "General": ["AA", "BB"], - "m1": ["A1", "B1"], - "t1": ["TA1", "TB1"], - "m2": ["A2", "B2"], - "t2": ["TA2", "TB2"], - "m3": ["A3", "B3"], - "t3": ["TA3", "TB3"], - } - ) - - result = ( - df.janitor.pivot_longer( - index=["Activity", "General"], - names_pattern=["^m", "^t"], - names_to=["M", "Task"], - ) - .select(["Activity", "General", "Task", "M"]) - .sort(by=pl.all()) - ) - - actual = [ - {"Activity": "P1", "General": "AA", "Task": "TA1", "M": "A1"}, - {"Activity": "P1", "General": "AA", "Task": "TA2", "M": "A2"}, - {"Activity": "P1", "General": "AA", "Task": "TA3", "M": "A3"}, - {"Activity": "P2", "General": "BB", "Task": "TB1", "M": "B1"}, - {"Activity": "P2", "General": "BB", "Task": "TB2", "M": "B2"}, - {"Activity": "P2", "General": "BB", "Task": "TB3", "M": "B3"}, - ] - - actual = pl.DataFrame(actual).sort(by=pl.all()) - - assert_frame_equal(result, actual) - - -@pytest.fixture -def not_dot_value(): - """Fixture DataFrame""" - return pl.DataFrame( - { - "country": ["United States", "Russia", "China"], - "vault_2012": [48.1, 46.4, 44.3], - "floor_2012": [45.4, 41.6, 40.8], - "vault_2016": [46.9, 45.7, 44.3], - "floor_2016": [46.0, 42.0, 42.1], - } - ) - - -actual2 = [ - {"country": "China", "event": "floor", "year": "2012", "score": 40.8}, - {"country": "China", "event": "floor", "year": "2016", "score": 42.1}, - {"country": "China", "event": "vault", "year": "2012", "score": 44.3}, - {"country": "China", "event": "vault", "year": "2016", "score": 44.3}, - {"country": "Russia", "event": "floor", "year": "2012", "score": 41.6}, - {"country": "Russia", "event": "floor", "year": "2016", "score": 42.0}, - {"country": "Russia", "event": "vault", "year": "2012", "score": 46.4}, - {"country": "Russia", "event": "vault", "year": "2016", "score": 45.7}, - { - "country": "United States", - "event": "floor", - "year": "2012", - "score": 45.4, - }, - { - "country": "United States", - "event": "floor", - "year": "2016", - "score": 46.0, - }, - { - "country": "United States", - "event": "vault", - "year": "2012", - "score": 48.1, - }, - { - "country": "United States", - "event": "vault", - "year": "2016", - "score": 46.9, - }, -] -actual2 = pl.DataFrame(actual2).sort(by=pl.all()) - - -def test_not_dot_value_sep(not_dot_value): - """Test output when names_sep and no dot_value""" - - result = not_dot_value.janitor.pivot_longer( - "country", - names_to=("event", "year"), - names_sep="_", - values_to="score", - ).sort(by=pl.all()) - - assert_frame_equal(result, actual2) - - -def test_not_dot_value_sep2(not_dot_value): - """Test output when names_sep and no dot_value""" - - result = not_dot_value.janitor.pivot_longer( - "country", - names_to="event", - names_sep="/", - values_to="score", - ) - - actual = not_dot_value.melt( - "country", variable_name="event", value_name="score" - ) - - assert_frame_equal(result, actual) - - -def test_not_dot_value_pattern(not_dot_value): - """Test output when names_pattern is a string and no dot_value""" - - result = not_dot_value.janitor.pivot_longer( - index="country", - names_to=("event", "year"), - names_pattern=r"(.+)_(.+)", - values_to="score", - ).sort(by=pl.all()) - - assert_frame_equal(result, actual2) - - -def test_multiple_dot_value(): - """Test output for multiple .value.""" - df = pl.DataFrame( - { - "x_1_mean": [1, 2, 3, 4], - "x_2_mean": [1, 1, 0, 0], - "x_1_sd": [0, 1, 1, 1], - "x_2_sd": [0.739, 0.219, 1.46, 0.918], - "y_1_mean": [1, 2, 3, 4], - "y_2_mean": [1, 1, 0, 0], - "y_1_sd": [0, 1, 1, 1], - "y_2_sd": [-0.525, 0.623, -0.705, 0.662], - "unit": [1, 2, 3, 4], - } - ) - - result = df.janitor.pivot_longer( - index="unit", - names_to=(".value", "time", ".value"), - names_pattern=r"(x|y)_([0-9])(_mean|_sd)", - names_transform={"time": pl.Int64}, - ).sort(by=pl.all()) - - actual = { - "unit": [1, 2, 3, 4, 1, 2, 3, 4], - "time": [1, 1, 1, 1, 2, 2, 2, 2], - "x_mean": [1, 2, 3, 4, 1, 1, 0, 0], - "x_sd": [0.0, 1.0, 1.0, 1.0, 0.739, 0.219, 1.46, 0.918], - "y_mean": [1, 2, 3, 4, 1, 1, 0, 0], - "y_sd": [0.0, 1.0, 1.0, 1.0, -0.525, 0.623, -0.705, 0.662], - } - - actual = pl.DataFrame(actual).sort(by=pl.all()) - - assert_frame_equal(result, actual) - - -@pytest.fixture -def single_val(): - """fixture dataframe""" - return pl.DataFrame( - { - "id": [1, 2, 3], - "x1": [4, 5, 6], - "x2": [5, 6, 7], - } - ) - - -def test_multiple_dot_value2(single_val): - """Test output for multiple .value.""" - - result = single_val.janitor.pivot_longer( - index="id", names_to=(".value", ".value"), names_pattern="(.)(.)" - ) - - assert_frame_equal(result, single_val) - - -actual3 = [ - {"id": 1, "x": 4}, - {"id": 2, "x": 5}, - {"id": 3, "x": 6}, - {"id": 1, "x": 5}, - {"id": 2, "x": 6}, - {"id": 3, "x": 7}, -] - -actual3 = pl.DataFrame(actual3) - - -def test_names_pattern_sequence_single_unique_column(single_val): - """ - Test output if names_pattern is a sequence of length 1. - """ - - result = single_val.janitor.pivot_longer( - "id", names_to=["x"], names_pattern=("x",) - ) - - assert_frame_equal(result, actual3) - - -def test_names_pattern_single_column(single_val): - """ - Test output if names_to is only '.value'. - """ - - result = single_val.janitor.pivot_longer( - "id", names_to=".value", names_pattern="(.)." - ) - - assert_frame_equal(result, actual3) - - -def test_names_pattern_single_column_not_dot_value(single_val): - """ - Test output if names_to is not '.value'. - """ - result = single_val.janitor.pivot_longer( - index="id", column_names="x1", names_to="yA", names_pattern="(.+)" - ) - - assert_frame_equal( - result, - single_val.melt(id_vars="id", value_vars="x1", variable_name="yA"), - ) - - -def test_names_pattern_single_column_not_dot_value1(single_val): - """ - Test output if names_to is not '.value'. - """ - result = single_val.select("x1").janitor.pivot_longer( - names_to="yA", names_pattern="(.+)" - ) - - assert_frame_equal( - result, single_val.select("x1").melt(variable_name="yA") - ) - - -@pytest.fixture -def df_null(): - "Dataframe with nulls." - return pl.DataFrame( - { - "family": [1, 2, 3, 4, 5], - "dob_child1": [ - "1998-11-26", - "1996-06-22", - "2002-07-11", - "2004-10-10", - "2000-12-05", - ], - "dob_child2": [ - "2000-01-29", - None, - "2004-04-05", - "2009-08-27", - "2005-02-28", - ], - "gender_child1": [1, 2, 2, 1, 2], - "gender_child2": [2.0, None, 2.0, 1.0, 1.0], - } - ) - - -def test_names_pattern_nulls_in_data(df_null): - """Test output if nulls are present in data.""" - result = df_null.janitor.pivot_longer( - index="family", - names_to=[".value", "child"], - names_pattern=r"(.+)_(.+)", - ).sort(by=pl.all()) - - actual = [ - {"family": 1, "child": "child1", "dob": "1998-11-26", "gender": 1.0}, - {"family": 2, "child": "child1", "dob": "1996-06-22", "gender": 2.0}, - {"family": 3, "child": "child1", "dob": "2002-07-11", "gender": 2.0}, - {"family": 4, "child": "child1", "dob": "2004-10-10", "gender": 1.0}, - {"family": 5, "child": "child1", "dob": "2000-12-05", "gender": 2.0}, - {"family": 1, "child": "child2", "dob": "2000-01-29", "gender": 2.0}, - {"family": 2, "child": "child2", "dob": None, "gender": None}, - {"family": 3, "child": "child2", "dob": "2004-04-05", "gender": 2.0}, - {"family": 4, "child": "child2", "dob": "2009-08-27", "gender": 1.0}, - {"family": 5, "child": "child2", "dob": "2005-02-28", "gender": 1.0}, - ] - - actual = pl.DataFrame(actual).sort(by=pl.all()) - - assert_frame_equal(result, actual) - - -@pytest.fixture -def multiple_values_to(): - """fixture for multiple values_to""" - # https://stackoverflow.com/q/51519101/7175713 - return pl.DataFrame( - { - "City": ["Houston", "Austin", "Hoover"], - "State": ["Texas", "Texas", "Alabama"], - "Name": ["Aria", "Penelope", "Niko"], - "Mango": [4, 10, 90], - "Orange": [10, 8, 14], - "Watermelon": [40, 99, 43], - "Gin": [16, 200, 34], - "Vodka": [20, 33, 18], - }, - ) - - -def test_output_values_to_seq(multiple_values_to): - """Test output when values_to is a list/tuple.""" - - expected = multiple_values_to.janitor.pivot_longer( - index=["City", "State"], - column_names=cs.numeric(), - names_to=("Fruit"), - values_to=("Pounds",), - names_pattern=[r"M|O|W"], - ).sort(by=pl.all()) - - actual = [ - {"City": "Houston", "State": "Texas", "Fruit": "Mango", "Pounds": 4}, - {"City": "Austin", "State": "Texas", "Fruit": "Mango", "Pounds": 10}, - {"City": "Hoover", "State": "Alabama", "Fruit": "Mango", "Pounds": 90}, - {"City": "Houston", "State": "Texas", "Fruit": "Orange", "Pounds": 10}, - {"City": "Austin", "State": "Texas", "Fruit": "Orange", "Pounds": 8}, - { - "City": "Hoover", - "State": "Alabama", - "Fruit": "Orange", - "Pounds": 14, - }, - { - "City": "Houston", - "State": "Texas", - "Fruit": "Watermelon", - "Pounds": 40, - }, - { - "City": "Austin", - "State": "Texas", - "Fruit": "Watermelon", - "Pounds": 99, - }, - { - "City": "Hoover", - "State": "Alabama", - "Fruit": "Watermelon", - "Pounds": 43, - }, - ] - - actual = pl.DataFrame(actual).sort(by=pl.all()) - - assert_frame_equal(expected, actual) - - -def test_output_values_to_seq1(multiple_values_to): - """Test output when values_to is a list/tuple.""" - # https://stackoverflow.com/a/51520155/7175713 - expected = ( - multiple_values_to.janitor.pivot_longer( - index=["City", "State"], - column_names=cs.numeric(), - names_to=("Fruit", "Drink"), - values_to=("Pounds", "Ounces"), - names_pattern=[r"M|O|W", r"G|V"], - ) - .with_columns(pl.col("Ounces").cast(float)) - .sort(by=pl.all()) - ) - - actual = { - "City": [ - "Houston", - "Austin", - "Hoover", - "Houston", - "Austin", - "Hoover", - "Houston", - "Austin", - "Hoover", - ], - "State": [ - "Texas", - "Texas", - "Alabama", - "Texas", - "Texas", - "Alabama", - "Texas", - "Texas", - "Alabama", - ], - "Fruit": [ - "Mango", - "Mango", - "Mango", - "Orange", - "Orange", - "Orange", - "Watermelon", - "Watermelon", - "Watermelon", - ], - "Pounds": [4, 10, 90, 10, 8, 14, 40, 99, 43], - "Drink": [ - "Gin", - "Gin", - "Gin", - "Vodka", - "Vodka", - "Vodka", - None, - None, - None, - ], - "Ounces": [16.0, 200.0, 34.0, 20.0, 33.0, 18.0, None, None, None], - } - - actual = pl.DataFrame(actual).sort(by=pl.all()) - - assert_frame_equal(expected, actual) From 4d9c35feff182ac99bb760ed47bfd73472c0b0b1 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Wed, 1 May 2024 20:30:39 +1000 Subject: [PATCH 45/46] minor edit to docs --- janitor/io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/janitor/io.py b/janitor/io.py index 4741cb4d2..4522be258 100644 --- a/janitor/io.py +++ b/janitor/io.py @@ -113,7 +113,7 @@ def read_commandline(cmd: str, engine="pandas", **kwargs: Any) -> Mapping: This function assumes that your command line command will return an output that is parsable using the relevant engine and StringIO. This function defaults to using `pd.read_csv` underneath the hood. - Keyword arguments are passed through to read_csv. + Keyword arguments are passed through as-is. Args: cmd: Shell command to preprocess a file on disk. From 3b781c1a8e01848cc81a5341ad6b31914341a2bb Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Wed, 1 May 2024 21:42:09 +1000 Subject: [PATCH 46/46] xlsx_table now supports polars --- .requirements/docs.in | 1 + CHANGELOG.md | 2 +- janitor/io.py | 76 +++++++++++++++++++++++++++---------------- mkdocs.yml | 1 + 4 files changed, 51 insertions(+), 29 deletions(-) diff --git a/.requirements/docs.in b/.requirements/docs.in index f0d4afc29..b23e373aa 100644 --- a/.requirements/docs.in +++ b/.requirements/docs.in @@ -1,4 +1,5 @@ mkdocs +polars mkdocs-material mkdocstrings>=0.19.0 mkdocstrings-python diff --git a/CHANGELOG.md b/CHANGELOG.md index 9aea6a879..7e0651811 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ # Changelog ## [Unreleased] -- [ENH] `read_commandline` function now supports polars - Issue #1352 +- [ENH] `xlsx_table` function now supports polars - Issue #1352 ## [v0.27.0] - 2024-03-21 diff --git a/janitor/io.py b/janitor/io.py index 4522be258..4829b3e1c 100644 --- a/janitor/io.py +++ b/janitor/io.py @@ -93,7 +93,7 @@ def read_csvs( return dfs_dict -def read_commandline(cmd: str, engine="pandas", **kwargs: Any) -> Mapping: +def read_commandline(cmd: str, **kwargs: Any) -> pd.DataFrame: """Read a CSV file based on a command-line command. For example, you may wish to run the following command on `sep-quarter.csv` @@ -111,42 +111,26 @@ def read_commandline(cmd: str, engine="pandas", **kwargs: Any) -> Mapping: ``` This function assumes that your command line command will return - an output that is parsable using the relevant engine and StringIO. - This function defaults to using `pd.read_csv` underneath the hood. - Keyword arguments are passed through as-is. + an output that is parsable using `pandas.read_csv` and StringIO. + We default to using `pd.read_csv` underneath the hood. + Keyword arguments are passed through to read_csv. Args: cmd: Shell command to preprocess a file on disk. - engine: DataFrame engine to process the output of the shell command. - Currently supports both pandas and polars. **kwargs: Keyword arguments that are passed through to - the engine's csv reader. - + `pd.read_csv()`. Returns: - A DataFrame parsed from the stdout of the underlying + A pandas DataFrame parsed from the stdout of the underlying shell. """ check("cmd", cmd, [str]) - if engine not in {"pandas", "polars"}: - raise ValueError("engine should be either pandas or polars.") # adding check=True ensures that an explicit, clear error # is raised, so that the user can see the reason for the failure outcome = subprocess.run( cmd, shell=True, capture_output=True, text=True, check=True ) - if engine == "polars": - try: - import polars as pl - except ImportError: - import_message( - submodule="polars", - package="polars", - conda_channel="conda-forge", - pip_install=True, - ) - return pl.read_csv(StringIO(outcome.stdout), **kwargs) return pd.read_csv(StringIO(outcome.stdout), **kwargs) @@ -158,14 +142,15 @@ def xlsx_table( path: Union[str, IO, Workbook], sheetname: str = None, table: Union[str, list, tuple] = None, -) -> Union[pd.DataFrame, dict]: + engine: str = "pandas", +) -> Mapping: """Returns a DataFrame of values in a table in the Excel file. This applies to an Excel file, where the data range is explicitly specified as a Microsoft Excel table. If there is a single table in the sheet, or a string is provided - as an argument to the `table` parameter, a pandas DataFrame is returned; + as an argument to the `table` parameter, a DataFrame is returned; if there is more than one table in the sheet, and the `table` argument is `None`, or a list/tuple of names, a dictionary of DataFrames is returned, where the keys of the dictionary @@ -173,6 +158,7 @@ def xlsx_table( Examples: >>> import pandas as pd + >>> import polars as pl >>> from janitor import xlsx_table >>> filename="../pyjanitor/tests/test_data/016-MSPTDA-Excel.xlsx" @@ -186,6 +172,20 @@ def xlsx_table( 3 4 Competition 4 5 Long Distance + >>> xlsx_table(filename, table='dCategory', engine='polars') + shape: (5, 2) + ┌────────────┬───────────────┐ + │ CategoryID ┆ Category │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞════════════╪═══════════════╡ + │ 1 ┆ Beginner │ + │ 2 ┆ Advanced │ + │ 3 ┆ Freestyle │ + │ 4 ┆ Competition │ + │ 5 ┆ Long Distance │ + └────────────┴───────────────┘ + Multiple tables: >>> out=xlsx_table(filename, table=["dCategory", "dSalesReps"]) @@ -205,6 +205,8 @@ def xlsx_table( Args: path: Path to the Excel File. It can also be an openpyxl Workbook. table: Name of a table, or list of tables in the sheet. + engine: DataFrame engine. Should be either pandas or polars. + Defaults to pandas Raises: AttributeError: If a workbook is provided, and is a ReadOnlyWorksheet. @@ -212,7 +214,7 @@ def xlsx_table( KeyError: If the provided table does not exist in the sheet. Returns: - A pandas DataFrame, or a dictionary of DataFrames, + A DataFrame, or a dictionary of DataFrames, if there are multiple arguments for the `table` parameter, or the argument to `table` is `None`. """ # noqa : E501 @@ -235,6 +237,22 @@ def xlsx_table( DeprecationWarning, stacklevel=find_stack_level(), ) + if engine not in {"pandas", "polars"}: + raise ValueError("engine should be one of pandas or polars.") + base_engine = pd + if engine == "polars": + try: + import polars as pl + + base_engine = pl + except ImportError: + import_message( + submodule="polars", + package="polars", + conda_channel="conda-forge", + pip_install=True, + ) + if table is not None: check("table", table, [str, list, tuple]) if isinstance(table, (list, tuple)): @@ -261,13 +279,15 @@ def _create_dataframe_or_dictionary_from_table( header_exist = contents.headerRowCount coordinates = contents.ref data = worksheet[coordinates] - data = [[entry.value for entry in cell] for cell in data] if header_exist: header, *data = data + header = [cell.value for cell in header] else: header = [f"C{num}" for num in range(len(data[0]))] - data = pd.DataFrame(data, columns=header) - dictionary[table_name] = data + data = zip(*data) + data = ([entry.value for entry in cell] for cell in data) + data = dict(zip(header, data)) + dictionary[table_name] = base_engine.DataFrame(data) return dictionary worksheets = [worksheet for worksheet in ws if worksheet.tables.items()] diff --git a/mkdocs.yml b/mkdocs.yml index 639d71bea..a7545afc5 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -45,6 +45,7 @@ nav: - Machine Learning: api/ml.md - Math: api/math.md # - PySpark: api/pyspark.md # will be added back later + - Polars: api/polars.md - Timeseries: api/timeseries.md - XArray: api/xarray.md - Development Guide: devguide.md