Skip to content

Commit

Permalink
Move and update tests
Browse files Browse the repository at this point in the history
  • Loading branch information
mcrumiller committed Sep 26, 2023
1 parent 71a70a2 commit 84f5a6c
Show file tree
Hide file tree
Showing 3 changed files with 272 additions and 212 deletions.
272 changes: 272 additions & 0 deletions py-polars/tests/unit/operations/test_cut.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,272 @@
import pytest

import polars as pl
from polars.testing import assert_frame_equal, assert_series_equal

inf = float("inf")


def test_cut() -> None:
# series
s = pl.Series("foo", [-2, -1, 0, 1, 2])
out = s.cut([-1, 1])
expected = pl.Series(
"foo",
[
"(-inf, -1]",
"(-inf, -1]",
"(-1, 1]",
"(-1, 1]",
"(1, inf]",
],
dtype=pl.Categorical,
)
assert_series_equal(out, expected, categorical_as_str=True)

# expr
df = pl.DataFrame(s)
df_out = df.select(pl.col("foo").cut([-1, 1]))
df_expected = pl.DataFrame(expected)
assert_frame_equal(df_out, df_expected, categorical_as_str=True)


def test_cut_with_labels() -> None:
# series
s = pl.Series("foo", [-2, -1, 0, 1, 2])
out = s.cut([-1, 1], labels=["a", "b", "c"])
expected = pl.Series("foo", ["a", "a", "b", "b", "c"], dtype=pl.Categorical)
assert_series_equal(out, expected, categorical_as_str=True)

# dataframe
df = pl.DataFrame(s)
df_out = df.with_columns(
pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut")
)
df_expected = pl.DataFrame(
{
"foo": [-2, -1, 0, 1, 2],
"cut": pl.Series(["a", "a", "b", "b", "c"], dtype=pl.Categorical),
}
)
assert_frame_equal(df_out, df_expected, categorical_as_str=True)


def test_cut_include_breaks() -> None:
# series
s = pl.Series("a", [-2, -1, 0, 1, 2])
out = s.cut([-1.5, 0.25, 1.0], labels=["a", "b", "c", "d"], include_breaks=True)
expected = pl.DataFrame(
{
"break_point": [-1.5, 0.25, 0.25, 1.0, inf],
"category": ["a", "b", "b", "c", "d"],
},
schema_overrides={"category": pl.Categorical},
).to_struct("a")
assert_series_equal(out, expected, categorical_as_str=True)

# dataframe
df = pl.DataFrame(s)
df_expected = pl.DataFrame(
{
"a": [-2, -1, 0, 1, 2],
"brk": [-1.0, -1.0, 1.0, 1.0, inf],
"a_bin": pl.Series(
[
"(-inf, -1]",
"(-inf, -1]",
"(-1, 1]",
"(-1, 1]",
"(1, inf]",
],
dtype=pl.Categorical,
),
}
)

# eager
df_out = df.with_columns(
pl.col("a").cut([-1, 1], include_breaks=True).alias("cut")
).unnest("cut")
assert df_out.schema == {"a": pl.Int64, "brk": pl.Float64, "a_bin": pl.Categorical}
assert_frame_equal(df_out, df_expected, categorical_as_str=True)

# lazy
df_out = (
df.lazy()
.with_columns(pl.col("a").cut([-1, 1], include_breaks=True).alias("cut"))
.unnest("cut")
.collect()
)
assert df_out.schema == {"a": pl.Int64, "brk": pl.Float64, "a_bin": pl.Categorical}
assert_frame_equal(df_out, df_expected, categorical_as_str=True)


def test_qcut() -> None:
# series
s = pl.Series("foo", [-2, -1, 0, 1, 2])
expected = pl.Series(
"foo",
[
"(-inf, -1]",
"(-inf, -1]",
"(-1, 1]",
"(-1, 1]",
"(1, inf]",
],
dtype=pl.Categorical,
)
out = s.qcut([0.25, 0.75])
assert_series_equal(out, expected, categorical_as_str=True)

# dataframe
df = pl.DataFrame(s)

# pre-defined quantile probabilities
df_expected = pl.DataFrame({"foo": [-2, -1, 0, 1, 2], "qcut": expected})

# eager
df_out = df.with_columns(pl.col("foo").qcut([0.25, 0.75]).alias("qcut"))
assert_frame_equal(df_out, df_expected, categorical_as_str=True)

# lazy
df_out = (
df.lazy().with_columns(pl.col("foo").qcut([0.25, 0.75]).alias("qcut")).collect()
)
assert df_out.schema == {"foo": pl.Int64, "qcut": pl.Categorical}
assert_frame_equal(df_out, df_expected, categorical_as_str=True)


def test_qcut_with_labels() -> None:
# series
s = pl.Series("foo", [-2, -1, 0, 1, 2])
expected = pl.Series("foo", ["a", "a", "b", "b", "c"], dtype=pl.Categorical)
out = s.qcut([0.25, 0.75], labels=["a", "b", "c"])
assert_series_equal(out, expected, categorical_as_str=True)

# dataframe
df = pl.DataFrame(s)

# pre-defined quantile probabilities
df_expected = pl.DataFrame({"foo": [-2, -1, 0, 1, 2], "qcut": expected})

# eager
df_out = df.with_columns(
pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut")
)
assert_frame_equal(df_out, df_expected, categorical_as_str=True)

# lazy
df_out = (
df.lazy()
.with_columns(
pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut")
)
.collect()
)
assert df_out.schema == {"foo": pl.Int64, "qcut": pl.Categorical}
assert_frame_equal(df_out, df_expected, categorical_as_str=True)

# uniform quantile probabilities
df_expected = pl.DataFrame(
{
"foo": [-2, -1, 0, 1, 2],
"qcut": pl.Series(
["low", "low", "high", "high", "high"], dtype=pl.Categorical
),
}
)
# eager
df_out = df.with_columns(
pl.col("foo").qcut(2, labels=["low", "high"], left_closed=True).alias("qcut")
)
assert_frame_equal(df_out, df_expected, categorical_as_str=True)

# lazy
df_out = (
df.lazy()
.with_columns(
pl.col("foo")
.qcut(2, labels=["low", "high"], left_closed=True)
.alias("qcut")
)
.collect()
)
assert df_out.schema == {"foo": pl.Int64, "qcut": pl.Categorical}
assert_frame_equal(df_out, df_expected, categorical_as_str=True)


def test_qcut_include_breaks() -> None:
df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]})

expected = pl.DataFrame(
{
"foo": [-2, -1, 0, 1, 2],
"brk": [-1.0, -1.0, 1.0, 1.0, inf],
"foo_bin": pl.Series(
[
"(-inf, -1]",
"(-inf, -1]",
"(-1, 1]",
"(-1, 1]",
"(1, inf]",
],
dtype=pl.Categorical,
),
}
)
# eager
out = df.with_columns(
pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut")
).unnest("qcut")
assert_frame_equal(out, expected, categorical_as_str=True)
# lazy
out = (
df.lazy()
.with_columns(
pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut")
)
.unnest("qcut")
.collect()
)
assert out.schema == {"foo": pl.Int64, "brk": pl.Float64, "foo_bin": pl.Categorical}
assert_frame_equal(out, expected, categorical_as_str=True)


def test_cut_null_values() -> None:
s = pl.Series([-1.0, None, 1.0, 2.0, None, 8.0, 4.0])

result = s.cut([1.5, 5.0], labels=["a", "b", "c"])

expected = pl.Series(["a", None, "a", "b", None, "c", "b"], dtype=pl.Categorical)
assert_series_equal(result, expected, categorical_as_str=True)


def test_cut_deprecated_as_series() -> None:
a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)])
with pytest.deprecated_call():
out = a.cut(breaks=[-1, 1], as_series=False)

assert out.shape == (12, 3)
assert out.filter(pl.col("break_point") < 1e9).to_dict(False) == {
"a": [-3.0, -2.5, -2.0, -1.5, -1.0, -0.5, 0.0, 0.5, 1.0],
"break_point": [-1.0, -1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 1.0],
"category": [
"(-inf, -1]",
"(-inf, -1]",
"(-inf, -1]",
"(-inf, -1]",
"(-inf, -1]",
"(-1, 1]",
"(-1, 1]",
"(-1, 1]",
"(-1, 1]",
],
}


def test_cut_deprecated_label_name() -> None:
s = pl.Series([1.0, 2.0])
with pytest.deprecated_call():
s.cut([0.1], category_label="x")
with pytest.deprecated_call():
s.cut([0.1], break_point_label="x")
80 changes: 0 additions & 80 deletions py-polars/tests/unit/series/test_cut.py

This file was deleted.

Loading

0 comments on commit 84f5a6c

Please sign in to comment.