From 031c9263dd649d01ac9284dfcbb9656343eaf0a8 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sun, 28 Apr 2024 15:02:46 +0200 Subject: [PATCH] fix: Set default limit for String column display to 30 and fix edge cases (#15934) --- crates/polars-core/src/fmt.rs | 111 ++++++++++++++++------------ py-polars/polars/config.py | 16 ++-- py-polars/polars/dataframe/_html.py | 4 +- py-polars/polars/expr/string.py | 18 ++--- py-polars/polars/series/string.py | 4 +- py-polars/polars/series/struct.py | 4 +- py-polars/src/series/mod.rs | 18 +++-- py-polars/tests/unit/test_config.py | 2 +- py-polars/tests/unit/test_format.py | 44 ++++++++++- 9 files changed, 137 insertions(+), 84 deletions(-) diff --git a/crates/polars-core/src/fmt.rs b/crates/polars-core/src/fmt.rs index 2ced3591c0a3..5cc4d6be8d00 100644 --- a/crates/polars-core/src/fmt.rs +++ b/crates/polars-core/src/fmt.rs @@ -1,6 +1,7 @@ #[cfg(any(feature = "fmt", feature = "fmt_no_tty"))] use std::borrow::Cow; use std::fmt::{Debug, Display, Formatter, Write}; +use std::str::FromStr; use std::sync::atomic::{AtomicBool, AtomicU8, Ordering}; use std::sync::RwLock; use std::{fmt, str}; @@ -28,7 +29,11 @@ use crate::prelude::*; // Note: see https://github.com/pola-rs/polars/pull/13699 for the rationale // behind choosing 10 as the default value for default number of rows displayed -const LIMIT: usize = 10; +const DEFAULT_ROW_LIMIT: usize = 10; +#[cfg(any(feature = "fmt", feature = "fmt_no_tty"))] +const DEFAULT_COL_LIMIT: usize = 8; +const DEFAULT_STR_LEN_LIMIT: usize = 30; +const DEFAULT_LIST_LEN_LIMIT: usize = 3; #[derive(Copy, Clone)] #[repr(u8)] @@ -86,6 +91,40 @@ pub fn set_trim_decimal_zeros(trim: Option) { TRIM_DECIMAL_ZEROS.store(trim.unwrap_or(false), Ordering::Relaxed) } +/// Parses an environment variable value. +fn parse_env_var(name: &str) -> Option { + std::env::var(name).ok().and_then(|v| v.parse().ok()) +} +/// Parses an environment variable value as a limit or set a default. +/// +/// Negative values (e.g. -1) are parsed as 'no limit' or [`usize::MAX`]. +fn parse_env_var_limit(name: &str, default: usize) -> usize { + parse_env_var(name).map_or( + default, + |n: i64| { + if n < 0 { + usize::MAX + } else { + n as usize + } + }, + ) +} + +fn get_row_limit() -> usize { + parse_env_var_limit(FMT_MAX_ROWS, DEFAULT_ROW_LIMIT) +} +#[cfg(any(feature = "fmt", feature = "fmt_no_tty"))] +fn get_col_limit() -> usize { + parse_env_var_limit(FMT_MAX_COLS, DEFAULT_COL_LIMIT) +} +fn get_str_len_limit() -> usize { + parse_env_var_limit(FMT_STR_LEN, DEFAULT_STR_LEN_LIMIT) +} +fn get_list_len_limit() -> usize { + parse_env_var_limit(FMT_TABLE_CELL_LIST_LEN, DEFAULT_LIST_LEN_LIMIT) +} + macro_rules! format_array { ($f:ident, $a:expr, $dtype:expr, $name:expr, $array_type:expr) => {{ write!( @@ -96,43 +135,38 @@ macro_rules! format_array { $name, $dtype )?; - let truncate = matches!($a.dtype(), DataType::String); - let truncate_len = if truncate { - std::env::var(FMT_STR_LEN) - .as_deref() - .unwrap_or("") - .parse() - .unwrap_or(15) - } else { - 15 - }; - let limit: usize = { - let limit = std::env::var(FMT_MAX_ROWS) - .as_deref() - .unwrap_or("") - .parse() - .map_or(LIMIT, |n: i64| if n < 0 { $a.len() } else { n as usize }); - std::cmp::min(limit, $a.len()) + + let truncate = match $a.dtype() { + DataType::String => true, + #[cfg(feature = "dtype-categorical")] + DataType::Categorical(_, _) | DataType::Enum(_, _) => true, + _ => false, }; + let truncate_len = if truncate { get_str_len_limit() } else { 0 }; + let write_fn = |v, f: &mut Formatter| -> fmt::Result { if truncate { let v = format!("{}", v); - let v_trunc = &v[..v + let v_no_quotes = &v[1..v.len() - 1]; + let v_trunc = &v_no_quotes[..v_no_quotes .char_indices() .take(truncate_len) .last() .map(|(i, c)| i + c.len_utf8()) .unwrap_or(0)]; - if v == v_trunc { + if v_no_quotes == v_trunc { write!(f, "\t{}\n", v)?; } else { - write!(f, "\t{}…\n", v_trunc)?; + write!(f, "\t\"{}…\n", v_trunc)?; } } else { write!(f, "\t{}\n", v)?; }; Ok(()) }; + + let limit = get_row_limit(); + if $a.len() > limit { let half = limit / 2; let rest = limit % 2; @@ -166,7 +200,7 @@ fn format_object_array( ) -> fmt::Result { match object.dtype() { DataType::Object(inner_type, _) => { - let limit = std::cmp::min(LIMIT, object.len()); + let limit = std::cmp::min(DEFAULT_ROW_LIMIT, object.len()); write!( f, "shape: ({},)\n{}: '{}' [o][{}]\n[\n", @@ -232,7 +266,7 @@ where T: PolarsObject, { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - let limit = std::cmp::min(LIMIT, self.len()); + let limit = std::cmp::min(DEFAULT_ROW_LIMIT, self.len()); let inner_type = T::type_name(); write!( f, @@ -497,14 +531,6 @@ fn fmt_df_shape((shape0, shape1): &(usize, usize)) -> String { ) } -fn get_str_width() -> usize { - std::env::var(FMT_STR_LEN) - .as_deref() - .unwrap_or("") - .parse() - .unwrap_or(32) -} - impl Display for DataFrame { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { #[cfg(any(feature = "fmt", feature = "fmt_no_tty"))] @@ -514,19 +540,10 @@ impl Display for DataFrame { self.columns.iter().all(|s| s.len() == height), "The column lengths in the DataFrame are not equal." ); - let str_truncate = get_str_width(); - let max_n_cols = std::env::var(FMT_MAX_COLS) - .as_deref() - .unwrap_or("") - .parse() - .map_or(8, |n: i64| if n < 0 { self.width() } else { n as usize }); - - let max_n_rows = std::env::var(FMT_MAX_ROWS) - .as_deref() - .unwrap_or("") - .parse() - .map_or(LIMIT, |n: i64| if n < 0 { height } else { n as usize }); + let max_n_cols = get_col_limit(); + let max_n_rows = get_row_limit(); + let str_truncate = get_str_len_limit(); let (n_first, n_last) = if self.width() > max_n_cols { ((max_n_cols + 1) / 2, max_n_cols / 2) @@ -965,7 +982,7 @@ fn format_duration(f: &mut Formatter, v: i64, sizes: &[i64], names: &[&str]) -> } fn format_blob(f: &mut Formatter<'_>, bytes: &[u8]) -> fmt::Result { - let width = get_str_width() * 2; + let width = get_str_len_limit() * 2; write!(f, "b\"")?; for b in bytes.iter().take(width) { @@ -1109,11 +1126,7 @@ impl Series { return "[]".to_owned(); } - let max_items = std::env::var(FMT_TABLE_CELL_LIST_LEN) - .as_deref() - .unwrap_or("") - .parse() - .map_or(3, |n: i64| if n < 0 { self.len() } else { n as usize }); + let max_items = get_list_len_limit(); match max_items { 0 => "[…]".to_owned(), diff --git a/py-polars/polars/config.py b/py-polars/polars/config.py index d714a87aa31b..0c09f3a3fcbf 100644 --- a/py-polars/polars/config.py +++ b/py-polars/polars/config.py @@ -663,14 +663,14 @@ def set_fmt_str_lengths(cls, n: int | None) -> type[Config]: ... ) >>> df.with_columns(pl.col("txt").str.len_bytes().alias("len")) shape: (2, 2) - ┌───────────────────────────────────┬─────┐ - │ txt ┆ len │ - │ --- ┆ --- │ - │ str ┆ u32 │ - ╞═══════════════════════════════════╪═════╡ - │ Play it, Sam. Play 'As Time Goes… ┆ 37 │ - │ This is the beginning of a beaut… ┆ 48 │ - └───────────────────────────────────┴─────┘ + ┌─────────────────────────────────┬─────┐ + │ txt ┆ len │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═════════════════════════════════╪═════╡ + │ Play it, Sam. Play 'As Time Go… ┆ 37 │ + │ This is the beginning of a bea… ┆ 48 │ + └─────────────────────────────────┴─────┘ >>> with pl.Config(fmt_str_lengths=50): ... print(df) shape: (2, 1) diff --git a/py-polars/polars/dataframe/_html.py b/py-polars/polars/dataframe/_html.py index 38a77cec60a8..64c0847f250e 100644 --- a/py-polars/polars/dataframe/_html.py +++ b/py-polars/polars/dataframe/_html.py @@ -107,7 +107,7 @@ def write_header(self) -> None: def write_body(self) -> None: """Write the body of an HTML table.""" - str_lengths = int(os.environ.get("POLARS_FMT_STR_LEN", "15")) + str_len_limit = int(os.environ.get("POLARS_FMT_STR_LEN", default=30)) with Tag(self.elements, "tbody"): for r in self.row_idx: with Tag(self.elements, "tr"): @@ -118,7 +118,7 @@ def write_body(self) -> None: else: series = self.df[:, c] self.elements.append( - html.escape(series._s.get_fmt(r, str_lengths)) + html.escape(series._s.get_fmt(r, str_len_limit)) ) def write(self, inner: str) -> None: diff --git a/py-polars/polars/expr/string.py b/py-polars/polars/expr/string.py index 47319b9f2893..77044dcf4bc6 100644 --- a/py-polars/polars/expr/string.py +++ b/py-polars/polars/expr/string.py @@ -1669,15 +1669,15 @@ def extract_groups(self, pattern: str) -> Expr: ... ).with_columns(name=pl.col("captures").struct["1"].str.to_uppercase()) ... ) shape: (3, 3) - ┌───────────────────────────────────┬───────────────────────┬──────────┐ - │ url ┆ captures ┆ name │ - │ --- ┆ --- ┆ --- │ - │ str ┆ struct[2] ┆ str │ - ╞═══════════════════════════════════╪═══════════════════════╪══════════╡ - │ http://vote.com/ballon_dor?candi… ┆ {"messi","python"} ┆ MESSI │ - │ http://vote.com/ballon_dor?candi… ┆ {"weghorst","polars"} ┆ WEGHORST │ - │ http://vote.com/ballon_dor?error… ┆ {null,null} ┆ null │ - └───────────────────────────────────┴───────────────────────┴──────────┘ + ┌─────────────────────────────────┬───────────────────────┬──────────┐ + │ url ┆ captures ┆ name │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[2] ┆ str │ + ╞═════════════════════════════════╪═══════════════════════╪══════════╡ + │ http://vote.com/ballon_dor?can… ┆ {"messi","python"} ┆ MESSI │ + │ http://vote.com/ballon_dor?can… ┆ {"weghorst","polars"} ┆ WEGHORST │ + │ http://vote.com/ballon_dor?err… ┆ {null,null} ┆ null │ + └─────────────────────────────────┴───────────────────────┴──────────┘ """ return wrap_expr(self._pyexpr.str_extract_groups(pattern)) diff --git a/py-polars/polars/series/string.py b/py-polars/polars/series/string.py index 9979d1041622..9854f1fac33c 100644 --- a/py-polars/polars/series/string.py +++ b/py-polars/polars/series/string.py @@ -1582,8 +1582,8 @@ def to_titlecase(self) -> Series: shape: (2,) Series: 'sing' [str] [ - "Welcome To My … - "There's No Tur… + "Welcome To My World" + "There's No Turning Back" ] """ diff --git a/py-polars/polars/series/struct.py b/py-polars/polars/series/struct.py index ceb843078c15..3d4c164b0056 100644 --- a/py-polars/polars/series/struct.py +++ b/py-polars/polars/series/struct.py @@ -136,7 +136,7 @@ def json_encode(self) -> Series: shape: (2,) Series: 'a' [str] [ - "{"a":[1,2],"b"… - "{"a":[9,1,3],"… + "{"a":[1,2],"b":[45]}" + "{"a":[9,1,3],"b":null}" ] """ diff --git a/py-polars/src/series/mod.rs b/py-polars/src/series/mod.rs index 773c39e3adfd..2ffda7f01d68 100644 --- a/py-polars/src/series/mod.rs +++ b/py-polars/src/series/mod.rs @@ -125,24 +125,26 @@ impl PySeries { }) } - fn get_fmt(&self, index: usize, str_lengths: usize) -> String { - let val = format!("{}", self.series.get(index).unwrap()); + /// Returns the string format of a single element of the Series. + fn get_fmt(&self, index: usize, str_len_limit: usize) -> String { + let v = format!("{}", self.series.get(index).unwrap()); if let DataType::String | DataType::Categorical(_, _) | DataType::Enum(_, _) = self.series.dtype() { - let v_trunc = &val[..val + let v_no_quotes = &v[1..v.len() - 1]; + let v_trunc = &v_no_quotes[..v_no_quotes .char_indices() - .take(str_lengths) + .take(str_len_limit) .last() .map(|(i, c)| i + c.len_utf8()) .unwrap_or(0)]; - if val == v_trunc { - val + if v_no_quotes == v_trunc { + v } else { - format!("{v_trunc}…") + format!("\"{v_trunc}…") } } else { - val + v } } diff --git a/py-polars/tests/unit/test_config.py b/py-polars/tests/unit/test_config.py index 00b454c7e717..41e71be0dd2a 100644 --- a/py-polars/tests/unit/test_config.py +++ b/py-polars/tests/unit/test_config.py @@ -339,7 +339,7 @@ def test_set_tbl_width_chars() -> None: "this is 10": [4, 5, 6], } ) - assert max(len(line) for line in str(df).split("\n")) == 70 + assert max(len(line) for line in str(df).split("\n")) == 68 pl.Config.set_tbl_width_chars(60) assert max(len(line) for line in str(df).split("\n")) == 60 diff --git a/py-polars/tests/unit/test_format.py b/py-polars/tests/unit/test_format.py index c403e2af7de4..748e3de54b19 100644 --- a/py-polars/tests/unit/test_format.py +++ b/py-polars/tests/unit/test_format.py @@ -1,5 +1,6 @@ from __future__ import annotations +import string from decimal import Decimal as D from typing import TYPE_CHECKING, Any, Iterator @@ -26,7 +27,7 @@ def _environ() -> Iterator[None]: """shape: (1,) Series: 'foo' [str] [ - "Somelongstring… + "Somelongstringt… ] """, ["Somelongstringto eeat wit me oundaf"], @@ -36,7 +37,7 @@ def _environ() -> Iterator[None]: """shape: (1,) Series: 'foo' [str] [ - "😀😁😂😃😄😅😆😇😈😉😊😋😌😎… + "😀😁😂😃😄😅😆😇😈😉😊😋😌😎😏… ] """, ["😀😁😂😃😄😅😆😇😈😉😊😋😌😎😏😐😑😒😓"], @@ -78,11 +79,48 @@ def test_fmt_series( capfd: pytest.CaptureFixture[str], expected: str, values: list[Any] ) -> None: s = pl.Series(name="foo", values=values) - print(s) + with pl.Config(fmt_str_lengths=15): + print(s) out, err = capfd.readouterr() assert out == expected +def test_fmt_series_string_truncate_default(capfd: pytest.CaptureFixture[str]) -> None: + values = [ + string.ascii_lowercase + "123", + string.ascii_lowercase + "1234", + string.ascii_lowercase + "12345", + ] + s = pl.Series(name="foo", values=values) + print(s) + out, _ = capfd.readouterr() + expected = """shape: (3,) +Series: 'foo' [str] +[ + "abcdefghijklmnopqrstuvwxyz123" + "abcdefghijklmnopqrstuvwxyz1234" + "abcdefghijklmnopqrstuvwxyz1234… +] +""" + assert out == expected + + +@pytest.mark.parametrize( + "dtype", [pl.String, pl.Categorical, pl.Enum(["abc", "abcd", "abcde"])] +) +def test_fmt_series_string_truncate_cat( + dtype: pl.PolarsDataType, capfd: pytest.CaptureFixture[str] +) -> None: + s = pl.Series(name="foo", values=["abc", "abcd", "abcde"], dtype=dtype) + with pl.Config(fmt_str_lengths=4): + print(s) + out, _ = capfd.readouterr() + result = [s.strip() for s in out.split("\n")[3:6]] + expected = ['"abc"', '"abcd"', '"abcd…'] + print(result) + assert result == expected + + @pytest.mark.parametrize( ("values", "dtype", "expected"), [