Skip to content

Commit

Permalink
fix: Set default limit for String column display to 30 and fix edge c…
Browse files Browse the repository at this point in the history
…ases (#15934)
  • Loading branch information
stinodego authored Apr 28, 2024
1 parent 14b352f commit 031c926
Show file tree
Hide file tree
Showing 9 changed files with 137 additions and 84 deletions.
111 changes: 62 additions & 49 deletions crates/polars-core/src/fmt.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#[cfg(any(feature = "fmt", feature = "fmt_no_tty"))]
use std::borrow::Cow;
use std::fmt::{Debug, Display, Formatter, Write};
use std::str::FromStr;
use std::sync::atomic::{AtomicBool, AtomicU8, Ordering};
use std::sync::RwLock;
use std::{fmt, str};
Expand Down Expand Up @@ -28,7 +29,11 @@ use crate::prelude::*;

// Note: see https://github.com/pola-rs/polars/pull/13699 for the rationale
// behind choosing 10 as the default value for default number of rows displayed
const LIMIT: usize = 10;
const DEFAULT_ROW_LIMIT: usize = 10;
#[cfg(any(feature = "fmt", feature = "fmt_no_tty"))]
const DEFAULT_COL_LIMIT: usize = 8;
const DEFAULT_STR_LEN_LIMIT: usize = 30;
const DEFAULT_LIST_LEN_LIMIT: usize = 3;

#[derive(Copy, Clone)]
#[repr(u8)]
Expand Down Expand Up @@ -86,6 +91,40 @@ pub fn set_trim_decimal_zeros(trim: Option<bool>) {
TRIM_DECIMAL_ZEROS.store(trim.unwrap_or(false), Ordering::Relaxed)
}

/// Parses an environment variable value.
fn parse_env_var<T: FromStr>(name: &str) -> Option<T> {
std::env::var(name).ok().and_then(|v| v.parse().ok())
}
/// Parses an environment variable value as a limit or set a default.
///
/// Negative values (e.g. -1) are parsed as 'no limit' or [`usize::MAX`].
fn parse_env_var_limit(name: &str, default: usize) -> usize {
parse_env_var(name).map_or(
default,
|n: i64| {
if n < 0 {
usize::MAX
} else {
n as usize
}
},
)
}

fn get_row_limit() -> usize {
parse_env_var_limit(FMT_MAX_ROWS, DEFAULT_ROW_LIMIT)
}
#[cfg(any(feature = "fmt", feature = "fmt_no_tty"))]
fn get_col_limit() -> usize {
parse_env_var_limit(FMT_MAX_COLS, DEFAULT_COL_LIMIT)
}
fn get_str_len_limit() -> usize {
parse_env_var_limit(FMT_STR_LEN, DEFAULT_STR_LEN_LIMIT)
}
fn get_list_len_limit() -> usize {
parse_env_var_limit(FMT_TABLE_CELL_LIST_LEN, DEFAULT_LIST_LEN_LIMIT)
}

macro_rules! format_array {
($f:ident, $a:expr, $dtype:expr, $name:expr, $array_type:expr) => {{
write!(
Expand All @@ -96,43 +135,38 @@ macro_rules! format_array {
$name,
$dtype
)?;
let truncate = matches!($a.dtype(), DataType::String);
let truncate_len = if truncate {
std::env::var(FMT_STR_LEN)
.as_deref()
.unwrap_or("")
.parse()
.unwrap_or(15)
} else {
15
};
let limit: usize = {
let limit = std::env::var(FMT_MAX_ROWS)
.as_deref()
.unwrap_or("")
.parse()
.map_or(LIMIT, |n: i64| if n < 0 { $a.len() } else { n as usize });
std::cmp::min(limit, $a.len())

let truncate = match $a.dtype() {
DataType::String => true,
#[cfg(feature = "dtype-categorical")]
DataType::Categorical(_, _) | DataType::Enum(_, _) => true,
_ => false,
};
let truncate_len = if truncate { get_str_len_limit() } else { 0 };

let write_fn = |v, f: &mut Formatter| -> fmt::Result {
if truncate {
let v = format!("{}", v);
let v_trunc = &v[..v
let v_no_quotes = &v[1..v.len() - 1];
let v_trunc = &v_no_quotes[..v_no_quotes
.char_indices()
.take(truncate_len)
.last()
.map(|(i, c)| i + c.len_utf8())
.unwrap_or(0)];
if v == v_trunc {
if v_no_quotes == v_trunc {
write!(f, "\t{}\n", v)?;
} else {
write!(f, "\t{}…\n", v_trunc)?;
write!(f, "\t\"{}…\n", v_trunc)?;
}
} else {
write!(f, "\t{}\n", v)?;
};
Ok(())
};

let limit = get_row_limit();

if $a.len() > limit {
let half = limit / 2;
let rest = limit % 2;
Expand Down Expand Up @@ -166,7 +200,7 @@ fn format_object_array(
) -> fmt::Result {
match object.dtype() {
DataType::Object(inner_type, _) => {
let limit = std::cmp::min(LIMIT, object.len());
let limit = std::cmp::min(DEFAULT_ROW_LIMIT, object.len());
write!(
f,
"shape: ({},)\n{}: '{}' [o][{}]\n[\n",
Expand Down Expand Up @@ -232,7 +266,7 @@ where
T: PolarsObject,
{
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
let limit = std::cmp::min(LIMIT, self.len());
let limit = std::cmp::min(DEFAULT_ROW_LIMIT, self.len());
let inner_type = T::type_name();
write!(
f,
Expand Down Expand Up @@ -497,14 +531,6 @@ fn fmt_df_shape((shape0, shape1): &(usize, usize)) -> String {
)
}

fn get_str_width() -> usize {
std::env::var(FMT_STR_LEN)
.as_deref()
.unwrap_or("")
.parse()
.unwrap_or(32)
}

impl Display for DataFrame {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
#[cfg(any(feature = "fmt", feature = "fmt_no_tty"))]
Expand All @@ -514,19 +540,10 @@ impl Display for DataFrame {
self.columns.iter().all(|s| s.len() == height),
"The column lengths in the DataFrame are not equal."
);
let str_truncate = get_str_width();

let max_n_cols = std::env::var(FMT_MAX_COLS)
.as_deref()
.unwrap_or("")
.parse()
.map_or(8, |n: i64| if n < 0 { self.width() } else { n as usize });

let max_n_rows = std::env::var(FMT_MAX_ROWS)
.as_deref()
.unwrap_or("")
.parse()
.map_or(LIMIT, |n: i64| if n < 0 { height } else { n as usize });
let max_n_cols = get_col_limit();
let max_n_rows = get_row_limit();
let str_truncate = get_str_len_limit();

let (n_first, n_last) = if self.width() > max_n_cols {
((max_n_cols + 1) / 2, max_n_cols / 2)
Expand Down Expand Up @@ -965,7 +982,7 @@ fn format_duration(f: &mut Formatter, v: i64, sizes: &[i64], names: &[&str]) ->
}

fn format_blob(f: &mut Formatter<'_>, bytes: &[u8]) -> fmt::Result {
let width = get_str_width() * 2;
let width = get_str_len_limit() * 2;
write!(f, "b\"")?;

for b in bytes.iter().take(width) {
Expand Down Expand Up @@ -1109,11 +1126,7 @@ impl Series {
return "[]".to_owned();
}

let max_items = std::env::var(FMT_TABLE_CELL_LIST_LEN)
.as_deref()
.unwrap_or("")
.parse()
.map_or(3, |n: i64| if n < 0 { self.len() } else { n as usize });
let max_items = get_list_len_limit();

match max_items {
0 => "[…]".to_owned(),
Expand Down
16 changes: 8 additions & 8 deletions py-polars/polars/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -663,14 +663,14 @@ def set_fmt_str_lengths(cls, n: int | None) -> type[Config]:
... )
>>> df.with_columns(pl.col("txt").str.len_bytes().alias("len"))
shape: (2, 2)
┌───────────────────────────────────┬─────┐
│ txt ┆ len │
│ --- ┆ --- │
│ str ┆ u32 │
╞═══════════════════════════════════╪═════╡
│ Play it, Sam. Play 'As Time Goes… ┆ 37 │
│ This is the beginning of a beaut… ┆ 48 │
└───────────────────────────────────┴─────┘
┌─────────────────────────────────┬─────┐
│ txt ┆ len │
│ --- ┆ --- │
│ str ┆ u32 │
╞═════════════════════════════════╪═════╡
│ Play it, Sam. Play 'As Time Go… ┆ 37 │
│ This is the beginning of a bea… ┆ 48 │
└─────────────────────────────────┴─────┘
>>> with pl.Config(fmt_str_lengths=50):
... print(df)
shape: (2, 1)
Expand Down
4 changes: 2 additions & 2 deletions py-polars/polars/dataframe/_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def write_header(self) -> None:

def write_body(self) -> None:
"""Write the body of an HTML table."""
str_lengths = int(os.environ.get("POLARS_FMT_STR_LEN", "15"))
str_len_limit = int(os.environ.get("POLARS_FMT_STR_LEN", default=30))
with Tag(self.elements, "tbody"):
for r in self.row_idx:
with Tag(self.elements, "tr"):
Expand All @@ -118,7 +118,7 @@ def write_body(self) -> None:
else:
series = self.df[:, c]
self.elements.append(
html.escape(series._s.get_fmt(r, str_lengths))
html.escape(series._s.get_fmt(r, str_len_limit))
)

def write(self, inner: str) -> None:
Expand Down
18 changes: 9 additions & 9 deletions py-polars/polars/expr/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -1669,15 +1669,15 @@ def extract_groups(self, pattern: str) -> Expr:
... ).with_columns(name=pl.col("captures").struct["1"].str.to_uppercase())
... )
shape: (3, 3)
┌───────────────────────────────────┬───────────────────────┬──────────┐
│ url ┆ captures ┆ name │
│ --- ┆ --- ┆ --- │
│ str ┆ struct[2] ┆ str │
╞═══════════════════════════════════╪═══════════════════════╪══════════╡
│ http://vote.com/ballon_dor?candi… ┆ {"messi","python"} ┆ MESSI │
│ http://vote.com/ballon_dor?candi… ┆ {"weghorst","polars"} ┆ WEGHORST │
│ http://vote.com/ballon_dor?error… ┆ {null,null} ┆ null │
└───────────────────────────────────┴───────────────────────┴──────────┘
┌─────────────────────────────────┬───────────────────────┬──────────┐
│ url ┆ captures ┆ name │
│ --- ┆ --- ┆ --- │
│ str ┆ struct[2] ┆ str │
╞═════════════════════════════════╪═══════════════════════╪══════════╡
│ http://vote.com/ballon_dor?can… ┆ {"messi","python"} ┆ MESSI │
│ http://vote.com/ballon_dor?can… ┆ {"weghorst","polars"} ┆ WEGHORST │
│ http://vote.com/ballon_dor?err… ┆ {null,null} ┆ null │
└─────────────────────────────────┴───────────────────────┴──────────┘
"""
return wrap_expr(self._pyexpr.str_extract_groups(pattern))

Expand Down
4 changes: 2 additions & 2 deletions py-polars/polars/series/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -1582,8 +1582,8 @@ def to_titlecase(self) -> Series:
shape: (2,)
Series: 'sing' [str]
[
"Welcome To My
"There's No Tur…
"Welcome To My World"
"There's No Turning Back"
]
"""

Expand Down
4 changes: 2 additions & 2 deletions py-polars/polars/series/struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def json_encode(self) -> Series:
shape: (2,)
Series: 'a' [str]
[
"{"a":[1,2],"b"
"{"a":[9,1,3],"
"{"a":[1,2],"b":[45]}"
"{"a":[9,1,3],"b":null}"
]
"""
18 changes: 10 additions & 8 deletions py-polars/src/series/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,24 +125,26 @@ impl PySeries {
})
}

fn get_fmt(&self, index: usize, str_lengths: usize) -> String {
let val = format!("{}", self.series.get(index).unwrap());
/// Returns the string format of a single element of the Series.
fn get_fmt(&self, index: usize, str_len_limit: usize) -> String {
let v = format!("{}", self.series.get(index).unwrap());
if let DataType::String | DataType::Categorical(_, _) | DataType::Enum(_, _) =
self.series.dtype()
{
let v_trunc = &val[..val
let v_no_quotes = &v[1..v.len() - 1];
let v_trunc = &v_no_quotes[..v_no_quotes
.char_indices()
.take(str_lengths)
.take(str_len_limit)
.last()
.map(|(i, c)| i + c.len_utf8())
.unwrap_or(0)];
if val == v_trunc {
val
if v_no_quotes == v_trunc {
v
} else {
format!("{v_trunc}…")
format!("\"{v_trunc}…")
}
} else {
val
v
}
}

Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/unit/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,7 @@ def test_set_tbl_width_chars() -> None:
"this is 10": [4, 5, 6],
}
)
assert max(len(line) for line in str(df).split("\n")) == 70
assert max(len(line) for line in str(df).split("\n")) == 68

pl.Config.set_tbl_width_chars(60)
assert max(len(line) for line in str(df).split("\n")) == 60
Expand Down
Loading

0 comments on commit 031c926

Please sign in to comment.