Skip to content

Commit

Permalink
convert integer columns with large values to str
Browse files Browse the repository at this point in the history
  • Loading branch information
mwouts committed Mar 26, 2023
1 parent a4a39f7 commit 5de5756
Show file tree
Hide file tree
Showing 10 changed files with 132 additions and 13 deletions.
7 changes: 4 additions & 3 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@ ITables ChangeLog
1.5.2 (2023-03-??)
------------------

**Added**
- We have added a CI configuration where we test `itables` against `pandas` in pre-release versions

**Fixed**
- Integers that are too big for Javascript are converted to str ([#152](https://github.com/mwouts/itables/issues/152))
- If a downsampling occurs, the downsampling message is displayed even if the table only has a few rows

**Added**
- We have added a CI configuration where we test `itables` against `pandas` in pre-release versions


1.5.1 (2023-03-12)
------------------
Expand Down
10 changes: 10 additions & 0 deletions docs/polars_dataframes.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,13 @@ show(dict_of_test_dfs["long_column_names"], scrollX=True)
```{code-cell}
show(dict_of_test_dfs["named_column_index"])
```

## big_integers

```{code-cell}
import itables.options as opt
opt.warn_on_int_to_str_conversion = False
show(dict_of_test_dfs["big_integers"])
```
10 changes: 10 additions & 0 deletions docs/sample_dataframes.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,3 +151,13 @@ show(dict_of_test_dfs["duplicated_columns"])
```{code-cell}
show(dict_of_test_dfs["named_column_index"])
```

## big_integers

```{code-cell}
import itables.options as opt
opt.warn_on_int_to_str_conversion = False
show(dict_of_test_dfs["big_integers"])
```
55 changes: 51 additions & 4 deletions itables/datatables_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,14 @@
import pandas as pd
import pandas.io.formats.format as fmt

import itables.options as opt
try:
import polars as pl
except ImportError:
pl = None


JS_MAX_SAFE_INTEGER = 2**53 - 1
JS_MIN_SAFE_INTEGER = -(2**53 - 1)


def _format_column(x):
Expand Down Expand Up @@ -48,7 +55,7 @@ def default(self, obj):
warnings.warn(
"Unexpected type '{}' for '{}'.\n"
"You can report this warning at https://github.com/mwouts/itables/issues\n"
"To ignore the warning, please run:\n"
"To silence this warning, please run:\n"
" import itables.options as opt\n"
" opt.warn_on_unexpected_types = False".format(type(obj), obj),
category=RuntimeWarning,
Expand All @@ -58,8 +65,48 @@ def default(self, obj):
return TableValuesEncoder


def datatables_rows(df, count=None):
def convert_bigints_to_str(df, warn_on_int_to_str_conversion):
"""In Javascript, integers have to remain between JS_MIN_SAFE_INTEGER and JS_MAX_SAFE_INTEGER."""
converted = []
for i, col in enumerate(df.columns):
try:
x = df.iloc[:, i]
if (
x.dtype.kind == "i"
and (
~x.isnull()
& ((x < JS_MIN_SAFE_INTEGER) | (x > JS_MAX_SAFE_INTEGER))
).any()
):
df.iloc[:, i] = x.astype(str)
converted.append(col)
except AttributeError:
x = df[col]
if (
x.dtype in pl.INTEGER_DTYPES
and ((x < JS_MIN_SAFE_INTEGER) | (x > JS_MAX_SAFE_INTEGER)).any()
):
df = df.with_columns(pl.col(col).cast(pl.Utf8))
converted.append(col)

if converted and warn_on_int_to_str_conversion:
warnings.warn(
"The columns {} contains integers that are too large for Javascript.\n"
"They have been converted to str.\n"
"To silence this warning, please run:\n"
" import itables.options as opt\n"
" opt.warn_on_int_to_str_conversion = False".format(converted)
)

return df


def datatables_rows(
df, count=None, warn_on_unexpected_types=False, warn_on_int_to_str_conversion=False
):
"""Format the values in the table and return the data, row by row, as requested by DataTables"""
df = convert_bigints_to_str(df, warn_on_int_to_str_conversion)

# We iterate over columns using an index rather than the column name
# to avoid an issue in case of duplicated column names #89
if count is None or len(df.columns) == count:
Expand All @@ -73,7 +120,7 @@ def datatables_rows(df, count=None):
try:
# Pandas DataFrame
data = list(zip(*(empty_columns + [_format_column(x) for _, x in df.items()])))
return json.dumps(data, cls=generate_encoder(opt.warn_on_unexpected_types))
return json.dumps(data, cls=generate_encoder(warn_on_unexpected_types))
except AttributeError:
# Polars DataFrame
data = list(df.iter_rows())
Expand Down
11 changes: 9 additions & 2 deletions itables/javascript.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ def to_html_datatable(df=None, caption=None, tableId=None, connected=True, **kwa
if (
option not in kwargs
and not option.startswith("__")
and option not in ["read_package_file", "warn_on_unexpected_types"]
and option not in ["read_package_file"]
):
kwargs[option] = getattr(opt, option)

Expand Down Expand Up @@ -274,6 +274,8 @@ def to_html_datatable(df=None, caption=None, tableId=None, connected=True, **kwa
maxColumns = kwargs.pop("maxColumns", pd.get_option("display.max_columns") or 0)
eval_functions = kwargs.pop("eval_functions", None)
pre_dt_code = kwargs.pop("pre_dt_code")
warn_on_unexpected_types = kwargs.pop("warn_on_unexpected_types", False)
warn_on_int_to_str_conversion = kwargs.pop("warn_on_int_to_str_conversion", False)

if isinstance(df, (np.ndarray, np.generic)):
df = pd.DataFrame(df)
Expand Down Expand Up @@ -390,7 +392,12 @@ def to_html_datatable(df=None, caption=None, tableId=None, connected=True, **kwa
# When the header has an extra column, we add
# an extra empty column in the table data #141
column_count = _column_count_in_header(table_header)
dt_data = datatables_rows(df, column_count)
dt_data = datatables_rows(
df,
column_count,
warn_on_unexpected_types=warn_on_unexpected_types,
warn_on_int_to_str_conversion=warn_on_int_to_str_conversion,
)

output = replace_value(
output, "const data = [];", "const data = {};".format(dt_data)
Expand Down
3 changes: 3 additions & 0 deletions itables/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,6 @@

"""Should a warning appear when we have to encode an unexpected type?"""
warn_on_unexpected_types = True

"""Should a warning appear when we convert large integers to str?"""
warn_on_int_to_str_conversion = True
14 changes: 14 additions & 0 deletions itables/sample_dfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,20 @@ def get_dict_of_test_dfs(N=100, M=100, polars=False):
),
),
"named_column_index": pd.DataFrame({"a": [1]}).rename_axis("columns", axis=1),
"big_integers": pd.DataFrame(
{
"bigint": [
1234567890123456789,
2345678901234567890,
3456789012345678901,
],
"expected": [
"1234567890123456789",
"2345678901234567890",
"3456789012345678901",
],
}
),
}

if polars:
Expand Down
18 changes: 17 additions & 1 deletion tests/test_datatables_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,18 @@
.T.reset_index(),
[[None, "a", 1, 2]],
),
(
pd.DataFrame(
{
"long": [
1234567890123456789,
2345678901234567890,
3456789012345678901,
]
}
),
'[["1234567890123456789"], ["2345678901234567890"], ["3456789012345678901"]]',
),
],
ids=[
"bool",
Expand All @@ -80,6 +92,7 @@
"object_dict",
"df_with_named_column_axis",
"transposed_df",
"big_integers",
],
)
def test_datatables_rows(df, expected):
Expand All @@ -95,7 +108,10 @@ def test_datatables_rows(df, expected):
)
column_count = _column_count_in_header(table_header)
actual = datatables_rows(df, count=column_count)
assert actual == json.dumps(expected)
if isinstance(expected, str):
assert actual == expected
else:
assert actual == json.dumps(expected)


@pytest.mark.skipif(
Expand Down
5 changes: 5 additions & 0 deletions tests/test_javascript.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ def test_warn_on_unexpected_types_not_in_html(df):
assert "warn_on_unexpected_types" not in html


def test_warn_on_int_to_str_conversion_not_in_html(df):
html = to_html_datatable(df)
assert "warn_on_int_to_str_conversion" not in html


def test_df_fits_in_one_page(df, lengthMenu):
kwargs = dict(lengthMenu=lengthMenu)
kwargs = {key: value for key, value in kwargs.items() if value is not None}
Expand Down
12 changes: 9 additions & 3 deletions tests/test_sample_dfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,15 @@ def kwargs_remove_none(**kwargs):
return {key: value for key, value in kwargs.items() if value is not None}


def test_show_test_dfs(df, lengthMenu):
def test_show_test_dfs(df, lengthMenu, monkeypatch):
if "bigint" in df.columns:
monkeypatch.setattr("itables.options.warn_on_int_to_str_conversion", False)
show(df, **kwargs_remove_none(lengthMenu=lengthMenu))


def test_to_html_datatable(df, lengthMenu):
def test_to_html_datatable(df, lengthMenu, monkeypatch):
if "bigint" in df.columns:
monkeypatch.setattr("itables.options.warn_on_int_to_str_conversion", False)
to_html_datatable(df, **kwargs_remove_none(lengthMenu=lengthMenu))


Expand All @@ -73,7 +77,9 @@ def test_format_column(series_name, series):


@pytest.mark.parametrize("series_name,series", get_dict_of_test_series().items())
def test_show_test_series(series_name, series):
def test_show_test_series(series_name, series, monkeypatch):
if "bigint" in series_name:
monkeypatch.setattr("itables.options.warn_on_int_to_str_conversion", False)
show(series)


Expand Down

0 comments on commit 5de5756

Please sign in to comment.