convert integer columns with large values to str

mwouts · Mar 26, 2023 · 5de5756 · 5de5756
1 parent a4a39f7
commit 5de5756
Show file tree

Hide file tree

Showing 10 changed files with 132 additions and 13 deletions.
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -4,12 +4,13 @@ ITables ChangeLog
 1.5.2 (2023-03-??)
 ------------------
 
-**Added**
-- We have added a CI configuration where we test `itables` against `pandas` in pre-release versions
-
 **Fixed**
+- Integers that are too big for Javascript are converted to str ([#152](https://github.com/mwouts/itables/issues/152))
 - If a downsampling occurs, the downsampling message is displayed even if the table only has a few rows
 
+**Added**
+- We have added a CI configuration where we test `itables` against `pandas` in pre-release versions
+
 
 1.5.1 (2023-03-12)
 ------------------

diff --git a/docs/polars_dataframes.md b/docs/polars_dataframes.md
@@ -140,3 +140,13 @@ show(dict_of_test_dfs["long_column_names"], scrollX=True)
 ```{code-cell}
 show(dict_of_test_dfs["named_column_index"])
 ```
+
+## big_integers
+
+```{code-cell}
+import itables.options as opt
+
+opt.warn_on_int_to_str_conversion = False
+
+show(dict_of_test_dfs["big_integers"])
+```
diff --git a/docs/sample_dataframes.md b/docs/sample_dataframes.md
@@ -151,3 +151,13 @@ show(dict_of_test_dfs["duplicated_columns"])
 ```{code-cell}
 show(dict_of_test_dfs["named_column_index"])
 ```
+
+## big_integers
+
+```{code-cell}
+import itables.options as opt
+
+opt.warn_on_int_to_str_conversion = False
+
+show(dict_of_test_dfs["big_integers"])
+```
diff --git a/itables/datatables_format.py b/itables/datatables_format.py
@@ -5,7 +5,14 @@
 import pandas as pd
 import pandas.io.formats.format as fmt
 
-import itables.options as opt
+try:
+    import polars as pl
+except ImportError:
+    pl = None
+
+
+JS_MAX_SAFE_INTEGER = 2**53 - 1
+JS_MIN_SAFE_INTEGER = -(2**53 - 1)
 
 
 def _format_column(x):
@@ -48,7 +55,7 @@ def default(self, obj):
                 warnings.warn(
                     "Unexpected type '{}' for '{}'.\n"
                     "You can report this warning at https://github.com/mwouts/itables/issues\n"
-                    "To ignore the warning, please run:\n"
+                    "To silence this warning, please run:\n"
                     "    import itables.options as opt\n"
                     "    opt.warn_on_unexpected_types = False".format(type(obj), obj),
                     category=RuntimeWarning,
@@ -58,8 +65,48 @@ def default(self, obj):
     return TableValuesEncoder
 
 
-def datatables_rows(df, count=None):
+def convert_bigints_to_str(df, warn_on_int_to_str_conversion):
+    """In Javascript, integers have to remain between JS_MIN_SAFE_INTEGER and JS_MAX_SAFE_INTEGER."""
+    converted = []
+    for i, col in enumerate(df.columns):
+        try:
+            x = df.iloc[:, i]
+            if (
+                x.dtype.kind == "i"
+                and (
+                    ~x.isnull()
+                    & ((x < JS_MIN_SAFE_INTEGER) | (x > JS_MAX_SAFE_INTEGER))
+                ).any()
+            ):
+                df.iloc[:, i] = x.astype(str)
+                converted.append(col)
+        except AttributeError:
+            x = df[col]
+            if (
+                x.dtype in pl.INTEGER_DTYPES
+                and ((x < JS_MIN_SAFE_INTEGER) | (x > JS_MAX_SAFE_INTEGER)).any()
+            ):
+                df = df.with_columns(pl.col(col).cast(pl.Utf8))
+                converted.append(col)
+
+    if converted and warn_on_int_to_str_conversion:
+        warnings.warn(
+            "The columns {} contains integers that are too large for Javascript.\n"
+            "They have been converted to str.\n"
+            "To silence this warning, please run:\n"
+            "    import itables.options as opt\n"
+            "    opt.warn_on_int_to_str_conversion = False".format(converted)
+        )
+
+    return df
+
+
+def datatables_rows(
+    df, count=None, warn_on_unexpected_types=False, warn_on_int_to_str_conversion=False
+):
     """Format the values in the table and return the data, row by row, as requested by DataTables"""
+    df = convert_bigints_to_str(df, warn_on_int_to_str_conversion)
+
     # We iterate over columns using an index rather than the column name
     # to avoid an issue in case of duplicated column names #89
     if count is None or len(df.columns) == count:
@@ -73,7 +120,7 @@ def datatables_rows(df, count=None):
     try:
         # Pandas DataFrame
         data = list(zip(*(empty_columns + [_format_column(x) for _, x in df.items()])))
-        return json.dumps(data, cls=generate_encoder(opt.warn_on_unexpected_types))
+        return json.dumps(data, cls=generate_encoder(warn_on_unexpected_types))
     except AttributeError:
         # Polars DataFrame
         data = list(df.iter_rows())

diff --git a/itables/javascript.py b/itables/javascript.py
@@ -245,7 +245,7 @@ def to_html_datatable(df=None, caption=None, tableId=None, connected=True, **kwa
         if (
             option not in kwargs
             and not option.startswith("__")
-            and option not in ["read_package_file", "warn_on_unexpected_types"]
+            and option not in ["read_package_file"]
         ):
             kwargs[option] = getattr(opt, option)
 
@@ -274,6 +274,8 @@ def to_html_datatable(df=None, caption=None, tableId=None, connected=True, **kwa
     maxColumns = kwargs.pop("maxColumns", pd.get_option("display.max_columns") or 0)
     eval_functions = kwargs.pop("eval_functions", None)
     pre_dt_code = kwargs.pop("pre_dt_code")
+    warn_on_unexpected_types = kwargs.pop("warn_on_unexpected_types", False)
+    warn_on_int_to_str_conversion = kwargs.pop("warn_on_int_to_str_conversion", False)
 
     if isinstance(df, (np.ndarray, np.generic)):
         df = pd.DataFrame(df)
@@ -390,7 +392,12 @@ def to_html_datatable(df=None, caption=None, tableId=None, connected=True, **kwa
     # When the header has an extra column, we add
     # an extra empty column in the table data #141
     column_count = _column_count_in_header(table_header)
-    dt_data = datatables_rows(df, column_count)
+    dt_data = datatables_rows(
+        df,
+        column_count,
+        warn_on_unexpected_types=warn_on_unexpected_types,
+        warn_on_int_to_str_conversion=warn_on_int_to_str_conversion,
+    )
 
     output = replace_value(
         output, "const data = [];", "const data = {};".format(dt_data)

diff --git a/itables/options.py b/itables/options.py
@@ -46,3 +46,6 @@
 
 """Should a warning appear when we have to encode an unexpected type?"""
 warn_on_unexpected_types = True
+
+"""Should a warning appear when we convert large integers to str?"""
+warn_on_int_to_str_conversion = True
diff --git a/itables/sample_dfs.py b/itables/sample_dfs.py
@@ -240,6 +240,20 @@ def get_dict_of_test_dfs(N=100, M=100, polars=False):
             ),
         ),
         "named_column_index": pd.DataFrame({"a": [1]}).rename_axis("columns", axis=1),
+        "big_integers": pd.DataFrame(
+            {
+                "bigint": [
+                    1234567890123456789,
+                    2345678901234567890,
+                    3456789012345678901,
+                ],
+                "expected": [
+                    "1234567890123456789",
+                    "2345678901234567890",
+                    "3456789012345678901",
+                ],
+            }
+        ),
     }
 
     if polars:

diff --git a/tests/test_datatables_format.py b/tests/test_datatables_format.py
@@ -64,6 +64,18 @@
             .T.reset_index(),
             [[None, "a", 1, 2]],
         ),
+        (
+            pd.DataFrame(
+                {
+                    "long": [
+                        1234567890123456789,
+                        2345678901234567890,
+                        3456789012345678901,
+                    ]
+                }
+            ),
+            '[["1234567890123456789"], ["2345678901234567890"], ["3456789012345678901"]]',
+        ),
     ],
     ids=[
         "bool",
@@ -80,6 +92,7 @@
         "object_dict",
         "df_with_named_column_axis",
         "transposed_df",
+        "big_integers",
     ],
 )
 def test_datatables_rows(df, expected):
@@ -95,7 +108,10 @@ def test_datatables_rows(df, expected):
     )
     column_count = _column_count_in_header(table_header)
     actual = datatables_rows(df, count=column_count)
-    assert actual == json.dumps(expected)
+    if isinstance(expected, str):
+        assert actual == expected
+    else:
+        assert actual == json.dumps(expected)
 
 
 @pytest.mark.skipif(

diff --git a/tests/test_javascript.py b/tests/test_javascript.py
@@ -6,6 +6,11 @@ def test_warn_on_unexpected_types_not_in_html(df):
     assert "warn_on_unexpected_types" not in html
 
 
+def test_warn_on_int_to_str_conversion_not_in_html(df):
+    html = to_html_datatable(df)
+    assert "warn_on_int_to_str_conversion" not in html
+
+
 def test_df_fits_in_one_page(df, lengthMenu):
     kwargs = dict(lengthMenu=lengthMenu)
     kwargs = {key: value for key, value in kwargs.items() if value is not None}

diff --git a/tests/test_sample_dfs.py b/tests/test_sample_dfs.py
@@ -52,11 +52,15 @@ def kwargs_remove_none(**kwargs):
     return {key: value for key, value in kwargs.items() if value is not None}
 
 
-def test_show_test_dfs(df, lengthMenu):
+def test_show_test_dfs(df, lengthMenu, monkeypatch):
+    if "bigint" in df.columns:
+        monkeypatch.setattr("itables.options.warn_on_int_to_str_conversion", False)
     show(df, **kwargs_remove_none(lengthMenu=lengthMenu))
 
 
-def test_to_html_datatable(df, lengthMenu):
+def test_to_html_datatable(df, lengthMenu, monkeypatch):
+    if "bigint" in df.columns:
+        monkeypatch.setattr("itables.options.warn_on_int_to_str_conversion", False)
     to_html_datatable(df, **kwargs_remove_none(lengthMenu=lengthMenu))
 
 
@@ -73,7 +77,9 @@ def test_format_column(series_name, series):
 
 
 @pytest.mark.parametrize("series_name,series", get_dict_of_test_series().items())
-def test_show_test_series(series_name, series):
+def test_show_test_series(series_name, series, monkeypatch):
+    if "bigint" in series_name:
+        monkeypatch.setattr("itables.options.warn_on_int_to_str_conversion", False)
     show(series)