diff --git a/doc/source/api.rst b/doc/source/api.rst index 21d8cc62..48e4375d 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -12,6 +12,7 @@ API Reference :param not_null_columns: Columns which should be considered "NOT NULL" in the target Hyper database. By default, all columns are considered nullable :param json_columns: Columns to be written as a JSON data type :param geo_columns: Columns to be written as a GEOGRAPHY data type + :param process_params: Parameters to pass to the Hyper Process constructor. .. py:function:: frame_from_hyper(source: Union[str, pathlib.Path, tab_api.Connection], *, table: Union[str, tableauhyperapi.Name, tableauhyperapi.TableName], return_type: Literal["pandas", "pyarrow", "polars"] = "pandas") @@ -20,6 +21,7 @@ API Reference :param source: Name / location of the Hyper file to be read or Hyper-API connection. :param table: Table to read. :param return_type: The type of DataFrame to be returned + :param process_params: Parameters to pass to the Hyper Process constructor. .. py:function:: frames_to_hyper(dict_of_frames: Dict[Union[str, tableauhyperapi.Name, tableauhyperapi.TableName], pd.DataFrame], database: Union[str, pathlib.Path], *, table_mode: str = "w", not_null_columns: Optional[Iterable[str]] = None, json_columns: Optional[Iterable[str]] = None, geo_columns: Optional[Iterable[str]] = None,) -> None: @@ -32,6 +34,7 @@ API Reference :param not_null_columns: Columns which should be considered "NOT NULL" in the target Hyper database. By default, all columns are considered nullable :param json_columns: Columns to be written as a JSON data type :param geo_columns: Columns to be written as a GEOGRAPHY data type + :param process_params: Parameters to pass to the Hyper Process constructor. .. py:function:: frames_from_hyper(source: Union[str, pathlib.Path, tab_api.Connection], *, return_type: Literal["pandas", "pyarrow", "polars"] = "pandas") -> dict: @@ -39,6 +42,7 @@ API Reference :param source: Name / location of the Hyper file to be read or Hyper-API connection. :param return_type: The type of DataFrame to be returned + :param process_params: Parameters to pass to the Hyper Process constructor. .. py:function:: frame_from_hyper_query(source: Union[str, pathlib.Path, tab_api.Connection], query: str, *, return_type: Literal["pandas", "polars", "pyarrow"] = "pandas",) @@ -48,3 +52,4 @@ API Reference :param source: Name / location of the Hyper file to be read or Hyper-API connection. :param query: SQL query to execute. :param return_type: The type of DataFrame to be returned + :param process_params: Parameters to pass to the Hyper Process constructor. diff --git a/doc/source/examples.rst b/doc/source/examples.rst index 68382d1c..58cd1ba9 100644 --- a/doc/source/examples.rst +++ b/doc/source/examples.rst @@ -30,6 +30,98 @@ Reading a Hyper Extract df = pt.frame_from_hyper("example.hyper", table="animals") print(df) +Overriding Nullability +---------------------- + +By default, all data written to the Hyper database will be nullable. If you want to force nullability checks in the Hyper database, you will want to pass the names of the columns as arguments to the ``not_null_columns`` parameter. + +.. code-block:: python + + import pandas as pd + import pantab as pt + + df = pd.DataFrame([ + ["dog", 4], + ["cat", 4], + ], columns=["animal", "num_of_legs"]) + + pt.frame_to_hyper( + df, + "example.hyper", + table="animals", + not_null_columns=["animal", "num_of_legs"] + ) + +Writing JSON data +----------------- + +The Hyper database can store JSON data. Although the Arrow specification has an extension type that can store JSON, support for it is very limited. + +As such, if you want to store JSON in a Hyper database, you should send it as a string and add the column names to the ``json_columns`` argument. + +.. code-block:: python + + import pandas as pd + import pantab as pt + + df = pd.DataFrame({"json": ['{"key": "value"}']}) + + pt.frame_to_hyper( + df, + "example.hyper", + table="test", + json_columns=["json"] + ) + +Geo Support +----------- + +The Hyper database supports the storage of Geography data. The easiest way to write this data is to specify your input data as a string using the `WKT `_ and supplying the name of the column(s) as an argument to ``geo_col`` + +.. code-block:: python + + import pandas as pd + import pantab as pt + + df = pd.DataFrame( + {"geo": ["point(-122.338083 47.647528)", "point(11.584329 48.139257)"]} + ) + + pt.frame_to_hyper( + df, + "example.hyper", + table="test", + geo_columns=["geo"] + ) + +When reading such data back from a Hyper database, it will be returned as a binary field containing WKB. You may write WKB back to Hyper using the same pattern above. If you need to translate between WKB and WKT, please consider using a geo-dataframe library like `GeoArrow `~ or `GeoPandas `_ + +Controlling Hyper Process Parameters +------------------------------------ + +pantab is responsible for starting and managing its own Hyper Process. Arguments to this process can be provided via the ``process_params`` parameter. + +The most common thing users have needed to control is the ``default_database_version``. While pantab specifies a value internally, older tools may not work with the default pantab provides. Some newer Hyper features may also require a more updated default version. For details specific to this parameter and its effects, please refer to Tableau's `default_database_version `_ parameter documentation. + +For a full listing of valid parameters, please refer to the `Tableau Documentation `_ + +.. code-block:: python + + import pandas as pd + import pantab as pt + + # single precision float support requires database version 4+ + df = pd.DataFrame( + {"float32": pd.Series([3.14], dtype="float32")} + ) + + pt.frame_to_hyper( + df, + "example.hyper", + table="test", + process_params={"default_database_version": "4"} + ) + Working with Schemas -------------------- diff --git a/src/pantab/reader.cpp b/src/pantab/reader.cpp index 30a3e7a2..f1c01858 100644 --- a/src/pantab/reader.cpp +++ b/src/pantab/reader.cpp @@ -439,7 +439,7 @@ auto read_from_hyper_query( if (!process_params.count("log_config")) process_params["log_config"] = ""; if (!process_params.count("default_database_version")) - process_params["default_database_version"] = "4"; + process_params["default_database_version"] = "2"; const hyperapi::HyperProcess hyper{ hyperapi::Telemetry::DoNotSendUsageDataToTableau, "", diff --git a/src/pantab/writer.cpp b/src/pantab/writer.cpp index ea7fe199..b5b0e027 100644 --- a/src/pantab/writer.cpp +++ b/src/pantab/writer.cpp @@ -736,7 +736,7 @@ void write_to_hyper( if (!process_params.count("log_config")) process_params["log_config"] = ""; if (!process_params.count("default_database_version")) - process_params["default_database_version"] = "4"; + process_params["default_database_version"] = "2"; const hyperapi::HyperProcess hyper{ hyperapi::Telemetry::DoNotSendUsageDataToTableau, "", diff --git a/tests/test_decimal.py b/tests/test_decimal.py index 1b30c438..83d1e39c 100644 --- a/tests/test_decimal.py +++ b/tests/test_decimal.py @@ -62,6 +62,7 @@ def test_decimal_roundtrip(tmp_hyper, compat): tbl, tmp_hyper, table="decimals", + process_params={"default_database_version": "3"}, ) result = pt.frame_from_hyper(tmp_hyper, table="decimals", return_type="pyarrow") @@ -131,6 +132,7 @@ def test_decimal_negative(tmp_hyper, compat): tbl, tmp_hyper, table="decimals", + process_params={"default_database_version": "3"}, ) result = pt.frame_from_hyper(tmp_hyper, table="decimals", return_type="pyarrow") diff --git a/tests/test_reader.py b/tests/test_reader.py index bf2c96b3..7eae0d71 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -9,7 +9,12 @@ def test_read_doesnt_modify_existing_file(frame, tmp_hyper): - pt.frame_to_hyper(frame, tmp_hyper, table="test") + pt.frame_to_hyper( + frame, + tmp_hyper, + table="test", + process_params={"default_database_version": "4"}, + ) last_modified = tmp_hyper.stat().st_mtime # Try out our read methods @@ -56,7 +61,12 @@ def test_reads_nullable_columns(tmp_hyper, compat): def test_read_query(frame, tmp_hyper): - pt.frame_to_hyper(frame, tmp_hyper, table="test") + pt.frame_to_hyper( + frame, + tmp_hyper, + table="test", + process_params={"default_database_version": "4"}, + ) query = "SELECT int16 AS i, '_' || int32 AS _i2 FROM test" result = pt.frame_from_hyper_query(tmp_hyper, query) @@ -137,13 +147,23 @@ def test_reader_handles_duplicate_columns(tmp_hyper): def test_frame_from_hyper_doesnt_generate_hyperd_log(frame, tmp_hyper): - pt.frame_to_hyper(frame, tmp_hyper, table="test") + pt.frame_to_hyper( + frame, + tmp_hyper, + table="test", + process_params={"default_database_version": "4"}, + ) pt.frame_from_hyper(tmp_hyper, table="test") assert not pathlib.Path("hyperd.log").is_file() def test_frames_from_hyper_doesnt_generate_hyperd_log(frame, tmp_hyper): - pt.frame_to_hyper(frame, tmp_hyper, table="test") + pt.frame_to_hyper( + frame, + tmp_hyper, + table="test", + process_params={"default_database_version": "4"}, + ) pt.frames_from_hyper(tmp_hyper) assert not pathlib.Path("hyperd.log").is_file() diff --git a/tests/test_roundtrip.py b/tests/test_roundtrip.py index 4893cd2d..8adc485d 100644 --- a/tests/test_roundtrip.py +++ b/tests/test_roundtrip.py @@ -25,6 +25,7 @@ def test_basic(frame, roundtripped, tmp_hyper, table_name, table_mode, compat): table_mode=table_mode, json_columns={"json"}, geo_columns={"geography"}, + process_params={"default_database_version": "4"}, ) pt.frame_to_hyper( frame, @@ -33,6 +34,7 @@ def test_basic(frame, roundtripped, tmp_hyper, table_name, table_mode, compat): table_mode=table_mode, json_columns={"json"}, geo_columns={"geography"}, + process_params={"default_database_version": "4"}, ) result = pt.frame_from_hyper(tmp_hyper, table=table_name, return_type=return_type) @@ -66,6 +68,7 @@ def test_multiple_tables( table_mode=table_mode, json_columns={"json"}, geo_columns={"geography"}, + process_params={"default_database_version": "4"}, ) pt.frames_to_hyper( {table_name: frame, "table2": frame}, @@ -73,6 +76,7 @@ def test_multiple_tables( table_mode=table_mode, json_columns={"json"}, geo_columns={"geography"}, + process_params={"default_database_version": "4"}, ) result = pt.frames_from_hyper(tmp_hyper, return_type=return_type) @@ -120,6 +124,7 @@ def test_empty_roundtrip( table_mode=table_mode, json_columns={"json"}, geo_columns={"geography"}, + process_params={"default_database_version": "4"}, ) pt.frame_to_hyper( empty, @@ -128,6 +133,7 @@ def test_empty_roundtrip( table_mode=table_mode, json_columns={"json"}, geo_columns={"geography"}, + process_params={"default_database_version": "4"}, ) result = pt.frame_from_hyper(tmp_hyper, table=table_name, return_type=return_type) @@ -171,7 +177,12 @@ def test_roundtrip_works_without_tableauhyperapi(frame, tmp_hyper, monkeypatch): if mod.startswith(libname): monkeypatch.delitem(sys.modules, mod) - pt.frame_to_hyper(frame, tmp_hyper, table="foo") + pt.frame_to_hyper( + frame, + tmp_hyper, + table="foo", + process_params={"default_database_version": "4"}, + ) pt.frames_from_hyper(tmp_hyper) diff --git a/tests/test_writer.py b/tests/test_writer.py index d4cdd7dc..10fcebe9 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -32,21 +32,43 @@ def test_append_mode_raises_column_dtype_mismatch( new_dtype, hyper_type_name, frame, tmp_hyper, table_name, compat ): frame = compat.select_columns(frame, ["int16"]) - pt.frame_to_hyper(frame, tmp_hyper, table=table_name) + pt.frame_to_hyper( + frame, + tmp_hyper, + table=table_name, + process_params={"default_database_version": "4"}, + ) frame = compat.cast_column_to_type(frame, "int16", new_dtype) msg = f"Column type mismatch at index 0; new: {hyper_type_name} old: SMALLINT" with pytest.raises(ValueError, match=msg): - pt.frame_to_hyper(frame, tmp_hyper, table=table_name, table_mode="a") + pt.frame_to_hyper( + frame, + tmp_hyper, + table=table_name, + table_mode="a", + process_params={"default_database_version": "4"}, + ) def test_append_mode_raises_ncolumns_mismatch(frame, tmp_hyper, table_name, compat): - pt.frame_to_hyper(frame, tmp_hyper, table=table_name) + pt.frame_to_hyper( + frame, + tmp_hyper, + table=table_name, + process_params={"default_database_version": "4"}, + ) frame = compat.drop_columns(frame, ["int16"]) msg = "Number of columns" with pytest.raises(ValueError, match=msg): - pt.frame_to_hyper(frame, tmp_hyper, table=table_name, table_mode="a") + pt.frame_to_hyper( + frame, + tmp_hyper, + table=table_name, + table_mode="a", + process_params={"default_database_version": "4"}, + ) @pytest.mark.parametrize("container_t", [set, list, tuple]) @@ -156,6 +178,7 @@ def test_failed_write_doesnt_overwrite_file( tmp_hyper, table="test", table_mode=table_mode, + process_params={"default_database_version": "4"}, ) last_modified = tmp_hyper.stat().st_mtime @@ -220,7 +243,12 @@ def test_utc_bug(tmp_hyper): def test_uint32_actually_writes_as_oid(tmp_hyper, frame): - pt.frame_to_hyper(frame, tmp_hyper, table="test") + pt.frame_to_hyper( + frame, + tmp_hyper, + table="test", + process_params={"default_database_version": "4"}, + ) with tab_api.HyperProcess( tab_api.Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU, parameters={"log_config": ""}, @@ -237,7 +265,12 @@ def test_uint32_actually_writes_as_oid(tmp_hyper, frame): @pytest.mark.parametrize("container_t", [set, list, tuple]) def test_geo_and_json_columns_writes_proper_type(tmp_hyper, frame, container_t): - pt.frame_to_hyper(frame, tmp_hyper, table="test") + pt.frame_to_hyper( + frame, + tmp_hyper, + table="test", + process_params={"default_database_version": "4"}, + ) with tab_api.HyperProcess( tab_api.Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU, @@ -260,6 +293,7 @@ def test_geo_and_json_columns_writes_proper_type(tmp_hyper, frame, container_t): table="test", json_columns=container_t(("json",)), geo_columns=container_t(("geography",)), + process_params={"default_database_version": "4"}, ) with tab_api.HyperProcess(