Skip to content

Commit

Permalink
feat(rust, python): Add cloudpickle for serializing python UDFs (#9921)
Browse files Browse the repository at this point in the history
  • Loading branch information
OneRaynyDay authored Jul 17, 2023
1 parent f93e796 commit f5a8c6c
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 6 deletions.
14 changes: 9 additions & 5 deletions polars/polars-lazy/polars-plan/src/dsl/python_udf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,9 @@ impl Serialize for PythonFunction {
S: Serializer,
{
Python::with_gil(|py| {
let pickle = PyModule::import(py, "pickle")
.expect("Unable to import 'pickle'")
let pickle = PyModule::import(py, "cloudpickle")
.or(PyModule::import(py, "pickle"))
.expect("Unable to import 'cloudpickle' or 'pickle'")
.getattr("dumps")
.unwrap();

Expand All @@ -83,7 +84,8 @@ impl<'a> Deserialize<'a> for PythonFunction {
let bytes = Vec::<u8>::deserialize(deserializer)?;

Python::with_gil(|py| {
let pickle = PyModule::import(py, "pickle")
let pickle = PyModule::import(py, "cloudpickle")
.or(PyModule::import(py, "pickle"))
.expect("Unable to import 'pickle'")
.getattr("loads")
.unwrap();
Expand Down Expand Up @@ -122,7 +124,8 @@ impl PythonUdfExpression {
let remainder = &buf[reader.position() as usize..];

Python::with_gil(|py| {
let pickle = PyModule::import(py, "pickle")
let pickle = PyModule::import(py, "cloudpickle")
.or(PyModule::import(py, "pickle"))
.expect("Unable to import 'pickle'")
.getattr("loads")
.unwrap();
Expand Down Expand Up @@ -169,7 +172,8 @@ impl SeriesUdf for PythonUdfExpression {
ciborium::ser::into_writer(&self.output_type, &mut *buf).unwrap();

Python::with_gil(|py| {
let pickle = PyModule::import(py, "pickle")
let pickle = PyModule::import(py, "cloudpickle")
.or(PyModule::import(py, "pickle"))
.expect("Unable to import 'pickle'")
.getattr("dumps")
.unwrap();
Expand Down
1 change: 1 addition & 0 deletions py-polars/polars/utils/show_versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def _get_dependency_info() -> dict[str, str]:
# see the list of dependencies in pyproject.toml
opt_deps = [
"adbc_driver_sqlite",
"cloudpickle",
"connectorx",
"deltalake",
"fsspec",
Expand Down
3 changes: 2 additions & 1 deletion py-polars/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,9 @@ pydantic = ["pydantic"]
sqlalchemy = ["sqlalchemy", "pandas"]
xlsxwriter = ["xlsxwriter"]
adbc = ["adbc_driver_sqlite"]
cloudpickle = ["cloudpickle"]
all = [
"polars[pyarrow,pandas,numpy,fsspec,connectorx,xlsx2csv,deltalake,timezone,matplotlib,pydantic,sqlalchemy,xlsxwriter,adbc]",
"polars[pyarrow,pandas,numpy,fsspec,connectorx,xlsx2csv,deltalake,timezone,matplotlib,pydantic,sqlalchemy,xlsxwriter,adbc,cloudpickle]",
]

[tool.mypy]
Expand Down
1 change: 1 addition & 0 deletions py-polars/requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ xlsx2csv
XlsxWriter
adbc_driver_sqlite; python_version >= '3.9' and platform_system != 'Windows'
connectorx==0.3.2a5; python_version >= '3.8' # Latest full release is broken - unpin when 0.3.2 released
cloudpickle

# Tooling
hypothesis==6.79.4; python_version < '3.8'
Expand Down
14 changes: 14 additions & 0 deletions py-polars/tests/unit/test_serde.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,3 +152,17 @@ def test_pickle_lazyframe_udf() -> None:

q = pickle.loads(b)
assert q.collect()["a"].to_list() == [2, 4, 6]


def test_pickle_lazyframe_nested_function_udf() -> None:
df = pl.DataFrame({"a": [1, 2, 3]})

# NOTE: This is only possible when we're using cloudpickle.
def inner_df_times2(df: pl.DataFrame) -> pl.DataFrame:
return df.select(pl.all() * 2)

q = df.lazy().map(inner_df_times2)
b = pickle.dumps(q)

q = pickle.loads(b)
assert q.collect()["a"].to_list() == [2, 4, 6]

0 comments on commit f5a8c6c

Please sign in to comment.