Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test(python): Improved import polars timing test; now much more consistent/reliable #12478

Merged
merged 1 commit into from
Nov 15, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 59 additions & 34 deletions py-polars/tests/unit/test_polars_import.py
Original file line number Diff line number Diff line change
@@ -1,57 +1,83 @@
from __future__ import annotations

import compileall
import subprocess
import sys
from pathlib import Path

import pytest

import polars as pl
from polars import selectors as cs

# set a maximum cutoff at 0.2 secs; note that we are typically much faster
# than this (more like ~0.07 secs, depending on hardware), but we allow a
# margin of error to account for frequent noise from slow/contended CI.
MAX_ALLOWED_IMPORT_TIME = 200_000 # << microseconds


def _import_time_from_frame(tm: pl.DataFrame) -> int:
return int(
tm.filter(pl.col("import").str.strip_chars() == "polars")
.select("cumulative_time")
.item()
)


def _import_timings() -> bytes:
# assemble suitable command to get polars module import timing;
# run in a separate process to ensure clean timing results.
cmd = f'{sys.executable} -X importtime -c "import polars"'
return (
output = (
subprocess.run(cmd, shell=True, capture_output=True)
.stderr.replace(b"import time:", b"")
.strip()
)
return output


def _import_timings_as_frame(best_of: int) -> pl.DataFrame:
# create master frame as minimum of 'best_of' timings.
import_timings = [
pl.read_csv(
source=_import_timings(),
separator="|",
has_header=True,
new_columns=["own_time", "cumulative_time", "import"],
).with_columns(
pl.col(["own_time", "cumulative_time"]).str.strip_chars().cast(pl.Int32)
def _import_timings_as_frame(n_tries: int) -> tuple[pl.DataFrame, int]:
import_timings = []
for _ in range(n_tries):
df_import = (
pl.read_csv(
source=_import_timings(),
separator="|",
has_header=True,
new_columns=["own_time", "cumulative_time", "import"],
)
.with_columns(cs.ends_with("_time").str.strip_chars().cast(pl.UInt32))
.select("import", "own_time", "cumulative_time")
.reverse()
)
for _ in range(best_of)
]
return (
sorted(
import_timings,
key=lambda tm: int(tm["cumulative_time"].max()), # type: ignore[arg-type]
)[0]
.reverse()
.select("import", "own_time", "cumulative_time")
)
polars_import_time = _import_time_from_frame(df_import)
if polars_import_time < MAX_ALLOWED_IMPORT_TIME:
return df_import, polars_import_time

import_timings.append(df_import)

# note: if a qualifying import time was already achieved, we won't get here
df_fastest_import = sorted(import_timings, key=_import_time_from_frame)[0]
return df_fastest_import, _import_time_from_frame(df_fastest_import)


@pytest.mark.skipif(sys.platform == "win32", reason="Unreliable on Windows")
@pytest.mark.slow()
def test_polars_import() -> None:
# note: take the fastest of several runs to reduce noise.
df_import = _import_timings_as_frame(best_of=3)
# up-front compile '.py' -> '.pyc' before timing
polars_path = Path(pl.__file__).parent
compileall.compile_dir(polars_path, quiet=1)

with pl.Config() as cfg:
# get a complete view of what's going on in case of failure
cfg.set_tbl_rows(250)
cfg.set_fmt_str_lengths(100)
cfg.set_tbl_hide_dataframe_shape()
# note: reduce noise by allowing up to 'n' tries (but return immediately if/when
# a qualifying time is achieved, so we don't waste time running unnecessary tests)
df_import, polars_import_time = _import_timings_as_frame(n_tries=5)

with pl.Config(
# get a complete view of what's going on in case of failure
tbl_rows=250,
fmt_str_lengths=100,
tbl_hide_dataframe_shape=True,
):
# ensure that we have not broken lazy-loading (numpy, pandas, pyarrow, etc).
lazy_modules = [
dep for dep in pl.dependencies.__all__ if not dep.startswith("_")
Expand All @@ -62,9 +88,8 @@ def test_polars_import() -> None:
assert not_imported, f"{if_err}\n{df_import}"

# ensure that we do not have an import speed regression.
polars_import = df_import.filter(pl.col("import").str.strip_chars() == "polars")
polars_import_time = polars_import["cumulative_time"].item()
assert isinstance(polars_import_time, int)

if_err = f"Possible import speed regression; took {polars_import_time//1_000}ms"
assert polars_import_time < 200_000, f"{if_err}\n{df_import}"
if polars_import_time > MAX_ALLOWED_IMPORT_TIME:
import_time_ms = polars_import_time // 1_000
raise AssertionError(
f"Possible import speed regression; took {import_time_ms}ms\n{df_import}"
)