Use positional arguments when calling kll update (#1545)

## Description Fixes for long running instances of rolling logger: * Use positional arguments when calling kll update * Add a load test to rolling logger * usage stats update to add segment_on_column, update condition_validator ``` poetry run pytest -o log_level=INFO -o log_cli=true tests/api/logger/test_rolling.py::test_rolling_logger_load_test --load ``` - [x] I have reviewed the [Guidelines for Contributing](CONTRIBUTING.md) and the [Code of Conduct](CODE_OF_CONDUCT.md).
whylabs · Jul 18, 2024 · 13e089c · 13e089c
1 parent b6a6dae
commit 13e089c
Show file tree

Hide file tree

Showing 4 changed files with 53 additions and 3 deletions.
diff --git a/python/tests/api/logger/test_rolling.py b/python/tests/api/logger/test_rolling.py
@@ -2,6 +2,7 @@
 import math
 import os
 from datetime import datetime, timezone
+from logging import getLogger
 from os import listdir
 from os.path import isfile
 from typing import Any, Optional, Tuple
@@ -22,6 +23,8 @@
 from whylogs.core.schema import DatasetSchema
 from whylogs.core.segmentation_partition import segment_on_column
 
+TEST_LOGGER = getLogger(__name__)
+
 
 class TimerContext:
     def __enter__(self) -> "TimerContext":
@@ -265,6 +268,50 @@ def test_rolling_row_messages_with_segments(tmp_path: Any) -> None:
     assert rolling_callback.call_count == 2
 
 
+@pytest.mark.load
+def test_rolling_logger_load_test(tmp_path: Any) -> None:
+    import gc
+    import tracemalloc
+
+    num_messages = 10
+    messages = [{"col1": i % 2, "col2": i * i * 1.2, "col3": "a"} for i in range(num_messages)]
+    tracemalloc.start()
+    # Here we create an aggressively rolling logger to try and test for memory pressure related to
+    # long running rolling logger instances over time. Don't do this in actual integrations.
+    rolling_logger = why.logger(
+        mode="rolling",
+        interval=1,
+        when="S",
+        base_name="test_base_name",
+    )
+    rolling_logger.append_writer("local", base_dir=tmp_path)
+    # parameterize the load test, 10,000 iterations with 10 messages per loop -> 100k log calls
+    test_iterations = 10000
+
+    def loop_test(rolling_logger, messages, test_iterations):
+        for i in range(test_iterations):
+            if i % 1000 == 0:
+                print(f"Iteration {i} out of {test_iterations}")
+            for message in messages:
+                rolling_logger.log(message)
+
+    loop_test(rolling_logger, messages, test_iterations)
+    gc.collect()
+    snapshot = tracemalloc.take_snapshot()
+    top_stats = snapshot.statistics("lineno")
+
+    # Before the memory fix, this would show hundreds of megabytes surviving garbage collection
+    # the expected results here are under a few megabytes for any of the top lines.
+    # TODO: use an assert to catch catastrophic regressions.
+    TEST_LOGGER.info("Top memory-consuming lines:")
+    for stat in top_stats[:5]:
+        TEST_LOGGER.info(stat)
+
+    TEST_LOGGER.info(f"load test with {test_iterations} iterations each using {num_messages}")
+
+    rolling_logger.close()
+
+
 def test_rolling_do_rollover():
     import pandas as pd
 

diff --git a/python/whylogs/core/metrics/metrics.py b/python/whylogs/core/metrics/metrics.py
@@ -262,7 +262,7 @@ def columnar_update(self, view: PreprocessedColumn) -> OperationResult:
         if view.numpy.len > 0:
             for arr in [view.numpy.floats, view.numpy.ints]:
                 if arr is not None:
-                    self.kll.value.update(arr)
+                    self.kll.value.update(array=arr)
                     n_b = len(arr)
                     if n_b > 1:
                         n_b = len(arr)
@@ -281,7 +281,7 @@ def columnar_update(self, view: PreprocessedColumn) -> OperationResult:
 
         for lst in [view.list.ints, view.list.floats]:
             if lst is not None and len(lst) > 0:
-                self.kll.value.update_list(lst)
+                self.kll.value.update_list(num_items=lst)
                 n_b = len(lst)
                 if n_b > 1:
                     mean_b = statistics.mean(lst)

diff --git a/python/whylogs/core/segmentation_partition.py b/python/whylogs/core/segmentation_partition.py
@@ -4,6 +4,7 @@
 from dataclasses import dataclass, field
 from typing import Callable, List, Mapping, Optional
 
+from whylogs.api.usage_stats import emit_usage
 from whylogs.core.projectors import FieldProjector
 
 logger = logging.getLogger(__name__)
@@ -69,4 +70,6 @@ def __hash__(self):
 
 
 def segment_on_column(column_name: str) -> Mapping[str, SegmentationPartition]:
+    emit_usage("segment_on_column")
+
     return {column_name: SegmentationPartition(name=column_name, mapper=ColumnMapperFunction(col_names=[column_name]))}
diff --git a/python/whylogs/core/validators/condition_validator.py b/python/whylogs/core/validators/condition_validator.py
@@ -37,7 +37,7 @@ class ConditionValidator(Validator):
     def __post_init__(self):
         from whylogs.api.usage_stats import emit_usage
 
-        emit_usage("condition_validators")
+        emit_usage("condition_validator")
         for cond_name in self.conditions.keys():
             if cond_name not in self.failures:
                 self.failures[cond_name] = 0