Skip to content

Commit

Permalink
TH2-5099 FIx big-performance-degradation when-we-have-many-data-files…
Browse files Browse the repository at this point in the history
…-that-iterate-each-other (#295)

* Update __add__ logic

* Update release_notes.md

* Update iteration_on_data_files.py

* Add more tests

* Refactor

---------

Co-authored-by: Slava Ermakov <[email protected]>
  • Loading branch information
davitmamrikishvili and ConnectDIY committed Feb 7, 2025
1 parent 4bf83fb commit 3182d9c
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 1 deletion.
1 change: 1 addition & 0 deletions release_notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -585,6 +585,7 @@ If you want to use RDP you have to specify dependency in square brackets `[ ]`
8. [TH2-5201] Performance improvements have been made to converters:
9. [TH2-5101] Data.update_metadata() now takes `change_type` argument (values: `update` default, `change` which denotes
whether to update or overwrite with new values.
10. [TH2-5099] Fixed slow iteration for Data objects created with many addition operators.
Benchmark.
- 1mln iterations per test
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import os
import time
import random

from th2_data_services.data import Data


def get_iteration_speed_plus():
data = Data.from_json(f"benchmark/json0.gz", gzip=True)
for i in range(1, 122):
data = data + Data.from_json(f"benchmark/json{i}.gz", gzip=True)

_iterate_and_print_stats(data)


def get_iteration_speed_plus_equals():
data = Data.from_json(f"benchmark/json0.gz", gzip=True)
for i in range(1, 122):
data += Data.from_json(f"benchmark/json{i}.gz", gzip=True)

_iterate_and_print_stats(data)


def get_iteration_speed_list_comprehension():
data = Data([Data.from_json(f"benchmark/json{i}.gz", gzip=True) for i in range(122)])

_iterate_and_print_stats(data)


def _generate_data():
n = 10_000
data = Data([random.randint(1, 100_000) for _ in range(n)])
os.makedirs("benchmark", exist_ok=True)
data.to_json_lines(f"benchmark/json0.gz", gzip=True, overwrite=True)
for i in range(1, 122):
data = Data([random.randint(1, 100_000) for _ in range(n)])
data.to_json_lines(f"benchmark/json{i}.gz", gzip=True, overwrite=True)


def _iterate_and_print_stats(data):
start_time = time.time()
j = 0
for _ in data:
j += 1

print(f"Number of records iterated: {j}")
print(f"Time took: {time.time() - start_time} seconds")


if __name__ == "__main__":
_generate_data()
print("get_iteration_speed_plus()")
get_iteration_speed_plus()
print("get_iteration_speed_plus_equals()")
get_iteration_speed_plus_equals()
print("get_iteration_speed_list_comprehension()")
get_iteration_speed_list_comprehension()
5 changes: 4 additions & 1 deletion th2_data_services/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ def __init__(
self._pending_cache_path = (
self._cache_path.with_name("[PENDING]" + self._cache_filename).resolve().absolute()
)
self._data_list = [self]
self._cache_file_obj: Optional[BinaryIO] = None
self._len: Optional[int] = None
self.workflow = DataWorkflow()
Expand Down Expand Up @@ -255,7 +256,9 @@ def __add__(self, other_data: "Data") -> "Data[DataIterValues]":
"""
if not isinstance(other_data, Data):
raise TypeError("Addition only works between Data objects")
data = Data(self._create_data_set_from_iterables([self, other_data]))
new_data_list = self._data_list + other_data._data_list
data = Data(new_data_list)
data._data_list = new_data_list
data._set_metadata(self.metadata)
if "source_file" in data.metadata:
data.update_metadata({"source_files": [data.metadata["source_file"]]})
Expand Down

0 comments on commit 3182d9c

Please sign in to comment.