diff --git a/release_notes.md b/release_notes.md index 8150a3a9..9a112989 100644 --- a/release_notes.md +++ b/release_notes.md @@ -585,6 +585,7 @@ If you want to use RDP you have to specify dependency in square brackets `[ ]` 8. [TH2-5201] Performance improvements have been made to converters: 9. [TH2-5101] Data.update_metadata() now takes `change_type` argument (values: `update` default, `change` which denotes whether to update or overwrite with new values. +10. [TH2-5099] Fixed slow iteration for Data objects created with many addition operators. Benchmark. - 1mln iterations per test diff --git a/tests/tests_unit/test_data/test_performance/iteration_on_data_files.py b/tests/tests_unit/test_data/test_performance/iteration_on_data_files.py new file mode 100644 index 00000000..f0d1b3f5 --- /dev/null +++ b/tests/tests_unit/test_data/test_performance/iteration_on_data_files.py @@ -0,0 +1,57 @@ +import os +import time +import random + +from th2_data_services.data import Data + + +def get_iteration_speed_plus(): + data = Data.from_json(f"benchmark/json0.gz", gzip=True) + for i in range(1, 122): + data = data + Data.from_json(f"benchmark/json{i}.gz", gzip=True) + + _iterate_and_print_stats(data) + + +def get_iteration_speed_plus_equals(): + data = Data.from_json(f"benchmark/json0.gz", gzip=True) + for i in range(1, 122): + data += Data.from_json(f"benchmark/json{i}.gz", gzip=True) + + _iterate_and_print_stats(data) + + +def get_iteration_speed_list_comprehension(): + data = Data([Data.from_json(f"benchmark/json{i}.gz", gzip=True) for i in range(122)]) + + _iterate_and_print_stats(data) + + +def _generate_data(): + n = 10_000 + data = Data([random.randint(1, 100_000) for _ in range(n)]) + os.makedirs("benchmark", exist_ok=True) + data.to_json_lines(f"benchmark/json0.gz", gzip=True, overwrite=True) + for i in range(1, 122): + data = Data([random.randint(1, 100_000) for _ in range(n)]) + data.to_json_lines(f"benchmark/json{i}.gz", gzip=True, overwrite=True) + + +def _iterate_and_print_stats(data): + start_time = time.time() + j = 0 + for _ in data: + j += 1 + + print(f"Number of records iterated: {j}") + print(f"Time took: {time.time() - start_time} seconds") + + +if __name__ == "__main__": + _generate_data() + print("get_iteration_speed_plus()") + get_iteration_speed_plus() + print("get_iteration_speed_plus_equals()") + get_iteration_speed_plus_equals() + print("get_iteration_speed_list_comprehension()") + get_iteration_speed_list_comprehension() diff --git a/th2_data_services/data.py b/th2_data_services/data.py index ae779abf..76311f73 100644 --- a/th2_data_services/data.py +++ b/th2_data_services/data.py @@ -201,6 +201,7 @@ def __init__( self._pending_cache_path = ( self._cache_path.with_name("[PENDING]" + self._cache_filename).resolve().absolute() ) + self._data_list = [self] self._cache_file_obj: Optional[BinaryIO] = None self._len: Optional[int] = None self.workflow = DataWorkflow() @@ -255,7 +256,9 @@ def __add__(self, other_data: "Data") -> "Data[DataIterValues]": """ if not isinstance(other_data, Data): raise TypeError("Addition only works between Data objects") - data = Data(self._create_data_set_from_iterables([self, other_data])) + new_data_list = self._data_list + other_data._data_list + data = Data(new_data_list) + data._data_list = new_data_list data._set_metadata(self.metadata) if "source_file" in data.metadata: data.update_metadata({"source_files": [data.metadata["source_file"]]})