From d07139b2656f09accfad0de9bd00957a33c5719d Mon Sep 17 00:00:00 2001 From: DavitMamrikishvili Date: Mon, 5 Aug 2024 14:33:07 +0400 Subject: [PATCH 1/5] Update __add__ logic --- .../test_iteration_on_data_files.py | 28 +++++++++++++++++++ th2_data_services/data.py | 5 +++- 2 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 tests/tests_unit/test_data/test_performance/test_iteration_on_data_files.py diff --git a/tests/tests_unit/test_data/test_performance/test_iteration_on_data_files.py b/tests/tests_unit/test_data/test_performance/test_iteration_on_data_files.py new file mode 100644 index 00000000..627986a4 --- /dev/null +++ b/tests/tests_unit/test_data/test_performance/test_iteration_on_data_files.py @@ -0,0 +1,28 @@ +import os +import time +import random + +from th2_data_services.data import Data + + +def test_iteration_speed(): + n = 10_000 + data = Data([random.randint(1, 100_000) for _ in range(n)]) + directory = "tests/tests_unit/test_data/test_performance/benchmark" + os.makedirs(directory, exist_ok=True) + data.to_json_lines(f"{directory}/json0.gz", gzip=True, overwrite=True) + for i in range(1, 122): + data = Data([random.randint(1, 100_000) for _ in range(n)]) + data.to_json_lines(f"{directory}/json{i}.gz", gzip=True, overwrite=True) + + data = Data.from_json(f"{directory}/json0.gz", gzip=True) + for i in range(1, 122): + data = data + Data.from_json(f"{directory}/json{i}.gz", gzip=True) + + start_time = time.time() + j = 0 + for _ in data: + j += 1 + + print(f"Number of records: {j}") + print(f"Time took: {time.time() - start_time}") diff --git a/th2_data_services/data.py b/th2_data_services/data.py index 927e3e4e..93b87516 100644 --- a/th2_data_services/data.py +++ b/th2_data_services/data.py @@ -197,6 +197,7 @@ def __init__( self._pending_cache_path = ( self._cache_path.with_name("[PENDING]" + self._cache_filename).resolve().absolute() ) + self._data_list = [self] self._cache_file_obj: Optional[BinaryIO] = None self._len: Optional[int] = None self.workflow = DataWorkflow() @@ -251,7 +252,9 @@ def __add__(self, other_data: "Data") -> "Data[DataIterValues]": """ if not isinstance(other_data, Data): raise TypeError("Addition only works between Data objects") - data = Data(self._create_data_set_from_iterables([self, other_data])) + new_data_list = self._data_list + other_data._data_list + data = Data(new_data_list) + data._data_list = new_data_list data._set_metadata(self.metadata) if "source_file" in data.metadata: data.update_metadata({"source_files": [data.metadata["source_file"]]}) From 702f1b14d08196e77a7ed21d4316baef0c91dc64 Mon Sep 17 00:00:00 2001 From: DavitMamrikishvili Date: Mon, 5 Aug 2024 15:21:32 +0400 Subject: [PATCH 2/5] Update release_notes.md --- release_notes.md | 1 + ...ration_on_data_files.py => iteration_on_data_files.py} | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) rename tests/tests_unit/test_data/test_performance/{test_iteration_on_data_files.py => iteration_on_data_files.py} (84%) diff --git a/release_notes.md b/release_notes.md index 90e2f467..cf928fab 100644 --- a/release_notes.md +++ b/release_notes.md @@ -584,6 +584,7 @@ If you want to use RDP you have to specify dependency in square brackets `[ ]` 8. [TH2-5201] Performance improvements have been made to converters: 9. [TH2-5101] Data.update_metadata() now takes `change_type` argument (values: `update` default, `change` which denotes whether to update or overwrite with new values. +10. [TH2-5099] Fixed slow iteration for Data objects created with many addition operators. Benchmark. - 1mln iterations per test diff --git a/tests/tests_unit/test_data/test_performance/test_iteration_on_data_files.py b/tests/tests_unit/test_data/test_performance/iteration_on_data_files.py similarity index 84% rename from tests/tests_unit/test_data/test_performance/test_iteration_on_data_files.py rename to tests/tests_unit/test_data/test_performance/iteration_on_data_files.py index 627986a4..c19a997b 100644 --- a/tests/tests_unit/test_data/test_performance/test_iteration_on_data_files.py +++ b/tests/tests_unit/test_data/test_performance/iteration_on_data_files.py @@ -5,7 +5,7 @@ from th2_data_services.data import Data -def test_iteration_speed(): +def get_iteration_speed(): n = 10_000 data = Data([random.randint(1, 100_000) for _ in range(n)]) directory = "tests/tests_unit/test_data/test_performance/benchmark" @@ -25,4 +25,8 @@ def test_iteration_speed(): j += 1 print(f"Number of records: {j}") - print(f"Time took: {time.time() - start_time}") + print(f"Time took: {time.time() - start_time} seconds") + + +if __name__ == "__main__": + get_iteration_speed() From 38fac152fb59815cd3de1c3e78deba4bc015670b Mon Sep 17 00:00:00 2001 From: DavitMamrikishvili Date: Tue, 6 Aug 2024 13:52:41 +0400 Subject: [PATCH 3/5] Update iteration_on_data_files.py --- .../test_data/test_performance/iteration_on_data_files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tests_unit/test_data/test_performance/iteration_on_data_files.py b/tests/tests_unit/test_data/test_performance/iteration_on_data_files.py index c19a997b..c9eb8a0a 100644 --- a/tests/tests_unit/test_data/test_performance/iteration_on_data_files.py +++ b/tests/tests_unit/test_data/test_performance/iteration_on_data_files.py @@ -8,7 +8,7 @@ def get_iteration_speed(): n = 10_000 data = Data([random.randint(1, 100_000) for _ in range(n)]) - directory = "tests/tests_unit/test_data/test_performance/benchmark" + directory = "benchmark" os.makedirs(directory, exist_ok=True) data.to_json_lines(f"{directory}/json0.gz", gzip=True, overwrite=True) for i in range(1, 122): From 27e540479503bf623bfd8d567a09c3659d938ea8 Mon Sep 17 00:00:00 2001 From: DavitMamrikishvili Date: Thu, 8 Aug 2024 11:05:40 +0400 Subject: [PATCH 4/5] Add more tests --- .../iteration_on_data_files.py | 49 ++++++++++++++----- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/tests/tests_unit/test_data/test_performance/iteration_on_data_files.py b/tests/tests_unit/test_data/test_performance/iteration_on_data_files.py index c9eb8a0a..841f934f 100644 --- a/tests/tests_unit/test_data/test_performance/iteration_on_data_files.py +++ b/tests/tests_unit/test_data/test_performance/iteration_on_data_files.py @@ -5,28 +5,55 @@ from th2_data_services.data import Data -def get_iteration_speed(): +def get_iteration_speed_plus(): + data = Data.from_json(f"benchmark/json0.gz", gzip=True) + for i in range(1, 122): + data = data + Data.from_json(f"benchmark/json{i}.gz", gzip=True) + + start_time = time.time() + _iterate_and_print_stats(start_time, data) + + +def get_iteration_speed_plus_equals(): + data = Data.from_json(f"benchmark/json0.gz", gzip=True) + for i in range(1, 122): + data += Data.from_json(f"benchmark/json{i}.gz", gzip=True) + + start_time = time.time() + _iterate_and_print_stats(start_time, data) + + +def get_iteration_speed_list_comprehension(): + data = Data([Data.from_json(f"benchmark/json{i}.gz", gzip=True) for i in range(122)]) + + start_time = time.time() + _iterate_and_print_stats(start_time, data) + + +def _generate_data(): n = 10_000 data = Data([random.randint(1, 100_000) for _ in range(n)]) - directory = "benchmark" - os.makedirs(directory, exist_ok=True) - data.to_json_lines(f"{directory}/json0.gz", gzip=True, overwrite=True) + os.makedirs("benchmark", exist_ok=True) + data.to_json_lines(f"benchmark/json0.gz", gzip=True, overwrite=True) for i in range(1, 122): data = Data([random.randint(1, 100_000) for _ in range(n)]) - data.to_json_lines(f"{directory}/json{i}.gz", gzip=True, overwrite=True) + data.to_json_lines(f"benchmark/json{i}.gz", gzip=True, overwrite=True) - data = Data.from_json(f"{directory}/json0.gz", gzip=True) - for i in range(1, 122): - data = data + Data.from_json(f"{directory}/json{i}.gz", gzip=True) - start_time = time.time() +def _iterate_and_print_stats(start_time, data): j = 0 for _ in data: j += 1 - print(f"Number of records: {j}") + print(f"Number of records iterated: {j}") print(f"Time took: {time.time() - start_time} seconds") if __name__ == "__main__": - get_iteration_speed() + _generate_data() + print("get_iteration_speed_plus()") + get_iteration_speed_plus() + print("get_iteration_speed_plus_equals()") + get_iteration_speed_plus_equals() + print("get_iteration_speed_list_comprehension()") + get_iteration_speed_list_comprehension() From 9ead2c8f5aec6c42abce3ef4364c16f77be9a773 Mon Sep 17 00:00:00 2001 From: DavitMamrikishvili Date: Thu, 8 Aug 2024 11:07:20 +0400 Subject: [PATCH 5/5] Refactor --- .../test_performance/iteration_on_data_files.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/tests_unit/test_data/test_performance/iteration_on_data_files.py b/tests/tests_unit/test_data/test_performance/iteration_on_data_files.py index 841f934f..f0d1b3f5 100644 --- a/tests/tests_unit/test_data/test_performance/iteration_on_data_files.py +++ b/tests/tests_unit/test_data/test_performance/iteration_on_data_files.py @@ -10,8 +10,7 @@ def get_iteration_speed_plus(): for i in range(1, 122): data = data + Data.from_json(f"benchmark/json{i}.gz", gzip=True) - start_time = time.time() - _iterate_and_print_stats(start_time, data) + _iterate_and_print_stats(data) def get_iteration_speed_plus_equals(): @@ -19,15 +18,13 @@ def get_iteration_speed_plus_equals(): for i in range(1, 122): data += Data.from_json(f"benchmark/json{i}.gz", gzip=True) - start_time = time.time() - _iterate_and_print_stats(start_time, data) + _iterate_and_print_stats(data) def get_iteration_speed_list_comprehension(): data = Data([Data.from_json(f"benchmark/json{i}.gz", gzip=True) for i in range(122)]) - start_time = time.time() - _iterate_and_print_stats(start_time, data) + _iterate_and_print_stats(data) def _generate_data(): @@ -40,7 +37,8 @@ def _generate_data(): data.to_json_lines(f"benchmark/json{i}.gz", gzip=True, overwrite=True) -def _iterate_and_print_stats(start_time, data): +def _iterate_and_print_stats(data): + start_time = time.time() j = 0 for _ in data: j += 1