From 0a49406c485543fc193da02fd369cd4ed1b5ee4f Mon Sep 17 00:00:00 2001 From: Danil Tolmachev Date: Mon, 21 Nov 2022 21:45:05 +0300 Subject: [PATCH 01/13] add glom like searching --- dictdatabase/index_manager.py | 28 +++++++++++++++++ dictdatabase/io_unsafe.py | 17 ++++------- dictdatabase/searching.py | 50 +++++++++++++++++++++++++++++++ tests/test_glom_like_searching.py | 21 +++++++++++++ 4 files changed, 104 insertions(+), 12 deletions(-) create mode 100644 dictdatabase/index_manager.py create mode 100644 dictdatabase/searching.py create mode 100644 tests/test_glom_like_searching.py diff --git a/dictdatabase/index_manager.py b/dictdatabase/index_manager.py new file mode 100644 index 0000000..a7dd8d3 --- /dev/null +++ b/dictdatabase/index_manager.py @@ -0,0 +1,28 @@ +import hashlib + +from dictdatabase import utils + + +class IndexManager: + @staticmethod + def create_index(all_file_bytes: bytes, key: str, start, end): + """ + It takes a JSON file, a key, and a start and end position, and returns a tuple of information about the key and its + value + + Args: + all_file_bytes (bytes): The entire file as a byte string. + key (str): The key of the value we're indexing. + start: the start of the value in the file + end: the end of the value in the file + + Returns: + The key, start, end, indent_level, indent_with, value_hash, end + """ + key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key) + indent_level, indent_with = utils.detect_indentation_in_json_bytes( + all_file_bytes, key_start + ) + value_bytes = all_file_bytes[start:end] + value_hash = hashlib.sha256(value_bytes).hexdigest() + return key, start, end, indent_level, indent_with, value_hash, end diff --git a/dictdatabase/io_unsafe.py b/dictdatabase/io_unsafe.py index 7204a30..5d2f242 100644 --- a/dictdatabase/io_unsafe.py +++ b/dictdatabase/io_unsafe.py @@ -5,6 +5,8 @@ import json import hashlib from . import config, utils, byte_codes, indexing, io_bytes +from .index_manager import IndexManager +from .searching import Searcher @dataclass(frozen=True) # slots=True not supported by python 3.8 and 3.9 @@ -79,21 +81,12 @@ def partial_read_only(db_name: str, key: str) -> dict | None: # Not found in index file, search for key in the entire file all_file_bytes = io_bytes.read(db_name) - key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key) - - if key_end == -1: + start, end, found = Searcher().search(all_file_bytes, key) + if not found: return None - - # Key found, now determine the bounding byte indices of the value - start = key_end + (1 if all_file_bytes[key_end] == byte_codes.SPACE else 0) - end = utils.seek_index_through_value_bytes(all_file_bytes, start) - - indent_level, indent_with = utils.detect_indentation_in_json_bytes(all_file_bytes, key_start) value_bytes = all_file_bytes[start:end] - value_hash = hashlib.sha256(value_bytes).hexdigest() - # Write key info to index file - indexer.write(key, start, end, indent_level, indent_with, value_hash, end) + indexer.write(*IndexManager.create_index(all_file_bytes, key, start, end)) return orjson.loads(value_bytes) diff --git a/dictdatabase/searching.py b/dictdatabase/searching.py new file mode 100644 index 0000000..0dc55b4 --- /dev/null +++ b/dictdatabase/searching.py @@ -0,0 +1,50 @@ +from dictdatabase import byte_codes +from dictdatabase import utils + + +class Searcher: + @staticmethod + def find_start_end_in_bytes(file: bytes, key: str) -> tuple[int, int, bool]: + """ + It finds the start and end indices of the value of a key in a JSON file + + Args: + file (bytes): bytes + key (str): The key to find in the JSON file. + + Returns: + A tuple of the start and end index of the key, and a boolean value indicating whether the key was found. + """ + key_start, key_end = utils.find_outermost_key_in_json_bytes(file, key) + if key_end == -1: + return -1, -1, False + start = key_end + (1 if file[key_end] == byte_codes.SPACE else 0) + end = utils.seek_index_through_value_bytes(file, start) + return start, end, True + + def search( + self, all_file_bytes: bytes, key: str, glom_searching=True + ) -> tuple[int, int, bool]: + """ + It takes a byte string, a key, and a boolean, and returns a tuple of three integers + + Args: + all_file_bytes (bytes): The bytes of the file you're searching in. + key (str): The key to search for. + glom_searching: If True, then the key is a glom path, and we need to search for each part of the path. Defaults to + True + + Returns: + The start and end of the key in the file. + """ + original_start = 0 + original_end = len(all_file_bytes) + for k in key.split(".") if glom_searching else [key]: + start, end, found = self.find_start_end_in_bytes( + all_file_bytes[original_start:original_end], k + ) + if not found: + return -1, -1, False + original_end = original_start + end + original_start += start + return original_start, original_end, True diff --git a/tests/test_glom_like_searching.py b/tests/test_glom_like_searching.py new file mode 100644 index 0000000..cea6670 --- /dev/null +++ b/tests/test_glom_like_searching.py @@ -0,0 +1,21 @@ +import dictdatabase as DDB + +data = { + "users": { + "Ben": {"age": 30, "job": "Software Engineer"}, + "Bob": {"age": 30, "job": "Plumbers"}, + }, + "Ben": {"job": {"age": 30, "job": "Software Engineer"}}, +} + + +def test_glom_searching(): + DDB.at("users").create(data, force_overwrite=True) + assert DDB.at("users", key="users.Ben.job").read() == 'Software Engineer' + + +def test_without_glom_searching(): + DDB.at("users").create(data, force_overwrite=True) + assert DDB.at("users", key="Ben").read() == { + "job": {"age": 30, "job": "Software Engineer"} + } From c83342c99729de29807a920453e1b868e3bff71e Mon Sep 17 00:00:00 2001 From: Danil Tolmachev Date: Mon, 21 Nov 2022 21:46:59 +0300 Subject: [PATCH 02/13] reformat imports --- dictdatabase/indexing.py | 5 ++++- dictdatabase/io_unsafe.py | 15 +++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/dictdatabase/indexing.py b/dictdatabase/indexing.py index c5eaabc..b147f92 100644 --- a/dictdatabase/indexing.py +++ b/dictdatabase/indexing.py @@ -1,7 +1,10 @@ -import orjson import os + +import orjson + from . import config + # Problem: Multiple read processes will concurrently read and write the same file # In some cases this will result in a empty read error, thats why the try-except exists diff --git a/dictdatabase/io_unsafe.py b/dictdatabase/io_unsafe.py index 5d2f242..5a2b09a 100644 --- a/dictdatabase/io_unsafe.py +++ b/dictdatabase/io_unsafe.py @@ -1,10 +1,17 @@ from __future__ import annotations -from typing import Tuple + +import hashlib +import json from dataclasses import dataclass +from typing import Tuple + import orjson -import json -import hashlib -from . import config, utils, byte_codes, indexing, io_bytes + +from . import byte_codes +from . import config +from . import indexing +from . import io_bytes +from . import utils from .index_manager import IndexManager from .searching import Searcher From e75f8ebbf55b832b617f3cdb56f54e4f7862fd7c Mon Sep 17 00:00:00 2001 From: Danil Tolmachev Date: Tue, 22 Nov 2022 10:27:26 +0300 Subject: [PATCH 03/13] fix type hinting --- dictdatabase/searching.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dictdatabase/searching.py b/dictdatabase/searching.py index 0dc55b4..81c4e30 100644 --- a/dictdatabase/searching.py +++ b/dictdatabase/searching.py @@ -1,10 +1,12 @@ +from typing import Tuple + from dictdatabase import byte_codes from dictdatabase import utils class Searcher: @staticmethod - def find_start_end_in_bytes(file: bytes, key: str) -> tuple[int, int, bool]: + def find_start_end_in_bytes(file: bytes, key: str) -> Tuple[int, int, bool]: """ It finds the start and end indices of the value of a key in a JSON file @@ -24,7 +26,7 @@ def find_start_end_in_bytes(file: bytes, key: str) -> tuple[int, int, bool]: def search( self, all_file_bytes: bytes, key: str, glom_searching=True - ) -> tuple[int, int, bool]: + ) -> Tuple[int, int, bool]: """ It takes a byte string, a key, and a boolean, and returns a tuple of three integers From 66a6a41a22e672ffa7a96634c60beb0995875b5e Mon Sep 17 00:00:00 2001 From: Danil Tolmachev Date: Tue, 22 Nov 2022 11:20:54 +0300 Subject: [PATCH 04/13] add tests --- tests/test_glom_like_searching.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/test_glom_like_searching.py b/tests/test_glom_like_searching.py index cea6670..cc1ec05 100644 --- a/tests/test_glom_like_searching.py +++ b/tests/test_glom_like_searching.py @@ -11,7 +11,7 @@ def test_glom_searching(): DDB.at("users").create(data, force_overwrite=True) - assert DDB.at("users", key="users.Ben.job").read() == 'Software Engineer' + assert DDB.at("users", key="users.Ben.job").read() == "Software Engineer" def test_without_glom_searching(): @@ -19,3 +19,13 @@ def test_without_glom_searching(): assert DDB.at("users", key="Ben").read() == { "job": {"age": 30, "job": "Software Engineer"} } + + +def test_glom_searching_if_key_not_exists(): + DDB.at("users").create(data, force_overwrite=True) + assert DDB.at("users", key="users.Job.Ben").read() is None + + +def test_glom_searching_if_subkey_not_exists(): + DDB.at("users").create(data, force_overwrite=True) + assert DDB.at("users", key="users.Ben.SUBKEYNOTEXISTS").read() is None From 325d3c7bac2cf439798066363b2b3f97df894772 Mon Sep 17 00:00:00 2001 From: Danil Tolmachev Date: Tue, 22 Nov 2022 11:21:07 +0300 Subject: [PATCH 05/13] rename Searcher -> KeySearcher --- dictdatabase/io_unsafe.py | 4 ++-- dictdatabase/searching.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dictdatabase/io_unsafe.py b/dictdatabase/io_unsafe.py index 5a2b09a..d9ea9ab 100644 --- a/dictdatabase/io_unsafe.py +++ b/dictdatabase/io_unsafe.py @@ -13,7 +13,7 @@ from . import io_bytes from . import utils from .index_manager import IndexManager -from .searching import Searcher +from .searching import KeySearcher @dataclass(frozen=True) # slots=True not supported by python 3.8 and 3.9 @@ -88,7 +88,7 @@ def partial_read_only(db_name: str, key: str) -> dict | None: # Not found in index file, search for key in the entire file all_file_bytes = io_bytes.read(db_name) - start, end, found = Searcher().search(all_file_bytes, key) + start, end, found = KeySearcher().search(all_file_bytes, key) if not found: return None value_bytes = all_file_bytes[start:end] diff --git a/dictdatabase/searching.py b/dictdatabase/searching.py index 81c4e30..cc6177d 100644 --- a/dictdatabase/searching.py +++ b/dictdatabase/searching.py @@ -4,7 +4,7 @@ from dictdatabase import utils -class Searcher: +class KeySearcher: @staticmethod def find_start_end_in_bytes(file: bytes, key: str) -> Tuple[int, int, bool]: """ From e856f71f8d37fdf6ab14452b8fbad60b960a14b9 Mon Sep 17 00:00:00 2001 From: Danil Tolmachev Date: Thu, 24 Nov 2022 00:08:18 +0300 Subject: [PATCH 06/13] add partial write --- dictdatabase/io_unsafe.py | 8 +-- dictdatabase/searching.py | 103 ++++++++++++++++++++++--------------- tests/test_glom_writing.py | 17 ++++++ 3 files changed, 82 insertions(+), 46 deletions(-) create mode 100644 tests/test_glom_writing.py diff --git a/dictdatabase/io_unsafe.py b/dictdatabase/io_unsafe.py index d9ea9ab..83a85a9 100644 --- a/dictdatabase/io_unsafe.py +++ b/dictdatabase/io_unsafe.py @@ -11,9 +11,9 @@ from . import config from . import indexing from . import io_bytes +from . import searching from . import utils from .index_manager import IndexManager -from .searching import KeySearcher @dataclass(frozen=True) # slots=True not supported by python 3.8 and 3.9 @@ -88,7 +88,7 @@ def partial_read_only(db_name: str, key: str) -> dict | None: # Not found in index file, search for key in the entire file all_file_bytes = io_bytes.read(db_name) - start, end, found = KeySearcher().search(all_file_bytes, key) + start, end, found = searching.search_value_by_key(all_file_bytes, key) if not found: return None value_bytes = all_file_bytes[start:end] @@ -185,9 +185,9 @@ def get_partial_file_handle(db_name: str, key: str) -> PartialFileHandle: return partial_handle # Not found in index file, search for key in the entire file - key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key) + key_start, key_end, found = searching.search_key(all_file_bytes, key) - if key_end == -1: + if not found: raise KeyError(f"Key \"{key}\" not found in db \"{db_name}\"") # Key found, now determine the bounding byte indices of the value diff --git a/dictdatabase/searching.py b/dictdatabase/searching.py index cc6177d..f661bde 100644 --- a/dictdatabase/searching.py +++ b/dictdatabase/searching.py @@ -1,52 +1,71 @@ from typing import Tuple +import orjson + from dictdatabase import byte_codes from dictdatabase import utils -class KeySearcher: - @staticmethod - def find_start_end_in_bytes(file: bytes, key: str) -> Tuple[int, int, bool]: - """ - It finds the start and end indices of the value of a key in a JSON file +def find_start_end_in_bytes(file: bytes, key: str) -> Tuple[int, int, bool]: + """ + It finds the start and end indices of the value of a key in a JSON file + + Args: + file (bytes): bytes + key (str): The key to find in the JSON file. - Args: - file (bytes): bytes - key (str): The key to find in the JSON file. + Returns: + A tuple of the start and end index of the key, and a boolean value indicating whether the key was found. + """ + key_start, key_end = utils.find_outermost_key_in_json_bytes(file, key) + if key_end == -1: + return -1, -1, False + start = key_end + (1 if file[key_end] == byte_codes.SPACE else 0) + end = utils.seek_index_through_value_bytes(file, start) + return start, end, True - Returns: - A tuple of the start and end index of the key, and a boolean value indicating whether the key was found. - """ - key_start, key_end = utils.find_outermost_key_in_json_bytes(file, key) + +def search_key(file: bytes, key: str, glom_searching=True) -> Tuple[int, int, bool]: + original_value_start = 0 + original_value_end = len(file) + original_key_start = 0 + original_key_end = len(file) + for k in key.split(".") if glom_searching else [key]: + key_start, key_end = utils.find_outermost_key_in_json_bytes(file, k) if key_end == -1: return -1, -1, False - start = key_end + (1 if file[key_end] == byte_codes.SPACE else 0) - end = utils.seek_index_through_value_bytes(file, start) - return start, end, True - - def search( - self, all_file_bytes: bytes, key: str, glom_searching=True - ) -> Tuple[int, int, bool]: - """ - It takes a byte string, a key, and a boolean, and returns a tuple of three integers - - Args: - all_file_bytes (bytes): The bytes of the file you're searching in. - key (str): The key to search for. - glom_searching: If True, then the key is a glom path, and we need to search for each part of the path. Defaults to - True - - Returns: - The start and end of the key in the file. - """ - original_start = 0 - original_end = len(all_file_bytes) - for k in key.split(".") if glom_searching else [key]: - start, end, found = self.find_start_end_in_bytes( - all_file_bytes[original_start:original_end], k - ) - if not found: - return -1, -1, False - original_end = original_start + end - original_start += start - return original_start, original_end, True + original_key_end = original_value_start + key_end + original_key_start = original_value_start + key_start + value_start, value_end, found = find_start_end_in_bytes(file, k) + original_value_end = original_value_start + original_value_end + original_value_start += value_start + file = file[original_value_start:original_value_end] + return original_key_start, original_key_end, True + + +def search_value_by_key( + all_file_bytes: bytes, key: str, glom_searching=True +) -> Tuple[int, int, bool]: + """ + It takes a byte string, a key, and a boolean, and returns a tuple of three integers + + Args: + all_file_bytes (bytes): The bytes of the file you're searching in. + key (str): The key to search for. + glom_searching: If True, then the key is a glom path, and we need to search for each part of the path. Defaults to + True + + Returns: + The start and end of the key in the file. + """ + original_start = 0 + original_end = len(all_file_bytes) + for k in key.split(".") if glom_searching else [key]: + start, end, found = find_start_end_in_bytes( + all_file_bytes[original_start:original_end], k + ) + if not found: + return -1, -1, False + original_end = original_start + end + original_start += start + return original_start, original_end, True diff --git a/tests/test_glom_writing.py b/tests/test_glom_writing.py new file mode 100644 index 0000000..2702884 --- /dev/null +++ b/tests/test_glom_writing.py @@ -0,0 +1,17 @@ +import dictdatabase as DDB + +data = { + "users": { + "Ben": {"age": 30, "job": "Software Engineer"}, + "Bob": {"age": 30, "job": "Plumbers"}, + }, + "Ben": {"job": {"age": 30, "job": "Software Engineer"}}, +} + + +def test_glom_writing(): + DDB.at("users").create(data, force_overwrite=True) + with DDB.at("users", key="users.Ben").session() as (session, purchase): + purchase["status"] = "cancelled" + session.write() + assert DDB.at("users", key="users.Ben.status").read() == "cancelled" From 0a105d93fba2916866be53e8ab625fccab71b4f7 Mon Sep 17 00:00:00 2001 From: Danil Tolmachev Date: Thu, 24 Nov 2022 00:10:51 +0300 Subject: [PATCH 07/13] fix print compatibility --- tests/benchmark/run_parallel.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/benchmark/run_parallel.py b/tests/benchmark/run_parallel.py index a3a1b6a..edce37c 100644 --- a/tests/benchmark/run_parallel.py +++ b/tests/benchmark/run_parallel.py @@ -88,9 +88,9 @@ class Scenario: ops: int = 10 def print(self): - res = f"✨ Scenario: {'🔹' * self.readers}{'🔻' * self.writers} ({self.readers}r{self.writers}w)" - res += ", 🔸 compression" if self.use_compression else "" - res += ", 💎 big file" if self.big_file else "" + res = f"Scenario: {'*' * self.readers}{'#' * self.writers} ({self.readers}r{self.writers}w)" + res += ", [] compression" if self.use_compression else "" + res += ", {} big file" if self.big_file else "" print(res) From a1c544b5a048cdca94e9ac286cfa6d0895e3211c Mon Sep 17 00:00:00 2001 From: Danil Tolmachev Date: Thu, 24 Nov 2022 00:16:08 +0300 Subject: [PATCH 08/13] renaming --- dictdatabase/io_unsafe.py | 4 ++-- dictdatabase/searching.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/dictdatabase/io_unsafe.py b/dictdatabase/io_unsafe.py index 83a85a9..fbc405b 100644 --- a/dictdatabase/io_unsafe.py +++ b/dictdatabase/io_unsafe.py @@ -88,7 +88,7 @@ def partial_read_only(db_name: str, key: str) -> dict | None: # Not found in index file, search for key in the entire file all_file_bytes = io_bytes.read(db_name) - start, end, found = searching.search_value_by_key(all_file_bytes, key) + start, end, found = searching.search_value_position_in_db(all_file_bytes, key) if not found: return None value_bytes = all_file_bytes[start:end] @@ -185,7 +185,7 @@ def get_partial_file_handle(db_name: str, key: str) -> PartialFileHandle: return partial_handle # Not found in index file, search for key in the entire file - key_start, key_end, found = searching.search_key(all_file_bytes, key) + key_start, key_end, found = searching.search_key_position_in_db(all_file_bytes, key) if not found: raise KeyError(f"Key \"{key}\" not found in db \"{db_name}\"") diff --git a/dictdatabase/searching.py b/dictdatabase/searching.py index f661bde..62ed857 100644 --- a/dictdatabase/searching.py +++ b/dictdatabase/searching.py @@ -6,7 +6,7 @@ from dictdatabase import utils -def find_start_end_in_bytes(file: bytes, key: str) -> Tuple[int, int, bool]: +def find_key_position_in_bytes(file: bytes, key: str) -> Tuple[int, int, bool]: """ It finds the start and end indices of the value of a key in a JSON file @@ -25,7 +25,7 @@ def find_start_end_in_bytes(file: bytes, key: str) -> Tuple[int, int, bool]: return start, end, True -def search_key(file: bytes, key: str, glom_searching=True) -> Tuple[int, int, bool]: +def search_key_position_in_db(file: bytes, key: str, glom_searching=True) -> Tuple[int, int, bool]: original_value_start = 0 original_value_end = len(file) original_key_start = 0 @@ -36,14 +36,14 @@ def search_key(file: bytes, key: str, glom_searching=True) -> Tuple[int, int, bo return -1, -1, False original_key_end = original_value_start + key_end original_key_start = original_value_start + key_start - value_start, value_end, found = find_start_end_in_bytes(file, k) + value_start, value_end, found = find_key_position_in_bytes(file, k) original_value_end = original_value_start + original_value_end original_value_start += value_start file = file[original_value_start:original_value_end] return original_key_start, original_key_end, True -def search_value_by_key( +def search_value_position_in_db( all_file_bytes: bytes, key: str, glom_searching=True ) -> Tuple[int, int, bool]: """ @@ -61,7 +61,7 @@ def search_value_by_key( original_start = 0 original_end = len(all_file_bytes) for k in key.split(".") if glom_searching else [key]: - start, end, found = find_start_end_in_bytes( + start, end, found = find_key_position_in_bytes( all_file_bytes[original_start:original_end], k ) if not found: From 21b8a9f756ee10ac985e4326e1ead94f8b6c29e2 Mon Sep 17 00:00:00 2001 From: Danil Tolmachev Date: Thu, 24 Nov 2022 00:25:27 +0300 Subject: [PATCH 09/13] add negative test --- tests/test_glom_writing.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_glom_writing.py b/tests/test_glom_writing.py index 2702884..4255cd0 100644 --- a/tests/test_glom_writing.py +++ b/tests/test_glom_writing.py @@ -1,3 +1,5 @@ +import pytest + import dictdatabase as DDB data = { @@ -15,3 +17,11 @@ def test_glom_writing(): purchase["status"] = "cancelled" session.write() assert DDB.at("users", key="users.Ben.status").read() == "cancelled" + + +def test_glom_writing_sub_key_not_exists(): + DDB.at("users").create(data, force_overwrite=True) + with pytest.raises(KeyError): + with DDB.at("users", key="users.SUBKEY").session() as (session, purchase): + purchase["status"] = "cancelled" + session.write() From 456aef225ca018dc9113b4131ad8d7202a59df65 Mon Sep 17 00:00:00 2001 From: Danil Tolmachev Date: Thu, 24 Nov 2022 00:38:11 +0300 Subject: [PATCH 10/13] add dataclass for searching --- dictdatabase/dataclasses.py | 8 ++++++++ dictdatabase/io_unsafe.py | 8 ++++---- dictdatabase/searching.py | 27 +++++++++++++++------------ 3 files changed, 27 insertions(+), 16 deletions(-) create mode 100644 dictdatabase/dataclasses.py diff --git a/dictdatabase/dataclasses.py b/dictdatabase/dataclasses.py new file mode 100644 index 0000000..2c54e19 --- /dev/null +++ b/dictdatabase/dataclasses.py @@ -0,0 +1,8 @@ +import dataclasses + + +@dataclasses.dataclass(frozen=True) +class SearchResult: + start_byte: int + end_byte: int + found: bool diff --git a/dictdatabase/io_unsafe.py b/dictdatabase/io_unsafe.py index fbc405b..7c8cba2 100644 --- a/dictdatabase/io_unsafe.py +++ b/dictdatabase/io_unsafe.py @@ -185,16 +185,16 @@ def get_partial_file_handle(db_name: str, key: str) -> PartialFileHandle: return partial_handle # Not found in index file, search for key in the entire file - key_start, key_end, found = searching.search_key_position_in_db(all_file_bytes, key) + position = searching.search_key_position_in_db(all_file_bytes, key) - if not found: + if not position.found: raise KeyError(f"Key \"{key}\" not found in db \"{db_name}\"") # Key found, now determine the bounding byte indices of the value - start = key_end + (1 if all_file_bytes[key_end] == byte_codes.SPACE else 0) + start = position.end_byte + (1 if all_file_bytes[position.end_byte] == byte_codes.SPACE else 0) end = utils.seek_index_through_value_bytes(all_file_bytes, start) - indent_level, indent_with = utils.detect_indentation_in_json_bytes(all_file_bytes, key_start) + indent_level, indent_with = utils.detect_indentation_in_json_bytes(all_file_bytes, position.start_byte) partial_value = orjson.loads(all_file_bytes[start:end]) prefix_bytes = all_file_bytes[:start] if config.use_compression else None diff --git a/dictdatabase/searching.py b/dictdatabase/searching.py index 62ed857..697819b 100644 --- a/dictdatabase/searching.py +++ b/dictdatabase/searching.py @@ -4,9 +4,10 @@ from dictdatabase import byte_codes from dictdatabase import utils +from dictdatabase.dataclasses import SearchResult -def find_key_position_in_bytes(file: bytes, key: str) -> Tuple[int, int, bool]: +def find_key_position_in_bytes(file: bytes, key: str) -> SearchResult: """ It finds the start and end indices of the value of a key in a JSON file @@ -19,13 +20,15 @@ def find_key_position_in_bytes(file: bytes, key: str) -> Tuple[int, int, bool]: """ key_start, key_end = utils.find_outermost_key_in_json_bytes(file, key) if key_end == -1: - return -1, -1, False + return SearchResult(start_byte=-1, end_byte=-1, found=False) start = key_end + (1 if file[key_end] == byte_codes.SPACE else 0) end = utils.seek_index_through_value_bytes(file, start) - return start, end, True + return SearchResult(start_byte=start, end_byte=end, found=True) -def search_key_position_in_db(file: bytes, key: str, glom_searching=True) -> Tuple[int, int, bool]: +def search_key_position_in_db( + file: bytes, key: str, glom_searching=True +) -> SearchResult: original_value_start = 0 original_value_end = len(file) original_key_start = 0 @@ -33,14 +36,14 @@ def search_key_position_in_db(file: bytes, key: str, glom_searching=True) -> Tup for k in key.split(".") if glom_searching else [key]: key_start, key_end = utils.find_outermost_key_in_json_bytes(file, k) if key_end == -1: - return -1, -1, False + return SearchResult(start_byte=-1, end_byte=-1, found=False) original_key_end = original_value_start + key_end original_key_start = original_value_start + key_start - value_start, value_end, found = find_key_position_in_bytes(file, k) + position = find_key_position_in_bytes(file, k) original_value_end = original_value_start + original_value_end - original_value_start += value_start + original_value_start += position.start_byte file = file[original_value_start:original_value_end] - return original_key_start, original_key_end, True + return SearchResult(start_byte=original_key_start, end_byte=original_key_end, found=True) def search_value_position_in_db( @@ -61,11 +64,11 @@ def search_value_position_in_db( original_start = 0 original_end = len(all_file_bytes) for k in key.split(".") if glom_searching else [key]: - start, end, found = find_key_position_in_bytes( + position = find_key_position_in_bytes( all_file_bytes[original_start:original_end], k ) - if not found: + if not position.found: return -1, -1, False - original_end = original_start + end - original_start += start + original_end = original_start + position.end_byte + original_start += position.start_byte return original_start, original_end, True From 4d0c4b807896dac394bc8c88f9db5f66fe8c83bb Mon Sep 17 00:00:00 2001 From: Danil Tolmachev Date: Sun, 27 Nov 2022 18:00:20 +0300 Subject: [PATCH 11/13] add Index dataclass --- dictdatabase/dataclasses.py | 11 ++ dictdatabase/index_manager.py | 41 ++-- dictdatabase/indexing.py | 121 ++++++------ dictdatabase/io_unsafe.py | 363 ++++++++++++++++++---------------- 4 files changed, 289 insertions(+), 247 deletions(-) diff --git a/dictdatabase/dataclasses.py b/dictdatabase/dataclasses.py index 2c54e19..9743ffb 100644 --- a/dictdatabase/dataclasses.py +++ b/dictdatabase/dataclasses.py @@ -6,3 +6,14 @@ class SearchResult: start_byte: int end_byte: int found: bool + + +@dataclasses.dataclass(frozen=True) +class Index: + key: str + key_start: int + key_end: int + indent_level: int + indent_with: str + value_hash: str + old_value_end: int diff --git a/dictdatabase/index_manager.py b/dictdatabase/index_manager.py index a7dd8d3..579a97f 100644 --- a/dictdatabase/index_manager.py +++ b/dictdatabase/index_manager.py @@ -1,28 +1,27 @@ import hashlib from dictdatabase import utils +from dictdatabase.dataclasses import Index -class IndexManager: - @staticmethod - def create_index(all_file_bytes: bytes, key: str, start, end): - """ - It takes a JSON file, a key, and a start and end position, and returns a tuple of information about the key and its - value +def create_index(all_file_bytes: bytes, key: str, start, end) -> Index: + """ + It takes a JSON file, a key, and a start and end position, and returns a tuple of information about the key and its + value - Args: - all_file_bytes (bytes): The entire file as a byte string. - key (str): The key of the value we're indexing. - start: the start of the value in the file - end: the end of the value in the file + Args: + all_file_bytes (bytes): The entire file as a byte string. + key (str): The key of the value we're indexing. + start: the start of the value in the file + end: the end of the value in the file - Returns: - The key, start, end, indent_level, indent_with, value_hash, end - """ - key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key) - indent_level, indent_with = utils.detect_indentation_in_json_bytes( - all_file_bytes, key_start - ) - value_bytes = all_file_bytes[start:end] - value_hash = hashlib.sha256(value_bytes).hexdigest() - return key, start, end, indent_level, indent_with, value_hash, end + Returns: + The key, start, end, indent_level, indent_with, value_hash, end + """ + key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key) + indent_level, indent_with = utils.detect_indentation_in_json_bytes( + all_file_bytes, key_start + ) + value_bytes = all_file_bytes[start:end] + value_hash = hashlib.sha256(value_bytes).hexdigest() + return Index(key, start, end, indent_level, indent_with, value_hash, end) diff --git a/dictdatabase/indexing.py b/dictdatabase/indexing.py index b147f92..bb8e36a 100644 --- a/dictdatabase/indexing.py +++ b/dictdatabase/indexing.py @@ -3,6 +3,7 @@ import orjson from . import config +from .dataclasses import Index # Problem: Multiple read processes will concurrently read and write the same file @@ -25,61 +26,65 @@ class Indexer: - """ - The Indexer takes the name of a database file, and tries to load the .index file - of the corresponding database file. - - The name of the index file is the name of the database file, with the extension - .index and all "/" replaced with "___" - - The content of the index file is a json object, where the keys are keys inside - the database json file, and the values are lists of 5 elements: - - start_index: The index of the first byte of the value of the key in the database file - - end_index: The index of the last byte of the value of the key in the database file - - indent_level: The indent level of the key in the database file - - indent_with: The indent string used. - - value_hash: The hash of the value bytes - """ - - __slots__ = ("data", "path") - - def __init__(self, db_name: str): - # Make path of index file - db_name = db_name.replace("/", "___") - self.path = os.path.join(config.storage_directory, ".ddb", f"{db_name}.index") - - os.makedirs(os.path.dirname(self.path), exist_ok=True) - if not os.path.exists(self.path): - self.data = {} - return - - try: - with open(self.path, "rb") as f: - self.data = orjson.loads(f.read()) - except orjson.JSONDecodeError: - self.data = {} - - - def get(self, key): - """ - Returns a list of 5 elements for a key if it exists, otherwise None - Elements:[start_index, end_index, indent_level, indent_with, value_hash] - """ - return self.data.get(key, None) - - - def write(self, key, start_index, end_index, indent_level, indent_with, value_hash, old_value_end): - """ - Write index information for a key to the index file - """ - - if self.data.get(key, None) is not None: - delta = end_index - old_value_end - for entry in self.data.values(): - if entry[0] > old_value_end: - entry[0] += delta - entry[1] += delta - - self.data[key] = [start_index, end_index, indent_level, indent_with, value_hash] - with open(self.path, "wb") as f: - f.write(orjson.dumps(self.data)) + """ + The Indexer takes the name of a database file, and tries to load the .index file + of the corresponding database file. + + The name of the index file is the name of the database file, with the extension + .index and all "/" replaced with "___" + + The content of the index file is a json object, where the keys are keys inside + the database json file, and the values are lists of 5 elements: + - start_index: The index of the first byte of the value of the key in the database file + - end_index: The index of the last byte of the value of the key in the database file + - indent_level: The indent level of the key in the database file + - indent_with: The indent string used. + - value_hash: The hash of the value bytes + """ + + __slots__ = ("data", "path") + + def __init__(self, db_name: str): + # Make path of index file + db_name = db_name.replace("/", "___") + self.path = os.path.join(config.storage_directory, ".ddb", f"{db_name}.index") + + os.makedirs(os.path.dirname(self.path), exist_ok=True) + if not os.path.exists(self.path): + self.data = {} + return + + try: + with open(self.path, "rb") as f: + self.data = orjson.loads(f.read()) + except orjson.JSONDecodeError: + self.data = {} + + def get(self, key): + """ + Returns a list of 5 elements for a key if it exists, otherwise None + Elements:[start_index, end_index, indent_level, indent_with, value_hash] + """ + return self.data.get(key, None) + + def write(self, index: Index): + """ + Write index information for a key to the index file + """ + + if self.data.get(index.key, None) is not None: + delta = index.key_end - index.old_value_end + for entry in self.data.values(): + if entry[0] > index.old_value_end: + entry[0] += delta + entry[1] += delta + + self.data[index.key] = [ + index.key_start, + index.key_end, + index.indent_level, + index.indent_with, + index.value_hash, + ] + with open(self.path, "wb") as f: + f.write(orjson.dumps(self.data)) diff --git a/dictdatabase/io_unsafe.py b/dictdatabase/io_unsafe.py index 7c8cba2..cd77b70 100644 --- a/dictdatabase/io_unsafe.py +++ b/dictdatabase/io_unsafe.py @@ -13,26 +13,27 @@ from . import io_bytes from . import searching from . import utils -from .index_manager import IndexManager +from .dataclasses import Index +from .index_manager import create_index @dataclass(frozen=True) # slots=True not supported by python 3.8 and 3.9 class PartialDict: - prefix: bytes - key: str - value: dict - value_start: int - value_end: int - suffix: bytes + prefix: bytes + key: str + value: dict + value_start: int + value_end: int + suffix: bytes @dataclass(frozen=True) # slots=True not supported by python 3.8 and 3.9 class PartialFileHandle: - db_name: str - partial_dict: PartialDict - indent_level: int - indent_with: str - indexer: indexing.Indexer + db_name: str + partial_dict: PartialDict + indent_level: int + indent_with: str + indexer: indexing.Indexer ######################################################################################## @@ -41,13 +42,13 @@ class PartialFileHandle: def read(db_name: str) -> dict: - """ - Read the file at db_path from the configured storage directory. - Make sure the file exists. If it does notnot a FileNotFoundError is - raised. - """ - # Always use orjson to read the file, because it is faster - return orjson.loads(io_bytes.read(db_name)) + """ + Read the file at db_path from the configured storage directory. + Make sure the file exists. If it does notnot a FileNotFoundError is + raised. + """ + # Always use orjson to read the file, because it is faster + return orjson.loads(io_bytes.read(db_name)) ######################################################################################## @@ -55,46 +56,48 @@ def read(db_name: str) -> dict: ######################################################################################## -def try_read_bytes_using_indexer(indexer: indexing.Indexer, db_name: str, key: str) -> bytes | None: - """ - Check if the key info is saved in the file's index file. - If it is and the value has not changed, return the value bytes. - Otherwise return None. - """ +def try_read_bytes_using_indexer( + indexer: indexing.Indexer, db_name: str, key: str +) -> bytes | None: + """ + Check if the key info is saved in the file's index file. + If it is and the value has not changed, return the value bytes. + Otherwise return None. + """ - if (index := indexer.get(key)) is None: - return None - start, end, _, _, value_hash = index - partial_bytes = io_bytes.read(db_name, start=start, end=end) - if value_hash != hashlib.sha256(partial_bytes).hexdigest(): - return None - return partial_bytes + if (index := indexer.get(key)) is None: + return None + start, end, _, _, value_hash = index + partial_bytes = io_bytes.read(db_name, start=start, end=end) + if value_hash != hashlib.sha256(partial_bytes).hexdigest(): + return None + return partial_bytes def partial_read_only(db_name: str, key: str) -> dict | None: - """ - Partially read a key from a db. - The key MUST be unique in the entire db, otherwise the behavior is undefined. - This is a lot faster than reading the entire db, because it does not parse - the entire file, but only the part part of the : pair. - - If the key is not found, a `KeyError` is raised. - """ - - # Search for key in the index file - indexer = indexing.Indexer(db_name) - if (value_bytes := try_read_bytes_using_indexer(indexer, db_name, key)) is not None: - return orjson.loads(value_bytes) - - # Not found in index file, search for key in the entire file - all_file_bytes = io_bytes.read(db_name) - start, end, found = searching.search_value_position_in_db(all_file_bytes, key) - if not found: - return None - value_bytes = all_file_bytes[start:end] - # Write key info to index file - indexer.write(*IndexManager.create_index(all_file_bytes, key, start, end)) - return orjson.loads(value_bytes) + """ + Partially read a key from a db. + The key MUST be unique in the entire db, otherwise the behavior is undefined. + This is a lot faster than reading the entire db, because it does not parse + the entire file, but only the part part of the : pair. + + If the key is not found, a `KeyError` is raised. + """ + + # Search for key in the index file + indexer = indexing.Indexer(db_name) + if (value_bytes := try_read_bytes_using_indexer(indexer, db_name, key)) is not None: + return orjson.loads(value_bytes) + + # Not found in index file, search for key in the entire file + all_file_bytes = io_bytes.read(db_name) + start, end, found = searching.search_value_position_in_db(all_file_bytes, key) + if not found: + return None + value_bytes = all_file_bytes[start:end] + # Write key info to index file + indexer.write(create_index(all_file_bytes, key, start, end)) + return orjson.loads(value_bytes) ################################################################################ @@ -103,26 +106,26 @@ def partial_read_only(db_name: str, key: str) -> dict | None: def serialize_data_to_json_bytes(data: dict) -> bytes: - """ - Serialize the data as json bytes. Depending on the config, - this can be done with orjson or the standard json module. - Additionally config.indent is respected. - """ - if config.use_orjson: - option = (orjson.OPT_INDENT_2 if config.indent else 0) | orjson.OPT_SORT_KEYS - return orjson.dumps(data, option=option) - else: - db_dump = json.dumps(data, indent=config.indent, sort_keys=True) - return db_dump.encode() + """ + Serialize the data as json bytes. Depending on the config, + this can be done with orjson or the standard json module. + Additionally config.indent is respected. + """ + if config.use_orjson: + option = (orjson.OPT_INDENT_2 if config.indent else 0) | orjson.OPT_SORT_KEYS + return orjson.dumps(data, option=option) + else: + db_dump = json.dumps(data, indent=config.indent, sort_keys=True) + return db_dump.encode() def write(db_name: str, data: dict): - """ - Write the dict db dumped as a json string - to the file of the db_path. - """ - data_bytes = serialize_data_to_json_bytes(data) - io_bytes.write(db_name, data_bytes) + """ + Write the dict db dumped as a json string + to the file of the db_path. + """ + data_bytes = serialize_data_to_json_bytes(data) + io_bytes.write(db_name, data_bytes) ################################################################################ @@ -130,105 +133,129 @@ def write(db_name: str, data: dict): ################################################################################ -def try_get_parial_file_handle_by_index(indexer: indexing.Indexer, db_name, key) -> Tuple[PartialFileHandle | None, bytes | None]: - """ - Try to get a partial file handle by using the key entry in the index file. - - If the data could be read from the index file, a tuple of the partial file - handle and None is returned. - If the data could not be read from the index file, a tuple of None and the file - bytes is returned, so that the file bytes can be searched for the key. - """ - - if (index := indexer.get(key)) is None: - return None, io_bytes.read(db_name) - start, end, indent_level, indent_with, value_hash = index - - # If compression is enabled, all data has to be read from the file - if config.use_compression: - all_file_bytes = io_bytes.read(db_name) - value_bytes = all_file_bytes[start:end] - if value_hash != hashlib.sha256(value_bytes).hexdigest(): - return None, all_file_bytes - value_data = orjson.loads(value_bytes) - partial_dict = PartialDict(all_file_bytes[:start], key, value_data, start, end, all_file_bytes[end:]) - - # If compression is disabled, only the value and suffix have to be read - else: - value_and_suffix_bytes = io_bytes.read(db_name, start=start) - value_length = end - start - value_bytes = value_and_suffix_bytes[:value_length] - if value_hash != hashlib.sha256(value_bytes).hexdigest(): - # If the hashes don't match, read the prefix to concat the full file bytes - prefix_bytes = io_bytes.read(db_name, end=start) - return None, prefix_bytes + value_and_suffix_bytes - value_data = orjson.loads(value_bytes) - partial_dict = PartialDict(None, key, value_data, start, end, value_and_suffix_bytes[value_length:]) - - return PartialFileHandle(db_name, partial_dict, indent_level, indent_with, indexer), None +def try_get_parial_file_handle_by_index( + indexer: indexing.Indexer, db_name, key +) -> Tuple[PartialFileHandle | None, bytes | None]: + """ + Try to get a partial file handle by using the key entry in the index file. + + If the data could be read from the index file, a tuple of the partial file + handle and None is returned. + If the data could not be read from the index file, a tuple of None and the file + bytes is returned, so that the file bytes can be searched for the key. + """ + + if (index := indexer.get(key)) is None: + return None, io_bytes.read(db_name) + start, end, indent_level, indent_with, value_hash = index + + # If compression is enabled, all data has to be read from the file + if config.use_compression: + all_file_bytes = io_bytes.read(db_name) + value_bytes = all_file_bytes[start:end] + if value_hash != hashlib.sha256(value_bytes).hexdigest(): + return None, all_file_bytes + value_data = orjson.loads(value_bytes) + partial_dict = PartialDict( + all_file_bytes[:start], key, value_data, start, end, all_file_bytes[end:] + ) + + # If compression is disabled, only the value and suffix have to be read + else: + value_and_suffix_bytes = io_bytes.read(db_name, start=start) + value_length = end - start + value_bytes = value_and_suffix_bytes[:value_length] + if value_hash != hashlib.sha256(value_bytes).hexdigest(): + # If the hashes don't match, read the prefix to concat the full file bytes + prefix_bytes = io_bytes.read(db_name, end=start) + return None, prefix_bytes + value_and_suffix_bytes + value_data = orjson.loads(value_bytes) + partial_dict = PartialDict( + None, key, value_data, start, end, value_and_suffix_bytes[value_length:] + ) + + return ( + PartialFileHandle(db_name, partial_dict, indent_level, indent_with, indexer), + None, + ) def get_partial_file_handle(db_name: str, key: str) -> PartialFileHandle: - """ - Partially read a key from a db. - The key MUST be unique in the entire db, otherwise the behavior is undefined. - This is a lot faster than reading the entire db, because it does not parse - the entire file, but only the part part of the : pair. - - If the key is not found, a `KeyError` is raised. - """ - - # Search for key in the index file - indexer = indexing.Indexer(db_name) - partial_handle, all_file_bytes = try_get_parial_file_handle_by_index(indexer, db_name, key) - if partial_handle is not None: - return partial_handle - - # Not found in index file, search for key in the entire file - position = searching.search_key_position_in_db(all_file_bytes, key) - - if not position.found: - raise KeyError(f"Key \"{key}\" not found in db \"{db_name}\"") - - # Key found, now determine the bounding byte indices of the value - start = position.end_byte + (1 if all_file_bytes[position.end_byte] == byte_codes.SPACE else 0) - end = utils.seek_index_through_value_bytes(all_file_bytes, start) - - indent_level, indent_with = utils.detect_indentation_in_json_bytes(all_file_bytes, position.start_byte) - - partial_value = orjson.loads(all_file_bytes[start:end]) - prefix_bytes = all_file_bytes[:start] if config.use_compression else None - partial_dict = PartialDict(prefix_bytes, key, partial_value, start, end, all_file_bytes[end:]) - return PartialFileHandle(db_name, partial_dict, indent_level, indent_with, indexer) + """ + Partially read a key from a db. + The key MUST be unique in the entire db, otherwise the behavior is undefined. + This is a lot faster than reading the entire db, because it does not parse + the entire file, but only the part part of the : pair. + + If the key is not found, a `KeyError` is raised. + """ + + # Search for key in the index file + indexer = indexing.Indexer(db_name) + partial_handle, all_file_bytes = try_get_parial_file_handle_by_index( + indexer, db_name, key + ) + if partial_handle is not None: + return partial_handle + + # Not found in index file, search for key in the entire file + position = searching.search_key_position_in_db(all_file_bytes, key) + + if not position.found: + raise KeyError(f'Key "{key}" not found in db "{db_name}"') + + # Key found, now determine the bounding byte indices of the value + start = position.end_byte + ( + 1 if all_file_bytes[position.end_byte] == byte_codes.SPACE else 0 + ) + end = utils.seek_index_through_value_bytes(all_file_bytes, start) + + indent_level, indent_with = utils.detect_indentation_in_json_bytes( + all_file_bytes, position.start_byte + ) + + partial_value = orjson.loads(all_file_bytes[start:end]) + prefix_bytes = all_file_bytes[:start] if config.use_compression else None + partial_dict = PartialDict( + prefix_bytes, key, partial_value, start, end, all_file_bytes[end:] + ) + return PartialFileHandle(db_name, partial_dict, indent_level, indent_with, indexer) def partial_write(pf: PartialFileHandle): - """ - Write a partial file handle to the db. - """ - - partial_bytes = serialize_data_to_json_bytes(pf.partial_dict.value) - - # Add indentation - if pf.indent_level > 0 and pf.indent_with: - replace_this = "\n".encode() - replace_with = ("\n" + (pf.indent_level * pf.indent_with)).encode() - partial_bytes = partial_bytes.replace(replace_this, replace_with) - - # Write key info to index file - pf.indexer.write( - key=pf.partial_dict.key, - start_index=pf.partial_dict.value_start, - end_index=pf.partial_dict.value_start + len(partial_bytes), - indent_level=pf.indent_level, - indent_with=pf.indent_with, - value_hash=hashlib.sha256(partial_bytes).hexdigest(), - old_value_end=pf.partial_dict.value_end, - ) - - if pf.partial_dict.prefix is None: - # Prefix could not be determined due to compression, so write the entire file - io_bytes.write(pf.db_name, partial_bytes + pf.partial_dict.suffix, start=pf.partial_dict.value_start) - else: - # Prefix was determined, so only write the changed part and the suffix - io_bytes.write(pf.db_name, pf.partial_dict.prefix + partial_bytes + pf.partial_dict.suffix) + """ + Write a partial file handle to the db. + """ + + partial_bytes = serialize_data_to_json_bytes(pf.partial_dict.value) + + # Add indentation + if pf.indent_level > 0 and pf.indent_with: + replace_this = "\n".encode() + replace_with = ("\n" + (pf.indent_level * pf.indent_with)).encode() + partial_bytes = partial_bytes.replace(replace_this, replace_with) + + # Write key info to index file + index = Index( + key=pf.partial_dict.key, + key_start=pf.partial_dict.value_start, + key_end=pf.partial_dict.value_start + len(partial_bytes), + indent_level=pf.indent_level, + indent_with=pf.indent_with, + value_hash=hashlib.sha256(partial_bytes).hexdigest(), + old_value_end=pf.partial_dict.value_end, + ) + pf.indexer.write(index) + + if pf.partial_dict.prefix is None: + # Prefix could not be determined due to compression, so write the entire file + io_bytes.write( + pf.db_name, + partial_bytes + pf.partial_dict.suffix, + start=pf.partial_dict.value_start, + ) + else: + # Prefix was determined, so only write the changed part and the suffix + io_bytes.write( + pf.db_name, pf.partial_dict.prefix + partial_bytes + pf.partial_dict.suffix + ) From f53b4566e55522916124bbd88994c8d5e6bb83ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcel=20Kr=C3=B6ker?= Date: Sun, 27 Nov 2022 16:39:56 +0100 Subject: [PATCH 12/13] use tabs --- dictdatabase/index_manager.py | 36 ++-- dictdatabase/io_unsafe.py | 378 +++++++++++++++++----------------- 2 files changed, 207 insertions(+), 207 deletions(-) diff --git a/dictdatabase/index_manager.py b/dictdatabase/index_manager.py index 579a97f..eee755a 100644 --- a/dictdatabase/index_manager.py +++ b/dictdatabase/index_manager.py @@ -5,23 +5,23 @@ def create_index(all_file_bytes: bytes, key: str, start, end) -> Index: - """ - It takes a JSON file, a key, and a start and end position, and returns a tuple of information about the key and its - value + """ + It takes a JSON file, a key, and a start and end position, and returns a tuple of information about the key and its + value - Args: - all_file_bytes (bytes): The entire file as a byte string. - key (str): The key of the value we're indexing. - start: the start of the value in the file - end: the end of the value in the file + Args: + all_file_bytes (bytes): The entire file as a byte string. + key (str): The key of the value we're indexing. + start: the start of the value in the file + end: the end of the value in the file - Returns: - The key, start, end, indent_level, indent_with, value_hash, end - """ - key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key) - indent_level, indent_with = utils.detect_indentation_in_json_bytes( - all_file_bytes, key_start - ) - value_bytes = all_file_bytes[start:end] - value_hash = hashlib.sha256(value_bytes).hexdigest() - return Index(key, start, end, indent_level, indent_with, value_hash, end) + Returns: + The key, start, end, indent_level, indent_with, value_hash, end + """ + key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key) + indent_level, indent_with = utils.detect_indentation_in_json_bytes( + all_file_bytes, key_start + ) + value_bytes = all_file_bytes[start:end] + value_hash = hashlib.sha256(value_bytes).hexdigest() + return Index(key, start, end, indent_level, indent_with, value_hash, end) diff --git a/dictdatabase/io_unsafe.py b/dictdatabase/io_unsafe.py index cd77b70..337063f 100644 --- a/dictdatabase/io_unsafe.py +++ b/dictdatabase/io_unsafe.py @@ -19,21 +19,21 @@ @dataclass(frozen=True) # slots=True not supported by python 3.8 and 3.9 class PartialDict: - prefix: bytes - key: str - value: dict - value_start: int - value_end: int - suffix: bytes + prefix: bytes + key: str + value: dict + value_start: int + value_end: int + suffix: bytes @dataclass(frozen=True) # slots=True not supported by python 3.8 and 3.9 class PartialFileHandle: - db_name: str - partial_dict: PartialDict - indent_level: int - indent_with: str - indexer: indexing.Indexer + db_name: str + partial_dict: PartialDict + indent_level: int + indent_with: str + indexer: indexing.Indexer ######################################################################################## @@ -42,13 +42,13 @@ class PartialFileHandle: def read(db_name: str) -> dict: - """ - Read the file at db_path from the configured storage directory. - Make sure the file exists. If it does notnot a FileNotFoundError is - raised. - """ - # Always use orjson to read the file, because it is faster - return orjson.loads(io_bytes.read(db_name)) + """ + Read the file at db_path from the configured storage directory. + Make sure the file exists. If it does notnot a FileNotFoundError is + raised. + """ + # Always use orjson to read the file, because it is faster + return orjson.loads(io_bytes.read(db_name)) ######################################################################################## @@ -57,47 +57,47 @@ def read(db_name: str) -> dict: def try_read_bytes_using_indexer( - indexer: indexing.Indexer, db_name: str, key: str + indexer: indexing.Indexer, db_name: str, key: str ) -> bytes | None: - """ - Check if the key info is saved in the file's index file. - If it is and the value has not changed, return the value bytes. - Otherwise return None. - """ + """ + Check if the key info is saved in the file's index file. + If it is and the value has not changed, return the value bytes. + Otherwise return None. + """ - if (index := indexer.get(key)) is None: - return None - start, end, _, _, value_hash = index - partial_bytes = io_bytes.read(db_name, start=start, end=end) - if value_hash != hashlib.sha256(partial_bytes).hexdigest(): - return None - return partial_bytes + if (index := indexer.get(key)) is None: + return None + start, end, _, _, value_hash = index + partial_bytes = io_bytes.read(db_name, start=start, end=end) + if value_hash != hashlib.sha256(partial_bytes).hexdigest(): + return None + return partial_bytes def partial_read_only(db_name: str, key: str) -> dict | None: - """ - Partially read a key from a db. - The key MUST be unique in the entire db, otherwise the behavior is undefined. - This is a lot faster than reading the entire db, because it does not parse - the entire file, but only the part part of the : pair. - - If the key is not found, a `KeyError` is raised. - """ - - # Search for key in the index file - indexer = indexing.Indexer(db_name) - if (value_bytes := try_read_bytes_using_indexer(indexer, db_name, key)) is not None: - return orjson.loads(value_bytes) - - # Not found in index file, search for key in the entire file - all_file_bytes = io_bytes.read(db_name) - start, end, found = searching.search_value_position_in_db(all_file_bytes, key) - if not found: - return None - value_bytes = all_file_bytes[start:end] - # Write key info to index file - indexer.write(create_index(all_file_bytes, key, start, end)) - return orjson.loads(value_bytes) + """ + Partially read a key from a db. + The key MUST be unique in the entire db, otherwise the behavior is undefined. + This is a lot faster than reading the entire db, because it does not parse + the entire file, but only the part part of the : pair. + + If the key is not found, a `KeyError` is raised. + """ + + # Search for key in the index file + indexer = indexing.Indexer(db_name) + if (value_bytes := try_read_bytes_using_indexer(indexer, db_name, key)) is not None: + return orjson.loads(value_bytes) + + # Not found in index file, search for key in the entire file + all_file_bytes = io_bytes.read(db_name) + start, end, found = searching.search_value_position_in_db(all_file_bytes, key) + if not found: + return None + value_bytes = all_file_bytes[start:end] + # Write key info to index file + indexer.write(create_index(all_file_bytes, key, start, end)) + return orjson.loads(value_bytes) ################################################################################ @@ -106,26 +106,26 @@ def partial_read_only(db_name: str, key: str) -> dict | None: def serialize_data_to_json_bytes(data: dict) -> bytes: - """ - Serialize the data as json bytes. Depending on the config, - this can be done with orjson or the standard json module. - Additionally config.indent is respected. - """ - if config.use_orjson: - option = (orjson.OPT_INDENT_2 if config.indent else 0) | orjson.OPT_SORT_KEYS - return orjson.dumps(data, option=option) - else: - db_dump = json.dumps(data, indent=config.indent, sort_keys=True) - return db_dump.encode() + """ + Serialize the data as json bytes. Depending on the config, + this can be done with orjson or the standard json module. + Additionally config.indent is respected. + """ + if config.use_orjson: + option = (orjson.OPT_INDENT_2 if config.indent else 0) | orjson.OPT_SORT_KEYS + return orjson.dumps(data, option=option) + else: + db_dump = json.dumps(data, indent=config.indent, sort_keys=True) + return db_dump.encode() def write(db_name: str, data: dict): - """ - Write the dict db dumped as a json string - to the file of the db_path. - """ - data_bytes = serialize_data_to_json_bytes(data) - io_bytes.write(db_name, data_bytes) + """ + Write the dict db dumped as a json string + to the file of the db_path. + """ + data_bytes = serialize_data_to_json_bytes(data) + io_bytes.write(db_name, data_bytes) ################################################################################ @@ -134,128 +134,128 @@ def write(db_name: str, data: dict): def try_get_parial_file_handle_by_index( - indexer: indexing.Indexer, db_name, key + indexer: indexing.Indexer, db_name, key ) -> Tuple[PartialFileHandle | None, bytes | None]: - """ - Try to get a partial file handle by using the key entry in the index file. - - If the data could be read from the index file, a tuple of the partial file - handle and None is returned. - If the data could not be read from the index file, a tuple of None and the file - bytes is returned, so that the file bytes can be searched for the key. - """ - - if (index := indexer.get(key)) is None: - return None, io_bytes.read(db_name) - start, end, indent_level, indent_with, value_hash = index - - # If compression is enabled, all data has to be read from the file - if config.use_compression: - all_file_bytes = io_bytes.read(db_name) - value_bytes = all_file_bytes[start:end] - if value_hash != hashlib.sha256(value_bytes).hexdigest(): - return None, all_file_bytes - value_data = orjson.loads(value_bytes) - partial_dict = PartialDict( - all_file_bytes[:start], key, value_data, start, end, all_file_bytes[end:] - ) - - # If compression is disabled, only the value and suffix have to be read - else: - value_and_suffix_bytes = io_bytes.read(db_name, start=start) - value_length = end - start - value_bytes = value_and_suffix_bytes[:value_length] - if value_hash != hashlib.sha256(value_bytes).hexdigest(): - # If the hashes don't match, read the prefix to concat the full file bytes - prefix_bytes = io_bytes.read(db_name, end=start) - return None, prefix_bytes + value_and_suffix_bytes - value_data = orjson.loads(value_bytes) - partial_dict = PartialDict( - None, key, value_data, start, end, value_and_suffix_bytes[value_length:] - ) - - return ( - PartialFileHandle(db_name, partial_dict, indent_level, indent_with, indexer), - None, - ) + """ + Try to get a partial file handle by using the key entry in the index file. + + If the data could be read from the index file, a tuple of the partial file + handle and None is returned. + If the data could not be read from the index file, a tuple of None and the file + bytes is returned, so that the file bytes can be searched for the key. + """ + + if (index := indexer.get(key)) is None: + return None, io_bytes.read(db_name) + start, end, indent_level, indent_with, value_hash = index + + # If compression is enabled, all data has to be read from the file + if config.use_compression: + all_file_bytes = io_bytes.read(db_name) + value_bytes = all_file_bytes[start:end] + if value_hash != hashlib.sha256(value_bytes).hexdigest(): + return None, all_file_bytes + value_data = orjson.loads(value_bytes) + partial_dict = PartialDict( + all_file_bytes[:start], key, value_data, start, end, all_file_bytes[end:] + ) + + # If compression is disabled, only the value and suffix have to be read + else: + value_and_suffix_bytes = io_bytes.read(db_name, start=start) + value_length = end - start + value_bytes = value_and_suffix_bytes[:value_length] + if value_hash != hashlib.sha256(value_bytes).hexdigest(): + # If the hashes don't match, read the prefix to concat the full file bytes + prefix_bytes = io_bytes.read(db_name, end=start) + return None, prefix_bytes + value_and_suffix_bytes + value_data = orjson.loads(value_bytes) + partial_dict = PartialDict( + None, key, value_data, start, end, value_and_suffix_bytes[value_length:] + ) + + return ( + PartialFileHandle(db_name, partial_dict, indent_level, indent_with, indexer), + None, + ) def get_partial_file_handle(db_name: str, key: str) -> PartialFileHandle: - """ - Partially read a key from a db. - The key MUST be unique in the entire db, otherwise the behavior is undefined. - This is a lot faster than reading the entire db, because it does not parse - the entire file, but only the part part of the : pair. - - If the key is not found, a `KeyError` is raised. - """ - - # Search for key in the index file - indexer = indexing.Indexer(db_name) - partial_handle, all_file_bytes = try_get_parial_file_handle_by_index( - indexer, db_name, key - ) - if partial_handle is not None: - return partial_handle - - # Not found in index file, search for key in the entire file - position = searching.search_key_position_in_db(all_file_bytes, key) - - if not position.found: - raise KeyError(f'Key "{key}" not found in db "{db_name}"') - - # Key found, now determine the bounding byte indices of the value - start = position.end_byte + ( - 1 if all_file_bytes[position.end_byte] == byte_codes.SPACE else 0 - ) - end = utils.seek_index_through_value_bytes(all_file_bytes, start) - - indent_level, indent_with = utils.detect_indentation_in_json_bytes( - all_file_bytes, position.start_byte - ) - - partial_value = orjson.loads(all_file_bytes[start:end]) - prefix_bytes = all_file_bytes[:start] if config.use_compression else None - partial_dict = PartialDict( - prefix_bytes, key, partial_value, start, end, all_file_bytes[end:] - ) - return PartialFileHandle(db_name, partial_dict, indent_level, indent_with, indexer) + """ + Partially read a key from a db. + The key MUST be unique in the entire db, otherwise the behavior is undefined. + This is a lot faster than reading the entire db, because it does not parse + the entire file, but only the part part of the : pair. + + If the key is not found, a `KeyError` is raised. + """ + + # Search for key in the index file + indexer = indexing.Indexer(db_name) + partial_handle, all_file_bytes = try_get_parial_file_handle_by_index( + indexer, db_name, key + ) + if partial_handle is not None: + return partial_handle + + # Not found in index file, search for key in the entire file + position = searching.search_key_position_in_db(all_file_bytes, key) + + if not position.found: + raise KeyError(f'Key "{key}" not found in db "{db_name}"') + + # Key found, now determine the bounding byte indices of the value + start = position.end_byte + ( + 1 if all_file_bytes[position.end_byte] == byte_codes.SPACE else 0 + ) + end = utils.seek_index_through_value_bytes(all_file_bytes, start) + + indent_level, indent_with = utils.detect_indentation_in_json_bytes( + all_file_bytes, position.start_byte + ) + + partial_value = orjson.loads(all_file_bytes[start:end]) + prefix_bytes = all_file_bytes[:start] if config.use_compression else None + partial_dict = PartialDict( + prefix_bytes, key, partial_value, start, end, all_file_bytes[end:] + ) + return PartialFileHandle(db_name, partial_dict, indent_level, indent_with, indexer) def partial_write(pf: PartialFileHandle): - """ - Write a partial file handle to the db. - """ - - partial_bytes = serialize_data_to_json_bytes(pf.partial_dict.value) - - # Add indentation - if pf.indent_level > 0 and pf.indent_with: - replace_this = "\n".encode() - replace_with = ("\n" + (pf.indent_level * pf.indent_with)).encode() - partial_bytes = partial_bytes.replace(replace_this, replace_with) - - # Write key info to index file - index = Index( - key=pf.partial_dict.key, - key_start=pf.partial_dict.value_start, - key_end=pf.partial_dict.value_start + len(partial_bytes), - indent_level=pf.indent_level, - indent_with=pf.indent_with, - value_hash=hashlib.sha256(partial_bytes).hexdigest(), - old_value_end=pf.partial_dict.value_end, - ) - pf.indexer.write(index) - - if pf.partial_dict.prefix is None: - # Prefix could not be determined due to compression, so write the entire file - io_bytes.write( - pf.db_name, - partial_bytes + pf.partial_dict.suffix, - start=pf.partial_dict.value_start, - ) - else: - # Prefix was determined, so only write the changed part and the suffix - io_bytes.write( - pf.db_name, pf.partial_dict.prefix + partial_bytes + pf.partial_dict.suffix - ) + """ + Write a partial file handle to the db. + """ + + partial_bytes = serialize_data_to_json_bytes(pf.partial_dict.value) + + # Add indentation + if pf.indent_level > 0 and pf.indent_with: + replace_this = "\n".encode() + replace_with = ("\n" + (pf.indent_level * pf.indent_with)).encode() + partial_bytes = partial_bytes.replace(replace_this, replace_with) + + # Write key info to index file + index = Index( + key=pf.partial_dict.key, + key_start=pf.partial_dict.value_start, + key_end=pf.partial_dict.value_start + len(partial_bytes), + indent_level=pf.indent_level, + indent_with=pf.indent_with, + value_hash=hashlib.sha256(partial_bytes).hexdigest(), + old_value_end=pf.partial_dict.value_end, + ) + pf.indexer.write(index) + + if pf.partial_dict.prefix is None: + # Prefix could not be determined due to compression, so write the entire file + io_bytes.write( + pf.db_name, + partial_bytes + pf.partial_dict.suffix, + start=pf.partial_dict.value_start, + ) + else: + # Prefix was determined, so only write the changed part and the suffix + io_bytes.write( + pf.db_name, pf.partial_dict.prefix + partial_bytes + pf.partial_dict.suffix + ) From d5499a04e5f4d6343447f2074fcaf5f6ff6d2dcb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcel=20Kr=C3=B6ker?= Date: Sun, 27 Nov 2022 16:42:54 +0100 Subject: [PATCH 13/13] revert docstring indent --- dictdatabase/io_unsafe.py | 54 +++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/dictdatabase/io_unsafe.py b/dictdatabase/io_unsafe.py index 337063f..170fcb0 100644 --- a/dictdatabase/io_unsafe.py +++ b/dictdatabase/io_unsafe.py @@ -43,9 +43,9 @@ class PartialFileHandle: def read(db_name: str) -> dict: """ - Read the file at db_path from the configured storage directory. - Make sure the file exists. If it does notnot a FileNotFoundError is - raised. + Read the file at db_path from the configured storage directory. + Make sure the file exists. If it does notnot a FileNotFoundError is + raised. """ # Always use orjson to read the file, because it is faster return orjson.loads(io_bytes.read(db_name)) @@ -60,9 +60,9 @@ def try_read_bytes_using_indexer( indexer: indexing.Indexer, db_name: str, key: str ) -> bytes | None: """ - Check if the key info is saved in the file's index file. - If it is and the value has not changed, return the value bytes. - Otherwise return None. + Check if the key info is saved in the file's index file. + If it is and the value has not changed, return the value bytes. + Otherwise return None. """ if (index := indexer.get(key)) is None: @@ -76,12 +76,12 @@ def try_read_bytes_using_indexer( def partial_read_only(db_name: str, key: str) -> dict | None: """ - Partially read a key from a db. - The key MUST be unique in the entire db, otherwise the behavior is undefined. - This is a lot faster than reading the entire db, because it does not parse - the entire file, but only the part part of the : pair. + Partially read a key from a db. + The key MUST be unique in the entire db, otherwise the behavior is undefined. + This is a lot faster than reading the entire db, because it does not parse + the entire file, but only the part part of the : pair. - If the key is not found, a `KeyError` is raised. + If the key is not found, a `KeyError` is raised. """ # Search for key in the index file @@ -107,9 +107,9 @@ def partial_read_only(db_name: str, key: str) -> dict | None: def serialize_data_to_json_bytes(data: dict) -> bytes: """ - Serialize the data as json bytes. Depending on the config, - this can be done with orjson or the standard json module. - Additionally config.indent is respected. + Serialize the data as json bytes. Depending on the config, + this can be done with orjson or the standard json module. + Additionally config.indent is respected. """ if config.use_orjson: option = (orjson.OPT_INDENT_2 if config.indent else 0) | orjson.OPT_SORT_KEYS @@ -121,8 +121,8 @@ def serialize_data_to_json_bytes(data: dict) -> bytes: def write(db_name: str, data: dict): """ - Write the dict db dumped as a json string - to the file of the db_path. + Write the dict db dumped as a json string + to the file of the db_path. """ data_bytes = serialize_data_to_json_bytes(data) io_bytes.write(db_name, data_bytes) @@ -137,12 +137,12 @@ def try_get_parial_file_handle_by_index( indexer: indexing.Indexer, db_name, key ) -> Tuple[PartialFileHandle | None, bytes | None]: """ - Try to get a partial file handle by using the key entry in the index file. + Try to get a partial file handle by using the key entry in the index file. - If the data could be read from the index file, a tuple of the partial file - handle and None is returned. - If the data could not be read from the index file, a tuple of None and the file - bytes is returned, so that the file bytes can be searched for the key. + If the data could be read from the index file, a tuple of the partial file + handle and None is returned. + If the data could not be read from the index file, a tuple of None and the file + bytes is returned, so that the file bytes can be searched for the key. """ if (index := indexer.get(key)) is None: @@ -182,12 +182,12 @@ def try_get_parial_file_handle_by_index( def get_partial_file_handle(db_name: str, key: str) -> PartialFileHandle: """ - Partially read a key from a db. - The key MUST be unique in the entire db, otherwise the behavior is undefined. - This is a lot faster than reading the entire db, because it does not parse - the entire file, but only the part part of the : pair. + Partially read a key from a db. + The key MUST be unique in the entire db, otherwise the behavior is undefined. + This is a lot faster than reading the entire db, because it does not parse + the entire file, but only the part part of the : pair. - If the key is not found, a `KeyError` is raised. + If the key is not found, a `KeyError` is raised. """ # Search for key in the index file @@ -224,7 +224,7 @@ def get_partial_file_handle(db_name: str, key: str) -> PartialFileHandle: def partial_write(pf: PartialFileHandle): """ - Write a partial file handle to the db. + Write a partial file handle to the db. """ partial_bytes = serialize_data_to_json_bytes(pf.partial_dict.value)