diff --git a/dictdatabase/dataclasses.py b/dictdatabase/dataclasses.py new file mode 100644 index 0000000..9743ffb --- /dev/null +++ b/dictdatabase/dataclasses.py @@ -0,0 +1,19 @@ +import dataclasses + + +@dataclasses.dataclass(frozen=True) +class SearchResult: + start_byte: int + end_byte: int + found: bool + + +@dataclasses.dataclass(frozen=True) +class Index: + key: str + key_start: int + key_end: int + indent_level: int + indent_with: str + value_hash: str + old_value_end: int diff --git a/dictdatabase/index_manager.py b/dictdatabase/index_manager.py new file mode 100644 index 0000000..eee755a --- /dev/null +++ b/dictdatabase/index_manager.py @@ -0,0 +1,27 @@ +import hashlib + +from dictdatabase import utils +from dictdatabase.dataclasses import Index + + +def create_index(all_file_bytes: bytes, key: str, start, end) -> Index: + """ + It takes a JSON file, a key, and a start and end position, and returns a tuple of information about the key and its + value + + Args: + all_file_bytes (bytes): The entire file as a byte string. + key (str): The key of the value we're indexing. + start: the start of the value in the file + end: the end of the value in the file + + Returns: + The key, start, end, indent_level, indent_with, value_hash, end + """ + key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key) + indent_level, indent_with = utils.detect_indentation_in_json_bytes( + all_file_bytes, key_start + ) + value_bytes = all_file_bytes[start:end] + value_hash = hashlib.sha256(value_bytes).hexdigest() + return Index(key, start, end, indent_level, indent_with, value_hash, end) diff --git a/dictdatabase/indexing.py b/dictdatabase/indexing.py index c5eaabc..bb8e36a 100644 --- a/dictdatabase/indexing.py +++ b/dictdatabase/indexing.py @@ -1,6 +1,10 @@ -import orjson import os + +import orjson + from . import config +from .dataclasses import Index + # Problem: Multiple read processes will concurrently read and write the same file # In some cases this will result in a empty read error, thats why the try-except exists @@ -22,61 +26,65 @@ class Indexer: - """ - The Indexer takes the name of a database file, and tries to load the .index file - of the corresponding database file. - - The name of the index file is the name of the database file, with the extension - .index and all "/" replaced with "___" - - The content of the index file is a json object, where the keys are keys inside - the database json file, and the values are lists of 5 elements: - - start_index: The index of the first byte of the value of the key in the database file - - end_index: The index of the last byte of the value of the key in the database file - - indent_level: The indent level of the key in the database file - - indent_with: The indent string used. - - value_hash: The hash of the value bytes - """ - - __slots__ = ("data", "path") - - def __init__(self, db_name: str): - # Make path of index file - db_name = db_name.replace("/", "___") - self.path = os.path.join(config.storage_directory, ".ddb", f"{db_name}.index") - - os.makedirs(os.path.dirname(self.path), exist_ok=True) - if not os.path.exists(self.path): - self.data = {} - return - - try: - with open(self.path, "rb") as f: - self.data = orjson.loads(f.read()) - except orjson.JSONDecodeError: - self.data = {} - - - def get(self, key): - """ - Returns a list of 5 elements for a key if it exists, otherwise None - Elements:[start_index, end_index, indent_level, indent_with, value_hash] - """ - return self.data.get(key, None) - - - def write(self, key, start_index, end_index, indent_level, indent_with, value_hash, old_value_end): - """ - Write index information for a key to the index file - """ - - if self.data.get(key, None) is not None: - delta = end_index - old_value_end - for entry in self.data.values(): - if entry[0] > old_value_end: - entry[0] += delta - entry[1] += delta - - self.data[key] = [start_index, end_index, indent_level, indent_with, value_hash] - with open(self.path, "wb") as f: - f.write(orjson.dumps(self.data)) + """ + The Indexer takes the name of a database file, and tries to load the .index file + of the corresponding database file. + + The name of the index file is the name of the database file, with the extension + .index and all "/" replaced with "___" + + The content of the index file is a json object, where the keys are keys inside + the database json file, and the values are lists of 5 elements: + - start_index: The index of the first byte of the value of the key in the database file + - end_index: The index of the last byte of the value of the key in the database file + - indent_level: The indent level of the key in the database file + - indent_with: The indent string used. + - value_hash: The hash of the value bytes + """ + + __slots__ = ("data", "path") + + def __init__(self, db_name: str): + # Make path of index file + db_name = db_name.replace("/", "___") + self.path = os.path.join(config.storage_directory, ".ddb", f"{db_name}.index") + + os.makedirs(os.path.dirname(self.path), exist_ok=True) + if not os.path.exists(self.path): + self.data = {} + return + + try: + with open(self.path, "rb") as f: + self.data = orjson.loads(f.read()) + except orjson.JSONDecodeError: + self.data = {} + + def get(self, key): + """ + Returns a list of 5 elements for a key if it exists, otherwise None + Elements:[start_index, end_index, indent_level, indent_with, value_hash] + """ + return self.data.get(key, None) + + def write(self, index: Index): + """ + Write index information for a key to the index file + """ + + if self.data.get(index.key, None) is not None: + delta = index.key_end - index.old_value_end + for entry in self.data.values(): + if entry[0] > index.old_value_end: + entry[0] += delta + entry[1] += delta + + self.data[index.key] = [ + index.key_start, + index.key_end, + index.indent_level, + index.indent_with, + index.value_hash, + ] + with open(self.path, "wb") as f: + f.write(orjson.dumps(self.data)) diff --git a/dictdatabase/io_unsafe.py b/dictdatabase/io_unsafe.py index 7204a30..170fcb0 100644 --- a/dictdatabase/io_unsafe.py +++ b/dictdatabase/io_unsafe.py @@ -1,10 +1,20 @@ from __future__ import annotations -from typing import Tuple + +import hashlib +import json from dataclasses import dataclass +from typing import Tuple + import orjson -import json -import hashlib -from . import config, utils, byte_codes, indexing, io_bytes + +from . import byte_codes +from . import config +from . import indexing +from . import io_bytes +from . import searching +from . import utils +from .dataclasses import Index +from .index_manager import create_index @dataclass(frozen=True) # slots=True not supported by python 3.8 and 3.9 @@ -46,7 +56,9 @@ def read(db_name: str) -> dict: ######################################################################################## -def try_read_bytes_using_indexer(indexer: indexing.Indexer, db_name: str, key: str) -> bytes | None: +def try_read_bytes_using_indexer( + indexer: indexing.Indexer, db_name: str, key: str +) -> bytes | None: """ Check if the key info is saved in the file's index file. If it is and the value has not changed, return the value bytes. @@ -79,21 +91,12 @@ def partial_read_only(db_name: str, key: str) -> dict | None: # Not found in index file, search for key in the entire file all_file_bytes = io_bytes.read(db_name) - key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key) - - if key_end == -1: + start, end, found = searching.search_value_position_in_db(all_file_bytes, key) + if not found: return None - - # Key found, now determine the bounding byte indices of the value - start = key_end + (1 if all_file_bytes[key_end] == byte_codes.SPACE else 0) - end = utils.seek_index_through_value_bytes(all_file_bytes, start) - - indent_level, indent_with = utils.detect_indentation_in_json_bytes(all_file_bytes, key_start) value_bytes = all_file_bytes[start:end] - value_hash = hashlib.sha256(value_bytes).hexdigest() - # Write key info to index file - indexer.write(key, start, end, indent_level, indent_with, value_hash, end) + indexer.write(create_index(all_file_bytes, key, start, end)) return orjson.loads(value_bytes) @@ -130,7 +133,9 @@ def write(db_name: str, data: dict): ################################################################################ -def try_get_parial_file_handle_by_index(indexer: indexing.Indexer, db_name, key) -> Tuple[PartialFileHandle | None, bytes | None]: +def try_get_parial_file_handle_by_index( + indexer: indexing.Indexer, db_name, key +) -> Tuple[PartialFileHandle | None, bytes | None]: """ Try to get a partial file handle by using the key entry in the index file. @@ -151,7 +156,9 @@ def try_get_parial_file_handle_by_index(indexer: indexing.Indexer, db_name, key) if value_hash != hashlib.sha256(value_bytes).hexdigest(): return None, all_file_bytes value_data = orjson.loads(value_bytes) - partial_dict = PartialDict(all_file_bytes[:start], key, value_data, start, end, all_file_bytes[end:]) + partial_dict = PartialDict( + all_file_bytes[:start], key, value_data, start, end, all_file_bytes[end:] + ) # If compression is disabled, only the value and suffix have to be read else: @@ -163,9 +170,14 @@ def try_get_parial_file_handle_by_index(indexer: indexing.Indexer, db_name, key) prefix_bytes = io_bytes.read(db_name, end=start) return None, prefix_bytes + value_and_suffix_bytes value_data = orjson.loads(value_bytes) - partial_dict = PartialDict(None, key, value_data, start, end, value_and_suffix_bytes[value_length:]) + partial_dict = PartialDict( + None, key, value_data, start, end, value_and_suffix_bytes[value_length:] + ) - return PartialFileHandle(db_name, partial_dict, indent_level, indent_with, indexer), None + return ( + PartialFileHandle(db_name, partial_dict, indent_level, indent_with, indexer), + None, + ) def get_partial_file_handle(db_name: str, key: str) -> PartialFileHandle: @@ -180,25 +192,33 @@ def get_partial_file_handle(db_name: str, key: str) -> PartialFileHandle: # Search for key in the index file indexer = indexing.Indexer(db_name) - partial_handle, all_file_bytes = try_get_parial_file_handle_by_index(indexer, db_name, key) + partial_handle, all_file_bytes = try_get_parial_file_handle_by_index( + indexer, db_name, key + ) if partial_handle is not None: return partial_handle # Not found in index file, search for key in the entire file - key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key) + position = searching.search_key_position_in_db(all_file_bytes, key) - if key_end == -1: - raise KeyError(f"Key \"{key}\" not found in db \"{db_name}\"") + if not position.found: + raise KeyError(f'Key "{key}" not found in db "{db_name}"') # Key found, now determine the bounding byte indices of the value - start = key_end + (1 if all_file_bytes[key_end] == byte_codes.SPACE else 0) + start = position.end_byte + ( + 1 if all_file_bytes[position.end_byte] == byte_codes.SPACE else 0 + ) end = utils.seek_index_through_value_bytes(all_file_bytes, start) - indent_level, indent_with = utils.detect_indentation_in_json_bytes(all_file_bytes, key_start) + indent_level, indent_with = utils.detect_indentation_in_json_bytes( + all_file_bytes, position.start_byte + ) partial_value = orjson.loads(all_file_bytes[start:end]) prefix_bytes = all_file_bytes[:start] if config.use_compression else None - partial_dict = PartialDict(prefix_bytes, key, partial_value, start, end, all_file_bytes[end:]) + partial_dict = PartialDict( + prefix_bytes, key, partial_value, start, end, all_file_bytes[end:] + ) return PartialFileHandle(db_name, partial_dict, indent_level, indent_with, indexer) @@ -216,19 +236,26 @@ def partial_write(pf: PartialFileHandle): partial_bytes = partial_bytes.replace(replace_this, replace_with) # Write key info to index file - pf.indexer.write( + index = Index( key=pf.partial_dict.key, - start_index=pf.partial_dict.value_start, - end_index=pf.partial_dict.value_start + len(partial_bytes), + key_start=pf.partial_dict.value_start, + key_end=pf.partial_dict.value_start + len(partial_bytes), indent_level=pf.indent_level, indent_with=pf.indent_with, value_hash=hashlib.sha256(partial_bytes).hexdigest(), old_value_end=pf.partial_dict.value_end, ) + pf.indexer.write(index) if pf.partial_dict.prefix is None: # Prefix could not be determined due to compression, so write the entire file - io_bytes.write(pf.db_name, partial_bytes + pf.partial_dict.suffix, start=pf.partial_dict.value_start) + io_bytes.write( + pf.db_name, + partial_bytes + pf.partial_dict.suffix, + start=pf.partial_dict.value_start, + ) else: # Prefix was determined, so only write the changed part and the suffix - io_bytes.write(pf.db_name, pf.partial_dict.prefix + partial_bytes + pf.partial_dict.suffix) + io_bytes.write( + pf.db_name, pf.partial_dict.prefix + partial_bytes + pf.partial_dict.suffix + ) diff --git a/dictdatabase/searching.py b/dictdatabase/searching.py new file mode 100644 index 0000000..697819b --- /dev/null +++ b/dictdatabase/searching.py @@ -0,0 +1,74 @@ +from typing import Tuple + +import orjson + +from dictdatabase import byte_codes +from dictdatabase import utils +from dictdatabase.dataclasses import SearchResult + + +def find_key_position_in_bytes(file: bytes, key: str) -> SearchResult: + """ + It finds the start and end indices of the value of a key in a JSON file + + Args: + file (bytes): bytes + key (str): The key to find in the JSON file. + + Returns: + A tuple of the start and end index of the key, and a boolean value indicating whether the key was found. + """ + key_start, key_end = utils.find_outermost_key_in_json_bytes(file, key) + if key_end == -1: + return SearchResult(start_byte=-1, end_byte=-1, found=False) + start = key_end + (1 if file[key_end] == byte_codes.SPACE else 0) + end = utils.seek_index_through_value_bytes(file, start) + return SearchResult(start_byte=start, end_byte=end, found=True) + + +def search_key_position_in_db( + file: bytes, key: str, glom_searching=True +) -> SearchResult: + original_value_start = 0 + original_value_end = len(file) + original_key_start = 0 + original_key_end = len(file) + for k in key.split(".") if glom_searching else [key]: + key_start, key_end = utils.find_outermost_key_in_json_bytes(file, k) + if key_end == -1: + return SearchResult(start_byte=-1, end_byte=-1, found=False) + original_key_end = original_value_start + key_end + original_key_start = original_value_start + key_start + position = find_key_position_in_bytes(file, k) + original_value_end = original_value_start + original_value_end + original_value_start += position.start_byte + file = file[original_value_start:original_value_end] + return SearchResult(start_byte=original_key_start, end_byte=original_key_end, found=True) + + +def search_value_position_in_db( + all_file_bytes: bytes, key: str, glom_searching=True +) -> Tuple[int, int, bool]: + """ + It takes a byte string, a key, and a boolean, and returns a tuple of three integers + + Args: + all_file_bytes (bytes): The bytes of the file you're searching in. + key (str): The key to search for. + glom_searching: If True, then the key is a glom path, and we need to search for each part of the path. Defaults to + True + + Returns: + The start and end of the key in the file. + """ + original_start = 0 + original_end = len(all_file_bytes) + for k in key.split(".") if glom_searching else [key]: + position = find_key_position_in_bytes( + all_file_bytes[original_start:original_end], k + ) + if not position.found: + return -1, -1, False + original_end = original_start + position.end_byte + original_start += position.start_byte + return original_start, original_end, True diff --git a/tests/benchmark/run_parallel.py b/tests/benchmark/run_parallel.py index a3a1b6a..edce37c 100644 --- a/tests/benchmark/run_parallel.py +++ b/tests/benchmark/run_parallel.py @@ -88,9 +88,9 @@ class Scenario: ops: int = 10 def print(self): - res = f"✨ Scenario: {'🔹' * self.readers}{'🔻' * self.writers} ({self.readers}r{self.writers}w)" - res += ", 🔸 compression" if self.use_compression else "" - res += ", 💎 big file" if self.big_file else "" + res = f"Scenario: {'*' * self.readers}{'#' * self.writers} ({self.readers}r{self.writers}w)" + res += ", [] compression" if self.use_compression else "" + res += ", {} big file" if self.big_file else "" print(res) diff --git a/tests/test_glom_like_searching.py b/tests/test_glom_like_searching.py new file mode 100644 index 0000000..cc1ec05 --- /dev/null +++ b/tests/test_glom_like_searching.py @@ -0,0 +1,31 @@ +import dictdatabase as DDB + +data = { + "users": { + "Ben": {"age": 30, "job": "Software Engineer"}, + "Bob": {"age": 30, "job": "Plumbers"}, + }, + "Ben": {"job": {"age": 30, "job": "Software Engineer"}}, +} + + +def test_glom_searching(): + DDB.at("users").create(data, force_overwrite=True) + assert DDB.at("users", key="users.Ben.job").read() == "Software Engineer" + + +def test_without_glom_searching(): + DDB.at("users").create(data, force_overwrite=True) + assert DDB.at("users", key="Ben").read() == { + "job": {"age": 30, "job": "Software Engineer"} + } + + +def test_glom_searching_if_key_not_exists(): + DDB.at("users").create(data, force_overwrite=True) + assert DDB.at("users", key="users.Job.Ben").read() is None + + +def test_glom_searching_if_subkey_not_exists(): + DDB.at("users").create(data, force_overwrite=True) + assert DDB.at("users", key="users.Ben.SUBKEYNOTEXISTS").read() is None diff --git a/tests/test_glom_writing.py b/tests/test_glom_writing.py new file mode 100644 index 0000000..4255cd0 --- /dev/null +++ b/tests/test_glom_writing.py @@ -0,0 +1,27 @@ +import pytest + +import dictdatabase as DDB + +data = { + "users": { + "Ben": {"age": 30, "job": "Software Engineer"}, + "Bob": {"age": 30, "job": "Plumbers"}, + }, + "Ben": {"job": {"age": 30, "job": "Software Engineer"}}, +} + + +def test_glom_writing(): + DDB.at("users").create(data, force_overwrite=True) + with DDB.at("users", key="users.Ben").session() as (session, purchase): + purchase["status"] = "cancelled" + session.write() + assert DDB.at("users", key="users.Ben.status").read() == "cancelled" + + +def test_glom_writing_sub_key_not_exists(): + DDB.at("users").create(data, force_overwrite=True) + with pytest.raises(KeyError): + with DDB.at("users", key="users.SUBKEY").session() as (session, purchase): + purchase["status"] = "cancelled" + session.write()