Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft: Batched indexing for files bigger than RAM #40

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions assets/coverage.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions dictdatabase/byte_codes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@
SPACE = 32
TAB = 9
NEWLINE = 10
COLON = 58
COMMA = 44
40 changes: 39 additions & 1 deletion dictdatabase/indexing.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from dataclasses import dataclass
import orjson
import os
from . import config
from . import config, utils, byte_codes, io_bytes

# Problem: Multiple read processes will concurrently read and write the same file
# In some cases this will result in a empty read error, thats why the try-except exists
Expand All @@ -21,6 +22,42 @@
# - Leave everything as is. While not ideal, it works. When empty read error occurs, don't use the index for that read






@dataclass
class KeyFinderState:
skip_next = False
in_str = False
list_depth = 0
dict_depth = 1
key_start = None
key_end = None
value_end = None
indices = []
i = 1


def batched_find_all_top_level_keys(db_name):
state, b = KeyFinderState(), 0
while True:
batch_start = b * 10_000_000
batch_end = batch_start + 10_000_000

batch_bytes = io_bytes.read_bytes(db_name, batch_start, batch_end)

if batch_start == 0 and batch_bytes[0] != byte_codes.OPEN_CURLY:
raise ValueError("The first byte of the database file must be an opening curly brace")
if len(batch_bytes) == 0:
break
utils.find_all_top_level_keys(batch_bytes, state, len(batch_bytes))
return state.indices





class Indexer:
"""
The Indexer takes the name of a database file, and tries to load the .index file
Expand Down Expand Up @@ -57,6 +94,7 @@ def __init__(self, db_name: str):
self.data = {}



def get(self, key):
"""
Returns a list of 5 elements for a key if it exists, otherwise None
Expand Down
57 changes: 56 additions & 1 deletion dictdatabase/utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Tuple
import os
import glob
from . import config, byte_codes
from . indexing import KeyFinderState


def file_info(db_name: str) -> Tuple[str, bool, str, bool]:
Expand Down Expand Up @@ -37,17 +39,70 @@ def find_all(file_name: str) -> list[str]:
return files_all



def find_all_top_level_keys(json_bytes: bytes, state: KeyFinderState, batch_size: int) -> KeyFinderState:
"""
In the bytes of the json object find all top level keys and the start and end
indices of their values.
"""

while state.i < batch_size:
current = json_bytes[state.i]
if state.skip_next:
state.skip_next = False
elif current == byte_codes.BACKSLASH:
state.skip_next = True
elif current == byte_codes.QUOTE:
if state.dict_depth == 1 and state.list_depth == 0:
if state.in_str:
state.key_end = state.i
state.i += 1
while json_bytes[state.i] in [byte_codes.SPACE, byte_codes.COLON]:
state.i += 1
state.value_start = state.i
else:
state.key_start = state.i + 1
state.in_str = not state.in_str
elif state.in_str or current in [byte_codes.SPACE, byte_codes.TAB, byte_codes.NEWLINE]:
pass
elif current == byte_codes.OPEN_SQUARE:
state.list_depth += 1
elif current == byte_codes.CLOSE_SQUARE:
state.list_depth -= 1
elif current == byte_codes.OPEN_CURLY:
state.dict_depth += 1
elif current == byte_codes.CLOSE_CURLY:
state.dict_depth -= 1
elif state.list_depth == 0 and state.dict_depth == 1:
state.indices.append((json_bytes[state.key_start:state.key_end].decode(), state.value_start, state.i + 1))
state.i += 1







def seek_index_through_value_bytes(json_bytes: bytes, index: int) -> int:
"""
Finds the index of the next comma or closing bracket/brace after the value
of a key-value pair in a bytes object containing valid JSON when decoded.

Valid start indices are the index after the colon or the index after that.

Example:

01234567
"2": {},

Valid start indices are 4 and 5. Returns 7.

Args:
- `json_bytes`: A bytes object containing valid JSON when decoded
- `index`: The start index in json_bytes

Returns:
- The end index of the value.
- The end index of the first byte right after the value's bytes.
"""

# See https://www.json.org/json-en.html for the JSON syntax
Expand Down