Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
mkrd committed Nov 20, 2022
1 parent 8fbad87 commit f6387eb
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 2 deletions.
1 change: 1 addition & 0 deletions dictdatabase/byte_codes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@
SPACE = 32
TAB = 9
NEWLINE = 10
COLON = 58
COMMA = 44
40 changes: 39 additions & 1 deletion dictdatabase/indexing.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from dataclasses import dataclass
import orjson
import os
from . import config
from . import config, utils, byte_codes, io_bytes

# Problem: Multiple read processes will concurrently read and write the same file
# In some cases this will result in a empty read error, thats why the try-except exists
Expand All @@ -21,6 +22,42 @@
# - Leave everything as is. While not ideal, it works. When empty read error occurs, don't use the index for that read






@dataclass
class KeyFinderState:
skip_next = False
in_str = False
list_depth = 0
dict_depth = 1
key_start = None
key_end = None
value_end = None
indices = []
i = 1


def batched_find_all_top_level_keys(db_name):
state, b = KeyFinderState(), 0
while True:
batch_start = b * 10_000_000
batch_end = batch_start + 10_000_000

batch_bytes = io_bytes.read_bytes(db_name, batch_start, batch_end)

if batch_start == 0 and batch_bytes[0] != byte_codes.OPEN_CURLY:
raise ValueError("The first byte of the database file must be an opening curly brace")
if len(batch_bytes) == 0:
break
utils.find_all_top_level_keys(batch_bytes, state, len(batch_bytes))
return state.indices





class Indexer:
"""
The Indexer takes the name of a database file, and tries to load the .index file
Expand Down Expand Up @@ -57,6 +94,7 @@ def __init__(self, db_name: str):
self.data = {}



def get(self, key):
"""
Returns a list of 5 elements for a key if it exists, otherwise None
Expand Down
57 changes: 56 additions & 1 deletion dictdatabase/utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Tuple
import os
import glob
from . import config, byte_codes
from . indexing import KeyFinderState


def file_info(db_name: str) -> Tuple[str, bool, str, bool]:
Expand Down Expand Up @@ -37,17 +39,70 @@ def find_all(file_name: str) -> list[str]:
return files_all



def find_all_top_level_keys(json_bytes: bytes, state: KeyFinderState, batch_size: int) -> KeyFinderState:
"""
In the bytes of the json object find all top level keys and the start and end
indices of their values.
"""

while state.i < batch_size:
current = json_bytes[state.i]
if state.skip_next:
state.skip_next = False
elif current == byte_codes.BACKSLASH:
state.skip_next = True
elif current == byte_codes.QUOTE:
if state.dict_depth == 1 and state.list_depth == 0:
if state.in_str:
state.key_end = state.i
state.i += 1
while json_bytes[state.i] in [byte_codes.SPACE, byte_codes.COLON]:
state.i += 1
state.value_start = state.i
else:
state.key_start = state.i + 1
state.in_str = not state.in_str
elif state.in_str or current in [byte_codes.SPACE, byte_codes.TAB, byte_codes.NEWLINE]:
pass
elif current == byte_codes.OPEN_SQUARE:
state.list_depth += 1
elif current == byte_codes.CLOSE_SQUARE:
state.list_depth -= 1
elif current == byte_codes.OPEN_CURLY:
state.dict_depth += 1
elif current == byte_codes.CLOSE_CURLY:
state.dict_depth -= 1
elif state.list_depth == 0 and state.dict_depth == 1:
state.indices.append((json_bytes[state.key_start:state.key_end].decode(), state.value_start, state.i + 1))
state.i += 1







def seek_index_through_value_bytes(json_bytes: bytes, index: int) -> int:
"""
Finds the index of the next comma or closing bracket/brace after the value
of a key-value pair in a bytes object containing valid JSON when decoded.
Valid start indices are the index after the colon or the index after that.
Example:
01234567
"2": {},
Valid start indices are 4 and 5. Returns 7.
Args:
- `json_bytes`: A bytes object containing valid JSON when decoded
- `index`: The start index in json_bytes
Returns:
- The end index of the value.
- The end index of the first byte right after the value's bytes.
"""

# See https://www.json.org/json-en.html for the JSON syntax
Expand Down

0 comments on commit f6387eb

Please sign in to comment.