mkrd · mkrd · Nov 19, 2022 · Nov 20, 2022 · Nov 20, 2022 · Nov 20, 2022
diff --git a/assets/coverage.svg b/assets/coverage.svg
diff --git a/dictdatabase/byte_codes.py b/dictdatabase/byte_codes.py
@@ -8,4 +8,5 @@
 SPACE = 32
 TAB = 9
 NEWLINE = 10
+COLON = 58
 COMMA = 44
diff --git a/dictdatabase/indexing.py b/dictdatabase/indexing.py
@@ -1,6 +1,7 @@
+from dataclasses import dataclass
 import orjson
 import os
-from . import config
+from . import config, utils, byte_codes, io_bytes
 
 # Problem: Multiple read processes will concurrently read and write the same file
 # In some cases this will result in a empty read error, thats why the try-except exists
@@ -21,6 +22,42 @@
 # - Leave everything as is. While not ideal, it works. When empty read error occurs, don't use the index for that read
 
 
+
+
+
+
+@dataclass
+class KeyFinderState:
+	skip_next = False
+	in_str = False
+	list_depth = 0
+	dict_depth = 1
+	key_start = None
+	key_end = None
+	value_end = None
+	indices = []
+	i = 1
+
+
+def batched_find_all_top_level_keys(db_name):
+	state, b = KeyFinderState(), 0
+	while True:
+		batch_start = b * 10_000_000
+		batch_end = batch_start + 10_000_000
+
+		batch_bytes = io_bytes.read_bytes(db_name, batch_start, batch_end)
+
+		if batch_start == 0 and batch_bytes[0] != byte_codes.OPEN_CURLY:
+			raise ValueError("The first byte of the database file must be an opening curly brace")
+		if len(batch_bytes) == 0:
+			break
+		utils.find_all_top_level_keys(batch_bytes, state, len(batch_bytes))
+	return state.indices
+
+
+
+
+
 class Indexer:
 	"""
 		The Indexer takes the name of a database file, and tries to load the .index file
@@ -57,6 +94,7 @@ def __init__(self, db_name: str):
 			self.data = {}
 
 
+
 	def get(self, key):
 		"""
 			Returns a list of 5 elements for a key if it exists, otherwise None

diff --git a/dictdatabase/utils.py b/dictdatabase/utils.py
@@ -1,8 +1,10 @@
 from __future__ import annotations
+from dataclasses import dataclass
 from typing import Tuple
 import os
 import glob
 from . import config, byte_codes
+from . indexing import KeyFinderState
 
 
 def file_info(db_name: str) -> Tuple[str, bool, str, bool]:
@@ -37,17 +39,70 @@ def find_all(file_name: str) -> list[str]:
 	return files_all
 
 
+
+def find_all_top_level_keys(json_bytes: bytes, state: KeyFinderState, batch_size: int) -> KeyFinderState:
+	"""
+		In the bytes of the json object find all top level keys and the start and end
+		indices of their values.
+	"""
+
+	while state.i < batch_size:
+		current = json_bytes[state.i]
+		if state.skip_next:
+			state.skip_next = False
+		elif current == byte_codes.BACKSLASH:
+			state.skip_next = True
+		elif current == byte_codes.QUOTE:
+			if state.dict_depth == 1 and state.list_depth == 0:
+				if state.in_str:
+					state.key_end = state.i
+					state.i += 1
+					while json_bytes[state.i] in [byte_codes.SPACE, byte_codes.COLON]:
+						state.i += 1
+					state.value_start = state.i
+				else:
+					state.key_start = state.i + 1
+			state.in_str = not state.in_str
+		elif state.in_str or current in [byte_codes.SPACE, byte_codes.TAB, byte_codes.NEWLINE]:
+			pass
+		elif current == byte_codes.OPEN_SQUARE:
+			state.list_depth += 1
+		elif current == byte_codes.CLOSE_SQUARE:
+			state.list_depth -= 1
+		elif current == byte_codes.OPEN_CURLY:
+			state.dict_depth += 1
+		elif current == byte_codes.CLOSE_CURLY:
+			state.dict_depth -= 1
+		elif state.list_depth == 0 and state.dict_depth == 1:
+			state.indices.append((json_bytes[state.key_start:state.key_end].decode(), state.value_start, state.i + 1))
+		state.i += 1
+
+
+
+
+
+
+
 def seek_index_through_value_bytes(json_bytes: bytes, index: int) -> int:
 	"""
 	Finds the index of the next comma or closing bracket/brace after the value
 	of a key-value pair in a bytes object containing valid JSON when decoded.
 
+	Valid start indices are the index after the colon or the index after that.
+
+	Example:
+
+	01234567
+	"2": {},
+
+	Valid start indices are 4 and 5. Returns 7.
+
 	Args:
 	- `json_bytes`: A bytes object containing valid JSON when decoded
 	- `index`: The start index in json_bytes
 
 	Returns:
-	- The end index of the value.
+	- The end index of the first byte right after the value's bytes.
 	"""
 
 	# See https://www.json.org/json-en.html for the JSON syntax