[refactor] Extract hash computation logic for reuse

smd1121 · Jan 7, 2024 · 0e87c4f · 0e87c4f
1 parent f8eabbe
commit 0e87c4f
Show file tree

Hide file tree

Showing 3 changed files with 46 additions and 36 deletions.
diff --git a/xgit/commands/cat_file.py b/xgit/commands/cat_file.py
@@ -1,13 +1,13 @@
 import sys
-import zlib
 from typing import Optional
 
 import typer
 from typer import Option, Argument
 from typing_extensions import Annotated
 
+from xgit.utils.sha import extract_data
 from xgit.types.types import Factory
-from xgit.utils.utils import get_object, check_exist
+from xgit.utils.utils import check_exist
 
 
 def cat_file(
@@ -57,8 +57,7 @@ def cat_file(
         typer.echo(f"fatal: Not a valid obj name {obj}", err=True)
         sys.exit(128)
 
-    with get_object(obj=obj).open("rb") as f:
-        data = zlib.decompress(f.read())
+    data = extract_data(object_id=obj)
 
     hdr, data = data.split(b"\x00", maxsplit=1)
     type_, size = hdr.split(b" ", maxsplit=1)

diff --git a/xgit/commands/hash_object.py b/xgit/commands/hash_object.py
@@ -1,14 +1,11 @@
 import sys
-import zlib
-import hashlib
 from typing import Optional
 
 import typer
 from typer import Option, Argument
 from typing_extensions import Annotated
 
-from xgit.utils.utils import find_repo
-from xgit.utils.constants import GIT_DIR
+from xgit.utils.sha import hash_file, do_hash_object
 
 
 def hash_object(
@@ -29,32 +26,5 @@ def hash_object(
         return
 
     for file in files:
-        with open(file, "rb") as f:
-            data = f.read()
-        object_id = do_hash_object(data, obj_type, write)
+        object_id = hash_file(file, write)
         typer.echo(object_id)
-
-
-def do_hash_object(data: bytes, obj_type: str, write: bool) -> str:
-    """
-    计算对象的哈希值；如果 write 为 True，则将内容写入到对象数据库中。
-
-    在 Git 中，对象的哈希值的计算依据以及存入文件的内容是这样的一个 bytes：
-    一个指定对象类型的字符串 + 空格 + 对象内容的长度 + \x00 + 对象内容
-    参见 https://stackoverflow.com/questions/22968856
-
-    在存储时，会将上述内容进行 zlib 压缩，然后计算 SHA-1 哈希值，作为文件名。
-    为了避免在一个目录下存储过多的文件导致性能问题，会将文件名的前两位作为目录名。
-    """
-
-    result = obj_type.encode() + b" " + str(len(data)).encode() + b"\x00" + data
-    object_id = hashlib.sha1(result).hexdigest()
-
-    if write:
-        repo_dir = find_repo()
-        file = repo_dir / GIT_DIR / f"objects/{object_id[:2]}/{object_id[2:]}"
-        file.parent.mkdir(exist_ok=True)
-        with open(file, "wb") as out:
-            out.write(zlib.compress(result))
-
-    return object_id
diff --git a/xgit/utils/sha.py b/xgit/utils/sha.py
@@ -0,0 +1,41 @@
+import zlib
+import hashlib
+
+from xgit.utils.utils import find_repo, get_object
+from xgit.utils.constants import GIT_DIR
+
+
+def hash_file(file: str, write: bool = False) -> str:
+    with open(file, "rb") as f:
+        data = f.read()
+    return do_hash_object(data, "blob", write)
+
+
+def do_hash_object(data: bytes, obj_type: str, write: bool) -> str:
+    """
+    计算对象的哈希值；如果 write 为 True，则将内容写入到对象数据库中。
+
+    在 Git 中，对象的哈希值的计算依据以及存入文件的内容是这样的一个 bytes：
+    一个指定对象类型的字符串 + 空格 + 对象内容的长度 + \x00 + 对象内容
+    参见 https://stackoverflow.com/questions/22968856
+
+    在存储时，会将上述内容进行 zlib 压缩，然后计算 SHA-1 哈希值，作为文件名。
+    为了避免在一个目录下存储过多的文件导致性能问题，会将文件名的前两位作为目录名。
+    """
+
+    result = obj_type.encode() + b" " + str(len(data)).encode() + b"\x00" + data
+    object_id = hashlib.sha1(result).hexdigest()
+
+    if write:
+        repo_dir = find_repo()
+        file = repo_dir / GIT_DIR / f"objects/{object_id[:2]}/{object_id[2:]}"
+        file.parent.mkdir(exist_ok=True)
+        with open(file, "wb") as out:
+            out.write(zlib.compress(result))
+
+    return object_id
+
+
+def extract_data(object_id: str) -> bytes:
+    with get_object(obj=object_id).open("rb") as f:
+        return zlib.decompress(f.read())