Skip to content

Commit

Permalink
[refactor] Extract hash computation logic for reuse
Browse files Browse the repository at this point in the history
  • Loading branch information
smd1121 committed Jan 7, 2024
1 parent f8eabbe commit 0e87c4f
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 36 deletions.
7 changes: 3 additions & 4 deletions xgit/commands/cat_file.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import sys
import zlib
from typing import Optional

import typer
from typer import Option, Argument
from typing_extensions import Annotated

from xgit.utils.sha import extract_data
from xgit.types.types import Factory
from xgit.utils.utils import get_object, check_exist
from xgit.utils.utils import check_exist


def cat_file(
Expand Down Expand Up @@ -57,8 +57,7 @@ def cat_file(
typer.echo(f"fatal: Not a valid obj name {obj}", err=True)
sys.exit(128)

with get_object(obj=obj).open("rb") as f:
data = zlib.decompress(f.read())
data = extract_data(object_id=obj)

hdr, data = data.split(b"\x00", maxsplit=1)
type_, size = hdr.split(b" ", maxsplit=1)
Expand Down
34 changes: 2 additions & 32 deletions xgit/commands/hash_object.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
import sys
import zlib
import hashlib
from typing import Optional

import typer
from typer import Option, Argument
from typing_extensions import Annotated

from xgit.utils.utils import find_repo
from xgit.utils.constants import GIT_DIR
from xgit.utils.sha import hash_file, do_hash_object


def hash_object(
Expand All @@ -29,32 +26,5 @@ def hash_object(
return

for file in files:
with open(file, "rb") as f:
data = f.read()
object_id = do_hash_object(data, obj_type, write)
object_id = hash_file(file, write)
typer.echo(object_id)


def do_hash_object(data: bytes, obj_type: str, write: bool) -> str:
"""
计算对象的哈希值;如果 write 为 True,则将内容写入到对象数据库中。
在 Git 中,对象的哈希值的计算依据以及存入文件的内容是这样的一个 bytes:
一个指定对象类型的字符串 + 空格 + 对象内容的长度 + \x00 + 对象内容
参见 https://stackoverflow.com/questions/22968856
在存储时,会将上述内容进行 zlib 压缩,然后计算 SHA-1 哈希值,作为文件名。
为了避免在一个目录下存储过多的文件导致性能问题,会将文件名的前两位作为目录名。
"""

result = obj_type.encode() + b" " + str(len(data)).encode() + b"\x00" + data
object_id = hashlib.sha1(result).hexdigest()

if write:
repo_dir = find_repo()
file = repo_dir / GIT_DIR / f"objects/{object_id[:2]}/{object_id[2:]}"
file.parent.mkdir(exist_ok=True)
with open(file, "wb") as out:
out.write(zlib.compress(result))

return object_id
41 changes: 41 additions & 0 deletions xgit/utils/sha.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import zlib
import hashlib

from xgit.utils.utils import find_repo, get_object
from xgit.utils.constants import GIT_DIR


def hash_file(file: str, write: bool = False) -> str:
with open(file, "rb") as f:
data = f.read()
return do_hash_object(data, "blob", write)


def do_hash_object(data: bytes, obj_type: str, write: bool) -> str:
"""
计算对象的哈希值;如果 write 为 True,则将内容写入到对象数据库中。
在 Git 中,对象的哈希值的计算依据以及存入文件的内容是这样的一个 bytes:
一个指定对象类型的字符串 + 空格 + 对象内容的长度 + \x00 + 对象内容
参见 https://stackoverflow.com/questions/22968856
在存储时,会将上述内容进行 zlib 压缩,然后计算 SHA-1 哈希值,作为文件名。
为了避免在一个目录下存储过多的文件导致性能问题,会将文件名的前两位作为目录名。
"""

result = obj_type.encode() + b" " + str(len(data)).encode() + b"\x00" + data
object_id = hashlib.sha1(result).hexdigest()

if write:
repo_dir = find_repo()
file = repo_dir / GIT_DIR / f"objects/{object_id[:2]}/{object_id[2:]}"
file.parent.mkdir(exist_ok=True)
with open(file, "wb") as out:
out.write(zlib.compress(result))

return object_id


def extract_data(object_id: str) -> bytes:
with get_object(obj=object_id).open("rb") as f:
return zlib.decompress(f.read())

0 comments on commit 0e87c4f

Please sign in to comment.