diff --git a/xgit/commands/cat_file.py b/xgit/commands/cat_file.py index 9cc5a08..65558ea 100644 --- a/xgit/commands/cat_file.py +++ b/xgit/commands/cat_file.py @@ -1,13 +1,13 @@ import sys -import zlib from typing import Optional import typer from typer import Option, Argument from typing_extensions import Annotated +from xgit.utils.sha import extract_data from xgit.types.types import Factory -from xgit.utils.utils import get_object, check_exist +from xgit.utils.utils import check_exist def cat_file( @@ -57,8 +57,7 @@ def cat_file( typer.echo(f"fatal: Not a valid obj name {obj}", err=True) sys.exit(128) - with get_object(obj=obj).open("rb") as f: - data = zlib.decompress(f.read()) + data = extract_data(object_id=obj) hdr, data = data.split(b"\x00", maxsplit=1) type_, size = hdr.split(b" ", maxsplit=1) diff --git a/xgit/commands/hash_object.py b/xgit/commands/hash_object.py index 848ae7b..f0f568e 100644 --- a/xgit/commands/hash_object.py +++ b/xgit/commands/hash_object.py @@ -1,14 +1,11 @@ import sys -import zlib -import hashlib from typing import Optional import typer from typer import Option, Argument from typing_extensions import Annotated -from xgit.utils.utils import find_repo -from xgit.utils.constants import GIT_DIR +from xgit.utils.sha import hash_file, do_hash_object def hash_object( @@ -29,32 +26,5 @@ def hash_object( return for file in files: - with open(file, "rb") as f: - data = f.read() - object_id = do_hash_object(data, obj_type, write) + object_id = hash_file(file, write) typer.echo(object_id) - - -def do_hash_object(data: bytes, obj_type: str, write: bool) -> str: - """ - 计算对象的哈希值;如果 write 为 True,则将内容写入到对象数据库中。 - - 在 Git 中,对象的哈希值的计算依据以及存入文件的内容是这样的一个 bytes: - 一个指定对象类型的字符串 + 空格 + 对象内容的长度 + \x00 + 对象内容 - 参见 https://stackoverflow.com/questions/22968856 - - 在存储时,会将上述内容进行 zlib 压缩,然后计算 SHA-1 哈希值,作为文件名。 - 为了避免在一个目录下存储过多的文件导致性能问题,会将文件名的前两位作为目录名。 - """ - - result = obj_type.encode() + b" " + str(len(data)).encode() + b"\x00" + data - object_id = hashlib.sha1(result).hexdigest() - - if write: - repo_dir = find_repo() - file = repo_dir / GIT_DIR / f"objects/{object_id[:2]}/{object_id[2:]}" - file.parent.mkdir(exist_ok=True) - with open(file, "wb") as out: - out.write(zlib.compress(result)) - - return object_id diff --git a/xgit/utils/sha.py b/xgit/utils/sha.py new file mode 100644 index 0000000..90e5633 --- /dev/null +++ b/xgit/utils/sha.py @@ -0,0 +1,41 @@ +import zlib +import hashlib + +from xgit.utils.utils import find_repo, get_object +from xgit.utils.constants import GIT_DIR + + +def hash_file(file: str, write: bool = False) -> str: + with open(file, "rb") as f: + data = f.read() + return do_hash_object(data, "blob", write) + + +def do_hash_object(data: bytes, obj_type: str, write: bool) -> str: + """ + 计算对象的哈希值;如果 write 为 True,则将内容写入到对象数据库中。 + + 在 Git 中,对象的哈希值的计算依据以及存入文件的内容是这样的一个 bytes: + 一个指定对象类型的字符串 + 空格 + 对象内容的长度 + \x00 + 对象内容 + 参见 https://stackoverflow.com/questions/22968856 + + 在存储时,会将上述内容进行 zlib 压缩,然后计算 SHA-1 哈希值,作为文件名。 + 为了避免在一个目录下存储过多的文件导致性能问题,会将文件名的前两位作为目录名。 + """ + + result = obj_type.encode() + b" " + str(len(data)).encode() + b"\x00" + data + object_id = hashlib.sha1(result).hexdigest() + + if write: + repo_dir = find_repo() + file = repo_dir / GIT_DIR / f"objects/{object_id[:2]}/{object_id[2:]}" + file.parent.mkdir(exist_ok=True) + with open(file, "wb") as out: + out.write(zlib.compress(result)) + + return object_id + + +def extract_data(object_id: str) -> bytes: + with get_object(obj=object_id).open("rb") as f: + return zlib.decompress(f.read())