Skip to content

Commit

Permalink
change default chunk size to 1 MB (#342)
Browse files Browse the repository at this point in the history
This value was based on the benchmarks in f0a6e96. The exact improvement
is platform dependent, but we observed improvements of 50-80%.

Signed-off-by: Spencer Schrock <[email protected]>
  • Loading branch information
spencerschrock authored Jan 22, 2025
1 parent 1fa9614 commit 51657ea
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 18 deletions.
2 changes: 1 addition & 1 deletion benchmarks/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def create_file_of_given_size(path: str, size: int) -> None:
"""
file_path = pathlib.Path(path)
file_path.parent.mkdir(parents=True, exist_ok=True)
chunk_size = 8192
chunk_size = 1048576
num_chunks = size // chunk_size

with file_path.open("wb") as f:
Expand Down
5 changes: 4 additions & 1 deletion benchmarks/serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,10 @@ def build_parser() -> argparse.ArgumentParser:

param_groups = parser.add_argument_group("Internal parameters to fine-tune")
param_groups.add_argument(
"--chunk", help="chunk size (default: 8192)", type=int, default=8192
"--chunk",
help="chunk size (default: 1048576)",
type=int,
default=1048576,
)
param_groups.add_argument(
"--shard",
Expand Down
24 changes: 12 additions & 12 deletions src/model_signing/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,13 +160,13 @@ def _build_stream_hasher(
def _build_file_hasher_factory(
self,
hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
chunk_size: int = 8192,
chunk_size: int = 1048576,
) -> Callable[[pathlib.Path], file.SimpleFileHasher]:
"""Builds the hasher factory for a serialization by file.
Args:
hashing_algorithm: the hashing algorithm to use to hash a file
chunk_size: The amount of file to read at once. Default is 8KB. A
chunk_size: The amount of file to read at once. Default is 1MB. A
special value of 0 signals to attempt to read everything in a
single call.
Expand All @@ -184,14 +184,14 @@ def factory(path: pathlib.Path) -> file.SimpleFileHasher:
def _build_sharded_file_hasher_factory(
self,
hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
chunk_size: int = 8192,
chunk_size: int = 1048576,
shard_size: int = 1000000,
) -> Callable[[pathlib.Path, int, int], file.ShardedFileHasher]:
"""Builds the hasher factory for a serialization by file shards.
Args:
hashing_algorithm: the hashing algorithm to use to hash a file
chunk_size: The amount of file to read at once. Default is 8KB. A
chunk_size: The amount of file to read at once. Default is 1MB. A
special value of 0 signals to attempt to read everything in a
single call.
shard_size: The size of a file shard. Default is 1,000,000 bytes.
Expand Down Expand Up @@ -220,7 +220,7 @@ def set_serialize_by_file_to_manifest(
self,
*,
hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
chunk_size: int = 8192,
chunk_size: int = 1048576,
max_workers: Optional[int] = None,
allow_symlinks: bool = False,
) -> Self:
Expand All @@ -232,7 +232,7 @@ def set_serialize_by_file_to_manifest(
Args:
hashing_algorithm: the hashing algorithm to use to hash a file
chunk_size: The amount of file to read at once. Default is 8KB. A
chunk_size: The amount of file to read at once. Default is 1MB. A
special value of 0 signals to attempt to read everything in a
single call.
max_workers: Maximum number of workers to use in parallel. Default
Expand All @@ -256,7 +256,7 @@ def set_serialize_by_file_to_digest(
*,
hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
merge_algorithm: Literal["sha256", "blake2"] = "sha256",
chunk_size: int = 8192,
chunk_size: int = 1048576,
allow_symlinks: bool = False,
) -> Self:
"""Configures serialization to a single digest, at file granularity.
Expand All @@ -269,7 +269,7 @@ def set_serialize_by_file_to_digest(
hashing_algorithm: the hashing algorithm to use to hash a file
merge_algorithm: the hashing algorithm to use when computing the
final digest over all the (file, digest) pairings
chunk_size: The amount of file to read at once. Default is 8KB. A
chunk_size: The amount of file to read at once. Default is 1MB. A
special value of 0 signals to attempt to read everything in a
single call.
allow_symlinks: Controls whether symbolic links are included. If a
Expand Down Expand Up @@ -298,7 +298,7 @@ def set_serialize_by_file_shard_to_manifest(
self,
*,
hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
chunk_size: int = 8192,
chunk_size: int = 1048576,
shard_size: int = 1000000,
max_workers: Optional[int] = None,
allow_symlinks: bool = False,
Expand All @@ -312,7 +312,7 @@ def set_serialize_by_file_shard_to_manifest(
Args:
hashing_algorithm: the hashing algorithm to use to hash a file shard
chunk_size: The amount of file to read at once. Default is 8KB. A
chunk_size: The amount of file to read at once. Default is 1MB. A
special value of 0 signals to attempt to read everything in a
single call.
shard_size: The size of a file shard. Default is 1,000,000 bytes.
Expand All @@ -339,7 +339,7 @@ def set_serialize_by_file_shard_to_digest(
*,
hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
merge_algorithm: Literal["sha256", "blake2"] = "sha256",
chunk_size: int = 8192,
chunk_size: int = 1048576,
shard_size: int = 1000000,
max_workers: Optional[int] = None,
allow_symlinks: bool = False,
Expand All @@ -354,7 +354,7 @@ def set_serialize_by_file_shard_to_digest(
hashing_algorithm: the hashing algorithm to use to hash a file shard
merge_algorithm: the hashing algorithm to use when computing the
final digest over all the (file, digest) pairings
chunk_size: The amount of file to read at once. Default is 8KB. A
chunk_size: The amount of file to read at once. Default is 1MB. A
special value of 0 signals to attempt to read everything in a
single call.
shard_size: The size of a file shard. Default is 1,000,000 bytes.
Expand Down
8 changes: 4 additions & 4 deletions src/model_signing/hashing/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def __init__(
file: pathlib.Path,
content_hasher: hashing.StreamingHashEngine,
*,
chunk_size: int = 8192,
chunk_size: int = 1048576,
digest_name_override: Optional[str] = None,
):
"""Initializes an instance to hash a file with a specific `HashEngine`.
Expand All @@ -92,7 +92,7 @@ def __init__(
file: The file to hash. Use `set_file` to reset it.
content_hasher: A `hashing.StreamingHashEngine` instance used to
compute the digest of the file.
chunk_size: The amount of file to read at once. Default is 8KB. A
chunk_size: The amount of file to read at once. Default is 1MB. A
special value of 0 signals to attempt to read everything in a
single call.
digest_name_override: Optional string to allow overriding the
Expand Down Expand Up @@ -235,7 +235,7 @@ def __init__(
*,
start: int,
end: int,
chunk_size: int = 8192,
chunk_size: int = 1048576,
shard_size: int = 1000000,
digest_name_override: Optional[str] = None,
):
Expand All @@ -250,7 +250,7 @@ def __init__(
end: The file offset to start reading from. Must be stricly greater
than start. If past the file size, or -1, it will be trimmed.
Reset with `set_shard`.
chunk_size: The amount of file to read at once. Default is 8KB. A
chunk_size: The amount of file to read at once. Default is 1MB. A
special value of 0 signals to attempt to read everything in a
single call.
shard_size: The size of a file shard. Default is 1,000,000 bytes.
Expand Down

0 comments on commit 51657ea

Please sign in to comment.