From 71fc7905506386a671e30a853e620d0aa6debe7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miloslav=20Trma=C4=8D?= Date: Thu, 10 Oct 2024 18:01:42 +0200 Subject: [PATCH 1/3] Don't set UncompressedSize on chunked pull MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current value obtained by summing the sizes of regular file contents does not match the size of the uncompressed layer tarball. We don't have a convenient source to compute the correct size for estargz without pulling the full layer and defeating the point; so we must allow for the size being unknown. For recent zstd:chunked images, we have the full tar-split, so we can compute the correct size; that will happen in the following commits. Signed-off-by: Miloslav Trmač --- drivers/driver.go | 2 +- layers.go | 13 ++++++++----- pkg/chunked/storage_linux.go | 24 +++++++----------------- store.go | 2 +- 4 files changed, 17 insertions(+), 24 deletions(-) diff --git a/drivers/driver.go b/drivers/driver.go index 91b240c450..957a348643 100644 --- a/drivers/driver.go +++ b/drivers/driver.go @@ -189,7 +189,7 @@ type Driver interface { type DriverWithDifferOutput struct { Differ Differ Target string - Size int64 + Size int64 // Size of the uncompressed layer, -1 if unknown. Must be known if UncompressedDigest is set. UIDs []uint32 GIDs []uint32 UncompressedDigest digest.Digest diff --git a/layers.go b/layers.go index 7491ce00ff..45696fd417 100644 --- a/layers.go +++ b/layers.go @@ -136,9 +136,12 @@ type Layer struct { TOCDigest digest.Digest `json:"toc-digest,omitempty"` // UncompressedSize is the length of the blob that was last passed to - // ApplyDiff() or create(), after we decompressed it. If - // UncompressedDigest is not set, this should be treated as if it were - // an uninitialized value. + // ApplyDiff() or create(), after we decompressed it. + // + // - If UncompressedDigest is set, this must be set to a valid value. + // - Otherwise, if TOCDigest is set, this is either valid or -1. + // - If neither of this digests is set, this should be treated as if it were + // an uninitialized value. UncompressedSize int64 `json:"diff-size,omitempty"` // CompressionType is the type of compression which we detected on the blob @@ -1214,8 +1217,8 @@ func (r *layerStore) Size(name string) (int64, error) { // We use the presence of a non-empty digest as an indicator that the size value was intentionally set, and that // a zero value is not just present because it was never set to anything else (which can happen if the layer was // created by a version of this library that didn't keep track of digest and size information). - if layer.TOCDigest != "" || layer.UncompressedDigest != "" { - return layer.UncompressedSize, nil + if layer.UncompressedDigest != "" || layer.TOCDigest != "" { + return layer.UncompressedSize, nil // This may return -1 if only TOCDigest is set } return -1, nil } diff --git a/pkg/chunked/storage_linux.go b/pkg/chunked/storage_linux.go index 26e8f1aba1..2106cd6c0c 100644 --- a/pkg/chunked/storage_linux.go +++ b/pkg/chunked/storage_linux.go @@ -1153,7 +1153,7 @@ func (c *chunkedDiffer) ApplyDiff(dest string, options *archive.TarOptions, diff var compressedDigest digest.Digest var uncompressedDigest digest.Digest - var convertedBlobSize int64 + var uncompressedSize int64 = -1 if c.convertToZstdChunked { fd, err := unix.Open(dest, unix.O_TMPFILE|unix.O_RDWR|unix.O_CLOEXEC, 0o600) @@ -1185,7 +1185,7 @@ func (c *chunkedDiffer) ApplyDiff(dest string, options *archive.TarOptions, diff if err != nil { return graphdriver.DriverWithDifferOutput{}, err } - convertedBlobSize = tarSize + uncompressedSize = tarSize // fileSource is a O_TMPFILE file descriptor, so we // need to keep it open until the entire file is processed. defer fileSource.Close() @@ -1255,6 +1255,7 @@ func (c *chunkedDiffer) ApplyDiff(dest string, options *archive.TarOptions, diff TOCDigest: c.tocDigest, UncompressedDigest: uncompressedDigest, CompressedDigest: compressedDigest, + Size: uncompressedSize, } // When the hard links deduplication is used, file attributes are ignored because setting them @@ -1268,19 +1269,12 @@ func (c *chunkedDiffer) ApplyDiff(dest string, options *archive.TarOptions, diff var missingParts []missingPart - mergedEntries, totalSizeFromTOC, err := c.mergeTocEntries(c.fileType, toc.Entries) + mergedEntries, err := c.mergeTocEntries(c.fileType, toc.Entries) if err != nil { return output, err } output.UIDs, output.GIDs = collectIDs(mergedEntries) - if convertedBlobSize > 0 { - // if the image was converted, store the original tar size, so that - // it can be recreated correctly. - output.Size = convertedBlobSize - } else { - output.Size = totalSizeFromTOC - } if err := maybeDoIDRemap(mergedEntries, options); err != nil { return output, err @@ -1597,9 +1591,7 @@ func mustSkipFile(fileType compressedFileType, e internal.FileMetadata) bool { return false } -func (c *chunkedDiffer) mergeTocEntries(fileType compressedFileType, entries []internal.FileMetadata) ([]fileMetadata, int64, error) { - var totalFilesSize int64 - +func (c *chunkedDiffer) mergeTocEntries(fileType compressedFileType, entries []internal.FileMetadata) ([]fileMetadata, error) { countNextChunks := func(start int) int { count := 0 for _, e := range entries[start:] { @@ -1629,10 +1621,8 @@ func (c *chunkedDiffer) mergeTocEntries(fileType compressedFileType, entries []i continue } - totalFilesSize += e.Size - if e.Type == TypeChunk { - return nil, -1, fmt.Errorf("chunk type without a regular file") + return nil, fmt.Errorf("chunk type without a regular file") } if e.Type == TypeReg { @@ -1668,7 +1658,7 @@ func (c *chunkedDiffer) mergeTocEntries(fileType compressedFileType, entries []i lastChunkOffset = mergedEntries[i].chunks[j].Offset } } - return mergedEntries, totalFilesSize, nil + return mergedEntries, nil } // validateChunkChecksum checks if the file at $root/$path[offset:chunk.ChunkSize] has the diff --git a/store.go b/store.go index 8f33ea217c..46a0c6ff78 100644 --- a/store.go +++ b/store.go @@ -2201,7 +2201,7 @@ func (s *store) ImageSize(id string) (int64, error) { } // The UncompressedSize is only valid if there's a digest to go with it. n := layer.UncompressedSize - if layer.UncompressedDigest == "" { + if layer.UncompressedDigest == "" || n == -1 { // Compute the size. n, err = layerStore.DiffSize("", layer.ID) if err != nil { From 7eb4a104ef10d0bcd4eaf62cf110ae0535fdd97b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miloslav=20Trma=C4=8D?= Date: Sat, 12 Oct 2024 00:04:27 +0200 Subject: [PATCH 2/3] Explicitly differentiate between empty and missing tar-split MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Empty tar-split shouldn't ever happen, but being precise here doesn't hurt. Signed-off-by: Miloslav Trmač --- drivers/driver.go | 2 +- layers.go | 2 +- pkg/chunked/compression_linux.go | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/driver.go b/drivers/driver.go index 957a348643..dc9b12c3ca 100644 --- a/drivers/driver.go +++ b/drivers/driver.go @@ -196,7 +196,7 @@ type DriverWithDifferOutput struct { CompressedDigest digest.Digest Metadata string BigData map[string][]byte - TarSplit []byte + TarSplit []byte // nil if not available TOCDigest digest.Digest // RootDirMode is the mode of the root directory of the layer, if specified. RootDirMode *os.FileMode diff --git a/layers.go b/layers.go index 45696fd417..e29e350e9a 100644 --- a/layers.go +++ b/layers.go @@ -2513,7 +2513,7 @@ func (r *layerStore) applyDiffFromStagingDirectory(id string, diffOutput *driver return err } - if len(diffOutput.TarSplit) != 0 { + if diffOutput.TarSplit != nil { tsdata := bytes.Buffer{} compressor, err := pgzip.NewWriterLevel(&tsdata, pgzip.BestSpeed) if err != nil { diff --git a/pkg/chunked/compression_linux.go b/pkg/chunked/compression_linux.go index e660e2a376..5f69e3a208 100644 --- a/pkg/chunked/compression_linux.go +++ b/pkg/chunked/compression_linux.go @@ -139,7 +139,7 @@ func readEstargzChunkedManifest(blobStream ImageSourceSeekable, blobSize int64, } // readZstdChunkedManifest reads the zstd:chunked manifest from the seekable stream blobStream. -// Returns (manifest blob, parsed manifest, tar-split blob, manifest offset). +// Returns (manifest blob, parsed manifest, tar-split blob or nil, manifest offset). func readZstdChunkedManifest(blobStream ImageSourceSeekable, tocDigest digest.Digest, annotations map[string]string) ([]byte, *internal.TOC, []byte, int64, error) { offsetMetadata := annotations[internal.ManifestInfoKey] if offsetMetadata == "" { @@ -214,7 +214,7 @@ func readZstdChunkedManifest(blobStream ImageSourceSeekable, tocDigest digest.Di return nil, nil, nil, 0, fmt.Errorf("unmarshaling TOC: %w", err) } - decodedTarSplit := []byte{} + var decodedTarSplit []byte = nil if toc.TarSplitDigest != "" { if tarSplitChunk.Offset <= 0 { return nil, nil, nil, 0, fmt.Errorf("TOC requires a tar-split, but the %s annotation does not describe a position", internal.TarSplitInfoKey) From f979bada647d1d1f755e5c29bbcb0da5e9879041 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miloslav=20Trma=C4=8D?= Date: Sat, 12 Oct 2024 00:24:23 +0200 Subject: [PATCH 3/3] Compute the layer size from tar-split for zstd:chunked layers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Miloslav Trmač --- pkg/chunked/compression_linux.go | 30 +++++++++++++ pkg/chunked/compression_linux_test.go | 45 +++++++++++++++++++ pkg/chunked/storage_linux.go | 63 ++++++++++++++++----------- 3 files changed, 112 insertions(+), 26 deletions(-) create mode 100644 pkg/chunked/compression_linux_test.go diff --git a/pkg/chunked/compression_linux.go b/pkg/chunked/compression_linux.go index 5f69e3a208..2dac463543 100644 --- a/pkg/chunked/compression_linux.go +++ b/pkg/chunked/compression_linux.go @@ -288,6 +288,36 @@ func ensureTOCMatchesTarSplit(toc *internal.TOC, tarSplit []byte) error { return nil } +// tarSizeFromTarSplit computes the total tarball size, using only the tarSplit metadata +func tarSizeFromTarSplit(tarSplit []byte) (int64, error) { + var res int64 = 0 + + unpacker := storage.NewJSONUnpacker(bytes.NewReader(tarSplit)) + for { + entry, err := unpacker.Next() + if err != nil { + if err == io.EOF { + break + } + return -1, fmt.Errorf("reading tar-split entries: %w", err) + } + switch entry.Type { + case storage.SegmentType: + res += int64(len(entry.Payload)) + case storage.FileType: + // entry.Size is the “logical size”, which might not be the physical size for sparse entries; + // but the way tar-split/tar/asm.WriteOutputTarStream combines FileType entries and returned files contents, + // sparse files are not supported. + // Also https://github.com/opencontainers/image-spec/blob/main/layer.md says + // > Sparse files SHOULD NOT be used because they lack consistent support across tar implementations. + res += entry.Size + default: + return -1, fmt.Errorf("unexpected tar-split entry type %q", entry.Type) + } + } + return res, nil +} + // ensureTimePointersMatch ensures that a and b are equal func ensureTimePointersMatch(a, b *time.Time) error { // We didn’t always use “timeIfNotZero” when creating the TOC, so treat time.IsZero the same as nil. diff --git a/pkg/chunked/compression_linux_test.go b/pkg/chunked/compression_linux_test.go new file mode 100644 index 0000000000..1b4a18e0db --- /dev/null +++ b/pkg/chunked/compression_linux_test.go @@ -0,0 +1,45 @@ +package chunked + +import ( + "bytes" + "io" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/vbatts/tar-split/archive/tar" + "github.com/vbatts/tar-split/tar/asm" + "github.com/vbatts/tar-split/tar/storage" +) + +func TestTarSizeFromTarSplit(t *testing.T) { + var tarball bytes.Buffer + tarWriter := tar.NewWriter(&tarball) + for _, e := range someFiles { + tf, err := typeToTarType(e.Type) + require.NoError(t, err) + err = tarWriter.WriteHeader(&tar.Header{ + Typeflag: tf, + Name: e.Name, + Size: e.Size, + Mode: e.Mode, + }) + require.NoError(t, err) + data := make([]byte, e.Size) + _, err = tarWriter.Write(data) + require.NoError(t, err) + } + err := tarWriter.Close() + require.NoError(t, err) + expectedTarSize := int64(tarball.Len()) + + var tarSplit bytes.Buffer + tsReader, err := asm.NewInputTarStream(&tarball, storage.NewJSONPacker(&tarSplit), storage.NewDiscardFilePutter()) + require.NoError(t, err) + _, err = io.Copy(io.Discard, tsReader) + require.NoError(t, err) + + res, err := tarSizeFromTarSplit(tarSplit.Bytes()) + require.NoError(t, err) + assert.Equal(t, expectedTarSize, res) +} diff --git a/pkg/chunked/storage_linux.go b/pkg/chunked/storage_linux.go index 2106cd6c0c..f438d66a47 100644 --- a/pkg/chunked/storage_linux.go +++ b/pkg/chunked/storage_linux.go @@ -89,7 +89,8 @@ type chunkedDiffer struct { // is no TOC referenced by the manifest. blobDigest digest.Digest - blobSize int64 + blobSize int64 + uncompressedTarSize int64 // -1 if unknown pullOptions map[string]string @@ -216,6 +217,7 @@ func makeConvertFromRawDiffer(store storage.Store, blobDigest digest.Digest, blo fsVerityDigests: make(map[string]string), blobDigest: blobDigest, blobSize: blobSize, + uncompressedTarSize: -1, // Will be computed later convertToZstdChunked: true, copyBuffer: makeCopyBuffer(), layersCache: layersCache, @@ -229,24 +231,33 @@ func makeZstdChunkedDiffer(store storage.Store, blobSize int64, tocDigest digest if err != nil { return nil, fmt.Errorf("read zstd:chunked manifest: %w", err) } + var uncompressedTarSize int64 = -1 + if tarSplit != nil { + uncompressedTarSize, err = tarSizeFromTarSplit(tarSplit) + if err != nil { + return nil, fmt.Errorf("computing size from tar-split") + } + } + layersCache, err := getLayersCache(store) if err != nil { return nil, err } return &chunkedDiffer{ - fsVerityDigests: make(map[string]string), - blobSize: blobSize, - tocDigest: tocDigest, - copyBuffer: makeCopyBuffer(), - fileType: fileTypeZstdChunked, - layersCache: layersCache, - manifest: manifest, - toc: toc, - pullOptions: pullOptions, - stream: iss, - tarSplit: tarSplit, - tocOffset: tocOffset, + fsVerityDigests: make(map[string]string), + blobSize: blobSize, + uncompressedTarSize: uncompressedTarSize, + tocDigest: tocDigest, + copyBuffer: makeCopyBuffer(), + fileType: fileTypeZstdChunked, + layersCache: layersCache, + manifest: manifest, + toc: toc, + pullOptions: pullOptions, + stream: iss, + tarSplit: tarSplit, + tocOffset: tocOffset, }, nil } @@ -261,16 +272,17 @@ func makeEstargzChunkedDiffer(store storage.Store, blobSize int64, tocDigest dig } return &chunkedDiffer{ - fsVerityDigests: make(map[string]string), - blobSize: blobSize, - tocDigest: tocDigest, - copyBuffer: makeCopyBuffer(), - fileType: fileTypeEstargz, - layersCache: layersCache, - manifest: manifest, - pullOptions: pullOptions, - stream: iss, - tocOffset: tocOffset, + fsVerityDigests: make(map[string]string), + blobSize: blobSize, + uncompressedTarSize: -1, // We would have to read and decompress the whole layer + tocDigest: tocDigest, + copyBuffer: makeCopyBuffer(), + fileType: fileTypeEstargz, + layersCache: layersCache, + manifest: manifest, + pullOptions: pullOptions, + stream: iss, + tocOffset: tocOffset, }, nil } @@ -1153,7 +1165,6 @@ func (c *chunkedDiffer) ApplyDiff(dest string, options *archive.TarOptions, diff var compressedDigest digest.Digest var uncompressedDigest digest.Digest - var uncompressedSize int64 = -1 if c.convertToZstdChunked { fd, err := unix.Open(dest, unix.O_TMPFILE|unix.O_RDWR|unix.O_CLOEXEC, 0o600) @@ -1185,7 +1196,7 @@ func (c *chunkedDiffer) ApplyDiff(dest string, options *archive.TarOptions, diff if err != nil { return graphdriver.DriverWithDifferOutput{}, err } - uncompressedSize = tarSize + c.uncompressedTarSize = tarSize // fileSource is a O_TMPFILE file descriptor, so we // need to keep it open until the entire file is processed. defer fileSource.Close() @@ -1255,7 +1266,7 @@ func (c *chunkedDiffer) ApplyDiff(dest string, options *archive.TarOptions, diff TOCDigest: c.tocDigest, UncompressedDigest: uncompressedDigest, CompressedDigest: compressedDigest, - Size: uncompressedSize, + Size: c.uncompressedTarSize, } // When the hard links deduplication is used, file attributes are ignored because setting them