From 80f093bf20c04bf166fa7fb07429e2940a9fa11b Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 24 Feb 2025 11:58:09 +0000
Subject: [PATCH] Bump github.com/klauspost/compress from 1.17.11 to 1.18.0

Bumps [github.com/klauspost/compress](https://github.com/klauspost/compress) from 1.17.11 to 1.18.0.
- [Release notes](https://github.com/klauspost/compress/releases)
- [Changelog](https://github.com/klauspost/compress/blob/master/.goreleaser.yml)
- [Commits](https://github.com/klauspost/compress/compare/v1.17.11...v1.18.0)

---
updated-dependencies:
- dependency-name: github.com/klauspost/compress
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 go.mod                                        |   2 +-
 go.sum                                        |   4 +-
 .../github.com/klauspost/compress/README.md   | 140 ++----
 .../klauspost/compress/flate/fast_encoder.go  |  63 ++-
 .../compress/flate/huffman_bit_writer.go      |  19 +-
 .../klauspost/compress/flate/level1.go        |  48 +-
 .../klauspost/compress/flate/level2.go        |   2 +-
 .../klauspost/compress/flate/level3.go        |   2 +-
 .../klauspost/compress/flate/level4.go        |  10 +-
 .../klauspost/compress/flate/level5.go        |  40 +-
 .../klauspost/compress/flate/level6.go        |  32 +-
 .../compress/flate/matchlen_amd64.go          |  16 -
 .../klauspost/compress/flate/matchlen_amd64.s |  66 ---
 .../compress/flate/matchlen_generic.go        |  15 +-
 .../klauspost/compress/flate/stateless.go     |  13 +-
 .../klauspost/compress/huff0/bitreader.go     |  25 +-
 .../klauspost/compress/internal/le/le.go      |   5 +
 .../compress/internal/le/unsafe_disabled.go   |  42 ++
 .../compress/internal/le/unsafe_enabled.go    |  55 +++
 .../klauspost/compress/s2/README.md           |   2 +-
 .../klauspost/compress/s2/decode_other.go     |  26 +-
 .../klauspost/compress/s2/encode_all.go       | 422 +++++++++++++++++-
 .../klauspost/compress/s2/encode_better.go    | 416 ++++++++++++++++-
 .../klauspost/compress/s2/encode_go.go        |  12 +
 vendor/github.com/klauspost/compress/s2sx.mod |   3 +-
 .../klauspost/compress/zstd/README.md         |   2 +-
 .../klauspost/compress/zstd/bitreader.go      |  37 +-
 .../klauspost/compress/zstd/blockdec.go       |  19 -
 .../klauspost/compress/zstd/blockenc.go       |  27 +-
 .../klauspost/compress/zstd/decoder.go        |   3 +-
 .../klauspost/compress/zstd/enc_base.go       |   2 +-
 .../compress/zstd/matchlen_generic.go         |  11 +-
 .../klauspost/compress/zstd/seqdec.go         |   2 +-
 .../klauspost/compress/zstd/seqdec_amd64.s    |  64 +--
 .../klauspost/compress/zstd/seqdec_generic.go |   2 +-
 .../klauspost/compress/zstd/seqenc.go         |   2 -
 .../klauspost/compress/zstd/snappy.go         |   4 +-
 .../klauspost/compress/zstd/zstd.go           |   7 +-
 vendor/modules.txt                            |   5 +-
 39 files changed, 1217 insertions(+), 450 deletions(-)
 delete mode 100644 vendor/github.com/klauspost/compress/flate/matchlen_amd64.go
 delete mode 100644 vendor/github.com/klauspost/compress/flate/matchlen_amd64.s
 create mode 100644 vendor/github.com/klauspost/compress/internal/le/le.go
 create mode 100644 vendor/github.com/klauspost/compress/internal/le/unsafe_disabled.go
 create mode 100644 vendor/github.com/klauspost/compress/internal/le/unsafe_enabled.go

diff --git a/go.mod b/go.mod
index 0a4c4feff..d6267b37c 100644
--- a/go.mod
+++ b/go.mod
@@ -19,7 +19,7 @@ require (
 	github.com/golang/snappy v0.0.4 // indirect
 	github.com/google/go-cmp v0.6.0
 	github.com/kardianos/osext v0.0.0-20190222173326-2bc1f35cddc0 // indirect
-	github.com/klauspost/compress v1.17.11
+	github.com/klauspost/compress v1.18.0
 	github.com/lomik/graphite-pickle v0.0.0-20171221213606-614e8df42119
 	github.com/lomik/og-rek v0.0.0-20170411191824-628eefeb8d80
 	github.com/lomik/stop v0.0.0-20161127103810-188e98d969bd
diff --git a/go.sum b/go.sum
index f2ee570ee..0fa45d716 100644
--- a/go.sum
+++ b/go.sum
@@ -166,8 +166,8 @@ github.com/kardianos/osext v0.0.0-20190222173326-2bc1f35cddc0/go.mod h1:1NbS8ALr
 github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
 github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
 github.com/klauspost/compress v1.12.2/go.mod h1:8dP1Hq4DHOhN9w426knH3Rhby4rFm6D8eO+e+Dq5Gzg=
-github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc=
-github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0=
+github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
+github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
 github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
 github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
 github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
diff --git a/vendor/github.com/klauspost/compress/README.md b/vendor/github.com/klauspost/compress/README.md
index de264c85a..244ee19c4 100644
--- a/vendor/github.com/klauspost/compress/README.md
+++ b/vendor/github.com/klauspost/compress/README.md
@@ -14,8 +14,34 @@ This package provides various compression algorithms.
 [![Go](https://github.com/klauspost/compress/actions/workflows/go.yml/badge.svg)](https://github.com/klauspost/compress/actions/workflows/go.yml)
 [![Sourcegraph Badge](https://sourcegraph.com/github.com/klauspost/compress/-/badge.svg)](https://sourcegraph.com/github.com/klauspost/compress?badge)
 
+# package usage
+
+Use `go get github.com/klauspost/compress@latest` to add it to your project.
+
+This package will support the current Go version and 2 versions back.
+
+* Use the `nounsafe` tag to disable all use of the "unsafe" package.
+* Use the `noasm` tag to disable all assembly across packages.
+
+Use the links above for more information on each.
+
 # changelog
 
+* Feb 19th, 2025 - [1.18.0](https://github.com/klauspost/compress/releases/tag/v1.18.0)
+  * Add unsafe little endian loaders https://github.com/klauspost/compress/pull/1036
+  * fix: check `r.err != nil` but return a nil value error `err` by @alingse in https://github.com/klauspost/compress/pull/1028
+  * flate: Simplify L4-6 loading https://github.com/klauspost/compress/pull/1043
+  * flate: Simplify matchlen (remove asm) https://github.com/klauspost/compress/pull/1045
+  * s2: Improve small block compression speed w/o asm https://github.com/klauspost/compress/pull/1048
+  * flate: Fix matchlen L5+L6 https://github.com/klauspost/compress/pull/1049
+  * flate: Cleanup & reduce casts https://github.com/klauspost/compress/pull/1050
+
+* Oct 11th, 2024 - [1.17.11](https://github.com/klauspost/compress/releases/tag/v1.17.11)
+  * zstd: Fix extra CRC written with multiple Close calls https://github.com/klauspost/compress/pull/1017
+  * s2: Don't use stack for index tables https://github.com/klauspost/compress/pull/1014
+  * gzhttp: No content-type on no body response code by @juliens in https://github.com/klauspost/compress/pull/1011
+  * gzhttp: Do not set the content-type when response has no body by @kevinpollet in https://github.com/klauspost/compress/pull/1013
+
 * Sep 23rd, 2024 - [1.17.10](https://github.com/klauspost/compress/releases/tag/v1.17.10)
 	* gzhttp: Add TransportAlwaysDecompress option. https://github.com/klauspost/compress/pull/978
 	* gzhttp: Add supported decompress request body by @mirecl in https://github.com/klauspost/compress/pull/1002
@@ -65,9 +91,9 @@ https://github.com/klauspost/compress/pull/919 https://github.com/klauspost/comp
 	* zstd: Fix rare *CORRUPTION* output in "best" mode. See https://github.com/klauspost/compress/pull/876
 
 * Oct 14th, 2023 - [v1.17.1](https://github.com/klauspost/compress/releases/tag/v1.17.1)
-	* s2: Fix S2 "best" dictionary wrong encoding by @klauspost in https://github.com/klauspost/compress/pull/871
+	* s2: Fix S2 "best" dictionary wrong encoding https://github.com/klauspost/compress/pull/871
 	* flate: Reduce allocations in decompressor and minor code improvements by @fakefloordiv in https://github.com/klauspost/compress/pull/869
-	* s2: Fix EstimateBlockSize on 6&7 length input by @klauspost in https://github.com/klauspost/compress/pull/867
+	* s2: Fix EstimateBlockSize on 6&7 length input https://github.com/klauspost/compress/pull/867
 
 * Sept 19th, 2023 - [v1.17.0](https://github.com/klauspost/compress/releases/tag/v1.17.0)
 	* Add experimental dictionary builder  https://github.com/klauspost/compress/pull/853
@@ -124,7 +150,7 @@ https://github.com/klauspost/compress/pull/919 https://github.com/klauspost/comp
 	<summary>See changes to v1.15.x</summary>
 	
 * Jan 21st, 2023 (v1.15.15)
-	* deflate: Improve level 7-9 by @klauspost in https://github.com/klauspost/compress/pull/739
+	* deflate: Improve level 7-9 https://github.com/klauspost/compress/pull/739
 	* zstd: Add delta encoding support by @greatroar in https://github.com/klauspost/compress/pull/728
 	* zstd: Various speed improvements by @greatroar https://github.com/klauspost/compress/pull/741 https://github.com/klauspost/compress/pull/734 https://github.com/klauspost/compress/pull/736 https://github.com/klauspost/compress/pull/744 https://github.com/klauspost/compress/pull/743 https://github.com/klauspost/compress/pull/745
 	* gzhttp: Add SuffixETag() and DropETag() options to prevent ETag collisions on compressed responses by @willbicks in https://github.com/klauspost/compress/pull/740
@@ -167,7 +193,7 @@ https://github.com/klauspost/compress/pull/919 https://github.com/klauspost/comp
 
 	* zstd: Fix decoder crash on amd64 (no BMI) on invalid input https://github.com/klauspost/compress/pull/645
 	* zstd: Disable decoder extended memory copies (amd64) due to possible crashes https://github.com/klauspost/compress/pull/644
-	* zstd: Allow single segments up to "max decoded size" by @klauspost in https://github.com/klauspost/compress/pull/643
+	* zstd: Allow single segments up to "max decoded size" https://github.com/klauspost/compress/pull/643
 
 * July 13, 2022 (v1.15.8)
 
@@ -209,7 +235,7 @@ https://github.com/klauspost/compress/pull/919 https://github.com/klauspost/comp
 	* zstd: Speed up when WithDecoderLowmem(false) https://github.com/klauspost/compress/pull/599
 	* zstd: faster next state update in BMI2 version of decode by @WojciechMula in https://github.com/klauspost/compress/pull/593
 	* huff0: Do not check max size when reading table. https://github.com/klauspost/compress/pull/586
-	* flate: Inplace hashing for level 7-9 by @klauspost in https://github.com/klauspost/compress/pull/590
+	* flate: Inplace hashing for level 7-9 https://github.com/klauspost/compress/pull/590
 
 
 * May 11, 2022 (v1.15.4)
@@ -236,12 +262,12 @@ https://github.com/klauspost/compress/pull/919 https://github.com/klauspost/comp
 	* zstd: Add stricter block size checks in [#523](https://github.com/klauspost/compress/pull/523)
 
 * Mar 3, 2022 (v1.15.0)
-	* zstd: Refactor decoder by @klauspost in [#498](https://github.com/klauspost/compress/pull/498)
-	* zstd: Add stream encoding without goroutines by @klauspost in [#505](https://github.com/klauspost/compress/pull/505)
+	* zstd: Refactor decoder [#498](https://github.com/klauspost/compress/pull/498)
+	* zstd: Add stream encoding without goroutines [#505](https://github.com/klauspost/compress/pull/505)
 	* huff0: Prevent single blocks exceeding 16 bits by @klauspost in[#507](https://github.com/klauspost/compress/pull/507)
-	* flate: Inline literal emission by @klauspost in [#509](https://github.com/klauspost/compress/pull/509)
-	* gzhttp: Add zstd to transport by @klauspost in [#400](https://github.com/klauspost/compress/pull/400)
-	* gzhttp: Make content-type optional by @klauspost in [#510](https://github.com/klauspost/compress/pull/510)
+	* flate: Inline literal emission [#509](https://github.com/klauspost/compress/pull/509)
+	* gzhttp: Add zstd to transport [#400](https://github.com/klauspost/compress/pull/400)
+	* gzhttp: Make content-type optional [#510](https://github.com/klauspost/compress/pull/510)
 
 Both compression and decompression now supports "synchronous" stream operations. This means that whenever "concurrency" is set to 1, they will operate without spawning goroutines.
 
@@ -258,7 +284,7 @@ While the release has been extensively tested, it is recommended to testing when
 	* flate: Fix rare huffman only (-2) corruption. [#503](https://github.com/klauspost/compress/pull/503)
 	* zip: Update deprecated CreateHeaderRaw to correctly call CreateRaw by @saracen in [#502](https://github.com/klauspost/compress/pull/502)
 	* zip: don't read data descriptor early by @saracen in [#501](https://github.com/klauspost/compress/pull/501)  #501
-	* huff0: Use static decompression buffer up to 30% faster by @klauspost in [#499](https://github.com/klauspost/compress/pull/499) [#500](https://github.com/klauspost/compress/pull/500)
+	* huff0: Use static decompression buffer up to 30% faster [#499](https://github.com/klauspost/compress/pull/499) [#500](https://github.com/klauspost/compress/pull/500)
 
 * Feb 17, 2022 (v1.14.3)
 	* flate: Improve fastest levels compression speed ~10% more throughput. [#482](https://github.com/klauspost/compress/pull/482) [#489](https://github.com/klauspost/compress/pull/489) [#490](https://github.com/klauspost/compress/pull/490) [#491](https://github.com/klauspost/compress/pull/491) [#494](https://github.com/klauspost/compress/pull/494)  [#478](https://github.com/klauspost/compress/pull/478)
@@ -565,12 +591,14 @@ While the release has been extensively tested, it is recommended to testing when
 
 The packages are drop-in replacements for standard libraries. Simply replace the import path to use them:
 
-| old import         | new import                              | Documentation
-|--------------------|-----------------------------------------|--------------------|
-| `compress/gzip`    | `github.com/klauspost/compress/gzip`    | [gzip](https://pkg.go.dev/github.com/klauspost/compress/gzip?tab=doc)
-| `compress/zlib`    | `github.com/klauspost/compress/zlib`    | [zlib](https://pkg.go.dev/github.com/klauspost/compress/zlib?tab=doc)
-| `archive/zip`      | `github.com/klauspost/compress/zip`     | [zip](https://pkg.go.dev/github.com/klauspost/compress/zip?tab=doc)
-| `compress/flate`   | `github.com/klauspost/compress/flate`   | [flate](https://pkg.go.dev/github.com/klauspost/compress/flate?tab=doc)
+Typical speed is about 2x of the standard library packages.
+
+| old import       | new import                            | Documentation                                                           |
+|------------------|---------------------------------------|-------------------------------------------------------------------------|
+| `compress/gzip`  | `github.com/klauspost/compress/gzip`  | [gzip](https://pkg.go.dev/github.com/klauspost/compress/gzip?tab=doc)   |
+| `compress/zlib`  | `github.com/klauspost/compress/zlib`  | [zlib](https://pkg.go.dev/github.com/klauspost/compress/zlib?tab=doc)   |
+| `archive/zip`    | `github.com/klauspost/compress/zip`   | [zip](https://pkg.go.dev/github.com/klauspost/compress/zip?tab=doc)     |
+| `compress/flate` | `github.com/klauspost/compress/flate` | [flate](https://pkg.go.dev/github.com/klauspost/compress/flate?tab=doc) |
 
 * Optimized [deflate](https://godoc.org/github.com/klauspost/compress/flate) packages which can be used as a dropin replacement for [gzip](https://godoc.org/github.com/klauspost/compress/gzip), [zip](https://godoc.org/github.com/klauspost/compress/zip) and [zlib](https://godoc.org/github.com/klauspost/compress/zlib).
 
@@ -625,84 +653,6 @@ This will only use up to 4KB in memory when the writer is idle.
 Compression is almost always worse than the fastest compression level 
 and each write will allocate (a little) memory. 
 
-# Performance Update 2018
-
-It has been a while since we have been looking at the speed of this package compared to the standard library, so I thought I would re-do my tests and give some overall recommendations based on the current state. All benchmarks have been performed with Go 1.10 on my Desktop Intel(R) Core(TM) i7-2600 CPU @3.40GHz. Since I last ran the tests, I have gotten more RAM, which means tests with big files are no longer limited by my SSD.
-
-The raw results are in my [updated spreadsheet](https://docs.google.com/spreadsheets/d/1nuNE2nPfuINCZJRMt6wFWhKpToF95I47XjSsc-1rbPQ/edit?usp=sharing). Due to cgo changes and upstream updates i could not get the cgo version of gzip to compile. Instead I included the [zstd](https://github.com/datadog/zstd) cgo implementation. If I get cgo gzip to work again, I might replace the results in the sheet.
-
-The columns to take note of are: *MB/s* - the throughput. *Reduction* - the data size reduction in percent of the original. *Rel Speed* relative speed compared to the standard library at the same level. *Smaller* - how many percent smaller is the compressed output compared to stdlib. Negative means the output was bigger. *Loss* means the loss (or gain) in compression as a percentage difference of the input.
-
-The `gzstd` (standard library gzip) and `gzkp` (this package gzip) only uses one CPU core. [`pgzip`](https://github.com/klauspost/pgzip), [`bgzf`](https://github.com/biogo/hts/tree/master/bgzf) uses all 4 cores. [`zstd`](https://github.com/DataDog/zstd) uses one core, and is a beast (but not Go, yet).
-
-
-## Overall differences.
-
-There appears to be a roughly 5-10% speed advantage over the standard library when comparing at similar compression levels.
-
-The biggest difference you will see is the result of [re-balancing](https://blog.klauspost.com/rebalancing-deflate-compression-levels/) the compression levels. I wanted by library to give a smoother transition between the compression levels than the standard library.
-
-This package attempts to provide a more smooth transition, where "1" is taking a lot of shortcuts, "5" is the reasonable trade-off and "9" is the "give me the best compression", and the values in between gives something reasonable in between. The standard library has big differences in levels 1-4, but levels 5-9 having no significant gains - often spending a lot more time than can be justified by the achieved compression.
-
-There are links to all the test data in the [spreadsheet](https://docs.google.com/spreadsheets/d/1nuNE2nPfuINCZJRMt6wFWhKpToF95I47XjSsc-1rbPQ/edit?usp=sharing) in the top left field on each tab.
-
-## Web Content
-
-This test set aims to emulate typical use in a web server. The test-set is 4GB data in 53k files, and is a mixture of (mostly) HTML, JS, CSS.
-
-Since level 1 and 9 are close to being the same code, they are quite close. But looking at the levels in-between the differences are quite big.
-
-Looking at level 6, this package is 88% faster, but will output about 6% more data. For a web server, this means you can serve 88% more data, but have to pay for 6% more bandwidth. You can draw your own conclusions on what would be the most expensive for your case.
-
-## Object files
-
-This test is for typical data files stored on a server. In this case it is a collection of Go precompiled objects. They are very compressible.
-
-The picture is similar to the web content, but with small differences since this is very compressible. Levels 2-3 offer good speed, but is sacrificing quite a bit of compression. 
-
-The standard library seems suboptimal on level 3 and 4 - offering both worse compression and speed than level 6 & 7 of this package respectively.
-
-## Highly Compressible File
-
-This is a JSON file with very high redundancy. The reduction starts at 95% on level 1, so in real life terms we are dealing with something like a highly redundant stream of data, etc.
-
-It is definitely visible that we are dealing with specialized content here, so the results are very scattered. This package does not do very well at levels 1-4, but picks up significantly at level 5 and levels 7 and 8 offering great speed for the achieved compression.
-
-So if you know you content is extremely compressible you might want to go slightly higher than the defaults. The standard library has a huge gap between levels 3 and 4 in terms of speed (2.75x slowdown), so it offers little "middle ground".
-
-## Medium-High Compressible
-
-This is a pretty common test corpus: [enwik9](http://mattmahoney.net/dc/textdata.html). It contains the first 10^9 bytes of the English Wikipedia dump on Mar. 3, 2006. This is a very good test of typical text based compression and more data heavy streams.
-
-We see a similar picture here as in "Web Content". On equal levels some compression is sacrificed for more speed. Level 5 seems to be the best trade-off between speed and size, beating stdlib level 3 in both.
-
-## Medium Compressible
-
-I will combine two test sets, one [10GB file set](http://mattmahoney.net/dc/10gb.html) and a VM disk image (~8GB). Both contain different data types and represent a typical backup scenario.
-
-The most notable thing is how quickly the standard library drops to very low compression speeds around level 5-6 without any big gains in compression. Since this type of data is fairly common, this does not seem like good behavior.
-
-
-## Un-compressible Content
-
-This is mainly a test of how good the algorithms are at detecting un-compressible input. The standard library only offers this feature with very conservative settings at level 1. Obviously there is no reason for the algorithms to try to compress input that cannot be compressed.  The only downside is that it might skip some compressible data on false detections.
-
-
-## Huffman only compression
-
-This compression library adds a special compression level, named `HuffmanOnly`, which allows near linear time compression. This is done by completely disabling matching of previous data, and only reduce the number of bits to represent each character. 
-
-This means that often used characters, like 'e' and ' ' (space) in text use the fewest bits to represent, and rare characters like '¤' takes more bits to represent. For more information see [wikipedia](https://en.wikipedia.org/wiki/Huffman_coding) or this nice [video](https://youtu.be/ZdooBTdW5bM).
-
-Since this type of compression has much less variance, the compression speed is mostly unaffected by the input data, and is usually more than *180MB/s* for a single core.
-
-The downside is that the compression ratio is usually considerably worse than even the fastest conventional compression. The compression ratio can never be better than 8:1 (12.5%). 
-
-The linear time compression can be used as a "better than nothing" mode, where you cannot risk the encoder to slow down on some content. For comparison, the size of the "Twain" text is *233460 bytes* (+29% vs. level 1) and encode speed is 144MB/s (4.5x level 1). So in this case you trade a 30% size increase for a 4 times speedup.
-
-For more information see my blog post on [Fast Linear Time Compression](http://blog.klauspost.com/constant-time-gzipzip-compression/).
-
-This is implemented on Go 1.7 as "Huffman Only" mode, though not exposed for gzip.
 
 # Other packages
 
diff --git a/vendor/github.com/klauspost/compress/flate/fast_encoder.go b/vendor/github.com/klauspost/compress/flate/fast_encoder.go
index c8124b5c4..0e8b1630c 100644
--- a/vendor/github.com/klauspost/compress/flate/fast_encoder.go
+++ b/vendor/github.com/klauspost/compress/flate/fast_encoder.go
@@ -6,8 +6,10 @@
 package flate
 
 import (
-	"encoding/binary"
 	"fmt"
+	"math/bits"
+
+	"github.com/klauspost/compress/internal/le"
 )
 
 type fastEnc interface {
@@ -58,11 +60,11 @@ const (
 )
 
 func load3232(b []byte, i int32) uint32 {
-	return binary.LittleEndian.Uint32(b[i:])
+	return le.Load32(b, i)
 }
 
 func load6432(b []byte, i int32) uint64 {
-	return binary.LittleEndian.Uint64(b[i:])
+	return le.Load64(b, i)
 }
 
 type tableEntry struct {
@@ -134,8 +136,8 @@ func hashLen(u uint64, length, mls uint8) uint32 {
 // matchlen will return the match length between offsets and t in src.
 // The maximum length returned is maxMatchLength - 4.
 // It is assumed that s > t, that t >=0 and s < len(src).
-func (e *fastGen) matchlen(s, t int32, src []byte) int32 {
-	if debugDecode {
+func (e *fastGen) matchlen(s, t int, src []byte) int32 {
+	if debugDeflate {
 		if t >= s {
 			panic(fmt.Sprint("t >=s:", t, s))
 		}
@@ -149,18 +151,34 @@ func (e *fastGen) matchlen(s, t int32, src []byte) int32 {
 			panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")"))
 		}
 	}
-	s1 := int(s) + maxMatchLength - 4
-	if s1 > len(src) {
-		s1 = len(src)
+	s1 := min(s+maxMatchLength-4, len(src))
+	left := s1 - s
+	n := int32(0)
+	for left >= 8 {
+		diff := le.Load64(src, s) ^ le.Load64(src, t)
+		if diff != 0 {
+			return n + int32(bits.TrailingZeros64(diff)>>3)
+		}
+		s += 8
+		t += 8
+		n += 8
+		left -= 8
 	}
 
-	// Extend the match to be as long as possible.
-	return int32(matchLen(src[s:s1], src[t:]))
+	a := src[s:s1]
+	b := src[t:]
+	for i := range a {
+		if a[i] != b[i] {
+			break
+		}
+		n++
+	}
+	return n
 }
 
 // matchlenLong will return the match length between offsets and t in src.
 // It is assumed that s > t, that t >=0 and s < len(src).
-func (e *fastGen) matchlenLong(s, t int32, src []byte) int32 {
+func (e *fastGen) matchlenLong(s, t int, src []byte) int32 {
 	if debugDeflate {
 		if t >= s {
 			panic(fmt.Sprint("t >=s:", t, s))
@@ -176,7 +194,28 @@ func (e *fastGen) matchlenLong(s, t int32, src []byte) int32 {
 		}
 	}
 	// Extend the match to be as long as possible.
-	return int32(matchLen(src[s:], src[t:]))
+	left := len(src) - s
+	n := int32(0)
+	for left >= 8 {
+		diff := le.Load64(src, s) ^ le.Load64(src, t)
+		if diff != 0 {
+			return n + int32(bits.TrailingZeros64(diff)>>3)
+		}
+		s += 8
+		t += 8
+		n += 8
+		left -= 8
+	}
+
+	a := src[s:]
+	b := src[t:]
+	for i := range a {
+		if a[i] != b[i] {
+			break
+		}
+		n++
+	}
+	return n
 }
 
 // Reset the encoding table.
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
index f70594c34..afdc8c053 100644
--- a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
+++ b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
@@ -5,10 +5,11 @@
 package flate
 
 import (
-	"encoding/binary"
 	"fmt"
 	"io"
 	"math"
+
+	"github.com/klauspost/compress/internal/le"
 )
 
 const (
@@ -438,7 +439,7 @@ func (w *huffmanBitWriter) writeOutBits() {
 	n := w.nbytes
 
 	// We over-write, but faster...
-	binary.LittleEndian.PutUint64(w.bytes[n:], bits)
+	le.Store64(w.bytes[n:], bits)
 	n += 6
 
 	if n >= bufferFlushSize {
@@ -854,7 +855,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 			bits |= c.code64() << (nbits & 63)
 			nbits += c.len()
 			if nbits >= 48 {
-				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				le.Store64(w.bytes[nbytes:], bits)
 				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
 				bits >>= 48
 				nbits -= 48
@@ -882,7 +883,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 			bits |= c.code64() << (nbits & 63)
 			nbits += c.len()
 			if nbits >= 48 {
-				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				le.Store64(w.bytes[nbytes:], bits)
 				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
 				bits >>= 48
 				nbits -= 48
@@ -905,7 +906,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 			bits |= uint64(extraLength) << (nbits & 63)
 			nbits += extraLengthBits
 			if nbits >= 48 {
-				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				le.Store64(w.bytes[nbytes:], bits)
 				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
 				bits >>= 48
 				nbits -= 48
@@ -931,7 +932,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 			bits |= c.code64() << (nbits & 63)
 			nbits += c.len()
 			if nbits >= 48 {
-				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				le.Store64(w.bytes[nbytes:], bits)
 				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
 				bits >>= 48
 				nbits -= 48
@@ -953,7 +954,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 			bits |= uint64((offset-(offsetComb>>8))&matchOffsetOnlyMask) << (nbits & 63)
 			nbits += uint8(offsetComb)
 			if nbits >= 48 {
-				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				le.Store64(w.bytes[nbytes:], bits)
 				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
 				bits >>= 48
 				nbits -= 48
@@ -1107,7 +1108,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 		// We must have at least 48 bits free.
 		if nbits >= 8 {
 			n := nbits >> 3
-			binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+			le.Store64(w.bytes[nbytes:], bits)
 			bits >>= (n * 8) & 63
 			nbits -= n * 8
 			nbytes += n
@@ -1136,7 +1137,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 	// Remaining...
 	for _, t := range input {
 		if nbits >= 48 {
-			binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+			le.Store64(w.bytes[nbytes:], bits)
 			//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
 			bits >>= 48
 			nbits -= 48
diff --git a/vendor/github.com/klauspost/compress/flate/level1.go b/vendor/github.com/klauspost/compress/flate/level1.go
index 703b9a89a..c3581a342 100644
--- a/vendor/github.com/klauspost/compress/flate/level1.go
+++ b/vendor/github.com/klauspost/compress/flate/level1.go
@@ -1,9 +1,9 @@
 package flate
 
 import (
-	"encoding/binary"
 	"fmt"
-	"math/bits"
+
+	"github.com/klauspost/compress/internal/le"
 )
 
 // fastGen maintains the table for matches,
@@ -77,6 +77,7 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 
 		nextS := s
 		var candidate tableEntry
+		var t int32
 		for {
 			nextHash := hashLen(cv, tableBits, hashBytes)
 			candidate = e.table[nextHash]
@@ -88,9 +89,8 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 			now := load6432(src, nextS)
 			e.table[nextHash] = tableEntry{offset: s + e.cur}
 			nextHash = hashLen(now, tableBits, hashBytes)
-
-			offset := s - (candidate.offset - e.cur)
-			if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {
+			t = candidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
 				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
 				break
 			}
@@ -103,8 +103,8 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 			now >>= 8
 			e.table[nextHash] = tableEntry{offset: s + e.cur}
 
-			offset = s - (candidate.offset - e.cur)
-			if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {
+			t = candidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
 				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
 				break
 			}
@@ -120,36 +120,10 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 			// literal bytes prior to s.
 
 			// Extend the 4-byte match as long as possible.
-			t := candidate.offset - e.cur
-			var l = int32(4)
-			if false {
-				l = e.matchlenLong(s+4, t+4, src) + 4
-			} else {
-				// inlined:
-				a := src[s+4:]
-				b := src[t+4:]
-				for len(a) >= 8 {
-					if diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b); diff != 0 {
-						l += int32(bits.TrailingZeros64(diff) >> 3)
-						break
-					}
-					l += 8
-					a = a[8:]
-					b = b[8:]
-				}
-				if len(a) < 8 {
-					b = b[:len(a)]
-					for i := range a {
-						if a[i] != b[i] {
-							break
-						}
-						l++
-					}
-				}
-			}
+			l := e.matchlenLong(int(s+4), int(t+4), src) + 4
 
 			// Extend backwards
-			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+			for t > 0 && s > nextEmit && le.Load8(src, t-1) == le.Load8(src, s-1) {
 				s--
 				t--
 				l++
@@ -221,8 +195,8 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 			candidate = e.table[currHash]
 			e.table[currHash] = tableEntry{offset: o + 2}
 
-			offset := s - (candidate.offset - e.cur)
-			if offset > maxMatchOffset || uint32(x) != load3232(src, candidate.offset-e.cur) {
+			t = candidate.offset - e.cur
+			if s-t > maxMatchOffset || uint32(x) != load3232(src, t) {
 				cv = x >> 8
 				s++
 				break
diff --git a/vendor/github.com/klauspost/compress/flate/level2.go b/vendor/github.com/klauspost/compress/flate/level2.go
index 876dfbe30..c8d047f2d 100644
--- a/vendor/github.com/klauspost/compress/flate/level2.go
+++ b/vendor/github.com/klauspost/compress/flate/level2.go
@@ -126,7 +126,7 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 
 			// Extend the 4-byte match as long as possible.
 			t := candidate.offset - e.cur
-			l := e.matchlenLong(s+4, t+4, src) + 4
+			l := e.matchlenLong(int(s+4), int(t+4), src) + 4
 
 			// Extend backwards
 			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
diff --git a/vendor/github.com/klauspost/compress/flate/level3.go b/vendor/github.com/klauspost/compress/flate/level3.go
index 7aa2b72a1..33f9fb152 100644
--- a/vendor/github.com/klauspost/compress/flate/level3.go
+++ b/vendor/github.com/klauspost/compress/flate/level3.go
@@ -135,7 +135,7 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 			// Extend the 4-byte match as long as possible.
 			//
 			t := candidate.offset - e.cur
-			l := e.matchlenLong(s+4, t+4, src) + 4
+			l := e.matchlenLong(int(s+4), int(t+4), src) + 4
 
 			// Extend backwards
 			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
diff --git a/vendor/github.com/klauspost/compress/flate/level4.go b/vendor/github.com/klauspost/compress/flate/level4.go
index 23c08b325..88509e197 100644
--- a/vendor/github.com/klauspost/compress/flate/level4.go
+++ b/vendor/github.com/klauspost/compress/flate/level4.go
@@ -98,19 +98,19 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 			e.bTable[nextHashL] = entry
 
 			t = lCandidate.offset - e.cur
-			if s-t < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.offset-e.cur) {
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
 				// We got a long match. Use that.
 				break
 			}
 
 			t = sCandidate.offset - e.cur
-			if s-t < maxMatchOffset && uint32(cv) == load3232(src, sCandidate.offset-e.cur) {
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
 				// Found a 4 match...
 				lCandidate = e.bTable[hash7(next, tableBits)]
 
 				// If the next long is a candidate, check if we should use that instead...
-				lOff := nextS - (lCandidate.offset - e.cur)
-				if lOff < maxMatchOffset && load3232(src, lCandidate.offset-e.cur) == uint32(next) {
+				lOff := lCandidate.offset - e.cur
+				if nextS-lOff < maxMatchOffset && load3232(src, lOff) == uint32(next) {
 					l1, l2 := matchLen(src[s+4:], src[t+4:]), matchLen(src[nextS+4:], src[nextS-lOff+4:])
 					if l2 > l1 {
 						s = nextS
@@ -127,7 +127,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 		// them as literal bytes.
 
 		// Extend the 4-byte match as long as possible.
-		l := e.matchlenLong(s+4, t+4, src) + 4
+		l := e.matchlenLong(int(s+4), int(t+4), src) + 4
 
 		// Extend backwards
 		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
diff --git a/vendor/github.com/klauspost/compress/flate/level5.go b/vendor/github.com/klauspost/compress/flate/level5.go
index 1f61ec182..6e5c21502 100644
--- a/vendor/github.com/klauspost/compress/flate/level5.go
+++ b/vendor/github.com/klauspost/compress/flate/level5.go
@@ -111,16 +111,16 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 
 			t = lCandidate.Cur.offset - e.cur
 			if s-t < maxMatchOffset {
-				if uint32(cv) == load3232(src, lCandidate.Cur.offset-e.cur) {
+				if uint32(cv) == load3232(src, t) {
 					// Store the next match
 					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
 					eLong := &e.bTable[nextHashL]
 					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
 
 					t2 := lCandidate.Prev.offset - e.cur
-					if s-t2 < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
-						l = e.matchlen(s+4, t+4, src) + 4
-						ml1 := e.matchlen(s+4, t2+4, src) + 4
+					if s-t2 < maxMatchOffset && uint32(cv) == load3232(src, t2) {
+						l = e.matchlen(int(s+4), int(t+4), src) + 4
+						ml1 := e.matchlen(int(s+4), int(t2+4), src) + 4
 						if ml1 > l {
 							t = t2
 							l = ml1
@@ -130,7 +130,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 					break
 				}
 				t = lCandidate.Prev.offset - e.cur
-				if s-t < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
+				if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
 					// Store the next match
 					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
 					eLong := &e.bTable[nextHashL]
@@ -140,9 +140,9 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 			}
 
 			t = sCandidate.offset - e.cur
-			if s-t < maxMatchOffset && uint32(cv) == load3232(src, sCandidate.offset-e.cur) {
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
 				// Found a 4 match...
-				l = e.matchlen(s+4, t+4, src) + 4
+				l = e.matchlen(int(s+4), int(t+4), src) + 4
 				lCandidate = e.bTable[nextHashL]
 				// Store the next match
 
@@ -153,8 +153,8 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 				// If the next long is a candidate, use that...
 				t2 := lCandidate.Cur.offset - e.cur
 				if nextS-t2 < maxMatchOffset {
-					if load3232(src, lCandidate.Cur.offset-e.cur) == uint32(next) {
-						ml := e.matchlen(nextS+4, t2+4, src) + 4
+					if load3232(src, t2) == uint32(next) {
+						ml := e.matchlen(int(nextS+4), int(t2+4), src) + 4
 						if ml > l {
 							t = t2
 							s = nextS
@@ -164,8 +164,8 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 					}
 					// If the previous long is a candidate, use that...
 					t2 = lCandidate.Prev.offset - e.cur
-					if nextS-t2 < maxMatchOffset && load3232(src, lCandidate.Prev.offset-e.cur) == uint32(next) {
-						ml := e.matchlen(nextS+4, t2+4, src) + 4
+					if nextS-t2 < maxMatchOffset && load3232(src, t2) == uint32(next) {
+						ml := e.matchlen(int(nextS+4), int(t2+4), src) + 4
 						if ml > l {
 							t = t2
 							s = nextS
@@ -185,9 +185,9 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 
 		if l == 0 {
 			// Extend the 4-byte match as long as possible.
-			l = e.matchlenLong(s+4, t+4, src) + 4
+			l = e.matchlenLong(int(s+4), int(t+4), src) + 4
 		} else if l == maxMatchLength {
-			l += e.matchlenLong(s+l, t+l, src)
+			l += e.matchlenLong(int(s+l), int(t+l), src)
 		}
 
 		// Try to locate a better match by checking the end of best match...
@@ -203,7 +203,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 			s2 := s + skipBeginning
 			off := s2 - t2
 			if t2 >= 0 && off < maxMatchOffset && off > 0 {
-				if l2 := e.matchlenLong(s2, t2, src); l2 > l {
+				if l2 := e.matchlenLong(int(s2), int(t2), src); l2 > l {
 					t = t2
 					l = l2
 					s = s2
@@ -423,14 +423,14 @@ func (e *fastEncL5Window) Encode(dst *tokens, src []byte) {
 
 			t = lCandidate.Cur.offset - e.cur
 			if s-t < maxMatchOffset {
-				if uint32(cv) == load3232(src, lCandidate.Cur.offset-e.cur) {
+				if uint32(cv) == load3232(src, t) {
 					// Store the next match
 					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
 					eLong := &e.bTable[nextHashL]
 					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
 
 					t2 := lCandidate.Prev.offset - e.cur
-					if s-t2 < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
+					if s-t2 < maxMatchOffset && uint32(cv) == load3232(src, t2) {
 						l = e.matchlen(s+4, t+4, src) + 4
 						ml1 := e.matchlen(s+4, t2+4, src) + 4
 						if ml1 > l {
@@ -442,7 +442,7 @@ func (e *fastEncL5Window) Encode(dst *tokens, src []byte) {
 					break
 				}
 				t = lCandidate.Prev.offset - e.cur
-				if s-t < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
+				if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
 					// Store the next match
 					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
 					eLong := &e.bTable[nextHashL]
@@ -452,7 +452,7 @@ func (e *fastEncL5Window) Encode(dst *tokens, src []byte) {
 			}
 
 			t = sCandidate.offset - e.cur
-			if s-t < maxMatchOffset && uint32(cv) == load3232(src, sCandidate.offset-e.cur) {
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
 				// Found a 4 match...
 				l = e.matchlen(s+4, t+4, src) + 4
 				lCandidate = e.bTable[nextHashL]
@@ -465,7 +465,7 @@ func (e *fastEncL5Window) Encode(dst *tokens, src []byte) {
 				// If the next long is a candidate, use that...
 				t2 := lCandidate.Cur.offset - e.cur
 				if nextS-t2 < maxMatchOffset {
-					if load3232(src, lCandidate.Cur.offset-e.cur) == uint32(next) {
+					if load3232(src, t2) == uint32(next) {
 						ml := e.matchlen(nextS+4, t2+4, src) + 4
 						if ml > l {
 							t = t2
@@ -476,7 +476,7 @@ func (e *fastEncL5Window) Encode(dst *tokens, src []byte) {
 					}
 					// If the previous long is a candidate, use that...
 					t2 = lCandidate.Prev.offset - e.cur
-					if nextS-t2 < maxMatchOffset && load3232(src, lCandidate.Prev.offset-e.cur) == uint32(next) {
+					if nextS-t2 < maxMatchOffset && load3232(src, t2) == uint32(next) {
 						ml := e.matchlen(nextS+4, t2+4, src) + 4
 						if ml > l {
 							t = t2
diff --git a/vendor/github.com/klauspost/compress/flate/level6.go b/vendor/github.com/klauspost/compress/flate/level6.go
index f1e9d98fa..96f5bb430 100644
--- a/vendor/github.com/klauspost/compress/flate/level6.go
+++ b/vendor/github.com/klauspost/compress/flate/level6.go
@@ -113,7 +113,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 
 			t = lCandidate.Cur.offset - e.cur
 			if s-t < maxMatchOffset {
-				if uint32(cv) == load3232(src, lCandidate.Cur.offset-e.cur) {
+				if uint32(cv) == load3232(src, t) {
 					// Long candidate matches at least 4 bytes.
 
 					// Store the next match
@@ -123,9 +123,9 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 
 					// Check the previous long candidate as well.
 					t2 := lCandidate.Prev.offset - e.cur
-					if s-t2 < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
-						l = e.matchlen(s+4, t+4, src) + 4
-						ml1 := e.matchlen(s+4, t2+4, src) + 4
+					if s-t2 < maxMatchOffset && uint32(cv) == load3232(src, t2) {
+						l = e.matchlen(int(s+4), int(t+4), src) + 4
+						ml1 := e.matchlen(int(s+4), int(t2+4), src) + 4
 						if ml1 > l {
 							t = t2
 							l = ml1
@@ -136,7 +136,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 				}
 				// Current value did not match, but check if previous long value does.
 				t = lCandidate.Prev.offset - e.cur
-				if s-t < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
+				if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
 					// Store the next match
 					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
 					eLong := &e.bTable[nextHashL]
@@ -146,9 +146,9 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 			}
 
 			t = sCandidate.offset - e.cur
-			if s-t < maxMatchOffset && uint32(cv) == load3232(src, sCandidate.offset-e.cur) {
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
 				// Found a 4 match...
-				l = e.matchlen(s+4, t+4, src) + 4
+				l = e.matchlen(int(s+4), int(t+4), src) + 4
 
 				// Look up next long candidate (at nextS)
 				lCandidate = e.bTable[nextHashL]
@@ -162,7 +162,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 				const repOff = 1
 				t2 := s - repeat + repOff
 				if load3232(src, t2) == uint32(cv>>(8*repOff)) {
-					ml := e.matchlen(s+4+repOff, t2+4, src) + 4
+					ml := e.matchlen(int(s+4+repOff), int(t2+4), src) + 4
 					if ml > l {
 						t = t2
 						l = ml
@@ -175,8 +175,8 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 				// If the next long is a candidate, use that...
 				t2 = lCandidate.Cur.offset - e.cur
 				if nextS-t2 < maxMatchOffset {
-					if load3232(src, lCandidate.Cur.offset-e.cur) == uint32(next) {
-						ml := e.matchlen(nextS+4, t2+4, src) + 4
+					if load3232(src, t2) == uint32(next) {
+						ml := e.matchlen(int(nextS+4), int(t2+4), src) + 4
 						if ml > l {
 							t = t2
 							s = nextS
@@ -186,8 +186,8 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 					}
 					// If the previous long is a candidate, use that...
 					t2 = lCandidate.Prev.offset - e.cur
-					if nextS-t2 < maxMatchOffset && load3232(src, lCandidate.Prev.offset-e.cur) == uint32(next) {
-						ml := e.matchlen(nextS+4, t2+4, src) + 4
+					if nextS-t2 < maxMatchOffset && load3232(src, t2) == uint32(next) {
+						ml := e.matchlen(int(nextS+4), int(t2+4), src) + 4
 						if ml > l {
 							t = t2
 							s = nextS
@@ -207,9 +207,9 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 
 		// Extend the 4-byte match as long as possible.
 		if l == 0 {
-			l = e.matchlenLong(s+4, t+4, src) + 4
+			l = e.matchlenLong(int(s+4), int(t+4), src) + 4
 		} else if l == maxMatchLength {
-			l += e.matchlenLong(s+l, t+l, src)
+			l += e.matchlenLong(int(s+l), int(t+l), src)
 		}
 
 		// Try to locate a better match by checking the end-of-match...
@@ -227,7 +227,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 			off := s2 - t2
 			if off < maxMatchOffset {
 				if off > 0 && t2 >= 0 {
-					if l2 := e.matchlenLong(s2, t2, src); l2 > l {
+					if l2 := e.matchlenLong(int(s2), int(t2), src); l2 > l {
 						t = t2
 						l = l2
 						s = s2
@@ -237,7 +237,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 				t2 = eLong.Prev.offset - e.cur - l + skipBeginning
 				off := s2 - t2
 				if off > 0 && off < maxMatchOffset && t2 >= 0 {
-					if l2 := e.matchlenLong(s2, t2, src); l2 > l {
+					if l2 := e.matchlenLong(int(s2), int(t2), src); l2 > l {
 						t = t2
 						l = l2
 						s = s2
diff --git a/vendor/github.com/klauspost/compress/flate/matchlen_amd64.go b/vendor/github.com/klauspost/compress/flate/matchlen_amd64.go
deleted file mode 100644
index 4bd388584..000000000
--- a/vendor/github.com/klauspost/compress/flate/matchlen_amd64.go
+++ /dev/null
@@ -1,16 +0,0 @@
-//go:build amd64 && !appengine && !noasm && gc
-// +build amd64,!appengine,!noasm,gc
-
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-
-package flate
-
-// matchLen returns how many bytes match in a and b
-//
-// It assumes that:
-//
-//	len(a) <= len(b) and len(a) > 0
-//
-//go:noescape
-func matchLen(a []byte, b []byte) int
diff --git a/vendor/github.com/klauspost/compress/flate/matchlen_amd64.s b/vendor/github.com/klauspost/compress/flate/matchlen_amd64.s
deleted file mode 100644
index 0782b86e3..000000000
--- a/vendor/github.com/klauspost/compress/flate/matchlen_amd64.s
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copied from S2 implementation.
-
-//go:build !appengine && !noasm && gc && !noasm
-
-#include "textflag.h"
-
-// func matchLen(a []byte, b []byte) int
-TEXT ·matchLen(SB), NOSPLIT, $0-56
-	MOVQ a_base+0(FP), AX
-	MOVQ b_base+24(FP), CX
-	MOVQ a_len+8(FP), DX
-
-	// matchLen
-	XORL SI, SI
-	CMPL DX, $0x08
-	JB   matchlen_match4_standalone
-
-matchlen_loopback_standalone:
-	MOVQ (AX)(SI*1), BX
-	XORQ (CX)(SI*1), BX
-	JZ   matchlen_loop_standalone
-
-#ifdef GOAMD64_v3
-	TZCNTQ BX, BX
-#else
-	BSFQ BX, BX
-#endif
-	SHRL $0x03, BX
-	LEAL (SI)(BX*1), SI
-	JMP  gen_match_len_end
-
-matchlen_loop_standalone:
-	LEAL -8(DX), DX
-	LEAL 8(SI), SI
-	CMPL DX, $0x08
-	JAE  matchlen_loopback_standalone
-
-matchlen_match4_standalone:
-	CMPL DX, $0x04
-	JB   matchlen_match2_standalone
-	MOVL (AX)(SI*1), BX
-	CMPL (CX)(SI*1), BX
-	JNE  matchlen_match2_standalone
-	LEAL -4(DX), DX
-	LEAL 4(SI), SI
-
-matchlen_match2_standalone:
-	CMPL DX, $0x02
-	JB   matchlen_match1_standalone
-	MOVW (AX)(SI*1), BX
-	CMPW (CX)(SI*1), BX
-	JNE  matchlen_match1_standalone
-	LEAL -2(DX), DX
-	LEAL 2(SI), SI
-
-matchlen_match1_standalone:
-	CMPL DX, $0x01
-	JB   gen_match_len_end
-	MOVB (AX)(SI*1), BL
-	CMPB (CX)(SI*1), BL
-	JNE  gen_match_len_end
-	INCL SI
-
-gen_match_len_end:
-	MOVQ SI, ret+48(FP)
-	RET
diff --git a/vendor/github.com/klauspost/compress/flate/matchlen_generic.go b/vendor/github.com/klauspost/compress/flate/matchlen_generic.go
index ad5cd814b..6149384aa 100644
--- a/vendor/github.com/klauspost/compress/flate/matchlen_generic.go
+++ b/vendor/github.com/klauspost/compress/flate/matchlen_generic.go
@@ -1,27 +1,29 @@
-//go:build !amd64 || appengine || !gc || noasm
-// +build !amd64 appengine !gc noasm
-
 // Copyright 2019+ Klaus Post. All rights reserved.
 // License information can be found in the LICENSE file.
 
 package flate
 
 import (
-	"encoding/binary"
 	"math/bits"
+
+	"github.com/klauspost/compress/internal/le"
 )
 
 // matchLen returns the maximum common prefix length of a and b.
 // a must be the shortest of the two.
 func matchLen(a, b []byte) (n int) {
-	for ; len(a) >= 8 && len(b) >= 8; a, b = a[8:], b[8:] {
-		diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b)
+	left := len(a)
+	for left >= 8 {
+		diff := le.Load64(a, n) ^ le.Load64(b, n)
 		if diff != 0 {
 			return n + bits.TrailingZeros64(diff)>>3
 		}
 		n += 8
+		left -= 8
 	}
 
+	a = a[n:]
+	b = b[n:]
 	for i := range a {
 		if a[i] != b[i] {
 			break
@@ -29,5 +31,4 @@ func matchLen(a, b []byte) (n int) {
 		n++
 	}
 	return n
-
 }
diff --git a/vendor/github.com/klauspost/compress/flate/stateless.go b/vendor/github.com/klauspost/compress/flate/stateless.go
index f3d4139ef..13b9b100d 100644
--- a/vendor/github.com/klauspost/compress/flate/stateless.go
+++ b/vendor/github.com/klauspost/compress/flate/stateless.go
@@ -4,6 +4,8 @@ import (
 	"io"
 	"math"
 	"sync"
+
+	"github.com/klauspost/compress/internal/le"
 )
 
 const (
@@ -152,18 +154,11 @@ func hashSL(u uint32) uint32 {
 }
 
 func load3216(b []byte, i int16) uint32 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:4]
-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+	return le.Load32(b, i)
 }
 
 func load6416(b []byte, i int16) uint64 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:8]
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+	return le.Load64(b, i)
 }
 
 func statelessEnc(dst *tokens, src []byte, startAt int16) {
diff --git a/vendor/github.com/klauspost/compress/huff0/bitreader.go b/vendor/github.com/klauspost/compress/huff0/bitreader.go
index e36d9742f..bfc7a523d 100644
--- a/vendor/github.com/klauspost/compress/huff0/bitreader.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitreader.go
@@ -6,10 +6,11 @@
 package huff0
 
 import (
-	"encoding/binary"
 	"errors"
 	"fmt"
 	"io"
+
+	"github.com/klauspost/compress/internal/le"
 )
 
 // bitReader reads a bitstream in reverse.
@@ -46,7 +47,7 @@ func (b *bitReaderBytes) init(in []byte) error {
 	return nil
 }
 
-// peekBitsFast requires that at least one bit is requested every time.
+// peekByteFast requires that at least one byte is requested every time.
 // There are no checks if the buffer is filled.
 func (b *bitReaderBytes) peekByteFast() uint8 {
 	got := uint8(b.value >> 56)
@@ -66,8 +67,7 @@ func (b *bitReaderBytes) fillFast() {
 	}
 
 	// 2 bounds checks.
-	v := b.in[b.off-4 : b.off]
-	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	low := le.Load32(b.in, b.off-4)
 	b.value |= uint64(low) << (b.bitsRead - 32)
 	b.bitsRead -= 32
 	b.off -= 4
@@ -76,7 +76,7 @@ func (b *bitReaderBytes) fillFast() {
 // fillFastStart() assumes the bitReaderBytes is empty and there is at least 8 bytes to read.
 func (b *bitReaderBytes) fillFastStart() {
 	// Do single re-slice to avoid bounds checks.
-	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
+	b.value = le.Load64(b.in, b.off-8)
 	b.bitsRead = 0
 	b.off -= 8
 }
@@ -86,9 +86,8 @@ func (b *bitReaderBytes) fill() {
 	if b.bitsRead < 32 {
 		return
 	}
-	if b.off > 4 {
-		v := b.in[b.off-4 : b.off]
-		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	if b.off >= 4 {
+		low := le.Load32(b.in, b.off-4)
 		b.value |= uint64(low) << (b.bitsRead - 32)
 		b.bitsRead -= 32
 		b.off -= 4
@@ -175,9 +174,7 @@ func (b *bitReaderShifted) fillFast() {
 		return
 	}
 
-	// 2 bounds checks.
-	v := b.in[b.off-4 : b.off]
-	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	low := le.Load32(b.in, b.off-4)
 	b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
 	b.bitsRead -= 32
 	b.off -= 4
@@ -185,8 +182,7 @@ func (b *bitReaderShifted) fillFast() {
 
 // fillFastStart() assumes the bitReaderShifted is empty and there is at least 8 bytes to read.
 func (b *bitReaderShifted) fillFastStart() {
-	// Do single re-slice to avoid bounds checks.
-	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
+	b.value = le.Load64(b.in, b.off-8)
 	b.bitsRead = 0
 	b.off -= 8
 }
@@ -197,8 +193,7 @@ func (b *bitReaderShifted) fill() {
 		return
 	}
 	if b.off > 4 {
-		v := b.in[b.off-4 : b.off]
-		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+		low := le.Load32(b.in, b.off-4)
 		b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
 		b.bitsRead -= 32
 		b.off -= 4
diff --git a/vendor/github.com/klauspost/compress/internal/le/le.go b/vendor/github.com/klauspost/compress/internal/le/le.go
new file mode 100644
index 000000000..e54909e16
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/internal/le/le.go
@@ -0,0 +1,5 @@
+package le
+
+type Indexer interface {
+	int | int8 | int16 | int32 | int64 | uint | uint8 | uint16 | uint32 | uint64
+}
diff --git a/vendor/github.com/klauspost/compress/internal/le/unsafe_disabled.go b/vendor/github.com/klauspost/compress/internal/le/unsafe_disabled.go
new file mode 100644
index 000000000..0cfb5c0e2
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/internal/le/unsafe_disabled.go
@@ -0,0 +1,42 @@
+//go:build !(amd64 || arm64 || ppc64le || riscv64) || nounsafe || purego || appengine
+
+package le
+
+import (
+	"encoding/binary"
+)
+
+// Load8 will load from b at index i.
+func Load8[I Indexer](b []byte, i I) byte {
+	return b[i]
+}
+
+// Load16 will load from b at index i.
+func Load16[I Indexer](b []byte, i I) uint16 {
+	return binary.LittleEndian.Uint16(b[i:])
+}
+
+// Load32 will load from b at index i.
+func Load32[I Indexer](b []byte, i I) uint32 {
+	return binary.LittleEndian.Uint32(b[i:])
+}
+
+// Load64 will load from b at index i.
+func Load64[I Indexer](b []byte, i I) uint64 {
+	return binary.LittleEndian.Uint64(b[i:])
+}
+
+// Store16 will store v at b.
+func Store16(b []byte, v uint16) {
+	binary.LittleEndian.PutUint16(b, v)
+}
+
+// Store32 will store v at b.
+func Store32(b []byte, v uint32) {
+	binary.LittleEndian.PutUint32(b, v)
+}
+
+// Store64 will store v at b.
+func Store64(b []byte, v uint64) {
+	binary.LittleEndian.PutUint64(b, v)
+}
diff --git a/vendor/github.com/klauspost/compress/internal/le/unsafe_enabled.go b/vendor/github.com/klauspost/compress/internal/le/unsafe_enabled.go
new file mode 100644
index 000000000..ada45cd90
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/internal/le/unsafe_enabled.go
@@ -0,0 +1,55 @@
+// We enable 64 bit LE platforms:
+
+//go:build (amd64 || arm64 || ppc64le || riscv64) && !nounsafe && !purego && !appengine
+
+package le
+
+import (
+	"unsafe"
+)
+
+// Load8 will load from b at index i.
+func Load8[I Indexer](b []byte, i I) byte {
+	//return binary.LittleEndian.Uint16(b[i:])
+	//return *(*uint16)(unsafe.Pointer(&b[i]))
+	return *(*byte)(unsafe.Add(unsafe.Pointer(unsafe.SliceData(b)), i))
+}
+
+// Load16 will load from b at index i.
+func Load16[I Indexer](b []byte, i I) uint16 {
+	//return binary.LittleEndian.Uint16(b[i:])
+	//return *(*uint16)(unsafe.Pointer(&b[i]))
+	return *(*uint16)(unsafe.Add(unsafe.Pointer(unsafe.SliceData(b)), i))
+}
+
+// Load32 will load from b at index i.
+func Load32[I Indexer](b []byte, i I) uint32 {
+	//return binary.LittleEndian.Uint32(b[i:])
+	//return *(*uint32)(unsafe.Pointer(&b[i]))
+	return *(*uint32)(unsafe.Add(unsafe.Pointer(unsafe.SliceData(b)), i))
+}
+
+// Load64 will load from b at index i.
+func Load64[I Indexer](b []byte, i I) uint64 {
+	//return binary.LittleEndian.Uint64(b[i:])
+	//return *(*uint64)(unsafe.Pointer(&b[i]))
+	return *(*uint64)(unsafe.Add(unsafe.Pointer(unsafe.SliceData(b)), i))
+}
+
+// Store16 will store v at b.
+func Store16(b []byte, v uint16) {
+	//binary.LittleEndian.PutUint16(b, v)
+	*(*uint16)(unsafe.Pointer(unsafe.SliceData(b))) = v
+}
+
+// Store32 will store v at b.
+func Store32(b []byte, v uint32) {
+	//binary.LittleEndian.PutUint32(b, v)
+	*(*uint32)(unsafe.Pointer(unsafe.SliceData(b))) = v
+}
+
+// Store64 will store v at b.
+func Store64(b []byte, v uint64) {
+	//binary.LittleEndian.PutUint64(b, v)
+	*(*uint64)(unsafe.Pointer(unsafe.SliceData(b))) = v
+}
diff --git a/vendor/github.com/klauspost/compress/s2/README.md b/vendor/github.com/klauspost/compress/s2/README.md
index 8284bb081..1d9220cbf 100644
--- a/vendor/github.com/klauspost/compress/s2/README.md
+++ b/vendor/github.com/klauspost/compress/s2/README.md
@@ -79,7 +79,7 @@ This will take ownership of the buffer until the stream is closed.
 func EncodeStream(src []byte, dst io.Writer) error {
     enc := s2.NewWriter(dst)
     // The encoder owns the buffer until Flush or Close is called.
-    err := enc.EncodeBuffer(buf)
+    err := enc.EncodeBuffer(src)
     if err != nil {
         enc.Close()
         return err
diff --git a/vendor/github.com/klauspost/compress/s2/decode_other.go b/vendor/github.com/klauspost/compress/s2/decode_other.go
index 2cb55c2c7..c99d40b69 100644
--- a/vendor/github.com/klauspost/compress/s2/decode_other.go
+++ b/vendor/github.com/klauspost/compress/s2/decode_other.go
@@ -11,6 +11,8 @@ package s2
 import (
 	"fmt"
 	"strconv"
+
+	"github.com/klauspost/compress/internal/le"
 )
 
 // decode writes the decoding of src to dst. It assumes that the varint-encoded
@@ -38,21 +40,18 @@ func s2Decode(dst, src []byte) int {
 			case x < 60:
 				s++
 			case x == 60:
+				x = uint32(src[s+1])
 				s += 2
-				x = uint32(src[s-1])
 			case x == 61:
-				in := src[s : s+3]
-				x = uint32(in[1]) | uint32(in[2])<<8
+				x = uint32(le.Load16(src, s+1))
 				s += 3
 			case x == 62:
-				in := src[s : s+4]
 				// Load as 32 bit and shift down.
-				x = uint32(in[0]) | uint32(in[1])<<8 | uint32(in[2])<<16 | uint32(in[3])<<24
+				x = le.Load32(src, s)
 				x >>= 8
 				s += 4
 			case x == 63:
-				in := src[s : s+5]
-				x = uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24
+				x = le.Load32(src, s+1)
 				s += 5
 			}
 			length = int(x) + 1
@@ -85,8 +84,7 @@ func s2Decode(dst, src []byte) int {
 					length = int(src[s]) + 4
 					s += 1
 				case 6:
-					in := src[s : s+2]
-					length = int(uint32(in[0])|(uint32(in[1])<<8)) + (1 << 8)
+					length = int(le.Load16(src, s)) + 1<<8
 					s += 2
 				case 7:
 					in := src[s : s+3]
@@ -99,15 +97,13 @@ func s2Decode(dst, src []byte) int {
 			}
 			length += 4
 		case tagCopy2:
-			in := src[s : s+3]
-			offset = int(uint32(in[1]) | uint32(in[2])<<8)
-			length = 1 + int(in[0])>>2
+			offset = int(le.Load16(src, s+1))
+			length = 1 + int(src[s])>>2
 			s += 3
 
 		case tagCopy4:
-			in := src[s : s+5]
-			offset = int(uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24)
-			length = 1 + int(in[0])>>2
+			offset = int(le.Load32(src, s+1))
+			length = 1 + int(src[s])>>2
 			s += 5
 		}
 
diff --git a/vendor/github.com/klauspost/compress/s2/encode_all.go b/vendor/github.com/klauspost/compress/s2/encode_all.go
index 997704569..a473b6452 100644
--- a/vendor/github.com/klauspost/compress/s2/encode_all.go
+++ b/vendor/github.com/klauspost/compress/s2/encode_all.go
@@ -10,14 +10,16 @@ import (
 	"encoding/binary"
 	"fmt"
 	"math/bits"
+
+	"github.com/klauspost/compress/internal/le"
 )
 
 func load32(b []byte, i int) uint32 {
-	return binary.LittleEndian.Uint32(b[i:])
+	return le.Load32(b, i)
 }
 
 func load64(b []byte, i int) uint64 {
-	return binary.LittleEndian.Uint64(b[i:])
+	return le.Load64(b, i)
 }
 
 // hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
@@ -44,7 +46,12 @@ func encodeGo(dst, src []byte) []byte {
 		d += emitLiteral(dst[d:], src)
 		return dst[:d]
 	}
-	n := encodeBlockGo(dst[d:], src)
+	var n int
+	if len(src) < 64<<10 {
+		n = encodeBlockGo64K(dst[d:], src)
+	} else {
+		n = encodeBlockGo(dst[d:], src)
+	}
 	if n > 0 {
 		d += n
 		return dst[:d]
@@ -70,7 +77,6 @@ func encodeBlockGo(dst, src []byte) (d int) {
 
 		debug = false
 	)
-
 	var table [maxTableSize]uint32
 
 	// sLimit is when to stop looking for offset/length copies. The inputMargin
@@ -277,13 +283,228 @@ emitRemainder:
 	return d
 }
 
-func encodeBlockSnappyGo(dst, src []byte) (d int) {
+// encodeBlockGo64K is a specialized version for compressing blocks <= 64KB
+func encodeBlockGo64K(dst, src []byte) (d int) {
 	// Initialize the hash table.
 	const (
 		tableBits    = 14
 		maxTableSize = 1 << tableBits
+
+		debug = false
 	)
 
+	var table [maxTableSize]uint16
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 5
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
+	repeat := 1
+
+	for {
+		candidate := 0
+		for {
+			// Next src position to check
+			nextS := s + (s-nextEmit)>>5 + 4
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hash0 := hash6(cv, tableBits)
+			hash1 := hash6(cv>>8, tableBits)
+			candidate = int(table[hash0])
+			candidate2 := int(table[hash1])
+			table[hash0] = uint16(s)
+			table[hash1] = uint16(s + 1)
+			hash2 := hash6(cv>>16, tableBits)
+
+			// Check repeat at offset checkRep.
+			const checkRep = 1
+			if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+
+				// Bail if we exceed the maximum size.
+				if d+(base-nextEmit) > dstLimit {
+					return 0
+				}
+
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+
+				// Extend forward
+				candidate := s - repeat + 4 + checkRep
+				s += 4 + checkRep
+				for s <= sLimit {
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+				if debug {
+					// Validate match.
+					if s <= candidate {
+						panic("s <= candidate")
+					}
+					a := src[base:s]
+					b := src[base-repeat : base-repeat+(s-base)]
+					if !bytes.Equal(a, b) {
+						panic("mismatch")
+					}
+				}
+				if nextEmit > 0 {
+					// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+					d += emitRepeat(dst[d:], repeat, s-base)
+				} else {
+					// First match, cannot be repeat.
+					d += emitCopy(dst[d:], repeat, s-base)
+				}
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+				cv = load64(src, s)
+				continue
+			}
+
+			if uint32(cv) == load32(src, candidate) {
+				break
+			}
+			candidate = int(table[hash2])
+			if uint32(cv>>8) == load32(src, candidate2) {
+				table[hash2] = uint16(s + 2)
+				candidate = candidate2
+				s++
+				break
+			}
+			table[hash2] = uint16(s + 2)
+			if uint32(cv>>16) == load32(src, candidate) {
+				s += 2
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards.
+		// The top bytes will be rechecked to get the full match.
+		for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
+			candidate--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		d += emitLiteral(dst[d:], src[nextEmit:s])
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+			base := s
+			repeat = base - candidate
+
+			// Extend the 4-byte match as long as possible.
+			s += 4
+			candidate += 4
+			for s <= len(src)-8 {
+				if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+					s += bits.TrailingZeros64(diff) >> 3
+					break
+				}
+				s += 8
+				candidate += 8
+			}
+
+			d += emitCopy(dst[d:], repeat, s-base)
+			if debug {
+				// Validate match.
+				if s <= candidate {
+					panic("s <= candidate")
+				}
+				a := src[base:s]
+				b := src[base-repeat : base-repeat+(s-base)]
+				if !bytes.Equal(a, b) {
+					panic("mismatch")
+				}
+			}
+
+			nextEmit = s
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			if d > dstLimit {
+				// Do we have space for more, if not bail.
+				return 0
+			}
+			// Check for an immediate match, otherwise start search at s+1
+			x := load64(src, s-2)
+			m2Hash := hash6(x, tableBits)
+			currHash := hash6(x>>16, tableBits)
+			candidate = int(table[currHash])
+			table[m2Hash] = uint16(s - 2)
+			table[currHash] = uint16(s)
+			if debug && s == candidate {
+				panic("s == candidate")
+			}
+			if uint32(x>>16) != load32(src, candidate) {
+				cv = load64(src, s+1)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+
+func encodeBlockSnappyGo(dst, src []byte) (d int) {
+	// Initialize the hash table.
+	const (
+		tableBits    = 14
+		maxTableSize = 1 << tableBits
+	)
 	var table [maxTableSize]uint32
 
 	// sLimit is when to stop looking for offset/length copies. The inputMargin
@@ -467,6 +688,197 @@ emitRemainder:
 	return d
 }
 
+// encodeBlockSnappyGo64K is a special version of encodeBlockSnappyGo for sizes <64KB
+func encodeBlockSnappyGo64K(dst, src []byte) (d int) {
+	// Initialize the hash table.
+	const (
+		tableBits    = 14
+		maxTableSize = 1 << tableBits
+	)
+
+	var table [maxTableSize]uint16
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 5
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
+	repeat := 1
+
+	for {
+		candidate := 0
+		for {
+			// Next src position to check
+			nextS := s + (s-nextEmit)>>5 + 4
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hash0 := hash6(cv, tableBits)
+			hash1 := hash6(cv>>8, tableBits)
+			candidate = int(table[hash0])
+			candidate2 := int(table[hash1])
+			table[hash0] = uint16(s)
+			table[hash1] = uint16(s + 1)
+			hash2 := hash6(cv>>16, tableBits)
+
+			// Check repeat at offset checkRep.
+			const checkRep = 1
+			if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+				// Bail if we exceed the maximum size.
+				if d+(base-nextEmit) > dstLimit {
+					return 0
+				}
+
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+
+				// Extend forward
+				candidate := s - repeat + 4 + checkRep
+				s += 4 + checkRep
+				for s <= sLimit {
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+
+				d += emitCopyNoRepeat(dst[d:], repeat, s-base)
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+
+				cv = load64(src, s)
+				continue
+			}
+
+			if uint32(cv) == load32(src, candidate) {
+				break
+			}
+			candidate = int(table[hash2])
+			if uint32(cv>>8) == load32(src, candidate2) {
+				table[hash2] = uint16(s + 2)
+				candidate = candidate2
+				s++
+				break
+			}
+			table[hash2] = uint16(s + 2)
+			if uint32(cv>>16) == load32(src, candidate) {
+				s += 2
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards
+		for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
+			candidate--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		d += emitLiteral(dst[d:], src[nextEmit:s])
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+			base := s
+			repeat = base - candidate
+
+			// Extend the 4-byte match as long as possible.
+			s += 4
+			candidate += 4
+			for s <= len(src)-8 {
+				if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+					s += bits.TrailingZeros64(diff) >> 3
+					break
+				}
+				s += 8
+				candidate += 8
+			}
+
+			d += emitCopyNoRepeat(dst[d:], repeat, s-base)
+			if false {
+				// Validate match.
+				a := src[base:s]
+				b := src[base-repeat : base-repeat+(s-base)]
+				if !bytes.Equal(a, b) {
+					panic("mismatch")
+				}
+			}
+
+			nextEmit = s
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			if d > dstLimit {
+				// Do we have space for more, if not bail.
+				return 0
+			}
+			// Check for an immediate match, otherwise start search at s+1
+			x := load64(src, s-2)
+			m2Hash := hash6(x, tableBits)
+			currHash := hash6(x>>16, tableBits)
+			candidate = int(table[currHash])
+			table[m2Hash] = uint16(s - 2)
+			table[currHash] = uint16(s)
+			if uint32(x>>16) != load32(src, candidate) {
+				cv = load64(src, s+1)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+
 // encodeBlockGo encodes a non-empty src to a guaranteed-large-enough dst. It
 // assumes that the varint-encoded length of the decompressed bytes has already
 // been written.
diff --git a/vendor/github.com/klauspost/compress/s2/encode_better.go b/vendor/github.com/klauspost/compress/s2/encode_better.go
index 544cb1e17..90ebf89c2 100644
--- a/vendor/github.com/klauspost/compress/s2/encode_better.go
+++ b/vendor/github.com/klauspost/compress/s2/encode_better.go
@@ -348,12 +348,7 @@ func encodeBlockBetterSnappyGo(dst, src []byte) (d int) {
 		nextS := 0
 		for {
 			// Next src position to check
-			nextS = (s-nextEmit)>>7 + 1
-			if nextS > maxSkip {
-				nextS = s + maxSkip
-			} else {
-				nextS += s
-			}
+			nextS = min(s+(s-nextEmit)>>7+1, s+maxSkip)
 
 			if nextS > sLimit {
 				goto emitRemainder
@@ -483,6 +478,415 @@ emitRemainder:
 	return d
 }
 
+func encodeBlockBetterGo64K(dst, src []byte) (d int) {
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+	// Initialize the hash tables.
+	// Use smaller tables for smaller blocks
+	const (
+		// Long hash matches.
+		lTableBits    = 16
+		maxLTableSize = 1 << lTableBits
+
+		// Short hash matches.
+		sTableBits    = 13
+		maxSTableSize = 1 << sTableBits
+	)
+
+	var lTable [maxLTableSize]uint16
+	var sTable [maxSTableSize]uint16
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 6
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We initialize repeat to 0, so we never match on first attempt
+	repeat := 0
+
+	for {
+		candidateL := 0
+		nextS := 0
+		for {
+			// Next src position to check
+			nextS = s + (s-nextEmit)>>6 + 1
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hashL := hash7(cv, lTableBits)
+			hashS := hash4(cv, sTableBits)
+			candidateL = int(lTable[hashL])
+			candidateS := int(sTable[hashS])
+			lTable[hashL] = uint16(s)
+			sTable[hashS] = uint16(s)
+
+			valLong := load64(src, candidateL)
+			valShort := load64(src, candidateS)
+
+			// If long matches at least 8 bytes, use that.
+			if cv == valLong {
+				break
+			}
+			if cv == valShort {
+				candidateL = candidateS
+				break
+			}
+
+			// Check repeat at offset checkRep.
+			const checkRep = 1
+			// Minimum length of a repeat. Tested with various values.
+			// While 4-5 offers improvements in some, 6 reduces
+			// regressions significantly.
+			const wantRepeatBytes = 6
+			const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
+			if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+
+				// Extend forward
+				candidate := s - repeat + wantRepeatBytes + checkRep
+				s += wantRepeatBytes + checkRep
+				for s < len(src) {
+					if len(src)-s < 8 {
+						if src[s] == src[candidate] {
+							s++
+							candidate++
+							continue
+						}
+						break
+					}
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+				// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+				d += emitRepeat(dst[d:], repeat, s-base)
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+				// Index in-between
+				index0 := base + 1
+				index1 := s - 2
+
+				for index0 < index1 {
+					cv0 := load64(src, index0)
+					cv1 := load64(src, index1)
+					lTable[hash7(cv0, lTableBits)] = uint16(index0)
+					sTable[hash4(cv0>>8, sTableBits)] = uint16(index0 + 1)
+
+					lTable[hash7(cv1, lTableBits)] = uint16(index1)
+					sTable[hash4(cv1>>8, sTableBits)] = uint16(index1 + 1)
+					index0 += 2
+					index1 -= 2
+				}
+
+				cv = load64(src, s)
+				continue
+			}
+
+			// Long likely matches 7, so take that.
+			if uint32(cv) == uint32(valLong) {
+				break
+			}
+
+			// Check our short candidate
+			if uint32(cv) == uint32(valShort) {
+				// Try a long candidate at s+1
+				hashL = hash7(cv>>8, lTableBits)
+				candidateL = int(lTable[hashL])
+				lTable[hashL] = uint16(s + 1)
+				if uint32(cv>>8) == load32(src, candidateL) {
+					s++
+					break
+				}
+				// Use our short candidate.
+				candidateL = candidateS
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards
+		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
+			candidateL--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		base := s
+		offset := base - candidateL
+
+		// Extend the 4-byte match as long as possible.
+		s += 4
+		candidateL += 4
+		for s < len(src) {
+			if len(src)-s < 8 {
+				if src[s] == src[candidateL] {
+					s++
+					candidateL++
+					continue
+				}
+				break
+			}
+			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
+				s += bits.TrailingZeros64(diff) >> 3
+				break
+			}
+			s += 8
+			candidateL += 8
+		}
+
+		d += emitLiteral(dst[d:], src[nextEmit:base])
+		if repeat == offset {
+			d += emitRepeat(dst[d:], offset, s-base)
+		} else {
+			d += emitCopy(dst[d:], offset, s-base)
+			repeat = offset
+		}
+
+		nextEmit = s
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		if d > dstLimit {
+			// Do we have space for more, if not bail.
+			return 0
+		}
+
+		// Index short & long
+		index0 := base + 1
+		index1 := s - 2
+
+		cv0 := load64(src, index0)
+		cv1 := load64(src, index1)
+		lTable[hash7(cv0, lTableBits)] = uint16(index0)
+		sTable[hash4(cv0>>8, sTableBits)] = uint16(index0 + 1)
+
+		// lTable could be postponed, but very minor difference.
+		lTable[hash7(cv1, lTableBits)] = uint16(index1)
+		sTable[hash4(cv1>>8, sTableBits)] = uint16(index1 + 1)
+		index0 += 1
+		index1 -= 1
+		cv = load64(src, s)
+
+		// Index large values sparsely in between.
+		// We do two starting from different offsets for speed.
+		index2 := (index0 + index1 + 1) >> 1
+		for index2 < index1 {
+			lTable[hash7(load64(src, index0), lTableBits)] = uint16(index0)
+			lTable[hash7(load64(src, index2), lTableBits)] = uint16(index2)
+			index0 += 2
+			index2 += 2
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+
+// encodeBlockBetterSnappyGo encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBetterSnappyGo64K(dst, src []byte) (d int) {
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+
+	// Initialize the hash tables.
+	// Use smaller tables for smaller blocks
+	const (
+		// Long hash matches.
+		lTableBits    = 15
+		maxLTableSize = 1 << lTableBits
+
+		// Short hash matches.
+		sTableBits    = 13
+		maxSTableSize = 1 << sTableBits
+	)
+
+	var lTable [maxLTableSize]uint16
+	var sTable [maxSTableSize]uint16
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 6
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	const maxSkip = 100
+
+	for {
+		candidateL := 0
+		nextS := 0
+		for {
+			// Next src position to check
+			nextS = min(s+(s-nextEmit)>>6+1, s+maxSkip)
+
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hashL := hash7(cv, lTableBits)
+			hashS := hash4(cv, sTableBits)
+			candidateL = int(lTable[hashL])
+			candidateS := int(sTable[hashS])
+			lTable[hashL] = uint16(s)
+			sTable[hashS] = uint16(s)
+
+			if uint32(cv) == load32(src, candidateL) {
+				break
+			}
+
+			// Check our short candidate
+			if uint32(cv) == load32(src, candidateS) {
+				// Try a long candidate at s+1
+				hashL = hash7(cv>>8, lTableBits)
+				candidateL = int(lTable[hashL])
+				lTable[hashL] = uint16(s + 1)
+				if uint32(cv>>8) == load32(src, candidateL) {
+					s++
+					break
+				}
+				// Use our short candidate.
+				candidateL = candidateS
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards
+		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
+			candidateL--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		base := s
+		offset := base - candidateL
+
+		// Extend the 4-byte match as long as possible.
+		s += 4
+		candidateL += 4
+		for s < len(src) {
+			if len(src)-s < 8 {
+				if src[s] == src[candidateL] {
+					s++
+					candidateL++
+					continue
+				}
+				break
+			}
+			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
+				s += bits.TrailingZeros64(diff) >> 3
+				break
+			}
+			s += 8
+			candidateL += 8
+		}
+
+		d += emitLiteral(dst[d:], src[nextEmit:base])
+		d += emitCopyNoRepeat(dst[d:], offset, s-base)
+
+		nextEmit = s
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		if d > dstLimit {
+			// Do we have space for more, if not bail.
+			return 0
+		}
+
+		// Index short & long
+		index0 := base + 1
+		index1 := s - 2
+
+		cv0 := load64(src, index0)
+		cv1 := load64(src, index1)
+		lTable[hash7(cv0, lTableBits)] = uint16(index0)
+		sTable[hash4(cv0>>8, sTableBits)] = uint16(index0 + 1)
+
+		lTable[hash7(cv1, lTableBits)] = uint16(index1)
+		sTable[hash4(cv1>>8, sTableBits)] = uint16(index1 + 1)
+		index0 += 1
+		index1 -= 1
+		cv = load64(src, s)
+
+		// Index large values sparsely in between.
+		// We do two starting from different offsets for speed.
+		index2 := (index0 + index1 + 1) >> 1
+		for index2 < index1 {
+			lTable[hash7(load64(src, index0), lTableBits)] = uint16(index0)
+			lTable[hash7(load64(src, index2), lTableBits)] = uint16(index2)
+			index0 += 2
+			index2 += 2
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+
 // encodeBlockBetterDict encodes a non-empty src to a guaranteed-large-enough dst. It
 // assumes that the varint-encoded length of the decompressed bytes has already
 // been written.
diff --git a/vendor/github.com/klauspost/compress/s2/encode_go.go b/vendor/github.com/klauspost/compress/s2/encode_go.go
index dd1c973ca..e25b78445 100644
--- a/vendor/github.com/klauspost/compress/s2/encode_go.go
+++ b/vendor/github.com/klauspost/compress/s2/encode_go.go
@@ -21,6 +21,9 @@ func encodeBlock(dst, src []byte) (d int) {
 	if len(src) < minNonLiteralBlockSize {
 		return 0
 	}
+	if len(src) <= 64<<10 {
+		return encodeBlockGo64K(dst, src)
+	}
 	return encodeBlockGo(dst, src)
 }
 
@@ -32,6 +35,9 @@ func encodeBlock(dst, src []byte) (d int) {
 //
 //	len(dst) >= MaxEncodedLen(len(src))
 func encodeBlockBetter(dst, src []byte) (d int) {
+	if len(src) <= 64<<10 {
+		return encodeBlockBetterGo64K(dst, src)
+	}
 	return encodeBlockBetterGo(dst, src)
 }
 
@@ -43,6 +49,9 @@ func encodeBlockBetter(dst, src []byte) (d int) {
 //
 //	len(dst) >= MaxEncodedLen(len(src))
 func encodeBlockBetterSnappy(dst, src []byte) (d int) {
+	if len(src) <= 64<<10 {
+		return encodeBlockBetterSnappyGo64K(dst, src)
+	}
 	return encodeBlockBetterSnappyGo(dst, src)
 }
 
@@ -57,6 +66,9 @@ func encodeBlockSnappy(dst, src []byte) (d int) {
 	if len(src) < minNonLiteralBlockSize {
 		return 0
 	}
+	if len(src) <= 64<<10 {
+		return encodeBlockSnappyGo64K(dst, src)
+	}
 	return encodeBlockSnappyGo(dst, src)
 }
 
diff --git a/vendor/github.com/klauspost/compress/s2sx.mod b/vendor/github.com/klauspost/compress/s2sx.mod
index 5a4412f90..81bda5e29 100644
--- a/vendor/github.com/klauspost/compress/s2sx.mod
+++ b/vendor/github.com/klauspost/compress/s2sx.mod
@@ -1,4 +1,3 @@
 module github.com/klauspost/compress
 
-go 1.19
-
+go 1.22
diff --git a/vendor/github.com/klauspost/compress/zstd/README.md b/vendor/github.com/klauspost/compress/zstd/README.md
index 92e2347bb..c11d7fa28 100644
--- a/vendor/github.com/klauspost/compress/zstd/README.md
+++ b/vendor/github.com/klauspost/compress/zstd/README.md
@@ -6,7 +6,7 @@ A high performance compression algorithm is implemented. For now focused on spee
 
 This package provides [compression](#Compressor) to and [decompression](#Decompressor) of Zstandard content. 
 
-This package is pure Go and without use of "unsafe". 
+This package is pure Go. Use `noasm` and `nounsafe` to disable relevant features.
 
 The `zstd` package is provided as open source software using a Go standard license.
 
diff --git a/vendor/github.com/klauspost/compress/zstd/bitreader.go b/vendor/github.com/klauspost/compress/zstd/bitreader.go
index 25ca98394..d41e3e170 100644
--- a/vendor/github.com/klauspost/compress/zstd/bitreader.go
+++ b/vendor/github.com/klauspost/compress/zstd/bitreader.go
@@ -5,11 +5,12 @@
 package zstd
 
 import (
-	"encoding/binary"
 	"errors"
 	"fmt"
 	"io"
 	"math/bits"
+
+	"github.com/klauspost/compress/internal/le"
 )
 
 // bitReader reads a bitstream in reverse.
@@ -18,6 +19,7 @@ import (
 type bitReader struct {
 	in       []byte
 	value    uint64 // Maybe use [16]byte, but shifting is awkward.
+	cursor   int    // offset where next read should end
 	bitsRead uint8
 }
 
@@ -32,6 +34,7 @@ func (b *bitReader) init(in []byte) error {
 	if v == 0 {
 		return errors.New("corrupt stream, did not find end of stream")
 	}
+	b.cursor = len(in)
 	b.bitsRead = 64
 	b.value = 0
 	if len(in) >= 8 {
@@ -67,18 +70,15 @@ func (b *bitReader) fillFast() {
 	if b.bitsRead < 32 {
 		return
 	}
-	v := b.in[len(b.in)-4:]
-	b.in = b.in[:len(b.in)-4]
-	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-	b.value = (b.value << 32) | uint64(low)
+	b.cursor -= 4
+	b.value = (b.value << 32) | uint64(le.Load32(b.in, b.cursor))
 	b.bitsRead -= 32
 }
 
 // fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
 func (b *bitReader) fillFastStart() {
-	v := b.in[len(b.in)-8:]
-	b.in = b.in[:len(b.in)-8]
-	b.value = binary.LittleEndian.Uint64(v)
+	b.cursor -= 8
+	b.value = le.Load64(b.in, b.cursor)
 	b.bitsRead = 0
 }
 
@@ -87,25 +87,23 @@ func (b *bitReader) fill() {
 	if b.bitsRead < 32 {
 		return
 	}
-	if len(b.in) >= 4 {
-		v := b.in[len(b.in)-4:]
-		b.in = b.in[:len(b.in)-4]
-		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-		b.value = (b.value << 32) | uint64(low)
+	if b.cursor >= 4 {
+		b.cursor -= 4
+		b.value = (b.value << 32) | uint64(le.Load32(b.in, b.cursor))
 		b.bitsRead -= 32
 		return
 	}
 
-	b.bitsRead -= uint8(8 * len(b.in))
-	for len(b.in) > 0 {
-		b.value = (b.value << 8) | uint64(b.in[len(b.in)-1])
-		b.in = b.in[:len(b.in)-1]
+	b.bitsRead -= uint8(8 * b.cursor)
+	for b.cursor > 0 {
+		b.cursor -= 1
+		b.value = (b.value << 8) | uint64(b.in[b.cursor])
 	}
 }
 
 // finished returns true if all bits have been read from the bit stream.
 func (b *bitReader) finished() bool {
-	return len(b.in) == 0 && b.bitsRead >= 64
+	return b.cursor == 0 && b.bitsRead >= 64
 }
 
 // overread returns true if more bits have been requested than is on the stream.
@@ -115,13 +113,14 @@ func (b *bitReader) overread() bool {
 
 // remain returns the number of bits remaining.
 func (b *bitReader) remain() uint {
-	return 8*uint(len(b.in)) + 64 - uint(b.bitsRead)
+	return 8*uint(b.cursor) + 64 - uint(b.bitsRead)
 }
 
 // close the bitstream and returns an error if out-of-buffer reads occurred.
 func (b *bitReader) close() error {
 	// Release reference.
 	b.in = nil
+	b.cursor = 0
 	if !b.finished() {
 		return fmt.Errorf("%d extra bits on block, should be 0", b.remain())
 	}
diff --git a/vendor/github.com/klauspost/compress/zstd/blockdec.go b/vendor/github.com/klauspost/compress/zstd/blockdec.go
index 9c28840c3..0dd742fd2 100644
--- a/vendor/github.com/klauspost/compress/zstd/blockdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockdec.go
@@ -5,14 +5,10 @@
 package zstd
 
 import (
-	"bytes"
-	"encoding/binary"
 	"errors"
 	"fmt"
 	"hash/crc32"
 	"io"
-	"os"
-	"path/filepath"
 	"sync"
 
 	"github.com/klauspost/compress/huff0"
@@ -648,21 +644,6 @@ func (b *blockDec) prepareSequences(in []byte, hist *history) (err error) {
 		println("initializing sequences:", err)
 		return err
 	}
-	// Extract blocks...
-	if false && hist.dict == nil {
-		fatalErr := func(err error) {
-			if err != nil {
-				panic(err)
-			}
-		}
-		fn := fmt.Sprintf("n-%d-lits-%d-prev-%d-%d-%d-win-%d.blk", hist.decoders.nSeqs, len(hist.decoders.literals), hist.recentOffsets[0], hist.recentOffsets[1], hist.recentOffsets[2], hist.windowSize)
-		var buf bytes.Buffer
-		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.litLengths.fse))
-		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.matchLengths.fse))
-		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.offsets.fse))
-		buf.Write(in)
-		os.WriteFile(filepath.Join("testdata", "seqs", fn), buf.Bytes(), os.ModePerm)
-	}
 
 	return nil
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/blockenc.go b/vendor/github.com/klauspost/compress/zstd/blockenc.go
index 32a7f401d..fd35ea148 100644
--- a/vendor/github.com/klauspost/compress/zstd/blockenc.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockenc.go
@@ -9,6 +9,7 @@ import (
 	"fmt"
 	"math"
 	"math/bits"
+	"slices"
 
 	"github.com/klauspost/compress/huff0"
 )
@@ -457,16 +458,7 @@ func fuzzFseEncoder(data []byte) int {
 		// All 0
 		return 0
 	}
-	maxCount := func(a []uint32) int {
-		var max uint32
-		for _, v := range a {
-			if v > max {
-				max = v
-			}
-		}
-		return int(max)
-	}
-	cnt := maxCount(hist[:maxSym])
+	cnt := int(slices.Max(hist[:maxSym]))
 	if cnt == len(data) {
 		// RLE
 		return 0
@@ -884,15 +876,6 @@ func (b *blockEnc) genCodes() {
 			}
 		}
 	}
-	maxCount := func(a []uint32) int {
-		var max uint32
-		for _, v := range a {
-			if v > max {
-				max = v
-			}
-		}
-		return int(max)
-	}
 	if debugAsserts && mlMax > maxMatchLengthSymbol {
 		panic(fmt.Errorf("mlMax > maxMatchLengthSymbol (%d)", mlMax))
 	}
@@ -903,7 +886,7 @@ func (b *blockEnc) genCodes() {
 		panic(fmt.Errorf("llMax > maxLiteralLengthSymbol (%d)", llMax))
 	}
 
-	b.coders.mlEnc.HistogramFinished(mlMax, maxCount(mlH[:mlMax+1]))
-	b.coders.ofEnc.HistogramFinished(ofMax, maxCount(ofH[:ofMax+1]))
-	b.coders.llEnc.HistogramFinished(llMax, maxCount(llH[:llMax+1]))
+	b.coders.mlEnc.HistogramFinished(mlMax, int(slices.Max(mlH[:mlMax+1])))
+	b.coders.ofEnc.HistogramFinished(ofMax, int(slices.Max(ofH[:ofMax+1])))
+	b.coders.llEnc.HistogramFinished(llMax, int(slices.Max(llH[:llMax+1])))
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go
index bbca17234..ea2a19376 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder.go
@@ -123,7 +123,7 @@ func NewReader(r io.Reader, opts ...DOption) (*Decoder, error) {
 }
 
 // Read bytes from the decompressed stream into p.
-// Returns the number of bytes written and any error that occurred.
+// Returns the number of bytes read and any error that occurred.
 // When the stream is done, io.EOF will be returned.
 func (d *Decoder) Read(p []byte) (int, error) {
 	var n int
@@ -323,6 +323,7 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 		frame.bBuf = nil
 		if frame.history.decoders.br != nil {
 			frame.history.decoders.br.in = nil
+			frame.history.decoders.br.cursor = 0
 		}
 		d.decoders <- block
 	}()
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_base.go b/vendor/github.com/klauspost/compress/zstd/enc_base.go
index 5ca46038a..7d250c67f 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_base.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_base.go
@@ -116,7 +116,7 @@ func (e *fastBase) matchlen(s, t int32, src []byte) int32 {
 			panic(err)
 		}
 		if t < 0 {
-			err := fmt.Sprintf("s (%d) < 0", s)
+			err := fmt.Sprintf("t (%d) < 0", t)
 			panic(err)
 		}
 		if s-t > e.maxMatchOff {
diff --git a/vendor/github.com/klauspost/compress/zstd/matchlen_generic.go b/vendor/github.com/klauspost/compress/zstd/matchlen_generic.go
index 57b9c31c0..bea1779e9 100644
--- a/vendor/github.com/klauspost/compress/zstd/matchlen_generic.go
+++ b/vendor/github.com/klauspost/compress/zstd/matchlen_generic.go
@@ -7,20 +7,25 @@
 package zstd
 
 import (
-	"encoding/binary"
 	"math/bits"
+
+	"github.com/klauspost/compress/internal/le"
 )
 
 // matchLen returns the maximum common prefix length of a and b.
 // a must be the shortest of the two.
 func matchLen(a, b []byte) (n int) {
-	for ; len(a) >= 8 && len(b) >= 8; a, b = a[8:], b[8:] {
-		diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b)
+	left := len(a)
+	for left >= 8 {
+		diff := le.Load64(a, n) ^ le.Load64(b, n)
 		if diff != 0 {
 			return n + bits.TrailingZeros64(diff)>>3
 		}
 		n += 8
+		left -= 8
 	}
+	a = a[n:]
+	b = b[n:]
 
 	for i := range a {
 		if a[i] != b[i] {
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec.go b/vendor/github.com/klauspost/compress/zstd/seqdec.go
index d7fe6d82d..9a7de82f9 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec.go
@@ -245,7 +245,7 @@ func (s *sequenceDecs) decodeSync(hist []byte) error {
 			return io.ErrUnexpectedEOF
 		}
 		var ll, mo, ml int
-		if len(br.in) > 4+((maxOffsetBits+16+16)>>3) {
+		if br.cursor > 4+((maxOffsetBits+16+16)>>3) {
 			// inlined function:
 			// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
 
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
index f5591fa1e..a708ca6d3 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
@@ -7,9 +7,9 @@
 TEXT ·sequenceDecs_decode_amd64(SB), $8-32
 	MOVQ    br+8(FP), CX
 	MOVQ    24(CX), DX
-	MOVBQZX 32(CX), BX
+	MOVBQZX 40(CX), BX
 	MOVQ    (CX), AX
-	MOVQ    8(CX), SI
+	MOVQ    32(CX), SI
 	ADDQ    SI, AX
 	MOVQ    AX, (SP)
 	MOVQ    ctx+16(FP), AX
@@ -299,8 +299,8 @@ sequenceDecs_decode_amd64_match_len_ofs_ok:
 	MOVQ R13, 160(AX)
 	MOVQ br+8(FP), AX
 	MOVQ DX, 24(AX)
-	MOVB BL, 32(AX)
-	MOVQ SI, 8(AX)
+	MOVB BL, 40(AX)
+	MOVQ SI, 32(AX)
 
 	// Return success
 	MOVQ $0x00000000, ret+24(FP)
@@ -335,9 +335,9 @@ error_overread:
 TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
 	MOVQ    br+8(FP), CX
 	MOVQ    24(CX), DX
-	MOVBQZX 32(CX), BX
+	MOVBQZX 40(CX), BX
 	MOVQ    (CX), AX
-	MOVQ    8(CX), SI
+	MOVQ    32(CX), SI
 	ADDQ    SI, AX
 	MOVQ    AX, (SP)
 	MOVQ    ctx+16(FP), AX
@@ -598,8 +598,8 @@ sequenceDecs_decode_56_amd64_match_len_ofs_ok:
 	MOVQ R13, 160(AX)
 	MOVQ br+8(FP), AX
 	MOVQ DX, 24(AX)
-	MOVB BL, 32(AX)
-	MOVQ SI, 8(AX)
+	MOVB BL, 40(AX)
+	MOVQ SI, 32(AX)
 
 	// Return success
 	MOVQ $0x00000000, ret+24(FP)
@@ -634,9 +634,9 @@ error_overread:
 TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
 	MOVQ    br+8(FP), BX
 	MOVQ    24(BX), AX
-	MOVBQZX 32(BX), DX
+	MOVBQZX 40(BX), DX
 	MOVQ    (BX), CX
-	MOVQ    8(BX), BX
+	MOVQ    32(BX), BX
 	ADDQ    BX, CX
 	MOVQ    CX, (SP)
 	MOVQ    ctx+16(FP), CX
@@ -884,8 +884,8 @@ sequenceDecs_decode_bmi2_match_len_ofs_ok:
 	MOVQ R12, 160(CX)
 	MOVQ br+8(FP), CX
 	MOVQ AX, 24(CX)
-	MOVB DL, 32(CX)
-	MOVQ BX, 8(CX)
+	MOVB DL, 40(CX)
+	MOVQ BX, 32(CX)
 
 	// Return success
 	MOVQ $0x00000000, ret+24(FP)
@@ -920,9 +920,9 @@ error_overread:
 TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
 	MOVQ    br+8(FP), BX
 	MOVQ    24(BX), AX
-	MOVBQZX 32(BX), DX
+	MOVBQZX 40(BX), DX
 	MOVQ    (BX), CX
-	MOVQ    8(BX), BX
+	MOVQ    32(BX), BX
 	ADDQ    BX, CX
 	MOVQ    CX, (SP)
 	MOVQ    ctx+16(FP), CX
@@ -1141,8 +1141,8 @@ sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
 	MOVQ R12, 160(CX)
 	MOVQ br+8(FP), CX
 	MOVQ AX, 24(CX)
-	MOVB DL, 32(CX)
-	MOVQ BX, 8(CX)
+	MOVB DL, 40(CX)
+	MOVQ BX, 32(CX)
 
 	// Return success
 	MOVQ $0x00000000, ret+24(FP)
@@ -1787,9 +1787,9 @@ empty_seqs:
 TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
 	MOVQ    br+8(FP), CX
 	MOVQ    24(CX), DX
-	MOVBQZX 32(CX), BX
+	MOVBQZX 40(CX), BX
 	MOVQ    (CX), AX
-	MOVQ    8(CX), SI
+	MOVQ    32(CX), SI
 	ADDQ    SI, AX
 	MOVQ    AX, (SP)
 	MOVQ    ctx+16(FP), AX
@@ -2281,8 +2281,8 @@ handle_loop:
 loop_finished:
 	MOVQ br+8(FP), AX
 	MOVQ DX, 24(AX)
-	MOVB BL, 32(AX)
-	MOVQ SI, 8(AX)
+	MOVB BL, 40(AX)
+	MOVQ SI, 32(AX)
 
 	// Update the context
 	MOVQ ctx+16(FP), AX
@@ -2349,9 +2349,9 @@ error_not_enough_space:
 TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
 	MOVQ    br+8(FP), BX
 	MOVQ    24(BX), AX
-	MOVBQZX 32(BX), DX
+	MOVBQZX 40(BX), DX
 	MOVQ    (BX), CX
-	MOVQ    8(BX), BX
+	MOVQ    32(BX), BX
 	ADDQ    BX, CX
 	MOVQ    CX, (SP)
 	MOVQ    ctx+16(FP), CX
@@ -2801,8 +2801,8 @@ handle_loop:
 loop_finished:
 	MOVQ br+8(FP), CX
 	MOVQ AX, 24(CX)
-	MOVB DL, 32(CX)
-	MOVQ BX, 8(CX)
+	MOVB DL, 40(CX)
+	MOVQ BX, 32(CX)
 
 	// Update the context
 	MOVQ ctx+16(FP), AX
@@ -2869,9 +2869,9 @@ error_not_enough_space:
 TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
 	MOVQ    br+8(FP), CX
 	MOVQ    24(CX), DX
-	MOVBQZX 32(CX), BX
+	MOVBQZX 40(CX), BX
 	MOVQ    (CX), AX
-	MOVQ    8(CX), SI
+	MOVQ    32(CX), SI
 	ADDQ    SI, AX
 	MOVQ    AX, (SP)
 	MOVQ    ctx+16(FP), AX
@@ -3465,8 +3465,8 @@ handle_loop:
 loop_finished:
 	MOVQ br+8(FP), AX
 	MOVQ DX, 24(AX)
-	MOVB BL, 32(AX)
-	MOVQ SI, 8(AX)
+	MOVB BL, 40(AX)
+	MOVQ SI, 32(AX)
 
 	// Update the context
 	MOVQ ctx+16(FP), AX
@@ -3533,9 +3533,9 @@ error_not_enough_space:
 TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
 	MOVQ    br+8(FP), BX
 	MOVQ    24(BX), AX
-	MOVBQZX 32(BX), DX
+	MOVBQZX 40(BX), DX
 	MOVQ    (BX), CX
-	MOVQ    8(BX), BX
+	MOVQ    32(BX), BX
 	ADDQ    BX, CX
 	MOVQ    CX, (SP)
 	MOVQ    ctx+16(FP), CX
@@ -4087,8 +4087,8 @@ handle_loop:
 loop_finished:
 	MOVQ br+8(FP), CX
 	MOVQ AX, 24(CX)
-	MOVB DL, 32(CX)
-	MOVQ BX, 8(CX)
+	MOVB DL, 40(CX)
+	MOVQ BX, 32(CX)
 
 	// Update the context
 	MOVQ ctx+16(FP), AX
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go b/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go
index 2fb35b788..7cec2197c 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go
@@ -29,7 +29,7 @@ func (s *sequenceDecs) decode(seqs []seqVals) error {
 	}
 	for i := range seqs {
 		var ll, mo, ml int
-		if len(br.in) > 4+((maxOffsetBits+16+16)>>3) {
+		if br.cursor > 4+((maxOffsetBits+16+16)>>3) {
 			// inlined function:
 			// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
 
diff --git a/vendor/github.com/klauspost/compress/zstd/seqenc.go b/vendor/github.com/klauspost/compress/zstd/seqenc.go
index 8014174a7..65045eabd 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqenc.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqenc.go
@@ -69,7 +69,6 @@ var llBitsTable = [maxLLCode + 1]byte{
 func llCode(litLength uint32) uint8 {
 	const llDeltaCode = 19
 	if litLength <= 63 {
-		// Compiler insists on bounds check (Go 1.12)
 		return llCodeTable[litLength&63]
 	}
 	return uint8(highBit(litLength)) + llDeltaCode
@@ -102,7 +101,6 @@ var mlBitsTable = [maxMLCode + 1]byte{
 func mlCode(mlBase uint32) uint8 {
 	const mlDeltaCode = 36
 	if mlBase <= 127 {
-		// Compiler insists on bounds check (Go 1.12)
 		return mlCodeTable[mlBase&127]
 	}
 	return uint8(highBit(mlBase)) + mlDeltaCode
diff --git a/vendor/github.com/klauspost/compress/zstd/snappy.go b/vendor/github.com/klauspost/compress/zstd/snappy.go
index ec13594e8..a17381b8f 100644
--- a/vendor/github.com/klauspost/compress/zstd/snappy.go
+++ b/vendor/github.com/klauspost/compress/zstd/snappy.go
@@ -197,7 +197,7 @@ func (r *SnappyConverter) Convert(in io.Reader, w io.Writer) (int64, error) {
 
 			n, r.err = w.Write(r.block.output)
 			if r.err != nil {
-				return written, err
+				return written, r.err
 			}
 			written += int64(n)
 			continue
@@ -239,7 +239,7 @@ func (r *SnappyConverter) Convert(in io.Reader, w io.Writer) (int64, error) {
 			}
 			n, r.err = w.Write(r.block.output)
 			if r.err != nil {
-				return written, err
+				return written, r.err
 			}
 			written += int64(n)
 			continue
diff --git a/vendor/github.com/klauspost/compress/zstd/zstd.go b/vendor/github.com/klauspost/compress/zstd/zstd.go
index 066bef2a4..6252b46ae 100644
--- a/vendor/github.com/klauspost/compress/zstd/zstd.go
+++ b/vendor/github.com/klauspost/compress/zstd/zstd.go
@@ -5,10 +5,11 @@ package zstd
 
 import (
 	"bytes"
-	"encoding/binary"
 	"errors"
 	"log"
 	"math"
+
+	"github.com/klauspost/compress/internal/le"
 )
 
 // enable debug printing
@@ -110,11 +111,11 @@ func printf(format string, a ...interface{}) {
 }
 
 func load3232(b []byte, i int32) uint32 {
-	return binary.LittleEndian.Uint32(b[:len(b):len(b)][i:])
+	return le.Load32(b, i)
 }
 
 func load6432(b []byte, i int32) uint64 {
-	return binary.LittleEndian.Uint64(b[:len(b):len(b)][i:])
+	return le.Load64(b, i)
 }
 
 type byter interface {
diff --git a/vendor/modules.txt b/vendor/modules.txt
index 469131394..294574dc8 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -218,14 +218,15 @@ github.com/jcmturner/rpc/v2/ndr
 # github.com/kardianos/osext v0.0.0-20190222173326-2bc1f35cddc0
 ## explicit
 github.com/kardianos/osext
-# github.com/klauspost/compress v1.17.11
-## explicit; go 1.21
+# github.com/klauspost/compress v1.18.0
+## explicit; go 1.22
 github.com/klauspost/compress
 github.com/klauspost/compress/flate
 github.com/klauspost/compress/fse
 github.com/klauspost/compress/gzip
 github.com/klauspost/compress/huff0
 github.com/klauspost/compress/internal/cpuinfo
+github.com/klauspost/compress/internal/le
 github.com/klauspost/compress/internal/race
 github.com/klauspost/compress/internal/snapref
 github.com/klauspost/compress/s2