From ab45429c5f8212e551ae6115559bf255b16ee1bb Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 12 Jul 2024 12:08:39 -0700 Subject: [PATCH 1/5] - add WARC-Protocol header combining http + tls protocols as comma separated - add WARC-Cipher-Suite header, mapping Chrome NetworkSecurityDetails to known cipher suites - fixes #641 --- src/util/recorder.ts | 8 +++++++ src/util/reqresp.ts | 53 ++++++++++++++++++++++++++------------------ 2 files changed, 40 insertions(+), 21 deletions(-) diff --git a/src/util/recorder.ts b/src/util/recorder.ts index be36cc3b..e6a90942 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -1755,6 +1755,14 @@ function createResponse( "WARC-Page-ID": pageid, }; + if (reqresp.protocols.length) { + warcHeaders["WARC-Protocol"] = reqresp.protocols.join(", "); + } + + if (reqresp.cipher) { + warcHeaders["WARC-Cipher-Suite"] = reqresp.cipher; + } + if (reqresp.resourceType) { warcHeaders["WARC-Resource-Type"] = reqresp.resourceType; } diff --git a/src/util/reqresp.ts b/src/util/reqresp.ts index 5548cb5b..1ed47ed9 100644 --- a/src/util/reqresp.ts +++ b/src/util/reqresp.ts @@ -25,7 +25,9 @@ export class RequestResponseInfo { method?: string; url!: string; - protocol?: string = "HTTP/1.1"; + + protocols: string[] = []; + cipher?: string; mimeType?: string; @@ -132,7 +134,9 @@ export class RequestResponseInfo { this.setStatus(response.status, response.statusText); - this.protocol = response.protocol; + if (response.protocol) { + this.protocols.push(response.protocol); + } if (resourceType) { this.resourceType = resourceType.toLowerCase(); @@ -153,11 +157,16 @@ export class RequestResponseInfo { this.fromServiceWorker = !!response.fromServiceWorker; - if (response.securityDetails) { - const issuer: string = response.securityDetails.issuer || ""; + const { securityDetails } = response; + + if (securityDetails) { + this.protocols.push( + securityDetails.protocol.replaceAll(" ", "/").toLowerCase(), + ); + this.cipher = getCipher(securityDetails); + const issuer: string = securityDetails.issuer || ""; const ctc: string = - response.securityDetails.certificateTransparencyCompliance === - "compliant" + securityDetails.certificateTransparencyCompliance === "compliant" ? "1" : "0"; this.extraOpts.cert = { issuer, ctc }; @@ -204,21 +213,6 @@ export class RequestResponseInfo { this.requestHeaders = params.headers; } - getResponseHeadersText() { - let headers = `${this.protocol} ${this.status} ${this.statusText}\r\n`; - - if (this.responseHeaders) { - for (const header of Object.keys(this.responseHeaders)) { - headers += `${header}: ${this.responseHeaders[header].replace( - /\n/g, - ", ", - )}\r\n`; - } - } - headers += "\r\n"; - return headers; - } - hasRequest() { return this.method && (this.requestHeaders || this.requestHeadersText); } @@ -417,3 +411,20 @@ export function isHTMLMime(mime: string) { export function isRedirectStatus(status: number) { return status >= 300 && status < 400 && status !== 304; } + +function getCipher({ + keyExchange, + keyExchangeGroup, + cipher, +}: Protocol.Network.SecurityDetails): string { + const key = `${keyExchange} ${keyExchangeGroup} ${cipher}`; + const mapping: Record = { + " X25519Kyber768Draft00 AES_128_GCM": "TLS_AES_128_GCM_SHA256", + " X25519 AES_128_GCM": "TLS_AES_128_GCM_SHA256", + "ECDHE_RSA X25519 AES_128_GCM": "TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256", + " X25519Kyber768Draft00 AES_256_GCM": "TLS_AES_256_GCM_SHA384", + " X25519 AES_256_GCM": "TLS_AES_256_GCM_SHA384", + "ECDHE_RSA X25519 AES_256_GCM": "TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384", + }; + return mapping[key] || ""; +} From 764930bf0f991f339f324a5b63afeb097e301596 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 29 Aug 2024 15:06:32 -0700 Subject: [PATCH 2/5] cleanup, add fallbacks for tls/1.3 to common cipher values --- src/util/reqresp.ts | 42 ++++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/src/util/reqresp.ts b/src/util/reqresp.ts index 1ed47ed9..d247ff64 100644 --- a/src/util/reqresp.ts +++ b/src/util/reqresp.ts @@ -4,6 +4,7 @@ import { Protocol } from "puppeteer-core"; import { postToGetUrl } from "warcio"; import { HTML_TYPES } from "./constants.js"; import { Response } from "undici"; +import { logger } from "./logger.js"; const CONTENT_LENGTH = "content-length"; const CONTENT_RANGE = "content-range"; @@ -160,10 +161,11 @@ export class RequestResponseInfo { const { securityDetails } = response; if (securityDetails) { - this.protocols.push( - securityDetails.protocol.replaceAll(" ", "/").toLowerCase(), - ); - this.cipher = getCipher(securityDetails); + const securityProtocol = securityDetails.protocol + .replaceAll(" ", "/") + .toLowerCase(); + this.protocols.push(securityProtocol); + this.cipher = getCipher(securityDetails, securityProtocol, this.url); const issuer: string = securityDetails.issuer || ""; const ctc: string = securityDetails.certificateTransparencyCompliance === "compliant" @@ -412,19 +414,35 @@ export function isRedirectStatus(status: number) { return status >= 300 && status < 400 && status !== 304; } -function getCipher({ - keyExchange, - keyExchangeGroup, - cipher, -}: Protocol.Network.SecurityDetails): string { +function getCipher( + { keyExchange, keyExchangeGroup, cipher }: Protocol.Network.SecurityDetails, + protocol: string, + url: string, +): string { const key = `${keyExchange} ${keyExchangeGroup} ${cipher}`; const mapping: Record = { + "ECDHE_RSA X25519 AES_128_GCM": "TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256", + "ECDHE_RSA X25519 AES_256_GCM": "TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384", " X25519Kyber768Draft00 AES_128_GCM": "TLS_AES_128_GCM_SHA256", " X25519 AES_128_GCM": "TLS_AES_128_GCM_SHA256", - "ECDHE_RSA X25519 AES_128_GCM": "TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256", " X25519Kyber768Draft00 AES_256_GCM": "TLS_AES_256_GCM_SHA384", " X25519 AES_256_GCM": "TLS_AES_256_GCM_SHA384", - "ECDHE_RSA X25519 AES_256_GCM": "TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384", + " P-256 AES_256_GCM": "TLS_AES_256_GCM_SHA384", }; - return mapping[key] || ""; + let cipherString = mapping[key] || ""; + if (!cipherString && protocol === "tls/1.3") { + switch (keyExchangeGroup) { + case "AES_256_GCM": + cipherString = "TLS_AES_256_GCM_SHA384"; + break; + + case "AES_128_GCM": + cipherString = "TLS_AES_128_GCM_SHA256"; + break; + } + } + if (!cipherString) { + logger.debug("No cipher for", { key, url }); + } + return cipherString; } From 1a185de28e9b41693554ebfaf8ff4b0260c3e496 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 20 Nov 2024 11:47:24 -0800 Subject: [PATCH 3/5] update to latest warcio support WARC-Protocol as multiple headers tests: add tests for WARC-Protocol, WARC-Cipher-Suite --- package.json | 2 +- src/util/recorder.ts | 7 +++++-- tests/pageinfo-records.test.js | 5 +++++ yarn.lock | 38 +++++++++++++++++++++++----------- 4 files changed, 37 insertions(+), 15 deletions(-) diff --git a/package.json b/package.json index a42ae2a7..05e90ac2 100644 --- a/package.json +++ b/package.json @@ -36,7 +36,7 @@ "tsc": "^2.0.4", "undici": "^6.18.2", "uuid": "8.3.2", - "warcio": "^2.3.1", + "warcio": "^2.4.2", "ws": "^7.4.4", "yargs": "^17.7.2" }, diff --git a/src/util/recorder.ts b/src/util/recorder.ts index e6a90942..6c9c0ba8 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -12,7 +12,7 @@ import { fetch, getGlobalDispatcher, Response } from "undici"; import { getCustomRewriter, rewriteDASH, rewriteHLS } from "@webrecorder/wabac"; -import { WARCRecord } from "warcio"; +import { WARCRecord, multiValueHeader } from "warcio"; import { TempFileBuffer, WARCSerializer } from "warcio/node"; import { WARCWriter } from "./warcwriter.js"; import { RedisCrawlState, WorkerId } from "./state.js"; @@ -1756,7 +1756,10 @@ function createResponse( }; if (reqresp.protocols.length) { - warcHeaders["WARC-Protocol"] = reqresp.protocols.join(", "); + warcHeaders["WARC-Protocol"] = multiValueHeader( + "WARC-Protocol", + reqresp.protocols, + ); } if (reqresp.cipher) { diff --git a/tests/pageinfo-records.test.js b/tests/pageinfo-records.test.js index 01dc77a4..3fe64d74 100644 --- a/tests/pageinfo-records.test.js +++ b/tests/pageinfo-records.test.js @@ -24,6 +24,11 @@ test("run warc and ensure pageinfo records contain the correct resources", async let foundInvalid = false; for await (const record of parser) { + if (record.warcType === "response") { + expect(record.warcHeaders.headers.get("WARC-Protocol")).toBe("h2, tls/1.3"); + expect(record.warcHeaders.headers.get("WARC-Cipher-Suite")).toBe("TLS_AES_128_GCM_SHA256"); + } + if ( !foundIndex && record.warcTargetURI === "urn:pageinfo:https://old.webrecorder.net/" diff --git a/yarn.lock b/yarn.lock index 8b0b963e..59276595 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1312,16 +1312,16 @@ resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406" integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ== -"@webrecorder/wabac@^2.20.0-beta.4": - version "2.20.0-beta.4" - resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.20.0-beta.4.tgz#c60fcd00f449cca52ce1a0bef305a06922c9e3e8" - integrity sha512-enHYcZoqs7cOu2tdTqVeB/zB27uL4wmCMzvF55bJqdB8d5zgPpY+/fpRA3eLxGrPc0nFYAjsI/aNaa62FH7WKQ== +"@webrecorder/wabac@^2.20.0": + version "2.20.3" + resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.20.3.tgz#4e3faa476477b64ba2c2315f57d38372d0acf4c0" + integrity sha512-bik2YbIJwox5LctL3QwZ1pvG89ORR31do3mFHTF1l4zcvjqeLoqCHIImEsgl9uH7KYq27UxeG4y25Jo3PxA5qQ== dependencies: "@peculiar/asn1-ecc" "^2.3.4" "@peculiar/asn1-schema" "^2.3.3" "@peculiar/x509" "^1.9.2" "@types/js-levenshtein" "^1.1.3" - "@webrecorder/wombat" "^3.8.2" + "@webrecorder/wombat" "^3.8.6" acorn "^8.10.0" auto-js-ipfs "^2.1.1" base64-js "^1.5.1" @@ -1340,14 +1340,14 @@ path-parser "^6.1.0" process "^0.11.10" stream-browserify "^3.0.0" - warcio "^2.3.1" + warcio "^2.4.0" -"@webrecorder/wombat@^3.8.2": - version "3.8.2" - resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.8.2.tgz#e46e18719834d633175eec52ce753a4dc4e48e27" - integrity sha512-uUZr9V4UYpVOpM64Tm27ND/hMjDbT37+/qyNaNV6loqDuVzBVQh5w7SfTEy0Bbjj1MYyNZP244mOtWtotTpUEA== +"@webrecorder/wombat@^3.8.6": + version "3.8.6" + resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.8.6.tgz#3aa99d9519f6263434a9e0b963f6ef86d3e0494a" + integrity sha512-+IxV0bkoc6QdHYzwejsPFPC31dRjaxa6zGuR9F08aFb4Ooeekf9AK16ZIYweizs/wm7nvTG5E12ZwW0LUUzX8w== dependencies: - warcio "^2.3.1" + warcio "^2.4.0" "@zxing/text-encoding@0.9.0": version "0.9.0" @@ -5283,7 +5283,7 @@ walker@^1.0.8: dependencies: makeerror "1.0.12" -warcio@^2.3.1: +warcio@^2.3.1, warcio@^2.4.0: version "2.3.1" resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.3.1.tgz#8ac9de897de1a556161168f2a3938b60929908ca" integrity sha512-PjcWqzXfs6HdWfHi1V/i8MoMmV5M0Csg3rOa2mqCJ1dmCJXswVfQ0VXbEVumwavNIW2oFFj6LJoCHHeL4Ls/zw== @@ -5297,6 +5297,20 @@ warcio@^2.3.1: uuid-random "^1.3.2" yargs "^17.6.2" +warcio@^2.4.2: + version "2.4.2" + resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.4.2.tgz#782d8dcb0769f271b0ae96521fb4969e2570e9b3" + integrity sha512-QYbZ3EGYtnAIrzL7Bajo7ak87pipilpkIfaFIzFQWUX4wuXNuKqnfQy/EAoi2tEIl3VJgsWcL+wjjk4+15MKbQ== + dependencies: + "@types/pako" "^1.0.7" + "@types/stream-buffers" "^3.0.7" + base32-encode "^2.0.0" + hash-wasm "^4.9.0" + pako "^1.0.11" + tempy "^3.1.0" + uuid-random "^1.3.2" + yargs "^17.7.2" + web-encoding@^1.1.5: version "1.1.5" resolved "https://registry.yarnpkg.com/web-encoding/-/web-encoding-1.1.5.tgz#fc810cf7667364a6335c939913f5051d3e0c4864" From b180e31c0726697aab9e9558af879ba34e2373ec Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 20 Nov 2024 11:50:34 -0800 Subject: [PATCH 4/5] Update src/util/reqresp.ts Co-authored-by: Tessa Walsh --- src/util/reqresp.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/util/reqresp.ts b/src/util/reqresp.ts index d247ff64..a57cd14c 100644 --- a/src/util/reqresp.ts +++ b/src/util/reqresp.ts @@ -2,8 +2,9 @@ import { getCustomRewriter, getStatusText } from "@webrecorder/wabac"; import { Protocol } from "puppeteer-core"; import { postToGetUrl } from "warcio"; -import { HTML_TYPES } from "./constants.js"; import { Response } from "undici"; + +import { HTML_TYPES } from "./constants.js"; import { logger } from "./logger.js"; const CONTENT_LENGTH = "content-length"; From 5df11fe641e68ba362c43859629541f7b6c2d322 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 20 Nov 2024 13:27:47 -0800 Subject: [PATCH 5/5] tests: only test html for warc-protocol header --- tests/pageinfo-records.test.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/pageinfo-records.test.js b/tests/pageinfo-records.test.js index 3fe64d74..15a0a98b 100644 --- a/tests/pageinfo-records.test.js +++ b/tests/pageinfo-records.test.js @@ -24,7 +24,8 @@ test("run warc and ensure pageinfo records contain the correct resources", async let foundInvalid = false; for await (const record of parser) { - if (record.warcType === "response") { + if (record.warcType === "response" && + (record.warcTargetURI === "https://oldwebrecorder.net/" || record.warcTargetURI === "https://old.webrecorder.net/about")) { expect(record.warcHeaders.headers.get("WARC-Protocol")).toBe("h2, tls/1.3"); expect(record.warcHeaders.headers.get("WARC-Cipher-Suite")).toBe("TLS_AES_128_GCM_SHA256"); }