From c1abfd962c015507b701007b18d9b54970c1ebe8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 28 Oct 2024 20:10:13 +0000 Subject: [PATCH 01/15] Bump actions/setup-python from 5.2.0 to 5.3.0 (#3810) Bumps [actions/setup-python](https://github.com/actions/setup-python) from 5.2.0 to 5.3.0. - [Release notes](https://github.com/actions/setup-python/releases) - [Commits](https://github.com/actions/setup-python/compare/v5.2.0...v5.3.0) --- updated-dependencies: - dependency-name: actions/setup-python dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/code-static-analysis.yml | 2 +- .github/workflows/lintsql.yml | 2 +- .github/workflows/predeploy.yml | 2 +- .github/workflows/test_website.yml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/code-static-analysis.yml b/.github/workflows/code-static-analysis.yml index 479dcf9e194..909dd326305 100644 --- a/.github/workflows/code-static-analysis.yml +++ b/.github/workflows/code-static-analysis.yml @@ -35,7 +35,7 @@ jobs: uses: actions/checkout@v4 - name: Set up Python 3.12 if: ${{ matrix.language == 'python' }} - uses: actions/setup-python@v5.2.0 + uses: actions/setup-python@v5.3.0 with: python-version: '3.12' - name: Install dependencies diff --git a/.github/workflows/lintsql.yml b/.github/workflows/lintsql.yml index a7f1758bc2b..f0e9b40f4d7 100644 --- a/.github/workflows/lintsql.yml +++ b/.github/workflows/lintsql.yml @@ -19,7 +19,7 @@ jobs: # Full git history is needed to get a proper list of changed files within `super-linter` fetch-depth: 0 - name: Set up Python 3.12 - uses: actions/setup-python@v5.2.0 + uses: actions/setup-python@v5.3.0 with: python-version: '3.12' - name: Lint SQL code diff --git a/.github/workflows/predeploy.yml b/.github/workflows/predeploy.yml index 9ebd32c4001..fa03fbf44ac 100644 --- a/.github/workflows/predeploy.yml +++ b/.github/workflows/predeploy.yml @@ -37,7 +37,7 @@ jobs: with: node-version: '20' - name: Set up Python 3.12 - uses: actions/setup-python@v5.2.0 + uses: actions/setup-python@v5.3.0 with: python-version: '3.12' - name: Install Asian Fonts diff --git a/.github/workflows/test_website.yml b/.github/workflows/test_website.yml index 885240b918d..a6489abf9f4 100644 --- a/.github/workflows/test_website.yml +++ b/.github/workflows/test_website.yml @@ -30,7 +30,7 @@ jobs: with: node-version: '20' - name: Set up Python 3.12 - uses: actions/setup-python@v5.2.0 + uses: actions/setup-python@v5.3.0 with: python-version: '3.12' - name: Run the website From dc7f46b37fd9ef3bfca81016c620c4dfde7861b7 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Mon, 28 Oct 2024 21:12:34 +0100 Subject: [PATCH 02/15] Figure generation workaround (#3818) --- src/tools/generate/generate_figure_images.js | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/tools/generate/generate_figure_images.js b/src/tools/generate/generate_figure_images.js index e491152a0b9..8b297291466 100644 --- a/src/tools/generate/generate_figure_images.js +++ b/src/tools/generate/generate_figure_images.js @@ -7,7 +7,8 @@ const take_single_screenshot = async (graphUrl, filename) => { const sheets_chart = graphUrl.startsWith('https://docs.google.com/spreadsheets') ? true : false; - const chartUrl = sheets_chart ? graphUrl : 'http://localhost:8080/' + graphUrl; + // Temporarily replace `&format=interactive` with `&format=image` + const chartUrl = sheets_chart ? graphUrl.replaceAll('&format=interactive', '&format=image') : 'http://localhost:8080/' + graphUrl; const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.setViewport({ @@ -18,7 +19,11 @@ const take_single_screenshot = async (graphUrl, filename) => { await page.goto(chartUrl, { waitUntil: 'networkidle2', }); - const el = sheets_chart ? await page.$('#embed_chart') : await page.$('main'); + + + // Temporarily handle `&format=image` instead of `&format=interactive` + // const el = sheets_chart ? await page.$('#embed_chart') : await page.$('main'); + const el = sheets_chart ? await page.$('img') : await page.$('main'); await el.screenshot({ path: filename }); await browser.close(); } From de9109f01fa27de53243ca080dd6f3867e723037 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Mon, 28 Oct 2024 23:20:01 +0100 Subject: [PATCH 03/15] Correct Sheets URLs (#3819) --- src/content/en/2024/accessibility.md | 2 +- src/content/en/2024/capabilities.md | 2 +- src/content/en/2024/cdn.md | 2 +- src/content/en/2024/cms.md | 2 +- src/content/en/2024/cookies.md | 2 +- src/content/en/2024/css.md | 2 +- src/content/en/2024/fonts.md | 2 +- src/content/en/2024/http.md | 2 +- src/content/en/2024/jamstack.md | 2 +- src/content/en/2024/javascript.md | 2 +- src/content/en/2024/markup.md | 2 +- src/content/en/2024/media.md | 2 +- src/content/en/2024/mobile-web.md | 2 +- src/content/en/2024/page-weight.md | 2 +- src/content/en/2024/performance.md | 2 +- src/content/en/2024/privacy.md | 2 +- src/content/en/2024/pwa.md | 2 +- src/content/en/2024/security.md | 2 +- src/content/en/2024/seo.md | 2 +- src/content/en/2024/structured-data.md | 2 +- src/content/en/2024/sustainability.md | 2 +- src/content/en/2024/third-parties.md | 2 +- src/content/en/2024/webassembly.md | 2 +- 23 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/content/en/2024/accessibility.md b/src/content/en/2024/accessibility.md index be008bff84e..09a45973ede 100644 --- a/src/content/en/2024/accessibility.md +++ b/src/content/en/2024/accessibility.md @@ -7,7 +7,7 @@ reviewers: [] editors: [] analysts: [] translators: [] -results: https://docs.google.com/spreadsheets/d/1btB1r9QpdgTyToPhn7glcGAdMFs7eq4UcQSVIHBqiYQ/edit#gid=1778117656 +results: https://docs.google.com/spreadsheets/d/1btB1r9QpdgTyToPhn7glcGAdMFs7eq4UcQSVIHBqiYQ/ featured_quote: featured_stat_1: featured_stat_label_1: diff --git a/src/content/en/2024/capabilities.md b/src/content/en/2024/capabilities.md index 987416d38bc..4861e249bd4 100644 --- a/src/content/en/2024/capabilities.md +++ b/src/content/en/2024/capabilities.md @@ -7,7 +7,7 @@ reviewers: [] editors: [] analysts: [] translators: [] -results: https://docs.google.com/spreadsheets/d/1Ig-821tyjr897i8QqPvXiRMY9o444qsFAmZt4AFyBjk/edit#gid=0 +results: https://docs.google.com/spreadsheets/d/1Ig-821tyjr897i8QqPvXiRMY9o444qsFAmZt4AFyBjk/ featured_quote: featured_stat_1: featured_stat_label_1: diff --git a/src/content/en/2024/cdn.md b/src/content/en/2024/cdn.md index a89fba04f26..673006e5161 100644 --- a/src/content/en/2024/cdn.md +++ b/src/content/en/2024/cdn.md @@ -7,7 +7,7 @@ reviewers: [] editors: [] analysts: [] translators: [] -results: https://docs.google.com/spreadsheets/d/15YXQQjyoQ0Bnfw9KNSz_YuGDiCfW978_WKEHvDXjdm4/edit#gid=745368492 +results: https://docs.google.com/spreadsheets/d/15YXQQjyoQ0Bnfw9KNSz_YuGDiCfW978_WKEHvDXjdm4/ featured_quote: featured_stat_1: featured_stat_label_1: diff --git a/src/content/en/2024/cms.md b/src/content/en/2024/cms.md index 5fb493f6626..a63bc37bdf6 100644 --- a/src/content/en/2024/cms.md +++ b/src/content/en/2024/cms.md @@ -7,7 +7,7 @@ reviewers: [] editors: [] analysts: [] translators: [] -results: https://docs.google.com/spreadsheets/d/118lwQV_GwFYqIxXvsm57oeadJdjAJEOMCRq1PsTqhfs/edit#gid=355498918 +results: https://docs.google.com/spreadsheets/d/118lwQV_GwFYqIxXvsm57oeadJdjAJEOMCRq1PsTqhfs/ featured_quote: featured_stat_1: featured_stat_label_1: diff --git a/src/content/en/2024/cookies.md b/src/content/en/2024/cookies.md index 86a4d3bdc57..d7465b03ff2 100644 --- a/src/content/en/2024/cookies.md +++ b/src/content/en/2024/cookies.md @@ -5,7 +5,7 @@ reviewers: [] editors: [] analysts: [] translators: [] -results: https://docs.google.com/spreadsheets/d/1wDGnUkO0rgcU5_V6hmUrhm1pq60VU2XbeMHgYJEEaSM/edit#gid=454016814 +results: https://docs.google.com/spreadsheets/d/1wDGnUkO0rgcU5_V6hmUrhm1pq60VU2XbeMHgYJEEaSM/ featured_quote: featured_stat_1: featured_stat_label_1: diff --git a/src/content/en/2024/css.md b/src/content/en/2024/css.md index 59989d6a7b3..9f62d2c03d3 100644 --- a/src/content/en/2024/css.md +++ b/src/content/en/2024/css.md @@ -7,7 +7,7 @@ reviewers: [] editors: [] analysts: [] translators: [] -results: https://docs.google.com/spreadsheets/d/1R40dZYFdszjciIpS2jFMC3mPpZC-xHJnEE42xTEVY2w/edit#gid=1778117656 +results: https://docs.google.com/spreadsheets/d/1R40dZYFdszjciIpS2jFMC3mPpZC-xHJnEE42xTEVY2w/ featured_quote: featured_stat_1: featured_stat_label_1: diff --git a/src/content/en/2024/fonts.md b/src/content/en/2024/fonts.md index 87d3acc14dc..3b7174d00f2 100644 --- a/src/content/en/2024/fonts.md +++ b/src/content/en/2024/fonts.md @@ -7,7 +7,7 @@ reviewers: [] editors: [] analysts: [] translators: [] -results: https://docs.google.com/spreadsheets/d/1EkdvJ8e0B9Rr42evC2Ds5Ekwq6gF9oLBW0BA5cmSUT4/edit#gid=1778117656 +results: https://docs.google.com/spreadsheets/d/1EkdvJ8e0B9Rr42evC2Ds5Ekwq6gF9oLBW0BA5cmSUT4/ featured_quote: featured_stat_1: featured_stat_label_1: diff --git a/src/content/en/2024/http.md b/src/content/en/2024/http.md index 7b1b05190f0..91f2238648a 100644 --- a/src/content/en/2024/http.md +++ b/src/content/en/2024/http.md @@ -7,7 +7,7 @@ reviewers: [] editors: [] analysts: [] translators: [] -results: https://docs.google.com/spreadsheets/d/1PfTZkbmgyLA3NmEICeCyxpMWgP7cKY7EsZl9RciE5S4/edit#gid=140668849 +results: https://docs.google.com/spreadsheets/d/1PfTZkbmgyLA3NmEICeCyxpMWgP7cKY7EsZl9RciE5S4/ featured_quote: featured_stat_1: featured_stat_label_1: diff --git a/src/content/en/2024/jamstack.md b/src/content/en/2024/jamstack.md index 1bf3d2b28cc..5d15f0eb404 100644 --- a/src/content/en/2024/jamstack.md +++ b/src/content/en/2024/jamstack.md @@ -7,7 +7,7 @@ reviewers: [] editors: [] analysts: [] translators: [] -results: https://docs.google.com/spreadsheets/d/1wKswSnp8TuN4aZb63ir7hE5eCikIu8zEdQYPp4Xo6O0/edit#gid=807625051 +results: https://docs.google.com/spreadsheets/d/1wKswSnp8TuN4aZb63ir7hE5eCikIu8zEdQYPp4Xo6O0/ featured_quote: featured_stat_1: featured_stat_label_1: diff --git a/src/content/en/2024/javascript.md b/src/content/en/2024/javascript.md index e6b28bd910f..6949564a476 100644 --- a/src/content/en/2024/javascript.md +++ b/src/content/en/2024/javascript.md @@ -7,7 +7,7 @@ reviewers: [] editors: [] analysts: [] translators: [] -results: https://docs.google.com/spreadsheets/d/16isMe5_rvmRmJHtK5Je66AhwO8SowGgq0EFqXyjEXw8/edit#gid=1778117656 +results: https://docs.google.com/spreadsheets/d/16isMe5_rvmRmJHtK5Je66AhwO8SowGgq0EFqXyjEXw8/ featured_quote: featured_stat_1: featured_stat_label_1: diff --git a/src/content/en/2024/markup.md b/src/content/en/2024/markup.md index d5910514940..a606fdf3773 100644 --- a/src/content/en/2024/markup.md +++ b/src/content/en/2024/markup.md @@ -7,7 +7,7 @@ reviewers: [] editors: [] analysts: [] translators: [] -results: https://docs.google.com/spreadsheets/d/1TtOMr_w58HvqNBv4RIWX021Lxm6m5ajYOcRykrPdAJc/edit#gid=1778117656 +results: https://docs.google.com/spreadsheets/d/1TtOMr_w58HvqNBv4RIWX021Lxm6m5ajYOcRykrPdAJc/ featured_quote: featured_stat_1: featured_stat_label_1: diff --git a/src/content/en/2024/media.md b/src/content/en/2024/media.md index ea3afeda708..43bc77a8c2f 100644 --- a/src/content/en/2024/media.md +++ b/src/content/en/2024/media.md @@ -7,7 +7,7 @@ reviewers: [] editors: [] analysts: [] translators: [] -results: https://docs.google.com/spreadsheets/d/1Q2ITOe6ZMIXGKHtIxqK9XmUA1eQBX9CLQkxarQOJFCk/edit#gid=1778117656 +results: https://docs.google.com/spreadsheets/d/1Q2ITOe6ZMIXGKHtIxqK9XmUA1eQBX9CLQkxarQOJFCk/ featured_quote: featured_stat_1: featured_stat_label_1: diff --git a/src/content/en/2024/mobile-web.md b/src/content/en/2024/mobile-web.md index 67aac160a70..e8f701a65c6 100644 --- a/src/content/en/2024/mobile-web.md +++ b/src/content/en/2024/mobile-web.md @@ -7,7 +7,7 @@ reviewers: [] editors: [] analysts: [] translators: [] -results: https://docs.google.com/spreadsheets/d/183HhK6E_kygGbIpOVGIGsQvGzLBQSzjvRzabVC6e2-4/edit#gid=1778117656 +results: https://docs.google.com/spreadsheets/d/183HhK6E_kygGbIpOVGIGsQvGzLBQSzjvRzabVC6e2-4/ featured_quote: featured_stat_1: featured_stat_label_1: diff --git a/src/content/en/2024/page-weight.md b/src/content/en/2024/page-weight.md index ac2dbff4e5f..f7bc46a0f66 100644 --- a/src/content/en/2024/page-weight.md +++ b/src/content/en/2024/page-weight.md @@ -7,7 +7,7 @@ reviewers: [] editors: [] analysts: [] translators: [] -results: https://docs.google.com/spreadsheets/d/1GHTFwsjJokf1U5dZmDg-7bBlH_Lu-rlOHVmmzTj0D98/edit#gid=825716732 +results: https://docs.google.com/spreadsheets/d/1GHTFwsjJokf1U5dZmDg-7bBlH_Lu-rlOHVmmzTj0D98/ featured_quote: featured_stat_1: featured_stat_label_1: diff --git a/src/content/en/2024/performance.md b/src/content/en/2024/performance.md index b86f000cdef..216f632c63e 100644 --- a/src/content/en/2024/performance.md +++ b/src/content/en/2024/performance.md @@ -7,7 +7,7 @@ reviewers: [] editors: [] analysts: [] translators: [] -results: https://docs.google.com/spreadsheets/d/15038wEIoqY53Y_kR8U6QWM-PBO31ZySQGi147ABTNBc/edit#gid=1778117656 +results: https://docs.google.com/spreadsheets/d/15038wEIoqY53Y_kR8U6QWM-PBO31ZySQGi147ABTNBc/ featured_quote: featured_stat_1: featured_stat_label_1: diff --git a/src/content/en/2024/privacy.md b/src/content/en/2024/privacy.md index 75936442532..3391b362744 100644 --- a/src/content/en/2024/privacy.md +++ b/src/content/en/2024/privacy.md @@ -7,7 +7,7 @@ reviewers: [] editors: [] analysts: [] translators: [] -results: https://docs.google.com/spreadsheets/d/18r8cT6x9lPdM-rXvXjsqx84W7ZDdTDYGD59xr0UGOwg/edit#gid=1778117656 +results: https://docs.google.com/spreadsheets/d/18r8cT6x9lPdM-rXvXjsqx84W7ZDdTDYGD59xr0UGOwg/ featured_quote: featured_stat_1: featured_stat_label_1: diff --git a/src/content/en/2024/pwa.md b/src/content/en/2024/pwa.md index 6f8247b6bc0..890fbc94b78 100644 --- a/src/content/en/2024/pwa.md +++ b/src/content/en/2024/pwa.md @@ -7,7 +7,7 @@ reviewers: [] editors: [] analysts: [] translators: [] -results: https://docs.google.com/spreadsheets/d/1hbPPlZxu8RNf0Ef6qZbsrdOZEL-wB6P6-528XrSDZ8Y/edit#gid=692219124 +results: https://docs.google.com/spreadsheets/d/1hbPPlZxu8RNf0Ef6qZbsrdOZEL-wB6P6-528XrSDZ8Y/ featured_quote: featured_stat_1: featured_stat_label_1: diff --git a/src/content/en/2024/security.md b/src/content/en/2024/security.md index ce38866ece6..de65b569cda 100644 --- a/src/content/en/2024/security.md +++ b/src/content/en/2024/security.md @@ -7,7 +7,7 @@ reviewers: [] editors: [] analysts: [] translators: [] -results: https://docs.google.com/spreadsheets/d/1b9IEGbfQjKCEaTBmcv_zyCyWEsq35StCa-dVOe6V1Cs/edit#gid=1778117656 +results: https://docs.google.com/spreadsheets/d/1b9IEGbfQjKCEaTBmcv_zyCyWEsq35StCa-dVOe6V1Cs/ featured_quote: featured_stat_1: featured_stat_label_1: diff --git a/src/content/en/2024/seo.md b/src/content/en/2024/seo.md index 0cd19ca67ba..33b0ba9e729 100644 --- a/src/content/en/2024/seo.md +++ b/src/content/en/2024/seo.md @@ -7,7 +7,7 @@ reviewers: [] editors: [] analysts: [] translators: [] -results: https://docs.google.com/spreadsheets/d/1lAQKcOF7l6xz9v7yvnI9I1F8yiSqcz3Xx6u-5ady1DQ/edit#gid=1778117656 +results: https://docs.google.com/spreadsheets/d/1lAQKcOF7l6xz9v7yvnI9I1F8yiSqcz3Xx6u-5ady1DQ/ featured_quote: featured_stat_1: featured_stat_label_1: diff --git a/src/content/en/2024/structured-data.md b/src/content/en/2024/structured-data.md index 19729ad7f71..1a1c8cd7367 100644 --- a/src/content/en/2024/structured-data.md +++ b/src/content/en/2024/structured-data.md @@ -7,7 +7,7 @@ reviewers: [] editors: [] analysts: [] translators: [] -results: https://docs.google.com/spreadsheets/d/1GWniSGupK6KgME7urV7ff0iWStzopGXqnQvJ3_-ynD4/edit#gid=1778117656 +results: https://docs.google.com/spreadsheets/d/1GWniSGupK6KgME7urV7ff0iWStzopGXqnQvJ3_-ynD4/ featured_quote: featured_stat_1: featured_stat_label_1: diff --git a/src/content/en/2024/sustainability.md b/src/content/en/2024/sustainability.md index e1961b014bf..71af8fe11ec 100644 --- a/src/content/en/2024/sustainability.md +++ b/src/content/en/2024/sustainability.md @@ -7,7 +7,7 @@ reviewers: [] editors: [] analysts: [] translators: [] -results: https://docs.google.com/spreadsheets/d/1E6wPVck2-5NDUpFKWjbJXKJKNx0E9fWwIdeM9hUKl8c/edit#gid=806519964 +results: https://docs.google.com/spreadsheets/d/1E6wPVck2-5NDUpFKWjbJXKJKNx0E9fWwIdeM9hUKl8c/ featured_quote: featured_stat_1: featured_stat_label_1: diff --git a/src/content/en/2024/third-parties.md b/src/content/en/2024/third-parties.md index 9e7e6777f93..676618734fa 100644 --- a/src/content/en/2024/third-parties.md +++ b/src/content/en/2024/third-parties.md @@ -7,7 +7,7 @@ reviewers: [] editors: [] analysts: [] translators: [] -results: https://docs.google.com/spreadsheets/d/18uTDBygSqgT_PNFldOz4guLSuXyMzDthRGnAG5if4sU/edit#gid=1778117656 +results: https://docs.google.com/spreadsheets/d/18uTDBygSqgT_PNFldOz4guLSuXyMzDthRGnAG5if4sU/ featured_quote: featured_stat_1: featured_stat_label_1: diff --git a/src/content/en/2024/webassembly.md b/src/content/en/2024/webassembly.md index 57412665c58..303fa93981e 100644 --- a/src/content/en/2024/webassembly.md +++ b/src/content/en/2024/webassembly.md @@ -7,7 +7,7 @@ reviewers: [] editors: [] analysts: [] translators: [] -results: https://docs.google.com/spreadsheets/d/1Qfp5d_xWuUolLJVBt8iQWUpqlubpHHf3_SY9TKYYzXM/edit#gid=1778117656 +results: https://docs.google.com/spreadsheets/d/1Qfp5d_xWuUolLJVBt8iQWUpqlubpHHf3_SY9TKYYzXM/ featured_quote: featured_stat_1: featured_stat_label_1: From 31fc2383653c965537ab710a2bbb56556d4c2a3a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 29 Oct 2024 20:42:45 +0000 Subject: [PATCH 04/15] Bump puppeteer from 23.6.0 to 23.6.1 in /src (#3820) Bumps [puppeteer](https://github.com/puppeteer/puppeteer) from 23.6.0 to 23.6.1. - [Release notes](https://github.com/puppeteer/puppeteer/releases) - [Changelog](https://github.com/puppeteer/puppeteer/blob/main/release-please-config.json) - [Commits](https://github.com/puppeteer/puppeteer/compare/puppeteer-v23.6.0...puppeteer-v23.6.1) --- updated-dependencies: - dependency-name: puppeteer dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- src/package-lock.json | 82 +++++++++++++++++++------------------------ src/package.json | 2 +- 2 files changed, 38 insertions(+), 46 deletions(-) diff --git a/src/package-lock.json b/src/package-lock.json index c100b87aa8c..fd2e4fb3dd3 100644 --- a/src/package-lock.json +++ b/src/package-lock.json @@ -15,7 +15,7 @@ "node-fetch": "3.3.2", "node-watch": "0.7.4", "prettier": "3.3.3", - "puppeteer": "23.6.0", + "puppeteer": "23.6.1", "rainbow-code": "2.1.7", "recursive-readdir": "2.2.3", "run-script-os": "1.1.6", @@ -214,13 +214,13 @@ "dev": true }, "node_modules/@types/node": { - "version": "22.7.6", - "resolved": "https://registry.npmjs.org/@types/node/-/node-22.7.6.tgz", - "integrity": "sha512-/d7Rnj0/ExXDMcioS78/kf1lMzYk4BZV8MZGTBKzTGZ6/406ukkbYlIsZmMPhcR5KlkunDHQLrtAVmSq7r+mSw==", + "version": "22.8.2", + "resolved": "https://registry.npmjs.org/@types/node/-/node-22.8.2.tgz", + "integrity": "sha512-NzaRNFV+FZkvK/KLCsNdTvID0SThyrs5SHB6tsD/lajr22FGC73N2QeDPM2wHtVde8mgcXuSsHQkH5cX1pbPLw==", "dev": true, "optional": true, "dependencies": { - "undici-types": "~6.19.2" + "undici-types": "~6.19.8" } }, "node_modules/@types/yauzl": { @@ -348,13 +348,12 @@ } }, "node_modules/bare-stream": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/bare-stream/-/bare-stream-2.3.0.tgz", - "integrity": "sha512-pVRWciewGUeCyKEuRxwv06M079r+fRjAQjBEK2P6OYGrO43O+Z0LrPZZEjlc4mB6C2RpZ9AxJ1s7NLEtOHO6eA==", + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/bare-stream/-/bare-stream-2.3.2.tgz", + "integrity": "sha512-EFZHSIBkDgSHIwj2l2QZfP4U5OcD4xFAOwhSb/vlr9PIqyGJGvB/nfClJbcnh3EY4jtPE4zsb5ztae96bVF79A==", "dev": true, "optional": true, "dependencies": { - "b4a": "^1.6.6", "streamx": "^2.20.0" } }, @@ -1460,9 +1459,9 @@ } }, "node_modules/puppeteer": { - "version": "23.6.0", - "resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-23.6.0.tgz", - "integrity": "sha512-l+Fgo8SVFSd51STtq2crz8t1Y3VXowsuR4zfR64qDOn6oggz7YIZauWiNR4IJjczQ6nvFs3S4cgng55/nesxTQ==", + "version": "23.6.1", + "resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-23.6.1.tgz", + "integrity": "sha512-8+ALGQgwXd3P/tGcuSsxTPGDaOQIjcDIm04I5hpWZv/PiN5q8bQNHRUyfYrifT+flnM9aTWCP7tLEzuB6SlIgA==", "dev": true, "hasInstallScript": true, "dependencies": { @@ -1470,7 +1469,7 @@ "chromium-bidi": "0.8.0", "cosmiconfig": "^9.0.0", "devtools-protocol": "0.0.1354347", - "puppeteer-core": "23.6.0", + "puppeteer-core": "23.6.1", "typed-query-selector": "^2.12.0" }, "bin": { @@ -1481,9 +1480,9 @@ } }, "node_modules/puppeteer-core": { - "version": "23.6.0", - "resolved": "https://registry.npmjs.org/puppeteer-core/-/puppeteer-core-23.6.0.tgz", - "integrity": "sha512-se1bhgUpR9C529SgHGr/eyT92mYyQPAhA2S9pGtGrVG2xob9qE6Pbp7TlqiSPlnnY1lINqhn6/67EwkdzOmKqQ==", + "version": "23.6.1", + "resolved": "https://registry.npmjs.org/puppeteer-core/-/puppeteer-core-23.6.1.tgz", + "integrity": "sha512-DoNLAzQfGklPauEn33N4h9cM9GubJSINEn+AUMwAXwW159Y9JLk5y34Jsbv4c7kG8P0puOYWV9leu2siMZ/QpQ==", "dev": true, "dependencies": { "@puppeteer/browsers": "2.4.0", @@ -1758,13 +1757,10 @@ } }, "node_modules/text-decoder": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/text-decoder/-/text-decoder-1.2.0.tgz", - "integrity": "sha512-n1yg1mOj9DNpk3NeZOx7T6jchTbyJS3i3cucbNN6FcdPriMZx7NsgrGpWWdWZZGxD7ES1XB+3uoqHMgOKaN+fg==", - "dev": true, - "dependencies": { - "b4a": "^1.6.4" - } + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/text-decoder/-/text-decoder-1.2.1.tgz", + "integrity": "sha512-x9v3H/lTKIJKQQe7RPQkLfKAnc9lUTkWDypIQgTzPJAq+5/GCDHonmshfvlsNSj58yyshbIJJDLmU15qNERrXQ==", + "dev": true }, "node_modules/through": { "version": "2.3.8", @@ -2216,13 +2212,13 @@ "dev": true }, "@types/node": { - "version": "22.7.6", - "resolved": "https://registry.npmjs.org/@types/node/-/node-22.7.6.tgz", - "integrity": "sha512-/d7Rnj0/ExXDMcioS78/kf1lMzYk4BZV8MZGTBKzTGZ6/406ukkbYlIsZmMPhcR5KlkunDHQLrtAVmSq7r+mSw==", + "version": "22.8.2", + "resolved": "https://registry.npmjs.org/@types/node/-/node-22.8.2.tgz", + "integrity": "sha512-NzaRNFV+FZkvK/KLCsNdTvID0SThyrs5SHB6tsD/lajr22FGC73N2QeDPM2wHtVde8mgcXuSsHQkH5cX1pbPLw==", "dev": true, "optional": true, "requires": { - "undici-types": "~6.19.2" + "undici-types": "~6.19.8" } }, "@types/yauzl": { @@ -2335,13 +2331,12 @@ } }, "bare-stream": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/bare-stream/-/bare-stream-2.3.0.tgz", - "integrity": "sha512-pVRWciewGUeCyKEuRxwv06M079r+fRjAQjBEK2P6OYGrO43O+Z0LrPZZEjlc4mB6C2RpZ9AxJ1s7NLEtOHO6eA==", + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/bare-stream/-/bare-stream-2.3.2.tgz", + "integrity": "sha512-EFZHSIBkDgSHIwj2l2QZfP4U5OcD4xFAOwhSb/vlr9PIqyGJGvB/nfClJbcnh3EY4jtPE4zsb5ztae96bVF79A==", "dev": true, "optional": true, "requires": { - "b4a": "^1.6.6", "streamx": "^2.20.0" } }, @@ -3128,23 +3123,23 @@ "dev": true }, "puppeteer": { - "version": "23.6.0", - "resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-23.6.0.tgz", - "integrity": "sha512-l+Fgo8SVFSd51STtq2crz8t1Y3VXowsuR4zfR64qDOn6oggz7YIZauWiNR4IJjczQ6nvFs3S4cgng55/nesxTQ==", + "version": "23.6.1", + "resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-23.6.1.tgz", + "integrity": "sha512-8+ALGQgwXd3P/tGcuSsxTPGDaOQIjcDIm04I5hpWZv/PiN5q8bQNHRUyfYrifT+flnM9aTWCP7tLEzuB6SlIgA==", "dev": true, "requires": { "@puppeteer/browsers": "2.4.0", "chromium-bidi": "0.8.0", "cosmiconfig": "^9.0.0", "devtools-protocol": "0.0.1354347", - "puppeteer-core": "23.6.0", + "puppeteer-core": "23.6.1", "typed-query-selector": "^2.12.0" } }, "puppeteer-core": { - "version": "23.6.0", - "resolved": "https://registry.npmjs.org/puppeteer-core/-/puppeteer-core-23.6.0.tgz", - "integrity": "sha512-se1bhgUpR9C529SgHGr/eyT92mYyQPAhA2S9pGtGrVG2xob9qE6Pbp7TlqiSPlnnY1lINqhn6/67EwkdzOmKqQ==", + "version": "23.6.1", + "resolved": "https://registry.npmjs.org/puppeteer-core/-/puppeteer-core-23.6.1.tgz", + "integrity": "sha512-DoNLAzQfGklPauEn33N4h9cM9GubJSINEn+AUMwAXwW159Y9JLk5y34Jsbv4c7kG8P0puOYWV9leu2siMZ/QpQ==", "dev": true, "requires": { "@puppeteer/browsers": "2.4.0", @@ -3356,13 +3351,10 @@ } }, "text-decoder": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/text-decoder/-/text-decoder-1.2.0.tgz", - "integrity": "sha512-n1yg1mOj9DNpk3NeZOx7T6jchTbyJS3i3cucbNN6FcdPriMZx7NsgrGpWWdWZZGxD7ES1XB+3uoqHMgOKaN+fg==", - "dev": true, - "requires": { - "b4a": "^1.6.4" - } + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/text-decoder/-/text-decoder-1.2.1.tgz", + "integrity": "sha512-x9v3H/lTKIJKQQe7RPQkLfKAnc9lUTkWDypIQgTzPJAq+5/GCDHonmshfvlsNSj58yyshbIJJDLmU15qNERrXQ==", + "dev": true }, "through": { "version": "2.3.8", diff --git a/src/package.json b/src/package.json index 8769de4cbeb..a0daef3a346 100644 --- a/src/package.json +++ b/src/package.json @@ -41,7 +41,7 @@ "node-fetch": "3.3.2", "node-watch": "0.7.4", "prettier": "3.3.3", - "puppeteer": "23.6.0", + "puppeteer": "23.6.1", "rainbow-code": "2.1.7", "recursive-readdir": "2.2.3", "run-script-os": "1.1.6", From 05a096c3b19339c8081d2e28b8fad3976f63d270 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Tue, 29 Oct 2024 22:54:53 +0000 Subject: [PATCH 05/15] Revert image workarounds (#3821) * Revert figure workarounds * Add comment --- src/static/js/almanac.js | 5 ----- src/tools/generate/generate_figure_images.js | 8 +++----- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/src/static/js/almanac.js b/src/static/js/almanac.js index 7fe87ea24af..499be49d7ca 100644 --- a/src/static/js/almanac.js +++ b/src/static/js/almanac.js @@ -388,11 +388,6 @@ function googleSheetsPixelNotLoaded() { //We use Google Sheets for detailed visualisations //Check for support and switch out images if supported function upgradeInteractiveFigures() { - - // Temporarily disable charts since embedding is currently broken in Sheets - // See: https://github.com/HTTPArchive/almanac.httparchive.org/issues/3803 - return; - try { if (!isInPrintMode() && bigEnoughForInteractiveFigures() && !dataSaverEnabled() && highBandwidthConnection() && highResolutionCanvasSupported()) { diff --git a/src/tools/generate/generate_figure_images.js b/src/tools/generate/generate_figure_images.js index 8b297291466..d2c450e8ae4 100644 --- a/src/tools/generate/generate_figure_images.js +++ b/src/tools/generate/generate_figure_images.js @@ -7,8 +7,7 @@ const take_single_screenshot = async (graphUrl, filename) => { const sheets_chart = graphUrl.startsWith('https://docs.google.com/spreadsheets') ? true : false; - // Temporarily replace `&format=interactive` with `&format=image` - const chartUrl = sheets_chart ? graphUrl.replaceAll('&format=interactive', '&format=image') : 'http://localhost:8080/' + graphUrl; + const chartUrl = sheets_chart ? graphUrl : 'http://localhost:8080/' + graphUrl; const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.setViewport({ @@ -21,9 +20,8 @@ const take_single_screenshot = async (graphUrl, filename) => { }); - // Temporarily handle `&format=image` instead of `&format=interactive` - // const el = sheets_chart ? await page.$('#embed_chart') : await page.$('main'); - const el = sheets_chart ? await page.$('img') : await page.$('main'); + // Charts are in #embed_chart, maps are in #c div div + const el = sheets_chart ? await page.$('#embed_chart, #c div div') : await page.$('main'); await el.screenshot({ path: filename }); await browser.close(); } From 89f88019c2d6825d7bbd42267e22d44c2db619ff Mon Sep 17 00:00:00 2001 From: Ivan Ukhov Date: Wed, 30 Oct 2024 11:47:02 +0100 Subject: [PATCH 06/15] Fonts 2024 Queries (#3696) * Update font_format_usage_without_services.sql * Update font_size_quantiles.sql * Update font_size_quantiles_by_opentype_table.sql * Update font_size_quantiles_without_services.sql * Update font_usage_by_service.sql * Update font_usage_over_time.sql * Update font_variant_values.sql * Update outline_formats.sql * Make a few cosmetic adjustments to design * Update resource_hints_usage.sql * Make a few cosmetic adjustments to performance * Update usage_by_country.sql * Update usage_of_font_loading_api.sql * Rename the queries in design and performance * Add headers to design * Change sizes_self_hosted.sql to sizes_by_service.sql * Change formats_file_self_hosted.sql to formats_file_by_service.sql * Move pages_style_font_variant.sql to development * Add headers to performance * Remove requests_over_time.sql * Switch to using requests instead of pages * Capitalize functions in design and performance * Update color_font_usage.sql * Update color_fonts.sql * Update color_fonts_emoji_usage.sql * Rename the queries about color fonts * Update font_palettes.sql * Update font_palettes_colors.sql * Make a cosmetic adjustment * Rename another color query * Make a cosmetic adjustment * Update font_feature_settings_tags_usage.sql * Update font_feature_settings_vs_font_variant.sql * Add a few headers * Make a cosmetic adjustment * Update font_kerning.sql * Update pages_style_font_variant.sql * Update font_metric_override_usage.sq * Rename font_metric_override_usage.sql * Make a few cosmetic adjustments * Rename a few queries * Update font_smoothing_usage.sql * Update font_opentype_support.sql * Update font_opentype_features_usage.sql * Update types_of_variable_font.sql * Update variable_font_animation.sql * Update variable_font_axes.sql * Rename a few queries * Rename a few queries * Fix a few typos * Update variable_font_axes_css.sql * Simplify fonts_variable_by_format.sql * Update variable_font_usage.sql * Update variable_font_googlefonts_vs_other.sql * Fix styles in JavaScript * Make a few cosmetic adjustments to performance * Revisit COUNT(0) in design * Revisit COUNT(0) in performance * Revisit COUNT(0) in development * Make a cosmetic adjustment * Rename a few queries * Order by client in design * Add performance/fonts_service_by_family.sql * Add design/fonts_designer.sql * Add fonts/design/pages_script.sql * Update performance/fonts_size_by_country.sql to the latest CrUX * Add design/fonts_license.sql * Update pages_link_relationship.sql to use pages * Add design/fonts_metric.sql * Rename a query * Adjust the query comments * Add design/fonts_family_by_script.sql * Fix a typo * Add development/fonts_compiler.sql * Adjust a few comments * Rename a few queries * Use NAME in two other queries * Add development/fonts_color_family.sql * Introduce VARIABLE_FORMATS and IS_VARIABLE * Add development/fonts_variable_family.sql * Extract common functions into a separate file * Make a few cosmetic adjustments * Add development/fonts_variable_range.sql * Annotate the common functions * Update the readme * Change the date to 2024-07-01 * Revert the definition of Edge Fonts * Add a query for the color distribution * Limit performance/fonts_family_by_service.sql * Adjust the ordering in performance/fonts_size_by_country.sql * Add rounding to the size-related queries * Filter out in performance/fonts_size_by_table.sql * Make a few cosmetic adjustments * Simplify performance/pages_link_relationship.sql * Order in performance/styles_font_display.sql * Adjust the question in performance/fonts_family_by_service.sql * Filter out outliers in performance/fonts_size.sql * Filter out outliers in performance/fonts_size_by_service.sql * Filter out outliers in performance/fonts_size_by_table.sql * Refine FILE_FORMAT * Rebase FILE_FORMAT on summary * Rebase the file size on summary * Change performance/fonts_service.sql to show trends over time * Change performance/fonts_service.sql to show all services used * Debug performance/fonts_service.sql * Update performance/scripts_font_face.sql to show the trend * Check href and as in performance/pages_link_relationship.sql * Update performance/styles_font_display.sql to show the trend * Debug performance/pages_link_relationship.sql * Make a cosmetic adjustment * Adjust FOUNDRY * Adjust the percentiles in design/fonts_metric.sql * Limit to 100 in design/styles_family.sql * Polish JavaScript * Correct the normalization in design/fonts_script.sql * Correct the ranking in performance/fonts_family_by_service.sql * Correct the ranking in design/styles_family.sql * Limit to 100 in design/fonts_license.sql * Make use of FOUNDRY in design/fonts_foundry.sql * Limit to 100 in design/fonts_foundry.sql * Adjust FOUNDRY * Fix the ranking in design/fonts_foundry.sql * Limit to 100 in design/fonts_designer.sql * Adjust FOUNDRY * Adjust FOUNDRY * Fix a typo in development/fonts_variable_family.sql * Limit to the 99th in development/fonts_variable_range.sql * Limit to 100 and 10 in design/fonts_family_by_foundry.sql * Limit to 100 in development/fonts_variable_family.sql * Debug design/fonts_family_by_foundry.sql * Debug development/fonts_opentype.sql * Debug development/fonts_feature_kerning.sql * Rename development/fonts_feature_kerning.sql * Make a few cosmetic adjustments * Limit to 100 in development/fonts_compiler.sql * Debug design/styles_family.sql * Simplify design/fonts_family_by_foundry.sql * Make a cosmetic adjustment * Adjust the sorting in development/styles_font_variant.sql * Adjust the sorting in development/styles_font_feature_settings.sql * Adjust the sorting in development/styles_metric_override.sql * Debug development/styles_feature_control.sql * Extend design/fonts_license.sql * Adjust the sorting in performance/fonts_size_by_country.sql * Limit to 10 in performance/fonts_format_file.sql * Limit to 10 in performance/fonts_format_file_by_service.sql * Remove data from performance/styles_font_display.sql and limit to 10 * Adjust a question * Adjust the sorting in development/styles_feature_control.sql * Break down performance/fonts_size.sql by format * Break down performance/fonts_size_by_service.sql by format * Break down performance/fonts_size_by_country.sql by format * Refine development/fonts_compiler.sql * Refine and clarify the normalization in design * Refine and clarify the normalization in development * Refine and clarify the normalization in performance * Switch performance/fonts_format_file.sql to fonts on pages * Add Pages to performance/fonts_format_file.sql * Reduce design/fonts_family_by_script.sql in half * Introduce performance/fonts_service.sql and performance/fonts_services.sql * Constrain the performance queries to is_root_page * Constrain the design queries to is_root_page * Remove design/pages_script.sql * Constrain the development queries to is_root_page * Limit to 10 in development/styles_smoothing.sql * Rename pages to sites * Rewrite performance/pages_link_relationship.sql to join with requests * Introduce performance/fonts_size_by_format.sql * Sync performance/fonts_format_file_by_service.sql with performance/fonts_format_file.sql * Change the normalization in development/fonts_color_emoji.sql * Add dates to development/fonts_color.sql * Add dates to development/fonts_color_emoji.sql * Add dates to development/fonts_variable.sql * Remove the format from performance/fonts_size_by_country.sql * Rename Fonts on sites to Links * Add more dates to development/fonts_variable_service.sql * Fix the percentage calculation with dates * Clean up performance/styles_font_display.sql * Add more dates to development/fonts_feature_opentype.sql * Add more dates to development/fonts_feature_kerning.sql * Add more dates to development/fonts_variable_format.sql * Correct the normalization in performance/fonts_service.sql * Rename requests to links * Correct the normalization in performance/fonts_format_outline.sql * Rename fonts to links in performance/fonts_format_file_by_service.sql * Correct the normalization in design/fonts_script.sql * Correct the normalization in design/fonts_license.sql * Make a cosmetic adjustment in performance/fonts_size_by_table.sql * Move the totals in performance/fonts_format_file_by_service.sql * Move is_root_page * Move the totals in performance/fonts_format_file.sql * Move the totals in performance/fonts_family_by_service.sql * Correct the normalization in design/fonts_license.sql * Move the totals in design/fonts_family_by_foundry.sql * Move the totals in design/fonts_family_by_script.sql * Change the normalization to Fonts in development/fonts_variable_service.sql * Make a cosmetic adjustment * Move the totals in development/fonts_variable_format.sql * Move the totals in development/fonts_variable_family.sql * Correct the normalization in development/fonts_variable_axis.sql * Move the totals in development/fonts_feature_opentype.sql * Move the totals in development/fonts_feature_kerning.sql * Move the totals in development/fonts_feature.sql * Move the totals in development/fonts_compiler.sql * Move the totals in development/fonts_color_palette.sql * Move the totals in development/fonts_color_format.sql * Move the totals in development/fonts_color_family.sql * Move the totals in development/fonts_color_entry.sql * Move the totals in development/fonts_color_emoji.sql * Move the totals in development/fonts_color_color.sql * Make a few cosmetic adjustments * Make a few cosmetic adjustments * Make a cosmetic adjustment * Exclude non-parseable fonts from the size calculations * Exclude non-parseable fonts in performance/fonts_family_by_service.sql and performance/fonts_format_outline.sql * Exclude non-parseable fonts in design/fonts_*.sql * Remove baseline_at_zero * Normalize metrics by the granularity * Add (parsed only) * Use SAFE_DIVIDE in design/fonts_metric.sql * Exclude non-parseable fonts in development/fonts_*.sql * Add 2022-06-01 to performance/scripts_font_face.sql * Attempt to remove styles in family names * Add a secondary normalization to development/fonts_variable_service.sql * Update development/styles_font_variable_settings_axis.sql to variable sites * Increase to top 100 in performance/fonts_family_by_service.sql * Refine the FAMILY function * Refine the FAMILY function * Clean up design/styles_family.sql * Address a lint * Add granularity to design/fonts_metric.sql * Add performance/fonts.sql * Clean up the foundry names * Update the variable-font animation detection * Add development/fonts_hinting.sql * Drop the downsampling in design/fonts_family_by_script.sql * Add more dates to development/fonts_color_format.sql * Fix the totals in development/fonts_color_format.sql * Provide the full path to the common script * Introduce development/fonts_color_family_by_format.sql * Clean up performance/styles_font_display.sql * Introduce performance/styles_font_display_by_family.sql * Do not limit in development/fonts_color_format_by_family.sq * Refine the definition of COLOR_FORMATS and IS_COLOR * Add more dates to performance/pages_link_relationship.sql * Rename sites and links to websites and requests, respectively * Rename websites to pages * Move fonts_format_outline.sql to development * Move styles_family_system.sql to development * Add examples to development/fonts_color_format_by_family.sql --- sql/2024/fonts/README.md | 60 +++++-- sql/2024/fonts/common.sql | 151 ++++++++++++++++++ sql/2024/fonts/design/fonts_designer.sql | 52 ++++++ .../fonts/design/fonts_family_by_foundry.sql | 42 +++++ .../fonts/design/fonts_family_by_script.sql | 46 ++++++ sql/2024/fonts/design/fonts_foundry.sql | 52 ++++++ sql/2024/fonts/design/fonts_license.sql | 56 +++++++ sql/2024/fonts/design/fonts_metric.sql | 133 +++++++++++++++ sql/2024/fonts/design/fonts_script.sql | 41 +++++ sql/2024/fonts/design/styles_family.sql | 71 ++++++++ sql/2024/fonts/development/fonts_color.sql | 51 ++++++ .../fonts/development/fonts_color_color.sql | 63 ++++++++ .../fonts/development/fonts_color_emoji.sql | 65 ++++++++ .../fonts/development/fonts_color_entry.sql | 45 ++++++ .../fonts/development/fonts_color_family.sql | 39 +++++ .../fonts/development/fonts_color_format.sql | 46 ++++++ .../fonts_color_format_by_family.sql | 46 ++++++ .../fonts/development/fonts_color_palette.sql | 45 ++++++ sql/2024/fonts/development/fonts_compiler.sql | 54 +++++++ sql/2024/fonts/development/fonts_feature.sql | 64 ++++++++ .../development/fonts_feature_kerning.sql | 74 +++++++++ .../development/fonts_feature_opentype.sql | 48 ++++++ .../development/fonts_format_outline.sql | 49 ++++++ sql/2024/fonts/development/fonts_hinting.sql | 58 +++++++ sql/2024/fonts/development/fonts_variable.sql | 52 ++++++ .../fonts/development/fonts_variable_axis.sql | 52 ++++++ .../development/fonts_variable_family.sql | 39 +++++ .../development/fonts_variable_format.sql | 46 ++++++ .../development/fonts_variable_range.sql | 67 ++++++++ .../development/fonts_variable_service.sql | 46 ++++++ .../development/styles_family_system.sql | 92 +++++++++++ .../development/styles_feature_control.sql | 81 ++++++++++ .../styles_font_feature_settings.sql | 81 ++++++++++ .../styles_font_variable_settings_axis.sql | 67 ++++++++ .../fonts/development/styles_font_variant.sql | 74 +++++++++ .../development/styles_metric_override.sql | 66 ++++++++ .../fonts/development/styles_smoothing.sql | 68 ++++++++ .../development/styles_variable_animation.sql | 78 +++++++++ sql/2024/fonts/performance/fonts.sql | 21 +++ .../performance/fonts_family_by_service.sql | 43 +++++ .../fonts/performance/fonts_format_file.sql | 47 ++++++ .../fonts_format_file_by_service.sql | 51 ++++++ sql/2024/fonts/performance/fonts_service.sql | 46 ++++++ sql/2024/fonts/performance/fonts_services.sql | 67 ++++++++ sql/2024/fonts/performance/fonts_size.sql | 38 +++++ .../performance/fonts_size_by_country.sql | 50 ++++++ .../performance/fonts_size_by_format.sql | 61 +++++++ .../performance/fonts_size_by_service.sql | 67 ++++++++ .../fonts/performance/fonts_size_by_table.sql | 56 +++++++ .../performance/pages_link_relationship.sql | 94 +++++++++++ .../fonts/performance/scripts_font_face.sql | 49 ++++++ .../fonts/performance/styles_font_display.sql | 68 ++++++++ .../styles_font_display_by_family.sql | 94 +++++++++++ 53 files changed, 3199 insertions(+), 13 deletions(-) create mode 100644 sql/2024/fonts/common.sql create mode 100644 sql/2024/fonts/design/fonts_designer.sql create mode 100644 sql/2024/fonts/design/fonts_family_by_foundry.sql create mode 100644 sql/2024/fonts/design/fonts_family_by_script.sql create mode 100644 sql/2024/fonts/design/fonts_foundry.sql create mode 100644 sql/2024/fonts/design/fonts_license.sql create mode 100644 sql/2024/fonts/design/fonts_metric.sql create mode 100644 sql/2024/fonts/design/fonts_script.sql create mode 100644 sql/2024/fonts/design/styles_family.sql create mode 100644 sql/2024/fonts/development/fonts_color.sql create mode 100644 sql/2024/fonts/development/fonts_color_color.sql create mode 100644 sql/2024/fonts/development/fonts_color_emoji.sql create mode 100644 sql/2024/fonts/development/fonts_color_entry.sql create mode 100644 sql/2024/fonts/development/fonts_color_family.sql create mode 100644 sql/2024/fonts/development/fonts_color_format.sql create mode 100644 sql/2024/fonts/development/fonts_color_format_by_family.sql create mode 100644 sql/2024/fonts/development/fonts_color_palette.sql create mode 100644 sql/2024/fonts/development/fonts_compiler.sql create mode 100644 sql/2024/fonts/development/fonts_feature.sql create mode 100644 sql/2024/fonts/development/fonts_feature_kerning.sql create mode 100644 sql/2024/fonts/development/fonts_feature_opentype.sql create mode 100644 sql/2024/fonts/development/fonts_format_outline.sql create mode 100644 sql/2024/fonts/development/fonts_hinting.sql create mode 100644 sql/2024/fonts/development/fonts_variable.sql create mode 100644 sql/2024/fonts/development/fonts_variable_axis.sql create mode 100644 sql/2024/fonts/development/fonts_variable_family.sql create mode 100644 sql/2024/fonts/development/fonts_variable_format.sql create mode 100644 sql/2024/fonts/development/fonts_variable_range.sql create mode 100644 sql/2024/fonts/development/fonts_variable_service.sql create mode 100644 sql/2024/fonts/development/styles_family_system.sql create mode 100644 sql/2024/fonts/development/styles_feature_control.sql create mode 100644 sql/2024/fonts/development/styles_font_feature_settings.sql create mode 100644 sql/2024/fonts/development/styles_font_variable_settings_axis.sql create mode 100644 sql/2024/fonts/development/styles_font_variant.sql create mode 100644 sql/2024/fonts/development/styles_metric_override.sql create mode 100644 sql/2024/fonts/development/styles_smoothing.sql create mode 100644 sql/2024/fonts/development/styles_variable_animation.sql create mode 100644 sql/2024/fonts/performance/fonts.sql create mode 100644 sql/2024/fonts/performance/fonts_family_by_service.sql create mode 100644 sql/2024/fonts/performance/fonts_format_file.sql create mode 100644 sql/2024/fonts/performance/fonts_format_file_by_service.sql create mode 100644 sql/2024/fonts/performance/fonts_service.sql create mode 100644 sql/2024/fonts/performance/fonts_services.sql create mode 100644 sql/2024/fonts/performance/fonts_size.sql create mode 100644 sql/2024/fonts/performance/fonts_size_by_country.sql create mode 100644 sql/2024/fonts/performance/fonts_size_by_format.sql create mode 100644 sql/2024/fonts/performance/fonts_size_by_service.sql create mode 100644 sql/2024/fonts/performance/fonts_size_by_table.sql create mode 100644 sql/2024/fonts/performance/pages_link_relationship.sql create mode 100644 sql/2024/fonts/performance/scripts_font_face.sql create mode 100644 sql/2024/fonts/performance/styles_font_display.sql create mode 100644 sql/2024/fonts/performance/styles_font_display_by_family.sql diff --git a/sql/2024/fonts/README.md b/sql/2024/fonts/README.md index 5919175e6c3..f3cc5ae3a9f 100644 --- a/sql/2024/fonts/README.md +++ b/sql/2024/fonts/README.md @@ -1,20 +1,54 @@ # 2024 Fonts queries - +## Structure -## Resources +The queries are split by the section where they are used: + +* `design/` is about foundries and families, +* `development/` is about tools and technologies, and +* `performance/` is about hosting and serving. + +Each file name starts with one of the following prefixes indicating the primary +subject of the corresponding analysis: + +* `fonts_` is about font files, +* `pages_` is about HTML pages, +* `scripts_` is about JavaScript scripts, and +* `styles_` is about CSS style sheets. + +The prefix is followed by the property studied given in singular, potentially +extended one or several suffixes narrowing down the scope, as in +`fonts_size_by_table.sql` and `pages_link_relation.sql`. + +## Contents + +Each query starts with a preamble indicating the section, question, and +normalization type: + +```sql +-- Section: Performance +-- Question: What is the distribution of the file size broken down by table? +-- Normalization: Pages +``` + +Many queries rely on temporary functions for convenience and clarity. The +functions appear in several queries are extracted into a common file: +`common.sql`. Whenever any of the functions defined in `common.sql` is used by a +query, the query has the following line at the top: + +```sql +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql +``` -- [📄 Planning doc][~google-doc] -- [📊 Results sheet][~google-sheets] -- [📝 Markdown file][~chapter-markdown] +It signalizes that, prior to executing the query, `common.sql` has to be +inlined. -[~google-doc]: https://docs.google.com/document/d/1ljEHbDvXComXnW5s_EXZ0nM3_JCLnYr28Xrcf0YYtP8/edit#heading=h.vp0ukb2pxxzp -[~google-sheets]: https://docs.google.com/spreadsheets/d/1EkdvJ8e0B9Rr42evC2Ds5Ekwq6gF9oLBW0BA5cmSUT4/edit#gid=1778117656 -[~chapter-markdown]: https://github.com/HTTPArchive/almanac.httparchive.org/tree/main/src/content/en/2024/fonts.md +[Planning document]: https://docs.google.com/document/d/1ljEHbDvXComXnW5s_EXZ0nM3_JCLnYr28Xrcf0YYtP8/edit +[Results sheet]: https://docs.google.com/spreadsheets/d/1EkdvJ8e0B9Rr42evC2Ds5Ekwq6gF9oLBW0BA5cmSUT4/edit +[Chapter content]: https://github.com/HTTPArchive/almanac.httparchive.org/tree/main/src/content/en/2024/fonts.md diff --git a/sql/2024/fonts/common.sql b/sql/2024/fonts/common.sql new file mode 100644 index 00000000000..17f36b06db0 --- /dev/null +++ b/sql/2024/fonts/common.sql @@ -0,0 +1,151 @@ +-- Normalize a family name. Used in FAMILY_INNER. +CREATE TEMPORARY FUNCTION FAMILY_INNER_INNER(name STRING) AS ( + CASE + WHEN REGEXP_CONTAINS(name, r'(?i)font\s?awesome') THEN 'Font Awesome' + ELSE IF(LENGTH(TRIM(name)) < 3, NULL, NULLIF(TRIM(name), '')) + END +); + +-- Normalize a family name. Used in FAMILY. +CREATE TEMPORARY FUNCTION FAMILY_INNER(name STRING) AS ( + FAMILY_INNER_INNER( + REGEXP_REPLACE( + name, + r'(?i)([\s-]?(black|bold|book|cond(ensed)?|demi|ex(tra)?|heavy|italic|light|medium|narrow|regular|semi|thin|ultra|wide|\d00|\d+pt))+$', + '' + ) + ) +); + +-- Extract the family name from a payload. +CREATE TEMPORARY FUNCTION FAMILY(payload STRING) AS ( + FAMILY_INNER( + COALESCE( + JSON_EXTRACT_SCALAR(payload, '$._font_details.names[16]'), + JSON_EXTRACT_SCALAR(payload, '$._font_details.names[1]') + ) + ) +); + +-- Extract the file format from an extension and a MIME type. +CREATE TEMPORARY FUNCTION FILE_FORMAT(extension STRING, type STRING) AS ( + LOWER(IFNULL(REGEXP_EXTRACT(type, '/(?:x-)?(?:font-)?(.*)'), extension)) +); + +-- Normalize a foundry name. Used in FOUNDRY. +CREATE TEMPORARY FUNCTION FOUNDRY_INNER(name STRING) AS ( + CASE UPPER(name) + WHEN 'ADBO' THEN 'ADBE' + WHEN 'PFED' THEN 'AWSM' + ELSE NULLIF(TRIM(REGEXP_REPLACE(name, r'[[:cntrl:]]+', '')), '') + END +); + +-- Extract the foundry name from a payload. +CREATE TEMPORARY FUNCTION FOUNDRY(payload STRING) AS ( + FOUNDRY_INNER(JSON_EXTRACT_SCALAR(payload, '$._font_details.OS2.achVendID')) +); + +-- Infer scripts from codepoints. Used in SCRIPTS. +CREATE TEMPORARY FUNCTION SCRIPTS_INNER(codepoints ARRAY) +RETURNS ARRAY +LANGUAGE js +OPTIONS (library = ["gs://httparchive/lib/text-utils.js"]) +AS r""" +if (codepoints && codepoints.length) { + return detectWritingScript(codepoints.map((character) => parseInt(character, 10)), 0.05); +} else { + return []; +} +"""; + +-- Infer scripts from a payload. +CREATE TEMPORARY FUNCTION SCRIPTS(payload STRING) AS ( + SCRIPTS_INNER(JSON_EXTRACT_STRING_ARRAY(payload, '$._font_details.cmap.codepoints')) +); + +-- Infer the service from a URL. +CREATE TEMPORARY FUNCTION SERVICE(url STRING) AS ( + CASE + WHEN REGEXP_CONTAINS(url, r'(fonts|use)\.typekit\.(net|com)') THEN 'Adobe' + WHEN REGEXP_CONTAINS(url, r'cloud\.typenetwork\.com') THEN 'typenetwork.com' + WHEN REGEXP_CONTAINS(url, r'cloud\.typography\.com') THEN 'typography.com' + WHEN REGEXP_CONTAINS(url, r'cloud\.webtype\.com') THEN 'webtype.com' + WHEN REGEXP_CONTAINS(url, r'f\.fontdeck\.com') THEN 'fontdeck.com' + WHEN REGEXP_CONTAINS(url, r'fast\.fonts\.(com|net)\/(jsapi|cssapi)') THEN 'fonts.com' + WHEN REGEXP_CONTAINS(url, r'fnt\.webink\.com') THEN 'webink.com' + WHEN REGEXP_CONTAINS(url, r'fontawesome\.com') THEN 'fontawesome.com' + WHEN REGEXP_CONTAINS(url, r'fonts\.(gstatic|googleapis)\.com|themes.googleusercontent.com/static/fonts|ssl.gstatic.com/fonts') THEN 'Google' + WHEN REGEXP_CONTAINS(url, r'fonts\.typonine\.com') THEN 'typonine.com' + WHEN REGEXP_CONTAINS(url, r'fonts\.typotheque\.com') THEN 'typotheque.com' + WHEN REGEXP_CONTAINS(url, r'kernest\.com') THEN 'kernest.com' + WHEN REGEXP_CONTAINS(url, r'typefront\.com') THEN 'typefront.com' + WHEN REGEXP_CONTAINS(url, r'typesquare\.com') THEN 'typesquare.com' + WHEN REGEXP_CONTAINS(url, r'use\.edgefonts\.net|webfonts\.creativecloud\.com') THEN 'edgefonts.net' + WHEN REGEXP_CONTAINS(url, r'webfont\.fontplus\.jp') THEN 'fontplus.jp' + WHEN REGEXP_CONTAINS(url, r'webfonts\.fontslive\.com') THEN 'fontslive.com' + WHEN REGEXP_CONTAINS(url, r'webfonts\.fontstand\.com') THEN 'fontstand.com' + WHEN REGEXP_CONTAINS(url, r'webfonts\.justanotherfoundry\.com') THEN 'justanotherfoundry.com' + ELSE 'self-hosted' + END +); + +-- Extract the color formats from a formats payload and remove spurious entries +-- via a table-sizes payload. +-- +-- When nonempty, it is expected that +-- +-- * `CBDT` is larger than 2 + 2 bytes, +-- * `COLR` is larger than 2 + 2 + 4 + 4 + 2 (+ 4 + 4 + 4 + 4 + 4) bytes, +-- * `SVG ` is larger than 2 + 4 + 4 + 2 bytes, and +-- * `sbix` is larger than 2 + 2 + 4 + 4 bytes. +-- +-- For simplicity, the threshold is set to 50 bytes. +CREATE TEMPORARY FUNCTION COLOR_FORMATS_INNER(jsonFormats STRING, jsonSizes STRING) +RETURNS ARRAY +LANGUAGE js AS ''' +try { + const formats = JSON.parse(jsonFormats); + const sizes = JSON.parse(jsonSizes); + return formats.filter((format) => { + const table = `${format} `.slice(0, 4); + return sizes[table] > 50; + }); +} catch (e) { + return []; +} +'''; + +-- Extract the color formats from a payload. +CREATE TEMPORARY FUNCTION COLOR_FORMATS(payload STRING) AS ( + COLOR_FORMATS_INNER( + JSON_EXTRACT(payload, '$._font_details.color.formats'), + JSON_EXTRACT(payload, '$._font_details.table_sizes') + ) +); + +-- Check if the font is a color font given its payload. +CREATE TEMPORARY FUNCTION IS_COLOR(payload STRING) AS ( + ARRAY_LENGTH(COLOR_FORMATS(payload)) > 0 +); + +-- Check if the font was successfully parsed given its payload. +CREATE TEMPORARY FUNCTION IS_PARSED(payload STRING) AS ( + JSON_EXTRACT(payload, '$._font_details.table_sizes') IS NOT NULL +); + +-- Check if the font is a variable font given its payload. +CREATE TEMPORARY FUNCTION IS_VARIABLE(payload STRING) AS ( + REGEXP_CONTAINS( + JSON_EXTRACT(payload, '$._font_details.table_sizes'), + '(?i)gvar|CFF2' + ) +); + +-- Extract the variable formats from a payload. +CREATE TEMPORARY FUNCTION VARIABLE_FORMATS(payload STRING) AS ( + REGEXP_EXTRACT_ALL( + JSON_EXTRACT(payload, '$._font_details.table_sizes'), + '(?i)glyf|CFF2' + ) +); diff --git a/sql/2024/fonts/design/fonts_designer.sql b/sql/2024/fonts/design/fonts_designer.sql new file mode 100644 index 00000000000..35527cd41f2 --- /dev/null +++ b/sql/2024/fonts/design/fonts_designer.sql @@ -0,0 +1,52 @@ +-- Section: Design +-- Question: Which designers are popular? +-- Normalization: Pages + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +WITH +designers AS ( + SELECT + client, + NULLIF(TRIM(JSON_EXTRACT_SCALAR(payload, '$._font_details.names[9]')), '') AS designer, + COUNT(DISTINCT page) AS count, + ROW_NUMBER() OVER (PARTITION BY client ORDER BY COUNT(DISTINCT page) DESC) AS rank + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + type = 'font' AND + is_root_page AND + IS_PARSED(payload) + GROUP BY + client, + designer + QUALIFY + rank <= 100 +), +pages AS ( + SELECT + client, + COUNT(DISTINCT page) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + is_root_page + GROUP BY + client +) + +SELECT + client, + designer, + count, + total, + count / total AS proportion +FROM + designers +JOIN + pages USING (client) +ORDER BY + client, + proportion DESC diff --git a/sql/2024/fonts/design/fonts_family_by_foundry.sql b/sql/2024/fonts/design/fonts_family_by_foundry.sql new file mode 100644 index 00000000000..34df3e438b2 --- /dev/null +++ b/sql/2024/fonts/design/fonts_family_by_foundry.sql @@ -0,0 +1,42 @@ +-- Section: Design +-- Question: Which families are used broken down by foundry? +-- Normalization: Requests (parsed only) + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +WITH +requests AS ( + SELECT + client, + FOUNDRY(payload) AS foundry, + FAMILY(payload) AS family, + COUNT(0) OVER (PARTITION BY client) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + type = 'font' AND + IS_PARSED(payload) AND + is_root_page +) + +SELECT + client, + foundry, + family, + COUNT(0) AS count, + total, + COUNT(0) / total AS proportion, + ROW_NUMBER() OVER (PARTITION BY client ORDER BY COUNT(0) DESC) AS rank +FROM + requests +GROUP BY + client, + foundry, + family, + total +QUALIFY + rank <= 100 +ORDER BY + client, + proportion DESC diff --git a/sql/2024/fonts/design/fonts_family_by_script.sql b/sql/2024/fonts/design/fonts_family_by_script.sql new file mode 100644 index 00000000000..f19a9069234 --- /dev/null +++ b/sql/2024/fonts/design/fonts_family_by_script.sql @@ -0,0 +1,46 @@ +-- Section: Design +-- Question: Which families are used broken down by script? +-- Normalization: Requests (parsed only) + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +WITH +requests AS ( + SELECT + client, + SCRIPTS(payload) AS scripts, + FAMILY(payload) AS family, + COUNT(0) OVER (PARTITION BY client) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + type = 'font' AND + is_root_page AND + IS_PARSED(payload) +) + +SELECT + client, + script, + family, + COUNT(0) AS count, + total, + COUNT(0) / total AS proportion, + ROW_NUMBER() OVER (PARTITION BY client, script ORDER BY COUNT(0) DESC) AS rank +FROM + requests, + UNNEST(scripts) AS script +WHERE + family != 'Adobe Blank' +GROUP BY + client, + script, + family, + requests.total +QUALIFY + rank <= 10 +ORDER BY + client, + script, + proportion DESC diff --git a/sql/2024/fonts/design/fonts_foundry.sql b/sql/2024/fonts/design/fonts_foundry.sql new file mode 100644 index 00000000000..0f591df5e3a --- /dev/null +++ b/sql/2024/fonts/design/fonts_foundry.sql @@ -0,0 +1,52 @@ +-- Section: Design +-- Question: Which foundries are popular? +-- Normalization: Pages + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +WITH +foundries AS ( + SELECT + client, + FOUNDRY(payload) AS foundry, + COUNT(DISTINCT page) AS count, + ROW_NUMBER() OVER (PARTITION BY client ORDER BY COUNT(DISTINCT page) DESC) AS rank + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + type = 'font' AND + is_root_page AND + IS_PARSED(payload) + GROUP BY + client, + foundry + QUALIFY + rank <= 100 +), +pages AS ( + SELECT + client, + COUNT(DISTINCT page) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + is_root_page + GROUP BY + client +) + +SELECT + client, + foundry, + count, + total, + count / total AS proportion +FROM + foundries +JOIN + pages USING (client) +ORDER BY + client, + proportion DESC diff --git a/sql/2024/fonts/design/fonts_license.sql b/sql/2024/fonts/design/fonts_license.sql new file mode 100644 index 00000000000..e6637c4dda4 --- /dev/null +++ b/sql/2024/fonts/design/fonts_license.sql @@ -0,0 +1,56 @@ +-- Section: Design +-- Question: Which licenses are used? +-- Normalization: Pages + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +CREATE TEMPORARY FUNCTION LICENSE(value STRING) AS ( + CASE + WHEN REGEXP_CONTAINS(value, 'adobe|typekit') THEN 'Adobe' + WHEN REGEXP_CONTAINS(value, 'apache') THEN 'Apache' + WHEN REGEXP_CONTAINS(value, 'fontawesome') THEN 'Font Awesome' + WHEN REGEXP_CONTAINS(value, 'linotype|monotype|myfonts') THEN 'Monotype' + WHEN REGEXP_CONTAINS(value, r'(?i)(ofl|open\s?font\s?license|sil\.org)') THEN 'OFL' + ELSE NULLIF(NULLIF(TRIM(value), ''), '-') + END +); + +WITH +pages AS ( + SELECT + client, + COUNT(DISTINCT page) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + is_root_page + GROUP BY + client +) + +SELECT + client, + LICENSE(JSON_EXTRACT_SCALAR(payload, '$._font_details.names[14]')) AS license, + COUNT(DISTINCT page) AS count, + total, + COUNT(DISTINCT page) / total AS proportion, + ROW_NUMBER() OVER (PARTITION BY client ORDER BY COUNT(DISTINCT page) DESC) AS rank +FROM + `httparchive.all.requests` +INNER JOIN + pages USING (client) +WHERE + date = '2024-07-01' AND + type = 'font' AND + is_root_page AND + IS_PARSED(payload) +GROUP BY + client, + license, + total +QUALIFY + rank <= 100 +ORDER BY + client, + proportion DESC diff --git a/sql/2024/fonts/design/fonts_metric.sql b/sql/2024/fonts/design/fonts_metric.sql new file mode 100644 index 00000000000..0017f1bd7ed --- /dev/null +++ b/sql/2024/fonts/design/fonts_metric.sql @@ -0,0 +1,133 @@ +-- Section: Performance +-- Question: What is the distribution of metrics? +-- Normalization: Fonts (parsed only) + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +WITH +fonts AS ( + SELECT + client, + url, + SAFE_CAST( + JSON_EXTRACT_SCALAR( + ANY_VALUE(payload), + '$._font_details.head.unitsPerEm' + ) AS INTEGER + ) AS granularity, + [ + STRUCT( + 'granularity' AS name, + SAFE_CAST( + JSON_EXTRACT_SCALAR( + ANY_VALUE(payload), + '$._font_details.head.unitsPerEm' + ) AS INTEGER + ) AS value + ), + STRUCT( + 'clipping_ascender' AS name, + SAFE_CAST( + JSON_EXTRACT_SCALAR( + ANY_VALUE(payload), + '$._font_details.OS2.usWinAscent' + ) AS INTEGER + ) AS value + ), + STRUCT( + 'ascender' AS name, + SAFE_CAST( + JSON_EXTRACT_SCALAR( + ANY_VALUE(payload), + '$._font_details.OS2.sTypoAscender' + ) AS INTEGER + ) AS value + ), + STRUCT( + 'cap_height' AS name, + SAFE_CAST( + JSON_EXTRACT_SCALAR( + ANY_VALUE(payload), + '$._font_details.OS2.sCapHeight' + ) AS INTEGER + ) AS value + ), + STRUCT( + 'x_height' AS name, + SAFE_CAST( + JSON_EXTRACT_SCALAR( + ANY_VALUE(payload), + '$._font_details.OS2.sxHeight' + ) AS INTEGER + ) AS value + ), + STRUCT( + 'descender' AS name, + SAFE_CAST( + JSON_EXTRACT_SCALAR( + ANY_VALUE(payload), + '$._font_details.OS2.sTypoDescender' + ) AS INTEGER + ) AS value + ), + STRUCT( + 'clipping_descender' AS name, + -SAFE_CAST( + JSON_EXTRACT_SCALAR( + ANY_VALUE(payload), + '$._font_details.OS2.usWinDescent' + ) AS INTEGER + ) AS value + ), + STRUCT( + 'line_gap' AS name, + SAFE_CAST( + JSON_EXTRACT_SCALAR( + ANY_VALUE(payload), + '$._font_details.OS2.sTypoLineGap' + ) AS INTEGER + ) AS value + ), + STRUCT( + 'use_typographic_metrics' AS name, + SAFE_CAST( + JSON_EXTRACT_SCALAR( + ANY_VALUE(payload), + '$._font_details.OS2.fsSelection' + ) AS INTEGER + ) & 128 AS value + ) + ] AS metrics + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + type = 'font' AND + is_root_page AND + IS_PARSED(payload) + GROUP BY + client, + url +) + +SELECT + client, + name AS metric, + percentile, + COUNT(0) AS count, + APPROX_QUANTILES( + IF(name = 'granularity', value, SAFE_DIVIDE(value, granularity)), + 1000 + )[OFFSET(percentile * 10)] AS value +FROM + fonts, + UNNEST(metrics) AS metric, + UNNEST([10, 25, 50, 75, 90, 99]) AS percentile +GROUP BY + client, + name, + percentile +ORDER BY + client, + name, + percentile diff --git a/sql/2024/fonts/design/fonts_script.sql b/sql/2024/fonts/design/fonts_script.sql new file mode 100644 index 00000000000..2c020754348 --- /dev/null +++ b/sql/2024/fonts/design/fonts_script.sql @@ -0,0 +1,41 @@ +-- Section: Design +-- Question: Which scripts does one design for? +-- Normalization: Fonts (parsed only) + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +WITH +fonts AS ( + SELECT + client, + url, + SCRIPTS(ANY_VALUE(payload)) AS scripts, + COUNT(0) OVER (PARTITION BY client) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + type = 'font' AND + is_root_page AND + IS_PARSED(payload) + GROUP BY + client, + url +) + +SELECT + client, + script, + COUNT(DISTINCT url) AS count, + total, + COUNT(DISTINCT url) / total AS proportion +FROM + fonts, + UNNEST(scripts) AS script +GROUP BY + client, + script, + total +ORDER BY + client, + proportion DESC diff --git a/sql/2024/fonts/design/styles_family.sql b/sql/2024/fonts/design/styles_family.sql new file mode 100644 index 00000000000..a900df45e1e --- /dev/null +++ b/sql/2024/fonts/design/styles_family.sql @@ -0,0 +1,71 @@ +-- Section: Design +-- Question: Which families are popular in CSS? +-- Normalization: Pages + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +CREATE TEMPORARY FUNCTION FAMILIES(json STRING) +RETURNS ARRAY +LANGUAGE js +OPTIONS (library = ["gs://httparchive/lib/css-font-parser.js", "gs://httparchive/lib/css-utils.js"]) +AS ''' +try { + const $ = JSON.parse(json); + const result = []; + walkDeclarations($, (declaration) => { + result.push(parseFontFamilyProperty(declaration.value)[0]); + }, { + properties: 'font-family', + rules: (rule) => rule.type.toLowerCase() === 'font-face' + }); + return result; +} catch (e) { + return []; +} +'''; + +WITH +families AS ( + SELECT + client, + FAMILY_INNER(family) AS family, + COUNT(DISTINCT page) AS count, + ROW_NUMBER() OVER (PARTITION BY client ORDER BY COUNT(DISTINCT page) DESC) AS rank + FROM + `httparchive.all.parsed_css`, + UNNEST(FAMILIES(css)) AS family + WHERE + date = '2024-07-01' AND + is_root_page + GROUP BY + client, + family + QUALIFY + rank <= 100 +), +pages AS ( + SELECT + client, + COUNT(DISTINCT page) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + is_root_page + GROUP BY + client +) + +SELECT + client, + family, + count, + total, + count / total AS proportion +FROM + families +JOIN + pages USING (client) +ORDER BY + client, + proportion DESC diff --git a/sql/2024/fonts/development/fonts_color.sql b/sql/2024/fonts/development/fonts_color.sql new file mode 100644 index 00000000000..0e93a985a00 --- /dev/null +++ b/sql/2024/fonts/development/fonts_color.sql @@ -0,0 +1,51 @@ +-- Section: Development +-- Question: How popular are color fonts? +-- Normalization: Pages + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +WITH +fonts AS ( + SELECT + date, + client, + COUNT(DISTINCT page) AS count + FROM + `httparchive.all.requests` + WHERE + date IN ('2022-07-01', '2023-07-01', '2024-07-01') AND + type = 'font' AND + is_root_page AND + IS_COLOR(payload) + GROUP BY + date, + client +), +pages AS ( + SELECT + date, + client, + COUNT(DISTINCT page) AS total + FROM + `httparchive.all.requests` + WHERE + date IN ('2022-07-01', '2023-07-01', '2024-07-01') AND + is_root_page + GROUP BY + date, + client +) + +SELECT + date, + client, + count, + total, + count / total AS proportion +FROM + fonts +JOIN + pages USING (date, client) +ORDER BY + date, + proportion DESC diff --git a/sql/2024/fonts/development/fonts_color_color.sql b/sql/2024/fonts/development/fonts_color_color.sql new file mode 100644 index 00000000000..9f9add02520 --- /dev/null +++ b/sql/2024/fonts/development/fonts_color_color.sql @@ -0,0 +1,63 @@ +-- Section: Development +-- Question: What is the distribution of color palettes? +-- Normalization: Fonts (color only) + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +CREATE TEMPORARY FUNCTION COLORS(json STRING) +RETURNS ARRAY +LANGUAGE js +AS ''' +function toHex(value) { + return ('0' + (value & 0xFF).toString(16)).slice(-2); +} + +try { + const $ = JSON.parse(json); + const result = new Set(); + for (const palette of $) { + for (const [blue, green, red, alpha] of palette) { + result.add(`#${toHex(red)}${toHex(green)}${toHex(blue)}${toHex(alpha)}`); + } + } + return Array.from(result); +} catch (e) { + return []; +} +'''; + +WITH +fonts AS ( + SELECT + client, + url, + COLORS(JSON_EXTRACT(ANY_VALUE(payload), '$._font_details.color.palettes')) AS colors, + COUNT(0) OVER (PARTITION BY client) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + type = 'font' AND + is_root_page AND + IS_COLOR(payload) + GROUP BY + client, + url +) + +SELECT + client, + color, + COUNT(0) AS count, + total, + COUNT(0) / total AS proportion +FROM + fonts, + UNNEST(colors) AS color +GROUP BY + client, + color, + total +ORDER BY + client, + proportion DESC diff --git a/sql/2024/fonts/development/fonts_color_emoji.sql b/sql/2024/fonts/development/fonts_color_emoji.sql new file mode 100644 index 00000000000..62ca275a016 --- /dev/null +++ b/sql/2024/fonts/development/fonts_color_emoji.sql @@ -0,0 +1,65 @@ +-- Section: Development +-- Question: Are color fonts used for the sake of emojis? +-- Normalization: Requests (color only) + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +CREATE TEMPORARY FUNCTION HAS_EMOJI(codepoints ARRAY) +RETURNS BOOL +LANGUAGE js +OPTIONS (library = ["gs://httparchive/lib/text-utils.js"]) +AS r""" +if (codepoints && codepoints.length) { + const detected = detectWritingScript(codepoints.map((character) => parseInt(character, 10)), 0.1); + const scripts = [ + 'Emoji', + 'Emoji_Component', + 'Emoji_Modifier', + 'Emoji_Modifier_Base', + 'Emoji_Presentation' + ]; + for (script of scripts) { + if (detected.includes(script)) { + return true; + } + } + return false; +} else { + return false; +} +"""; + +WITH +requests AS ( + SELECT + date, + client, + HAS_EMOJI(JSON_EXTRACT_STRING_ARRAY(payload, '$._font_details.cmap.codepoints')) AS emoji, + COUNT(0) OVER (PARTITION BY date, client) AS total + FROM + `httparchive.all.requests` + WHERE + date IN ('2022-07-01', '2023-07-01', '2024-07-01') AND + type = 'font' AND + is_root_page AND + IS_COLOR(payload) +) + +SELECT + date, + client, + emoji, + COUNT(0) AS count, + total, + COUNT(0) / total AS proportion +FROM + requests +GROUP BY + date, + client, + emoji, + total +ORDER BY + date, + client, + emoji diff --git a/sql/2024/fonts/development/fonts_color_entry.sql b/sql/2024/fonts/development/fonts_color_entry.sql new file mode 100644 index 00000000000..f2c78aa22ae --- /dev/null +++ b/sql/2024/fonts/development/fonts_color_entry.sql @@ -0,0 +1,45 @@ +-- Section: Development +-- Question: How many entries are there in color palettes? +-- Normalization: Fonts (color only) + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +WITH +fonts AS ( + SELECT + client, + url, + SAFE_CAST( + JSON_EXTRACT_SCALAR( + ANY_VALUE(payload), + '$._font_details.color.numPaletteEntries' + ) AS INT64 + ) AS entries, + COUNT(0) OVER (PARTITION BY client) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + type = 'font' AND + is_root_page AND + IS_COLOR(payload) + GROUP BY + client, + url +) + +SELECT + client, + entries, + COUNT(0) AS count, + total, + COUNT(0) / total AS proportion +FROM + fonts +GROUP BY + client, + entries, + total +ORDER BY + client, + proportion DESC diff --git a/sql/2024/fonts/development/fonts_color_family.sql b/sql/2024/fonts/development/fonts_color_family.sql new file mode 100644 index 00000000000..4ae551a1ed6 --- /dev/null +++ b/sql/2024/fonts/development/fonts_color_family.sql @@ -0,0 +1,39 @@ +-- Section: Development +-- Question: Which color families are used? +-- Normalization: Requests (color only) + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +WITH +requests AS ( + SELECT + client, + FAMILY(payload) AS family, + COUNT(0) OVER (PARTITION BY client) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + type = 'font' AND + is_root_page AND + IS_COLOR(payload) +) + +SELECT + client, + family, + COUNT(0) AS count, + total, + COUNT(0) / total AS proportion, + ROW_NUMBER() OVER (PARTITION BY client ORDER BY COUNT(0) DESC) AS rank +FROM + requests +GROUP BY + client, + family, + total +QUALIFY + rank <= 100 +ORDER BY + client, + proportion DESC diff --git a/sql/2024/fonts/development/fonts_color_format.sql b/sql/2024/fonts/development/fonts_color_format.sql new file mode 100644 index 00000000000..52f7e89729c --- /dev/null +++ b/sql/2024/fonts/development/fonts_color_format.sql @@ -0,0 +1,46 @@ +-- Section: Development +-- Question: Which color formats are used? +-- Normalization: Fonts (color only) + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +WITH +fonts AS ( + SELECT + date, + client, + url, + COLOR_FORMATS(ANY_VALUE(payload)) AS formats, + COUNT(0) OVER (PARTITION BY date, client) AS total + FROM + `httparchive.all.requests` + WHERE + date IN ('2022-07-01', '2023-07-01', '2024-07-01') AND + type = 'font' AND + is_root_page AND + IS_COLOR(payload) + GROUP BY + date, + client, + url +) + +SELECT + date, + client, + format, + COUNT(0) AS count, + total, + COUNT(0) / total AS proportion +FROM + fonts, + UNNEST(formats) AS format +GROUP BY + date, + client, + format, + total +ORDER BY + date, + client, + proportion DESC diff --git a/sql/2024/fonts/development/fonts_color_format_by_family.sql b/sql/2024/fonts/development/fonts_color_format_by_family.sql new file mode 100644 index 00000000000..1c106e6a58b --- /dev/null +++ b/sql/2024/fonts/development/fonts_color_format_by_family.sql @@ -0,0 +1,46 @@ +-- Section: Development +-- Question: Which color formats are used broken down by family? +-- Normalization: Fonts (color only) + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +WITH +fonts AS ( + SELECT + client, + url, + COLOR_FORMATS(ANY_VALUE(payload)) AS formats, + FAMILY(ANY_VALUE(payload)) AS family, + COUNT(DISTINCT url) OVER (PARTITION BY client) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + type = 'font' AND + is_root_page AND + IS_COLOR(payload) + GROUP BY + client, + url +) + +SELECT + client, + format, + family, + COUNT(0) AS count, + total, + COUNT(0) / total AS proportion, + ANY_VALUE(url) AS example +FROM + fonts, + UNNEST(formats) AS format +GROUP BY + client, + format, + family, + total +ORDER BY + client, + format, + proportion DESC diff --git a/sql/2024/fonts/development/fonts_color_palette.sql b/sql/2024/fonts/development/fonts_color_palette.sql new file mode 100644 index 00000000000..80f717229bb --- /dev/null +++ b/sql/2024/fonts/development/fonts_color_palette.sql @@ -0,0 +1,45 @@ +-- Section: Development +-- Question: How many palettes are there in color fonts? +-- Normalization: Fonts (color only) + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +WITH +fonts AS ( + SELECT + client, + url, + SAFE_CAST( + JSON_EXTRACT_SCALAR( + ANY_VALUE(payload), + '$._font_details.color.numPalettes' + ) AS INT64 + ) AS entries, + COUNT(0) OVER (PARTITION BY client) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + type = 'font' AND + is_root_page AND + IS_COLOR(payload) + GROUP BY + client, + url +) + +SELECT + client, + entries, + COUNT(0) AS count, + total, + COUNT(0) / total AS proportion +FROM + fonts +GROUP BY + client, + entries, + total +ORDER BY + client, + proportion DESC diff --git a/sql/2024/fonts/development/fonts_compiler.sql b/sql/2024/fonts/development/fonts_compiler.sql new file mode 100644 index 00000000000..b98db06bdfd --- /dev/null +++ b/sql/2024/fonts/development/fonts_compiler.sql @@ -0,0 +1,54 @@ +-- Section: Development +-- Question: Which compilers are used? +-- Normalization: Fonts (parsed only) + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +CREATE TEMPORARY FUNCTION COMPILER(version STRING) AS ( + CASE + WHEN REGEXP_CONTAINS(version, r'(?i)(Core \d|PS \d|hotconv|makeotf)') THEN 'makeotf' + WHEN REGEXP_CONTAINS(version, r'(?i)FontCreator') THEN 'FontCreator' + WHEN REGEXP_CONTAINS(version, r'(?i)Fontself') THEN 'Fontself Maker' + WHEN REGEXP_CONTAINS(version, r'(?i)(FEAKit|Glyphs)') THEN 'Glyphs.app' + WHEN REGEXP_CONTAINS(version, r'(?i)gftools') THEN 'fontmake' + ELSE TRIM(REGEXP_EXTRACT(version, ';(.*)')) + END +); + +WITH +fonts AS ( + SELECT + client, + url, + COMPILER(JSON_EXTRACT_SCALAR(ANY_VALUE(payload), '$._font_details.names[5]')) AS compiler, + COUNT(0) OVER (PARTITION BY client) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + type = 'font' AND + is_root_page AND + IS_PARSED(payload) + GROUP BY + client, + url +) + +SELECT + client, + compiler, + COUNT(DISTINCT url) AS count, + total, + COUNT(DISTINCT url) / total AS proportion, + ROW_NUMBER() OVER (PARTITION BY client ORDER BY COUNT(DISTINCT url) DESC) AS rank +FROM + fonts +GROUP BY + client, + compiler, + total +QUALIFY + rank <= 100 +ORDER BY + client, + proportion DESC diff --git a/sql/2024/fonts/development/fonts_feature.sql b/sql/2024/fonts/development/fonts_feature.sql new file mode 100644 index 00000000000..3d5c769d028 --- /dev/null +++ b/sql/2024/fonts/development/fonts_feature.sql @@ -0,0 +1,64 @@ +-- Section: Development +-- Question: Which features are used in fonts? +-- Normalization: Fonts (parsed only) + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +CREATE TEMPORARY FUNCTION FEATURES(data STRING) +RETURNS ARRAY +LANGUAGE js +AS ''' +try { + const $ = JSON.parse(data); + const result = new Set(); + for (const [table, scripts] of Object.entries($)) { + for (const [script, languages] of Object.entries(scripts)) { + for (const [language, features] of Object.entries(languages)) { + features.forEach(feature => result.add(feature)); + } + } + } + return Array.from(result); +} catch (e) { + return []; +} +'''; + +WITH +fonts AS ( + SELECT + client, + url, + FEATURES(JSON_EXTRACT(ANY_VALUE(payload), '$._font_details.features')) AS features, + COUNT(0) OVER (PARTITION BY client) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + type = 'font' AND + is_root_page AND + IS_PARSED(payload) + GROUP BY + client, + url +) + +SELECT + client, + feature, + COUNT(0) AS count, + total, + COUNT(0) / total AS proportion, + ROW_NUMBER() OVER (PARTITION BY client ORDER BY COUNT(0) DESC) AS rank +FROM + fonts, + UNNEST(features) AS feature +GROUP BY + client, + feature, + total +QUALIFY + rank <= 100 +ORDER BY + client, + proportion DESC diff --git a/sql/2024/fonts/development/fonts_feature_kerning.sql b/sql/2024/fonts/development/fonts_feature_kerning.sql new file mode 100644 index 00000000000..f9bfa80868c --- /dev/null +++ b/sql/2024/fonts/development/fonts_feature_kerning.sql @@ -0,0 +1,74 @@ +-- Section: Development +-- Question: How prevalent is kerning support? +-- Normalization: Fonts (parsed only) + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +CREATE TEMPORARY FUNCTION HAS_KERNING(data STRING) +RETURNS BOOL +LANGUAGE js +AS ''' +try { + const $ = JSON.parse(data); + const result = new Set(); + for (const [table, scripts] of Object.entries($)) { + for (const [script, languages] of Object.entries(scripts)) { + for (const [language, features] of Object.entries(languages)) { + features.forEach(feature => result.add(feature)); + } + } + } + return Array.from(result).includes('kern'); +} catch (e) { + return false; +} +'''; + +WITH +fonts AS ( + SELECT + date, + client, + url, + ( + HAS_KERNING(JSON_EXTRACT(ANY_VALUE(payload), '$._font_details.features')) OR + IFNULL( + REGEXP_CONTAINS( + JSON_EXTRACT(ANY_VALUE(payload), '$._font_details.table_sizes'), + '(?i)kern' + ), + FALSE + ) + ) AS support, + COUNT(0) OVER (PARTITION BY date, client) AS total + FROM + `httparchive.all.requests` + WHERE + date IN ('2022-07-01', '2023-07-01', '2024-07-01') AND + type = 'font' AND + is_root_page AND + IS_PARSED(payload) + GROUP BY + date, + client, + url +) + +SELECT + date, + client, + support, + COUNT(DISTINCT url) AS count, + total, + COUNT(DISTINCT url) / total AS proportion +FROM + fonts +GROUP BY + date, + client, + support, + total +ORDER BY + date, + client, + support diff --git a/sql/2024/fonts/development/fonts_feature_opentype.sql b/sql/2024/fonts/development/fonts_feature_opentype.sql new file mode 100644 index 00000000000..8dc1427126a --- /dev/null +++ b/sql/2024/fonts/development/fonts_feature_opentype.sql @@ -0,0 +1,48 @@ +-- Section: Development +-- Question: How prevalent is OpenType support? +-- Normalization: Fonts (parsed only) + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +WITH +fonts AS ( + SELECT + date, + client, + url, + REGEXP_CONTAINS( + JSON_EXTRACT(ANY_VALUE(payload), '$._font_details.table_sizes'), + '(?i)GPOS|GSUB' + ) AS support, + COUNT(0) OVER (PARTITION BY date, client) AS total + FROM + `httparchive.all.requests` + WHERE + date IN ('2022-07-01', '2023-07-01', '2024-07-01') AND + type = 'font' AND + is_root_page AND + IS_PARSED(payload) + GROUP BY + date, + client, + url +) + +SELECT + date, + client, + support, + COUNT(0) AS count, + total, + COUNT(0) / total AS proportion +FROM + fonts +GROUP BY + date, + client, + support, + total +ORDER BY + date, + client, + support diff --git a/sql/2024/fonts/development/fonts_format_outline.sql b/sql/2024/fonts/development/fonts_format_outline.sql new file mode 100644 index 00000000000..a521c96ce92 --- /dev/null +++ b/sql/2024/fonts/development/fonts_format_outline.sql @@ -0,0 +1,49 @@ +-- Section: Development +-- Question: Which outline formats are used? +-- Normalization: Fonts (parsed only) + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +WITH +fonts AS ( + SELECT + date, + client, + url, + REGEXP_EXTRACT_ALL( + JSON_EXTRACT(ANY_VALUE(payload), '$._font_details.table_sizes'), + '(?i)(CFF |glyf|SVG|CFF2)' + ) AS formats, + COUNT(0) OVER (PARTITION BY date, client) AS total + FROM + `httparchive.all.requests` + WHERE + date IN ('2022-07-01', '2023-07-01', '2024-07-01') AND + type = 'font' AND + is_root_page AND + IS_PARSED(payload) + GROUP BY + date, + client, + url +) + +SELECT + date, + client, + format, + COUNT(0) AS count, + total, + COUNT(0) / total AS proportion +FROM + fonts, + UNNEST(formats) AS format +GROUP BY + date, + client, + format, + total +ORDER BY + date, + client, + proportion DESC diff --git a/sql/2024/fonts/development/fonts_hinting.sql b/sql/2024/fonts/development/fonts_hinting.sql new file mode 100644 index 00000000000..48f21c598e3 --- /dev/null +++ b/sql/2024/fonts/development/fonts_hinting.sql @@ -0,0 +1,58 @@ +-- Section: Development +-- Question: How prevalent is autohinting? +-- Normalization: Fonts (parsed only) + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +CREATE TEMPORARY FUNCTION IS_HINTED(payload STRING) AS ( + REGEXP_CONTAINS( + JSON_EXTRACT(payload, '$._font_details.table_sizes'), + '(?i)fpgm|prep' + ) +); + + +CREATE TEMPORARY FUNCTION IS_AUTOHINTED(payload STRING) AS ( + REGEXP_CONTAINS( + JSON_EXTRACT_SCALAR(payload, '$._font_details.names[5]'), + 'autohint' + ) +); + +WITH +pages AS ( + SELECT + client, + COUNT(DISTINCT page) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + is_root_page + GROUP BY + client +) + +SELECT + client, + IS_AUTOHINTED(payload) AS autohinted, + COUNT(DISTINCT page) AS count, + total, + COUNT(DISTINCT page) / total AS proportion +FROM + `httparchive.all.requests` +INNER JOIN + pages USING (client) +WHERE + date = '2024-07-01' AND + type = 'font' AND + is_root_page AND + IS_PARSED(payload) AND + IS_HINTED(payload) +GROUP BY + client, + autohinted, + total +ORDER BY + client, + autohinted diff --git a/sql/2024/fonts/development/fonts_variable.sql b/sql/2024/fonts/development/fonts_variable.sql new file mode 100644 index 00000000000..44ff5cd27a4 --- /dev/null +++ b/sql/2024/fonts/development/fonts_variable.sql @@ -0,0 +1,52 @@ +-- Section: Development +-- Question: How popular are variable fonts? +-- Normalization: Pages + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +WITH +fonts AS ( + SELECT + date, + client, + COUNT(DISTINCT page) AS count + FROM + `httparchive.all.requests` + WHERE + date IN ('2022-07-01', '2023-07-01', '2024-07-01') AND + type = 'font' AND + is_root_page AND + IS_VARIABLE(payload) + GROUP BY + date, + client +), +pages AS ( + SELECT + date, + client, + COUNT(DISTINCT page) AS total + FROM + `httparchive.all.requests` + WHERE + date IN ('2022-07-01', '2023-07-01', '2024-07-01') AND + is_root_page + GROUP BY + date, + client +) + +SELECT + date, + client, + count, + total, + count / total AS proportion +FROM + fonts +JOIN + pages USING (date, client) +ORDER BY + date, + client, + proportion DESC diff --git a/sql/2024/fonts/development/fonts_variable_axis.sql b/sql/2024/fonts/development/fonts_variable_axis.sql new file mode 100644 index 00000000000..293bb614613 --- /dev/null +++ b/sql/2024/fonts/development/fonts_variable_axis.sql @@ -0,0 +1,52 @@ +-- Section: Development +-- Question: Which axes are used in variable fonts? +-- Normalization: Fonts (variable only) + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +CREATE TEMPORARY FUNCTION AXES(json STRING) +RETURNS ARRAY +LANGUAGE js +AS ''' +try { + return Object.keys(JSON.parse(json)); +} catch (e) { + return []; +} +'''; + +WITH +fonts AS ( + SELECT + client, + url, + AXES(JSON_EXTRACT(ANY_VALUE(payload), '$._font_details.fvar')) AS axes, + COUNT(0) OVER (PARTITION BY client) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + type = 'font' AND + is_root_page AND + IS_VARIABLE(payload) + GROUP BY + client, + url +) + +SELECT + client, + axis, + COUNT(0) AS count, + total, + COUNT(0) / total AS proportion +FROM + fonts, + UNNEST(axes) AS axis +GROUP BY + client, + axis, + total +ORDER BY + client, + proportion DESC diff --git a/sql/2024/fonts/development/fonts_variable_family.sql b/sql/2024/fonts/development/fonts_variable_family.sql new file mode 100644 index 00000000000..8e1718a947e --- /dev/null +++ b/sql/2024/fonts/development/fonts_variable_family.sql @@ -0,0 +1,39 @@ +-- Section: Development +-- Question: Which variable families are used? +-- Normalization: Requests (variable only) + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +WITH +requests AS ( + SELECT + client, + FAMILY(payload) AS family, + COUNT(0) OVER (PARTITION BY client) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + type = 'font' AND + is_root_page AND + IS_VARIABLE(payload) +) + +SELECT + client, + family, + COUNT(0) AS count, + total, + COUNT(0) / total AS proportion, + ROW_NUMBER() OVER (PARTITION BY client ORDER BY COUNT(0) DESC) AS rank +FROM + requests +GROUP BY + client, + family, + total +QUALIFY + rank <= 100 +ORDER BY + client, + proportion DESC diff --git a/sql/2024/fonts/development/fonts_variable_format.sql b/sql/2024/fonts/development/fonts_variable_format.sql new file mode 100644 index 00000000000..5d34201aa82 --- /dev/null +++ b/sql/2024/fonts/development/fonts_variable_format.sql @@ -0,0 +1,46 @@ +-- Section: Development +-- Question: Which outline formats are used in variable fonts? +-- Normalization: Fonts (variable only) + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +WITH +fonts AS ( + SELECT + date, + client, + url, + VARIABLE_FORMATS(ANY_VALUE(payload)) AS formats, + COUNT(0) OVER (PARTITION BY date, client) AS total + FROM + `httparchive.all.requests` + WHERE + date IN ('2022-07-01', '2023-07-01', '2024-07-01') AND + type = 'font' AND + is_root_page AND + IS_VARIABLE(payload) + GROUP BY + date, + client, + url +) + +SELECT + date, + client, + format, + COUNT(0) AS count, + total, + COUNT(0) / total AS proportion +FROM + fonts, + UNNEST(formats) AS format +GROUP BY + date, + client, + format, + total +ORDER BY + date, + client, + proportion DESC diff --git a/sql/2024/fonts/development/fonts_variable_range.sql b/sql/2024/fonts/development/fonts_variable_range.sql new file mode 100644 index 00000000000..19fca517f2f --- /dev/null +++ b/sql/2024/fonts/development/fonts_variable_range.sql @@ -0,0 +1,67 @@ +-- Section: Development +-- Question: What are the distributions of axes? +-- Normalization: Fonts (variable only) + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +CREATE TEMPORARY FUNCTION AXES(json STRING) +RETURNS ARRAY> +LANGUAGE js +AS ''' +try { + const axes = JSON.parse(json); + return Object.keys(axes).map((name) => { + return { + name: name, + minimum: axes[name].min, + medium: axes[name].default, + maximum: axes[name].max + }; + }); +} catch (e) { + return []; +} +'''; + +WITH +fonts AS ( + SELECT + client, + url, + axis.name, + ANY_VALUE(axis.minimum) AS minimum, + ANY_VALUE(axis.medium) AS medium, + ANY_VALUE(axis.maximum) AS maximum + FROM + `httparchive.all.requests`, + UNNEST(AXES(JSON_EXTRACT(payload, '$._font_details.fvar'))) AS axis + WHERE + date = '2024-07-01' AND + type = 'font' AND + is_root_page AND + IS_VARIABLE(payload) + GROUP BY + client, + url, + name +) + +SELECT + client, + name, + percentile, + COUNT(0) AS count, + APPROX_QUANTILES(minimum, 1000)[OFFSET(percentile * 10)] AS minimum, + APPROX_QUANTILES(medium, 1000)[OFFSET(percentile * 10)] AS medium, + APPROX_QUANTILES(maximum, 1000)[OFFSET(percentile * 10)] AS maximum +FROM + fonts, + UNNEST([10, 25, 50, 75, 90, 99]) AS percentile +GROUP BY + client, + name, + percentile +ORDER BY + client, + name, + percentile diff --git a/sql/2024/fonts/development/fonts_variable_service.sql b/sql/2024/fonts/development/fonts_variable_service.sql new file mode 100644 index 00000000000..aa41632fbc9 --- /dev/null +++ b/sql/2024/fonts/development/fonts_variable_service.sql @@ -0,0 +1,46 @@ +-- Section: Development +-- Question: Who is serving variable fonts? +-- Normalization: Requests (variable only) and fonts (variable only) + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +WITH +requests AS ( + SELECT + date, + client, + url, + SERVICE(url) AS service, + COUNT(0) OVER (PARTITION BY date, client) AS total, + COUNT(DISTINCT url) OVER (PARTITION BY date, client) AS total_secondary + FROM + `httparchive.all.requests` + WHERE + date IN ('2022-06-01', '2022-07-01', '2023-07-01', '2024-07-01') AND + type = 'font' AND + is_root_page AND + IS_VARIABLE(payload) +) + +SELECT + date, + client, + service, + COUNT(0) AS count, + COUNT(DISTINCT url) AS count_secondary, + total, + total_secondary, + COUNT(0) / total AS proportion, + COUNT(DISTINCT url) / total_secondary AS proportion_secondary +FROM + requests +GROUP BY + date, + client, + service, + total, + total_secondary +ORDER BY + date, + client, + proportion DESC diff --git a/sql/2024/fonts/development/styles_family_system.sql b/sql/2024/fonts/development/styles_family_system.sql new file mode 100644 index 00000000000..d3b3f655ff9 --- /dev/null +++ b/sql/2024/fonts/development/styles_family_system.sql @@ -0,0 +1,92 @@ +-- Section: Development +-- Question: Which system families are popular? +-- Normalization: Pages + +CREATE TEMPORARY FUNCTION FAMILIES(json STRING) +RETURNS ARRAY +LANGUAGE js +OPTIONS (library = ["gs://httparchive/lib/css-font-parser.js", "gs://httparchive/lib/css-utils.js"]) +AS ''' +const system = [ + 'cursive', + 'emoji', + 'fangsong', + 'fantasy', + 'math', + 'monospace', + 'sans-serif', + 'serif', + 'system-ui', + 'ui-monospace', + 'ui-rounded', + 'ui-sans-serif', + 'ui-serif' +]; + +try { + const $ = JSON.parse(json); + const result = []; + walkDeclarations($, (declaration) => { + if (declaration.property.toLowerCase() === 'font-family') { + const fonts = parseFontFamilyProperty(declaration.value); + if (fonts) { + fonts.forEach(font => result.push(font)); + } + } else if (declaration.property.toLowerCase() === 'font') { + const value = parseFontProperty(declaration.value); + if (value) { + value['font-family'].forEach((font) => result.push(font)); + } + } + }, { + properties: ['font-family', 'font'], + rules: (rule) => rule.type.toLowerCase() !== 'font-face' + }); + return result.filter((font) => system.includes(font)); +} catch (e) { + return []; +} +'''; + +WITH +families AS ( + SELECT + client, + family, + COUNT(DISTINCT page) AS count + FROM + `httparchive.all.parsed_css`, + UNNEST(FAMILIES(css)) AS family + WHERE + date = '2024-07-01' AND + is_root_page + GROUP BY + client, + family +), +pages AS ( + SELECT + client, + COUNT(DISTINCT page) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + is_root_page + GROUP BY + client +) + +SELECT + client, + family, + count, + total, + count / total AS proportion +FROM + families +JOIN + pages USING (client) +ORDER BY + client, + proportion DESC diff --git a/sql/2024/fonts/development/styles_feature_control.sql b/sql/2024/fonts/development/styles_feature_control.sql new file mode 100644 index 00000000000..ce6b236dfb2 --- /dev/null +++ b/sql/2024/fonts/development/styles_feature_control.sql @@ -0,0 +1,81 @@ +-- Section: Development +-- Question: How are features used in CSS? +-- Normalization: Pages + +CREATE TEMPORARY FUNCTION PROPERTIES(json STRING) +RETURNS ARRAY +LANGUAGE js +OPTIONS (library = "gs://httparchive/lib/css-utils.js") +AS ''' +function compute(tree) { + const result = {}; + walkDeclarations(tree, ({ property, value }) => { + const name = property.toLowerCase(); + if ( + name.startsWith('font-variant-') && + value.toLowerCase() !== 'none' && + value.toLowerCase() !== 'normal' + ) { + incrementByKey(result, 'font-variant'); + } else if ( + name === 'font-feature-settings' && + value.toLowerCase() !== 'normal' + ) { + incrementByKey(result, 'font-feature-settings'); + } + }); + return sortObject(result); +} + +try { + const properties = compute(JSON.parse(json)); + return Object.entries(properties).flatMap(([name, count]) => { + return Array(count).fill(name); + }); +} catch (e) { + return []; +} +'''; + +WITH +properties AS ( + SELECT + client, + property, + COUNT(DISTINCT page) AS count + FROM + `httparchive.all.parsed_css`, + UNNEST(PROPERTIES(css)) AS property + WHERE + date = '2024-07-01' AND + is_root_page + GROUP BY + client, + property +), +pages AS ( + SELECT + client, + COUNT(DISTINCT page) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + is_root_page + GROUP BY + client +) + +SELECT + client, + property, + count, + total, + count / total AS proportion +FROM + properties +JOIN + pages USING (client) +ORDER BY + client, + proportion DESC diff --git a/sql/2024/fonts/development/styles_font_feature_settings.sql b/sql/2024/fonts/development/styles_font_feature_settings.sql new file mode 100644 index 00000000000..dfb46330689 --- /dev/null +++ b/sql/2024/fonts/development/styles_font_feature_settings.sql @@ -0,0 +1,81 @@ +-- Section: Development +-- Question: Which features are used via font-feature-settings in CSS? +-- Normalization: Pages + +CREATE TEMPORARY FUNCTION FEATURES(json STRING) +RETURNS ARRAY +LANGUAGE js +OPTIONS(library = "gs://httparchive/lib/css-utils.js") +AS ''' +function parseFontFeatureSettings(value) { + const features = (value || '').split(/\\s*,\\s*/); + const result = [] + for (let i = 0; i < features.length; i++) { + const match = /^"([\u0020-\u007e]{1,4})"(?:\\s+(\\d+|on|off))?$/i.exec(features[i]); + if (match) { + result.push(match[1]); + } + } + return result; +} + +try { + const $ = JSON.parse(json); + const result = []; + walkDeclarations($, (declaration) => { + const tags = parseFontFeatureSettings(declaration.value); + if (tags && tags.length) { + tags.forEach((tag) => result.push(tag)); + } + }, { + properties: 'font-feature-settings', + rules: (rule) => rule.type.toLowerCase() !== 'font-face' + }); + return result; +} catch (e) { + return []; +} +'''; + +WITH +features AS ( + SELECT + client, + feature, + COUNT(DISTINCT page) AS count + FROM + `httparchive.all.parsed_css`, + UNNEST(FEATURES(css)) AS feature + WHERE + date = '2024-07-01' AND + is_root_page + GROUP BY + client, + feature +), +pages AS ( + SELECT + client, + COUNT(DISTINCT page) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + is_root_page + GROUP BY + client +) + +SELECT + client, + feature, + count, + total, + count / total AS proportion +FROM + features +JOIN + pages USING (client) +ORDER BY + client, + proportion DESC diff --git a/sql/2024/fonts/development/styles_font_variable_settings_axis.sql b/sql/2024/fonts/development/styles_font_variable_settings_axis.sql new file mode 100644 index 00000000000..b87c9dbcc67 --- /dev/null +++ b/sql/2024/fonts/development/styles_font_variable_settings_axis.sql @@ -0,0 +1,67 @@ +-- Section: Development +-- Question: Which axes are used in CSS? +-- Normalization: Pages (variable only) + +CREATE TEMPORARY FUNCTION PROPERTIES(json STRING) +RETURNS ARRAY +LANGUAGE js +AS ''' +function compute(values, rule) { + if ('rules' in rule) { + return rule.rules.reduce(compute, values); + } + if (!('declarations' in rule)) { + return values; + } + return values.concat( + rule.declarations + .filter((declaration) => declaration.property.toLowerCase() === 'font-variation-settings') + .map((declaration) => declaration.value) + ); +} + +try { + const $ = JSON.parse(json); + return $.stylesheet.rules.reduce(compute, []); +} catch (e) { + return []; +} +'''; + +WITH +pages AS ( + SELECT + client, + page, + REGEXP_EXTRACT(chunk, r'''['"]([\w]{4})['"]''') AS axis, + COUNT(DISTINCT page) OVER (PARTITION BY client) AS total + FROM + `httparchive.all.parsed_css`, + UNNEST(PROPERTIES(css)) AS property, + UNNEST(SPLIT(property, ',')) AS chunk + WHERE + date = '2024-07-01' AND + is_root_page + GROUP BY + client, + page, + axis + HAVING + axis IS NOT NULL +) + +SELECT + client, + axis, + COUNT(0) AS count, + total, + COUNT(0) / total AS proportion +FROM + pages +GROUP BY + client, + axis, + total +ORDER BY + client, + proportion DESC diff --git a/sql/2024/fonts/development/styles_font_variant.sql b/sql/2024/fonts/development/styles_font_variant.sql new file mode 100644 index 00000000000..82f35f66055 --- /dev/null +++ b/sql/2024/fonts/development/styles_font_variant.sql @@ -0,0 +1,74 @@ +-- Section: Development +-- Question: Which features are used via font-variant in CSS? +-- Normalization: Pages + +CREATE TEMPORARY FUNCTION PROPERTIES(json STRING) +RETURNS ARRAY +LANGUAGE js +OPTIONS (library = "gs://httparchive/lib/css-utils.js") +AS ''' +function compute(tree) { + const result = {}; + walkDeclarations(tree, ({ property, value }) => { + const name = property.toLowerCase(); + if (name === 'font-variant') { + incrementByKey(result, 'font-variant: ' + value) + } else if (name.startsWith('font-variant-')) { + incrementByKey(result, name + ': ' + value); + } + }); + return sortObject(result); +} + +try { + const properties = compute(JSON.parse(json)); + return Object.entries(properties).flatMap(([name, count]) => { + return Array(count).fill(name); + }); +} catch (e) { + return []; +} +'''; + +WITH +properties AS ( + SELECT + client, + property, + COUNT(DISTINCT page) AS count + FROM + `httparchive.all.parsed_css`, + UNNEST(PROPERTIES(css)) AS property + WHERE + date = '2024-07-01' AND + is_root_page + GROUP BY + client, + property +), +pages AS ( + SELECT + client, + COUNT(DISTINCT page) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + is_root_page + GROUP BY + client +) + +SELECT + client, + property, + count, + total, + count / total AS proportion +FROM + properties +JOIN + pages USING (client) +ORDER BY + client, + proportion DESC diff --git a/sql/2024/fonts/development/styles_metric_override.sql b/sql/2024/fonts/development/styles_metric_override.sql new file mode 100644 index 00000000000..c4d9fbafa84 --- /dev/null +++ b/sql/2024/fonts/development/styles_metric_override.sql @@ -0,0 +1,66 @@ +-- Section: Development +-- Question: How and how often is metrics override used in CSS? +-- Normalization: Pages + +CREATE TEMPORARY FUNCTION PROPERTIES(json STRING) +RETURNS ARRAY +LANGUAGE js +OPTIONS (library = "gs://httparchive/lib/css-utils.js") +AS ''' +try { + const $ = JSON.parse(json); + const result = []; + walkDeclarations($, (declaration) => { + result.push(declaration.property); + }, { + properties: ['size-adjust', 'ascent-override', 'descent-override', 'line-gap-override'], + rules: (rule) => rule.type.toLowerCase() === 'font-face' + }); + return result; +} catch (e) { + return []; +} +'''; + +WITH +properties AS ( + SELECT + client, + property, + COUNT(DISTINCT page) AS count + FROM + `httparchive.all.parsed_css`, + UNNEST(PROPERTIES(css)) AS property + WHERE + date = '2024-07-01' AND + is_root_page + GROUP BY + client, + property +), +pages AS ( + SELECT + client, + COUNT(DISTINCT page) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + is_root_page + GROUP BY + client +) + +SELECT + client, + property, + count, + total, + count / total AS proportion +FROM + properties +JOIN + pages USING (client) +ORDER BY + client, + proportion DESC diff --git a/sql/2024/fonts/development/styles_smoothing.sql b/sql/2024/fonts/development/styles_smoothing.sql new file mode 100644 index 00000000000..b2dc5288785 --- /dev/null +++ b/sql/2024/fonts/development/styles_smoothing.sql @@ -0,0 +1,68 @@ +-- Section: Development +-- Question: How and how often is smoothing used in CSS? +-- Normalization: Pages + +CREATE TEMPORARY FUNCTION PROPERTIES(json STRING) +RETURNS ARRAY +LANGUAGE js +OPTIONS (library = "gs://httparchive/lib/css-utils.js") +AS ''' +try { + const $ = JSON.parse(json); + const result = []; + walkDeclarations($, (declaration) => { + result.push(`${declaration.property}: ${declaration.value}`); + }, { + properties: ['-webkit-font-smoothing', '-moz-osx-font-smoothing', 'font-smooth'] + }); + return result; +} catch (e) { + return []; +} +'''; + +WITH +properties AS ( + SELECT + client, + property, + COUNT(DISTINCT page) AS count, + ROW_NUMBER() OVER (PARTITION BY client ORDER BY COUNT(DISTINCT page) DESC) AS rank + FROM + `httparchive.all.parsed_css`, + UNNEST(PROPERTIES(css)) AS property + WHERE + date = '2024-07-01' AND + is_root_page + GROUP BY + client, + property + QUALIFY + rank <= 10 +), +pages AS ( + SELECT + client, + COUNT(DISTINCT page) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + is_root_page + GROUP BY + client +) + +SELECT + client, + property, + count, + total, + count / total AS proportion +FROM + properties +JOIN + pages USING (client) +ORDER BY + client, + proportion DESC diff --git a/sql/2024/fonts/development/styles_variable_animation.sql b/sql/2024/fonts/development/styles_variable_animation.sql new file mode 100644 index 00000000000..109698a6040 --- /dev/null +++ b/sql/2024/fonts/development/styles_variable_animation.sql @@ -0,0 +1,78 @@ +-- Section: Development +-- Question: How popular is variable-font animimation in CSS? +-- Normalization: Pages + +CREATE TEMPORARY FUNCTION HAS_ANIMATION(json STRING) +RETURNS BOOLEAN +LANGUAGE js +OPTIONS(library = "gs://httparchive/lib/css-utils.js") +AS ''' +try { + const $ = JSON.parse(json); + let count = 0; + walkRules($, (rule) => { + rule.keyframes.forEach((frame) => { + count += countDeclarations( + frame, + { + properties: [ + 'font-stretch', + 'font-style', + 'font-variation-settings', + 'font-weight' + ] + } + ); + }); + }, { + type: 'keyframes' + }); + count += countDeclarations($.stylesheet.rules, { + properties: 'transition', + values: /font-stretch|font-style|font-variation-settings|font-weight/ + }); + return count > 0; +} catch (e) { + return false; +} +'''; + +WITH +properties AS ( + SELECT + client, + COUNT(DISTINCT page) AS count + FROM + `httparchive.all.parsed_css` + WHERE + date = '2024-07-01' AND + is_root_page AND + HAS_ANIMATION(css) + GROUP BY + client +), +pages AS ( + SELECT + client, + COUNT(DISTINCT page) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + is_root_page + GROUP BY + client +) + +SELECT + client, + count, + total, + count / total AS proportion +FROM + properties +JOIN + pages USING (client) +ORDER BY + client, + proportion DESC diff --git a/sql/2024/fonts/performance/fonts.sql b/sql/2024/fonts/performance/fonts.sql new file mode 100644 index 00000000000..636b5a994e3 --- /dev/null +++ b/sql/2024/fonts/performance/fonts.sql @@ -0,0 +1,21 @@ +-- Section: Performance +-- Question: What is the font usage over time? +-- Normalization: Pages + +SELECT + date, + client, + COUNT(DISTINCT IF(type = 'font', page, NULL)) AS count, + COUNT(DISTINCT page) AS total, + COUNT(DISTINCT IF(type = 'font', page, NULL)) / COUNT(DISTINCT page) AS proportion +FROM + `httparchive.all.requests` +WHERE + date IS NOT NULL AND + is_root_page +GROUP BY + client, + date +ORDER BY + date, + client diff --git a/sql/2024/fonts/performance/fonts_family_by_service.sql b/sql/2024/fonts/performance/fonts_family_by_service.sql new file mode 100644 index 00000000000..3c5f32cb6eb --- /dev/null +++ b/sql/2024/fonts/performance/fonts_family_by_service.sql @@ -0,0 +1,43 @@ +-- Section: Performance +-- Question: Which families are used broken down by service? +-- Normalization: Requests (parsed only) + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +WITH +requests AS ( + SELECT + client, + SERVICE(url) AS service, + FAMILY(payload) AS family, + COUNT(0) OVER (PARTITION BY client) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + type = 'font' AND + is_root_page AND + IS_PARSED(payload) +) + +SELECT + client, + service, + family, + COUNT(0) AS count, + total, + COUNT(0) / total AS proportion, + ROW_NUMBER() OVER (PARTITION BY client, service ORDER BY COUNT(0) DESC) AS rank +FROM + requests +GROUP BY + client, + service, + family, + total +QUALIFY + rank <= 100 +ORDER BY + client, + service, + proportion DESC diff --git a/sql/2024/fonts/performance/fonts_format_file.sql b/sql/2024/fonts/performance/fonts_format_file.sql new file mode 100644 index 00000000000..7e3421e86e3 --- /dev/null +++ b/sql/2024/fonts/performance/fonts_format_file.sql @@ -0,0 +1,47 @@ +-- Section: Performance +-- Question: Which file formats are used? +-- Normalization: Requests and fonts + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +WITH +requests AS ( + SELECT + client, + url, + FILE_FORMAT( + JSON_EXTRACT_SCALAR(summary, '$.ext'), + JSON_EXTRACT_SCALAR(summary, '$.mimeType') + ) AS format, + COUNT(0) OVER (PARTITION BY client) AS total, + COUNT(DISTINCT url) OVER (PARTITION BY client) AS total_secondary + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + type = 'font' AND + is_root_page +) + +SELECT + client, + format, + COUNT(0) AS count, + COUNT(DISTINCT url) AS count_secondary, + total, + total_secondary, + COUNT(0) / total AS proportion, + COUNT(DISTINCT url) / total_secondary AS proportion_secondary, + ROW_NUMBER() OVER (PARTITION BY client ORDER BY COUNT(0) DESC) AS rank +FROM + requests +GROUP BY + client, + format, + total, + total_secondary +QUALIFY + rank <= 10 +ORDER BY + client, + proportion DESC diff --git a/sql/2024/fonts/performance/fonts_format_file_by_service.sql b/sql/2024/fonts/performance/fonts_format_file_by_service.sql new file mode 100644 index 00000000000..ab6b61d0369 --- /dev/null +++ b/sql/2024/fonts/performance/fonts_format_file_by_service.sql @@ -0,0 +1,51 @@ +-- Section: Performance +-- Question: Which file formats are used broken down by service? +-- Normalization: Requests and fonts + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +WITH +requests AS ( + SELECT + client, + url, + SERVICE(url) AS service, + FILE_FORMAT( + JSON_EXTRACT_SCALAR(summary, '$.ext'), + JSON_EXTRACT_SCALAR(summary, '$.mimeType') + ) AS format, + COUNT(0) OVER (PARTITION BY client) AS total, + COUNT(DISTINCT url) OVER (PARTITION BY client) AS total_secondary + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + type = 'font' AND + is_root_page +) + +SELECT + client, + service, + format, + COUNT(0) AS count, + COUNT(DISTINCT url) AS count_secondary, + total, + total_secondary, + COUNT(0) / total AS proportion, + COUNT(DISTINCT url) / total_secondary AS proportion_secondary, + ROW_NUMBER() OVER (PARTITION BY client, service ORDER BY COUNT(0) DESC) AS rank +FROM + requests +GROUP BY + client, + service, + format, + total, + total_secondary +QUALIFY + rank <= 10 +ORDER BY + client, + service, + proportion DESC diff --git a/sql/2024/fonts/performance/fonts_service.sql b/sql/2024/fonts/performance/fonts_service.sql new file mode 100644 index 00000000000..eedfe4fd99c --- /dev/null +++ b/sql/2024/fonts/performance/fonts_service.sql @@ -0,0 +1,46 @@ +-- Section: Performance +-- Question: Which services are popular? +-- Normalization: Pages + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +WITH +pages AS ( + SELECT + date, + client, + COUNT(DISTINCT page) AS total + FROM + `httparchive.all.requests` + WHERE + date IN ('2022-07-01', '2023-07-01', '2024-07-01') AND + is_root_page + GROUP BY + date, + client +) + +SELECT + date, + client, + SERVICE(url) AS service, + COUNT(DISTINCT page) AS count, + total, + COUNT(DISTINCT page) / total AS proportion +FROM + `httparchive.all.requests` +INNER JOIN + pages USING (date, client) +WHERE + date IN ('2022-07-01', '2023-07-01', '2024-07-01') AND + type = 'font' AND + is_root_page +GROUP BY + date, + client, + service, + total +ORDER BY + date, + client, + proportion DESC diff --git a/sql/2024/fonts/performance/fonts_services.sql b/sql/2024/fonts/performance/fonts_services.sql new file mode 100644 index 00000000000..b64e28f7ac5 --- /dev/null +++ b/sql/2024/fonts/performance/fonts_services.sql @@ -0,0 +1,67 @@ +-- Section: Performance +-- Question: Which service combinations are popular? +-- Normalization: Pages + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +WITH +services_1 AS ( + SELECT + date, + client, + page, + STRING_AGG(DISTINCT SERVICE(url), ', ' ORDER BY SERVICE(url)) AS services + FROM + `httparchive.all.requests` + WHERE + date IN ('2022-07-01', '2023-07-01', '2024-07-01') AND + type = 'font' AND + is_root_page + GROUP BY + date, + client, + page +), +services_2 AS ( + SELECT + date, + client, + services, + COUNT(DISTINCT page) AS count + FROM + services_1 + GROUP BY + date, + client, + services +), +pages AS ( + SELECT + date, + client, + COUNT(DISTINCT page) AS total + FROM + `httparchive.all.requests` + WHERE + date IN ('2022-07-01', '2023-07-01', '2024-07-01') AND + is_root_page + GROUP BY + date, + client +) + +SELECT + date, + client, + services, + count, + total, + count / total AS proportion +FROM + services_2 +JOIN + pages USING (date, client) +ORDER BY + date, + client, + proportion DESC diff --git a/sql/2024/fonts/performance/fonts_size.sql b/sql/2024/fonts/performance/fonts_size.sql new file mode 100644 index 00000000000..061d7b75043 --- /dev/null +++ b/sql/2024/fonts/performance/fonts_size.sql @@ -0,0 +1,38 @@ +-- Section: Performance +-- Question: What is the distribution of the file size? +-- Normalization: Fonts (parsed only) + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +WITH +fonts AS ( + SELECT + client, + url, + PARSE_NUMERIC(JSON_EXTRACT_SCALAR(ANY_VALUE(summary), '$.respBodySize')) AS size + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + type = 'font' AND + is_root_page AND + IS_PARSED(payload) + GROUP BY + client, + url +) + +SELECT + client, + percentile, + COUNT(0) AS count, + ROUND(APPROX_QUANTILES(size, 1000)[OFFSET(percentile * 10)]) AS size +FROM + fonts, + UNNEST([10, 25, 50, 75, 90, 99]) AS percentile +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2024/fonts/performance/fonts_size_by_country.sql b/sql/2024/fonts/performance/fonts_size_by_country.sql new file mode 100644 index 00000000000..1291662ad54 --- /dev/null +++ b/sql/2024/fonts/performance/fonts_size_by_country.sql @@ -0,0 +1,50 @@ +-- Section: Performance +-- Question: What is the distribution of the file size broken down by country? +-- Normalization: Requests (parsed only) + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +WITH +countries AS ( + SELECT + IF(device = 'desktop', 'desktop', 'mobile') AS client, + NET.HOST(origin) AS domain, + `chrome-ux-report`.experimental.GET_COUNTRY(country_code) AS country + FROM + `chrome-ux-report.materialized.country_summary` + WHERE + yyyymm = 202407 + GROUP BY + client, + domain, + country +), +requests AS ( + SELECT + client, + NET.HOST(page) AS domain, + PARSE_NUMERIC(JSON_EXTRACT_SCALAR(summary, '$.respBodySize')) AS size + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + type = 'font' AND + is_root_page AND + IS_PARSED(payload) +) + +SELECT + client, + country, + COUNT(0) AS count, + ROUND(APPROX_QUANTILES(size, 1000)[OFFSET(500)]) AS size +FROM + requests +INNER JOIN + countries USING (client, domain) +GROUP BY + client, + country +ORDER BY + client, + country diff --git a/sql/2024/fonts/performance/fonts_size_by_format.sql b/sql/2024/fonts/performance/fonts_size_by_format.sql new file mode 100644 index 00000000000..d527b738221 --- /dev/null +++ b/sql/2024/fonts/performance/fonts_size_by_format.sql @@ -0,0 +1,61 @@ +-- Section: Performance +-- Question: What is the distribution of the file size broken down by format? +-- Normalization: Fonts (parsed only) + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +WITH +fonts AS ( + SELECT + client, + url, + FILE_FORMAT( + JSON_EXTRACT_SCALAR(ANY_VALUE(summary), '$.ext'), + JSON_EXTRACT_SCALAR(ANY_VALUE(summary), '$.mimeType') + ) AS format, + PARSE_NUMERIC(JSON_EXTRACT_SCALAR(ANY_VALUE(summary), '$.respBodySize')) AS size + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + type = 'font' AND + is_root_page AND + IS_PARSED(payload) + GROUP BY + client, + url +), +formats AS ( + SELECT + client, + format, + ROW_NUMBER() OVER (PARTITION BY client ORDER BY COUNT(DISTINCT url) DESC) AS rank + FROM + fonts + GROUP BY + client, + format +) + +SELECT + client, + format, + percentile, + COUNT(DISTINCT url) AS count, + ROUND(APPROX_QUANTILES(size, 1000)[OFFSET(percentile * 10)]) AS size +FROM + fonts, + UNNEST([10, 25, 50, 75, 90, 99]) AS percentile +INNER JOIN + formats USING (client, format) +WHERE + rank <= 10 +GROUP BY + client, + format, + rank, + percentile +ORDER BY + client, + rank, + percentile diff --git a/sql/2024/fonts/performance/fonts_size_by_service.sql b/sql/2024/fonts/performance/fonts_size_by_service.sql new file mode 100644 index 00000000000..8f253dbfec6 --- /dev/null +++ b/sql/2024/fonts/performance/fonts_size_by_service.sql @@ -0,0 +1,67 @@ +-- Section: Performance +-- Question: What is the distribution of the file size broken down by service? +-- Normalization: Fonts (parsed only) + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +WITH +fonts AS ( + SELECT + client, + url, + SERVICE(url) AS service, + FILE_FORMAT( + JSON_EXTRACT_SCALAR(ANY_VALUE(summary), '$.ext'), + JSON_EXTRACT_SCALAR(ANY_VALUE(summary), '$.mimeType') + ) AS format, + PARSE_NUMERIC(JSON_EXTRACT_SCALAR(ANY_VALUE(summary), '$.respBodySize')) AS size + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + type = 'font' AND + is_root_page AND + IS_PARSED(payload) + GROUP BY + client, + url +), +formats AS ( + SELECT + client, + service, + format, + ROW_NUMBER() OVER (PARTITION BY client, service ORDER BY COUNT(DISTINCT url) DESC) AS rank + FROM + fonts + GROUP BY + client, + service, + format +) + +SELECT + client, + service, + format, + percentile, + COUNT(DISTINCT url) AS count, + ROUND(APPROX_QUANTILES(size, 1000)[OFFSET(percentile * 10)]) AS size +FROM + fonts, + UNNEST([10, 25, 50, 75, 90, 99]) AS percentile +INNER JOIN + formats USING (client, service, format) +WHERE + rank <= 10 +GROUP BY + client, + service, + format, + rank, + percentile +ORDER BY + client, + service, + rank, + percentile diff --git a/sql/2024/fonts/performance/fonts_size_by_table.sql b/sql/2024/fonts/performance/fonts_size_by_table.sql new file mode 100644 index 00000000000..13c67de7d87 --- /dev/null +++ b/sql/2024/fonts/performance/fonts_size_by_table.sql @@ -0,0 +1,56 @@ +-- Section: Performance +-- Question: What is the distribution of the file size broken down by table? +-- Normalization: Fonts (parsed only) + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +CREATE TEMPORARY FUNCTION TABLES(json STRING) +RETURNS ARRAY> +LANGUAGE js AS ''' +try { + const $ = JSON.parse(json); + return Object.entries($).map(([name, size]) => ({ name, size })); +} catch (e) { + return []; +} +'''; + +WITH +fonts AS ( + SELECT + client, + url, + TABLES(JSON_EXTRACT(ANY_VALUE(payload), '$._font_details.table_sizes')) AS tables + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + type = 'font' AND + is_root_page AND + IS_PARSED(payload) + GROUP BY + client, + url +) + +SELECT + client, + table.name AS table, + percentile, + COUNT(0) AS count, + ROUND(APPROX_QUANTILES(size, 1000)[OFFSET(percentile * 10)]) AS size +FROM + fonts, + UNNEST(tables) AS table, + UNNEST([10, 25, 50, 75, 90, 99]) AS percentile +GROUP BY + client, + table, + percentile +HAVING + -- Filter out spurious tables. + count > 1000 +ORDER BY + client, + table, + percentile diff --git a/sql/2024/fonts/performance/pages_link_relationship.sql b/sql/2024/fonts/performance/pages_link_relationship.sql new file mode 100644 index 00000000000..838d07af0d2 --- /dev/null +++ b/sql/2024/fonts/performance/pages_link_relationship.sql @@ -0,0 +1,94 @@ +-- Section: Performance +-- Question: What is the usage of link relationship in HTML? +-- Normalization: Pages + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +CREATE TEMPORARY FUNCTION HINTS(json STRING) +RETURNS ARRAY> +LANGUAGE js AS ''' +const names = new Set([ + 'dns-prefetch', + 'preconnect', + 'prefetch', + 'preload', +]); +try { + const $ = JSON.parse(json); + return $.almanac['link-nodes'].nodes.reduce((results, node) => { + const name = node.rel.toLowerCase(); + if (names.has(name)) { + results.push({ + 'name': name, + 'type': node.as, + 'url': node.href + }); + } + return results; + }, []); +} catch (e) { + return []; +} +'''; + +WITH +hints AS ( + SELECT + pages.date, + pages.client, + hint.name AS hint, + COUNT(DISTINCT pages.page) AS count + FROM + `httparchive.all.pages` AS pages, + UNNEST(HINTS(custom_metrics)) AS hint + LEFT JOIN + `httparchive.all.requests` AS requests + ON + requests.date IN ('2022-06-01', '2022-07-01', '2023-07-01', '2024-07-01') AND + requests.type = 'font' AND + requests.is_root_page AND + pages.page = requests.page AND + hint.url = requests.url + WHERE + pages.date IN ('2022-06-01', '2022-07-01', '2023-07-01', '2024-07-01') AND + pages.is_root_page AND + ( + requests.url IS NOT NULL OR + LOWER(hint.type) = 'font' OR + SERVICE(hint.url) != 'self-hosted' + ) + GROUP BY + date, + client, + hint +), +pages AS ( + SELECT + date, + client, + COUNT(DISTINCT page) AS total + FROM + `httparchive.all.pages` + WHERE + date IN ('2022-06-01', '2022-07-01', '2023-07-01', '2024-07-01') AND + is_root_page + GROUP BY + date, + client +) + +SELECT + date, + client, + hint, + count, + total, + count / total AS proportion +FROM + hints +LEFT JOIN + pages USING (date, client) +ORDER BY + date, + client, + proportion DESC diff --git a/sql/2024/fonts/performance/scripts_font_face.sql b/sql/2024/fonts/performance/scripts_font_face.sql new file mode 100644 index 00000000000..0e7bc6998e8 --- /dev/null +++ b/sql/2024/fonts/performance/scripts_font_face.sql @@ -0,0 +1,49 @@ +-- Section: Performance +-- Question: What is the usage of FontFace in JavaScript? +-- Normalization: Pages + +WITH +scripts AS ( + SELECT + date, + client, + COUNT(DISTINCT page) AS count + FROM + `httparchive.all.requests` + WHERE + date IN ('2022-06-01', '2022-07-01', '2023-07-01', '2024-07-01') AND + type = 'script' AND + is_root_page AND + REGEXP_CONTAINS(response_body, r'new FontFace\(') + GROUP BY + date, + client +), +pages AS ( + SELECT + date, + client, + COUNT(DISTINCT page) AS total + FROM + `httparchive.all.requests` + WHERE + date IN ('2022-06-01', '2022-07-01', '2023-07-01', '2024-07-01') AND + is_root_page + GROUP BY + client, + date +) + +SELECT + date, + client, + count, + total, + count / total AS proportion +FROM + scripts +JOIN + pages USING (date, client) +ORDER BY + date, + client diff --git a/sql/2024/fonts/performance/styles_font_display.sql b/sql/2024/fonts/performance/styles_font_display.sql new file mode 100644 index 00000000000..9823ea1b54f --- /dev/null +++ b/sql/2024/fonts/performance/styles_font_display.sql @@ -0,0 +1,68 @@ +-- Section: Performance +-- Question: What is the usage of font-display in CSS? +-- Normalization: Pages + +CREATE TEMPORARY FUNCTION PROPERTIES(json STRING) +RETURNS ARRAY +LANGUAGE js +OPTIONS(library = "gs://httparchive/lib/css-utils.js") +AS ''' +try { + const values = ['auto', 'block', 'fallback', 'optional', 'swap']; + const $ = JSON.parse(json); + const result = []; + walkDeclarations($, (declaration) => { + const value = declaration.value.toLowerCase(); + result.push(values.find((other) => value.includes(other)) || 'other'); + }, { + properties: 'font-display', + rules: (rule) => rule.type.toLowerCase() === 'font-face' + }); + return result; +} catch (e) { + return []; +} +'''; + +WITH +properties AS ( + SELECT + client, + NULLIF(property, 'other') AS property, + COUNT(DISTINCT page) AS count + FROM + `httparchive.all.parsed_css`, + UNNEST(PROPERTIES(css)) AS property + WHERE + date = '2024-07-01' AND + is_root_page + GROUP BY + client, + property +), +pages AS ( + SELECT + client, + COUNT(DISTINCT page) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + is_root_page + GROUP BY + client +) + +SELECT + client, + property, + count, + total, + count / total AS proportion +FROM + properties +JOIN + pages USING (client) +ORDER BY + client, + proportion DESC diff --git a/sql/2024/fonts/performance/styles_font_display_by_family.sql b/sql/2024/fonts/performance/styles_font_display_by_family.sql new file mode 100644 index 00000000000..4363e5201bb --- /dev/null +++ b/sql/2024/fonts/performance/styles_font_display_by_family.sql @@ -0,0 +1,94 @@ +-- Section: Performance +-- Question: What is the usage of font-display in CSS broken down by family? +-- Normalization: Pages + +-- INCLUDE https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/sql/2024/fonts/common.sql + +CREATE TEMPORARY FUNCTION PROPERTIES(json STRING) +RETURNS ARRAY> +LANGUAGE js +OPTIONS (library = ["gs://httparchive/lib/css-font-parser.js", "gs://httparchive/lib/css-utils.js"]) +AS ''' +try { + const values = ['auto', 'block', 'fallback', 'optional', 'swap']; + const $ = JSON.parse(json); + const result = []; + walkRules($, (rule) => { + let found = false; + let family = undefined; + let display = undefined; + for (const declaration of rule.declarations) { + const name = declaration.property.toLowerCase(); + if (name === 'font-family') { + family = parseFontFamilyProperty(declaration.value)[0]; + } + if (name === 'font-display') { + found = true; + const value = declaration.value.toLowerCase(); + display = values.find((other) => value.includes(other)); + } + if (family && display) { + break; + } + } + if (found) { + result.push({ family, display }); + } + }, { + type: 'font-face' + }); + return result; +} catch (e) { + return []; +} +'''; + +WITH +properties AS ( + SELECT + client, + display AS property, + FAMILY_INNER(family) AS family, + COUNT(DISTINCT page) AS count, + ROW_NUMBER() OVER (PARTITION BY client, display ORDER BY COUNT(DISTINCT page) DESC) AS rank + FROM + `httparchive.all.parsed_css`, + UNNEST(PROPERTIES(css)) AS property + WHERE + date = '2024-07-01' AND + is_root_page + GROUP BY + client, + property, + family + QUALIFY + rank <= 10 +), +pages AS ( + SELECT + client, + COUNT(DISTINCT page) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-07-01' AND + is_root_page + GROUP BY + client +) + +SELECT + client, + property, + family, + count, + total, + count / total AS proportion +FROM + properties +JOIN + pages USING (client) +ORDER BY + client, + property, + proportion DESC From 5bdb5233278c4df287dd05ff2a693fa2713b52d8 Mon Sep 17 00:00:00 2001 From: Mike Gifford Date: Wed, 30 Oct 2024 07:30:11 -0400 Subject: [PATCH 07/15] Accessibility 2024: Queries (#3749) * Create a11y_overall_tech_usage_by_domain_rank.sql This code was developed by Barry Pollard while he was showing me the ropes. * Update a11y_overall_tech_usage_by_domain_rank.sql Attempting to fix lint errors * Create a11y_technology_usage.sql This seems to work. * Create a11y_technology_usage_by_domain_rank.sql Seems to work.... * Update a11y_technology_usage.sql Remove white space * Create alt_ending_in_image_extension.sql Seems to work.... * Create anchors_with_role_button.sql Looks good against rank 1000 * Create audio_track_usage.sql Produces results.... * Create button_name_sources.sql Data looks good * Create captcha_usage.sql Working... * Create color_contrast.sql I'm guessing that the JSON_VALUE is the same as JSON_EXTRACT_SCALAR * Create common_alt_text_length.sql Seems to work.. * Create common_aria_role.sql * Create common_element_attributes.sql Looks good.. * Create form_input_name_sources.sql Seems to work. * Create form_required_controls.sql Seems to work. * Update a11y_overall_tech_usage_by_domain_rank.sql Incorrect spacing. * Update alt_ending_in_image_extension.sql Spacing... * Update button_name_sources.sql Spacing. * Create landmark_elements_and_roles.sql Seems to work * Create page_title.sql Seems to work * Create pages_with_search_input.sql working... * Create placeholder_but_no_label.sql Working * Create sites_using_role.sql updated.... * Create skip_links.sql updated.. * Create sr_only_classes.sql I think this probably could be extended to include other screen reader classes which are common. * Create tabindex_usage_and_values.sql Updated. * Create table_stats.sql Updated * Create valid_html_lang.sql working... * Create video_track_usage.sql * Create viewport_zoom_scale.sql working.. * Create viewport_zoom_scale_by_domain_rank.sql done * Update sql/2024/accessibility/a11y_technology_usage_by_domain_rank.sql Thanks! Co-authored-by: Barry Pollard * Update sql/2024/accessibility/alt_ending_in_image_extension.sql Thanks. Co-authored-by: Barry Pollard * Update viewport_zoom_scale_by_domain_rank.sql removing white space * Create lighthouse_a11y_audits.sql Thanks to @tunetheweb in Slack. Modified to make it not for testing purposes. * Create lighthouse_a11y_score.sql I think this works... * Update video_track_usage.sql Fixing spacing * Update a11y_overall_tech_usage_by_domain_rank.sql Including is_root_page in query. * Update a11y_technology_usage.sql adding is_root_page * Update a11y_technology_usage_by_domain_rank.sql * Update alt_ending_in_image_extension.sql adding is_root_page * Update anchors_with_role_button.sql * Update audio_track_usage.sql is_root_page * Update button_name_sources.sql is_root_page, * Update captcha_usage.sql is_root_page * Update color_contrast.sql is_root_page * Update common_alt_text_length.sql is_root_page * Update common_aria_role.sql is_root_page * Update common_element_attributes.sql is_root_page * Update form_input_name_sources.sql is_root_page * Update form_required_controls.sql is_root_page * Update landmark_elements_and_roles.sql is_root_page * Update lighthouse_a11y_score.sql is_root_page * Update page_title.sql is_root_page * Update pages_with_search_input.sql is_root_page * Update placeholder_but_no_label.sql is_root_page * Update sites_using_role.sql is_root_page * Update skip_links.sql is_root_page * Update sr_only_classes.sql is_root_page * Update tabindex_usage_and_values.sql is_root_page * Update table_stats.sql is_root_page * Update valid_html_lang.sql is_root_page * Update video_track_usage.sql is_root_page * Update viewport_zoom_scale.sql is_root_page * Update viewport_zoom_scale_by_domain_rank.sql is_root_page * Update viewport_zoom_scale.sql removing spaces * Update valid_html_lang.sql removing spaces * Update placeholder_but_no_label.sql Spaces * Update lighthouse_a11y_score.sql spaces * Update a11y_technology_usage.sql spaces * Create a11y_frontend_technology.sql * Create focus_outline_0.sql This seems to work. * Update a11y_frontend_technology.sql white spaces * Update a11y_frontend_technology.sql Linting * Create focus_visible.sql Adding focus visible check * Update focus_visible.sql Updating the year. * Update focus_visible.sql this seems to work better. * Update focus_visible.sql Fixing linting * Create media_query_features.sql Seems to be working. * Create units_properties.sql Seems to be working * Create lighthouse_per_cms.sql * Rename lighthouse_per_cms.sql to lighthouse_score_by_cms.sql title change * Update lighthouse_score_by_cms.sql Adding back in best practices. * Update lighthouse_score_by_cms.sql * Update lighthouse_score_by_cms.sql removing rounding and commented out code * Update lighthouse_score_by_cms.sql removing round and fixing best-practices * Create lighthouse_score_by_government.sql Adding country specific scans. * Rename lighthouse_score_by_country.sql to lighthouse_score_by_government.sql for consistency * Update lighthouse_score_by_government.sql Adding more governments. * Update lighthouse_score_by_government.sql Adding US States * Create lighthouse_by_frontend_framework.sql * Rename lighthouse_by_frontend_framework.sql to lighthouse_score_by_frontend.sql title update * Update lighthouse_score_by_government.sql white space * Update lighthouse_score_by_frontend.sql white space & linting * Update lighthouse_score_by_cms.sql white space * Update focus_outline_0.sql expanding the returned results * Update lighthouse_score_by_frontend.sql linting * Update lighthouse_score_by_government.sql Linting * Update lighthouse_score_by_government.sql Linting * Update landmark_elements_and_roles.sql Updated query * Update landmark_elements_and_roles.sql linting - white space * Update lighthouse_score_by_government.sql Trying to standardize count to address linting issue. * Update common_aria_role.sql Update to make sure we're getting percentages. * Update common_element_attributes.sql updating for percentages * Update a11y_technology_usage_by_domain_rank.sql Query that is producing better results. * Update a11y_technology_usage_by_domain_rank.sql Linting * Update landmark_elements_and_roles.sql Fixed query - Thanks to @tunetheweb * Update a11y_technology_usage_by_domain_rank.sql spaces * Create lighthouse_score_by_country.sql Adding country references * Create lighthouse_score_by_tld.sql Adding query by TLD * Update lighthouse_score_by_government.sql Making query case insensitive * Update lighthouse_score_by_government.sql Adding Mass.gov * Update lighthouse_a11y_score.sql The value was still rounded so lost valuable info * Update lighthouse_a11y_score.sql Avoiding approximate values for the return. * Update lighthouse_score_by_tld.sql Linting * Update lighthouse_a11y_score.sql Linting * Create lighthouse_a11y_audits_by_cms.sql Query to highlight the types of errors produced by popular CMS. * Update lighthouse_a11y_score.sql Linting * Update a11y_technology_usage_by_domain_rank.sql Linting * Update sql/2024/accessibility/lighthouse_score_by_government.sql Double quotes... Co-authored-by: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> * Update viewport_zoom_scale_by_domain_rank.sql Updating SQL to get rank right. * Update lighthouse_score_by_government.sql Some simple name changes. * Update lighthouse_score_by_government.sql Including secondary pages. * Create lighthouse_a11y_by_government_with_urls.sql Adding new SQL to check results of query * Update and rename lighthouse_a11y_by_government_with_urls.sql to lighthouse_a11y_score_by_government_with_urls.sql Finding the missing countries... * Rename lighthouse_a11y_score_by_government_with_urls.sql to lighthouse_score_by_government_with_urls.sql renaming * Update lighthouse_score_by_government.sql UPdating query * Update lighthouse_score_by_government.sql Reducing errors like https://www.stats.gov.nl.ca/ landing in the Netherlands rather than Canada. * Update lighthouse_score_by_government.sql Correcting Belgium's domains * Update lighthouse_score_by_government.sql Updating finland. * Update lighthouse_score_by_government.sql Adding lithuania * Update lighthouse_score_by_government.sql Updating the netherlands * Update lighthouse_score_by_government.sql Adding Swedish domains. * Update lighthouse_score_by_government.sql Updating where statement. * Update lighthouse_score_by_government.sql Adding more UN organizations. * Update lighthouse_score_by_government.sql Optimized WHERE clause. * Update lighthouse_score_by_government.sql Updating where * Update lighthouse_score_by_government.sql Update to where. * Update lighthouse_score_by_government.sql More flexible query. * Update lighthouse_score_by_government.sql Spacing for linting * Update lighthouse_score_by_government_with_urls.sql Standardizing government queries. * Update viewport_zoom_scale_by_domain_rank.sql Linting problem * Update lighthouse_score_by_government_with_urls.sql Checking linter.. * Update lighthouse_score_by_government.sql Correcting parse error * Update viewport_zoom_scale_by_domain_rank.sql linting * Update viewport_zoom_scale_by_domain_rank.sql Linting * Update lighthouse_score_by_government.sql More consistency * Update lighthouse_score_by_government_with_urls.sql Consistency in domains... * Update lighthouse_score_by_government_with_urls.sql adding more countries * Update lighthouse_score_by_government.sql Adding other countries * Update lighthouse_score_by_government_with_urls.sql Trying to further maximize the number of government domains returned and sorted. * Update lighthouse_score_by_government_with_urls.sql Linting * Update lighthouse_score_by_government.sql Updating URL to keep up with maximizing URLs. * Update lighthouse_score_by_government_with_urls.sql Updating list of government sites. * Update lighthouse_score_by_government.sql Updating lists of government sites. * Update lighthouse_score_by_government_with_urls.sql More Finish governments * Update lighthouse_score_by_government_with_urls.sql More finland domains. * Update lighthouse_score_by_government_with_urls.sql adding more European country exceptions. * Update lighthouse_score_by_government_with_urls.sql Updating country listings. * Update lighthouse_score_by_government_with_urls.sql Updates for Canadian provinces * Update lighthouse_score_by_government_with_urls.sql Improvements in the categorization. * Update lighthouse_score_by_government_with_urls.sql Adding Luxemburg domain * Update lighthouse_score_by_government_with_urls.sql Adding Andorra * Update lighthouse_score_by_government_with_urls.sql Linting issues * Update lighthouse_score_by_government.sql Updating domains for consistency * Update lighthouse_score_by_government.sql Including unique identifier for the URL/test * Update lighthouse_score_by_government.sql Not useful in an average * Update lighthouse_score_by_government_with_urls.sql Adding wptid Including correlations for all of the world's countries. * Update lighthouse_score_by_government_with_urls.sql Making the query more consistent * Update lighthouse_score_by_government_with_urls.sql removing duplicates * Update lighthouse_score_by_government_with_urls.sql Cleaner URLs * Update lighthouse_score_by_government_with_urls.sql Updated countries.... * Update lighthouse_score_by_government_with_urls.sql Finally with the "Other" not showing up for all of the domains.... REGEX problems... * Update lighthouse_score_by_government.sql Updates to improve country coverage * Update lighthouse_score_by_government.sql Adding Norway and Greenland * Update lighthouse_score_by_government_with_urls.sql Adding Norway and Greenland * Update lighthouse_score_by_government_with_urls.sql Adding Iran and note about https://state.mn.us/ coverage too. * Update lighthouse_score_by_government_with_urls.sql Linting issues * Update lighthouse_score_by_government.sql Linting * Update lighthouse_score_by_government.sql Linting * Update lighthouse_score_by_government_with_urls.sql Linting * Update lighthouse_score_by_government_with_urls.sql Linting... * Update lighthouse_score_by_government_with_urls.sql spacing (hopefully the last) * Update lighthouse_score_by_government.sql Final space hopefully * Update button_name_sources.sql Removing is_root_page so that it is easier to calculate the a comparable value. * Update alt_ending_in_image_extension.sql Thanks to @tunetheweb for this. * Update common_alt_text_length.sql --------- Co-authored-by: Barry Pollard Co-authored-by: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> --- .../a11y_frontend_technology.sql | 50 ++ ...a11y_overall_tech_usage_by_domain_rank.sql | 56 ++ .../accessibility/a11y_technology_usage.sql | 21 + .../a11y_technology_usage_by_domain_rank.sql | 71 +++ .../alt_ending_in_image_extension.sql | 76 +++ .../anchors_with_role_button.sql | 25 + sql/2024/accessibility/audio_track_usage.sql | 27 + .../accessibility/button_name_sources.sql | 73 +++ sql/2024/accessibility/captcha_usage.sql | 24 + sql/2024/accessibility/color_contrast.sql | 26 + .../accessibility/common_alt_text_length.sql | 43 ++ sql/2024/accessibility/common_aria_role.sql | 64 ++ .../common_element_attributes.sql | 62 ++ sql/2024/accessibility/focus_outline_0.sql | 78 +++ sql/2024/accessibility/focus_visible.sql | 85 +++ .../accessibility/form_input_name_sources.sql | 71 +++ .../accessibility/form_required_controls.sql | 113 ++++ .../landmark_elements_and_roles.sql | 123 ++++ .../accessibility/lighthouse_a11y_audits.sql | 52 ++ .../lighthouse_a11y_audits_by_cms.sql | 95 +++ .../accessibility/lighthouse_a11y_score.sql | 55 ++ .../accessibility/lighthouse_score_by_cms.sql | 38 ++ .../lighthouse_score_by_country.sql | 47 ++ .../lighthouse_score_by_frontend.sql | 51 ++ .../lighthouse_score_by_government.sql | 571 +++++++++++++++++ ...ghthouse_score_by_government_with_urls.sql | 580 ++++++++++++++++++ .../accessibility/lighthouse_score_by_tld.sql | 32 + .../accessibility/media_query_features.sql | 83 +++ sql/2024/accessibility/page_title.sql | 26 + .../accessibility/pages_with_search_input.sql | 51 ++ .../placeholder_but_no_label.sql | 31 + sql/2024/accessibility/sites_using_role.sql | 30 + sql/2024/accessibility/skip_links.sql | 26 + sql/2024/accessibility/sr_only_classes.sql | 22 + .../tabindex_usage_and_values.sql | 55 ++ sql/2024/accessibility/table_stats.sql | 37 ++ sql/2024/accessibility/units_properties.sql | 138 +++++ sql/2024/accessibility/valid_html_lang.sql | 28 + sql/2024/accessibility/video_track_usage.sql | 27 + .../accessibility/viewport_zoom_scale.sql | 37 ++ .../viewport_zoom_scale_by_domain_rank.sql | 69 +++ 41 files changed, 3269 insertions(+) create mode 100644 sql/2024/accessibility/a11y_frontend_technology.sql create mode 100644 sql/2024/accessibility/a11y_overall_tech_usage_by_domain_rank.sql create mode 100644 sql/2024/accessibility/a11y_technology_usage.sql create mode 100644 sql/2024/accessibility/a11y_technology_usage_by_domain_rank.sql create mode 100644 sql/2024/accessibility/alt_ending_in_image_extension.sql create mode 100644 sql/2024/accessibility/anchors_with_role_button.sql create mode 100644 sql/2024/accessibility/audio_track_usage.sql create mode 100644 sql/2024/accessibility/button_name_sources.sql create mode 100644 sql/2024/accessibility/captcha_usage.sql create mode 100644 sql/2024/accessibility/color_contrast.sql create mode 100644 sql/2024/accessibility/common_alt_text_length.sql create mode 100644 sql/2024/accessibility/common_aria_role.sql create mode 100644 sql/2024/accessibility/common_element_attributes.sql create mode 100644 sql/2024/accessibility/focus_outline_0.sql create mode 100644 sql/2024/accessibility/focus_visible.sql create mode 100644 sql/2024/accessibility/form_input_name_sources.sql create mode 100644 sql/2024/accessibility/form_required_controls.sql create mode 100644 sql/2024/accessibility/landmark_elements_and_roles.sql create mode 100644 sql/2024/accessibility/lighthouse_a11y_audits.sql create mode 100644 sql/2024/accessibility/lighthouse_a11y_audits_by_cms.sql create mode 100644 sql/2024/accessibility/lighthouse_a11y_score.sql create mode 100644 sql/2024/accessibility/lighthouse_score_by_cms.sql create mode 100644 sql/2024/accessibility/lighthouse_score_by_country.sql create mode 100644 sql/2024/accessibility/lighthouse_score_by_frontend.sql create mode 100644 sql/2024/accessibility/lighthouse_score_by_government.sql create mode 100644 sql/2024/accessibility/lighthouse_score_by_government_with_urls.sql create mode 100644 sql/2024/accessibility/lighthouse_score_by_tld.sql create mode 100644 sql/2024/accessibility/media_query_features.sql create mode 100644 sql/2024/accessibility/page_title.sql create mode 100644 sql/2024/accessibility/pages_with_search_input.sql create mode 100644 sql/2024/accessibility/placeholder_but_no_label.sql create mode 100644 sql/2024/accessibility/sites_using_role.sql create mode 100644 sql/2024/accessibility/skip_links.sql create mode 100644 sql/2024/accessibility/sr_only_classes.sql create mode 100644 sql/2024/accessibility/tabindex_usage_and_values.sql create mode 100644 sql/2024/accessibility/table_stats.sql create mode 100644 sql/2024/accessibility/units_properties.sql create mode 100644 sql/2024/accessibility/valid_html_lang.sql create mode 100644 sql/2024/accessibility/video_track_usage.sql create mode 100644 sql/2024/accessibility/viewport_zoom_scale.sql create mode 100644 sql/2024/accessibility/viewport_zoom_scale_by_domain_rank.sql diff --git a/sql/2024/accessibility/a11y_frontend_technology.sql b/sql/2024/accessibility/a11y_frontend_technology.sql new file mode 100644 index 00000000000..7c197f56fe9 --- /dev/null +++ b/sql/2024/accessibility/a11y_frontend_technology.sql @@ -0,0 +1,50 @@ +WITH score_data AS ( + SELECT + client, + page, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.performance.score') AS FLOAT64) AS performance_score, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.accessibility.score') AS FLOAT64) AS accessibility_score, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.best-practices.score') AS FLOAT64) AS best_practices_score, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.seo.score') AS FLOAT64) AS seo_score, + t.technology AS framework + FROM + `httparchive.all.pages`, + UNNEST(technologies) AS t + WHERE + date = '2024-06-01' AND + lighthouse IS NOT NULL AND + lighthouse != '{}' AND + is_root_page = TRUE AND + ('Web frameworks' IN UNNEST(t.categories) OR 'JavaScript libraries' IN UNNEST(t.categories) OR 'Frontend frameworks' IN UNNEST(t.categories) OR 'JavaScript frameworks' IN UNNEST(t.categories)) AND + t.technology IS NOT NULL +) + +SELECT + client, + framework, + AVG(performance_score) AS avg_performance_score, + AVG(accessibility_score) AS avg_accessibility_score, + AVG(best_practices_score) AS avg_best_practices_score, + AVG(seo_score) AS avg_seo_score, + COUNT(DISTINCT page) AS total_pages +FROM ( + SELECT + client, + page, + framework, + AVG(performance_score) AS performance_score, # All scores are the same for one page (we have multiple rows due to unnest), we could also take the first instead of the average + AVG(accessibility_score) AS accessibility_score, + AVG(best_practices_score) AS best_practices_score, + AVG(seo_score) AS seo_score + FROM + score_data + GROUP BY + client, + page, + framework + ) +GROUP BY + client, + framework +ORDER BY + total_pages DESC; diff --git a/sql/2024/accessibility/a11y_overall_tech_usage_by_domain_rank.sql b/sql/2024/accessibility/a11y_overall_tech_usage_by_domain_rank.sql new file mode 100644 index 00000000000..ffa4a24d47c --- /dev/null +++ b/sql/2024/accessibility/a11y_overall_tech_usage_by_domain_rank.sql @@ -0,0 +1,56 @@ +#standardSQL +# Overall Accessibility (A11y) technology, ie. Overlays, usage by domain rank + +# Main SELECT statement to aggregate results by client and rank grouping. +SELECT + client, + is_root_page, + rank_grouping, # Grouping of domains by their rank (e.g., top 1000, top 10000, etc.) + total_in_rank, # Total number of sites within the rank grouping + COUNT(DISTINCT page) AS sites_with_a11y_tech, # Number of unique sites that use accessibility technology + COUNT(DISTINCT page) / total_in_rank AS pct_sites_with_a11y_tech # Percentage of sites using accessibility technology within the rank grouping +FROM + ( + # Subquery to filter and extract relevant pages with A11Y technology + SELECT DISTINCT + client, + is_root_page, + page, + rank_grouping, + category + FROM + `httparchive.all.pages`, + UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping, # Expand rank_grouping to cover different rank categories + UNNEST(technologies) AS tech, + UNNEST(categories) AS category + WHERE + date = '2024-06-01' AND + category = 'Accessibility' AND + rank <= rank_grouping # Include only sites within the specified rank grouping + ) +JOIN + ( + # Subquery to count total sites in each rank grouping for each client + SELECT + client, + rank_grouping, + COUNT(0) AS total_in_rank + FROM + `httparchive.all.pages`, + UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping + WHERE + date = '2024-06-01' AND + rank <= rank_grouping + GROUP BY + client, + rank_grouping + ) USING (client, rank_grouping) +GROUP BY + client, + is_root_page, + rank_grouping, + total_in_rank +ORDER BY + client, + is_root_page, + rank_grouping diff --git a/sql/2024/accessibility/a11y_technology_usage.sql b/sql/2024/accessibility/a11y_technology_usage.sql new file mode 100644 index 00000000000..81a73a4abd7 --- /dev/null +++ b/sql/2024/accessibility/a11y_technology_usage.sql @@ -0,0 +1,21 @@ +#standardSQL +# Accessibility (A11y) technology, ie. Overlays, usage by client + +SELECT + client, # Client domain + is_root_page, + COUNT(DISTINCT page) AS total_sites, # Total number of unique sites for the client + COUNT(DISTINCT IF(category = 'Accessibility', page, NULL)) AS sites_with_a11y_tech, # Number of unique sites that use accessibility technology + COUNT(DISTINCT IF(category = 'Accessibility', page, NULL)) / COUNT(DISTINCT page) AS pct_sites_with_a11y_tech # Percentage of sites using accessibility technology +FROM + `httparchive.all.pages`, + UNNEST(technologies) AS tech, + UNNEST(categories) AS category +WHERE + date = '2024-06-01' # Specific date for data extraction +GROUP BY + client, + is_root_page +ORDER BY + client, + is_root_page; diff --git a/sql/2024/accessibility/a11y_technology_usage_by_domain_rank.sql b/sql/2024/accessibility/a11y_technology_usage_by_domain_rank.sql new file mode 100644 index 00000000000..7ebd4b594c3 --- /dev/null +++ b/sql/2024/accessibility/a11y_technology_usage_by_domain_rank.sql @@ -0,0 +1,71 @@ +#standardSQL +# A11Y technology usage by domain rank +WITH ranked_sites AS ( + -- Get the total number of sites within each rank grouping + SELECT + client, + is_root_page, + page, + rank, + technologies, -- Include technologies field here + CASE + WHEN rank <= 1000 THEN 1000 + WHEN rank <= 10000 THEN 10000 + WHEN rank <= 100000 THEN 100000 + WHEN rank <= 1000000 THEN 1000000 + WHEN rank <= 10000000 THEN 10000000 + WHEN rank <= 100000000 THEN 100000000 + END AS rank_grouping + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' -- Use the relevant date for analysis +), + +rank_totals AS ( + -- Calculate total sites in each rank grouping + SELECT + client, + is_root_page, + rank_grouping, + COUNT(DISTINCT page) AS total_in_rank + FROM + ranked_sites + GROUP BY + client, + is_root_page, + rank_grouping +) + +SELECT + r.client, + r.is_root_page, + r.rank_grouping, + rt.total_in_rank, -- Total number of unique sites within the rank grouping + tech.technology AS app, -- Accessibility technology used + COUNT(DISTINCT r.page) AS sites_with_app, -- Number of sites using the specific accessibility technology + SAFE_DIVIDE(COUNT(DISTINCT r.page), rt.total_in_rank) AS pct_sites_with_app -- Percentage of sites using the accessibility technology +FROM + ranked_sites r +JOIN + UNNEST(r.technologies) AS tech -- Expand technologies array to individual rows +JOIN + rank_totals rt -- Join to get the total number of sites per rank grouping +ON r.client = rt.client AND + r.is_root_page = rt.is_root_page AND + r.rank_grouping = rt.rank_grouping +JOIN + UNNEST(tech.categories) AS category -- Unnest the categories array to filter for accessibility +WHERE + category = 'Accessibility' -- Filter to include only accessibility-related technologies +GROUP BY + r.client, + r.is_root_page, + r.rank_grouping, + rt.total_in_rank, + tech.technology +ORDER BY + tech.technology, -- Order results by technology (app) + r.rank_grouping, -- Order results by rank grouping + r.client, + r.is_root_page; diff --git a/sql/2024/accessibility/alt_ending_in_image_extension.sql b/sql/2024/accessibility/alt_ending_in_image_extension.sql new file mode 100644 index 00000000000..6400ebc2787 --- /dev/null +++ b/sql/2024/accessibility/alt_ending_in_image_extension.sql @@ -0,0 +1,76 @@ +#standardSQL +# Alt text ending in an image extension +CREATE TEMPORARY FUNCTION getUsedExtensions(payload STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + const a11y = JSON.parse(payload); + + return Object.entries(a11y.file_extension_alts.file_extensions).map(([extension, total]) => { + return {extension, total}; + }); +} catch (e) { + return []; +} +'''; +SELECT + client, + is_root_page, + sites_with_non_empty_alt, + sites_with_file_extension_alt, + total_alts_with_file_extensions, + + # Of sites with a non-empty alt, what % have an alt with a file extension + sites_with_file_extension_alt / sites_with_non_empty_alt AS pct_sites_with_file_extension_alt, + # Given a random alt, how often will it end in a file extension + total_alts_with_file_extensions / total_non_empty_alts AS pct_alts_with_file_extension, + + extension_stat.extension AS extension, + COUNT(0) AS total_sites_using, + # Of sites with a non-empty alt, what % have an alt with this file extension + COUNT(0) / sites_with_non_empty_alt AS pct_applicable_sites_using, + + # Of sites with a non-empty alt, what % have an alt with this file extension + SUM(extension_stat.total) AS total_occurances, + # Given a random alt ending in a file extension, how often will it end in this file extension + SUM(extension_stat.total) / total_alts_with_file_extensions AS pct_total_occurances +FROM + `httparchive.all.pages`, + UNNEST(getUsedExtensions(JSON_EXTRACT(custom_metrics, '$.a11y'))) AS extension_stat +LEFT JOIN ( + SELECT + client, + is_root_page, + COUNTIF(total_non_empty_alt > 0) AS sites_with_non_empty_alt, + COUNTIF(total_with_file_extension > 0) AS sites_with_file_extension_alt, + + SUM(total_non_empty_alt) AS total_non_empty_alts, + SUM(total_with_file_extension) AS total_alts_with_file_extensions + FROM ( + SELECT + client, + is_root_page, + CAST(JSON_EXTRACT_SCALAR(custom_metrics, '$.markup.images.img.alt.present') AS INT64) AS total_non_empty_alt, + CAST(JSON_EXTRACT_SCALAR(custom_metrics, '$.a11y.file_extension_alts.total_with_file_extension') AS INT64) AS total_with_file_extension + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' + ) + GROUP BY + client, + is_root_page +) USING (client, is_root_page) +WHERE + date = '2024-06-01' +GROUP BY + client, + is_root_page, + sites_with_non_empty_alt, + sites_with_file_extension_alt, + total_non_empty_alts, + total_alts_with_file_extensions, + extension +ORDER BY + client, + is_root_page, + total_occurances DESC diff --git a/sql/2024/accessibility/anchors_with_role_button.sql b/sql/2024/accessibility/anchors_with_role_button.sql new file mode 100644 index 00000000000..c8ba67780c1 --- /dev/null +++ b/sql/2024/accessibility/anchors_with_role_button.sql @@ -0,0 +1,25 @@ +#standardSQL +# Anchors with role='button' +SELECT + client, + is_root_page, + COUNTIF(total_anchors > 0) AS sites_with_anchors, + COUNTIF(total_anchors_with_role_button > 0) AS sites_with_anchor_role_button, + + # Of sites that have anchors... how many have an anchor with a role='button' + COUNTIF(total_anchors_with_role_button > 0) / COUNTIF(total_anchors > 0) AS pct_sites_with_anchor_role_button +FROM ( + SELECT + client, + is_root_page, + date, + CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._a11y'), '$.total_anchors_with_role_button') AS INT64) AS total_anchors_with_role_button, + IFNULL(CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._element_count'), '$.a') AS INT64), 0) AS total_anchors + FROM + `httparchive.all.pages` +) +WHERE + date = '2024-06-01' +GROUP BY + client, + is_root_page; diff --git a/sql/2024/accessibility/audio_track_usage.sql b/sql/2024/accessibility/audio_track_usage.sql new file mode 100644 index 00000000000..35b22d34884 --- /dev/null +++ b/sql/2024/accessibility/audio_track_usage.sql @@ -0,0 +1,27 @@ +#standardSQL +# Audio elements track usage +SELECT + client, + is_root_page, + COUNT(0) AS total_sites, + COUNTIF(total_audios > 0) AS total_with_audio, + COUNTIF(total_with_track > 0) AS total_with_tracks, + + SUM(total_with_track) / SUM(total_audios) AS pct_audios_with_tracks, + COUNTIF(total_audios > 0) / COUNT(0) AS pct_sites_with_audios, + COUNTIF(total_with_track > 0) / COUNTIF(total_audios > 0) AS pct_audio_sites_with_tracks +FROM ( + SELECT + client, + is_root_page, + date, + CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._almanac'), '$.audios.total') AS INT64) AS total_audios, + CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._almanac'), '$.audios.total_with_track') AS INT64) AS total_with_track + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' +) +GROUP BY + client, + is_root_page; diff --git a/sql/2024/accessibility/button_name_sources.sql b/sql/2024/accessibility/button_name_sources.sql new file mode 100644 index 00000000000..886fbc3c14e --- /dev/null +++ b/sql/2024/accessibility/button_name_sources.sql @@ -0,0 +1,73 @@ +#standardSQL +# Where button elements get their A11Y names from +CREATE TEMPORARY FUNCTION a11yButtonNameSources(payload STRING) +RETURNS ARRAY LANGUAGE js AS ''' + try { + const a11y = JSON.parse(payload); + + const accessible_name_sources = []; + for (const tree_node of a11y.form_control_a11y_tree) { + const is_button_type = tree_node.type === "button"; + const is_submit_input = tree_node.type === "input" && tree_node.attributes.type === "submit"; + if (!is_button_type && !is_submit_input) { + continue; + } + + if (tree_node.accessible_name.length === 0) { + // No A11Y name given + accessible_name_sources.push("No accessible name"); + continue; + } + + if (tree_node.accessible_name_sources.length <= 0) { + continue; + } + + const name_source = tree_node.accessible_name_sources[0]; + let pretty_name_source = name_source.type; + if (name_source.type === "attribute") { + pretty_name_source = `${name_source.type}: ${name_source.attribute}`; + } else if (name_source.type === "relatedElement") { + if (name_source.attribute) { + pretty_name_source = `${name_source.type}: ${name_source.attribute}`; + } else { + pretty_name_source = `${name_source.type}: label`; + } + } + + accessible_name_sources.push(pretty_name_source); + } + + return accessible_name_sources; + } catch (e) { + return []; + } +'''; + +SELECT + client, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_buttons, + + button_name_source, + COUNT(0) AS total_with_this_source, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS perc_of_all_buttons +FROM ( + SELECT + client, + date, + button_name_source + FROM + `httparchive.all.pages`, + UNNEST( + a11yButtonNameSources(JSON_EXTRACT_SCALAR(payload, '$._a11y')) + ) AS button_name_source + WHERE + date = '2024-06-01' AND + is_root_page +) +GROUP BY + client, + is_root_page, + button_name_source +ORDER BY + perc_of_all_buttons DESC diff --git a/sql/2024/accessibility/captcha_usage.sql b/sql/2024/accessibility/captcha_usage.sql new file mode 100644 index 00000000000..643fa8726f6 --- /dev/null +++ b/sql/2024/accessibility/captcha_usage.sql @@ -0,0 +1,24 @@ +#standardSQL +# Captcha usage analysis + +SELECT + client, + is_root_page, + date, # Date of the analysis + COUNT(DISTINCT page) AS total_sites, # Total number of unique sites for the client + COUNT(DISTINCT IF(app.technology IN ('reCAPTCHA', 'hCaptcha'), page, NULL)) AS sites_with_captcha, # Number of sites using reCAPTCHA or hCaptcha + SAFE_DIVIDE( + COUNT(DISTINCT IF(app.technology IN ('reCAPTCHA', 'hCaptcha'), page, NULL)), + COUNT(DISTINCT page) + ) AS perc_sites_with_captcha # Percentage of sites using reCAPTCHA or hCaptcha +FROM + `httparchive.all.pages`, + UNNEST(technologies) AS app # Unnest the technologies array to get individual apps +WHERE + date = '2024-06-01' +GROUP BY + client, + is_root_page, + date +ORDER BY + client; # Order results by client domain diff --git a/sql/2024/accessibility/color_contrast.sql b/sql/2024/accessibility/color_contrast.sql new file mode 100644 index 00000000000..f45d9812416 --- /dev/null +++ b/sql/2024/accessibility/color_contrast.sql @@ -0,0 +1,26 @@ +#standardSQL +# % of pages with sufficient text color contrast with its background +SELECT + client, + is_root_page, + COUNTIF(color_contrast_score IS NOT NULL) AS total_applicable, + COUNTIF(CAST(color_contrast_score AS NUMERIC) = 1) AS total_good_contrast, + COUNTIF(CAST(color_contrast_score AS NUMERIC) = 1) / COUNTIF(color_contrast_score IS NOT NULL) AS perc_good_contrast +FROM ( + SELECT + client, + is_root_page, + date, + JSON_VALUE(lighthouse, '$.audits.color-contrast.score') AS color_contrast_score + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' +) +GROUP BY + client, + is_root_page, + date +ORDER BY + client, + is_root_page; diff --git a/sql/2024/accessibility/common_alt_text_length.sql b/sql/2024/accessibility/common_alt_text_length.sql new file mode 100644 index 00000000000..b2e0aa6cc92 --- /dev/null +++ b/sql/2024/accessibility/common_alt_text_length.sql @@ -0,0 +1,43 @@ +#standardSQL +# Most common lengths of alt text +# Note: A value of -1 means there is no alt tag. 0 means it is empty +# Note: Lengths of 2000+ characters are grouped together +SELECT + client, + is_root_page, + SUM(COUNT(0)) OVER (PARTITION BY client, is_root_page) AS total_images, + SUM(COUNTIF(alt_length_clipped >= 0)) OVER (PARTITION BY client, is_root_page) AS total_alt_tags, + + alt_length_clipped AS alt_length, + COUNT(0) AS occurrences, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client, is_root_page) AS pct_all_occurrences, + COUNT(0) / NULLIF(SUM(COUNTIF(alt_length_clipped >= 0)) OVER (PARTITION BY client, is_root_page), 0) AS pct_of_alt_tags + +FROM ( + SELECT + client, + is_root_page, + LEAST(alt_length, 2000) AS alt_length_clipped + FROM ( + SELECT + client, + is_root_page, + date, + SAFE_CAST(alt_length_string AS INT64) AS alt_length + FROM + `httparchive.all.pages`, + UNNEST( + JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._almanac'), '$.images.alt_lengths') + ) AS alt_length_string + ) + WHERE + date = '2024-06-01' AND + alt_length IS NOT NULL AND + is_root_page IS TRUE +) +GROUP BY + client, + is_root_page, + alt_length +ORDER BY + client, is_root_page, alt_length ASC; diff --git a/sql/2024/accessibility/common_aria_role.sql b/sql/2024/accessibility/common_aria_role.sql new file mode 100644 index 00000000000..43f5ac92d8a --- /dev/null +++ b/sql/2024/accessibility/common_aria_role.sql @@ -0,0 +1,64 @@ +#standardSQL +# % of sites using each type of ARIA role + +# Temporary function to extract ARIA roles used on a site from the JSON payload +CREATE TEMPORARY FUNCTION getUsedRoles(payload STRING) +RETURNS ARRAY LANGUAGE js AS ''' +try { + const almanac = JSON.parse(payload); + return Object.keys(almanac.nodes_using_role.usage_and_count); // Extract ARIA roles from the JSON structure +} catch (e) { + return []; +} +'''; + +# Extract ARIA role usage per page +WITH role_usage AS ( + SELECT + client, + is_root_page, + page, # Distinct pages + role # The ARIA role being analyzed + FROM + `httparchive.all.pages`, + UNNEST(getUsedRoles(JSON_EXTRACT_SCALAR(payload, '$._almanac'))) AS role + WHERE + date = '2024-06-01' # Filter for the specific date +), + +# Calculate total sites for each client and is_root_page combination +total_sites_per_group AS ( + SELECT + client, + is_root_page, + COUNT(DISTINCT page) AS total_sites # Total number of unique sites + FROM + role_usage + GROUP BY + client, + is_root_page +) + +# Aggregate the results to compute totals and percentages +SELECT + r.client, + r.is_root_page, + t.total_sites, # Total number of unique sites + r.role, # The ARIA role being analyzed + COUNT(DISTINCT r.page) AS total_sites_using, # Number of unique sites using this ARIA role + SAFE_DIVIDE(COUNT(DISTINCT r.page), t.total_sites) AS pct_sites_using # Correct percentage of sites using this ARIA role +FROM + role_usage r +JOIN + total_sites_per_group t # Join to get the total number of unique sites for each group +ON + r.client = t.client AND r.is_root_page = t.is_root_page +GROUP BY + r.client, + r.is_root_page, + t.total_sites, + r.role +HAVING + total_sites_using >= 100 # Filter to include only roles used by 100 or more sites +ORDER BY + pct_sites_using DESC; # Order results by percentage of sites using each ARIA role diff --git a/sql/2024/accessibility/common_element_attributes.sql b/sql/2024/accessibility/common_element_attributes.sql new file mode 100644 index 00000000000..3231e3e45d5 --- /dev/null +++ b/sql/2024/accessibility/common_element_attributes.sql @@ -0,0 +1,62 @@ +#standardSQL +# How often pages contain an element with a given attribute + +# Temporary function to extract attributes used on elements from the JSON payload +CREATE TEMPORARY FUNCTION getUsedAttributes(payload STRING) +RETURNS ARRAY LANGUAGE js AS ''' +try { + const almanac = JSON.parse(payload); + return Object.keys(almanac.attributes_used_on_elements); // Extract attributes from the JSON structure +} catch (e) { + return []; +} +'''; + +# Main query to analyze the usage of specific attributes across sites +WITH attribute_usage AS ( + SELECT + client, + is_root_page, + page, # Distinct pages + attribute # The attribute being analyzed + FROM + `httparchive.all.pages`, + UNNEST(getUsedAttributes(JSON_EXTRACT_SCALAR(payload, '$._almanac'))) AS attribute + WHERE + date = '2024-06-01' +) + +# Aggregate results to compute totals and percentages +SELECT + a.client, + a.is_root_page, + t.total_sites, # Total number of unique sites + a.attribute, # The attribute being analyzed + COUNT(DISTINCT a.page) AS total_sites_using, # Number of unique sites using this attribute + SAFE_DIVIDE(COUNT(DISTINCT a.page), t.total_sites) AS pct_sites_using # Percentage of sites using this attribute +FROM + attribute_usage a +JOIN ( + # Subquery to calculate total unique sites per client and is_root_page + SELECT + client, + is_root_page, + COUNT(DISTINCT page) AS total_sites + FROM + attribute_usage + GROUP BY + client, + is_root_page +) t +ON + a.client = t.client AND a.is_root_page = t.is_root_page +GROUP BY + a.client, + a.is_root_page, + a.attribute, + t.total_sites +HAVING + STARTS_WITH(a.attribute, 'aria-') OR # Include attributes that start with 'aria-' + SAFE_DIVIDE(COUNT(DISTINCT a.page), t.total_sites) >= 0.01 # Or include attributes used by 1% or more of sites +ORDER BY + pct_sites_using DESC; # Order by percentage in descending order diff --git a/sql/2024/accessibility/focus_outline_0.sql b/sql/2024/accessibility/focus_outline_0.sql new file mode 100644 index 00000000000..c0b4fda9ce5 --- /dev/null +++ b/sql/2024/accessibility/focus_outline_0.sql @@ -0,0 +1,78 @@ +#standardSQL +# Adoption of :focus pseudoclass and outline: 0 style + +CREATE TEMPORARY FUNCTION getFocusStylesOutline0(css STRING) RETURNS ARRAY> LANGUAGE js +OPTIONS (library = "gs://httparchive/lib/css-utils.js") AS ''' +try { + var reduceValues = (values, rule) => { + if ('rules' in rule) { + return rule.rules.reduce(reduceValues, values); + } + if (!('declarations' in rule)) { + return values; + } + var focusRegEx = /:focus/; + var fastFocusCheck = rule.selectors.some(selector => focusRegEx.test(selector)); + if (!fastFocusCheck) { + return values; + } + var hasFocusPseudoClass = rule.selectors.some(selector => { + var tokens = parsel.tokenize(selector); + return tokens.some(token => token.type == 'pseudo-class' && token.name == 'focus'); + }); + if (!hasFocusPseudoClass) { + return values; + } + var setsOutline0 = rule.declarations.some(d => d.property.toLowerCase() == 'outline' && d.value == '0'); + return values.concat({sets_focus_style: true, sets_outline_0: setsOutline0}); + }; + var $ = JSON.parse(css); + return $.stylesheet.rules.reduce(reduceValues, []); +} catch (e) { + return [{sets_focus_style: false, sets_outline_0: false}]; +} +'''; + +WITH focus_data AS ( + SELECT + client, + page, + MAX(IFNULL(focus_style.sets_focus_style, FALSE)) AS sets_focus_style, + MAX(IFNULL(focus_style.sets_outline_0, FALSE)) AS sets_focus_outline_0 + FROM + `httparchive.all.parsed_css`, + UNNEST(getFocusStylesOutline0(css)) AS focus_style + WHERE + date = '2024-06-01' + GROUP BY + client, page +), +total_pages_data AS ( + SELECT + client, + COUNT(0) AS total_pages + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' + GROUP BY + client +) + +SELECT + f.client, + COUNTIF(f.sets_focus_style) AS pages_focus, + COUNTIF(f.sets_focus_outline_0) AS pages_focus_outline_0, + tp.total_pages, + COUNTIF(f.sets_focus_style) / tp.total_pages AS pct_pages_focus, + COUNTIF(f.sets_focus_outline_0) / tp.total_pages AS pct_pages_focus_outline_0 +FROM + focus_data AS f +JOIN + total_pages_data AS tp +ON + f.client = tp.client +GROUP BY + f.client, tp.total_pages +ORDER BY + pct_pages_focus DESC diff --git a/sql/2024/accessibility/focus_visible.sql b/sql/2024/accessibility/focus_visible.sql new file mode 100644 index 00000000000..3fc2b16c89d --- /dev/null +++ b/sql/2024/accessibility/focus_visible.sql @@ -0,0 +1,85 @@ +#standardSQL + +CREATE TEMPORARY FUNCTION getSelectorParts(css STRING) +RETURNS STRUCT< + class ARRAY, + id ARRAY, + attribute ARRAY, + pseudo_class ARRAY, + pseudo_element ARRAY +> +LANGUAGE js +OPTIONS (library = "gs://httparchive/lib/css-utils.js") +AS ''' +try { + function compute(ast) { + let ret = { + class: {}, + id: {}, + attribute: {}, + "pseudo-class": {}, + "pseudo-element": {} + }; + + walkSelectors(ast, selector => { + let sast = parsel.parse(selector, {list: false}); + + parsel.walk(sast, node => { + if (node.type in ret) { + incrementByKey(ret[node.type], node.name); + } + }, {subtree: true}); + }); + + for (let type in ret) { + ret[type] = sortObject(ret[type]); + } + + return ret; + } + + function unzip(obj) { + return Object.entries(obj).filter(([name, value]) => { + return !isNaN(value); + }).map(([name, value]) => name); + } + + const ast = JSON.parse(css); + let parts = compute(ast); + return { + class: unzip(parts.class), + id: unzip(parts.id), + attribute: unzip(parts.attribute), + pseudo_class: unzip(parts['pseudo-class']), + pseudo_element: unzip(parts['pseudo-element']) + } +} catch (e) { + return null; +} +'''; + +SELECT + client, + COUNTIF(num_focus_visible > 0) AS pages_with_focus_visible, + COUNT(0) AS total_pages, + SAFE_DIVIDE(COUNTIF(num_focus_visible > 0), COUNT(0)) AS pct_pages_focus_visible +FROM ( + SELECT + client, + page, + COUNTIF(pseudo_class = 'focus-visible') AS num_focus_visible + FROM + `httparchive.all.parsed_css` + LEFT JOIN + UNNEST(getSelectorParts(css).pseudo_class) AS pseudo_class + WHERE + date = '2024-06-01' AND + LENGTH(css) < 0.1 * 1024 * 1024 -- Limit the size of the CSS to avoid OOM crashes + GROUP BY + client, + page +) +GROUP BY + client +ORDER BY + pct_pages_focus_visible DESC; diff --git a/sql/2024/accessibility/form_input_name_sources.sql b/sql/2024/accessibility/form_input_name_sources.sql new file mode 100644 index 00000000000..6a2417b85b8 --- /dev/null +++ b/sql/2024/accessibility/form_input_name_sources.sql @@ -0,0 +1,71 @@ +#standardSQL +# Where input elements get their A11Y names from +CREATE TEMPORARY FUNCTION a11yInputNameSources(payload STRING) +RETURNS ARRAY LANGUAGE js AS ''' + try { + const a11y = JSON.parse(payload); + + const accessible_name_sources = []; + for (const tree_node of a11y.form_control_a11y_tree) { + if (tree_node.type === "button") { + continue; + } + if (tree_node.type === "input" && tree_node.attributes.type === "submit") { + continue; + } + + if (tree_node.accessible_name.length === 0) { + // No A11Y name given + accessible_name_sources.push("No accessible name"); + continue; + } + + if (tree_node.accessible_name_sources.length <= 0) { + continue; + } + + const name_source = tree_node.accessible_name_sources[0]; + let pretty_name_source = name_source.type; + if (name_source.type === "attribute") { + pretty_name_source = `${name_source.type}: ${name_source.attribute}`; + } else if (name_source.type === "relatedElement") { + if (name_source.attribute) { + pretty_name_source = `${name_source.type}: ${name_source.attribute}`; + } else { + pretty_name_source = `${name_source.type}: label`; + } + } + + accessible_name_sources.push(pretty_name_source); + } + + return accessible_name_sources; + } catch (e) { + return []; + } +'''; + +SELECT + client, + is_root_page, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_inputs, + input_name_source, + COUNT(0) AS total_with_this_source, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS perc_of_all_inputs +FROM ( + SELECT + client, + is_root_page, + input_name_source + FROM + `httparchive.all.pages`, + UNNEST( + a11yInputNameSources(JSON_EXTRACT_SCALAR(payload, '$._a11y')) + ) AS input_name_source + WHERE + date = '2024-06-01' +) +GROUP BY + client, + is_root_page, + input_name_source; diff --git a/sql/2024/accessibility/form_required_controls.sql b/sql/2024/accessibility/form_required_controls.sql new file mode 100644 index 00000000000..136506a9fa5 --- /dev/null +++ b/sql/2024/accessibility/form_required_controls.sql @@ -0,0 +1,113 @@ +#standardSQL +# Various stats for required form controls (form controls being: input, select, textarea) +CREATE TEMPORARY FUNCTION requiredControls(payload STRING) +RETURNS STRUCT LANGUAGE js AS ''' + try { + const a11y = JSON.parse(payload); + const required_form_controls = a11y.required_form_controls + + const total = required_form_controls.length; + let asterisk = 0; + let required_attribute = 0; + let aria_required = 0; + + let all_three = 0; + let asterisk_required = 0; + let asterisk_aria = 0; + let required_with_aria = 0; + for (const form_control of required_form_controls) { + if (form_control.has_visible_required_asterisk) { + asterisk++; + } + if (form_control.has_visible_required_asterisk && form_control.has_required) { + asterisk_required++; + } + if (form_control.has_visible_required_asterisk && form_control.has_aria_required) { + asterisk_aria++; + } + + if (form_control.has_required) { + required_attribute++; + } + if (form_control.has_required && form_control.has_aria_required) { + required_with_aria++; + } + + if (form_control.has_aria_required) { + aria_required++; + } + + + if (form_control.has_visible_required_asterisk && + form_control.has_required && + form_control.has_aria_required) { + all_three++; + } + } + + return { + total, + asterisk, + required_attribute, + aria_required, + + all_three, + asterisk_required, + asterisk_aria, + required_with_aria, + }; + } catch (e) { + return { + total: 0, + asterisk: 0, + required_attribute: 0, + aria_required: 0, + + all_three: 0, + asterisk_required: 0, + asterisk_aria: 0, + required_with_aria: 0, + }; + } +'''; + +SELECT + client, + is_root_page, + COUNT(0) AS total_sites, + COUNTIF(stats.total > 0) AS total_sites_with_required_controls, + SUM(stats.total) AS total_required_controls, + + SUM(stats.asterisk) AS total_asterisk, + SUM(stats.asterisk) / SUM(stats.total) AS perc_asterisk, + + SUM(stats.required_attribute) AS total_required_attribute, + SUM(stats.required_attribute) / SUM(stats.total) AS perc_required_attribute, + + SUM(stats.aria_required) AS total_aria_required, + SUM(stats.aria_required) / SUM(stats.total) AS perc_aria_required, + + SUM(stats.all_three) AS total_all_three, + SUM(stats.all_three) / SUM(stats.total) AS perc_all_three, + + SUM(stats.asterisk_required) AS total_asterisk_required, + SUM(stats.asterisk_required) / SUM(stats.total) AS perc_asterisk_required, + + SUM(stats.asterisk_aria) AS total_asterisk_aria, + SUM(stats.asterisk_aria) / SUM(stats.total) AS perc_asterisk_aria, + + SUM(stats.required_with_aria) AS total_required_with_aria, + SUM(stats.required_with_aria) / SUM(stats.total) AS perc_required_with_aria +FROM ( + SELECT + client, + is_root_page, + requiredControls(JSON_EXTRACT_SCALAR(payload, '$._a11y')) AS stats + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' +) +GROUP BY + client, + is_root_page; diff --git a/sql/2024/accessibility/landmark_elements_and_roles.sql b/sql/2024/accessibility/landmark_elements_and_roles.sql new file mode 100644 index 00000000000..afa16b1d82e --- /dev/null +++ b/sql/2024/accessibility/landmark_elements_and_roles.sql @@ -0,0 +1,123 @@ +#standardSQL +# percentage/count of pages that contain common elements and roles + +CREATE TEMPORARY FUNCTION getUsedRoles(payload STRING) +RETURNS ARRAY LANGUAGE js AS ''' +try { + const almanac = JSON.parse(payload); + return Object.keys(almanac.nodes_using_role.usage_and_count); +} catch (e) { + return []; +} +'''; + +CREATE TEMPORARY FUNCTION get_element_types(element_count_string STRING) +RETURNS ARRAY LANGUAGE js AS ''' +try { + if (!element_count_string) return []; // 2019 had a few cases + + var element_count = JSON.parse(element_count_string); // should be an object with element type properties with values of how often they are present + + if (Array.isArray(element_count)) return []; + if (typeof element_count != 'object') return []; + + return Object.keys(element_count); +} catch (e) { + return []; +} +'''; + +WITH mappings AS ( + SELECT 1 AS mapping_id, 'main' AS element_type, 'main' AS role_type + UNION ALL + SELECT 2 AS mapping_id, 'header' AS element_type, 'banner' AS role_type + UNION ALL + SELECT 3 AS mapping_id, 'nav' AS element_type, 'navigation' AS role_type + UNION ALL + SELECT 4 AS mapping_id, 'footer' AS element_type, 'contentinfo' AS role_type +), + +elements AS ( + SELECT + client, + page, + element_type + FROM + `httparchive.all.pages`, + UNNEST(get_element_types(JSON_EXTRACT(custom_metrics, '$.element_count'))) AS element_type + JOIN + mappings + USING (element_type) + WHERE + date = '2024-06-01' AND + is_root_page +), + +roles AS ( + SELECT + client, + page, + role_type + FROM + `httparchive.all.pages`, + UNNEST(getUsedRoles(JSON_EXTRACT(custom_metrics, '$.almanac'))) AS role_type + JOIN + mappings + USING (role_type) + WHERE + date = '2024-06-01' AND + is_root_page +), + +base AS ( + SELECT + client, + page, + mapping_id, + element_type, + role_type, + COUNTIF(e.element_type IS NOT NULL) AS element_usage, + COUNTIF(r.role_type IS NOT NULL) AS role_usage + FROM + `httparchive.all.pages` + INNER JOIN mappings ON (TRUE) + LEFT OUTER JOIN + elements e + USING (client, page, element_type) + LEFT OUTER JOIN + roles r + USING (client, page, role_type) + WHERE + date = '2024-06-01' AND + is_root_page + GROUP BY + client, + page, + mapping_id, + element_type, + role_type +) + +SELECT + client, + mapping_id, + element_type, + role_type, + COUNT(DISTINCT page) AS total_pages, + COUNTIF(element_usage > 0) AS element_usage, + COUNTIF(role_usage > 0) AS role_usage, + COUNTIF(element_usage > 0 OR role_usage > 0) AS both_usage, + COUNTIF(element_usage > 0) / COUNT(DISTINCT page) AS element_pct, + COUNTIF(role_usage > 0) / COUNT(DISTINCT page) AS role_pct, + COUNTIF(element_usage > 0 OR role_usage > 0) / COUNT(DISTINCT page) AS both_pct +FROM + base +GROUP BY + client, + mapping_id, + element_type, + role_type +ORDER BY + client, + mapping_id, + element_type diff --git a/sql/2024/accessibility/lighthouse_a11y_audits.sql b/sql/2024/accessibility/lighthouse_a11y_audits.sql new file mode 100644 index 00000000000..abbcfb8546d --- /dev/null +++ b/sql/2024/accessibility/lighthouse_a11y_audits.sql @@ -0,0 +1,52 @@ +#standardSQL +# Get summary of all Lighthouse scores for a category +CREATE TEMPORARY FUNCTION getAudits(report STRING, category STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(report); + var auditrefs = $.categories[category].auditRefs; + var audits = $.audits; + $ = null; + var results = []; + for (auditref of auditrefs) { + results.push({ + id: auditref.id, + weight: auditref.weight, + audit_group: auditref.group, + description: audits[auditref.id].description, + score: audits[auditref.id].score + }); + } + return results; +} catch (e) { + return [{}]; +} +'''; +SELECT + client, + is_root_page, + audits.id AS id, + COUNTIF(audits.score > 0) AS num_pages, + COUNT(0) AS total, + COUNTIF(audits.score IS NOT NULL) AS total_applicable, + SAFE_DIVIDE(COUNTIF(audits.score > 0), COUNTIF(audits.score IS NOT NULL)) AS pct, + APPROX_QUANTILES(audits.weight, 100)[OFFSET(50)] AS median_weight, + MAX(audits.audit_group) AS audit_group, + MAX(audits.description) AS description +FROM + `httparchive.all.pages`, + UNNEST(getAudits(lighthouse, 'accessibility')) AS audits +WHERE + date = '2024-06-01' AND + lighthouse IS NOT NULL AND + lighthouse != '{}' +GROUP BY + client, + is_root_page, + audits.id, + date +ORDER BY + client, + is_root_page, + median_weight DESC, + audits.id; diff --git a/sql/2024/accessibility/lighthouse_a11y_audits_by_cms.sql b/sql/2024/accessibility/lighthouse_a11y_audits_by_cms.sql new file mode 100644 index 00000000000..ad80fac1a49 --- /dev/null +++ b/sql/2024/accessibility/lighthouse_a11y_audits_by_cms.sql @@ -0,0 +1,95 @@ +#standardSQL +# Define the function outside the WITH clause +CREATE TEMPORARY FUNCTION getAudits(report STRING, category STRING) +RETURNS ARRAY> LANGUAGE js AS ''' + try { + var $ = JSON.parse(report); + var auditrefs = $.categories[category].auditRefs; + var audits = $.audits; + $ = null; + var results = []; + for (auditref of auditrefs) { + results.push({ + id: auditref.id, + weight: auditref.weight, + audit_group: auditref.group, + title: audits[auditref.id].title, // Keep title, not description + score: audits[auditref.id].score + }); + } + return results; + } catch (e) { + return [{}]; + } +'''; + +# Step 1: Sample the data once and store in a temporary table, then combine with accessibility issues +WITH sampled_data AS ( + SELECT * + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + lighthouse IS NOT NULL AND + lighthouse != '{}' AND + is_root_page +), + +top_cms AS ( + SELECT + t.technology AS cms, + AVG(CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.accessibility.score') AS FLOAT64)) AS avg_accessibility_score, + COUNT(DISTINCT page) AS total_pages, + RANK() OVER (ORDER BY AVG(CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.accessibility.score') AS FLOAT64)) DESC) AS rank + FROM + sampled_data, + UNNEST(technologies) AS t, + UNNEST(t.categories) AS category + WHERE + category = 'CMS' + GROUP BY + cms + ORDER BY + avg_accessibility_score DESC + LIMIT 1000 -- Limit to top 1000 CMS platforms +), + +accessibility_issues AS ( + SELECT + t.technology AS cms, + audits.id AS audit_id, + COUNTIF(audits.score < 1) AS num_pages_with_issue, + COUNT(0) AS total_pages, -- Total number of pages + COUNTIF(audits.score IS NOT NULL) AS total_applicable, -- Total number of applicable pages + SAFE_DIVIDE(COUNTIF(audits.score < 1), COUNT(0)) AS pct_pages_with_issue, -- Revised calculation based on total pages + APPROX_QUANTILES(audits.weight, 100)[OFFSET(50)] AS median_weight, + MAX(audits.audit_group) AS audit_group + FROM + sampled_data, + UNNEST(technologies) AS t, + UNNEST(t.categories) AS category, + UNNEST(getAudits(lighthouse, 'accessibility')) AS audits + WHERE + category = 'CMS' AND + t.technology IN (SELECT cms FROM top_cms) -- Filter by top CMS platforms + GROUP BY + t.technology, + audits.id +) + +# Step 4: Combine and select top 10 issues per CMS +SELECT + cms, + audit_id, + num_pages_with_issue, + total_pages, + total_applicable, + pct_pages_with_issue, + median_weight, + audit_group +FROM + accessibility_issues +ORDER BY + cms, + pct_pages_with_issue DESC, + median_weight DESC diff --git a/sql/2024/accessibility/lighthouse_a11y_score.sql b/sql/2024/accessibility/lighthouse_a11y_score.sql new file mode 100644 index 00000000000..f354cd93ed6 --- /dev/null +++ b/sql/2024/accessibility/lighthouse_a11y_score.sql @@ -0,0 +1,55 @@ +#standardSQL +# Percentiles of Lighthouse accessibility scores using exact sorting + +WITH score_data AS ( + SELECT + client, + is_root_page, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.accessibility.score') AS FLOAT64) AS score + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + lighthouse IS NOT NULL AND + lighthouse != '{}' +), +sorted_scores AS ( + SELECT + client, + is_root_page, + score, + ROW_NUMBER() OVER (PARTITION BY client, is_root_page ORDER BY score ASC) AS rank, + COUNT(0) OVER (PARTITION BY client, is_root_page) AS total_count + FROM + score_data +), +percentiles AS ( + SELECT + client, + is_root_page, + rank, + total_count, + (rank - 1) / total_count AS percentile_value, + score + FROM + sorted_scores +) +SELECT + client, + is_root_page, + '2024_06_01' AS date, + percentile, + AVG(score) AS exact_score -- Average score for each percentile +FROM + percentiles, + UNNEST([0.1, 0.25, 0.5, 0.75, 0.9]) AS percentile -- Target percentiles (e.g., 10th, 25th, 50th, etc.) +WHERE + ABS(percentile_value - percentile) < 0.01 -- Match scores around the percentile value +GROUP BY + client, + is_root_page, + percentile +ORDER BY + client, + is_root_page, + percentile; diff --git a/sql/2024/accessibility/lighthouse_score_by_cms.sql b/sql/2024/accessibility/lighthouse_score_by_cms.sql new file mode 100644 index 00000000000..a46ebc0df26 --- /dev/null +++ b/sql/2024/accessibility/lighthouse_score_by_cms.sql @@ -0,0 +1,38 @@ +#standardSQL +# Average Lighthouse scores (performance, accessibility, best-practices, SEO) for top CMS platforms within `httparchive.all.pages` + +WITH score_data AS ( + SELECT + client, + page, + t.technology AS cms, + rank, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.performance.score') AS FLOAT64) AS performance_score, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.accessibility.score') AS FLOAT64) AS accessibility_score, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.best-practices.score') AS FLOAT64) AS best_practices_score, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.seo.score') AS FLOAT64) AS seo_score + FROM + `httparchive.all.pages`, + UNNEST(technologies) AS t, + UNNEST(t.categories) AS category + WHERE + date = '2024-06-01' AND + lighthouse IS NOT NULL AND + lighthouse != '{}' AND + is_root_page AND + category = 'CMS' +) + +SELECT + cms, + AVG(performance_score) AS avg_performance_score, + AVG(accessibility_score) AS avg_accessibility_score, + AVG(best_practices_score) AS avg_best_practices_score, + AVG(seo_score) AS avg_seo_score, + COUNT(DISTINCT page) AS total_pages +FROM + score_data +GROUP BY + cms +ORDER BY + avg_performance_score DESC; diff --git a/sql/2024/accessibility/lighthouse_score_by_country.sql b/sql/2024/accessibility/lighthouse_score_by_country.sql new file mode 100644 index 00000000000..6ad7c1d1c05 --- /dev/null +++ b/sql/2024/accessibility/lighthouse_score_by_country.sql @@ -0,0 +1,47 @@ +#standardSQL +# Average Lighthouse scores per geo (by country and device) + +WITH geo_summary AS ( + SELECT + `chrome-ux-report`.experimental.GET_COUNTRY(country_code) AS geo, + IF(device = 'desktop', 'desktop', 'mobile') AS client, + origin, + COUNT(DISTINCT origin) OVER (PARTITION BY country_code, IF(device = 'desktop', 'desktop', 'mobile')) AS total + FROM + `chrome-ux-report.materialized.country_summary` + WHERE + yyyymm = 202406 -- Use June 2024 dataset +), + +score_data AS ( + SELECT + REGEXP_EXTRACT(page, r'://([^/]+)') AS domain, -- Extract top-level domain from page + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.performance.score') AS FLOAT64) AS performance_score, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.accessibility.score') AS FLOAT64) AS accessibility_score, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.best-practices.score') AS FLOAT64) AS best_practices_score, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.seo.score') AS FLOAT64) AS seo_score + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + lighthouse IS NOT NULL AND + lighthouse != '{}' AND + is_root_page +) + +SELECT + geo_summary.geo, + geo_summary.client, + AVG(score_data.performance_score) AS avg_performance_score, + AVG(score_data.accessibility_score) AS avg_accessibility_score, + AVG(score_data.best_practices_score) AS avg_best_practices_score, + AVG(score_data.seo_score) AS avg_seo_score, + COUNT(DISTINCT score_data.domain) AS total_domains +FROM + geo_summary +JOIN + score_data ON REGEXP_EXTRACT(geo_summary.origin, r'://([^/]+)') = score_data.domain +GROUP BY + geo_summary.geo, geo_summary.client +ORDER BY + avg_accessibility_score DESC; diff --git a/sql/2024/accessibility/lighthouse_score_by_frontend.sql b/sql/2024/accessibility/lighthouse_score_by_frontend.sql new file mode 100644 index 00000000000..7812e9d4c14 --- /dev/null +++ b/sql/2024/accessibility/lighthouse_score_by_frontend.sql @@ -0,0 +1,51 @@ +WITH score_data AS ( + SELECT + client, + page, + rank, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.performance.score') AS FLOAT64) AS performance_score, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.accessibility.score') AS FLOAT64) AS accessibility_score, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.best-practices.score') AS FLOAT64) AS best_practices_score, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.seo.score') AS FLOAT64) AS seo_score, + t.technology AS framework + FROM + `httparchive.all.pages`, + UNNEST(technologies) AS t + WHERE + date = '2024-06-01' AND + lighthouse IS NOT NULL AND + lighthouse != '{}' AND + is_root_page = TRUE AND + ('Web frameworks' IN UNNEST(t.categories) OR 'JavaScript libraries' IN UNNEST(t.categories) OR 'Frontend frameworks' IN UNNEST(t.categories) OR 'JavaScript frameworks' IN UNNEST(t.categories)) AND + t.technology IS NOT NULL +) + +SELECT + client, + framework, + AVG(performance_score) AS avg_performance_score, + AVG(accessibility_score) AS avg_accessibility_score, + AVG(best_practices_score) AS avg_best_practices_score, + AVG(seo_score) AS avg_seo_score, + COUNT(DISTINCT page) AS total_pages +FROM ( + SELECT + client, + page, + framework, + AVG(performance_score) AS performance_score, # All scores are the same for one page (we have multiple rows due to unnest), we could also take the first instead of the average + AVG(accessibility_score) AS accessibility_score, + AVG(best_practices_score) AS best_practices_score, + AVG(seo_score) AS seo_score + FROM + score_data + GROUP BY + client, + page, + framework + ) +GROUP BY + client, + framework +ORDER BY + total_pages DESC; diff --git a/sql/2024/accessibility/lighthouse_score_by_government.sql b/sql/2024/accessibility/lighthouse_score_by_government.sql new file mode 100644 index 00000000000..f8bbc95ad8a --- /dev/null +++ b/sql/2024/accessibility/lighthouse_score_by_government.sql @@ -0,0 +1,571 @@ +#standardSQL +# Calculate average Lighthouse scores and the number of pages for government-related domains + +WITH score_data AS ( + SELECT + client, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.performance.score') AS FLOAT64) AS performance_score, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.accessibility.score') AS FLOAT64) AS accessibility_score, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.best-practices.score') AS FLOAT64) AS best_practices_score, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.seo.score') AS FLOAT64) AS seo_score, + page, + is_root_page + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + lighthouse IS NOT NULL AND + lighthouse != '{}' +), + +domain_scores AS ( + SELECT + page, + CASE + -- United Nations + WHEN REGEXP_CONTAINS(page, r'\.un\.org/|\.worldbank\.org/|\.undp\.org/|\.reliefweb.int/|\.who.int/|\.unfccc.int/|\.unccd.int/|\.unesco.org/') THEN 'United Nations' + WHEN REGEXP_CONTAINS(page, r'\.europa\.eu/') THEN 'European Union' + + -- North American States and Provinces + WHEN REGEXP_CONTAINS(page, r'\.(alabama|al)\.gov/') THEN 'Alabama' + WHEN REGEXP_CONTAINS(page, r'\.(alaska|ak)\.gov/') THEN 'Alaska' + WHEN REGEXP_CONTAINS(page, r'\.(arizona|az)\.gov/') THEN 'Arizona' + WHEN REGEXP_CONTAINS(page, r'\.(arkansas|ar)\.gov/') THEN 'Arkansas' + WHEN REGEXP_CONTAINS(page, r'\.(california|ca)\.gov/') THEN 'California' + WHEN REGEXP_CONTAINS(page, r'\.(colorado|co)\.gov/') THEN 'Colorado' + WHEN REGEXP_CONTAINS(page, r'\.(connecticut|ct)\.gov/') THEN 'Connecticut' + WHEN REGEXP_CONTAINS(page, r'\.(delaware|de)\.gov/') THEN 'Delaware' + WHEN REGEXP_CONTAINS(page, r'\.(florida|fl)\.gov/|\.myflorida\.com/') THEN 'Florida' + WHEN REGEXP_CONTAINS(page, r'\.(georgia|ga)\.gov/') THEN 'Georgia State' -- To avoid confusion with the country + WHEN REGEXP_CONTAINS(page, r'\.(hawaii|hi|ehawaii)\.gov/') THEN 'Hawaii' + WHEN REGEXP_CONTAINS(page, r'\.(idaho|id)\.gov/') THEN 'Idaho' + WHEN REGEXP_CONTAINS(page, r'\.(illinois|il)\.gov/') THEN 'Illinois' + WHEN REGEXP_CONTAINS(page, r'\.(indiana|in)\.gov/') THEN 'Indiana' + WHEN REGEXP_CONTAINS(page, r'\.(iowa|ia)\.gov/') THEN 'Iowa' + WHEN REGEXP_CONTAINS(page, r'\.(kansas|ks)\.gov/') THEN 'Kansas' + WHEN REGEXP_CONTAINS(page, r'\.(kentucky|ky)\.gov/') THEN 'Kentucky' + WHEN REGEXP_CONTAINS(page, r'\.(louisiana|la)\.gov/') THEN 'Louisiana' + WHEN REGEXP_CONTAINS(page, r'\.(maine|me)\.gov/') THEN 'Maine' + WHEN REGEXP_CONTAINS(page, r'\.(maryland|md)\.gov/') THEN 'Maryland' + WHEN REGEXP_CONTAINS(page, r'\.(massachusetts|ma|mass)\.gov/') THEN 'Massachusetts' + WHEN REGEXP_CONTAINS(page, r'\.(michigan|mi)\.gov/') THEN 'Michigan' + WHEN REGEXP_CONTAINS(page, r'\.(minnesota|mn)\.gov/') THEN 'Minnesota' + WHEN REGEXP_CONTAINS(page, r'\.(mississippi|ms)\.gov/') THEN 'Mississippi' + WHEN REGEXP_CONTAINS(page, r'\.(missouri|mo)\.gov/') THEN 'Missouri' + WHEN REGEXP_CONTAINS(page, r'\.(montana|mt)\.gov/') THEN 'Montana' + WHEN REGEXP_CONTAINS(page, r'\.(nebraska|ne)\.gov/') THEN 'Nebraska' + WHEN REGEXP_CONTAINS(page, r'\.(nevada|nv)\.gov/') THEN 'Nevada' + WHEN REGEXP_CONTAINS(page, r'\.(newhampshire|nh)\.gov/') THEN 'New Hampshire' + WHEN REGEXP_CONTAINS(page, r'\.(newjersey|nj)\.gov/') THEN 'New Jersey' + WHEN REGEXP_CONTAINS(page, r'\.(newmexico|nm)\.gov/') THEN 'New Mexico' + WHEN REGEXP_CONTAINS(page, r'\.(newyork|ny)\.gov/') THEN 'New York' + WHEN REGEXP_CONTAINS(page, r'\.(northcarolina|nc)\.gov/') THEN 'North Carolina' + WHEN REGEXP_CONTAINS(page, r'\.(northdakota|nd)\.gov/') THEN 'North Dakota' + WHEN REGEXP_CONTAINS(page, r'\.(ohio|oh)\.gov/') THEN 'Ohio' + WHEN REGEXP_CONTAINS(page, r'\.(oklahoma|ok)\.gov/') THEN 'Oklahoma' + WHEN REGEXP_CONTAINS(page, r'\.(oregon|or)\.gov/') THEN 'Oregon' + WHEN REGEXP_CONTAINS(page, r'\.(pennsylvania|pa)\.gov/') THEN 'Pennsylvania' + WHEN REGEXP_CONTAINS(page, r'\.(rhodeisland|ri)\.gov/') THEN 'Rhode Island' + WHEN REGEXP_CONTAINS(page, r'\.(southcarolina|sc)\.gov/') THEN 'South Carolina' + WHEN REGEXP_CONTAINS(page, r'\.(southdakota|sd)\.gov/') THEN 'South Dakota' + WHEN REGEXP_CONTAINS(page, r'\.(tennessee|tn)\.gov/') THEN 'Tennessee' + WHEN REGEXP_CONTAINS(page, r'\.(texas|tx)\.gov/') THEN 'Texas' + WHEN REGEXP_CONTAINS(page, r'\.(utah|ut)\.gov/') THEN 'Utah' + WHEN REGEXP_CONTAINS(page, r'\.(vermont|vt)\.gov/') THEN 'Vermont' + WHEN REGEXP_CONTAINS(page, r'\.(virginia)\.gov/') THEN 'Virginia' + WHEN REGEXP_CONTAINS(page, r'\.(washington|wa)\.gov/') THEN 'Washington' + WHEN REGEXP_CONTAINS(page, r'\.(washington|wa)\.gov/') THEN 'Washington' + WHEN REGEXP_CONTAINS(page, r'\.(westvirginia|wv)\.gov/') THEN 'West Virginia' + WHEN REGEXP_CONTAINS(page, r'\.(wisconsin|wi)\.gov/') THEN 'Wisconsin' + WHEN REGEXP_CONTAINS(page, r'\.(wyoming|wy)\.gov/') THEN 'Wyoming' + WHEN REGEXP_CONTAINS(page, r'\.dc\.gov/') THEN 'DC' + WHEN REGEXP_CONTAINS(page, r'\.pr\.gov/') THEN 'Puerto Rico' + WHEN REGEXP_CONTAINS(page, r'\.guam\.gov/') THEN 'Guam' + WHEN REGEXP_CONTAINS(page, r'\.americansamoa\.gov/') THEN 'American Samoa' + -- USA .gov domains need to be at the very end so that all other instances catch them. + WHEN REGEXP_CONTAINS(page, r'\.gob\.mx/') THEN 'Mexico' + WHEN REGEXP_CONTAINS(page, r'\.(gc\.ca|canada\.ca|alberta\.ca|gov\.ab\.ca|gov\.bc\.ca|manitoba\.ca|gov\.mb\.ca|gnb\.ca|gov\.nb\.ca|gov\.nl\.ca|novascotia\.ca|gov\.ns\.ca|ontario\.ca|gov\.on\.ca|gov\.pe\.ca|quebec\.ca|gouv\.qc\.ca|revenuquebec\.ca|saskatchewan\.ca|gov\.sk\.ca|gov\.nt\.ca|gov\.nu\.ca|yukon\.ca|gov\.yk\.ca)/') THEN 'Canada' + + -- European Countries + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.al/') THEN 'Albania' + WHEN REGEXP_CONTAINS(page, r'\.ax/') THEN 'Åland' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.ad/|\.govern\.ad/|\.exteriors\.ad/|\.consellgeneral\.ad/') THEN 'Andorra' + WHEN REGEXP_CONTAINS(page, r'\.am/') THEN 'Armenia' + WHEN REGEXP_CONTAINS(page, r'\.gv\.at/') THEN 'Austria' + WHEN REGEXP_CONTAINS(page, r'\.az/') THEN 'Azerbaijan' + WHEN REGEXP_CONTAINS(page, r'\.eus/') THEN 'Basque Country' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.by/') THEN 'Belarus' + WHEN REGEXP_CONTAINS(page, r'\.belgium\.be/|\.(gov|mil)\.be/|\.fgov\.be/|\.vlaanderen\.be/|\.wallonie\.be/|\.brussels\.be/|\.mil\.be/') THEN 'Belgium' + WHEN REGEXP_CONTAINS(page, r'\.ba/') THEN 'Bosnia and Herzegovina' + WHEN REGEXP_CONTAINS(page, r'\.government\.bg/') THEN 'Bulgaria' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.hr/') THEN 'Croatia' + WHEN REGEXP_CONTAINS(page, r'\.cy/') THEN 'Cyprus' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.cz/') THEN 'Czechia (Czech Republic)' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.dk/|\.ft\.dk/|\.nemkonto\.dk/|\.nemlog-in\.dk/|\.mitid\.dk/|\.digst\.dk/|\.sikkerdigital\.dk/|\.forsvaret\.dk/|\.skat\.dk/|\.stps\.dk/|\.ufm\.dk/|\.urm\.dk/|\.uvm\.dk/|\.politi\.dk/|\.dataetiskraad\.dk/|\.at\.dk/|\.kum\.dk/') THEN 'Denmark' + WHEN REGEXP_CONTAINS(page, r'\.riik\.ee/|\.riigiteataja\.ee/|\.eesti\.ee/|\.valitsus\.ee/') THEN 'Estonia' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.fi/|\.valtioneuvosto\.fi/|\.minedu\.fi/|\.formin\.fi/|\.intermin\.fi/|\.suomi\.fi/|\.ym\.fi/|\.stm\.fi/|\.tem\.fi/|\.lvm\.fi/|\.mmm\.fi/|\.okm\.fi/|\.vm\.fi/|\.defmin\.fi/|\.oikeusministerio\.fi/|\.um\.fi/|\.vero\.fi/|\.kela\.fi/') THEN 'Finland' + WHEN REGEXP_CONTAINS(page, r'\.gouv\.fr/') THEN 'France' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.ge/') THEN 'Georgia Country' + WHEN REGEXP_CONTAINS(page, r'\.bund\.de/') THEN 'Germany' + WHEN REGEXP_CONTAINS(page, r'\.gi/') THEN 'Gibraltar' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.gr/') THEN 'Greece' + WHEN REGEXP_CONTAINS(page, r'\.gov\.gl/|\.naalakkersuisut\.gl/|\.stat\.gl/|\.oqaasileriffik\.gl/|\.sullissivik\.gl/|\.sisimiut\.gl/|\.kalaallitnunaata\.gl/|\.inatsisartut\.gl/|\.politi\.gl/|\.visitgreenland\.gl/|\.energitjenesten\.gl/|\.nusuka\.gl/|\.telepost\.gl/|\.kujalleq\.gl/|\.sermersooq\.gl/|\.aviisi\.gl/|\.anjuma\.gl/|\.kni\.gl/|\.greenlandinstitute\.gl/|\.mhs\.gl/|\.iluarsartuiffik\.gl/|\.royalgroenland\.gl/|\.gux\.gl/|\.univiseyisarti\.gl/|\.arcticcommand\.gl(/|\.$)') THEN 'Greenland' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.hu/') THEN 'Hungary' + WHEN REGEXP_CONTAINS(page, r'\.is/') THEN 'Iceland' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.ie/') THEN 'Ireland' + WHEN REGEXP_CONTAINS(page, r'\.im/') THEN 'Isle of Man' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.it/|\.governo\.it/') THEN 'Italy' + WHEN REGEXP_CONTAINS(page, r'\.kz/') THEN 'Kazakhstan' + WHEN REGEXP_CONTAINS(page, r'\.lv/') THEN 'Latvia' + WHEN REGEXP_CONTAINS(page, r'\.li/') THEN 'Liechtenstein' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.lt/|\.vrm\.lt/|\.sam\.lt/|\.ukmin\.lt/|\.lrv\.lt/|\.uzt\.lt/|\.migracija\.lt/|\.kam\.lt/|\.lrs\.lt/|\.urm\.lt/') THEN 'Lithuania' + WHEN REGEXP_CONTAINS(page, r'\.public\.lu/|\.etat\.lu/') THEN 'Luxembourg' + WHEN REGEXP_CONTAINS(page, r'\.mt/') THEN 'Malta' + WHEN REGEXP_CONTAINS(page, r'\.md/') THEN 'Moldova' + WHEN REGEXP_CONTAINS(page, r'\.mc/') THEN 'Monaco' + WHEN REGEXP_CONTAINS(page, r'\.me/') THEN 'Montenegro' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.nl/|\.overheid\.nl/|\.mijnoverheid\.nl/') THEN 'Netherlands' + WHEN REGEXP_CONTAINS(page, r'\.mk/') THEN 'Macedonia' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.no/|\.regjeringen\.no/|\.stortinget\.no/|\.nav\.no/|\.helsenorge\.no/|\.udir\.no/|\.udi\.no/|\.politi\.no/|\.nve\.no/|\.ssb\.no/|\.norges-bank\.no/|\.miljodirektoratet\.no/|\.arbeidstilsynet\.no/|\.forsvaret\.no/|\.skatteetaten\.no/|\.brreg\.no/|\.vegvesen\.no/|\.mattilsynet\.no/|\.lovdata\.no/|\.altinn\.no/|\.nkom\.no/|\.fhi\.no/|\.dsa\.no/|\.kystverket\.no/|\.bufdir\.no/|\.nupi\.no(/|\.$)') THEN 'Norway' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.pl/') THEN 'Poland' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.pt/') THEN 'Portugal' + WHEN REGEXP_CONTAINS(page, r'\.ro/') THEN 'Romania' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.ru/|\.govvrn\.ru/') THEN 'Russia' + WHEN REGEXP_CONTAINS(page, r'\.sm/') THEN 'San Marino' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.rs/') THEN 'Serbia' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.sk/') THEN 'Slovakia' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.si/') THEN 'Slovenia' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.es|gob\.es|ine\.es|boe\.es/') THEN 'Spain' + WHEN REGEXP_CONTAINS(page, r'\.sj/') THEN 'Svalbard and Jan Mayen Islands' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.se/|\.1177\.se/|\.funktionstjanster\.se/|\.hemnet\.se/|\.smhi\.se/|\.sverigesradio\.se/|\.klart\.se/|\.bankid\.com/|\.synonymer\.se/|\.arbetsformedlingen\.se/|\.skatteverket\.se/|\.schoolsoft\.se/|\.postnord\.se/|\.grandid\.com/|\.viaplay\.se/|\.skola24\.se/|\.forsakringskassan\.se/|\.vklass\.se|sl\.se/|\.familjeliv\.se(/|\.$)') THEN 'Sweden' + WHEN REGEXP_CONTAINS(page, r'\.admin\.ch/') THEN 'Switzerland' + WHEN REGEXP_CONTAINS(page, r'\.gv\.ua/') THEN 'Ukraine' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.uk/') THEN 'United Kingdom (UK)' + + -- Other Countries + WHEN REGEXP_CONTAINS(page, r'\.af/') THEN 'Afghanistan' + WHEN REGEXP_CONTAINS(page, r'\.dz/') THEN 'Algeria' + WHEN REGEXP_CONTAINS(page, r'\.as/') THEN 'American Samoa' + WHEN REGEXP_CONTAINS(page, r'\.ao/') THEN 'Angola' + WHEN REGEXP_CONTAINS(page, r'\.ai/') THEN 'Anguilla' + WHEN REGEXP_CONTAINS(page, r'\.aq/') THEN 'Antarctica' + WHEN REGEXP_CONTAINS(page, r'\.ag/') THEN 'Antigua and Barbuda' + WHEN REGEXP_CONTAINS(page, r'\.(mil|gov|gob|gub)\.ar/') THEN 'Argentina' + WHEN REGEXP_CONTAINS(page, r'\.aw/') THEN 'Aruba' + WHEN REGEXP_CONTAINS(page, r'\.ac/') THEN 'Ascension Island' + WHEN REGEXP_CONTAINS(page, r'\.au/') THEN 'Australia' + WHEN REGEXP_CONTAINS(page, r'\.bs/') THEN 'Bahamas' + WHEN REGEXP_CONTAINS(page, r'\.bh/') THEN 'Bahrain' + WHEN REGEXP_CONTAINS(page, r'\.bd/') THEN 'Bangladesh' + WHEN REGEXP_CONTAINS(page, r'\.bb/') THEN 'Barbados' + WHEN REGEXP_CONTAINS(page, r'\.bz/') THEN 'Belize' + WHEN REGEXP_CONTAINS(page, r'\.bj/') THEN 'Benin' + WHEN REGEXP_CONTAINS(page, r'\.bm/') THEN 'Bermuda' + WHEN REGEXP_CONTAINS(page, r'\.bt/') THEN 'Bhutan' + WHEN REGEXP_CONTAINS(page, r'\.bo/') THEN 'Bolivia' + WHEN REGEXP_CONTAINS(page, r'\.bq/') THEN 'Bonaire' + WHEN REGEXP_CONTAINS(page, r'\.bw/') THEN 'Botswana' + WHEN REGEXP_CONTAINS(page, r'\.bv/') THEN 'Bouvet Island' + WHEN REGEXP_CONTAINS(page, r'\.br/') THEN 'Brazil' + WHEN REGEXP_CONTAINS(page, r'\.io/') THEN 'British Indian Ocean Territory' + WHEN REGEXP_CONTAINS(page, r'\.vg/') THEN 'British Virgin Islands' + WHEN REGEXP_CONTAINS(page, r'\.bn/') THEN 'Brunei' + WHEN REGEXP_CONTAINS(page, r'\.bf/') THEN 'Burkina Faso' + WHEN REGEXP_CONTAINS(page, r'\.mm/') THEN 'Burma (officially: Myanmar)' + WHEN REGEXP_CONTAINS(page, r'\.bi/') THEN 'Burundi' + WHEN REGEXP_CONTAINS(page, r'\.kh/') THEN 'Cambodia' + WHEN REGEXP_CONTAINS(page, r'\.cm/') THEN 'Cameroon' + WHEN REGEXP_CONTAINS(page, r'\.ca/') THEN 'Canada' + WHEN REGEXP_CONTAINS(page, r'\.cv/') THEN 'Cape Verde (in Portuguese: Cabo Verde)' + WHEN REGEXP_CONTAINS(page, r'\.cat/') THEN 'Catalonia' + WHEN REGEXP_CONTAINS(page, r'\.ky/') THEN 'Cayman Islands' + WHEN REGEXP_CONTAINS(page, r'\.cf/') THEN 'Central African Republic' + WHEN REGEXP_CONTAINS(page, r'\.td/') THEN 'Chad' + WHEN REGEXP_CONTAINS(page, r'\.cl/') THEN 'Chile' + WHEN REGEXP_CONTAINS(page, r'\.cn/') THEN 'China' + WHEN REGEXP_CONTAINS(page, r'\.cx/') THEN 'Christmas Island' + WHEN REGEXP_CONTAINS(page, r'\.cc/') THEN 'Cocos (Keeling) Islands' + WHEN REGEXP_CONTAINS(page, r'\.co/') THEN 'Colombia' + WHEN REGEXP_CONTAINS(page, r'\.km/') THEN 'Comoros' + WHEN REGEXP_CONTAINS(page, r'\.cd/') THEN 'Congo (Congo-Kinshasa)' + WHEN REGEXP_CONTAINS(page, r'\.cg/') THEN 'Congo (Congo-Brazzaville)' + WHEN REGEXP_CONTAINS(page, r'\.ck/') THEN 'Cook Islands' + WHEN REGEXP_CONTAINS(page, r'\.cr/') THEN 'Costa Rica' + WHEN REGEXP_CONTAINS(page, r'\.ci/') THEN 'Ivory Coast' + WHEN REGEXP_CONTAINS(page, r'\.cu/') THEN 'Cuba' + WHEN REGEXP_CONTAINS(page, r'\.cw/') THEN 'Curaçao' + WHEN REGEXP_CONTAINS(page, r'\.dj/') THEN 'Djibouti' + WHEN REGEXP_CONTAINS(page, r'\.dm/') THEN 'Dominica' + WHEN REGEXP_CONTAINS(page, r'\.do/') THEN 'Dominican Republic' + WHEN REGEXP_CONTAINS(page, r'\.tl/') THEN 'East Timor (Timor-Leste)' + WHEN REGEXP_CONTAINS(page, r'\.tp/') THEN 'East Timor (Timor-Leste)' + WHEN REGEXP_CONTAINS(page, r'\.ec/') THEN 'Ecuador' + WHEN REGEXP_CONTAINS(page, r'\.eg/') THEN 'Egypt' + WHEN REGEXP_CONTAINS(page, r'\.sv/') THEN 'El Salvador' + WHEN REGEXP_CONTAINS(page, r'\.gq/') THEN 'Equatorial Guinea' + WHEN REGEXP_CONTAINS(page, r'\.er/') THEN 'Eritrea' + WHEN REGEXP_CONTAINS(page, r'\.et/') THEN 'Ethiopia' + WHEN REGEXP_CONTAINS(page, r'\.eu/') THEN 'European Union' + WHEN REGEXP_CONTAINS(page, r'\.fk/') THEN 'Falkland Islands' + WHEN REGEXP_CONTAINS(page, r'\.fo/') THEN 'Faeroe Islands' + WHEN REGEXP_CONTAINS(page, r'\.fm/') THEN 'Federated States of Micronesia' + WHEN REGEXP_CONTAINS(page, r'\.fj/') THEN 'Fiji' + WHEN REGEXP_CONTAINS(page, r'\.gf/') THEN 'French Guiana' + WHEN REGEXP_CONTAINS(page, r'\.pf/') THEN 'French Polynesia' + WHEN REGEXP_CONTAINS(page, r'\.tf/') THEN 'French Southern and Antarctic Lands' + WHEN REGEXP_CONTAINS(page, r'\.ga/') THEN 'Gabon' + WHEN REGEXP_CONTAINS(page, r'\.gal/') THEN 'Galicia' + WHEN REGEXP_CONTAINS(page, r'\.gm/') THEN 'Gambia' + WHEN REGEXP_CONTAINS(page, r'\.ps/') THEN 'Gaza' + WHEN REGEXP_CONTAINS(page, r'\.gh/') THEN 'Ghana' + WHEN REGEXP_CONTAINS(page, r'\.gd/') THEN 'Grenada' + WHEN REGEXP_CONTAINS(page, r'\.gp/') THEN 'Guadeloupe' + WHEN REGEXP_CONTAINS(page, r'\.gu/') THEN 'Guam' + WHEN REGEXP_CONTAINS(page, r'\.gt/') THEN 'Guatemala' + WHEN REGEXP_CONTAINS(page, r'\.gg/') THEN 'Guernsey' + WHEN REGEXP_CONTAINS(page, r'\.gn/') THEN 'Guinea' + WHEN REGEXP_CONTAINS(page, r'\.gw/') THEN 'Guinea-Bissau' + WHEN REGEXP_CONTAINS(page, r'\.gy/') THEN 'Guyana' + WHEN REGEXP_CONTAINS(page, r'\.ht/') THEN 'Haiti' + WHEN REGEXP_CONTAINS(page, r'\.hm/') THEN 'Heard Island' + WHEN REGEXP_CONTAINS(page, r'\.hn/') THEN 'Honduras' + WHEN REGEXP_CONTAINS(page, r'\.hk/') THEN 'Hong Kong' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.in/|\.nic\.in/') THEN 'India' + WHEN REGEXP_CONTAINS(page, r'\.id/') THEN 'Indonesia' + WHEN REGEXP_CONTAINS(page, r'\.ir/') THEN 'Iran' + WHEN REGEXP_CONTAINS(page, r'\.iq/') THEN 'Iraq' + WHEN REGEXP_CONTAINS(page, r'\.il/') THEN 'Israel' + WHEN REGEXP_CONTAINS(page, r'\.it/') THEN 'Italy' + WHEN REGEXP_CONTAINS(page, r'\.jm/') THEN 'Jamaica' + WHEN REGEXP_CONTAINS(page, r'\.jp/') THEN 'Japan' + WHEN REGEXP_CONTAINS(page, r'\.je/') THEN 'Jersey' + WHEN REGEXP_CONTAINS(page, r'\.jo/') THEN 'Jordan' + WHEN REGEXP_CONTAINS(page, r'\.ke/') THEN 'Kenya' + WHEN REGEXP_CONTAINS(page, r'\.ki/') THEN 'Kiribati' + WHEN REGEXP_CONTAINS(page, r'\.kw/') THEN 'Kuwait' + WHEN REGEXP_CONTAINS(page, r'\.kg/') THEN 'Kyrgyzstan' + WHEN REGEXP_CONTAINS(page, r'\.la/') THEN 'Laos' + WHEN REGEXP_CONTAINS(page, r'\.lb/') THEN 'Lebanon' + WHEN REGEXP_CONTAINS(page, r'\.ls/') THEN 'Lesotho' + WHEN REGEXP_CONTAINS(page, r'\.lr/') THEN 'Liberia' + WHEN REGEXP_CONTAINS(page, r'\.ly/') THEN 'Libya' + WHEN REGEXP_CONTAINS(page, r'\.mo/') THEN 'Macau' + WHEN REGEXP_CONTAINS(page, r'\.mg/') THEN 'Madagascar' + WHEN REGEXP_CONTAINS(page, r'\.mw/') THEN 'Malawi' + WHEN REGEXP_CONTAINS(page, r'\.my/') THEN 'Malaysia' + WHEN REGEXP_CONTAINS(page, r'\.mv/') THEN 'Maldives' + WHEN REGEXP_CONTAINS(page, r'\.ml/') THEN 'Mali' + WHEN REGEXP_CONTAINS(page, r'\.mh/') THEN 'Marshall Islands' + WHEN REGEXP_CONTAINS(page, r'\.mq/') THEN 'Martinique' + WHEN REGEXP_CONTAINS(page, r'\.mr/') THEN 'Mauritania' + WHEN REGEXP_CONTAINS(page, r'\.mu/') THEN 'Mauritius' + WHEN REGEXP_CONTAINS(page, r'\.yt/') THEN 'Mayotte' + WHEN REGEXP_CONTAINS(page, r'\.mx/') THEN 'Mexico' + WHEN REGEXP_CONTAINS(page, r'\.mn/') THEN 'Mongolia' + WHEN REGEXP_CONTAINS(page, r'\.ms/') THEN 'Montserrat' + WHEN REGEXP_CONTAINS(page, r'\.ma/') THEN 'Morocco' + WHEN REGEXP_CONTAINS(page, r'\.mz/') THEN 'Mozambique' + WHEN REGEXP_CONTAINS(page, r'\.mm/') THEN 'Myanmar' + WHEN REGEXP_CONTAINS(page, r'\.na/') THEN 'Namibia' + WHEN REGEXP_CONTAINS(page, r'\.nr/') THEN 'Nauru' + WHEN REGEXP_CONTAINS(page, r'\.np/') THEN 'Nepal' + WHEN REGEXP_CONTAINS(page, r'\.nl/') THEN 'Netherlands' + WHEN REGEXP_CONTAINS(page, r'\.nc/') THEN 'New Caledonia' + WHEN REGEXP_CONTAINS(page, r'\.nz/') THEN 'New Zealand' + WHEN REGEXP_CONTAINS(page, r'\.ni/') THEN 'Nicaragua' + WHEN REGEXP_CONTAINS(page, r'\.ne/') THEN 'Niger' + WHEN REGEXP_CONTAINS(page, r'\.ng/') THEN 'Nigeria' + WHEN REGEXP_CONTAINS(page, r'\.nu/') THEN 'Niue' + WHEN REGEXP_CONTAINS(page, r'\.nf/') THEN 'Norfolk Island' + WHEN REGEXP_CONTAINS(page, r'\.kp/') THEN 'North Korea' + WHEN REGEXP_CONTAINS(page, r'\.mp/') THEN 'Northern Mariana Islands' + WHEN REGEXP_CONTAINS(page, r'\.om/') THEN 'Oman' + WHEN REGEXP_CONTAINS(page, r'\.pk/') THEN 'Pakistan' + WHEN REGEXP_CONTAINS(page, r'\.pw/') THEN 'Palau' + WHEN REGEXP_CONTAINS(page, r'\.ps/') THEN 'Palestine' + WHEN REGEXP_CONTAINS(page, r'\.pa/') THEN 'Panama' + WHEN REGEXP_CONTAINS(page, r'\.pg/') THEN 'Papua New Guinea' + WHEN REGEXP_CONTAINS(page, r'\.py/') THEN 'Paraguay' + WHEN REGEXP_CONTAINS(page, r'\.pe/') THEN 'Peru' + WHEN REGEXP_CONTAINS(page, r'\.ph/') THEN 'Philippines' + WHEN REGEXP_CONTAINS(page, r'\.pn/') THEN 'Pitcairn Islands' + WHEN REGEXP_CONTAINS(page, r'\.pr/') THEN 'Puerto Rico' + WHEN REGEXP_CONTAINS(page, r'\.qa/') THEN 'Qatar' + WHEN REGEXP_CONTAINS(page, r'\.rw/') THEN 'Rwanda' + WHEN REGEXP_CONTAINS(page, r'\.re/') THEN 'Réunion Island' + WHEN REGEXP_CONTAINS(page, r'\.sh/') THEN 'Saint Helena' + WHEN REGEXP_CONTAINS(page, r'\.kn/') THEN 'Saint Kitts and Nevis' + WHEN REGEXP_CONTAINS(page, r'\.lc/') THEN 'Saint Lucia' + WHEN REGEXP_CONTAINS(page, r'\.pm/') THEN 'Saint-Pierre and Miquelon' + WHEN REGEXP_CONTAINS(page, r'\.vc/') THEN 'Saint Vincent and the Grenadines' + WHEN REGEXP_CONTAINS(page, r'\.ws/') THEN 'Samoa' + WHEN REGEXP_CONTAINS(page, r'\.st/') THEN 'São Tomé and Príncipe' + WHEN REGEXP_CONTAINS(page, r'\.sa/') THEN 'Saudi Arabia' + WHEN REGEXP_CONTAINS(page, r'\.sn/') THEN 'Senegal' + WHEN REGEXP_CONTAINS(page, r'\.sc/') THEN 'Seychelles' + WHEN REGEXP_CONTAINS(page, r'\.sl/') THEN 'Sierra Leone' + WHEN REGEXP_CONTAINS(page, r'\.sg/') THEN 'Singapore' + WHEN REGEXP_CONTAINS(page, r'\.sx/') THEN 'Sint Maarten' + WHEN REGEXP_CONTAINS(page, r'\.sb/') THEN 'Solomon Islands' + WHEN REGEXP_CONTAINS(page, r'\.so/') THEN 'Somalia' + WHEN REGEXP_CONTAINS(page, r'\.za/') THEN 'South Africa' + WHEN REGEXP_CONTAINS(page, r'\.gs/') THEN 'South Georgia and the South Sandwich Islands' + WHEN REGEXP_CONTAINS(page, r'\.kr/') THEN 'South Korea' + WHEN REGEXP_CONTAINS(page, r'\.ss/') THEN 'South Sudan' + WHEN REGEXP_CONTAINS(page, r'\.es/') THEN 'Spain' + WHEN REGEXP_CONTAINS(page, r'\.lk/') THEN 'Sri Lanka' + WHEN REGEXP_CONTAINS(page, r'\.sd/') THEN 'Sudan' + WHEN REGEXP_CONTAINS(page, r'\.sr/') THEN 'Suriname' + WHEN REGEXP_CONTAINS(page, r'\.sz/') THEN 'Swaziland' + WHEN REGEXP_CONTAINS(page, r'\.se/') THEN 'Sweden' + WHEN REGEXP_CONTAINS(page, r'\.sy/') THEN 'Syria' + WHEN REGEXP_CONTAINS(page, r'\.gov\.tw/|\.taipei/') THEN 'Taiwan' + WHEN REGEXP_CONTAINS(page, r'\.tj/') THEN 'Tajikistan' + WHEN REGEXP_CONTAINS(page, r'\.tz/') THEN 'Tanzania' + WHEN REGEXP_CONTAINS(page, r'\.th/') THEN 'Thailand' + WHEN REGEXP_CONTAINS(page, r'\.tg/') THEN 'Togo' + WHEN REGEXP_CONTAINS(page, r'\.tk/') THEN 'Tokelau' + WHEN REGEXP_CONTAINS(page, r'\.to/') THEN 'Tonga' + WHEN REGEXP_CONTAINS(page, r'\.tt/') THEN 'Trinidad & Tobago' + WHEN REGEXP_CONTAINS(page, r'\.tn/') THEN 'Tunisia' + WHEN REGEXP_CONTAINS(page, r'\.tr/') THEN 'Turkey' + WHEN REGEXP_CONTAINS(page, r'\.tm/') THEN 'Turkmenistan' + WHEN REGEXP_CONTAINS(page, r'\.tc/') THEN 'Turks and Caicos Islands' + WHEN REGEXP_CONTAINS(page, r'\.tv/') THEN 'Tuvalu' + WHEN REGEXP_CONTAINS(page, r'\.ug/') THEN 'Uganda' + WHEN REGEXP_CONTAINS(page, r'\.ua/') THEN 'Ukraine' + WHEN REGEXP_CONTAINS(page, r'\.ae/') THEN 'United Arab Emirates (UAE)' + WHEN REGEXP_CONTAINS(page, r'\.vi/') THEN 'United States Virgin Islands' + WHEN REGEXP_CONTAINS(page, r'\.uy/') THEN 'Uruguay' + WHEN REGEXP_CONTAINS(page, r'\.uz/') THEN 'Uzbekistan' + WHEN REGEXP_CONTAINS(page, r'\.vu/') THEN 'Vanuatu' + WHEN REGEXP_CONTAINS(page, r'\.va/') THEN 'Vatican City' + WHEN REGEXP_CONTAINS(page, r'\.ve/') THEN 'Venezuela' + WHEN REGEXP_CONTAINS(page, r'\.vn/') THEN 'Vietnam' + WHEN REGEXP_CONTAINS(page, r'\.wf/') THEN 'Wallis and Futuna' + WHEN REGEXP_CONTAINS(page, r'\.eh/') THEN 'Western Sahara' + WHEN REGEXP_CONTAINS(page, r'\.ye/') THEN 'Yemen' + WHEN REGEXP_CONTAINS(page, r'\.zm/') THEN 'Zambia' + WHEN REGEXP_CONTAINS(page, r'\.zw/') THEN 'Zimbabwe' + + -- All other .gov defintions will be American + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)/') THEN 'United States (USA)' + + ELSE 'Other' + END AS gov_domain, + is_root_page, + performance_score, + accessibility_score, + best_practices_score, + seo_score + FROM + score_data + WHERE + REGEXP_CONTAINS(page, r'(' + '\\.un\\.org/' -- United Nations and International Organizations + '|\\.worldbank\\.org/' + '|\\.undp\\.org/' + '|\\.reliefweb\\.int/' + '|\\.who\\.int/' + '|\\.unfccc\\.int/' + '|\\.unccd\\.int/' + '|\\.unesco\\.org/' + + '|\\.europa\\.eu/' -- European Union + + '|\\.gov/' -- US Government + '|\\.mil/' -- US Military + + '|\\.myflorida\\.com/' -- Florida + + '|\\.(gov|mil|gouv|gob|gub|go|govt|gv|nic|government)\\.(taipei|[a-z]{2,3})/' -- Other generic government formats (e.g., gouv.fr, gob.mx, go.jp) + + '|\\.gc\\.ca/' -- Canada and provinces + '|\\.canada\\.ca/' + '|\\.alberta\\.ca/' + '|\\.gov\\.ab\\.ca/' + '|\\.gov\\.bc\\.ca/' + '|\\.manitoba\\.ca/' + '|\\.gov\\.mb\\.ca/' + '|\\.gnb\\.ca/' + '|\\.gov\\.nb\\.ca/' + '|\\.gov\\.nl\\.ca/' + '|\\.novascotia\\.ca/' + '|\\.gov\\.ns\\.ca/' + '|\\.ontario\\.ca/' + '|\\.gov\\.on\\.ca/' + '|\\.gov\\.pe\\.ca/' + '|\\.quebec\\.ca/' + '|\\.gouv\\.qc\\.ca/' + '|\\.revenuquebec\\.ca/' + '|\\.saskatchewan\\.ca/' + '|\\.gov\\.sk\\.ca/' + '|\\.gov\\.nt\\.ca/' + '|\\.gov\\.nu\\.ca/' + '|\\.yukon\\.ca/' + '|\\.gov\\.yk\\.ca/' + + '|\\.bund\\.de/' -- Germany + + '|\\.belgium\\.be/' -- Belgium + '|\\.fgov\\.be/' + '|\\.vlaanderen\\.be/' + '|\\.wallonie\\.be/' + '|\\.brussels\\.be/' + '|\\.mil\\.be/' + + '|\\.gov\\.se/' -- Sweden + '|\\.1177\\.se/' + '|\\.funktionstjanster\\.se/' + '|\\.hemnet\\.se/' + '|\\.smhi\\.se/' + '|\\.sverigesradio\\.se/' + '|\\.klart\\.se/' + '|\\.bankid\\.com/' + '|\\.synonymer\\.se/' + '|\\.arbetsformedlingen\\.se/' + '|\\.skatteverket\\.se/' + '|\\.schoolsoft\\.se/' + '|\\.postnord\\.se/' + '|\\.grandid\\.com/' + '|\\.viaplay\\.se/' + '|\\.skola24\\.se/' + '|\\.forsakringskassan\\.se/' + '|\\.vklass\\.se/' + '|\\.sl\\.se/' + '|\\.familjeliv\\.se/' + + '|\\.regjeringen\\.no/' -- Norway + '|\\.stortinget\\.no/' + '|\\.nav\\.no/' + '|\\.helsenorge\\.no/' + '|\\.udir\\.no/' + '|\\.udi\\.no/' + '|\\.politi\\.no/' + '|\\.nve\\.no/' + '|\\.ssb\\.no/' + '|\\.miljodirektoratet\\.no/' + '|\\.arbeidstilsynet\\.no/' + '|\\.forsvaret\\.no/' + '|\\.skatteetaten\\.no/' + '|\\.brreg\\.no/' + '|\\.vegvesen\\.no/' + '|\\.mattilsynet\\.no/' + '|\\.lovdata\\.no/' + '|\\.altinn\\.no/' + '|\\.nkom\\.no/' + '|\\.fhi\\.no/' + '|\\.dsa\\.no/' + '|\\.kystverket\\.no/' + '|\\.bufdir\\.no/' + '|\\.nupi\\.no/' + + '|\\.gov\\.gl/' -- Greenland + '|\\.naalakkersuisut\\.gl/' + '|\\.stat\\.gl/' + '|\\.oqaasileriffik\\.gl/' + '|\\.sullissivik\\.gl/' + '|\\.sisimiut\\.gl/' + '|\\.kalaallitnunaata\\.gl/' + '|\\.inatsisartut\\.gl/' + '|\\.politi\\.gl/' + '|\\.visitgreenland\\.gl/' + '|\\.energitjenesten\\.gl/' + '|\\.nusuka\\.gl/' + '|\\.telepost\\.gl/' + '|\\.kujalleq\\.gl/' + '|\\.sermersooq\\.gl/' + '|\\.aviisi\\.gl/' + '|\\.anjuma\\.gl/' + '|\\.kni\\.gl/' + '|\\.greenlandinstitute\\.gl/' + '|\\.mhs\\.gl/' + '|\\.iluarsartuiffik\\.gl/' + '|\\.royalgroenland\\.gl/' + '|\\.gux\\.gl/' + '|\\.univiseyisarti\\.gl/' + '|\\.arcticcommand\\.gl/' + + '|\\.valtioneuvosto\\.fi/' -- Finland + '|\\.minedu\\.fi/' + '|\\.formin\\.fi/' + '|\\.intermin\\.fi/' + '|\\.suomi\\.fi/' + '|\\.ym\\.fi/' + '|\\.stm\\.fi/' + '|\\.tem\\.fi/' + '|\\.lvm\\.fi/' + '|\\.mmm\\.fi/' + '|\\.okm\\.fi/' + '|\\.vm\\.fi/' + '|\\.defmin\\.fi/' + '|\\.oikeusministerio\\.fi/' + '|\\.um\\.fi/' + '|\\.vero\\.fi/' + '|\\.kela\\.fi/' + + '|\\.lrv\\.lt/' -- Lithuania + '|\\.uzt\\.lt/' + '|\\.migracija\\.lt/' + '|\\.kam\\.lt/' + '|\\.lrs\\.lt/' + '|\\.urm\\.lt/' + + '|\\.riik\\.ee/' -- Estonia + '|\\.riigiteataja\\.ee/' + '|\\.eesti\\.ee/' + '|\\.valitsus\\.ee/' + + '|\\.admin\\.ch/' -- Switzerland + + '|\\.seg-social\\.es/' -- Spain + '|\\.ine\\.es/' + '|\\.boe\\.es/' + + '|\\.ft\\.dk/' -- Denmark + '|\\.nemkonto\\.dk/' + '|\\.nemlog-in\\.dk/' + '|\\.mitid\\.dk/' + '|\\.digst\\.dk/' + '|\\.sikkerdigital\\.dk/' + '|\\.forsvaret\\.dk/' + '|\\.skat\\.dk/' + '|\\.stps\\.dk/' + '|\\.ufm\\.dk/' + '|\\.urm\\.dk/' + '|\\.uvm\\.dk/' + '|\\.politi\\.dk/' + '|\\.dataetiskraad\\.dk/' + '|\\.at\\.dk/' + '|\\.kum\\.dk/' + + '|\\.govvrn\\.ru/' -- Russia + + '|\\.public\\.lu/' -- Luxembourg + '|\\.etat\\.lu/' + + '|\\.governo\\.it/' -- Italy + + '|\\.overheid\\.nl/' -- Netherlands + '|\\.mijnoverheid\\.nl/' + + '|\\.govern\\.ad/' -- Andorra + '|\\.exteriors\\.ad/' + '|\\.consellgeneral\\.ad/' + + ')') +) + +SELECT + gov_domain, + AVG(performance_score) AS average_performance_score, + AVG(accessibility_score) AS average_accessibility_score, + AVG(best_practices_score) AS average_best_practices_score, + AVG(seo_score) AS average_seo_score, + COUNT(0) AS total_domains +FROM + domain_scores +GROUP BY + gov_domain +ORDER BY + average_accessibility_score DESC; diff --git a/sql/2024/accessibility/lighthouse_score_by_government_with_urls.sql b/sql/2024/accessibility/lighthouse_score_by_government_with_urls.sql new file mode 100644 index 00000000000..0141bf4dc5c --- /dev/null +++ b/sql/2024/accessibility/lighthouse_score_by_government_with_urls.sql @@ -0,0 +1,580 @@ +#standardSQL +# List all included government domains along with scores + +WITH score_data AS ( + SELECT + client, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.performance.score') AS FLOAT64) AS performance_score, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.accessibility.score') AS FLOAT64) AS accessibility_score, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.best-practices.score') AS FLOAT64) AS best_practices_score, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.seo.score') AS FLOAT64) AS seo_score, + page, + is_root_page, + wptid + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + lighthouse IS NOT NULL AND + lighthouse != '{}' +), + +domain_scores AS ( + SELECT + page, + CASE + -- United Nations + WHEN REGEXP_CONTAINS(page, r'\.un\.org/|\.worldbank\.org/|\.undp\.org/|\.reliefweb.int/|\.who.int/|\.unfccc.int/|\.unccd.int/|\.unesco.org/') THEN 'United Nations' + WHEN REGEXP_CONTAINS(page, r'\.europa\.eu/') THEN 'European Union' + + -- North American States and Provinces + WHEN REGEXP_CONTAINS(page, r'\.(alabama|al)\.gov/') THEN 'Alabama' + WHEN REGEXP_CONTAINS(page, r'\.(alaska|ak)\.gov/') THEN 'Alaska' + WHEN REGEXP_CONTAINS(page, r'\.(arizona|az)\.gov/') THEN 'Arizona' + WHEN REGEXP_CONTAINS(page, r'\.(arkansas|ar)\.gov/') THEN 'Arkansas' + WHEN REGEXP_CONTAINS(page, r'\.(california|ca)\.gov/') THEN 'California' + WHEN REGEXP_CONTAINS(page, r'\.(colorado|co)\.gov/') THEN 'Colorado' + WHEN REGEXP_CONTAINS(page, r'\.(connecticut|ct)\.gov/') THEN 'Connecticut' + WHEN REGEXP_CONTAINS(page, r'\.(delaware|de)\.gov/') THEN 'Delaware' + WHEN REGEXP_CONTAINS(page, r'\.(florida|fl)\.gov/|\.myflorida\.com/') THEN 'Florida' + WHEN REGEXP_CONTAINS(page, r'\.(georgia|ga)\.gov/') THEN 'Georgia State' -- To avoid confusion with the country + WHEN REGEXP_CONTAINS(page, r'\.(hawaii|hi|ehawaii)\.gov/') THEN 'Hawaii' + WHEN REGEXP_CONTAINS(page, r'\.(idaho|id)\.gov/') THEN 'Idaho' + WHEN REGEXP_CONTAINS(page, r'\.(illinois|il)\.gov/') THEN 'Illinois' + WHEN REGEXP_CONTAINS(page, r'\.(indiana|in)\.gov/') THEN 'Indiana' + WHEN REGEXP_CONTAINS(page, r'\.(iowa|ia)\.gov/') THEN 'Iowa' + WHEN REGEXP_CONTAINS(page, r'\.(kansas|ks)\.gov/') THEN 'Kansas' + WHEN REGEXP_CONTAINS(page, r'\.(kentucky|ky)\.gov/') THEN 'Kentucky' + WHEN REGEXP_CONTAINS(page, r'\.(louisiana|la)\.gov/') THEN 'Louisiana' + WHEN REGEXP_CONTAINS(page, r'\.(maine|me)\.gov/') THEN 'Maine' + WHEN REGEXP_CONTAINS(page, r'\.(maryland|md)\.gov/') THEN 'Maryland' + WHEN REGEXP_CONTAINS(page, r'\.(massachusetts|ma|mass)\.gov/') THEN 'Massachusetts' + WHEN REGEXP_CONTAINS(page, r'\.(michigan|mi)\.gov/') THEN 'Michigan' + WHEN REGEXP_CONTAINS(page, r'\.(minnesota|mn)\.gov/') THEN 'Minnesota' + WHEN REGEXP_CONTAINS(page, r'\.(mississippi|ms)\.gov/') THEN 'Mississippi' -- Also should factor in state.mn.us + WHEN REGEXP_CONTAINS(page, r'\.(missouri|mo)\.gov/') THEN 'Missouri' + WHEN REGEXP_CONTAINS(page, r'\.(montana|mt)\.gov/') THEN 'Montana' + WHEN REGEXP_CONTAINS(page, r'\.(nebraska|ne)\.gov/') THEN 'Nebraska' + WHEN REGEXP_CONTAINS(page, r'\.(nevada|nv)\.gov/') THEN 'Nevada' + WHEN REGEXP_CONTAINS(page, r'\.(newhampshire|nh)\.gov/') THEN 'New Hampshire' + WHEN REGEXP_CONTAINS(page, r'\.(newjersey|nj)\.gov/') THEN 'New Jersey' + WHEN REGEXP_CONTAINS(page, r'\.(newmexico|nm)\.gov/') THEN 'New Mexico' + WHEN REGEXP_CONTAINS(page, r'\.(newyork|ny)\.gov/') THEN 'New York' + WHEN REGEXP_CONTAINS(page, r'\.(northcarolina|nc)\.gov/') THEN 'North Carolina' + WHEN REGEXP_CONTAINS(page, r'\.(northdakota|nd)\.gov/') THEN 'North Dakota' + WHEN REGEXP_CONTAINS(page, r'\.(ohio|oh)\.gov/') THEN 'Ohio' + WHEN REGEXP_CONTAINS(page, r'\.(oklahoma|ok)\.gov/') THEN 'Oklahoma' + WHEN REGEXP_CONTAINS(page, r'\.(oregon|or)\.gov/') THEN 'Oregon' + WHEN REGEXP_CONTAINS(page, r'\.(pennsylvania|pa)\.gov/') THEN 'Pennsylvania' + WHEN REGEXP_CONTAINS(page, r'\.(rhodeisland|ri)\.gov/') THEN 'Rhode Island' + WHEN REGEXP_CONTAINS(page, r'\.(southcarolina|sc)\.gov/') THEN 'South Carolina' + WHEN REGEXP_CONTAINS(page, r'\.(southdakota|sd)\.gov/') THEN 'South Dakota' + WHEN REGEXP_CONTAINS(page, r'\.(tennessee|tn)\.gov/') THEN 'Tennessee' + WHEN REGEXP_CONTAINS(page, r'\.(texas|tx)\.gov/') THEN 'Texas' + WHEN REGEXP_CONTAINS(page, r'\.(utah|ut)\.gov/') THEN 'Utah' + WHEN REGEXP_CONTAINS(page, r'\.(vermont|vt)\.gov/') THEN 'Vermont' + WHEN REGEXP_CONTAINS(page, r'\.(virginia)\.gov/') THEN 'Virginia' + WHEN REGEXP_CONTAINS(page, r'\.(washington|wa)\.gov/') THEN 'Washington' + WHEN REGEXP_CONTAINS(page, r'\.(washington|wa)\.gov/') THEN 'Washington' + WHEN REGEXP_CONTAINS(page, r'\.(westvirginia|wv)\.gov/') THEN 'West Virginia' + WHEN REGEXP_CONTAINS(page, r'\.(wisconsin|wi)\.gov/') THEN 'Wisconsin' + WHEN REGEXP_CONTAINS(page, r'\.(wyoming|wy)\.gov/') THEN 'Wyoming' + WHEN REGEXP_CONTAINS(page, r'\.dc\.gov/') THEN 'DC' + WHEN REGEXP_CONTAINS(page, r'\.pr\.gov/') THEN 'Puerto Rico' + WHEN REGEXP_CONTAINS(page, r'\.guam\.gov/') THEN 'Guam' + WHEN REGEXP_CONTAINS(page, r'\.americansamoa\.gov/') THEN 'American Samoa' + -- USA .gov domains need to be at the very end so that all other instances catch them. + WHEN REGEXP_CONTAINS(page, r'\.gob\.mx/') THEN 'Mexico' + WHEN REGEXP_CONTAINS(page, r'\.(gc\.ca|canada\.ca|alberta\.ca|gov\.ab\.ca|gov\.bc\.ca|manitoba\.ca|gov\.mb\.ca|gnb\.ca|gov\.nb\.ca|gov\.nl\.ca|novascotia\.ca|gov\.ns\.ca|ontario\.ca|gov\.on\.ca|gov\.pe\.ca|quebec\.ca|gouv\.qc\.ca|revenuquebec\.ca|saskatchewan\.ca|gov\.sk\.ca|gov\.nt\.ca|gov\.nu\.ca|yukon\.ca|gov\.yk\.ca)/') THEN 'Canada' + + -- European Countries + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.al/') THEN 'Albania' + WHEN REGEXP_CONTAINS(page, r'\.ax/') THEN 'Åland' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.ad/|\.govern\.ad/|\.exteriors\.ad/|\.consellgeneral\.ad/') THEN 'Andorra' + WHEN REGEXP_CONTAINS(page, r'\.am/') THEN 'Armenia' + WHEN REGEXP_CONTAINS(page, r'\.gv\.at/') THEN 'Austria' + WHEN REGEXP_CONTAINS(page, r'\.az/') THEN 'Azerbaijan' + WHEN REGEXP_CONTAINS(page, r'\.eus/') THEN 'Basque Country' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.by/') THEN 'Belarus' + WHEN REGEXP_CONTAINS(page, r'\.belgium\.be/|\.(gov|mil)\.be/|\.fgov\.be/|\.vlaanderen\.be/|\.wallonie\.be/|\.brussels\.be/|\.mil\.be/') THEN 'Belgium' + WHEN REGEXP_CONTAINS(page, r'\.ba/') THEN 'Bosnia and Herzegovina' + WHEN REGEXP_CONTAINS(page, r'\.government\.bg/') THEN 'Bulgaria' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.hr/') THEN 'Croatia' + WHEN REGEXP_CONTAINS(page, r'\.cy/') THEN 'Cyprus' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.cz/') THEN 'Czechia (Czech Republic)' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.dk/|\.ft\.dk/|\.nemkonto\.dk/|\.nemlog-in\.dk/|\.mitid\.dk/|\.digst\.dk/|\.sikkerdigital\.dk/|\.forsvaret\.dk/|\.skat\.dk/|\.stps\.dk/|\.ufm\.dk/|\.urm\.dk/|\.uvm\.dk/|\.politi\.dk/|\.dataetiskraad\.dk/|\.at\.dk/|\.kum\.dk/') THEN 'Denmark' + WHEN REGEXP_CONTAINS(page, r'\.riik\.ee/|\.riigiteataja\.ee/|\.eesti\.ee/|\.valitsus\.ee/') THEN 'Estonia' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.fi/|\.valtioneuvosto\.fi/|\.minedu\.fi/|\.formin\.fi/|\.intermin\.fi/|\.suomi\.fi/|\.ym\.fi/|\.stm\.fi/|\.tem\.fi/|\.lvm\.fi/|\.mmm\.fi/|\.okm\.fi/|\.vm\.fi/|\.defmin\.fi/|\.oikeusministerio\.fi/|\.um\.fi/|\.vero\.fi/|\.kela\.fi/') THEN 'Finland' + WHEN REGEXP_CONTAINS(page, r'\.gouv\.fr/') THEN 'France' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.ge/') THEN 'Georgia Country' + WHEN REGEXP_CONTAINS(page, r'\.bund\.de/') THEN 'Germany' + WHEN REGEXP_CONTAINS(page, r'\.gi/') THEN 'Gibraltar' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.gr/') THEN 'Greece' + WHEN REGEXP_CONTAINS(page, r'\.gov\.gl/|\.naalakkersuisut\.gl/|\.stat\.gl/|\.oqaasileriffik\.gl/|\.sullissivik\.gl/|\.sisimiut\.gl/|\.kalaallitnunaata\.gl/|\.inatsisartut\.gl/|\.politi\.gl/|\.visitgreenland\.gl/|\.energitjenesten\.gl/|\.nusuka\.gl/|\.telepost\.gl/|\.kujalleq\.gl/|\.sermersooq\.gl/|\.aviisi\.gl/|\.anjuma\.gl/|\.kni\.gl/|\.greenlandinstitute\.gl/|\.mhs\.gl/|\.iluarsartuiffik\.gl/|\.royalgroenland\.gl/|\.gux\.gl/|\.univiseyisarti\.gl/|\.arcticcommand\.gl(/|\.$)') THEN 'Greenland' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.hu/') THEN 'Hungary' + WHEN REGEXP_CONTAINS(page, r'\.is/') THEN 'Iceland' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.ie/') THEN 'Ireland' + WHEN REGEXP_CONTAINS(page, r'\.im/') THEN 'Isle of Man' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.it/|\.governo\.it/') THEN 'Italy' + WHEN REGEXP_CONTAINS(page, r'\.kz/') THEN 'Kazakhstan' + WHEN REGEXP_CONTAINS(page, r'\.lv/') THEN 'Latvia' + WHEN REGEXP_CONTAINS(page, r'\.li/') THEN 'Liechtenstein' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.lt/|\.vrm\.lt/|\.sam\.lt/|\.ukmin\.lt/|\.lrv\.lt/|\.uzt\.lt/|\.migracija\.lt/|\.kam\.lt/|\.lrs\.lt/|\.urm\.lt/') THEN 'Lithuania' + WHEN REGEXP_CONTAINS(page, r'\.public\.lu/|\.etat\.lu/') THEN 'Luxembourg' + WHEN REGEXP_CONTAINS(page, r'\.mt/') THEN 'Malta' + WHEN REGEXP_CONTAINS(page, r'\.md/') THEN 'Moldova' + WHEN REGEXP_CONTAINS(page, r'\.mc/') THEN 'Monaco' + WHEN REGEXP_CONTAINS(page, r'\.me/') THEN 'Montenegro' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.nl/|\.overheid\.nl/|\.mijnoverheid\.nl/') THEN 'Netherlands' + WHEN REGEXP_CONTAINS(page, r'\.mk/') THEN 'Macedonia' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.no/|\.regjeringen\.no/|\.stortinget\.no/|\.nav\.no/|\.helsenorge\.no/|\.udir\.no/|\.udi\.no/|\.politi\.no/|\.nve\.no/|\.ssb\.no/|\.norges-bank\.no/|\.miljodirektoratet\.no/|\.arbeidstilsynet\.no/|\.forsvaret\.no/|\.skatteetaten\.no/|\.brreg\.no/|\.vegvesen\.no/|\.mattilsynet\.no/|\.lovdata\.no/|\.altinn\.no/|\.nkom\.no/|\.fhi\.no/|\.dsa\.no/|\.kystverket\.no/|\.bufdir\.no/|\.nupi\.no(/|\.$)') THEN 'Norway' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.pl/') THEN 'Poland' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.pt/') THEN 'Portugal' + WHEN REGEXP_CONTAINS(page, r'\.ro/') THEN 'Romania' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.ru/|\.govvrn\.ru/') THEN 'Russia' + WHEN REGEXP_CONTAINS(page, r'\.sm/') THEN 'San Marino' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.rs/') THEN 'Serbia' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.sk/') THEN 'Slovakia' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.si/') THEN 'Slovenia' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.es|gob\.es|ine\.es|boe\.es/') THEN 'Spain' + WHEN REGEXP_CONTAINS(page, r'\.sj/') THEN 'Svalbard and Jan Mayen Islands' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.se/|\.1177\.se/|\.funktionstjanster\.se/|\.hemnet\.se/|\.smhi\.se/|\.sverigesradio\.se/|\.klart\.se/|\.bankid\.com/|\.synonymer\.se/|\.arbetsformedlingen\.se/|\.skatteverket\.se/|\.schoolsoft\.se/|\.postnord\.se/|\.grandid\.com/|\.viaplay\.se/|\.skola24\.se/|\.forsakringskassan\.se/|\.vklass\.se|sl\.se/|\.familjeliv\.se(/|\.$)') THEN 'Sweden' + WHEN REGEXP_CONTAINS(page, r'\.admin\.ch/') THEN 'Switzerland' + WHEN REGEXP_CONTAINS(page, r'\.gv\.ua/') THEN 'Ukraine' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.uk/') THEN 'United Kingdom (UK)' + + -- Other Countries + WHEN REGEXP_CONTAINS(page, r'\.af/') THEN 'Afghanistan' + WHEN REGEXP_CONTAINS(page, r'\.dz/') THEN 'Algeria' + WHEN REGEXP_CONTAINS(page, r'\.as/') THEN 'American Samoa' + WHEN REGEXP_CONTAINS(page, r'\.ao/') THEN 'Angola' + WHEN REGEXP_CONTAINS(page, r'\.ai/') THEN 'Anguilla' + WHEN REGEXP_CONTAINS(page, r'\.aq/') THEN 'Antarctica' + WHEN REGEXP_CONTAINS(page, r'\.ag/') THEN 'Antigua and Barbuda' + WHEN REGEXP_CONTAINS(page, r'\.(mil|gov|gob|gub)\.ar/') THEN 'Argentina' + WHEN REGEXP_CONTAINS(page, r'\.aw/') THEN 'Aruba' + WHEN REGEXP_CONTAINS(page, r'\.ac/') THEN 'Ascension Island' + WHEN REGEXP_CONTAINS(page, r'\.au/') THEN 'Australia' + WHEN REGEXP_CONTAINS(page, r'\.bs/') THEN 'Bahamas' + WHEN REGEXP_CONTAINS(page, r'\.bh/') THEN 'Bahrain' + WHEN REGEXP_CONTAINS(page, r'\.bd/') THEN 'Bangladesh' + WHEN REGEXP_CONTAINS(page, r'\.bb/') THEN 'Barbados' + WHEN REGEXP_CONTAINS(page, r'\.bz/') THEN 'Belize' + WHEN REGEXP_CONTAINS(page, r'\.bj/') THEN 'Benin' + WHEN REGEXP_CONTAINS(page, r'\.bm/') THEN 'Bermuda' + WHEN REGEXP_CONTAINS(page, r'\.bt/') THEN 'Bhutan' + WHEN REGEXP_CONTAINS(page, r'\.bo/') THEN 'Bolivia' + WHEN REGEXP_CONTAINS(page, r'\.bq/') THEN 'Bonaire' + WHEN REGEXP_CONTAINS(page, r'\.bw/') THEN 'Botswana' + WHEN REGEXP_CONTAINS(page, r'\.bv/') THEN 'Bouvet Island' + WHEN REGEXP_CONTAINS(page, r'\.br/') THEN 'Brazil' + WHEN REGEXP_CONTAINS(page, r'\.io/') THEN 'British Indian Ocean Territory' + WHEN REGEXP_CONTAINS(page, r'\.vg/') THEN 'British Virgin Islands' + WHEN REGEXP_CONTAINS(page, r'\.bn/') THEN 'Brunei' + WHEN REGEXP_CONTAINS(page, r'\.bf/') THEN 'Burkina Faso' + WHEN REGEXP_CONTAINS(page, r'\.mm/') THEN 'Burma (officially: Myanmar)' + WHEN REGEXP_CONTAINS(page, r'\.bi/') THEN 'Burundi' + WHEN REGEXP_CONTAINS(page, r'\.kh/') THEN 'Cambodia' + WHEN REGEXP_CONTAINS(page, r'\.cm/') THEN 'Cameroon' + WHEN REGEXP_CONTAINS(page, r'\.ca/') THEN 'Canada' + WHEN REGEXP_CONTAINS(page, r'\.cv/') THEN 'Cape Verde (in Portuguese: Cabo Verde)' + WHEN REGEXP_CONTAINS(page, r'\.cat/') THEN 'Catalonia' + WHEN REGEXP_CONTAINS(page, r'\.ky/') THEN 'Cayman Islands' + WHEN REGEXP_CONTAINS(page, r'\.cf/') THEN 'Central African Republic' + WHEN REGEXP_CONTAINS(page, r'\.td/') THEN 'Chad' + WHEN REGEXP_CONTAINS(page, r'\.cl/') THEN 'Chile' + WHEN REGEXP_CONTAINS(page, r'\.cn/') THEN 'China' + WHEN REGEXP_CONTAINS(page, r'\.cx/') THEN 'Christmas Island' + WHEN REGEXP_CONTAINS(page, r'\.cc/') THEN 'Cocos (Keeling) Islands' + WHEN REGEXP_CONTAINS(page, r'\.co/') THEN 'Colombia' + WHEN REGEXP_CONTAINS(page, r'\.km/') THEN 'Comoros' + WHEN REGEXP_CONTAINS(page, r'\.cd/') THEN 'Congo (Congo-Kinshasa)' + WHEN REGEXP_CONTAINS(page, r'\.cg/') THEN 'Congo (Congo-Brazzaville)' + WHEN REGEXP_CONTAINS(page, r'\.ck/') THEN 'Cook Islands' + WHEN REGEXP_CONTAINS(page, r'\.cr/') THEN 'Costa Rica' + WHEN REGEXP_CONTAINS(page, r'\.ci/') THEN 'Ivory Coast' + WHEN REGEXP_CONTAINS(page, r'\.cu/') THEN 'Cuba' + WHEN REGEXP_CONTAINS(page, r'\.cw/') THEN 'Curaçao' + WHEN REGEXP_CONTAINS(page, r'\.dj/') THEN 'Djibouti' + WHEN REGEXP_CONTAINS(page, r'\.dm/') THEN 'Dominica' + WHEN REGEXP_CONTAINS(page, r'\.do/') THEN 'Dominican Republic' + WHEN REGEXP_CONTAINS(page, r'\.tl/') THEN 'East Timor (Timor-Leste)' + WHEN REGEXP_CONTAINS(page, r'\.tp/') THEN 'East Timor (Timor-Leste)' + WHEN REGEXP_CONTAINS(page, r'\.ec/') THEN 'Ecuador' + WHEN REGEXP_CONTAINS(page, r'\.eg/') THEN 'Egypt' + WHEN REGEXP_CONTAINS(page, r'\.sv/') THEN 'El Salvador' + WHEN REGEXP_CONTAINS(page, r'\.gq/') THEN 'Equatorial Guinea' + WHEN REGEXP_CONTAINS(page, r'\.er/') THEN 'Eritrea' + WHEN REGEXP_CONTAINS(page, r'\.et/') THEN 'Ethiopia' + WHEN REGEXP_CONTAINS(page, r'\.eu/') THEN 'European Union' + WHEN REGEXP_CONTAINS(page, r'\.fk/') THEN 'Falkland Islands' + WHEN REGEXP_CONTAINS(page, r'\.fo/') THEN 'Faeroe Islands' + WHEN REGEXP_CONTAINS(page, r'\.fm/') THEN 'Federated States of Micronesia' + WHEN REGEXP_CONTAINS(page, r'\.fj/') THEN 'Fiji' + WHEN REGEXP_CONTAINS(page, r'\.gf/') THEN 'French Guiana' + WHEN REGEXP_CONTAINS(page, r'\.pf/') THEN 'French Polynesia' + WHEN REGEXP_CONTAINS(page, r'\.tf/') THEN 'French Southern and Antarctic Lands' + WHEN REGEXP_CONTAINS(page, r'\.ga/') THEN 'Gabon' + WHEN REGEXP_CONTAINS(page, r'\.gal/') THEN 'Galicia' + WHEN REGEXP_CONTAINS(page, r'\.gm/') THEN 'Gambia' + WHEN REGEXP_CONTAINS(page, r'\.ps/') THEN 'Gaza' + WHEN REGEXP_CONTAINS(page, r'\.gh/') THEN 'Ghana' + WHEN REGEXP_CONTAINS(page, r'\.gl/') THEN 'Greenland' + WHEN REGEXP_CONTAINS(page, r'\.gd/') THEN 'Grenada' + WHEN REGEXP_CONTAINS(page, r'\.gp/') THEN 'Guadeloupe' + WHEN REGEXP_CONTAINS(page, r'\.gu/') THEN 'Guam' + WHEN REGEXP_CONTAINS(page, r'\.gt/') THEN 'Guatemala' + WHEN REGEXP_CONTAINS(page, r'\.gg/') THEN 'Guernsey' + WHEN REGEXP_CONTAINS(page, r'\.gn/') THEN 'Guinea' + WHEN REGEXP_CONTAINS(page, r'\.gw/') THEN 'Guinea-Bissau' + WHEN REGEXP_CONTAINS(page, r'\.gy/') THEN 'Guyana' + WHEN REGEXP_CONTAINS(page, r'\.ht/') THEN 'Haiti' + WHEN REGEXP_CONTAINS(page, r'\.hm/') THEN 'Heard Island' + WHEN REGEXP_CONTAINS(page, r'\.hn/') THEN 'Honduras' + WHEN REGEXP_CONTAINS(page, r'\.hk/') THEN 'Hong Kong' + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)\.in/|\.nic\.in/') THEN 'India' + WHEN REGEXP_CONTAINS(page, r'\.id/') THEN 'Indonesia' + WHEN REGEXP_CONTAINS(page, r'\.ir/') THEN 'Iran' + WHEN REGEXP_CONTAINS(page, r'\.iq/') THEN 'Iraq' + WHEN REGEXP_CONTAINS(page, r'\.il/') THEN 'Israel' + WHEN REGEXP_CONTAINS(page, r'\.it/') THEN 'Italy' + WHEN REGEXP_CONTAINS(page, r'\.jm/') THEN 'Jamaica' + WHEN REGEXP_CONTAINS(page, r'\.jp/') THEN 'Japan' + WHEN REGEXP_CONTAINS(page, r'\.je/') THEN 'Jersey' + WHEN REGEXP_CONTAINS(page, r'\.jo/') THEN 'Jordan' + WHEN REGEXP_CONTAINS(page, r'\.ke/') THEN 'Kenya' + WHEN REGEXP_CONTAINS(page, r'\.ki/') THEN 'Kiribati' + WHEN REGEXP_CONTAINS(page, r'\.kw/') THEN 'Kuwait' + WHEN REGEXP_CONTAINS(page, r'\.kg/') THEN 'Kyrgyzstan' + WHEN REGEXP_CONTAINS(page, r'\.la/') THEN 'Laos' + WHEN REGEXP_CONTAINS(page, r'\.lb/') THEN 'Lebanon' + WHEN REGEXP_CONTAINS(page, r'\.ls/') THEN 'Lesotho' + WHEN REGEXP_CONTAINS(page, r'\.lr/') THEN 'Liberia' + WHEN REGEXP_CONTAINS(page, r'\.ly/') THEN 'Libya' + WHEN REGEXP_CONTAINS(page, r'\.mo/') THEN 'Macau' + WHEN REGEXP_CONTAINS(page, r'\.mg/') THEN 'Madagascar' + WHEN REGEXP_CONTAINS(page, r'\.mw/') THEN 'Malawi' + WHEN REGEXP_CONTAINS(page, r'\.my/') THEN 'Malaysia' + WHEN REGEXP_CONTAINS(page, r'\.mv/') THEN 'Maldives' + WHEN REGEXP_CONTAINS(page, r'\.ml/') THEN 'Mali' + WHEN REGEXP_CONTAINS(page, r'\.mh/') THEN 'Marshall Islands' + WHEN REGEXP_CONTAINS(page, r'\.mq/') THEN 'Martinique' + WHEN REGEXP_CONTAINS(page, r'\.mr/') THEN 'Mauritania' + WHEN REGEXP_CONTAINS(page, r'\.mu/') THEN 'Mauritius' + WHEN REGEXP_CONTAINS(page, r'\.yt/') THEN 'Mayotte' + WHEN REGEXP_CONTAINS(page, r'\.mx/') THEN 'Mexico' + WHEN REGEXP_CONTAINS(page, r'\.mn/') THEN 'Mongolia' + WHEN REGEXP_CONTAINS(page, r'\.ms/') THEN 'Montserrat' + WHEN REGEXP_CONTAINS(page, r'\.ma/') THEN 'Morocco' + WHEN REGEXP_CONTAINS(page, r'\.mz/') THEN 'Mozambique' + WHEN REGEXP_CONTAINS(page, r'\.mm/') THEN 'Myanmar' + WHEN REGEXP_CONTAINS(page, r'\.na/') THEN 'Namibia' + WHEN REGEXP_CONTAINS(page, r'\.nr/') THEN 'Nauru' + WHEN REGEXP_CONTAINS(page, r'\.np/') THEN 'Nepal' + WHEN REGEXP_CONTAINS(page, r'\.nl/') THEN 'Netherlands' + WHEN REGEXP_CONTAINS(page, r'\.nc/') THEN 'New Caledonia' + WHEN REGEXP_CONTAINS(page, r'\.nz/') THEN 'New Zealand' + WHEN REGEXP_CONTAINS(page, r'\.ni/') THEN 'Nicaragua' + WHEN REGEXP_CONTAINS(page, r'\.ne/') THEN 'Niger' + WHEN REGEXP_CONTAINS(page, r'\.ng/') THEN 'Nigeria' + WHEN REGEXP_CONTAINS(page, r'\.nu/') THEN 'Niue' + WHEN REGEXP_CONTAINS(page, r'\.nf/') THEN 'Norfolk Island' + WHEN REGEXP_CONTAINS(page, r'\.kp/') THEN 'North Korea' + WHEN REGEXP_CONTAINS(page, r'\.mp/') THEN 'Northern Mariana Islands' + WHEN REGEXP_CONTAINS(page, r'\.om/') THEN 'Oman' + WHEN REGEXP_CONTAINS(page, r'\.pk/') THEN 'Pakistan' + WHEN REGEXP_CONTAINS(page, r'\.pw/') THEN 'Palau' + WHEN REGEXP_CONTAINS(page, r'\.ps/') THEN 'Palestine' + WHEN REGEXP_CONTAINS(page, r'\.pa/') THEN 'Panama' + WHEN REGEXP_CONTAINS(page, r'\.pg/') THEN 'Papua New Guinea' + WHEN REGEXP_CONTAINS(page, r'\.py/') THEN 'Paraguay' + WHEN REGEXP_CONTAINS(page, r'\.pe/') THEN 'Peru' + WHEN REGEXP_CONTAINS(page, r'\.ph/') THEN 'Philippines' + WHEN REGEXP_CONTAINS(page, r'\.pn/') THEN 'Pitcairn Islands' + WHEN REGEXP_CONTAINS(page, r'\.pr/') THEN 'Puerto Rico' + WHEN REGEXP_CONTAINS(page, r'\.qa/') THEN 'Qatar' + WHEN REGEXP_CONTAINS(page, r'\.rw/') THEN 'Rwanda' + WHEN REGEXP_CONTAINS(page, r'\.re/') THEN 'Réunion Island' + WHEN REGEXP_CONTAINS(page, r'\.sh/') THEN 'Saint Helena' + WHEN REGEXP_CONTAINS(page, r'\.kn/') THEN 'Saint Kitts and Nevis' + WHEN REGEXP_CONTAINS(page, r'\.lc/') THEN 'Saint Lucia' + WHEN REGEXP_CONTAINS(page, r'\.pm/') THEN 'Saint-Pierre and Miquelon' + WHEN REGEXP_CONTAINS(page, r'\.vc/') THEN 'Saint Vincent and the Grenadines' + WHEN REGEXP_CONTAINS(page, r'\.ws/') THEN 'Samoa' + WHEN REGEXP_CONTAINS(page, r'\.st/') THEN 'São Tomé and Príncipe' + WHEN REGEXP_CONTAINS(page, r'\.sa/') THEN 'Saudi Arabia' + WHEN REGEXP_CONTAINS(page, r'\.sn/') THEN 'Senegal' + WHEN REGEXP_CONTAINS(page, r'\.sc/') THEN 'Seychelles' + WHEN REGEXP_CONTAINS(page, r'\.sl/') THEN 'Sierra Leone' + WHEN REGEXP_CONTAINS(page, r'\.sg/') THEN 'Singapore' + WHEN REGEXP_CONTAINS(page, r'\.sx/') THEN 'Sint Maarten' + WHEN REGEXP_CONTAINS(page, r'\.sb/') THEN 'Solomon Islands' + WHEN REGEXP_CONTAINS(page, r'\.so/') THEN 'Somalia' + WHEN REGEXP_CONTAINS(page, r'\.za/') THEN 'South Africa' + WHEN REGEXP_CONTAINS(page, r'\.gs/') THEN 'South Georgia and the South Sandwich Islands' + WHEN REGEXP_CONTAINS(page, r'\.kr/') THEN 'South Korea' + WHEN REGEXP_CONTAINS(page, r'\.ss/') THEN 'South Sudan' + WHEN REGEXP_CONTAINS(page, r'\.es/') THEN 'Spain' + WHEN REGEXP_CONTAINS(page, r'\.lk/') THEN 'Sri Lanka' + WHEN REGEXP_CONTAINS(page, r'\.sd/') THEN 'Sudan' + WHEN REGEXP_CONTAINS(page, r'\.sr/') THEN 'Suriname' + WHEN REGEXP_CONTAINS(page, r'\.sz/') THEN 'Swaziland' + WHEN REGEXP_CONTAINS(page, r'\.se/') THEN 'Sweden' + WHEN REGEXP_CONTAINS(page, r'\.sy/') THEN 'Syria' + WHEN REGEXP_CONTAINS(page, r'\.gov\.tw/|\.taipei/') THEN 'Taiwan' + WHEN REGEXP_CONTAINS(page, r'\.tj/') THEN 'Tajikistan' + WHEN REGEXP_CONTAINS(page, r'\.tz/') THEN 'Tanzania' + WHEN REGEXP_CONTAINS(page, r'\.th/') THEN 'Thailand' + WHEN REGEXP_CONTAINS(page, r'\.tg/') THEN 'Togo' + WHEN REGEXP_CONTAINS(page, r'\.tk/') THEN 'Tokelau' + WHEN REGEXP_CONTAINS(page, r'\.to/') THEN 'Tonga' + WHEN REGEXP_CONTAINS(page, r'\.tt/') THEN 'Trinidad & Tobago' + WHEN REGEXP_CONTAINS(page, r'\.tn/') THEN 'Tunisia' + WHEN REGEXP_CONTAINS(page, r'\.tr/') THEN 'Turkey' + WHEN REGEXP_CONTAINS(page, r'\.tm/') THEN 'Turkmenistan' + WHEN REGEXP_CONTAINS(page, r'\.tc/') THEN 'Turks and Caicos Islands' + WHEN REGEXP_CONTAINS(page, r'\.tv/') THEN 'Tuvalu' + WHEN REGEXP_CONTAINS(page, r'\.ug/') THEN 'Uganda' + WHEN REGEXP_CONTAINS(page, r'\.ua/') THEN 'Ukraine' + WHEN REGEXP_CONTAINS(page, r'\.ae/') THEN 'United Arab Emirates (UAE)' + WHEN REGEXP_CONTAINS(page, r'\.vi/') THEN 'United States Virgin Islands' + WHEN REGEXP_CONTAINS(page, r'\.uy/') THEN 'Uruguay' + WHEN REGEXP_CONTAINS(page, r'\.uz/') THEN 'Uzbekistan' + WHEN REGEXP_CONTAINS(page, r'\.vu/') THEN 'Vanuatu' + WHEN REGEXP_CONTAINS(page, r'\.va/') THEN 'Vatican City' + WHEN REGEXP_CONTAINS(page, r'\.ve/') THEN 'Venezuela' + WHEN REGEXP_CONTAINS(page, r'\.vn/') THEN 'Vietnam' + WHEN REGEXP_CONTAINS(page, r'\.wf/') THEN 'Wallis and Futuna' + WHEN REGEXP_CONTAINS(page, r'\.eh/') THEN 'Western Sahara' + WHEN REGEXP_CONTAINS(page, r'\.ye/') THEN 'Yemen' + WHEN REGEXP_CONTAINS(page, r'\.zm/') THEN 'Zambia' + WHEN REGEXP_CONTAINS(page, r'\.zw/') THEN 'Zimbabwe' + + -- All other .gov defintions will be American + WHEN REGEXP_CONTAINS(page, r'\.(gov|mil)/') THEN 'United States (USA)' + + ELSE 'Other' + END AS gov_domain, + is_root_page, + performance_score, + accessibility_score, + best_practices_score, + seo_score, + wptid + FROM + score_data + WHERE + REGEXP_CONTAINS(page, r'(' + '\\.un\\.org/' -- United Nations and International Organizations + '|\\.worldbank\\.org/' + '|\\.undp\\.org/' + '|\\.reliefweb\\.int/' + '|\\.who\\.int/' + '|\\.unfccc\\.int/' + '|\\.unccd\\.int/' + '|\\.unesco\\.org/' + + '|\\.europa\\.eu/' -- European Union + + '|\\.gov/' -- US Government + '|\\.mil/' -- US Military + + '|\\.myflorida\\.com/' -- Florida + + '|\\.(gov|mil|gouv|gob|gub|go|govt|gv|nic|government)\\.(taipei|[a-z]{2,3})/' -- Other generic government formats (e.g., gouv.fr, gob.mx, go.jp) + + + '|\\.gc\\.ca/' -- Canada and provinces + '|\\.canada\\.ca/' + '|\\.alberta\\.ca/' + '|\\.gov\\.ab\\.ca/' + '|\\.gov\\.bc\\.ca/' + '|\\.manitoba\\.ca/' + '|\\.gov\\.mb\\.ca/' + '|\\.gnb\\.ca/' + '|\\.gov\\.nb\\.ca/' + '|\\.gov\\.nl\\.ca/' + '|\\.novascotia\\.ca/' + '|\\.gov\\.ns\\.ca/' + '|\\.ontario\\.ca/' + '|\\.gov\\.on\\.ca/' + '|\\.gov\\.pe\\.ca/' + '|\\.quebec\\.ca/' + '|\\.gouv\\.qc\\.ca/' + '|\\.revenuquebec\\.ca/' + '|\\.saskatchewan\\.ca/' + '|\\.gov\\.sk\\.ca/' + '|\\.gov\\.nt\\.ca/' + '|\\.gov\\.nu\\.ca/' + '|\\.yukon\\.ca/' + '|\\.gov\\.yk\\.ca/' + + '|\\.bund\\.de/' -- Germany + + '|\\.belgium\\.be/' -- Belgium + '|\\.fgov\\.be/' + '|\\.vlaanderen\\.be/' + '|\\.wallonie\\.be/' + '|\\.brussels\\.be/' + '|\\.mil\\.be/' + + '|\\.gov\\.se/' -- Sweden + '|\\.1177\\.se/' + '|\\.funktionstjanster\\.se/' + '|\\.hemnet\\.se/' + '|\\.smhi\\.se/' + '|\\.sverigesradio\\.se/' + '|\\.klart\\.se/' + '|\\.bankid\\.com/' + '|\\.synonymer\\.se/' + '|\\.arbetsformedlingen\\.se/' + '|\\.skatteverket\\.se/' + '|\\.schoolsoft\\.se/' + '|\\.postnord\\.se/' + '|\\.grandid\\.com/' + '|\\.viaplay\\.se/' + '|\\.skola24\\.se/' + '|\\.forsakringskassan\\.se/' + '|\\.vklass\\.se/' + '|\\.sl\\.se/' + '|\\.familjeliv\\.se/' + + '|\\.regjeringen\\.no/' -- Norway + '|\\.stortinget\\.no/' + '|\\.nav\\.no/' + '|\\.helsenorge\\.no/' + '|\\.udir\\.no/' + '|\\.udi\\.no/' + '|\\.politi\\.no/' + '|\\.nve\\.no/' + '|\\.ssb\\.no/' + '|\\.miljodirektoratet\\.no/' + '|\\.arbeidstilsynet\\.no/' + '|\\.forsvaret\\.no/' + '|\\.skatteetaten\\.no/' + '|\\.brreg\\.no/' + '|\\.vegvesen\\.no/' + '|\\.mattilsynet\\.no/' + '|\\.lovdata\\.no/' + '|\\.altinn\\.no/' + '|\\.nkom\\.no/' + '|\\.fhi\\.no/' + '|\\.dsa\\.no/' + '|\\.kystverket\\.no/' + '|\\.bufdir\\.no/' + '|\\.nupi\\.no/' + + '|\\.gov\\.gl/' -- Greenland + '|\\.naalakkersuisut\\.gl/' + '|\\.stat\\.gl/' + '|\\.oqaasileriffik\\.gl/' + '|\\.sullissivik\\.gl/' + '|\\.sisimiut\\.gl/' + '|\\.kalaallitnunaata\\.gl/' + '|\\.inatsisartut\\.gl/' + '|\\.politi\\.gl/' + '|\\.visitgreenland\\.gl/' + '|\\.energitjenesten\\.gl/' + '|\\.nusuka\\.gl/' + '|\\.telepost\\.gl/' + '|\\.kujalleq\\.gl/' + '|\\.sermersooq\\.gl/' + '|\\.aviisi\\.gl/' + '|\\.anjuma\\.gl/' + '|\\.kni\\.gl/' + '|\\.greenlandinstitute\\.gl/' + '|\\.mhs\\.gl/' + '|\\.iluarsartuiffik\\.gl/' + '|\\.royalgroenland\\.gl/' + '|\\.gux\\.gl/' + '|\\.univiseyisarti\\.gl/' + '|\\.arcticcommand\\.gl/' + + '|\\.valtioneuvosto\\.fi/' -- Finland + '|\\.minedu\\.fi/' + '|\\.formin\\.fi/' + '|\\.intermin\\.fi/' + '|\\.suomi\\.fi/' + '|\\.ym\\.fi/' + '|\\.stm\\.fi/' + '|\\.tem\\.fi/' + '|\\.lvm\\.fi/' + '|\\.mmm\\.fi/' + '|\\.okm\\.fi/' + '|\\.vm\\.fi/' + '|\\.defmin\\.fi/' + '|\\.oikeusministerio\\.fi/' + '|\\.um\\.fi/' + '|\\.vero\\.fi/' + '|\\.kela\\.fi/' + + '|\\.lrv\\.lt/' -- Lithuania + '|\\.uzt\\.lt/' + '|\\.migracija\\.lt/' + '|\\.kam\\.lt/' + '|\\.lrs\\.lt/' + '|\\.urm\\.lt/' + + '|\\.riik\\.ee/' -- Estonia + '|\\.riigiteataja\\.ee/' + '|\\.eesti\\.ee/' + '|\\.valitsus\\.ee/' + + '|\\.admin\\.ch/' -- Switzerland + + '|\\.seg-social\\.es/' -- Spain + '|\\.ine\\.es/' + '|\\.boe\\.es/' + + '|\\.ft\\.dk/' -- Denmark + '|\\.nemkonto\\.dk/' + '|\\.nemlog-in\\.dk/' + '|\\.mitid\\.dk/' + '|\\.digst\\.dk/' + '|\\.sikkerdigital\\.dk/' + '|\\.forsvaret\\.dk/' + '|\\.skat\\.dk/' + '|\\.stps\\.dk/' + '|\\.ufm\\.dk/' + '|\\.urm\\.dk/' + '|\\.uvm\\.dk/' + '|\\.politi\\.dk/' + '|\\.dataetiskraad\\.dk/' + '|\\.at\\.dk/' + '|\\.kum\\.dk/' + + '|\\.govvrn\\.ru/' -- Russia + + '|\\.public\\.lu/' -- Luxembourg + '|\\.etat\\.lu/' + + '|\\.governo\\.it/' -- Italy + + '|\\.overheid\\.nl/' -- Netherlands + '|\\.mijnoverheid\\.nl/' + + '|\\.govern\\.ad/' -- Andorra + '|\\.exteriors\\.ad/' + '|\\.consellgeneral\\.ad/' + + '|\\.irangov\\.ir' -- Iran + '|\\.irna\\.ir' + '|\\.razavi\\.ir' + '|\\.gholhak\\.ir' + + ')') +) + +SELECT + gov_domain, + page, + is_root_page, + performance_score, + accessibility_score, + best_practices_score, + seo_score, + wptid +FROM + domain_scores +WHERE gov_domain IS NOT NULL +ORDER BY gov_domain; diff --git a/sql/2024/accessibility/lighthouse_score_by_tld.sql b/sql/2024/accessibility/lighthouse_score_by_tld.sql new file mode 100644 index 00000000000..c9d2f4f5338 --- /dev/null +++ b/sql/2024/accessibility/lighthouse_score_by_tld.sql @@ -0,0 +1,32 @@ +#standardSQL +# Group Lighthouse scores by top-level domain (TLD) + +WITH tld_score_data AS ( + SELECT + REGEXP_EXTRACT(page, r'://[^/]+\.(\w{2,})/') AS tld, -- Extract the top-level domain (TLD) + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.performance.score') AS FLOAT64) AS performance_score, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.accessibility.score') AS FLOAT64) AS accessibility_score, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.best-practices.score') AS FLOAT64) AS best_practices_score, + CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.seo.score') AS FLOAT64) AS seo_score + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + lighthouse IS NOT NULL AND + lighthouse != '{}' AND + is_root_page +) + +SELECT + tld, -- Group by top-level domain + AVG(performance_score) AS avg_performance_score, + AVG(accessibility_score) AS avg_accessibility_score, + AVG(best_practices_score) AS avg_best_practices_score, + AVG(seo_score) AS avg_seo_score, + COUNT(0) AS total_pages +FROM + tld_score_data +GROUP BY + tld +ORDER BY + avg_accessibility_score DESC; -- Sort by accessibility score, or another metric of interest diff --git a/sql/2024/accessibility/media_query_features.sql b/sql/2024/accessibility/media_query_features.sql new file mode 100644 index 00000000000..d85b6828d99 --- /dev/null +++ b/sql/2024/accessibility/media_query_features.sql @@ -0,0 +1,83 @@ +#standardSQL + +CREATE TEMPORARY FUNCTION getMediaQueryFeatures(css STRING) +RETURNS ARRAY +LANGUAGE js +OPTIONS (library = "gs://httparchive/lib/css-utils.js") +AS ''' +try { + function compute(ast) { + let ret = {}; + + walkRules(ast, rule => { + let features = rule.media + .replace(/\\s+/g, "") + .match(/\\([\\w-]+(?=[:\\)])/g); + + if (features) { + features = features.map(s => s.slice(1)); + + for (let feature of features) { + incrementByKey(ret, feature); + } + } + }, {type: "media"}); + + return ret; + } + + const ast = JSON.parse(css); + let features = compute(ast); + return Object.keys(features); +} catch (e) { + return []; +} +'''; + +WITH media_query_data AS ( + -- Extracting media query features from the CSS data + SELECT + client, + page, + LOWER(feature) AS feature + FROM + `httparchive.all.parsed_css`, + UNNEST(getMediaQueryFeatures(css)) AS feature + WHERE + date = '2024-06-01' AND + feature IS NOT NULL + GROUP BY + client, page, feature +), + +total_pages_data AS ( + -- Calculating the total number of pages per client + SELECT + client, + COUNT(0) AS total_pages + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' + GROUP BY + client +) + +SELECT + m.client, + m.feature, + COUNT(DISTINCT m.page) AS pages, + tp.total_pages, + SAFE_DIVIDE(COUNT(DISTINCT m.page), tp.total_pages) AS pct_pages_with_feature +FROM + media_query_data AS m +JOIN + total_pages_data AS tp +ON + m.client = tp.client +GROUP BY + m.client, m.feature, tp.total_pages +HAVING + pages >= 100 +ORDER BY + pct_pages_with_feature DESC; diff --git a/sql/2024/accessibility/page_title.sql b/sql/2024/accessibility/page_title.sql new file mode 100644 index 00000000000..b6d92776876 --- /dev/null +++ b/sql/2024/accessibility/page_title.sql @@ -0,0 +1,26 @@ +#standardSQL +# Page title stats (usage, descriptive, changed on render) +SELECT + client, + is_root_page, + COUNT(0) AS total_sites, + COUNTIF(total_title_words > 0) AS total_has_title, + COUNTIF(total_title_words > 3) AS total_title_with_four_or_more_words, + COUNTIF(title_changed_on_render) AS total_title_changed, + COUNTIF(total_title_words > 0) / COUNT(0) AS pct_with_title, + COUNTIF(total_title_words > 3) / COUNTIF(total_title_words > 0) AS pct_titles_four_or_more_words, + COUNTIF(title_changed_on_render) / COUNTIF(total_title_words > 0) AS pct_titles_changed_on_render +FROM ( + SELECT + client, + is_root_page, + CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies'), '$.title.title_changed_on_render') AS BOOL) AS title_changed_on_render, + CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._wpt_bodies'), '$.title.rendered.primary.words') AS INT64) AS total_title_words + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' +) +GROUP BY + client, + is_root_page; diff --git a/sql/2024/accessibility/pages_with_search_input.sql b/sql/2024/accessibility/pages_with_search_input.sql new file mode 100644 index 00000000000..53936b8a742 --- /dev/null +++ b/sql/2024/accessibility/pages_with_search_input.sql @@ -0,0 +1,51 @@ +#standardSQL +# Pages with search input +CREATE TEMPORARY FUNCTION hasSearchInput(payload STRING) +RETURNS BOOLEAN LANGUAGE js AS ''' + try { + const almanac = JSON.parse(payload); + return almanac.input_elements.nodes.some((node) => { + if (node.type.toLowerCase() === "search") { + return true; + } + + // Detect regular inputs of type text and the first word being "search" + if (node.type.toLowerCase() === "text" && + /^\\s*search(\\s|$)/i.test(node.placeholder || '')) { + return true; + } + + return false; + }); + + } catch (e) { + return false; + } +'''; + +SELECT + client, + is_root_page, + COUNT(0) AS total_sites, + COUNTIF(has_inputs) AS total_with_inputs, + COUNTIF(has_search_input) AS total_with_search_input, + + # Perc of all sites which have a search input + COUNTIF(has_search_input) / COUNT(0) AS perc_sites_with_search_input, + # Of sites that have at least 1 input element, how many have a search input + COUNTIF(has_search_input) / COUNTIF(has_inputs) AS perc_input_sites_with_search_input +FROM + ( + SELECT + client, + is_root_page, + SAFE_CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._almanac'), '$.input_elements.total') AS INT64) > 0 AS has_inputs, + hasSearchInput(JSON_EXTRACT_SCALAR(payload, '$._almanac')) AS has_search_input + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' + ) +GROUP BY + client, + is_root_page; diff --git a/sql/2024/accessibility/placeholder_but_no_label.sql b/sql/2024/accessibility/placeholder_but_no_label.sql new file mode 100644 index 00000000000..a182eeab118 --- /dev/null +++ b/sql/2024/accessibility/placeholder_but_no_label.sql @@ -0,0 +1,31 @@ +#standardSQL +# Form controls with placeholder but no label + +SELECT + client, + is_root_page, + COUNT(0) AS total_sites, + COUNTIF(total_placeholder > 0) AS sites_with_placeholder, + COUNTIF(total_no_label > 0) AS sites_with_no_label, # Has placeholder but no label + + COUNTIF(total_placeholder > 0) / COUNT(0) AS pct_sites_with_placeholder, + # Sites with placeholders that dont always use labels alongside them + COUNTIF(total_no_label > 0) / COUNTIF(total_placeholder > 0) AS pct_placeholder_sites_with_no_label, + + SUM(total_placeholder) AS total_placeholders, + SUM(total_no_label) AS total_placeholder_with_no_label, + SUM(total_no_label) / SUM(total_placeholder) AS pct_placeholders_with_no_label +FROM ( + SELECT + client, + is_root_page, + CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._a11y'), '$.placeholder_but_no_label.total_placeholder') AS INT64) AS total_placeholder, + CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._a11y'), '$.placeholder_but_no_label.total_no_label') AS INT64) AS total_no_label + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' +) +GROUP BY + client, + is_root_page; diff --git a/sql/2024/accessibility/sites_using_role.sql b/sql/2024/accessibility/sites_using_role.sql new file mode 100644 index 00000000000..5b29ca7da16 --- /dev/null +++ b/sql/2024/accessibility/sites_using_role.sql @@ -0,0 +1,30 @@ +#standardSQL +# Sites using the role attribute + +SELECT + client, + is_root_page, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_sites, + SUM(COUNTIF(total_role_attributes > 0)) OVER (PARTITION BY client) AS total_using_role, + SUM(COUNTIF(total_role_attributes > 0)) OVER (PARTITION BY client) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_using_role, + percentile, + APPROX_QUANTILES(total_role_attributes, 1000)[OFFSET(percentile * 10)] AS total_role_usages +FROM ( + SELECT + client, + is_root_page, + CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._almanac'), '$.nodes_using_role.total') AS INT64) AS total_role_attributes + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' + ), + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +GROUP BY + percentile, + client, + is_root_page +ORDER BY + percentile, + client, + is_root_page; diff --git a/sql/2024/accessibility/skip_links.sql b/sql/2024/accessibility/skip_links.sql new file mode 100644 index 00000000000..2873333a3f9 --- /dev/null +++ b/sql/2024/accessibility/skip_links.sql @@ -0,0 +1,26 @@ +#standardSQL +# % of pages having skip links + +CREATE TEMPORARY FUNCTION getEarlyHash(payload STRING) +RETURNS INT64 LANGUAGE js AS ''' +try { + const almanac = JSON.parse(payload); + return almanac['seo-anchor-elements'].earlyHash; +} catch (e) { + return 0; +} +'''; + +SELECT + client, + is_root_page, + COUNTIF(getEarlyHash(JSON_EXTRACT_SCALAR(payload, '$._almanac')) > 0) AS pages, + COUNT(0) AS total, + COUNTIF(getEarlyHash(JSON_EXTRACT_SCALAR(payload, '$._almanac')) > 0) / COUNT(0) AS pct +FROM + `httparchive.all.pages` +WHERE + date = '2024-06-01' +GROUP BY + client, + is_root_page; diff --git a/sql/2024/accessibility/sr_only_classes.sql b/sql/2024/accessibility/sr_only_classes.sql new file mode 100644 index 00000000000..f42aabdaf96 --- /dev/null +++ b/sql/2024/accessibility/sr_only_classes.sql @@ -0,0 +1,22 @@ +#standardSQL +# Sites using sr-only or visually-hidden classes + +SELECT + client, + is_root_page, + COUNT(0) AS total_sites, + COUNTIF(uses_sr_only) AS sites_with_sr_only, + COUNTIF(uses_sr_only) / COUNT(0) AS pct_sites_with_sr_only +FROM ( + SELECT + client, + is_root_page, + CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._a11y'), '$.screen_reader_classes') AS BOOL) AS uses_sr_only + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' +) +GROUP BY + client, + is_root_page; diff --git a/sql/2024/accessibility/tabindex_usage_and_values.sql b/sql/2024/accessibility/tabindex_usage_and_values.sql new file mode 100644 index 00000000000..354afadcfb9 --- /dev/null +++ b/sql/2024/accessibility/tabindex_usage_and_values.sql @@ -0,0 +1,55 @@ +#standardSQL +# Positive tabindex value occurrences + +CREATE TEMPORARY FUNCTION getTotalPositiveTabIndexes(payload STRING) +RETURNS STRUCT LANGUAGE js AS ''' +try { + const almanac = JSON.parse(payload); + + let total = 0; + let total_positive = 0; + let total_negative = 0; + let total_zero = 0; + for (const node of almanac['09.27'].nodes) { + total++; + const int = parseInt(node.tabindex, 10); + if (int > 0) { + total_positive++; + } else if (int < 0) { + total_negative++; + } else if (int === 0) { + total_zero++; + } + } + + return {total, total_positive, total_negative, total_zero}; +} catch (e) { + return {total: 0, total_positive: 0, total_negative: 0, total_zero: 0}; +} +'''; + +SELECT + client, + is_root_page, + COUNT(0) AS total_sites, + COUNTIF(tab_index_stats.total > 0) AS total_with_tab_indexes, + COUNTIF(tab_index_stats.total_positive > 0) AS total_with_positive_tab_indexes, + COUNTIF(tab_index_stats.total_negative > 0) AS total_with_negative_tab_indexes, + COUNTIF(tab_index_stats.total_zero > 0) AS total_with_zero_tab_indexes, + COUNTIF(tab_index_stats.total_negative > 0 OR tab_index_stats.total_zero > 0) AS total_with_negative_or_zero, + COUNTIF(tab_index_stats.total > 0) / COUNT(0) AS pct_with_tab_indexes, + COUNTIF(tab_index_stats.total_positive > 0) / COUNT(0) AS pct_with_positive_tab_indexes, + COUNTIF(tab_index_stats.total_positive > 0) / COUNTIF(tab_index_stats.total > 0) AS pct_positive_in_sites_with_tab_indexes +FROM ( + SELECT + client, + is_root_page, + getTotalPositiveTabIndexes(JSON_EXTRACT_SCALAR(payload, '$._almanac')) AS tab_index_stats + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' +) +GROUP BY + client, + is_root_page; diff --git a/sql/2024/accessibility/table_stats.sql b/sql/2024/accessibility/table_stats.sql new file mode 100644 index 00000000000..d9ed9b74da1 --- /dev/null +++ b/sql/2024/accessibility/table_stats.sql @@ -0,0 +1,37 @@ +#standardSQL +# Table stats. Total all, captioned and presentational + +SELECT + client, + is_root_page, + COUNT(0) AS total_sites, + + COUNTIF(total_tables > 0) AS sites_with_table, + COUNTIF(total_captioned > 0) AS sites_with_captions, + COUNTIF(total_presentational > 0) AS sites_with_presentational, + + COUNTIF(total_tables > 0) / COUNT(0) AS pct_sites_with_table, + COUNTIF(total_captioned > 0) / COUNTIF(total_tables > 0) AS pct_table_sites_with_captioned, + COUNTIF(total_presentational > 0) / COUNTIF(total_tables > 0) AS pct_table_sites_with_presentational, + + SUM(total_tables) AS total_tables, + SUM(total_captioned) AS total_captioned, + SUM(total_presentational) AS total_presentational, + + SUM(total_captioned) / SUM(total_tables) AS pct_captioned, + SUM(total_presentational) / SUM(total_tables) AS pct_presentational +FROM ( + SELECT + client, + is_root_page, + CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._a11y'), '$.tables.total') AS INT64) AS total_tables, + CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._a11y'), '$.tables.total_with_caption') AS INT64) AS total_captioned, + CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._a11y'), '$.tables.total_with_presentational') AS INT64) AS total_presentational + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' +) +GROUP BY + client, + is_root_page; diff --git a/sql/2024/accessibility/units_properties.sql b/sql/2024/accessibility/units_properties.sql new file mode 100644 index 00000000000..7d2ee57f69c --- /dev/null +++ b/sql/2024/accessibility/units_properties.sql @@ -0,0 +1,138 @@ +#standardSQL + +CREATE TEMPORARY FUNCTION getPropertyUnits(css STRING) +RETURNS ARRAY> +LANGUAGE js +OPTIONS (library = "gs://httparchive/lib/css-utils.js") +AS ''' +try { + function compute(ast) { + let ret = { + zeroes: {}, + by_property: {} + }; + + const lengths = /(?-?\\d*\\.?\\d+)(?%|[a-z]{1,4}\\b|(?=\\s|$|,|\\*|\\/)\\b)/gi; + + walkDeclarations(ast, ({property, value}) => { + value = removeFunctionCalls(value, {names: ["rgb", "rgba", "hsl", "hsla"]}); + + for (let length of value.matchAll(lengths)) { + let {number, unit} = length.groups; + ret.by_property[property] = ret.by_property[property] || {}; + + if (number === "0") { + incrementByKey(ret.zeroes, unit || ""); + } + + if (unit) { + incrementByKey(ret, unit); + incrementByKey(ret.by_property[property], unit); + } else { + incrementByKey(ret, ""); + incrementByKey(ret.by_property[property], ""); + } + + incrementByKey(ret, "total"); + incrementByKey(ret.by_property[property], "total"); + } + }, { + properties: [ + "baseline-shift", + "box-shadow", + "vertical-align", + "clip-path", + /^column[s-]|^inset\b/g, + "contain-intrinsic-size", + "cx", + "cy", + "flex-basis", + "letter-spacing", + "perspective", + "perspective-origin", + "r", + "row-gap", + "rx", + "ry", + "tab-size", + "text-indent", + "text-shadow", + "translate", + "vertical-align", + "word-spacing", + "x", + "y", + /\\b(?:width|height|thickness|offset|origin|padding|border|margin|outline|top|right|bottom|left|(inline|block)-(start|end)|gap|size|position)\\b/g + ], + not: { + properties: /^-|-color$/ + } + }); + + ret = sortObject(ret); + + for (let property in ret.by_property) { + ret.by_property[property] = sortObject(ret.by_property[property]); + } + + return ret; + } + var ast = JSON.parse(css); + var units = compute(ast); + return Object.entries(units.by_property).flatMap(([property, units]) => { + return Object.entries(units).filter(([unit]) => { + return unit != 'total'; + }).map(([unit, freq]) => { + return {property, unit, freq}; + }); + }); +} catch (e) { + return []; +} +'''; + +WITH property_units_data AS ( + -- Extracting property units data from CSS + SELECT + client, + unit.property, + unit.unit, + unit.freq + FROM + `httparchive.all.parsed_css`, + UNNEST(getPropertyUnits(css)) AS unit + WHERE + date = '2024-06-01' AND + LENGTH(css) < 0.1 * 1024 * 1024 -- Limit the size of the CSS to avoid OOM crashes +), + +aggregated_data AS ( + -- Aggregating frequency data per client and property + SELECT + client, + property, + unit, + SUM(freq) AS freq, + SUM(SUM(freq)) OVER (PARTITION BY client, property) AS total, + SAFE_DIVIDE(SUM(freq), SUM(SUM(freq)) OVER (PARTITION BY client, property)) AS pct + FROM + property_units_data + GROUP BY + client, property, unit +) + +SELECT + client, + property, + unit, + freq, + total, + pct +FROM + aggregated_data +WHERE + total >= 1000 AND + pct >= 0.01 +ORDER BY + total DESC, + pct DESC; diff --git a/sql/2024/accessibility/valid_html_lang.sql b/sql/2024/accessibility/valid_html_lang.sql new file mode 100644 index 00000000000..6f478cfb8a3 --- /dev/null +++ b/sql/2024/accessibility/valid_html_lang.sql @@ -0,0 +1,28 @@ +#standardSQL +# % of pages with a valid html lang attribute + +SELECT + client, + is_root_page, + COUNT(0) AS total, + COUNTIF(valid_lang) AS valid_lang, + COUNTIF(has_lang) AS has_lang, + COUNTIF(has_lang) / COUNT(0) AS pct_has_of_total, + COUNTIF(valid_lang) / COUNT(0) AS pct_valid_of_total +FROM ( + SELECT + client, + is_root_page, + JSON_EXTRACT_SCALAR(lighthouse, "$.audits['html-has-lang'].score") = '1' AS has_lang, + JSON_EXTRACT_SCALAR(lighthouse, "$.audits['html-lang-valid'].score") = '1' AS valid_lang + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' + ) +GROUP BY + client, + is_root_page +ORDER BY + client, + is_root_page; diff --git a/sql/2024/accessibility/video_track_usage.sql b/sql/2024/accessibility/video_track_usage.sql new file mode 100644 index 00000000000..d9e0621cebb --- /dev/null +++ b/sql/2024/accessibility/video_track_usage.sql @@ -0,0 +1,27 @@ +#standardSQL +# Video elements track usage + +SELECT + client, + is_root_page, + COUNT(0) AS total_sites, + COUNTIF(total_videos > 0) AS total_with_video, + COUNTIF(total_with_track > 0) AS total_with_tracks, + + SUM(total_with_track) / SUM(total_videos) AS pct_videos_with_tracks, + COUNTIF(total_videos > 0) / COUNT(0) AS pct_sites_with_videos, + COUNTIF(total_with_track > 0) / COUNTIF(total_videos > 0) AS pct_video_sites_with_tracks +FROM ( + SELECT + client, + is_root_page, + CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._almanac'), '$.videos.total') AS INT64) AS total_videos, + CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._almanac'), '$.videos.total_with_track') AS INT64) AS total_with_track + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' +) +GROUP BY + client, + is_root_page; diff --git a/sql/2024/accessibility/viewport_zoom_scale.sql b/sql/2024/accessibility/viewport_zoom_scale.sql new file mode 100644 index 00000000000..df3398ff31a --- /dev/null +++ b/sql/2024/accessibility/viewport_zoom_scale.sql @@ -0,0 +1,37 @@ +#standardSQL +# Disabled zooming and scaling via the viewport tag +# Copy of sql/2022/mobile-web/viewport_zoom_scale.sql + +SELECT + client, + is_root_page, + COUNT(0) AS total_pages, + COUNTIF(has_meta_viewport) AS total_viewports, + COUNTIF(not_scalable) AS total_no_scale, + COUNTIF(max_scale_1_or_less) AS total_locked_max_scale, + COUNTIF(not_scalable OR max_scale_1_or_less) AS total_either, + + COUNTIF(not_scalable) / COUNT(0) AS pct_pages_no_scale, + COUNTIF(max_scale_1_or_less) / COUNT(0) AS pct_pages_locked_max_scale, + COUNTIF(not_scalable OR max_scale_1_or_less) / COUNT(0) AS pct_pages_either +FROM ( + SELECT + client, + is_root_page, + meta_viewport IS NOT NULL AS has_meta_viewport, + REGEXP_EXTRACT(meta_viewport, r'(?i)user-scalable\s*=\s*(no|0)') IS NOT NULL AS not_scalable, + SAFE_CAST(REGEXP_EXTRACT(meta_viewport, r'(?i)maximum-scale\s*=\s*([0-9]*\.[0-9]+|[0-9]+)') AS FLOAT64) <= 1 AS max_scale_1_or_less + FROM ( + SELECT + client, + is_root_page, + JSON_EXTRACT_SCALAR(payload, '$._meta_viewport') AS meta_viewport + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' + ) +) +GROUP BY + client, + is_root_page; diff --git a/sql/2024/accessibility/viewport_zoom_scale_by_domain_rank.sql b/sql/2024/accessibility/viewport_zoom_scale_by_domain_rank.sql new file mode 100644 index 00000000000..70eb10c096a --- /dev/null +++ b/sql/2024/accessibility/viewport_zoom_scale_by_domain_rank.sql @@ -0,0 +1,69 @@ +#standardSQL +# Analyze the usage of viewport tags and scaling behavior by domain rank + +WITH RankedPages AS ( + SELECT + client, + is_root_page, + JSON_EXTRACT_SCALAR(payload, '$._meta_viewport') AS meta_viewport, + rank + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' +), +RankGroups AS ( + SELECT + client, + is_root_page, + rank, + CASE + WHEN rank <= 1000 THEN 1000 + WHEN rank <= 10000 THEN 10000 + WHEN rank <= 100000 THEN 100000 + WHEN rank <= 1000000 THEN 1000000 + WHEN rank <= 10000000 THEN 10000000 + ELSE 100000000 + END AS rank_grouping + FROM + RankedPages +), +AggregatedData AS ( + SELECT + client, + is_root_page, + rank_grouping, + COUNT(0) AS total_pages, + COUNTIF(meta_viewport IS NOT NULL) AS total_viewports, + COUNTIF(REGEXP_EXTRACT(meta_viewport, r'(?i)user-scalable\s*=\s*(no|0)') IS NOT NULL) AS total_no_scale, + COUNTIF(SAFE_CAST(REGEXP_EXTRACT(meta_viewport, r'(?i)maximum-scale\s*=\s*([0-9]*\.[0-9]+|[0-9]+)') AS FLOAT64) <= 1) AS total_locked_max_scale, + COUNTIF(REGEXP_EXTRACT(meta_viewport, r'(?i)user-scalable\s*=\s*(no|0)') IS NOT NULL OR + SAFE_CAST(REGEXP_EXTRACT(meta_viewport, r'(?i)maximum-scale\s*=\s*([0-9]*\.[0-9]+|[0-9]+)') AS FLOAT64) <= 1) AS total_either + FROM + RankGroups + LEFT JOIN + RankedPages + USING (client, is_root_page, rank) + GROUP BY + client, + is_root_page, + rank_grouping +) +SELECT + client, + is_root_page, + rank_grouping, + total_pages, + total_viewports, + total_no_scale, + total_locked_max_scale, + total_either, + SAFE_DIVIDE(total_no_scale, total_pages) AS pct_pages_no_scale, + SAFE_DIVIDE(total_locked_max_scale, total_pages) AS pct_pages_locked_max_scale, + SAFE_DIVIDE(total_either, total_pages) AS pct_pages_either +FROM + AggregatedData +ORDER BY + client, + is_root_page, + rank_grouping; From f610a485f4b6996783bc96d55fa6f690d957e8ae Mon Sep 17 00:00:00 2001 From: Jannis Rautenstrauch <33023300+JannisBush@users.noreply.github.com> Date: Wed, 30 Oct 2024 12:52:35 +0100 Subject: [PATCH 08/15] Security 2024 Queries (#3671) * Add 2022 queries * Add updated bot detection query (issue HSTS technology) * Query update * Update Cryptominer + Crypto query * Port all cookie queries * CSP + CORP porting * More porting * Many updated queries * Update many queries * Update all remaining queries * Remove 2024 prefix * Add header value distributions (some headers) * Add Timing-Allow-Origin Header Usage No real security header as it can only worsen security (MDN classifies it as CORS header) thus not included in all queries, e.g., not included in feature_adoption_by_country.sql) * Add OAC queries * Add document.domain query * HTML Sanitization query * Security Features by category * Add server-timing query * Update security.txt query * Improve security.text query * Only consider "real" security.txt files for query * Add some insights into security.txt data (FPs vs FNs) * Fix typo * SQLfluff fix * Fix typo * Add a limit * Improve Note document.domain * Update cryptominer time period * Add missing lowercasing * Update note * Remove trailing whitespace * Add two more queries * Fix typo * Add WSS CSP query * Iframes attributes: change to custom_metrics + add more dates * Update server-timing parsing * FIx st query + add sampling data * Add Server-Timing overview query * Lint * Add query to measure use of disallowed CSP directives in * Fix linting issue --------- Co-authored-by: Gertjan Franken --- sql/2024/security/README.md | 4 +- sql/2024/security/bot_detection.sql | 38 +++ .../clear-site-data_value_prevalence.sql | 30 ++ sql/2024/security/coep_header_prevalence.sql | 30 ++ sql/2024/security/cookie_age_negative.sql | 130 ++++++++ sql/2024/security/cookie_age_percentiles.sql | 118 +++++++ sql/2024/security/cookie_attributes.sql | 45 +++ .../cookie_max_age_expires_top_values.sql | 128 ++++++++ sql/2024/security/coop_header_prevalence.sql | 30 ++ sql/2024/security/corp_header_prevalence.sql | 29 ++ sql/2024/security/cryptominer_share.sql | 23 ++ sql/2024/security/cryptominer_usage.sql | 26 ++ .../security/csp_allowed_host_frequency.sql | 50 +++ .../csp_allowed_host_frequency_wss.sql | 50 +++ sql/2024/security/csp_directives_usage.sql | 30 ++ sql/2024/security/csp_most_common_header.sql | 31 ++ .../security/csp_number_of_allowed_hosts.sql | 36 +++ .../csp_script_source_list_keywords.sql | 49 +++ sql/2024/security/documentdomain_usage.sql | 19 ++ .../security/feature_adoption_by_category.sql | 53 ++++ .../security/feature_adoption_by_country.sql | 57 ++++ .../feature_adoption_by_other_features.sql | 53 ++++ .../feature_adoption_by_technology.sql | 83 +++++ .../feature_adoption_by_topN_technologies.sql | 91 ++++++ sql/2024/security/fp_header_prevalence.sql | 30 ++ sql/2024/security/home_page_https_usage.sql | 18 ++ sql/2024/security/hsts_attributes.sql | 25 ++ .../security/hsts_max_age_percentiles.sql | 26 ++ sql/2024/security/html_sanitization_usage.sql | 19 ++ sql/2024/security/https_request_over_time.sql | 20 ++ sql/2024/security/https_server_redirects.sql | 23 ++ .../iframe_allow_directive_values.sql | 63 ++++ sql/2024/security/iframe_allow_directives.sql | 51 +++ .../iframe_attribute_popular_hosts.sql | 59 ++++ sql/2024/security/iframe_attributes_usage.sql | 45 +++ .../security/iframe_sandbox_directives.sql | 49 +++ .../meta_csp_disallowed_directives.sql | 28 ++ .../meta_policies_allowed_vs_disallowed.sql | 30 ++ .../mimetype_file_extension_mismatch.sql | 51 +++ sql/2024/security/mixed_content.sql | 24 ++ sql/2024/security/oac_header_prevalence.sql | 30 ++ sql/2024/security/pp_header_prevalence.sql | 30 ++ .../robot_header_and_meta_tag_prevalence.sql | 78 +++++ .../security/robot_txt_sensitive_disallow.sql | 53 ++++ .../security_adoption_by_category.sql | 34 ++ .../security/security_adoption_by_rank.sql | 32 ++ .../security/security_headers_prevalence.sql | 35 +++ .../server_header_value_prevalence.sql | 63 ++++ .../server_information_header_prevalence.sql | 32 ++ .../security/server_timing_usage_values.sql | 295 ++++++++++++++++++ sql/2024/security/sri_coverage_per_page.sql | 31 ++ sql/2024/security/sri_hash_functions.sql | 45 +++ sql/2024/security/sri_popular_hosts.sql | 51 +++ sql/2024/security/sri_usage.sql | 34 ++ sql/2024/security/tao_header_prevalence.sql | 29 ++ sql/2024/security/tls_ca_issuers_pages.sql | 31 ++ sql/2024/security/tls_ca_issuers_requests.sql | 28 ++ sql/2024/security/tls_cipher_suite.sql | 27 ++ sql/2024/security/tls_forward_secrecy.sql | 27 ++ sql/2024/security/tls_versions_pages.sql | 28 ++ sql/2024/security/tls_versions_requests.sql | 30 ++ .../version-evolution-top-technologies.sql | 71 +++++ sql/2024/security/web_cryptography_api.sql | 18 ++ .../security/well-known_change-password.sql | 30 ++ .../well-known_resource-not-be-200.sql | 27 ++ sql/2024/security/well-known_security.sql | 272 ++++++++++++++++ sql/2024/security/xfo_header_prevalence.sql | 30 ++ 67 files changed, 3283 insertions(+), 2 deletions(-) create mode 100644 sql/2024/security/bot_detection.sql create mode 100644 sql/2024/security/clear-site-data_value_prevalence.sql create mode 100644 sql/2024/security/coep_header_prevalence.sql create mode 100644 sql/2024/security/cookie_age_negative.sql create mode 100644 sql/2024/security/cookie_age_percentiles.sql create mode 100644 sql/2024/security/cookie_attributes.sql create mode 100644 sql/2024/security/cookie_max_age_expires_top_values.sql create mode 100644 sql/2024/security/coop_header_prevalence.sql create mode 100644 sql/2024/security/corp_header_prevalence.sql create mode 100644 sql/2024/security/cryptominer_share.sql create mode 100644 sql/2024/security/cryptominer_usage.sql create mode 100644 sql/2024/security/csp_allowed_host_frequency.sql create mode 100644 sql/2024/security/csp_allowed_host_frequency_wss.sql create mode 100644 sql/2024/security/csp_directives_usage.sql create mode 100644 sql/2024/security/csp_most_common_header.sql create mode 100644 sql/2024/security/csp_number_of_allowed_hosts.sql create mode 100644 sql/2024/security/csp_script_source_list_keywords.sql create mode 100644 sql/2024/security/documentdomain_usage.sql create mode 100644 sql/2024/security/feature_adoption_by_category.sql create mode 100644 sql/2024/security/feature_adoption_by_country.sql create mode 100644 sql/2024/security/feature_adoption_by_other_features.sql create mode 100644 sql/2024/security/feature_adoption_by_technology.sql create mode 100644 sql/2024/security/feature_adoption_by_topN_technologies.sql create mode 100644 sql/2024/security/fp_header_prevalence.sql create mode 100644 sql/2024/security/home_page_https_usage.sql create mode 100644 sql/2024/security/hsts_attributes.sql create mode 100644 sql/2024/security/hsts_max_age_percentiles.sql create mode 100644 sql/2024/security/html_sanitization_usage.sql create mode 100644 sql/2024/security/https_request_over_time.sql create mode 100644 sql/2024/security/https_server_redirects.sql create mode 100644 sql/2024/security/iframe_allow_directive_values.sql create mode 100644 sql/2024/security/iframe_allow_directives.sql create mode 100644 sql/2024/security/iframe_attribute_popular_hosts.sql create mode 100644 sql/2024/security/iframe_attributes_usage.sql create mode 100644 sql/2024/security/iframe_sandbox_directives.sql create mode 100644 sql/2024/security/meta_csp_disallowed_directives.sql create mode 100644 sql/2024/security/meta_policies_allowed_vs_disallowed.sql create mode 100644 sql/2024/security/mimetype_file_extension_mismatch.sql create mode 100644 sql/2024/security/mixed_content.sql create mode 100644 sql/2024/security/oac_header_prevalence.sql create mode 100644 sql/2024/security/pp_header_prevalence.sql create mode 100644 sql/2024/security/robot_header_and_meta_tag_prevalence.sql create mode 100644 sql/2024/security/robot_txt_sensitive_disallow.sql create mode 100644 sql/2024/security/security_adoption_by_category.sql create mode 100644 sql/2024/security/security_adoption_by_rank.sql create mode 100644 sql/2024/security/security_headers_prevalence.sql create mode 100644 sql/2024/security/server_header_value_prevalence.sql create mode 100644 sql/2024/security/server_information_header_prevalence.sql create mode 100644 sql/2024/security/server_timing_usage_values.sql create mode 100644 sql/2024/security/sri_coverage_per_page.sql create mode 100644 sql/2024/security/sri_hash_functions.sql create mode 100644 sql/2024/security/sri_popular_hosts.sql create mode 100644 sql/2024/security/sri_usage.sql create mode 100644 sql/2024/security/tao_header_prevalence.sql create mode 100644 sql/2024/security/tls_ca_issuers_pages.sql create mode 100644 sql/2024/security/tls_ca_issuers_requests.sql create mode 100644 sql/2024/security/tls_cipher_suite.sql create mode 100644 sql/2024/security/tls_forward_secrecy.sql create mode 100644 sql/2024/security/tls_versions_pages.sql create mode 100644 sql/2024/security/tls_versions_requests.sql create mode 100644 sql/2024/security/version-evolution-top-technologies.sql create mode 100644 sql/2024/security/web_cryptography_api.sql create mode 100644 sql/2024/security/well-known_change-password.sql create mode 100644 sql/2024/security/well-known_resource-not-be-200.sql create mode 100644 sql/2024/security/well-known_security.sql create mode 100644 sql/2024/security/xfo_header_prevalence.sql diff --git a/sql/2024/security/README.md b/sql/2024/security/README.md index 1cdda292864..3967054b8e1 100644 --- a/sql/2024/security/README.md +++ b/sql/2024/security/README.md @@ -14,9 +14,9 @@ - [📄 Planning doc][~google-doc] - [📊 Results sheet][~google-sheets] - [📝 Markdown file][~chapter-markdown] -- [:book: 2021 chapter][~2021-chapter] +- [:book: 2022 chapter][~2022-chapter] [~google-doc]: https://docs.google.com/document/d/1jBGxgkBDIi9nDQ-n2eVFkwDZXk_9rQkLiKfnHkknsAs/edit [~google-sheets]: https://docs.google.com/spreadsheets/d/1b9IEGbfQjKCEaTBmcv_zyCyWEsq35StCa-dVOe6V1Cs/edit#gid=1778117656 [~chapter-markdown]: https://github.com/HTTPArchive/almanac.httparchive.org/tree/main/src/content/en/2024/security.md -[~2021-chapter]: https://almanac.httparchive.org/en/2021/security +[~2022-chapter]: https://almanac.httparchive.org/en/2022/security diff --git a/sql/2024/security/bot_detection.sql b/sql/2024/security/bot_detection.sql new file mode 100644 index 00000000000..bebef01cac1 --- /dev/null +++ b/sql/2024/security/bot_detection.sql @@ -0,0 +1,38 @@ +#standardSQL +# Section: Attack Preventions - Bot protection services +# Question: Which bot protection services are used most often on mobile and desktop sites? +# Notes: The Wappalyzer 'Security' category mostly contains bot protection services such as reCAPTCHA and Cloudflare Bot Management +# Issue: Due to some updates to wappalyzer the 'Security' category now also contains 'HSTS' (security header) and 'Really Simple SSL & Security' in significant numbers. Do we want to filter them out? +SELECT + client, + t.technology, + COUNT(0) AS freq, + total, + COUNT(0) / total AS pct +FROM + `httparchive.all.pages`, + UNNEST(technologies) AS t, + UNNEST(t.categories) AS category +JOIN ( + SELECT + client, + COUNT(0) AS total + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page + GROUP BY + client) +USING + (client) +WHERE + date = '2024-06-01' AND + category = 'Security' AND + is_root_page +GROUP BY + client, + total, + t.technology +ORDER BY + pct DESC diff --git a/sql/2024/security/clear-site-data_value_prevalence.sql b/sql/2024/security/clear-site-data_value_prevalence.sql new file mode 100644 index 00000000000..6aca991d8e7 --- /dev/null +++ b/sql/2024/security/clear-site-data_value_prevalence.sql @@ -0,0 +1,30 @@ +#standardSQL +# Section: Attack preventions - Preventing attacks using Clear-Site-Data +# Question: Which Clear-Site-Data header values are most prevalent? +# Notes: Many used values are still invalid (without quotes). We only count each host-value pair once. +SELECT + client, + csd_header, + SUM(COUNT(DISTINCT host)) OVER (PARTITION BY client) AS total_csd_headers, + COUNT(DISTINCT host) AS freq, + COUNT(DISTINCT host) / SUM(COUNT(DISTINCT host)) OVER (PARTITION BY client) AS pct +FROM ( + SELECT + client, + NET.HOST(url) AS host, + response_headers.value AS csd_header + FROM + `httparchive.all.requests`, + UNNEST(response_headers) AS response_headers + WHERE + date = '2024-06-01' AND + is_root_page AND + # AND is_main_document # (Uncomment to only run on the main document response; majority of CSD headers are set on them) + LOWER(response_headers.name) = 'clear-site-data') +GROUP BY + client, + csd_header +ORDER BY + pct DESC +LIMIT + 100 diff --git a/sql/2024/security/coep_header_prevalence.sql b/sql/2024/security/coep_header_prevalence.sql new file mode 100644 index 00000000000..bf07e619a70 --- /dev/null +++ b/sql/2024/security/coep_header_prevalence.sql @@ -0,0 +1,30 @@ +#standardSQL +# Section: Attack Preventions - Preventing attacks using Cross-Origin policies +# Question: Which are the most common COEP values? +# Note: Considers headers of main document responses +SELECT + client, + coep_header, + SUM(COUNT(DISTINCT host)) OVER (PARTITION BY client) AS total_coep_headers, + COUNT(DISTINCT host) AS freq, + COUNT(DISTINCT host) / SUM(COUNT(DISTINCT host)) OVER (PARTITION BY client) AS pct +FROM ( + SELECT + client, + NET.HOST(url) AS host, + response_headers.value AS coep_header + FROM + `httparchive.all.requests`, + UNNEST(response_headers) AS response_headers + WHERE + date = '2024-06-01' AND + is_root_page AND + is_main_document AND + LOWER(response_headers.name) = 'cross-origin-embedder-policy') +GROUP BY + client, + coep_header +ORDER BY + pct DESC +LIMIT + 100 diff --git a/sql/2024/security/cookie_age_negative.sql b/sql/2024/security/cookie_age_negative.sql new file mode 100644 index 00000000000..a8cf1782b31 --- /dev/null +++ b/sql/2024/security/cookie_age_negative.sql @@ -0,0 +1,130 @@ +#standardSQL +# Section: Cookies - Cookie Age +# Question: How many cookies (total, hosts, pages) have negative Max-Age, Expires and real age (Max-Age has precedence over Expires) attributes? +# Note: Query is expensive and slow (14TB). Query is inefficient (We create a result array of length 1 for each cookie-attribute for each cookie and then unnest it again; We could instead not use arrays and skip the unnesting). +# Note: Some of the percentages are quite different to the old query; one of both might be broken (difficult to compare as both cannot operate on a shared dataset) +CREATE TEMPORARY FUNCTION getCookieAgeValues(cookie_value STRING, epochOfRequest NUMERIC) +RETURNS STRING DETERMINISTIC +LANGUAGE js AS ''' + const regexMaxAge = new RegExp(/max-age\\s*=\\s*(?-*[0-9]+)/i); + const regexExpires = new RegExp(/expires\\s*=\\s*(?.*?)(;|$)/i); + const cookieValues = [cookie_value]; + const result = { + "maxAge": [], + "expires": [], + "realAge": [] + }; + cookieValues.forEach(cookie => { + let maxAge = null; + let expires = null; + if (regexMaxAge.exec(cookie)) { + maxAge = Number(regexMaxAge.exec(cookie)[1]); + result["maxAge"].push(maxAge); + } + if (regexExpires.exec(cookie)) { + expires = Math.round(Number(new Date(regexExpires.exec(cookie)[1])) / 1000) - epochOfRequest; + result["expires"].push(Number.isSafeInteger(expires) ? expires : null); + } + if (maxAge) { + result["realAge"].push(maxAge); + } else if (expires) { + result["realAge"].push(expires); + } + }); + return JSON.stringify(result); +'''; + +WITH age_values AS ( + SELECT + client, + page, + NET.HOST(url) AS host, + getCookieAgeValues(response_headers.value, CAST(JSON_QUERY(summary, '$.startedDateTime') AS NUMERIC)) AS values + FROM + `httparchive.all.requests`, + UNNEST(response_headers) AS response_headers + WHERE + date = '2024-06-01' AND + is_root_page AND + LOWER(response_headers.name) = 'set-cookie' +), + +max_age_values AS ( + SELECT + client, + COUNTIF(SAFE_CAST(max_age_value AS NUMERIC) <= 0) AS count_negative_max_age, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_max_age_cookies, + COUNT(DISTINCT IF(SAFE_CAST(max_age_value AS NUMERIC) <= 0, page, NULL)) AS num_max_age_pages, + COUNT(DISTINCT page) AS total_max_age_pages, + COUNT(DISTINCT IF(SAFE_CAST(max_age_value AS NUMERIC) <= 0, host, NULL)) AS num_max_age_hosts, + COUNT(DISTINCT host) AS total_max_age_hosts + FROM age_values, + UNNEST(JSON_QUERY_ARRAY(values, '$.maxAge')) AS max_age_value + GROUP BY + client + ORDER BY + client +), + +expires_values AS ( + SELECT + client, + COUNTIF(SAFE_CAST(expires_value AS NUMERIC) <= 0) AS count_negative_expires, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_expires_cookies, + COUNT(DISTINCT IF(SAFE_CAST(expires_value AS NUMERIC) <= 0, page, NULL)) AS num_expires_pages, + COUNT(DISTINCT page) AS total_expires_pages, + COUNT(DISTINCT IF(SAFE_CAST(expires_value AS NUMERIC) <= 0, host, NULL)) AS num_expires_hosts, + COUNT(DISTINCT host) AS total_expires_hosts + FROM age_values, + UNNEST(JSON_QUERY_ARRAY(values, '$.expires')) AS expires_value + GROUP BY + client + ORDER BY + client +), + +real_age_values AS ( + SELECT + client, + COUNTIF(SAFE_CAST(real_age_value AS NUMERIC) <= 0) AS count_negative_real_age, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_real_age_cookies, + COUNT(DISTINCT IF(SAFE_CAST(real_age_value AS NUMERIC) <= 0, page, NULL)) AS num_real_age_pages, + COUNT(DISTINCT page) AS total_real_age_pages, + COUNT(DISTINCT IF(SAFE_CAST(real_age_value AS NUMERIC) <= 0, host, NULL)) AS num_real_age_hosts, + COUNT(DISTINCT host) AS total_real_age_hosts + FROM age_values, + UNNEST(JSON_QUERY_ARRAY(values, '$.realAge')) AS real_age_value + GROUP BY + client + ORDER BY + client +) + +SELECT + client, + count_negative_max_age, + count_negative_max_age / total_max_age_cookies AS pct_negative_max_age, + num_max_age_pages, + num_max_age_pages / total_max_age_pages AS pct_max_age_pages, + num_max_age_hosts, + num_max_age_hosts / total_max_age_hosts AS pct_max_age_hosts, + count_negative_expires, + count_negative_expires / total_expires_cookies AS pct_negative_expires, + num_expires_pages, + num_expires_pages / total_expires_pages AS pct_expires_pages, + num_expires_hosts, + num_expires_hosts / total_expires_hosts AS pct_expires_hosts, + count_negative_real_age, + count_negative_real_age / total_real_age_cookies AS pct_negative_real_age, + num_real_age_pages, + num_real_age_pages / total_real_age_pages AS pct_real_age_pages, + num_real_age_hosts, + num_real_age_hosts / total_real_age_hosts AS pct_real_age_hosts +FROM + max_age_values +JOIN expires_values +USING (client) +JOIN real_age_values +USING (client) +ORDER BY + client diff --git a/sql/2024/security/cookie_age_percentiles.sql b/sql/2024/security/cookie_age_percentiles.sql new file mode 100644 index 00000000000..c544075b1b9 --- /dev/null +++ b/sql/2024/security/cookie_age_percentiles.sql @@ -0,0 +1,118 @@ +#standardSQL +# Section: Cookies - Cookie Age +# Question: How long are cookies valid? (Max-Age, Expires, Real Age) +# Note: Only incorporates values that are larger than 0; cookies set over all all requests on the root_page +# Note: Could be combined with the other cookie queries to run the expensive response_header unnesting only once? +CREATE TEMPORARY FUNCTION getCookieAgeValues(cookie_value STRING, epochOfRequest NUMERIC) +RETURNS STRING DETERMINISTIC +LANGUAGE js AS ''' + const regexMaxAge = new RegExp(/max-age\\s*=\\s*(?-*[0-9]+)/i); + const regexExpires = new RegExp(/expires\\s*=\\s*(?.*?)(;|$)/i); + const cookieValues = [cookie_value]; + const result = { + "maxAge": [], + "expires": [], + "realAge": [] + }; + cookieValues.forEach(cookie => { + let maxAge = null; + let expires = null; + if (regexMaxAge.exec(cookie)) { + maxAge = Number(regexMaxAge.exec(cookie)[1]); + result["maxAge"].push(maxAge); + } + if (regexExpires.exec(cookie)) { + expires = Math.round(Number(new Date(regexExpires.exec(cookie)[1])) / 1000) - epochOfRequest; + result["expires"].push(Number.isSafeInteger(expires) ? expires : null); + } + if (maxAge) { + result["realAge"].push(maxAge); + } else if (expires) { + result["realAge"].push(expires); + } + }); + return JSON.stringify(result); +'''; + +WITH age_values AS ( + SELECT + client, + getCookieAgeValues(response_headers.value, CAST(JSON_QUERY(summary, '$.startedDateTime') AS NUMERIC)) AS values + FROM + `httparchive.all.requests`, + UNNEST(response_headers) AS response_headers + WHERE + date = '2024-06-01' AND + is_root_page AND + LOWER(response_headers.name) = 'set-cookie' +), + +max_age_values AS ( + SELECT + client, + percentile, + APPROX_QUANTILES(SAFE_CAST(max_age_value AS NUMERIC), 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS max_age + FROM age_values, + UNNEST(JSON_QUERY_ARRAY(values, '$.maxAge')) AS max_age_value, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile + WHERE + SAFE_CAST(max_age_value AS NUMERIC) > 0 + GROUP BY + percentile, + client + ORDER BY + percentile, + client +), + +expires_values AS ( + SELECT + client, + percentile, + APPROX_QUANTILES(SAFE_CAST(expires_value AS NUMERIC), 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS expires + FROM age_values, + UNNEST(JSON_QUERY_ARRAY(values, '$.expires')) AS expires_value, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile + WHERE + SAFE_CAST(expires_value AS NUMERIC) > 0 + GROUP BY + percentile, + client + ORDER BY + percentile, + client +), + +real_age_values AS ( + SELECT + client, + percentile, + APPROX_QUANTILES(SAFE_CAST(real_age_value AS NUMERIC), 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS real_age + FROM age_values, + UNNEST(JSON_QUERY_ARRAY(values, '$.realAge')) AS real_age_value, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile + WHERE + SAFE_CAST(real_age_value AS NUMERIC) > 0 + GROUP BY + percentile, + client + ORDER BY + percentile, + client +) + +SELECT + client, + percentile, + max_age, + expires, + real_age +FROM + max_age_values +JOIN expires_values +USING (client, percentile) +JOIN real_age_values +USING (client, percentile) +ORDER BY + client, + percentile diff --git a/sql/2024/security/cookie_attributes.sql b/sql/2024/security/cookie_attributes.sql new file mode 100644 index 00000000000..66b0f7588d1 --- /dev/null +++ b/sql/2024/security/cookie_attributes.sql @@ -0,0 +1,45 @@ +#standardSQL +# Section: Cookies - Cookie Attributes +# Question: What is the prevalence of cookie attributes (HttpOnly, Secure, SameSite, __Secure-, __Host- prefixes, ...) for cookies set on first-party and third-party requests? +# Note: NET.REG_DOMAIN does not use the private section of publicsuffix.org (e.g., all *.github.io sites are counted as the same party) +SELECT + client, + IF(NET.REG_DOMAIN(url) = NET.REG_DOMAIN(page), 1, 3) AS party, + COUNT(0) AS total_cookies, + COUNTIF(REGEXP_CONTAINS(response_headers.value, r'(?i);.*httponly')) AS count_httponly, + COUNTIF(REGEXP_CONTAINS(response_headers.value, r'(?i);.*httponly')) / COUNT(0) AS pct_httponly, + COUNTIF(REGEXP_CONTAINS(response_headers.value, r'(?i);.*secure')) AS count_secure, + COUNTIF(REGEXP_CONTAINS(response_headers.value, r'(?i);.*secure')) / COUNT(0) AS pct_secure, + COUNTIF(REGEXP_CONTAINS(response_headers.value, r'(?i);.*samesite\s*=')) AS count_samesite, + COUNTIF(REGEXP_CONTAINS(response_headers.value, r'(?i);.*samesite\s*=')) / COUNT(0) AS pct_samesite, + COUNTIF(REGEXP_CONTAINS(response_headers.value, r'(?i);.*samesite\s*=\s*lax')) AS count_samesite_lax, + COUNTIF(REGEXP_CONTAINS(response_headers.value, r'(?i);.*samesite\s*=\s*lax')) / COUNT(0) AS pct_samesite_lax, + COUNTIF(REGEXP_CONTAINS(response_headers.value, r'(?i);.*samesite\s*=\s*strict')) AS count_samesite_strict, + COUNTIF(REGEXP_CONTAINS(response_headers.value, r'(?i);.*samesite\s*=\s*strict')) / COUNT(0) AS pct_samesite_strict, + COUNTIF(REGEXP_CONTAINS(response_headers.value, r'(?i);.*samesite\s*=\s*none')) AS count_samesite_none, + COUNTIF(REGEXP_CONTAINS(response_headers.value, r'(?i);.*samesite\s*=\s*none')) / COUNT(0) AS pct_samesite_none, + COUNTIF(REGEXP_CONTAINS(response_headers.value, r'(?i);.*sameparty')) AS count_sameparty, + COUNTIF(REGEXP_CONTAINS(response_headers.value, r'(?i);.*sameparty')) / COUNT(0) AS pct_sameparty, + COUNTIF(REGEXP_CONTAINS(response_headers.value, r'(?i);.*max-age\s*=\s*.+')) AS count_max_age, + COUNTIF(REGEXP_CONTAINS(response_headers.value, r'(?i);.*max-age\s*=\s*.+')) / COUNT(0) AS pct_max_age, + COUNTIF(REGEXP_CONTAINS(response_headers.value, r'(?i);.*expires\s*=\s*.+')) AS count_expires, + COUNTIF(REGEXP_CONTAINS(response_headers.value, r'(?i);.*expires\s*=\s*.+')) / COUNT(0) AS pct_expires, + COUNTIF(NOT(REGEXP_CONTAINS(response_headers.value, r'(?i);.*max-age\s*=\s*.+') OR REGEXP_CONTAINS(response_headers.value, r'(?i);.*expires\s*=\s*.+'))) AS count_session, + COUNTIF(NOT(REGEXP_CONTAINS(response_headers.value, r'(?i);.*max-age\s*=\s*.+') OR REGEXP_CONTAINS(response_headers.value, r'(?i);.*expires\s*=\s*.+'))) / COUNT(0) AS pct_session, + COUNTIF(REGEXP_CONTAINS(response_headers.value, r'(?i)^\s*__Secure-')) AS count_secure_prefix, + COUNTIF(REGEXP_CONTAINS(response_headers.value, r'(?i)^\s*__Secure-')) / COUNT(0) AS pct_secure_prefix, + COUNTIF(REGEXP_CONTAINS(response_headers.value, r'(?i)^\s*__Host-')) AS count_host_prefix, + COUNTIF(REGEXP_CONTAINS(response_headers.value, r'(?i)^\s*__Host-')) / COUNT(0) AS pct_host_prefix +FROM + `httparchive.all.requests`, + UNNEST(response_headers) AS response_headers +WHERE + date = '2024-06-01' AND + is_root_page AND + LOWER(response_headers.name) = 'set-cookie' +GROUP BY + client, + party +ORDER BY + client, + party diff --git a/sql/2024/security/cookie_max_age_expires_top_values.sql b/sql/2024/security/cookie_max_age_expires_top_values.sql new file mode 100644 index 00000000000..e7c7e43e584 --- /dev/null +++ b/sql/2024/security/cookie_max_age_expires_top_values.sql @@ -0,0 +1,128 @@ +#standardSQL +# Section: Cookies - Cookie Age +# Question: Which are the most common Max-Age and Expires cookie attribute values? +# Note: Expensive query could be combined with the other cookie queries to only go over the cookie headers once. +CREATE TEMPORARY FUNCTION getCookieAgeValues(cookie_value STRING, epochOfRequest NUMERIC) +RETURNS STRING DETERMINISTIC +LANGUAGE js AS ''' + const regexMaxAge = new RegExp(/max-age\\s*=\\s*(?-*[0-9]+)/i); + const regexExpires = new RegExp(/expires\\s*=\\s*(?.*?)(;|$)/i); + const cookieValues = [cookie_value]; + const result = { + "maxAge": [], + "expires": [] + }; + cookieValues.forEach(cookie => { + let maxAge = null; + let expires = null; + if (regexMaxAge.exec(cookie)) { + maxAge = Number(regexMaxAge.exec(cookie)[1]); + result["maxAge"].push(maxAge); + } + if (regexExpires.exec(cookie)) { + expires = regexExpires.exec(cookie)[1]; + result["expires"].push(expires); + } + }); + return JSON.stringify(result); +'''; + +WITH max_age_values AS ( + SELECT + client, + max_age_value + FROM + `httparchive.all.requests`, + UNNEST(response_headers) AS rh, + UNNEST(JSON_QUERY_ARRAY(getCookieAgeValues(rh.value, CAST(JSON_QUERY(summary, '$.startedDateTime') AS NUMERIC)), '$.maxAge')) AS max_age_value + WHERE + date = '2024-06-01' AND + is_root_page AND + LOWER(rh.name) = 'set-cookie' +), + +expires_values AS ( + SELECT + client, + expires_value + FROM + `httparchive.all.requests`, + UNNEST(response_headers) AS rh, + UNNEST(JSON_QUERY_ARRAY(getCookieAgeValues(rh.value, CAST(JSON_QUERY(summary, '$.startedDateTime') AS NUMERIC)), '$.expires')) AS expires_value + WHERE + date = '2024-06-01' AND + is_root_page AND + LOWER(rh.name) = 'set-cookie' +), + +max_age AS ( + SELECT + client, + 'max-age' AS type, + total_cookies_with_max_age AS total, + COUNT(0) AS freq, + COUNT(0) / total_cookies_with_max_age AS pct, + max_age_value AS attribute_value + FROM + max_age_values + JOIN + ( + SELECT + client, + COUNT(0) AS total_cookies_with_max_age + FROM + max_age_values + GROUP BY + client + ) + USING (client) + GROUP BY + client, + total, + attribute_value + ORDER BY + freq DESC + LIMIT 50 +), + +expires AS ( + SELECT + client, + 'expires' AS type, + total_cookies_with_expires AS total, + COUNT(0) AS freq, + COUNT(0) / total_cookies_with_expires AS pct, + expires_value AS attribute_value + FROM + expires_values + JOIN + ( + SELECT + client, + COUNT(0) AS total_cookies_with_expires + FROM + expires_values + GROUP BY + client + ) + USING (client) + GROUP BY + client, + total, + attribute_value + ORDER BY + freq DESC + LIMIT 50 +) + +SELECT * +FROM + max_age +UNION ALL +SELECT * +FROM + expires +ORDER BY + client, + type, + freq DESC diff --git a/sql/2024/security/coop_header_prevalence.sql b/sql/2024/security/coop_header_prevalence.sql new file mode 100644 index 00000000000..33f76fdd793 --- /dev/null +++ b/sql/2024/security/coop_header_prevalence.sql @@ -0,0 +1,30 @@ +#standardSQL +# Section: Attack Preventions - Preventing attacks using Cross-Origin policies +# Question: Which are the most common COOP values? +# Note: Considers headers of main document responses +SELECT + client, + coop_header, + SUM(COUNT(DISTINCT host)) OVER (PARTITION BY client) AS total_coop_headers, + COUNT(DISTINCT host) AS freq, + COUNT(DISTINCT host) / SUM(COUNT(DISTINCT host)) OVER (PARTITION BY client) AS pct +FROM ( + SELECT + client, + NET.HOST(url) AS host, + response_headers.value AS coop_header + FROM + `httparchive.all.requests`, + UNNEST(response_headers) AS response_headers + WHERE + date = '2024-06-01' AND + is_root_page AND + is_main_document AND + LOWER(response_headers.name) = 'cross-origin-opener-policy') +GROUP BY + client, + coop_header +ORDER BY + pct DESC +LIMIT + 100 diff --git a/sql/2024/security/corp_header_prevalence.sql b/sql/2024/security/corp_header_prevalence.sql new file mode 100644 index 00000000000..27a4e851ccc --- /dev/null +++ b/sql/2024/security/corp_header_prevalence.sql @@ -0,0 +1,29 @@ +#standardSQL +# Section: Attack Preventions - Preventing attacks using Cross-Origin policies +# Question: Which are the most common CORP values? +# Note: Considers headers of all responses including all subresources (header is used for script and img resources) +SELECT + client, + corp_header, + SUM(COUNT(DISTINCT host)) OVER (PARTITION BY client) AS total_corp_headers, + COUNT(DISTINCT host) AS freq, + COUNT(DISTINCT host) / SUM(COUNT(DISTINCT host)) OVER (PARTITION BY client) AS pct +FROM ( + SELECT + client, + NET.HOST(url) AS host, + response_headers.value AS corp_header + FROM + `httparchive.all.requests`, + UNNEST(response_headers) AS response_headers + WHERE + date = '2024-06-01' AND + is_root_page AND + LOWER(response_headers.name) = 'cross-origin-resource-policy') +GROUP BY + client, + corp_header +ORDER BY + pct DESC +LIMIT + 100 diff --git a/sql/2024/security/cryptominer_share.sql b/sql/2024/security/cryptominer_share.sql new file mode 100644 index 00000000000..2f4d8ed0766 --- /dev/null +++ b/sql/2024/security/cryptominer_share.sql @@ -0,0 +1,23 @@ +#standardSQL + # Section: Malpractices on the web + # Question: Which cryptominers have the largest market share? +SELECT + client, + t.technology, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_cryptominers, + COUNT(0) AS freq, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct +FROM + `httparchive.all.pages`, + UNNEST(technologies) AS t, + UNNEST(t.categories) AS category +WHERE + date = '2024-06-01' AND + category = 'Cryptominers' AND + is_root_page +GROUP BY + client, + t.technology +ORDER BY + client, + pct DESC diff --git a/sql/2024/security/cryptominer_usage.sql b/sql/2024/security/cryptominer_usage.sql new file mode 100644 index 00000000000..f05781eea29 --- /dev/null +++ b/sql/2024/security/cryptominer_usage.sql @@ -0,0 +1,26 @@ +#standardSQL + # Section: Malpractices on the web + # Question: How many sites used cryptominers over time? + # Note: The usage is very low, so maybe we want to drop this query. Also unclear which starting date we want +SELECT + DATE_TRUNC(date, MONTH) AS month, + client, + COUNT(DISTINCT + IF(category = 'Cryptominers', page, NULL)) AS freq, + COUNT(DISTINCT page) AS total_pages, + COUNT(DISTINCT + IF(category = 'Cryptominers', page, NULL)) / COUNT(DISTINCT page) AS pct +FROM + `httparchive.all.pages`, + UNNEST(technologies) AS t, + UNNEST(t.categories) AS category +WHERE + date >= '2022-05-01' AND + is_root_page +GROUP BY + date, + client +ORDER BY + client, + month, + pct DESC diff --git a/sql/2024/security/csp_allowed_host_frequency.sql b/sql/2024/security/csp_allowed_host_frequency.sql new file mode 100644 index 00000000000..b7611af8f6e --- /dev/null +++ b/sql/2024/security/csp_allowed_host_frequency.sql @@ -0,0 +1,50 @@ +#standardSQL +# Section: Attack preventions - Preventing attacks using CSP +# Question: Which are the most common "allowed host" values in CSPs on home pages? +WITH totals AS ( + SELECT + client, + COUNT(0) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-06-01' AND + is_root_page AND + is_main_document + GROUP BY + client +) + +SELECT + client, + csp_allowed_host, + total AS total_pages, + COUNT(DISTINCT page) AS freq, + COUNT(DISTINCT page) / total AS pct +FROM ( + SELECT + client, + page, + response_headers.value AS csp_header + FROM + `httparchive.all.requests`, + UNNEST(response_headers) AS response_headers + WHERE + date = '2024-06-01' AND + is_root_page AND + is_main_document AND + LOWER(response_headers.name) = 'content-security-policy' +) +JOIN + totals +USING (client), + UNNEST(REGEXP_EXTRACT_ALL(csp_header, r'(?i)(https*://[^\s;]+)[\s;]')) AS csp_allowed_host +WHERE + csp_header IS NOT NULL +GROUP BY + client, + total, + csp_allowed_host +ORDER BY + pct DESC +LIMIT 100 diff --git a/sql/2024/security/csp_allowed_host_frequency_wss.sql b/sql/2024/security/csp_allowed_host_frequency_wss.sql new file mode 100644 index 00000000000..bdd4f2900f6 --- /dev/null +++ b/sql/2024/security/csp_allowed_host_frequency_wss.sql @@ -0,0 +1,50 @@ +#standardSQL +# Section: Attack preventions - Preventing attacks using CSP +# Question: Which are the most common WSS "allowed host" values in CSPs on home pages? +WITH totals AS ( + SELECT + client, + COUNT(0) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-06-01' AND + is_root_page AND + is_main_document + GROUP BY + client +) + +SELECT + client, + csp_allowed_host, + total AS total_pages, + COUNT(DISTINCT page) AS freq, + COUNT(DISTINCT page) / total AS pct +FROM ( + SELECT + client, + page, + response_headers.value AS csp_header + FROM + `httparchive.all.requests`, + UNNEST(response_headers) AS response_headers + WHERE + date = '2024-06-01' AND + is_root_page AND + is_main_document AND + LOWER(response_headers.name) = 'content-security-policy' +) +JOIN + totals +USING (client), + UNNEST(REGEXP_EXTRACT_ALL(csp_header, r'(?i)(wss*://[^\s;]+)[\s;]')) AS csp_allowed_host +WHERE + csp_header IS NOT NULL +GROUP BY + client, + total, + csp_allowed_host +ORDER BY + pct DESC +LIMIT 100 diff --git a/sql/2024/security/csp_directives_usage.sql b/sql/2024/security/csp_directives_usage.sql new file mode 100644 index 00000000000..b49ac2e1dc3 --- /dev/null +++ b/sql/2024/security/csp_directives_usage.sql @@ -0,0 +1,30 @@ +#standardSQL +# Section: Attack preventions - Preventing attacks using CSP +# Question: Which are the most common CSP directives on home pages? +SELECT + client, + directive, + COUNT(0) AS total_csp_headers, + COUNTIF(REGEXP_CONTAINS(CONCAT(' ', csp_header, ' '), CONCAT(r'(?i)\W', directive, r'\W'))) AS num_with_directive, + COUNTIF(REGEXP_CONTAINS(CONCAT(' ', csp_header, ' '), CONCAT(r'(?i)\W', directive, r'\W'))) / COUNT(0) AS pct_with_directive +FROM ( + SELECT + client, + response_headers.value AS csp_header + FROM + `httparchive.all.requests`, + UNNEST(response_headers) AS response_headers + WHERE + date = '2024-06-01' AND + is_root_page AND + is_main_document AND + LOWER(response_headers.name) = 'content-security-policy'), + UNNEST(['child-src', 'connect-src', 'default-src', 'font-src', 'frame-src', 'img-src', 'manifest-src', 'media-src', 'object-src', 'prefetch-src', 'script-src', 'script-src-elem', 'script-src-attr', 'style-src', 'style-src-elem', 'style-src-attr', 'worker-src', 'base-uri', 'plugin-types', 'sandbox', 'form-action', 'frame-ancestors', 'navigate-to', 'report-uri', 'report-to', 'block-all-mixed-content', 'referrer', 'require-sri-for', 'require-trusted-types-for', 'trusted-types', 'upgrade-insecure-requests', 'input-protection']) AS directive +WHERE + csp_header IS NOT NULL +GROUP BY + client, + directive +ORDER BY + client, + pct_with_directive DESC diff --git a/sql/2024/security/csp_most_common_header.sql b/sql/2024/security/csp_most_common_header.sql new file mode 100644 index 00000000000..a0a3b0e85d8 --- /dev/null +++ b/sql/2024/security/csp_most_common_header.sql @@ -0,0 +1,31 @@ +#standardSQL +# Section: Attack Preventions - Preventing attacks using CSP +# Question: Which are the most common CSP values on home pages? +# Note: Only considers CSPs of the home page and not of embedded resources +SELECT + client, + csp_header, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_csp_headers, + COUNT(0) AS freq, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct +FROM ( + SELECT + client, + response_headers.value AS csp_header + FROM + `httparchive.all.requests`, + UNNEST(response_headers) AS response_headers + WHERE + date = '2024-06-01' AND + is_root_page AND + is_main_document AND + LOWER(response_headers.name) = 'content-security-policy') +WHERE + csp_header IS NOT NULL +GROUP BY + client, + csp_header +ORDER BY + pct DESC +LIMIT + 100 diff --git a/sql/2024/security/csp_number_of_allowed_hosts.sql b/sql/2024/security/csp_number_of_allowed_hosts.sql new file mode 100644 index 00000000000..3ca4f68b5e9 --- /dev/null +++ b/sql/2024/security/csp_number_of_allowed_hosts.sql @@ -0,0 +1,36 @@ +#standardSQL +# Section: Attack Preventions - Preventing attacks using CSP +# Question: CSP on home pages: number of unique headers, header length and number of allowed HTTP(S) hosts in all directives +CREATE TEMP FUNCTION getNumUniqueHosts(str STRING) AS ( + (SELECT COUNT(DISTINCT x) FROM UNNEST(REGEXP_EXTRACT_ALL(str, r'(?i)(https*://[^\s;]+)[\s;]')) AS x) +); + +SELECT + client, + percentile, + COUNT(0) AS total_requests, + COUNTIF(csp_header IS NOT NULL) AS total_csp_headers, + COUNTIF(csp_header IS NOT NULL) / COUNT(0) AS pct_csp_headers, + COUNT(DISTINCT csp_header) AS num_unique_csp_headers, + APPROX_QUANTILES(LENGTH(csp_header), 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS csp_header_length, + APPROX_QUANTILES(getNumUniqueHosts(csp_header), 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS unique_allowed_hosts +FROM ( + SELECT + client, + response_headers.value AS csp_header + FROM + `httparchive.all.requests`, + UNNEST(response_headers) AS response_headers + WHERE + date = '2024-06-01' AND + is_root_page AND + is_main_document AND + LOWER(response_headers.name) = 'content-security-policy' +), +UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2024/security/csp_script_source_list_keywords.sql b/sql/2024/security/csp_script_source_list_keywords.sql new file mode 100644 index 00000000000..c1bf47b19ee --- /dev/null +++ b/sql/2024/security/csp_script_source_list_keywords.sql @@ -0,0 +1,49 @@ +#standardSQL +# Section: Attack preventions - Preventing attacks using CSP +# Question: usage of default/script-src, and within the directive usage of strict-dynamic, nonce values, unsafe-inline and unsafe-eval +SELECT + client, + total_pages, + freq_csp, + freq_default_script_src, + SAFE_DIVIDE(freq_default_script_src, freq_csp) AS pct_default_script_src_over_csp, + freq_strict_dynamic, + SAFE_DIVIDE(freq_strict_dynamic, freq_csp) AS pct_strict_dynamic_over_csp, + SAFE_DIVIDE(freq_strict_dynamic, freq_default_script_src) AS pct_strict_dynamic_over_csp_with_src, + freq_nonce, + SAFE_DIVIDE(freq_nonce, freq_csp) AS pct_nonce_over_csp, + SAFE_DIVIDE(freq_nonce, freq_default_script_src) AS pct_nonce_over_csp_with_src, + freq_unsafe_inline, + SAFE_DIVIDE(freq_unsafe_inline, freq_csp) AS pct_unsafe_inline_over_csp, + SAFE_DIVIDE(freq_unsafe_inline, freq_default_script_src) AS pct_unsafe_inline_over_csp_with_src, + freq_unsafe_eval, + SAFE_DIVIDE(freq_unsafe_eval, freq_csp) AS pct_unsafe_eval_over_csp, + SAFE_DIVIDE(freq_unsafe_eval, freq_default_script_src) AS pct_unsafe_eval_over_csp_with_src +FROM ( + SELECT + client, + COUNT(0) AS total_pages, + COUNTIF(csp_header IS NOT NULL) AS freq_csp, + COUNTIF(REGEXP_CONTAINS(csp_header, '(?i)(default|script)-src')) AS freq_default_script_src, + COUNTIF(REGEXP_CONTAINS(csp_header, '(?i)(default|script)-src[^;]+strict-dynamic')) AS freq_strict_dynamic, + COUNTIF(REGEXP_CONTAINS(csp_header, '(?i)(default|script)-src[^;]+nonce-')) AS freq_nonce, + COUNTIF(REGEXP_CONTAINS(csp_header, '(?i)(default|script)-src[^;]+unsafe-inline')) AS freq_script_unsafe_inline, + COUNTIF(REGEXP_CONTAINS(csp_header, '(?i)(default|script)-src[^;]+unsafe-eval')) AS freq_script_unsafe_eval, + COUNTIF(REGEXP_CONTAINS(csp_header, '(?i)unsafe-inline')) AS freq_unsafe_inline, + COUNTIF(REGEXP_CONTAINS(csp_header, '(?i)unsafe-eval')) AS freq_unsafe_eval + FROM ( + SELECT + client, + response_headers.value AS csp_header + FROM + `httparchive.all.requests`, + UNNEST(response_headers) AS response_headers + WHERE + date = '2024-06-01' AND + is_root_page AND + is_main_document AND + LOWER(response_headers.name) = 'content-security-policy') + GROUP BY + client) +ORDER BY + client diff --git a/sql/2024/security/documentdomain_usage.sql b/sql/2024/security/documentdomain_usage.sql new file mode 100644 index 00000000000..e9223e7e4a9 --- /dev/null +++ b/sql/2024/security/documentdomain_usage.sql @@ -0,0 +1,19 @@ +#standardSQL + # Section: Attack preventions - Security Headers? (document.domain feature) + # Question: How often is document.domain still used even though deprecated? + # Note: Possible to port to httparchive.all.pages, however would require to recreate num_urls, total_urls, and pct_urls + # Note: Features here: https://source.chromium.org/chromium/chromium/src/+/main:third_party/blink/renderer/core/dom/document.cc?q=DocumentSetDomain + # Note: DocumentDomainSettingWithoutOriginAgentClusterHeader seems broken as the OAC header is very rare and yet the difference between DocumentSetDomain and DocumentDomainSettingWithoutOriginAgentClusterHeader is large (Explanation: from '20230201' they count no header as header exist.) +SELECT + client, + feature, + num_urls, + total_urls, + pct_urls +FROM + `httparchive.blink_features.usage` +WHERE + yyyymmdd = '20240601' AND + feature IN UNNEST(['DocumentSetDomain', 'DocumentDomainSettingWithoutOriginAgentClusterHeader', 'DocumentDomainSetWithDefaultPort', 'DocumentDomainSetWithNonDefaultPort', 'CrossOriginAccessBasedOnDocumentDomain', 'DocumentDomainEnabledCrossOriginAccess', 'DocumentDomainBlockedCrossOriginAccess', 'DocumentOpenAliasedOriginDocumentDomain']) +ORDER BY + pct_urls DESC diff --git a/sql/2024/security/feature_adoption_by_category.sql b/sql/2024/security/feature_adoption_by_category.sql new file mode 100644 index 00000000000..97676bc78a6 --- /dev/null +++ b/sql/2024/security/feature_adoption_by_category.sql @@ -0,0 +1,53 @@ +#standardSQL +# Section: Drivers of security mechanism adoption - Website Category +# Question: How is security feature adoption and category of a website related? +# Note: Not all headers have their individual percentages +# Note: Currenly uses regex search on respOtherHeaders that can have false positives if a header name is used as a value of a header; could use the new response_header struct instead +# Note: Only on the main document (is_main_document) +CREATE TEMP FUNCTION getNumSecurityHeaders(headers STRING) AS ( + ( + SELECT + COUNTIF(REGEXP_CONTAINS(headers, CONCAT('(?i)', headername, ' '))) + FROM + UNNEST(['Content-Security-Policy', 'Content-Security-Policy-Report-Only', 'Cross-Origin-Embedder-Policy', 'Cross-Origin-Opener-Policy', + 'Cross-Origin-Resource-Policy', 'Expect-CT', 'Feature-Policy', 'Permissions-Policy', 'Referrer-Policy', 'Report-To', + 'Strict-Transport-Security', 'X-Content-Type-Options', 'X-Frame-Options', 'X-XSS-Protection']) AS headername + ) +); + +SELECT + client, + category, + COUNT(0) AS total_pages_for_category, + COUNTIF(STARTS_WITH(url, 'https')) AS freq_https, + SAFE_DIVIDE(COUNTIF(STARTS_WITH(url, 'https')), COUNT(0)) AS pct_https, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(respOtherHeaders, '(?i)X-Frame-Options ')), COUNT(0)) AS pct_xfo, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(respOtherHeaders, '(?i)Strict-Transport-Security ')), COUNT(0)) AS pct_hsts, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(respOtherHeaders, '(?i)X-Content-Type-Options ')), COUNT(0)) AS pct_xcto, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(respOtherHeaders, '(?i)Expect-CT ')), COUNT(0)) AS pct_expectct, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(respOtherHeaders, '(?i)Content-Security-Policy ')), COUNT(0)) AS pct_csp, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(respOtherHeaders, '(?i)Content-Security-Policy-Report-Only ')), COUNT(0)) AS pct_csp, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(respOtherHeaders, '(?i)X-XSS-Protection ')), COUNT(0)) AS pct_xss, + AVG(getNumSecurityHeaders(respOtherHeaders)) AS avg_security_headers, + APPROX_QUANTILES(getNumSecurityHeaders(respOtherHeaders), 1000)[OFFSET(500)] AS median_security_headers +FROM ( + SELECT + client, + SPLIT(parent_category, '/')[1] AS category, + JSON_VALUE(r.summary, '$.respOtherHeaders') AS respOtherHeaders, + url + FROM + `httparchive.all.requests` AS r + INNER JOIN + UNNEST(`httparchive.fn.GET_HOST_CATEGORIES`(url)) + WHERE + date = '2024-06-01' AND + is_root_page AND + is_main_document +) +GROUP BY + client, + category +ORDER BY + client, + total_pages_for_category DESC diff --git a/sql/2024/security/feature_adoption_by_country.sql b/sql/2024/security/feature_adoption_by_country.sql new file mode 100644 index 00000000000..0a40cb55a85 --- /dev/null +++ b/sql/2024/security/feature_adoption_by_country.sql @@ -0,0 +1,57 @@ +#standardSQL +# Section: Drivers of security mechanism adoption - Location of a website +# Question: How is security feature adoption and location of a website related (i.e. which is the most common country visiting that website)? +# Note: Security feature adoption grouped by sites frequently visited from different countries +# Note: Not all headers have their individual percentages +# Note: Currenly uses regex search on respOtherHeaders that can have false positives if a header name is used as a value of a header; could use the new response_header struct instead +# Note: Only on the main document (is_main_document) +CREATE TEMP FUNCTION getNumSecurityHeaders(headers STRING) AS ( + ( + SELECT + COUNTIF(REGEXP_CONTAINS(headers, CONCAT('(?i)', headername, ' '))) + FROM + UNNEST(['Content-Security-Policy', 'Content-Security-Policy-Report-Only', 'Cross-Origin-Embedder-Policy', 'Cross-Origin-Opener-Policy', + 'Cross-Origin-Resource-Policy', 'Expect-CT', 'Feature-Policy', 'Permissions-Policy', 'Referrer-Policy', 'Report-To', + 'Strict-Transport-Security', 'X-Content-Type-Options', 'X-Frame-Options', 'X-XSS-Protection']) AS headername + ) +); + +SELECT + client, + country, + COUNT(0) AS total_pages_for_country, + COUNTIF(STARTS_WITH(url, 'https')) AS freq_https, + SAFE_DIVIDE(COUNTIF(STARTS_WITH(url, 'https')), COUNT(0)) AS pct_https, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(respOtherHeaders, '(?i)X-Frame-Options ')), COUNT(0)) AS pct_xfo, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(respOtherHeaders, '(?i)Strict-Transport-Security ')), COUNT(0)) AS pct_hsts, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(respOtherHeaders, '(?i)X-Content-Type-Options ')), COUNT(0)) AS pct_xcto, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(respOtherHeaders, '(?i)Expect-CT ')), COUNT(0)) AS pct_expectct, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(respOtherHeaders, '(?i)Content-Security-Policy ')), COUNT(0)) AS pct_csp, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(respOtherHeaders, '(?i)Content-Security-Policy-Report-Only ')), COUNT(0)) AS pct_csp, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(respOtherHeaders, '(?i)X-XSS-Protection ')), COUNT(0)) AS pct_xss, + AVG(getNumSecurityHeaders(respOtherHeaders)) AS avg_security_headers, + APPROX_QUANTILES(getNumSecurityHeaders(respOtherHeaders), 1000)[OFFSET(500)] AS median_security_headers +FROM ( + SELECT + client, + `chrome-ux-report.experimental`.GET_COUNTRY(country_code) AS country, + JSON_VALUE(r.summary, '$.respOtherHeaders') AS respOtherHeaders, + url + FROM + `httparchive.all.requests` AS r + INNER JOIN + `chrome-ux-report.experimental.country` AS c + ON + url = CONCAT(c.origin, '/') + WHERE + date = '2024-06-01' AND + yyyymm = 202406 AND + is_root_page AND + is_main_document +) +GROUP BY + client, + country +ORDER BY + client, + total_pages_for_country DESC diff --git a/sql/2024/security/feature_adoption_by_other_features.sql b/sql/2024/security/feature_adoption_by_other_features.sql new file mode 100644 index 00000000000..31ab2a21013 --- /dev/null +++ b/sql/2024/security/feature_adoption_by_other_features.sql @@ -0,0 +1,53 @@ +#standardSQL +# Section: Drivers of security mechanism adoption - Other Headers? +# Question: Which features (mostly security headers) influence the adoption of other features? +# Note: Query seems unnatural after the port; Add other (new) features? +SELECT + client, + headername, + COUNT(0) AS total_pages, + COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' '))) AS total_with_header, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' '))), COUNT(0)) AS pct, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')) AND + STARTS_WITH(url, 'https')), COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')))) AS pct_header_and_https, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')) AND + REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), '(?i)Content-Security-Policy ')), COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')))) AS pct_header_and_csp, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')) AND + REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), '(?i)Content-Security-Policy-Report-Only ')), COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')))) AS pct_header_and_csp_report_only, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')) AND + REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), '(?i)Cross-Origin-Embedder-Policy ')), COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')))) AS pct_header_and_coep, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')) AND + REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), '(?i)Cross-Origin-Opener-Policy ')), COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')))) AS pct_header_and_coop, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')) AND + REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), '(?i)Cross-Origin-Resource-Policy ')), COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')))) AS pct_header_and_corp, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')) AND + REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), '(?i)Expect-CT ')), COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')))) AS pct_header_and_expectct, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')) AND + REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), '(?i)Feature-Policy ')), COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')))) AS pct_header_and_featurep, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')) AND + REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), '(?i)Permissions-Policy ')), COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')))) AS pct_header_and_permissionsp, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')) AND + REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), '(?i)Referrer-Policy ')), COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')))) AS pct_header_and_referrerp, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')) AND + REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), '(?i)Report-To ')), COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')))) AS pct_header_and_reportto, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')) AND + REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), '(?i)Strict-Transport-Security ')), COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')))) AS pct_header_and_hsts, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')) AND + REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), '(?i)X-Content-Type-Options ')), COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')))) AS pct_header_and_xcto, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')) AND + REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), '(?i)X-Frame-Options ')), COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')))) AS pct_header_and_xfo, + SAFE_DIVIDE(COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')) AND + REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), '(?i)X-XSS-Protection ')), COUNTIF(REGEXP_CONTAINS(JSON_VALUE(summary, '$.respOtherHeaders'), CONCAT('(?i)', headername, ' ')))) AS pct_header_and_xss +FROM + `httparchive.all.requests`, + UNNEST(['Content-Security-Policy', 'Content-Security-Policy-Report-Only', 'Cross-Origin-Embedder-Policy', 'Cross-Origin-Opener-Policy', 'Cross-Origin-Resource-Policy', 'Expect-CT', 'Feature-Policy', 'Permissions-Policy', 'Referrer-Policy', 'Report-To', 'Strict-Transport-Security', 'X-Content-Type-Options', 'X-Frame-Options', 'X-XSS-Protection']) AS headername +WHERE + date = '2024-06-01' AND + is_root_page AND + is_main_document +GROUP BY + client, + headername +ORDER BY + client, + headername diff --git a/sql/2024/security/feature_adoption_by_technology.sql b/sql/2024/security/feature_adoption_by_technology.sql new file mode 100644 index 00000000000..d86792989bb --- /dev/null +++ b/sql/2024/security/feature_adoption_by_technology.sql @@ -0,0 +1,83 @@ +#standardSQL +# Section: Driveres of security mechanism - Technology stack +# Question: How are security features and used technology correlated? +# Note: Adoption of features based on the technology that is used +WITH +totals AS ( + SELECT + client, + category, + t.technology AS technology, + COUNT(page) AS total_pages_with_technology, + COUNT(DISTINCT + IF(STARTS_WITH(page, 'https'), page, NULL)) AS total_https_pages + FROM + `httparchive.all.pages`, + UNNEST(technologies) AS t, + UNNEST(t.categories) AS category + WHERE + date = '2024-06-01' AND + is_root_page + GROUP BY + client, + category, + technology +) +SELECT + client, + category, + technology, + headername, + total_pages_with_technology, + total_https_pages, + COUNT(DISTINCT + IF(REGEXP_CONTAINS(respOtherHeaders, CONCAT('(?i)', headername, ' ')), url, NULL)) AS freq, + SAFE_DIVIDE(COUNT(DISTINCT + IF(REGEXP_CONTAINS(respOtherHeaders, CONCAT('(?i)', headername, ' ')), url, NULL)), total_pages_with_technology) AS pct, + SAFE_DIVIDE(COUNT(DISTINCT + IF(REGEXP_CONTAINS(respOtherHeaders, CONCAT('(?i)', headername, ' ')) AND + STARTS_WITH(url, 'https'), url, NULL)), total_https_pages) AS pct_https +FROM ( + SELECT + client, + technologies, + JSON_VALUE(r.summary, '$.respOtherHeaders') AS respOtherHeaders, + url + FROM + `httparchive.all.requests` AS r + INNER JOIN + `httparchive.all.pages` + USING + (client, + page, + date, + is_root_page) + WHERE + date = '2024-06-01' AND + is_root_page AND + is_main_document), + UNNEST(['Content-Security-Policy', 'Content-Security-Policy-Report-Only', 'Cross-Origin-Embedder-Policy', 'Cross-Origin-Opener-Policy', 'Cross-Origin-Resource-Policy', 'Expect-CT', 'Feature-Policy', 'Permissions-Policy', 'Referrer-Policy', 'Report-To', 'Strict-Transport-Security', 'X-Content-Type-Options', 'X-Frame-Options', 'X-XSS-Protection', 'Timing-Allow-Origin', 'Origin-Agent-Cluster']) AS headername, + UNNEST(technologies) AS t, + UNNEST(t.categories) AS category +INNER JOIN + totals +USING + (client, + category, + technology) +GROUP BY + client, + category, + technology, + headername, + total_pages_with_technology, + total_https_pages +HAVING + total_pages_with_technology >= 1000 AND + category IN UNNEST(['Blogs', 'CDN', 'Web frameworks', 'Programming languages', 'CMS', 'Ecommerce', 'PaaS', 'Security']) AND + pct >= 0.50 +ORDER BY + client, + category, + technology, + headername diff --git a/sql/2024/security/feature_adoption_by_topN_technologies.sql b/sql/2024/security/feature_adoption_by_topN_technologies.sql new file mode 100644 index 00000000000..1a07c7ed605 --- /dev/null +++ b/sql/2024/security/feature_adoption_by_topN_technologies.sql @@ -0,0 +1,91 @@ +#standardSQL +# Section: Drivers of security mechanism adoption - Technology stack +# Question: Determines to what extent the top-N technology drivers are responsible for the global adoption of different security features +# Note: Not sure if this query makes sense +WITH app_headers AS ( + SELECT + client, + headername, + category, + t.technology AS technology, + JSON_VALUE(r.summary, '$.respOtherHeaders') AS respOtherHeaders, + url + FROM + `httparchive.all.requests` AS r + INNER JOIN + `httparchive.all.pages` + USING + (client, page, date, is_root_page), + UNNEST(['Content-Security-Policy', 'Content-Security-Policy-Report-Only', 'Cross-Origin-Embedder-Policy', 'Cross-Origin-Opener-Policy', + 'Cross-Origin-Resource-Policy', 'Expect-CT', 'Feature-Policy', 'Permissions-Policy', 'Referrer-Policy', 'Report-To', + 'Strict-Transport-Security', 'X-Content-Type-Options', 'X-Frame-Options', 'X-XSS-Protection', 'Timing-Allow-Origin', 'Origin-Agent-Cluster']) AS headername, + UNNEST(technologies) AS t, + UNNEST(t.categories) AS category + WHERE + date = '2024-06-01' AND + is_root_page AND + is_main_document AND + category IN UNNEST(['Blogs', 'CDN', 'Web frameworks', 'Programming languages', 'CMS', 'Ecommerce', 'PaaS', 'Security']) +) + +SELECT + client, + headername, + topN, + ARRAY_TO_STRING(array_slice(top_apps, 0, topN - 1), ', ', 'NULL') AS topN_apps, + COUNT(DISTINCT IF(REGEXP_CONTAINS(respOtherHeaders, CONCAT('(?i)', headername, ' ')) AND CONCAT(category, '_', technology) IN UNNEST(array_slice(top_apps, 0, topN - 1)), url, NULL)) AS freq_in_topN, + SAFE_DIVIDE(COUNT(DISTINCT IF(REGEXP_CONTAINS(respOtherHeaders, CONCAT('(?i)', headername, ' ')) AND CONCAT(category, '_', technology) IN UNNEST(array_slice(top_apps, 0, topN - 1)), url, NULL)), global_freq) AS pct_overall +FROM + app_headers +INNER JOIN ( + SELECT + headername, + client, + ARRAY_AGG(CONCAT(category, '_', technology) ORDER BY freq DESC) AS top_apps + FROM ( + SELECT + headername, + client, + category, + technology, + COUNT(DISTINCT IF(REGEXP_CONTAINS(respOtherHeaders, CONCAT('(?i)', headername, ' ')), url, NULL)) AS freq, + SAFE_DIVIDE(COUNT(DISTINCT IF(REGEXP_CONTAINS(respOtherHeaders, CONCAT('(?i)', headername, ' ')), url, NULL)), COUNT(DISTINCT url)) AS pct + FROM + app_headers + GROUP BY + headername, + client, + category, + technology + HAVING + pct > 0.8 AND + freq > 1000 + ) + GROUP BY + client, + headername) +USING + (client, headername) +INNER JOIN ( + SELECT + client, + headername, + COUNT(DISTINCT IF(REGEXP_CONTAINS(respOtherHeaders, CONCAT('(?i)', headername, ' ')), url, NULL)) AS global_freq + FROM + app_headers + GROUP BY + client, + headername) +USING + (client, headername), + UNNEST(GENERATE_ARRAY(1, 10)) AS topN +GROUP BY + client, + topN, + topN_apps, + headername, + global_freq +ORDER BY + client, + headername, + topN diff --git a/sql/2024/security/fp_header_prevalence.sql b/sql/2024/security/fp_header_prevalence.sql new file mode 100644 index 00000000000..33498c4a027 --- /dev/null +++ b/sql/2024/security/fp_header_prevalence.sql @@ -0,0 +1,30 @@ +#standardSQL +# Section: ? (Permissions) +# Question: Which are the most common FP values? +# Note: Considers headers of main document responses +SELECT + client, + fp_header, + SUM(COUNT(DISTINCT host)) OVER (PARTITION BY client) AS total_fp_headers, + COUNT(DISTINCT host) AS freq, + COUNT(DISTINCT host) / SUM(COUNT(DISTINCT host)) OVER (PARTITION BY client) AS pct +FROM ( + SELECT + client, + NET.HOST(url) AS host, + response_headers.value AS fp_header + FROM + `httparchive.all.requests`, + UNNEST(response_headers) AS response_headers + WHERE + date = '2024-06-01' AND + is_root_page AND + is_main_document AND + LOWER(response_headers.name) = 'feature-policy') +GROUP BY + client, + fp_header +ORDER BY + pct DESC +LIMIT + 100 diff --git a/sql/2024/security/home_page_https_usage.sql b/sql/2024/security/home_page_https_usage.sql new file mode 100644 index 00000000000..a3531471e81 --- /dev/null +++ b/sql/2024/security/home_page_https_usage.sql @@ -0,0 +1,18 @@ +#standardSQL +# Section: Transpont Security - Protocol versions +# Question: How many websites (home pages only) use HTTP vs HTTPS? +SELECT + client, + STARTS_WITH(page, 'https') AS https, + COUNT(0) AS pages, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct +FROM + `httparchive.all.requests` +WHERE + date = '2024-06-01' AND + is_root_page AND + is_main_document +GROUP BY + client, + https diff --git a/sql/2024/security/hsts_attributes.sql b/sql/2024/security/hsts_attributes.sql new file mode 100644 index 00000000000..b52d4e28566 --- /dev/null +++ b/sql/2024/security/hsts_attributes.sql @@ -0,0 +1,25 @@ +#standardSQL +# Section: Transport Security - HTTP Strict Transport Security +# Question: How many websites use HSTS includeSubDomains and preload? +SELECT + client, + COUNT(0) AS total_requests, + COUNTIF(hsts_header_val IS NOT NULL) AS total_hsts_headers, + COUNTIF(hsts_header_val IS NOT NULL) / COUNT(0) AS pct_hsts_requests, + COUNTIF(REGEXP_CONTAINS(hsts_header_val, r'(?i)max-age\s*=\s*\d+') AND NOT REGEXP_CONTAINS(CONCAT(hsts_header_val, ' '), r'(?i)max-age\s*=\s*0\W')) / COUNTIF(hsts_header_val IS NOT NULL) AS pct_valid_max_age, + COUNTIF(REGEXP_CONTAINS(CONCAT(hsts_header_val, ' '), r'(?i)max-age\s*=\s*0\W')) / COUNTIF(hsts_header_val IS NOT NULL) AS pct_zero_max_age, + COUNTIF(REGEXP_CONTAINS(hsts_header_val, r'(?i)includeSubDomains')) / COUNTIF(hsts_header_val IS NOT NULL) AS pct_include_subdomains, + COUNTIF(REGEXP_CONTAINS(hsts_header_val, r'(?i)preload')) / COUNTIF(hsts_header_val IS NOT NULL) AS pct_preload +FROM ( + SELECT + client, + REGEXP_EXTRACT(JSON_VALUE(summary, '$.respOtherHeaders'), r'(?i)strict-transport-security =([^,]+)') AS hsts_header_val + FROM + `httparchive.all.requests` + WHERE + date = '2024-06-01' AND + is_root_page AND + is_main_document +) +GROUP BY + client diff --git a/sql/2024/security/hsts_max_age_percentiles.sql b/sql/2024/security/hsts_max_age_percentiles.sql new file mode 100644 index 00000000000..ac402c4f69a --- /dev/null +++ b/sql/2024/security/hsts_max_age_percentiles.sql @@ -0,0 +1,26 @@ +#standardSQL +# Section: Transport Security - HTTP Strict Transport Security +# Question: What is the distribution of max-age values for HSTS? +SELECT + client, + percentile, + APPROX_QUANTILES(max_age, 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS max_age +FROM ( + SELECT + client, + SAFE_CAST(REGEXP_EXTRACT(response_headers.value, r'(?i)max-age=\s*(-?\d+)') AS NUMERIC) AS max_age + FROM + `httparchive.all.requests`, + UNNEST(response_headers) AS response_headers + WHERE + date = '2024-06-01' AND + is_root_page AND + LOWER(response_headers.name) = 'strict-transport-security' +), +UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +GROUP BY + percentile, + client +ORDER BY + percentile, + client diff --git a/sql/2024/security/html_sanitization_usage.sql b/sql/2024/security/html_sanitization_usage.sql new file mode 100644 index 00000000000..6d67686b9f7 --- /dev/null +++ b/sql/2024/security/html_sanitization_usage.sql @@ -0,0 +1,19 @@ +#standardSQL + # Section: Attack preventions - HTML Sanitization + # Question: How often is setHTMLUnsafe and parseHTMLUnsafe used? + # Note: Possible to port to httparchive.all.pages, however would require to recreate num_urls, total_urls, and pct_urls + # Note: https://chromestatus.com/feature/6560361081995264 + # Note: very rare! +SELECT + client, + feature, + num_urls, + total_urls, + pct_urls +FROM + `httparchive.blink_features.usage` +WHERE + yyyymmdd = '20240601' AND + feature IN UNNEST(['SetHTMLUnsafe', 'ParseHTMLUnsafe']) +ORDER BY + pct_urls DESC diff --git a/sql/2024/security/https_request_over_time.sql b/sql/2024/security/https_request_over_time.sql new file mode 100644 index 00000000000..3958f1656e7 --- /dev/null +++ b/sql/2024/security/https_request_over_time.sql @@ -0,0 +1,20 @@ +#standardSQL +# Section: Transport Security - ? +# Question: How many requests are made via HTTPS over time? +# Note: Currently all requests on the landing page; could be restricted to top-level requests only (is_main_document) +SELECT + date, + client, + SUM(IF(STARTS_WITH(url, 'https'), 1, 0)) / COUNT(0) AS percent +FROM + `httparchive.all.requests` +WHERE + date >= '2022-06-01' AND + is_root_page +# AND is_main_document +GROUP BY + date, + client +ORDER BY + date DESC, + client diff --git a/sql/2024/security/https_server_redirects.sql b/sql/2024/security/https_server_redirects.sql new file mode 100644 index 00000000000..cb01dfa4784 --- /dev/null +++ b/sql/2024/security/https_server_redirects.sql @@ -0,0 +1,23 @@ +#standardSQL +# Section: Transport Security - Unused? +# Question: How many HTTP requests exist on pages and how many of them server-side redirect to HTTPS +# Note: Does not distinguish between whether the main Page URL is HTTP or HTTPS +SELECT + client, + date, + COUNT(DISTINCT url) AS total_urls_on_page, + COUNT(DISTINCT(CASE WHEN url LIKE 'http://%' THEN url END)) AS count_http_urls_on_page, + COUNT(DISTINCT(CASE WHEN url LIKE 'http://%' THEN url END)) / COUNT(DISTINCT url) AS pct_http_urls_on_page, + COUNT(DISTINCT(CASE WHEN url LIKE 'http://%' AND JSON_VALUE(summary, '$.resp_location') LIKE 'https://%' AND CAST(JSON_VALUE(summary, '$.status') AS INT) BETWEEN 300 AND 399 THEN url END)) AS count_http_urls_with_https_redirect_on_page, + COUNT(DISTINCT(CASE WHEN url LIKE 'http://%' AND JSON_VALUE(summary, '$.resp_location') LIKE 'https://%' AND CAST(JSON_VALUE(summary, '$.status') AS INT) BETWEEN 300 AND 399 THEN url END)) / COUNT(DISTINCT(CASE WHEN url LIKE 'http://%' THEN url END)) AS pct_http_urls_with_https_redirect_on_page +FROM + `httparchive.all.requests` +WHERE + date = '2024-06-01' AND + is_root_page +GROUP BY + client, + date +ORDER BY + client, + date diff --git a/sql/2024/security/iframe_allow_directive_values.sql b/sql/2024/security/iframe_allow_directive_values.sql new file mode 100644 index 00000000000..d92184993cd --- /dev/null +++ b/sql/2024/security/iframe_allow_directive_values.sql @@ -0,0 +1,63 @@ +#standardSQL +# Section: Content Inclusion - Permissions Policy +# Question: Which are the most prominent directives/directive-value pairs for the allow attributes on iframes? +CREATE TEMP FUNCTION getNumWithAllowAttribute(payload STRING) AS (( + SELECT + COUNT(0) + FROM + UNNEST(JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._security'), '$.iframe-allow-sandbox')) AS iframeAttr + WHERE + JSON_EXTRACT_SCALAR(iframeAttr, '$.allow') IS NOT NULL +)); + +SELECT + client, + SPLIT(TRIM(allow_attr), ' ')[OFFSET(0)] AS directive, + TRIM(origin) AS origin, + total_iframes_with_allow, + COUNT(0) AS freq, + COUNT(0) / total_iframes_with_allow AS pct +FROM ( + SELECT + client, + JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._security'), '$.iframe-allow-sandbox') AS iframeAttrs + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page + ), + UNNEST(iframeAttrs) AS iframeAttr, + UNNEST(REGEXP_EXTRACT_ALL(JSON_EXTRACT_SCALAR(iframeAttr, '$.allow'), r'(?i)([^,;]+)')) AS allow_attr +JOIN ( + SELECT + client, + SUM(getNumWithAllowAttribute(payload)) AS total_iframes_with_allow + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page + GROUP BY + client +) USING (client), +UNNEST( -- Directive may specify explicit origins or not. + IF( + ARRAY_LENGTH(SPLIT(TRIM(allow_attr), ' ')) = 1, -- test if any explicit origin is provided + [TRIM(allow_attr), ''], -- if not, add a dummy empty origin to make the query work + SPLIT(TRIM(allow_attr), ' ' -- if it is, split the different origins + ) + ) +) AS origin WITH OFFSET AS offset +WHERE + offset > 0 -- do not retain the first part of the directive (as this is the directive name) +GROUP BY + client, + directive, + origin, + total_iframes_with_allow +HAVING + pct > 0.001 +ORDER BY + client, + pct DESC diff --git a/sql/2024/security/iframe_allow_directives.sql b/sql/2024/security/iframe_allow_directives.sql new file mode 100644 index 00000000000..287a037c0e5 --- /dev/null +++ b/sql/2024/security/iframe_allow_directives.sql @@ -0,0 +1,51 @@ +#standardSQL +# Section: Content Inclusion - Permissions Policy +# Question: Which are the most prominent directives for the allow attributes on iframes? +CREATE TEMP FUNCTION getNumWithAllowAttribute(payload STRING) AS (( + SELECT + COUNT(0) + FROM + UNNEST(JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._security'), '$.iframe-allow-sandbox')) AS iframeAttr + WHERE + JSON_EXTRACT_SCALAR(iframeAttr, '$.allow') IS NOT NULL +)); + +SELECT + client, + SPLIT(TRIM(allow_attr), ' ')[OFFSET(0)] AS directive, + total_iframes_with_allow, + COUNT(0) AS freq, + COUNT(0) / total_iframes_with_allow AS pct +FROM ( + SELECT + client, + JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._security'), '$.iframe-allow-sandbox') AS iframeAttrs + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page + ), + UNNEST(iframeAttrs) AS iframeAttr, + UNNEST(REGEXP_EXTRACT_ALL(JSON_EXTRACT_SCALAR(iframeAttr, '$.allow'), r'(?i)([^,;]+)')) AS allow_attr +JOIN ( + SELECT + client, + SUM(getNumWithAllowAttribute(payload)) AS total_iframes_with_allow + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page + GROUP BY + client +) USING (client) +GROUP BY + client, + directive, + total_iframes_with_allow +HAVING + pct > 0.001 +ORDER BY + client, + pct DESC diff --git a/sql/2024/security/iframe_attribute_popular_hosts.sql b/sql/2024/security/iframe_attribute_popular_hosts.sql new file mode 100644 index 00000000000..68d416bf98d --- /dev/null +++ b/sql/2024/security/iframe_attribute_popular_hosts.sql @@ -0,0 +1,59 @@ +#standardSQL +# Section: Content Inclusion - Iframe Sandbox/Permissions Policy +# Question: Wich are the most commont hostnames of iframes that have an allow or sandbox attribute? +CREATE TEMP FUNCTION hasPolicy(attr STRING, policy_type STRING) +RETURNS BOOL DETERMINISTIC +LANGUAGE js AS ''' + const $ = JSON.parse(attr); + return $[policy_type] !== null; +'''; + +SELECT + client, + policy_type, + hostname, + total_iframes, + COUNTIF(has_policy) AS freq, + COUNTIF(has_policy) / total_iframes AS pct +FROM ( + SELECT + client, + policy_type, + JSON_EXTRACT_SCALAR(iframeAttr, '$.hostname') AS hostname, + hasPolicy(iframeAttr, policy_type) AS has_policy + FROM ( + SELECT + client, + JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._security'), '$.iframe-allow-sandbox') AS iframeAttrs + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page + ), + UNNEST(iframeAttrs) AS iframeAttr, + UNNEST(['allow', 'sandbox']) AS policy_type +) +JOIN ( + SELECT + client, + SUM(ARRAY_LENGTH(JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._security'), '$.iframe-allow-sandbox'))) AS total_iframes + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page + GROUP BY + client) +USING + (client) +GROUP BY + client, + total_iframes, + policy_type, + hostname +HAVING + pct > 0.001 +ORDER BY + client, + pct DESC diff --git a/sql/2024/security/iframe_attributes_usage.sql b/sql/2024/security/iframe_attributes_usage.sql new file mode 100644 index 00000000000..9b64593ba90 --- /dev/null +++ b/sql/2024/security/iframe_attributes_usage.sql @@ -0,0 +1,45 @@ +#standardSQL +# Section: Content Inclusion - Iframe Sandbox/Permissions Policy +# Question: How often are the allow and sandbox attributes used on iframes? Both per page and over all iframe elements +SELECT + client, + date, + COUNT(0) AS total_iframes, + COUNTIF(allow IS NOT NULL) AS freq_allow, + COUNTIF(allow IS NOT NULL) / COUNT(0) AS pct_allow_frames, + COUNTIF(sandbox IS NOT NULL) AS freq_sandbox, + COUNTIF(sandbox IS NOT NULL) / COUNT(0) AS pct_sandbox_frames, + COUNTIF(allow IS NOT NULL AND sandbox IS NOT NULL) AS freq_both_frames, + COUNTIF(allow IS NOT NULL AND sandbox IS NOT NULL) / COUNT(0) AS pct_both_frames, + COUNT(DISTINCT url) AS total_urls, + COUNT(DISTINCT IF(allow IS NOT NULL, url, NULL)) AS allow_freq_urls, + COUNT(DISTINCT IF(allow IS NOT NULL, url, NULL)) / COUNT(DISTINCT url) AS allow_pct_urls, + COUNT(DISTINCT IF(sandbox IS NOT NULL, url, NULL)) AS sandbox_freq_urls, + COUNT(DISTINCT IF(sandbox IS NOT NULL, url, NULL)) / COUNT(DISTINCT url) AS sandbox_pct_urls +FROM ( + SELECT + client, + date, + url, + JSON_EXTRACT_SCALAR(iframeAttr, '$.allow') AS allow, + JSON_EXTRACT_SCALAR(iframeAttr, '$.sandbox') AS sandbox + FROM ( + SELECT + client, + date, + page AS url, + JSON_EXTRACT_ARRAY(JSON_QUERY(custom_metrics, '$.security'), '$.iframe-allow-sandbox') AS iframeAttrs + FROM + `httparchive.all.pages` + WHERE + (date = '2022-06-01' OR date = '2023-06-01' OR date = '2023-12-01' OR date = '2024-03-01' OR date = '2024-04-01' OR date = '2024-05-01' OR date = '2024-06-01') AND + is_root_page + ) + LEFT JOIN UNNEST(iframeAttrs) AS iframeAttr + ) +GROUP BY + client, + date +ORDER BY + date, + client diff --git a/sql/2024/security/iframe_sandbox_directives.sql b/sql/2024/security/iframe_sandbox_directives.sql new file mode 100644 index 00000000000..832df88bb2f --- /dev/null +++ b/sql/2024/security/iframe_sandbox_directives.sql @@ -0,0 +1,49 @@ +#standardSQL +# Section: Content Inclusion - Iframe Sandbox +# Question: Which are the most common directives for the sandbox attribute on iframes? +CREATE TEMP FUNCTION getNumWithSandboxAttribute(payload STRING) AS (( + SELECT + COUNT(0) + FROM + UNNEST(JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._security'), '$.iframe-allow-sandbox')) AS iframeAttr + WHERE + JSON_EXTRACT_SCALAR(iframeAttr, '$.sandbox') IS NOT NULL +)); + +SELECT + client, + TRIM(sandbox_attr) AS directive, + total_iframes_with_sandbox, + COUNT(0) AS freq, + COUNT(0) / total_iframes_with_sandbox AS pct +FROM ( + SELECT + client, + JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._security'), '$.iframe-allow-sandbox') AS iframeAttrs + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page + ), + UNNEST(iframeAttrs) AS iframeAttr, + UNNEST(SPLIT(JSON_EXTRACT_SCALAR(iframeAttr, '$.sandbox'), ' ')) AS sandbox_attr +JOIN ( + SELECT + client, + SUM(getNumWithSandboxAttribute(payload)) AS total_iframes_with_sandbox + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page + GROUP BY + client +) USING (client) +GROUP BY + client, + directive, + total_iframes_with_sandbox +ORDER BY + client, + pct DESC diff --git a/sql/2024/security/meta_csp_disallowed_directives.sql b/sql/2024/security/meta_csp_disallowed_directives.sql new file mode 100644 index 00000000000..7816a584cd9 --- /dev/null +++ b/sql/2024/security/meta_csp_disallowed_directives.sql @@ -0,0 +1,28 @@ +#standardSQL +# Section: Security misconfigurations - CSP directives that are ignored in +# Question: How many pages use invalid CSP directives in ? +# Note: uses the old payload._almanac metric location instead of custom_metrics.almanac (also the meta-nodes metric is in the generic almanac.js custom metric) +SELECT + client, + COUNT(DISTINCT page) AS total_pages, + COUNT(CASE WHEN REGEXP_CONTAINS(LOWER(JSON_VALUE(meta_node, '$.content')), r'(?i)frame-ancestors') THEN page END) AS count_frame_ancestors, + COUNT(CASE WHEN REGEXP_CONTAINS(LOWER(JSON_VALUE(meta_node, '$.content')), r'(?i)frame-ancestors') THEN page END) / COUNT(DISTINCT page) AS pct_frame_ancestors, + COUNT(CASE WHEN REGEXP_CONTAINS(LOWER(JSON_VALUE(meta_node, '$.content')), r'(?i)sandbox( allow-[a-z]+)*;') THEN page END) AS count_sandbox, + COUNT(CASE WHEN REGEXP_CONTAINS(LOWER(JSON_VALUE(meta_node, '$.content')), r'(?i)sandbox( allow-[a-z]+)*;') THEN page END) / COUNT(DISTINCT page) AS pct_sandbox +FROM ( + SELECT + client, + page, + JSON_VALUE(payload, '$._almanac') AS metrics + FROM + `httparchive.all.pages.` + WHERE + date = '2024-06-01' AND + is_root_page +), +UNNEST(JSON_QUERY_ARRAY(metrics, '$.meta-nodes.nodes')) meta_node, +UNNEST(['Content-Security-Policy']) AS policy +WHERE + LOWER(JSON_VALUE(meta_node, '$.http-equiv')) = 'content-security-policy' OR LOWER(JSON_VALUE(meta_node, '$.name')) = 'content-security-policy' +GROUP BY + client diff --git a/sql/2024/security/meta_policies_allowed_vs_disallowed.sql b/sql/2024/security/meta_policies_allowed_vs_disallowed.sql new file mode 100644 index 00000000000..c1955cab234 --- /dev/null +++ b/sql/2024/security/meta_policies_allowed_vs_disallowed.sql @@ -0,0 +1,30 @@ +#standardSQL +# Section: Attack preventions - Preventing attacks using +# Question: How many pages use security policies in meta tags (both allowed and ignored ones) +# Note: uses the old payload._almanac metric location instead of custom_metrics.almanac (also the meta-nodes metric is in the generic almanac.js custom metric) +SELECT + client, + policy, + COUNT(DISTINCT page) AS total_pages, + COUNT(DISTINCT(CASE WHEN LOWER(JSON_VALUE(meta_node, '$.http-equiv')) = LOWER(policy) OR LOWER(JSON_VALUE(meta_node, '$.name')) = LOWER(policy) THEN page END)) AS count_policy, + COUNT(DISTINCT(CASE WHEN LOWER(JSON_VALUE(meta_node, '$.http-equiv')) = LOWER(policy) OR LOWER(JSON_VALUE(meta_node, '$.name')) = LOWER(policy) THEN page END)) / COUNT(DISTINCT page) AS pct_policy, + policy IN ('Content-Security-Policy', 'referrer') AS is_allowed_as_meta +FROM ( + SELECT + client, + page, + JSON_VALUE(payload, '$._almanac') AS metrics + FROM + `httparchive.all.pages.` + WHERE + date = '2024-06-01' AND + is_root_page +), +UNNEST(JSON_QUERY_ARRAY(metrics, '$.meta-nodes.nodes')) meta_node, +UNNEST(['Content-Security-Policy', 'Content-Security-Policy-Report-Only', 'Cross-Origin-Embedder-Policy', 'Cross-Origin-Opener-Policy', 'Cross-Origin-Resource-Policy', 'Expect-CT', 'Feature-Policy', 'Permissions-Policy', 'Referrer-Policy', 'referrer', 'Report-To', 'Strict-Transport-Security', 'X-Content-Type-Options', 'X-Frame-Options', 'X-XSS-Protection']) AS policy +GROUP BY + client, + policy +ORDER BY + client, + count_policy DESC diff --git a/sql/2024/security/mimetype_file_extension_mismatch.sql b/sql/2024/security/mimetype_file_extension_mismatch.sql new file mode 100644 index 00000000000..ce5e349b6b5 --- /dev/null +++ b/sql/2024/security/mimetype_file_extension_mismatch.sql @@ -0,0 +1,51 @@ +#standardSQL +# Section: unclear +# Question: How often does the mimetype of a request and the file extension mismatch across all requests? +# Note: Non-SVG images are ignored +WITH mimtype_file_ext_pairs AS ( + SELECT + client, + LOWER(JSON_VALUE(summary, '$.mimeType')) AS mimetype, + LOWER(JSON_VALUE(summary, '$.ext')) AS file_extension, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_requests, + COUNT(0) AS count_pair + FROM + `httparchive.all.requests` + WHERE + date = '2024-06-01' AND + is_root_page + GROUP BY + client, + mimetype, + file_extension +) + +SELECT + client, + mimetype, + file_extension, + total_requests, + SUM(MIN(count_pair)) OVER (PARTITION BY client) AS count_mismatches, + SUM(MIN(count_pair)) OVER (PARTITION BY client) / total_requests AS pct_mismatches, + MIN(count_pair) AS count_pair, + MIN(count_pair) / SUM(MIN(count_pair)) OVER (PARTITION BY client) AS pct_pair +FROM + mimtype_file_ext_pairs +WHERE + mimetype IS NOT NULL AND + mimetype != '' AND + file_extension IS NOT NULL AND + file_extension != '' AND + mimetype NOT LIKE CONCAT('%', file_extension) AND + NOT (REGEXP_CONTAINS(mimetype, '(application|text)/(x-)*javascript') AND REGEXP_CONTAINS(file_extension, r'(?i)^m?js$')) AND + NOT (mimetype = 'image/svg+xml' AND REGEXP_CONTAINS(file_extension, r'(?i)^svg$')) AND + NOT (mimetype = 'audio/mpeg' AND REGEXP_CONTAINS(file_extension, r'(?i)^mp3$')) AND + NOT (STARTS_WITH(mimetype, 'image/') AND REGEXP_CONTAINS(file_extension, r'(?i)^(apng|avif|bmp|cur|gif|jpeg|jpg|jfif|ico|pjpeg|pjp|png|tif|tiff|webp)$')) +GROUP BY + client, + total_requests, + mimetype, + file_extension +ORDER BY + count_pair DESC +LIMIT 100 diff --git a/sql/2024/security/mixed_content.sql b/sql/2024/security/mixed_content.sql new file mode 100644 index 00000000000..d46576950f4 --- /dev/null +++ b/sql/2024/security/mixed_content.sql @@ -0,0 +1,24 @@ +#standardSQL +# Section: Transport Security - ? +# Question: How many landing pages that load over HTTPS have at least one reference over HTTP? (Distributed across ranking) +# Note: Each rank bucket does not include lower buckets; +# Prevalence of landing pages over HTTPS that include at least one reference over HTTP, and distribution over ranking +SELECT + client, + rank AS rank_grouping, + COUNT(DISTINCT(CASE WHEN REGEXP_CONTAINS(page, r'https://.*') THEN page END)) AS total_pages_over_https, + COUNT(DISTINCT(CASE WHEN REGEXP_CONTAINS(page, r'https://.*') AND REGEXP_CONTAINS(url, r'http://.*') THEN page END)) AS count_pages_over_https_with_http_reference, + COUNT(DISTINCT(CASE WHEN REGEXP_CONTAINS(page, r'https://.*') AND REGEXP_CONTAINS(url, r'http://.*') THEN page END)) / COUNT(DISTINCT(CASE WHEN REGEXP_CONTAINS(page, r'https://.*') THEN page END)) AS pct_pages_over_https_with_http_reference +FROM + `httparchive.all.requests` +JOIN + `httparchive.all.pages` USING (client, page, date, is_root_page) +WHERE + date = '2024-06-01' AND + is_root_page +GROUP BY + client, + rank +ORDER BY + client, + rank_grouping diff --git a/sql/2024/security/oac_header_prevalence.sql b/sql/2024/security/oac_header_prevalence.sql new file mode 100644 index 00000000000..d29727232d8 --- /dev/null +++ b/sql/2024/security/oac_header_prevalence.sql @@ -0,0 +1,30 @@ +#standardSQL +# Section: Attack Preventions - Security Header Adoptions? +# Question: Which are the most common OAC values? +# Note: Considers headers of all main document responses +SELECT + client, + oac_header, + SUM(COUNT(DISTINCT host)) OVER (PARTITION BY client) AS total_oac_headers, + COUNT(DISTINCT host) AS freq, + COUNT(DISTINCT host) / SUM(COUNT(DISTINCT host)) OVER (PARTITION BY client) AS pct +FROM ( + SELECT + client, + NET.HOST(url) AS host, + response_headers.value AS oac_header + FROM + `httparchive.all.requests`, + UNNEST(response_headers) AS response_headers + WHERE + date = '2024-06-01' AND + is_root_page AND + is_main_document AND + LOWER(response_headers.name) = 'origin-agent-cluster') +GROUP BY + client, + oac_header +ORDER BY + pct DESC +LIMIT + 100 diff --git a/sql/2024/security/pp_header_prevalence.sql b/sql/2024/security/pp_header_prevalence.sql new file mode 100644 index 00000000000..9ad66adc92e --- /dev/null +++ b/sql/2024/security/pp_header_prevalence.sql @@ -0,0 +1,30 @@ +#standardSQL +# Section: ? (Permissions) +# Question: Which are the most common PP values? +# Note: Considers headers of main document responses +SELECT + client, + pp_header, + SUM(COUNT(DISTINCT host)) OVER (PARTITION BY client) AS total_pp_headers, + COUNT(DISTINCT host) AS freq, + COUNT(DISTINCT host) / SUM(COUNT(DISTINCT host)) OVER (PARTITION BY client) AS pct +FROM ( + SELECT + client, + NET.HOST(url) AS host, + response_headers.value AS pp_header + FROM + `httparchive.all.requests`, + UNNEST(response_headers) AS response_headers + WHERE + date = '2024-06-01' AND + is_root_page AND + is_main_document AND + LOWER(response_headers.name) = 'permissions-policy') +GROUP BY + client, + pp_header +ORDER BY + pct DESC +LIMIT + 100 diff --git a/sql/2024/security/robot_header_and_meta_tag_prevalence.sql b/sql/2024/security/robot_header_and_meta_tag_prevalence.sql new file mode 100644 index 00000000000..e95aee2eed6 --- /dev/null +++ b/sql/2024/security/robot_header_and_meta_tag_prevalence.sql @@ -0,0 +1,78 @@ +#standardSQL +# Section: Unclear? +# Question: What is the prevalence of robots meta tag values and the X-robots-tag header? +WITH meta_tags AS ( + SELECT + client, + page, + LOWER(JSON_VALUE(meta_node, '$.content')) AS robots_content + FROM ( + SELECT + client, + page, + JSON_VALUE(payload, '$._almanac') AS metrics + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page + ), + UNNEST(JSON_QUERY_ARRAY(metrics, '$.meta-nodes.nodes')) meta_node + WHERE LOWER(JSON_VALUE(meta_node, '$.name')) = 'robots' +), + +robot_headers AS ( + SELECT + client, + url AS page, + LOWER(response_headers.value) AS robot_header_value + FROM + `httparchive.all.requests`, + UNNEST(response_headers) AS response_headers + WHERE + date = '2024-06-01' AND + is_root_page AND + is_main_document AND + LOWER(response_headers.name) = 'x-robots-tag' +), + +totals AS ( + SELECT + client, + COUNT(DISTINCT page) AS total_nb_pages + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page + GROUP BY + client +) + +SELECT + client, + total_nb_pages AS total, + COUNTIF(robots_content IS NOT NULL OR robot_header_value IS NOT NULL) AS count_robots, + COUNTIF(robots_content IS NOT NULL OR robot_header_value IS NOT NULL) / total_nb_pages AS pct_robots, + COUNT(robots_content) AS count_robots_content, + COUNT(robots_content) / total_nb_pages AS pct_robots_content, + COUNT(robot_header_value) AS count_robot_header_value, + COUNT(robot_header_value) / total_nb_pages AS pct_robot_header_value, + COUNTIF(REGEXP_CONTAINS(robots_content, r'.*noindex.*') OR REGEXP_CONTAINS(robot_header_value, r'.*noindex.*')) AS count_noindex, + COUNTIF(REGEXP_CONTAINS(robots_content, r'.*noindex.*') OR REGEXP_CONTAINS(robot_header_value, r'.*noindex.*')) / COUNTIF(robots_content IS NOT NULL OR robot_header_value IS NOT NULL) AS pct_noindex, + COUNTIF(REGEXP_CONTAINS(robots_content, r'.*nofollow.*') OR REGEXP_CONTAINS(robot_header_value, r'.*nofollow.*')) AS count_nofollow, + COUNTIF(REGEXP_CONTAINS(robots_content, r'.*nofollow.*') OR REGEXP_CONTAINS(robot_header_value, r'.*nofollow.*')) / COUNTIF(robots_content IS NOT NULL OR robot_header_value IS NOT NULL) AS pct_nofollow, + COUNTIF(REGEXP_CONTAINS(robots_content, r'.*nosnippet.*') OR REGEXP_CONTAINS(robot_header_value, r'.*nosnippet.*')) AS count_nosnippet, + COUNTIF(REGEXP_CONTAINS(robots_content, r'.*nosnippet.*') OR REGEXP_CONTAINS(robot_header_value, r'.*nosnippet.*')) / COUNTIF(robots_content IS NOT NULL OR robot_header_value IS NOT NULL) AS pct_nosnippet, + COUNTIF(REGEXP_CONTAINS(robots_content, r'.*noarchive.*') OR REGEXP_CONTAINS(robot_header_value, r'.*noarchive.*')) AS count_noarchive, + COUNTIF(REGEXP_CONTAINS(robots_content, r'.*noarchive.*') OR REGEXP_CONTAINS(robot_header_value, r'.*noarchive.*')) / COUNTIF(robots_content IS NOT NULL OR robot_header_value IS NOT NULL) AS pct_noarchive +FROM + meta_tags FULL OUTER JOIN robot_headers USING (client, page) +JOIN + totals +USING (client) +GROUP BY + client, + total_nb_pages +ORDER BY + client diff --git a/sql/2024/security/robot_txt_sensitive_disallow.sql b/sql/2024/security/robot_txt_sensitive_disallow.sql new file mode 100644 index 00000000000..dd2686fd024 --- /dev/null +++ b/sql/2024/security/robot_txt_sensitive_disallow.sql @@ -0,0 +1,53 @@ +#standardSQL +# Section: Well-know URIs - robots.txt (?) +# Question: What is the prevalence of /robots.txt and what is the prevalence of potentially sensitive endpoints in disallow directives ('login', 'log-in', 'signin', 'sign-in', 'admin', 'auth', 'sso', 'account') +CREATE TEMPORARY FUNCTION getAllDisallowedEndpoints(data STRING) +RETURNS ARRAY DETERMINISTIC +LANGUAGE js AS ''' + let parsed_data; + try { + parsed_data = JSON.parse(data); + } catch (e) { + return []; + } + if (parsed_data == null || parsed_data["/robots.txt"] == undefined || !parsed_data["/robots.txt"]["found"]) { + return []; + } + const parsed_endpoints = parsed_data["/robots.txt"]["data"]["matched_disallows"]; + const endpoints_list = Object.keys(parsed_endpoints).map(key => parsed_endpoints[key]).flat(); + return Array.from(new Set(endpoints_list)); +'''; + +SELECT + client, + COUNT(DISTINCT page) AS total_pages, + COUNT(DISTINCT(CASE WHEN has_robots_txt = 'true' THEN page END)) AS count_robots_txt, + COUNT(DISTINCT(CASE WHEN has_robots_txt = 'true' THEN page END)) / COUNT(DISTINCT page) AS pct_robots_txt, + COUNT(DISTINCT(CASE WHEN REGEXP_CONTAINS(disallowed_endpoint, r'.*/admin/.*') THEN page END)) AS count_disallow_admin, + COUNT(DISTINCT(CASE WHEN REGEXP_CONTAINS(disallowed_endpoint, r'.*/admin/.*') THEN page END)) / COUNT(DISTINCT(CASE WHEN has_robots_txt = 'true' THEN page END)) AS pct_disallow_admin, + COUNT(DISTINCT(CASE WHEN REGEXP_CONTAINS(disallowed_endpoint, r'.*/log-*in/.*') THEN page END)) AS count_disallow_login, + COUNT(DISTINCT(CASE WHEN REGEXP_CONTAINS(disallowed_endpoint, r'.*/log-*in/.*') THEN page END)) / COUNT(DISTINCT(CASE WHEN has_robots_txt = 'true' THEN page END)) AS pct_disallow_login, + COUNT(DISTINCT(CASE WHEN REGEXP_CONTAINS(disallowed_endpoint, r'.*/sign-*in/.*') THEN page END)) AS count_disallow_signin, + COUNT(DISTINCT(CASE WHEN REGEXP_CONTAINS(disallowed_endpoint, r'.*/sign-*in/.*') THEN page END)) / COUNT(DISTINCT(CASE WHEN has_robots_txt = 'true' THEN page END)) AS pct_disallow_signin, + COUNT(DISTINCT(CASE WHEN REGEXP_CONTAINS(disallowed_endpoint, r'.*/auth./*') THEN page END)) AS count_disallow_auth, + COUNT(DISTINCT(CASE WHEN REGEXP_CONTAINS(disallowed_endpoint, r'.*/auth/.*') THEN page END)) / COUNT(DISTINCT(CASE WHEN has_robots_txt = 'true' THEN page END)) AS pct_disallow_auth, + COUNT(DISTINCT(CASE WHEN REGEXP_CONTAINS(disallowed_endpoint, r'.*/sso/.*') THEN page END)) AS count_disallow_sso, + COUNT(DISTINCT(CASE WHEN REGEXP_CONTAINS(disallowed_endpoint, r'.*/sso/.*') THEN page END)) / COUNT(DISTINCT(CASE WHEN has_robots_txt = 'true' THEN page END)) AS pct_disallow_sso, + COUNT(DISTINCT(CASE WHEN REGEXP_CONTAINS(disallowed_endpoint, r'.*/account/.*') THEN page END)) AS count_disallow_account, + COUNT(DISTINCT(CASE WHEN REGEXP_CONTAINS(disallowed_endpoint, r'.*/account/.*') THEN page END)) / COUNT(DISTINCT(CASE WHEN has_robots_txt = 'true' THEN page END)) AS pct_disallow_account +FROM + ( + SELECT + client, + page, + JSON_VALUE(JSON_VALUE(payload, '$._well-known'), '$."/robots.txt".found') AS has_robots_txt, + getAllDisallowedEndpoints(JSON_VALUE(payload, '$._well-known')) AS disallowed_endpoints + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page + ) +LEFT JOIN UNNEST(disallowed_endpoints) AS disallowed_endpoint +GROUP BY + client diff --git a/sql/2024/security/security_adoption_by_category.sql b/sql/2024/security/security_adoption_by_category.sql new file mode 100644 index 00000000000..40657aed791 --- /dev/null +++ b/sql/2024/security/security_adoption_by_category.sql @@ -0,0 +1,34 @@ +#standardSQL +# Section: Drivers of security mechanism adoption - Website category +# Question: How prevalent are the various security headers on first-party resources? (per category) +# Note: Instead of the parent_category, we could ues full_category or subcategory (https://har.fyi/reference/functions/get_host_categories/) +# Note: Instead of using every "first-party" resource via url.host = page.host, we could only look at top-level documents (is_main_document) or responses of type document (top-level + iframes)? +SELECT + client, + headername, + SPLIT(parent_category, '/')[1] AS category, + COUNT(DISTINCT NET.HOST(url)) AS total_hosts, + COUNT(DISTINCT IF(LOWER(rh.name) = LOWER(headername), NET.HOST(url), NULL)) AS num_with_header, + COUNT(DISTINCT IF(LOWER(rh.name) = LOWER(headername), NET.HOST(url), NULL)) / COUNT(DISTINCT NET.HOST(url)) AS pct_with_header +FROM + `httparchive.all.requests`, + UNNEST(['Content-Security-Policy', 'Content-Security-Policy-Report-Only', 'Cross-Origin-Embedder-Policy', 'Cross-Origin-Opener-Policy', + 'Cross-Origin-Resource-Policy', 'Expect-CT', 'Feature-Policy', 'Permissions-Policy', 'Referrer-Policy', 'Report-To', + 'Strict-Transport-Security', 'X-Content-Type-Options', 'X-Frame-Options', 'X-XSS-Protection', 'Clear-Site-Data', 'Timing-Allow-Origin', 'Origin-Agent-Cluster']) AS headername, + UNNEST(response_headers) AS rh +JOIN UNNEST(`httparchive.fn.GET_HOST_CATEGORIES`(url)) +WHERE + date = '2024-06-01' AND + is_root_page AND + is_main_document AND + NET.HOST(url) = NET.HOST(page) + +GROUP BY + client, + headername, + category +ORDER BY + pct_with_header DESC, + client, + headername, + category diff --git a/sql/2024/security/security_adoption_by_rank.sql b/sql/2024/security/security_adoption_by_rank.sql new file mode 100644 index 00000000000..681622bc460 --- /dev/null +++ b/sql/2024/security/security_adoption_by_rank.sql @@ -0,0 +1,32 @@ +#standardSQL +# Section: Drivers of security mechanism adoption - Website popularity +# Question: How prevalent are the various security headers on first-party resources? (per rank grouping 1K, 5K, 10K, 100K, 500K, 1M, ...) +# Note: Buckets do not include prior ranks +# Note: Instead of using every "first-party" resource via url.host = page.host, we could only look at top-level documents (is_main_document) or responses of type document (top-level + iframes)? +SELECT + client, + headername, + rank AS rank_grouping, + COUNT(DISTINCT NET.HOST(url)) AS total_hosts, + COUNT(DISTINCT IF(LOWER(rh.name) = LOWER(headername), NET.HOST(url), NULL)) AS num_with_header, + COUNT(DISTINCT IF(LOWER(rh.name) = LOWER(headername), NET.HOST(url), NULL)) / COUNT(DISTINCT NET.HOST(url)) AS pct_with_header +FROM + `httparchive.all.requests`, + UNNEST(['Content-Security-Policy', 'Content-Security-Policy-Report-Only', 'Cross-Origin-Embedder-Policy', 'Cross-Origin-Opener-Policy', + 'Cross-Origin-Resource-Policy', 'Expect-CT', 'Feature-Policy', 'Permissions-Policy', 'Referrer-Policy', 'Report-To', + 'Strict-Transport-Security', 'X-Content-Type-Options', 'X-Frame-Options', 'X-XSS-Protection', 'Clear-Site-Data', 'Timing-Allow-Origin', 'Origin-Agent-Cluster']) AS headername, + UNNEST(response_headers) AS rh +JOIN `httparchive.all.pages` USING (client, page, date, is_root_page) +WHERE + date = '2024-06-01' AND + is_root_page AND + NET.HOST(url) = NET.HOST(page) + +GROUP BY + client, + headername, + rank_grouping +ORDER BY + client, + headername, + rank_grouping diff --git a/sql/2024/security/security_headers_prevalence.sql b/sql/2024/security/security_headers_prevalence.sql new file mode 100644 index 00000000000..1fa1c09c21f --- /dev/null +++ b/sql/2024/security/security_headers_prevalence.sql @@ -0,0 +1,35 @@ +#standardSQL +# Section: Attack Preventions - Security header adoptions +# Question: How prevalent are security headers in a first-party context? (count by number of hosts) +# Note: Instead we could only look at top-level responses (is_main_document)? +SELECT + date, + client, + headername, + COUNT(DISTINCT host) AS total_hosts, + COUNT(DISTINCT IF(LOWER(response_headers.name) = LOWER(headername), host, NULL)) AS count_with_header, + COUNT(DISTINCT IF(LOWER(response_headers.name) = LOWER(headername), host, NULL)) / COUNT(DISTINCT host) AS pct_with_header +FROM ( + SELECT + date, + client, + NET.HOST(url) AS host, + response_headers + FROM + `httparchive.all.requests`, + UNNEST(response_headers) AS response_headers + WHERE + (date = '2022-06-09' OR date = '2023-06-01' OR date = '2024-06-01') AND + NET.HOST(url) = NET.HOST(page) +), +UNNEST(['Content-Security-Policy', 'Content-Security-Policy-Report-Only', 'Cross-Origin-Embedder-Policy', 'Cross-Origin-Opener-Policy', + 'Cross-Origin-Resource-Policy', 'Expect-CT', 'Feature-Policy', 'Permissions-Policy', 'Referrer-Policy', 'Report-To', + 'Strict-Transport-Security', 'X-Content-Type-Options', 'X-Frame-Options', 'X-XSS-Protection', 'Clear-Site-Data', 'Timing-Allow-Origin', 'Origin-Agent-Cluster']) AS headername +GROUP BY + date, + client, + headername +ORDER BY + date, + client, + headername diff --git a/sql/2024/security/server_header_value_prevalence.sql b/sql/2024/security/server_header_value_prevalence.sql new file mode 100644 index 00000000000..263da44a259 --- /dev/null +++ b/sql/2024/security/server_header_value_prevalence.sql @@ -0,0 +1,63 @@ +#standardSQL +# Section: ? +# Question: Which are the most common Server and X-Powered-By headers? (count by number of hosts) +# Note: Different dates taken together; Is it correct to take the host of the page instead of the URL? Maybe only take is_main_document? +SELECT + client, + type, + resp_value, + total, + freq, + pct +FROM + ( + SELECT + client, + 'server' AS type, + response_header.value AS resp_value, + SUM(COUNT(DISTINCT NET.HOST(page))) OVER (PARTITION BY client) AS total, + COUNT(DISTINCT NET.HOST(page)) AS freq, + COUNT(DISTINCT NET.HOST(page)) / SUM(COUNT(DISTINCT NET.HOST(page))) OVER (PARTITION BY client) AS pct + FROM + `httparchive.all.requests`, + UNNEST(response_headers) AS response_header + WHERE + (date = '2022-06-09' OR date = '2023-06-01' OR date = '2024-06-01') AND + is_root_page AND + LOWER(response_header.name) = 'server' + GROUP BY + client, + type, + resp_value + ORDER BY + freq DESC + LIMIT 40 + ) +UNION ALL +( + SELECT + client, + 'x-powered-by' AS type, + response_header.value AS resp_value, + SUM(COUNT(DISTINCT NET.HOST(page))) OVER (PARTITION BY client) AS total, + COUNT(DISTINCT NET.HOST(page)) AS freq, + COUNT(DISTINCT NET.HOST(page)) / SUM(COUNT(DISTINCT NET.HOST(page))) OVER (PARTITION BY client) AS pct + FROM + `httparchive.all.requests`, + UNNEST(response_headers) AS response_header + WHERE + (date = '2022-06-09' OR date = '2023-06-01' OR date = '2024-06-01') AND + is_root_page AND + LOWER(response_header.name) = 'x-powered-by' + GROUP BY + client, + type, + resp_value + ORDER BY + freq DESC + LIMIT 40 +) +ORDER BY + client, + type, + freq DESC diff --git a/sql/2024/security/server_information_header_prevalence.sql b/sql/2024/security/server_information_header_prevalence.sql new file mode 100644 index 00000000000..719a463d8a5 --- /dev/null +++ b/sql/2024/security/server_information_header_prevalence.sql @@ -0,0 +1,32 @@ +#standardSQL +# Section: ? +# Question: How prevalent are headers leaking server information? (count by number of hosts) +SELECT + date, + client, + headername, + COUNT(DISTINCT host) AS total_hosts, + COUNT(DISTINCT IF(LOWER(response_header.name) = LOWER(headername), host, NULL)) AS count_with_header, + COUNT(DISTINCT IF(LOWER(response_header.name) = LOWER(headername), host, NULL)) / COUNT(DISTINCT host) AS pct_with_header +FROM ( + SELECT + date, + client, + NET.HOST(url) AS host, + response_header + FROM + `httparchive.all.requests`, + UNNEST(response_headers) AS response_header + WHERE + (date = '2022-06-09' OR date = '2023-06-01' OR date = '2024-06-01') AND + is_root_page +), +UNNEST(['Server', 'X-Server', 'X-Backend-Server', 'X-Powered-By', 'X-Aspnet-Version']) AS headername +GROUP BY + date, + client, + headername +ORDER BY + date, + client, + count_with_header DESC diff --git a/sql/2024/security/server_timing_usage_values.sql b/sql/2024/security/server_timing_usage_values.sql new file mode 100644 index 00000000000..9da649bcd5e --- /dev/null +++ b/sql/2024/security/server_timing_usage_values.sql @@ -0,0 +1,295 @@ +#standardSQL +# Section: Security misconfigurations and oversights - (Missing) suppression of 'Server-Timing' header +# Question: Which are the most common server-timing headers and how often are they used in total? +# Note: Probably better to split some of the things up to make the interpretation of the results easier +# Note: Server-Timing sent to same-origin/top-level requests is not an issue, maybe only look at non first-party requests? +CREATE TEMPORARY FUNCTION parseServerTiming(server_timing STRING) +RETURNS STRING DETERMINISTIC +LANGUAGE js AS ''' + if (!server_timing) { + return server_timing; + } + const result = { + "metric_names": [], + "dur": [], + "desc": [], + }; + + server_timing.split(",").forEach(st => { + let name = null; + let dur = null; + let desc = null; + + st.split(";").forEach(prop => { + let [key, ...valueParts] = prop.split("="); + key = key.trim(); + const value = valueParts.join("="); // Join the remaining parts to handle multiple '=' signs + + if (!name) { + name = key; // The first property is assumed to be the name + } else if (key === "dur") { + dur = value; + } else if (key === "desc") { + desc = value; + } else { + // Invalid or unhandled property + } + }); + + result["metric_names"].push(name); + result["dur"].push(dur); + result["desc"].push(desc); + }); + + return JSON.stringify(result); + +'''; + +WITH parsed_server_timing AS ( + SELECT + client, + NET.HOST(url) AS host, + COUNT(0) OVER (PARTITION BY CLIENT) AS total_st, # Total number of ST header, if a response has more than one ST header each of them is counted + COUNT(DISTINCT NET.HOST(url)) OVER (PARTITION BY CLIENT) AS total_st_hosts, + response_headers.value AS server_timing_header, + JSON_EXTRACT(parseServerTiming(response_headers.value), '$.metric_names') AS metric_names, + JSON_EXTRACT(parseServerTiming(response_headers.value), '$.dur') AS dur, + JSON_EXTRACT(parseServerTiming(response_headers.value), '$.desc') AS desc1 + FROM + `httparchive.all.requests`, + # `httparchive.sample_data.requests_1k`, + UNNEST(response_headers) AS response_headers + WHERE + date = '2024-06-01' AND + # date = '2024-08-01' AND + is_root_page AND + LOWER(response_headers.name) = 'server-timing' +), st_details AS ( + SELECT + client, + host, + server_timing_header, + total_st, + total_st_hosts, + metric_name, + dur, + desc1 + FROM + parsed_server_timing, + UNNEST(JSON_EXTRACT_ARRAY(metric_names)) AS metric_name WITH OFFSET idx, + UNNEST(JSON_EXTRACT_ARRAY(dur)) AS dur WITH OFFSET idx_dur, + UNNEST(JSON_EXTRACT_ARRAY(desc1)) AS desc1 WITH OFFSET idx_desc + WHERE + idx = idx_dur AND + idx = idx_desc +), totals AS ( + SELECT + client, + COUNT(0) AS total, + COUNT(DISTINCT NET.HOST(url)) AS total_hosts + FROM + `httparchive.all.requests` + # `httparchive.sample_data.requests_1k` + WHERE + date = '2024-06-01' AND + # date = '2024-08-01' AND + is_root_page + GROUP BY + client +) + +# Most common headers +# (Data already has one row for each metric (header split at comma) thus some headers occur multiple times) +/* +SELECT + client, + server_timing_header, + metric_name, + dur, + desc1, + total AS total_responses, + total_hosts, + total_st, + total_st_hosts, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_st_metrics, # Counts the total number of metrics + total_st / total AS pct_server_timing, + total_st_hosts / total_hosts AS pct_server_timing_hosts, + COUNT(DISTINCT host) AS freq_host, + COUNT(0) AS freq_req, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_value, # How common this exact "row" (i.e., server_timing_header or metric_name or dur or desc) is + COUNT(DISTINCT host) / total_st_hosts AS pct_hosts, # What is the percentage of hosts of all hosts that have a ST header that belong to this row's group +FROM st_details + JOIN totals USING (client) +GROUP BY + client, + total, + total_st, + total_st_hosts, + total_hosts, + server_timing_header, + metric_name, + dur, + desc1 +ORDER BY + pct_value DESC +LIMIT + 100 +*/ + +# Most common metric names +/* +SELECT + client, + metric_name, + total AS total_responses, + total_hosts, + total_st, + total_st_hosts, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_st_metrics, # Counts the total number of metrics + total_st / total AS pct_server_timing, + total_st_hosts / total_hosts AS pct_server_timing_hosts, + COUNT(DISTINCT host) AS freq_host, + COUNT(0) AS freq_req, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_value, # How common this exact "row" (i.e., server_timing_header or metric_name or dur or desc) is + COUNT(DISTINCT host) / total_st_hosts AS pct_hosts, # What is the percentage of hosts of all hosts that have a ST header that belong to this row's group +FROM st_details + JOIN totals USING (client) +GROUP BY + client, + total, + total_st, + total_st_hosts, + total_hosts, + metric_name +ORDER BY + pct_value DESC +LIMIT + 100 +*/ + +# Most common dur properties +/* +SELECT + client, + dur, + total AS total_responses, + total_hosts, + total_st, + total_st_hosts, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_st_metrics, # Counts the total number of metrics + total_st / total AS pct_server_timing, + total_st_hosts / total_hosts AS pct_server_timing_hosts, + COUNT(DISTINCT host) AS freq_host, + COUNT(0) AS freq_req, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_value, # How common this exact "row" (i.e., server_timing_header or metric_name or dur or desc) is + COUNT(DISTINCT host) / total_st_hosts AS pct_hosts, # What is the percentage of hosts of all hosts that have a ST header that belong to this row's group +FROM st_details + JOIN totals USING (client) +GROUP BY + client, + total, + total_st, + total_st_hosts, + total_hosts, + dur +ORDER BY + pct_value DESC +LIMIT + 100 +*/ + +# Most common descriptions +/* +SELECT + client, + desc1, + total AS total_responses, + total_hosts, + total_st, + total_st_hosts, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_st_metrics, # Counts the total number of metrics + total_st / total AS pct_server_timing, + total_st_hosts / total_hosts AS pct_server_timing_hosts, + COUNT(DISTINCT host) AS freq_host, + COUNT(0) AS freq_req, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_value, # How common this exact "row" (i.e., server_timing_header or metric_name or dur or desc) is + COUNT(DISTINCT host) / total_st_hosts AS pct_hosts, # What is the percentage of hosts of all hosts that have a ST header that belong to this row's group +FROM st_details + JOIN totals USING (client) +GROUP BY + client, + total, + total_st, + total_st_hosts, + total_hosts, + desc1 +ORDER BY + pct_value DESC +LIMIT + 100 +*/ + +# General query: Total #Responses (Requests), #With header, #With any dur, #With 1xdur, #With 2x dur, #With >=3x dur, #With any desc, #Average number of metrics, #.... +SELECT + client, + total AS total_responses, + total_hosts, + total_st, + total_st_hosts, + COUNT(0) AS total_st_metrics, # Counts the total number of metrics + total_st / total AS pct_server_timing, + total_st_hosts / total_hosts AS pct_server_timing_hosts, + COUNTIF(dur != 'null') AS freq_dur, + COUNTIF(desc1 != 'null') AS freq_desc, + COUNTIF(dur != 'null') / COUNT(0) AS pct_dur, + COUNTIF(desc1 != 'null') / COUNT(0) AS pct_desc, + hosts_at_least_1_dur, + hosts_1_durs, + hosts_2_durs, + hosts_more_than_2_durs, + hosts_avg_distinct_durs, + hosts_avg_durs, + hosts_avg_descs, + hosts_avg_metrics, + hosts_at_least_1_desc +FROM ( + SELECT + client, + COUNT(DISTINCT CASE WHEN dur_count >= 1 THEN host ELSE NULL END) AS hosts_at_least_1_dur, + COUNT(DISTINCT CASE WHEN dur_count = 1 THEN host ELSE NULL END) AS hosts_1_durs, + COUNT(DISTINCT CASE WHEN dur_count = 2 THEN host ELSE NULL END) AS hosts_2_durs, + COUNT(DISTINCT CASE WHEN dur_count >= 3 THEN host ELSE NULL END) AS hosts_more_than_2_durs, + AVG(dur_distinct) AS hosts_avg_distinct_durs, + AVG(dur_count) AS hosts_avg_durs, # Average of all hosts that have a least one ST header, they might have 0 ST headers with an metric with a dur content + AVG(row_count) AS hosts_avg_metrics, + AVG(desc_count) AS hosts_avg_descs, + COUNT(DISTINCT CASE WHEN desc_count >= 1 THEN host ELSE NULL END) AS hosts_at_least_1_desc + FROM ( + SELECT + client, + host, + COUNTIF(dur != 'null') AS dur_count, + COUNT(DISTINCT dur) AS dur_distinct, # Counts 'null' as one distinct value? + COUNTIF(desc1 != 'null') AS desc_count, + COUNT(0) AS row_count + FROM st_details + GROUP BY client, host + ) GROUP BY client +) +JOIN st_details USING (client) +JOIN totals USING (client) +GROUP BY + client, + total, + total_st, + total_st_hosts, + total_hosts, + hosts_at_least_1_dur, + hosts_1_durs, + hosts_2_durs, + hosts_more_than_2_durs, + hosts_avg_distinct_durs, + hosts_avg_durs, + hosts_avg_descs, + hosts_avg_metrics, + hosts_at_least_1_desc diff --git a/sql/2024/security/sri_coverage_per_page.sql b/sql/2024/security/sri_coverage_per_page.sql new file mode 100644 index 00000000000..504c4df97d9 --- /dev/null +++ b/sql/2024/security/sri_coverage_per_page.sql @@ -0,0 +1,31 @@ +#standardSQL +# Section: Content Inclusion - Subresource Integriy +# Question: How many scripts on a page have the integrity attribute? (percentage) +CREATE TEMP FUNCTION getNumScriptElements(sris ARRAY) AS ( + (SELECT COUNT(0) FROM UNNEST(sris) AS sri WHERE JSON_EXTRACT_SCALAR(sri, '$.tagname') = 'script') +); + +SELECT + client, + percentile, + APPROX_QUANTILES(getNumScriptElements(sris) / num_scripts, 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS integrity_pct +FROM ( + SELECT + client, + JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._security'), '$.sri-integrity') AS sris, + SAFE_CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._element_count'), '$.script') AS INT64) AS num_scripts + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page + ), + UNNEST([10, 25, 50, 75, 90]) AS percentile +WHERE + getNumScriptElements(sris) > 0 +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2024/security/sri_hash_functions.sql b/sql/2024/security/sri_hash_functions.sql new file mode 100644 index 00000000000..e066009bb5d --- /dev/null +++ b/sql/2024/security/sri_hash_functions.sql @@ -0,0 +1,45 @@ +#standardSQL +# Section: Content Inclusion - Subresource Integrity +# Question: Wich are the most common SRI hash functions used? +WITH totals AS ( + SELECT + client, + COUNT(0) AS total_sri_elements + FROM + `httparchive.all.pages`, + UNNEST(JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._security'), '$.sri-integrity')) AS sri + WHERE + date = '2024-06-01' AND + is_root_page + GROUP BY + client +) + +SELECT + client, + hash_function, + total_sri_elements, + COUNT(0) AS freq, + COUNT(0) / total_sri_elements AS pct +FROM ( + SELECT + client, + JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._security'), '$.sri-integrity') AS sris + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page + ), + UNNEST(sris) AS sri, + UNNEST(REGEXP_EXTRACT_ALL(JSON_EXTRACT_SCALAR(sri, '$.integrity'), r'(sha[^-]+)-')) AS hash_function +JOIN totals USING (client) +WHERE + sri IS NOT NULL +GROUP BY + client, + total_sri_elements, + hash_function +ORDER BY + client, + pct DESC diff --git a/sql/2024/security/sri_popular_hosts.sql b/sql/2024/security/sri_popular_hosts.sql new file mode 100644 index 00000000000..e374c16c68b --- /dev/null +++ b/sql/2024/security/sri_popular_hosts.sql @@ -0,0 +1,51 @@ +#standardSQL +# Section: Content Inclusion - Subresource Integrity +# Question: Which are the most popular hosts for which SRI is used on script tags? +WITH totals AS ( + SELECT + client, + COUNT(0) AS total_sri_scripts + FROM + `httparchive.all.pages`, + UNNEST(JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._security'), '$.sri-integrity')) AS sri + WHERE + date = '2024-06-01' AND + is_root_page AND + sri IS NOT NULL AND + JSON_EXTRACT_SCALAR(sri, '$.tagname') = 'script' + GROUP BY + client +) + +SELECT + client, + NET.HOST(JSON_EXTRACT_SCALAR(sri, '$.src')) AS host, + total_sri_scripts, + COUNT(0) AS freq, + COUNT(0) / total_sri_scripts AS pct, + SUM(COUNT(DISTINCT url)) OVER (PARTITION BY client) AS total_urls, + COUNT(DISTINCT url) AS freq_urls, + COUNT(DISTINCT url) / SUM(COUNT(DISTINCT url)) OVER (PARTITION BY client) AS pct_urls +FROM ( + SELECT + client, + page AS url, + JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._security'), '$.sri-integrity') AS sris + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page + ), + UNNEST(sris) AS sri +JOIN totals USING (client) +WHERE + sri IS NOT NULL AND + JSON_EXTRACT_SCALAR(sri, '$.tagname') = 'script' +GROUP BY + client, + total_sri_scripts, + host +ORDER BY + pct DESC +LIMIT 1000 diff --git a/sql/2024/security/sri_usage.sql b/sql/2024/security/sri_usage.sql new file mode 100644 index 00000000000..94e24ae8c71 --- /dev/null +++ b/sql/2024/security/sri_usage.sql @@ -0,0 +1,34 @@ +#standardSQL +# Section: Content Inclusion - Subresource Integrity +# Question: How many pages use SRI (per tagname) and what is the tagname usage for all SRI elements? +SELECT + client, + COUNTIF(sri IS NOT NULL) AS total_sris, + COUNT(DISTINCT url) AS total_urls, + COUNT(DISTINCT IF(sri IS NOT NULL, url, NULL)) AS freq, + COUNT(DISTINCT IF(sri IS NOT NULL, url, NULL)) / COUNT(DISTINCT url) AS pct, + COUNTIF(JSON_EXTRACT_SCALAR(sri, '$.tagname') = 'script') AS freq_script_sris, + COUNTIF(JSON_EXTRACT_SCALAR(sri, '$.tagname') = 'script') / COUNTIF(sri IS NOT NULL) AS pct_script_sris, + COUNTIF(JSON_EXTRACT_SCALAR(sri, '$.tagname') = 'link') AS freq_link_sris, + COUNTIF(JSON_EXTRACT_SCALAR(sri, '$.tagname') = 'link') / COUNTIF(sri IS NOT NULL) AS pct_link_sris, + COUNT(DISTINCT IF(JSON_EXTRACT_SCALAR(sri, '$.tagname') = 'script', url, NULL)) AS freq_script_urls, + COUNT(DISTINCT IF(JSON_EXTRACT_SCALAR(sri, '$.tagname') = 'script', url, NULL)) / COUNT(DISTINCT url) AS pct_script_urls, + COUNT(DISTINCT IF(JSON_EXTRACT_SCALAR(sri, '$.tagname') = 'link', url, NULL)) AS freq_link_urls, + COUNT(DISTINCT IF(JSON_EXTRACT_SCALAR(sri, '$.tagname') = 'link', url, NULL)) / COUNT(DISTINCT url) AS pct_link_urls +FROM ( + SELECT + client, + page AS url, + JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._security'), '$.sri-integrity') AS sris + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page + ) +LEFT JOIN UNNEST(sris) AS sri +GROUP BY + client +ORDER BY + client, + pct DESC diff --git a/sql/2024/security/tao_header_prevalence.sql b/sql/2024/security/tao_header_prevalence.sql new file mode 100644 index 00000000000..78f4a5a5455 --- /dev/null +++ b/sql/2024/security/tao_header_prevalence.sql @@ -0,0 +1,29 @@ +#standardSQL +# Section: Attack Preventions - Security Header Adoptions? +# Question: Which are the most common TAO values? +# Note: Considers headers of all responses including all subresources (header is used for script and img resources) +SELECT + client, + tao_header, + SUM(COUNT(DISTINCT host)) OVER (PARTITION BY client) AS total_tao_headers, + COUNT(DISTINCT host) AS freq, + COUNT(DISTINCT host) / SUM(COUNT(DISTINCT host)) OVER (PARTITION BY client) AS pct +FROM ( + SELECT + client, + NET.HOST(url) AS host, + response_headers.value AS tao_header + FROM + `httparchive.all.requests`, + UNNEST(response_headers) AS response_headers + WHERE + date = '2024-06-01' AND + is_root_page AND + LOWER(response_headers.name) = 'timing-allow-origin') +GROUP BY + client, + tao_header +ORDER BY + pct DESC +LIMIT + 100 diff --git a/sql/2024/security/tls_ca_issuers_pages.sql b/sql/2024/security/tls_ca_issuers_pages.sql new file mode 100644 index 00000000000..3450aa4d060 --- /dev/null +++ b/sql/2024/security/tls_ca_issuers_pages.sql @@ -0,0 +1,31 @@ +#standardSQL +# Section: Transport Security - Certificate Authority +# Question: What is the distribution of CA issuers for all pages? +# Note: currently includes HTTP (i.e., pages with no issuer) +SELECT + client, + issuer, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_https_pages, + COUNT(0) AS freq, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct +FROM ( + SELECT + client, + NET.HOST(url) AS request_host, + JSON_VALUE(payload, '$._securityDetails.issuer') AS issuer + FROM + `httparchive.all.requests` + WHERE + date = '2024-06-01' AND + is_root_page AND + is_main_document + GROUP BY + client, + request_host, + issuer + ) +GROUP BY + client, + issuer +ORDER BY + pct DESC diff --git a/sql/2024/security/tls_ca_issuers_requests.sql b/sql/2024/security/tls_ca_issuers_requests.sql new file mode 100644 index 00000000000..66239ba3ad6 --- /dev/null +++ b/sql/2024/security/tls_ca_issuers_requests.sql @@ -0,0 +1,28 @@ +#standardSQL +# Section: Transport Security - Certificate Authority +# Question: What is the distribution of CA issuers for all (top-level) requests? +# Note: original query was without is_main_document but due to the port to the tables it suddently took 50+TB instead of 20GB?! +SELECT + client, + issuer, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_https_requests, + COUNT(0) AS freq, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct +FROM ( + SELECT + client, + JSON_VALUE(payload, '$._securityDetails.issuer') AS issuer + FROM + `httparchive.all.requests` + WHERE + date = '2024-06-01' AND + is_root_page AND + is_main_document +) +WHERE + issuer IS NOT NULL +GROUP BY + client, + issuer +ORDER BY + pct DESC diff --git a/sql/2024/security/tls_cipher_suite.sql b/sql/2024/security/tls_cipher_suite.sql new file mode 100644 index 00000000000..07d9fd913b4 --- /dev/null +++ b/sql/2024/security/tls_cipher_suite.sql @@ -0,0 +1,27 @@ +#standardSQL +# Section: Transport Security - Cipher suites +# Question: What is the distribution of all ciphers for all requests? +# Note: Query is large (43TB) +SELECT + client, + cipher, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_https_requests, + COUNT(0) AS freq, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct +FROM ( + SELECT + client, + JSON_VALUE(payload, '$._securityDetails.cipher') AS cipher + FROM + `httparchive.all.requests` + WHERE + date = '2024-06-01' AND + is_root_page + ) +WHERE + cipher IS NOT NULL +GROUP BY + client, + cipher +ORDER BY + pct DESC diff --git a/sql/2024/security/tls_forward_secrecy.sql b/sql/2024/security/tls_forward_secrecy.sql new file mode 100644 index 00000000000..428ffb76f87 --- /dev/null +++ b/sql/2024/security/tls_forward_secrecy.sql @@ -0,0 +1,27 @@ +#standardSQL +# Section: Transport Security - Cipher Suites +# Question: How many used cipher suites support forward secrecy for all requests? +# Note: Large query (40+TB) +SELECT + client, + COUNT(0) AS total_requests, + COUNTIF(REGEXP_CONTAINS(key_exchange, r'(?i)DHE') OR protocol = 'TLS 1.3') AS forward_secrecy_count, + COUNTIF(REGEXP_CONTAINS(key_exchange, r'(?i)DHE') OR protocol = 'TLS 1.3') / COUNT(0) AS pct +FROM ( + SELECT + client, + JSON_VALUE(payload, '$._securityDetails.keyExchange') AS key_exchange, + JSON_VALUE(payload, '$._securityDetails.protocol') AS protocol + FROM + `httparchive.all.requests` + WHERE + date = '2024-06-01' AND + is_root_page + ) +WHERE + protocol IS NOT NULL +GROUP BY + client +ORDER BY + client, + pct DESC diff --git a/sql/2024/security/tls_versions_pages.sql b/sql/2024/security/tls_versions_pages.sql new file mode 100644 index 00000000000..f6241c4994f --- /dev/null +++ b/sql/2024/security/tls_versions_pages.sql @@ -0,0 +1,28 @@ +#standardSQL +# Section: Transport Security - Protocol Versions +# Question: Which TLS versions are most common on all TLS-enabled web pages? +SELECT + client, + tls_version, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_https_pages, + COUNT(0) AS freq, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct +FROM ( + SELECT + client, + IFNULL(JSON_VALUE(payload, '$._tls_version'), JSON_VALUE(payload, '$._securityDetails.protocol')) AS tls_version + FROM + `httparchive.all.requests` + WHERE + date = '2024-06-01' AND + is_root_page AND + is_main_document AND + STARTS_WITH(url, 'https') + ) +WHERE + tls_version IS NOT NULL +GROUP BY + client, + tls_version +ORDER BY + pct DESC diff --git a/sql/2024/security/tls_versions_requests.sql b/sql/2024/security/tls_versions_requests.sql new file mode 100644 index 00000000000..60bec6ccf23 --- /dev/null +++ b/sql/2024/security/tls_versions_requests.sql @@ -0,0 +1,30 @@ +#standardSQL +# Section: Transport Security - Protocol Versions +# Question: What is the distribution of TLS versions on all TLS-enabled requests? +# Note: Query is large (40TB) +SELECT + client, + tls_version, + SUM(COUNT(DISTINCT host)) OVER (PARTITION BY client) AS total_https_hosts, + COUNT(DISTINCT host) AS freq, + COUNT(DISTINCT host) / SUM(COUNT(DISTINCT host)) OVER (PARTITION BY client) AS pct +FROM ( + SELECT + client, + NET.HOST(url) AS host, + IFNULL(JSON_VALUE(payload, '$._tls_version'), JSON_VALUE(payload, '$._securityDetails.protocol')) AS tls_version + FROM + `httparchive.all.requests` + WHERE + date = '2024-06-01' AND + is_root_page AND + STARTS_WITH(url, 'https') + ) +WHERE + tls_version IS NOT NULL +GROUP BY + client, + tls_version +ORDER BY + client, + pct DESC diff --git a/sql/2024/security/version-evolution-top-technologies.sql b/sql/2024/security/version-evolution-top-technologies.sql new file mode 100644 index 00000000000..3a83c432438 --- /dev/null +++ b/sql/2024/security/version-evolution-top-technologies.sql @@ -0,0 +1,71 @@ +#standardSQL +# Section: Drivers of security mechanims - Technology stack +# Question: Distribution of the different version of the top 20 technologies used on the web. +SELECT + category, + technology, + info, + date, + client, + freq, + pct +FROM ( + SELECT + info, + tech.category_lower AS category, + tech.technology_lower AS technology, + date, + client, + COUNT(0) AS freq, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client, date, tech.category_lower, tech.technology_lower) AS pct + FROM ( + SELECT + info, + TRIM(LOWER(category)) AS category_lower, + TRIM(LOWER(t.technology)) AS technology_lower, + date, + client + FROM + `httparchive.all.pages`, + UNNEST(technologies) AS t, + UNNEST(t.categories) AS category, + UNNEST(t.info) AS info + WHERE + DATE >= '2022-06-01' AND + is_root_page AND + REGEXP_CONTAINS(info, r'\d+\.\d+') + ) AS tech + INNER JOIN ( + SELECT + TRIM(LOWER(category)) AS category_lower, + TRIM(LOWER(technology)) AS technology_lower, + COUNT(0) AS num + FROM + `httparchive.all.pages`, + UNNEST(technologies) AS t, + UNNEST(t.categories) AS category, + UNNEST(t.info) AS info + WHERE + DATE >= '2022-06-01' AND + is_root_page + GROUP BY + category_lower, + technology_lower + ORDER BY + num DESC + LIMIT 20 + ) AS top ON (tech.category_lower = top.category_lower AND tech.technology_lower = top.technology_lower) + GROUP BY + tech.category_lower, + tech.technology_lower, + date, + info, + client) +WHERE + pct > 0.01 +ORDER BY + client, + category, + technology, + date, + pct DESC diff --git a/sql/2024/security/web_cryptography_api.sql b/sql/2024/security/web_cryptography_api.sql new file mode 100644 index 00000000000..66dd5576628 --- /dev/null +++ b/sql/2024/security/web_cryptography_api.sql @@ -0,0 +1,18 @@ +#standardSQL + # Section: Attack preventions - Web Cryptography API + # Question: Which Web Cryptography APIs are used the most? + # Note: Possible to port to httparchive.all.pages, however would require to recreate num_urls, total_urls, and pct_urls +SELECT + client, + feature, + num_urls, + total_urls, + pct_urls +FROM + `httparchive.blink_features.usage` +WHERE + (feature LIKE 'Crypto%' OR + feature LIKE 'Subtle%') AND + yyyymmdd = '20240601' +ORDER BY + pct_urls DESC diff --git a/sql/2024/security/well-known_change-password.sql b/sql/2024/security/well-known_change-password.sql new file mode 100644 index 00000000000..10c3f7cb48e --- /dev/null +++ b/sql/2024/security/well-known_change-password.sql @@ -0,0 +1,30 @@ +#standardSQL +# Section: Well-known URIs - change-password +# Question What is the prevalence of correctly configured /.well-known/change-password endpoints? +# Notes: Safe Cast is required `.data.status` is not always an INT for some reason + +# Prevalence of correctly configured /.well-known/change-password endpoints: +# defined as `change-password` redirecting and having an 'ok' HTTP status code (https://fetch.spec.whatwg.org/#ok-status), +# while `resource-that-should-not-exist-whose-status-code-should-not-be-200` indeed does not have status code 200, +# as this would indicate that the server is badly configured, and that the redirect to `change-password` can't be trusted. +# `status` reflects the status code after redirection, so checking only for the status code afterwards is fine. +SELECT + client, + COUNT(DISTINCT page) AS total_pages, + COUNTIF(change_password_redirected = 'true' AND (change_password_status BETWEEN 200 AND 299) AND (resource_status NOT BETWEEN 200 AND 299)) AS count_change_password_did_redirect_and_ok, + COUNTIF(change_password_redirected = 'true' AND (change_password_status BETWEEN 200 AND 299) AND (resource_status NOT BETWEEN 200 AND 299)) / COUNT(DISTINCT page) AS pct_change_password_did_redirect_and_ok +FROM ( + SELECT + client, + page, + JSON_QUERY(JSON_VALUE(payload, '$._well-known'), '$."/.well-known/change-password".data.redirected') AS change_password_redirected, + SAFE_CAST(JSON_QUERY(JSON_VALUE(payload, '$._well-known'), '$."/.well-known/change-password".data.status') AS INT64) AS change_password_status, + SAFE_CAST(JSON_QUERY(JSON_VALUE(payload, '$._well-known'), '$."/.well-known/resource-that-should-not-exist-whose-status-code-should-not-be-200/".data.status') AS INT64) AS resource_status + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page +) +GROUP BY + client diff --git a/sql/2024/security/well-known_resource-not-be-200.sql b/sql/2024/security/well-known_resource-not-be-200.sql new file mode 100644 index 00000000000..2a227133e27 --- /dev/null +++ b/sql/2024/security/well-known_resource-not-be-200.sql @@ -0,0 +1,27 @@ +#standardSQL +# Section: Well-known URIs - Detecting Status Code Reliability +# Question: What is the prevalence of servers that return a 200 status code where they should not? +# Prevalence of /.well-known/resource-that-should-not-exist-whose-status-code-should-not-be-200 counts status codes +# "We can see if a web server's statuses are reliable by fetching a URL that should never result in an ok status." (https://w3c.github.io/webappsec-change-password-url/response-code-reliability.html) +SELECT + client, + COUNT(DISTINCT page) AS total_pages, + # `status` reflects the status code after redirection, so checking only for 200 is fine. + COUNTIF(status BETWEEN 200 AND 299) AS count_status_200, + SAFE_DIVIDE(COUNTIF(status BETWEEN 200 AND 299), COUNT(DISTINCT page)) AS pct_status_200, + COUNTIF(status NOT BETWEEN 200 AND 299) AS count_status_not_ok, + SAFE_DIVIDE(COUNTIF(status NOT BETWEEN 200 AND 299), COUNT(DISTINCT page)) AS pct_status_not_ok +FROM ( + SELECT + client, + page, + JSON_QUERY(JSON_VALUE(payload, '$._well-known'), '$."/.well-known/resource-that-should-not-exist-whose-status-code-should-not-be-200/".data.redirected') AS redirected, + SAFE_CAST(JSON_VALUE(JSON_VALUE(payload, '$._well-known'), '$."/.well-known/resource-that-should-not-exist-whose-status-code-should-not-be-200/".data.status') AS INT64) AS status + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page +) +GROUP BY + client diff --git a/sql/2024/security/well-known_security.sql b/sql/2024/security/well-known_security.sql new file mode 100644 index 00000000000..ac1691c26e2 --- /dev/null +++ b/sql/2024/security/well-known_security.sql @@ -0,0 +1,272 @@ +#standardSQL + # Section: Well-known URIs - securityt.txt + # Question: What is the prevalence of (signed) /.well-known/security.txt endpoints and prevalence of included attributes (canonical, encryption, expires, policy)? + # Note: Query is huge (60TB) and computationally expensive (slow) + # Note: We require that the final status code for /.well-known/security.txt is 200 (found) and that the content-type starts with text/plain. This can lead to a very small number of false negatives but is much better than false positives using other approaches + # Note: all_required_exist = contact & expires are mandatory; only_one_requirement_broken = expires & preferred_languages are not allowed to occur multiple times; valid = all_required_exist && !only_one_requirement_broken + # Note: The custom metric only has an entry for a directive if it is not empty, thus we can assume that a non-null value cannot be an empty list + # Note: Each directive (except signed) is saved as a list, however currently we do not really check the content +WITH +security_txt_data AS ( + SELECT + client, + page, + # Bools + LAX_BOOL(TO_JSON(JSON_VALUE(sec_txt, '$.found'))) AS found, + LAX_BOOL(TO_JSON(JSON_VALUE(sec_txt, '$.data.redirected'))) AS redirected, + LAX_BOOL(TO_JSON(JSON_VALUE(sec_txt, '$.data.valid'))) AS valid, + LAX_BOOL(TO_JSON(JSON_VALUE(sec_txt, '$.data.all_required_exist'))) AS all_required_exist, + LAX_BOOL(TO_JSON(JSON_VALUE(sec_txt, '$.data.only_one_requirement_broken'))) AS only_one_requirement_broken, + # Meta Info + JSON_VALUE(sec_txt, '$.data.status') AS status, + JSON_VALUE(sec_txt, '$.data.content_type') AS content_type, + # Directives + LAX_BOOL(TO_JSON(JSON_VALUE(sec_txt, '$.data.signed'))) AS signed, + JSON_VALUE_ARRAY(sec_txt, '$.data.contact') AS contact, + JSON_VALUE_ARRAY(sec_txt, '$.data.expires') AS expires, + JSON_VALUE_ARRAY(sec_txt, '$.data.encryption') AS encryption, + JSON_VALUE_ARRAY(sec_txt, '$.data.acknowledgments') AS acknowledgments, + JSON_VALUE_ARRAY(sec_txt, '$.data.preferred_languages') AS preferred_languages, + JSON_VALUE_ARRAY(sec_txt, '$.data.canonical') AS canonical, + JSON_VALUE_ARRAY(sec_txt, '$.data.policy') AS POLICY, + JSON_VALUE_ARRAY(sec_txt, '$.data.hiring') AS hiring, + JSON_VALUE_ARRAY(sec_txt, '$.data.csaf') AS csaf, + # Other has a structure of [("key": value)] and thus needs QUERY_ARRAY + JSON_QUERY_ARRAY(sec_txt, '$.data.other') AS other + FROM ( + SELECT + client, + page, + JSON_QUERY(custom_metrics, '$.well-known."/.well-known/security.txt"') AS sec_txt + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page + # AND rank <= 1000 + ) +), +totals AS ( + SELECT + client, + # High Level stats + COUNT(DISTINCT page) AS total_pages, + # Request to .well-known/security.txt failed or did not even start + COUNTIF(found IS NULL) AS count_failed, + # Found == final status code is 200 + COUNTIF(found) AS count_found, + COUNTIF(found) / COUNT(DISTINCT page) AS pct_found, + # Redirected == response redirected at least once + COUNTIF(redirected) AS count_redirected_all, + COUNTIF(redirected) / COUNT(DISTINCT page) AS pct_redirected_all, + # Redirected found == response redirected and final status code is 200 (some redirect and then answer with 500 or 426; Note that some also use a redirect status code such as 307 but as there is no location header, do not actually redirect) + COUNTIF(redirected AND found) AS count_redirected_found + FROM + security_txt_data + GROUP BY + client +) + + +SELECT + client, + # High Level stats for all pages + total_pages, + count_failed, + count_found, + pct_found, + count_redirected_all, + pct_redirected_all, + count_redirected_found, + + # High level stats on real security.txt files (i.e., found + content-type startswith text/plain) + # Real security.txt files + COUNT(0) AS has_security_txt, + COUNT(0) / total_pages AS pct_security_txt, + # Redirected and real security.txt file + COUNTIF(redirected) AS count_redirected_security_txt, + # Redirected valid == response redirected, final status code is 200 and file is a "valid" security.txt file + COUNTIF(redirected AND valid) AS count_redirected_valid, + # Valid == all_required_exist && !only_one_requirement_broken + COUNTIF(valid) AS count_valid, + COUNTIF(valid) / COUNT(0) AS pct_valid, + # All required exist == expires && contact + COUNTIF(all_required_exist) AS count_all_required_exist, + COUNTIF(all_required_exist) / COUNT(0) AS pct_all_required_exist, + # Only one requriement broken == expires & preferred_languages are not allowed to occur multiple times + COUNTIF(only_one_requirement_broken) AS count_only_one_requirement_broken, + COUNTIF(only_one_requirement_broken) / COUNT(0) AS pct_only_one_requirement_broken, + + # Individual values + COUNTIF(signed) AS count_signed, + COUNTIF(signed) / COUNT(0) AS pct_signed, + COUNTIF(contact IS NOT NULL) AS contact, + COUNTIF(contact IS NOT NULL) / COUNT(0) AS pct_contact, + COUNTIF(expires IS NOT NULL) AS expires, + COUNTIF(expires IS NOT NULL) / COUNT(0) AS pct_expires, + COUNTIF(encryption IS NOT NULL) AS encryption, + COUNTIF(encryption IS NOT NULL) / COUNT(0) AS pct_encryption, + COUNTIF(acknowledgments IS NOT NULL) AS acknowlegments, + COUNTIF(acknowledgments IS NOT NULL) / COUNT(0) AS pct_acknowledgments, + COUNTIF(preferred_languages IS NOT NULL) AS preferred_languages, + COUNTIF(preferred_languages IS NOT NULL) / COUNT(0) AS pct_preferred_languages, + COUNTIF(canonical IS NOT NULL) AS canonical, + COUNTIF(canonical IS NOT NULL) / COUNT(0) AS pct_canonical, + COUNTIF(POLICY IS NOT NULL) AS POLICY, + COUNTIF(POLICY IS NOT NULL) / COUNT(0) AS pct_policy, + COUNTIF(hiring IS NOT NULL) AS hiring, + COUNTIF(hiring IS NOT NULL) / COUNT(0) AS pct_hiring, + COUNTIF(csaf IS NOT NULL) AS csaf, + COUNTIF(csaf IS NOT NULL) / COUNT(0) AS pct_csaf, + COUNTIF(other IS NOT NULL) AS other, + COUNTIF(other IS NOT NULL) / COUNT(0) AS pct_other, + + # Other values relative to only valid files (as other can be garbage if the file is not actually a security.txt file) + COUNTIF(other IS NOT NULL AND + valid) AS other_valid, + COUNTIF(other IS NOT NULL AND + valid) / COUNTIF(valid) AS pct_other_valid, + + # Average counts of directives (only non-null values are counted; i.e., min is 1, might be better to count the average of all "found" files, i.e., including 0) (COALASCE 0) + AVG(ARRAY_LENGTH(contact)) AS avg_contact_count, + AVG(ARRAY_LENGTH(expires)) AS avg_expires_count, + AVG(ARRAY_LENGTH(encryption)) AS avg_encryption_count, + AVG(ARRAY_LENGTH(acknowledgments)) AS avg_acknowledgments_count, + AVG(ARRAY_LENGTH(preferred_languages)) AS avg_preferred_language_count, + AVG(ARRAY_LENGTH(canonical)) AS avg_canonical_count, + AVG(ARRAY_LENGTH(policy)) AS avg_policy_count, + AVG(ARRAY_LENGTH(hiring)) AS avg_hiring_count, + AVG(ARRAY_LENGTH(csaf)) AS avg_csaf_count, + AVG(ARRAY_LENGTH(other)) AS avg_other_count +FROM + security_txt_data +JOIN totals USING (client) +WHERE + found AND + STARTS_WITH(content_type, 'text/plain') +GROUP BY + client, + total_pages, + count_failed, + count_found, + pct_found, + count_redirected_all, + pct_redirected_all, + count_redirected_found + +/* +# Quite some pages do not have any security.txt data, i.e., the custom metric collection (for well-know) failed; more info could be in the "error" and "message" +SELECT + redirected, + found, + COUNT(0) as ct +FROM + security_txt_data +GROUP BY + redirected, + found +ORDER BY + ct DESC +*/ + +/* +# Most status codes are 404, however, 403 or 503 also exist +# Some sites also use a redirect status code without a location header (no redirect occurs!) +SELECT + status, + redirected, + found, + COUNT(0) as ct +FROM + security_txt_data +GROUP BY + status, + redirected, + found +ORDER BY + ct DESC +*/ + +/* +# Most found/valid files use content-type text/plain.* We use a filter on the content-type to remove all other files (e.g., HTML files with status code 200 at /.well-known/security.txt) +# Responses without any content-type are quite rare +# E.g., in the Top 100K: 2311 with found false and valid==null (i.e., 404 or similar without content), 170 with found true and valid==false (mostly HTML pages), 92 with found false and valid==false (mostly HTML pages with status code != 200 but okay type), and 12 with found true and valid true (false negatives!) (compared to text/plain true, true 1756 and true, false 1430) +SELECT + content_type, + found, + valid, + COUNT(0) as ct +FROM + security_txt_data +GROUP BY + content_type, + found, + valid +ORDER BY + ct DESC +*/ + +/* +# Do any of the non text/plain files have anything resembling a security.text file at all? +# They only have "other" values that appear to be mostly css that we accidentally match as they return status code 200 at /.well-known/security.txt +# Small number of FNs, e.g., 40 sites with text/html and contact in Top100K (the same would cause ~3545 false positives if not filtering on text/plain) +SELECT + content_type as ct, + COUNT(page) as total, + COUNTIF(signed) as signed, + COUNTIF(contact IS NOT NULL) as contact, + COUNTIF(expires IS NOT NULL) as expires, + COUNTIF(encryption IS NOT NULL) as encryption, + COUNTIF(acknowledgments IS NOT NULL) as ack, + COUNTIF(preferred_languages IS NOT NULL) as pref_lang, + COUNTIF(canonical IS NOT NULL) as canonical, + COUNTIF(POLICY IS NOT NULL) as policy, + COUNTIF(hiring IS NOT NULL) as hiring, + COUNTIF(csaf IS NOT NULL) as csaf, + COUNTIF(other IS NOT NULL) as other +FROM + security_txt_data +WHERE + found + AND NOT STARTS_WITH(content_type, "text/plain") +GROUP BY + ct +*/ + +/* +# Valid (other) values seem very rare for non text/plain responses! +# For text/plain they see to be Acknowledgements (typo/AE vs BE), Info, Tips, Hash, ... +SELECT + contact, + expires, + preferred_languages, + other +FROM + security_txt_data +WHERE + found + AND other is not NULL + #AND STARTS_WITH(content_type, "text/html;charset=utf-8") + #AND NOT STARTS_WITH(content_type, "text/plain") + AND STARTS_WITH(content_type, "text/plain") +*/ + +/* +# Value distribution of other values! +# Common values (only text/plain otherwise it will be HTML stuff) +# Top 100sk, Acknowledgements (268), Hash (368), OpenBugBounty (196), Signature (52), Bug Bountry program (22), Contact (16; with space in front), Expires (14; with space in front), ... +SELECT + JSON_VALUE_ARRAY(other_val)[offset(0)] as directive_name, + COUNT(0) as cnt +FROM + security_txt_data, + UNNEST(other) as other_val +WHERE + found + AND other IS NOT NULL + AND STARTS_WITH(content_type, "text/plain") +GROUP BY + directive_name +ORDER BY + cnt DESC +*/ diff --git a/sql/2024/security/xfo_header_prevalence.sql b/sql/2024/security/xfo_header_prevalence.sql new file mode 100644 index 00000000000..c4cbad2093e --- /dev/null +++ b/sql/2024/security/xfo_header_prevalence.sql @@ -0,0 +1,30 @@ +#standardSQL +# Section: Attack Preventions - Security Header Adoptions? +# Question: Which are the most common XFO values? +# Note: Considers headers of main document responses +SELECT + client, + xfo_header, + SUM(COUNT(DISTINCT host)) OVER (PARTITION BY client) AS total_xfo_headers, + COUNT(DISTINCT host) AS freq, + COUNT(DISTINCT host) / SUM(COUNT(DISTINCT host)) OVER (PARTITION BY client) AS pct +FROM ( + SELECT + client, + NET.HOST(url) AS host, + response_headers.value AS xfo_header + FROM + `httparchive.all.requests`, + UNNEST(response_headers) AS response_headers + WHERE + date = '2024-06-01' AND + is_root_page AND + is_main_document AND + LOWER(response_headers.name) = 'x-frame-options') +GROUP BY + client, + xfo_header +ORDER BY + pct DESC +LIMIT + 100 From 3d31ca370a92f048c995fdf63db87cd8c9feedaf Mon Sep 17 00:00:00 2001 From: Kevin Farrugia Date: Wed, 30 Oct 2024 14:05:25 +0100 Subject: [PATCH 09/15] Performance 2024 Queries (#3712) * Migrated queries from previous editions * Removed TABLESAMPLE; Resolved lint errors * Updated sqlfluff and re-run linter * prefer_count_0 * Reverted .sqlfluff * Add performance queries * Update web_vitals_by_technology.sql Fixing Linter error - Files must end with a single trailing newline. * Update monthly_cls_lcp.sql Fixing linter error * Update font_usage_mobile.sql Fixing linter * Update render_blocking_resources.sql Fixing linter error * Update render_blocking_resources.sql adding spaces. * Add font_resource_hints_usage query * Updated bfcache_unload due to changes in Lighthouse audits structure * Removed sampling * Added render_blocking_savings_* * Added query for web_vitals root and secondary page * Updated secondary pages CWV * Update performance queries * Added resource hint queries * Resolved lint issues * Reverted change to lcp_preload_discoverable * Added queries for client-side generated content * Updated PR following review * Removed TABLESAMPLE --------- Co-authored-by: guaca Co-authored-by: Mike Gifford Co-authored-by: Barry Pollard --- .../bfcache_cachecontrol_nostore.sql | 29 ++ sql/2024/performance/bfcache_unload.sql | 40 +++ sql/2024/performance/cls_animations.sql | 28 ++ .../performance/cls_unsized_image_height.sql | 27 ++ sql/2024/performance/cls_unsized_images.sql | 28 ++ .../performance/font_resource_hints_usage.sql | 66 +++++ .../font_resource_hints_usage_trends.sql | 96 +++++++ sql/2024/performance/font_usage_mobile.sql | 12 + sql/2024/performance/generated_content.sql | 74 ++++++ .../generated_content_web_vitals.sql | 133 ++++++++++ sql/2024/performance/inp_long_tasks.sql | 39 +++ sql/2024/performance/inp_tbt.sql | 16 ++ sql/2024/performance/js_bytes_rank.sql | 17 ++ .../performance/lcp_bytes_distribution.sql | 42 +++ sql/2024/performance/lcp_bytes_histogram.sql | 43 +++ sql/2024/performance/lcp_element_data.sql | 130 +++++++++ sql/2024/performance/lcp_format.sql | 42 +++ sql/2024/performance/lcp_host.sql | 29 ++ sql/2024/performance/lcp_host_3p.sql | 31 +++ sql/2024/performance/lcp_initiator_type.sql | 43 +++ sql/2024/performance/lcp_lazy.sql | 47 ++++ .../performance/lcp_lazy_secondary_pages.sql | 47 ++++ .../performance/lcp_lazy_technologies.sql | 75 ++++++ .../performance/lcp_preload_discoverable.sql | 28 ++ .../performance/lcp_resource_load_delay.sql | 53 ++++ sql/2024/performance/lcp_resource_type.sql | 33 +++ sql/2024/performance/lcp_responsive_data.sql | 59 +++++ sql/2024/performance/lcp_wasted_bytes.sql | 63 +++++ sql/2024/performance/monthly_cls_lcp.sql | 16 ++ .../performance/render_blocking_resources.sql | 14 + .../render_blocking_savings_fcp.sql | 26 ++ .../render_blocking_savings_lcp.sql | 41 +++ sql/2024/performance/resource_hints_usage.sql | 74 ++++++ .../performance/resource_hints_usage_2021.sql | 62 +++++ sql/2024/performance/rtt_distribution.sql | 16 ++ .../viewport_meta_zoom_disable.sql | 12 + sql/2024/performance/web_vitals_by_device.sql | 242 +++++++++++++++++ .../web_vitals_by_device_secondary_pages.sql | 65 +++++ .../web_vitals_by_rank_and_device.sql | 250 ++++++++++++++++++ .../performance/web_vitals_by_technology.sql | 215 +++++++++++++++ 40 files changed, 2403 insertions(+) create mode 100644 sql/2024/performance/bfcache_cachecontrol_nostore.sql create mode 100644 sql/2024/performance/bfcache_unload.sql create mode 100644 sql/2024/performance/cls_animations.sql create mode 100644 sql/2024/performance/cls_unsized_image_height.sql create mode 100644 sql/2024/performance/cls_unsized_images.sql create mode 100644 sql/2024/performance/font_resource_hints_usage.sql create mode 100644 sql/2024/performance/font_resource_hints_usage_trends.sql create mode 100644 sql/2024/performance/font_usage_mobile.sql create mode 100644 sql/2024/performance/generated_content.sql create mode 100644 sql/2024/performance/generated_content_web_vitals.sql create mode 100644 sql/2024/performance/inp_long_tasks.sql create mode 100644 sql/2024/performance/inp_tbt.sql create mode 100644 sql/2024/performance/js_bytes_rank.sql create mode 100644 sql/2024/performance/lcp_bytes_distribution.sql create mode 100644 sql/2024/performance/lcp_bytes_histogram.sql create mode 100644 sql/2024/performance/lcp_element_data.sql create mode 100644 sql/2024/performance/lcp_format.sql create mode 100644 sql/2024/performance/lcp_host.sql create mode 100644 sql/2024/performance/lcp_host_3p.sql create mode 100644 sql/2024/performance/lcp_initiator_type.sql create mode 100644 sql/2024/performance/lcp_lazy.sql create mode 100644 sql/2024/performance/lcp_lazy_secondary_pages.sql create mode 100644 sql/2024/performance/lcp_lazy_technologies.sql create mode 100644 sql/2024/performance/lcp_preload_discoverable.sql create mode 100644 sql/2024/performance/lcp_resource_load_delay.sql create mode 100644 sql/2024/performance/lcp_resource_type.sql create mode 100644 sql/2024/performance/lcp_responsive_data.sql create mode 100644 sql/2024/performance/lcp_wasted_bytes.sql create mode 100644 sql/2024/performance/monthly_cls_lcp.sql create mode 100644 sql/2024/performance/render_blocking_resources.sql create mode 100644 sql/2024/performance/render_blocking_savings_fcp.sql create mode 100644 sql/2024/performance/render_blocking_savings_lcp.sql create mode 100644 sql/2024/performance/resource_hints_usage.sql create mode 100644 sql/2024/performance/resource_hints_usage_2021.sql create mode 100644 sql/2024/performance/rtt_distribution.sql create mode 100644 sql/2024/performance/viewport_meta_zoom_disable.sql create mode 100644 sql/2024/performance/web_vitals_by_device.sql create mode 100644 sql/2024/performance/web_vitals_by_device_secondary_pages.sql create mode 100644 sql/2024/performance/web_vitals_by_rank_and_device.sql create mode 100644 sql/2024/performance/web_vitals_by_technology.sql diff --git a/sql/2024/performance/bfcache_cachecontrol_nostore.sql b/sql/2024/performance/bfcache_cachecontrol_nostore.sql new file mode 100644 index 00000000000..6541ab25ea8 --- /dev/null +++ b/sql/2024/performance/bfcache_cachecontrol_nostore.sql @@ -0,0 +1,29 @@ +CREATE TEMP FUNCTION HAS_NO_STORE_DIRECTIVE(cache_control STRING) RETURNS BOOL AS ( + REGEXP_CONTAINS(cache_control, r'(?i)\bno-store\b') +); + +WITH requests AS ( + SELECT + client, + LOGICAL_OR(HAS_NO_STORE_DIRECTIVE(JSON_VALUE(payload, '$._cacheControl'))) AS includes_ccns + FROM + `httparchive.all.requests` + WHERE + date = '2024-06-01' AND + is_main_document + GROUP BY + client, + page +) + +SELECT + client, + COUNTIF(includes_ccns) AS pages, + COUNT(0) AS total, + COUNTIF(includes_ccns) / COUNT(0) AS pct +FROM + requests +GROUP BY + client +ORDER BY + client diff --git a/sql/2024/performance/bfcache_unload.sql b/sql/2024/performance/bfcache_unload.sql new file mode 100644 index 00000000000..09aa40e75f9 --- /dev/null +++ b/sql/2024/performance/bfcache_unload.sql @@ -0,0 +1,40 @@ +CREATE TEMPORARY FUNCTION getUnloadHandler(audit STRING) +RETURNS BOOL LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details?.items?.some(n => n.value?.toLowerCase() === "unloadhandler"); +} catch (e) { + return false; +} +'''; + +WITH lh AS ( + SELECT + client, + page, + rank, + getUnloadHandler(JSON_EXTRACT(lighthouse, '$.audits.deprecations')) AS has_unload + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' +) + + +SELECT + client, + _rank AS rank, + COUNTIF(has_unload) AS pages, + COUNT(0) AS total, + COUNTIF(has_unload) / COUNT(0) AS pct +FROM + lh, + UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS _rank +WHERE + rank <= _rank +GROUP BY + client, + rank +ORDER BY + rank, + client diff --git a/sql/2024/performance/cls_animations.sql b/sql/2024/performance/cls_animations.sql new file mode 100644 index 00000000000..f07c337c944 --- /dev/null +++ b/sql/2024/performance/cls_animations.sql @@ -0,0 +1,28 @@ +WITH lh AS ( + SELECT + client, + ARRAY_LENGTH(JSON_QUERY_ARRAY(lighthouse, '$.audits.non-composited-animations.details.items')) AS num_animations + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page +) + + +SELECT + percentile, + client, + APPROX_QUANTILES(num_animations, 1000)[OFFSET(percentile * 10)] AS num_animations, + COUNTIF(num_animations > 0) AS pages, + COUNT(0) AS total, + COUNTIF(num_animations > 0) / COUNT(0) AS pct +FROM + lh, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +GROUP BY + percentile, + client +ORDER BY + percentile, + client diff --git a/sql/2024/performance/cls_unsized_image_height.sql b/sql/2024/performance/cls_unsized_image_height.sql new file mode 100644 index 00000000000..74afcaca23b --- /dev/null +++ b/sql/2024/performance/cls_unsized_image_height.sql @@ -0,0 +1,27 @@ +WITH lh AS ( + SELECT + client, + CAST(JSON_VALUE(unsized_image, '$.node.boundingRect.height') AS INT64) AS height + FROM + `httparchive.all.pages`, + UNNEST(JSON_QUERY_ARRAY(lighthouse, '$.audits.unsized-images.details.items')) AS unsized_image + WHERE + date = '2024-06-01' AND + is_root_page +) + + +SELECT + percentile, + client, + APPROX_QUANTILES(height, 1000)[OFFSET(percentile * 10)] AS height, + COUNT(0) AS unsized_images +FROM + lh, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +GROUP BY + percentile, + client +ORDER BY + percentile, + client diff --git a/sql/2024/performance/cls_unsized_images.sql b/sql/2024/performance/cls_unsized_images.sql new file mode 100644 index 00000000000..43633c9d197 --- /dev/null +++ b/sql/2024/performance/cls_unsized_images.sql @@ -0,0 +1,28 @@ +WITH lh AS ( + SELECT + client, + ARRAY_LENGTH(JSON_QUERY_ARRAY(lighthouse, '$.audits.unsized-images.details.items')) AS num_unsized_images + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page +) + + +SELECT + percentile, + client, + APPROX_QUANTILES(num_unsized_images, 1000)[OFFSET(percentile * 10)] AS num_unsized_images, + COUNTIF(num_unsized_images > 0) AS pages, + COUNT(0) AS total, + COUNTIF(num_unsized_images > 0) / COUNT(0) AS pct +FROM + lh, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +GROUP BY + percentile, + client +ORDER BY + percentile, + client diff --git a/sql/2024/performance/font_resource_hints_usage.sql b/sql/2024/performance/font_resource_hints_usage.sql new file mode 100644 index 00000000000..bb5ad1ad756 --- /dev/null +++ b/sql/2024/performance/font_resource_hints_usage.sql @@ -0,0 +1,66 @@ +CREATE TEMPORARY FUNCTION getResourceHints(payload STRING) +RETURNS ARRAY < STRUCT < name STRING, href STRING >> +LANGUAGE js AS ''' +var hints = new Set(['preload', 'prefetch', 'preconnect', 'prerender', 'dns-prefetch']); +try { + var $ = JSON.parse(payload); + var almanac = JSON.parse($._almanac); + return almanac['link-nodes'].nodes.reduce((results, link) => { + var hint = link.rel.toLowerCase(); + if (!hints.has(hint)) { + return results; + } + results.push({ + name: hint, + href: link.href + }); + return results; + }, []); +} catch (e) { + return []; +} +'''; + +WITH resource_hints AS ( + SELECT DISTINCT + client, + page, + hint.name + FROM + `httparchive.all.pages` + LEFT JOIN + UNNEST(getResourceHints(payload)) AS hint + WHERE + date = '2024-06-01' AND + is_root_page +), + +font_requests AS ( + SELECT + client, + page, + type + FROM + `httparchive.all.requests` + WHERE + date = '2024-06-01' AND + type = 'font' AND + is_root_page +) + +SELECT + client, + name, + COUNT(DISTINCT page) AS pages, + SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client) AS total, + COUNT(DISTINCT page) / SUM(COUNT(DISTINCT page)) OVER (PARTITION BY client) AS pct_hints +FROM + resource_hints +LEFT JOIN + font_requests +USING + (client, page) +GROUP BY + client, name, type +ORDER BY + pct_hints DESC; diff --git a/sql/2024/performance/font_resource_hints_usage_trends.sql b/sql/2024/performance/font_resource_hints_usage_trends.sql new file mode 100644 index 00000000000..a63242c686f --- /dev/null +++ b/sql/2024/performance/font_resource_hints_usage_trends.sql @@ -0,0 +1,96 @@ +CREATE TEMPORARY FUNCTION getResourceHints(payload STRING) +RETURNS ARRAY < STRUCT < name STRING, href STRING >> +LANGUAGE js AS ''' +var hints = new Set(['preload', 'prefetch', 'preconnect', 'prerender', 'dns-prefetch']); +try { + var $ = JSON.parse(payload); + var almanac = JSON.parse($._almanac); + return almanac['link-nodes'].nodes.reduce((results, link) => { + var hint = link.rel.toLowerCase(); + if (!hints.has(hint)) { + return results; + } + results.push({ + name: hint, + href: link.href + }); + return results; + }, []); +} catch (e) { + return []; +} +'''; + +WITH resource_hints AS ( + SELECT DISTINCT + client, + date, + page, + hint.name AS name + FROM + `httparchive.all.pages` + LEFT JOIN + UNNEST(getResourceHints(payload)) AS hint + WHERE + (date = '2024-06-01' OR date = '2023-06-01' OR date = '2022-06-01') AND + is_root_page +), + +font_requests AS ( + SELECT + client, + date, + page, + type + FROM + `httparchive.all.requests` + WHERE + (date = '2024-06-01' OR date = '2023-06-01' OR date = '2022-06-01') AND + type = 'font' AND + is_root_page +), + +totals AS ( + SELECT + client, + date, + COUNT(0) AS total_pages + FROM + `httparchive.all.pages` + WHERE + (date = '2024-06-01' OR date = '2023-06-01' OR date = '2022-06-01') AND + is_root_page + GROUP BY + client, + date +) + +SELECT + client, + date, + name, + type, + COUNT(DISTINCT page) AS pages, + ANY_VALUE(total_pages) AS total, + COUNT(DISTINCT page) / ANY_VALUE(total_pages) AS pct +FROM + resource_hints +LEFT JOIN + font_requests +USING + (client, date, page) +JOIN + totals +USING + (client, date) +GROUP BY + client, + date, + name, + type +HAVING + type IS NOT NULL +ORDER BY + client, + date, + name DESC diff --git a/sql/2024/performance/font_usage_mobile.sql b/sql/2024/performance/font_usage_mobile.sql new file mode 100644 index 00000000000..0f198b08693 --- /dev/null +++ b/sql/2024/performance/font_usage_mobile.sql @@ -0,0 +1,12 @@ +SELECT + COUNTIF(SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqFont') AS INT64) > 0) AS freq_fonts, + COUNT(0) AS total, + COUNTIF(SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqFont') AS INT64) > 0) / COUNT(0) AS pct_fonts +FROM + `httparchive.all.pages` +WHERE + date = '2024-06-01' AND + client = 'mobile' AND + is_root_page AND + SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqFont') AS INT64) IS NOT NULL AND + SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesFont') AS INT64) IS NOT NULL diff --git a/sql/2024/performance/generated_content.sql b/sql/2024/performance/generated_content.sql new file mode 100644 index 00000000000..9155fa312ca --- /dev/null +++ b/sql/2024/performance/generated_content.sql @@ -0,0 +1,74 @@ +#standardSQL +CREATE TEMPORARY FUNCTION getGeneratedContent(payload STRING) +RETURNS STRUCT LANGUAGE js AS ''' +try { + const data = JSON.parse(payload); + const generatedData = data["_generated-content"]; + + const percent = parseFloat(generatedData.percent); + const sizeInKB = parseFloat(generatedData.sizeInKB); + + return { + percent: percent > 0 ? percent : 0, + sizeInKB: sizeInKB > 0 ? sizeInKB : 0 + }; +} catch (e) { + return null; +} +'''; + +WITH crux AS ( + SELECT + CONCAT(origin, '/') AS page, + CASE + WHEN device = 'phone' THEN 'mobile' + ELSE device + END AS client + FROM + `chrome-ux-report.materialized.device_summary` + WHERE + device IN ('desktop', 'phone') AND + date IN ('2024-06-01') +), + +pages AS ( + SELECT + client, + page, + getGeneratedContent(payload) AS generated_content + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page +) + +SELECT + client, + percentile, + APPROX_QUANTILES(generated_content_percent, 1000)[OFFSET(percentile * 10)] AS generated_content_percent, + APPROX_QUANTILES(generated_content_sizeInKB, 1000)[OFFSET(percentile * 10)] AS generated_content_sizeInKB, + COUNT(0) AS total +FROM ( + SELECT + client, + page, + generated_content.percent AS generated_content_percent, + generated_content.sizeInKB AS generated_content_sizeInKB + FROM + pages + JOIN + crux + USING + (client, page) + ), + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +WHERE + generated_content_percent IS NOT NULL AND + generated_content_sizeInKB IS NOT NULL +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2024/performance/generated_content_web_vitals.sql b/sql/2024/performance/generated_content_web_vitals.sql new file mode 100644 index 00000000000..58a71c73b9c --- /dev/null +++ b/sql/2024/performance/generated_content_web_vitals.sql @@ -0,0 +1,133 @@ +#standardSQL +CREATE TEMP FUNCTION IS_NON_ZERO (good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( + good + needs_improvement + poor > 0 +); + +CREATE TEMPORARY FUNCTION IS_GOOD (good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( + SAFE_DIVIDE(good, (good + needs_improvement + poor)) >= 0.75 +); + +CREATE TEMPORARY FUNCTION getGeneratedContent(payload STRING) +RETURNS STRUCT LANGUAGE js AS ''' +try { + const data = JSON.parse(payload); + const generatedData = data["_generated-content"]; + + const percent = parseFloat(generatedData.percent); + const sizeInKB = parseFloat(generatedData.sizeInKB); + + return { + percent: percent > 0 ? percent : 0, + sizeInKB: sizeInKB > 0 ? sizeInKB : 0 + }; +} catch (e) { + return null; +} +'''; + +WITH crux AS ( + SELECT + CONCAT(origin, '/') AS page, + origin, + CASE + WHEN device = 'phone' THEN 'mobile' + ELSE device + END AS client, + + fast_inp, + avg_inp, + slow_inp, + + fast_lcp, + avg_lcp, + slow_lcp, + + small_cls, + medium_cls, + large_cls + + FROM + `chrome-ux-report.materialized.device_summary` + WHERE + device IN ('desktop', 'phone') AND + date IN ('2024-06-01') +), + +pages AS ( + SELECT + client, + page, + getGeneratedContent(payload) AS generated_content + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page +) + +SELECT + client, + generated_content_percent, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_GOOD(fast_lcp, avg_lcp, slow_lcp), origin, NULL)), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp), origin, NULL))) AS pct_lcp_good, + + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_GOOD(fast_inp, avg_inp, slow_inp), origin, NULL)), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_inp, avg_inp, slow_inp), origin, NULL))) AS pct_inp_good, + + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL)), + COUNT(DISTINCT IF( + IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL))) AS pct_cls_good, + COUNT(DISTINCT origin) AS total_origins +FROM ( + SELECT + client, + page, + origin, + + fast_inp, + avg_inp, + slow_inp, + + fast_lcp, + avg_lcp, + slow_lcp, + + small_cls, + medium_cls, + large_cls, + + CASE + WHEN generated_content.percent >= 0.9 THEN '0.9-1.0' + WHEN generated_content.percent >= 0.8 THEN '0.8-0.9' + WHEN generated_content.percent >= 0.7 THEN '0.7-0.8' + WHEN generated_content.percent >= 0.6 THEN '0.6-0.7' + WHEN generated_content.percent >= 0.5 THEN '0.5-0.6' + WHEN generated_content.percent >= 0.4 THEN '0.4-0.5' + WHEN generated_content.percent >= 0.3 THEN '0.3-0.4' + WHEN generated_content.percent >= 0.2 THEN '0.2-0.3' + WHEN generated_content.percent >= 0.1 THEN '0.1-0.2' + ELSE '0.0-0.1' + END AS generated_content_percent + FROM + pages + JOIN + crux + USING + (client, page) + ) +WHERE + generated_content_percent IS NOT NULL +GROUP BY + client, + generated_content_percent +ORDER BY + client, + generated_content_percent diff --git a/sql/2024/performance/inp_long_tasks.sql b/sql/2024/performance/inp_long_tasks.sql new file mode 100644 index 00000000000..f39c43573d3 --- /dev/null +++ b/sql/2024/performance/inp_long_tasks.sql @@ -0,0 +1,39 @@ +WITH long_tasks AS ( + SELECT + client, + page, + ANY_VALUE(httparchive.core_web_vitals.GET_CRUX_INP(payload)) AS inp, + SUM(CAST(JSON_QUERY(item, '$.duration') AS FLOAT64)) AS long_tasks + FROM + `httparchive.all.pages`, + UNNEST(JSON_QUERY_ARRAY(lighthouse, '$.audits.long-tasks.details.items')) AS item + WHERE + date = '2024-06-01' AND + is_root_page + GROUP BY + client, + page +), + +meta AS ( + SELECT + *, + COUNT(0) OVER (PARTITION BY client) AS n, + ROW_NUMBER() OVER (PARTITION BY client ORDER BY inp) AS row + FROM + long_tasks + WHERE + inp IS NOT NULL +) + +SELECT + client, + long_tasks, + inp +FROM + meta +WHERE + MOD(row, CAST(FLOOR(n / 1000) AS INT64)) = 0 +ORDER BY + client, + long_tasks diff --git a/sql/2024/performance/inp_tbt.sql b/sql/2024/performance/inp_tbt.sql new file mode 100644 index 00000000000..8790f0781de --- /dev/null +++ b/sql/2024/performance/inp_tbt.sql @@ -0,0 +1,16 @@ +SELECT + percentile, + client, + APPROX_QUANTILES(CAST(JSON_QUERY(lighthouse, '$.audits.total-blocking-time.numericValue') AS FLOAT64), 1000)[OFFSET(percentile * 10)] AS tbt +FROM + `httparchive.all.pages`, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +WHERE + date = '2024-06-01' AND + is_root_page +GROUP BY + percentile, + client +ORDER BY + percentile, + client diff --git a/sql/2024/performance/js_bytes_rank.sql b/sql/2024/performance/js_bytes_rank.sql new file mode 100644 index 00000000000..ff5ec48cd86 --- /dev/null +++ b/sql/2024/performance/js_bytes_rank.sql @@ -0,0 +1,17 @@ +SELECT + IF(_rank < 100000000, CAST(_rank AS STRING), 'all') AS rank, + client, + APPROX_QUANTILES(CAST(JSON_VALUE(summary, '$.bytesJS') AS INT64), 1000)[OFFSET(500)] / 1024 AS js_kbytes +FROM + `httparchive.all.pages`, + UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS _rank +WHERE + date = '2024-06-01' AND + is_root_page AND + rank <= _rank +GROUP BY + rank, + client +ORDER BY + rank, + client diff --git a/sql/2024/performance/lcp_bytes_distribution.sql b/sql/2024/performance/lcp_bytes_distribution.sql new file mode 100644 index 00000000000..ffb6a9490fe --- /dev/null +++ b/sql/2024/performance/lcp_bytes_distribution.sql @@ -0,0 +1,42 @@ +WITH pages AS ( + SELECT + client, + page, + JSON_VALUE(custom_metrics, '$.performance.lcp_elem_stats.url') AS url + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page +), + +requests AS ( + SELECT + client, + page, + url, + CAST(JSON_VALUE(summary, '$.respSize') AS INT64) / 1024 AS kbytes + FROM + `httparchive.all.requests` + WHERE + date = '2024-06-01' AND + is_root_page +) + +SELECT + percentile, + client, + APPROX_QUANTILES(kbytes, 1000)[OFFSET(percentile * 10)] AS kbytes +FROM + pages +JOIN + requests +USING + (client, page, url), + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +GROUP BY + percentile, + client +ORDER BY + percentile, + client diff --git a/sql/2024/performance/lcp_bytes_histogram.sql b/sql/2024/performance/lcp_bytes_histogram.sql new file mode 100644 index 00000000000..aceee9411df --- /dev/null +++ b/sql/2024/performance/lcp_bytes_histogram.sql @@ -0,0 +1,43 @@ +WITH pages AS ( + SELECT + client, + page, + JSON_VALUE(custom_metrics, '$.performance.lcp_elem_stats.url') AS url + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page +), + +requests AS ( + SELECT + client, + page, + url, + CAST(JSON_VALUE(summary, '$.respSize') AS INT64) / 1024 AS kbytes + FROM + `httparchive.all.requests` + WHERE + date = '2024-06-01' AND + is_root_page +) + +SELECT + client, + IF(CEILING(kbytes / 100) * 100 < 1000, CAST(CEILING(kbytes / 100) * 100 AS STRING), '1000+') AS kbytes, + COUNT(0) AS freq, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct +FROM + pages +JOIN + requests +USING + (client, page, url) +GROUP BY + client, + kbytes +ORDER BY + client, + kbytes diff --git a/sql/2024/performance/lcp_element_data.sql b/sql/2024/performance/lcp_element_data.sql new file mode 100644 index 00000000000..bcef866c665 --- /dev/null +++ b/sql/2024/performance/lcp_element_data.sql @@ -0,0 +1,130 @@ +CREATE TEMP FUNCTION getLoadingAttr(attributes STRING) RETURNS STRING LANGUAGE js AS ''' + try { + const data = JSON.parse(attributes); + const loadingAttr = data.find(attr => attr["name"] === "loading") + return loadingAttr.value + } catch (e) { + return ""; + } +'''; + +CREATE TEMP FUNCTION getDecodingAttr(attributes STRING) RETURNS STRING LANGUAGE js AS ''' + try { + const data = JSON.parse(attributes); + const decodingAttr = data.find(attr => attr["name"] === "decoding") + return decodingAttr.value + } catch (e) { + return ""; + } +'''; + +CREATE TEMP FUNCTION getFetchPriorityAttr(attributes STRING) RETURNS STRING LANGUAGE js AS ''' + try { + const data = JSON.parse(attributes); + const fetchPriorityAttr = data.find(attr => attr["name"] === "fetchpriority") + return fetchPriorityAttr.value + } catch (e) { + return ""; + } +'''; + +CREATE TEMP FUNCTION getLoadingClasses(attributes STRING) RETURNS STRING LANGUAGE js AS ''' + try { + const data = JSON.parse(attributes); + const classes = data.find(attr => attr["name"] === "class").value + if (classes.indexOf('lazyload') !== -1) { + return classes + } else { + return "" + } + } catch (e) { + return ""; + } +'''; + +CREATE TEMPORARY FUNCTION getResourceHints(payload STRING) +RETURNS STRUCT +LANGUAGE js AS ''' +var hints = ['preload', 'prefetch', 'preconnect', 'prerender', 'dns-prefetch', 'modulepreload']; +try { + var $ = JSON.parse(payload); + var almanac = JSON.parse($.almanac); + return hints.reduce((results, hint) => { + results[hint] = !!almanac['link-nodes'].nodes.find(link => link.rel.toLowerCase() == hint); + return results; + }, {}); +} catch (e) { + return hints.reduce((results, hint) => { + results[hint] = false; + return results; + }, {}); +} +'''; + + +WITH lcp_stats AS ( + SELECT + client, + page, + JSON_EXTRACT_SCALAR(custom_metrics, '$.performance.lcp_elem_stats.nodeName') AS nodeName, + JSON_EXTRACT_SCALAR(custom_metrics, '$.performance.lcp_elem_stats.url') AS elementUrl, + CAST(JSON_EXTRACT_SCALAR(custom_metrics, '$.performance.lcp_elem_stats.size') AS INT64) AS size, + CAST(JSON_EXTRACT_SCALAR(custom_metrics, '$.performance.lcp_elem_stats.loadTime') AS FLOAT64) AS loadTime, + CAST(JSON_EXTRACT_SCALAR(custom_metrics, '$.performance.lcp_elem_stats.startTime') AS FLOAT64) AS startTime, + CAST(JSON_EXTRACT_SCALAR(custom_metrics, '$.performance.lcp_elem_stats.renderTime') AS FLOAT64) AS renderTime, + JSON_EXTRACT(custom_metrics, '$.performance.lcp_elem_stats.attributes') AS attributes, + getLoadingAttr(JSON_EXTRACT(custom_metrics, '$.performance.lcp_elem_stats.attributes')) AS loading, + getDecodingAttr(JSON_EXTRACT(custom_metrics, '$.performance.lcp_elem_stats.attributes')) AS decoding, + getLoadingClasses(JSON_EXTRACT(custom_metrics, '$.performance.lcp_elem_stats.attributes')) AS classWithLazyload, + getFetchPriorityAttr(JSON_EXTRACT(custom_metrics, '$.performance.lcp_elem_stats.attributes')) AS fetchPriority, + getResourceHints(custom_metrics) AS hints + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page +) + +SELECT + client, + nodeName, + COUNT(DISTINCT page) AS pages, + ANY_VALUE(total) AS total, + COUNT(DISTINCT page) / ANY_VALUE(total) AS pct, + COUNTIF(elementUrl != '') AS haveImages, + COUNTIF(elementUrl != '') / COUNT(DISTINCT page) AS pct_haveImages, + COUNTIF(loading = 'eager') AS native_eagerload, + COUNTIF(loading = 'lazy') AS native_lazyload, + COUNTIF(classWithLazyload != '') AS lazyload_class, + COUNTIF(classWithLazyload != '' OR loading = 'lazy') AS probably_lazyLoaded, + COUNTIF(classWithLazyload != '' OR loading = 'lazy') / COUNT(DISTINCT page) AS pct_prob_lazyloaded, + COUNTIF(decoding = 'async') AS async_decoding, + COUNTIF(decoding = 'sync') AS sync_decoding, + COUNTIF(decoding = 'auto') AS auto_decoding, + COUNTIF(fetchPriority = 'low') AS priority_low, + COUNTIF(fetchPriority = 'high') AS priority_high, + COUNTIF(hints.preload) AS preload, + COUNTIF(hints.preload) / COUNT(0) AS pct_preload +FROM + lcp_stats +JOIN ( + SELECT + client, + COUNT(0) AS total + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page + GROUP BY + client +) +USING + (client) +GROUP BY + client, + nodeName +HAVING + pages > 1000 +ORDER BY + pct DESC diff --git a/sql/2024/performance/lcp_format.sql b/sql/2024/performance/lcp_format.sql new file mode 100644 index 00000000000..e9ddcc5707d --- /dev/null +++ b/sql/2024/performance/lcp_format.sql @@ -0,0 +1,42 @@ +WITH pages AS ( + SELECT + client, + page, + JSON_VALUE(custom_metrics, '$.performance.lcp_elem_stats.url') AS url + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page +), + +requests AS ( + SELECT + client, + page, + url, + JSON_VALUE(summary, '$.format') AS format + FROM + `httparchive.all.requests` + WHERE + date = '2024-06-01' AND + is_root_page +) + +SELECT + client, + format, + COUNT(0) AS freq, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct +FROM + pages +JOIN + requests +USING + (client, page, url) +GROUP BY + client, + format +ORDER BY + pct DESC diff --git a/sql/2024/performance/lcp_host.sql b/sql/2024/performance/lcp_host.sql new file mode 100644 index 00000000000..19e98361b1b --- /dev/null +++ b/sql/2024/performance/lcp_host.sql @@ -0,0 +1,29 @@ +WITH lcp AS ( + SELECT + client, + page, + JSON_VALUE(custom_metrics, '$.performance.lcp_elem_stats.url') AS url + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page +) + + +SELECT + client, + CASE + WHEN NET.HOST(url) = 'data' THEN 'other content' + WHEN NET.HOST(url) IS NULL THEN 'other content' + WHEN NET.HOST(page) = NET.HOST(url) THEN 'same host' + ELSE 'cross host' + END AS lcp_same_host, + COUNT(0) AS pages, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct +FROM + lcp +GROUP BY + client, + lcp_same_host diff --git a/sql/2024/performance/lcp_host_3p.sql b/sql/2024/performance/lcp_host_3p.sql new file mode 100644 index 00000000000..d33be5d46a1 --- /dev/null +++ b/sql/2024/performance/lcp_host_3p.sql @@ -0,0 +1,31 @@ +WITH lcp AS ( + SELECT + client, + page, + JSON_VALUE(custom_metrics, '$.performance.lcp_elem_stats.url') AS url + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page +) + + +SELECT + client, + NET.REG_DOMAIN(url) AS lcp_domain, + COUNT(0) AS pages, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct +FROM + lcp +WHERE + NET.HOST(page) != NET.HOST(url) AND + NET.HOST(url) != 'data' +GROUP BY + client, + lcp_domain +ORDER BY + pct DESC +LIMIT + 25 diff --git a/sql/2024/performance/lcp_initiator_type.sql b/sql/2024/performance/lcp_initiator_type.sql new file mode 100644 index 00000000000..5ce46436508 --- /dev/null +++ b/sql/2024/performance/lcp_initiator_type.sql @@ -0,0 +1,43 @@ +WITH lcp AS ( + SELECT + client, + page, + JSON_VALUE(custom_metrics, '$.performance.lcp_resource.initiator.url') AS url, + JSON_VALUE(custom_metrics, '$.performance.is_lcp_statically_discoverable') = 'false' AS not_discoverable + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page +), + +requests AS ( + SELECT + client, + page, + url, + type + FROM + `httparchive.all.requests` + WHERE + date = '2024-06-01' +) + + +SELECT + client, + IFNULL(type, 'unknown') AS lcp_initiator_type, + COUNTIF(not_discoverable) AS pages, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total, + COUNTIF(not_discoverable) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct +FROM + lcp +LEFT JOIN + requests +USING + (client, page, url) +GROUP BY + client, + type +ORDER BY + pct DESC diff --git a/sql/2024/performance/lcp_lazy.sql b/sql/2024/performance/lcp_lazy.sql new file mode 100644 index 00000000000..cb96c62df7d --- /dev/null +++ b/sql/2024/performance/lcp_lazy.sql @@ -0,0 +1,47 @@ +CREATE TEMP FUNCTION isLazyLoaded(attributes STRING) RETURNS BOOLEAN LANGUAGE js AS ''' + try { + const data = JSON.parse(attributes); + const loadingAttr = data.find(attr => attr["name"] === "loading") + return loadingAttr.value == 'lazy' + } catch (e) { + return null; + } +'''; + +CREATE TEMP FUNCTION hasLazyHeuristics(attributes STRING) RETURNS BOOLEAN LANGUAGE js AS ''' + try { + const data = JSON.parse(attributes); + const classes = data.find(attr => attr["name"] === "class").value; + const hasLazyClasses = classes.indexOf('lazyload') !== -1; + const hasLazySrc = data.includes(attr => attr["name"] === "data-src"); + + return hasLazyClasses || hasLazySrc; + } catch (e) { + return false; + } +'''; + +WITH lcp_stats AS ( + SELECT + client, + isLazyLoaded(JSON_EXTRACT(custom_metrics, '$.performance.lcp_elem_stats.attributes')) AS native_lazy, + hasLazyHeuristics(JSON_EXTRACT(custom_metrics, '$.performance.lcp_elem_stats.attributes')) AS custom_lazy + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page AND + JSON_EXTRACT_SCALAR(custom_metrics, '$.performance.lcp_elem_stats.nodeName') = 'IMG' +) + +SELECT + client, + COUNT(0) AS total, + COUNTIF(native_lazy) / COUNT(0) AS pct_native_lazy, + COUNTIF(custom_lazy) / COUNT(0) AS pct_custom_lazy, + COUNTIF(custom_lazy OR native_lazy) / COUNT(0) AS pct_either_lazy, + COUNTIF(custom_lazy AND native_lazy) / COUNT(0) AS pct_both_lazy +FROM + lcp_stats +GROUP BY + client diff --git a/sql/2024/performance/lcp_lazy_secondary_pages.sql b/sql/2024/performance/lcp_lazy_secondary_pages.sql new file mode 100644 index 00000000000..081934d1a7d --- /dev/null +++ b/sql/2024/performance/lcp_lazy_secondary_pages.sql @@ -0,0 +1,47 @@ +CREATE TEMP FUNCTION isLazyLoaded(attributes STRING) RETURNS BOOLEAN LANGUAGE js AS ''' + try { + const data = JSON.parse(attributes); + const loadingAttr = data.find(attr => attr["name"] === "loading") + return loadingAttr.value == 'lazy' + } catch (e) { + return null; + } +'''; + +CREATE TEMP FUNCTION hasLazyHeuristics(attributes STRING) RETURNS BOOLEAN LANGUAGE js AS ''' + try { + const data = JSON.parse(attributes); + const classes = data.find(attr => attr["name"] === "class").value; + const hasLazyClasses = classes.indexOf('lazyload') !== -1; + const hasLazySrc = data.includes(attr => attr["name"] === "data-src"); + + return hasLazyClasses || hasLazySrc; + } catch (e) { + return false; + } +'''; + +WITH lcp_stats AS ( + SELECT + client, + isLazyLoaded(JSON_EXTRACT(custom_metrics, '$.performance.lcp_elem_stats.attributes')) AS native_lazy, + hasLazyHeuristics(JSON_EXTRACT(custom_metrics, '$.performance.lcp_elem_stats.attributes')) AS custom_lazy + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + NOT is_root_page AND + JSON_EXTRACT_SCALAR(custom_metrics, '$.performance.lcp_elem_stats.nodeName') = 'IMG' +) + +SELECT + client, + COUNT(0) AS total, + COUNTIF(native_lazy) / COUNT(0) AS pct_native_lazy, + COUNTIF(custom_lazy) / COUNT(0) AS pct_custom_lazy, + COUNTIF(custom_lazy OR native_lazy) / COUNT(0) AS pct_either_lazy, + COUNTIF(custom_lazy AND native_lazy) / COUNT(0) AS pct_both_lazy +FROM + lcp_stats +GROUP BY + client diff --git a/sql/2024/performance/lcp_lazy_technologies.sql b/sql/2024/performance/lcp_lazy_technologies.sql new file mode 100644 index 00000000000..3567ac9fb1d --- /dev/null +++ b/sql/2024/performance/lcp_lazy_technologies.sql @@ -0,0 +1,75 @@ +CREATE TEMP FUNCTION isLazyLoaded(attributes STRING) RETURNS BOOLEAN LANGUAGE js AS ''' + try { + const data = JSON.parse(attributes); + const loadingAttr = data.find(attr => attr["name"] === "loading") + return loadingAttr.value == 'lazy' + } catch (e) { + return null; + } +'''; + +CREATE TEMP FUNCTION hasLazyHeuristics(attributes STRING) RETURNS BOOLEAN LANGUAGE js AS ''' + try { + const data = JSON.parse(attributes); + const classes = data.find(attr => attr["name"] === "class").value; + const hasLazyClasses = classes.indexOf('lazyload') !== -1; + const hasLazySrc = data.includes(attr => attr["name"] === "data-src"); + + return hasLazyClasses || hasLazySrc; + } catch (e) { + return false; + } +'''; + +WITH lazy_tech AS ( + SELECT + client, + page, + isLazyLoaded(JSON_EXTRACT(custom_metrics, '$.performance.lcp_elem_stats.attributes')) AS native_lazy, + hasLazyHeuristics(JSON_EXTRACT(custom_metrics, '$.performance.lcp_elem_stats.attributes')) AS custom_lazy, + t.technology + FROM + `httparchive.all.pages`, + UNNEST(technologies) AS t + WHERE + date = '2024-06-01' AND + is_root_page +), + +tech_totals AS ( + SELECT + client, + technology, + COUNT(0) AS pages_per_technology + FROM + lazy_tech + GROUP BY + client, + technology +) + + +SELECT + client, + technology, + COUNTIF(native_lazy) AS native_lazy, + COUNTIF(custom_lazy) AS custom_lazy, + COUNTIF(native_lazy OR custom_lazy) AS either_lazy, + COUNT(0) AS pages, + COUNTIF(native_lazy) / COUNT(0) AS pct_native_lazy, + COUNTIF(custom_lazy) / COUNT(0) AS pct_custom_lazy, + COUNTIF(native_lazy OR custom_lazy) / COUNT(0) AS pct_either_lazy +FROM + lazy_tech +JOIN + tech_totals +USING + (client, technology) +GROUP BY + client, + technology +HAVING + pages > 1000 AND + pct_either_lazy > 0.1 +ORDER BY + either_lazy DESC diff --git a/sql/2024/performance/lcp_preload_discoverable.sql b/sql/2024/performance/lcp_preload_discoverable.sql new file mode 100644 index 00000000000..3dd5699973a --- /dev/null +++ b/sql/2024/performance/lcp_preload_discoverable.sql @@ -0,0 +1,28 @@ +WITH lcp AS ( + SELECT + client, + JSON_VALUE(custom_metrics, '$.performance.is_lcp_statically_discoverable') = 'true' AS discoverable, + JSON_VALUE(custom_metrics, '$.performance.is_lcp_preloaded') = 'true' AS preloaded + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page +) + + +SELECT + client, + discoverable, + preloaded, + COUNT(0) AS pages, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct +FROM + lcp +GROUP BY + client, + discoverable, + preloaded +ORDER BY + pct DESC diff --git a/sql/2024/performance/lcp_resource_load_delay.sql b/sql/2024/performance/lcp_resource_load_delay.sql new file mode 100644 index 00000000000..97b51cd44dd --- /dev/null +++ b/sql/2024/performance/lcp_resource_load_delay.sql @@ -0,0 +1,53 @@ +WITH pages AS ( + SELECT + client, + page, + JSON_VALUE(custom_metrics, '$.performance.lcp_elem_stats.url') AS url, + httparchive.core_web_vitals.GET_LAB_TTFB(payload) AS ttfb + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page +), + +requests AS ( + SELECT + client, + page, + url, + CAST(JSON_QUERY(payload, '$._created') AS FLOAT64) AS lcp_req_time + FROM + `httparchive.all.requests` + WHERE + date = '2024-06-01' AND + is_root_page +), + +delays AS ( + SELECT + client, + CAST(lcp_req_time - ttfb AS INT64) AS lcp_resource_load_delay + FROM + pages + JOIN + requests + USING + (client, page, url) + WHERE + lcp_req_time > ttfb +) + +SELECT + percentile, + client, + APPROX_QUANTILES(lcp_resource_load_delay, 1000)[OFFSET(percentile * 10)] AS lcp_resource_load_delay +FROM + delays, + UNNEST([10, 25, 50, 75, 90]) AS percentile +GROUP BY + percentile, + client +ORDER BY + percentile, + client diff --git a/sql/2024/performance/lcp_resource_type.sql b/sql/2024/performance/lcp_resource_type.sql new file mode 100644 index 00000000000..6eec129ea52 --- /dev/null +++ b/sql/2024/performance/lcp_resource_type.sql @@ -0,0 +1,33 @@ +# We are unable to track LCP for video elements: https://issues.chromium.org/issues/364860066 + +WITH lcp AS ( + SELECT + client, + page, + # Parse anchors out of LCP URLs. + REGEXP_EXTRACT(JSON_VALUE(custom_metrics, '$.performance.lcp_elem_stats.url'), r'([^#]*)') AS url + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page +) + + +SELECT + client, + CASE + WHEN lcp.url = '' THEN 'text' + WHEN STARTS_WITH(lcp.url, 'data:') THEN 'inline image' + ELSE 'image' + END AS lcp_type, + COUNT(0) AS pages, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct +FROM + lcp +GROUP BY + client, + lcp_type +ORDER BY + pct DESC diff --git a/sql/2024/performance/lcp_responsive_data.sql b/sql/2024/performance/lcp_responsive_data.sql new file mode 100644 index 00000000000..db4e34ca564 --- /dev/null +++ b/sql/2024/performance/lcp_responsive_data.sql @@ -0,0 +1,59 @@ +CREATE TEMP FUNCTION checkResponsiveImages(responsivelist STRING, lcpImgUrl STRING, nodePath STRING) RETURNS BOOLEAN LANGUAGE js AS ''' + try { + //we will check lcp elment is img + const lastSegment = (nodePath.split(",").reverse())[0]; + let lastNodeImg = false + if(lastSegment == 'IMG'){ + lastNodeImg = true + } + if(lcpImgUrl != null && lastNodeImg){ + const listJson = JSON.parse(responsivelist); + if(listJson.length > 0){ + for(let i=0;i= 0.75) / COUNTIF(fast_lcp IS NOT NULL) AS pct_good_lcp, + COUNTIF(small_cls / (small_cls + medium_cls + large_cls) >= 0.75) / COUNTIF(small_cls IS NOT NULL) AS pct_good_cls +FROM + `chrome-ux-report.materialized.device_summary` +WHERE + date BETWEEN '2022-01-01' AND '2024-06-01' AND + device IN ('desktop', 'phone') +GROUP BY + date, + device +ORDER BY + date, + device diff --git a/sql/2024/performance/render_blocking_resources.sql b/sql/2024/performance/render_blocking_resources.sql new file mode 100644 index 00000000000..84b1c4081bd --- /dev/null +++ b/sql/2024/performance/render_blocking_resources.sql @@ -0,0 +1,14 @@ +SELECT + date, + client, + COUNTIF(SAFE_CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.audits.render-blocking-resources.score') AS FLOAT64) >= 0.9) AS is_passing, + COUNT(0) AS total, + COUNTIF(SAFE_CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.audits.render-blocking-resources.score') AS FLOAT64) >= 0.9) / COUNT(0) AS pct_passing +FROM + `httparchive.all.pages` +WHERE + date IN ('2022-06-01', '2023-06-01', '2024-06-01') AND + is_root_page +GROUP BY + date, + client diff --git a/sql/2024/performance/render_blocking_savings_fcp.sql b/sql/2024/performance/render_blocking_savings_fcp.sql new file mode 100644 index 00000000000..932db060017 --- /dev/null +++ b/sql/2024/performance/render_blocking_savings_fcp.sql @@ -0,0 +1,26 @@ +WITH lh AS ( + SELECT + client, + page, + rank, + CAST(JSON_VALUE(lighthouse, '$.audits.render-blocking-resources.metricSavings.FCP') AS INT64) AS metricSavings_fcp + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page +) + +SELECT + percentile, + client, + APPROX_QUANTILES(metricSavings_fcp, 1000)[OFFSET(percentile * 10)] AS metricSavings_fcp +FROM + lh, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +GROUP BY + percentile, + client +ORDER BY + percentile, + client diff --git a/sql/2024/performance/render_blocking_savings_lcp.sql b/sql/2024/performance/render_blocking_savings_lcp.sql new file mode 100644 index 00000000000..3ae2d2fe66c --- /dev/null +++ b/sql/2024/performance/render_blocking_savings_lcp.sql @@ -0,0 +1,41 @@ +WITH data AS ( + SELECT + client, + page, + rank, + CAST(JSON_VALUE(lighthouse, '$.audits.render-blocking-resources.metricSavings.LCP') AS INT64) AS metricSavings_lcp, + REGEXP_EXTRACT(JSON_VALUE(custom_metrics, '$.performance.lcp_elem_stats.url'), r'([^#]*)') AS lcp_url + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' AND + is_root_page +) + +SELECT + percentile, + client, + APPROX_QUANTILES(metricSavings_lcp, 1000)[OFFSET(percentile * 10)] AS metricSavings_lcp, + lcp_type +FROM ( + SELECT + client, + metricSavings_lcp, + CASE + WHEN lcp_url = '' THEN 'text' + ELSE 'image' + END AS lcp_type + FROM + data +), +UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +WHERE + lcp_type = 'text' +GROUP BY + percentile, + lcp_type, + client +ORDER BY + percentile, + lcp_type, + client diff --git a/sql/2024/performance/resource_hints_usage.sql b/sql/2024/performance/resource_hints_usage.sql new file mode 100644 index 00000000000..90facfa568c --- /dev/null +++ b/sql/2024/performance/resource_hints_usage.sql @@ -0,0 +1,74 @@ +CREATE TEMPORARY FUNCTION getResourceHints(payload STRING) +RETURNS ARRAY < STRUCT < name STRING, href STRING >> +LANGUAGE js AS ''' +var hints = new Set(['preload', 'prefetch', 'preconnect', 'prerender', 'dns-prefetch']); +try { + var $ = JSON.parse(payload); + var almanac = JSON.parse($._almanac); + return almanac['link-nodes'].nodes.reduce((results, link) => { + var hint = link.rel.toLowerCase(); + if (!hints.has(hint)) { + return results; + } + results.push({ + name: hint, + href: link.href + }); + return results; + }, []); +} catch (e) { + return []; +} +'''; + +WITH resource_hints AS ( + SELECT DISTINCT + client, + page, + date, + hint.name AS name + FROM + `httparchive.all.pages` + LEFT JOIN + UNNEST(getResourceHints(payload)) AS hint + WHERE + (date = '2024-06-01' OR date = '2023-06-01' OR date = '2022-06-01') AND + is_root_page +), + +totals AS ( + SELECT + client, + date, + COUNT(0) AS total_pages + FROM + `httparchive.all.pages` + WHERE + (date = '2024-06-01' OR date = '2023-06-01' OR date = '2022-06-01') AND + is_root_page + GROUP BY + client, + date +) + +SELECT + client, + date, + name, + COUNT(DISTINCT page) AS pages, + ANY_VALUE(total_pages) AS total, + COUNT(DISTINCT page) / ANY_VALUE(total_pages) AS pct +FROM + resource_hints +JOIN + totals +USING + (client, date) +GROUP BY + client, + date, + name +ORDER BY + client, + date, + name DESC diff --git a/sql/2024/performance/resource_hints_usage_2021.sql b/sql/2024/performance/resource_hints_usage_2021.sql new file mode 100644 index 00000000000..624b2d07c29 --- /dev/null +++ b/sql/2024/performance/resource_hints_usage_2021.sql @@ -0,0 +1,62 @@ +CREATE TEMPORARY FUNCTION getResourceHints(payload STRING) +RETURNS ARRAY < STRUCT < name STRING, href STRING >> +LANGUAGE js AS ''' +var hints = new Set(['preload', 'prefetch', 'preconnect', 'prerender', 'dns-prefetch']); +try { + var $ = JSON.parse(payload); + var almanac = JSON.parse($._almanac); + return almanac['link-nodes'].nodes.reduce((results, link) => { + var hint = link.rel.toLowerCase(); + if (!hints.has(hint)) { + return results; + } + results.push({ + name: hint, + href: link.href + }); + return results; + }, []); +} catch (e) { + return []; +} +'''; + +WITH resource_hints AS ( + SELECT DISTINCT + _TABLE_SUFFIX AS client, + url, + hint.name AS name + FROM + `httparchive.pages.2021_07_01_*` + LEFT JOIN + UNNEST(getResourceHints(payload)) AS hint +), + +totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + client +) + +SELECT + client, + name, + COUNT(DISTINCT url) AS pages, + ANY_VALUE(total_pages) AS total, + COUNT(DISTINCT url) / ANY_VALUE(total_pages) AS pct +FROM + resource_hints +JOIN + totals +USING + (client) +GROUP BY + client, + name +ORDER BY + client, + name diff --git a/sql/2024/performance/rtt_distribution.sql b/sql/2024/performance/rtt_distribution.sql new file mode 100644 index 00000000000..4aefe003198 --- /dev/null +++ b/sql/2024/performance/rtt_distribution.sql @@ -0,0 +1,16 @@ +SELECT + client, + percentile, + APPROX_QUANTILES(JSON_QUERY(payload, '$._CrUX.metrics.round_trip_time.percentiles.p75'), 1000)[OFFSET(percentile * 10)] AS rtt_p75 +FROM + `httparchive.all.pages`, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +WHERE + date = '2024-08-01' AND + is_root_page +GROUP BY + percentile, + client +ORDER BY + percentile, + client diff --git a/sql/2024/performance/viewport_meta_zoom_disable.sql b/sql/2024/performance/viewport_meta_zoom_disable.sql new file mode 100644 index 00000000000..18508fee717 --- /dev/null +++ b/sql/2024/performance/viewport_meta_zoom_disable.sql @@ -0,0 +1,12 @@ +SELECT + client, + COUNTIF(JSON_VALUE(lighthouse, '$.audits.viewport.score') = '0') AS viewport_failed, + COUNT(0) AS total, + COUNTIF(JSON_VALUE(lighthouse, '$.audits.viewport.score') = '0') / COUNT(0) AS pct_failed +FROM + `httparchive.all.pages` +WHERE + date = '2024-06-01' AND + is_root_page +GROUP BY + client diff --git a/sql/2024/performance/web_vitals_by_device.sql b/sql/2024/performance/web_vitals_by_device.sql new file mode 100644 index 00000000000..c589d0fa27a --- /dev/null +++ b/sql/2024/performance/web_vitals_by_device.sql @@ -0,0 +1,242 @@ +CREATE TEMP FUNCTION IS_GOOD (good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( + SAFE_DIVIDE(good, (good + needs_improvement + poor)) >= 0.75 +); + +CREATE TEMP FUNCTION IS_NI (good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( + SAFE_DIVIDE(good, (good + needs_improvement + poor)) < 0.75 AND + SAFE_DIVIDE(poor, (good + needs_improvement + poor)) < 0.25 +); + +CREATE TEMP FUNCTION IS_POOR (good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( + SAFE_DIVIDE(poor, (good + needs_improvement + poor)) >= 0.25 +); + +CREATE TEMP FUNCTION IS_NON_ZERO (good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( + good + needs_improvement + poor > 0 +); + +WITH +base AS ( + SELECT + date, + origin, + device, + + fast_fid, + avg_fid, + slow_fid, + + fast_inp, + avg_inp, + slow_inp, + + fast_lcp, + avg_lcp, + slow_lcp, + + small_cls, + medium_cls, + large_cls, + + fast_fcp, + avg_fcp, + slow_fcp, + + fast_ttfb, + avg_ttfb, + slow_ttfb + + FROM + `chrome-ux-report.materialized.device_summary` + WHERE + device IN ('desktop', 'phone') AND + date IN ('2020-08-01', '2021-07-01', '2022-06-01', '2023-09-01', '2024-06-01') +) + +SELECT + date, + device, + + COUNT(DISTINCT origin) AS total_origins, + + # Good CWV with optional FID + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_GOOD(fast_fid, avg_fid, slow_fid) IS NOT FALSE AND + IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AND + IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp) AND + IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL + )) + ) AS pct_cwv23_good, + + # Good CWV with optional INP + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_GOOD(fast_inp, avg_inp, slow_inp) IS NOT FALSE AND + IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AND + IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp) AND + IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL + )) + ) AS pct_cwv24_good, + + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_GOOD(fast_lcp, avg_lcp, slow_lcp), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp), origin, NULL + )) + ) AS pct_lcp_good, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_NI(fast_lcp, avg_lcp, slow_lcp), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp), origin, NULL + )) + ) AS pct_lcp_ni, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_POOR(fast_lcp, avg_lcp, slow_lcp), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp), origin, NULL + )) + ) AS pct_lcp_poor, + + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_GOOD(fast_inp, avg_inp, slow_inp), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_inp, avg_inp, slow_inp), origin, NULL + )) + ) AS pct_inp_good, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_NI(fast_inp, avg_inp, slow_inp), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_inp, avg_inp, slow_inp), origin, NULL + )) + ) AS pct_inp_ni, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_POOR(fast_inp, avg_inp, slow_inp), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_inp, avg_inp, slow_inp), origin, NULL + )) + ) AS pct_inp_poor, + + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_GOOD(fast_fid, avg_fid, slow_fid), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_fid, avg_fid, slow_fid), origin, NULL + )) + ) AS pct_fid_good, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_NI(fast_fid, avg_fid, slow_fid), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_fid, avg_fid, slow_fid), origin, NULL + )) + ) AS pct_fid_ni, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_POOR(fast_fid, avg_fid, slow_fid), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_fid, avg_fid, slow_fid), origin, NULL + )) + ) AS pct_fid_poor, + + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL + )) + ) AS pct_cls_good, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_NI(small_cls, medium_cls, large_cls), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL + )) + ) AS pct_cls_ni, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_POOR(small_cls, medium_cls, large_cls), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL + )) + ) AS pct_cls_poor, + + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_GOOD(fast_fcp, avg_fcp, slow_fcp), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp), origin, NULL + )) + ) AS pct_fcp_good, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_NI(fast_fcp, avg_fcp, slow_fcp), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp), origin, NULL + )) + ) AS pct_fcp_ni, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_POOR(fast_fcp, avg_fcp, slow_fcp), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp), origin, NULL + )) + ) AS pct_fcp_poor, + + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_GOOD(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL + )) + ) AS pct_ttfb_good, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_NI(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL + )) + ) AS pct_ttfb_ni, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_POOR(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL + )) + ) AS pct_ttfb_poor + +FROM + base +GROUP BY + date, + device diff --git a/sql/2024/performance/web_vitals_by_device_secondary_pages.sql b/sql/2024/performance/web_vitals_by_device_secondary_pages.sql new file mode 100644 index 00000000000..e4da671ce39 --- /dev/null +++ b/sql/2024/performance/web_vitals_by_device_secondary_pages.sql @@ -0,0 +1,65 @@ +CREATE TEMPORARY FUNCTION getGoodCwv(payload STRING) +RETURNS STRUCT +LANGUAGE js AS ''' +try { + var $ = JSON.parse(payload); + var crux = $._CrUX; + + if (crux) { + return Object.keys(crux.metrics).reduce((acc, n) => ({ + ...acc, + [n]: crux.metrics[n].histogram[0].density >= 0.75 + }), {}) + } +0 + return null; +} catch (e) { + return null; +} +'''; + +SELECT + client, + is_root_page, + + COUNT(0) AS freq, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total, + + COUNTIF(CrUX.largest_contentful_paint) AS lcp_good, + COUNTIF(CrUX.largest_contentful_paint IS NOT NULL) AS any_lcp, + COUNTIF(CrUX.largest_contentful_paint) / COUNTIF(CrUX.largest_contentful_paint IS NOT NULL) AS pct_lcp_good, + + COUNTIF(CrUX.interaction_to_next_paint) AS inp_good, + COUNTIF(CrUX.interaction_to_next_paint IS NOT NULL) AS any_inp, + COUNTIF(CrUX.interaction_to_next_paint) / COUNTIF(CrUX.interaction_to_next_paint IS NOT NULL) AS pct_inp_good, + + COUNTIF(CrUX.cumulative_layout_shift) AS cls_good, + COUNTIF(CrUX.cumulative_layout_shift IS NOT NULL) AS any_cls, + COUNTIF(CrUX.cumulative_layout_shift) / COUNTIF(CrUX.cumulative_layout_shift IS NOT NULL) AS pct_cls_good, + + COUNTIF(CrUX.first_contentful_paint) AS fcp_good, + COUNTIF(CrUX.first_contentful_paint IS NOT NULL) AS any_fcp, + COUNTIF(CrUX.first_contentful_paint) / COUNTIF(CrUX.first_contentful_paint IS NOT NULL) AS pct_fcp_good, + + COUNTIF(CrUX.largest_contentful_paint AND CrUX.interaction_to_next_paint IS NOT FALSE AND CrUX.cumulative_layout_shift) AS cwv_good, + COUNTIF(CrUX.largest_contentful_paint IS NOT NULL AND CrUX.cumulative_layout_shift IS NOT NULL) AS eligible_cwv, + COUNTIF(CrUX.largest_contentful_paint AND CrUX.interaction_to_next_paint IS NOT FALSE AND CrUX.cumulative_layout_shift) / COUNTIF(CrUX.largest_contentful_paint IS NOT NULL AND CrUX.cumulative_layout_shift IS NOT NULL) AS pct_cwv_good +FROM ( + SELECT + client, + getGoodCwv(payload) AS CrUX, + is_root_page + FROM + `httparchive.all.pages` + WHERE + date = '2024-06-01' + +) +WHERE + CrUX IS NOT NULL +GROUP BY + client, + is_root_page +ORDER BY + client, + is_root_page diff --git a/sql/2024/performance/web_vitals_by_rank_and_device.sql b/sql/2024/performance/web_vitals_by_rank_and_device.sql new file mode 100644 index 00000000000..e669d240025 --- /dev/null +++ b/sql/2024/performance/web_vitals_by_rank_and_device.sql @@ -0,0 +1,250 @@ +CREATE TEMP FUNCTION IS_GOOD (good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( + SAFE_DIVIDE(good, (good + needs_improvement + poor)) >= 0.75 +); + +CREATE TEMP FUNCTION IS_POOR (good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( + SAFE_DIVIDE(poor, (good + needs_improvement + poor)) >= 0.25 +); + +CREATE TEMP FUNCTION IS_NI (good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( + NOT IS_GOOD(good, needs_improvement, poor) AND + NOT IS_POOR(good, needs_improvement, poor) +); + +CREATE TEMP FUNCTION IS_NON_ZERO (good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( + good + needs_improvement + poor > 0 +); + +WITH +base AS ( + SELECT + date, + origin, + device, + rank, + + fast_fid, + avg_fid, + slow_fid, + + fast_inp, + avg_inp, + slow_inp, + + fast_lcp, + avg_lcp, + slow_lcp, + + small_cls, + medium_cls, + large_cls, + + fast_fcp, + avg_fcp, + slow_fcp, + + fast_ttfb, + avg_ttfb, + slow_ttfb + + FROM + `chrome-ux-report.materialized.device_summary` + WHERE + device IN ('desktop', 'phone') AND + date IN ('2022-06-01', '2023-09-01', '2024-06-01') +) + +SELECT + date, + device, + rank_grouping AS ranking, + + COUNT(DISTINCT origin) AS total_origins, + + # Good CWV with optional FID + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_GOOD(fast_fid, avg_fid, slow_fid) IS NOT FALSE AND + IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AND + IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp) AND + IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL + )) + ) AS pct_cwv23_good, + + # Good CWV with optional INP + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_GOOD(fast_inp, avg_inp, slow_inp) IS NOT FALSE AND + IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AND + IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp) AND + IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL + )) + ) AS pct_cwv24_good, + + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_GOOD(fast_lcp, avg_lcp, slow_lcp), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp), origin, NULL + )) + ) AS pct_lcp_good, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_NI(fast_lcp, avg_lcp, slow_lcp), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp), origin, NULL + )) + ) AS pct_lcp_ni, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_POOR(fast_lcp, avg_lcp, slow_lcp), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp), origin, NULL + )) + ) AS pct_lcp_poor, + + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_GOOD(fast_fid, avg_fid, slow_fid), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_fid, avg_fid, slow_fid), origin, NULL + )) + ) AS pct_fid_good, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_NI(fast_fid, avg_fid, slow_fid), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_fid, avg_fid, slow_fid), origin, NULL + )) + ) AS pct_fid_ni, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_POOR(fast_fid, avg_fid, slow_fid), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_fid, avg_fid, slow_fid), origin, NULL + )) + ) AS pct_fid_poor, + + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_GOOD(fast_inp, avg_inp, slow_inp), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_inp, avg_inp, slow_inp), origin, NULL + )) + ) AS pct_inp_good, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_NI(fast_inp, avg_inp, slow_inp), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_inp, avg_inp, slow_inp), origin, NULL + )) + ) AS pct_inp_ni, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_POOR(fast_inp, avg_inp, slow_inp), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_inp, avg_inp, slow_inp), origin, NULL + )) + ) AS pct_inp_poor, + + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL + )) + ) AS pct_cls_good, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_NI(small_cls, medium_cls, large_cls), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL + )) + ) AS pct_cls_ni, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_POOR(small_cls, medium_cls, large_cls), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL + )) + ) AS pct_cls_poor, + + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_GOOD(fast_fcp, avg_fcp, slow_fcp), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp), origin, NULL + )) + ) AS pct_fcp_good, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_NI(fast_fcp, avg_fcp, slow_fcp), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp), origin, NULL + )) + ) AS pct_fcp_ni, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_POOR(fast_fcp, avg_fcp, slow_fcp), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp), origin, NULL + )) + ) AS pct_fcp_poor, + + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_GOOD(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL + )) + ) AS pct_ttfb_good, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_NI(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL + )) + ) AS pct_ttfb_ni, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_POOR(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL + )), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL + )) + ) AS pct_ttfb_poor + +FROM + base, + UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping +WHERE + rank <= rank_grouping +GROUP BY + date, + device, + rank_grouping +ORDER BY + rank_grouping diff --git a/sql/2024/performance/web_vitals_by_technology.sql b/sql/2024/performance/web_vitals_by_technology.sql new file mode 100644 index 00000000000..3b601fab47d --- /dev/null +++ b/sql/2024/performance/web_vitals_by_technology.sql @@ -0,0 +1,215 @@ +CREATE TEMP FUNCTION IS_GOOD (good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( + SAFE_DIVIDE(good, (good + needs_improvement + poor)) >= 0.75 +); + +CREATE TEMP FUNCTION IS_POOR (good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( + SAFE_DIVIDE(poor, (good + needs_improvement + poor)) >= 0.25 +); + +CREATE TEMP FUNCTION IS_NI (good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( + NOT IS_GOOD(good, needs_improvement, poor) AND + NOT IS_POOR(good, needs_improvement, poor) +); + +CREATE TEMP FUNCTION IS_NON_ZERO (good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( + good + needs_improvement + poor > 0 +); + +WITH base AS ( + SELECT + date, + origin, + CONCAT(origin, '/') AS page, + CASE + WHEN device = 'phone' THEN 'mobile' + ELSE device + END AS client, + + fast_fid, + avg_fid, + slow_fid, + + fast_inp, + avg_inp, + slow_inp, + + fast_lcp, + avg_lcp, + slow_lcp, + + small_cls, + medium_cls, + large_cls, + + fast_fcp, + avg_fcp, + slow_fcp, + + fast_ttfb, + avg_ttfb, + slow_ttfb + FROM + `chrome-ux-report.materialized.device_summary` + WHERE + device IN ('desktop', 'phone') AND + date IN ('2024-06-01') +), + +tech AS ( + SELECT + client, + t.categories, + t.technology, + page + FROM + `httparchive.all.pages`, + UNNEST(technologies) AS t + WHERE + date = '2024-06-01' AND + is_root_page AND + EXISTS ( + SELECT 1 + FROM UNNEST(t.categories) AS category + WHERE category IN ('CMS', 'Ecommerce', 'JavaScript frameworks') + ) +) + +SELECT + date, + client, + categories, + technology, + + COUNT(DISTINCT origin) AS total_origins, + + # Good CWV with optional FID + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_GOOD(fast_fid, avg_fid, slow_fid) IS NOT FALSE AND + IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AND + IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL)), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp) AND + IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL))) AS pct_cwv_good, + + # Good CWV with optional INP (hypothetical!) + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_GOOD(fast_inp, avg_inp, slow_inp) IS NOT FALSE AND + IS_GOOD(fast_lcp, avg_lcp, slow_lcp) AND + IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL)), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp) AND + IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL))) AS pct_cwv_inp_good, + + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_GOOD(fast_lcp, avg_lcp, slow_lcp), origin, NULL)), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp), origin, NULL))) AS pct_lcp_good, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_NI(fast_lcp, avg_lcp, slow_lcp), origin, NULL)), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp), origin, NULL))) AS pct_lcp_ni, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_POOR(fast_lcp, avg_lcp, slow_lcp), origin, NULL)), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_lcp, avg_lcp, slow_lcp), origin, NULL))) AS pct_lcp_poor, + + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_GOOD(fast_fid, avg_fid, slow_fid), origin, NULL)), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_fid, avg_fid, slow_fid), origin, NULL))) AS pct_fid_good, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_NI(fast_fid, avg_fid, slow_fid), origin, NULL)), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_fid, avg_fid, slow_fid), origin, NULL))) AS pct_fid_ni, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_POOR(fast_fid, avg_fid, slow_fid), origin, NULL)), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_fid, avg_fid, slow_fid), origin, NULL))) AS pct_fid_poor, + + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_GOOD(fast_inp, avg_inp, slow_inp), origin, NULL)), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_inp, avg_inp, slow_inp), origin, NULL))) AS pct_inp_good, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_NI(fast_inp, avg_inp, slow_inp), origin, NULL)), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_inp, avg_inp, slow_inp), origin, NULL))) AS pct_inp_ni, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_POOR(fast_inp, avg_inp, slow_inp), origin, NULL)), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_inp, avg_inp, slow_inp), origin, NULL))) AS pct_inp_poor, + + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_GOOD(small_cls, medium_cls, large_cls), origin, NULL)), + COUNT(DISTINCT IF( + IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL))) AS pct_cls_good, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_NI(small_cls, medium_cls, large_cls), origin, NULL)), + COUNT(DISTINCT IF( + IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL))) AS pct_cls_ni, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_POOR(small_cls, medium_cls, large_cls), origin, NULL)), + COUNT(DISTINCT IF( + IS_NON_ZERO(small_cls, medium_cls, large_cls), origin, NULL))) AS pct_cls_poor, + + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_GOOD(fast_fcp, avg_fcp, slow_fcp), origin, NULL)), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp), origin, NULL))) AS pct_fcp_good, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_NI(fast_fcp, avg_fcp, slow_fcp), origin, NULL)), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp), origin, NULL))) AS pct_fcp_ni, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_POOR(fast_fcp, avg_fcp, slow_fcp), origin, NULL)), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_fcp, avg_fcp, slow_fcp), origin, NULL))) AS pct_fcp_poor, + + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_GOOD(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL)), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL))) AS pct_ttfb_good, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_NI(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL)), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL))) AS pct_ttfb_ni, + SAFE_DIVIDE( + COUNT(DISTINCT IF( + IS_POOR(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL)), + COUNT(DISTINCT IF( + IS_NON_ZERO(fast_ttfb, avg_ttfb, slow_ttfb), origin, NULL))) AS pct_ttfb_poor + +FROM + base +JOIN + tech +USING + (client, page) +GROUP BY + date, + client, + categories, + technology +HAVING + total_origins >= 1000 +ORDER BY + total_origins DESC From ad562c6e4fd3008e38f33431be475b48f67ffe6e Mon Sep 17 00:00:00 2001 From: Mike Gifford Date: Wed, 30 Oct 2024 11:31:09 -0400 Subject: [PATCH 10/15] Accessibility 2024 Chapter (#3809) * Update accessibility.md - In Progress Initial commit to ensure I capture what I have done so far. Working on the alt text now. * Update accessibility.md removing the encapsulated images. * Update accessibility.md Adding an example chart * Update accessibility.md removing legacy link * Update accessibility.md Updating overlay content * Update accessibility.md Highlighted numbers * Update accessibility.md Filling out all of the remaining big numbers * Update accessibility.md adding in more images and setting defaults for others * Update accessibility.md Filling in more content. Up tp user preferences. * Update accessibility.md Adding mroe graphs * Update accessibility.md Up to alt-altribute-lengths * Update accessibility.md Getting to the States * Update accessibility.md All of the images and alt text. * Update contributors.json Adding myself to contributor.json file along with the article. * Update contributors.json Adding other contributors. * Update accessibility.md Making some minor text changes and filling in the featured metadata. * Update accessibility.md removing trailing slashes. * Update accessibility.md Updating authors, reviewers and analysts. * Update accessibility.md Removing remaining "\." and fixing white spaces. * Update contributors.json fixing a linting error * Update contributors.json Removing commas for linter * Update contributors.json Adding Jonathan's website. * Update accessibility.md (#3814) Minor typos * Fixes to allow chapter to generate * Add images * Optimised images with calibre/image-actions * Markup fixes * Smart quotes * Table markup * Add image * Optimised images with calibre/image-actions * Fix links * Formatting * Formatting * Accessible images * Optimised images with calibre/image-actions * Retake images * Map images * Final edits * Final, final edits --------- Co-authored-by: Jonathan Pagel <63317370+jcmpagel@users.noreply.github.com> Co-authored-by: Barry Pollard Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- src/config/2024.json | 3 +- src/config/contributors.json | 61 + src/content/en/2024/accessibility.md | 1122 ++++++++++++++++- src/server/config.py | 2 +- .../US-state-governments-map.png | Bin 0 -> 29325 bytes .../accessibility/US-state-governments.png | Bin 0 -> 46001 bytes .../accessibility/a11y-app-usage-by-rank.png | Bin 0 -> 21318 bytes .../accessible-governments-world.png | Bin 0 -> 60415 bytes .../accessibility/accessible-governments.png | Bin 0 -> 34717 bytes .../accessibility/alt-attribute-lengths.png | Bin 0 -> 21802 bytes .../accessibility/button-name-sources.png | Bin 0 -> 27041 bytes ...olor-contrast-2019-2020-2021-2022-2024.png | Bin 0 -> 19716 bytes .../common-file-extensions-in-alt-text.png | Bin 0 -> 19679 bytes .../2024/accessibility/country-by-geoid.png | Bin 0 -> 46041 bytes .../accessibility/country-by-tld-globe.png | Bin 0 -> 60163 bytes .../2024/accessibility/country-by-tld.png | Bin 0 -> 60222 bytes .../2024/accessibility/font-unit-usage.png | Bin 0 -> 18260 bytes .../accessibility/form-input-name-sources.png | Bin 0 -> 27876 bytes .../accessibility/form-required-controls.png | Bin 0 -> 23076 bytes .../accessibility/javascript-frontend-ui.png | Bin 0 -> 32546 bytes .../javascript-meta-frameworks.png | Bin 0 -> 31833 bytes .../lighthouse-audit-markup-improvements.png | Bin 0 -> 29660 bytes .../lighthouse-audit-median-score-yoy.png | Bin 0 -> 21101 bytes .../microsoft-inclusivity-gudelines.png | Bin 0 -> 19869 bytes .../accessibility/page_title-information.png | Bin 0 -> 22050 bytes .../pages-overriding-focus-styles.png | Bin 0 -> 21426 bytes .../pages-using-a11y-apps-by-rank.png | Bin 0 -> 23591 bytes .../accessibility/pages-using-a11y-apps.png | Bin 0 -> 22350 bytes .../pages-with-element-role-yty.png | Bin 0 -> 18489 bytes .../pages-zooming-scaling-disabled.png | Bin 0 -> 22908 bytes .../placeholder-but-no-label.png | Bin 0 -> 22662 bytes .../2024/accessibility/platform-cms.png | Bin 0 -> 32931 bytes .../tabindex-usage-and-values.png | Bin 0 -> 20211 bytes .../2024/accessibility/top-10-aria-roles.png | Bin 0 -> 42942 bytes .../accessibility/top10-aria-attributes.png | Bin 0 -> 47240 bytes .../2024/accessibility/traditional-cms.png | Bin 0 -> 30494 bytes .../userpreference-media-queries.png | Bin 0 -> 27889 bytes src/tools/test/test_status_codes.js | 2 +- 38 files changed, 1175 insertions(+), 15 deletions(-) create mode 100644 src/static/images/2024/accessibility/US-state-governments-map.png create mode 100644 src/static/images/2024/accessibility/US-state-governments.png create mode 100644 src/static/images/2024/accessibility/a11y-app-usage-by-rank.png create mode 100644 src/static/images/2024/accessibility/accessible-governments-world.png create mode 100644 src/static/images/2024/accessibility/accessible-governments.png create mode 100644 src/static/images/2024/accessibility/alt-attribute-lengths.png create mode 100644 src/static/images/2024/accessibility/button-name-sources.png create mode 100644 src/static/images/2024/accessibility/color-contrast-2019-2020-2021-2022-2024.png create mode 100644 src/static/images/2024/accessibility/common-file-extensions-in-alt-text.png create mode 100644 src/static/images/2024/accessibility/country-by-geoid.png create mode 100644 src/static/images/2024/accessibility/country-by-tld-globe.png create mode 100644 src/static/images/2024/accessibility/country-by-tld.png create mode 100644 src/static/images/2024/accessibility/font-unit-usage.png create mode 100644 src/static/images/2024/accessibility/form-input-name-sources.png create mode 100644 src/static/images/2024/accessibility/form-required-controls.png create mode 100644 src/static/images/2024/accessibility/javascript-frontend-ui.png create mode 100644 src/static/images/2024/accessibility/javascript-meta-frameworks.png create mode 100644 src/static/images/2024/accessibility/lighthouse-audit-markup-improvements.png create mode 100644 src/static/images/2024/accessibility/lighthouse-audit-median-score-yoy.png create mode 100644 src/static/images/2024/accessibility/microsoft-inclusivity-gudelines.png create mode 100644 src/static/images/2024/accessibility/page_title-information.png create mode 100644 src/static/images/2024/accessibility/pages-overriding-focus-styles.png create mode 100644 src/static/images/2024/accessibility/pages-using-a11y-apps-by-rank.png create mode 100644 src/static/images/2024/accessibility/pages-using-a11y-apps.png create mode 100644 src/static/images/2024/accessibility/pages-with-element-role-yty.png create mode 100644 src/static/images/2024/accessibility/pages-zooming-scaling-disabled.png create mode 100644 src/static/images/2024/accessibility/placeholder-but-no-label.png create mode 100644 src/static/images/2024/accessibility/platform-cms.png create mode 100644 src/static/images/2024/accessibility/tabindex-usage-and-values.png create mode 100644 src/static/images/2024/accessibility/top-10-aria-roles.png create mode 100644 src/static/images/2024/accessibility/top10-aria-attributes.png create mode 100644 src/static/images/2024/accessibility/traditional-cms.png create mode 100644 src/static/images/2024/accessibility/userpreference-media-queries.png diff --git a/src/config/2024.json b/src/config/2024.json index 0d4659e1eda..a1abf227497 100644 --- a/src/config/2024.json +++ b/src/config/2024.json @@ -86,8 +86,7 @@ "part": "II", "chapter_number": "10", "title": "Accessibility", - "slug": "accessibility", - "todo": true + "slug": "accessibility" }, { "part": "II", diff --git a/src/config/contributors.json b/src/config/contributors.json index ec56a948c3d..291fb9526d0 100644 --- a/src/config/contributors.json +++ b/src/config/contributors.json @@ -571,6 +571,16 @@ "twitter": "LoukilAymen", "website": "http://www.aymen-loukil.com/en/" }, + "b_atish": { + "name": "Beatriz González Mellídez", + "teams": { + "2024": [ + "reviewers" + ] + }, + "twitter": "b_atish", + "website": "https://medium.com/@b_atish" + }, "tunetheweb": { "avatar_url": "10931297", "github": "tunetheweb", @@ -1638,6 +1648,18 @@ }, "twitter": "HenriHelvetica" }, + "hidde": { + "avatar_url": "178782", + "github": "hidde", + "name": "Hidde de Vries", + "teams": { + "2024": [ + "reviewers" + ] + }, + "mastodon": "@hdv@front-end.social", + "website": "https://hidde.blog/" + }, "housseindjirdeh": { "avatar_url": "12476932", "github": "housseindjirdeh", @@ -2022,6 +2044,29 @@ "twitter": "jtteag", "website": "https://gemservers.com" }, + "JonathanAvila": { + "avatar_url": "5640755", + "github": "mraccess77", + "name": "Jonathan Avila", + "linkedin": "jonathan-avila-cpwa-2a964a7", + "teams": { + "2024": [ + "editors" + ] + } + }, + "JonathanPagel": { + "avatar_url": "63317370", + "github": "jcmpagel", + "linkedin": "jonathan-pagel", + "name": "Jonathan Pagel", + "teams": { + "2024": [ + "editors" + ] + }, + "website": "https://jonathanpagel.com" + }, "sirjonathan": { "avatar_url": "104149", "github": "sirjonathan", @@ -2708,6 +2753,19 @@ ] } }, + "mgifford": { + "avatar_url": "116832", + "github": "mgifford", + "linkedin": "mgifford", + "mastodon": "https://mastodon.social/@mgifford", + "name": "Mike Gifford", + "teams": { + "2024": [ + "authors", + "analysts" + ] + } + }, "mikegeyser": { "avatar_url": "105242", "github": "mikegeyser", @@ -3620,6 +3678,9 @@ "designers", "editors", "reviewers" + ], + "2024": [ + "editors" ] } }, diff --git a/src/content/en/2024/accessibility.md b/src/content/en/2024/accessibility.md index 09a45973ede..50b59a61795 100644 --- a/src/content/en/2024/accessibility.md +++ b/src/content/en/2024/accessibility.md @@ -2,17 +2,1117 @@ #See https://github.com/HTTPArchive/almanac.httparchive.org/wiki/Authors'-Guide#metadata-to-add-at-the-top-of-your-chapters title: Accessibility description: Accessibility chapter of the 2024 Web Almanac covering ease of reading, navigation, forms, media, ARIA, and accessibility apps. -authors: [] -reviewers: [] -editors: [] -analysts: [] +authors: [mgifford] +reviewers: [hidde, b_atish] +analysts: [mgifford] +editors: [JonathanPagel, JonathanAvila, shantsis] translators: [] +discuss: results: https://docs.google.com/spreadsheets/d/1btB1r9QpdgTyToPhn7glcGAdMFs7eq4UcQSVIHBqiYQ/ -featured_quote: -featured_stat_1: -featured_stat_label_1: -featured_stat_2: -featured_stat_label_2: -featured_stat_3: -featured_stat_label_3: +mgifford_bio: Mike Gifford is CivicActions' Open Standards & Practices Lead. He is also a thought leader on open government, digital accessibility and sustainability. He has served as a Drupal Core Accessibility Maintainer and also a W3C Invited Expert. He is a recognized authoring tool accessibility expert and contributor to the W3C's Draft Web Sustainability Guidelines (WSG) 1.0. +featured_quote: Most modern governments have committed to either WCAG 2.0 AA or WCAG 2.1 AA. It is clear that the implementation of these policies isn't being equally delivered. +featured_stat_1: 40% +featured_stat_label_1: Of desktop sites and 39% of mobile sites have at least one `role="presentation"`. +featured_stat_2: 0.1% +featured_stat_label_2: Sites with `