Skip to content

Commit

Permalink
Merge branch 'main' into http2024-markdown
Browse files Browse the repository at this point in the history
  • Loading branch information
tunetheweb committed Nov 10, 2024
2 parents a0220da + ef9e375 commit d551cf2
Show file tree
Hide file tree
Showing 376 changed files with 19,444 additions and 1,809 deletions.
20 changes: 0 additions & 20 deletions sql/2024/caching/README.md

This file was deleted.

20 changes: 0 additions & 20 deletions sql/2024/capabilities/README.md

This file was deleted.

60 changes: 0 additions & 60 deletions sql/2024/capabilities/fugu.sql

This file was deleted.

30 changes: 0 additions & 30 deletions sql/2024/capabilities/top.sql

This file was deleted.

16 changes: 10 additions & 6 deletions sql/2024/mobile-web/README.md → sql/2024/cdn/README copy.md
Original file line number Diff line number Diff line change
@@ -1,22 +1,26 @@
# 2024 Mobile Web queries

# 2024 CDN queries

<!--
This directory contains all of the 2024 Mobile Web chapter queries.
This directory contains all of the 2024 CDN chapter queries.
Each query should have a corresponding `metric_name.sql` file.
Note that readers are linked to this directory, so try to make the SQL file names descriptive for easy browsing.
Analysts: if helpful, you can use this README to give additional info about the queries.
-->

Query updates:
- Dates have been updated




## Resources

- [📄 Planning doc][~google-doc]
- [📊 Results sheet][~google-sheets]
- [📝 Markdown file][~chapter-markdown]

[~google-doc]: https://docs.google.com/document/d/1EfA723C8h9tTojvwCJ8dqPITJPvzBOosyqGu3_WGD5A/edit
[~google-sheets]: https://docs.google.com/spreadsheets/d/183HhK6E_kygGbIpOVGIGsQvGzLBQSzjvRzabVC6e2-4/edit#gid=1778117656
[~chapter-markdown]: https://github.com/HTTPArchive/almanac.httparchive.org/tree/main/src/content/en/2024/mobile-web.md
[~google-doc]: https://docs.google.com/document/d/11Yz8S-e3ltbYQPdzKX1E3oexfA2PwWLdA5tToDv98BI/edit
[~google-sheets]:https://docs.google.com/spreadsheets/d/15YXQQjyoQ0Bnfw9KNSz_YuGDiCfW978_WKEHvDXjdm4/edit#gid=745368492
[~chapter-markdown]: https://github.com/HTTPArchive/almanac.httparchive.org/tree/main/src/content/en/2024/cdn.md
43 changes: 43 additions & 0 deletions sql/2024/cdn/cdn_usage_by_site_rank.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#standardSQL
# cdn_usage_by_site_rank.sql : Distribution of HTML pages served by CDN vs Origin by rank

WITH requests AS (
SELECT
client,
rank,
-- _cdn_provider is now in requests.summary table
-- Also it returns empty string ('')rather than 'ORIGIN' when no CDN
IF(IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(resp.summary, '$._cdn_provider'), r'^([^,]*).*'), ''), '') = '', 'ORIGIN', 'CDN') AS cdn
FROM
--`httparchive.almanac.requests` -- OLD table
`httparchive.all.requests` AS resp -- NEW table
-- `httparchive.sample_data.requests_1k` AS resp -- SAMPLE table (quicker)
INNER JOIN
`httparchive.all.pages` -- NEW pages table
-- `httparchive.sample_data.pages_1k` AS pages -- SAMPLE pages table (quicker)
USING (page, client, date)
WHERE
date = '2024-06-01' AND -- Uncomment this when running on full table
is_main_document -- new name for firstHtml
)

SELECT
client,
nested_rank,
cdn,
COUNT(0) AS num_requests,
SUM(COUNT(0)) OVER (PARTITION BY client, nested_rank) AS total,
COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client, nested_rank) AS pct_requests
FROM
requests,
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS nested_rank -- Note extra rank since 2022
WHERE
rank <= nested_rank
GROUP BY
client,
cdn,
nested_rank
ORDER BY
client,
nested_rank,
cdn
16 changes: 16 additions & 0 deletions sql/2024/cdn/client_hints_cdn_vs_origin.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
SELECT
(COUNT(0)) AS Total, client,
IF(IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), '') = '', 'ORIGIN', 'CDN') AS cdn
FROM `httparchive.all.requests`
WHERE date = '2024-06-01'
GROUP BY cdn, client
UNION ALL
SELECT
(COUNT(req)) AS ClientHints, client,
IF(IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), '') = '', 'ORIGIN', 'CDN') AS cdn
FROM `httparchive.all.requests` AS req,
UNNEST(response_headers) AS header
WHERE date = '2024-06-01' AND
header.name = 'accept-ch' AND
header.value IS NOT NULL
GROUP BY cdn, client
35 changes: 35 additions & 0 deletions sql/2024/cdn/distribution_of_compression_types_by_cdn.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#standardSQL
# distribution_of_compression_types_by_cdn.sql : What compression formats are being used (gzip, brotli, etc) for compressed resources served by CDNs

SELECT
client,
cdn,
compression_type,
COUNT(0) AS num_requests,
SUM(COUNT(0)) OVER (PARTITION BY client, cdn) AS total_compressed,
COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client, cdn) AS pct
FROM (
SELECT
client,
IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), 'ORIGIN') AS cdn, # sometimes _cdn provider detection includes multiple entries. we bias for the DNS detected entry which is the first entry
CASE
WHEN a.value = 'gzip' THEN 'Gzip'
WHEN a.value = 'br' THEN 'Brotli'
WHEN a.value = '' THEN 'no text compression'
ELSE 'other'
END AS compression_type
FROM
`httparchive.all.requests` CROSS JOIN UNNEST(response_headers) AS a
WHERE
date = '2024-06-01' AND
a.name = 'content-encoding'
-- resp_content_encoding != ''
)
GROUP BY
client,
cdn,
compression_type
ORDER BY
client,
cdn,
compression_type
35 changes: 35 additions & 0 deletions sql/2024/cdn/distribution_of_compression_types_cdn_vs_origin.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#standardSQL
# distribution_of_compression_types_cdn_vs_origin.sql : What compression formats are being used (gzip, brotli, etc) for compressed resources served by CDNs

SELECT
client,
cdn,
compression_type,
COUNT(0) AS num_requests,
SUM(COUNT(0)) OVER (PARTITION BY client, cdn) AS total_compressed,
COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client, cdn) AS pct
FROM (
SELECT
client,
IF(IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), 'ORIGIN') = 'ORIGIN', 'ORIGIN', 'CDN') AS cdn,
CASE
WHEN a.value = 'gzip' THEN 'Gzip'
WHEN a.value = 'br' THEN 'Brotli'
WHEN a.value = '' THEN 'no text compression'
ELSE 'other'
END AS compression_type
FROM
`httparchive.all.requests` CROSS JOIN UNNEST(response_headers) AS a
WHERE
date = '2024-06-01' AND
a.name = 'content-encoding'
-- resp_content_encoding != ''
)
GROUP BY
client,
cdn,
compression_type
ORDER BY
client,
cdn,
compression_type
77 changes: 77 additions & 0 deletions sql/2024/cdn/distribution_of_http_versions.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#standardSQL
# distribution_of_http_versions: Percentage of HTTPS responses by protocol
SELECT
a.client,
cdn,
is_main_document,
COUNTIF(IFNULL(a.protocol, b.protocol) = 'HTTP/0.9') AS http09,
COUNTIF(IFNULL(a.protocol, b.protocol) = 'HTTP/1.0') AS http10,
COUNTIF(IFNULL(a.protocol, b.protocol) = 'HTTP/1.1') AS http11,
COUNTIF(IFNULL(a.protocol, b.protocol) = 'HTTP/2') AS http2,
COUNTIF(IFNULL(a.protocol, b.protocol) = 'H3-29' OR IFNULL(a.protocol, b.protocol) = 'H3-Q050') AS http3,
COUNTIF(IFNULL(a.protocol, b.protocol) NOT IN ('HTTP/0.9', 'HTTP/1.0', 'HTTP/1.1', 'HTTP/2', 'H3-29', 'H3-Q050')) AS http_other,
COUNTIF(isSecure OR IFNULL(a.protocol, b.protocol) = 'HTTP/2') AS tls_total,
COUNTIF(IFNULL(a.protocol, b.protocol) = 'HTTP/0.9') / COUNT(0) AS http09_pct,
COUNTIF(IFNULL(a.protocol, b.protocol) = 'HTTP/1.0') / COUNT(0) AS http10_pct,
COUNTIF(IFNULL(a.protocol, b.protocol) = 'HTTP/1.1') / COUNT(0) AS http11_pct,
COUNTIF(IFNULL(a.protocol, b.protocol) = 'HTTP/2') / COUNT(0) AS http2_pct,
COUNTIF(IFNULL(a.protocol, b.protocol) = 'H3-29' OR IFNULL(a.protocol, b.protocol) = 'H3-Q050') / COUNT(0) AS http3_pct,
COUNTIF(IFNULL(a.protocol, b.protocol) NOT IN ('HTTP/0.9', 'HTTP/1.0', 'HTTP/1.1', 'HTTP/2', 'H3-29', 'H3-Q050')) / COUNT(0) AS http_other_pct,
COUNTIF(isSecure OR IFNULL(a.protocol, b.protocol) = 'HTTP/2') / COUNT(0) AS tls_pct,
COUNT(0) AS total
FROM
(
SELECT
client,
page,
url,
is_main_document,
# WPT is inconsistent with protocol population.
UPPER(IFNULL(JSON_EXTRACT_SCALAR(payload, '$._protocol'), IFNULL(NULLIF(JSON_EXTRACT_SCALAR(payload, '$._tls_next_proto'), 'unknown'), NULLIF(CONCAT('HTTP/', JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/')))) AS protocol,
JSON_EXTRACT_SCALAR(payload, '$._tls_version') AS tlsVersion,

# WPT joins CDN detection but we bias to the DNS detection which is the first entry
IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), 'ORIGIN') AS cdn,
CAST(JSON_EXTRACT(payload, '$.timings.ssl') AS INT64) AS tlstime,

# isSecure reports what the browser thought it was going to use, but it can get upgraded with STS OR UpgradeInsecure: 1
IF(STARTS_WITH(url, 'https') OR JSON_EXTRACT_SCALAR(payload, '$._tls_version') IS NOT NULL OR CAST(JSON_EXTRACT(payload, '$._is_secure') AS INT64) = 1, TRUE, FALSE) AS isSecure,
CAST(JSON_EXTRACT(payload, '$._socket') AS INT64) AS socket
FROM
`httparchive.all.requests` CROSS JOIN UNNEST(response_headers) AS r
--`httparchive.sample_data.requests`
WHERE
# WPT changes the response fields based on a redirect (url becomes the Location path instead of the original) causing insonsistencies in the counts, so we ignore them
date = '2024-06-01' AND
r.name = 'location' AND
(r.value = '' OR r.value IS NULL)
) a
LEFT JOIN
(
SELECT
client,
page,
CAST(JSON_EXTRACT(payload, '$._socket') AS INT64) AS socket,
ANY_VALUE(UPPER(IFNULL(JSON_EXTRACT_SCALAR(payload, '$._protocol'), IFNULL(NULLIF(JSON_EXTRACT_SCALAR(payload, '$._tls_next_proto'), 'unknown'), NULLIF(concat('HTTP/', JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/'))))) AS protocol,
ANY_VALUE(JSON_EXTRACT_SCALAR(payload, '$._tls_version')) AS tlsVersion
FROM
`httparchive.all.requests`
WHERE
JSON_EXTRACT_SCALAR(payload, '$._tls_version') IS NOT NULL AND
IFNULL(JSON_EXTRACT_SCALAR(payload, '$._protocol'), IFNULL(NULLIF(JSON_EXTRACT_SCALAR(payload, '$._tls_next_proto'), 'unknown'), NULLIF(concat('HTTP/',
JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/'))) IS NOT NULL AND
JSON_EXTRACT(payload, '$._socket') IS NOT NULL AND
date = '2024-06-01'
GROUP BY
client,
page,
socket
) b ON (a.client = b.client AND a.page = b.page AND a.socket = b.socket)

GROUP BY
client,
cdn,
is_main_document
ORDER BY
client DESC,
total DESC
Loading

0 comments on commit d551cf2

Please sign in to comment.