-
-
Notifications
You must be signed in to change notification settings - Fork 176
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into http2024-markdown
- Loading branch information
Showing
376 changed files
with
19,444 additions
and
1,809 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
16 changes: 10 additions & 6 deletions
16
sql/2024/mobile-web/README.md → sql/2024/cdn/README copy.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,22 +1,26 @@ | ||
# 2024 Mobile Web queries | ||
|
||
# 2024 CDN queries | ||
|
||
<!-- | ||
This directory contains all of the 2024 Mobile Web chapter queries. | ||
This directory contains all of the 2024 CDN chapter queries. | ||
Each query should have a corresponding `metric_name.sql` file. | ||
Note that readers are linked to this directory, so try to make the SQL file names descriptive for easy browsing. | ||
Analysts: if helpful, you can use this README to give additional info about the queries. | ||
--> | ||
|
||
Query updates: | ||
- Dates have been updated | ||
|
||
|
||
|
||
|
||
## Resources | ||
|
||
- [📄 Planning doc][~google-doc] | ||
- [📊 Results sheet][~google-sheets] | ||
- [📝 Markdown file][~chapter-markdown] | ||
|
||
[~google-doc]: https://docs.google.com/document/d/1EfA723C8h9tTojvwCJ8dqPITJPvzBOosyqGu3_WGD5A/edit | ||
[~google-sheets]: https://docs.google.com/spreadsheets/d/183HhK6E_kygGbIpOVGIGsQvGzLBQSzjvRzabVC6e2-4/edit#gid=1778117656 | ||
[~chapter-markdown]: https://github.com/HTTPArchive/almanac.httparchive.org/tree/main/src/content/en/2024/mobile-web.md | ||
[~google-doc]: https://docs.google.com/document/d/11Yz8S-e3ltbYQPdzKX1E3oexfA2PwWLdA5tToDv98BI/edit | ||
[~google-sheets]:https://docs.google.com/spreadsheets/d/15YXQQjyoQ0Bnfw9KNSz_YuGDiCfW978_WKEHvDXjdm4/edit#gid=745368492 | ||
[~chapter-markdown]: https://github.com/HTTPArchive/almanac.httparchive.org/tree/main/src/content/en/2024/cdn.md |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
#standardSQL | ||
# cdn_usage_by_site_rank.sql : Distribution of HTML pages served by CDN vs Origin by rank | ||
|
||
WITH requests AS ( | ||
SELECT | ||
client, | ||
rank, | ||
-- _cdn_provider is now in requests.summary table | ||
-- Also it returns empty string ('')rather than 'ORIGIN' when no CDN | ||
IF(IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(resp.summary, '$._cdn_provider'), r'^([^,]*).*'), ''), '') = '', 'ORIGIN', 'CDN') AS cdn | ||
FROM | ||
--`httparchive.almanac.requests` -- OLD table | ||
`httparchive.all.requests` AS resp -- NEW table | ||
-- `httparchive.sample_data.requests_1k` AS resp -- SAMPLE table (quicker) | ||
INNER JOIN | ||
`httparchive.all.pages` -- NEW pages table | ||
-- `httparchive.sample_data.pages_1k` AS pages -- SAMPLE pages table (quicker) | ||
USING (page, client, date) | ||
WHERE | ||
date = '2024-06-01' AND -- Uncomment this when running on full table | ||
is_main_document -- new name for firstHtml | ||
) | ||
|
||
SELECT | ||
client, | ||
nested_rank, | ||
cdn, | ||
COUNT(0) AS num_requests, | ||
SUM(COUNT(0)) OVER (PARTITION BY client, nested_rank) AS total, | ||
COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client, nested_rank) AS pct_requests | ||
FROM | ||
requests, | ||
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS nested_rank -- Note extra rank since 2022 | ||
WHERE | ||
rank <= nested_rank | ||
GROUP BY | ||
client, | ||
cdn, | ||
nested_rank | ||
ORDER BY | ||
client, | ||
nested_rank, | ||
cdn |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
SELECT | ||
(COUNT(0)) AS Total, client, | ||
IF(IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), '') = '', 'ORIGIN', 'CDN') AS cdn | ||
FROM `httparchive.all.requests` | ||
WHERE date = '2024-06-01' | ||
GROUP BY cdn, client | ||
UNION ALL | ||
SELECT | ||
(COUNT(req)) AS ClientHints, client, | ||
IF(IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), '') = '', 'ORIGIN', 'CDN') AS cdn | ||
FROM `httparchive.all.requests` AS req, | ||
UNNEST(response_headers) AS header | ||
WHERE date = '2024-06-01' AND | ||
header.name = 'accept-ch' AND | ||
header.value IS NOT NULL | ||
GROUP BY cdn, client |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
#standardSQL | ||
# distribution_of_compression_types_by_cdn.sql : What compression formats are being used (gzip, brotli, etc) for compressed resources served by CDNs | ||
|
||
SELECT | ||
client, | ||
cdn, | ||
compression_type, | ||
COUNT(0) AS num_requests, | ||
SUM(COUNT(0)) OVER (PARTITION BY client, cdn) AS total_compressed, | ||
COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client, cdn) AS pct | ||
FROM ( | ||
SELECT | ||
client, | ||
IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), 'ORIGIN') AS cdn, # sometimes _cdn provider detection includes multiple entries. we bias for the DNS detected entry which is the first entry | ||
CASE | ||
WHEN a.value = 'gzip' THEN 'Gzip' | ||
WHEN a.value = 'br' THEN 'Brotli' | ||
WHEN a.value = '' THEN 'no text compression' | ||
ELSE 'other' | ||
END AS compression_type | ||
FROM | ||
`httparchive.all.requests` CROSS JOIN UNNEST(response_headers) AS a | ||
WHERE | ||
date = '2024-06-01' AND | ||
a.name = 'content-encoding' | ||
-- resp_content_encoding != '' | ||
) | ||
GROUP BY | ||
client, | ||
cdn, | ||
compression_type | ||
ORDER BY | ||
client, | ||
cdn, | ||
compression_type |
35 changes: 35 additions & 0 deletions
35
sql/2024/cdn/distribution_of_compression_types_cdn_vs_origin.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
#standardSQL | ||
# distribution_of_compression_types_cdn_vs_origin.sql : What compression formats are being used (gzip, brotli, etc) for compressed resources served by CDNs | ||
|
||
SELECT | ||
client, | ||
cdn, | ||
compression_type, | ||
COUNT(0) AS num_requests, | ||
SUM(COUNT(0)) OVER (PARTITION BY client, cdn) AS total_compressed, | ||
COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client, cdn) AS pct | ||
FROM ( | ||
SELECT | ||
client, | ||
IF(IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), 'ORIGIN') = 'ORIGIN', 'ORIGIN', 'CDN') AS cdn, | ||
CASE | ||
WHEN a.value = 'gzip' THEN 'Gzip' | ||
WHEN a.value = 'br' THEN 'Brotli' | ||
WHEN a.value = '' THEN 'no text compression' | ||
ELSE 'other' | ||
END AS compression_type | ||
FROM | ||
`httparchive.all.requests` CROSS JOIN UNNEST(response_headers) AS a | ||
WHERE | ||
date = '2024-06-01' AND | ||
a.name = 'content-encoding' | ||
-- resp_content_encoding != '' | ||
) | ||
GROUP BY | ||
client, | ||
cdn, | ||
compression_type | ||
ORDER BY | ||
client, | ||
cdn, | ||
compression_type |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
#standardSQL | ||
# distribution_of_http_versions: Percentage of HTTPS responses by protocol | ||
SELECT | ||
a.client, | ||
cdn, | ||
is_main_document, | ||
COUNTIF(IFNULL(a.protocol, b.protocol) = 'HTTP/0.9') AS http09, | ||
COUNTIF(IFNULL(a.protocol, b.protocol) = 'HTTP/1.0') AS http10, | ||
COUNTIF(IFNULL(a.protocol, b.protocol) = 'HTTP/1.1') AS http11, | ||
COUNTIF(IFNULL(a.protocol, b.protocol) = 'HTTP/2') AS http2, | ||
COUNTIF(IFNULL(a.protocol, b.protocol) = 'H3-29' OR IFNULL(a.protocol, b.protocol) = 'H3-Q050') AS http3, | ||
COUNTIF(IFNULL(a.protocol, b.protocol) NOT IN ('HTTP/0.9', 'HTTP/1.0', 'HTTP/1.1', 'HTTP/2', 'H3-29', 'H3-Q050')) AS http_other, | ||
COUNTIF(isSecure OR IFNULL(a.protocol, b.protocol) = 'HTTP/2') AS tls_total, | ||
COUNTIF(IFNULL(a.protocol, b.protocol) = 'HTTP/0.9') / COUNT(0) AS http09_pct, | ||
COUNTIF(IFNULL(a.protocol, b.protocol) = 'HTTP/1.0') / COUNT(0) AS http10_pct, | ||
COUNTIF(IFNULL(a.protocol, b.protocol) = 'HTTP/1.1') / COUNT(0) AS http11_pct, | ||
COUNTIF(IFNULL(a.protocol, b.protocol) = 'HTTP/2') / COUNT(0) AS http2_pct, | ||
COUNTIF(IFNULL(a.protocol, b.protocol) = 'H3-29' OR IFNULL(a.protocol, b.protocol) = 'H3-Q050') / COUNT(0) AS http3_pct, | ||
COUNTIF(IFNULL(a.protocol, b.protocol) NOT IN ('HTTP/0.9', 'HTTP/1.0', 'HTTP/1.1', 'HTTP/2', 'H3-29', 'H3-Q050')) / COUNT(0) AS http_other_pct, | ||
COUNTIF(isSecure OR IFNULL(a.protocol, b.protocol) = 'HTTP/2') / COUNT(0) AS tls_pct, | ||
COUNT(0) AS total | ||
FROM | ||
( | ||
SELECT | ||
client, | ||
page, | ||
url, | ||
is_main_document, | ||
# WPT is inconsistent with protocol population. | ||
UPPER(IFNULL(JSON_EXTRACT_SCALAR(payload, '$._protocol'), IFNULL(NULLIF(JSON_EXTRACT_SCALAR(payload, '$._tls_next_proto'), 'unknown'), NULLIF(CONCAT('HTTP/', JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/')))) AS protocol, | ||
JSON_EXTRACT_SCALAR(payload, '$._tls_version') AS tlsVersion, | ||
|
||
# WPT joins CDN detection but we bias to the DNS detection which is the first entry | ||
IFNULL(NULLIF(REGEXP_EXTRACT(JSON_EXTRACT_SCALAR(summary, '$._cdn_provider'), r'^([^,]*).*'), ''), 'ORIGIN') AS cdn, | ||
CAST(JSON_EXTRACT(payload, '$.timings.ssl') AS INT64) AS tlstime, | ||
|
||
# isSecure reports what the browser thought it was going to use, but it can get upgraded with STS OR UpgradeInsecure: 1 | ||
IF(STARTS_WITH(url, 'https') OR JSON_EXTRACT_SCALAR(payload, '$._tls_version') IS NOT NULL OR CAST(JSON_EXTRACT(payload, '$._is_secure') AS INT64) = 1, TRUE, FALSE) AS isSecure, | ||
CAST(JSON_EXTRACT(payload, '$._socket') AS INT64) AS socket | ||
FROM | ||
`httparchive.all.requests` CROSS JOIN UNNEST(response_headers) AS r | ||
--`httparchive.sample_data.requests` | ||
WHERE | ||
# WPT changes the response fields based on a redirect (url becomes the Location path instead of the original) causing insonsistencies in the counts, so we ignore them | ||
date = '2024-06-01' AND | ||
r.name = 'location' AND | ||
(r.value = '' OR r.value IS NULL) | ||
) a | ||
LEFT JOIN | ||
( | ||
SELECT | ||
client, | ||
page, | ||
CAST(JSON_EXTRACT(payload, '$._socket') AS INT64) AS socket, | ||
ANY_VALUE(UPPER(IFNULL(JSON_EXTRACT_SCALAR(payload, '$._protocol'), IFNULL(NULLIF(JSON_EXTRACT_SCALAR(payload, '$._tls_next_proto'), 'unknown'), NULLIF(concat('HTTP/', JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/'))))) AS protocol, | ||
ANY_VALUE(JSON_EXTRACT_SCALAR(payload, '$._tls_version')) AS tlsVersion | ||
FROM | ||
`httparchive.all.requests` | ||
WHERE | ||
JSON_EXTRACT_SCALAR(payload, '$._tls_version') IS NOT NULL AND | ||
IFNULL(JSON_EXTRACT_SCALAR(payload, '$._protocol'), IFNULL(NULLIF(JSON_EXTRACT_SCALAR(payload, '$._tls_next_proto'), 'unknown'), NULLIF(concat('HTTP/', | ||
JSON_EXTRACT_SCALAR(payload, '$.response.httpVersion')), 'HTTP/'))) IS NOT NULL AND | ||
JSON_EXTRACT(payload, '$._socket') IS NOT NULL AND | ||
date = '2024-06-01' | ||
GROUP BY | ||
client, | ||
page, | ||
socket | ||
) b ON (a.client = b.client AND a.page = b.page AND a.socket = b.socket) | ||
|
||
GROUP BY | ||
client, | ||
cdn, | ||
is_main_document | ||
ORDER BY | ||
client DESC, | ||
total DESC |
Oops, something went wrong.