Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cookies sql 2024 #3741

Merged
merged 25 commits into from
Nov 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions sql/2024/cookies/0_extract_cookies.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
-- Extract to the `httparchive.almanac.cookies `table the cookies that were set
-- during the <DATE> crawl on <CLIENT>. Data in this table can then be queried
-- more efficiently in consecutive queries without having to reextract it every
-- time


-- Code used by @tunetheweb to create the table
-- see https://github.com/HTTPArchive/almanac.httparchive.org/pull/3741#discussion_r1823153262

-- CREATE TABLE `httparchive.almanac.cookies`
-- (
-- date DATE,
-- client STRING,
-- page STRING,
-- root_page STRING,
-- rank INTEGER,
-- startedDateTime STRING,
-- firstPartyCookie BOOL,
-- name STRING,
-- domain STRING,
-- path STRING,
-- expires STRING,
-- size STRING,
-- httpOnly STRING,
-- secure STRING,
-- session STRING,
-- sameSite STRING,
-- sameParty STRING,
-- partitionKey STRING,
-- partitionKeyOpaque STRING
-- )
-- PARTITION BY date
-- CLUSTER BY
-- client, rank, page
-- AS
-- ...


INSERT INTO `httparchive.almanac.cookies`
WITH intermediate_cookie AS (
SELECT
date,
client,
page,
root_page,
rank,
JSON_VALUE(summary, '$.startedDateTime') AS startedDateTime,
cookie
FROM
`httparchive.all.pages`,
UNNEST(JSON_EXTRACT_ARRAY(custom_metrics, '$.cookies')) AS cookie
WHERE
date = '2024-06-01'
)

SELECT
date,
client,
page,
root_page,
rank,
startedDateTime,
ENDS_WITH(NET.HOST(page), NET.REG_DOMAIN(JSON_VALUE(cookie, '$.domain'))) AS firstPartyCookie,
JSON_VALUE(cookie, '$.name') AS name,
JSON_VALUE(cookie, '$.domain') AS domain,
JSON_VALUE(cookie, '$.path') AS path,
JSON_VALUE(cookie, '$.expires') AS expires,
JSON_VALUE(cookie, '$.size') AS size,
JSON_VALUE(cookie, '$.httpOnly') AS httpOnly,
JSON_VALUE(cookie, '$.secure') AS secure,
JSON_VALUE(cookie, '$.session') AS session,
JSON_VALUE(cookie, '$.sameSite') AS sameSite,
JSON_VALUE(cookie, '$.sameParty') AS sameParty,
JSON_VALUE(cookie, '$.partitionKey') AS partitionKey,
JSON_VALUE(cookie, '$.partitionKeyOpaque') AS partitionKeyOpaque
FROM intermediate_cookie
18 changes: 18 additions & 0 deletions sql/2024/cookies/CHIPS_top_20_first_party_cookies.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
-- Extract the top 20 first party cookies seen across websites that are
-- partitioned. Note: it is a bit weird that 1st party cookies would also be
-- partitioned, as CHIPS is meant for a 3rd party context...
-- Before running query: edit date and client

SELECT
name,
COUNT(DISTINCT NET.HOST(page)) / (SELECT (COUNT(DISTINCT NET.HOST(page))) FROM `httparchive.almanac.cookies`) AS percentWebsites
FROM `httparchive.almanac.cookies`
WHERE
date = '2024-06-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie = TRUE AND
partitionKey IS NOT NULL
GROUP BY name
ORDER BY percentWebsites DESC
LIMIT 20
19 changes: 19 additions & 0 deletions sql/2024/cookies/CHIPS_top_20_third_party_cookies.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
-- Extract the top 20 first party cookies seen across websites that are
-- partitioned. Note: it is a bit weird that 1st party cookies would also be
-- partitioned, as CHIPS is meant for a 3rd party context...
-- Before running query: edit date and client

SELECT
name,
domain,
COUNT(DISTINCT NET.HOST(page)) / (SELECT (COUNT(DISTINCT NET.HOST(page))) FROM `httparchive.almanac.cookies`) AS percentWebsites
FROM `httparchive.almanac.cookies`
WHERE
date = '2024-06-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie = FALSE AND
partitionKey IS NOT NULL
GROUP BY name, domain
ORDER BY percentWebsites DESC
LIMIT 20
6 changes: 6 additions & 0 deletions sql/2024/cookies/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,9 @@
[~google-doc]: https://docs.google.com/document/d/1o2AgdsDq_x3OvthZF7Kb50rUKMVLn7UANT9Stz7ku2I/edit#heading=h.ymg495uvm3yx
[~google-sheets]: https://docs.google.com/spreadsheets/d/1wDGnUkO0rgcU5_V6hmUrhm1pq60VU2XbeMHgYJEEaSM/edit#gid=454016814
[~chapter-markdown]: https://github.com/HTTPArchive/almanac.httparchive.org/tree/main/src/content/en/2024/cookies.md

## Note about SQL queries

First execute [`0_create_cookies.sql`](0_create_cookies.sql) to export the
results of the <DATE> crawl specified into the `httparchive.almanac.cookies`
table that will then be used in other SQL queries.
28 changes: 28 additions & 0 deletions sql/2024/cookies/age_expire_cookies_per_type_quantiles.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
-- Extract stats around the expire date (age rounded to closest number of days)
-- (only positive values, non session cookies)
-- Before running query: edit date and client

WITH cookies_age AS (
SELECT
firstPartyCookie,
ROUND((CAST(expires AS FLOAT64) - CAST(startedDateTime AS FLOAT64)) / (24 * 3600), 0) AS age
FROM `httparchive.almanac.cookies`
WHERE
date = '2024-06-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL AND
CAST(expires AS FLOAT64) >= 0
)

SELECT
firstPartyCookie,
MIN(age) AS min,
APPROX_QUANTILES(age, 100)[OFFSET(25)] AS p25,
APPROX_QUANTILES(age, 100)[OFFSET(50)] AS median,
APPROX_QUANTILES(age, 100)[OFFSET(75)] AS p75,
APPROX_QUANTILES(age, 100)[OFFSET(90)] AS p90,
APPROX_QUANTILES(age, 100)[OFFSET(99)] AS p99,
MAX(age) AS max
FROM cookies_age
GROUP BY firstPartyCookie
25 changes: 25 additions & 0 deletions sql/2024/cookies/age_expire_cookies_quantiles.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
-- Extract stats around the expire date (age rounded to closest number of days)
-- (only positive values, non session cookies)
-- Before running query: edit date and client

WITH cookies_age AS (
SELECT
ROUND((CAST(expires AS FLOAT64) - CAST(startedDateTime AS FLOAT64)) / (24 * 3600), 0) AS age
FROM `httparchive.almanac.cookies`
WHERE
date = '2024-06-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL AND
CAST(expires AS FLOAT64) >= 0
)

SELECT
MIN(age) AS min,
APPROX_QUANTILES(age, 100)[OFFSET(25)] AS p25,
APPROX_QUANTILES(age, 100)[OFFSET(50)] AS median,
APPROX_QUANTILES(age, 100)[OFFSET(75)] AS p75,
APPROX_QUANTILES(age, 100)[OFFSET(90)] AS p90,
APPROX_QUANTILES(age, 100)[OFFSET(99)] AS p99,
MAX(age) AS max
FROM cookies_age
22 changes: 22 additions & 0 deletions sql/2024/cookies/age_expires_cookies_cdf.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
-- Extract the expire date (age rounded to closest number of days)
-- (only positive values, non session cookies)
-- Before running query: edit date and client

WITH cookies_age AS (
SELECT
ROUND((CAST(expires AS FLOAT64) - CAST(startedDateTime AS FLOAT64)) / (24 * 3600), 0) AS age
FROM `httparchive.almanac.cookies`
WHERE
date = '2024-06-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL AND
CAST(expires AS FLOAT64) >= 0
)

SELECT
age,
COUNT(0) AS nbCookies
FROM cookies_age
GROUP BY age
ORDER BY age ASC
24 changes: 24 additions & 0 deletions sql/2024/cookies/nb_cookies_cdf.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
-- Extract the nb of cookies
-- Before running query: edit date and client

WITH nb_cookies_per_website AS (
SELECT
firstPartyCookie,
NET.HOST(page) AS pageFirstPartyHost,
COUNT(DISTINCT CONCAT(name, domain)) AS distinctNbCookies
FROM `httparchive.almanac.cookies`
WHERE
date = '2024-06-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL
GROUP BY firstPartyCookie, pageFirstPartyHost
)

SELECT
firstPartyCookie,
distinctNbCookies,
COUNT(DISTINCT pageFirstPartyHost) AS nbWebsites
FROM nb_cookies_per_website
GROUP BY firstPartyCookie, distinctNbCookies
ORDER BY firstPartyCookie, distinctNbCookies ASC
28 changes: 28 additions & 0 deletions sql/2024/cookies/nb_cookies_per_type_quantiles.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
-- Extract stats about the nb of cookies
-- Before running query: edit date and client

WITH nb_cookies_per_website AS (
SELECT
firstPartyCookie,
NET.HOST(page) AS pageFirstPartyHost,
COUNT(DISTINCT CONCAT(name, domain)) AS distinctNbCookies
FROM `httparchive.almanac.cookies`
WHERE
date = '2024-06-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL
GROUP BY firstPartyCookie, pageFirstPartyHost
)

SELECT
firstPartyCookie,
MIN(distinctNbCookies) AS min,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(25)] AS p25,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(50)] AS median,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(75)] AS p75,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(90)] AS p90,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(99)] AS p99,
MAX(distinctNbCookies) AS max
FROM nb_cookies_per_website
GROUP BY firstPartyCookie
25 changes: 25 additions & 0 deletions sql/2024/cookies/nb_cookies_quantiles.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
-- Extract stats about the nb of cookies
-- Before running query: edit date and client

WITH nb_cookies_per_website AS (
SELECT
NET.HOST(page) AS pageFirstPartyHost,
COUNT(DISTINCT CONCAT(name, domain)) AS distinctNbCookies
FROM `httparchive.almanac.cookies`
WHERE
date = '2024-06-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL
GROUP BY pageFirstPartyHost
)

SELECT
MIN(distinctNbCookies) AS min,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(25)] AS p25,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(50)] AS median,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(75)] AS p75,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(90)] AS p90,
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(99)] AS p99,
MAX(distinctNbCookies) AS max
FROM nb_cookies_per_website
25 changes: 25 additions & 0 deletions sql/2024/cookies/prevalence_attributes_per_type.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
-- Prevalence of cookies type and attributes per type of cookie (1st/3rd party)
-- Before running query: edit date and client

SELECT
firstPartyCookie,
SUM(IF(httpOnly = 'true', 1, 0)) / COUNT(0) AS httpOnly,
SUM(IF(secure = 'true', 1, 0)) / COUNT(0) AS secure,
SUM(IF(session = 'true', 1, 0)) / COUNT(0) AS session,
SUM(IF(sameParty = 'true', 1, 0)) / COUNT(0) AS sameParty,
SUM(IF(sameSite = 'Lax', 1, 0)) / COUNT(0) AS sameSiteLax,
SUM(IF(sameSite = 'None', 1, 0)) / COUNT(0) AS sameSiteNone,
SUM(IF(sameSite = 'Strict', 1, 0)) / COUNT(0) AS sameSiteStrict,
SUM(IF(sameSite IS NULL, 1, 0)) / COUNT(0) AS sameSiteNull,
SUM(IF(partitionKey IS NOT NULL, 1, 0)) / COUNT(0) AS partitionKey,
SUM(IF(partitionKeyOpaque IS NOT NULL, 1, 0)) / COUNT(0) AS partitionKeyOpaque,
SUM(IF(STARTS_WITH(name, '__Host-'), 1, 0)) / COUNT(0) AS hostPrefix,
SUM(IF(STARTS_WITH(name, '__Secure-'), 1, 0)) / COUNT(0) AS securePrefix
FROM `httparchive.almanac.cookies`
WHERE
date = '2024-06-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL -- just in case
GROUP BY
firstPartyCookie
28 changes: 28 additions & 0 deletions sql/2024/cookies/prevalence_type_attributes_per_rank.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
-- Prevalence of cookies type and attributes per rank grouping
-- Before running query: edit date and client

SELECT
rank_grouping,
SUM(IF(firstPartyCookie = TRUE, 1, 0)) / COUNT(0) AS firstParty,
SUM(IF(firstPartyCookie = FALSE, 1, 0)) / COUNT(0) AS thirdParty,
SUM(IF(httpOnly = 'true', 1, 0)) / COUNT(0) AS httpOnly,
SUM(IF(secure = 'true', 1, 0)) / COUNT(0) AS secure,
SUM(IF(session = 'true', 1, 0)) / COUNT(0) AS session,
SUM(IF(sameParty = 'true', 1, 0)) / COUNT(0) AS sameParty,
SUM(IF(sameSite = 'Lax', 1, 0)) / COUNT(0) AS sameSiteLax,
SUM(IF(sameSite = 'None', 1, 0)) / COUNT(0) AS sameSiteNone,
SUM(IF(sameSite = 'Strict', 1, 0)) / COUNT(0) AS sameSiteStrict,
SUM(IF(sameSite IS NULL, 1, 0)) / COUNT(0) AS sameSiteNull,
SUM(IF(partitionKey IS NOT NULL, 1, 0)) / COUNT(0) AS partitionKey,
SUM(IF(partitionKeyOpaque IS NOT NULL, 1, 0)) / COUNT(0) AS partitionKeyOpaque
FROM `httparchive.almanac.cookies`,
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping
WHERE
rank <= rank_grouping AND
date = '2024-06-01' AND
client = 'desktop' AND
firstPartyCookie IS NOT NULL -- just in case
GROUP BY
rank_grouping
ORDER BY
rank_grouping
14 changes: 14 additions & 0 deletions sql/2024/cookies/size_cookies_cdf.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
-- Extract the size of the cookies
-- Before running query: edit date and client

SELECT
CAST(size AS INT) AS sizeCookie,
COUNT(0) AS nbCookies
FROM `httparchive.almanac.cookies`
WHERE
date = '2024-06-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL
GROUP BY sizeCookie
ORDER BY sizeCookie ASC
19 changes: 19 additions & 0 deletions sql/2024/cookies/size_cookies_per_type_quantiles.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
-- Extract stats around the size of the cookies
-- Before running query: edit date and client

SELECT
firstPartyCookie,
MIN(CAST(size AS INT)) AS min,
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(25)] AS p25,
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(50)] AS median,
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(75)] AS p75,
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(90)] AS p90,
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(99)] AS p99,
MAX(CAST(size AS INT)) AS max
FROM `httparchive.almanac.cookies`
WHERE
date = '2024-06-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL
GROUP BY firstPartyCookie
17 changes: 17 additions & 0 deletions sql/2024/cookies/size_cookies_quantiles.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
-- Extract stats around the size of the cookies
-- Before running query: edit date and client

SELECT
MIN(CAST(size AS INT)) AS min,
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(25)] AS p25,
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(50)] AS median,
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(75)] AS p75,
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(90)] AS p90,
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(99)] AS p99,
MAX(CAST(size AS INT)) AS max
FROM `httparchive.almanac.cookies`
WHERE
date = '2024-06-01' AND
client = 'desktop' AND
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
firstPartyCookie IS NOT NULL
Loading