Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into privacy-markdown-2024
Browse files Browse the repository at this point in the history
  • Loading branch information
max-ostapenko committed Nov 4, 2024
2 parents 805ea9c + 324d22b commit fae1fc5
Show file tree
Hide file tree
Showing 11 changed files with 225 additions and 49 deletions.
32 changes: 29 additions & 3 deletions sql/2024/privacy/ccpa_most_common_phrases.sql
Original file line number Diff line number Diff line change
@@ -1,5 +1,31 @@
WITH pages_with_phrase AS (
SELECT client, rank_grouping, page, count(DISTINCT page) OVER (PARTITION BY client, rank_grouping) AS total_pages_with_phrase_in_rank_group, JSON_QUERY_ARRAY(custom_metrics, '$.privacy.ccpa_link.CCPALinkPhrases') AS ccpa_link_phrases FROM `httparchive.all.pages`, --TABLESAMPLE SYSTEM (0.01 PERCENT)
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping WHERE date = '2024-06-01' AND is_root_page = true AND rank <= rank_grouping AND array_length(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.ccpa_link.CCPALinkPhrases')) > 0
SELECT
client,
rank_grouping,
page,
COUNT(DISTINCT page) OVER (PARTITION BY client, rank_grouping) AS total_pages_with_phrase_in_rank_group,
JSON_QUERY_ARRAY(custom_metrics, '$.privacy.ccpa_link.CCPALinkPhrases') AS ccpa_link_phrases
FROM `httparchive.all.pages`, --TABLESAMPLE SYSTEM (0.01 PERCENT)
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping
WHERE date = '2024-06-01' AND
is_root_page = true AND
rank <= rank_grouping AND
array_length(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.ccpa_link.CCPALinkPhrases')) > 0
)
SELECT client, rank_grouping, link_phrase, count(DISTINCT page) AS num_pages, count(DISTINCT page) / any_value(total_pages_with_phrase_in_rank_group) AS pct_pages FROM pages_with_phrase, unnest(ccpa_link_phrases) link_phrase GROUP BY link_phrase, rank_grouping, client ORDER BY rank_grouping, client, num_pages DESC

SELECT
client,
rank_grouping,
link_phrase,
COUNT(DISTINCT page) AS num_pages,
COUNT(DISTINCT page) / any_value(total_pages_with_phrase_in_rank_group) AS pct_pages
FROM pages_with_phrase,
UNNEST(ccpa_link_phrases) AS link_phrase
GROUP BY
link_phrase,
rank_grouping,
client
ORDER BY
rank_grouping,
client,
num_pages DESC
29 changes: 25 additions & 4 deletions sql/2024/privacy/ccpa_prevalence.sql
Original file line number Diff line number Diff line change
@@ -1,6 +1,27 @@
WITH pages AS (
SELECT client, rank_grouping, page, JSON_VALUE(custom_metrics, '$.privacy.ccpa_link.hasCCPALink') AS has_ccpa_link FROM `httparchive.all.pages`,
-- TABLESAMPLE SYSTEM (0.0025 PERCENT)
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping WHERE date = '2024-06-01' AND is_root_page = true AND rank <= rank_grouping
SELECT
client,
rank_grouping,
page,
JSON_VALUE(custom_metrics, '$.privacy.ccpa_link.hasCCPALink') AS has_ccpa_link
FROM `httparchive.all.pages`, -- TABLESAMPLE SYSTEM (0.0025 PERCENT)
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping
WHERE date = '2024-06-01' AND
is_root_page = true AND
rank <= rank_grouping
)
SELECT client, rank_grouping, has_ccpa_link, count(DISTINCT page) AS num_pages FROM pages GROUP BY has_ccpa_link, rank_grouping, client ORDER BY rank_grouping, client, has_ccpa_link

SELECT
client,
rank_grouping,
has_ccpa_link,
COUNT(DISTINCT page) AS num_pages
FROM pages
GROUP BY
has_ccpa_link,
rank_grouping,
client
ORDER BY
rank_grouping,
client,
has_ccpa_link
35 changes: 30 additions & 5 deletions sql/2024/privacy/cookies_top_first_party_names.sql
Original file line number Diff line number Diff line change
@@ -1,10 +1,35 @@
-- Most common cookie names, by number of domains on which they appear. Goal is to identify common trackers that use first-party cookies across sites.

WITH pages AS (
SELECT client, root_page, custom_metrics, count(DISTINCT net.host(root_page)) OVER(PARTITION BY client) AS total_domains FROM `httparchive.all.pages` -- TABLESAMPLE SYSTEM (0.00001 PERCENT)
SELECT
client,
root_page,
custom_metrics,
COUNT(DISTINCT net.host(root_page)) OVER(PARTITION BY client) AS total_domains
FROM `httparchive.all.pages`
WHERE date = '2024-06-01'
),
cookies AS (
SELECT client, cookie, net.host(JSON_VALUE(cookie, '$.domain')) AS cookie_host, net.host(root_page) AS firstparty_host, total_domains FROM pages, UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) cookie
), cookies AS (
SELECT
client,
cookie,
NET.HOST(JSON_VALUE(cookie, '$.domain')) AS cookie_host,
NET.HOST(root_page) AS firstparty_host,
total_domains
FROM pages,
UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) AS cookie
)
SELECT client, count(DISTINCT firstparty_host) AS domain_count, count(DISTINCT firstparty_host) / any_value(total_domains) AS pct_domains, JSON_VALUE(cookie, '$.name') AS cookie_name FROM cookies WHERE firstparty_host LIKE '%' || cookie_host GROUP BY client, cookie_name ORDER BY domain_count DESC, client DESC LIMIT 500

SELECT
client,
COUNT(DISTINCT firstparty_host) AS domain_count,
COUNT(DISTINCT firstparty_host) / any_value(total_domains) AS pct_domains,
JSON_VALUE(cookie, '$.name') AS cookie_name
FROM cookies
WHERE firstparty_host LIKE '%' || cookie_host
GROUP BY
client,
cookie_name
ORDER BY
domain_count DESC,
client DESC
LIMIT 500
37 changes: 32 additions & 5 deletions sql/2024/privacy/cookies_top_third_party_domains.sql
Original file line number Diff line number Diff line change
@@ -1,8 +1,35 @@
WITH pages AS (
SELECT page, client, root_page, custom_metrics, count(DISTINCT page) OVER (PARTITION BY client) AS total_pages FROM `httparchive.all.pages` -- TABLESAMPLE SYSTEM (0.1 PERCENT)
SELECT
page,
client,
root_page,
custom_metrics,
COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages
FROM `httparchive.all.pages`
WHERE date = '2024-06-01'
),
cookies AS (
SELECT client, page, cookie, net.host(JSON_VALUE(cookie, '$.domain')) AS cookie_host, net.host(root_page) AS firstparty_host, total_pages FROM pages, UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) cookie
), cookies AS (
SELECT
client,
page,
cookie,
NET.HOST(JSON_VALUE(cookie, '$.domain')) AS cookie_host,
NET.HOST(root_page) AS firstparty_host,
total_pages
FROM pages,
UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) AS cookie
)
SELECT client, cookie_host, count(DISTINCT page) AS page_count, count(DISTINCT page) / any_value(total_pages) AS pct_pages FROM cookies WHERE firstparty_host NOT LIKE '%' || cookie_host GROUP BY client, cookie_host ORDER BY page_count DESC, client LIMIT 500

SELECT
client,
cookie_host,
COUNT(DISTINCT page) AS page_count,
COUNT(DISTINCT page) / any_value(total_pages) AS pct_pages
FROM cookies
WHERE firstparty_host NOT LIKE '%' || cookie_host
GROUP BY
client,
cookie_host
ORDER BY
page_count DESC,
client
LIMIT 500
31 changes: 28 additions & 3 deletions sql/2024/privacy/cookies_top_third_party_names.sql
Original file line number Diff line number Diff line change
@@ -1,10 +1,35 @@
-- Most common cookie names, by number of domains on which they appear. Goal is to identify common trackers that set cookies using many domains.

WITH pages AS (
SELECT client, root_page, custom_metrics, count(DISTINCT net.host(root_page)) OVER(PARTITION BY client) AS total_domains FROM `httparchive.all.pages` -- TABLESAMPLE SYSTEM (0.00001 PERCENT)
SELECT
client,
root_page,
custom_metrics,
COUNT(DISTINCT net.host(root_page)) OVER(PARTITION BY client) AS total_domains
FROM `httparchive.all.pages`
WHERE date = '2024-06-01'
),
cookies AS (
SELECT client, cookie, net.host(JSON_VALUE(cookie, '$.domain')) AS cookie_host, net.host(root_page) AS firstparty_host, total_domains FROM pages, UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) cookie
SELECT
client,
cookie,
NET.HOST(JSON_VALUE(cookie, '$.domain')) AS cookie_host,
NET.HOST(root_page) AS firstparty_host,
total_domains
FROM pages,
UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) AS cookie
)
SELECT client, count(DISTINCT firstparty_host) AS domain_count, count(DISTINCT firstparty_host) / any_value(total_domains) AS pct_domains, JSON_VALUE(cookie, '$.name') AS cookie_name FROM cookies WHERE firstparty_host NOT LIKE '%' || cookie_host GROUP BY client, cookie_name ORDER BY domain_count DESC, client DESC LIMIT 500
SELECT
client,
COUNT(DISTINCT firstparty_host) AS domain_count,
COUNT(DISTINCT firstparty_host) / any_value(total_domains) AS pct_domains,
JSON_VALUE(cookie, '$.name') AS cookie_name
FROM cookies
WHERE firstparty_host NOT LIKE '%' || cookie_host
GROUP BY
client,
cookie_name
ORDER BY
domain_count DESC,
client DESC
LIMIT 500
22 changes: 19 additions & 3 deletions sql/2024/privacy/fingerprinting_most_common_apis.sql
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,23 @@ if (input) {
""";

WITH pages AS (
SELECT client, page, fingerprinting_type, count(DISTINCT page) OVER (PARTITION BY client) AS total_pages FROM `httparchive.all.pages`, --TABLESAMPLE SYSTEM (0.01 PERCENT)
unnest(getFingerprintingTypes(JSON_EXTRACT(custom_metrics, '$.privacy.fingerprinting.counts'))) AS fingerprinting_type WHERE date = '2024-06-01'
SELECT
client,
page,
fingerprinting_type,
COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages
FROM `httparchive.all.pages`,
UNNEST(getFingerprintingTypes(JSON_EXTRACT(custom_metrics, '$.privacy.fingerprinting.counts'))) AS fingerprinting_type
WHERE date = '2024-06-01'
)
SELECT client, fingerprinting_type, count(DISTINCT page) AS page_count, count(DISTINCT page) / any_value(total_pages) AS pct_pages FROM pages GROUP BY client, fingerprinting_type ORDER BY page_count DESC
SELECT
client,
fingerprinting_type,
COUNT(DISTINCT page) AS page_count,
COUNT(DISTINCT page) / any_value(total_pages) AS pct_pages
FROM pages
GROUP BY
client,
fingerprinting_type
ORDER BY
page_count DESC
23 changes: 20 additions & 3 deletions sql/2024/privacy/fingerprinting_most_common_scripts.sql
Original file line number Diff line number Diff line change
@@ -1,6 +1,23 @@
WITH pages AS (
SELECT page, client, custom_metrics, count(DISTINCT page) OVER (PARTITION BY client) AS total_pages FROM `httparchive.all.pages` --TABLESAMPLE SYSTEM (0.001 PERCENT)
SELECT
page,
client,
custom_metrics,
COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages
FROM `httparchive.all.pages`
WHERE date = '2024-06-01'
)
SELECT client, script, count(DISTINCT page) AS page_count, count(DISTINCT page) / any_value(total_pages) AS pct_pages FROM pages,
unnest(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.fingerprinting.likelyFingerprintingScripts')) AS script GROUP BY client, script ORDER BY page_count DESC LIMIT 100;

SELECT
client,
script,
COUNT(DISTINCT page) AS page_count,
COUNT(DISTINCT page) / any_value(total_pages) AS pct_pages
FROM pages,
UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.fingerprinting.likelyFingerprintingScripts')) AS script
GROUP BY
client,
script
ORDER BY
page_count DESC
LIMIT 100;
20 changes: 18 additions & 2 deletions sql/2024/privacy/fingerprinting_script_count.sql
Original file line number Diff line number Diff line change
@@ -1,5 +1,21 @@
WITH pages AS (
SELECT page, client, array_length(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.fingerprinting.likelyFingerprintingScripts')) AS script_count, count(DISTINCT page) OVER (PARTITION BY client) AS total_pages FROM `httparchive.all.pages` --TABLESAMPLE SYSTEM (0.01 PERCENT)
SELECT
page,
client,
ARRAY_LENGTH(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.fingerprinting.likelyFingerprintingScripts')) AS script_count,
COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages
FROM `httparchive.all.pages`
WHERE date = '2024-06-01'
)
SELECT script_count, client, count(DISTINCT page) AS page_count, count(DISTINCT page) / any_value(total_pages) AS pct_pages FROM pages GROUP BY script_count, client ORDER BY script_count ASC;

SELECT
script_count,
client,
COUNT(DISTINCT page) AS page_count,
COUNT(DISTINCT page) / any_value(total_pages) AS pct_pages
FROM pages
GROUP BY
script_count,
client
ORDER BY
script_count ASC;
14 changes: 12 additions & 2 deletions sql/2024/privacy/number_of_websites_with_bounce_tracking.sql
Original file line number Diff line number Diff line change
@@ -1,20 +1,24 @@
-- Detection logic explained:
-- https://github.com/privacycg/proposals/issues/6
-- https://github.com/privacycg/nav-tracking-mitigations/blob/main/bounce-tracking-explainer.md
WITH redirect_requests AS (
SELECT
client,
url,
index,
response_headers,
page
FROM `httparchive.all.requests`
WHERE
date = '2024-06-01' AND
client = 'mobile' AND
is_root_page = TRUE AND
type NOT IN ('css', 'image', 'font', 'video', 'audio') AND
LEFT(JSON_VALUE(summary, '$.status'), 1) = '3' AND
index <= 2
), navigation_redirect AS (
-- Find the first navigation redirect
SELECT
client,
url,
page,
headers.value AS navigation_redirect_location
Expand All @@ -27,6 +31,7 @@ WITH redirect_requests AS (
), bounce_redirect AS (
-- Find the second navigation redirect
SELECT
client,
url,
page,
headers.value AS bounce_redirect_location,
Expand All @@ -40,6 +45,7 @@ WITH redirect_requests AS (
), bounce_redirect_with_cookies AS (
-- Find the cookies set during the second navigation redirect
SELECT
client,
url,
page,
bounce_redirect_location
Expand All @@ -51,6 +57,7 @@ WITH redirect_requests AS (
), bounce_sequences AS (
-- Combine the first and second navigation redirects
SELECT
nav.client,
nav.page,
nav.url AS navigation_url,
nav.navigation_redirect_location,
Expand All @@ -59,10 +66,12 @@ WITH redirect_requests AS (
FROM navigation_redirect AS nav
LEFT JOIN bounce_redirect_with_cookies AS bounce
ON
nav.client = bounce.client AND
nav.page = bounce.page AND
nav.navigation_redirect_location = bounce.url
WHERE bounce_redirect_location IS NOT NULL
GROUP BY
nav.client,
page,
navigation_url,
navigation_redirect_location,
Expand All @@ -71,10 +80,11 @@ WITH redirect_requests AS (

-- Count the number of websites with bounce tracking per bounce hostname
SELECT
client,
NET.HOST(navigation_redirect_location) AS bounce_hostname,
COUNT(DISTINCT page) AS number_of_pages
--ARRAY_AGG(page LIMIT 2) AS page_examples
FROM bounce_sequences
GROUP BY bounce_hostname
GROUP BY client, bounce_hostname
ORDER BY number_of_pages DESC
LIMIT 100
18 changes: 9 additions & 9 deletions sql/2024/privacy/top_direct_ads_demand.sql
Original file line number Diff line number Diff line change
@@ -1,24 +1,24 @@
WITH RECURSIVE pages AS (
SELECT
CASE page -- publisher websites may redirect to an SSP domain, and need to use redirected domain instead of page domain
CASE page -- Publisher websites may redirect to an SSP domain, and need to use redirected domain instead of page domain. CASE needs to be replaced with a more robust solution from HTTPArchive/custom-metrics#136.
WHEN 'https://www.chunkbase.com/' THEN 'cafemedia.com'
ELSE NET.REG_DOMAIN(page)
END AS page,
JSON_QUERY(custom_metrics, '$.ads') AS ads_metrics
END AS page_domain,
JSON_QUERY(ANY_VALUE(custom_metrics), '$.ads') AS ads_metrics
FROM `httparchive.all.pages`
WHERE date = '2024-06-01' AND
client = 'mobile' AND
is_root_page = TRUE
GROUP BY page_domain
), ads AS (
SELECT
page,
page_domain,
JSON_QUERY(ads_metrics, '$.ads.account_types') AS ad_accounts
FROM pages
WHERE
JSON_VALUE(ads_metrics, '$.ads.account_count') != '0'
), sellers AS (
SELECT
page,
page_domain,
JSON_QUERY(ads_metrics, '$.sellers.seller_types') AS ad_sellers
FROM pages
WHERE
Expand All @@ -31,18 +31,18 @@ WITH RECURSIVE pages AS (
FROM (
SELECT
NET.REG_DOMAIN(domain) AS demand,
page AS publisher
page_domain AS publisher
FROM ads,
UNNEST(JSON_VALUE_ARRAY(ad_accounts, '$.direct.domains')) AS domain
UNION ALL
SELECT
page AS demand,
page_domain AS demand,
NET.REG_DOMAIN(domain) AS publisher
FROM sellers,
UNNEST(JSON_VALUE_ARRAY(ad_sellers, '$.publisher.domains')) AS domain
UNION ALL
SELECT
page AS demand,
page_domain AS demand,
NET.REG_DOMAIN(domain) AS publisher
FROM sellers,
UNNEST(JSON_VALUE_ARRAY(ad_sellers, '$.both.domains')) AS domain
Expand Down
Loading

0 comments on commit fae1fc5

Please sign in to comment.