diff --git a/sql/2024/privacy/ccpa_most_common_phrases.sql b/sql/2024/privacy/ccpa_most_common_phrases.sql index 94d3075a5fd..f7ef9daf7ae 100644 --- a/sql/2024/privacy/ccpa_most_common_phrases.sql +++ b/sql/2024/privacy/ccpa_most_common_phrases.sql @@ -1,5 +1,31 @@ WITH pages_with_phrase AS ( - SELECT client, rank_grouping, page, count(DISTINCT page) OVER (PARTITION BY client, rank_grouping) AS total_pages_with_phrase_in_rank_group, JSON_QUERY_ARRAY(custom_metrics, '$.privacy.ccpa_link.CCPALinkPhrases') AS ccpa_link_phrases FROM `httparchive.all.pages`, --TABLESAMPLE SYSTEM (0.01 PERCENT) - UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping WHERE date = '2024-06-01' AND is_root_page = true AND rank <= rank_grouping AND array_length(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.ccpa_link.CCPALinkPhrases')) > 0 + SELECT + client, + rank_grouping, + page, + COUNT(DISTINCT page) OVER (PARTITION BY client, rank_grouping) AS total_pages_with_phrase_in_rank_group, + JSON_QUERY_ARRAY(custom_metrics, '$.privacy.ccpa_link.CCPALinkPhrases') AS ccpa_link_phrases + FROM `httparchive.all.pages`, --TABLESAMPLE SYSTEM (0.01 PERCENT) + UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping + WHERE date = '2024-06-01' AND + is_root_page = true AND + rank <= rank_grouping AND + array_length(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.ccpa_link.CCPALinkPhrases')) > 0 ) -SELECT client, rank_grouping, link_phrase, count(DISTINCT page) AS num_pages, count(DISTINCT page) / any_value(total_pages_with_phrase_in_rank_group) AS pct_pages FROM pages_with_phrase, unnest(ccpa_link_phrases) link_phrase GROUP BY link_phrase, rank_grouping, client ORDER BY rank_grouping, client, num_pages DESC + +SELECT + client, + rank_grouping, + link_phrase, + COUNT(DISTINCT page) AS num_pages, + COUNT(DISTINCT page) / any_value(total_pages_with_phrase_in_rank_group) AS pct_pages +FROM pages_with_phrase, + UNNEST(ccpa_link_phrases) AS link_phrase +GROUP BY + link_phrase, + rank_grouping, + client +ORDER BY + rank_grouping, + client, + num_pages DESC diff --git a/sql/2024/privacy/ccpa_prevalence.sql b/sql/2024/privacy/ccpa_prevalence.sql index 29453778b97..e3ac8946e9d 100644 --- a/sql/2024/privacy/ccpa_prevalence.sql +++ b/sql/2024/privacy/ccpa_prevalence.sql @@ -1,6 +1,27 @@ WITH pages AS ( - SELECT client, rank_grouping, page, JSON_VALUE(custom_metrics, '$.privacy.ccpa_link.hasCCPALink') AS has_ccpa_link FROM `httparchive.all.pages`, - -- TABLESAMPLE SYSTEM (0.0025 PERCENT) - UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping WHERE date = '2024-06-01' AND is_root_page = true AND rank <= rank_grouping + SELECT + client, + rank_grouping, + page, + JSON_VALUE(custom_metrics, '$.privacy.ccpa_link.hasCCPALink') AS has_ccpa_link + FROM `httparchive.all.pages`, -- TABLESAMPLE SYSTEM (0.0025 PERCENT) + UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping + WHERE date = '2024-06-01' AND + is_root_page = true AND + rank <= rank_grouping ) -SELECT client, rank_grouping, has_ccpa_link, count(DISTINCT page) AS num_pages FROM pages GROUP BY has_ccpa_link, rank_grouping, client ORDER BY rank_grouping, client, has_ccpa_link + +SELECT + client, + rank_grouping, + has_ccpa_link, + COUNT(DISTINCT page) AS num_pages +FROM pages +GROUP BY + has_ccpa_link, + rank_grouping, + client +ORDER BY + rank_grouping, + client, + has_ccpa_link diff --git a/sql/2024/privacy/cookies_top_first_party_names.sql b/sql/2024/privacy/cookies_top_first_party_names.sql index c65aefc029b..1073d24c33a 100644 --- a/sql/2024/privacy/cookies_top_first_party_names.sql +++ b/sql/2024/privacy/cookies_top_first_party_names.sql @@ -1,10 +1,35 @@ -- Most common cookie names, by number of domains on which they appear. Goal is to identify common trackers that use first-party cookies across sites. WITH pages AS ( - SELECT client, root_page, custom_metrics, count(DISTINCT net.host(root_page)) OVER(PARTITION BY client) AS total_domains FROM `httparchive.all.pages` -- TABLESAMPLE SYSTEM (0.00001 PERCENT) + SELECT + client, + root_page, + custom_metrics, + COUNT(DISTINCT net.host(root_page)) OVER(PARTITION BY client) AS total_domains + FROM `httparchive.all.pages` WHERE date = '2024-06-01' -), -cookies AS ( - SELECT client, cookie, net.host(JSON_VALUE(cookie, '$.domain')) AS cookie_host, net.host(root_page) AS firstparty_host, total_domains FROM pages, UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) cookie +), cookies AS ( + SELECT + client, + cookie, + NET.HOST(JSON_VALUE(cookie, '$.domain')) AS cookie_host, + NET.HOST(root_page) AS firstparty_host, + total_domains + FROM pages, + UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) AS cookie ) -SELECT client, count(DISTINCT firstparty_host) AS domain_count, count(DISTINCT firstparty_host) / any_value(total_domains) AS pct_domains, JSON_VALUE(cookie, '$.name') AS cookie_name FROM cookies WHERE firstparty_host LIKE '%' || cookie_host GROUP BY client, cookie_name ORDER BY domain_count DESC, client DESC LIMIT 500 + +SELECT + client, + COUNT(DISTINCT firstparty_host) AS domain_count, + COUNT(DISTINCT firstparty_host) / any_value(total_domains) AS pct_domains, + JSON_VALUE(cookie, '$.name') AS cookie_name +FROM cookies +WHERE firstparty_host LIKE '%' || cookie_host +GROUP BY + client, + cookie_name +ORDER BY + domain_count DESC, + client DESC +LIMIT 500 diff --git a/sql/2024/privacy/cookies_top_third_party_domains.sql b/sql/2024/privacy/cookies_top_third_party_domains.sql index 6e6ff0c690b..ff5236f3bff 100644 --- a/sql/2024/privacy/cookies_top_third_party_domains.sql +++ b/sql/2024/privacy/cookies_top_third_party_domains.sql @@ -1,8 +1,35 @@ WITH pages AS ( - SELECT page, client, root_page, custom_metrics, count(DISTINCT page) OVER (PARTITION BY client) AS total_pages FROM `httparchive.all.pages` -- TABLESAMPLE SYSTEM (0.1 PERCENT) + SELECT + page, + client, + root_page, + custom_metrics, + COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages + FROM `httparchive.all.pages` WHERE date = '2024-06-01' -), -cookies AS ( - SELECT client, page, cookie, net.host(JSON_VALUE(cookie, '$.domain')) AS cookie_host, net.host(root_page) AS firstparty_host, total_pages FROM pages, UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) cookie +), cookies AS ( + SELECT + client, + page, + cookie, + NET.HOST(JSON_VALUE(cookie, '$.domain')) AS cookie_host, + NET.HOST(root_page) AS firstparty_host, + total_pages + FROM pages, + UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) AS cookie ) -SELECT client, cookie_host, count(DISTINCT page) AS page_count, count(DISTINCT page) / any_value(total_pages) AS pct_pages FROM cookies WHERE firstparty_host NOT LIKE '%' || cookie_host GROUP BY client, cookie_host ORDER BY page_count DESC, client LIMIT 500 + +SELECT + client, + cookie_host, + COUNT(DISTINCT page) AS page_count, + COUNT(DISTINCT page) / any_value(total_pages) AS pct_pages +FROM cookies +WHERE firstparty_host NOT LIKE '%' || cookie_host +GROUP BY + client, + cookie_host +ORDER BY + page_count DESC, + client +LIMIT 500 diff --git a/sql/2024/privacy/cookies_top_third_party_names.sql b/sql/2024/privacy/cookies_top_third_party_names.sql index 42b42218e78..4f37892974f 100644 --- a/sql/2024/privacy/cookies_top_third_party_names.sql +++ b/sql/2024/privacy/cookies_top_third_party_names.sql @@ -1,10 +1,35 @@ -- Most common cookie names, by number of domains on which they appear. Goal is to identify common trackers that set cookies using many domains. WITH pages AS ( - SELECT client, root_page, custom_metrics, count(DISTINCT net.host(root_page)) OVER(PARTITION BY client) AS total_domains FROM `httparchive.all.pages` -- TABLESAMPLE SYSTEM (0.00001 PERCENT) + SELECT + client, + root_page, + custom_metrics, + COUNT(DISTINCT net.host(root_page)) OVER(PARTITION BY client) AS total_domains + FROM `httparchive.all.pages` WHERE date = '2024-06-01' ), cookies AS ( - SELECT client, cookie, net.host(JSON_VALUE(cookie, '$.domain')) AS cookie_host, net.host(root_page) AS firstparty_host, total_domains FROM pages, UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) cookie + SELECT + client, + cookie, + NET.HOST(JSON_VALUE(cookie, '$.domain')) AS cookie_host, + NET.HOST(root_page) AS firstparty_host, + total_domains + FROM pages, + UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) AS cookie ) -SELECT client, count(DISTINCT firstparty_host) AS domain_count, count(DISTINCT firstparty_host) / any_value(total_domains) AS pct_domains, JSON_VALUE(cookie, '$.name') AS cookie_name FROM cookies WHERE firstparty_host NOT LIKE '%' || cookie_host GROUP BY client, cookie_name ORDER BY domain_count DESC, client DESC LIMIT 500 +SELECT + client, + COUNT(DISTINCT firstparty_host) AS domain_count, + COUNT(DISTINCT firstparty_host) / any_value(total_domains) AS pct_domains, + JSON_VALUE(cookie, '$.name') AS cookie_name +FROM cookies +WHERE firstparty_host NOT LIKE '%' || cookie_host +GROUP BY + client, + cookie_name +ORDER BY + domain_count DESC, + client DESC +LIMIT 500 diff --git a/sql/2024/privacy/fingerprinting_most_common_apis.sql b/sql/2024/privacy/fingerprinting_most_common_apis.sql index e955816b0e2..f4ba43c5cb7 100644 --- a/sql/2024/privacy/fingerprinting_most_common_apis.sql +++ b/sql/2024/privacy/fingerprinting_most_common_apis.sql @@ -13,7 +13,23 @@ if (input) { """; WITH pages AS ( - SELECT client, page, fingerprinting_type, count(DISTINCT page) OVER (PARTITION BY client) AS total_pages FROM `httparchive.all.pages`, --TABLESAMPLE SYSTEM (0.01 PERCENT) - unnest(getFingerprintingTypes(JSON_EXTRACT(custom_metrics, '$.privacy.fingerprinting.counts'))) AS fingerprinting_type WHERE date = '2024-06-01' + SELECT + client, + page, + fingerprinting_type, + COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages + FROM `httparchive.all.pages`, + UNNEST(getFingerprintingTypes(JSON_EXTRACT(custom_metrics, '$.privacy.fingerprinting.counts'))) AS fingerprinting_type + WHERE date = '2024-06-01' ) -SELECT client, fingerprinting_type, count(DISTINCT page) AS page_count, count(DISTINCT page) / any_value(total_pages) AS pct_pages FROM pages GROUP BY client, fingerprinting_type ORDER BY page_count DESC +SELECT + client, + fingerprinting_type, + COUNT(DISTINCT page) AS page_count, + COUNT(DISTINCT page) / any_value(total_pages) AS pct_pages +FROM pages +GROUP BY + client, + fingerprinting_type +ORDER BY + page_count DESC diff --git a/sql/2024/privacy/fingerprinting_most_common_scripts.sql b/sql/2024/privacy/fingerprinting_most_common_scripts.sql index bd91b1c43df..84d3ea02e4d 100644 --- a/sql/2024/privacy/fingerprinting_most_common_scripts.sql +++ b/sql/2024/privacy/fingerprinting_most_common_scripts.sql @@ -1,6 +1,23 @@ WITH pages AS ( - SELECT page, client, custom_metrics, count(DISTINCT page) OVER (PARTITION BY client) AS total_pages FROM `httparchive.all.pages` --TABLESAMPLE SYSTEM (0.001 PERCENT) + SELECT + page, + client, + custom_metrics, + COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages + FROM `httparchive.all.pages` WHERE date = '2024-06-01' ) -SELECT client, script, count(DISTINCT page) AS page_count, count(DISTINCT page) / any_value(total_pages) AS pct_pages FROM pages, - unnest(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.fingerprinting.likelyFingerprintingScripts')) AS script GROUP BY client, script ORDER BY page_count DESC LIMIT 100; + +SELECT + client, + script, + COUNT(DISTINCT page) AS page_count, + COUNT(DISTINCT page) / any_value(total_pages) AS pct_pages +FROM pages, + UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.fingerprinting.likelyFingerprintingScripts')) AS script +GROUP BY + client, + script +ORDER BY + page_count DESC +LIMIT 100; diff --git a/sql/2024/privacy/fingerprinting_script_count.sql b/sql/2024/privacy/fingerprinting_script_count.sql index 85128dd3c98..07a6038e03a 100644 --- a/sql/2024/privacy/fingerprinting_script_count.sql +++ b/sql/2024/privacy/fingerprinting_script_count.sql @@ -1,5 +1,21 @@ WITH pages AS ( - SELECT page, client, array_length(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.fingerprinting.likelyFingerprintingScripts')) AS script_count, count(DISTINCT page) OVER (PARTITION BY client) AS total_pages FROM `httparchive.all.pages` --TABLESAMPLE SYSTEM (0.01 PERCENT) + SELECT + page, + client, + ARRAY_LENGTH(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.fingerprinting.likelyFingerprintingScripts')) AS script_count, + COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages + FROM `httparchive.all.pages` WHERE date = '2024-06-01' ) -SELECT script_count, client, count(DISTINCT page) AS page_count, count(DISTINCT page) / any_value(total_pages) AS pct_pages FROM pages GROUP BY script_count, client ORDER BY script_count ASC; + +SELECT + script_count, + client, + COUNT(DISTINCT page) AS page_count, + COUNT(DISTINCT page) / any_value(total_pages) AS pct_pages +FROM pages +GROUP BY + script_count, + client +ORDER BY + script_count ASC; diff --git a/sql/2024/privacy/number_of_websites_with_bounce_tracking.sql b/sql/2024/privacy/number_of_websites_with_bounce_tracking.sql index 3c8625a7865..eeaebeeda1e 100644 --- a/sql/2024/privacy/number_of_websites_with_bounce_tracking.sql +++ b/sql/2024/privacy/number_of_websites_with_bounce_tracking.sql @@ -1,5 +1,9 @@ +-- Detection logic explained: +-- https://github.com/privacycg/proposals/issues/6 +-- https://github.com/privacycg/nav-tracking-mitigations/blob/main/bounce-tracking-explainer.md WITH redirect_requests AS ( SELECT + client, url, index, response_headers, @@ -7,7 +11,6 @@ WITH redirect_requests AS ( FROM `httparchive.all.requests` WHERE date = '2024-06-01' AND - client = 'mobile' AND is_root_page = TRUE AND type NOT IN ('css', 'image', 'font', 'video', 'audio') AND LEFT(JSON_VALUE(summary, '$.status'), 1) = '3' AND @@ -15,6 +18,7 @@ WITH redirect_requests AS ( ), navigation_redirect AS ( -- Find the first navigation redirect SELECT + client, url, page, headers.value AS navigation_redirect_location @@ -27,6 +31,7 @@ WITH redirect_requests AS ( ), bounce_redirect AS ( -- Find the second navigation redirect SELECT + client, url, page, headers.value AS bounce_redirect_location, @@ -40,6 +45,7 @@ WITH redirect_requests AS ( ), bounce_redirect_with_cookies AS ( -- Find the cookies set during the second navigation redirect SELECT + client, url, page, bounce_redirect_location @@ -51,6 +57,7 @@ WITH redirect_requests AS ( ), bounce_sequences AS ( -- Combine the first and second navigation redirects SELECT + nav.client, nav.page, nav.url AS navigation_url, nav.navigation_redirect_location, @@ -59,10 +66,12 @@ WITH redirect_requests AS ( FROM navigation_redirect AS nav LEFT JOIN bounce_redirect_with_cookies AS bounce ON + nav.client = bounce.client AND nav.page = bounce.page AND nav.navigation_redirect_location = bounce.url WHERE bounce_redirect_location IS NOT NULL GROUP BY + nav.client, page, navigation_url, navigation_redirect_location, @@ -71,10 +80,11 @@ WITH redirect_requests AS ( -- Count the number of websites with bounce tracking per bounce hostname SELECT + client, NET.HOST(navigation_redirect_location) AS bounce_hostname, COUNT(DISTINCT page) AS number_of_pages --ARRAY_AGG(page LIMIT 2) AS page_examples FROM bounce_sequences -GROUP BY bounce_hostname +GROUP BY client, bounce_hostname ORDER BY number_of_pages DESC LIMIT 100 diff --git a/sql/2024/privacy/top_direct_ads_demand.sql b/sql/2024/privacy/top_direct_ads_demand.sql index 6df1a23c729..ee96dbe9e38 100644 --- a/sql/2024/privacy/top_direct_ads_demand.sql +++ b/sql/2024/privacy/top_direct_ads_demand.sql @@ -1,24 +1,24 @@ WITH RECURSIVE pages AS ( SELECT - CASE page -- publisher websites may redirect to an SSP domain, and need to use redirected domain instead of page domain + CASE page -- Publisher websites may redirect to an SSP domain, and need to use redirected domain instead of page domain. CASE needs to be replaced with a more robust solution from HTTPArchive/custom-metrics#136. WHEN 'https://www.chunkbase.com/' THEN 'cafemedia.com' ELSE NET.REG_DOMAIN(page) - END AS page, - JSON_QUERY(custom_metrics, '$.ads') AS ads_metrics + END AS page_domain, + JSON_QUERY(ANY_VALUE(custom_metrics), '$.ads') AS ads_metrics FROM `httparchive.all.pages` WHERE date = '2024-06-01' AND - client = 'mobile' AND is_root_page = TRUE + GROUP BY page_domain ), ads AS ( SELECT - page, + page_domain, JSON_QUERY(ads_metrics, '$.ads.account_types') AS ad_accounts FROM pages WHERE JSON_VALUE(ads_metrics, '$.ads.account_count') != '0' ), sellers AS ( SELECT - page, + page_domain, JSON_QUERY(ads_metrics, '$.sellers.seller_types') AS ad_sellers FROM pages WHERE @@ -31,18 +31,18 @@ WITH RECURSIVE pages AS ( FROM ( SELECT NET.REG_DOMAIN(domain) AS demand, - page AS publisher + page_domain AS publisher FROM ads, UNNEST(JSON_VALUE_ARRAY(ad_accounts, '$.direct.domains')) AS domain UNION ALL SELECT - page AS demand, + page_domain AS demand, NET.REG_DOMAIN(domain) AS publisher FROM sellers, UNNEST(JSON_VALUE_ARRAY(ad_sellers, '$.publisher.domains')) AS domain UNION ALL SELECT - page AS demand, + page_domain AS demand, NET.REG_DOMAIN(domain) AS publisher FROM sellers, UNNEST(JSON_VALUE_ARRAY(ad_sellers, '$.both.domains')) AS domain diff --git a/sql/util/whotracksme_trackers.py b/sql/util/whotracksme_trackers.py index 06fbe1362eb..ec68f922e17 100644 --- a/sql/util/whotracksme_trackers.py +++ b/sql/util/whotracksme_trackers.py @@ -2,20 +2,13 @@ This module retrieves and extracts trackers as identified by WhoTracks.me and appends them to the httparchive.almanac.whotracksme BigQuery table. """ - # pylint: disable=import-error - -from datetime import datetime as DateTime import sqlite3 import pandas import requests from bq_writer import write_to_bq, bigquery - -# get current year -year = DateTime.now().year - # Retrieve and extract trackers as identified by WhoTracks.me. # https://github.com/ghostery/whotracks.me/blob/master/blog/generating_adblocker_filters.md#loading-the-data tracker_db = requests.get( @@ -23,9 +16,9 @@ timeout=10, ).text -trackers_query = f""" +TRACKERS_QUERY = """ SELECT - '{year}-06-01' AS date, + '2024-06-01' AS date, categories.name as category, tracker, domain @@ -40,7 +33,7 @@ """ connection = sqlite3.connect(":memory:") connection.executescript(tracker_db) -trackers_df = pandas.read_sql(trackers_query, connection) +trackers_df = pandas.read_sql(TRACKERS_QUERY, connection) connection.close() # Append to almanac.whotracksme BQ table