-
-
Notifications
You must be signed in to change notification settings - Fork 176
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge remote-tracking branch 'origin/main' into privacy-markdown-2024
- Loading branch information
Showing
11 changed files
with
225 additions
and
49 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,31 @@ | ||
WITH pages_with_phrase AS ( | ||
SELECT client, rank_grouping, page, count(DISTINCT page) OVER (PARTITION BY client, rank_grouping) AS total_pages_with_phrase_in_rank_group, JSON_QUERY_ARRAY(custom_metrics, '$.privacy.ccpa_link.CCPALinkPhrases') AS ccpa_link_phrases FROM `httparchive.all.pages`, --TABLESAMPLE SYSTEM (0.01 PERCENT) | ||
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping WHERE date = '2024-06-01' AND is_root_page = true AND rank <= rank_grouping AND array_length(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.ccpa_link.CCPALinkPhrases')) > 0 | ||
SELECT | ||
client, | ||
rank_grouping, | ||
page, | ||
COUNT(DISTINCT page) OVER (PARTITION BY client, rank_grouping) AS total_pages_with_phrase_in_rank_group, | ||
JSON_QUERY_ARRAY(custom_metrics, '$.privacy.ccpa_link.CCPALinkPhrases') AS ccpa_link_phrases | ||
FROM `httparchive.all.pages`, --TABLESAMPLE SYSTEM (0.01 PERCENT) | ||
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping | ||
WHERE date = '2024-06-01' AND | ||
is_root_page = true AND | ||
rank <= rank_grouping AND | ||
array_length(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.ccpa_link.CCPALinkPhrases')) > 0 | ||
) | ||
SELECT client, rank_grouping, link_phrase, count(DISTINCT page) AS num_pages, count(DISTINCT page) / any_value(total_pages_with_phrase_in_rank_group) AS pct_pages FROM pages_with_phrase, unnest(ccpa_link_phrases) link_phrase GROUP BY link_phrase, rank_grouping, client ORDER BY rank_grouping, client, num_pages DESC | ||
|
||
SELECT | ||
client, | ||
rank_grouping, | ||
link_phrase, | ||
COUNT(DISTINCT page) AS num_pages, | ||
COUNT(DISTINCT page) / any_value(total_pages_with_phrase_in_rank_group) AS pct_pages | ||
FROM pages_with_phrase, | ||
UNNEST(ccpa_link_phrases) AS link_phrase | ||
GROUP BY | ||
link_phrase, | ||
rank_grouping, | ||
client | ||
ORDER BY | ||
rank_grouping, | ||
client, | ||
num_pages DESC |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,27 @@ | ||
WITH pages AS ( | ||
SELECT client, rank_grouping, page, JSON_VALUE(custom_metrics, '$.privacy.ccpa_link.hasCCPALink') AS has_ccpa_link FROM `httparchive.all.pages`, | ||
-- TABLESAMPLE SYSTEM (0.0025 PERCENT) | ||
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping WHERE date = '2024-06-01' AND is_root_page = true AND rank <= rank_grouping | ||
SELECT | ||
client, | ||
rank_grouping, | ||
page, | ||
JSON_VALUE(custom_metrics, '$.privacy.ccpa_link.hasCCPALink') AS has_ccpa_link | ||
FROM `httparchive.all.pages`, -- TABLESAMPLE SYSTEM (0.0025 PERCENT) | ||
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping | ||
WHERE date = '2024-06-01' AND | ||
is_root_page = true AND | ||
rank <= rank_grouping | ||
) | ||
SELECT client, rank_grouping, has_ccpa_link, count(DISTINCT page) AS num_pages FROM pages GROUP BY has_ccpa_link, rank_grouping, client ORDER BY rank_grouping, client, has_ccpa_link | ||
|
||
SELECT | ||
client, | ||
rank_grouping, | ||
has_ccpa_link, | ||
COUNT(DISTINCT page) AS num_pages | ||
FROM pages | ||
GROUP BY | ||
has_ccpa_link, | ||
rank_grouping, | ||
client | ||
ORDER BY | ||
rank_grouping, | ||
client, | ||
has_ccpa_link |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,35 @@ | ||
-- Most common cookie names, by number of domains on which they appear. Goal is to identify common trackers that use first-party cookies across sites. | ||
|
||
WITH pages AS ( | ||
SELECT client, root_page, custom_metrics, count(DISTINCT net.host(root_page)) OVER(PARTITION BY client) AS total_domains FROM `httparchive.all.pages` -- TABLESAMPLE SYSTEM (0.00001 PERCENT) | ||
SELECT | ||
client, | ||
root_page, | ||
custom_metrics, | ||
COUNT(DISTINCT net.host(root_page)) OVER(PARTITION BY client) AS total_domains | ||
FROM `httparchive.all.pages` | ||
WHERE date = '2024-06-01' | ||
), | ||
cookies AS ( | ||
SELECT client, cookie, net.host(JSON_VALUE(cookie, '$.domain')) AS cookie_host, net.host(root_page) AS firstparty_host, total_domains FROM pages, UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) cookie | ||
), cookies AS ( | ||
SELECT | ||
client, | ||
cookie, | ||
NET.HOST(JSON_VALUE(cookie, '$.domain')) AS cookie_host, | ||
NET.HOST(root_page) AS firstparty_host, | ||
total_domains | ||
FROM pages, | ||
UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) AS cookie | ||
) | ||
SELECT client, count(DISTINCT firstparty_host) AS domain_count, count(DISTINCT firstparty_host) / any_value(total_domains) AS pct_domains, JSON_VALUE(cookie, '$.name') AS cookie_name FROM cookies WHERE firstparty_host LIKE '%' || cookie_host GROUP BY client, cookie_name ORDER BY domain_count DESC, client DESC LIMIT 500 | ||
|
||
SELECT | ||
client, | ||
COUNT(DISTINCT firstparty_host) AS domain_count, | ||
COUNT(DISTINCT firstparty_host) / any_value(total_domains) AS pct_domains, | ||
JSON_VALUE(cookie, '$.name') AS cookie_name | ||
FROM cookies | ||
WHERE firstparty_host LIKE '%' || cookie_host | ||
GROUP BY | ||
client, | ||
cookie_name | ||
ORDER BY | ||
domain_count DESC, | ||
client DESC | ||
LIMIT 500 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,35 @@ | ||
WITH pages AS ( | ||
SELECT page, client, root_page, custom_metrics, count(DISTINCT page) OVER (PARTITION BY client) AS total_pages FROM `httparchive.all.pages` -- TABLESAMPLE SYSTEM (0.1 PERCENT) | ||
SELECT | ||
page, | ||
client, | ||
root_page, | ||
custom_metrics, | ||
COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages | ||
FROM `httparchive.all.pages` | ||
WHERE date = '2024-06-01' | ||
), | ||
cookies AS ( | ||
SELECT client, page, cookie, net.host(JSON_VALUE(cookie, '$.domain')) AS cookie_host, net.host(root_page) AS firstparty_host, total_pages FROM pages, UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) cookie | ||
), cookies AS ( | ||
SELECT | ||
client, | ||
page, | ||
cookie, | ||
NET.HOST(JSON_VALUE(cookie, '$.domain')) AS cookie_host, | ||
NET.HOST(root_page) AS firstparty_host, | ||
total_pages | ||
FROM pages, | ||
UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) AS cookie | ||
) | ||
SELECT client, cookie_host, count(DISTINCT page) AS page_count, count(DISTINCT page) / any_value(total_pages) AS pct_pages FROM cookies WHERE firstparty_host NOT LIKE '%' || cookie_host GROUP BY client, cookie_host ORDER BY page_count DESC, client LIMIT 500 | ||
|
||
SELECT | ||
client, | ||
cookie_host, | ||
COUNT(DISTINCT page) AS page_count, | ||
COUNT(DISTINCT page) / any_value(total_pages) AS pct_pages | ||
FROM cookies | ||
WHERE firstparty_host NOT LIKE '%' || cookie_host | ||
GROUP BY | ||
client, | ||
cookie_host | ||
ORDER BY | ||
page_count DESC, | ||
client | ||
LIMIT 500 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,35 @@ | ||
-- Most common cookie names, by number of domains on which they appear. Goal is to identify common trackers that set cookies using many domains. | ||
|
||
WITH pages AS ( | ||
SELECT client, root_page, custom_metrics, count(DISTINCT net.host(root_page)) OVER(PARTITION BY client) AS total_domains FROM `httparchive.all.pages` -- TABLESAMPLE SYSTEM (0.00001 PERCENT) | ||
SELECT | ||
client, | ||
root_page, | ||
custom_metrics, | ||
COUNT(DISTINCT net.host(root_page)) OVER(PARTITION BY client) AS total_domains | ||
FROM `httparchive.all.pages` | ||
WHERE date = '2024-06-01' | ||
), | ||
cookies AS ( | ||
SELECT client, cookie, net.host(JSON_VALUE(cookie, '$.domain')) AS cookie_host, net.host(root_page) AS firstparty_host, total_domains FROM pages, UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) cookie | ||
SELECT | ||
client, | ||
cookie, | ||
NET.HOST(JSON_VALUE(cookie, '$.domain')) AS cookie_host, | ||
NET.HOST(root_page) AS firstparty_host, | ||
total_domains | ||
FROM pages, | ||
UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) AS cookie | ||
) | ||
SELECT client, count(DISTINCT firstparty_host) AS domain_count, count(DISTINCT firstparty_host) / any_value(total_domains) AS pct_domains, JSON_VALUE(cookie, '$.name') AS cookie_name FROM cookies WHERE firstparty_host NOT LIKE '%' || cookie_host GROUP BY client, cookie_name ORDER BY domain_count DESC, client DESC LIMIT 500 | ||
SELECT | ||
client, | ||
COUNT(DISTINCT firstparty_host) AS domain_count, | ||
COUNT(DISTINCT firstparty_host) / any_value(total_domains) AS pct_domains, | ||
JSON_VALUE(cookie, '$.name') AS cookie_name | ||
FROM cookies | ||
WHERE firstparty_host NOT LIKE '%' || cookie_host | ||
GROUP BY | ||
client, | ||
cookie_name | ||
ORDER BY | ||
domain_count DESC, | ||
client DESC | ||
LIMIT 500 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,23 @@ | ||
WITH pages AS ( | ||
SELECT page, client, custom_metrics, count(DISTINCT page) OVER (PARTITION BY client) AS total_pages FROM `httparchive.all.pages` --TABLESAMPLE SYSTEM (0.001 PERCENT) | ||
SELECT | ||
page, | ||
client, | ||
custom_metrics, | ||
COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages | ||
FROM `httparchive.all.pages` | ||
WHERE date = '2024-06-01' | ||
) | ||
SELECT client, script, count(DISTINCT page) AS page_count, count(DISTINCT page) / any_value(total_pages) AS pct_pages FROM pages, | ||
unnest(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.fingerprinting.likelyFingerprintingScripts')) AS script GROUP BY client, script ORDER BY page_count DESC LIMIT 100; | ||
|
||
SELECT | ||
client, | ||
script, | ||
COUNT(DISTINCT page) AS page_count, | ||
COUNT(DISTINCT page) / any_value(total_pages) AS pct_pages | ||
FROM pages, | ||
UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.fingerprinting.likelyFingerprintingScripts')) AS script | ||
GROUP BY | ||
client, | ||
script | ||
ORDER BY | ||
page_count DESC | ||
LIMIT 100; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,21 @@ | ||
WITH pages AS ( | ||
SELECT page, client, array_length(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.fingerprinting.likelyFingerprintingScripts')) AS script_count, count(DISTINCT page) OVER (PARTITION BY client) AS total_pages FROM `httparchive.all.pages` --TABLESAMPLE SYSTEM (0.01 PERCENT) | ||
SELECT | ||
page, | ||
client, | ||
ARRAY_LENGTH(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.fingerprinting.likelyFingerprintingScripts')) AS script_count, | ||
COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages | ||
FROM `httparchive.all.pages` | ||
WHERE date = '2024-06-01' | ||
) | ||
SELECT script_count, client, count(DISTINCT page) AS page_count, count(DISTINCT page) / any_value(total_pages) AS pct_pages FROM pages GROUP BY script_count, client ORDER BY script_count ASC; | ||
|
||
SELECT | ||
script_count, | ||
client, | ||
COUNT(DISTINCT page) AS page_count, | ||
COUNT(DISTINCT page) / any_value(total_pages) AS pct_pages | ||
FROM pages | ||
GROUP BY | ||
script_count, | ||
client | ||
ORDER BY | ||
script_count ASC; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.