-
-
Notifications
You must be signed in to change notification settings - Fork 176
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'privacy-markdown-2024' of https://github.com/HTTPArchiv…
…e/almanac.httparchive.org into privacy-markdown-2024
- Loading branch information
Showing
520 changed files
with
17,778 additions
and
491 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
WITH score_data AS ( | ||
SELECT | ||
client, | ||
page, | ||
CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.performance.score') AS FLOAT64) AS performance_score, | ||
CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.accessibility.score') AS FLOAT64) AS accessibility_score, | ||
CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.best-practices.score') AS FLOAT64) AS best_practices_score, | ||
CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.seo.score') AS FLOAT64) AS seo_score, | ||
t.technology AS framework | ||
FROM | ||
`httparchive.all.pages`, | ||
UNNEST(technologies) AS t | ||
WHERE | ||
date = '2024-06-01' AND | ||
lighthouse IS NOT NULL AND | ||
lighthouse != '{}' AND | ||
is_root_page = TRUE AND | ||
('Web frameworks' IN UNNEST(t.categories) OR 'JavaScript libraries' IN UNNEST(t.categories) OR 'Frontend frameworks' IN UNNEST(t.categories) OR 'JavaScript frameworks' IN UNNEST(t.categories)) AND | ||
t.technology IS NOT NULL | ||
) | ||
|
||
SELECT | ||
client, | ||
framework, | ||
AVG(performance_score) AS avg_performance_score, | ||
AVG(accessibility_score) AS avg_accessibility_score, | ||
AVG(best_practices_score) AS avg_best_practices_score, | ||
AVG(seo_score) AS avg_seo_score, | ||
COUNT(DISTINCT page) AS total_pages | ||
FROM ( | ||
SELECT | ||
client, | ||
page, | ||
framework, | ||
AVG(performance_score) AS performance_score, # All scores are the same for one page (we have multiple rows due to unnest), we could also take the first instead of the average | ||
AVG(accessibility_score) AS accessibility_score, | ||
AVG(best_practices_score) AS best_practices_score, | ||
AVG(seo_score) AS seo_score | ||
FROM | ||
score_data | ||
GROUP BY | ||
client, | ||
page, | ||
framework | ||
) | ||
GROUP BY | ||
client, | ||
framework | ||
ORDER BY | ||
total_pages DESC; |
56 changes: 56 additions & 0 deletions
56
sql/2024/accessibility/a11y_overall_tech_usage_by_domain_rank.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
#standardSQL | ||
# Overall Accessibility (A11y) technology, ie. Overlays, usage by domain rank | ||
|
||
# Main SELECT statement to aggregate results by client and rank grouping. | ||
SELECT | ||
client, | ||
is_root_page, | ||
rank_grouping, # Grouping of domains by their rank (e.g., top 1000, top 10000, etc.) | ||
total_in_rank, # Total number of sites within the rank grouping | ||
COUNT(DISTINCT page) AS sites_with_a11y_tech, # Number of unique sites that use accessibility technology | ||
COUNT(DISTINCT page) / total_in_rank AS pct_sites_with_a11y_tech # Percentage of sites using accessibility technology within the rank grouping | ||
FROM | ||
( | ||
# Subquery to filter and extract relevant pages with A11Y technology | ||
SELECT DISTINCT | ||
client, | ||
is_root_page, | ||
page, | ||
rank_grouping, | ||
category | ||
FROM | ||
`httparchive.all.pages`, | ||
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping, # Expand rank_grouping to cover different rank categories | ||
UNNEST(technologies) AS tech, | ||
UNNEST(categories) AS category | ||
WHERE | ||
date = '2024-06-01' AND | ||
category = 'Accessibility' AND | ||
rank <= rank_grouping # Include only sites within the specified rank grouping | ||
) | ||
JOIN | ||
( | ||
# Subquery to count total sites in each rank grouping for each client | ||
SELECT | ||
client, | ||
rank_grouping, | ||
COUNT(0) AS total_in_rank | ||
FROM | ||
`httparchive.all.pages`, | ||
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping | ||
WHERE | ||
date = '2024-06-01' AND | ||
rank <= rank_grouping | ||
GROUP BY | ||
client, | ||
rank_grouping | ||
) USING (client, rank_grouping) | ||
GROUP BY | ||
client, | ||
is_root_page, | ||
rank_grouping, | ||
total_in_rank | ||
ORDER BY | ||
client, | ||
is_root_page, | ||
rank_grouping |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
#standardSQL | ||
# Accessibility (A11y) technology, ie. Overlays, usage by client | ||
|
||
SELECT | ||
client, # Client domain | ||
is_root_page, | ||
COUNT(DISTINCT page) AS total_sites, # Total number of unique sites for the client | ||
COUNT(DISTINCT IF(category = 'Accessibility', page, NULL)) AS sites_with_a11y_tech, # Number of unique sites that use accessibility technology | ||
COUNT(DISTINCT IF(category = 'Accessibility', page, NULL)) / COUNT(DISTINCT page) AS pct_sites_with_a11y_tech # Percentage of sites using accessibility technology | ||
FROM | ||
`httparchive.all.pages`, | ||
UNNEST(technologies) AS tech, | ||
UNNEST(categories) AS category | ||
WHERE | ||
date = '2024-06-01' # Specific date for data extraction | ||
GROUP BY | ||
client, | ||
is_root_page | ||
ORDER BY | ||
client, | ||
is_root_page; |
71 changes: 71 additions & 0 deletions
71
sql/2024/accessibility/a11y_technology_usage_by_domain_rank.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
#standardSQL | ||
# A11Y technology usage by domain rank | ||
WITH ranked_sites AS ( | ||
-- Get the total number of sites within each rank grouping | ||
SELECT | ||
client, | ||
is_root_page, | ||
page, | ||
rank, | ||
technologies, -- Include technologies field here | ||
CASE | ||
WHEN rank <= 1000 THEN 1000 | ||
WHEN rank <= 10000 THEN 10000 | ||
WHEN rank <= 100000 THEN 100000 | ||
WHEN rank <= 1000000 THEN 1000000 | ||
WHEN rank <= 10000000 THEN 10000000 | ||
WHEN rank <= 100000000 THEN 100000000 | ||
END AS rank_grouping | ||
FROM | ||
`httparchive.all.pages` | ||
WHERE | ||
date = '2024-06-01' -- Use the relevant date for analysis | ||
), | ||
|
||
rank_totals AS ( | ||
-- Calculate total sites in each rank grouping | ||
SELECT | ||
client, | ||
is_root_page, | ||
rank_grouping, | ||
COUNT(DISTINCT page) AS total_in_rank | ||
FROM | ||
ranked_sites | ||
GROUP BY | ||
client, | ||
is_root_page, | ||
rank_grouping | ||
) | ||
|
||
SELECT | ||
r.client, | ||
r.is_root_page, | ||
r.rank_grouping, | ||
rt.total_in_rank, -- Total number of unique sites within the rank grouping | ||
tech.technology AS app, -- Accessibility technology used | ||
COUNT(DISTINCT r.page) AS sites_with_app, -- Number of sites using the specific accessibility technology | ||
SAFE_DIVIDE(COUNT(DISTINCT r.page), rt.total_in_rank) AS pct_sites_with_app -- Percentage of sites using the accessibility technology | ||
FROM | ||
ranked_sites r | ||
JOIN | ||
UNNEST(r.technologies) AS tech -- Expand technologies array to individual rows | ||
JOIN | ||
rank_totals rt -- Join to get the total number of sites per rank grouping | ||
ON r.client = rt.client AND | ||
r.is_root_page = rt.is_root_page AND | ||
r.rank_grouping = rt.rank_grouping | ||
JOIN | ||
UNNEST(tech.categories) AS category -- Unnest the categories array to filter for accessibility | ||
WHERE | ||
category = 'Accessibility' -- Filter to include only accessibility-related technologies | ||
GROUP BY | ||
r.client, | ||
r.is_root_page, | ||
r.rank_grouping, | ||
rt.total_in_rank, | ||
tech.technology | ||
ORDER BY | ||
tech.technology, -- Order results by technology (app) | ||
r.rank_grouping, -- Order results by rank grouping | ||
r.client, | ||
r.is_root_page; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
#standardSQL | ||
# Alt text ending in an image extension | ||
CREATE TEMPORARY FUNCTION getUsedExtensions(payload STRING) | ||
RETURNS ARRAY<STRUCT<extension STRING, total INT64>> LANGUAGE js AS ''' | ||
try { | ||
const a11y = JSON.parse(payload); | ||
return Object.entries(a11y.file_extension_alts.file_extensions).map(([extension, total]) => { | ||
return {extension, total}; | ||
}); | ||
} catch (e) { | ||
return []; | ||
} | ||
'''; | ||
SELECT | ||
client, | ||
is_root_page, | ||
sites_with_non_empty_alt, | ||
sites_with_file_extension_alt, | ||
total_alts_with_file_extensions, | ||
|
||
# Of sites with a non-empty alt, what % have an alt with a file extension | ||
sites_with_file_extension_alt / sites_with_non_empty_alt AS pct_sites_with_file_extension_alt, | ||
# Given a random alt, how often will it end in a file extension | ||
total_alts_with_file_extensions / total_non_empty_alts AS pct_alts_with_file_extension, | ||
|
||
extension_stat.extension AS extension, | ||
COUNT(0) AS total_sites_using, | ||
# Of sites with a non-empty alt, what % have an alt with this file extension | ||
COUNT(0) / sites_with_non_empty_alt AS pct_applicable_sites_using, | ||
|
||
# Of sites with a non-empty alt, what % have an alt with this file extension | ||
SUM(extension_stat.total) AS total_occurances, | ||
# Given a random alt ending in a file extension, how often will it end in this file extension | ||
SUM(extension_stat.total) / total_alts_with_file_extensions AS pct_total_occurances | ||
FROM | ||
`httparchive.all.pages`, | ||
UNNEST(getUsedExtensions(JSON_EXTRACT(custom_metrics, '$.a11y'))) AS extension_stat | ||
LEFT JOIN ( | ||
SELECT | ||
client, | ||
is_root_page, | ||
COUNTIF(total_non_empty_alt > 0) AS sites_with_non_empty_alt, | ||
COUNTIF(total_with_file_extension > 0) AS sites_with_file_extension_alt, | ||
|
||
SUM(total_non_empty_alt) AS total_non_empty_alts, | ||
SUM(total_with_file_extension) AS total_alts_with_file_extensions | ||
FROM ( | ||
SELECT | ||
client, | ||
is_root_page, | ||
CAST(JSON_EXTRACT_SCALAR(custom_metrics, '$.markup.images.img.alt.present') AS INT64) AS total_non_empty_alt, | ||
CAST(JSON_EXTRACT_SCALAR(custom_metrics, '$.a11y.file_extension_alts.total_with_file_extension') AS INT64) AS total_with_file_extension | ||
FROM | ||
`httparchive.all.pages` | ||
WHERE | ||
date = '2024-06-01' | ||
) | ||
GROUP BY | ||
client, | ||
is_root_page | ||
) USING (client, is_root_page) | ||
WHERE | ||
date = '2024-06-01' | ||
GROUP BY | ||
client, | ||
is_root_page, | ||
sites_with_non_empty_alt, | ||
sites_with_file_extension_alt, | ||
total_non_empty_alts, | ||
total_alts_with_file_extensions, | ||
extension | ||
ORDER BY | ||
client, | ||
is_root_page, | ||
total_occurances DESC |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
#standardSQL | ||
# Anchors with role='button' | ||
SELECT | ||
client, | ||
is_root_page, | ||
COUNTIF(total_anchors > 0) AS sites_with_anchors, | ||
COUNTIF(total_anchors_with_role_button > 0) AS sites_with_anchor_role_button, | ||
|
||
# Of sites that have anchors... how many have an anchor with a role='button' | ||
COUNTIF(total_anchors_with_role_button > 0) / COUNTIF(total_anchors > 0) AS pct_sites_with_anchor_role_button | ||
FROM ( | ||
SELECT | ||
client, | ||
is_root_page, | ||
date, | ||
CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._a11y'), '$.total_anchors_with_role_button') AS INT64) AS total_anchors_with_role_button, | ||
IFNULL(CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._element_count'), '$.a') AS INT64), 0) AS total_anchors | ||
FROM | ||
`httparchive.all.pages` | ||
) | ||
WHERE | ||
date = '2024-06-01' | ||
GROUP BY | ||
client, | ||
is_root_page; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
#standardSQL | ||
# Audio elements track usage | ||
SELECT | ||
client, | ||
is_root_page, | ||
COUNT(0) AS total_sites, | ||
COUNTIF(total_audios > 0) AS total_with_audio, | ||
COUNTIF(total_with_track > 0) AS total_with_tracks, | ||
|
||
SUM(total_with_track) / SUM(total_audios) AS pct_audios_with_tracks, | ||
COUNTIF(total_audios > 0) / COUNT(0) AS pct_sites_with_audios, | ||
COUNTIF(total_with_track > 0) / COUNTIF(total_audios > 0) AS pct_audio_sites_with_tracks | ||
FROM ( | ||
SELECT | ||
client, | ||
is_root_page, | ||
date, | ||
CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._almanac'), '$.audios.total') AS INT64) AS total_audios, | ||
CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._almanac'), '$.audios.total_with_track') AS INT64) AS total_with_track | ||
FROM | ||
`httparchive.all.pages` | ||
WHERE | ||
date = '2024-06-01' | ||
) | ||
GROUP BY | ||
client, | ||
is_root_page; |
Oops, something went wrong.