Skip to content

Commit

Permalink
Merge branch 'privacy-markdown-2024' of https://github.com/HTTPArchiv…
Browse files Browse the repository at this point in the history
…e/almanac.httparchive.org into privacy-markdown-2024
  • Loading branch information
max-ostapenko committed Nov 4, 2024
2 parents 781d0af + 4a19dc1 commit 805ea9c
Show file tree
Hide file tree
Showing 520 changed files with 17,778 additions and 491 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/code-static-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ jobs:
uses: actions/checkout@v4
- name: Set up Python 3.12
if: ${{ matrix.language == 'python' }}
uses: actions/setup-python@v5.2.0
uses: actions/setup-python@v5.3.0
with:
python-version: '3.12'
- name: Install dependencies
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/lintsql.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
# Full git history is needed to get a proper list of changed files within `super-linter`
fetch-depth: 0
- name: Set up Python 3.12
uses: actions/setup-python@v5.2.0
uses: actions/setup-python@v5.3.0
with:
python-version: '3.12'
- name: Lint SQL code
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/predeploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
with:
node-version: '20'
- name: Set up Python 3.12
uses: actions/setup-python@v5.2.0
uses: actions/setup-python@v5.3.0
with:
python-version: '3.12'
- name: Install Asian Fonts
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test_website.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ jobs:
with:
node-version: '20'
- name: Set up Python 3.12
uses: actions/setup-python@v5.2.0
uses: actions/setup-python@v5.3.0
with:
python-version: '3.12'
- name: Run the website
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ We do almost all of our project planning here on GitHub. Browse the open [issues

We also have a [`#web-almanac`](https://join.slack.com/t/httparchive/shared_invite/zt-45sgwmnb-eDEatOhqssqNAKxxOSLAaA) channel on the HTTP Archive Slack where we chat about project updates.

For news and announcements, follow [@HTTPArchive](https://twitter.com/HTTPArchive) on Twitter.
For news and announcements, follow [@HTTPArchive](https://x.com/HTTPArchive) on Twitter.

## [License](https://github.com/HTTPArchive/almanac.httparchive.org/blob/main/LICENSE)

Expand Down
50 changes: 50 additions & 0 deletions sql/2024/accessibility/a11y_frontend_technology.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
WITH score_data AS (
SELECT
client,
page,
CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.performance.score') AS FLOAT64) AS performance_score,
CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.accessibility.score') AS FLOAT64) AS accessibility_score,
CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.best-practices.score') AS FLOAT64) AS best_practices_score,
CAST(JSON_EXTRACT_SCALAR(lighthouse, '$.categories.seo.score') AS FLOAT64) AS seo_score,
t.technology AS framework
FROM
`httparchive.all.pages`,
UNNEST(technologies) AS t
WHERE
date = '2024-06-01' AND
lighthouse IS NOT NULL AND
lighthouse != '{}' AND
is_root_page = TRUE AND
('Web frameworks' IN UNNEST(t.categories) OR 'JavaScript libraries' IN UNNEST(t.categories) OR 'Frontend frameworks' IN UNNEST(t.categories) OR 'JavaScript frameworks' IN UNNEST(t.categories)) AND
t.technology IS NOT NULL
)

SELECT
client,
framework,
AVG(performance_score) AS avg_performance_score,
AVG(accessibility_score) AS avg_accessibility_score,
AVG(best_practices_score) AS avg_best_practices_score,
AVG(seo_score) AS avg_seo_score,
COUNT(DISTINCT page) AS total_pages
FROM (
SELECT
client,
page,
framework,
AVG(performance_score) AS performance_score, # All scores are the same for one page (we have multiple rows due to unnest), we could also take the first instead of the average
AVG(accessibility_score) AS accessibility_score,
AVG(best_practices_score) AS best_practices_score,
AVG(seo_score) AS seo_score
FROM
score_data
GROUP BY
client,
page,
framework
)
GROUP BY
client,
framework
ORDER BY
total_pages DESC;
56 changes: 56 additions & 0 deletions sql/2024/accessibility/a11y_overall_tech_usage_by_domain_rank.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#standardSQL
# Overall Accessibility (A11y) technology, ie. Overlays, usage by domain rank

# Main SELECT statement to aggregate results by client and rank grouping.
SELECT
client,
is_root_page,
rank_grouping, # Grouping of domains by their rank (e.g., top 1000, top 10000, etc.)
total_in_rank, # Total number of sites within the rank grouping
COUNT(DISTINCT page) AS sites_with_a11y_tech, # Number of unique sites that use accessibility technology
COUNT(DISTINCT page) / total_in_rank AS pct_sites_with_a11y_tech # Percentage of sites using accessibility technology within the rank grouping
FROM
(
# Subquery to filter and extract relevant pages with A11Y technology
SELECT DISTINCT
client,
is_root_page,
page,
rank_grouping,
category
FROM
`httparchive.all.pages`,
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping, # Expand rank_grouping to cover different rank categories
UNNEST(technologies) AS tech,
UNNEST(categories) AS category
WHERE
date = '2024-06-01' AND
category = 'Accessibility' AND
rank <= rank_grouping # Include only sites within the specified rank grouping
)
JOIN
(
# Subquery to count total sites in each rank grouping for each client
SELECT
client,
rank_grouping,
COUNT(0) AS total_in_rank
FROM
`httparchive.all.pages`,
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping
WHERE
date = '2024-06-01' AND
rank <= rank_grouping
GROUP BY
client,
rank_grouping
) USING (client, rank_grouping)
GROUP BY
client,
is_root_page,
rank_grouping,
total_in_rank
ORDER BY
client,
is_root_page,
rank_grouping
21 changes: 21 additions & 0 deletions sql/2024/accessibility/a11y_technology_usage.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#standardSQL
# Accessibility (A11y) technology, ie. Overlays, usage by client

SELECT
client, # Client domain
is_root_page,
COUNT(DISTINCT page) AS total_sites, # Total number of unique sites for the client
COUNT(DISTINCT IF(category = 'Accessibility', page, NULL)) AS sites_with_a11y_tech, # Number of unique sites that use accessibility technology
COUNT(DISTINCT IF(category = 'Accessibility', page, NULL)) / COUNT(DISTINCT page) AS pct_sites_with_a11y_tech # Percentage of sites using accessibility technology
FROM
`httparchive.all.pages`,
UNNEST(technologies) AS tech,
UNNEST(categories) AS category
WHERE
date = '2024-06-01' # Specific date for data extraction
GROUP BY
client,
is_root_page
ORDER BY
client,
is_root_page;
71 changes: 71 additions & 0 deletions sql/2024/accessibility/a11y_technology_usage_by_domain_rank.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#standardSQL
# A11Y technology usage by domain rank
WITH ranked_sites AS (
-- Get the total number of sites within each rank grouping
SELECT
client,
is_root_page,
page,
rank,
technologies, -- Include technologies field here
CASE
WHEN rank <= 1000 THEN 1000
WHEN rank <= 10000 THEN 10000
WHEN rank <= 100000 THEN 100000
WHEN rank <= 1000000 THEN 1000000
WHEN rank <= 10000000 THEN 10000000
WHEN rank <= 100000000 THEN 100000000
END AS rank_grouping
FROM
`httparchive.all.pages`
WHERE
date = '2024-06-01' -- Use the relevant date for analysis
),

rank_totals AS (
-- Calculate total sites in each rank grouping
SELECT
client,
is_root_page,
rank_grouping,
COUNT(DISTINCT page) AS total_in_rank
FROM
ranked_sites
GROUP BY
client,
is_root_page,
rank_grouping
)

SELECT
r.client,
r.is_root_page,
r.rank_grouping,
rt.total_in_rank, -- Total number of unique sites within the rank grouping
tech.technology AS app, -- Accessibility technology used
COUNT(DISTINCT r.page) AS sites_with_app, -- Number of sites using the specific accessibility technology
SAFE_DIVIDE(COUNT(DISTINCT r.page), rt.total_in_rank) AS pct_sites_with_app -- Percentage of sites using the accessibility technology
FROM
ranked_sites r
JOIN
UNNEST(r.technologies) AS tech -- Expand technologies array to individual rows
JOIN
rank_totals rt -- Join to get the total number of sites per rank grouping
ON r.client = rt.client AND
r.is_root_page = rt.is_root_page AND
r.rank_grouping = rt.rank_grouping
JOIN
UNNEST(tech.categories) AS category -- Unnest the categories array to filter for accessibility
WHERE
category = 'Accessibility' -- Filter to include only accessibility-related technologies
GROUP BY
r.client,
r.is_root_page,
r.rank_grouping,
rt.total_in_rank,
tech.technology
ORDER BY
tech.technology, -- Order results by technology (app)
r.rank_grouping, -- Order results by rank grouping
r.client,
r.is_root_page;
76 changes: 76 additions & 0 deletions sql/2024/accessibility/alt_ending_in_image_extension.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#standardSQL
# Alt text ending in an image extension
CREATE TEMPORARY FUNCTION getUsedExtensions(payload STRING)
RETURNS ARRAY<STRUCT<extension STRING, total INT64>> LANGUAGE js AS '''
try {
const a11y = JSON.parse(payload);
return Object.entries(a11y.file_extension_alts.file_extensions).map(([extension, total]) => {
return {extension, total};
});
} catch (e) {
return [];
}
''';
SELECT
client,
is_root_page,
sites_with_non_empty_alt,
sites_with_file_extension_alt,
total_alts_with_file_extensions,

# Of sites with a non-empty alt, what % have an alt with a file extension
sites_with_file_extension_alt / sites_with_non_empty_alt AS pct_sites_with_file_extension_alt,
# Given a random alt, how often will it end in a file extension
total_alts_with_file_extensions / total_non_empty_alts AS pct_alts_with_file_extension,

extension_stat.extension AS extension,
COUNT(0) AS total_sites_using,
# Of sites with a non-empty alt, what % have an alt with this file extension
COUNT(0) / sites_with_non_empty_alt AS pct_applicable_sites_using,

# Of sites with a non-empty alt, what % have an alt with this file extension
SUM(extension_stat.total) AS total_occurances,
# Given a random alt ending in a file extension, how often will it end in this file extension
SUM(extension_stat.total) / total_alts_with_file_extensions AS pct_total_occurances
FROM
`httparchive.all.pages`,
UNNEST(getUsedExtensions(JSON_EXTRACT(custom_metrics, '$.a11y'))) AS extension_stat
LEFT JOIN (
SELECT
client,
is_root_page,
COUNTIF(total_non_empty_alt > 0) AS sites_with_non_empty_alt,
COUNTIF(total_with_file_extension > 0) AS sites_with_file_extension_alt,

SUM(total_non_empty_alt) AS total_non_empty_alts,
SUM(total_with_file_extension) AS total_alts_with_file_extensions
FROM (
SELECT
client,
is_root_page,
CAST(JSON_EXTRACT_SCALAR(custom_metrics, '$.markup.images.img.alt.present') AS INT64) AS total_non_empty_alt,
CAST(JSON_EXTRACT_SCALAR(custom_metrics, '$.a11y.file_extension_alts.total_with_file_extension') AS INT64) AS total_with_file_extension
FROM
`httparchive.all.pages`
WHERE
date = '2024-06-01'
)
GROUP BY
client,
is_root_page
) USING (client, is_root_page)
WHERE
date = '2024-06-01'
GROUP BY
client,
is_root_page,
sites_with_non_empty_alt,
sites_with_file_extension_alt,
total_non_empty_alts,
total_alts_with_file_extensions,
extension
ORDER BY
client,
is_root_page,
total_occurances DESC
25 changes: 25 additions & 0 deletions sql/2024/accessibility/anchors_with_role_button.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#standardSQL
# Anchors with role='button'
SELECT
client,
is_root_page,
COUNTIF(total_anchors > 0) AS sites_with_anchors,
COUNTIF(total_anchors_with_role_button > 0) AS sites_with_anchor_role_button,

# Of sites that have anchors... how many have an anchor with a role='button'
COUNTIF(total_anchors_with_role_button > 0) / COUNTIF(total_anchors > 0) AS pct_sites_with_anchor_role_button
FROM (
SELECT
client,
is_root_page,
date,
CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._a11y'), '$.total_anchors_with_role_button') AS INT64) AS total_anchors_with_role_button,
IFNULL(CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._element_count'), '$.a') AS INT64), 0) AS total_anchors
FROM
`httparchive.all.pages`
)
WHERE
date = '2024-06-01'
GROUP BY
client,
is_root_page;
27 changes: 27 additions & 0 deletions sql/2024/accessibility/audio_track_usage.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#standardSQL
# Audio elements track usage
SELECT
client,
is_root_page,
COUNT(0) AS total_sites,
COUNTIF(total_audios > 0) AS total_with_audio,
COUNTIF(total_with_track > 0) AS total_with_tracks,

SUM(total_with_track) / SUM(total_audios) AS pct_audios_with_tracks,
COUNTIF(total_audios > 0) / COUNT(0) AS pct_sites_with_audios,
COUNTIF(total_with_track > 0) / COUNTIF(total_audios > 0) AS pct_audio_sites_with_tracks
FROM (
SELECT
client,
is_root_page,
date,
CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._almanac'), '$.audios.total') AS INT64) AS total_audios,
CAST(JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(payload, '$._almanac'), '$.audios.total_with_track') AS INT64) AS total_with_track
FROM
`httparchive.all.pages`
WHERE
date = '2024-06-01'
)
GROUP BY
client,
is_root_page;
Loading

0 comments on commit 805ea9c

Please sign in to comment.