Skip to content

Commit

Permalink
Sustainability 2024: Queries (#3736)
Browse files Browse the repository at this point in the history
* Update unminified_css bytes query

* Update unused_css_bytes query

* Update unused_js_bytes query

* Update unminified_js__bytes query

* Update cache_header_usage query

* Update cdn_adoption query

* Update cms_bytes_per_type query

* Update ssg_bytes_per_type query

* Update ecommerce_bytes_per_type query

* Add use of prefers_dark_mode query

* Update script usage query

* Update stylesheet_count query

* Updated for new CO2 calculation

* Update ecommerce with new co2 calc

* Update Cms with new co2 calc

* Add green hosting query

* Create favicons.sql

Adding based on Laurent Devernay comment in Slack.

* Add green third party query

* Is root page updates

* Update stylesheet count query with root_page filter

* Filter root page for comparability

* Create query_run_size.sql

Adding in a query to track the size of the query.

* Update query_run_size.sql

Updating the docs

* Create global_emissions_per_page.sql, page_byte_pre_type.sql, responsive_images.sql, text_compression.sql

* Fix linter issues for recently added SQL queries

* run sqlfluff fix

* add video_autoplay_values.sql, video_preload_values.sql

* fix linter errors

* Remove Tablesample mistake

* add 2022 queries

* Apply suggestions from code review

---------

Co-authored-by: Mike Gifford <[email protected]>
Co-authored-by: Burak Güneli <[email protected]>
Co-authored-by: Rafael Bonalume Lebre <[email protected]>
Co-authored-by: Barry Pollard <[email protected]>
  • Loading branch information
5 people authored Nov 4, 2024
1 parent 324d22b commit 6f4be9c
Show file tree
Hide file tree
Showing 26 changed files with 1,942 additions and 0 deletions.
50 changes: 50 additions & 0 deletions sql/2024/sustainability/cache_header_usage.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#standardSQL
# The distribution of cache header adoption on websites by client.

SELECT
client,
COUNT(0) AS total_requests,

COUNTIF(uses_cache_control) AS total_using_cache_control,
COUNTIF(uses_max_age) AS total_using_max_age,
COUNTIF(uses_expires) AS total_using_expires,
COUNTIF(uses_max_age AND uses_expires) AS total_using_max_age_and_expires,
COUNTIF(uses_cache_control AND uses_expires) AS total_using_both_cc_and_expires,
COUNTIF(NOT uses_cache_control AND NOT uses_expires) AS total_using_neither_cc_and_expires,
COUNTIF(uses_cache_control AND NOT uses_expires) AS total_using_only_cache_control,
COUNTIF(NOT uses_cache_control AND uses_expires) AS total_using_only_expires,

COUNTIF(uses_cache_control) / COUNT(0) AS pct_cache_control,
COUNTIF(uses_max_age) / COUNT(0) AS pct_using_max_age,
COUNTIF(uses_expires) / COUNT(0) AS pct_using_expires,
COUNTIF(uses_max_age AND uses_expires) / COUNT(0) AS pct_using_max_age_and_expires,
COUNTIF(uses_cache_control AND uses_expires) / COUNT(0) AS pct_using_both_cc_and_expires,
COUNTIF(NOT uses_cache_control AND NOT uses_expires) / COUNT(0) AS pct_using_neither_cc_nor_expires,
COUNTIF(uses_cache_control AND NOT uses_expires) / COUNT(0) AS pct_using_only_cache_control,
COUNTIF(NOT uses_cache_control AND uses_expires) / COUNT(0) AS pct_using_only_expires

FROM (
SELECT
client,

JSON_EXTRACT_SCALAR(summary, '$.resp_expires') IS NOT NULL AND TRIM(JSON_EXTRACT_SCALAR(summary, '$.resp_expires')) != '' AS uses_expires,
JSON_EXTRACT_SCALAR(summary, '$.resp_cache_control') IS NOT NULL AND TRIM(JSON_EXTRACT_SCALAR(summary, '$.resp_cache_control')) != '' AS uses_cache_control,
REGEXP_CONTAINS(JSON_EXTRACT_SCALAR(summary, '$.resp_cache_control'), r'(?i)max-age\s*=\s*[0-9]+') AS uses_max_age,

JSON_EXTRACT_SCALAR(summary, '$.resp_etag') IS NULL OR TRIM(JSON_EXTRACT_SCALAR(summary, '$.resp_etag')) = '' AS uses_no_etag,
JSON_EXTRACT_SCALAR(summary, '$.resp_etag') IS NOT NULL AND TRIM(JSON_EXTRACT_SCALAR(summary, '$.resp_etag')) != '' AS uses_etag,
JSON_EXTRACT_SCALAR(summary, '$.resp_last_modified') IS NOT NULL AND TRIM(JSON_EXTRACT_SCALAR(summary, '$.resp_last_modified')) != '' AS uses_last_modified,

REGEXP_CONTAINS(TRIM(JSON_EXTRACT_SCALAR(summary, '$.resp_etag')), '^W/".*"') AS uses_weak_etag,
REGEXP_CONTAINS(TRIM(JSON_EXTRACT_SCALAR(summary, '$.resp_etag')), '^".*"') AS uses_strong_etag

FROM
`httparchive.all.requests`
WHERE
date = '2024-06-01'
)

GROUP BY
client
ORDER BY
client;
31 changes: 31 additions & 0 deletions sql/2024/sustainability/cdn_adoption.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#standardSQL
# The distribution of CDN adoption on websites by client.

SELECT
client,
IF(cdn = '', 'No CDN', cdn) AS cdn,
COUNT(0) AS freq,
total,
COUNT(0) / total AS pct
FROM (
SELECT
client,
COUNT(0) AS total,
ARRAY_CONCAT_AGG(SPLIT(JSON_EXTRACT_SCALAR(summary, '$.cdn'), ', ')) AS cdn_list
FROM
`httparchive.all.pages`
WHERE
date = '2024-06-01' AND
is_root_page = TRUE
GROUP BY
client
),
UNNEST(cdn_list) AS cdn
GROUP BY
client,
cdn,
total
ORDER BY
pct DESC,
client,
cdn;
162 changes: 162 additions & 0 deletions sql/2024/sustainability/cms_bytes_per_type.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
#standardSQL
# Median resource weights by CMS

# Declare variables to calculate the carbon emissions of one byte
# Source: https://sustainablewebdesign.org/calculating-digital-emissions/
# The implementation below does not make the assumptions about returning visitors or caching that are present in the Sustainable Web Design model.

DECLARE grid_intensity NUMERIC DEFAULT 494;
DECLARE embodied_emissions_data_centers NUMERIC DEFAULT 0.012;
DECLARE embodied_emissions_network NUMERIC DEFAULT 0.013;
DECLARE embodied_emissions_user_devices NUMERIC DEFAULT 0.081;
DECLARE operational_emissions_data_centers NUMERIC DEFAULT 0.055;
DECLARE operational_emissions_network NUMERIC DEFAULT 0.059;
DECLARE operational_emissions_user_devices NUMERIC DEFAULT 0.080;

WITH cms_data AS (
SELECT
client,
page,
tech.technology AS cms,
CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 AS total_kb,

-- Operational emissions calculations
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_data_centers * grid_intensity AS op_emissions_dc,
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_network * grid_intensity AS op_emissions_networks,
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_user_devices * grid_intensity AS op_emissions_devices,

-- Embodied emissions calculations
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_data_centers * grid_intensity AS em_emissions_dc,
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_network * grid_intensity AS em_emissions_networks,
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_user_devices * grid_intensity AS em_emissions_devices,

-- Total emissions (operational + embodied)
(
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_data_centers * grid_intensity +
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_network * grid_intensity +
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_user_devices * grid_intensity
) AS total_operational_emissions,

(
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_data_centers * grid_intensity +
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_network * grid_intensity +
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_user_devices * grid_intensity
) AS total_embodied_emissions,

(
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_data_centers * grid_intensity +
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_network * grid_intensity +
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * operational_emissions_user_devices * grid_intensity +
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_data_centers * grid_intensity +
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_network * grid_intensity +
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * embodied_emissions_user_devices * grid_intensity
) AS total_emissions,

-- Proportions of each resource type relative to total bytes
CAST(JSON_VALUE(summary, '$.bytesHtml') AS INT64) / CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) AS html_proportion,
CAST(JSON_VALUE(summary, '$.bytesJS') AS INT64) / CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) AS js_proportion,
CAST(JSON_VALUE(summary, '$.bytesCss') AS INT64) / CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) AS css_proportion,
CAST(JSON_VALUE(summary, '$.bytesImg') AS INT64) / CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) AS img_proportion,
CAST(JSON_VALUE(summary, '$.bytesFont') AS INT64) / CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) AS font_proportion,

-- Resource-specific emissions calculations
(SAFE_DIVIDE(CAST(JSON_VALUE(summary, '$.bytesHtml') AS INT64), CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64)) * (
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * (
operational_emissions_data_centers * grid_intensity +
operational_emissions_network * grid_intensity +
operational_emissions_user_devices * grid_intensity +
embodied_emissions_data_centers * grid_intensity +
embodied_emissions_network * grid_intensity +
embodied_emissions_user_devices * grid_intensity
)
)) AS total_html_emissions,

(SAFE_DIVIDE(CAST(JSON_VALUE(summary, '$.bytesJS') AS INT64), CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64)) * (
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * (
operational_emissions_data_centers * grid_intensity +
operational_emissions_network * grid_intensity +
operational_emissions_user_devices * grid_intensity +
embodied_emissions_data_centers * grid_intensity +
embodied_emissions_network * grid_intensity +
embodied_emissions_user_devices * grid_intensity
)
)) AS total_js_emissions,

(SAFE_DIVIDE(CAST(JSON_VALUE(summary, '$.bytesCss') AS INT64), CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64)) * (
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * (
operational_emissions_data_centers * grid_intensity +
operational_emissions_network * grid_intensity +
operational_emissions_user_devices * grid_intensity +
embodied_emissions_data_centers * grid_intensity +
embodied_emissions_network * grid_intensity +
embodied_emissions_user_devices * grid_intensity
)
)) AS total_css_emissions,

(SAFE_DIVIDE(CAST(JSON_VALUE(summary, '$.bytesImg') AS INT64), CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64)) * (
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * (
operational_emissions_data_centers * grid_intensity +
operational_emissions_network * grid_intensity +
operational_emissions_user_devices * grid_intensity +
embodied_emissions_data_centers * grid_intensity +
embodied_emissions_network * grid_intensity +
embodied_emissions_user_devices * grid_intensity
)
)) AS total_img_emissions,

(SAFE_DIVIDE(CAST(JSON_VALUE(summary, '$.bytesFont') AS INT64), CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64)) * (
(CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) / 1024 / 1024 / 1024) * (
operational_emissions_data_centers * grid_intensity +
operational_emissions_network * grid_intensity +
operational_emissions_user_devices * grid_intensity +
embodied_emissions_data_centers * grid_intensity +
embodied_emissions_network * grid_intensity +
embodied_emissions_user_devices * grid_intensity
)
)) AS total_font_emissions,

-- Resource-specific size in KB
CAST(JSON_VALUE(summary, '$.bytesHtml') AS INT64) / 1024 AS html_kb,
CAST(JSON_VALUE(summary, '$.bytesJS') AS INT64) / 1024 AS js_kb,
CAST(JSON_VALUE(summary, '$.bytesCss') AS INT64) / 1024 AS css_kb,
CAST(JSON_VALUE(summary, '$.bytesImg') AS INT64) / 1024 AS img_kb,
CAST(JSON_VALUE(summary, '$.bytesFont') AS INT64) / 1024 AS font_kb
FROM
`httparchive.all.pages`,
UNNEST(technologies) AS tech
WHERE
date = '2024-06-01' AND
is_root_page = TRUE AND
'CMS' IN UNNEST(tech.categories)
)

SELECT
client,
cms,
COUNT(0) AS pages,
-- Median resource weights and emissions
APPROX_QUANTILES(total_kb, 1000)[OFFSET(500)] AS median_total_kb,
APPROX_QUANTILES(total_operational_emissions, 1000)[OFFSET(500)] AS median_operational_emissions,
APPROX_QUANTILES(total_embodied_emissions, 1000)[OFFSET(500)] AS median_embodied_emissions,
APPROX_QUANTILES(total_emissions, 1000)[OFFSET(500)] AS median_total_emissions,

-- Resource-specific medians
APPROX_QUANTILES(html_kb, 1000)[OFFSET(500)] AS median_html_kb,
APPROX_QUANTILES(total_html_emissions, 1000)[OFFSET(500)] AS median_total_html_emissions,
APPROX_QUANTILES(js_kb, 1000)[OFFSET(500)] AS median_js_kb,
APPROX_QUANTILES(total_js_emissions, 1000)[OFFSET(500)] AS median_total_js_emissions,
APPROX_QUANTILES(css_kb, 1000)[OFFSET(500)] AS median_css_kb,
APPROX_QUANTILES(total_css_emissions, 1000)[OFFSET(500)] AS median_total_css_emissions,
APPROX_QUANTILES(img_kb, 1000)[OFFSET(500)] AS median_img_kb,
APPROX_QUANTILES(total_img_emissions, 1000)[OFFSET(500)] AS median_total_img_emissions,
APPROX_QUANTILES(font_kb, 1000)[OFFSET(500)] AS median_font_kb,
APPROX_QUANTILES(total_font_emissions, 1000)[OFFSET(500)] AS median_total_font_emissions
FROM
cms_data
GROUP BY
client,
cms
ORDER BY
pages DESC,
cms,
client;
Loading

0 comments on commit 6f4be9c

Please sign in to comment.