Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrade to new datamodel #209

Merged
merged 26 commits into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion sql/.sqlfluff
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,18 @@ templater = jinja
## Comma separated list of rules to check, or None for all
rules = None
## Comma separated list of rules to exclude, or None
exclude_rules = AL01,AL04,AL07,AL09,AM03,AM05,CP02,CP03,CV02,LT05,LT09,RF01,RF02,RF03,RF04,ST01,ST02,ST05,ST06,ST07
exclude_rules = AL01,AL04,AL07,AL09,AM03,AM05,CP02,CP03,CV02,CV12,LT05,LT09,LT14,RF01,RF02,RF03,RF04,ST01,ST02,ST05,ST06,ST07
# AL04 - Asks for unique table aliases meaning it complains if selecting from two 2021_07_01 tables as implicit alias is table name (not fully qualified) so same.
# AL07 - Avoid aliases in from and join - why?
# AM03 - if using DESC in one ORDER BY column, then insist on ASC/DESC for all.
# AM05 - INNER JOIN must be fully qualified. Probably should use this but not our style.
# CP02 - Unquoted identifiers (e.g. column names) will be mixed case so don't enforce case
# CP03 - Function names will be mixed case so don't enforce case
# CV02 - Use COALESCE instead of IFNULL or NVL. We think ISNULL is clearer.
# CV12 - Doesn't work with UNNEST. https://github.com/sqlfluff/sqlfluff/issues/6558
# LT05 - We allow longer lines as some of our queries are complex. Maybe should limit in future?
# LT09 - Select targets should be on new lines but sub clauses don't always obey this. Maybe revisit in future?
# LT14 - Keywords on newline. We have some simple, single line joins
# RF01 - BigQuery uses STRUCTS which can look like incorrect table references
# RF02 - Asks for qualified columns for ambiguous ones, but we not qualify our columns, and they are not really ambiguous (or BigQuery would complain)
# RF03 - Insists on references in column names even if not ambiguous. Bit OTT.
Expand Down
3 changes: 3 additions & 0 deletions sql/.sqlfluffignore
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
/lens/*/crux_histograms.sql
/lens/*/crux_timeseries.sql
/lens/*/histograms.sql
/lens/*/timeseries.sql
193 changes: 95 additions & 98 deletions sql/generate_reports.sh

Large diffs are not rendered by default.

9 changes: 6 additions & 3 deletions sql/histograms/bootupJs.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
FLOOR(CAST(IFNULL(JSON_EXTRACT(report, '$.audits.bootup-time.numericValue'), JSON_EXTRACT(report, '$.audits.bootup-time.rawValue')) AS FLOAT64) / 100) / 10 AS bin
FLOOR(FLOAT64(IFNULL(lighthouse.audits['bootup-time'].numericValue, lighthouse.audits['bootup-time'].rawValue)) / 100) / 10 AS bin
FROM
`httparchive.lighthouse.${YYYY_MM_DD}_*`
`httparchive.crawl.pages`
WHERE
date = '${YYYY-MM-DD}' AND
is_root_page
GROUP BY
bin,
client
Expand Down
9 changes: 6 additions & 3 deletions sql/histograms/bytesCss.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
CAST(FLOOR(bytesCSS / 10240) * 10 AS INT64) AS bin
CAST(FLOOR(FLOAT64(summary.bytesCss) / 10240) * 10 AS INT64) AS bin
FROM
`httparchive.summary_pages.${YYYY_MM_DD}_*`
`httparchive.crawl.pages`
WHERE
date = '${YYYY-MM-DD}' AND
is_root_page
GROUP BY
bin,
client
Expand Down
9 changes: 6 additions & 3 deletions sql/histograms/bytesFont.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
CAST(FLOOR(bytesFont / 10240) * 10 AS INT64) AS bin
CAST(FLOOR(FLOAT64(summary.bytesFont) / 10240) * 10 AS INT64) AS bin
FROM
`httparchive.summary_pages.${YYYY_MM_DD}_*`
`httparchive.crawl.pages`
WHERE
date = '${YYYY-MM-DD}' AND
is_root_page
GROUP BY
bin,
client
Expand Down
9 changes: 6 additions & 3 deletions sql/histograms/bytesHtml.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
CAST(FLOOR(bytesHtml / 10240) * 10 AS INT64) AS bin
CAST(FLOOR(FLOAT64(summary.bytesHtml) / 10240) * 10 AS INT64) AS bin
FROM
`httparchive.summary_pages.${YYYY_MM_DD}_*`
`httparchive.crawl.pages`
WHERE
date = '${YYYY-MM-DD}' AND
is_root_page
GROUP BY
bin,
client
Expand Down
9 changes: 6 additions & 3 deletions sql/histograms/bytesImg.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
CAST(FLOOR(bytesImg / 102400) * 100 AS INT64) AS bin
CAST(FLOOR(FLOAT64(summary.bytesImg) / 102400) * 100 AS INT64) AS bin
FROM
`httparchive.summary_pages.${YYYY_MM_DD}_*`
`httparchive.crawl.pages`
WHERE
date = '${YYYY-MM-DD}' AND
is_root_page
GROUP BY
bin,
client
Expand Down
9 changes: 6 additions & 3 deletions sql/histograms/bytesJs.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
CAST(FLOOR(bytesJS / 10240) * 10 AS INT64) AS bin
CAST(FLOOR(FLOAT64(summary.bytesJS) / 10240) * 10 AS INT64) AS bin
FROM
`httparchive.summary_pages.${YYYY_MM_DD}_*`
`httparchive.crawl.pages`
WHERE
date = '${YYYY-MM-DD}' AND
is_root_page
GROUP BY
bin,
client
Expand Down
9 changes: 6 additions & 3 deletions sql/histograms/bytesOther.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
CAST(FLOOR(bytesOther / 10240) * 10 AS INT64) AS bin
CAST(FLOOR(FLOAT64(summary.bytesOther) / 10240) * 10 AS INT64) AS bin
FROM
`httparchive.summary_pages.${YYYY_MM_DD}_*`
`httparchive.crawl.pages`
WHERE
date = '${YYYY-MM-DD}' AND
is_root_page
GROUP BY
bin,
client
Expand Down
9 changes: 6 additions & 3 deletions sql/histograms/bytesTotal.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
CAST(FLOOR(bytesTotal / 102400) * 100 AS INT64) AS bin
CAST(FLOOR(FLOAT64(summary.bytesTotal) / 102400) * 100 AS INT64) AS bin
FROM
`httparchive.summary_pages.${YYYY_MM_DD}_*`
`httparchive.crawl.pages`
WHERE
date = '${YYYY-MM-DD}' AND
is_root_page
GROUP BY
bin,
client
Expand Down
9 changes: 6 additions & 3 deletions sql/histograms/bytesVideo.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
CAST(FLOOR(bytesVideo / 10240) * 10 AS INT64) AS bin
CAST(FLOOR(FLOAT64(summary.bytesVideo) / 10240) * 10 AS INT64) AS bin
FROM
`httparchive.summary_pages.${YYYY_MM_DD}_*`
`httparchive.crawl.pages`
WHERE
date = '${YYYY-MM-DD}' AND
is_root_page
GROUP BY
bin,
client
Expand Down
9 changes: 6 additions & 3 deletions sql/histograms/compileJs.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
CAST(JSON_EXTRACT(payload, "$['_cpu.v8.compile']") AS INT64) AS bin
INT64(payload['_cpu.v8.compile']) AS bin
FROM
`httparchive.pages.${YYYY_MM_DD}_*`
`httparchive.crawl.pages`
WHERE
date = '${YYYY-MM-DD}' AND
is_root_page
GROUP BY
bin,
client
Expand Down
22 changes: 15 additions & 7 deletions sql/histograms/cruxShopifyThemes.sql
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,27 @@ CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor F
good + needs_improvement + poor > 0
);

-- Test CrUX data exists
WITH crux_test AS ( -- noqa: ST03
SELECT
1
FROM
`chrome-ux-report.all.${YYYYMM}`
),

-- All Shopify shops in HTTPArchive
WITH archive_pages AS (
archive_pages AS (
SELECT
client,
page AS url,
JSON_VALUE(custom_metrics, '$.ecommerce.Shopify.theme.name') AS theme_name,
JSON_VALUE(custom_metrics, '$.ecommerce.Shopify.theme.theme_store_id') AS theme_store_id
JSON_VALUE(custom_metrics.ecommerce.Shopify.theme.name) AS theme_name,
JSON_VALUE(custom_metrics.ecommerce.Shopify.theme.theme_store_id) AS theme_store_id
FROM
`httparchive.all.pages`
`httparchive.crawl.pages`
WHERE
date = DATE(REPLACE('${YYYY_MM_DD}', '_', '-')) AND
date = '${YYYY-MM-DD}' AND
is_root_page AND
JSON_VALUE(custom_metrics, '$.ecommerce.Shopify.theme.name') IS NOT NULL --first grab all shops for market share
JSON_VALUE(custom_metrics.ecommerce.Shopify.theme.name) IS NOT NULL --first grab all shops for market share
)

SELECT
Expand Down Expand Up @@ -176,7 +184,7 @@ JOIN (
-- Include null theme store ids so that we can get full market share within CrUX
ON IFNULL(theme_names.theme_store_id, 'N/A') = IFNULL(archive_pages.theme_store_id, 'N/A')
WHERE
date = DATE(REPLACE('${YYYY_MM_DD}', '_', '-')) AND
date = '${YYYY-MM-DD}' AND
theme_names.rank = 1
GROUP BY
client,
Expand Down
10 changes: 6 additions & 4 deletions sql/histograms/dcl.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
FLOOR(onContentLoaded / 1000) AS bin
FLOOR(FLOAT64(summary.onContentLoaded) / 1000) AS bin
FROM
`httparchive.summary_pages.${YYYY_MM_DD}_*`
`httparchive.crawl.pages`
WHERE
onContentLoaded > 0
date = '${YYYY-MM-DD}' AND
is_root_page AND
FLOAT64(summary.onContentLoaded) > 0
GROUP BY
bin,
client
Expand Down
12 changes: 9 additions & 3 deletions sql/histograms/evalJs.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,17 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
CAST(CAST(JSON_EXTRACT(payload, "$['_cpu.EvaluateScript']") AS FLOAT64) / 20 AS INT64) * 20 AS bin
CAST(FLOAT64(r.payload['_cpu.EvaluateScript']) / 20 AS INT64) * 20 AS bin
FROM
`httparchive.requests.${YYYY_MM_DD}_*`
`httparchive.crawl.requests` r
INNER JOIN
`httparchive.crawl.pages`
USING (date, client, is_root_page, rank, page)
WHERE
date = '${YYYY-MM-DD}' AND
is_root_page
GROUP BY
bin,
client
Expand Down
9 changes: 6 additions & 3 deletions sql/histograms/fcp.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
CAST(FLOOR(CAST(JSON_EXTRACT(payload, "$['_chromeUserTiming.firstContentfulPaint']") AS FLOAT64) / 1000) AS INT64) AS bin
CAST(FLOOR(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']) / 1000) AS INT64) AS bin
FROM
`httparchive.pages.${YYYY_MM_DD}_*`
`httparchive.crawl.pages`
WHERE
date = '${YYYY-MM-DD}' AND
is_root_page
GROUP BY
bin,
client
Expand Down
9 changes: 6 additions & 3 deletions sql/histograms/gzipSavings.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
CAST(FLOOR(CAST(JSON_EXTRACT(payload, '$._gzip_savings') AS FLOAT64) / (1024 * 2)) * 2 AS INT64) AS bin
CAST(FLOOR(FLOAT64(payload._gzip_savings) / (1024 * 2)) * 2 AS INT64) AS bin
FROM
`httparchive.pages.${YYYY_MM_DD}_*`
`httparchive.crawl.pages`
WHERE
date = '${YYYY-MM-DD}' AND
is_root_page
GROUP BY
bin,
client
Expand Down
11 changes: 5 additions & 6 deletions sql/histograms/htmlElementPopularity.sql
Original file line number Diff line number Diff line change
Expand Up @@ -18,26 +18,25 @@ SELECT
COUNT(DISTINCT root_page) / total AS pct,
ARRAY_TO_STRING(ARRAY_AGG(DISTINCT page LIMIT 5), ' ') AS sample_urls
FROM
`httparchive.all.pages`
`httparchive.crawl.pages`
JOIN
(
SELECT
date,
client,
COUNT(DISTINCT root_page) AS total
FROM
`httparchive.all.pages`
`httparchive.crawl.pages`
WHERE
date = PARSE_DATE('%Y_%m_%d', '${YYYY_MM_DD}') AND
rank = 1000
date = '${YYYY-MM-DD}'
GROUP BY
date,
client
)
USING (date, client),
UNNEST(getElements(JSON_EXTRACT(custom_metrics, '$.element_count'))) AS element
UNNEST(getElements(TO_JSON_STRING(custom_metrics.element_count))) AS element
WHERE
date = PARSE_DATE('%Y_%m_%d', '${YYYY_MM_DD}')
date = '${YYYY-MM-DD}'
GROUP BY
client,
total,
Expand Down
9 changes: 6 additions & 3 deletions sql/histograms/imgSavings.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
CAST(FLOOR(CAST(JSON_EXTRACT(payload, '$._image_savings') AS FLOAT64) / (1024 * 10)) * 10 AS INT64) AS bin
CAST(FLOOR(FLOAT64(payload._image_savings) / (1024 * 10)) * 10 AS INT64) AS bin
FROM
`httparchive.pages.${YYYY_MM_DD}_*`
`httparchive.crawl.pages`
WHERE
date = '${YYYY-MM-DD}' AND
is_root_page
GROUP BY
bin,
client
Expand Down
13 changes: 10 additions & 3 deletions sql/histograms/offscreenImages.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,18 @@ FROM (
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
FROM (
SELECT
_TABLE_SUFFIX AS client,
client,
COUNT(0) AS volume,
CAST(FLOOR(IFNULL(CAST(JSON_EXTRACT(report, '$.audits.offscreen-images.details.overallSavingsBytes') AS INT64), CAST(JSON_EXTRACT(report, '$.audits.offscreen-images.extendedInfo.value.wastedKb') AS INT64) * 1024) / 10240) * 10 AS INT64) AS bin
CAST(FLOOR(IFNULL(
INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes),
INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024
) / 10240) * 10 AS INT64) AS bin
FROM
`httparchive.lighthouse.${YYYY_MM_DD}_*`
`httparchive.crawl.pages`
WHERE
date >= '2022-03-01' AND
date = '${YYYY-MM-DD}' AND
is_root_page
GROUP BY
bin,
client
Expand Down
Loading