From a1f3cddc4f653d2cd3b9392aa87b313c24dbc25f Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Mon, 13 Jan 2025 11:38:21 +0000 Subject: [PATCH 01/25] Upgrade to new datamodel --- sql/addDate.js | 4 +- sql/delete_date_from_reports.sh | 16 ++--- sql/generate_reports.sh | 79 ++++++++++++++---------- sql/histograms/bootupJs.sql | 13 ++-- sql/histograms/bytesCss.sql | 9 ++- sql/histograms/bytesFont.sql | 9 ++- sql/histograms/bytesHtml.sql | 9 ++- sql/histograms/bytesImg.sql | 9 ++- sql/histograms/bytesJs.sql | 9 ++- sql/histograms/bytesOther.sql | 9 ++- sql/histograms/bytesTotal.sql | 9 ++- sql/histograms/bytesVideo.sql | 9 ++- sql/histograms/compileJs.sql | 9 ++- sql/histograms/dcl.sql | 10 +-- sql/histograms/evalJs.sql | 9 ++- sql/histograms/fcp.sql | 9 ++- sql/histograms/gzipSavings.sql | 9 ++- sql/histograms/htmlElementPopularity.sql | 11 ++-- sql/histograms/imgSavings.sql | 9 ++- sql/histograms/offscreenImages.sql | 10 ++- sql/histograms/ol.sql | 10 +-- sql/histograms/optimizedImages.sql | 31 ++++++---- sql/histograms/reqCss.sql | 9 ++- sql/histograms/reqFont.sql | 9 ++- sql/histograms/reqHtml.sql | 7 ++- sql/histograms/reqImg.sql | 7 ++- sql/histograms/reqJs.sql | 9 ++- sql/histograms/reqOther.sql | 9 ++- sql/histograms/reqTotal.sql | 9 ++- sql/histograms/reqVideo.sql | 9 ++- sql/histograms/speedIndex.sql | 9 ++- sql/histograms/tcp.sql | 10 +-- sql/histograms/ttci.sql | 10 ++- sql/histograms/vulnJs.sql | 39 ------------ sql/lens/drupal/crux_histograms.sql | 14 ++--- sql/lens/drupal/histograms.sql | 9 +-- sql/lens/magento/crux_histograms.sql | 12 ++-- sql/lens/magento/histograms.sql | 9 +-- sql/lens/top100k/histograms.sql | 6 +- sql/lens/top10k/histograms.sql | 6 +- sql/lens/top10k/timeseries.sql | 11 ++-- sql/lens/top1k/histograms.sql | 6 +- sql/lens/top1k/timeseries.sql | 11 ++-- sql/lens/top1m/histograms.sql | 6 +- sql/lens/top1m/timeseries.sql | 11 ++-- sql/lens/wordpress/crux_histograms.sql | 12 ++-- sql/lens/wordpress/histograms.sql | 9 +-- sql/timeseries/a11yButtonName.sql | 17 +++-- sql/timeseries/a11yColorContrast.sql | 17 +++-- sql/timeseries/a11yImageAlt.sql | 17 +++-- sql/timeseries/a11yLabel.sql | 17 +++-- sql/timeseries/a11yLinkName.sql | 17 +++-- sql/timeseries/a11yScores.sql | 20 +++--- sql/timeseries/bootupJs.sql | 25 +++++--- sql/timeseries/bytesCss.sql | 22 ++++--- sql/timeseries/bytesFont.sql | 22 ++++--- sql/timeseries/bytesHtml.sql | 22 ++++--- sql/timeseries/bytesImg.sql | 22 ++++--- sql/timeseries/bytesJs.sql | 22 ++++--- sql/timeseries/bytesOther.sql | 22 ++++--- sql/timeseries/bytesTotal.sql | 22 ++++--- sql/timeseries/bytesVideo.sql | 22 ++++--- sql/timeseries/canonical.sql | 16 ++--- sql/timeseries/dcl.sql | 22 ++++--- sql/timeseries/fcp.sql | 21 ++++--- sql/timeseries/fontDisplay.sql | 17 ++--- sql/timeseries/gzipSavings.sql | 20 +++--- sql/timeseries/h2.sql | 13 ++-- sql/timeseries/h3.sql | 20 +++--- sql/timeseries/hreflang.sql | 17 ++--- sql/timeseries/imgLazy.sql | 15 +++-- sql/timeseries/imgSavings.sql | 21 ++++--- sql/timeseries/legible.sql | 16 ++--- sql/timeseries/linkText.sql | 16 ++--- sql/timeseries/numUrls.sql | 11 ++-- sql/timeseries/offscreenImages.sql | 21 ++++--- sql/timeseries/ol.sql | 22 ++++--- sql/timeseries/optimizedImages.sql | 21 ++++--- sql/timeseries/pctHttps.sql | 8 +-- sql/timeseries/pctVuln.sql | 18 ------ sql/timeseries/pwaScores.sql | 41 ------------ sql/timeseries/reqCss.sql | 22 ++++--- sql/timeseries/reqFont.sql | 22 ++++--- sql/timeseries/reqHtml.sql | 22 ++++--- sql/timeseries/reqImg.sql | 22 ++++--- sql/timeseries/reqJs.sql | 22 ++++--- sql/timeseries/reqOther.sql | 22 ++++--- sql/timeseries/reqTotal.sql | 22 ++++--- sql/timeseries/reqVideo.sql | 22 ++++--- sql/timeseries/speedIndex.sql | 21 ++++--- sql/timeseries/tcp.sql | 22 ++++--- sql/timeseries/ttci.sql | 2 + 92 files changed, 786 insertions(+), 656 deletions(-) delete mode 100644 sql/histograms/vulnJs.sql delete mode 100644 sql/timeseries/pctVuln.sql delete mode 100644 sql/timeseries/pwaScores.sql diff --git a/sql/addDate.js b/sql/addDate.js index 540dd45..d2b4b63 100755 --- a/sql/addDate.js +++ b/sql/addDate.js @@ -13,8 +13,8 @@ const fs = require('fs'); const date = process.argv[2]; if (!date) { - console.error(`You must pass a YYYY_MM_DD-formatted date as input. For example: - sql/addDate.js 2017_09_01`); + console.error(`You must pass a YYYY-MM-DD-formatted date as input. For example: + sql/addDate.js 2017-09-01`); process.exit(1); } diff --git a/sql/delete_date_from_reports.sh b/sql/delete_date_from_reports.sh index 596dd9f..e36f4f4 100755 --- a/sql/delete_date_from_reports.sh +++ b/sql/delete_date_from_reports.sh @@ -4,9 +4,9 @@ # # Usage: # -# $ sql/delete_date_from_reports.sh -d YYYY_MM_DD -# $ sql/delete_date_from_reports.sh -d YYYY_MM_DD -l top1k -# $ sql/delete_date_from_reports.sh -d YYYY_MM_DD -l top1k -r "*crux*" +# $ sql/delete_date_from_reports.sh -d YYYY-MM-DD +# $ sql/delete_date_from_reports.sh -d YYYY-MM-DD -l top1k +# $ sql/delete_date_from_reports.sh -d YYYY-MM-DD -l top1k -r "*crux*" # # Flags: # @@ -26,7 +26,7 @@ NO_CHANGES=0 while getopts ":nvd:l:r:" opt; do case "${opt}" in d) - YYYY_MM_DD=${OPTARG} + YYYY-MM-DD=${OPTARG} ;; v) VERBOSE=1 @@ -43,12 +43,12 @@ while getopts ":nvd:l:r:" opt; do esac done -if [[ "${YYYY_MM_DD}" == "" ]]; then - echo "Usage $0 -d 2021_12_01" +if [[ "${YYYY-MM-DD}" == "" ]]; then + echo "Usage $0 -d 2021-12-01" exit 1 fi -echo "${YYYY_MM_DD}" +echo "${YYYY-MM-DD}" # Run all timeseries queries. for query in sql/timeseries/$REPORTS.sql; do @@ -96,7 +96,7 @@ for query in sql/timeseries/$REPORTS.sql; do echo "${current_contents}\n" fi - new_contents=$(echo "$current_contents" | jq -c --indent 1 --arg date "${YYYY_MM_DD}" '.[] | select(.date!=$date)' | tr -d '\n' | sed 's/^/[ /' | sed 's/}$/ } ]\n/' | sed 's/}{/ }, {/g') + new_contents=$(echo "$current_contents" | jq -c --indent 1 --arg date "${YYYY-MM-DD}" '.[] | select(.date!=$date)' | tr -d '\n' | sed 's/^/[ /' | sed 's/}$/ } ]\n/' | sed 's/}{/ }, {/g') if [ ${VERBOSE} -eq 1 ]; then echo "New JSON:" diff --git a/sql/generate_reports.sh b/sql/generate_reports.sh index 6737b61..2e2fdec 100755 --- a/sql/generate_reports.sh +++ b/sql/generate_reports.sh @@ -4,7 +4,7 @@ # # Usage: # -# $ sql/generateReports.sh -t -h YYYY_MM_DD +# $ sql/generateReports.sh -t -h YYYY-MM-DD # # Flags: # @@ -37,6 +37,7 @@ while getopts ":ftvh:l:r:" opt; do YYYY_MM_DD=${OPTARG} dateParts=(`echo ${OPTARG} | tr "_" "\\n"`) YYYYMM=${dateParts[0]}${dateParts[1]} + DATE=${dateParts[0]}-${dateParts[1]-${dateParts[2]} ;; t) GENERATE_TIMESERIES=1 @@ -56,7 +57,7 @@ while getopts ":ftvh:l:r:" opt; do esac done -# Exit early if there's nothing to do. +# Exit early if there is nothing to do. if [ $GENERATE_HISTOGRAM -eq 0 -a $GENERATE_TIMESERIES -eq 0 ]; then echo -e "You must provide one or both -t or -h flags." >&2 echo -e "For example: sql/generateReports.sh -t -h 2017_08_01" >&2 @@ -65,26 +66,36 @@ fi # Check if all tables for the given date are available in BigQuery. # Tables representing desktop/mobile and HAR/CSV data sources must exist. -(bq show "httparchive:pages.${YYYY_MM_DD}_desktop" && \ - bq show "httparchive:pages.${YYYY_MM_DD}_mobile" && \ - bq show "httparchive:summary_pages.${YYYY_MM_DD}_desktop" && \ - bq show "httparchive:summary_pages.${YYYY_MM_DD}_mobile") &> /dev/null -if [ $GENERATE_HISTOGRAM -ne 0 -a $? -ne 0 ]; then - echo -e "The BigQuery tables for $YYYY_MM_DD are not available." >&2 +DATED_TABLES_READY=0 +if [ -n "$YYYY_MM_DD" ]; then + DESKTOP_ROOT_PAGES_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.pages WHERE date = '${DATE}' AND client = 'desktop' AND is_root_page LIMIT 1" | tail -1) + DESKTOP_NON_ROOT_PAGES_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.pages WHERE date = '${DATE}' AND client = 'desktop' AND is_root_page LIMIT 1" | tail -1) + MOBILE_ROOT_PAGES_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.pages WHERE date = '${DATE}' AND client = 'mobile' AND NOT is_root_page LIMIT 1" | tail -1) + MOBILE_NON_ROOT_PAGES_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.pages WHERE date = '${DATE}' AND client = 'mobile' AND NOT is_root_page LIMIT 1" | tail -1) + DESKTOP_ROOT_REQUESTS_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.requests WHERE date = '${DATE}' AND client = 'desktop' AND is_root_page LIMIT 1" | tail -1) + DESKTOP_NON_ROOT_REQUESTS_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.requests WHERE date = '${DATE}' AND client = 'desktop' AND is_root_page LIMIT 1" | tail -1) + MOBILE_ROOT_REQUESTS_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.requests WHERE date = '${DATE}' AND client = 'mobile' AND NOT is_root_page LIMIT 1" | tail -1) + MOBILE_NON_ROOT_REQUESTS_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.requests WHERE date = '${DATE}' AND client = 'mobile' AND NOT is_root_page LIMIT 1" | tail -1) + if [[ "$DESKTOP_ROOT_PAGES_EXIST" == true && "$DESKTOP_NON_ROOT_PAGES_EXIST" == true && "$MOBILE_ROOT_PAGES_EXIST" == true && "$MOBILE_NON_ROOT_PAGES_EXIST" == true && "$DESKTOP_ROOT_REQUESTS_EXIST" == true && "$DESKTOP_NON_ROOT_REQUESTS_EXIST" == true && "$MOBILE_ROOT_REQUESTS_EXIST" == true && "$MOBILE_NON_ROOT_REQUESTS_EXIST" == true ]]; then + DATED_TABLES_READY=1 + fi +fi +if [ $GENERATE_HISTOGRAM -ne 0 -a $DATED_TABLES_READY -ne 1 ]; then + echo -e "The BigQuery tables for $DATE are not available." >&2 # List table data for debugging echo $(date) - bq show "httparchive:pages.${YYYY_MM_DD}_desktop" | head -5 - bq show "httparchive:pages.${YYYY_MM_DD}_mobile" | head -5 - bq show "httparchive:summary_pages.${YYYY_MM_DD}_desktop" | head -5 - bq show "httparchive:summary_pages.${YYYY_MM_DD}_mobile" | head -5 + echo "Desktop root pages ready: ${DESKTOP_ROOT_PAGES_EXIST}" + echo "Desktop non-oot pages ready: ${DESKTOP_NON_ROOT_PAGES_EXIST}" + echo "Mobile root pages ready: ${MOBILE_ROOT_PAGES_EXIST}" + echo "Mobile non-root pages ready: ${MOBILE_NON_ROOT_PAGES_EXIST}" exit 1 fi if [ $GENERATE_HISTOGRAM -eq 0 ]; then echo -e "Skipping histograms" else - echo -e "Generating histograms for date $YYYY_MM_DD" + echo -e "Generating histograms for date $DATE" # Run all histogram queries. for query in sql/histograms/$REPORTS.sql; do @@ -145,17 +156,17 @@ else fi sql=$(sed -e "s/\(\`chrome-ux-report[^\`]*\`\)/\1 $lens_join/" $query \ - | sed -e "s/\${YYYY_MM_DD}/$YYYY_MM_DD/g" \ + | sed -e "s/\${YYYY-MM-DD}/$DATE/g" \ | sed -e "s/\${YYYYMM}/$YYYYMM/g") else sql=$(sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/" $query \ - | sed -e "s/\${YYYY_MM_DD}/$YYYY_MM_DD/g" \ + | sed -e "s/\${YYYY-MM-DD}/$DATE/g" \ | sed -e "s/\${YYYYMM}/$YYYYMM/g") fi else echo -e "Generating ${metric} report for base (no lens)" - sql=$(sed -e "s/\${YYYY_MM_DD}/$YYYY_MM_DD/" $query \ - | sed -e "s/\${YYYYMM}/$YYYYMM/") + sql=$(sed -e "s/\${YYYY-MM-DD}/$DATE/g" $query \ + | sed -e "s/\${YYYYMM}/$YYYYMM/g") fi if [ ${VERBOSE} -eq 1 ]; then @@ -240,7 +251,7 @@ else if [[ $(grep "httparchive.blink_features.usage" $query) && $LENS == "" ]]; then # blink needs a special join, different for lenses date_join="yyyymmdd > REPLACE(\"$max_date\",\"_\",\"\")" if [[ -n "$YYYY_MM_DD" ]]; then - # If a date is given, then only run up until then (in case next month is mid-run as don't wanna get just desktop data) + # If a date is given, then only run up until then (in case next month is mid-run as do not wanna get just desktop data) date_join="${date_join} AND yyyymmdd <= REPLACE(\"$YYYY_MM_DD\",\"_\",\"\")" fi elif [[ $(grep "httparchive.blink_features.usage" $query) && $LENS != "" ]]; then # blink needs a special join, different for lenses @@ -248,7 +259,7 @@ else # Skip 2022_05_12 tables date_join="${date_join} AND yyyymmdd != \"2022-05-12\"" if [[ -n "$YYYY_MM_DD" ]]; then - # If a date is given, then only run up until then (in case next month is mid run as don't wanna get just desktop data) + # If a date is given, then only run up until then (in case next month is mid run as do not wanna get just desktop data) date_join="${date_join} AND yyyymmdd <= CAST(REPLACE(\"$YYYY_MM_DD\",\"_\",\"-\") AS DATE)" fi elif [[ $metric != crux* ]]; then # CrUX is quick and join is more compilicated so just do a full run of that @@ -256,7 +267,7 @@ else # Skip 2022_05_12 tables date_join="${date_join} AND SUBSTR(_TABLE_SUFFIX, 0, 10) != \"2022_05_12\"" if [[ -n "$YYYY_MM_DD" ]]; then - # If a date is given, then only run up until then (in case next month is mid run as don't wanna get just desktop data) + # If a date is given, then only run up until then (in case next month is mid run as do not wanna get just desktop data) date_join="${date_join} AND SUBSTR(_TABLE_SUFFIX, 0, 10) <= \"$YYYY_MM_DD\"" fi fi @@ -269,34 +280,34 @@ else fi elif [[ -n "$YYYY_MM_DD" ]]; then - # Even if doing a force run we only wanna run up until date given in case next month is mid-run as don't wanna get just desktop data + # Even if doing a force run we only wanna run up until date given in case next month is mid-run as do not wanna get just desktop data if [[ $(grep "httparchive.blink_features.usage" $query) && $LENS == "" ]]; then # blink needs a special join, different for lenses - date_join="yyyymmdd <= REPLACE(\"$YYYY_MM_DD\",\"_\",\"\")" + date_join="yyyymmdd <= \"$DATE\"" elif [[ $(grep "httparchive.blink_features.usage" $query) && $LENS != "" ]]; then # blink needs a special join, different for lenses - date_join="yyyymmdd <= CAST(REPLACE(\"$YYYY_MM_DD\",\"_\",\"-\") AS DATE)" + date_join="yyyymmdd <= \"$DATE\"" # Skip 2022_05_12 tables date_join="${date_join} AND yyyymmdd != \"2022-05-12\"" elif [[ $metric != crux* ]]; then # CrUX is quick and join is more compilicated so just do a full run of that - # If a date is given, then only run up until then (in case next month is mid run as don't wanna get just desktop data) - date_join="SUBSTR(_TABLE_SUFFIX, 0, 10) <= \"$YYYY_MM_DD\"" + # If a date is given, then only run up until then (in case next month is mid run as do not wanna get just desktop data) + date_join="date <= \"$DATE\"" # Skip 2022_05_12 tables - date_join="${date_join} AND SUBSTR(_TABLE_SUFFIX, 0, 10) != \"2022_05_12\"" + date_join="${date_join} AND date != \"2022-05-12\"" fi echo -e "Force Mode=${FORCE}. Generating $gs_lens_dir$metric timeseries from start until ${YYYY_MM_DD}." fi elif [[ -n "$YYYY_MM_DD" ]]; then - # Even if the file doesn't exist we only wanna run up until date given in case next month is mid-run as don't wanna get just desktop data + # Even if the file does not exist we only wanna run up until date given in case next month is mid-run as do not wanna get just desktop data if [[ $(grep "httparchive.blink_features.usage" $query) && $LENS == "" ]]; then # blink needs a special join, different for lenses - date_join="yyyymmdd <= REPLACE(\"$YYYY_MM_DD\",\"_\",\"\")" + date_join="yyyymmdd <= \"$DATE\"" elif [[ $(grep "httparchive.blink_features.usage" $query) && $LENS != "" ]]; then # blink needs a special join, different for lenses - date_join="yyyymmdd <= CAST(REPLACE(\"$YYYY_MM_DD\",\"_\",\"-\") AS DATE)" + date_join="yyyymmdd <= \"$DATE\"" # Skip 2022_05_12 tables date_join="${date_join} AND yyyymmdd != \"2022-05-12\"" elif [[ $metric != crux* ]]; then # CrUX is quick and join is more compilicated so just do a full run of that date_join="SUBSTR(_TABLE_SUFFIX, 0, 10) <= \"$YYYY_MM_DD\"" # Skip 2022_05_12 tables - date_join="${date_join} AND SUBSTR(_TABLE_SUFFIX, 0, 10) != \"2022_05_12\"" + date_join="${date_join} AND date != \"2022-05-12\"" fi echo -e "Timeseries does not exist. Generating $gs_lens_dir$metric timeseries from start until ${YYYY_MM_DD}" @@ -339,7 +350,7 @@ else sql=$(sed -e "s/\(WHERE\)/\1 $date_join AND/" $query \ | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/") else - # If WHERE clause doesn't exists then add it, before GROUP BY + # If WHERE clause does not exists then add it, before GROUP BY sql=$(sed -e "s/\(GROUP BY\)/WHERE $date_join \1/" $query \ | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/") fi @@ -354,7 +365,7 @@ else # If WHERE clause already exists then add to it, before GROUP BY sql=$(sed -e "s/\(WHERE\)/\1 $date_join AND /" $query) else - # If WHERE clause doesn't exists then add it, before GROUP BY + # If WHERE clause does not exists then add it, before GROUP BY sql=$(sed -e "s/\(GROUP BY\)/WHERE $date_join \1/" $query) fi else @@ -380,8 +391,8 @@ else echo "$metric took $ELAPSED_TIME seconds" fi - # If it's a partial run, then combine with the current results. - if [[ $FORCE -eq 0 && -n "${current_contents}" ]]; then + # If it is a partial run, then combine with the current results. + if [[ $FORCE -eq 0 && -n "${current_contents}" && $metric != crux* ]]; then result=$(echo ${result} ${current_contents} | jq '.+= input') fi diff --git a/sql/histograms/bootupJs.sql b/sql/histograms/bootupJs.sql index c289489..07cbd71 100644 --- a/sql/histograms/bootupJs.sql +++ b/sql/histograms/bootupJs.sql @@ -3,16 +3,15 @@ SELECT *, SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf FROM ( - SELECT - *, - volume / SUM(volume) OVER (PARTITION BY client) AS pdf - FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - FLOOR(CAST(IFNULL(JSON_EXTRACT(report, '$.audits.bootup-time.numericValue'), JSON_EXTRACT(report, '$.audits.bootup-time.rawValue')) AS FLOAT64) / 100) / 10 AS bin + FLOOR(FLOAT64(IFNULL(lighthouse.audits['bootup-time'].numericValue, lighthouse.audits['bootup-time'].rawValue)) / 100) / 10 AS bin FROM - `httparchive.lighthouse.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/bytesCss.sql b/sql/histograms/bytesCss.sql index c2a3bbb..97c16bc 100644 --- a/sql/histograms/bytesCss.sql +++ b/sql/histograms/bytesCss.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(bytesCSS / 10240) * 10 AS INT64) AS bin + CAST(FLOOR(FLOAT64(summary.bytesCss) / 10240) * 10 AS INT64) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/bytesFont.sql b/sql/histograms/bytesFont.sql index 495b687..2b7548e 100644 --- a/sql/histograms/bytesFont.sql +++ b/sql/histograms/bytesFont.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(bytesFont / 10240) * 10 AS INT64) AS bin + CAST(FLOOR(FLOAT64(summary.bytesFont) / 10240) * 10 AS INT64) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/bytesHtml.sql b/sql/histograms/bytesHtml.sql index 430d80a..0be97a9 100644 --- a/sql/histograms/bytesHtml.sql +++ b/sql/histograms/bytesHtml.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(bytesHtml / 10240) * 10 AS INT64) AS bin + CAST(FLOOR(FLOAT64(summary.bytesHtml) / 10240) * 10 AS INT64) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/bytesImg.sql b/sql/histograms/bytesImg.sql index b232d54..7aa5b7d 100644 --- a/sql/histograms/bytesImg.sql +++ b/sql/histograms/bytesImg.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(bytesImg / 102400) * 100 AS INT64) AS bin + CAST(FLOOR(FLOAT64(summary.bytesImg) / 102400) * 100 AS INT64) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/bytesJs.sql b/sql/histograms/bytesJs.sql index 911756b..6d2662f 100644 --- a/sql/histograms/bytesJs.sql +++ b/sql/histograms/bytesJs.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(bytesJS / 10240) * 10 AS INT64) AS bin + CAST(FLOOR(FLOAT64(summary.bytesJS) / 10240) * 10 AS INT64) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/bytesOther.sql b/sql/histograms/bytesOther.sql index 9ddfca0..8ceff61 100644 --- a/sql/histograms/bytesOther.sql +++ b/sql/histograms/bytesOther.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(bytesOther / 10240) * 10 AS INT64) AS bin + CAST(FLOOR(FLOAT64(summary.bytesOther) / 10240) * 10 AS INT64) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/bytesTotal.sql b/sql/histograms/bytesTotal.sql index 8d5407d..86ebb02 100644 --- a/sql/histograms/bytesTotal.sql +++ b/sql/histograms/bytesTotal.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(bytesTotal / 102400) * 100 AS INT64) AS bin + CAST(FLOOR(FLOAT64(summary.bytesTotal) / 102400) * 100 AS INT64) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/bytesVideo.sql b/sql/histograms/bytesVideo.sql index 39b8958..af3b9d3 100644 --- a/sql/histograms/bytesVideo.sql +++ b/sql/histograms/bytesVideo.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(bytesVideo / 10240) * 10 AS INT64) AS bin + CAST(FLOOR(FLOAT64(summary.bytesVideo) / 10240) * 10 AS INT64) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/compileJs.sql b/sql/histograms/compileJs.sql index 6aeb46c..eddc382 100644 --- a/sql/histograms/compileJs.sql +++ b/sql/histograms/compileJs.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(JSON_EXTRACT(payload, "$['_cpu.v8.compile']") AS INT64) AS bin + INT64(payload['_cpu.v8.compile']) AS bin FROM - `httparchive.pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/dcl.sql b/sql/histograms/dcl.sql index 120ccff..4e587ee 100644 --- a/sql/histograms/dcl.sql +++ b/sql/histograms/dcl.sql @@ -8,13 +8,15 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - FLOOR(onContentLoaded / 1000) AS bin + FLOOR(FLOAT64(summary.onContentLoaded) / 1000) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` WHERE - onContentLoaded > 0 + date = '${YYYY-MM-DD}' AND + is_root_page AND + FLOAT64(summary.onContentLoaded) > 0 GROUP BY bin, client diff --git a/sql/histograms/evalJs.sql b/sql/histograms/evalJs.sql index 4d4e0eb..00ac184 100644 --- a/sql/histograms/evalJs.sql +++ b/sql/histograms/evalJs.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(CAST(JSON_EXTRACT(payload, "$['_cpu.EvaluateScript']") AS FLOAT64) / 20 AS INT64) * 20 AS bin + CAST(FLOAT64(payload['_cpu.EvaluateScript']) / 20 AS INT64) * 20 AS bin FROM - `httparchive.requests.${YYYY_MM_DD}_*` + `httparchive.crawl.requests` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/fcp.sql b/sql/histograms/fcp.sql index 0c3b380..015426d 100644 --- a/sql/histograms/fcp.sql +++ b/sql/histograms/fcp.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(CAST(JSON_EXTRACT(payload, "$['_chromeUserTiming.firstContentfulPaint']") AS FLOAT64) / 1000) AS INT64) AS bin + CAST(FLOOR(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']) / 1000) AS INT64) AS bin FROM - `httparchive.pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/gzipSavings.sql b/sql/histograms/gzipSavings.sql index c239b10..81b173e 100644 --- a/sql/histograms/gzipSavings.sql +++ b/sql/histograms/gzipSavings.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(CAST(JSON_EXTRACT(payload, '$._gzip_savings') AS FLOAT64) / (1024 * 2)) * 2 AS INT64) AS bin + CAST(FLOOR(FLOAT64(payload._gzip_savings) / (1024 * 2)) * 2 AS INT64) AS bin FROM - `httparchive.pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/htmlElementPopularity.sql b/sql/histograms/htmlElementPopularity.sql index 773dc40..4e6618c 100644 --- a/sql/histograms/htmlElementPopularity.sql +++ b/sql/histograms/htmlElementPopularity.sql @@ -18,7 +18,7 @@ SELECT COUNT(DISTINCT root_page) / total AS pct, ARRAY_TO_STRING(ARRAY_AGG(DISTINCT page LIMIT 5), ' ') AS sample_urls FROM - `httparchive.all.pages` + `httparchive.crawl.pages` JOIN ( SELECT @@ -26,18 +26,17 @@ JOIN client, COUNT(DISTINCT root_page) AS total FROM - `httparchive.all.pages` + `httparchive.crawl.pages` WHERE - date = PARSE_DATE('%Y_%m_%d', '${YYYY_MM_DD}') AND - rank = 1000 + date = '${YYYY-MM-DD}' GROUP BY date, client ) USING (date, client), - UNNEST(getElements(JSON_EXTRACT(custom_metrics, '$.element_count'))) AS element + UNNEST(getElements(TO_JSON_STRING(custom_metrics.element_count))) AS element WHERE - date = PARSE_DATE('%Y_%m_%d', '${YYYY_MM_DD}') + date = '${YYYY-MM-DD}' GROUP BY client, total, diff --git a/sql/histograms/imgSavings.sql b/sql/histograms/imgSavings.sql index 70023c3..df93fe4 100644 --- a/sql/histograms/imgSavings.sql +++ b/sql/histograms/imgSavings.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(CAST(JSON_EXTRACT(payload, '$._image_savings') AS FLOAT64) / (1024 * 10)) * 10 AS INT64) AS bin + CAST(FLOOR(FLOAT64(payload._image_savings) / (1024 * 10)) * 10 AS INT64) AS bin FROM - `httparchive.pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/offscreenImages.sql b/sql/histograms/offscreenImages.sql index 8f571a6..f77e2a0 100644 --- a/sql/histograms/offscreenImages.sql +++ b/sql/histograms/offscreenImages.sql @@ -8,11 +8,15 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(IFNULL(CAST(JSON_EXTRACT(report, '$.audits.offscreen-images.details.overallSavingsBytes') AS INT64), CAST(JSON_EXTRACT(report, '$.audits.offscreen-images.extendedInfo.value.wastedKb') AS INT64) * 1024) / 10240) * 10 AS INT64) AS bin + CAST(FLOOR(IFNULL(CAST(JSON_EXTRACT(lighthouse, '$.audits.offscreen-images.details.overallSavingsBytes') AS INT64), CAST(JSON_EXTRACT(lighthouse, '$.audits.offscreen-images.extendedInfo.value.wastedKb') AS INT64) * 1024) / 10240) * 10 AS INT64) AS bin FROM - `httparchive.lighthouse.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date >= '2022-03-01' AND + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/ol.sql b/sql/histograms/ol.sql index 4aa6390..825943a 100644 --- a/sql/histograms/ol.sql +++ b/sql/histograms/ol.sql @@ -8,13 +8,15 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - FLOOR(onLoad / 1000) AS bin + FLOOR(FLOAT64(summary.onLoad) / 1000) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` WHERE - onLoad > 0 + date = '${YYYY-MM-DD}' AND + is_root_page AND + FLOAT64(summary.onLoad) > 0 GROUP BY bin, client diff --git a/sql/histograms/optimizedImages.sql b/sql/histograms/optimizedImages.sql index c9b5f9b..3b79333 100644 --- a/sql/histograms/optimizedImages.sql +++ b/sql/histograms/optimizedImages.sql @@ -6,18 +6,25 @@ FROM ( SELECT *, volume / SUM(volume) OVER (PARTITION BY client) AS pdf - FROM ( - SELECT - _TABLE_SUFFIX AS client, - COUNT(0) AS volume, - CAST(FLOOR(IFNULL(CAST(JSON_EXTRACT(report, '$.audits.uses-optimized-images.details.overallSavingsBytes') AS INT64), CAST(JSON_EXTRACT(report, '$.audits.uses-optimized-images.extendedInfo.value.wastedKb') AS INT64) * 1024) / 10240) * 10 AS INT64) AS bin - FROM - `httparchive.lighthouse.${YYYY_MM_DD}_*` - GROUP BY - bin, - client - HAVING - bin IS NOT NULL + FROM ( + SELECT + client, + COUNT(0) AS volume, + CAST(FLOOR(IFNULL( + INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), + INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) + * 1024) / 10240) * 10 AS INT64) AS bin + FROM + `httparchive.crawl.pages` + WHERE + date >= '2022-03-01' AND + date = '${YYYY-MM-DD}' AND + is_root_page + GROUP BY + bin, + client + HAVING + bin IS NOT NULL ) ) ORDER BY diff --git a/sql/histograms/reqCss.sql b/sql/histograms/reqCss.sql index 8bc6cdc..861fe81 100644 --- a/sql/histograms/reqCss.sql +++ b/sql/histograms/reqCss.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - reqCSS AS bin + FLOAT64(summary.reqCss) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/reqFont.sql b/sql/histograms/reqFont.sql index d414b22..145e2b6 100644 --- a/sql/histograms/reqFont.sql +++ b/sql/histograms/reqFont.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - reqFont AS bin + FLOAT64(summary.reqFont) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/reqHtml.sql b/sql/histograms/reqHtml.sql index d6631e3..f18d596 100644 --- a/sql/histograms/reqHtml.sql +++ b/sql/histograms/reqHtml.sql @@ -10,9 +10,12 @@ FROM ( SELECT _TABLE_SUFFIX AS client, COUNT(0) AS volume, - reqHtml AS bin + FLOAT64(summary.reqHtml) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/reqImg.sql b/sql/histograms/reqImg.sql index 9d075bc..a5fc334 100644 --- a/sql/histograms/reqImg.sql +++ b/sql/histograms/reqImg.sql @@ -10,9 +10,12 @@ FROM ( SELECT _TABLE_SUFFIX AS client, COUNT(0) AS volume, - reqImg AS bin + FLOAT64(summary.reqImg) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/reqJs.sql b/sql/histograms/reqJs.sql index 503dbaf..8dfc12e 100644 --- a/sql/histograms/reqJs.sql +++ b/sql/histograms/reqJs.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - reqJS AS bin + FLOAT64(summary.reqJS) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/reqOther.sql b/sql/histograms/reqOther.sql index ee54ef9..caa9944 100644 --- a/sql/histograms/reqOther.sql +++ b/sql/histograms/reqOther.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - reqOther AS bin + FLOAT64(summary.reqOther) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/reqTotal.sql b/sql/histograms/reqTotal.sql index 5c712a1..df0987f 100644 --- a/sql/histograms/reqTotal.sql +++ b/sql/histograms/reqTotal.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - FLOOR(reqTotal / 10) * 10 AS bin + FLOOR(FLOAT64(summary.reqTotal) / 10) * 10 AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/reqVideo.sql b/sql/histograms/reqVideo.sql index 74bf7b6..3f83c74 100644 --- a/sql/histograms/reqVideo.sql +++ b/sql/histograms/reqVideo.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - reqVideo AS bin + FLOAT64(summary.reqVideo) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/speedIndex.sql b/sql/histograms/speedIndex.sql index 8bb993a..6740961 100644 --- a/sql/histograms/speedIndex.sql +++ b/sql/histograms/speedIndex.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(CAST(JSON_EXTRACT(payload, '$._SpeedIndex') AS FLOAT64) / (1000)) * 1000 AS INT64) AS bin + CAST(FLOOR(FLOAT64(payload._SpeedIndex) / (1000)) * 1000 AS INT64) AS bin FROM - `httparchive.pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/tcp.sql b/sql/histograms/tcp.sql index aeb80bf..ea80501 100644 --- a/sql/histograms/tcp.sql +++ b/sql/histograms/tcp.sql @@ -8,13 +8,15 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - _connections AS bin + INT64(summary._connections) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` WHERE - _connections > 0 + date = '${YYYY-MM-DD}' AND + is_root_page AND + INT64(summary._connections) > 0 GROUP BY bin, client diff --git a/sql/histograms/ttci.sql b/sql/histograms/ttci.sql index db62d3a..c69d0f2 100644 --- a/sql/histograms/ttci.sql +++ b/sql/histograms/ttci.sql @@ -8,11 +8,15 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(CAST(IFNULL(JSON_EXTRACT(report, '$.audits.interactive.numericValue'), IFNULL(JSON_EXTRACT(report, '$.audits.consistently-interactive.rawValue'), JSON_EXTRACT(report, '$.audits.interactive.rawValue'))) AS FLOAT64) / 1000) AS INT64) AS bin + CAST(FLOOR(CAST(IFNULL(JSON_EXTRACT(lighthouse, '$.audits.interactive.numericValue'), IFNULL(JSON_EXTRACT(lighthouse, '$.audits.consistently-interactive.rawValue'), JSON_EXTRACT(lighthouse, '$.audits.interactive.rawValue'))) AS FLOAT64) / 1000) AS INT64) AS bin FROM - `httparchive.lighthouse.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date >= '2022-03-01' AND + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/vulnJs.sql b/sql/histograms/vulnJs.sql deleted file mode 100644 index 87f3f93..0000000 --- a/sql/histograms/vulnJs.sql +++ /dev/null @@ -1,39 +0,0 @@ -#standardSQL -CREATE TEMPORARY FUNCTION countVulnerabilities(report STRING) -RETURNS INT64 LANGUAGE js AS """ - try { - const $ = JSON.parse(report); - const audit = $.audits['no-vulnerable-libraries']; - if (audit.extendedInfo && audit.extendedInfo.vulnerabilities) { - return audit.extendedInfo.vulnerabilities.length; - } - return +audit.displayValue.match(/\\d+/)[0]; - } catch (e) { - return 0; - } -"""; - -SELECT - *, - SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf -FROM ( - SELECT - *, - volume / SUM(volume) OVER (PARTITION BY client) AS pdf - FROM ( - SELECT - _TABLE_SUFFIX AS client, - COUNT(0) AS volume, - countVulnerabilities(report) AS bin - FROM - `httparchive.lighthouse.${YYYY_MM_DD}_*` - WHERE - report IS NOT NULL - GROUP BY - bin, - client - ) -) -ORDER BY - bin, - client diff --git a/sql/lens/drupal/crux_histograms.sql b/sql/lens/drupal/crux_histograms.sql index 03c404e..682012e 100644 --- a/sql/lens/drupal/crux_histograms.sql +++ b/sql/lens/drupal/crux_histograms.sql @@ -1,15 +1,15 @@ -JOIN +INNER JOIN ( SELECT - url, - _TABLE_SUFFIX AS _TABLE_SUFFIX + page, + client FROM - `httparchive.technologies.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` WHERE - app = 'Drupal' and - LENGTH(url) > 0 + date = '${YYYY-MM-DD}' AND + 'Drupal' in UNNEST(technologies.technology) GROUP BY 1, 2 ) -ON (SUBSTR(url, 0, LENGTH(url) -1) = origin AND form_factor.name = IF(_TABLE_SUFFIX = 'desktop', 'desktop', 'phone')) +ON (SUBSTR(page, 0, LENGTH(page) -1) = origin AND form_factor.name = IF(client = 'desktop', 'desktop', 'phone')) diff --git a/sql/lens/drupal/histograms.sql b/sql/lens/drupal/histograms.sql index f113339..a539e99 100644 --- a/sql/lens/drupal/histograms.sql +++ b/sql/lens/drupal/histograms.sql @@ -1,10 +1,11 @@ SELECT - url, - _TABLE_SUFFIX AS _TABLE_SUFFIX + page, + client FROM - `httparchive.technologies.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` WHERE - app = 'Drupal' + date = '${YYYY-MM-DD}' AND + 'Drupal' in UNNEST(technologies.technology) GROUP BY 1, 2 diff --git a/sql/lens/magento/crux_histograms.sql b/sql/lens/magento/crux_histograms.sql index b8197cd..eac905d 100644 --- a/sql/lens/magento/crux_histograms.sql +++ b/sql/lens/magento/crux_histograms.sql @@ -1,15 +1,15 @@ INNER JOIN ( SELECT - url, - _TABLE_SUFFIX AS _TABLE_SUFFIX + page, + client FROM - `httparchive.technologies.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` WHERE - app = 'Magento' and - LENGTH(url) > 0 + date = '${YYYY-MM-DD}' AND + 'Magento' in UNNEST(technologies.technology) GROUP BY 1, 2 ) -ON (SUBSTR(url, 0, LENGTH(url) -1) = origin AND form_factor.name = IF(_TABLE_SUFFIX = 'desktop', 'desktop', 'phone')) +ON (SUBSTR(page, 0, LENGTH(page) -1) = origin AND form_factor.name = IF(client = 'desktop', 'desktop', 'phone')) diff --git a/sql/lens/magento/histograms.sql b/sql/lens/magento/histograms.sql index 3035144..7ee271f 100644 --- a/sql/lens/magento/histograms.sql +++ b/sql/lens/magento/histograms.sql @@ -1,10 +1,11 @@ SELECT - url, - _TABLE_SUFFIX AS _TABLE_SUFFIX + page, + client FROM - `httparchive.technologies.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` WHERE - app = 'Magento' + date = '${YYYY-MM-DD}' AND + 'Magento' in UNNEST(technologies.technology) GROUP BY 1, 2 diff --git a/sql/lens/top100k/histograms.sql b/sql/lens/top100k/histograms.sql index 80e62de..4915598 100644 --- a/sql/lens/top100k/histograms.sql +++ b/sql/lens/top100k/histograms.sql @@ -1,9 +1,11 @@ SELECT url, - _TABLE_SUFFIX AS _TABLE_SUFFIX + client FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` WHERE + date = '${YYYY-MM-DD}' AND + is_root_page AND rank <= 100000 GROUP BY 1, diff --git a/sql/lens/top10k/histograms.sql b/sql/lens/top10k/histograms.sql index 2157a3f..33bc834 100644 --- a/sql/lens/top10k/histograms.sql +++ b/sql/lens/top10k/histograms.sql @@ -1,9 +1,11 @@ SELECT url, - _TABLE_SUFFIX AS _TABLE_SUFFIX + client FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` WHERE + date = '${YYYY-MM-DD}' AND + is_root_page AND rank <= 10000 GROUP BY 1, diff --git a/sql/lens/top10k/timeseries.sql b/sql/lens/top10k/timeseries.sql index 6ddf43a..f8d40b1 100644 --- a/sql/lens/top10k/timeseries.sql +++ b/sql/lens/top10k/timeseries.sql @@ -1,11 +1,14 @@ SELECT url, - _TABLE_SUFFIX AS _TABLE_SUFFIX + client, + date FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE + date >= '2021-05-01' AND rank <= 10000 AND - _TABLE_SUFFIX >= '2021_05_01' + is_root_page GROUP BY 1, - 2 + 2, + 3 diff --git a/sql/lens/top1k/histograms.sql b/sql/lens/top1k/histograms.sql index 6bb184f..6d35ac2 100644 --- a/sql/lens/top1k/histograms.sql +++ b/sql/lens/top1k/histograms.sql @@ -1,9 +1,11 @@ SELECT url, - _TABLE_SUFFIX AS _TABLE_SUFFIX + client FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` WHERE + date = '${YYYY-MM-DD}' AND + is_root_page AND rank <= 1000 GROUP BY 1, diff --git a/sql/lens/top1k/timeseries.sql b/sql/lens/top1k/timeseries.sql index 61c16c2..ff749ae 100644 --- a/sql/lens/top1k/timeseries.sql +++ b/sql/lens/top1k/timeseries.sql @@ -1,11 +1,14 @@ SELECT url, - _TABLE_SUFFIX AS _TABLE_SUFFIX + client, + date FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE + date >= '2021-05-01' AND rank <= 1000 AND - _TABLE_SUFFIX >= '2021_05_01' + is_root_page GROUP BY 1, - 2 + 2, + 3 diff --git a/sql/lens/top1m/histograms.sql b/sql/lens/top1m/histograms.sql index e36cd2a..ce09f42 100644 --- a/sql/lens/top1m/histograms.sql +++ b/sql/lens/top1m/histograms.sql @@ -1,9 +1,11 @@ SELECT url, - _TABLE_SUFFIX AS _TABLE_SUFFIX + client FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` WHERE + date = '${YYYY-MM-DD}' AND + is_root_page AND rank <= 1000000 GROUP BY 1, diff --git a/sql/lens/top1m/timeseries.sql b/sql/lens/top1m/timeseries.sql index bb59934..3acd7cf 100644 --- a/sql/lens/top1m/timeseries.sql +++ b/sql/lens/top1m/timeseries.sql @@ -1,11 +1,14 @@ SELECT url, - _TABLE_SUFFIX AS _TABLE_SUFFIX + client, + date FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE + date >= '2021-05-01' AND rank <= 1000000 AND - _TABLE_SUFFIX >= '2021_05_01' + is_root_page GROUP BY 1, - 2 + 2, + 3 diff --git a/sql/lens/wordpress/crux_histograms.sql b/sql/lens/wordpress/crux_histograms.sql index 5784cde..6b416cc 100644 --- a/sql/lens/wordpress/crux_histograms.sql +++ b/sql/lens/wordpress/crux_histograms.sql @@ -1,15 +1,15 @@ INNER JOIN ( SELECT - url, - _TABLE_SUFFIX AS _TABLE_SUFFIX + page, + client FROM - `httparchive.technologies.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` WHERE - app = 'WordPress' and - LENGTH(url) > 0 + date = '${YYYY-MM-DD}' AND + 'WordPress' in UNNEST(technologies.technology) GROUP BY 1, 2 ) -ON (SUBSTR(url, 0, LENGTH(url) -1) = origin AND form_factor.name = IF(_TABLE_SUFFIX = 'desktop', 'desktop', 'phone')) +ON (SUBSTR(page, 0, LENGTH(page) -1) = origin AND form_factor.name = IF(client = 'desktop', 'desktop', 'phone')) diff --git a/sql/lens/wordpress/histograms.sql b/sql/lens/wordpress/histograms.sql index c225c98..1b92037 100644 --- a/sql/lens/wordpress/histograms.sql +++ b/sql/lens/wordpress/histograms.sql @@ -1,10 +1,11 @@ SELECT - url, - _TABLE_SUFFIX AS _TABLE_SUFFIX + page, + client FROM - `httparchive.technologies.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` WHERE - app = 'WordPress' + date = '${YYYY-MM-DD}' AND + 'WordPress' in UNNEST(technologies.technology) GROUP BY 1, 2 diff --git a/sql/timeseries/a11yButtonName.sql b/sql/timeseries/a11yButtonName.sql index 55eddc5..eb0b653 100644 --- a/sql/timeseries/a11yButtonName.sql +++ b/sql/timeseries/a11yButtonName.sql @@ -1,13 +1,18 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(SUM(IF(JSON_EXTRACT(report, '$.audits.button-name.score') IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits['button-name'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent, FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` WHERE - report IS NOT NULL + /* Should really use the following to only include eligible sites. */ + /* LAX_STRING(lighthouse.audits['button-name'].score) IS NOT NULL AND */ + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + is_root_page AND + date >= '2017-06-01' GROUP BY date, timestamp, diff --git a/sql/timeseries/a11yColorContrast.sql b/sql/timeseries/a11yColorContrast.sql index e865687..132d9ff 100644 --- a/sql/timeseries/a11yColorContrast.sql +++ b/sql/timeseries/a11yColorContrast.sql @@ -1,13 +1,18 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(SUM(IF(JSON_EXTRACT(report, '$.audits.color-contrast.score') IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits['color-contrast'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` WHERE - report IS NOT NULL + /* Should really use the following to only include eligible sites. */ + /* LAX_STRING(lighthouse.audits['color-contrast'].score) IS NOT NULL AND */ + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + date >= '2017-06-01' AND + is_root_page GROUP BY date, timestamp, diff --git a/sql/timeseries/a11yImageAlt.sql b/sql/timeseries/a11yImageAlt.sql index 28ec635..fdf5bff 100644 --- a/sql/timeseries/a11yImageAlt.sql +++ b/sql/timeseries/a11yImageAlt.sql @@ -1,13 +1,18 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(SUM(IF(JSON_EXTRACT(report, '$.audits.image-alt.score') IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits.['image-alt'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` WHERE - report IS NOT NULL + /* Should really use the following to only include eligible sites. */ + /* LAX_STRING(lighthouse.audits.['image-alt'].score) IS NOT NULL AND */ + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + date >= '2017-06-01' AND + is_root_page GROUP BY date, timestamp, diff --git a/sql/timeseries/a11yLabel.sql b/sql/timeseries/a11yLabel.sql index 867f03f..19557c4 100644 --- a/sql/timeseries/a11yLabel.sql +++ b/sql/timeseries/a11yLabel.sql @@ -1,13 +1,18 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(SUM(IF(JSON_EXTRACT(report, '$.audits.label.score') IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits,label.score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` WHERE - report IS NOT NULL + /* Should really use the following to only include eligible sites. */ + /* LAX_STRING(lighthouse.audits,label.score) IS NOT NULL AND */ + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + date >= '2017-06-01' AND + is_root_page GROUP BY date, timestamp, diff --git a/sql/timeseries/a11yLinkName.sql b/sql/timeseries/a11yLinkName.sql index 0416853..7063087 100644 --- a/sql/timeseries/a11yLinkName.sql +++ b/sql/timeseries/a11yLinkName.sql @@ -1,13 +1,18 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(SUM(IF(JSON_EXTRACT(report, '$.audits.link-name.score') IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits['link-name'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` WHERE - report IS NOT NULL + /* Should really use the following to only include eligible sites. */ + /* LAX_STRING(lighthouse.audits['link-name'].score) IS NOT NULL AND */ + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + date >= '2017-06-01' AND + is_root_page GROUP BY date, timestamp, diff --git a/sql/timeseries/a11yScores.sql b/sql/timeseries/a11yScores.sql index 03a8f21..6090cb1 100644 --- a/sql/timeseries/a11yScores.sql +++ b/sql/timeseries/a11yScores.sql @@ -1,11 +1,10 @@ #standardSQL # Lighthouse changed format of scores in v3.0.0 released in July 2018 so handle old with a UDF -CREATE TEMPORARY FUNCTION getA11yScore(reportCategories STRING) +CREATE TEMPORARY FUNCTION getA11yScore(reportCategories JSON) RETURNS FLOAT64 DETERMINISTIC LANGUAGE js AS """ - $=JSON.parse(reportCategories); - if($) { - return $.find(i => i.name === 'Accessibility').score; + if(reportCategories) { + return reportCategories.find(i => i.name === 'Accessibility').score; } """; @@ -20,13 +19,16 @@ SELECT ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(900)], 2) AS p90 FROM ( SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - IFNULL(CAST(JSON_EXTRACT(report, '$.categories.accessibility.score') AS FLOAT64) * 100, getA11yScore(JSON_EXTRACT(report, '$.reportCategories'))) AS score + format_timestamp('%Y_%m_%d', date) AS date, + client, + IFNULL(LAX_FLOAT64(lighthouse.categories.accessibility.score) * 100, getA11yScore(lighthouse.reportCategories)) AS score FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` WHERE - report IS NOT NULL + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + date >= '2017-06-01' AND + is_root_page ) GROUP BY date, diff --git a/sql/timeseries/bootupJs.sql b/sql/timeseries/bootupJs.sql index 5efd34c..ea98557 100644 --- a/sql/timeseries/bootupJs.sql +++ b/sql/timeseries/bootupJs.sql @@ -1,8 +1,8 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, + date, + timestamp, + client, ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(100)], 2) AS p10, ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(250)], 2) AS p25, ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(500)], 2) AS p50, @@ -10,13 +10,20 @@ SELECT ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(900)], 2) AS p90 FROM ( SELECT - _TABLE_SUFFIX AS _TABLE_SUFFIX, - CAST(IFNULL( - JSON_EXTRACT(report, '$.audits.bootup-time.numericValue'), - JSON_EXTRACT(report, '$.audits.bootup-time.rawValue') - ) AS FLOAT64) / 1000 AS value + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + IFNULL( + FLOAT64(lighthouse.audits['bootup-time'].numericValue), + FLOAT64(lighthouse.audits['bootup-time'].rawValue) + ) / 1000 AS value FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` + WHERE + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + date >= '2017-06-01' AND + is_root_page ) GROUP BY date, diff --git a/sql/timeseries/bytesCss.sql b/sql/timeseries/bytesCss.sql index 9bbdfd8..dc7007b 100644 --- a/sql/timeseries/bytesCss.sql +++ b/sql/timeseries/bytesCss.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(bytesCSS, 1001)[OFFSET(101)] / 1024, 2) AS p10, - ROUND(APPROX_QUANTILES(bytesCSS, 1001)[OFFSET(251)] / 1024, 2) AS p25, - ROUND(APPROX_QUANTILES(bytesCSS, 1001)[OFFSET(501)] / 1024, 2) AS p50, - ROUND(APPROX_QUANTILES(bytesCSS, 1001)[OFFSET(751)] / 1024, 2) AS p75, - ROUND(APPROX_QUANTILES(bytesCSS, 1001)[OFFSET(901)] / 1024, 2) AS p90 + date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesCss), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesCss), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesCss), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesCss), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesCss), 1001)[OFFSET(901)] / 1024, 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - bytesCSS > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.bytesCss) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/bytesFont.sql b/sql/timeseries/bytesFont.sql index fa2c89a..df40db1 100644 --- a/sql/timeseries/bytesFont.sql +++ b/sql/timeseries/bytesFont.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(bytesFont, 1001)[OFFSET(101)] / 1024, 2) AS p10, - ROUND(APPROX_QUANTILES(bytesFont, 1001)[OFFSET(251)] / 1024, 2) AS p25, - ROUND(APPROX_QUANTILES(bytesFont, 1001)[OFFSET(501)] / 1024, 2) AS p50, - ROUND(APPROX_QUANTILES(bytesFont, 1001)[OFFSET(751)] / 1024, 2) AS p75, - ROUND(APPROX_QUANTILES(bytesFont, 1001)[OFFSET(901)] / 1024, 2) AS p90 + date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesFont), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesFont), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesFont), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesFont), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesFont), 1001)[OFFSET(901)] / 1024, 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - bytesFont > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.bytesFont) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/bytesHtml.sql b/sql/timeseries/bytesHtml.sql index 005a5d3..6f93049 100644 --- a/sql/timeseries/bytesHtml.sql +++ b/sql/timeseries/bytesHtml.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(bytesHtml, 1001)[OFFSET(101)] / 1024, 2) AS p10, - ROUND(APPROX_QUANTILES(bytesHtml, 1001)[OFFSET(251)] / 1024, 2) AS p25, - ROUND(APPROX_QUANTILES(bytesHtml, 1001)[OFFSET(501)] / 1024, 2) AS p50, - ROUND(APPROX_QUANTILES(bytesHtml, 1001)[OFFSET(751)] / 1024, 2) AS p75, - ROUND(APPROX_QUANTILES(bytesHtml, 1001)[OFFSET(901)] / 1024, 2) AS p90 + date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesHtml), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesHtml), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesHtml), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesHtml), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesHtml), 1001)[OFFSET(901)] / 1024, 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - bytesHtml > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.bytesHtml) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/bytesImg.sql b/sql/timeseries/bytesImg.sql index ed83cc0..f4e09e4 100644 --- a/sql/timeseries/bytesImg.sql +++ b/sql/timeseries/bytesImg.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(bytesImg, 1001)[OFFSET(101)] / 1024, 2) AS p10, - ROUND(APPROX_QUANTILES(bytesImg, 1001)[OFFSET(251)] / 1024, 2) AS p25, - ROUND(APPROX_QUANTILES(bytesImg, 1001)[OFFSET(501)] / 1024, 2) AS p50, - ROUND(APPROX_QUANTILES(bytesImg, 1001)[OFFSET(751)] / 1024, 2) AS p75, - ROUND(APPROX_QUANTILES(bytesImg, 1001)[OFFSET(901)] / 1024, 2) AS p90 + date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesImg), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesImg), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesImg), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesImg), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesImg), 1001)[OFFSET(901)] / 1024, 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - bytesImg > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.bytesImg) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/bytesJs.sql b/sql/timeseries/bytesJs.sql index 092a2d4..20c20f9 100644 --- a/sql/timeseries/bytesJs.sql +++ b/sql/timeseries/bytesJs.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(bytesJS, 1001)[OFFSET(101)] / 1024, 2) AS p10, - ROUND(APPROX_QUANTILES(bytesJS, 1001)[OFFSET(251)] / 1024, 2) AS p25, - ROUND(APPROX_QUANTILES(bytesJS, 1001)[OFFSET(501)] / 1024, 2) AS p50, - ROUND(APPROX_QUANTILES(bytesJS, 1001)[OFFSET(751)] / 1024, 2) AS p75, - ROUND(APPROX_QUANTILES(bytesJS, 1001)[OFFSET(901)] / 1024, 2) AS p90 + date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesJS), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesJS), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesJS), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesJS), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesJS), 1001)[OFFSET(901)] / 1024, 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - bytesJS > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.bytesJS) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/bytesOther.sql b/sql/timeseries/bytesOther.sql index d92c01f..ac9fdb6 100644 --- a/sql/timeseries/bytesOther.sql +++ b/sql/timeseries/bytesOther.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(bytesOther, 1001)[OFFSET(101)] / 1024, 2) AS p10, - ROUND(APPROX_QUANTILES(bytesOther, 1001)[OFFSET(251)] / 1024, 2) AS p25, - ROUND(APPROX_QUANTILES(bytesOther, 1001)[OFFSET(501)] / 1024, 2) AS p50, - ROUND(APPROX_QUANTILES(bytesOther, 1001)[OFFSET(751)] / 1024, 2) AS p75, - ROUND(APPROX_QUANTILES(bytesOther, 1001)[OFFSET(901)] / 1024, 2) AS p90 + date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesOther), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesOther), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesOther), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesOther), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesOther), 1001)[OFFSET(901)] / 1024, 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - bytesOther > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.bytesOther) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/bytesTotal.sql b/sql/timeseries/bytesTotal.sql index ceec253..c6e9da8 100644 --- a/sql/timeseries/bytesTotal.sql +++ b/sql/timeseries/bytesTotal.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(bytesTotal, 1001)[OFFSET(101)] / 1024, 2) AS p10, - ROUND(APPROX_QUANTILES(bytesTotal, 1001)[OFFSET(251)] / 1024, 2) AS p25, - ROUND(APPROX_QUANTILES(bytesTotal, 1001)[OFFSET(501)] / 1024, 2) AS p50, - ROUND(APPROX_QUANTILES(bytesTotal, 1001)[OFFSET(751)] / 1024, 2) AS p75, - ROUND(APPROX_QUANTILES(bytesTotal, 1001)[OFFSET(901)] / 1024, 2) AS p90 + date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesTotal), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesTotal), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesTotal), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesTotal), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesTotal), 1001)[OFFSET(901)] / 1024, 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - bytesTotal > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.bytesTotal) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/bytesVideo.sql b/sql/timeseries/bytesVideo.sql index 0e78ffb..fddc697 100644 --- a/sql/timeseries/bytesVideo.sql +++ b/sql/timeseries/bytesVideo.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(bytesVideo, 1001)[OFFSET(101)] / 1024, 2) AS p10, - ROUND(APPROX_QUANTILES(bytesVideo, 1001)[OFFSET(251)] / 1024, 2) AS p25, - ROUND(APPROX_QUANTILES(bytesVideo, 1001)[OFFSET(501)] / 1024, 2) AS p50, - ROUND(APPROX_QUANTILES(bytesVideo, 1001)[OFFSET(751)] / 1024, 2) AS p75, - ROUND(APPROX_QUANTILES(bytesVideo, 1001)[OFFSET(901)] / 1024, 2) AS p90 + date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesVideo), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesVideo), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesVideo), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesVideo), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesVideo), 1001)[OFFSET(901)] / 1024, 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - bytesVideo > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.bytesVideo) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/canonical.sql b/sql/timeseries/canonical.sql index 2ca07ad..3f2ae72 100644 --- a/sql/timeseries/canonical.sql +++ b/sql/timeseries/canonical.sql @@ -1,14 +1,16 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(SUM(IF(JSON_EXTRACT(report, '$.audits.canonical.score') IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits.canonical.score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` WHERE - report IS NOT NULL AND - JSON_EXTRACT(report, '$.audits.canonical.score') IS NOT NULL + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + date >= '2017-06-01' AND + is_root_page GROUP BY date, timestamp, diff --git a/sql/timeseries/dcl.sql b/sql/timeseries/dcl.sql index 74cb5c9..b949605 100644 --- a/sql/timeseries/dcl.sql +++ b/sql/timeseries/dcl.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(onContentLoaded, 1001)[OFFSET(101)] / 1000, 2) AS p10, - ROUND(APPROX_QUANTILES(onContentLoaded, 1001)[OFFSET(251)] / 1000, 2) AS p25, - ROUND(APPROX_QUANTILES(onContentLoaded, 1001)[OFFSET(501)] / 1000, 2) AS p50, - ROUND(APPROX_QUANTILES(onContentLoaded, 1001)[OFFSET(751)] / 1000, 2) AS p75, - ROUND(APPROX_QUANTILES(onContentLoaded, 1001)[OFFSET(901)] / 1000, 2) AS p90 + date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onContentLoaded), 1001)[OFFSET(101)] / 1000, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onContentLoaded), 1001)[OFFSET(251)] / 1000, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onContentLoaded), 1001)[OFFSET(501)] / 1000, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onContentLoaded), 1001)[OFFSET(751)] / 1000, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onContentLoaded), 1001)[OFFSET(901)] / 1000, 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - onContentLoaded > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.onContentLoaded) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/fcp.sql b/sql/timeseries/fcp.sql index 38a2321..ee9a719 100644 --- a/sql/timeseries/fcp.sql +++ b/sql/timeseries/fcp.sql @@ -1,17 +1,18 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, "$['_chromeUserTiming.firstContentfulPaint']") AS FLOAT64), 1001)[OFFSET(101)] / 1024, 2) AS p10, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, "$['_chromeUserTiming.firstContentfulPaint']") AS FLOAT64), 1001)[OFFSET(251)] / 1024, 2) AS p25, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, "$['_chromeUserTiming.firstContentfulPaint']") AS FLOAT64), 1001)[OFFSET(501)] / 1024, 2) AS p50, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, "$['_chromeUserTiming.firstContentfulPaint']") AS FLOAT64), 1001)[OFFSET(751)] / 1024, 2) AS p75, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, "$['_chromeUserTiming.firstContentfulPaint']") AS FLOAT64), 1001)[OFFSET(901)] / 1024, 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']), 1001)[OFFSET(901)] / 1024, 2) AS p90 FROM - `httparchive.pages.*` + `httparchive.crawl.pages` WHERE - _TABLE_SUFFIX >= '2016_12_15' + date >= '2016-12-15' AND + is_root_page GROUP BY date, timestamp, diff --git a/sql/timeseries/fontDisplay.sql b/sql/timeseries/fontDisplay.sql index 28dee3b..4f1bb67 100644 --- a/sql/timeseries/fontDisplay.sql +++ b/sql/timeseries/fontDisplay.sql @@ -1,14 +1,17 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(SUM(IF(JSON_EXTRACT(report, '$.audits.font-display.score') IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits['font-display'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` WHERE - report IS NOT NULL AND - JSON_EXTRACT(report, '$.audits.font-display.score') IS NOT NULL + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + date >= '2017-06-01' AND + is_root_page AND + LAX_STRING(lighthouse.audits['font-display'].score) IS NOT NULL GROUP BY date, timestamp, diff --git a/sql/timeseries/gzipSavings.sql b/sql/timeseries/gzipSavings.sql index f5b44fb..0bd63ac 100644 --- a/sql/timeseries/gzipSavings.sql +++ b/sql/timeseries/gzipSavings.sql @@ -1,15 +1,17 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._gzip_savings') AS FLOAT64), 1001)[OFFSET(101)] / 1024, 2) AS p10, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._gzip_savings') AS FLOAT64), 1001)[OFFSET(251)] / 1024, 2) AS p25, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._gzip_savings') AS FLOAT64), 1001)[OFFSET(501)] / 1024, 2) AS p50, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._gzip_savings') AS FLOAT64), 1001)[OFFSET(751)] / 1024, 2) AS p75, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._gzip_savings') AS FLOAT64), 1001)[OFFSET(901)] / 1024, 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(payload._gzip_savings), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(payload._gzip_savings), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(payload._gzip_savings), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(payload._gzip_savings), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(payload._gzip_savings), 1001)[OFFSET(901)] / 1024, 2) AS p90 FROM - `httparchive.pages.*` + `httparchive.crawl.pages` +WHERE + is_root_page GROUP BY date, timestamp, diff --git a/sql/timeseries/h2.sql b/sql/timeseries/h2.sql index 26d4da1..9b73dd6 100644 --- a/sql/timeseries/h2.sql +++ b/sql/timeseries/h2.sql @@ -1,11 +1,14 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(SUM(IF(protocol = 'HTTP/2', 1, 0)) * 100 / COUNT(0), 2) AS percent + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(SUM(IF(LAX_STRING(summary.respHttpVersion) = 'HTTP/2', 1, 0)) * 100 / COUNT(0), 2) AS percent FROM - (SELECT page AS url, JSON_EXTRACT_SCALAR(payload, '$._protocol') AS protocol, _TABLE_SUFFIX AS _TABLE_SUFFIX FROM `httparchive.requests.*`) + `httparchive.crawl.requests` +WHERE + is_root_page AND + date >= '2016-07-15' GROUP BY date, timestamp, diff --git a/sql/timeseries/h3.sql b/sql/timeseries/h3.sql index 4eec8a4..af5ea4d 100644 --- a/sql/timeseries/h3.sql +++ b/sql/timeseries/h3.sql @@ -11,24 +11,26 @@ # when HTTP/3 is approved so we include that as it is HTTP/3 in all but name. # SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, ROUND( SUM( IF( - respHttpVersion IN ('HTTP/3', 'h3', 'h3-29') OR - reqHttpVersion IN ('HTTP/3', 'h3', 'h3-29') OR - REGEXP_EXTRACT(REGEXP_EXTRACT(respOtherHeaders, r'alt-svc = (.*)'), r'(.*?)(?:, [^ ]* = .*)?$') LIKE '%h3=%' OR - REGEXP_EXTRACT(REGEXP_EXTRACT(respOtherHeaders, r'alt-svc = (.*)'), r'(.*?)(?:, [^ ]* = .*)?$') LIKE '%h3-29=%', + LAX_STRING(summary.respHttpVersion) IN ('HTTP/3', 'h3', 'h3-29') OR + REGEXP_EXTRACT(REGEXP_EXTRACT(resp.value, r'(.*)'), r'(.*?)(?:, [^ ]* = .*)?$') LIKE '%h3=%' OR + REGEXP_EXTRACT(REGEXP_EXTRACT(resp.value, r'(.*)'), r'(.*?)(?:, [^ ]* = .*)?$') LIKE '%h3-29=%', 1, 0 ) ) * 100 / COUNT(0), 2 ) AS percent FROM - `httparchive.summary_requests.*` + `httparchive.crawl.requests` +LEFT OUTER JOIN + UNNEST (response_headers) AS resp ON (resp.name = 'alt-svc') WHERE - SUBSTR(_TABLE_SUFFIX, 0, 10) >= '2020_01_01' + date >= '2020-01-01' AND + is_root_page GROUP BY date, timestamp, diff --git a/sql/timeseries/hreflang.sql b/sql/timeseries/hreflang.sql index f034f87..ae4b1ed 100644 --- a/sql/timeseries/hreflang.sql +++ b/sql/timeseries/hreflang.sql @@ -1,14 +1,17 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(SUM(IF(JSON_EXTRACT(report, '$.audits.hreflang.score') IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits.hreflang.score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` WHERE - report IS NOT NULL AND - JSON_EXTRACT(report, '$.audits.hreflang.score') IS NOT NULL + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + date >= '2017-06-01' AND + is_root_page AND + LAX_STRING(lighthouse.audits.hreflang.score) IS NOT NULL GROUP BY date, timestamp, diff --git a/sql/timeseries/imgLazy.sql b/sql/timeseries/imgLazy.sql index d3a0d2a..5498477 100644 --- a/sql/timeseries/imgLazy.sql +++ b/sql/timeseries/imgLazy.sql @@ -1,13 +1,16 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(COUNT(DISTINCT IF(LOWER(attr) = '"lazy"', url, NULL)) * 100 / COUNT(DISTINCT url), 2) AS percent + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(COUNT(DISTINCT IF(LOWER(LAX_STRING(attr)) = 'lazy', page, NULL)) * 100 / COUNT(DISTINCT page), 2) AS percent FROM - `httparchive.pages.*` + `httparchive.crawl.pages` LEFT JOIN - UNNEST(JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, "$['_img-loading-attr']"), '$')) AS attr + UNNEST(JSON_EXTRACT_ARRAY(custom_metrics.other['img-loading-attr'])) AS attr +WHERE + is_root_page AND + date > '2016-01-01' GROUP BY date, timestamp, diff --git a/sql/timeseries/imgSavings.sql b/sql/timeseries/imgSavings.sql index 3c3839c..99c3788 100644 --- a/sql/timeseries/imgSavings.sql +++ b/sql/timeseries/imgSavings.sql @@ -1,15 +1,18 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._image_savings') AS FLOAT64), 1001)[OFFSET(101)] / 1024, 2) AS p10, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._image_savings') AS FLOAT64), 1001)[OFFSET(251)] / 1024, 2) AS p25, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._image_savings') AS FLOAT64), 1001)[OFFSET(501)] / 1024, 2) AS p50, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._image_savings') AS FLOAT64), 1001)[OFFSET(751)] / 1024, 2) AS p75, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._image_savings') AS FLOAT64), 1001)[OFFSET(901)] / 1024, 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(payload._image_savings), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(payload._image_savings), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(payload._image_savings), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(payload._image_savings), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(payload._image_savings), 1001)[OFFSET(901)] / 1024, 2) AS p90 FROM - `httparchive.pages.*` + `httparchive.crawl.pages` +WHERE + is_root_page AND + date >= '2016-01-01' GROUP BY date, timestamp, diff --git a/sql/timeseries/legible.sql b/sql/timeseries/legible.sql index b3821a2..dd29a47 100644 --- a/sql/timeseries/legible.sql +++ b/sql/timeseries/legible.sql @@ -1,14 +1,16 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(SUM(IF(JSON_EXTRACT(report, '$.audits.font-size.score') IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits['font-size'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` WHERE - report IS NOT NULL AND - JSON_EXTRACT(report, '$.audits.font-size.score') IS NOT NULL + lighthouse IS NOT NULL AND + date >= '2017-12-15' AND + is_root_page AND + LAX_STRING(lighthouse.audits['font-size'].score) IS NOT NULL GROUP BY date, timestamp, diff --git a/sql/timeseries/linkText.sql b/sql/timeseries/linkText.sql index 429112e..d7f2114 100644 --- a/sql/timeseries/linkText.sql +++ b/sql/timeseries/linkText.sql @@ -1,14 +1,16 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(SUM(IF(JSON_EXTRACT(report, '$.audits.link-text.score') IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits['link-text'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` WHERE - report IS NOT NULL AND - JSON_EXTRACT(report, '$.audits.link-text.score') IS NOT NULL + lighthouse IS NOT NULL AND + date >= '2017-11-15' AND + is_root_page AND + LAX_STRING(lighthouse.audits['link-text'].score) IS NOT NULL GROUP BY date, timestamp, diff --git a/sql/timeseries/numUrls.sql b/sql/timeseries/numUrls.sql index e5c5b0b..272265b 100644 --- a/sql/timeseries/numUrls.sql +++ b/sql/timeseries/numUrls.sql @@ -1,11 +1,14 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, + date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, COUNT(0) AS urls FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` +WHERE + date >= '2010-11-15' AND + is_root_page GROUP BY date, timestamp, diff --git a/sql/timeseries/offscreenImages.sql b/sql/timeseries/offscreenImages.sql index 72b351d..7594029 100644 --- a/sql/timeseries/offscreenImages.sql +++ b/sql/timeseries/offscreenImages.sql @@ -1,15 +1,18 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(IFNULL(CAST(JSON_EXTRACT(report, '$.audits.offscreen-images.details.overallSavingsBytes') AS INT64), CAST(JSON_EXTRACT(report, '$.audits.offscreen-images.extendedInfo.value.wastedKb') AS INT64) * 1024), 1001)[OFFSET(101)] / 1024, 2) AS p10, - ROUND(APPROX_QUANTILES(IFNULL(CAST(JSON_EXTRACT(report, '$.audits.offscreen-images.details.overallSavingsBytes') AS INT64), CAST(JSON_EXTRACT(report, '$.audits.offscreen-images.extendedInfo.value.wastedKb') AS INT64) * 1024), 1001)[OFFSET(251)] / 1024, 2) AS p25, - ROUND(APPROX_QUANTILES(IFNULL(CAST(JSON_EXTRACT(report, '$.audits.offscreen-images.details.overallSavingsBytes') AS INT64), CAST(JSON_EXTRACT(report, '$.audits.offscreen-images.extendedInfo.value.wastedKb') AS INT64) * 1024), 1001)[OFFSET(501)] / 1024, 2) AS p50, - ROUND(APPROX_QUANTILES(IFNULL(CAST(JSON_EXTRACT(report, '$.audits.offscreen-images.details.overallSavingsBytes') AS INT64), CAST(JSON_EXTRACT(report, '$.audits.offscreen-images.extendedInfo.value.wastedKb') AS INT64) * 1024), 1001)[OFFSET(751)] / 1024, 2) AS p75, - ROUND(APPROX_QUANTILES(IFNULL(CAST(JSON_EXTRACT(report, '$.audits.offscreen-images.details.overallSavingsBytes') AS INT64), CAST(JSON_EXTRACT(report, '$.audits.offscreen-images.extendedInfo.value.wastedKb') AS INT64) * 1024), 1001)[OFFSET(901)] / 1024, 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(901)] / 1024, 2) AS p90 FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` +WHERE + is_root_page AND + date >= '2017-06-01' GROUP BY date, timestamp, diff --git a/sql/timeseries/ol.sql b/sql/timeseries/ol.sql index cba8d6a..2c4c3ce 100644 --- a/sql/timeseries/ol.sql +++ b/sql/timeseries/ol.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(onLoad, 1001)[OFFSET(101)] / 1000, 2) AS p10, - ROUND(APPROX_QUANTILES(onLoad, 1001)[OFFSET(251)] / 1000, 2) AS p25, - ROUND(APPROX_QUANTILES(onLoad, 1001)[OFFSET(501)] / 1000, 2) AS p50, - ROUND(APPROX_QUANTILES(onLoad, 1001)[OFFSET(751)] / 1000, 2) AS p75, - ROUND(APPROX_QUANTILES(onLoad, 1001)[OFFSET(901)] / 1000, 2) AS p90 + date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onLoad), 1001)[OFFSET(101)] / 1000, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onLoad), 1001)[OFFSET(251)] / 1000, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onLoad), 1001)[OFFSET(501)] / 1000, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onLoad), 1001)[OFFSET(751)] / 1000, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onLoad), 1001)[OFFSET(901)] / 1000, 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - onLoad > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.onLoad) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/optimizedImages.sql b/sql/timeseries/optimizedImages.sql index 4b65792..aee6ae8 100644 --- a/sql/timeseries/optimizedImages.sql +++ b/sql/timeseries/optimizedImages.sql @@ -1,15 +1,18 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(IFNULL(CAST(JSON_EXTRACT(report, '$.audits.uses-optimized-images.details.overallSavingsBytes') AS INT64), CAST(JSON_EXTRACT(report, '$.audits.uses-optimized-images.extendedInfo.value.wastedKb') AS INT64) * 1024), 1001)[OFFSET(101)] / 1024, 2) AS p10, - ROUND(APPROX_QUANTILES(IFNULL(CAST(JSON_EXTRACT(report, '$.audits.uses-optimized-images.details.overallSavingsBytes') AS INT64), CAST(JSON_EXTRACT(report, '$.audits.uses-optimized-images.extendedInfo.value.wastedKb') AS INT64) * 1024), 1001)[OFFSET(251)] / 1024, 2) AS p25, - ROUND(APPROX_QUANTILES(IFNULL(CAST(JSON_EXTRACT(report, '$.audits.uses-optimized-images.details.overallSavingsBytes') AS INT64), CAST(JSON_EXTRACT(report, '$.audits.uses-optimized-images.extendedInfo.value.wastedKb') AS INT64) * 1024), 1001)[OFFSET(501)] / 1024, 2) AS p50, - ROUND(APPROX_QUANTILES(IFNULL(CAST(JSON_EXTRACT(report, '$.audits.uses-optimized-images.details.overallSavingsBytes') AS INT64), CAST(JSON_EXTRACT(report, '$.audits.uses-optimized-images.extendedInfo.value.wastedKb') AS INT64) * 1024), 1001)[OFFSET(751)] / 1024, 2) AS p75, - ROUND(APPROX_QUANTILES(IFNULL(CAST(JSON_EXTRACT(report, '$.audits.uses-optimized-images.details.overallSavingsBytes') AS INT64), CAST(JSON_EXTRACT(report, '$.audits.uses-optimized-images.extendedInfo.value.wastedKb') AS INT64) * 1024), 1001)[OFFSET(901)] / 1024, 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(901)] / 1024, 2) AS p90 FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` +WHERE + date >= '2017-06-01' AND + is_root_page GROUP BY date, timestamp, diff --git a/sql/timeseries/pctHttps.sql b/sql/timeseries/pctHttps.sql index bc6c9a2..828ea3b 100644 --- a/sql/timeseries/pctHttps.sql +++ b/sql/timeseries/pctHttps.sql @@ -1,11 +1,11 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, ROUND(SUM(IF(STARTS_WITH(request, 'https'), 1, 0)) * 100 / COUNT(0), 2) AS percent FROM - (SELECT url AS request, page AS url, _TABLE_SUFFIX AS _TABLE_SUFFIX FROM `httparchive.requests.*`) + (SELECT url AS request, page AS url, client, date FROM `httparchive.crawl.requests` WHERE is_root_page AND date >= '2016-01-01') GROUP BY date, timestamp, diff --git a/sql/timeseries/pctVuln.sql b/sql/timeseries/pctVuln.sql deleted file mode 100644 index 9c1c230..0000000 --- a/sql/timeseries/pctVuln.sql +++ /dev/null @@ -1,18 +0,0 @@ -#standardSQL -SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(SUM(IF(JSON_EXTRACT(report, '$.audits.no-vulnerable-libraries.score') IN ('false', '0'), 1, 0)) * 100 / COUNT(0), 2) AS percent -FROM - `httparchive.lighthouse.*` -WHERE - report IS NOT NULL AND - JSON_EXTRACT(report, '$.audits.no-vulnerable-libraries.score') IS NOT NULL -GROUP BY - date, - timestamp, - client -ORDER BY - date DESC, - client diff --git a/sql/timeseries/pwaScores.sql b/sql/timeseries/pwaScores.sql deleted file mode 100644 index 0247410..0000000 --- a/sql/timeseries/pwaScores.sql +++ /dev/null @@ -1,41 +0,0 @@ -#standardSQL -# Lighthouse changed format of scores in v3.0.0 released in July 2018 so handle old with a UDF -CREATE TEMPORARY FUNCTION getPWAScore(reportCategories STRING) -RETURNS FLOAT64 DETERMINISTIC -LANGUAGE js AS """ - $=JSON.parse(reportCategories); - if ($) { - return $.find(i => i.name === 'Progressive Web App').score; - } -"""; - -SELECT - date, - UNIX_DATE(CAST(REPLACE(date, '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - client, - ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(100)], 2) AS p10, - ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(250)], 2) AS p25, - ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(500)], 2) AS p50, - ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(750)], 2) AS p75, - ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(900)], 2) AS p90 -FROM ( - SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - IFNULL(CAST(JSON_EXTRACT(report, '$.categories.pwa.score') AS FLOAT64) * 100, getPWAScore(JSON_EXTRACT(report, '$.reportCategories'))) AS score - FROM - `httparchive.lighthouse.*` - WHERE - report IS NOT NULL AND - ( - JSON_EXTRACT(report, '$.audits.service-worker.score') = 'true' OR - JSON_EXTRACT(report, '$.audits.service-worker.score') = '1' - ) -) -GROUP BY - date, - timestamp, - client -ORDER BY - date DESC, - client; diff --git a/sql/timeseries/reqCss.sql b/sql/timeseries/reqCss.sql index 4e3d7ad..37604fe 100644 --- a/sql/timeseries/reqCss.sql +++ b/sql/timeseries/reqCss.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(reqCSS, 1001)[OFFSET(101)], 2) AS p10, - ROUND(APPROX_QUANTILES(reqCSS, 1001)[OFFSET(251)], 2) AS p25, - ROUND(APPROX_QUANTILES(reqCSS, 1001)[OFFSET(501)], 2) AS p50, - ROUND(APPROX_QUANTILES(reqCSS, 1001)[OFFSET(751)], 2) AS p75, - ROUND(APPROX_QUANTILES(reqCSS, 1001)[OFFSET(901)], 2) AS p90 + date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqCss), 1001)[OFFSET(101)], 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqCss), 1001)[OFFSET(251)], 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqCss), 1001)[OFFSET(501)], 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqCss), 1001)[OFFSET(751)], 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqCss), 1001)[OFFSET(901)], 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - reqCSS > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.reqCss) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/reqFont.sql b/sql/timeseries/reqFont.sql index 7c5cf56..1e76455 100644 --- a/sql/timeseries/reqFont.sql +++ b/sql/timeseries/reqFont.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(reqFont, 1001)[OFFSET(101)], 2) AS p10, - ROUND(APPROX_QUANTILES(reqFont, 1001)[OFFSET(251)], 2) AS p25, - ROUND(APPROX_QUANTILES(reqFont, 1001)[OFFSET(501)], 2) AS p50, - ROUND(APPROX_QUANTILES(reqFont, 1001)[OFFSET(751)], 2) AS p75, - ROUND(APPROX_QUANTILES(reqFont, 1001)[OFFSET(901)], 2) AS p90 + date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqFont), 1001)[OFFSET(101)], 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqFont), 1001)[OFFSET(251)], 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqFont), 1001)[OFFSET(501)], 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqFont), 1001)[OFFSET(751)], 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqFont), 1001)[OFFSET(901)], 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - reqFont > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.reqFont) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/reqHtml.sql b/sql/timeseries/reqHtml.sql index d7faea8..b173863 100644 --- a/sql/timeseries/reqHtml.sql +++ b/sql/timeseries/reqHtml.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(reqHtml, 1001)[OFFSET(101)], 2) AS p10, - ROUND(APPROX_QUANTILES(reqHtml, 1001)[OFFSET(251)], 2) AS p25, - ROUND(APPROX_QUANTILES(reqHtml, 1001)[OFFSET(501)], 2) AS p50, - ROUND(APPROX_QUANTILES(reqHtml, 1001)[OFFSET(751)], 2) AS p75, - ROUND(APPROX_QUANTILES(reqHtml, 1001)[OFFSET(901)], 2) AS p90 + date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqHtml), 1001)[OFFSET(101)], 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqHtml), 1001)[OFFSET(251)], 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqHtml), 1001)[OFFSET(501)], 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqHtml), 1001)[OFFSET(751)], 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqHtml), 1001)[OFFSET(901)], 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - reqHtml > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.reqHtml) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/reqImg.sql b/sql/timeseries/reqImg.sql index dac48ff..3a27087 100644 --- a/sql/timeseries/reqImg.sql +++ b/sql/timeseries/reqImg.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(reqImg, 1001)[OFFSET(101)], 2) AS p10, - ROUND(APPROX_QUANTILES(reqImg, 1001)[OFFSET(251)], 2) AS p25, - ROUND(APPROX_QUANTILES(reqImg, 1001)[OFFSET(501)], 2) AS p50, - ROUND(APPROX_QUANTILES(reqImg, 1001)[OFFSET(751)], 2) AS p75, - ROUND(APPROX_QUANTILES(reqImg, 1001)[OFFSET(901)], 2) AS p90 + date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqImg), 1001)[OFFSET(101)], 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqImg), 1001)[OFFSET(251)], 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqImg), 1001)[OFFSET(501)], 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqImg), 1001)[OFFSET(751)], 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqImg), 1001)[OFFSET(901)], 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - reqImg > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.reqImg) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/reqJs.sql b/sql/timeseries/reqJs.sql index 3a60a6a..1738800 100644 --- a/sql/timeseries/reqJs.sql +++ b/sql/timeseries/reqJs.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(reqJS, 1001)[OFFSET(101)], 2) AS p10, - ROUND(APPROX_QUANTILES(reqJS, 1001)[OFFSET(251)], 2) AS p25, - ROUND(APPROX_QUANTILES(reqJS, 1001)[OFFSET(501)], 2) AS p50, - ROUND(APPROX_QUANTILES(reqJS, 1001)[OFFSET(751)], 2) AS p75, - ROUND(APPROX_QUANTILES(reqJS, 1001)[OFFSET(901)], 2) AS p90 + date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqJS), 1001)[OFFSET(101)], 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqJS), 1001)[OFFSET(251)], 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqJS), 1001)[OFFSET(501)], 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqJS), 1001)[OFFSET(751)], 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqJS), 1001)[OFFSET(901)], 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - reqJS > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.reqJS) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/reqOther.sql b/sql/timeseries/reqOther.sql index 891811d..fa10869 100644 --- a/sql/timeseries/reqOther.sql +++ b/sql/timeseries/reqOther.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(reqOther, 1001)[OFFSET(101)], 2) AS p10, - ROUND(APPROX_QUANTILES(reqOther, 1001)[OFFSET(251)], 2) AS p25, - ROUND(APPROX_QUANTILES(reqOther, 1001)[OFFSET(501)], 2) AS p50, - ROUND(APPROX_QUANTILES(reqOther, 1001)[OFFSET(751)], 2) AS p75, - ROUND(APPROX_QUANTILES(reqOther, 1001)[OFFSET(901)], 2) AS p90 + date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqOther), 1001)[OFFSET(101)], 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqOther), 1001)[OFFSET(251)], 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqOther), 1001)[OFFSET(501)], 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqOther), 1001)[OFFSET(751)], 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqOther), 1001)[OFFSET(901)], 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - reqOther > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.reqOther) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/reqTotal.sql b/sql/timeseries/reqTotal.sql index 0f337f8..f132ca0 100644 --- a/sql/timeseries/reqTotal.sql +++ b/sql/timeseries/reqTotal.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(reqTotal, 1001)[OFFSET(101)], 2) AS p10, - ROUND(APPROX_QUANTILES(reqTotal, 1001)[OFFSET(251)], 2) AS p25, - ROUND(APPROX_QUANTILES(reqTotal, 1001)[OFFSET(501)], 2) AS p50, - ROUND(APPROX_QUANTILES(reqTotal, 1001)[OFFSET(751)], 2) AS p75, - ROUND(APPROX_QUANTILES(reqTotal, 1001)[OFFSET(901)], 2) AS p90 + date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqTotal), 1001)[OFFSET(101)], 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqTotal), 1001)[OFFSET(251)], 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqTotal), 1001)[OFFSET(501)], 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqTotal), 1001)[OFFSET(751)], 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqTotal), 1001)[OFFSET(901)], 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - reqTotal > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.reqTotal) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/reqVideo.sql b/sql/timeseries/reqVideo.sql index eb46b24..a1144e5 100644 --- a/sql/timeseries/reqVideo.sql +++ b/sql/timeseries/reqVideo.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(reqVideo, 1001)[OFFSET(101)], 2) AS p10, - ROUND(APPROX_QUANTILES(reqVideo, 1001)[OFFSET(251)], 2) AS p25, - ROUND(APPROX_QUANTILES(reqVideo, 1001)[OFFSET(501)], 2) AS p50, - ROUND(APPROX_QUANTILES(reqVideo, 1001)[OFFSET(751)], 2) AS p75, - ROUND(APPROX_QUANTILES(reqVideo, 1001)[OFFSET(901)], 2) AS p90 + date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqVideo), 1001)[OFFSET(101)], 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqVideo), 1001)[OFFSET(251)], 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqVideo), 1001)[OFFSET(501)], 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqVideo), 1001)[OFFSET(751)], 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqVideo), 1001)[OFFSET(901)], 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - reqVideo > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.reqVideo) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/speedIndex.sql b/sql/timeseries/speedIndex.sql index 199d651..a6902df 100644 --- a/sql/timeseries/speedIndex.sql +++ b/sql/timeseries/speedIndex.sql @@ -1,15 +1,18 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._SpeedIndex') AS FLOAT64), 1001)[OFFSET(101)] / 1000, 2) AS p10, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._SpeedIndex') AS FLOAT64), 1001)[OFFSET(251)] / 1000, 2) AS p25, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._SpeedIndex') AS FLOAT64), 1001)[OFFSET(501)] / 1000, 2) AS p50, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._SpeedIndex') AS FLOAT64), 1001)[OFFSET(751)] / 1000, 2) AS p75, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._SpeedIndex') AS FLOAT64), 1001)[OFFSET(901)] / 1000, 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(payload._SpeedIndex), 1001)[OFFSET(101)] / 1000, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(payload._SpeedIndex), 1001)[OFFSET(251)] / 1000, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(payload._SpeedIndex), 1001)[OFFSET(501)] / 1000, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(payload._SpeedIndex), 1001)[OFFSET(751)] / 1000, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(payload._SpeedIndex), 1001)[OFFSET(901)] / 1000, 2) AS p90 FROM - `httparchive.pages.*` + `httparchive.crawl.pages` +WHERE + is_root_page AND + date >= '2016-01-01' GROUP BY date, timestamp, diff --git a/sql/timeseries/tcp.sql b/sql/timeseries/tcp.sql index 65f165c..db01bdf 100644 --- a/sql/timeseries/tcp.sql +++ b/sql/timeseries/tcp.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - APPROX_QUANTILES(_connections, 1001)[OFFSET(101)] AS p10, - APPROX_QUANTILES(_connections, 1001)[OFFSET(251)] AS p25, - APPROX_QUANTILES(_connections, 1001)[OFFSET(501)] AS p50, - APPROX_QUANTILES(_connections, 1001)[OFFSET(751)] AS p75, - APPROX_QUANTILES(_connections, 1001)[OFFSET(901)] AS p90 + date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + APPROX_QUANTILES(FLOAT64(summary._connections), 1001)[OFFSET(101)] AS p10, + APPROX_QUANTILES(FLOAT64(summary._connections), 1001)[OFFSET(251)] AS p25, + APPROX_QUANTILES(FLOAT64(summary._connections), 1001)[OFFSET(501)] AS p50, + APPROX_QUANTILES(FLOAT64(summary._connections), 1001)[OFFSET(751)] AS p75, + APPROX_QUANTILES(FLOAT64(summary._connections), 1001)[OFFSET(901)] AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - _connections > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary._connections) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/ttci.sql b/sql/timeseries/ttci.sql index 67fa289..4052400 100644 --- a/sql/timeseries/ttci.sql +++ b/sql/timeseries/ttci.sql @@ -20,6 +20,8 @@ FROM ( ) AS FLOAT64) / 1000 AS value FROM `httparchive.lighthouse.*` + WHERE + _TABLE_SUFFIX < '2022_03_01' ) GROUP BY date, From 54405817f91d4d6b8172cd6438d56984d028d689 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Mon, 13 Jan 2025 12:25:04 +0000 Subject: [PATCH 02/25] Fix histograms --- sql/generate_reports.sh | 10 ++++++++-- sql/histograms/bootupJs.sql | 4 ++++ sql/histograms/cruxShopifyThemes.sql | 12 ++++++------ sql/histograms/offscreenImages.sql | 5 ++++- sql/histograms/reqHtml.sql | 2 +- sql/histograms/reqImg.sql | 2 +- sql/histograms/ttci.sql | 8 +++++++- 7 files changed, 31 insertions(+), 12 deletions(-) diff --git a/sql/generate_reports.sh b/sql/generate_reports.sh index 2e2fdec..c308e30 100755 --- a/sql/generate_reports.sh +++ b/sql/generate_reports.sh @@ -37,7 +37,7 @@ while getopts ":ftvh:l:r:" opt; do YYYY_MM_DD=${OPTARG} dateParts=(`echo ${OPTARG} | tr "_" "\\n"`) YYYYMM=${dateParts[0]}${dateParts[1]} - DATE=${dateParts[0]}-${dateParts[1]-${dateParts[2]} + DATE=${dateParts[0]}-${dateParts[1]}-${dateParts[2]} ;; t) GENERATE_TIMESERIES=1 @@ -68,6 +68,7 @@ fi # Tables representing desktop/mobile and HAR/CSV data sources must exist. DATED_TABLES_READY=0 if [ -n "$YYYY_MM_DD" ]; then + echo "Checking if tables are ready for ${DATE}..." DESKTOP_ROOT_PAGES_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.pages WHERE date = '${DATE}' AND client = 'desktop' AND is_root_page LIMIT 1" | tail -1) DESKTOP_NON_ROOT_PAGES_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.pages WHERE date = '${DATE}' AND client = 'desktop' AND is_root_page LIMIT 1" | tail -1) MOBILE_ROOT_PAGES_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.pages WHERE date = '${DATE}' AND client = 'mobile' AND NOT is_root_page LIMIT 1" | tail -1) @@ -76,6 +77,7 @@ if [ -n "$YYYY_MM_DD" ]; then DESKTOP_NON_ROOT_REQUESTS_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.requests WHERE date = '${DATE}' AND client = 'desktop' AND is_root_page LIMIT 1" | tail -1) MOBILE_ROOT_REQUESTS_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.requests WHERE date = '${DATE}' AND client = 'mobile' AND NOT is_root_page LIMIT 1" | tail -1) MOBILE_NON_ROOT_REQUESTS_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.requests WHERE date = '${DATE}' AND client = 'mobile' AND NOT is_root_page LIMIT 1" | tail -1) + echo "Finished checking if dates are ready" if [[ "$DESKTOP_ROOT_PAGES_EXIST" == true && "$DESKTOP_NON_ROOT_PAGES_EXIST" == true && "$MOBILE_ROOT_PAGES_EXIST" == true && "$MOBILE_NON_ROOT_PAGES_EXIST" == true && "$DESKTOP_ROOT_REQUESTS_EXIST" == true && "$DESKTOP_NON_ROOT_REQUESTS_EXIST" == true && "$MOBILE_ROOT_REQUESTS_EXIST" == true && "$MOBILE_NON_ROOT_REQUESTS_EXIST" == true ]]; then DATED_TABLES_READY=1 fi @@ -86,9 +88,13 @@ if [ $GENERATE_HISTOGRAM -ne 0 -a $DATED_TABLES_READY -ne 1 ]; then # List table data for debugging echo $(date) echo "Desktop root pages ready: ${DESKTOP_ROOT_PAGES_EXIST}" - echo "Desktop non-oot pages ready: ${DESKTOP_NON_ROOT_PAGES_EXIST}" + echo "Desktop non-root pages ready: ${DESKTOP_NON_ROOT_PAGES_EXIST}" echo "Mobile root pages ready: ${MOBILE_ROOT_PAGES_EXIST}" echo "Mobile non-root pages ready: ${MOBILE_NON_ROOT_PAGES_EXIST}" + echo "Desktop root requests ready: ${DESKTOP_ROOT_REQUESTS_EXIST}" + echo "Desktop non-root requests ready: ${DESKTOP_NON_ROOT_REQUESTS_EXIST}" + echo "Mobile root requests ready: ${MOBILE_ROOT_REQUESTS_EXIST}" + echo "Mobile non-root requests ready: ${MOBILE_NON_ROOT_REQUESTS_EXIST}" exit 1 fi diff --git a/sql/histograms/bootupJs.sql b/sql/histograms/bootupJs.sql index 07cbd71..e44601b 100644 --- a/sql/histograms/bootupJs.sql +++ b/sql/histograms/bootupJs.sql @@ -3,6 +3,10 @@ SELECT *, SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( SELECT client, COUNT(0) AS volume, diff --git a/sql/histograms/cruxShopifyThemes.sql b/sql/histograms/cruxShopifyThemes.sql index 850a05b..e932b33 100644 --- a/sql/histograms/cruxShopifyThemes.sql +++ b/sql/histograms/cruxShopifyThemes.sql @@ -17,14 +17,14 @@ WITH archive_pages AS ( SELECT client, page AS url, - JSON_VALUE(custom_metrics, '$.ecommerce.Shopify.theme.name') AS theme_name, - JSON_VALUE(custom_metrics, '$.ecommerce.Shopify.theme.theme_store_id') AS theme_store_id + JSON_VALUE(custom_metrics.ecommerce.Shopify.theme.name) AS theme_name, + JSON_VALUE(custom_metrics.ecommerce.Shopify.theme.theme_store_id) AS theme_store_id FROM - `httparchive.all.pages` + `httparchive.crawl.pages` WHERE - date = DATE(REPLACE('${YYYY_MM_DD}', '_', '-')) AND + date = '${YYYY-MM-DD}' AND is_root_page AND - JSON_VALUE(custom_metrics, '$.ecommerce.Shopify.theme.name') IS NOT NULL --first grab all shops for market share + JSON_VALUE(custom_metrics.ecommerce.Shopify.theme.name) IS NOT NULL --first grab all shops for market share ) SELECT @@ -176,7 +176,7 @@ JOIN ( -- Include null theme store ids so that we can get full market share within CrUX ON IFNULL(theme_names.theme_store_id, 'N/A') = IFNULL(archive_pages.theme_store_id, 'N/A') WHERE - date = DATE(REPLACE('${YYYY_MM_DD}', '_', '-')) AND + date = '${YYYY-MM-DD}' AND theme_names.rank = 1 GROUP BY client, diff --git a/sql/histograms/offscreenImages.sql b/sql/histograms/offscreenImages.sql index f77e2a0..bd1a33f 100644 --- a/sql/histograms/offscreenImages.sql +++ b/sql/histograms/offscreenImages.sql @@ -10,7 +10,10 @@ FROM ( SELECT client, COUNT(0) AS volume, - CAST(FLOOR(IFNULL(CAST(JSON_EXTRACT(lighthouse, '$.audits.offscreen-images.details.overallSavingsBytes') AS INT64), CAST(JSON_EXTRACT(lighthouse, '$.audits.offscreen-images.extendedInfo.value.wastedKb') AS INT64) * 1024) / 10240) * 10 AS INT64) AS bin + CAST(FLOOR(IFNULL( + INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), + INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) + * 1024) / 10240) * 10 AS INT64) AS bin FROM `httparchive.crawl.pages` WHERE diff --git a/sql/histograms/reqHtml.sql b/sql/histograms/reqHtml.sql index f18d596..82dd5a2 100644 --- a/sql/histograms/reqHtml.sql +++ b/sql/histograms/reqHtml.sql @@ -8,7 +8,7 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, FLOAT64(summary.reqHtml) AS bin FROM diff --git a/sql/histograms/reqImg.sql b/sql/histograms/reqImg.sql index a5fc334..5409c83 100644 --- a/sql/histograms/reqImg.sql +++ b/sql/histograms/reqImg.sql @@ -8,7 +8,7 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, FLOAT64(summary.reqImg) AS bin FROM diff --git a/sql/histograms/ttci.sql b/sql/histograms/ttci.sql index c69d0f2..7ba6de1 100644 --- a/sql/histograms/ttci.sql +++ b/sql/histograms/ttci.sql @@ -10,7 +10,13 @@ FROM ( SELECT client, COUNT(0) AS volume, - CAST(FLOOR(CAST(IFNULL(JSON_EXTRACT(lighthouse, '$.audits.interactive.numericValue'), IFNULL(JSON_EXTRACT(lighthouse, '$.audits.consistently-interactive.rawValue'), JSON_EXTRACT(lighthouse, '$.audits.interactive.rawValue'))) AS FLOAT64) / 1000) AS INT64) AS bin + CAST(FLOOR(CAST(IFNULL( + FLOAT64(lighthouse.audits.interactive.numericValue), + IFNULL( + FLOAT64(lighthouse.audits['consistently-interactive'].rawValue), + FLOAT64(lighthouse.audits.interactive.rawValue) + ) + ) AS FLOAT64) / 1000) AS INT64) AS bin FROM `httparchive.crawl.pages` WHERE From 51f981bd582a09cedb0f7689bac0662e6e7f5250 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Mon, 13 Jan 2025 12:50:08 +0000 Subject: [PATCH 03/25] Handle lenses --- sql/generate_reports.sh | 10 +++++----- sql/histograms/cruxShopifyThemes.sql | 10 +++++++++- sql/lens/drupal/timeseries.sql | 16 +++++++++++----- sql/lens/magento/timeseries.sql | 16 +++++++++++----- sql/lens/top100k/timeseries.sql | 17 +++++++++++------ sql/lens/top10k/timeseries.sql | 12 +++++++----- sql/lens/top1k/timeseries.sql | 12 +++++++----- sql/lens/top1m/timeseries.sql | 12 +++++++----- sql/lens/wordpress/timeseries.sql | 16 +++++++++++----- 9 files changed, 79 insertions(+), 42 deletions(-) diff --git a/sql/generate_reports.sh b/sql/generate_reports.sh index c308e30..9038f3b 100755 --- a/sql/generate_reports.sh +++ b/sql/generate_reports.sh @@ -151,7 +151,7 @@ else # Replace the date template in the query. if [[ $LENS != "" ]]; then echo -e "Generating ${metric} report for $LENS" - lens_join="JOIN ($(cat sql/lens/$LENS/histograms.sql | tr '\n' ' ')) USING (url, _TABLE_SUFFIX)" + lens_join="JOIN ($(cat sql/lens/$LENS/histograms.sql | tr '\n' ' ')) USING (page, client)" if [[ $metric == crux* ]]; then if [[ -f sql/lens/$LENS/crux_histograms.sql ]]; then echo "Using alternative crux lens join" @@ -269,12 +269,12 @@ else date_join="${date_join} AND yyyymmdd <= CAST(REPLACE(\"$YYYY_MM_DD\",\"_\",\"-\") AS DATE)" fi elif [[ $metric != crux* ]]; then # CrUX is quick and join is more compilicated so just do a full run of that - date_join="SUBSTR(_TABLE_SUFFIX, 0, 10) > \"$max_date\"" + date_join="date > CAST(REPLACE(\"$max_date\",\"_\",\"-\") AS DATE)" # Skip 2022_05_12 tables - date_join="${date_join} AND SUBSTR(_TABLE_SUFFIX, 0, 10) != \"2022_05_12\"" + date_join="${date_join} AND date != \"2022-05-12\"" if [[ -n "$YYYY_MM_DD" ]]; then # If a date is given, then only run up until then (in case next month is mid run as do not wanna get just desktop data) - date_join="${date_join} AND SUBSTR(_TABLE_SUFFIX, 0, 10) <= \"$YYYY_MM_DD\"" + date_join="${date_join} AND date <= \"$DATE\"" fi fi @@ -311,7 +311,7 @@ else # Skip 2022_05_12 tables date_join="${date_join} AND yyyymmdd != \"2022-05-12\"" elif [[ $metric != crux* ]]; then # CrUX is quick and join is more compilicated so just do a full run of that - date_join="SUBSTR(_TABLE_SUFFIX, 0, 10) <= \"$YYYY_MM_DD\"" + date_join="date <= \"$DATE\"" # Skip 2022_05_12 tables date_join="${date_join} AND date != \"2022-05-12\"" fi diff --git a/sql/histograms/cruxShopifyThemes.sql b/sql/histograms/cruxShopifyThemes.sql index e932b33..e70f391 100644 --- a/sql/histograms/cruxShopifyThemes.sql +++ b/sql/histograms/cruxShopifyThemes.sql @@ -12,8 +12,16 @@ CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor F good + needs_improvement + poor > 0 ); +-- Test CrUX data exists +WITH crux_test AS ( + SELECT + 1 + FROM + `chrome-ux-report.all.${YYYYMM}` +), + -- All Shopify shops in HTTPArchive -WITH archive_pages AS ( +archive_pages AS ( SELECT client, page AS url, diff --git a/sql/lens/drupal/timeseries.sql b/sql/lens/drupal/timeseries.sql index eb2b2df..7999feb 100644 --- a/sql/lens/drupal/timeseries.sql +++ b/sql/lens/drupal/timeseries.sql @@ -1,10 +1,16 @@ SELECT - url, - _TABLE_SUFFIX AS _TABLE_SUFFIX + page, + client, + date, + is_root_page FROM - `httparchive.technologies.*` + `httparchive.crawl.pages` WHERE - app = 'Drupal' + date >= '2010-11-15' AND + is_root_page AND + 'Drupal' in UNNEST(technologies.technology) GROUP BY 1, - 2 + 2, + 3, + 4 diff --git a/sql/lens/magento/timeseries.sql b/sql/lens/magento/timeseries.sql index 43c135a..cba87db 100644 --- a/sql/lens/magento/timeseries.sql +++ b/sql/lens/magento/timeseries.sql @@ -1,10 +1,16 @@ SELECT - url, - _TABLE_SUFFIX AS _TABLE_SUFFIX + page, + client, + date, + is_root_page FROM - `httparchive.technologies.*` + `httparchive.crawl.pages` WHERE - app = 'Magento' + date >= '2010-11-15' AND + is_root_page AND + 'Magento' in UNNEST(technologies.technology) GROUP BY 1, - 2 + 2, + 3, + 4 diff --git a/sql/lens/top100k/timeseries.sql b/sql/lens/top100k/timeseries.sql index d7db042..a87de37 100644 --- a/sql/lens/top100k/timeseries.sql +++ b/sql/lens/top100k/timeseries.sql @@ -1,11 +1,16 @@ SELECT - url, - _TABLE_SUFFIX AS _TABLE_SUFFIX + page, + client, + date, + is_root_page FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - rank <= 100000 AND - _TABLE_SUFFIX >= '2021_05_01' + date >= '2021-05-01' AND + is_root_page AND + rank <= 1000000 GROUP BY 1, - 2 + 2, + 3, + 4 diff --git a/sql/lens/top10k/timeseries.sql b/sql/lens/top10k/timeseries.sql index f8d40b1..b6e8cf0 100644 --- a/sql/lens/top10k/timeseries.sql +++ b/sql/lens/top10k/timeseries.sql @@ -1,14 +1,16 @@ SELECT - url, + page, client, - date + date, + is_root_page FROM `httparchive.crawl.pages` WHERE date >= '2021-05-01' AND - rank <= 10000 AND - is_root_page + is_root_page AND + rank <= 10000 GROUP BY 1, 2, - 3 + 3, + 4 diff --git a/sql/lens/top1k/timeseries.sql b/sql/lens/top1k/timeseries.sql index ff749ae..29240aa 100644 --- a/sql/lens/top1k/timeseries.sql +++ b/sql/lens/top1k/timeseries.sql @@ -1,14 +1,16 @@ SELECT - url, + page, client, - date + date, + is_root_page FROM `httparchive.crawl.pages` WHERE date >= '2021-05-01' AND - rank <= 1000 AND - is_root_page + is_root_page AND + rank <= 1000 GROUP BY 1, 2, - 3 + 3, + 4 diff --git a/sql/lens/top1m/timeseries.sql b/sql/lens/top1m/timeseries.sql index 3acd7cf..a87de37 100644 --- a/sql/lens/top1m/timeseries.sql +++ b/sql/lens/top1m/timeseries.sql @@ -1,14 +1,16 @@ SELECT - url, + page, client, - date + date, + is_root_page FROM `httparchive.crawl.pages` WHERE date >= '2021-05-01' AND - rank <= 1000000 AND - is_root_page + is_root_page AND + rank <= 1000000 GROUP BY 1, 2, - 3 + 3, + 4 diff --git a/sql/lens/wordpress/timeseries.sql b/sql/lens/wordpress/timeseries.sql index 9c618f0..415281e 100644 --- a/sql/lens/wordpress/timeseries.sql +++ b/sql/lens/wordpress/timeseries.sql @@ -1,10 +1,16 @@ SELECT - url, - _TABLE_SUFFIX AS _TABLE_SUFFIX + page, + client, + date, + is_root_page FROM - `httparchive.technologies.*` + `httparchive.crawl.pages` WHERE - app = 'WordPress' + date >= '2010-11-15' AND + is_root_page AND + 'WordPress' in UNNEST(technologies.technology) GROUP BY 1, - 2 + 2, + 3, + 4 From 89fe7a60f66adb6004202aef17c1eeefccb0383e Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Mon, 13 Jan 2025 14:22:02 +0000 Subject: [PATCH 04/25] Rethink lenses for timeseries --- sql/generate_reports.sh | 25 ++++++++++++++++------ sql/lens/drupal/blink_timeseries.sql | 28 ++++++++++++++++--------- sql/lens/drupal/crux_timeseries.sql | 15 +++++++++++++ sql/lens/drupal/timeseries.sql | 17 +-------------- sql/lens/magento/blink_timeseries.sql | 28 ++++++++++++++++--------- sql/lens/magento/crux_timeseries.sql | 15 +++++++++++++ sql/lens/magento/timeseries.sql | 17 +-------------- sql/lens/top100k/crux_timeseries.sql | 15 +++++++++++++ sql/lens/top100k/timeseries.sql | 17 +-------------- sql/lens/top10k/crux_timeseries.sql | 15 +++++++++++++ sql/lens/top10k/timeseries.sql | 17 +-------------- sql/lens/top1k/crux_timeseries.sql | 15 +++++++++++++ sql/lens/top1k/timeseries.sql | 17 +-------------- sql/lens/top1m/crux_timeseries.sql | 15 +++++++++++++ sql/lens/top1m/timeseries.sql | 17 +-------------- sql/lens/wordpress/blink_timeseries.sql | 28 ++++++++++++++++--------- sql/lens/wordpress/crux_timeseries.sql | 15 +++++++++++++ sql/lens/wordpress/timeseries.sql | 17 +-------------- sql/timeseries/h2.sql | 7 +++++-- sql/timeseries/h3.sql | 7 +++++-- sql/timeseries/pctHttps.sql | 10 +++++++-- 21 files changed, 203 insertions(+), 154 deletions(-) create mode 100644 sql/lens/drupal/crux_timeseries.sql create mode 100644 sql/lens/magento/crux_timeseries.sql create mode 100644 sql/lens/top100k/crux_timeseries.sql create mode 100644 sql/lens/top10k/crux_timeseries.sql create mode 100644 sql/lens/top1k/crux_timeseries.sql create mode 100644 sql/lens/top1m/crux_timeseries.sql create mode 100644 sql/lens/wordpress/crux_timeseries.sql diff --git a/sql/generate_reports.sh b/sql/generate_reports.sh index 9038f3b..a8c7685 100755 --- a/sql/generate_reports.sh +++ b/sql/generate_reports.sh @@ -344,24 +344,37 @@ else fi else - lens_join="JOIN ($(cat sql/lens/$LENS/timeseries.sql | tr '\n' ' ')) USING (url, _TABLE_SUFFIX)" - if [[ $metric == crux* ]]; then + if [[ $metric != crux* ]]; then + lens_clause="$(cat sql/lens/$LENS/timeseries.sql)" + lens_clause_and="$(cat sql/lens/$LENS/timeseries.sql) AND" + lens_join="" + else echo "CrUX query so using alternative lens join" - lens_join="JOIN ($(cat sql/lens/$LENS/timeseries.sql | tr '\n' ' ')) ON (origin || '\/' = url AND REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\\\\\\\d{4})(\\\\\\\\d{2})', '\\\\\\\\1_\\\\\\\\2_01') || '_' || IF(device = 'phone', 'mobile', device) = _TABLE_SUFFIX)" + lens_clause="" + lens_clause_and="" + lens_join="JOIN ($(cat sql/lens/$LENS/crux_timeseries.sql | tr '\n' ' ')) USING (origin, date, device)" fi if [[ -n "${date_join}" ]]; then if [[ $(grep -i "WHERE" $query) ]]; then # If WHERE clause already exists then add to it, before GROUP BY - sql=$(sed -e "s/\(WHERE\)/\1 $date_join AND/" $query \ + sql=$(sed -e "s/\(WHERE\)/\1 $lens_clause_and $date_join AND/" $query \ | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/") else # If WHERE clause does not exists then add it, before GROUP BY - sql=$(sed -e "s/\(GROUP BY\)/WHERE $date_join \1/" $query \ + sql=$(sed -e "s/\(GROUP BY\)/WHERE $lens_clause_and $date_join \1/" $query \ | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/") fi else - sql=$(sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/" $query) + if [[ $(grep -i "WHERE" $query) ]]; then + # If WHERE clause already exists then add to it, before GROUP BY + sql=$(sed -e "s/\(WHERE\)/\1 $lens_clause_and /" $query \ + | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/") + else + # If WHERE clause does not exists then add it, before GROUP BY + sql=$(sed -e "s/\(GROUP BY\)/WHERE $lens_clause \1/" $query \ + | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/") + fi fi fi diff --git a/sql/lens/drupal/blink_timeseries.sql b/sql/lens/drupal/blink_timeseries.sql index 03b5d5d..090f47b 100644 --- a/sql/lens/drupal/blink_timeseries.sql +++ b/sql/lens/drupal/blink_timeseries.sql @@ -12,17 +12,21 @@ FROM JOIN ( SELECT - _TABLE_SUFFIX AS _TABLE_SUFFIX, + page, + client AS page_client, + date, url AS tech_url FROM - `httparchive.technologies.*` + `httparchive.crawl.pages` WHERE + date >= '2016-11-15' AND app = 'Drupal' GROUP BY - _TABLE_SUFFIX, - tech_url + client, + date, + page ) -ON (url = tech_url AND _TABLE_SUFFIX = FORMAT_DATE('%Y_%m_%d', yyyymmdd) || '_' || client) +ON (url = page AND yyyymmdd = date AND client = page_client) JOIN ( SELECT yyyymmdd, @@ -32,17 +36,21 @@ JOIN ( JOIN ( SELECT - _TABLE_SUFFIX AS _TABLE_SUFFIX, + page, + client AS page_client, + date, url AS tech_url FROM - `httparchive.technologies.*` + `httparchive.crawl.pages` WHERE + date >= '2017-01-01' AND app = 'Drupal' GROUP BY - _TABLE_SUFFIX, - tech_url + client, + date, + page ) - ON (url = tech_url AND _TABLE_SUFFIX = FORMAT_DATE('%Y_%m_%d', yyyymmdd) || '_' || client) + ON (url = page AND yyyymmdd = date AND client = page_client) WHERE 1 = 1 {{ BLINK_DATE_JOIN }} diff --git a/sql/lens/drupal/crux_timeseries.sql b/sql/lens/drupal/crux_timeseries.sql new file mode 100644 index 0000000..f182b63 --- /dev/null +++ b/sql/lens/drupal/crux_timeseries.sql @@ -0,0 +1,15 @@ +SELECT + SUBSTR(page, 0, LENGTH(page) -1) AS origin, + IF(client = 'mobile', 'phone', client) AS device, + date +FROM + `httparchive.crawl.pages` +WHERE + date >= '2010-11-15' AND + is_root_page AND + 'Drupal' in UNNEST(technologies.technology) +GROUP BY + 1, + 2, + 3, + 4 diff --git a/sql/lens/drupal/timeseries.sql b/sql/lens/drupal/timeseries.sql index 7999feb..73c2fe5 100644 --- a/sql/lens/drupal/timeseries.sql +++ b/sql/lens/drupal/timeseries.sql @@ -1,16 +1 @@ -SELECT - page, - client, - date, - is_root_page -FROM - `httparchive.crawl.pages` -WHERE - date >= '2010-11-15' AND - is_root_page AND - 'Drupal' in UNNEST(technologies.technology) -GROUP BY - 1, - 2, - 3, - 4 +'Drupal' in UNNEST(technologies.technology) diff --git a/sql/lens/magento/blink_timeseries.sql b/sql/lens/magento/blink_timeseries.sql index aeb8742..27aa662 100644 --- a/sql/lens/magento/blink_timeseries.sql +++ b/sql/lens/magento/blink_timeseries.sql @@ -12,17 +12,21 @@ FROM JOIN ( SELECT - _TABLE_SUFFIX AS _TABLE_SUFFIX, + page, + client AS page_client, + date, url AS tech_url FROM - `httparchive.technologies.*` + `httparchive.crawl.pages` WHERE + date >= '2016-11-15' AND app = 'Magento' GROUP BY - _TABLE_SUFFIX, - tech_url + client, + date, + page ) -ON (url = tech_url AND _TABLE_SUFFIX = FORMAT_DATE('%Y_%m_%d', yyyymmdd) || '_' || client) +ON (url = page AND yyyymmdd = date AND client = page_client) JOIN ( SELECT yyyymmdd, @@ -32,17 +36,21 @@ JOIN ( JOIN ( SELECT - _TABLE_SUFFIX AS _TABLE_SUFFIX, + page, + client AS page_client, + date, url AS tech_url FROM - `httparchive.technologies.*` + `httparchive.crawl.pages` WHERE + date >= '2017-01-01' AND app = 'Magento' GROUP BY - _TABLE_SUFFIX, - tech_url + client, + date, + page ) - ON (url = tech_url AND _TABLE_SUFFIX = FORMAT_DATE('%Y_%m_%d', yyyymmdd) || '_' || client) + ON (url = page AND yyyymmdd = date AND client = page_client) WHERE 1 = 1 {{ BLINK_DATE_JOIN }} diff --git a/sql/lens/magento/crux_timeseries.sql b/sql/lens/magento/crux_timeseries.sql new file mode 100644 index 0000000..342c617 --- /dev/null +++ b/sql/lens/magento/crux_timeseries.sql @@ -0,0 +1,15 @@ +SELECT + SUBSTR(page, 0, LENGTH(page) -1) AS origin, + IF(client = 'mobile', 'phone', client) AS device, + date +FROM + `httparchive.crawl.pages` +WHERE + date >= '2010-11-15' AND + is_root_page AND + 'Megento' in UNNEST(technologies.technology) +GROUP BY + 1, + 2, + 3, + 4 diff --git a/sql/lens/magento/timeseries.sql b/sql/lens/magento/timeseries.sql index cba87db..4823997 100644 --- a/sql/lens/magento/timeseries.sql +++ b/sql/lens/magento/timeseries.sql @@ -1,16 +1 @@ -SELECT - page, - client, - date, - is_root_page -FROM - `httparchive.crawl.pages` -WHERE - date >= '2010-11-15' AND - is_root_page AND - 'Magento' in UNNEST(technologies.technology) -GROUP BY - 1, - 2, - 3, - 4 +'Magento' in UNNEST(technologies.technology) diff --git a/sql/lens/top100k/crux_timeseries.sql b/sql/lens/top100k/crux_timeseries.sql new file mode 100644 index 0000000..f105392 --- /dev/null +++ b/sql/lens/top100k/crux_timeseries.sql @@ -0,0 +1,15 @@ +SELECT + SUBSTR(page, 0, LENGTH(page) -1) AS origin, + IF(client = 'mobile', 'phone', client) AS device, + date +FROM + `httparchive.crawl.pages` +WHERE + date >= '2010-11-15' AND + is_root_page AND + rank = 100000 +GROUP BY + 1, + 2, + 3, + 4 diff --git a/sql/lens/top100k/timeseries.sql b/sql/lens/top100k/timeseries.sql index a87de37..da9eeaa 100644 --- a/sql/lens/top100k/timeseries.sql +++ b/sql/lens/top100k/timeseries.sql @@ -1,16 +1 @@ -SELECT - page, - client, - date, - is_root_page -FROM - `httparchive.crawl.pages` -WHERE - date >= '2021-05-01' AND - is_root_page AND - rank <= 1000000 -GROUP BY - 1, - 2, - 3, - 4 +rank <= 100000 diff --git a/sql/lens/top10k/crux_timeseries.sql b/sql/lens/top10k/crux_timeseries.sql new file mode 100644 index 0000000..8dc3d12 --- /dev/null +++ b/sql/lens/top10k/crux_timeseries.sql @@ -0,0 +1,15 @@ +SELECT + SUBSTR(page, 0, LENGTH(page) -1) AS origin, + IF(client = 'mobile', 'phone', client) AS device, + date +FROM + `httparchive.crawl.pages` +WHERE + date >= '2010-11-15' AND + is_root_page AND + rank = 10000 +GROUP BY + 1, + 2, + 3, + 4 diff --git a/sql/lens/top10k/timeseries.sql b/sql/lens/top10k/timeseries.sql index b6e8cf0..57dbc02 100644 --- a/sql/lens/top10k/timeseries.sql +++ b/sql/lens/top10k/timeseries.sql @@ -1,16 +1 @@ -SELECT - page, - client, - date, - is_root_page -FROM - `httparchive.crawl.pages` -WHERE - date >= '2021-05-01' AND - is_root_page AND - rank <= 10000 -GROUP BY - 1, - 2, - 3, - 4 +rank <= 10000 diff --git a/sql/lens/top1k/crux_timeseries.sql b/sql/lens/top1k/crux_timeseries.sql new file mode 100644 index 0000000..a833fe6 --- /dev/null +++ b/sql/lens/top1k/crux_timeseries.sql @@ -0,0 +1,15 @@ +SELECT + SUBSTR(page, 0, LENGTH(page) -1) AS origin, + IF(client = 'mobile', 'phone', client) AS device, + date +FROM + `httparchive.crawl.pages` +WHERE + date >= '2010-11-15' AND + is_root_page AND + rank = 1000 +GROUP BY + 1, + 2, + 3, + 4 diff --git a/sql/lens/top1k/timeseries.sql b/sql/lens/top1k/timeseries.sql index 29240aa..75ca1c8 100644 --- a/sql/lens/top1k/timeseries.sql +++ b/sql/lens/top1k/timeseries.sql @@ -1,16 +1 @@ -SELECT - page, - client, - date, - is_root_page -FROM - `httparchive.crawl.pages` -WHERE - date >= '2021-05-01' AND - is_root_page AND - rank <= 1000 -GROUP BY - 1, - 2, - 3, - 4 +rank <= 1000 diff --git a/sql/lens/top1m/crux_timeseries.sql b/sql/lens/top1m/crux_timeseries.sql new file mode 100644 index 0000000..e480ce7 --- /dev/null +++ b/sql/lens/top1m/crux_timeseries.sql @@ -0,0 +1,15 @@ +SELECT + SUBSTR(page, 0, LENGTH(page) -1) AS origin, + IF(client = 'mobile', 'phone', client) AS device, + date +FROM + `httparchive.crawl.pages` +WHERE + date >= '2010-11-15' AND + is_root_page AND + rank = 1000000 +GROUP BY + 1, + 2, + 3, + 4 diff --git a/sql/lens/top1m/timeseries.sql b/sql/lens/top1m/timeseries.sql index a87de37..57dbc02 100644 --- a/sql/lens/top1m/timeseries.sql +++ b/sql/lens/top1m/timeseries.sql @@ -1,16 +1 @@ -SELECT - page, - client, - date, - is_root_page -FROM - `httparchive.crawl.pages` -WHERE - date >= '2021-05-01' AND - is_root_page AND - rank <= 1000000 -GROUP BY - 1, - 2, - 3, - 4 +rank <= 10000 diff --git a/sql/lens/wordpress/blink_timeseries.sql b/sql/lens/wordpress/blink_timeseries.sql index caf9e78..80fa331 100644 --- a/sql/lens/wordpress/blink_timeseries.sql +++ b/sql/lens/wordpress/blink_timeseries.sql @@ -12,17 +12,21 @@ FROM JOIN ( SELECT - _TABLE_SUFFIX AS _TABLE_SUFFIX, + page, + client AS page_client, + date, url AS tech_url FROM - `httparchive.technologies.*` + `httparchive.crawl.pages` WHERE + date >= '2016-11-15' AND app = 'WordPress' GROUP BY - _TABLE_SUFFIX, - tech_url + client, + date, + page ) -ON (url = tech_url AND _TABLE_SUFFIX = FORMAT_DATE('%Y_%m_%d', yyyymmdd) || '_' || client) +ON (url = page AND yyyymmdd = date AND client = page_client) JOIN ( SELECT yyyymmdd, @@ -32,17 +36,21 @@ JOIN ( JOIN ( SELECT - _TABLE_SUFFIX AS _TABLE_SUFFIX, + page, + client AS page_client, + date, url AS tech_url FROM - `httparchive.technologies.*` + `httparchive.crawl.pages` WHERE + date >= '2017-01-01' AND app = 'WordPress' GROUP BY - _TABLE_SUFFIX, - tech_url + client, + date, + page ) - ON (url = tech_url AND _TABLE_SUFFIX = FORMAT_DATE('%Y_%m_%d', yyyymmdd) || '_' || client) + ON (url = page AND yyyymmdd = date AND client = page_client) WHERE 1 = 1 {{ BLINK_DATE_JOIN }} diff --git a/sql/lens/wordpress/crux_timeseries.sql b/sql/lens/wordpress/crux_timeseries.sql new file mode 100644 index 0000000..4a9683e --- /dev/null +++ b/sql/lens/wordpress/crux_timeseries.sql @@ -0,0 +1,15 @@ +SELECT + SUBSTR(page, 0, LENGTH(page) -1) AS origin, + IF(client = 'mobile', 'phone', client) AS device, + date +FROM + `httparchive.crawl.pages` +WHERE + date >= '2010-11-15' AND + is_root_page AND + 'WordPress' in UNNEST(technologies.technology) +GROUP BY + 1, + 2, + 3, + 4 diff --git a/sql/lens/wordpress/timeseries.sql b/sql/lens/wordpress/timeseries.sql index 415281e..d87563e 100644 --- a/sql/lens/wordpress/timeseries.sql +++ b/sql/lens/wordpress/timeseries.sql @@ -1,16 +1 @@ -SELECT - page, - client, - date, - is_root_page -FROM - `httparchive.crawl.pages` -WHERE - date >= '2010-11-15' AND - is_root_page AND - 'WordPress' in UNNEST(technologies.technology) -GROUP BY - 1, - 2, - 3, - 4 +'WordPress' in UNNEST(technologies.technology) diff --git a/sql/timeseries/h2.sql b/sql/timeseries/h2.sql index 9b73dd6..626be80 100644 --- a/sql/timeseries/h2.sql +++ b/sql/timeseries/h2.sql @@ -3,9 +3,12 @@ SELECT FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - ROUND(SUM(IF(LAX_STRING(summary.respHttpVersion) = 'HTTP/2', 1, 0)) * 100 / COUNT(0), 2) AS percent + ROUND(SUM(IF(LAX_STRING(r.summary.respHttpVersion) = 'HTTP/2', 1, 0)) * 100 / COUNT(0), 2) AS percent FROM - `httparchive.crawl.requests` + `httparchive.crawl.requests` r +INNER JOIN + `httparchive.crawl.pages` p +USING (date, client, is_root_page, rank) WHERE is_root_page AND date >= '2016-07-15' diff --git a/sql/timeseries/h3.sql b/sql/timeseries/h3.sql index af5ea4d..f020482 100644 --- a/sql/timeseries/h3.sql +++ b/sql/timeseries/h3.sql @@ -17,7 +17,7 @@ SELECT ROUND( SUM( IF( - LAX_STRING(summary.respHttpVersion) IN ('HTTP/3', 'h3', 'h3-29') OR + LAX_STRING(r.summary.respHttpVersion) IN ('HTTP/3', 'h3', 'h3-29') OR REGEXP_EXTRACT(REGEXP_EXTRACT(resp.value, r'(.*)'), r'(.*?)(?:, [^ ]* = .*)?$') LIKE '%h3=%' OR REGEXP_EXTRACT(REGEXP_EXTRACT(resp.value, r'(.*)'), r'(.*?)(?:, [^ ]* = .*)?$') LIKE '%h3-29=%', 1, 0 @@ -25,9 +25,12 @@ SELECT ) * 100 / COUNT(0), 2 ) AS percent FROM - `httparchive.crawl.requests` + `httparchive.crawl.requests` r LEFT OUTER JOIN UNNEST (response_headers) AS resp ON (resp.name = 'alt-svc') +INNER JOIN + `httparchive.crawl.pages` p +USING (date, client, is_root_page, rank) WHERE date >= '2020-01-01' AND is_root_page diff --git a/sql/timeseries/pctHttps.sql b/sql/timeseries/pctHttps.sql index 828ea3b..623f180 100644 --- a/sql/timeseries/pctHttps.sql +++ b/sql/timeseries/pctHttps.sql @@ -3,9 +3,15 @@ SELECT FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - ROUND(SUM(IF(STARTS_WITH(request, 'https'), 1, 0)) * 100 / COUNT(0), 2) AS percent + ROUND(SUM(IF(STARTS_WITH(url, 'https'), 1, 0)) * 100 / COUNT(0), 2) AS percent FROM - (SELECT url AS request, page AS url, client, date FROM `httparchive.crawl.requests` WHERE is_root_page AND date >= '2016-01-01') + `httparchive.crawl.requests` +INNER JOIN + `httparchive.crawl.pages` p +USING (date, client, is_root_page, rank) +WHERE + is_root_page AND + date >= '2016-01-01' GROUP BY date, timestamp, From cd0d1c7e207a4e4a57e4e980c8832f51b334b0ce Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Mon, 13 Jan 2025 16:26:35 +0000 Subject: [PATCH 05/25] Fix two reports --- sql/timeseries/a11yImageAlt.sql | 4 ++-- sql/timeseries/a11yLabel.sql | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sql/timeseries/a11yImageAlt.sql b/sql/timeseries/a11yImageAlt.sql index fdf5bff..94c3347 100644 --- a/sql/timeseries/a11yImageAlt.sql +++ b/sql/timeseries/a11yImageAlt.sql @@ -3,12 +3,12 @@ SELECT FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - ROUND(SUM(IF(LAX_STRING(lighthouse.audits.['image-alt'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent + ROUND(SUM(IF(LAX_STRING(lighthouse.audits['image-alt'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent FROM `httparchive.crawl.pages` WHERE /* Should really use the following to only include eligible sites. */ - /* LAX_STRING(lighthouse.audits.['image-alt'].score) IS NOT NULL AND */ + /* LAX_STRING(lighthouse.audits['image-alt'].score) IS NOT NULL AND */ lighthouse IS NOT NULL AND TO_JSON_STRING(lighthouse) != '{}' AND date >= '2017-06-01' AND diff --git a/sql/timeseries/a11yLabel.sql b/sql/timeseries/a11yLabel.sql index 19557c4..f6b29c0 100644 --- a/sql/timeseries/a11yLabel.sql +++ b/sql/timeseries/a11yLabel.sql @@ -3,12 +3,12 @@ SELECT FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - ROUND(SUM(IF(LAX_STRING(lighthouse.audits,label.score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent + ROUND(SUM(IF(LAX_STRING(lighthouse.audits.label.score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent FROM `httparchive.crawl.pages` WHERE /* Should really use the following to only include eligible sites. */ - /* LAX_STRING(lighthouse.audits,label.score) IS NOT NULL AND */ + /* LAX_STRING(lighthouse.audits.label.score) IS NOT NULL AND */ lighthouse IS NOT NULL AND TO_JSON_STRING(lighthouse) != '{}' AND date >= '2017-06-01' AND From ecf3e40ddbac8e1c7d4ff3fc473054af2f613aa7 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Mon, 13 Jan 2025 22:14:51 +0000 Subject: [PATCH 06/25] Revert date format in script --- sql/delete_date_from_reports.sh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sql/delete_date_from_reports.sh b/sql/delete_date_from_reports.sh index e36f4f4..596dd9f 100755 --- a/sql/delete_date_from_reports.sh +++ b/sql/delete_date_from_reports.sh @@ -4,9 +4,9 @@ # # Usage: # -# $ sql/delete_date_from_reports.sh -d YYYY-MM-DD -# $ sql/delete_date_from_reports.sh -d YYYY-MM-DD -l top1k -# $ sql/delete_date_from_reports.sh -d YYYY-MM-DD -l top1k -r "*crux*" +# $ sql/delete_date_from_reports.sh -d YYYY_MM_DD +# $ sql/delete_date_from_reports.sh -d YYYY_MM_DD -l top1k +# $ sql/delete_date_from_reports.sh -d YYYY_MM_DD -l top1k -r "*crux*" # # Flags: # @@ -26,7 +26,7 @@ NO_CHANGES=0 while getopts ":nvd:l:r:" opt; do case "${opt}" in d) - YYYY-MM-DD=${OPTARG} + YYYY_MM_DD=${OPTARG} ;; v) VERBOSE=1 @@ -43,12 +43,12 @@ while getopts ":nvd:l:r:" opt; do esac done -if [[ "${YYYY-MM-DD}" == "" ]]; then - echo "Usage $0 -d 2021-12-01" +if [[ "${YYYY_MM_DD}" == "" ]]; then + echo "Usage $0 -d 2021_12_01" exit 1 fi -echo "${YYYY-MM-DD}" +echo "${YYYY_MM_DD}" # Run all timeseries queries. for query in sql/timeseries/$REPORTS.sql; do @@ -96,7 +96,7 @@ for query in sql/timeseries/$REPORTS.sql; do echo "${current_contents}\n" fi - new_contents=$(echo "$current_contents" | jq -c --indent 1 --arg date "${YYYY-MM-DD}" '.[] | select(.date!=$date)' | tr -d '\n' | sed 's/^/[ /' | sed 's/}$/ } ]\n/' | sed 's/}{/ }, {/g') + new_contents=$(echo "$current_contents" | jq -c --indent 1 --arg date "${YYYY_MM_DD}" '.[] | select(.date!=$date)' | tr -d '\n' | sed 's/^/[ /' | sed 's/}$/ } ]\n/' | sed 's/}{/ }, {/g') if [ ${VERBOSE} -eq 1 ]; then echo "New JSON:" From 550778808256fda5f31397b4c1bb1ad408f3a512 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Mon, 13 Jan 2025 22:15:30 +0000 Subject: [PATCH 07/25] One more revert --- sql/generate_reports.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/generate_reports.sh b/sql/generate_reports.sh index a8c7685..e515118 100755 --- a/sql/generate_reports.sh +++ b/sql/generate_reports.sh @@ -4,7 +4,7 @@ # # Usage: # -# $ sql/generateReports.sh -t -h YYYY-MM-DD +# $ sql/generateReports.sh -t -h YYYY_MM_DD # # Flags: # From 269fedee8db571aa99a4ae8862a669d9bbf46476 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Mon, 13 Jan 2025 22:17:16 +0000 Subject: [PATCH 08/25] One more reversion --- sql/addDate.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/addDate.js b/sql/addDate.js index d2b4b63..540dd45 100755 --- a/sql/addDate.js +++ b/sql/addDate.js @@ -13,8 +13,8 @@ const fs = require('fs'); const date = process.argv[2]; if (!date) { - console.error(`You must pass a YYYY-MM-DD-formatted date as input. For example: - sql/addDate.js 2017-09-01`); + console.error(`You must pass a YYYY_MM_DD-formatted date as input. For example: + sql/addDate.js 2017_09_01`); process.exit(1); } From 15d8bcf5385bea39454970e0fc6589f3c36b6060 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Mon, 13 Jan 2025 22:22:20 +0000 Subject: [PATCH 09/25] Missed reports --- sql/timeseries/tcp.sql | 2 +- sql/timeseries/ttci.sql | 24 +++++++++++++----------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/sql/timeseries/tcp.sql b/sql/timeseries/tcp.sql index db01bdf..bf22f1f 100644 --- a/sql/timeseries/tcp.sql +++ b/sql/timeseries/tcp.sql @@ -1,6 +1,6 @@ #standardSQL SELECT - date, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, APPROX_QUANTILES(FLOAT64(summary._connections), 1001)[OFFSET(101)] AS p10, diff --git a/sql/timeseries/ttci.sql b/sql/timeseries/ttci.sql index 4052400..8574eb7 100644 --- a/sql/timeseries/ttci.sql +++ b/sql/timeseries/ttci.sql @@ -1,8 +1,8 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(100)], 2) AS p10, ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(250)], 2) AS p25, ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(500)], 2) AS p50, @@ -10,18 +10,20 @@ SELECT ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(900)], 2) AS p90 FROM ( SELECT - _TABLE_SUFFIX AS _TABLE_SUFFIX, - CAST(IFNULL( - JSON_EXTRACT(report, '$.audits.interactive.numericValue'), + client, + date, + IFNULL( + FLOAT64(lighthouse.audits.interactive.numericValue), IFNULL( - JSON_EXTRACT(report, '$.audits.interactive.rawValue'), - JSON_EXTRACT(report, '$.audits.consistently-interactive.rawValue') + FLOAT64(lighthouse.audits.interactive.rawValue), + FLOAT64(lighthouse.audits['consistently-interactive].rawValue) ) - ) AS FLOAT64) / 1000 AS value + ) / 1000 AS value FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` WHERE - _TABLE_SUFFIX < '2022_03_01' + is_root_page AND + date >= '2016-01-01' ) GROUP BY date, From 1b1a5ef0fac1ca34f0fd2f17743ffef2193e4dfe Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Mon, 13 Jan 2025 22:24:23 +0000 Subject: [PATCH 10/25] Typo --- sql/timeseries/ttci.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/timeseries/ttci.sql b/sql/timeseries/ttci.sql index 8574eb7..1ed7e73 100644 --- a/sql/timeseries/ttci.sql +++ b/sql/timeseries/ttci.sql @@ -16,7 +16,7 @@ FROM ( FLOAT64(lighthouse.audits.interactive.numericValue), IFNULL( FLOAT64(lighthouse.audits.interactive.rawValue), - FLOAT64(lighthouse.audits['consistently-interactive].rawValue) + FLOAT64(lighthouse.audits['consistently-interactive'].rawValue) ) ) / 1000 AS value FROM From 2cce5e2593dbf5a5396caca52a8f672e17a90989 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Mon, 13 Jan 2025 22:27:05 +0000 Subject: [PATCH 11/25] Update url to page --- sql/lens/top100k/histograms.sql | 2 +- sql/lens/top10k/histograms.sql | 2 +- sql/lens/top1k/histograms.sql | 2 +- sql/lens/top1m/histograms.sql | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sql/lens/top100k/histograms.sql b/sql/lens/top100k/histograms.sql index 4915598..f64ee4f 100644 --- a/sql/lens/top100k/histograms.sql +++ b/sql/lens/top100k/histograms.sql @@ -1,5 +1,5 @@ SELECT - url, + page, client FROM `httparchive.crawl.pages` diff --git a/sql/lens/top10k/histograms.sql b/sql/lens/top10k/histograms.sql index 33bc834..4183ed2 100644 --- a/sql/lens/top10k/histograms.sql +++ b/sql/lens/top10k/histograms.sql @@ -1,5 +1,5 @@ SELECT - url, + page, client FROM `httparchive.crawl.pages` diff --git a/sql/lens/top1k/histograms.sql b/sql/lens/top1k/histograms.sql index 6d35ac2..b5fd802 100644 --- a/sql/lens/top1k/histograms.sql +++ b/sql/lens/top1k/histograms.sql @@ -1,5 +1,5 @@ SELECT - url, + page, client FROM `httparchive.crawl.pages` diff --git a/sql/lens/top1m/histograms.sql b/sql/lens/top1m/histograms.sql index ce09f42..71e9ebf 100644 --- a/sql/lens/top1m/histograms.sql +++ b/sql/lens/top1m/histograms.sql @@ -1,5 +1,5 @@ SELECT - url, + page, client FROM `httparchive.crawl.pages` From eae67bcdd135b0c4d264ae42b8cea67d3cb6553d Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Mon, 13 Jan 2025 22:46:40 +0000 Subject: [PATCH 12/25] Linting --- sql/.sqlfluff | 4 ++- sql/.sqlfluffignore | 1 + sql/histograms/cruxShopifyThemes.sql | 2 +- sql/histograms/offscreenImages.sql | 4 +-- sql/histograms/optimizedImages.sql | 38 +++++++++++++------------- sql/lens/drupal/crux_histograms.sql | 4 +-- sql/lens/drupal/crux_timeseries.sql | 4 +-- sql/lens/drupal/histograms.sql | 2 +- sql/lens/drupal/timeseries.sql | 2 +- sql/lens/magento/crux_histograms.sql | 4 +-- sql/lens/magento/crux_timeseries.sql | 4 +-- sql/lens/magento/histograms.sql | 2 +- sql/lens/magento/timeseries.sql | 2 +- sql/lens/top100k/crux_timeseries.sql | 2 +- sql/lens/top10k/crux_timeseries.sql | 2 +- sql/lens/top1k/crux_timeseries.sql | 2 +- sql/lens/top1m/crux_timeseries.sql | 2 +- sql/lens/wordpress/crux_histograms.sql | 4 +-- sql/lens/wordpress/crux_timeseries.sql | 4 +-- sql/lens/wordpress/histograms.sql | 2 +- sql/lens/wordpress/timeseries.sql | 2 +- sql/timeseries/a11yButtonName.sql | 2 +- sql/timeseries/h2.sql | 2 +- sql/timeseries/h3.sql | 5 ++-- 24 files changed, 53 insertions(+), 49 deletions(-) diff --git a/sql/.sqlfluff b/sql/.sqlfluff index 3fcc852..d06c400 100644 --- a/sql/.sqlfluff +++ b/sql/.sqlfluff @@ -11,7 +11,7 @@ templater = jinja ## Comma separated list of rules to check, or None for all rules = None ## Comma separated list of rules to exclude, or None -exclude_rules = AL01,AL04,AL07,AL09,AM03,AM05,CP02,CP03,CV02,LT05,LT09,RF01,RF02,RF03,RF04,ST01,ST02,ST05,ST06,ST07 +exclude_rules = AL01,AL04,AL07,AL09,AM03,AM05,CP02,CP03,CV02,CV12,LT05,LT09,LT14,RF01,RF02,RF03,RF04,ST01,ST02,ST05,ST06,ST07 # AL04 - Asks for unique table aliases meaning it complains if selecting from two 2021_07_01 tables as implicit alias is table name (not fully qualified) so same. # AL07 - Avoid aliases in from and join - why? # AM03 - if using DESC in one ORDER BY column, then insist on ASC/DESC for all. @@ -19,8 +19,10 @@ exclude_rules = AL01,AL04,AL07,AL09,AM03,AM05,CP02,CP03,CV02,LT05,LT09,RF01,RF02 # CP02 - Unquoted identifiers (e.g. column names) will be mixed case so don't enforce case # CP03 - Function names will be mixed case so don't enforce case # CV02 - Use COALESCE instead of IFNULL or NVL. We think ISNULL is clearer. +# CV12 - Doesn't work with UNNEST. https://github.com/sqlfluff/sqlfluff/issues/6558 # LT05 - We allow longer lines as some of our queries are complex. Maybe should limit in future? # LT09 - Select targets should be on new lines but sub clauses don't always obey this. Maybe revisit in future? +# LT14 - Keywords on newline. We have some simple, single line joins # RF01 - BigQuery uses STRUCTS which can look like incorrect table references # RF02 - Asks for qualified columns for ambiguous ones, but we not qualify our columns, and they are not really ambiguous (or BigQuery would complain) # RF03 - Insists on references in column names even if not ambiguous. Bit OTT. diff --git a/sql/.sqlfluffignore b/sql/.sqlfluffignore index 7ef1f06..82fb062 100644 --- a/sql/.sqlfluffignore +++ b/sql/.sqlfluffignore @@ -1 +1,2 @@ /lens/*/crux_histograms.sql +/lens/*/timeseries.sql diff --git a/sql/histograms/cruxShopifyThemes.sql b/sql/histograms/cruxShopifyThemes.sql index e70f391..7b71403 100644 --- a/sql/histograms/cruxShopifyThemes.sql +++ b/sql/histograms/cruxShopifyThemes.sql @@ -13,7 +13,7 @@ CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor F ); -- Test CrUX data exists -WITH crux_test AS ( +WITH crux_test AS ( -- noqa: ST03 SELECT 1 FROM diff --git a/sql/histograms/offscreenImages.sql b/sql/histograms/offscreenImages.sql index bd1a33f..18dda95 100644 --- a/sql/histograms/offscreenImages.sql +++ b/sql/histograms/offscreenImages.sql @@ -12,8 +12,8 @@ FROM ( COUNT(0) AS volume, CAST(FLOOR(IFNULL( INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), - INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) - * 1024) / 10240) * 10 AS INT64) AS bin + INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024 + ) / 10240) * 10 AS INT64) AS bin FROM `httparchive.crawl.pages` WHERE diff --git a/sql/histograms/optimizedImages.sql b/sql/histograms/optimizedImages.sql index 3b79333..bb5ffca 100644 --- a/sql/histograms/optimizedImages.sql +++ b/sql/histograms/optimizedImages.sql @@ -6,25 +6,25 @@ FROM ( SELECT *, volume / SUM(volume) OVER (PARTITION BY client) AS pdf - FROM ( - SELECT - client, - COUNT(0) AS volume, - CAST(FLOOR(IFNULL( - INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), - INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) - * 1024) / 10240) * 10 AS INT64) AS bin - FROM - `httparchive.crawl.pages` - WHERE - date >= '2022-03-01' AND - date = '${YYYY-MM-DD}' AND - is_root_page - GROUP BY - bin, - client - HAVING - bin IS NOT NULL + FROM ( + SELECT + client, + COUNT(0) AS volume, + CAST(FLOOR(IFNULL( + INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), + INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024 + ) / 10240) * 10 AS INT64) AS bin + FROM + `httparchive.crawl.pages` + WHERE + date >= '2022-03-01' AND + date = '${YYYY-MM-DD}' AND + is_root_page + GROUP BY + bin, + client + HAVING + bin IS NOT NULL ) ) ORDER BY diff --git a/sql/lens/drupal/crux_histograms.sql b/sql/lens/drupal/crux_histograms.sql index 682012e..8d237c5 100644 --- a/sql/lens/drupal/crux_histograms.sql +++ b/sql/lens/drupal/crux_histograms.sql @@ -7,9 +7,9 @@ INNER JOIN `httparchive.crawl.pages` WHERE date = '${YYYY-MM-DD}' AND - 'Drupal' in UNNEST(technologies.technology) + 'Drupal' IN UNNEST(technologies.technology) GROUP BY 1, 2 ) -ON (SUBSTR(page, 0, LENGTH(page) -1) = origin AND form_factor.name = IF(client = 'desktop', 'desktop', 'phone')) +ON (SUBSTR(page, 0, LENGTH(page) - 1) = origin AND form_factor.name = IF(client = 'desktop', 'desktop', 'phone')) diff --git a/sql/lens/drupal/crux_timeseries.sql b/sql/lens/drupal/crux_timeseries.sql index f182b63..6786c5a 100644 --- a/sql/lens/drupal/crux_timeseries.sql +++ b/sql/lens/drupal/crux_timeseries.sql @@ -1,5 +1,5 @@ SELECT - SUBSTR(page, 0, LENGTH(page) -1) AS origin, + SUBSTR(page, 0, LENGTH(page) - 1) AS origin, IF(client = 'mobile', 'phone', client) AS device, date FROM @@ -7,7 +7,7 @@ FROM WHERE date >= '2010-11-15' AND is_root_page AND - 'Drupal' in UNNEST(technologies.technology) + 'Drupal' IN UNNEST(technologies.technology) GROUP BY 1, 2, diff --git a/sql/lens/drupal/histograms.sql b/sql/lens/drupal/histograms.sql index a539e99..bf7519f 100644 --- a/sql/lens/drupal/histograms.sql +++ b/sql/lens/drupal/histograms.sql @@ -5,7 +5,7 @@ FROM `httparchive.crawl.pages` WHERE date = '${YYYY-MM-DD}' AND - 'Drupal' in UNNEST(technologies.technology) + 'Drupal' IN UNNEST(technologies.technology) GROUP BY 1, 2 diff --git a/sql/lens/drupal/timeseries.sql b/sql/lens/drupal/timeseries.sql index 73c2fe5..86b5d99 100644 --- a/sql/lens/drupal/timeseries.sql +++ b/sql/lens/drupal/timeseries.sql @@ -1 +1 @@ -'Drupal' in UNNEST(technologies.technology) +'Drupal' IN UNNEST(technologies.technology) diff --git a/sql/lens/magento/crux_histograms.sql b/sql/lens/magento/crux_histograms.sql index eac905d..315debe 100644 --- a/sql/lens/magento/crux_histograms.sql +++ b/sql/lens/magento/crux_histograms.sql @@ -7,9 +7,9 @@ INNER JOIN `httparchive.crawl.pages` WHERE date = '${YYYY-MM-DD}' AND - 'Magento' in UNNEST(technologies.technology) + 'Magento' IN UNNEST(technologies.technology) GROUP BY 1, 2 ) -ON (SUBSTR(page, 0, LENGTH(page) -1) = origin AND form_factor.name = IF(client = 'desktop', 'desktop', 'phone')) +ON (SUBSTR(page, 0, LENGTH(page) - 1) = origin AND form_factor.name = IF(client = 'desktop', 'desktop', 'phone')) diff --git a/sql/lens/magento/crux_timeseries.sql b/sql/lens/magento/crux_timeseries.sql index 342c617..d8eb981 100644 --- a/sql/lens/magento/crux_timeseries.sql +++ b/sql/lens/magento/crux_timeseries.sql @@ -1,5 +1,5 @@ SELECT - SUBSTR(page, 0, LENGTH(page) -1) AS origin, + SUBSTR(page, 0, LENGTH(page) - 1) AS origin, IF(client = 'mobile', 'phone', client) AS device, date FROM @@ -7,7 +7,7 @@ FROM WHERE date >= '2010-11-15' AND is_root_page AND - 'Megento' in UNNEST(technologies.technology) + 'Megento' IN UNNEST(technologies.technology) GROUP BY 1, 2, diff --git a/sql/lens/magento/histograms.sql b/sql/lens/magento/histograms.sql index 7ee271f..ad7ee64 100644 --- a/sql/lens/magento/histograms.sql +++ b/sql/lens/magento/histograms.sql @@ -5,7 +5,7 @@ FROM `httparchive.crawl.pages` WHERE date = '${YYYY-MM-DD}' AND - 'Magento' in UNNEST(technologies.technology) + 'Magento' IN UNNEST(technologies.technology) GROUP BY 1, 2 diff --git a/sql/lens/magento/timeseries.sql b/sql/lens/magento/timeseries.sql index 4823997..55746e8 100644 --- a/sql/lens/magento/timeseries.sql +++ b/sql/lens/magento/timeseries.sql @@ -1 +1 @@ -'Magento' in UNNEST(technologies.technology) +'Magento' IN UNNEST(technologies.technology) diff --git a/sql/lens/top100k/crux_timeseries.sql b/sql/lens/top100k/crux_timeseries.sql index f105392..d2b4c8f 100644 --- a/sql/lens/top100k/crux_timeseries.sql +++ b/sql/lens/top100k/crux_timeseries.sql @@ -1,5 +1,5 @@ SELECT - SUBSTR(page, 0, LENGTH(page) -1) AS origin, + SUBSTR(page, 0, LENGTH(page) - 1) AS origin, IF(client = 'mobile', 'phone', client) AS device, date FROM diff --git a/sql/lens/top10k/crux_timeseries.sql b/sql/lens/top10k/crux_timeseries.sql index 8dc3d12..0241b42 100644 --- a/sql/lens/top10k/crux_timeseries.sql +++ b/sql/lens/top10k/crux_timeseries.sql @@ -1,5 +1,5 @@ SELECT - SUBSTR(page, 0, LENGTH(page) -1) AS origin, + SUBSTR(page, 0, LENGTH(page) - 1) AS origin, IF(client = 'mobile', 'phone', client) AS device, date FROM diff --git a/sql/lens/top1k/crux_timeseries.sql b/sql/lens/top1k/crux_timeseries.sql index a833fe6..fe19d42 100644 --- a/sql/lens/top1k/crux_timeseries.sql +++ b/sql/lens/top1k/crux_timeseries.sql @@ -1,5 +1,5 @@ SELECT - SUBSTR(page, 0, LENGTH(page) -1) AS origin, + SUBSTR(page, 0, LENGTH(page) - 1) AS origin, IF(client = 'mobile', 'phone', client) AS device, date FROM diff --git a/sql/lens/top1m/crux_timeseries.sql b/sql/lens/top1m/crux_timeseries.sql index e480ce7..134deb3 100644 --- a/sql/lens/top1m/crux_timeseries.sql +++ b/sql/lens/top1m/crux_timeseries.sql @@ -1,5 +1,5 @@ SELECT - SUBSTR(page, 0, LENGTH(page) -1) AS origin, + SUBSTR(page, 0, LENGTH(page) - 1) AS origin, IF(client = 'mobile', 'phone', client) AS device, date FROM diff --git a/sql/lens/wordpress/crux_histograms.sql b/sql/lens/wordpress/crux_histograms.sql index 6b416cc..fe83781 100644 --- a/sql/lens/wordpress/crux_histograms.sql +++ b/sql/lens/wordpress/crux_histograms.sql @@ -7,9 +7,9 @@ INNER JOIN `httparchive.crawl.pages` WHERE date = '${YYYY-MM-DD}' AND - 'WordPress' in UNNEST(technologies.technology) + 'WordPress' IN UNNEST(technologies.technology) GROUP BY 1, 2 ) -ON (SUBSTR(page, 0, LENGTH(page) -1) = origin AND form_factor.name = IF(client = 'desktop', 'desktop', 'phone')) +ON (SUBSTR(page, 0, LENGTH(page) - 1) = origin AND form_factor.name = IF(client = 'desktop', 'desktop', 'phone')) diff --git a/sql/lens/wordpress/crux_timeseries.sql b/sql/lens/wordpress/crux_timeseries.sql index 4a9683e..21c9cd4 100644 --- a/sql/lens/wordpress/crux_timeseries.sql +++ b/sql/lens/wordpress/crux_timeseries.sql @@ -1,5 +1,5 @@ SELECT - SUBSTR(page, 0, LENGTH(page) -1) AS origin, + SUBSTR(page, 0, LENGTH(page) - 1) AS origin, IF(client = 'mobile', 'phone', client) AS device, date FROM @@ -7,7 +7,7 @@ FROM WHERE date >= '2010-11-15' AND is_root_page AND - 'WordPress' in UNNEST(technologies.technology) + 'WordPress' IN UNNEST(technologies.technology) GROUP BY 1, 2, diff --git a/sql/lens/wordpress/histograms.sql b/sql/lens/wordpress/histograms.sql index 1b92037..e1a916d 100644 --- a/sql/lens/wordpress/histograms.sql +++ b/sql/lens/wordpress/histograms.sql @@ -5,7 +5,7 @@ FROM `httparchive.crawl.pages` WHERE date = '${YYYY-MM-DD}' AND - 'WordPress' in UNNEST(technologies.technology) + 'WordPress' IN UNNEST(technologies.technology) GROUP BY 1, 2 diff --git a/sql/lens/wordpress/timeseries.sql b/sql/lens/wordpress/timeseries.sql index d87563e..6ac1aa9 100644 --- a/sql/lens/wordpress/timeseries.sql +++ b/sql/lens/wordpress/timeseries.sql @@ -1 +1 @@ -'WordPress' in UNNEST(technologies.technology) +'WordPress' IN UNNEST(technologies.technology) diff --git a/sql/timeseries/a11yButtonName.sql b/sql/timeseries/a11yButtonName.sql index eb0b653..5dedac5 100644 --- a/sql/timeseries/a11yButtonName.sql +++ b/sql/timeseries/a11yButtonName.sql @@ -3,7 +3,7 @@ SELECT FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - ROUND(SUM(IF(LAX_STRING(lighthouse.audits['button-name'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits['button-name'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent FROM `httparchive.crawl.pages` WHERE diff --git a/sql/timeseries/h2.sql b/sql/timeseries/h2.sql index 626be80..3d0eed2 100644 --- a/sql/timeseries/h2.sql +++ b/sql/timeseries/h2.sql @@ -7,7 +7,7 @@ SELECT FROM `httparchive.crawl.requests` r INNER JOIN - `httparchive.crawl.pages` p + `httparchive.crawl.pages` USING (date, client, is_root_page, rank) WHERE is_root_page AND diff --git a/sql/timeseries/h3.sql b/sql/timeseries/h3.sql index f020482..4e50661 100644 --- a/sql/timeseries/h3.sql +++ b/sql/timeseries/h3.sql @@ -27,9 +27,10 @@ SELECT FROM `httparchive.crawl.requests` r LEFT OUTER JOIN - UNNEST (response_headers) AS resp ON (resp.name = 'alt-svc') + UNNEST(response_headers) AS resp +ON (resp.name = 'alt-svc') INNER JOIN - `httparchive.crawl.pages` p + `httparchive.crawl.pages` USING (date, client, is_root_page, rank) WHERE date >= '2020-01-01' AND From 5a86818fe5d5d2bbce7278e4eaeabd62b6f1c0d1 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Mon, 13 Jan 2025 23:05:38 +0000 Subject: [PATCH 13/25] Fix request queries --- sql/histograms/evalJs.sql | 3 +++ sql/timeseries/h2.sql | 2 +- sql/timeseries/h3.sql | 2 +- sql/timeseries/pctHttps.sql | 4 ++-- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/sql/histograms/evalJs.sql b/sql/histograms/evalJs.sql index 00ac184..4d2b200 100644 --- a/sql/histograms/evalJs.sql +++ b/sql/histograms/evalJs.sql @@ -13,6 +13,9 @@ FROM ( CAST(FLOAT64(payload['_cpu.EvaluateScript']) / 20 AS INT64) * 20 AS bin FROM `httparchive.crawl.requests` + INNER JOIN + `httparchive.crawl.pages` + USING (date, client, is_root_page, rank, page) WHERE date = '${YYYY-MM-DD}' AND is_root_page diff --git a/sql/timeseries/h2.sql b/sql/timeseries/h2.sql index 3d0eed2..2ce3681 100644 --- a/sql/timeseries/h2.sql +++ b/sql/timeseries/h2.sql @@ -8,7 +8,7 @@ FROM `httparchive.crawl.requests` r INNER JOIN `httparchive.crawl.pages` -USING (date, client, is_root_page, rank) +USING (date, client, is_root_page, rank, page) WHERE is_root_page AND date >= '2016-07-15' diff --git a/sql/timeseries/h3.sql b/sql/timeseries/h3.sql index 4e50661..dd15274 100644 --- a/sql/timeseries/h3.sql +++ b/sql/timeseries/h3.sql @@ -31,7 +31,7 @@ LEFT OUTER JOIN ON (resp.name = 'alt-svc') INNER JOIN `httparchive.crawl.pages` -USING (date, client, is_root_page, rank) +USING (date, client, is_root_page, rank, page) WHERE date >= '2020-01-01' AND is_root_page diff --git a/sql/timeseries/pctHttps.sql b/sql/timeseries/pctHttps.sql index 623f180..5ac3b7b 100644 --- a/sql/timeseries/pctHttps.sql +++ b/sql/timeseries/pctHttps.sql @@ -7,8 +7,8 @@ SELECT FROM `httparchive.crawl.requests` INNER JOIN - `httparchive.crawl.pages` p -USING (date, client, is_root_page, rank) + `httparchive.crawl.pages` +USING (date, client, is_root_page, rank, page) WHERE is_root_page AND date >= '2016-01-01' From cb2585ef1a7dd8db1bd603ffac2443653b495089 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Mon, 13 Jan 2025 23:37:26 +0000 Subject: [PATCH 14/25] Better bink_features implementation --- sql/generate_reports.sh | 103 +++++------------- sql/lens/drupal/blink_timeseries.sql | 76 ------------- sql/lens/magento/blink_timeseries.sql | 76 ------------- sql/lens/top100k/blink_timeseries.sql | 43 -------- sql/lens/top10k/blink_timeseries.sql | 42 ------- sql/lens/top1k/blink_timeseries.sql | 42 ------- sql/lens/top1m/blink_timeseries.sql | 42 ------- sql/lens/wordpress/blink_timeseries.sql | 76 ------------- sql/timeseries/asyncClipboardRead.sql | 18 +-- sql/timeseries/badgeClear.sql | 18 +-- sql/timeseries/badgeSet.sql | 18 +-- sql/timeseries/contentIndex.sql | 18 +-- sql/timeseries/getInstalledRelatedApps.sql | 18 +-- sql/timeseries/idleDetection.sql | 18 +-- sql/timeseries/notificationTriggers.sql | 18 +-- sql/timeseries/periodicBackgroundSync.sql | 18 +-- .../periodicBackgroundSyncRegister.sql | 18 +-- sql/timeseries/quicTransport.sql | 18 +-- sql/timeseries/screenWakeLock.sql | 18 +-- sql/timeseries/storageEstimate.sql | 18 +-- sql/timeseries/storagePersist.sql | 20 ++-- sql/timeseries/swControlledPages.sql | 19 ++-- sql/timeseries/webSocketStream.sql | 20 ++-- 23 files changed, 198 insertions(+), 577 deletions(-) delete mode 100644 sql/lens/drupal/blink_timeseries.sql delete mode 100644 sql/lens/magento/blink_timeseries.sql delete mode 100644 sql/lens/top100k/blink_timeseries.sql delete mode 100644 sql/lens/top10k/blink_timeseries.sql delete mode 100644 sql/lens/top1k/blink_timeseries.sql delete mode 100644 sql/lens/top1m/blink_timeseries.sql delete mode 100644 sql/lens/wordpress/blink_timeseries.sql diff --git a/sql/generate_reports.sh b/sql/generate_reports.sh index e515118..a71943e 100755 --- a/sql/generate_reports.sh +++ b/sql/generate_reports.sh @@ -254,21 +254,7 @@ else # Only run if new dates if [[ -z "${YYYY_MM_DD}" || "${max_date}" < "${YYYY_MM_DD}" ]]; then - if [[ $(grep "httparchive.blink_features.usage" $query) && $LENS == "" ]]; then # blink needs a special join, different for lenses - date_join="yyyymmdd > REPLACE(\"$max_date\",\"_\",\"\")" - if [[ -n "$YYYY_MM_DD" ]]; then - # If a date is given, then only run up until then (in case next month is mid-run as do not wanna get just desktop data) - date_join="${date_join} AND yyyymmdd <= REPLACE(\"$YYYY_MM_DD\",\"_\",\"\")" - fi - elif [[ $(grep "httparchive.blink_features.usage" $query) && $LENS != "" ]]; then # blink needs a special join, different for lenses - date_join="yyyymmdd > CAST(REPLACE(\"$max_date\",\"_\",\"-\") AS DATE)" - # Skip 2022_05_12 tables - date_join="${date_join} AND yyyymmdd != \"2022-05-12\"" - if [[ -n "$YYYY_MM_DD" ]]; then - # If a date is given, then only run up until then (in case next month is mid run as do not wanna get just desktop data) - date_join="${date_join} AND yyyymmdd <= CAST(REPLACE(\"$YYYY_MM_DD\",\"_\",\"-\") AS DATE)" - fi - elif [[ $metric != crux* ]]; then # CrUX is quick and join is more compilicated so just do a full run of that + if [[ $metric != crux* ]]; then # CrUX is quick and join is more compilicated so just do a full run of that date_join="date > CAST(REPLACE(\"$max_date\",\"_\",\"-\") AS DATE)" # Skip 2022_05_12 tables date_join="${date_join} AND date != \"2022-05-12\"" @@ -287,13 +273,7 @@ else elif [[ -n "$YYYY_MM_DD" ]]; then # Even if doing a force run we only wanna run up until date given in case next month is mid-run as do not wanna get just desktop data - if [[ $(grep "httparchive.blink_features.usage" $query) && $LENS == "" ]]; then # blink needs a special join, different for lenses - date_join="yyyymmdd <= \"$DATE\"" - elif [[ $(grep "httparchive.blink_features.usage" $query) && $LENS != "" ]]; then # blink needs a special join, different for lenses - date_join="yyyymmdd <= \"$DATE\"" - # Skip 2022_05_12 tables - date_join="${date_join} AND yyyymmdd != \"2022-05-12\"" - elif [[ $metric != crux* ]]; then # CrUX is quick and join is more compilicated so just do a full run of that + if [[ $metric != crux* ]]; then # CrUX is quick and join is more compilicated so just do a full run of that # If a date is given, then only run up until then (in case next month is mid run as do not wanna get just desktop data) date_join="date <= \"$DATE\"" # Skip 2022_05_12 tables @@ -304,13 +284,7 @@ else fi elif [[ -n "$YYYY_MM_DD" ]]; then # Even if the file does not exist we only wanna run up until date given in case next month is mid-run as do not wanna get just desktop data - if [[ $(grep "httparchive.blink_features.usage" $query) && $LENS == "" ]]; then # blink needs a special join, different for lenses - date_join="yyyymmdd <= \"$DATE\"" - elif [[ $(grep "httparchive.blink_features.usage" $query) && $LENS != "" ]]; then # blink needs a special join, different for lenses - date_join="yyyymmdd <= \"$DATE\"" - # Skip 2022_05_12 tables - date_join="${date_join} AND yyyymmdd != \"2022-05-12\"" - elif [[ $metric != crux* ]]; then # CrUX is quick and join is more compilicated so just do a full run of that + if [[ $metric != crux* ]]; then # CrUX is quick and join is more compilicated so just do a full run of that date_join="date <= \"$DATE\"" # Skip 2022_05_12 tables date_join="${date_join} AND date != \"2022-05-12\"" @@ -324,57 +298,36 @@ else if [[ $LENS != "" ]]; then - if [[ $(grep "httparchive.blink_features.usage" $query) ]]; then - # blink_features.usage need to be replace by blink_features.features for lenses - if [[ -f sql/lens/$LENS/blink_timeseries.sql ]]; then - echo "Using alternative blink_timeseries lens join" - lens_join="$(cat sql/lens/$LENS/blink_timeseries.sql | tr '\n' ' ')" - - # For blink features for lenses we have a BLINK_DATE_JOIN variable to replace - if [[ -z "${date_join}" ]]; then - sql=$(sed -e "s/\`httparchive.blink_features.usage\`/($lens_join)/" $query \ - | sed -e "s/ {{ BLINK_DATE_JOIN }}//g") - else - sql=$( sed -e "s/\`httparchive.blink_features.usage\`/($lens_join)/" $query \ - | sed -e "s/{{ BLINK_DATE_JOIN }}/AND $date_join/g") - fi - else - echo "blink_features.usage queries not supported for this lens so skipping lens" - continue - fi + if [[ $metric != crux* ]]; then + lens_clause="$(cat sql/lens/$LENS/timeseries.sql)" + lens_clause_and="$(cat sql/lens/$LENS/timeseries.sql) AND" + lens_join="" else + echo "CrUX query so using alternative lens join" + lens_clause="" + lens_clause_and="" + lens_join="JOIN ($(cat sql/lens/$LENS/crux_timeseries.sql | tr '\n' ' ')) USING (origin, date, device)" + fi - if [[ $metric != crux* ]]; then - lens_clause="$(cat sql/lens/$LENS/timeseries.sql)" - lens_clause_and="$(cat sql/lens/$LENS/timeseries.sql) AND" - lens_join="" + if [[ -n "${date_join}" ]]; then + if [[ $(grep -i "WHERE" $query) ]]; then + # If WHERE clause already exists then add to it, before GROUP BY + sql=$(sed -e "s/\(WHERE\)/\1 $lens_clause_and $date_join AND/" $query \ + | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/") else - echo "CrUX query so using alternative lens join" - lens_clause="" - lens_clause_and="" - lens_join="JOIN ($(cat sql/lens/$LENS/crux_timeseries.sql | tr '\n' ' ')) USING (origin, date, device)" + # If WHERE clause does not exists then add it, before GROUP BY + sql=$(sed -e "s/\(GROUP BY\)/WHERE $lens_clause_and $date_join \1/" $query \ + | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/") fi - - if [[ -n "${date_join}" ]]; then - if [[ $(grep -i "WHERE" $query) ]]; then - # If WHERE clause already exists then add to it, before GROUP BY - sql=$(sed -e "s/\(WHERE\)/\1 $lens_clause_and $date_join AND/" $query \ - | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/") - else - # If WHERE clause does not exists then add it, before GROUP BY - sql=$(sed -e "s/\(GROUP BY\)/WHERE $lens_clause_and $date_join \1/" $query \ - | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/") - fi + else + if [[ $(grep -i "WHERE" $query) ]]; then + # If WHERE clause already exists then add to it, before GROUP BY + sql=$(sed -e "s/\(WHERE\)/\1 $lens_clause_and /" $query \ + | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/") else - if [[ $(grep -i "WHERE" $query) ]]; then - # If WHERE clause already exists then add to it, before GROUP BY - sql=$(sed -e "s/\(WHERE\)/\1 $lens_clause_and /" $query \ - | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/") - else - # If WHERE clause does not exists then add it, before GROUP BY - sql=$(sed -e "s/\(GROUP BY\)/WHERE $lens_clause \1/" $query \ - | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/") - fi + # If WHERE clause does not exists then add it, before GROUP BY + sql=$(sed -e "s/\(GROUP BY\)/WHERE $lens_clause \1/" $query \ + | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/") fi fi diff --git a/sql/lens/drupal/blink_timeseries.sql b/sql/lens/drupal/blink_timeseries.sql deleted file mode 100644 index 090f47b..0000000 --- a/sql/lens/drupal/blink_timeseries.sql +++ /dev/null @@ -1,76 +0,0 @@ -SELECT - REGEXP_REPLACE(CAST(yyyymmdd AS STRING), '-', '') AS yyyymmdd, - client, - id, - feature, - type, - COUNT(0) AS num_urls, - MAX(total) AS total_urls, - SAFE_DIVIDE(COUNT(0), max(total)) AS num_urls_pct -FROM - `httparchive.blink_features.features` -JOIN - ( - SELECT - page, - client AS page_client, - date, - url AS tech_url - FROM - `httparchive.crawl.pages` - WHERE - date >= '2016-11-15' AND - app = 'Drupal' - GROUP BY - client, - date, - page - ) -ON (url = page AND yyyymmdd = date AND client = page_client) -JOIN ( - SELECT - yyyymmdd, - client, - COUNT(DISTINCT url) AS total - FROM `httparchive.blink_features.features` - JOIN - ( - SELECT - page, - client AS page_client, - date, - url AS tech_url - FROM - `httparchive.crawl.pages` - WHERE - date >= '2017-01-01' AND - app = 'Drupal' - GROUP BY - client, - date, - page - ) - ON (url = page AND yyyymmdd = date AND client = page_client) - WHERE - 1 = 1 - {{ BLINK_DATE_JOIN }} - GROUP BY - yyyymmdd, - client -) -USING (yyyymmdd, client) -WHERE - 1 = 1 - {{ BLINK_DATE_JOIN }} -GROUP BY - yyyymmdd, - client, - id, - feature, - type -ORDER BY - yyyymmdd, - client, - id, - feature, - type diff --git a/sql/lens/magento/blink_timeseries.sql b/sql/lens/magento/blink_timeseries.sql deleted file mode 100644 index 27aa662..0000000 --- a/sql/lens/magento/blink_timeseries.sql +++ /dev/null @@ -1,76 +0,0 @@ -SELECT - REGEXP_REPLACE(CAST(yyyymmdd AS STRING), '-', '') AS yyyymmdd, - client, - id, - feature, - type, - COUNT(0) AS num_urls, - MAX(total) AS total_urls, - SAFE_DIVIDE(COUNT(0), max(total)) AS num_urls_pct -FROM - `httparchive.blink_features.features` -JOIN - ( - SELECT - page, - client AS page_client, - date, - url AS tech_url - FROM - `httparchive.crawl.pages` - WHERE - date >= '2016-11-15' AND - app = 'Magento' - GROUP BY - client, - date, - page - ) -ON (url = page AND yyyymmdd = date AND client = page_client) -JOIN ( - SELECT - yyyymmdd, - client, - COUNT(DISTINCT url) AS total - FROM `httparchive.blink_features.features` - JOIN - ( - SELECT - page, - client AS page_client, - date, - url AS tech_url - FROM - `httparchive.crawl.pages` - WHERE - date >= '2017-01-01' AND - app = 'Magento' - GROUP BY - client, - date, - page - ) - ON (url = page AND yyyymmdd = date AND client = page_client) - WHERE - 1 = 1 - {{ BLINK_DATE_JOIN }} - GROUP BY - yyyymmdd, - client -) -USING (yyyymmdd, client) -WHERE - 1 = 1 - {{ BLINK_DATE_JOIN }} -GROUP BY - yyyymmdd, - client, - id, - feature, - type -ORDER BY - yyyymmdd, - client, - id, - feature, - type diff --git a/sql/lens/top100k/blink_timeseries.sql b/sql/lens/top100k/blink_timeseries.sql deleted file mode 100644 index e365003..0000000 --- a/sql/lens/top100k/blink_timeseries.sql +++ /dev/null @@ -1,43 +0,0 @@ -SELECT - REGEXP_REPLACE(CAST(yyyymmdd AS STRING), '-', '') AS yyyymmdd, - client, - id, - feature, - type, - COUNT(0) AS num_urls, - MAX(total) AS total_urls, - SAFE_DIVIDE(COUNT(0), max(total)) AS num_urls_pct -FROM - `httparchive.blink_features.features` -JOIN - ( - SELECT - yyyymmdd, - client, - COUNT(DISTINCT url) AS total - FROM `httparchive.blink_features.features` - WHERE - rank <= 100000 AND - yyyymmdd >= '2021-05-01' - {{ BLINK_DATE_JOIN }} - GROUP BY - yyyymmdd, - client - ) -USING (yyyymmdd, client) -WHERE - rank <= 100000 AND - yyyymmdd >= '2021-05-01' - {{ BLINK_DATE_JOIN }} -GROUP BY - yyyymmdd, - client, - id, - feature, - type -ORDER BY - yyyymmdd, - client, - id, - feature, - type diff --git a/sql/lens/top10k/blink_timeseries.sql b/sql/lens/top10k/blink_timeseries.sql deleted file mode 100644 index 3c85d9d..0000000 --- a/sql/lens/top10k/blink_timeseries.sql +++ /dev/null @@ -1,42 +0,0 @@ -SELECT - REGEXP_REPLACE(CAST(yyyymmdd AS STRING), '-', '') AS yyyymmdd, - client, - id, - feature, - type, - COUNT(0) AS num_urls, - MAX(total) AS total_urls, - SAFE_DIVIDE(COUNT(0), max(total)) AS num_urls_pct -FROM - `httparchive.blink_features.features` -JOIN ( - SELECT - yyyymmdd, - client, - COUNT(DISTINCT url) AS total - FROM `httparchive.blink_features.features` - WHERE - rank <= 10000 AND - yyyymmdd >= '2021-05-01' - {{ BLINK_DATE_JOIN }} - GROUP BY - yyyymmdd, - client -) -USING (yyyymmdd, client) -WHERE - rank <= 10000 AND - yyyymmdd >= '2021-05-01' - {{ BLINK_DATE_JOIN }} -GROUP BY - yyyymmdd, - client, - id, - feature, - type -ORDER BY - yyyymmdd, - client, - id, - feature, - type diff --git a/sql/lens/top1k/blink_timeseries.sql b/sql/lens/top1k/blink_timeseries.sql deleted file mode 100644 index 243acd8..0000000 --- a/sql/lens/top1k/blink_timeseries.sql +++ /dev/null @@ -1,42 +0,0 @@ -SELECT - REGEXP_REPLACE(CAST(yyyymmdd AS STRING), '-', '') AS yyyymmdd, - client, - id, - feature, - type, - COUNT(0) AS num_urls, - MAX(total) AS total_urls, - SAFE_DIVIDE(COUNT(0), max(total)) AS num_urls_pct -FROM - `httparchive.blink_features.features` -JOIN ( - SELECT - yyyymmdd, - client, - COUNT(DISTINCT url) AS total - FROM `httparchive.blink_features.features` - WHERE - rank <= 1000 AND - yyyymmdd >= '2021-05-01' - {{ BLINK_DATE_JOIN }} - GROUP BY - yyyymmdd, - client -) -USING (yyyymmdd, client) -WHERE - rank <= 1000 AND - yyyymmdd >= '2021-05-01' - {{ BLINK_DATE_JOIN }} -GROUP BY - yyyymmdd, - client, - id, - feature, - type -ORDER BY - yyyymmdd, - client, - id, - feature, - type diff --git a/sql/lens/top1m/blink_timeseries.sql b/sql/lens/top1m/blink_timeseries.sql deleted file mode 100644 index 81a262a..0000000 --- a/sql/lens/top1m/blink_timeseries.sql +++ /dev/null @@ -1,42 +0,0 @@ -SELECT - REGEXP_REPLACE(CAST(yyyymmdd AS STRING), '-', '') AS yyyymmdd, - client, - id, - feature, - type, - COUNT(0) AS num_urls, - MAX(total) AS total_urls, - SAFE_DIVIDE(COUNT(0), max(total)) AS num_urls_pct -FROM - `httparchive.blink_features.features` -JOIN ( - SELECT - yyyymmdd, - client, - COUNT(DISTINCT url) AS total - FROM `httparchive.blink_features.features` - WHERE - rank <= 1000000 AND - yyyymmdd >= '2021-05-01' - {{ BLINK_DATE_JOIN }} - GROUP BY - yyyymmdd, - client -) -USING (yyyymmdd, client) -WHERE - rank <= 1000000 AND - yyyymmdd >= '2021-05-01' - {{ BLINK_DATE_JOIN }} -GROUP BY - yyyymmdd, - client, - id, - feature, - type -ORDER BY - yyyymmdd, - client, - id, - feature, - type diff --git a/sql/lens/wordpress/blink_timeseries.sql b/sql/lens/wordpress/blink_timeseries.sql deleted file mode 100644 index 80fa331..0000000 --- a/sql/lens/wordpress/blink_timeseries.sql +++ /dev/null @@ -1,76 +0,0 @@ -SELECT - REGEXP_REPLACE(CAST(yyyymmdd AS STRING), '-', '') AS yyyymmdd, - client, - id, - feature, - type, - COUNT(0) AS num_urls, - MAX(total) AS total_urls, - SAFE_DIVIDE(COUNT(0), max(total)) AS num_urls_pct -FROM - `httparchive.blink_features.features` -JOIN - ( - SELECT - page, - client AS page_client, - date, - url AS tech_url - FROM - `httparchive.crawl.pages` - WHERE - date >= '2016-11-15' AND - app = 'WordPress' - GROUP BY - client, - date, - page - ) -ON (url = page AND yyyymmdd = date AND client = page_client) -JOIN ( - SELECT - yyyymmdd, - client, - COUNT(DISTINCT url) AS total - FROM `httparchive.blink_features.features` - JOIN - ( - SELECT - page, - client AS page_client, - date, - url AS tech_url - FROM - `httparchive.crawl.pages` - WHERE - date >= '2017-01-01' AND - app = 'WordPress' - GROUP BY - client, - date, - page - ) - ON (url = page AND yyyymmdd = date AND client = page_client) - WHERE - 1 = 1 - {{ BLINK_DATE_JOIN }} - GROUP BY - yyyymmdd, - client -) -USING (yyyymmdd, client) -WHERE - 1 = 1 - {{ BLINK_DATE_JOIN }} -GROUP BY - yyyymmdd, - client, - id, - feature, - type -ORDER BY - yyyymmdd, - client, - id, - feature, - type diff --git a/sql/timeseries/asyncClipboardRead.sql b/sql/timeseries/asyncClipboardRead.sql index 8a27b28..476f537 100644 --- a/sql/timeseries/asyncClipboardRead.sql +++ b/sql/timeseries/asyncClipboardRead.sql @@ -1,17 +1,21 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '2369' OR feature = 'AsyncClipboardAPIRead', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '2369' OR feature = 'AsyncClipboardAPIRead', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '2369' OR feat.feature = 'AsyncClipboardAPIRead') +WHERE + date = '2024-11-01' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, diff --git a/sql/timeseries/badgeClear.sql b/sql/timeseries/badgeClear.sql index 93332ac..556021e 100644 --- a/sql/timeseries/badgeClear.sql +++ b/sql/timeseries/badgeClear.sql @@ -1,17 +1,21 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '2727' OR feature = 'BadgeClear', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '2727' OR feature = 'BadgeClear', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '2727' OR feat.feature = 'BadgeClear') +WHERE + date = '2024-11-01' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, diff --git a/sql/timeseries/badgeSet.sql b/sql/timeseries/badgeSet.sql index bb3b206..152e951 100644 --- a/sql/timeseries/badgeSet.sql +++ b/sql/timeseries/badgeSet.sql @@ -1,17 +1,21 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '2726' OR feature = 'BadgeSet', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '2726' OR feature = 'BadgeSet', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '2726' OR feat.feature = 'BadgeSet') +WHERE + date = '2024-11-01' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, diff --git a/sql/timeseries/contentIndex.sql b/sql/timeseries/contentIndex.sql index 6762db1..2060d63 100644 --- a/sql/timeseries/contentIndex.sql +++ b/sql/timeseries/contentIndex.sql @@ -1,17 +1,21 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '2983' OR feature = 'ContentIndexAdd', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '2983' OR feature = 'ContentIndexAdd', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '2983' OR feat.feature = 'ContentIndexAdd') +WHERE + date = '2024-11-01' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, diff --git a/sql/timeseries/getInstalledRelatedApps.sql b/sql/timeseries/getInstalledRelatedApps.sql index 5d34788..deaa0be 100644 --- a/sql/timeseries/getInstalledRelatedApps.sql +++ b/sql/timeseries/getInstalledRelatedApps.sql @@ -1,17 +1,21 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '1870' OR feature = 'V8Navigator_GetInstalledRelatedApps_Method', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '1870' OR feature = 'V8Navigator_GetInstalledRelatedApps_Method', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '1870' OR feat.feature = 'V8Navigator_GetInstalledRelatedApps_Method') +WHERE + date = '2024-11-01' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, diff --git a/sql/timeseries/idleDetection.sql b/sql/timeseries/idleDetection.sql index c1419cb..f4a6ee3 100644 --- a/sql/timeseries/idleDetection.sql +++ b/sql/timeseries/idleDetection.sql @@ -1,17 +1,21 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '2834' OR feature = 'IdleDetectionStart', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '2834' OR feature = 'IdleDetectionStart', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '2834' OR feat.feature = 'IdleDetectionStart') +WHERE + date = '2024-11-01' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, diff --git a/sql/timeseries/notificationTriggers.sql b/sql/timeseries/notificationTriggers.sql index 2b404af..e988b2c 100644 --- a/sql/timeseries/notificationTriggers.sql +++ b/sql/timeseries/notificationTriggers.sql @@ -1,17 +1,21 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '3017' OR feature = 'NotificationShowTrigger', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '3017' OR feature = 'NotificationShowTrigger', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '3017' OR feat.feature = 'NotificationShowTrigger') +WHERE + date = '2024-11-01' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, diff --git a/sql/timeseries/periodicBackgroundSync.sql b/sql/timeseries/periodicBackgroundSync.sql index 87fd2e9..c83505c 100644 --- a/sql/timeseries/periodicBackgroundSync.sql +++ b/sql/timeseries/periodicBackgroundSync.sql @@ -1,17 +1,21 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '2930' OR feature = 'PeriodicBackgroundSync', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '2930' OR feature = 'PeriodicBackgroundSync', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '2930' OR feat.feature = 'PeriodicBackgroundSync') +WHERE + date = '2024-11-01' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, diff --git a/sql/timeseries/periodicBackgroundSyncRegister.sql b/sql/timeseries/periodicBackgroundSyncRegister.sql index 701b197..bde4190 100644 --- a/sql/timeseries/periodicBackgroundSyncRegister.sql +++ b/sql/timeseries/periodicBackgroundSyncRegister.sql @@ -1,17 +1,21 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '2931' OR feature = 'PeriodicBackgroundSyncRegister', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '2931' OR feature = 'PeriodicBackgroundSyncRegister', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '2931' OR feat.feature = 'PeriodicBackgroundSyncRegister') +WHERE + date = '2024-11-01' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, diff --git a/sql/timeseries/quicTransport.sql b/sql/timeseries/quicTransport.sql index c207a8f..a30b1b4 100644 --- a/sql/timeseries/quicTransport.sql +++ b/sql/timeseries/quicTransport.sql @@ -1,17 +1,21 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '3184' OR feature = 'QuicTransport', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '3184' OR feature = 'QuicTransport', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '3184' OR feat.feature = 'QuicTransport') +WHERE + date = '2024-11-01' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, diff --git a/sql/timeseries/screenWakeLock.sql b/sql/timeseries/screenWakeLock.sql index bb149a3..b34af06 100644 --- a/sql/timeseries/screenWakeLock.sql +++ b/sql/timeseries/screenWakeLock.sql @@ -1,17 +1,21 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '3005' OR feature = 'WakeLockAcquireScreenLock', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '3005' OR feature = 'WakeLockAcquireScreenLock', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '3005' OR feat.feature = 'WakeLockAcquireScreenLock') +WHERE + date = '2024-11-01' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, diff --git a/sql/timeseries/storageEstimate.sql b/sql/timeseries/storageEstimate.sql index e9b05e9..6b4d73c 100644 --- a/sql/timeseries/storageEstimate.sql +++ b/sql/timeseries/storageEstimate.sql @@ -1,17 +1,21 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '1371' OR feature = 'DurableStorageEstimate', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '1371' OR feature = 'DurableStorageEstimate', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '1371' OR feat.feature = 'DurableStorageEstimate') +WHERE + date = '2024-11-01' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, diff --git a/sql/timeseries/storagePersist.sql b/sql/timeseries/storagePersist.sql index e3cbc03..7882e9f 100644 --- a/sql/timeseries/storagePersist.sql +++ b/sql/timeseries/storagePersist.sql @@ -1,18 +1,24 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '1369' OR feature = 'DurableStoragePersist', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '1369' OR feature = 'DurableStoragePersist', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN + UNNEST(features) AS feat +ON (feat.id = '3018' OR feat.feature = 'DurableStoragePersist') +WHERE + date = '2024-11-01' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, num_urls DESC + diff --git a/sql/timeseries/swControlledPages.sql b/sql/timeseries/swControlledPages.sql index 2b48b1f..3272645 100644 --- a/sql/timeseries/swControlledPages.sql +++ b/sql/timeseries/swControlledPages.sql @@ -1,17 +1,22 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '990' OR feature = 'ServiceWorkerControlledPage', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '990' OR feature = 'ServiceWorkerControlledPage', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id = '990' OR feat.feature = 'ServiceWorkerControlledPage', 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id = '990' OR feat.feature = 'ServiceWorkerControlledPage', 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN + UNNEST(features) AS feat +ON (feat.id = '990' OR feat.feature = 'ServiceWorkerControlledPage') +WHERE + date = '2024-11-01' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, diff --git a/sql/timeseries/webSocketStream.sql b/sql/timeseries/webSocketStream.sql index a94cfaf..b90180d 100644 --- a/sql/timeseries/webSocketStream.sql +++ b/sql/timeseries/webSocketStream.sql @@ -1,18 +1,24 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '3018' OR feature = 'WebSocketStreamConstructor', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '3018' OR feature = 'WebSocketStreamConstructor', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id = '3018' OR feat.feature = 'WebSocketStreamConstructor', 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id = '3018' OR feat.feature = 'WebSocketStreamConstructor', 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN + UNNEST(features) AS feat +ON (feat.id = '3018' OR feat.feature = 'WebSocketStreamConstructor') +WHERE + date = '2024-11-01' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, num_urls DESC + From ea017f88112ce49d0053d23dd250cef36745919e Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Mon, 13 Jan 2025 23:38:38 +0000 Subject: [PATCH 15/25] Linting --- sql/timeseries/storagePersist.sql | 1 - sql/timeseries/webSocketStream.sql | 1 - 2 files changed, 2 deletions(-) diff --git a/sql/timeseries/storagePersist.sql b/sql/timeseries/storagePersist.sql index 7882e9f..fe6e194 100644 --- a/sql/timeseries/storagePersist.sql +++ b/sql/timeseries/storagePersist.sql @@ -21,4 +21,3 @@ ORDER BY date DESC, client, num_urls DESC - diff --git a/sql/timeseries/webSocketStream.sql b/sql/timeseries/webSocketStream.sql index b90180d..a756072 100644 --- a/sql/timeseries/webSocketStream.sql +++ b/sql/timeseries/webSocketStream.sql @@ -21,4 +21,3 @@ ORDER BY date DESC, client, num_urls DESC - From fb29cf030b10a011e36b70aecf0923c3f64d148d Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Mon, 13 Jan 2025 23:58:46 +0000 Subject: [PATCH 16/25] Fix date --- sql/timeseries/asyncClipboardRead.sql | 2 +- sql/timeseries/badgeClear.sql | 2 +- sql/timeseries/badgeSet.sql | 2 +- sql/timeseries/contentIndex.sql | 2 +- sql/timeseries/getInstalledRelatedApps.sql | 2 +- sql/timeseries/idleDetection.sql | 2 +- sql/timeseries/notificationTriggers.sql | 2 +- sql/timeseries/periodicBackgroundSync.sql | 2 +- sql/timeseries/periodicBackgroundSyncRegister.sql | 2 +- sql/timeseries/quicTransport.sql | 2 +- sql/timeseries/screenWakeLock.sql | 2 +- sql/timeseries/storageEstimate.sql | 2 +- sql/timeseries/storagePersist.sql | 2 +- sql/timeseries/swControlledPages.sql | 2 +- sql/timeseries/webSocketStream.sql | 2 +- 15 files changed, 15 insertions(+), 15 deletions(-) diff --git a/sql/timeseries/asyncClipboardRead.sql b/sql/timeseries/asyncClipboardRead.sql index 476f537..c712e87 100644 --- a/sql/timeseries/asyncClipboardRead.sql +++ b/sql/timeseries/asyncClipboardRead.sql @@ -10,7 +10,7 @@ FROM LEFT OUTER JOIN UNNEST(features) AS feat ON (feat.id = '2369' OR feat.feature = 'AsyncClipboardAPIRead') WHERE - date = '2024-11-01' AND + date >= '2016-11-15' AND is_root_page GROUP BY date, diff --git a/sql/timeseries/badgeClear.sql b/sql/timeseries/badgeClear.sql index 556021e..a8514f0 100644 --- a/sql/timeseries/badgeClear.sql +++ b/sql/timeseries/badgeClear.sql @@ -10,7 +10,7 @@ FROM LEFT OUTER JOIN UNNEST(features) AS feat ON (feat.id = '2727' OR feat.feature = 'BadgeClear') WHERE - date = '2024-11-01' AND + date >= '2016-11-15' AND is_root_page GROUP BY date, diff --git a/sql/timeseries/badgeSet.sql b/sql/timeseries/badgeSet.sql index 152e951..7655f7b 100644 --- a/sql/timeseries/badgeSet.sql +++ b/sql/timeseries/badgeSet.sql @@ -10,7 +10,7 @@ FROM LEFT OUTER JOIN UNNEST(features) AS feat ON (feat.id = '2726' OR feat.feature = 'BadgeSet') WHERE - date = '2024-11-01' AND + date >= '2016-11-15' AND is_root_page GROUP BY date, diff --git a/sql/timeseries/contentIndex.sql b/sql/timeseries/contentIndex.sql index 2060d63..d46a73d 100644 --- a/sql/timeseries/contentIndex.sql +++ b/sql/timeseries/contentIndex.sql @@ -10,7 +10,7 @@ FROM LEFT OUTER JOIN UNNEST(features) AS feat ON (feat.id = '2983' OR feat.feature = 'ContentIndexAdd') WHERE - date = '2024-11-01' AND + date >= '2016-11-15' AND is_root_page GROUP BY date, diff --git a/sql/timeseries/getInstalledRelatedApps.sql b/sql/timeseries/getInstalledRelatedApps.sql index deaa0be..b17be36 100644 --- a/sql/timeseries/getInstalledRelatedApps.sql +++ b/sql/timeseries/getInstalledRelatedApps.sql @@ -10,7 +10,7 @@ FROM LEFT OUTER JOIN UNNEST(features) AS feat ON (feat.id = '1870' OR feat.feature = 'V8Navigator_GetInstalledRelatedApps_Method') WHERE - date = '2024-11-01' AND + date >= '2016-11-15' AND is_root_page GROUP BY date, diff --git a/sql/timeseries/idleDetection.sql b/sql/timeseries/idleDetection.sql index f4a6ee3..70464a1 100644 --- a/sql/timeseries/idleDetection.sql +++ b/sql/timeseries/idleDetection.sql @@ -10,7 +10,7 @@ FROM LEFT OUTER JOIN UNNEST(features) AS feat ON (feat.id = '2834' OR feat.feature = 'IdleDetectionStart') WHERE - date = '2024-11-01' AND + date >= '2016-11-15' AND is_root_page GROUP BY date, diff --git a/sql/timeseries/notificationTriggers.sql b/sql/timeseries/notificationTriggers.sql index e988b2c..3a1cd98 100644 --- a/sql/timeseries/notificationTriggers.sql +++ b/sql/timeseries/notificationTriggers.sql @@ -10,7 +10,7 @@ FROM LEFT OUTER JOIN UNNEST(features) AS feat ON (feat.id = '3017' OR feat.feature = 'NotificationShowTrigger') WHERE - date = '2024-11-01' AND + date >= '2016-11-15' AND is_root_page GROUP BY date, diff --git a/sql/timeseries/periodicBackgroundSync.sql b/sql/timeseries/periodicBackgroundSync.sql index c83505c..5004969 100644 --- a/sql/timeseries/periodicBackgroundSync.sql +++ b/sql/timeseries/periodicBackgroundSync.sql @@ -10,7 +10,7 @@ FROM LEFT OUTER JOIN UNNEST(features) AS feat ON (feat.id = '2930' OR feat.feature = 'PeriodicBackgroundSync') WHERE - date = '2024-11-01' AND + date >= '2016-11-15' AND is_root_page GROUP BY date, diff --git a/sql/timeseries/periodicBackgroundSyncRegister.sql b/sql/timeseries/periodicBackgroundSyncRegister.sql index bde4190..99a25f0 100644 --- a/sql/timeseries/periodicBackgroundSyncRegister.sql +++ b/sql/timeseries/periodicBackgroundSyncRegister.sql @@ -10,7 +10,7 @@ FROM LEFT OUTER JOIN UNNEST(features) AS feat ON (feat.id = '2931' OR feat.feature = 'PeriodicBackgroundSyncRegister') WHERE - date = '2024-11-01' AND + date >= '2016-11-15' AND is_root_page GROUP BY date, diff --git a/sql/timeseries/quicTransport.sql b/sql/timeseries/quicTransport.sql index a30b1b4..e49ed8f 100644 --- a/sql/timeseries/quicTransport.sql +++ b/sql/timeseries/quicTransport.sql @@ -10,7 +10,7 @@ FROM LEFT OUTER JOIN UNNEST(features) AS feat ON (feat.id = '3184' OR feat.feature = 'QuicTransport') WHERE - date = '2024-11-01' AND + date >= '2016-11-15' AND is_root_page GROUP BY date, diff --git a/sql/timeseries/screenWakeLock.sql b/sql/timeseries/screenWakeLock.sql index b34af06..6946319 100644 --- a/sql/timeseries/screenWakeLock.sql +++ b/sql/timeseries/screenWakeLock.sql @@ -10,7 +10,7 @@ FROM LEFT OUTER JOIN UNNEST(features) AS feat ON (feat.id = '3005' OR feat.feature = 'WakeLockAcquireScreenLock') WHERE - date = '2024-11-01' AND + date >= '2016-11-15' AND is_root_page GROUP BY date, diff --git a/sql/timeseries/storageEstimate.sql b/sql/timeseries/storageEstimate.sql index 6b4d73c..6731a6d 100644 --- a/sql/timeseries/storageEstimate.sql +++ b/sql/timeseries/storageEstimate.sql @@ -10,7 +10,7 @@ FROM LEFT OUTER JOIN UNNEST(features) AS feat ON (feat.id = '1371' OR feat.feature = 'DurableStorageEstimate') WHERE - date = '2024-11-01' AND + date >= '2016-11-15' AND is_root_page GROUP BY date, diff --git a/sql/timeseries/storagePersist.sql b/sql/timeseries/storagePersist.sql index fe6e194..168c2f0 100644 --- a/sql/timeseries/storagePersist.sql +++ b/sql/timeseries/storagePersist.sql @@ -11,7 +11,7 @@ LEFT OUTER JOIN UNNEST(features) AS feat ON (feat.id = '3018' OR feat.feature = 'DurableStoragePersist') WHERE - date = '2024-11-01' AND + date >= '2016-11-15' AND is_root_page GROUP BY date, diff --git a/sql/timeseries/swControlledPages.sql b/sql/timeseries/swControlledPages.sql index 3272645..7355a79 100644 --- a/sql/timeseries/swControlledPages.sql +++ b/sql/timeseries/swControlledPages.sql @@ -11,7 +11,7 @@ LEFT OUTER JOIN UNNEST(features) AS feat ON (feat.id = '990' OR feat.feature = 'ServiceWorkerControlledPage') WHERE - date = '2024-11-01' AND + date >= '2016-11-15' AND is_root_page GROUP BY date, diff --git a/sql/timeseries/webSocketStream.sql b/sql/timeseries/webSocketStream.sql index a756072..7a99e89 100644 --- a/sql/timeseries/webSocketStream.sql +++ b/sql/timeseries/webSocketStream.sql @@ -11,7 +11,7 @@ LEFT OUTER JOIN UNNEST(features) AS feat ON (feat.id = '3018' OR feat.feature = 'WebSocketStreamConstructor') WHERE - date = '2024-11-01' AND + date >= '2016-11-15' AND is_root_page GROUP BY date, From 358224d3f8ceab13c3042ad955d66c1d87514676 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Tue, 14 Jan 2025 00:08:57 +0000 Subject: [PATCH 17/25] Better lens histograms --- sql/generate_reports.sh | 22 ++++++++++++++++++---- sql/lens/drupal/histograms.sql | 12 +----------- sql/lens/magento/histograms.sql | 12 +----------- sql/lens/top100k/histograms.sql | 13 +------------ sql/lens/top10k/histograms.sql | 13 +------------ sql/lens/top1k/histograms.sql | 13 +------------ sql/lens/top1m/histograms.sql | 13 +------------ sql/lens/wordpress/histograms.sql | 12 +----------- 8 files changed, 25 insertions(+), 85 deletions(-) diff --git a/sql/generate_reports.sh b/sql/generate_reports.sh index a71943e..8de7c6e 100755 --- a/sql/generate_reports.sh +++ b/sql/generate_reports.sh @@ -151,8 +151,13 @@ else # Replace the date template in the query. if [[ $LENS != "" ]]; then echo -e "Generating ${metric} report for $LENS" - lens_join="JOIN ($(cat sql/lens/$LENS/histograms.sql | tr '\n' ' ')) USING (page, client)" + lens_clause="$(cat sql/lens/$LENS/histograms.sql)" + lens_clause_and="$(cat sql/lens/$LENS/histograms.sql) AND" + lens_join="" + if [[ $metric == crux* ]]; then + lens_clause="" + lens_clause_and="" if [[ -f sql/lens/$LENS/crux_histograms.sql ]]; then echo "Using alternative crux lens join" lens_join="$(cat sql/lens/$LENS/crux_histograms.sql | sed -e "s/--noqa: disable=PRS//g" | tr '\n' ' ')" @@ -165,9 +170,18 @@ else | sed -e "s/\${YYYY-MM-DD}/$DATE/g" \ | sed -e "s/\${YYYYMM}/$YYYYMM/g") else - sql=$(sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/" $query \ - | sed -e "s/\${YYYY-MM-DD}/$DATE/g" \ - | sed -e "s/\${YYYYMM}/$YYYYMM/g") + + if [[ $(grep -i "WHERE" $query) ]]; then + # If WHERE clause already exists then add to it, before GROUP BY + sql=$(sed -e "s/\(WHERE\)/\1 $lens_clause_and /" $query \ + | sed -e "s/\${YYYY-MM-DD}/$DATE/g" \ + | sed -e "s/\${YYYYMM}/$YYYYMM/g") + else + # If WHERE clause does not exists then add it, before GROUP BY + sql=$(sed -e "s/\(GROUP BY\)/WHERE $lens_clause \1/" $query \ + | sed -e "s/\${YYYY-MM-DD}/$DATE/g" \ + | sed -e "s/\${YYYYMM}/$YYYYMM/g") + fi fi else echo -e "Generating ${metric} report for base (no lens)" diff --git a/sql/lens/drupal/histograms.sql b/sql/lens/drupal/histograms.sql index bf7519f..86b5d99 100644 --- a/sql/lens/drupal/histograms.sql +++ b/sql/lens/drupal/histograms.sql @@ -1,11 +1 @@ -SELECT - page, - client -FROM - `httparchive.crawl.pages` -WHERE - date = '${YYYY-MM-DD}' AND - 'Drupal' IN UNNEST(technologies.technology) -GROUP BY - 1, - 2 +'Drupal' IN UNNEST(technologies.technology) diff --git a/sql/lens/magento/histograms.sql b/sql/lens/magento/histograms.sql index ad7ee64..55746e8 100644 --- a/sql/lens/magento/histograms.sql +++ b/sql/lens/magento/histograms.sql @@ -1,11 +1 @@ -SELECT - page, - client -FROM - `httparchive.crawl.pages` -WHERE - date = '${YYYY-MM-DD}' AND - 'Magento' IN UNNEST(technologies.technology) -GROUP BY - 1, - 2 +'Magento' IN UNNEST(technologies.technology) diff --git a/sql/lens/top100k/histograms.sql b/sql/lens/top100k/histograms.sql index f64ee4f..da9eeaa 100644 --- a/sql/lens/top100k/histograms.sql +++ b/sql/lens/top100k/histograms.sql @@ -1,12 +1 @@ -SELECT - page, - client -FROM - `httparchive.crawl.pages` -WHERE - date = '${YYYY-MM-DD}' AND - is_root_page AND - rank <= 100000 -GROUP BY - 1, - 2 +rank <= 100000 diff --git a/sql/lens/top10k/histograms.sql b/sql/lens/top10k/histograms.sql index 4183ed2..57dbc02 100644 --- a/sql/lens/top10k/histograms.sql +++ b/sql/lens/top10k/histograms.sql @@ -1,12 +1 @@ -SELECT - page, - client -FROM - `httparchive.crawl.pages` -WHERE - date = '${YYYY-MM-DD}' AND - is_root_page AND - rank <= 10000 -GROUP BY - 1, - 2 +rank <= 10000 diff --git a/sql/lens/top1k/histograms.sql b/sql/lens/top1k/histograms.sql index b5fd802..75ca1c8 100644 --- a/sql/lens/top1k/histograms.sql +++ b/sql/lens/top1k/histograms.sql @@ -1,12 +1 @@ -SELECT - page, - client -FROM - `httparchive.crawl.pages` -WHERE - date = '${YYYY-MM-DD}' AND - is_root_page AND - rank <= 1000 -GROUP BY - 1, - 2 +rank <= 1000 diff --git a/sql/lens/top1m/histograms.sql b/sql/lens/top1m/histograms.sql index 71e9ebf..57dbc02 100644 --- a/sql/lens/top1m/histograms.sql +++ b/sql/lens/top1m/histograms.sql @@ -1,12 +1 @@ -SELECT - page, - client -FROM - `httparchive.crawl.pages` -WHERE - date = '${YYYY-MM-DD}' AND - is_root_page AND - rank <= 1000000 -GROUP BY - 1, - 2 +rank <= 10000 diff --git a/sql/lens/wordpress/histograms.sql b/sql/lens/wordpress/histograms.sql index e1a916d..6ac1aa9 100644 --- a/sql/lens/wordpress/histograms.sql +++ b/sql/lens/wordpress/histograms.sql @@ -1,11 +1 @@ -SELECT - page, - client -FROM - `httparchive.crawl.pages` -WHERE - date = '${YYYY-MM-DD}' AND - 'WordPress' IN UNNEST(technologies.technology) -GROUP BY - 1, - 2 +'WordPress' IN UNNEST(technologies.technology) From b352515e9cff3b8458ff8fc6023dcd110d832a93 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Tue, 14 Jan 2025 00:25:21 +0000 Subject: [PATCH 18/25] More consistent crux lenses --- sql/.sqlfluffignore | 2 ++ sql/generate_reports.sh | 12 +++++----- sql/lens/drupal/crux_histograms.sql | 5 ++-- sql/lens/drupal/crux_timeseries.sql | 32 ++++++++++++++------------ sql/lens/magento/crux_histograms.sql | 5 ++-- sql/lens/magento/crux_timeseries.sql | 32 ++++++++++++++------------ sql/lens/top100k/crux_timeseries.sql | 32 ++++++++++++++------------ sql/lens/top10k/crux_timeseries.sql | 32 ++++++++++++++------------ sql/lens/top1k/crux_timeseries.sql | 32 ++++++++++++++------------ sql/lens/top1m/crux_timeseries.sql | 32 ++++++++++++++------------ sql/lens/wordpress/crux_histograms.sql | 5 ++-- sql/lens/wordpress/crux_timeseries.sql | 32 ++++++++++++++------------ 12 files changed, 133 insertions(+), 120 deletions(-) diff --git a/sql/.sqlfluffignore b/sql/.sqlfluffignore index 82fb062..c7c6d67 100644 --- a/sql/.sqlfluffignore +++ b/sql/.sqlfluffignore @@ -1,2 +1,4 @@ /lens/*/crux_histograms.sql +/lens/*/crux_timeseries.sql +/lens/*/histograms.sql /lens/*/timeseries.sql diff --git a/sql/generate_reports.sh b/sql/generate_reports.sh index 8de7c6e..2273261 100755 --- a/sql/generate_reports.sh +++ b/sql/generate_reports.sh @@ -160,7 +160,7 @@ else lens_clause_and="" if [[ -f sql/lens/$LENS/crux_histograms.sql ]]; then echo "Using alternative crux lens join" - lens_join="$(cat sql/lens/$LENS/crux_histograms.sql | sed -e "s/--noqa: disable=PRS//g" | tr '\n' ' ')" + lens_join="$(cat sql/lens/$LENS/crux_histograms.sql | tr '\n' ' ')" else echo "CrUX queries do not support histograms for this lens so skipping" continue @@ -172,7 +172,7 @@ else else if [[ $(grep -i "WHERE" $query) ]]; then - # If WHERE clause already exists then add to it, before GROUP BY + # If WHERE clause already exists then add to it sql=$(sed -e "s/\(WHERE\)/\1 $lens_clause_and /" $query \ | sed -e "s/\${YYYY-MM-DD}/$DATE/g" \ | sed -e "s/\${YYYYMM}/$YYYYMM/g") @@ -320,12 +320,12 @@ else echo "CrUX query so using alternative lens join" lens_clause="" lens_clause_and="" - lens_join="JOIN ($(cat sql/lens/$LENS/crux_timeseries.sql | tr '\n' ' ')) USING (origin, date, device)" + lens_join="$(cat sql/lens/$LENS/crux_timeseries.sql | tr '\n' ' ')" fi if [[ -n "${date_join}" ]]; then if [[ $(grep -i "WHERE" $query) ]]; then - # If WHERE clause already exists then add to it, before GROUP BY + # If WHERE clause already exists then add to it sql=$(sed -e "s/\(WHERE\)/\1 $lens_clause_and $date_join AND/" $query \ | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/") else @@ -335,7 +335,7 @@ else fi else if [[ $(grep -i "WHERE" $query) ]]; then - # If WHERE clause already exists then add to it, before GROUP BY + # If WHERE clause already exists then add to it sql=$(sed -e "s/\(WHERE\)/\1 $lens_clause_and /" $query \ | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/") else @@ -348,7 +348,7 @@ else else if [[ -n "${date_join}" ]]; then if [[ $(grep -i "WHERE" $query) ]]; then - # If WHERE clause already exists then add to it, before GROUP BY + # If WHERE clause already exists then add to it sql=$(sed -e "s/\(WHERE\)/\1 $date_join AND /" $query) else # If WHERE clause does not exists then add it, before GROUP BY diff --git a/sql/lens/drupal/crux_histograms.sql b/sql/lens/drupal/crux_histograms.sql index 8d237c5..58edc7e 100644 --- a/sql/lens/drupal/crux_histograms.sql +++ b/sql/lens/drupal/crux_histograms.sql @@ -1,5 +1,4 @@ -INNER JOIN - ( +INNER JOIN ( SELECT page, client @@ -11,5 +10,5 @@ INNER JOIN GROUP BY 1, 2 - ) +) ON (SUBSTR(page, 0, LENGTH(page) - 1) = origin AND form_factor.name = IF(client = 'desktop', 'desktop', 'phone')) diff --git a/sql/lens/drupal/crux_timeseries.sql b/sql/lens/drupal/crux_timeseries.sql index 6786c5a..3ebac3e 100644 --- a/sql/lens/drupal/crux_timeseries.sql +++ b/sql/lens/drupal/crux_timeseries.sql @@ -1,15 +1,17 @@ -SELECT - SUBSTR(page, 0, LENGTH(page) - 1) AS origin, - IF(client = 'mobile', 'phone', client) AS device, - date -FROM - `httparchive.crawl.pages` -WHERE - date >= '2010-11-15' AND - is_root_page AND - 'Drupal' IN UNNEST(technologies.technology) -GROUP BY - 1, - 2, - 3, - 4 +INNER JOIN ( + SELECT + SUBSTR(page, 0, LENGTH(page) - 1) AS origin, + IF(client = 'mobile', 'phone', client) AS device, + date + FROM + `httparchive.crawl.pages` + WHERE + date >= '2010-11-15' AND + is_root_page AND + 'Drupal' IN UNNEST(technologies.technology) + GROUP BY + 1, + 2, + 3 +) +USING (origin, device, date) diff --git a/sql/lens/magento/crux_histograms.sql b/sql/lens/magento/crux_histograms.sql index 315debe..f4fb070 100644 --- a/sql/lens/magento/crux_histograms.sql +++ b/sql/lens/magento/crux_histograms.sql @@ -1,5 +1,4 @@ -INNER JOIN - ( +INNER JOIN ( SELECT page, client @@ -11,5 +10,5 @@ INNER JOIN GROUP BY 1, 2 - ) +) ON (SUBSTR(page, 0, LENGTH(page) - 1) = origin AND form_factor.name = IF(client = 'desktop', 'desktop', 'phone')) diff --git a/sql/lens/magento/crux_timeseries.sql b/sql/lens/magento/crux_timeseries.sql index d8eb981..3023d3f 100644 --- a/sql/lens/magento/crux_timeseries.sql +++ b/sql/lens/magento/crux_timeseries.sql @@ -1,15 +1,17 @@ -SELECT - SUBSTR(page, 0, LENGTH(page) - 1) AS origin, - IF(client = 'mobile', 'phone', client) AS device, - date -FROM - `httparchive.crawl.pages` -WHERE - date >= '2010-11-15' AND - is_root_page AND - 'Megento' IN UNNEST(technologies.technology) -GROUP BY - 1, - 2, - 3, - 4 +INNER JOIN ( + SELECT + SUBSTR(page, 0, LENGTH(page) - 1) AS origin, + IF(client = 'mobile', 'phone', client) AS device, + date + FROM + `httparchive.crawl.pages` + WHERE + date >= '2010-11-15' AND + is_root_page AND + 'Megento' IN UNNEST(technologies.technology) + GROUP BY + 1, + 2, + 3 +) +USING (origin, device, date) diff --git a/sql/lens/top100k/crux_timeseries.sql b/sql/lens/top100k/crux_timeseries.sql index d2b4c8f..09c91df 100644 --- a/sql/lens/top100k/crux_timeseries.sql +++ b/sql/lens/top100k/crux_timeseries.sql @@ -1,15 +1,17 @@ -SELECT - SUBSTR(page, 0, LENGTH(page) - 1) AS origin, - IF(client = 'mobile', 'phone', client) AS device, - date -FROM - `httparchive.crawl.pages` -WHERE - date >= '2010-11-15' AND - is_root_page AND - rank = 100000 -GROUP BY - 1, - 2, - 3, - 4 +INNER JOIN ( + SELECT + SUBSTR(page, 0, LENGTH(page) - 1) AS origin, + IF(client = 'mobile', 'phone', client) AS device, + date + FROM + `httparchive.crawl.pages` + WHERE + date >= '2010-11-15' AND + is_root_page AND + rank = 100000 + GROUP BY + 1, + 2, + 3 +) +USING (origin, device, date) diff --git a/sql/lens/top10k/crux_timeseries.sql b/sql/lens/top10k/crux_timeseries.sql index 0241b42..d4749d5 100644 --- a/sql/lens/top10k/crux_timeseries.sql +++ b/sql/lens/top10k/crux_timeseries.sql @@ -1,15 +1,17 @@ -SELECT - SUBSTR(page, 0, LENGTH(page) - 1) AS origin, - IF(client = 'mobile', 'phone', client) AS device, - date -FROM - `httparchive.crawl.pages` -WHERE - date >= '2010-11-15' AND - is_root_page AND - rank = 10000 -GROUP BY - 1, - 2, - 3, - 4 +INNER JOIN ( + SELECT + SUBSTR(page, 0, LENGTH(page) - 1) AS origin, + IF(client = 'mobile', 'phone', client) AS device, + date + FROM + `httparchive.crawl.pages` + WHERE + date >= '2010-11-15' AND + is_root_page AND + rank = 10000 + GROUP BY + 1, + 2, + 3 +) +USING (origin, device, date) diff --git a/sql/lens/top1k/crux_timeseries.sql b/sql/lens/top1k/crux_timeseries.sql index fe19d42..2bd0720 100644 --- a/sql/lens/top1k/crux_timeseries.sql +++ b/sql/lens/top1k/crux_timeseries.sql @@ -1,15 +1,17 @@ -SELECT - SUBSTR(page, 0, LENGTH(page) - 1) AS origin, - IF(client = 'mobile', 'phone', client) AS device, - date -FROM - `httparchive.crawl.pages` -WHERE - date >= '2010-11-15' AND - is_root_page AND - rank = 1000 -GROUP BY - 1, - 2, - 3, - 4 +INNER JOIN ( + SELECT + SUBSTR(page, 0, LENGTH(page) - 1) AS origin, + IF(client = 'mobile', 'phone', client) AS device, + date + FROM + `httparchive.crawl.pages` + WHERE + date >= '2010-11-15' AND + is_root_page AND + rank = 1000 + GROUP BY + 1, + 2, + 3 +) +USING (origin, device, date) diff --git a/sql/lens/top1m/crux_timeseries.sql b/sql/lens/top1m/crux_timeseries.sql index 134deb3..dbf0c64 100644 --- a/sql/lens/top1m/crux_timeseries.sql +++ b/sql/lens/top1m/crux_timeseries.sql @@ -1,15 +1,17 @@ -SELECT - SUBSTR(page, 0, LENGTH(page) - 1) AS origin, - IF(client = 'mobile', 'phone', client) AS device, - date -FROM - `httparchive.crawl.pages` -WHERE - date >= '2010-11-15' AND - is_root_page AND - rank = 1000000 -GROUP BY - 1, - 2, - 3, - 4 +INNER JOIN ( + SELECT + SUBSTR(page, 0, LENGTH(page) - 1) AS origin, + IF(client = 'mobile', 'phone', client) AS device, + date + FROM + `httparchive.crawl.pages` + WHERE + date >= '2010-11-15' AND + is_root_page AND + rank = 1000000 + GROUP BY + 1, + 2, + 3 +) +USING (origin, device, date) diff --git a/sql/lens/wordpress/crux_histograms.sql b/sql/lens/wordpress/crux_histograms.sql index fe83781..ccb5f9a 100644 --- a/sql/lens/wordpress/crux_histograms.sql +++ b/sql/lens/wordpress/crux_histograms.sql @@ -1,5 +1,4 @@ -INNER JOIN - ( +INNER JOIN ( SELECT page, client @@ -11,5 +10,5 @@ INNER JOIN GROUP BY 1, 2 - ) +) ON (SUBSTR(page, 0, LENGTH(page) - 1) = origin AND form_factor.name = IF(client = 'desktop', 'desktop', 'phone')) diff --git a/sql/lens/wordpress/crux_timeseries.sql b/sql/lens/wordpress/crux_timeseries.sql index 21c9cd4..dd869d8 100644 --- a/sql/lens/wordpress/crux_timeseries.sql +++ b/sql/lens/wordpress/crux_timeseries.sql @@ -1,15 +1,17 @@ -SELECT - SUBSTR(page, 0, LENGTH(page) - 1) AS origin, - IF(client = 'mobile', 'phone', client) AS device, - date -FROM - `httparchive.crawl.pages` -WHERE - date >= '2010-11-15' AND - is_root_page AND - 'WordPress' IN UNNEST(technologies.technology) -GROUP BY - 1, - 2, - 3, - 4 +INNER JOIN ( + SELECT + SUBSTR(page, 0, LENGTH(page) - 1) AS origin, + IF(client = 'mobile', 'phone', client) AS device, + date + FROM + `httparchive.crawl.pages` + WHERE + date >= '2010-11-15' AND + is_root_page AND + 'WordPress' IN UNNEST(technologies.technology) + GROUP BY + 1, + 2, + 3 +) +USING (origin, device, date) From 4616b332a3229815ed163122da6bd4a16a3647a1 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Tue, 14 Jan 2025 00:37:49 +0000 Subject: [PATCH 19/25] Fix evalJs --- sql/histograms/evalJs.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/histograms/evalJs.sql b/sql/histograms/evalJs.sql index 4d2b200..356905a 100644 --- a/sql/histograms/evalJs.sql +++ b/sql/histograms/evalJs.sql @@ -10,9 +10,9 @@ FROM ( SELECT client, COUNT(0) AS volume, - CAST(FLOAT64(payload['_cpu.EvaluateScript']) / 20 AS INT64) * 20 AS bin + CAST(FLOAT64(r.payload['_cpu.EvaluateScript']) / 20 AS INT64) * 20 AS bin FROM - `httparchive.crawl.requests` + `httparchive.crawl.requests` r INNER JOIN `httparchive.crawl.pages` USING (date, client, is_root_page, rank, page) From 1fef068c484f7ca7f34b23af78402bb246567001 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Tue, 14 Jan 2025 01:14:00 +0000 Subject: [PATCH 20/25] Fix dates --- sql/timeseries/a11yScores.sql | 4 ++-- sql/timeseries/bootupJs.sql | 4 ++-- sql/timeseries/bytesCss.sql | 2 +- sql/timeseries/bytesFont.sql | 2 +- sql/timeseries/bytesHtml.sql | 2 +- sql/timeseries/bytesImg.sql | 2 +- sql/timeseries/bytesJs.sql | 2 +- sql/timeseries/bytesOther.sql | 2 +- sql/timeseries/bytesTotal.sql | 2 +- sql/timeseries/bytesVideo.sql | 2 +- sql/timeseries/dcl.sql | 2 +- sql/timeseries/numUrls.sql | 2 +- sql/timeseries/ol.sql | 2 +- sql/timeseries/reqCss.sql | 2 +- sql/timeseries/reqFont.sql | 2 +- sql/timeseries/reqHtml.sql | 2 +- sql/timeseries/reqImg.sql | 2 +- sql/timeseries/reqJs.sql | 2 +- sql/timeseries/reqOther.sql | 2 +- sql/timeseries/reqTotal.sql | 2 +- sql/timeseries/reqVideo.sql | 2 +- 21 files changed, 23 insertions(+), 23 deletions(-) diff --git a/sql/timeseries/a11yScores.sql b/sql/timeseries/a11yScores.sql index 6090cb1..3c98e92 100644 --- a/sql/timeseries/a11yScores.sql +++ b/sql/timeseries/a11yScores.sql @@ -9,8 +9,8 @@ LANGUAGE js AS """ """; SELECT - date, - UNIX_DATE(CAST(REPLACE(date, '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(100)], 2) AS p10, ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(250)], 2) AS p25, diff --git a/sql/timeseries/bootupJs.sql b/sql/timeseries/bootupJs.sql index ea98557..e02f2f1 100644 --- a/sql/timeseries/bootupJs.sql +++ b/sql/timeseries/bootupJs.sql @@ -1,7 +1,7 @@ #standardSQL SELECT - date, - timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(100)], 2) AS p10, ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(250)], 2) AS p25, diff --git a/sql/timeseries/bytesCss.sql b/sql/timeseries/bytesCss.sql index dc7007b..57f8bdc 100644 --- a/sql/timeseries/bytesCss.sql +++ b/sql/timeseries/bytesCss.sql @@ -1,6 +1,6 @@ #standardSQL SELECT - date, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesCss), 1001)[OFFSET(101)] / 1024, 2) AS p10, diff --git a/sql/timeseries/bytesFont.sql b/sql/timeseries/bytesFont.sql index df40db1..f3e4ea9 100644 --- a/sql/timeseries/bytesFont.sql +++ b/sql/timeseries/bytesFont.sql @@ -1,6 +1,6 @@ #standardSQL SELECT - date, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesFont), 1001)[OFFSET(101)] / 1024, 2) AS p10, diff --git a/sql/timeseries/bytesHtml.sql b/sql/timeseries/bytesHtml.sql index 6f93049..133941c 100644 --- a/sql/timeseries/bytesHtml.sql +++ b/sql/timeseries/bytesHtml.sql @@ -1,6 +1,6 @@ #standardSQL SELECT - date, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesHtml), 1001)[OFFSET(101)] / 1024, 2) AS p10, diff --git a/sql/timeseries/bytesImg.sql b/sql/timeseries/bytesImg.sql index f4e09e4..f71309d 100644 --- a/sql/timeseries/bytesImg.sql +++ b/sql/timeseries/bytesImg.sql @@ -1,6 +1,6 @@ #standardSQL SELECT - date, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesImg), 1001)[OFFSET(101)] / 1024, 2) AS p10, diff --git a/sql/timeseries/bytesJs.sql b/sql/timeseries/bytesJs.sql index 20c20f9..17896c7 100644 --- a/sql/timeseries/bytesJs.sql +++ b/sql/timeseries/bytesJs.sql @@ -1,6 +1,6 @@ #standardSQL SELECT - date, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesJS), 1001)[OFFSET(101)] / 1024, 2) AS p10, diff --git a/sql/timeseries/bytesOther.sql b/sql/timeseries/bytesOther.sql index ac9fdb6..378b52b 100644 --- a/sql/timeseries/bytesOther.sql +++ b/sql/timeseries/bytesOther.sql @@ -1,6 +1,6 @@ #standardSQL SELECT - date, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesOther), 1001)[OFFSET(101)] / 1024, 2) AS p10, diff --git a/sql/timeseries/bytesTotal.sql b/sql/timeseries/bytesTotal.sql index c6e9da8..acb65fd 100644 --- a/sql/timeseries/bytesTotal.sql +++ b/sql/timeseries/bytesTotal.sql @@ -1,6 +1,6 @@ #standardSQL SELECT - date, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesTotal), 1001)[OFFSET(101)] / 1024, 2) AS p10, diff --git a/sql/timeseries/bytesVideo.sql b/sql/timeseries/bytesVideo.sql index fddc697..e5bbcfd 100644 --- a/sql/timeseries/bytesVideo.sql +++ b/sql/timeseries/bytesVideo.sql @@ -1,6 +1,6 @@ #standardSQL SELECT - date, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesVideo), 1001)[OFFSET(101)] / 1024, 2) AS p10, diff --git a/sql/timeseries/dcl.sql b/sql/timeseries/dcl.sql index b949605..1b54d01 100644 --- a/sql/timeseries/dcl.sql +++ b/sql/timeseries/dcl.sql @@ -1,6 +1,6 @@ #standardSQL SELECT - date, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, ROUND(APPROX_QUANTILES(FLOAT64(summary.onContentLoaded), 1001)[OFFSET(101)] / 1000, 2) AS p10, diff --git a/sql/timeseries/numUrls.sql b/sql/timeseries/numUrls.sql index 272265b..df992df 100644 --- a/sql/timeseries/numUrls.sql +++ b/sql/timeseries/numUrls.sql @@ -1,6 +1,6 @@ #standardSQL SELECT - date, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, COUNT(0) AS urls diff --git a/sql/timeseries/ol.sql b/sql/timeseries/ol.sql index 2c4c3ce..95c67ce 100644 --- a/sql/timeseries/ol.sql +++ b/sql/timeseries/ol.sql @@ -1,6 +1,6 @@ #standardSQL SELECT - date, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, ROUND(APPROX_QUANTILES(FLOAT64(summary.onLoad), 1001)[OFFSET(101)] / 1000, 2) AS p10, diff --git a/sql/timeseries/reqCss.sql b/sql/timeseries/reqCss.sql index 37604fe..7c0dc8a 100644 --- a/sql/timeseries/reqCss.sql +++ b/sql/timeseries/reqCss.sql @@ -1,6 +1,6 @@ #standardSQL SELECT - date, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, ROUND(APPROX_QUANTILES(FLOAT64(summary.reqCss), 1001)[OFFSET(101)], 2) AS p10, diff --git a/sql/timeseries/reqFont.sql b/sql/timeseries/reqFont.sql index 1e76455..a490a83 100644 --- a/sql/timeseries/reqFont.sql +++ b/sql/timeseries/reqFont.sql @@ -1,6 +1,6 @@ #standardSQL SELECT - date, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, ROUND(APPROX_QUANTILES(FLOAT64(summary.reqFont), 1001)[OFFSET(101)], 2) AS p10, diff --git a/sql/timeseries/reqHtml.sql b/sql/timeseries/reqHtml.sql index b173863..a564e22 100644 --- a/sql/timeseries/reqHtml.sql +++ b/sql/timeseries/reqHtml.sql @@ -1,6 +1,6 @@ #standardSQL SELECT - date, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, ROUND(APPROX_QUANTILES(FLOAT64(summary.reqHtml), 1001)[OFFSET(101)], 2) AS p10, diff --git a/sql/timeseries/reqImg.sql b/sql/timeseries/reqImg.sql index 3a27087..c23bedd 100644 --- a/sql/timeseries/reqImg.sql +++ b/sql/timeseries/reqImg.sql @@ -1,6 +1,6 @@ #standardSQL SELECT - date, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, ROUND(APPROX_QUANTILES(FLOAT64(summary.reqImg), 1001)[OFFSET(101)], 2) AS p10, diff --git a/sql/timeseries/reqJs.sql b/sql/timeseries/reqJs.sql index 1738800..8abacce 100644 --- a/sql/timeseries/reqJs.sql +++ b/sql/timeseries/reqJs.sql @@ -1,6 +1,6 @@ #standardSQL SELECT - date, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, ROUND(APPROX_QUANTILES(FLOAT64(summary.reqJS), 1001)[OFFSET(101)], 2) AS p10, diff --git a/sql/timeseries/reqOther.sql b/sql/timeseries/reqOther.sql index fa10869..fcd6073 100644 --- a/sql/timeseries/reqOther.sql +++ b/sql/timeseries/reqOther.sql @@ -1,6 +1,6 @@ #standardSQL SELECT - date, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, ROUND(APPROX_QUANTILES(FLOAT64(summary.reqOther), 1001)[OFFSET(101)], 2) AS p10, diff --git a/sql/timeseries/reqTotal.sql b/sql/timeseries/reqTotal.sql index f132ca0..372baaf 100644 --- a/sql/timeseries/reqTotal.sql +++ b/sql/timeseries/reqTotal.sql @@ -1,6 +1,6 @@ #standardSQL SELECT - date, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, ROUND(APPROX_QUANTILES(FLOAT64(summary.reqTotal), 1001)[OFFSET(101)], 2) AS p10, diff --git a/sql/timeseries/reqVideo.sql b/sql/timeseries/reqVideo.sql index a1144e5..cefd474 100644 --- a/sql/timeseries/reqVideo.sql +++ b/sql/timeseries/reqVideo.sql @@ -1,6 +1,6 @@ #standardSQL SELECT - date, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, ROUND(APPROX_QUANTILES(FLOAT64(summary.reqVideo), 1001)[OFFSET(101)], 2) AS p10, From ac0e6d524e4b72ab420b36f527b70d0bc822f35c Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Tue, 14 Jan 2025 02:03:45 +0000 Subject: [PATCH 21/25] Fix date --- sql/timeseries/a11yScores.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/timeseries/a11yScores.sql b/sql/timeseries/a11yScores.sql index 3c98e92..4296313 100644 --- a/sql/timeseries/a11yScores.sql +++ b/sql/timeseries/a11yScores.sql @@ -19,7 +19,7 @@ SELECT ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(900)], 2) AS p90 FROM ( SELECT - format_timestamp('%Y_%m_%d', date) AS date, + date, client, IFNULL(LAX_FLOAT64(lighthouse.categories.accessibility.score) * 100, getA11yScore(lighthouse.reportCategories)) AS score FROM From f24d43e4d1448135a9bb8cbb56d43f0f5b33d344 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Tue, 14 Jan 2025 08:31:59 +0000 Subject: [PATCH 22/25] Update sql/lens/magento/crux_timeseries.sql Co-authored-by: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> --- sql/lens/magento/crux_timeseries.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/lens/magento/crux_timeseries.sql b/sql/lens/magento/crux_timeseries.sql index 3023d3f..2694cab 100644 --- a/sql/lens/magento/crux_timeseries.sql +++ b/sql/lens/magento/crux_timeseries.sql @@ -8,7 +8,7 @@ INNER JOIN ( WHERE date >= '2010-11-15' AND is_root_page AND - 'Megento' IN UNNEST(technologies.technology) + 'Magento' IN UNNEST(technologies.technology) GROUP BY 1, 2, From 1f03720f5d47c5579ee54f5403529709202fc3ab Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Wed, 15 Jan 2025 17:14:34 +0000 Subject: [PATCH 23/25] Remove FID --- sql/histograms/cruxFid.sql | 49 ---------------------------------- sql/timeseries/cruxFastFid.sql | 31 --------------------- sql/timeseries/cruxSlowFid.sql | 31 --------------------- 3 files changed, 111 deletions(-) delete mode 100644 sql/histograms/cruxFid.sql delete mode 100644 sql/timeseries/cruxFastFid.sql delete mode 100644 sql/timeseries/cruxSlowFid.sql diff --git a/sql/histograms/cruxFid.sql b/sql/histograms/cruxFid.sql deleted file mode 100644 index cd6386b..0000000 --- a/sql/histograms/cruxFid.sql +++ /dev/null @@ -1,49 +0,0 @@ -#standardSQL -CREATE TEMPORARY FUNCTION spreadBins(bins ARRAY>) -RETURNS ARRAY> -LANGUAGE js AS """ - // Convert into 25ms bins and spread the density around. - const WIDTH = 25; - return (bins || []).reduce((bins, bin) => { - bin.start = +bin.start; - bin.end = Math.min(bin.end, bin.start + 5000); - const binWidth = bin.end - bin.start; - for (let start = bin.start; start < bin.end; start += WIDTH) { - bins.push({ - start, - density: bin.density / (binWidth / WIDTH) - }); - } - return bins; - }, []); -"""; - -SELECT - *, - SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf -FROM ( - SELECT - *, - volume / SUM(volume) OVER (PARTITION BY client) AS pdf - FROM ( - SELECT - IF(form_factor.name = 'desktop', 'desktop', 'mobile') AS client, - bin.start AS bin, - SUM(bin.density) AS volume - FROM ( - SELECT - form_factor, - spreadBins(first_input.delay.histogram.bin) AS bins - FROM - `chrome-ux-report.all.${YYYYMM}` - ) - CROSS JOIN - UNNEST(bins) AS bin - GROUP BY - bin, - client - ) -) -ORDER BY - bin, - client diff --git a/sql/timeseries/cruxFastFid.sql b/sql/timeseries/cruxFastFid.sql deleted file mode 100644 index 0899ed7..0000000 --- a/sql/timeseries/cruxFastFid.sql +++ /dev/null @@ -1,31 +0,0 @@ -#standardSQL -# Fast FID by device - -CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( - good / (good + needs_improvement + poor) >= 0.75 -); - -CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( - good + needs_improvement + poor > 0 -); - -SELECT - REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(device = 'desktop', 'desktop', 'mobile') AS client, - SAFE_DIVIDE( - COUNT(DISTINCT IF(IS_GOOD(fast_fid, avg_fid, slow_fid), origin, NULL)), - COUNT(DISTINCT IF(IS_NON_ZERO(fast_fid, avg_fid, slow_fid), origin, NULL)) - ) * 100 AS percent -FROM - `chrome-ux-report.materialized.device_summary` -WHERE - device IN ('desktop', 'phone') AND - yyyymm >= 201806 -GROUP BY - date, - timestamp, - client -ORDER BY - date DESC, - client diff --git a/sql/timeseries/cruxSlowFid.sql b/sql/timeseries/cruxSlowFid.sql deleted file mode 100644 index 7c0c38f..0000000 --- a/sql/timeseries/cruxSlowFid.sql +++ /dev/null @@ -1,31 +0,0 @@ -#standardSQL -# Slow FID by device - -CREATE TEMP FUNCTION IS_POOR(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( - poor / (good + needs_improvement + poor) >= 0.25 -); - -CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( - good + needs_improvement + poor > 0 -); - -SELECT - REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(device = 'desktop', 'desktop', 'mobile') AS client, - SAFE_DIVIDE( - COUNT(DISTINCT IF(IS_POOR(fast_fid, avg_fid, slow_fid), origin, NULL)), - COUNT(DISTINCT IF(IS_NON_ZERO(fast_fid, avg_fid, slow_fid), origin, NULL)) - ) * 100 AS percent -FROM - `chrome-ux-report.materialized.device_summary` -WHERE - device IN ('desktop', 'phone') AND - yyyymm >= 201806 -GROUP BY - date, - timestamp, - client -ORDER BY - date DESC, - client From 4564e73db70e9d05fc3b5d76cec012a83c785fc8 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Wed, 15 Jan 2025 17:16:30 +0000 Subject: [PATCH 24/25] Fix bootupJs --- sql/timeseries/bootupJs.sql | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sql/timeseries/bootupJs.sql b/sql/timeseries/bootupJs.sql index e02f2f1..d035216 100644 --- a/sql/timeseries/bootupJs.sql +++ b/sql/timeseries/bootupJs.sql @@ -10,8 +10,7 @@ SELECT ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(900)], 2) AS p90 FROM ( SELECT - FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, - UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + date, client, IFNULL( FLOAT64(lighthouse.audits['bootup-time'].numericValue), From 477d54f15aab9950e61518975717027db8ac38c1 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Thu, 16 Jan 2025 10:11:53 +0000 Subject: [PATCH 25/25] Remove unneeded data check --- sql/generate_reports.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/generate_reports.sh b/sql/generate_reports.sh index 2273261..6a2e48f 100755 --- a/sql/generate_reports.sh +++ b/sql/generate_reports.sh @@ -271,7 +271,7 @@ else if [[ $metric != crux* ]]; then # CrUX is quick and join is more compilicated so just do a full run of that date_join="date > CAST(REPLACE(\"$max_date\",\"_\",\"-\") AS DATE)" # Skip 2022_05_12 tables - date_join="${date_join} AND date != \"2022-05-12\"" + date_join="${date_join}" if [[ -n "$YYYY_MM_DD" ]]; then # If a date is given, then only run up until then (in case next month is mid run as do not wanna get just desktop data) date_join="${date_join} AND date <= \"$DATE\"" @@ -291,7 +291,7 @@ else # If a date is given, then only run up until then (in case next month is mid run as do not wanna get just desktop data) date_join="date <= \"$DATE\"" # Skip 2022_05_12 tables - date_join="${date_join} AND date != \"2022-05-12\"" + date_join="${date_join}" fi echo -e "Force Mode=${FORCE}. Generating $gs_lens_dir$metric timeseries from start until ${YYYY_MM_DD}." @@ -301,7 +301,7 @@ else if [[ $metric != crux* ]]; then # CrUX is quick and join is more compilicated so just do a full run of that date_join="date <= \"$DATE\"" # Skip 2022_05_12 tables - date_join="${date_join} AND date != \"2022-05-12\"" + date_join="${date_join}" fi echo -e "Timeseries does not exist. Generating $gs_lens_dir$metric timeseries from start until ${YYYY_MM_DD}"