diff --git a/sql/.sqlfluff b/sql/.sqlfluff index 3fcc852..d06c400 100644 --- a/sql/.sqlfluff +++ b/sql/.sqlfluff @@ -11,7 +11,7 @@ templater = jinja ## Comma separated list of rules to check, or None for all rules = None ## Comma separated list of rules to exclude, or None -exclude_rules = AL01,AL04,AL07,AL09,AM03,AM05,CP02,CP03,CV02,LT05,LT09,RF01,RF02,RF03,RF04,ST01,ST02,ST05,ST06,ST07 +exclude_rules = AL01,AL04,AL07,AL09,AM03,AM05,CP02,CP03,CV02,CV12,LT05,LT09,LT14,RF01,RF02,RF03,RF04,ST01,ST02,ST05,ST06,ST07 # AL04 - Asks for unique table aliases meaning it complains if selecting from two 2021_07_01 tables as implicit alias is table name (not fully qualified) so same. # AL07 - Avoid aliases in from and join - why? # AM03 - if using DESC in one ORDER BY column, then insist on ASC/DESC for all. @@ -19,8 +19,10 @@ exclude_rules = AL01,AL04,AL07,AL09,AM03,AM05,CP02,CP03,CV02,LT05,LT09,RF01,RF02 # CP02 - Unquoted identifiers (e.g. column names) will be mixed case so don't enforce case # CP03 - Function names will be mixed case so don't enforce case # CV02 - Use COALESCE instead of IFNULL or NVL. We think ISNULL is clearer. +# CV12 - Doesn't work with UNNEST. https://github.com/sqlfluff/sqlfluff/issues/6558 # LT05 - We allow longer lines as some of our queries are complex. Maybe should limit in future? # LT09 - Select targets should be on new lines but sub clauses don't always obey this. Maybe revisit in future? +# LT14 - Keywords on newline. We have some simple, single line joins # RF01 - BigQuery uses STRUCTS which can look like incorrect table references # RF02 - Asks for qualified columns for ambiguous ones, but we not qualify our columns, and they are not really ambiguous (or BigQuery would complain) # RF03 - Insists on references in column names even if not ambiguous. Bit OTT. diff --git a/sql/.sqlfluffignore b/sql/.sqlfluffignore index 7ef1f06..c7c6d67 100644 --- a/sql/.sqlfluffignore +++ b/sql/.sqlfluffignore @@ -1 +1,4 @@ /lens/*/crux_histograms.sql +/lens/*/crux_timeseries.sql +/lens/*/histograms.sql +/lens/*/timeseries.sql diff --git a/sql/generate_reports.sh b/sql/generate_reports.sh index 6737b61..6a2e48f 100755 --- a/sql/generate_reports.sh +++ b/sql/generate_reports.sh @@ -37,6 +37,7 @@ while getopts ":ftvh:l:r:" opt; do YYYY_MM_DD=${OPTARG} dateParts=(`echo ${OPTARG} | tr "_" "\\n"`) YYYYMM=${dateParts[0]}${dateParts[1]} + DATE=${dateParts[0]}-${dateParts[1]}-${dateParts[2]} ;; t) GENERATE_TIMESERIES=1 @@ -56,7 +57,7 @@ while getopts ":ftvh:l:r:" opt; do esac done -# Exit early if there's nothing to do. +# Exit early if there is nothing to do. if [ $GENERATE_HISTOGRAM -eq 0 -a $GENERATE_TIMESERIES -eq 0 ]; then echo -e "You must provide one or both -t or -h flags." >&2 echo -e "For example: sql/generateReports.sh -t -h 2017_08_01" >&2 @@ -65,26 +66,42 @@ fi # Check if all tables for the given date are available in BigQuery. # Tables representing desktop/mobile and HAR/CSV data sources must exist. -(bq show "httparchive:pages.${YYYY_MM_DD}_desktop" && \ - bq show "httparchive:pages.${YYYY_MM_DD}_mobile" && \ - bq show "httparchive:summary_pages.${YYYY_MM_DD}_desktop" && \ - bq show "httparchive:summary_pages.${YYYY_MM_DD}_mobile") &> /dev/null -if [ $GENERATE_HISTOGRAM -ne 0 -a $? -ne 0 ]; then - echo -e "The BigQuery tables for $YYYY_MM_DD are not available." >&2 +DATED_TABLES_READY=0 +if [ -n "$YYYY_MM_DD" ]; then + echo "Checking if tables are ready for ${DATE}..." + DESKTOP_ROOT_PAGES_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.pages WHERE date = '${DATE}' AND client = 'desktop' AND is_root_page LIMIT 1" | tail -1) + DESKTOP_NON_ROOT_PAGES_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.pages WHERE date = '${DATE}' AND client = 'desktop' AND is_root_page LIMIT 1" | tail -1) + MOBILE_ROOT_PAGES_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.pages WHERE date = '${DATE}' AND client = 'mobile' AND NOT is_root_page LIMIT 1" | tail -1) + MOBILE_NON_ROOT_PAGES_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.pages WHERE date = '${DATE}' AND client = 'mobile' AND NOT is_root_page LIMIT 1" | tail -1) + DESKTOP_ROOT_REQUESTS_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.requests WHERE date = '${DATE}' AND client = 'desktop' AND is_root_page LIMIT 1" | tail -1) + DESKTOP_NON_ROOT_REQUESTS_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.requests WHERE date = '${DATE}' AND client = 'desktop' AND is_root_page LIMIT 1" | tail -1) + MOBILE_ROOT_REQUESTS_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.requests WHERE date = '${DATE}' AND client = 'mobile' AND NOT is_root_page LIMIT 1" | tail -1) + MOBILE_NON_ROOT_REQUESTS_EXIST=$(bq query --nouse_legacy_sql --format csv --headless -q "SELECT true FROM httparchive.crawl.requests WHERE date = '${DATE}' AND client = 'mobile' AND NOT is_root_page LIMIT 1" | tail -1) + echo "Finished checking if dates are ready" + if [[ "$DESKTOP_ROOT_PAGES_EXIST" == true && "$DESKTOP_NON_ROOT_PAGES_EXIST" == true && "$MOBILE_ROOT_PAGES_EXIST" == true && "$MOBILE_NON_ROOT_PAGES_EXIST" == true && "$DESKTOP_ROOT_REQUESTS_EXIST" == true && "$DESKTOP_NON_ROOT_REQUESTS_EXIST" == true && "$MOBILE_ROOT_REQUESTS_EXIST" == true && "$MOBILE_NON_ROOT_REQUESTS_EXIST" == true ]]; then + DATED_TABLES_READY=1 + fi +fi +if [ $GENERATE_HISTOGRAM -ne 0 -a $DATED_TABLES_READY -ne 1 ]; then + echo -e "The BigQuery tables for $DATE are not available." >&2 # List table data for debugging echo $(date) - bq show "httparchive:pages.${YYYY_MM_DD}_desktop" | head -5 - bq show "httparchive:pages.${YYYY_MM_DD}_mobile" | head -5 - bq show "httparchive:summary_pages.${YYYY_MM_DD}_desktop" | head -5 - bq show "httparchive:summary_pages.${YYYY_MM_DD}_mobile" | head -5 + echo "Desktop root pages ready: ${DESKTOP_ROOT_PAGES_EXIST}" + echo "Desktop non-root pages ready: ${DESKTOP_NON_ROOT_PAGES_EXIST}" + echo "Mobile root pages ready: ${MOBILE_ROOT_PAGES_EXIST}" + echo "Mobile non-root pages ready: ${MOBILE_NON_ROOT_PAGES_EXIST}" + echo "Desktop root requests ready: ${DESKTOP_ROOT_REQUESTS_EXIST}" + echo "Desktop non-root requests ready: ${DESKTOP_NON_ROOT_REQUESTS_EXIST}" + echo "Mobile root requests ready: ${MOBILE_ROOT_REQUESTS_EXIST}" + echo "Mobile non-root requests ready: ${MOBILE_NON_ROOT_REQUESTS_EXIST}" exit 1 fi if [ $GENERATE_HISTOGRAM -eq 0 ]; then echo -e "Skipping histograms" else - echo -e "Generating histograms for date $YYYY_MM_DD" + echo -e "Generating histograms for date $DATE" # Run all histogram queries. for query in sql/histograms/$REPORTS.sql; do @@ -134,28 +151,42 @@ else # Replace the date template in the query. if [[ $LENS != "" ]]; then echo -e "Generating ${metric} report for $LENS" - lens_join="JOIN ($(cat sql/lens/$LENS/histograms.sql | tr '\n' ' ')) USING (url, _TABLE_SUFFIX)" + lens_clause="$(cat sql/lens/$LENS/histograms.sql)" + lens_clause_and="$(cat sql/lens/$LENS/histograms.sql) AND" + lens_join="" + if [[ $metric == crux* ]]; then + lens_clause="" + lens_clause_and="" if [[ -f sql/lens/$LENS/crux_histograms.sql ]]; then echo "Using alternative crux lens join" - lens_join="$(cat sql/lens/$LENS/crux_histograms.sql | sed -e "s/--noqa: disable=PRS//g" | tr '\n' ' ')" + lens_join="$(cat sql/lens/$LENS/crux_histograms.sql | tr '\n' ' ')" else echo "CrUX queries do not support histograms for this lens so skipping" continue fi sql=$(sed -e "s/\(\`chrome-ux-report[^\`]*\`\)/\1 $lens_join/" $query \ - | sed -e "s/\${YYYY_MM_DD}/$YYYY_MM_DD/g" \ + | sed -e "s/\${YYYY-MM-DD}/$DATE/g" \ | sed -e "s/\${YYYYMM}/$YYYYMM/g") else - sql=$(sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/" $query \ - | sed -e "s/\${YYYY_MM_DD}/$YYYY_MM_DD/g" \ - | sed -e "s/\${YYYYMM}/$YYYYMM/g") + + if [[ $(grep -i "WHERE" $query) ]]; then + # If WHERE clause already exists then add to it + sql=$(sed -e "s/\(WHERE\)/\1 $lens_clause_and /" $query \ + | sed -e "s/\${YYYY-MM-DD}/$DATE/g" \ + | sed -e "s/\${YYYYMM}/$YYYYMM/g") + else + # If WHERE clause does not exists then add it, before GROUP BY + sql=$(sed -e "s/\(GROUP BY\)/WHERE $lens_clause \1/" $query \ + | sed -e "s/\${YYYY-MM-DD}/$DATE/g" \ + | sed -e "s/\${YYYYMM}/$YYYYMM/g") + fi fi else echo -e "Generating ${metric} report for base (no lens)" - sql=$(sed -e "s/\${YYYY_MM_DD}/$YYYY_MM_DD/" $query \ - | sed -e "s/\${YYYYMM}/$YYYYMM/") + sql=$(sed -e "s/\${YYYY-MM-DD}/$DATE/g" $query \ + | sed -e "s/\${YYYYMM}/$YYYYMM/g") fi if [ ${VERBOSE} -eq 1 ]; then @@ -237,27 +268,13 @@ else # Only run if new dates if [[ -z "${YYYY_MM_DD}" || "${max_date}" < "${YYYY_MM_DD}" ]]; then - if [[ $(grep "httparchive.blink_features.usage" $query) && $LENS == "" ]]; then # blink needs a special join, different for lenses - date_join="yyyymmdd > REPLACE(\"$max_date\",\"_\",\"\")" - if [[ -n "$YYYY_MM_DD" ]]; then - # If a date is given, then only run up until then (in case next month is mid-run as don't wanna get just desktop data) - date_join="${date_join} AND yyyymmdd <= REPLACE(\"$YYYY_MM_DD\",\"_\",\"\")" - fi - elif [[ $(grep "httparchive.blink_features.usage" $query) && $LENS != "" ]]; then # blink needs a special join, different for lenses - date_join="yyyymmdd > CAST(REPLACE(\"$max_date\",\"_\",\"-\") AS DATE)" - # Skip 2022_05_12 tables - date_join="${date_join} AND yyyymmdd != \"2022-05-12\"" - if [[ -n "$YYYY_MM_DD" ]]; then - # If a date is given, then only run up until then (in case next month is mid run as don't wanna get just desktop data) - date_join="${date_join} AND yyyymmdd <= CAST(REPLACE(\"$YYYY_MM_DD\",\"_\",\"-\") AS DATE)" - fi - elif [[ $metric != crux* ]]; then # CrUX is quick and join is more compilicated so just do a full run of that - date_join="SUBSTR(_TABLE_SUFFIX, 0, 10) > \"$max_date\"" + if [[ $metric != crux* ]]; then # CrUX is quick and join is more compilicated so just do a full run of that + date_join="date > CAST(REPLACE(\"$max_date\",\"_\",\"-\") AS DATE)" # Skip 2022_05_12 tables - date_join="${date_join} AND SUBSTR(_TABLE_SUFFIX, 0, 10) != \"2022_05_12\"" + date_join="${date_join}" if [[ -n "$YYYY_MM_DD" ]]; then - # If a date is given, then only run up until then (in case next month is mid run as don't wanna get just desktop data) - date_join="${date_join} AND SUBSTR(_TABLE_SUFFIX, 0, 10) <= \"$YYYY_MM_DD\"" + # If a date is given, then only run up until then (in case next month is mid run as do not wanna get just desktop data) + date_join="${date_join} AND date <= \"$DATE\"" fi fi @@ -269,34 +286,22 @@ else fi elif [[ -n "$YYYY_MM_DD" ]]; then - # Even if doing a force run we only wanna run up until date given in case next month is mid-run as don't wanna get just desktop data - if [[ $(grep "httparchive.blink_features.usage" $query) && $LENS == "" ]]; then # blink needs a special join, different for lenses - date_join="yyyymmdd <= REPLACE(\"$YYYY_MM_DD\",\"_\",\"\")" - elif [[ $(grep "httparchive.blink_features.usage" $query) && $LENS != "" ]]; then # blink needs a special join, different for lenses - date_join="yyyymmdd <= CAST(REPLACE(\"$YYYY_MM_DD\",\"_\",\"-\") AS DATE)" - # Skip 2022_05_12 tables - date_join="${date_join} AND yyyymmdd != \"2022-05-12\"" - elif [[ $metric != crux* ]]; then # CrUX is quick and join is more compilicated so just do a full run of that - # If a date is given, then only run up until then (in case next month is mid run as don't wanna get just desktop data) - date_join="SUBSTR(_TABLE_SUFFIX, 0, 10) <= \"$YYYY_MM_DD\"" + # Even if doing a force run we only wanna run up until date given in case next month is mid-run as do not wanna get just desktop data + if [[ $metric != crux* ]]; then # CrUX is quick and join is more compilicated so just do a full run of that + # If a date is given, then only run up until then (in case next month is mid run as do not wanna get just desktop data) + date_join="date <= \"$DATE\"" # Skip 2022_05_12 tables - date_join="${date_join} AND SUBSTR(_TABLE_SUFFIX, 0, 10) != \"2022_05_12\"" + date_join="${date_join}" fi echo -e "Force Mode=${FORCE}. Generating $gs_lens_dir$metric timeseries from start until ${YYYY_MM_DD}." fi elif [[ -n "$YYYY_MM_DD" ]]; then - # Even if the file doesn't exist we only wanna run up until date given in case next month is mid-run as don't wanna get just desktop data - if [[ $(grep "httparchive.blink_features.usage" $query) && $LENS == "" ]]; then # blink needs a special join, different for lenses - date_join="yyyymmdd <= REPLACE(\"$YYYY_MM_DD\",\"_\",\"\")" - elif [[ $(grep "httparchive.blink_features.usage" $query) && $LENS != "" ]]; then # blink needs a special join, different for lenses - date_join="yyyymmdd <= CAST(REPLACE(\"$YYYY_MM_DD\",\"_\",\"-\") AS DATE)" - # Skip 2022_05_12 tables - date_join="${date_join} AND yyyymmdd != \"2022-05-12\"" - elif [[ $metric != crux* ]]; then # CrUX is quick and join is more compilicated so just do a full run of that - date_join="SUBSTR(_TABLE_SUFFIX, 0, 10) <= \"$YYYY_MM_DD\"" + # Even if the file does not exist we only wanna run up until date given in case next month is mid-run as do not wanna get just desktop data + if [[ $metric != crux* ]]; then # CrUX is quick and join is more compilicated so just do a full run of that + date_join="date <= \"$DATE\"" # Skip 2022_05_12 tables - date_join="${date_join} AND SUBSTR(_TABLE_SUFFIX, 0, 10) != \"2022_05_12\"" + date_join="${date_join}" fi echo -e "Timeseries does not exist. Generating $gs_lens_dir$metric timeseries from start until ${YYYY_MM_DD}" @@ -307,54 +312,46 @@ else if [[ $LENS != "" ]]; then - if [[ $(grep "httparchive.blink_features.usage" $query) ]]; then - # blink_features.usage need to be replace by blink_features.features for lenses - if [[ -f sql/lens/$LENS/blink_timeseries.sql ]]; then - echo "Using alternative blink_timeseries lens join" - lens_join="$(cat sql/lens/$LENS/blink_timeseries.sql | tr '\n' ' ')" - - # For blink features for lenses we have a BLINK_DATE_JOIN variable to replace - if [[ -z "${date_join}" ]]; then - sql=$(sed -e "s/\`httparchive.blink_features.usage\`/($lens_join)/" $query \ - | sed -e "s/ {{ BLINK_DATE_JOIN }}//g") - else - sql=$( sed -e "s/\`httparchive.blink_features.usage\`/($lens_join)/" $query \ - | sed -e "s/{{ BLINK_DATE_JOIN }}/AND $date_join/g") - fi - else - echo "blink_features.usage queries not supported for this lens so skipping lens" - continue - fi + if [[ $metric != crux* ]]; then + lens_clause="$(cat sql/lens/$LENS/timeseries.sql)" + lens_clause_and="$(cat sql/lens/$LENS/timeseries.sql) AND" + lens_join="" else + echo "CrUX query so using alternative lens join" + lens_clause="" + lens_clause_and="" + lens_join="$(cat sql/lens/$LENS/crux_timeseries.sql | tr '\n' ' ')" + fi - lens_join="JOIN ($(cat sql/lens/$LENS/timeseries.sql | tr '\n' ' ')) USING (url, _TABLE_SUFFIX)" - if [[ $metric == crux* ]]; then - echo "CrUX query so using alternative lens join" - lens_join="JOIN ($(cat sql/lens/$LENS/timeseries.sql | tr '\n' ' ')) ON (origin || '\/' = url AND REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\\\\\\\d{4})(\\\\\\\\d{2})', '\\\\\\\\1_\\\\\\\\2_01') || '_' || IF(device = 'phone', 'mobile', device) = _TABLE_SUFFIX)" + if [[ -n "${date_join}" ]]; then + if [[ $(grep -i "WHERE" $query) ]]; then + # If WHERE clause already exists then add to it + sql=$(sed -e "s/\(WHERE\)/\1 $lens_clause_and $date_join AND/" $query \ + | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/") + else + # If WHERE clause does not exists then add it, before GROUP BY + sql=$(sed -e "s/\(GROUP BY\)/WHERE $lens_clause_and $date_join \1/" $query \ + | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/") fi - - if [[ -n "${date_join}" ]]; then - if [[ $(grep -i "WHERE" $query) ]]; then - # If WHERE clause already exists then add to it, before GROUP BY - sql=$(sed -e "s/\(WHERE\)/\1 $date_join AND/" $query \ - | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/") - else - # If WHERE clause doesn't exists then add it, before GROUP BY - sql=$(sed -e "s/\(GROUP BY\)/WHERE $date_join \1/" $query \ - | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/") - fi + else + if [[ $(grep -i "WHERE" $query) ]]; then + # If WHERE clause already exists then add to it + sql=$(sed -e "s/\(WHERE\)/\1 $lens_clause_and /" $query \ + | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/") else - sql=$(sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/" $query) + # If WHERE clause does not exists then add it, before GROUP BY + sql=$(sed -e "s/\(GROUP BY\)/WHERE $lens_clause \1/" $query \ + | sed -e "s/\(\`[^\`]*\`)*\)/\1 $lens_join/") fi fi else if [[ -n "${date_join}" ]]; then if [[ $(grep -i "WHERE" $query) ]]; then - # If WHERE clause already exists then add to it, before GROUP BY + # If WHERE clause already exists then add to it sql=$(sed -e "s/\(WHERE\)/\1 $date_join AND /" $query) else - # If WHERE clause doesn't exists then add it, before GROUP BY + # If WHERE clause does not exists then add it, before GROUP BY sql=$(sed -e "s/\(GROUP BY\)/WHERE $date_join \1/" $query) fi else @@ -380,8 +377,8 @@ else echo "$metric took $ELAPSED_TIME seconds" fi - # If it's a partial run, then combine with the current results. - if [[ $FORCE -eq 0 && -n "${current_contents}" ]]; then + # If it is a partial run, then combine with the current results. + if [[ $FORCE -eq 0 && -n "${current_contents}" && $metric != crux* ]]; then result=$(echo ${result} ${current_contents} | jq '.+= input') fi diff --git a/sql/histograms/bootupJs.sql b/sql/histograms/bootupJs.sql index c289489..e44601b 100644 --- a/sql/histograms/bootupJs.sql +++ b/sql/histograms/bootupJs.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - FLOOR(CAST(IFNULL(JSON_EXTRACT(report, '$.audits.bootup-time.numericValue'), JSON_EXTRACT(report, '$.audits.bootup-time.rawValue')) AS FLOAT64) / 100) / 10 AS bin + FLOOR(FLOAT64(IFNULL(lighthouse.audits['bootup-time'].numericValue, lighthouse.audits['bootup-time'].rawValue)) / 100) / 10 AS bin FROM - `httparchive.lighthouse.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/bytesCss.sql b/sql/histograms/bytesCss.sql index c2a3bbb..97c16bc 100644 --- a/sql/histograms/bytesCss.sql +++ b/sql/histograms/bytesCss.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(bytesCSS / 10240) * 10 AS INT64) AS bin + CAST(FLOOR(FLOAT64(summary.bytesCss) / 10240) * 10 AS INT64) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/bytesFont.sql b/sql/histograms/bytesFont.sql index 495b687..2b7548e 100644 --- a/sql/histograms/bytesFont.sql +++ b/sql/histograms/bytesFont.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(bytesFont / 10240) * 10 AS INT64) AS bin + CAST(FLOOR(FLOAT64(summary.bytesFont) / 10240) * 10 AS INT64) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/bytesHtml.sql b/sql/histograms/bytesHtml.sql index 430d80a..0be97a9 100644 --- a/sql/histograms/bytesHtml.sql +++ b/sql/histograms/bytesHtml.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(bytesHtml / 10240) * 10 AS INT64) AS bin + CAST(FLOOR(FLOAT64(summary.bytesHtml) / 10240) * 10 AS INT64) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/bytesImg.sql b/sql/histograms/bytesImg.sql index b232d54..7aa5b7d 100644 --- a/sql/histograms/bytesImg.sql +++ b/sql/histograms/bytesImg.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(bytesImg / 102400) * 100 AS INT64) AS bin + CAST(FLOOR(FLOAT64(summary.bytesImg) / 102400) * 100 AS INT64) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/bytesJs.sql b/sql/histograms/bytesJs.sql index 911756b..6d2662f 100644 --- a/sql/histograms/bytesJs.sql +++ b/sql/histograms/bytesJs.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(bytesJS / 10240) * 10 AS INT64) AS bin + CAST(FLOOR(FLOAT64(summary.bytesJS) / 10240) * 10 AS INT64) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/bytesOther.sql b/sql/histograms/bytesOther.sql index 9ddfca0..8ceff61 100644 --- a/sql/histograms/bytesOther.sql +++ b/sql/histograms/bytesOther.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(bytesOther / 10240) * 10 AS INT64) AS bin + CAST(FLOOR(FLOAT64(summary.bytesOther) / 10240) * 10 AS INT64) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/bytesTotal.sql b/sql/histograms/bytesTotal.sql index 8d5407d..86ebb02 100644 --- a/sql/histograms/bytesTotal.sql +++ b/sql/histograms/bytesTotal.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(bytesTotal / 102400) * 100 AS INT64) AS bin + CAST(FLOOR(FLOAT64(summary.bytesTotal) / 102400) * 100 AS INT64) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/bytesVideo.sql b/sql/histograms/bytesVideo.sql index 39b8958..af3b9d3 100644 --- a/sql/histograms/bytesVideo.sql +++ b/sql/histograms/bytesVideo.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(bytesVideo / 10240) * 10 AS INT64) AS bin + CAST(FLOOR(FLOAT64(summary.bytesVideo) / 10240) * 10 AS INT64) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/compileJs.sql b/sql/histograms/compileJs.sql index 6aeb46c..eddc382 100644 --- a/sql/histograms/compileJs.sql +++ b/sql/histograms/compileJs.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(JSON_EXTRACT(payload, "$['_cpu.v8.compile']") AS INT64) AS bin + INT64(payload['_cpu.v8.compile']) AS bin FROM - `httparchive.pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/cruxFid.sql b/sql/histograms/cruxFid.sql deleted file mode 100644 index cd6386b..0000000 --- a/sql/histograms/cruxFid.sql +++ /dev/null @@ -1,49 +0,0 @@ -#standardSQL -CREATE TEMPORARY FUNCTION spreadBins(bins ARRAY>) -RETURNS ARRAY> -LANGUAGE js AS """ - // Convert into 25ms bins and spread the density around. - const WIDTH = 25; - return (bins || []).reduce((bins, bin) => { - bin.start = +bin.start; - bin.end = Math.min(bin.end, bin.start + 5000); - const binWidth = bin.end - bin.start; - for (let start = bin.start; start < bin.end; start += WIDTH) { - bins.push({ - start, - density: bin.density / (binWidth / WIDTH) - }); - } - return bins; - }, []); -"""; - -SELECT - *, - SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf -FROM ( - SELECT - *, - volume / SUM(volume) OVER (PARTITION BY client) AS pdf - FROM ( - SELECT - IF(form_factor.name = 'desktop', 'desktop', 'mobile') AS client, - bin.start AS bin, - SUM(bin.density) AS volume - FROM ( - SELECT - form_factor, - spreadBins(first_input.delay.histogram.bin) AS bins - FROM - `chrome-ux-report.all.${YYYYMM}` - ) - CROSS JOIN - UNNEST(bins) AS bin - GROUP BY - bin, - client - ) -) -ORDER BY - bin, - client diff --git a/sql/histograms/cruxShopifyThemes.sql b/sql/histograms/cruxShopifyThemes.sql index 850a05b..7b71403 100644 --- a/sql/histograms/cruxShopifyThemes.sql +++ b/sql/histograms/cruxShopifyThemes.sql @@ -12,19 +12,27 @@ CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor F good + needs_improvement + poor > 0 ); +-- Test CrUX data exists +WITH crux_test AS ( -- noqa: ST03 + SELECT + 1 + FROM + `chrome-ux-report.all.${YYYYMM}` +), + -- All Shopify shops in HTTPArchive -WITH archive_pages AS ( +archive_pages AS ( SELECT client, page AS url, - JSON_VALUE(custom_metrics, '$.ecommerce.Shopify.theme.name') AS theme_name, - JSON_VALUE(custom_metrics, '$.ecommerce.Shopify.theme.theme_store_id') AS theme_store_id + JSON_VALUE(custom_metrics.ecommerce.Shopify.theme.name) AS theme_name, + JSON_VALUE(custom_metrics.ecommerce.Shopify.theme.theme_store_id) AS theme_store_id FROM - `httparchive.all.pages` + `httparchive.crawl.pages` WHERE - date = DATE(REPLACE('${YYYY_MM_DD}', '_', '-')) AND + date = '${YYYY-MM-DD}' AND is_root_page AND - JSON_VALUE(custom_metrics, '$.ecommerce.Shopify.theme.name') IS NOT NULL --first grab all shops for market share + JSON_VALUE(custom_metrics.ecommerce.Shopify.theme.name) IS NOT NULL --first grab all shops for market share ) SELECT @@ -176,7 +184,7 @@ JOIN ( -- Include null theme store ids so that we can get full market share within CrUX ON IFNULL(theme_names.theme_store_id, 'N/A') = IFNULL(archive_pages.theme_store_id, 'N/A') WHERE - date = DATE(REPLACE('${YYYY_MM_DD}', '_', '-')) AND + date = '${YYYY-MM-DD}' AND theme_names.rank = 1 GROUP BY client, diff --git a/sql/histograms/dcl.sql b/sql/histograms/dcl.sql index 120ccff..4e587ee 100644 --- a/sql/histograms/dcl.sql +++ b/sql/histograms/dcl.sql @@ -8,13 +8,15 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - FLOOR(onContentLoaded / 1000) AS bin + FLOOR(FLOAT64(summary.onContentLoaded) / 1000) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` WHERE - onContentLoaded > 0 + date = '${YYYY-MM-DD}' AND + is_root_page AND + FLOAT64(summary.onContentLoaded) > 0 GROUP BY bin, client diff --git a/sql/histograms/evalJs.sql b/sql/histograms/evalJs.sql index 4d4e0eb..356905a 100644 --- a/sql/histograms/evalJs.sql +++ b/sql/histograms/evalJs.sql @@ -8,11 +8,17 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(CAST(JSON_EXTRACT(payload, "$['_cpu.EvaluateScript']") AS FLOAT64) / 20 AS INT64) * 20 AS bin + CAST(FLOAT64(r.payload['_cpu.EvaluateScript']) / 20 AS INT64) * 20 AS bin FROM - `httparchive.requests.${YYYY_MM_DD}_*` + `httparchive.crawl.requests` r + INNER JOIN + `httparchive.crawl.pages` + USING (date, client, is_root_page, rank, page) + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/fcp.sql b/sql/histograms/fcp.sql index 0c3b380..015426d 100644 --- a/sql/histograms/fcp.sql +++ b/sql/histograms/fcp.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(CAST(JSON_EXTRACT(payload, "$['_chromeUserTiming.firstContentfulPaint']") AS FLOAT64) / 1000) AS INT64) AS bin + CAST(FLOOR(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']) / 1000) AS INT64) AS bin FROM - `httparchive.pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/gzipSavings.sql b/sql/histograms/gzipSavings.sql index c239b10..81b173e 100644 --- a/sql/histograms/gzipSavings.sql +++ b/sql/histograms/gzipSavings.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(CAST(JSON_EXTRACT(payload, '$._gzip_savings') AS FLOAT64) / (1024 * 2)) * 2 AS INT64) AS bin + CAST(FLOOR(FLOAT64(payload._gzip_savings) / (1024 * 2)) * 2 AS INT64) AS bin FROM - `httparchive.pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/htmlElementPopularity.sql b/sql/histograms/htmlElementPopularity.sql index 773dc40..4e6618c 100644 --- a/sql/histograms/htmlElementPopularity.sql +++ b/sql/histograms/htmlElementPopularity.sql @@ -18,7 +18,7 @@ SELECT COUNT(DISTINCT root_page) / total AS pct, ARRAY_TO_STRING(ARRAY_AGG(DISTINCT page LIMIT 5), ' ') AS sample_urls FROM - `httparchive.all.pages` + `httparchive.crawl.pages` JOIN ( SELECT @@ -26,18 +26,17 @@ JOIN client, COUNT(DISTINCT root_page) AS total FROM - `httparchive.all.pages` + `httparchive.crawl.pages` WHERE - date = PARSE_DATE('%Y_%m_%d', '${YYYY_MM_DD}') AND - rank = 1000 + date = '${YYYY-MM-DD}' GROUP BY date, client ) USING (date, client), - UNNEST(getElements(JSON_EXTRACT(custom_metrics, '$.element_count'))) AS element + UNNEST(getElements(TO_JSON_STRING(custom_metrics.element_count))) AS element WHERE - date = PARSE_DATE('%Y_%m_%d', '${YYYY_MM_DD}') + date = '${YYYY-MM-DD}' GROUP BY client, total, diff --git a/sql/histograms/imgSavings.sql b/sql/histograms/imgSavings.sql index 70023c3..df93fe4 100644 --- a/sql/histograms/imgSavings.sql +++ b/sql/histograms/imgSavings.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(CAST(JSON_EXTRACT(payload, '$._image_savings') AS FLOAT64) / (1024 * 10)) * 10 AS INT64) AS bin + CAST(FLOOR(FLOAT64(payload._image_savings) / (1024 * 10)) * 10 AS INT64) AS bin FROM - `httparchive.pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/offscreenImages.sql b/sql/histograms/offscreenImages.sql index 8f571a6..18dda95 100644 --- a/sql/histograms/offscreenImages.sql +++ b/sql/histograms/offscreenImages.sql @@ -8,11 +8,18 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(IFNULL(CAST(JSON_EXTRACT(report, '$.audits.offscreen-images.details.overallSavingsBytes') AS INT64), CAST(JSON_EXTRACT(report, '$.audits.offscreen-images.extendedInfo.value.wastedKb') AS INT64) * 1024) / 10240) * 10 AS INT64) AS bin + CAST(FLOOR(IFNULL( + INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), + INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024 + ) / 10240) * 10 AS INT64) AS bin FROM - `httparchive.lighthouse.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date >= '2022-03-01' AND + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/ol.sql b/sql/histograms/ol.sql index 4aa6390..825943a 100644 --- a/sql/histograms/ol.sql +++ b/sql/histograms/ol.sql @@ -8,13 +8,15 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - FLOOR(onLoad / 1000) AS bin + FLOOR(FLOAT64(summary.onLoad) / 1000) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` WHERE - onLoad > 0 + date = '${YYYY-MM-DD}' AND + is_root_page AND + FLOAT64(summary.onLoad) > 0 GROUP BY bin, client diff --git a/sql/histograms/optimizedImages.sql b/sql/histograms/optimizedImages.sql index c9b5f9b..bb5ffca 100644 --- a/sql/histograms/optimizedImages.sql +++ b/sql/histograms/optimizedImages.sql @@ -8,11 +8,18 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(IFNULL(CAST(JSON_EXTRACT(report, '$.audits.uses-optimized-images.details.overallSavingsBytes') AS INT64), CAST(JSON_EXTRACT(report, '$.audits.uses-optimized-images.extendedInfo.value.wastedKb') AS INT64) * 1024) / 10240) * 10 AS INT64) AS bin + CAST(FLOOR(IFNULL( + INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), + INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024 + ) / 10240) * 10 AS INT64) AS bin FROM - `httparchive.lighthouse.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date >= '2022-03-01' AND + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/reqCss.sql b/sql/histograms/reqCss.sql index 8bc6cdc..861fe81 100644 --- a/sql/histograms/reqCss.sql +++ b/sql/histograms/reqCss.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - reqCSS AS bin + FLOAT64(summary.reqCss) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/reqFont.sql b/sql/histograms/reqFont.sql index d414b22..145e2b6 100644 --- a/sql/histograms/reqFont.sql +++ b/sql/histograms/reqFont.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - reqFont AS bin + FLOAT64(summary.reqFont) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/reqHtml.sql b/sql/histograms/reqHtml.sql index d6631e3..82dd5a2 100644 --- a/sql/histograms/reqHtml.sql +++ b/sql/histograms/reqHtml.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - reqHtml AS bin + FLOAT64(summary.reqHtml) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/reqImg.sql b/sql/histograms/reqImg.sql index 9d075bc..5409c83 100644 --- a/sql/histograms/reqImg.sql +++ b/sql/histograms/reqImg.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - reqImg AS bin + FLOAT64(summary.reqImg) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/reqJs.sql b/sql/histograms/reqJs.sql index 503dbaf..8dfc12e 100644 --- a/sql/histograms/reqJs.sql +++ b/sql/histograms/reqJs.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - reqJS AS bin + FLOAT64(summary.reqJS) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/reqOther.sql b/sql/histograms/reqOther.sql index ee54ef9..caa9944 100644 --- a/sql/histograms/reqOther.sql +++ b/sql/histograms/reqOther.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - reqOther AS bin + FLOAT64(summary.reqOther) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/reqTotal.sql b/sql/histograms/reqTotal.sql index 5c712a1..df0987f 100644 --- a/sql/histograms/reqTotal.sql +++ b/sql/histograms/reqTotal.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - FLOOR(reqTotal / 10) * 10 AS bin + FLOOR(FLOAT64(summary.reqTotal) / 10) * 10 AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/reqVideo.sql b/sql/histograms/reqVideo.sql index 74bf7b6..3f83c74 100644 --- a/sql/histograms/reqVideo.sql +++ b/sql/histograms/reqVideo.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - reqVideo AS bin + FLOAT64(summary.reqVideo) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/speedIndex.sql b/sql/histograms/speedIndex.sql index 8bb993a..6740961 100644 --- a/sql/histograms/speedIndex.sql +++ b/sql/histograms/speedIndex.sql @@ -8,11 +8,14 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(CAST(JSON_EXTRACT(payload, '$._SpeedIndex') AS FLOAT64) / (1000)) * 1000 AS INT64) AS bin + CAST(FLOOR(FLOAT64(payload._SpeedIndex) / (1000)) * 1000 AS INT64) AS bin FROM - `httparchive.pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/tcp.sql b/sql/histograms/tcp.sql index aeb80bf..ea80501 100644 --- a/sql/histograms/tcp.sql +++ b/sql/histograms/tcp.sql @@ -8,13 +8,15 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - _connections AS bin + INT64(summary._connections) AS bin FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` WHERE - _connections > 0 + date = '${YYYY-MM-DD}' AND + is_root_page AND + INT64(summary._connections) > 0 GROUP BY bin, client diff --git a/sql/histograms/ttci.sql b/sql/histograms/ttci.sql index db62d3a..7ba6de1 100644 --- a/sql/histograms/ttci.sql +++ b/sql/histograms/ttci.sql @@ -8,11 +8,21 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - _TABLE_SUFFIX AS client, + client, COUNT(0) AS volume, - CAST(FLOOR(CAST(IFNULL(JSON_EXTRACT(report, '$.audits.interactive.numericValue'), IFNULL(JSON_EXTRACT(report, '$.audits.consistently-interactive.rawValue'), JSON_EXTRACT(report, '$.audits.interactive.rawValue'))) AS FLOAT64) / 1000) AS INT64) AS bin + CAST(FLOOR(CAST(IFNULL( + FLOAT64(lighthouse.audits.interactive.numericValue), + IFNULL( + FLOAT64(lighthouse.audits['consistently-interactive'].rawValue), + FLOAT64(lighthouse.audits.interactive.rawValue) + ) + ) AS FLOAT64) / 1000) AS INT64) AS bin FROM - `httparchive.lighthouse.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` + WHERE + date >= '2022-03-01' AND + date = '${YYYY-MM-DD}' AND + is_root_page GROUP BY bin, client diff --git a/sql/histograms/vulnJs.sql b/sql/histograms/vulnJs.sql deleted file mode 100644 index 87f3f93..0000000 --- a/sql/histograms/vulnJs.sql +++ /dev/null @@ -1,39 +0,0 @@ -#standardSQL -CREATE TEMPORARY FUNCTION countVulnerabilities(report STRING) -RETURNS INT64 LANGUAGE js AS """ - try { - const $ = JSON.parse(report); - const audit = $.audits['no-vulnerable-libraries']; - if (audit.extendedInfo && audit.extendedInfo.vulnerabilities) { - return audit.extendedInfo.vulnerabilities.length; - } - return +audit.displayValue.match(/\\d+/)[0]; - } catch (e) { - return 0; - } -"""; - -SELECT - *, - SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf -FROM ( - SELECT - *, - volume / SUM(volume) OVER (PARTITION BY client) AS pdf - FROM ( - SELECT - _TABLE_SUFFIX AS client, - COUNT(0) AS volume, - countVulnerabilities(report) AS bin - FROM - `httparchive.lighthouse.${YYYY_MM_DD}_*` - WHERE - report IS NOT NULL - GROUP BY - bin, - client - ) -) -ORDER BY - bin, - client diff --git a/sql/lens/drupal/blink_timeseries.sql b/sql/lens/drupal/blink_timeseries.sql deleted file mode 100644 index 03b5d5d..0000000 --- a/sql/lens/drupal/blink_timeseries.sql +++ /dev/null @@ -1,68 +0,0 @@ -SELECT - REGEXP_REPLACE(CAST(yyyymmdd AS STRING), '-', '') AS yyyymmdd, - client, - id, - feature, - type, - COUNT(0) AS num_urls, - MAX(total) AS total_urls, - SAFE_DIVIDE(COUNT(0), max(total)) AS num_urls_pct -FROM - `httparchive.blink_features.features` -JOIN - ( - SELECT - _TABLE_SUFFIX AS _TABLE_SUFFIX, - url AS tech_url - FROM - `httparchive.technologies.*` - WHERE - app = 'Drupal' - GROUP BY - _TABLE_SUFFIX, - tech_url - ) -ON (url = tech_url AND _TABLE_SUFFIX = FORMAT_DATE('%Y_%m_%d', yyyymmdd) || '_' || client) -JOIN ( - SELECT - yyyymmdd, - client, - COUNT(DISTINCT url) AS total - FROM `httparchive.blink_features.features` - JOIN - ( - SELECT - _TABLE_SUFFIX AS _TABLE_SUFFIX, - url AS tech_url - FROM - `httparchive.technologies.*` - WHERE - app = 'Drupal' - GROUP BY - _TABLE_SUFFIX, - tech_url - ) - ON (url = tech_url AND _TABLE_SUFFIX = FORMAT_DATE('%Y_%m_%d', yyyymmdd) || '_' || client) - WHERE - 1 = 1 - {{ BLINK_DATE_JOIN }} - GROUP BY - yyyymmdd, - client -) -USING (yyyymmdd, client) -WHERE - 1 = 1 - {{ BLINK_DATE_JOIN }} -GROUP BY - yyyymmdd, - client, - id, - feature, - type -ORDER BY - yyyymmdd, - client, - id, - feature, - type diff --git a/sql/lens/drupal/crux_histograms.sql b/sql/lens/drupal/crux_histograms.sql index 03c404e..58edc7e 100644 --- a/sql/lens/drupal/crux_histograms.sql +++ b/sql/lens/drupal/crux_histograms.sql @@ -1,15 +1,14 @@ -JOIN - ( +INNER JOIN ( SELECT - url, - _TABLE_SUFFIX AS _TABLE_SUFFIX + page, + client FROM - `httparchive.technologies.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` WHERE - app = 'Drupal' and - LENGTH(url) > 0 + date = '${YYYY-MM-DD}' AND + 'Drupal' IN UNNEST(technologies.technology) GROUP BY 1, 2 - ) -ON (SUBSTR(url, 0, LENGTH(url) -1) = origin AND form_factor.name = IF(_TABLE_SUFFIX = 'desktop', 'desktop', 'phone')) +) +ON (SUBSTR(page, 0, LENGTH(page) - 1) = origin AND form_factor.name = IF(client = 'desktop', 'desktop', 'phone')) diff --git a/sql/lens/drupal/crux_timeseries.sql b/sql/lens/drupal/crux_timeseries.sql new file mode 100644 index 0000000..3ebac3e --- /dev/null +++ b/sql/lens/drupal/crux_timeseries.sql @@ -0,0 +1,17 @@ +INNER JOIN ( + SELECT + SUBSTR(page, 0, LENGTH(page) - 1) AS origin, + IF(client = 'mobile', 'phone', client) AS device, + date + FROM + `httparchive.crawl.pages` + WHERE + date >= '2010-11-15' AND + is_root_page AND + 'Drupal' IN UNNEST(technologies.technology) + GROUP BY + 1, + 2, + 3 +) +USING (origin, device, date) diff --git a/sql/lens/drupal/histograms.sql b/sql/lens/drupal/histograms.sql index f113339..86b5d99 100644 --- a/sql/lens/drupal/histograms.sql +++ b/sql/lens/drupal/histograms.sql @@ -1,10 +1 @@ -SELECT - url, - _TABLE_SUFFIX AS _TABLE_SUFFIX -FROM - `httparchive.technologies.${YYYY_MM_DD}_*` -WHERE - app = 'Drupal' -GROUP BY - 1, - 2 +'Drupal' IN UNNEST(technologies.technology) diff --git a/sql/lens/drupal/timeseries.sql b/sql/lens/drupal/timeseries.sql index eb2b2df..86b5d99 100644 --- a/sql/lens/drupal/timeseries.sql +++ b/sql/lens/drupal/timeseries.sql @@ -1,10 +1 @@ -SELECT - url, - _TABLE_SUFFIX AS _TABLE_SUFFIX -FROM - `httparchive.technologies.*` -WHERE - app = 'Drupal' -GROUP BY - 1, - 2 +'Drupal' IN UNNEST(technologies.technology) diff --git a/sql/lens/magento/blink_timeseries.sql b/sql/lens/magento/blink_timeseries.sql deleted file mode 100644 index aeb8742..0000000 --- a/sql/lens/magento/blink_timeseries.sql +++ /dev/null @@ -1,68 +0,0 @@ -SELECT - REGEXP_REPLACE(CAST(yyyymmdd AS STRING), '-', '') AS yyyymmdd, - client, - id, - feature, - type, - COUNT(0) AS num_urls, - MAX(total) AS total_urls, - SAFE_DIVIDE(COUNT(0), max(total)) AS num_urls_pct -FROM - `httparchive.blink_features.features` -JOIN - ( - SELECT - _TABLE_SUFFIX AS _TABLE_SUFFIX, - url AS tech_url - FROM - `httparchive.technologies.*` - WHERE - app = 'Magento' - GROUP BY - _TABLE_SUFFIX, - tech_url - ) -ON (url = tech_url AND _TABLE_SUFFIX = FORMAT_DATE('%Y_%m_%d', yyyymmdd) || '_' || client) -JOIN ( - SELECT - yyyymmdd, - client, - COUNT(DISTINCT url) AS total - FROM `httparchive.blink_features.features` - JOIN - ( - SELECT - _TABLE_SUFFIX AS _TABLE_SUFFIX, - url AS tech_url - FROM - `httparchive.technologies.*` - WHERE - app = 'Magento' - GROUP BY - _TABLE_SUFFIX, - tech_url - ) - ON (url = tech_url AND _TABLE_SUFFIX = FORMAT_DATE('%Y_%m_%d', yyyymmdd) || '_' || client) - WHERE - 1 = 1 - {{ BLINK_DATE_JOIN }} - GROUP BY - yyyymmdd, - client -) -USING (yyyymmdd, client) -WHERE - 1 = 1 - {{ BLINK_DATE_JOIN }} -GROUP BY - yyyymmdd, - client, - id, - feature, - type -ORDER BY - yyyymmdd, - client, - id, - feature, - type diff --git a/sql/lens/magento/crux_histograms.sql b/sql/lens/magento/crux_histograms.sql index b8197cd..f4fb070 100644 --- a/sql/lens/magento/crux_histograms.sql +++ b/sql/lens/magento/crux_histograms.sql @@ -1,15 +1,14 @@ -INNER JOIN - ( +INNER JOIN ( SELECT - url, - _TABLE_SUFFIX AS _TABLE_SUFFIX + page, + client FROM - `httparchive.technologies.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` WHERE - app = 'Magento' and - LENGTH(url) > 0 + date = '${YYYY-MM-DD}' AND + 'Magento' IN UNNEST(technologies.technology) GROUP BY 1, 2 - ) -ON (SUBSTR(url, 0, LENGTH(url) -1) = origin AND form_factor.name = IF(_TABLE_SUFFIX = 'desktop', 'desktop', 'phone')) +) +ON (SUBSTR(page, 0, LENGTH(page) - 1) = origin AND form_factor.name = IF(client = 'desktop', 'desktop', 'phone')) diff --git a/sql/lens/magento/crux_timeseries.sql b/sql/lens/magento/crux_timeseries.sql new file mode 100644 index 0000000..2694cab --- /dev/null +++ b/sql/lens/magento/crux_timeseries.sql @@ -0,0 +1,17 @@ +INNER JOIN ( + SELECT + SUBSTR(page, 0, LENGTH(page) - 1) AS origin, + IF(client = 'mobile', 'phone', client) AS device, + date + FROM + `httparchive.crawl.pages` + WHERE + date >= '2010-11-15' AND + is_root_page AND + 'Magento' IN UNNEST(technologies.technology) + GROUP BY + 1, + 2, + 3 +) +USING (origin, device, date) diff --git a/sql/lens/magento/histograms.sql b/sql/lens/magento/histograms.sql index 3035144..55746e8 100644 --- a/sql/lens/magento/histograms.sql +++ b/sql/lens/magento/histograms.sql @@ -1,10 +1 @@ -SELECT - url, - _TABLE_SUFFIX AS _TABLE_SUFFIX -FROM - `httparchive.technologies.${YYYY_MM_DD}_*` -WHERE - app = 'Magento' -GROUP BY - 1, - 2 +'Magento' IN UNNEST(technologies.technology) diff --git a/sql/lens/magento/timeseries.sql b/sql/lens/magento/timeseries.sql index 43c135a..55746e8 100644 --- a/sql/lens/magento/timeseries.sql +++ b/sql/lens/magento/timeseries.sql @@ -1,10 +1 @@ -SELECT - url, - _TABLE_SUFFIX AS _TABLE_SUFFIX -FROM - `httparchive.technologies.*` -WHERE - app = 'Magento' -GROUP BY - 1, - 2 +'Magento' IN UNNEST(technologies.technology) diff --git a/sql/lens/top100k/blink_timeseries.sql b/sql/lens/top100k/blink_timeseries.sql deleted file mode 100644 index e365003..0000000 --- a/sql/lens/top100k/blink_timeseries.sql +++ /dev/null @@ -1,43 +0,0 @@ -SELECT - REGEXP_REPLACE(CAST(yyyymmdd AS STRING), '-', '') AS yyyymmdd, - client, - id, - feature, - type, - COUNT(0) AS num_urls, - MAX(total) AS total_urls, - SAFE_DIVIDE(COUNT(0), max(total)) AS num_urls_pct -FROM - `httparchive.blink_features.features` -JOIN - ( - SELECT - yyyymmdd, - client, - COUNT(DISTINCT url) AS total - FROM `httparchive.blink_features.features` - WHERE - rank <= 100000 AND - yyyymmdd >= '2021-05-01' - {{ BLINK_DATE_JOIN }} - GROUP BY - yyyymmdd, - client - ) -USING (yyyymmdd, client) -WHERE - rank <= 100000 AND - yyyymmdd >= '2021-05-01' - {{ BLINK_DATE_JOIN }} -GROUP BY - yyyymmdd, - client, - id, - feature, - type -ORDER BY - yyyymmdd, - client, - id, - feature, - type diff --git a/sql/lens/top100k/crux_timeseries.sql b/sql/lens/top100k/crux_timeseries.sql new file mode 100644 index 0000000..09c91df --- /dev/null +++ b/sql/lens/top100k/crux_timeseries.sql @@ -0,0 +1,17 @@ +INNER JOIN ( + SELECT + SUBSTR(page, 0, LENGTH(page) - 1) AS origin, + IF(client = 'mobile', 'phone', client) AS device, + date + FROM + `httparchive.crawl.pages` + WHERE + date >= '2010-11-15' AND + is_root_page AND + rank = 100000 + GROUP BY + 1, + 2, + 3 +) +USING (origin, device, date) diff --git a/sql/lens/top100k/histograms.sql b/sql/lens/top100k/histograms.sql index 80e62de..da9eeaa 100644 --- a/sql/lens/top100k/histograms.sql +++ b/sql/lens/top100k/histograms.sql @@ -1,10 +1 @@ -SELECT - url, - _TABLE_SUFFIX AS _TABLE_SUFFIX -FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` -WHERE - rank <= 100000 -GROUP BY - 1, - 2 +rank <= 100000 diff --git a/sql/lens/top100k/timeseries.sql b/sql/lens/top100k/timeseries.sql index d7db042..da9eeaa 100644 --- a/sql/lens/top100k/timeseries.sql +++ b/sql/lens/top100k/timeseries.sql @@ -1,11 +1 @@ -SELECT - url, - _TABLE_SUFFIX AS _TABLE_SUFFIX -FROM - `httparchive.summary_pages.*` -WHERE - rank <= 100000 AND - _TABLE_SUFFIX >= '2021_05_01' -GROUP BY - 1, - 2 +rank <= 100000 diff --git a/sql/lens/top10k/blink_timeseries.sql b/sql/lens/top10k/blink_timeseries.sql deleted file mode 100644 index 3c85d9d..0000000 --- a/sql/lens/top10k/blink_timeseries.sql +++ /dev/null @@ -1,42 +0,0 @@ -SELECT - REGEXP_REPLACE(CAST(yyyymmdd AS STRING), '-', '') AS yyyymmdd, - client, - id, - feature, - type, - COUNT(0) AS num_urls, - MAX(total) AS total_urls, - SAFE_DIVIDE(COUNT(0), max(total)) AS num_urls_pct -FROM - `httparchive.blink_features.features` -JOIN ( - SELECT - yyyymmdd, - client, - COUNT(DISTINCT url) AS total - FROM `httparchive.blink_features.features` - WHERE - rank <= 10000 AND - yyyymmdd >= '2021-05-01' - {{ BLINK_DATE_JOIN }} - GROUP BY - yyyymmdd, - client -) -USING (yyyymmdd, client) -WHERE - rank <= 10000 AND - yyyymmdd >= '2021-05-01' - {{ BLINK_DATE_JOIN }} -GROUP BY - yyyymmdd, - client, - id, - feature, - type -ORDER BY - yyyymmdd, - client, - id, - feature, - type diff --git a/sql/lens/top10k/crux_timeseries.sql b/sql/lens/top10k/crux_timeseries.sql new file mode 100644 index 0000000..d4749d5 --- /dev/null +++ b/sql/lens/top10k/crux_timeseries.sql @@ -0,0 +1,17 @@ +INNER JOIN ( + SELECT + SUBSTR(page, 0, LENGTH(page) - 1) AS origin, + IF(client = 'mobile', 'phone', client) AS device, + date + FROM + `httparchive.crawl.pages` + WHERE + date >= '2010-11-15' AND + is_root_page AND + rank = 10000 + GROUP BY + 1, + 2, + 3 +) +USING (origin, device, date) diff --git a/sql/lens/top10k/histograms.sql b/sql/lens/top10k/histograms.sql index 2157a3f..57dbc02 100644 --- a/sql/lens/top10k/histograms.sql +++ b/sql/lens/top10k/histograms.sql @@ -1,10 +1 @@ -SELECT - url, - _TABLE_SUFFIX AS _TABLE_SUFFIX -FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` -WHERE - rank <= 10000 -GROUP BY - 1, - 2 +rank <= 10000 diff --git a/sql/lens/top10k/timeseries.sql b/sql/lens/top10k/timeseries.sql index 6ddf43a..57dbc02 100644 --- a/sql/lens/top10k/timeseries.sql +++ b/sql/lens/top10k/timeseries.sql @@ -1,11 +1 @@ -SELECT - url, - _TABLE_SUFFIX AS _TABLE_SUFFIX -FROM - `httparchive.summary_pages.*` -WHERE - rank <= 10000 AND - _TABLE_SUFFIX >= '2021_05_01' -GROUP BY - 1, - 2 +rank <= 10000 diff --git a/sql/lens/top1k/blink_timeseries.sql b/sql/lens/top1k/blink_timeseries.sql deleted file mode 100644 index 243acd8..0000000 --- a/sql/lens/top1k/blink_timeseries.sql +++ /dev/null @@ -1,42 +0,0 @@ -SELECT - REGEXP_REPLACE(CAST(yyyymmdd AS STRING), '-', '') AS yyyymmdd, - client, - id, - feature, - type, - COUNT(0) AS num_urls, - MAX(total) AS total_urls, - SAFE_DIVIDE(COUNT(0), max(total)) AS num_urls_pct -FROM - `httparchive.blink_features.features` -JOIN ( - SELECT - yyyymmdd, - client, - COUNT(DISTINCT url) AS total - FROM `httparchive.blink_features.features` - WHERE - rank <= 1000 AND - yyyymmdd >= '2021-05-01' - {{ BLINK_DATE_JOIN }} - GROUP BY - yyyymmdd, - client -) -USING (yyyymmdd, client) -WHERE - rank <= 1000 AND - yyyymmdd >= '2021-05-01' - {{ BLINK_DATE_JOIN }} -GROUP BY - yyyymmdd, - client, - id, - feature, - type -ORDER BY - yyyymmdd, - client, - id, - feature, - type diff --git a/sql/lens/top1k/crux_timeseries.sql b/sql/lens/top1k/crux_timeseries.sql new file mode 100644 index 0000000..2bd0720 --- /dev/null +++ b/sql/lens/top1k/crux_timeseries.sql @@ -0,0 +1,17 @@ +INNER JOIN ( + SELECT + SUBSTR(page, 0, LENGTH(page) - 1) AS origin, + IF(client = 'mobile', 'phone', client) AS device, + date + FROM + `httparchive.crawl.pages` + WHERE + date >= '2010-11-15' AND + is_root_page AND + rank = 1000 + GROUP BY + 1, + 2, + 3 +) +USING (origin, device, date) diff --git a/sql/lens/top1k/histograms.sql b/sql/lens/top1k/histograms.sql index 6bb184f..75ca1c8 100644 --- a/sql/lens/top1k/histograms.sql +++ b/sql/lens/top1k/histograms.sql @@ -1,10 +1 @@ -SELECT - url, - _TABLE_SUFFIX AS _TABLE_SUFFIX -FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` -WHERE - rank <= 1000 -GROUP BY - 1, - 2 +rank <= 1000 diff --git a/sql/lens/top1k/timeseries.sql b/sql/lens/top1k/timeseries.sql index 61c16c2..75ca1c8 100644 --- a/sql/lens/top1k/timeseries.sql +++ b/sql/lens/top1k/timeseries.sql @@ -1,11 +1 @@ -SELECT - url, - _TABLE_SUFFIX AS _TABLE_SUFFIX -FROM - `httparchive.summary_pages.*` -WHERE - rank <= 1000 AND - _TABLE_SUFFIX >= '2021_05_01' -GROUP BY - 1, - 2 +rank <= 1000 diff --git a/sql/lens/top1m/blink_timeseries.sql b/sql/lens/top1m/blink_timeseries.sql deleted file mode 100644 index 81a262a..0000000 --- a/sql/lens/top1m/blink_timeseries.sql +++ /dev/null @@ -1,42 +0,0 @@ -SELECT - REGEXP_REPLACE(CAST(yyyymmdd AS STRING), '-', '') AS yyyymmdd, - client, - id, - feature, - type, - COUNT(0) AS num_urls, - MAX(total) AS total_urls, - SAFE_DIVIDE(COUNT(0), max(total)) AS num_urls_pct -FROM - `httparchive.blink_features.features` -JOIN ( - SELECT - yyyymmdd, - client, - COUNT(DISTINCT url) AS total - FROM `httparchive.blink_features.features` - WHERE - rank <= 1000000 AND - yyyymmdd >= '2021-05-01' - {{ BLINK_DATE_JOIN }} - GROUP BY - yyyymmdd, - client -) -USING (yyyymmdd, client) -WHERE - rank <= 1000000 AND - yyyymmdd >= '2021-05-01' - {{ BLINK_DATE_JOIN }} -GROUP BY - yyyymmdd, - client, - id, - feature, - type -ORDER BY - yyyymmdd, - client, - id, - feature, - type diff --git a/sql/lens/top1m/crux_timeseries.sql b/sql/lens/top1m/crux_timeseries.sql new file mode 100644 index 0000000..dbf0c64 --- /dev/null +++ b/sql/lens/top1m/crux_timeseries.sql @@ -0,0 +1,17 @@ +INNER JOIN ( + SELECT + SUBSTR(page, 0, LENGTH(page) - 1) AS origin, + IF(client = 'mobile', 'phone', client) AS device, + date + FROM + `httparchive.crawl.pages` + WHERE + date >= '2010-11-15' AND + is_root_page AND + rank = 1000000 + GROUP BY + 1, + 2, + 3 +) +USING (origin, device, date) diff --git a/sql/lens/top1m/histograms.sql b/sql/lens/top1m/histograms.sql index e36cd2a..57dbc02 100644 --- a/sql/lens/top1m/histograms.sql +++ b/sql/lens/top1m/histograms.sql @@ -1,10 +1 @@ -SELECT - url, - _TABLE_SUFFIX AS _TABLE_SUFFIX -FROM - `httparchive.summary_pages.${YYYY_MM_DD}_*` -WHERE - rank <= 1000000 -GROUP BY - 1, - 2 +rank <= 10000 diff --git a/sql/lens/top1m/timeseries.sql b/sql/lens/top1m/timeseries.sql index bb59934..57dbc02 100644 --- a/sql/lens/top1m/timeseries.sql +++ b/sql/lens/top1m/timeseries.sql @@ -1,11 +1 @@ -SELECT - url, - _TABLE_SUFFIX AS _TABLE_SUFFIX -FROM - `httparchive.summary_pages.*` -WHERE - rank <= 1000000 AND - _TABLE_SUFFIX >= '2021_05_01' -GROUP BY - 1, - 2 +rank <= 10000 diff --git a/sql/lens/wordpress/blink_timeseries.sql b/sql/lens/wordpress/blink_timeseries.sql deleted file mode 100644 index caf9e78..0000000 --- a/sql/lens/wordpress/blink_timeseries.sql +++ /dev/null @@ -1,68 +0,0 @@ -SELECT - REGEXP_REPLACE(CAST(yyyymmdd AS STRING), '-', '') AS yyyymmdd, - client, - id, - feature, - type, - COUNT(0) AS num_urls, - MAX(total) AS total_urls, - SAFE_DIVIDE(COUNT(0), max(total)) AS num_urls_pct -FROM - `httparchive.blink_features.features` -JOIN - ( - SELECT - _TABLE_SUFFIX AS _TABLE_SUFFIX, - url AS tech_url - FROM - `httparchive.technologies.*` - WHERE - app = 'WordPress' - GROUP BY - _TABLE_SUFFIX, - tech_url - ) -ON (url = tech_url AND _TABLE_SUFFIX = FORMAT_DATE('%Y_%m_%d', yyyymmdd) || '_' || client) -JOIN ( - SELECT - yyyymmdd, - client, - COUNT(DISTINCT url) AS total - FROM `httparchive.blink_features.features` - JOIN - ( - SELECT - _TABLE_SUFFIX AS _TABLE_SUFFIX, - url AS tech_url - FROM - `httparchive.technologies.*` - WHERE - app = 'WordPress' - GROUP BY - _TABLE_SUFFIX, - tech_url - ) - ON (url = tech_url AND _TABLE_SUFFIX = FORMAT_DATE('%Y_%m_%d', yyyymmdd) || '_' || client) - WHERE - 1 = 1 - {{ BLINK_DATE_JOIN }} - GROUP BY - yyyymmdd, - client -) -USING (yyyymmdd, client) -WHERE - 1 = 1 - {{ BLINK_DATE_JOIN }} -GROUP BY - yyyymmdd, - client, - id, - feature, - type -ORDER BY - yyyymmdd, - client, - id, - feature, - type diff --git a/sql/lens/wordpress/crux_histograms.sql b/sql/lens/wordpress/crux_histograms.sql index 5784cde..ccb5f9a 100644 --- a/sql/lens/wordpress/crux_histograms.sql +++ b/sql/lens/wordpress/crux_histograms.sql @@ -1,15 +1,14 @@ -INNER JOIN - ( +INNER JOIN ( SELECT - url, - _TABLE_SUFFIX AS _TABLE_SUFFIX + page, + client FROM - `httparchive.technologies.${YYYY_MM_DD}_*` + `httparchive.crawl.pages` WHERE - app = 'WordPress' and - LENGTH(url) > 0 + date = '${YYYY-MM-DD}' AND + 'WordPress' IN UNNEST(technologies.technology) GROUP BY 1, 2 - ) -ON (SUBSTR(url, 0, LENGTH(url) -1) = origin AND form_factor.name = IF(_TABLE_SUFFIX = 'desktop', 'desktop', 'phone')) +) +ON (SUBSTR(page, 0, LENGTH(page) - 1) = origin AND form_factor.name = IF(client = 'desktop', 'desktop', 'phone')) diff --git a/sql/lens/wordpress/crux_timeseries.sql b/sql/lens/wordpress/crux_timeseries.sql new file mode 100644 index 0000000..dd869d8 --- /dev/null +++ b/sql/lens/wordpress/crux_timeseries.sql @@ -0,0 +1,17 @@ +INNER JOIN ( + SELECT + SUBSTR(page, 0, LENGTH(page) - 1) AS origin, + IF(client = 'mobile', 'phone', client) AS device, + date + FROM + `httparchive.crawl.pages` + WHERE + date >= '2010-11-15' AND + is_root_page AND + 'WordPress' IN UNNEST(technologies.technology) + GROUP BY + 1, + 2, + 3 +) +USING (origin, device, date) diff --git a/sql/lens/wordpress/histograms.sql b/sql/lens/wordpress/histograms.sql index c225c98..6ac1aa9 100644 --- a/sql/lens/wordpress/histograms.sql +++ b/sql/lens/wordpress/histograms.sql @@ -1,10 +1 @@ -SELECT - url, - _TABLE_SUFFIX AS _TABLE_SUFFIX -FROM - `httparchive.technologies.${YYYY_MM_DD}_*` -WHERE - app = 'WordPress' -GROUP BY - 1, - 2 +'WordPress' IN UNNEST(technologies.technology) diff --git a/sql/lens/wordpress/timeseries.sql b/sql/lens/wordpress/timeseries.sql index 9c618f0..6ac1aa9 100644 --- a/sql/lens/wordpress/timeseries.sql +++ b/sql/lens/wordpress/timeseries.sql @@ -1,10 +1 @@ -SELECT - url, - _TABLE_SUFFIX AS _TABLE_SUFFIX -FROM - `httparchive.technologies.*` -WHERE - app = 'WordPress' -GROUP BY - 1, - 2 +'WordPress' IN UNNEST(technologies.technology) diff --git a/sql/timeseries/a11yButtonName.sql b/sql/timeseries/a11yButtonName.sql index 55eddc5..5dedac5 100644 --- a/sql/timeseries/a11yButtonName.sql +++ b/sql/timeseries/a11yButtonName.sql @@ -1,13 +1,18 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(SUM(IF(JSON_EXTRACT(report, '$.audits.button-name.score') IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits['button-name'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` WHERE - report IS NOT NULL + /* Should really use the following to only include eligible sites. */ + /* LAX_STRING(lighthouse.audits['button-name'].score) IS NOT NULL AND */ + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + is_root_page AND + date >= '2017-06-01' GROUP BY date, timestamp, diff --git a/sql/timeseries/a11yColorContrast.sql b/sql/timeseries/a11yColorContrast.sql index e865687..132d9ff 100644 --- a/sql/timeseries/a11yColorContrast.sql +++ b/sql/timeseries/a11yColorContrast.sql @@ -1,13 +1,18 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(SUM(IF(JSON_EXTRACT(report, '$.audits.color-contrast.score') IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits['color-contrast'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` WHERE - report IS NOT NULL + /* Should really use the following to only include eligible sites. */ + /* LAX_STRING(lighthouse.audits['color-contrast'].score) IS NOT NULL AND */ + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + date >= '2017-06-01' AND + is_root_page GROUP BY date, timestamp, diff --git a/sql/timeseries/a11yImageAlt.sql b/sql/timeseries/a11yImageAlt.sql index 28ec635..94c3347 100644 --- a/sql/timeseries/a11yImageAlt.sql +++ b/sql/timeseries/a11yImageAlt.sql @@ -1,13 +1,18 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(SUM(IF(JSON_EXTRACT(report, '$.audits.image-alt.score') IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits['image-alt'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` WHERE - report IS NOT NULL + /* Should really use the following to only include eligible sites. */ + /* LAX_STRING(lighthouse.audits['image-alt'].score) IS NOT NULL AND */ + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + date >= '2017-06-01' AND + is_root_page GROUP BY date, timestamp, diff --git a/sql/timeseries/a11yLabel.sql b/sql/timeseries/a11yLabel.sql index 867f03f..f6b29c0 100644 --- a/sql/timeseries/a11yLabel.sql +++ b/sql/timeseries/a11yLabel.sql @@ -1,13 +1,18 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(SUM(IF(JSON_EXTRACT(report, '$.audits.label.score') IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits.label.score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` WHERE - report IS NOT NULL + /* Should really use the following to only include eligible sites. */ + /* LAX_STRING(lighthouse.audits.label.score) IS NOT NULL AND */ + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + date >= '2017-06-01' AND + is_root_page GROUP BY date, timestamp, diff --git a/sql/timeseries/a11yLinkName.sql b/sql/timeseries/a11yLinkName.sql index 0416853..7063087 100644 --- a/sql/timeseries/a11yLinkName.sql +++ b/sql/timeseries/a11yLinkName.sql @@ -1,13 +1,18 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(SUM(IF(JSON_EXTRACT(report, '$.audits.link-name.score') IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits['link-name'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` WHERE - report IS NOT NULL + /* Should really use the following to only include eligible sites. */ + /* LAX_STRING(lighthouse.audits['link-name'].score) IS NOT NULL AND */ + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + date >= '2017-06-01' AND + is_root_page GROUP BY date, timestamp, diff --git a/sql/timeseries/a11yScores.sql b/sql/timeseries/a11yScores.sql index 03a8f21..4296313 100644 --- a/sql/timeseries/a11yScores.sql +++ b/sql/timeseries/a11yScores.sql @@ -1,17 +1,16 @@ #standardSQL # Lighthouse changed format of scores in v3.0.0 released in July 2018 so handle old with a UDF -CREATE TEMPORARY FUNCTION getA11yScore(reportCategories STRING) +CREATE TEMPORARY FUNCTION getA11yScore(reportCategories JSON) RETURNS FLOAT64 DETERMINISTIC LANGUAGE js AS """ - $=JSON.parse(reportCategories); - if($) { - return $.find(i => i.name === 'Accessibility').score; + if(reportCategories) { + return reportCategories.find(i => i.name === 'Accessibility').score; } """; SELECT - date, - UNIX_DATE(CAST(REPLACE(date, '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(100)], 2) AS p10, ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(250)], 2) AS p25, @@ -20,13 +19,16 @@ SELECT ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(900)], 2) AS p90 FROM ( SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - IFNULL(CAST(JSON_EXTRACT(report, '$.categories.accessibility.score') AS FLOAT64) * 100, getA11yScore(JSON_EXTRACT(report, '$.reportCategories'))) AS score + date, + client, + IFNULL(LAX_FLOAT64(lighthouse.categories.accessibility.score) * 100, getA11yScore(lighthouse.reportCategories)) AS score FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` WHERE - report IS NOT NULL + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + date >= '2017-06-01' AND + is_root_page ) GROUP BY date, diff --git a/sql/timeseries/asyncClipboardRead.sql b/sql/timeseries/asyncClipboardRead.sql index 8a27b28..c712e87 100644 --- a/sql/timeseries/asyncClipboardRead.sql +++ b/sql/timeseries/asyncClipboardRead.sql @@ -1,17 +1,21 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '2369' OR feature = 'AsyncClipboardAPIRead', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '2369' OR feature = 'AsyncClipboardAPIRead', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '2369' OR feat.feature = 'AsyncClipboardAPIRead') +WHERE + date >= '2016-11-15' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, diff --git a/sql/timeseries/badgeClear.sql b/sql/timeseries/badgeClear.sql index 93332ac..a8514f0 100644 --- a/sql/timeseries/badgeClear.sql +++ b/sql/timeseries/badgeClear.sql @@ -1,17 +1,21 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '2727' OR feature = 'BadgeClear', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '2727' OR feature = 'BadgeClear', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '2727' OR feat.feature = 'BadgeClear') +WHERE + date >= '2016-11-15' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, diff --git a/sql/timeseries/badgeSet.sql b/sql/timeseries/badgeSet.sql index bb3b206..7655f7b 100644 --- a/sql/timeseries/badgeSet.sql +++ b/sql/timeseries/badgeSet.sql @@ -1,17 +1,21 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '2726' OR feature = 'BadgeSet', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '2726' OR feature = 'BadgeSet', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '2726' OR feat.feature = 'BadgeSet') +WHERE + date >= '2016-11-15' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, diff --git a/sql/timeseries/bootupJs.sql b/sql/timeseries/bootupJs.sql index 5efd34c..d035216 100644 --- a/sql/timeseries/bootupJs.sql +++ b/sql/timeseries/bootupJs.sql @@ -1,8 +1,8 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(100)], 2) AS p10, ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(250)], 2) AS p25, ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(500)], 2) AS p50, @@ -10,13 +10,19 @@ SELECT ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(900)], 2) AS p90 FROM ( SELECT - _TABLE_SUFFIX AS _TABLE_SUFFIX, - CAST(IFNULL( - JSON_EXTRACT(report, '$.audits.bootup-time.numericValue'), - JSON_EXTRACT(report, '$.audits.bootup-time.rawValue') - ) AS FLOAT64) / 1000 AS value + date, + client, + IFNULL( + FLOAT64(lighthouse.audits['bootup-time'].numericValue), + FLOAT64(lighthouse.audits['bootup-time'].rawValue) + ) / 1000 AS value FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` + WHERE + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + date >= '2017-06-01' AND + is_root_page ) GROUP BY date, diff --git a/sql/timeseries/bytesCss.sql b/sql/timeseries/bytesCss.sql index 9bbdfd8..57f8bdc 100644 --- a/sql/timeseries/bytesCss.sql +++ b/sql/timeseries/bytesCss.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(bytesCSS, 1001)[OFFSET(101)] / 1024, 2) AS p10, - ROUND(APPROX_QUANTILES(bytesCSS, 1001)[OFFSET(251)] / 1024, 2) AS p25, - ROUND(APPROX_QUANTILES(bytesCSS, 1001)[OFFSET(501)] / 1024, 2) AS p50, - ROUND(APPROX_QUANTILES(bytesCSS, 1001)[OFFSET(751)] / 1024, 2) AS p75, - ROUND(APPROX_QUANTILES(bytesCSS, 1001)[OFFSET(901)] / 1024, 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesCss), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesCss), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesCss), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesCss), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesCss), 1001)[OFFSET(901)] / 1024, 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - bytesCSS > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.bytesCss) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/bytesFont.sql b/sql/timeseries/bytesFont.sql index fa2c89a..f3e4ea9 100644 --- a/sql/timeseries/bytesFont.sql +++ b/sql/timeseries/bytesFont.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(bytesFont, 1001)[OFFSET(101)] / 1024, 2) AS p10, - ROUND(APPROX_QUANTILES(bytesFont, 1001)[OFFSET(251)] / 1024, 2) AS p25, - ROUND(APPROX_QUANTILES(bytesFont, 1001)[OFFSET(501)] / 1024, 2) AS p50, - ROUND(APPROX_QUANTILES(bytesFont, 1001)[OFFSET(751)] / 1024, 2) AS p75, - ROUND(APPROX_QUANTILES(bytesFont, 1001)[OFFSET(901)] / 1024, 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesFont), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesFont), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesFont), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesFont), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesFont), 1001)[OFFSET(901)] / 1024, 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - bytesFont > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.bytesFont) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/bytesHtml.sql b/sql/timeseries/bytesHtml.sql index 005a5d3..133941c 100644 --- a/sql/timeseries/bytesHtml.sql +++ b/sql/timeseries/bytesHtml.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(bytesHtml, 1001)[OFFSET(101)] / 1024, 2) AS p10, - ROUND(APPROX_QUANTILES(bytesHtml, 1001)[OFFSET(251)] / 1024, 2) AS p25, - ROUND(APPROX_QUANTILES(bytesHtml, 1001)[OFFSET(501)] / 1024, 2) AS p50, - ROUND(APPROX_QUANTILES(bytesHtml, 1001)[OFFSET(751)] / 1024, 2) AS p75, - ROUND(APPROX_QUANTILES(bytesHtml, 1001)[OFFSET(901)] / 1024, 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesHtml), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesHtml), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesHtml), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesHtml), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesHtml), 1001)[OFFSET(901)] / 1024, 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - bytesHtml > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.bytesHtml) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/bytesImg.sql b/sql/timeseries/bytesImg.sql index ed83cc0..f71309d 100644 --- a/sql/timeseries/bytesImg.sql +++ b/sql/timeseries/bytesImg.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(bytesImg, 1001)[OFFSET(101)] / 1024, 2) AS p10, - ROUND(APPROX_QUANTILES(bytesImg, 1001)[OFFSET(251)] / 1024, 2) AS p25, - ROUND(APPROX_QUANTILES(bytesImg, 1001)[OFFSET(501)] / 1024, 2) AS p50, - ROUND(APPROX_QUANTILES(bytesImg, 1001)[OFFSET(751)] / 1024, 2) AS p75, - ROUND(APPROX_QUANTILES(bytesImg, 1001)[OFFSET(901)] / 1024, 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesImg), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesImg), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesImg), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesImg), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesImg), 1001)[OFFSET(901)] / 1024, 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - bytesImg > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.bytesImg) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/bytesJs.sql b/sql/timeseries/bytesJs.sql index 092a2d4..17896c7 100644 --- a/sql/timeseries/bytesJs.sql +++ b/sql/timeseries/bytesJs.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(bytesJS, 1001)[OFFSET(101)] / 1024, 2) AS p10, - ROUND(APPROX_QUANTILES(bytesJS, 1001)[OFFSET(251)] / 1024, 2) AS p25, - ROUND(APPROX_QUANTILES(bytesJS, 1001)[OFFSET(501)] / 1024, 2) AS p50, - ROUND(APPROX_QUANTILES(bytesJS, 1001)[OFFSET(751)] / 1024, 2) AS p75, - ROUND(APPROX_QUANTILES(bytesJS, 1001)[OFFSET(901)] / 1024, 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesJS), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesJS), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesJS), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesJS), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesJS), 1001)[OFFSET(901)] / 1024, 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - bytesJS > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.bytesJS) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/bytesOther.sql b/sql/timeseries/bytesOther.sql index d92c01f..378b52b 100644 --- a/sql/timeseries/bytesOther.sql +++ b/sql/timeseries/bytesOther.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(bytesOther, 1001)[OFFSET(101)] / 1024, 2) AS p10, - ROUND(APPROX_QUANTILES(bytesOther, 1001)[OFFSET(251)] / 1024, 2) AS p25, - ROUND(APPROX_QUANTILES(bytesOther, 1001)[OFFSET(501)] / 1024, 2) AS p50, - ROUND(APPROX_QUANTILES(bytesOther, 1001)[OFFSET(751)] / 1024, 2) AS p75, - ROUND(APPROX_QUANTILES(bytesOther, 1001)[OFFSET(901)] / 1024, 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesOther), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesOther), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesOther), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesOther), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesOther), 1001)[OFFSET(901)] / 1024, 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - bytesOther > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.bytesOther) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/bytesTotal.sql b/sql/timeseries/bytesTotal.sql index ceec253..acb65fd 100644 --- a/sql/timeseries/bytesTotal.sql +++ b/sql/timeseries/bytesTotal.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(bytesTotal, 1001)[OFFSET(101)] / 1024, 2) AS p10, - ROUND(APPROX_QUANTILES(bytesTotal, 1001)[OFFSET(251)] / 1024, 2) AS p25, - ROUND(APPROX_QUANTILES(bytesTotal, 1001)[OFFSET(501)] / 1024, 2) AS p50, - ROUND(APPROX_QUANTILES(bytesTotal, 1001)[OFFSET(751)] / 1024, 2) AS p75, - ROUND(APPROX_QUANTILES(bytesTotal, 1001)[OFFSET(901)] / 1024, 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesTotal), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesTotal), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesTotal), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesTotal), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesTotal), 1001)[OFFSET(901)] / 1024, 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - bytesTotal > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.bytesTotal) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/bytesVideo.sql b/sql/timeseries/bytesVideo.sql index 0e78ffb..e5bbcfd 100644 --- a/sql/timeseries/bytesVideo.sql +++ b/sql/timeseries/bytesVideo.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(bytesVideo, 1001)[OFFSET(101)] / 1024, 2) AS p10, - ROUND(APPROX_QUANTILES(bytesVideo, 1001)[OFFSET(251)] / 1024, 2) AS p25, - ROUND(APPROX_QUANTILES(bytesVideo, 1001)[OFFSET(501)] / 1024, 2) AS p50, - ROUND(APPROX_QUANTILES(bytesVideo, 1001)[OFFSET(751)] / 1024, 2) AS p75, - ROUND(APPROX_QUANTILES(bytesVideo, 1001)[OFFSET(901)] / 1024, 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesVideo), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesVideo), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesVideo), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesVideo), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.bytesVideo), 1001)[OFFSET(901)] / 1024, 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - bytesVideo > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.bytesVideo) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/canonical.sql b/sql/timeseries/canonical.sql index 2ca07ad..3f2ae72 100644 --- a/sql/timeseries/canonical.sql +++ b/sql/timeseries/canonical.sql @@ -1,14 +1,16 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(SUM(IF(JSON_EXTRACT(report, '$.audits.canonical.score') IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits.canonical.score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` WHERE - report IS NOT NULL AND - JSON_EXTRACT(report, '$.audits.canonical.score') IS NOT NULL + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + date >= '2017-06-01' AND + is_root_page GROUP BY date, timestamp, diff --git a/sql/timeseries/contentIndex.sql b/sql/timeseries/contentIndex.sql index 6762db1..d46a73d 100644 --- a/sql/timeseries/contentIndex.sql +++ b/sql/timeseries/contentIndex.sql @@ -1,17 +1,21 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '2983' OR feature = 'ContentIndexAdd', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '2983' OR feature = 'ContentIndexAdd', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '2983' OR feat.feature = 'ContentIndexAdd') +WHERE + date >= '2016-11-15' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, diff --git a/sql/timeseries/cruxFastFid.sql b/sql/timeseries/cruxFastFid.sql deleted file mode 100644 index 0899ed7..0000000 --- a/sql/timeseries/cruxFastFid.sql +++ /dev/null @@ -1,31 +0,0 @@ -#standardSQL -# Fast FID by device - -CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( - good / (good + needs_improvement + poor) >= 0.75 -); - -CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( - good + needs_improvement + poor > 0 -); - -SELECT - REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(device = 'desktop', 'desktop', 'mobile') AS client, - SAFE_DIVIDE( - COUNT(DISTINCT IF(IS_GOOD(fast_fid, avg_fid, slow_fid), origin, NULL)), - COUNT(DISTINCT IF(IS_NON_ZERO(fast_fid, avg_fid, slow_fid), origin, NULL)) - ) * 100 AS percent -FROM - `chrome-ux-report.materialized.device_summary` -WHERE - device IN ('desktop', 'phone') AND - yyyymm >= 201806 -GROUP BY - date, - timestamp, - client -ORDER BY - date DESC, - client diff --git a/sql/timeseries/cruxSlowFid.sql b/sql/timeseries/cruxSlowFid.sql deleted file mode 100644 index 7c0c38f..0000000 --- a/sql/timeseries/cruxSlowFid.sql +++ /dev/null @@ -1,31 +0,0 @@ -#standardSQL -# Slow FID by device - -CREATE TEMP FUNCTION IS_POOR(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( - poor / (good + needs_improvement + poor) >= 0.25 -); - -CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( - good + needs_improvement + poor > 0 -); - -SELECT - REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1_\\2_01') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(CAST(yyyymm AS STRING), '(\\d{4})(\\d{2})', '\\1-\\2-01') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(device = 'desktop', 'desktop', 'mobile') AS client, - SAFE_DIVIDE( - COUNT(DISTINCT IF(IS_POOR(fast_fid, avg_fid, slow_fid), origin, NULL)), - COUNT(DISTINCT IF(IS_NON_ZERO(fast_fid, avg_fid, slow_fid), origin, NULL)) - ) * 100 AS percent -FROM - `chrome-ux-report.materialized.device_summary` -WHERE - device IN ('desktop', 'phone') AND - yyyymm >= 201806 -GROUP BY - date, - timestamp, - client -ORDER BY - date DESC, - client diff --git a/sql/timeseries/dcl.sql b/sql/timeseries/dcl.sql index 74cb5c9..1b54d01 100644 --- a/sql/timeseries/dcl.sql +++ b/sql/timeseries/dcl.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(onContentLoaded, 1001)[OFFSET(101)] / 1000, 2) AS p10, - ROUND(APPROX_QUANTILES(onContentLoaded, 1001)[OFFSET(251)] / 1000, 2) AS p25, - ROUND(APPROX_QUANTILES(onContentLoaded, 1001)[OFFSET(501)] / 1000, 2) AS p50, - ROUND(APPROX_QUANTILES(onContentLoaded, 1001)[OFFSET(751)] / 1000, 2) AS p75, - ROUND(APPROX_QUANTILES(onContentLoaded, 1001)[OFFSET(901)] / 1000, 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onContentLoaded), 1001)[OFFSET(101)] / 1000, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onContentLoaded), 1001)[OFFSET(251)] / 1000, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onContentLoaded), 1001)[OFFSET(501)] / 1000, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onContentLoaded), 1001)[OFFSET(751)] / 1000, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onContentLoaded), 1001)[OFFSET(901)] / 1000, 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - onContentLoaded > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.onContentLoaded) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/fcp.sql b/sql/timeseries/fcp.sql index 38a2321..ee9a719 100644 --- a/sql/timeseries/fcp.sql +++ b/sql/timeseries/fcp.sql @@ -1,17 +1,18 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, "$['_chromeUserTiming.firstContentfulPaint']") AS FLOAT64), 1001)[OFFSET(101)] / 1024, 2) AS p10, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, "$['_chromeUserTiming.firstContentfulPaint']") AS FLOAT64), 1001)[OFFSET(251)] / 1024, 2) AS p25, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, "$['_chromeUserTiming.firstContentfulPaint']") AS FLOAT64), 1001)[OFFSET(501)] / 1024, 2) AS p50, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, "$['_chromeUserTiming.firstContentfulPaint']") AS FLOAT64), 1001)[OFFSET(751)] / 1024, 2) AS p75, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, "$['_chromeUserTiming.firstContentfulPaint']") AS FLOAT64), 1001)[OFFSET(901)] / 1024, 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']), 1001)[OFFSET(901)] / 1024, 2) AS p90 FROM - `httparchive.pages.*` + `httparchive.crawl.pages` WHERE - _TABLE_SUFFIX >= '2016_12_15' + date >= '2016-12-15' AND + is_root_page GROUP BY date, timestamp, diff --git a/sql/timeseries/fontDisplay.sql b/sql/timeseries/fontDisplay.sql index 28dee3b..4f1bb67 100644 --- a/sql/timeseries/fontDisplay.sql +++ b/sql/timeseries/fontDisplay.sql @@ -1,14 +1,17 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(SUM(IF(JSON_EXTRACT(report, '$.audits.font-display.score') IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits['font-display'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` WHERE - report IS NOT NULL AND - JSON_EXTRACT(report, '$.audits.font-display.score') IS NOT NULL + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + date >= '2017-06-01' AND + is_root_page AND + LAX_STRING(lighthouse.audits['font-display'].score) IS NOT NULL GROUP BY date, timestamp, diff --git a/sql/timeseries/getInstalledRelatedApps.sql b/sql/timeseries/getInstalledRelatedApps.sql index 5d34788..b17be36 100644 --- a/sql/timeseries/getInstalledRelatedApps.sql +++ b/sql/timeseries/getInstalledRelatedApps.sql @@ -1,17 +1,21 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '1870' OR feature = 'V8Navigator_GetInstalledRelatedApps_Method', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '1870' OR feature = 'V8Navigator_GetInstalledRelatedApps_Method', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '1870' OR feat.feature = 'V8Navigator_GetInstalledRelatedApps_Method') +WHERE + date >= '2016-11-15' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, diff --git a/sql/timeseries/gzipSavings.sql b/sql/timeseries/gzipSavings.sql index f5b44fb..0bd63ac 100644 --- a/sql/timeseries/gzipSavings.sql +++ b/sql/timeseries/gzipSavings.sql @@ -1,15 +1,17 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._gzip_savings') AS FLOAT64), 1001)[OFFSET(101)] / 1024, 2) AS p10, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._gzip_savings') AS FLOAT64), 1001)[OFFSET(251)] / 1024, 2) AS p25, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._gzip_savings') AS FLOAT64), 1001)[OFFSET(501)] / 1024, 2) AS p50, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._gzip_savings') AS FLOAT64), 1001)[OFFSET(751)] / 1024, 2) AS p75, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._gzip_savings') AS FLOAT64), 1001)[OFFSET(901)] / 1024, 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(payload._gzip_savings), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(payload._gzip_savings), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(payload._gzip_savings), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(payload._gzip_savings), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(payload._gzip_savings), 1001)[OFFSET(901)] / 1024, 2) AS p90 FROM - `httparchive.pages.*` + `httparchive.crawl.pages` +WHERE + is_root_page GROUP BY date, timestamp, diff --git a/sql/timeseries/h2.sql b/sql/timeseries/h2.sql index 26d4da1..2ce3681 100644 --- a/sql/timeseries/h2.sql +++ b/sql/timeseries/h2.sql @@ -1,11 +1,17 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(SUM(IF(protocol = 'HTTP/2', 1, 0)) * 100 / COUNT(0), 2) AS percent + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(SUM(IF(LAX_STRING(r.summary.respHttpVersion) = 'HTTP/2', 1, 0)) * 100 / COUNT(0), 2) AS percent FROM - (SELECT page AS url, JSON_EXTRACT_SCALAR(payload, '$._protocol') AS protocol, _TABLE_SUFFIX AS _TABLE_SUFFIX FROM `httparchive.requests.*`) + `httparchive.crawl.requests` r +INNER JOIN + `httparchive.crawl.pages` +USING (date, client, is_root_page, rank, page) +WHERE + is_root_page AND + date >= '2016-07-15' GROUP BY date, timestamp, diff --git a/sql/timeseries/h3.sql b/sql/timeseries/h3.sql index 4eec8a4..dd15274 100644 --- a/sql/timeseries/h3.sql +++ b/sql/timeseries/h3.sql @@ -11,24 +11,30 @@ # when HTTP/3 is approved so we include that as it is HTTP/3 in all but name. # SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, ROUND( SUM( IF( - respHttpVersion IN ('HTTP/3', 'h3', 'h3-29') OR - reqHttpVersion IN ('HTTP/3', 'h3', 'h3-29') OR - REGEXP_EXTRACT(REGEXP_EXTRACT(respOtherHeaders, r'alt-svc = (.*)'), r'(.*?)(?:, [^ ]* = .*)?$') LIKE '%h3=%' OR - REGEXP_EXTRACT(REGEXP_EXTRACT(respOtherHeaders, r'alt-svc = (.*)'), r'(.*?)(?:, [^ ]* = .*)?$') LIKE '%h3-29=%', + LAX_STRING(r.summary.respHttpVersion) IN ('HTTP/3', 'h3', 'h3-29') OR + REGEXP_EXTRACT(REGEXP_EXTRACT(resp.value, r'(.*)'), r'(.*?)(?:, [^ ]* = .*)?$') LIKE '%h3=%' OR + REGEXP_EXTRACT(REGEXP_EXTRACT(resp.value, r'(.*)'), r'(.*?)(?:, [^ ]* = .*)?$') LIKE '%h3-29=%', 1, 0 ) ) * 100 / COUNT(0), 2 ) AS percent FROM - `httparchive.summary_requests.*` + `httparchive.crawl.requests` r +LEFT OUTER JOIN + UNNEST(response_headers) AS resp +ON (resp.name = 'alt-svc') +INNER JOIN + `httparchive.crawl.pages` +USING (date, client, is_root_page, rank, page) WHERE - SUBSTR(_TABLE_SUFFIX, 0, 10) >= '2020_01_01' + date >= '2020-01-01' AND + is_root_page GROUP BY date, timestamp, diff --git a/sql/timeseries/hreflang.sql b/sql/timeseries/hreflang.sql index f034f87..ae4b1ed 100644 --- a/sql/timeseries/hreflang.sql +++ b/sql/timeseries/hreflang.sql @@ -1,14 +1,17 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(SUM(IF(JSON_EXTRACT(report, '$.audits.hreflang.score') IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits.hreflang.score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` WHERE - report IS NOT NULL AND - JSON_EXTRACT(report, '$.audits.hreflang.score') IS NOT NULL + lighthouse IS NOT NULL AND + TO_JSON_STRING(lighthouse) != '{}' AND + date >= '2017-06-01' AND + is_root_page AND + LAX_STRING(lighthouse.audits.hreflang.score) IS NOT NULL GROUP BY date, timestamp, diff --git a/sql/timeseries/idleDetection.sql b/sql/timeseries/idleDetection.sql index c1419cb..70464a1 100644 --- a/sql/timeseries/idleDetection.sql +++ b/sql/timeseries/idleDetection.sql @@ -1,17 +1,21 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '2834' OR feature = 'IdleDetectionStart', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '2834' OR feature = 'IdleDetectionStart', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '2834' OR feat.feature = 'IdleDetectionStart') +WHERE + date >= '2016-11-15' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, diff --git a/sql/timeseries/imgLazy.sql b/sql/timeseries/imgLazy.sql index d3a0d2a..5498477 100644 --- a/sql/timeseries/imgLazy.sql +++ b/sql/timeseries/imgLazy.sql @@ -1,13 +1,16 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(COUNT(DISTINCT IF(LOWER(attr) = '"lazy"', url, NULL)) * 100 / COUNT(DISTINCT url), 2) AS percent + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(COUNT(DISTINCT IF(LOWER(LAX_STRING(attr)) = 'lazy', page, NULL)) * 100 / COUNT(DISTINCT page), 2) AS percent FROM - `httparchive.pages.*` + `httparchive.crawl.pages` LEFT JOIN - UNNEST(JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, "$['_img-loading-attr']"), '$')) AS attr + UNNEST(JSON_EXTRACT_ARRAY(custom_metrics.other['img-loading-attr'])) AS attr +WHERE + is_root_page AND + date > '2016-01-01' GROUP BY date, timestamp, diff --git a/sql/timeseries/imgSavings.sql b/sql/timeseries/imgSavings.sql index 3c3839c..99c3788 100644 --- a/sql/timeseries/imgSavings.sql +++ b/sql/timeseries/imgSavings.sql @@ -1,15 +1,18 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._image_savings') AS FLOAT64), 1001)[OFFSET(101)] / 1024, 2) AS p10, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._image_savings') AS FLOAT64), 1001)[OFFSET(251)] / 1024, 2) AS p25, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._image_savings') AS FLOAT64), 1001)[OFFSET(501)] / 1024, 2) AS p50, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._image_savings') AS FLOAT64), 1001)[OFFSET(751)] / 1024, 2) AS p75, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._image_savings') AS FLOAT64), 1001)[OFFSET(901)] / 1024, 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(payload._image_savings), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(payload._image_savings), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(payload._image_savings), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(payload._image_savings), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(payload._image_savings), 1001)[OFFSET(901)] / 1024, 2) AS p90 FROM - `httparchive.pages.*` + `httparchive.crawl.pages` +WHERE + is_root_page AND + date >= '2016-01-01' GROUP BY date, timestamp, diff --git a/sql/timeseries/legible.sql b/sql/timeseries/legible.sql index b3821a2..dd29a47 100644 --- a/sql/timeseries/legible.sql +++ b/sql/timeseries/legible.sql @@ -1,14 +1,16 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(SUM(IF(JSON_EXTRACT(report, '$.audits.font-size.score') IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits['font-size'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` WHERE - report IS NOT NULL AND - JSON_EXTRACT(report, '$.audits.font-size.score') IS NOT NULL + lighthouse IS NOT NULL AND + date >= '2017-12-15' AND + is_root_page AND + LAX_STRING(lighthouse.audits['font-size'].score) IS NOT NULL GROUP BY date, timestamp, diff --git a/sql/timeseries/linkText.sql b/sql/timeseries/linkText.sql index 429112e..d7f2114 100644 --- a/sql/timeseries/linkText.sql +++ b/sql/timeseries/linkText.sql @@ -1,14 +1,16 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(SUM(IF(JSON_EXTRACT(report, '$.audits.link-text.score') IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(SUM(IF(LAX_STRING(lighthouse.audits['link-text'].score) IN ('true', '1'), 1, 0)) * 100 / COUNT(0), 2) AS percent FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` WHERE - report IS NOT NULL AND - JSON_EXTRACT(report, '$.audits.link-text.score') IS NOT NULL + lighthouse IS NOT NULL AND + date >= '2017-11-15' AND + is_root_page AND + LAX_STRING(lighthouse.audits['link-text'].score) IS NOT NULL GROUP BY date, timestamp, diff --git a/sql/timeseries/notificationTriggers.sql b/sql/timeseries/notificationTriggers.sql index 2b404af..3a1cd98 100644 --- a/sql/timeseries/notificationTriggers.sql +++ b/sql/timeseries/notificationTriggers.sql @@ -1,17 +1,21 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '3017' OR feature = 'NotificationShowTrigger', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '3017' OR feature = 'NotificationShowTrigger', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '3017' OR feat.feature = 'NotificationShowTrigger') +WHERE + date >= '2016-11-15' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, diff --git a/sql/timeseries/numUrls.sql b/sql/timeseries/numUrls.sql index e5c5b0b..df992df 100644 --- a/sql/timeseries/numUrls.sql +++ b/sql/timeseries/numUrls.sql @@ -1,11 +1,14 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, COUNT(0) AS urls FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` +WHERE + date >= '2010-11-15' AND + is_root_page GROUP BY date, timestamp, diff --git a/sql/timeseries/offscreenImages.sql b/sql/timeseries/offscreenImages.sql index 72b351d..7594029 100644 --- a/sql/timeseries/offscreenImages.sql +++ b/sql/timeseries/offscreenImages.sql @@ -1,15 +1,18 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(IFNULL(CAST(JSON_EXTRACT(report, '$.audits.offscreen-images.details.overallSavingsBytes') AS INT64), CAST(JSON_EXTRACT(report, '$.audits.offscreen-images.extendedInfo.value.wastedKb') AS INT64) * 1024), 1001)[OFFSET(101)] / 1024, 2) AS p10, - ROUND(APPROX_QUANTILES(IFNULL(CAST(JSON_EXTRACT(report, '$.audits.offscreen-images.details.overallSavingsBytes') AS INT64), CAST(JSON_EXTRACT(report, '$.audits.offscreen-images.extendedInfo.value.wastedKb') AS INT64) * 1024), 1001)[OFFSET(251)] / 1024, 2) AS p25, - ROUND(APPROX_QUANTILES(IFNULL(CAST(JSON_EXTRACT(report, '$.audits.offscreen-images.details.overallSavingsBytes') AS INT64), CAST(JSON_EXTRACT(report, '$.audits.offscreen-images.extendedInfo.value.wastedKb') AS INT64) * 1024), 1001)[OFFSET(501)] / 1024, 2) AS p50, - ROUND(APPROX_QUANTILES(IFNULL(CAST(JSON_EXTRACT(report, '$.audits.offscreen-images.details.overallSavingsBytes') AS INT64), CAST(JSON_EXTRACT(report, '$.audits.offscreen-images.extendedInfo.value.wastedKb') AS INT64) * 1024), 1001)[OFFSET(751)] / 1024, 2) AS p75, - ROUND(APPROX_QUANTILES(IFNULL(CAST(JSON_EXTRACT(report, '$.audits.offscreen-images.details.overallSavingsBytes') AS INT64), CAST(JSON_EXTRACT(report, '$.audits.offscreen-images.extendedInfo.value.wastedKb') AS INT64) * 1024), 1001)[OFFSET(901)] / 1024, 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['offscreen-images'].details.overallSavingsBytes), INT64(lighthouse.audits['offscreen-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(901)] / 1024, 2) AS p90 FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` +WHERE + is_root_page AND + date >= '2017-06-01' GROUP BY date, timestamp, diff --git a/sql/timeseries/ol.sql b/sql/timeseries/ol.sql index cba8d6a..95c67ce 100644 --- a/sql/timeseries/ol.sql +++ b/sql/timeseries/ol.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(onLoad, 1001)[OFFSET(101)] / 1000, 2) AS p10, - ROUND(APPROX_QUANTILES(onLoad, 1001)[OFFSET(251)] / 1000, 2) AS p25, - ROUND(APPROX_QUANTILES(onLoad, 1001)[OFFSET(501)] / 1000, 2) AS p50, - ROUND(APPROX_QUANTILES(onLoad, 1001)[OFFSET(751)] / 1000, 2) AS p75, - ROUND(APPROX_QUANTILES(onLoad, 1001)[OFFSET(901)] / 1000, 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onLoad), 1001)[OFFSET(101)] / 1000, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onLoad), 1001)[OFFSET(251)] / 1000, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onLoad), 1001)[OFFSET(501)] / 1000, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onLoad), 1001)[OFFSET(751)] / 1000, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.onLoad), 1001)[OFFSET(901)] / 1000, 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - onLoad > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.onLoad) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/optimizedImages.sql b/sql/timeseries/optimizedImages.sql index 4b65792..aee6ae8 100644 --- a/sql/timeseries/optimizedImages.sql +++ b/sql/timeseries/optimizedImages.sql @@ -1,15 +1,18 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(IFNULL(CAST(JSON_EXTRACT(report, '$.audits.uses-optimized-images.details.overallSavingsBytes') AS INT64), CAST(JSON_EXTRACT(report, '$.audits.uses-optimized-images.extendedInfo.value.wastedKb') AS INT64) * 1024), 1001)[OFFSET(101)] / 1024, 2) AS p10, - ROUND(APPROX_QUANTILES(IFNULL(CAST(JSON_EXTRACT(report, '$.audits.uses-optimized-images.details.overallSavingsBytes') AS INT64), CAST(JSON_EXTRACT(report, '$.audits.uses-optimized-images.extendedInfo.value.wastedKb') AS INT64) * 1024), 1001)[OFFSET(251)] / 1024, 2) AS p25, - ROUND(APPROX_QUANTILES(IFNULL(CAST(JSON_EXTRACT(report, '$.audits.uses-optimized-images.details.overallSavingsBytes') AS INT64), CAST(JSON_EXTRACT(report, '$.audits.uses-optimized-images.extendedInfo.value.wastedKb') AS INT64) * 1024), 1001)[OFFSET(501)] / 1024, 2) AS p50, - ROUND(APPROX_QUANTILES(IFNULL(CAST(JSON_EXTRACT(report, '$.audits.uses-optimized-images.details.overallSavingsBytes') AS INT64), CAST(JSON_EXTRACT(report, '$.audits.uses-optimized-images.extendedInfo.value.wastedKb') AS INT64) * 1024), 1001)[OFFSET(751)] / 1024, 2) AS p75, - ROUND(APPROX_QUANTILES(IFNULL(CAST(JSON_EXTRACT(report, '$.audits.uses-optimized-images.details.overallSavingsBytes') AS INT64), CAST(JSON_EXTRACT(report, '$.audits.uses-optimized-images.extendedInfo.value.wastedKb') AS INT64) * 1024), 1001)[OFFSET(901)] / 1024, 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(IFNULL(INT64(lighthouse.audits['uses-optimized-images'].details.overallSavingsBytes), INT64(lighthouse.audits['uses-optimized-images'].extendedInfo.value.wastedKb) * 1024), 1001)[OFFSET(901)] / 1024, 2) AS p90 FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` +WHERE + date >= '2017-06-01' AND + is_root_page GROUP BY date, timestamp, diff --git a/sql/timeseries/pctHttps.sql b/sql/timeseries/pctHttps.sql index bc6c9a2..5ac3b7b 100644 --- a/sql/timeseries/pctHttps.sql +++ b/sql/timeseries/pctHttps.sql @@ -1,11 +1,17 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(SUM(IF(STARTS_WITH(request, 'https'), 1, 0)) * 100 / COUNT(0), 2) AS percent + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(SUM(IF(STARTS_WITH(url, 'https'), 1, 0)) * 100 / COUNT(0), 2) AS percent FROM - (SELECT url AS request, page AS url, _TABLE_SUFFIX AS _TABLE_SUFFIX FROM `httparchive.requests.*`) + `httparchive.crawl.requests` +INNER JOIN + `httparchive.crawl.pages` +USING (date, client, is_root_page, rank, page) +WHERE + is_root_page AND + date >= '2016-01-01' GROUP BY date, timestamp, diff --git a/sql/timeseries/pctVuln.sql b/sql/timeseries/pctVuln.sql deleted file mode 100644 index 9c1c230..0000000 --- a/sql/timeseries/pctVuln.sql +++ /dev/null @@ -1,18 +0,0 @@ -#standardSQL -SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(SUM(IF(JSON_EXTRACT(report, '$.audits.no-vulnerable-libraries.score') IN ('false', '0'), 1, 0)) * 100 / COUNT(0), 2) AS percent -FROM - `httparchive.lighthouse.*` -WHERE - report IS NOT NULL AND - JSON_EXTRACT(report, '$.audits.no-vulnerable-libraries.score') IS NOT NULL -GROUP BY - date, - timestamp, - client -ORDER BY - date DESC, - client diff --git a/sql/timeseries/periodicBackgroundSync.sql b/sql/timeseries/periodicBackgroundSync.sql index 87fd2e9..5004969 100644 --- a/sql/timeseries/periodicBackgroundSync.sql +++ b/sql/timeseries/periodicBackgroundSync.sql @@ -1,17 +1,21 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '2930' OR feature = 'PeriodicBackgroundSync', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '2930' OR feature = 'PeriodicBackgroundSync', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '2930' OR feat.feature = 'PeriodicBackgroundSync') +WHERE + date >= '2016-11-15' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, diff --git a/sql/timeseries/periodicBackgroundSyncRegister.sql b/sql/timeseries/periodicBackgroundSyncRegister.sql index 701b197..99a25f0 100644 --- a/sql/timeseries/periodicBackgroundSyncRegister.sql +++ b/sql/timeseries/periodicBackgroundSyncRegister.sql @@ -1,17 +1,21 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '2931' OR feature = 'PeriodicBackgroundSyncRegister', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '2931' OR feature = 'PeriodicBackgroundSyncRegister', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '2931' OR feat.feature = 'PeriodicBackgroundSyncRegister') +WHERE + date >= '2016-11-15' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, diff --git a/sql/timeseries/pwaScores.sql b/sql/timeseries/pwaScores.sql deleted file mode 100644 index 0247410..0000000 --- a/sql/timeseries/pwaScores.sql +++ /dev/null @@ -1,41 +0,0 @@ -#standardSQL -# Lighthouse changed format of scores in v3.0.0 released in July 2018 so handle old with a UDF -CREATE TEMPORARY FUNCTION getPWAScore(reportCategories STRING) -RETURNS FLOAT64 DETERMINISTIC -LANGUAGE js AS """ - $=JSON.parse(reportCategories); - if ($) { - return $.find(i => i.name === 'Progressive Web App').score; - } -"""; - -SELECT - date, - UNIX_DATE(CAST(REPLACE(date, '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - client, - ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(100)], 2) AS p10, - ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(250)], 2) AS p25, - ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(500)], 2) AS p50, - ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(750)], 2) AS p75, - ROUND(APPROX_QUANTILES(score, 1000)[OFFSET(900)], 2) AS p90 -FROM ( - SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - IFNULL(CAST(JSON_EXTRACT(report, '$.categories.pwa.score') AS FLOAT64) * 100, getPWAScore(JSON_EXTRACT(report, '$.reportCategories'))) AS score - FROM - `httparchive.lighthouse.*` - WHERE - report IS NOT NULL AND - ( - JSON_EXTRACT(report, '$.audits.service-worker.score') = 'true' OR - JSON_EXTRACT(report, '$.audits.service-worker.score') = '1' - ) -) -GROUP BY - date, - timestamp, - client -ORDER BY - date DESC, - client; diff --git a/sql/timeseries/quicTransport.sql b/sql/timeseries/quicTransport.sql index c207a8f..e49ed8f 100644 --- a/sql/timeseries/quicTransport.sql +++ b/sql/timeseries/quicTransport.sql @@ -1,17 +1,21 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '3184' OR feature = 'QuicTransport', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '3184' OR feature = 'QuicTransport', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '3184' OR feat.feature = 'QuicTransport') +WHERE + date >= '2016-11-15' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, diff --git a/sql/timeseries/reqCss.sql b/sql/timeseries/reqCss.sql index 4e3d7ad..7c0dc8a 100644 --- a/sql/timeseries/reqCss.sql +++ b/sql/timeseries/reqCss.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(reqCSS, 1001)[OFFSET(101)], 2) AS p10, - ROUND(APPROX_QUANTILES(reqCSS, 1001)[OFFSET(251)], 2) AS p25, - ROUND(APPROX_QUANTILES(reqCSS, 1001)[OFFSET(501)], 2) AS p50, - ROUND(APPROX_QUANTILES(reqCSS, 1001)[OFFSET(751)], 2) AS p75, - ROUND(APPROX_QUANTILES(reqCSS, 1001)[OFFSET(901)], 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqCss), 1001)[OFFSET(101)], 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqCss), 1001)[OFFSET(251)], 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqCss), 1001)[OFFSET(501)], 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqCss), 1001)[OFFSET(751)], 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqCss), 1001)[OFFSET(901)], 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - reqCSS > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.reqCss) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/reqFont.sql b/sql/timeseries/reqFont.sql index 7c5cf56..a490a83 100644 --- a/sql/timeseries/reqFont.sql +++ b/sql/timeseries/reqFont.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(reqFont, 1001)[OFFSET(101)], 2) AS p10, - ROUND(APPROX_QUANTILES(reqFont, 1001)[OFFSET(251)], 2) AS p25, - ROUND(APPROX_QUANTILES(reqFont, 1001)[OFFSET(501)], 2) AS p50, - ROUND(APPROX_QUANTILES(reqFont, 1001)[OFFSET(751)], 2) AS p75, - ROUND(APPROX_QUANTILES(reqFont, 1001)[OFFSET(901)], 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqFont), 1001)[OFFSET(101)], 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqFont), 1001)[OFFSET(251)], 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqFont), 1001)[OFFSET(501)], 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqFont), 1001)[OFFSET(751)], 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqFont), 1001)[OFFSET(901)], 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - reqFont > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.reqFont) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/reqHtml.sql b/sql/timeseries/reqHtml.sql index d7faea8..a564e22 100644 --- a/sql/timeseries/reqHtml.sql +++ b/sql/timeseries/reqHtml.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(reqHtml, 1001)[OFFSET(101)], 2) AS p10, - ROUND(APPROX_QUANTILES(reqHtml, 1001)[OFFSET(251)], 2) AS p25, - ROUND(APPROX_QUANTILES(reqHtml, 1001)[OFFSET(501)], 2) AS p50, - ROUND(APPROX_QUANTILES(reqHtml, 1001)[OFFSET(751)], 2) AS p75, - ROUND(APPROX_QUANTILES(reqHtml, 1001)[OFFSET(901)], 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqHtml), 1001)[OFFSET(101)], 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqHtml), 1001)[OFFSET(251)], 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqHtml), 1001)[OFFSET(501)], 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqHtml), 1001)[OFFSET(751)], 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqHtml), 1001)[OFFSET(901)], 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - reqHtml > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.reqHtml) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/reqImg.sql b/sql/timeseries/reqImg.sql index dac48ff..c23bedd 100644 --- a/sql/timeseries/reqImg.sql +++ b/sql/timeseries/reqImg.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(reqImg, 1001)[OFFSET(101)], 2) AS p10, - ROUND(APPROX_QUANTILES(reqImg, 1001)[OFFSET(251)], 2) AS p25, - ROUND(APPROX_QUANTILES(reqImg, 1001)[OFFSET(501)], 2) AS p50, - ROUND(APPROX_QUANTILES(reqImg, 1001)[OFFSET(751)], 2) AS p75, - ROUND(APPROX_QUANTILES(reqImg, 1001)[OFFSET(901)], 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqImg), 1001)[OFFSET(101)], 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqImg), 1001)[OFFSET(251)], 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqImg), 1001)[OFFSET(501)], 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqImg), 1001)[OFFSET(751)], 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqImg), 1001)[OFFSET(901)], 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - reqImg > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.reqImg) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/reqJs.sql b/sql/timeseries/reqJs.sql index 3a60a6a..8abacce 100644 --- a/sql/timeseries/reqJs.sql +++ b/sql/timeseries/reqJs.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(reqJS, 1001)[OFFSET(101)], 2) AS p10, - ROUND(APPROX_QUANTILES(reqJS, 1001)[OFFSET(251)], 2) AS p25, - ROUND(APPROX_QUANTILES(reqJS, 1001)[OFFSET(501)], 2) AS p50, - ROUND(APPROX_QUANTILES(reqJS, 1001)[OFFSET(751)], 2) AS p75, - ROUND(APPROX_QUANTILES(reqJS, 1001)[OFFSET(901)], 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqJS), 1001)[OFFSET(101)], 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqJS), 1001)[OFFSET(251)], 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqJS), 1001)[OFFSET(501)], 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqJS), 1001)[OFFSET(751)], 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqJS), 1001)[OFFSET(901)], 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - reqJS > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.reqJS) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/reqOther.sql b/sql/timeseries/reqOther.sql index 891811d..fcd6073 100644 --- a/sql/timeseries/reqOther.sql +++ b/sql/timeseries/reqOther.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(reqOther, 1001)[OFFSET(101)], 2) AS p10, - ROUND(APPROX_QUANTILES(reqOther, 1001)[OFFSET(251)], 2) AS p25, - ROUND(APPROX_QUANTILES(reqOther, 1001)[OFFSET(501)], 2) AS p50, - ROUND(APPROX_QUANTILES(reqOther, 1001)[OFFSET(751)], 2) AS p75, - ROUND(APPROX_QUANTILES(reqOther, 1001)[OFFSET(901)], 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqOther), 1001)[OFFSET(101)], 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqOther), 1001)[OFFSET(251)], 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqOther), 1001)[OFFSET(501)], 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqOther), 1001)[OFFSET(751)], 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqOther), 1001)[OFFSET(901)], 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - reqOther > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.reqOther) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/reqTotal.sql b/sql/timeseries/reqTotal.sql index 0f337f8..372baaf 100644 --- a/sql/timeseries/reqTotal.sql +++ b/sql/timeseries/reqTotal.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(reqTotal, 1001)[OFFSET(101)], 2) AS p10, - ROUND(APPROX_QUANTILES(reqTotal, 1001)[OFFSET(251)], 2) AS p25, - ROUND(APPROX_QUANTILES(reqTotal, 1001)[OFFSET(501)], 2) AS p50, - ROUND(APPROX_QUANTILES(reqTotal, 1001)[OFFSET(751)], 2) AS p75, - ROUND(APPROX_QUANTILES(reqTotal, 1001)[OFFSET(901)], 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqTotal), 1001)[OFFSET(101)], 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqTotal), 1001)[OFFSET(251)], 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqTotal), 1001)[OFFSET(501)], 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqTotal), 1001)[OFFSET(751)], 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqTotal), 1001)[OFFSET(901)], 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - reqTotal > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.reqTotal) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/reqVideo.sql b/sql/timeseries/reqVideo.sql index eb46b24..cefd474 100644 --- a/sql/timeseries/reqVideo.sql +++ b/sql/timeseries/reqVideo.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(reqVideo, 1001)[OFFSET(101)], 2) AS p10, - ROUND(APPROX_QUANTILES(reqVideo, 1001)[OFFSET(251)], 2) AS p25, - ROUND(APPROX_QUANTILES(reqVideo, 1001)[OFFSET(501)], 2) AS p50, - ROUND(APPROX_QUANTILES(reqVideo, 1001)[OFFSET(751)], 2) AS p75, - ROUND(APPROX_QUANTILES(reqVideo, 1001)[OFFSET(901)], 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqVideo), 1001)[OFFSET(101)], 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqVideo), 1001)[OFFSET(251)], 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqVideo), 1001)[OFFSET(501)], 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqVideo), 1001)[OFFSET(751)], 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(summary.reqVideo), 1001)[OFFSET(901)], 2) AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - reqVideo > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary.reqVideo) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/screenWakeLock.sql b/sql/timeseries/screenWakeLock.sql index bb149a3..6946319 100644 --- a/sql/timeseries/screenWakeLock.sql +++ b/sql/timeseries/screenWakeLock.sql @@ -1,17 +1,21 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '3005' OR feature = 'WakeLockAcquireScreenLock', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '3005' OR feature = 'WakeLockAcquireScreenLock', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '3005' OR feat.feature = 'WakeLockAcquireScreenLock') +WHERE + date >= '2016-11-15' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, diff --git a/sql/timeseries/speedIndex.sql b/sql/timeseries/speedIndex.sql index 199d651..a6902df 100644 --- a/sql/timeseries/speedIndex.sql +++ b/sql/timeseries/speedIndex.sql @@ -1,15 +1,18 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._SpeedIndex') AS FLOAT64), 1001)[OFFSET(101)] / 1000, 2) AS p10, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._SpeedIndex') AS FLOAT64), 1001)[OFFSET(251)] / 1000, 2) AS p25, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._SpeedIndex') AS FLOAT64), 1001)[OFFSET(501)] / 1000, 2) AS p50, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._SpeedIndex') AS FLOAT64), 1001)[OFFSET(751)] / 1000, 2) AS p75, - ROUND(APPROX_QUANTILES(CAST(JSON_EXTRACT(payload, '$._SpeedIndex') AS FLOAT64), 1001)[OFFSET(901)] / 1000, 2) AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + ROUND(APPROX_QUANTILES(FLOAT64(payload._SpeedIndex), 1001)[OFFSET(101)] / 1000, 2) AS p10, + ROUND(APPROX_QUANTILES(FLOAT64(payload._SpeedIndex), 1001)[OFFSET(251)] / 1000, 2) AS p25, + ROUND(APPROX_QUANTILES(FLOAT64(payload._SpeedIndex), 1001)[OFFSET(501)] / 1000, 2) AS p50, + ROUND(APPROX_QUANTILES(FLOAT64(payload._SpeedIndex), 1001)[OFFSET(751)] / 1000, 2) AS p75, + ROUND(APPROX_QUANTILES(FLOAT64(payload._SpeedIndex), 1001)[OFFSET(901)] / 1000, 2) AS p90 FROM - `httparchive.pages.*` + `httparchive.crawl.pages` +WHERE + is_root_page AND + date >= '2016-01-01' GROUP BY date, timestamp, diff --git a/sql/timeseries/storageEstimate.sql b/sql/timeseries/storageEstimate.sql index e9b05e9..6731a6d 100644 --- a/sql/timeseries/storageEstimate.sql +++ b/sql/timeseries/storageEstimate.sql @@ -1,17 +1,21 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '1371' OR feature = 'DurableStorageEstimate', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '1371' OR feature = 'DurableStorageEstimate', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN UNNEST(features) AS feat +ON (feat.id = '1371' OR feat.feature = 'DurableStorageEstimate') +WHERE + date >= '2016-11-15' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, diff --git a/sql/timeseries/storagePersist.sql b/sql/timeseries/storagePersist.sql index e3cbc03..168c2f0 100644 --- a/sql/timeseries/storagePersist.sql +++ b/sql/timeseries/storagePersist.sql @@ -1,17 +1,22 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '1369' OR feature = 'DurableStoragePersist', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '1369' OR feature = 'DurableStoragePersist', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id IS NOT NULL, 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id IS NOT NULL, 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN + UNNEST(features) AS feat +ON (feat.id = '3018' OR feat.feature = 'DurableStoragePersist') +WHERE + date >= '2016-11-15' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, diff --git a/sql/timeseries/swControlledPages.sql b/sql/timeseries/swControlledPages.sql index 2b48b1f..7355a79 100644 --- a/sql/timeseries/swControlledPages.sql +++ b/sql/timeseries/swControlledPages.sql @@ -1,17 +1,22 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '990' OR feature = 'ServiceWorkerControlledPage', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '990' OR feature = 'ServiceWorkerControlledPage', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id = '990' OR feat.feature = 'ServiceWorkerControlledPage', 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id = '990' OR feat.feature = 'ServiceWorkerControlledPage', 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN + UNNEST(features) AS feat +ON (feat.id = '990' OR feat.feature = 'ServiceWorkerControlledPage') +WHERE + date >= '2016-11-15' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client, diff --git a/sql/timeseries/tcp.sql b/sql/timeseries/tcp.sql index 65f165c..bf22f1f 100644 --- a/sql/timeseries/tcp.sql +++ b/sql/timeseries/tcp.sql @@ -1,17 +1,19 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, - APPROX_QUANTILES(_connections, 1001)[OFFSET(101)] AS p10, - APPROX_QUANTILES(_connections, 1001)[OFFSET(251)] AS p25, - APPROX_QUANTILES(_connections, 1001)[OFFSET(501)] AS p50, - APPROX_QUANTILES(_connections, 1001)[OFFSET(751)] AS p75, - APPROX_QUANTILES(_connections, 1001)[OFFSET(901)] AS p90 + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, + APPROX_QUANTILES(FLOAT64(summary._connections), 1001)[OFFSET(101)] AS p10, + APPROX_QUANTILES(FLOAT64(summary._connections), 1001)[OFFSET(251)] AS p25, + APPROX_QUANTILES(FLOAT64(summary._connections), 1001)[OFFSET(501)] AS p50, + APPROX_QUANTILES(FLOAT64(summary._connections), 1001)[OFFSET(751)] AS p75, + APPROX_QUANTILES(FLOAT64(summary._connections), 1001)[OFFSET(901)] AS p90 FROM - `httparchive.summary_pages.*` + `httparchive.crawl.pages` WHERE - _connections > 0 + date >= '2010-11-15' AND + is_root_page AND + FLOAT64(summary._connections) > 0 GROUP BY date, timestamp, diff --git a/sql/timeseries/ttci.sql b/sql/timeseries/ttci.sql index 67fa289..1ed7e73 100644 --- a/sql/timeseries/ttci.sql +++ b/sql/timeseries/ttci.sql @@ -1,8 +1,8 @@ #standardSQL SELECT - SUBSTR(_TABLE_SUFFIX, 0, 10) AS date, - UNIX_DATE(CAST(REPLACE(SUBSTR(_TABLE_SUFFIX, 0, 10), '_', '-') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, - IF(ENDS_WITH(_TABLE_SUFFIX, 'desktop'), 'desktop', 'mobile') AS client, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + client, ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(100)], 2) AS p10, ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(250)], 2) AS p25, ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(500)], 2) AS p50, @@ -10,16 +10,20 @@ SELECT ROUND(APPROX_QUANTILES(value, 1000)[OFFSET(900)], 2) AS p90 FROM ( SELECT - _TABLE_SUFFIX AS _TABLE_SUFFIX, - CAST(IFNULL( - JSON_EXTRACT(report, '$.audits.interactive.numericValue'), + client, + date, + IFNULL( + FLOAT64(lighthouse.audits.interactive.numericValue), IFNULL( - JSON_EXTRACT(report, '$.audits.interactive.rawValue'), - JSON_EXTRACT(report, '$.audits.consistently-interactive.rawValue') + FLOAT64(lighthouse.audits.interactive.rawValue), + FLOAT64(lighthouse.audits['consistently-interactive'].rawValue) ) - ) AS FLOAT64) / 1000 AS value + ) / 1000 AS value FROM - `httparchive.lighthouse.*` + `httparchive.crawl.pages` + WHERE + is_root_page AND + date >= '2016-01-01' ) GROUP BY date, diff --git a/sql/timeseries/webSocketStream.sql b/sql/timeseries/webSocketStream.sql index a94cfaf..7a99e89 100644 --- a/sql/timeseries/webSocketStream.sql +++ b/sql/timeseries/webSocketStream.sql @@ -1,17 +1,22 @@ #standardSQL SELECT - REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1_\\2_\\3') AS date, - UNIX_DATE(CAST(REGEXP_REPLACE(yyyymmdd, r'(\d{4})(\d{2})(\d{2})', '\\1-\\2-\\3') AS DATE)) * 1000 * 60 * 60 * 24 AS timestamp, + FORMAT_TIMESTAMP('%Y_%m_%d', date) AS date, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, client, - SUM(IF(id = '3018' OR feature = 'WebSocketStreamConstructor', num_urls, 0)) AS num_urls, - ROUND(SUM(IF(id = '3018' OR feature = 'WebSocketStreamConstructor', num_urls, 0)) / total_urls * 100, 5) AS percent + SUM(IF(feat.id = '3018' OR feat.feature = 'WebSocketStreamConstructor', 1, 0)) AS num_urls, + ROUND(SUM(IF(feat.id = '3018' OR feat.feature = 'WebSocketStreamConstructor', 1, 0)) / COUNT(0) * 100, 5) AS percent FROM - `httparchive.blink_features.usage` + `httparchive.crawl.pages` +LEFT OUTER JOIN + UNNEST(features) AS feat +ON (feat.id = '3018' OR feat.feature = 'WebSocketStreamConstructor') +WHERE + date >= '2016-11-15' AND + is_root_page GROUP BY date, timestamp, - client, - total_urls + client ORDER BY date DESC, client,