diff --git a/sql/2021/structured-data/classic_microformats_types.sql b/sql/2021/structured-data/classic_microformats_types.sql index 5506aafaa22..d61ea9968cb 100644 --- a/sql/2021/structured-data/classic_microformats_types.sql +++ b/sql/2021/structured-data/classic_microformats_types.sql @@ -14,29 +14,42 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getClassicMicroformatsTypes(rendered) AS classic_microformats_types, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getClassicMicroformatsTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS classic_microformats_types + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, classic_microformats_type.name AS classic_microformats_type, - SUM(classic_microformats_type.count) AS count, - SUM(SUM(classic_microformats_type.count)) OVER (PARTITION BY client) AS total, - SUM(classic_microformats_type.count) / SUM(SUM(classic_microformats_type.count)) OVER (PARTITION BY client) AS pct, - client + SUM(classic_microformats_type.count) AS freq_microformat, + SUM(SUM(classic_microformats_type.count)) OVER (PARTITION BY client) AS total_microformat, + SUM(classic_microformats_type.count) / SUM(SUM(classic_microformats_type.count)) OVER (PARTITION BY client) AS pct_microformat, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM rendered_data, UNNEST(classic_microformats_types) AS classic_microformats_type +JOIN + page_totals +USING (client) GROUP BY + client, classic_microformats_type, - client + total_pages ORDER BY - count DESC + freq_microformat DESC, + client diff --git a/sql/2021/structured-data/dublin_core_types.sql b/sql/2021/structured-data/dublin_core_types.sql index e2b6433c6e2..9b8e24b73aa 100644 --- a/sql/2021/structured-data/dublin_core_types.sql +++ b/sql/2021/structured-data/dublin_core_types.sql @@ -14,29 +14,42 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getDublinCoreTypes(rendered) AS dublin_core_types, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getDublinCoreTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS dublin_core_types + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, dublin_core_type, COUNT(dublin_core_type) AS count, - SUM(COUNT(dublin_core_type)) OVER (PARTITION BY client) AS total, - COUNT(dublin_core_type) / SUM(COUNT(dublin_core_type)) OVER (PARTITION BY client) AS pct, - client + SUM(COUNT(dublin_core_type)) OVER (PARTITION BY client) AS freq_dublin_core, + COUNT(dublin_core_type) / SUM(COUNT(dublin_core_type)) OVER (PARTITION BY client) AS pct_dublin_core, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM rendered_data, UNNEST(dublin_core_types) AS dublin_core_type +JOIN + page_totals +USING (client) GROUP BY + client, dublin_core_type, - client + total_pages ORDER BY - count DESC + pct_dublin_core DESC, + client diff --git a/sql/2021/structured-data/facebook_types.sql b/sql/2021/structured-data/facebook_types.sql index e425890a69c..7b49be95517 100644 --- a/sql/2021/structured-data/facebook_types.sql +++ b/sql/2021/structured-data/facebook_types.sql @@ -14,29 +14,42 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getFacebookTypes(rendered) AS facebook_types, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getFacebookTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS facebook_type + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, facebook_type, - COUNT(facebook_type) AS count, - SUM(COUNT(facebook_type)) OVER (PARTITION BY client) AS total, - COUNT(facebook_type) / SUM(COUNT(facebook_type)) OVER (PARTITION BY client) AS pct, - client + COUNT(facebook_type) AS freq_facebook, + SUM(COUNT(facebook_type)) OVER (PARTITION BY client) AS total_facebook, + COUNT(facebook_type) / SUM(COUNT(facebook_type)) OVER (PARTITION BY client) AS pct_facebook, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM rendered_data, - UNNEST(facebook_types) AS facebook_type + UNNEST(facebook_type) AS facebook_type +JOIN + page_totals +USING (client) GROUP BY + client, facebook_type, - client + total_pages ORDER BY - count DESC + freq_facebook DESC, + client diff --git a/sql/2021/structured-data/jsonld_contexts.sql b/sql/2021/structured-data/jsonld_contexts.sql index 947be5b5fa0..6ee68e5c88e 100644 --- a/sql/2021/structured-data/jsonld_contexts.sql +++ b/sql/2021/structured-data/jsonld_contexts.sql @@ -33,29 +33,42 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getJSONLDContexts(rendered) AS jsonld_contexts, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getJSONLDContexts(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS jsonld_context + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, NET.REG_DOMAIN(jsonld_context) AS jsonld_context, - COUNT(0) AS count, - SUM(COUNT(0)) OVER (PARTITION BY client) AS total, - COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct, - client + COUNT(0) AS freq_jsonld_context, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_jsonld_context, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_jsonld_context, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM rendered_data, - UNNEST(jsonld_contexts) AS jsonld_context + UNNEST(jsonld_context) AS jsonld_context +JOIN + page_totals +USING (client) GROUP BY + client, jsonld_context, - client + total_pages ORDER BY - count DESC + pct_jsonld_context DESC, + client diff --git a/sql/2021/structured-data/jsonld_max_depth.sql b/sql/2021/structured-data/jsonld_depth_percentiles.sql similarity index 72% rename from sql/2021/structured-data/jsonld_max_depth.sql rename to sql/2021/structured-data/jsonld_depth_percentiles.sql index 2456b044d41..23b3a92ade0 100644 --- a/sql/2021/structured-data/jsonld_max_depth.sql +++ b/sql/2021/structured-data/jsonld_depth_percentiles.sql @@ -41,26 +41,27 @@ LANGUAGE js AS """ } """; -WITH -rendered_data AS ( +WITH rendered_data AS ( SELECT - getJSONLDEntitiesRelationships(rendered) AS jsonld_entities_relationships, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getJSONLDEntitiesRelationships(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS jsonld_entities_relationships + FROM + `httparchive.pages.2021_07_01_*` ) SELECT - MAX(jsonld_entity_relationship.depth) AS max_depth, - client + client, + percentile, + APPROX_QUANTILES(jsonld_entity_relationship.depth, 1000)[OFFSET(percentile * 10)] AS depth, + ARRAY_TO_STRING(ARRAY_AGG(DISTINCT url LIMIT 5), ' ') AS sample_urls FROM rendered_data, - UNNEST(jsonld_entities_relationships) AS jsonld_entity_relationship + UNNEST(jsonld_entities_relationships) AS jsonld_entity_relationship, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile GROUP BY - client + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2021/structured-data/jsonld_entities_relationships.sql b/sql/2021/structured-data/jsonld_entities_relationships.sql index d9109f367da..ba2dfcafae9 100644 --- a/sql/2021/structured-data/jsonld_entities_relationships.sql +++ b/sql/2021/structured-data/jsonld_entities_relationships.sql @@ -44,34 +44,49 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getJSONLDEntitiesRelationships(rendered) AS jsonld_entities_relationships, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getJSONLDEntitiesRelationships(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS jsonld_entities_relationships + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, jsonld_entity_relationship._from, jsonld_entity_relationship.relationship, jsonld_entity_relationship._to, jsonld_entity_relationship.depth, - COUNT(0) AS count, - client + COUNT(0) AS freq_relationship, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_relationship, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_relationship, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM rendered_data, UNNEST(jsonld_entities_relationships) AS jsonld_entity_relationship +JOIN + page_totals +USING (client) GROUP BY + client, jsonld_entity_relationship._from, jsonld_entity_relationship.relationship, jsonld_entity_relationship._to, jsonld_entity_relationship.depth, - client + total_pages ORDER BY - count DESC -LIMIT 10000 + pct_relationship DESC, + client +LIMIT 1000 diff --git a/sql/2021/structured-data/jsonld_relationship_values.sql b/sql/2021/structured-data/jsonld_relationship_values.sql index 9cd0850ac7c..d5a3cffe32c 100644 --- a/sql/2021/structured-data/jsonld_relationship_values.sql +++ b/sql/2021/structured-data/jsonld_relationship_values.sql @@ -44,30 +44,45 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getJSONLDEntitiesRelationships(rendered) AS jsonld_entities_relationships, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getJSONLDEntitiesRelationships(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS jsonld_entities_relationships + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, jsonld_entity_relationship.relationship, jsonld_entity_relationship._to, - COUNT(0) AS count, - client + COUNT(0) AS freq_relationship, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_relationship, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_relationship, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM rendered_data, UNNEST(jsonld_entities_relationships) AS jsonld_entity_relationship +JOIN + page_totals +USING (client) GROUP BY + client, jsonld_entity_relationship.relationship, jsonld_entity_relationship._to, - client + total_pages ORDER BY - count DESC -LIMIT 10000 + pct_relationship DESC, + client +LIMIT 1000 diff --git a/sql/2021/structured-data/jsonld_relationships.sql b/sql/2021/structured-data/jsonld_relationships.sql index ce6e088feb4..83c71174dcd 100644 --- a/sql/2021/structured-data/jsonld_relationships.sql +++ b/sql/2021/structured-data/jsonld_relationships.sql @@ -33,30 +33,43 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getJSONLDRelationships(rendered) AS jsonld_relationships, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getJSONLDRelationships(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS jsonld_relationships + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, jsonld_relationship, - COUNT(jsonld_relationship) AS count, - SUM(COUNT(jsonld_relationship)) OVER (PARTITION BY client) AS total, - COUNT(jsonld_relationship) / SUM(COUNT(jsonld_relationship)) OVER (PARTITION BY client) AS pct, - client + COUNT(jsonld_relationship) AS freq_relationship, + SUM(COUNT(jsonld_relationship)) OVER (PARTITION BY client) AS total_relationship, + COUNT(jsonld_relationship) / SUM(COUNT(jsonld_relationship)) OVER (PARTITION BY client) AS pct_relationship, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM rendered_data, UNNEST(jsonld_relationships) AS jsonld_relationship +JOIN + page_totals +USING (client) GROUP BY + client, jsonld_relationship, - client + total_pages ORDER BY - count DESC -LIMIT 10000 + pct_relationship DESC, + client +LIMIT 1000 diff --git a/sql/2021/structured-data/jsonld_same_ases.sql b/sql/2021/structured-data/jsonld_same_ases.sql index 155f7415745..9677fb27e96 100644 --- a/sql/2021/structured-data/jsonld_same_ases.sql +++ b/sql/2021/structured-data/jsonld_same_ases.sql @@ -33,30 +33,43 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getJSONLDSameAses(rendered) AS jsonld_sameases, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getJSONLDSameAses(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS jsonld_sameases + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, NET.REG_DOMAIN(jsonld_sameas) AS jsonld_sameas, - COUNT(0) AS count, - SUM(COUNT(0)) OVER (PARTITION BY client) AS total, - COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct, - client + COUNT(0) AS freq_jsonld_sameas, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_jsonld_sameas, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_jsonld_sameas, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM rendered_data, UNNEST(jsonld_sameases) AS jsonld_sameas +JOIN + page_totals +USING (client) GROUP BY + client, jsonld_sameas, - client + total_pages ORDER BY - count DESC -LIMIT 10000 + pct_jsonld_sameas DESC, + client +LIMIT 1000 diff --git a/sql/2021/structured-data/jsonld_types.sql b/sql/2021/structured-data/jsonld_types.sql index 27cf50dc9f9..988292c62c2 100644 --- a/sql/2021/structured-data/jsonld_types.sql +++ b/sql/2021/structured-data/jsonld_types.sql @@ -33,29 +33,42 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getJSONLDTypes(rendered) AS jsonld_types, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getJSONLDTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS jsonld_types + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, jsonld_type, - COUNT(jsonld_type) AS count, - SUM(COUNT(jsonld_type)) OVER (PARTITION BY client) AS total, - COUNT(jsonld_type) / SUM(COUNT(jsonld_type)) OVER (PARTITION BY client) AS pct, - client + COUNT(jsonld_type) AS freq_jsonld_type, + SUM(COUNT(jsonld_type)) OVER (PARTITION BY client) AS total_jsonld_type, + COUNT(jsonld_type) / SUM(COUNT(jsonld_type)) OVER (PARTITION BY client) AS pct_jsonld_type, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM rendered_data, UNNEST(jsonld_types) AS jsonld_type +JOIN + page_totals +USING (client) GROUP BY + client, jsonld_type, - client + total_pages ORDER BY - count DESC + pct_jsonld_type DESC, + client diff --git a/sql/2021/structured-data/microdata_item_types.sql b/sql/2021/structured-data/microdata_item_types.sql index 61d8cb77345..2f6fea91485 100644 --- a/sql/2021/structured-data/microdata_item_types.sql +++ b/sql/2021/structured-data/microdata_item_types.sql @@ -14,37 +14,51 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getMicrodataItemTypes(rendered) AS microdata_item_types, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getMicrodataItemTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS microdata_item_types + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, microdata_item_type, - COUNT(microdata_item_type) AS count, - SUM(COUNT(microdata_item_type)) OVER (PARTITION BY client) AS total, - COUNT(microdata_item_type) / SUM(COUNT(microdata_item_type)) OVER (PARTITION BY client) AS pct, - client + COUNT(microdata_item_type) AS freq_microdata, + SUM(COUNT(microdata_item_type)) OVER (PARTITION BY client) AS total_microdata, + COUNT(microdata_item_type) / SUM(COUNT(microdata_item_type)) OVER (PARTITION BY client) AS pct_microdata, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM ( SELECT + client, + url, -- Removes the protocol and any subdomains from the URL. -- e.g. "https://my.example.com/pathname" becomes "example.com/pathname" -- This is done to normalize the URL a bit before counting. - CONCAT(NET.REG_DOMAIN(microdata_item_type), SPLIT(microdata_item_type, NET.REG_DOMAIN(microdata_item_type))[SAFE_OFFSET(1)]) AS microdata_item_type, - client + CONCAT(NET.REG_DOMAIN(microdata_item_type), SPLIT(microdata_item_type, NET.REG_DOMAIN(microdata_item_type))[SAFE_OFFSET(1)]) AS microdata_item_type FROM rendered_data, UNNEST(microdata_item_types) AS microdata_item_type ) +JOIN + page_totals +USING (client) GROUP BY + client, microdata_item_type, - client + total_pages ORDER BY - count DESC + freq_microdata DESC, + client diff --git a/sql/2021/structured-data/microformats2_types.sql b/sql/2021/structured-data/microformats2_types.sql index 93a21756f70..6174a2a2385 100644 --- a/sql/2021/structured-data/microformats2_types.sql +++ b/sql/2021/structured-data/microformats2_types.sql @@ -14,29 +14,42 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getMicroformats2Types(rendered) AS microformats2_types, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getMicroformats2Types(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS microformats2_types + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, microformats2_type.name AS microformats2_type, - SUM(microformats2_type.count) AS count, - SUM(SUM(microformats2_type.count)) OVER (PARTITION BY client) AS total, - SUM(microformats2_type.count) / SUM(SUM(microformats2_type.count)) OVER (PARTITION BY client) AS pct, - client + SUM(microformats2_type.count) AS freq_microformats2, + SUM(SUM(microformats2_type.count)) OVER (PARTITION BY client) AS total_microformats2_type, + SUM(microformats2_type.count) / SUM(SUM(microformats2_type.count)) OVER (PARTITION BY client) AS pct_microformats2_type, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM rendered_data, UNNEST(microformats2_types) AS microformats2_type +JOIN + page_totals +USING (client) GROUP BY + client, microformats2_type, - client + total_pages ORDER BY - count DESC + pct_microformats2_type DESC, + client diff --git a/sql/2021/structured-data/open_graph_types.sql b/sql/2021/structured-data/open_graph_types.sql index 5c901b559a5..334cd7b0868 100644 --- a/sql/2021/structured-data/open_graph_types.sql +++ b/sql/2021/structured-data/open_graph_types.sql @@ -14,29 +14,42 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getOpenGraphTypes(rendered) AS open_graph_types, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getOpenGraphTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS open_graph_types + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, open_graph_type, - COUNT(open_graph_type) AS count, - SUM(COUNT(open_graph_type)) OVER (PARTITION BY client) AS total, - COUNT(open_graph_type) / SUM(COUNT(open_graph_type)) OVER (PARTITION BY client) AS pct, - client + COUNT(open_graph_type) AS freq_open_graph_type, + SUM(COUNT(open_graph_type)) OVER (PARTITION BY client) AS total_open_graph_types, + COUNT(open_graph_type) / SUM(COUNT(open_graph_type)) OVER (PARTITION BY client) AS pct_open_graph_types, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM rendered_data, UNNEST(open_graph_types) AS open_graph_type +JOIN + page_totals +USING (client) GROUP BY + client, open_graph_type, - client + total_pages ORDER BY - count DESC + pct_open_graph_types DESC, + client diff --git a/sql/2021/structured-data/present_types.sql b/sql/2021/structured-data/present_types.sql index 4d81abcd82f..7a0ccef484e 100644 --- a/sql/2021/structured-data/present_types.sql +++ b/sql/2021/structured-data/present_types.sql @@ -1,73 +1,35 @@ #standardSQL # A count of pages which include each type of structured data SELECT - SUM(CASE - WHEN CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.rdfa') AS BOOL) = TRUE THEN 1 - ELSE - 0 - END - ) AS rdfa, - SUM(CASE - WHEN CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.json_ld') AS BOOL) = TRUE THEN 1 - ELSE - 0 - END - ) AS json_ld, - SUM(CASE - WHEN CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microdata') AS BOOL) = TRUE THEN 1 - ELSE - 0 - END - ) AS microdata, - SUM(CASE - WHEN CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microformats2') AS BOOL) = TRUE THEN 1 - ELSE - 0 - END - ) AS microformats2, - SUM(CASE - WHEN CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microformats_classic') AS BOOL) = TRUE THEN 1 - ELSE - 0 - END - ) AS microformats_classic, - SUM(CASE - WHEN CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.dublin_core') AS BOOL) = TRUE THEN 1 - ELSE - 0 - END - ) AS dublin_core, - SUM(CASE - WHEN CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.twitter') AS BOOL) = TRUE THEN 1 - ELSE - 0 - END - ) AS twitter, - SUM(CASE - WHEN CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.facebook') AS BOOL) = TRUE THEN 1 - ELSE - 0 - END - ) AS facebook, - SUM(CASE - WHEN CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.opengraph') AS BOOL) = TRUE THEN 1 - ELSE - 0 - END - ) AS opengraph, - SUM(CASE - WHEN JSON_EXTRACT(structured_data, '$.structured_data') IS NOT NULL AND JSON_EXTRACT(structured_data, '$.log') IS NULL THEN 1 - ELSE - 0 - END - ) AS total, - client + client, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.rdfa') AS BOOL)) AS rdfa, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.json_ld') AS BOOL)) AS json_ld, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microdata') AS BOOL)) AS microdata, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microformats2') AS BOOL)) AS microformats2, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microformats_classic') AS BOOL)) AS microformats_classic, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.dublin_core') AS BOOL)) AS dublin_core, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.twitter') AS BOOL)) AS twitter, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.facebook') AS BOOL)) AS facebook, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.opengraph') AS BOOL)) AS opengraph, + COUNTIF(JSON_EXTRACT(structured_data, '$.structured_data') IS NOT NULL AND JSON_EXTRACT(structured_data, '$.log') IS NULL) AS total_structured_data_ran, + COUNT(0) AS total_pages, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.rdfa') AS BOOL)) / COUNT(0) AS pct_rdfa, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.json_ld') AS BOOL)) / COUNT(0) AS pct_json_ld, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microdata') AS BOOL)) / COUNT(0) AS pct_microdata, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microformats2') AS BOOL)) / COUNT(0) AS pct_microformats2, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microformats_classic') AS BOOL)) / COUNT(0) AS pct_microformats_classic, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.dublin_core') AS BOOL)) / COUNT(0) AS pct_dublin_core, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.twitter') AS BOOL)) / COUNT(0) AS pct_twitter, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.facebook') AS BOOL)) / COUNT(0) AS pct_facebook, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.opengraph') AS BOOL)) / COUNT(0) AS pct_opengraph, + COUNTIF(JSON_EXTRACT(structured_data, '$.structured_data') IS NOT NULL AND JSON_EXTRACT(structured_data, '$.log') IS NULL) / COUNT(0) AS pct_total_structured_data_ran FROM ( SELECT - JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')) AS structured_data, - _TABLE_SUFFIX AS client + _TABLE_SUFFIX AS client, + JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')) AS structured_data FROM `httparchive.pages.2021_07_01_*`) GROUP BY client +ORDER BY + client diff --git a/sql/2021/structured-data/rdfa_prefixes.sql b/sql/2021/structured-data/rdfa_prefixes.sql index cda1bd82c27..2725b414408 100644 --- a/sql/2021/structured-data/rdfa_prefixes.sql +++ b/sql/2021/structured-data/rdfa_prefixes.sql @@ -2,7 +2,7 @@ # Count RDFa Prefixes CREATE TEMP FUNCTION getRDFaPrefixes(rendered STRING) RETURNS ARRAY -LANGUAGE js AS """ +LANGUAGE js AS r""" try { rendered = JSON.parse(rendered); const prefixRegExp = new RegExp(/(?[^:]*):\s+(?[^\s]*)\s*/gm) @@ -18,37 +18,51 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getRDFaPrefixes(rendered) AS rdfa_prefixes, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getRDFaPrefixes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS rdfa_prefixes + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, rdfa_prefix, - COUNT(rdfa_prefix) AS count, - SUM(COUNT(rdfa_prefix)) OVER (PARTITION BY client) AS total, - COUNT(rdfa_prefix) / SUM(COUNT(rdfa_prefix)) OVER (PARTITION BY client) AS pct, - client + COUNT(rdfa_prefix) AS freq_rdfa, + SUM(COUNT(rdfa_prefix)) OVER (PARTITION BY client) AS total_rdfa, + COUNT(rdfa_prefix) / SUM(COUNT(rdfa_prefix)) OVER (PARTITION BY client) AS pct_rdfa, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM ( SELECT + client, + url, -- Removes the protocol and any subdomains from the URL. -- e.g. "https://my.example.com/pathname" becomes "example.com/pathname" -- This is done to normalize the URL a bit before counting. - CONCAT(NET.REG_DOMAIN(rdfa_prefix), SPLIT(rdfa_prefix, NET.REG_DOMAIN(rdfa_prefix))[SAFE_OFFSET(1)]) AS rdfa_prefix, - client + CONCAT(NET.REG_DOMAIN(rdfa_prefix), SPLIT(rdfa_prefix, NET.REG_DOMAIN(rdfa_prefix))[SAFE_OFFSET(1)]) AS rdfa_prefix FROM rendered_data, UNNEST(rdfa_prefixes) AS rdfa_prefix ) +JOIN + page_totals +USING (client) GROUP BY + client, rdfa_prefix, - client + total_pages ORDER BY - count DESC + pct_rdfa DESC, + client diff --git a/sql/2021/structured-data/rdfa_type_ofs.sql b/sql/2021/structured-data/rdfa_type_ofs.sql index 619d4fc7f69..cc90e4e542c 100644 --- a/sql/2021/structured-data/rdfa_type_ofs.sql +++ b/sql/2021/structured-data/rdfa_type_ofs.sql @@ -14,29 +14,42 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getRDFaTypeOfs(rendered) AS rdfa_type_ofs, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getRDFaTypeOfs(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS rdfa_type_ofs + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, rdfa_type_of, - COUNT(rdfa_type_of) AS count, - SUM(COUNT(rdfa_type_of)) OVER (PARTITION BY client) AS total, - COUNT(rdfa_type_of) / SUM(COUNT(rdfa_type_of)) OVER (PARTITION BY client) AS pct, - client + COUNT(rdfa_type_of) AS freq_rdfa_type_of, + SUM(COUNT(rdfa_type_of)) OVER (PARTITION BY client) AS total_rdfa_type_of, + COUNT(rdfa_type_of) / SUM(COUNT(rdfa_type_of)) OVER (PARTITION BY client) AS pct_rdfa_type_of, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM rendered_data, UNNEST(rdfa_type_ofs) AS rdfa_type_of +JOIN + page_totals +USING (client) GROUP BY + client, rdfa_type_of, - client + total_pages ORDER BY - count DESC + pct_rdfa_type_of DESC, + client diff --git a/sql/2021/structured-data/rdfa_vocabs.sql b/sql/2021/structured-data/rdfa_vocabs.sql index 772090e5bde..a2c2490404c 100644 --- a/sql/2021/structured-data/rdfa_vocabs.sql +++ b/sql/2021/structured-data/rdfa_vocabs.sql @@ -14,37 +14,51 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getRDFaVocabs(rendered) AS rdfa_vocabs, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getRDFaVocabs(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS rdfa_vocabs + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, rdfa_vocab, - COUNT(rdfa_vocab) AS count, - SUM(COUNT(rdfa_vocab)) OVER (PARTITION BY client) AS total, - COUNT(rdfa_vocab) / SUM(COUNT(rdfa_vocab)) OVER (PARTITION BY client) AS pct, - client + COUNT(rdfa_vocab) AS freq_rdfa_vocab, + SUM(COUNT(rdfa_vocab)) OVER (PARTITION BY client) AS total_rdfa_vocab, + COUNT(rdfa_vocab) / SUM(COUNT(rdfa_vocab)) OVER (PARTITION BY client) AS pct_rdfa_vocab, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM ( SELECT + client, + url, -- Removes the protocol and any subdomains from the URL. -- e.g. "https://my.example.com/pathname" becomes "example.com/pathname" -- This is done to normalize the URL a bit before counting. - CONCAT(NET.REG_DOMAIN(rdfa_vocab), SPLIT(rdfa_vocab, NET.REG_DOMAIN(rdfa_vocab))[SAFE_OFFSET(1)]) AS rdfa_vocab, - client + CONCAT(NET.REG_DOMAIN(rdfa_vocab), SPLIT(rdfa_vocab, NET.REG_DOMAIN(rdfa_vocab))[SAFE_OFFSET(1)]) AS rdfa_vocab FROM rendered_data, UNNEST(rdfa_vocabs) AS rdfa_vocab ) +JOIN + page_totals +USING (client) GROUP BY + client, rdfa_vocab, - client + total_pages ORDER BY - count DESC + pct_pages DESC, + client diff --git a/sql/2021/structured-data/summary_count.sql b/sql/2021/structured-data/summary_count.sql index 166864c2508..5847d67a169 100644 --- a/sql/2021/structured-data/summary_count.sql +++ b/sql/2021/structured-data/summary_count.sql @@ -1,51 +1,33 @@ #standardSQL # A summary count of the pages run against SELECT - SUM(CASE - WHEN success IS NOT NULL THEN 1 - ELSE - 0 - END - ) AS success, - SUM(CASE - WHEN errors IS NOT NULL THEN 1 - ELSE - 0 - END - ) AS errors, - SUM(CASE - WHEN success IS NOT NULL AND errors IS NULL THEN 1 - ELSE - 0 - END - ) AS success_no_errors, - SUM(CASE - WHEN errors IS NOT NULL AND success IS NULL THEN 1 - ELSE - 0 - END - ) AS errors_no_success, - SUM(CASE - WHEN success IS NOT NULL AND errors IS NOT NULL THEN 1 - ELSE - 0 - END - ) AS success_errors, - SUM(CASE - WHEN success IS NULL AND errors IS NULL THEN 1 - ELSE - 0 - END - ) AS no_success_no_errors + client, + COUNTIF(success IS NOT NULL) AS success, + COUNTIF(errors IS NOT NULL) AS errors, + COUNTIF(success IS NOT NULL AND errors IS NULL) AS success_no_errors, + COUNTIF(errors IS NOT NULL AND success IS NULL) AS errors_no_success, + COUNTIF(success IS NOT NULL AND errors IS NOT NULL) AS success_errors, + COUNTIF(success IS NULL AND errors IS NULL) AS no_success_no_errors, + COUNTIF(success IS NOT NULL) / COUNT(0) AS pct_success, + COUNTIF(errors IS NOT NULL) / COUNT(0) AS pct_errors, + COUNTIF(success IS NOT NULL AND errors IS NULL) / COUNT(0) AS pct_success_no_errors, + COUNTIF(errors IS NOT NULL AND success IS NULL) / COUNT(0) AS pct_errors_no_success, + COUNTIF(success IS NOT NULL AND errors IS NOT NULL) / COUNT(0) AS pct_success_errors, + COUNTIF(success IS NULL AND errors IS NULL) / COUNT(0) AS pct_no_success_no_errors FROM ( SELECT - JSON_EXTRACT(structured_data, - '$.structured_data') AS success, - JSON_EXTRACT(structured_data, - '$.log') AS errors + client, + JSON_EXTRACT(structured_data, '$.structured_data') AS success, + JSON_EXTRACT(structured_data, '$.log') AS errors FROM ( SELECT - JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')) AS structured_data + _TABLE_SUFFIX AS client, + JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')) AS structured_data FROM - `httparchive.pages.2021_07_01_*`)) + `httparchive.pages.2021_07_01_*` + ) +) +GROUP BY + client +ORDER BY + client diff --git a/sql/2021/structured-data/twitter_types.sql b/sql/2021/structured-data/twitter_types.sql index cee530c258f..128462c4231 100644 --- a/sql/2021/structured-data/twitter_types.sql +++ b/sql/2021/structured-data/twitter_types.sql @@ -14,29 +14,42 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getTwitterTypes(rendered) AS twitter_types, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getTwitterTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS twitter_types + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, twitter_type, - COUNT(twitter_type) AS count, - SUM(COUNT(twitter_type)) OVER (PARTITION BY client) AS total, - COUNT(twitter_type) / SUM(COUNT(twitter_type)) OVER (PARTITION BY client) AS pct, - client + COUNT(twitter_type) AS freq_twitter, + SUM(COUNT(twitter_type)) OVER (PARTITION BY client) AS total_twitter, + COUNT(twitter_type) / SUM(COUNT(twitter_type)) OVER (PARTITION BY client) AS pct_twitter, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM rendered_data, UNNEST(twitter_types) AS twitter_type +JOIN + page_totals +USING (client) GROUP BY + client, twitter_type, - client + total_pages ORDER BY - count DESC + pct_twitter DESC, + client