From 6081e01b09343944d66699f09d3af8886f8b8c01 Mon Sep 17 00:00:00 2001 From: Barry Date: Sat, 6 Nov 2021 13:32:17 +0000 Subject: [PATCH 1/4] Add pages to first set of queries --- .../classic_microformats_types.sql | 45 +++++++++++------ .../structured-data/dublin_core_types.sql | 43 ++++++++++------ sql/2021/structured-data/facebook_types.sql | 47 ++++++++++------- sql/2021/structured-data/jsonld_contexts.sql | 47 ++++++++++------- .../jsonld_entities_relationships.sql | 45 +++++++++++------ .../jsonld_relationship_values.sql | 45 +++++++++++------ .../structured-data/jsonld_relationships.sql | 47 ++++++++++------- sql/2021/structured-data/jsonld_same_ases.sql | 47 ++++++++++------- sql/2021/structured-data/jsonld_types.sql | 45 +++++++++++------ .../structured-data/microdata_item_types.sql | 50 ++++++++++++------- 10 files changed, 298 insertions(+), 163 deletions(-) diff --git a/sql/2021/structured-data/classic_microformats_types.sql b/sql/2021/structured-data/classic_microformats_types.sql index 5506aafaa22..8af1bdfc8e3 100644 --- a/sql/2021/structured-data/classic_microformats_types.sql +++ b/sql/2021/structured-data/classic_microformats_types.sql @@ -14,29 +14,42 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getClassicMicroformatsTypes(rendered) AS classic_microformats_types, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getClassicMicroformatsTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS classic_microformats_types + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, classic_microformats_type.name AS classic_microformats_type, - SUM(classic_microformats_type.count) AS count, - SUM(SUM(classic_microformats_type.count)) OVER (PARTITION BY client) AS total, - SUM(classic_microformats_type.count) / SUM(SUM(classic_microformats_type.count)) OVER (PARTITION BY client) AS pct, - client + SUM(classic_microformats_type.count) AS freq_microformat, + SUM(SUM(classic_microformats_type.count)) OVER (PARTITION BY client) AS total_microformat, + SUM(classic_microformats_type.count) / SUM(SUM(classic_microformats_type.count)) OVER (PARTITION BY client) AS pct_microformat, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM rendered_data, UNNEST(classic_microformats_types) AS classic_microformats_type +JOIN + page_totals +USING (client) GROUP BY + client, classic_microformats_type, - client + total_pages ORDER BY - count DESC + freq_microformat DESC, + client diff --git a/sql/2021/structured-data/dublin_core_types.sql b/sql/2021/structured-data/dublin_core_types.sql index e2b6433c6e2..3cdda147387 100644 --- a/sql/2021/structured-data/dublin_core_types.sql +++ b/sql/2021/structured-data/dublin_core_types.sql @@ -14,29 +14,42 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getDublinCoreTypes(rendered) AS dublin_core_types, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getDublinCoreTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS dublin_core_types + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, dublin_core_type, COUNT(dublin_core_type) AS count, - SUM(COUNT(dublin_core_type)) OVER (PARTITION BY client) AS total, - COUNT(dublin_core_type) / SUM(COUNT(dublin_core_type)) OVER (PARTITION BY client) AS pct, - client + SUM(COUNT(dublin_core_type)) OVER (PARTITION BY client) AS freq_dublin_core, + COUNT(dublin_core_type) / SUM(COUNT(dublin_core_type)) OVER (PARTITION BY client) AS pct_dublin_core, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM rendered_data, UNNEST(dublin_core_types) AS dublin_core_type +JOIN + page_totals +USING (client) GROUP BY + client, dublin_core_type, - client + total_pages ORDER BY - count DESC + pct_dublin_core DESC, + client diff --git a/sql/2021/structured-data/facebook_types.sql b/sql/2021/structured-data/facebook_types.sql index e425890a69c..a7ab2449247 100644 --- a/sql/2021/structured-data/facebook_types.sql +++ b/sql/2021/structured-data/facebook_types.sql @@ -14,29 +14,42 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getFacebookTypes(rendered) AS facebook_types, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getFacebookTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS facebook_type + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, facebook_type, - COUNT(facebook_type) AS count, - SUM(COUNT(facebook_type)) OVER (PARTITION BY client) AS total, - COUNT(facebook_type) / SUM(COUNT(facebook_type)) OVER (PARTITION BY client) AS pct, - client + COUNT(facebook_type) AS freq_facebook, + SUM(COUNT(facebook_type)) OVER (PARTITION BY client) AS total_facebook, + COUNT(facebook_type) / SUM(COUNT(facebook_type)) OVER (PARTITION BY client) AS pct_facebook, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM rendered_data, - UNNEST(facebook_types) AS facebook_type + UNNEST(facebook_type) AS facebook_type +JOIN + page_totals +USING (client) GROUP BY + client, facebook_type, - client + total_pages ORDER BY - count DESC + freq_facebook DESC, + client diff --git a/sql/2021/structured-data/jsonld_contexts.sql b/sql/2021/structured-data/jsonld_contexts.sql index 947be5b5fa0..2af76dcc2db 100644 --- a/sql/2021/structured-data/jsonld_contexts.sql +++ b/sql/2021/structured-data/jsonld_contexts.sql @@ -33,29 +33,42 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getJSONLDContexts(rendered) AS jsonld_contexts, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getJSONLDContexts(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS jsonld_context + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, NET.REG_DOMAIN(jsonld_context) AS jsonld_context, - COUNT(0) AS count, - SUM(COUNT(0)) OVER (PARTITION BY client) AS total, - COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct, - client + COUNT(0) AS freq_jsonld_context, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_jsonld_context, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_jsonld_context, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM rendered_data, - UNNEST(jsonld_contexts) AS jsonld_context + UNNEST(jsonld_context) AS jsonld_context +JOIN + page_totals +USING (client) GROUP BY + client, jsonld_context, - client + total_pages ORDER BY - count DESC + pct_jsonld_context DESC, + client diff --git a/sql/2021/structured-data/jsonld_entities_relationships.sql b/sql/2021/structured-data/jsonld_entities_relationships.sql index d9109f367da..3e7950cf92d 100644 --- a/sql/2021/structured-data/jsonld_entities_relationships.sql +++ b/sql/2021/structured-data/jsonld_entities_relationships.sql @@ -44,34 +44,49 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getJSONLDEntitiesRelationships(rendered) AS jsonld_entities_relationships, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getJSONLDEntitiesRelationships(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS jsonld_entities_relationships + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, jsonld_entity_relationship._from, jsonld_entity_relationship.relationship, jsonld_entity_relationship._to, jsonld_entity_relationship.depth, - COUNT(0) AS count, - client + COUNT(0) AS freq_relationship, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_relationship, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_relationship, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM rendered_data, UNNEST(jsonld_entities_relationships) AS jsonld_entity_relationship +JOIN + page_totals +USING (client) GROUP BY + client, jsonld_entity_relationship._from, jsonld_entity_relationship.relationship, jsonld_entity_relationship._to, jsonld_entity_relationship.depth, - client + total_pages ORDER BY - count DESC -LIMIT 10000 + pct_relationship DESC, + client +LIMIT 1000 diff --git a/sql/2021/structured-data/jsonld_relationship_values.sql b/sql/2021/structured-data/jsonld_relationship_values.sql index 9cd0850ac7c..9a33e1f8289 100644 --- a/sql/2021/structured-data/jsonld_relationship_values.sql +++ b/sql/2021/structured-data/jsonld_relationship_values.sql @@ -44,30 +44,45 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getJSONLDEntitiesRelationships(rendered) AS jsonld_entities_relationships, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getJSONLDEntitiesRelationships(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS jsonld_entities_relationships + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, jsonld_entity_relationship.relationship, jsonld_entity_relationship._to, - COUNT(0) AS count, - client + COUNT(0) AS freq_relationship, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_relationship, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_relationship, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM rendered_data, UNNEST(jsonld_entities_relationships) AS jsonld_entity_relationship +JOIN + page_totals +USING (client) GROUP BY + client, jsonld_entity_relationship.relationship, jsonld_entity_relationship._to, - client + total_pages ORDER BY - count DESC -LIMIT 10000 + pct_relationship DESC, + client +LIMIT 1000 diff --git a/sql/2021/structured-data/jsonld_relationships.sql b/sql/2021/structured-data/jsonld_relationships.sql index ce6e088feb4..0e707cabb63 100644 --- a/sql/2021/structured-data/jsonld_relationships.sql +++ b/sql/2021/structured-data/jsonld_relationships.sql @@ -33,30 +33,43 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getJSONLDRelationships(rendered) AS jsonld_relationships, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getJSONLDRelationships(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS jsonld_relationships + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, jsonld_relationship, - COUNT(jsonld_relationship) AS count, - SUM(COUNT(jsonld_relationship)) OVER (PARTITION BY client) AS total, - COUNT(jsonld_relationship) / SUM(COUNT(jsonld_relationship)) OVER (PARTITION BY client) AS pct, - client + COUNT(jsonld_relationship) AS freq_relationship, + SUM(COUNT(jsonld_relationship)) OVER (PARTITION BY client) AS total_relationship, + COUNT(jsonld_relationship) / SUM(COUNT(jsonld_relationship)) OVER (PARTITION BY client) AS pct_relationship, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM rendered_data, UNNEST(jsonld_relationships) AS jsonld_relationship +JOIN + page_totals +USING (client) GROUP BY + client, jsonld_relationship, - client + total_pages ORDER BY - count DESC -LIMIT 10000 + pct_relationship DESC, + client +LIMIT 1000 diff --git a/sql/2021/structured-data/jsonld_same_ases.sql b/sql/2021/structured-data/jsonld_same_ases.sql index 155f7415745..e665e2437f3 100644 --- a/sql/2021/structured-data/jsonld_same_ases.sql +++ b/sql/2021/structured-data/jsonld_same_ases.sql @@ -33,30 +33,43 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getJSONLDSameAses(rendered) AS jsonld_sameases, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getJSONLDSameAses(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS jsonld_sameases + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, NET.REG_DOMAIN(jsonld_sameas) AS jsonld_sameas, - COUNT(0) AS count, - SUM(COUNT(0)) OVER (PARTITION BY client) AS total, - COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct, - client + COUNT(0) AS freq_jsonld_sameas, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_jsonld_sameas, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_jsonld_sameas, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM rendered_data, UNNEST(jsonld_sameases) AS jsonld_sameas +JOIN + page_totals +USING (client) GROUP BY + client, jsonld_sameas, - client + total_pages ORDER BY - count DESC -LIMIT 10000 + pct_jsonld_sameas DESC, + client +LIMIT 1000 diff --git a/sql/2021/structured-data/jsonld_types.sql b/sql/2021/structured-data/jsonld_types.sql index 27cf50dc9f9..6a9341a5a85 100644 --- a/sql/2021/structured-data/jsonld_types.sql +++ b/sql/2021/structured-data/jsonld_types.sql @@ -33,29 +33,42 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getJSONLDTypes(rendered) AS jsonld_types, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getJSONLDTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS jsonld_types + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, jsonld_type, - COUNT(jsonld_type) AS count, - SUM(COUNT(jsonld_type)) OVER (PARTITION BY client) AS total, - COUNT(jsonld_type) / SUM(COUNT(jsonld_type)) OVER (PARTITION BY client) AS pct, - client + COUNT(jsonld_type) AS freq_jsonld_type, + SUM(COUNT(jsonld_type)) OVER (PARTITION BY client) AS total_jsonld_type, + COUNT(jsonld_type) / SUM(COUNT(jsonld_type)) OVER (PARTITION BY client) AS pct_jsonld_type, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM rendered_data, UNNEST(jsonld_types) AS jsonld_type +JOIN + page_totals +USING (client) GROUP BY + client, jsonld_type, - client + total_pages ORDER BY - count DESC + pct_jsonld_type DESC, + client diff --git a/sql/2021/structured-data/microdata_item_types.sql b/sql/2021/structured-data/microdata_item_types.sql index 61d8cb77345..dcb185b0c27 100644 --- a/sql/2021/structured-data/microdata_item_types.sql +++ b/sql/2021/structured-data/microdata_item_types.sql @@ -14,37 +14,51 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getMicrodataItemTypes(rendered) AS microdata_item_types, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getMicrodataItemTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS microdata_item_types + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, microdata_item_type, - COUNT(microdata_item_type) AS count, - SUM(COUNT(microdata_item_type)) OVER (PARTITION BY client) AS total, - COUNT(microdata_item_type) / SUM(COUNT(microdata_item_type)) OVER (PARTITION BY client) AS pct, - client + COUNT(microdata_item_type) AS freq_microdata, + SUM(COUNT(microdata_item_type)) OVER (PARTITION BY client) AS total_microdata, + COUNT(microdata_item_type) / SUM(COUNT(microdata_item_type)) OVER (PARTITION BY client) AS pct_microdata, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM ( SELECT + client, + url, -- Removes the protocol and any subdomains from the URL. -- e.g. "https://my.example.com/pathname" becomes "example.com/pathname" -- This is done to normalize the URL a bit before counting. - CONCAT(NET.REG_DOMAIN(microdata_item_type), SPLIT(microdata_item_type, NET.REG_DOMAIN(microdata_item_type))[SAFE_OFFSET(1)]) AS microdata_item_type, - client + CONCAT(NET.REG_DOMAIN(microdata_item_type), SPLIT(microdata_item_type, NET.REG_DOMAIN(microdata_item_type))[SAFE_OFFSET(1)]) AS microdata_item_type FROM rendered_data, UNNEST(microdata_item_types) AS microdata_item_type ) +JOIN + page_totals +USING (client) GROUP BY + client, microdata_item_type, - client + total_pages ORDER BY - count DESC + freq_microdata DESC, + client From 36a23786a53f5d24a17f550c263e349fd19cb521 Mon Sep 17 00:00:00 2001 From: Barry Date: Sat, 6 Nov 2021 16:53:05 +0000 Subject: [PATCH 2/4] Add page counts to the remainders --- ...depth.sql => jsonld_depth_percentiles.sql} | 33 +++---- .../structured-data/microformats2_types.sql | 45 ++++++---- sql/2021/structured-data/open_graph_types.sql | 45 ++++++---- sql/2021/structured-data/present_types.sql | 90 ++++++------------- sql/2021/structured-data/rdfa_prefixes.sql | 50 +++++++---- sql/2021/structured-data/rdfa_type_ofs.sql | 45 ++++++---- sql/2021/structured-data/rdfa_vocabs.sql | 48 ++++++---- sql/2021/structured-data/summary_count.sql | 68 ++++++-------- sql/2021/structured-data/twitter_types.sql | 45 ++++++---- 9 files changed, 247 insertions(+), 222 deletions(-) rename sql/2021/structured-data/{jsonld_max_depth.sql => jsonld_depth_percentiles.sql} (72%) diff --git a/sql/2021/structured-data/jsonld_max_depth.sql b/sql/2021/structured-data/jsonld_depth_percentiles.sql similarity index 72% rename from sql/2021/structured-data/jsonld_max_depth.sql rename to sql/2021/structured-data/jsonld_depth_percentiles.sql index 2456b044d41..972db82b121 100644 --- a/sql/2021/structured-data/jsonld_max_depth.sql +++ b/sql/2021/structured-data/jsonld_depth_percentiles.sql @@ -41,26 +41,27 @@ LANGUAGE js AS """ } """; -WITH -rendered_data AS ( +WITH rendered_data AS ( SELECT - getJSONLDEntitiesRelationships(rendered) AS jsonld_entities_relationships, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getJSONLDEntitiesRelationships(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS jsonld_entities_relationships + FROM + `httparchive.pages.2021_07_01_*` ) SELECT - MAX(jsonld_entity_relationship.depth) AS max_depth, - client + client, + percentile, + APPROX_QUANTILES(jsonld_entity_relationship.depth, 1000)[OFFSET(percentile * 10)] AS depth, + ARRAY_TO_STRING(ARRAY_AGG(DISTINCT url LIMIT 5), ' ') AS sample_urls FROM rendered_data, - UNNEST(jsonld_entities_relationships) AS jsonld_entity_relationship + UNNEST(jsonld_entities_relationships) AS jsonld_entity_relationship, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile GROUP BY - client + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2021/structured-data/microformats2_types.sql b/sql/2021/structured-data/microformats2_types.sql index 93a21756f70..e40a2a97636 100644 --- a/sql/2021/structured-data/microformats2_types.sql +++ b/sql/2021/structured-data/microformats2_types.sql @@ -14,29 +14,42 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getMicroformats2Types(rendered) AS microformats2_types, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getMicroformats2Types(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS microformats2_types + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, microformats2_type.name AS microformats2_type, - SUM(microformats2_type.count) AS count, - SUM(SUM(microformats2_type.count)) OVER (PARTITION BY client) AS total, - SUM(microformats2_type.count) / SUM(SUM(microformats2_type.count)) OVER (PARTITION BY client) AS pct, - client + SUM(microformats2_type.count) AS freq_microformats2, + SUM(SUM(microformats2_type.count)) OVER (PARTITION BY client) AS total_microformats2_type, + SUM(microformats2_type.count) / SUM(SUM(microformats2_type.count)) OVER (PARTITION BY client) AS pct_microformats2_type, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM rendered_data, UNNEST(microformats2_types) AS microformats2_type +JOIN + page_totals +USING (client) GROUP BY + client, microformats2_type, - client + total_pages ORDER BY - count DESC + pct_microformats2_type DESC, + client diff --git a/sql/2021/structured-data/open_graph_types.sql b/sql/2021/structured-data/open_graph_types.sql index 5c901b559a5..ee326515658 100644 --- a/sql/2021/structured-data/open_graph_types.sql +++ b/sql/2021/structured-data/open_graph_types.sql @@ -14,29 +14,42 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getOpenGraphTypes(rendered) AS open_graph_types, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getOpenGraphTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS open_graph_types + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, open_graph_type, - COUNT(open_graph_type) AS count, - SUM(COUNT(open_graph_type)) OVER (PARTITION BY client) AS total, - COUNT(open_graph_type) / SUM(COUNT(open_graph_type)) OVER (PARTITION BY client) AS pct, - client + COUNT(open_graph_type) AS freq_open_graph_type, + SUM(COUNT(open_graph_type)) OVER (PARTITION BY client) AS total_open_graph_types, + COUNT(open_graph_type) / SUM(COUNT(open_graph_type)) OVER (PARTITION BY client) AS pct_open_graph_types, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM rendered_data, UNNEST(open_graph_types) AS open_graph_type +JOIN + page_totals +USING (client) GROUP BY + client, open_graph_type, - client + total_pages ORDER BY - count DESC + pct_open_graph_types DESC, + client diff --git a/sql/2021/structured-data/present_types.sql b/sql/2021/structured-data/present_types.sql index 4d81abcd82f..bcd0d66b1ec 100644 --- a/sql/2021/structured-data/present_types.sql +++ b/sql/2021/structured-data/present_types.sql @@ -1,73 +1,35 @@ #standardSQL # A count of pages which include each type of structured data SELECT - SUM(CASE - WHEN CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.rdfa') AS BOOL) = TRUE THEN 1 - ELSE - 0 - END - ) AS rdfa, - SUM(CASE - WHEN CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.json_ld') AS BOOL) = TRUE THEN 1 - ELSE - 0 - END - ) AS json_ld, - SUM(CASE - WHEN CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microdata') AS BOOL) = TRUE THEN 1 - ELSE - 0 - END - ) AS microdata, - SUM(CASE - WHEN CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microformats2') AS BOOL) = TRUE THEN 1 - ELSE - 0 - END - ) AS microformats2, - SUM(CASE - WHEN CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microformats_classic') AS BOOL) = TRUE THEN 1 - ELSE - 0 - END - ) AS microformats_classic, - SUM(CASE - WHEN CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.dublin_core') AS BOOL) = TRUE THEN 1 - ELSE - 0 - END - ) AS dublin_core, - SUM(CASE - WHEN CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.twitter') AS BOOL) = TRUE THEN 1 - ELSE - 0 - END - ) AS twitter, - SUM(CASE - WHEN CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.facebook') AS BOOL) = TRUE THEN 1 - ELSE - 0 - END - ) AS facebook, - SUM(CASE - WHEN CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.opengraph') AS BOOL) = TRUE THEN 1 - ELSE - 0 - END - ) AS opengraph, - SUM(CASE - WHEN JSON_EXTRACT(structured_data, '$.structured_data') IS NOT NULL AND JSON_EXTRACT(structured_data, '$.log') IS NULL THEN 1 - ELSE - 0 - END - ) AS total, - client + client, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.rdfa') AS BOOL) = TRUE) AS rdfa, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.json_ld') AS BOOL) = TRUE) AS json_ld, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microdata') AS BOOL) = TRUE) AS microdata, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microformats2') AS BOOL) = TRUE) AS microformats2, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microformats_classic') AS BOOL) = TRUE) AS microformats_classic, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.dublin_core') AS BOOL) = TRUE) AS dublin_core, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.twitter') AS BOOL) = TRUE) AS twitter, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.facebook') AS BOOL) = TRUE) AS facebook, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.opengraph') AS BOOL) = TRUE) AS opengraph, + COUNTIF(JSON_EXTRACT(structured_data, '$.structured_data') IS NOT NULL AND JSON_EXTRACT(structured_data, '$.log') IS NULL) AS total_structured_data_ran, + COUNT(0) AS total_pages, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.rdfa') AS BOOL) = TRUE) / COUNT(0) AS pct_rdfa, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.json_ld') AS BOOL) = TRUE) / COUNT(0) AS pct_json_ld, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microdata') AS BOOL) = TRUE) / COUNT(0) AS pct_microdata, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microformats2') AS BOOL) = TRUE) / COUNT(0) AS pct_microformats2, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microformats_classic') AS BOOL) = TRUE) / COUNT(0) AS pct_microformats_classic, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.dublin_core') AS BOOL) = TRUE) / COUNT(0) AS pct_dublin_core, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.twitter') AS BOOL) = TRUE) / COUNT(0) AS pct_twitter, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.facebook') AS BOOL) = TRUE) / COUNT(0) AS pct_facebook, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.opengraph') AS BOOL) = TRUE) / COUNT(0) AS pct_opengraph, + COUNTIF(JSON_EXTRACT(structured_data, '$.structured_data') IS NOT NULL AND JSON_EXTRACT(structured_data, '$.log') IS NULL) / COUNT(0) AS pct_total_structured_data_ran, FROM ( SELECT - JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')) AS structured_data, - _TABLE_SUFFIX AS client + _TABLE_SUFFIX AS client, + JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')) AS structured_data FROM `httparchive.pages.2021_07_01_*`) GROUP BY client +ORDER BY + client diff --git a/sql/2021/structured-data/rdfa_prefixes.sql b/sql/2021/structured-data/rdfa_prefixes.sql index cda1bd82c27..2ed685a22fe 100644 --- a/sql/2021/structured-data/rdfa_prefixes.sql +++ b/sql/2021/structured-data/rdfa_prefixes.sql @@ -5,7 +5,7 @@ RETURNS ARRAY LANGUAGE js AS """ try { rendered = JSON.parse(rendered); - const prefixRegExp = new RegExp(/(?[^:]*):\s+(?[^\s]*)\s*/gm) + const prefixRegExp = new RegExp(/(?[^:]*):\\s+(?[^\\s]*)\\s*/gm) return rendered.rdfa_prefixes.map(prefix => { const matches = [...prefix.toLowerCase().trim().matchAll(prefixRegExp)]; return matches.map(match => match.groups.uri); @@ -18,37 +18,51 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getRDFaPrefixes(rendered) AS rdfa_prefixes, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getRDFaPrefixes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS rdfa_prefixes + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, rdfa_prefix, - COUNT(rdfa_prefix) AS count, - SUM(COUNT(rdfa_prefix)) OVER (PARTITION BY client) AS total, - COUNT(rdfa_prefix) / SUM(COUNT(rdfa_prefix)) OVER (PARTITION BY client) AS pct, - client + COUNT(rdfa_prefix) AS freq_rdfa, + SUM(COUNT(rdfa_prefix)) OVER (PARTITION BY client) AS total_rdfa, + COUNT(rdfa_prefix) / SUM(COUNT(rdfa_prefix)) OVER (PARTITION BY client) AS pct_rdfa, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM ( SELECT + client, + url, -- Removes the protocol and any subdomains from the URL. -- e.g. "https://my.example.com/pathname" becomes "example.com/pathname" -- This is done to normalize the URL a bit before counting. CONCAT(NET.REG_DOMAIN(rdfa_prefix), SPLIT(rdfa_prefix, NET.REG_DOMAIN(rdfa_prefix))[SAFE_OFFSET(1)]) AS rdfa_prefix, - client FROM rendered_data, UNNEST(rdfa_prefixes) AS rdfa_prefix ) +JOIN + page_totals +USING (client) GROUP BY + client, rdfa_prefix, - client + total_pages ORDER BY - count DESC + pct_rdfa DESC, + client diff --git a/sql/2021/structured-data/rdfa_type_ofs.sql b/sql/2021/structured-data/rdfa_type_ofs.sql index 619d4fc7f69..ec8698dceba 100644 --- a/sql/2021/structured-data/rdfa_type_ofs.sql +++ b/sql/2021/structured-data/rdfa_type_ofs.sql @@ -14,29 +14,42 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getRDFaTypeOfs(rendered) AS rdfa_type_ofs, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getRDFaTypeOfs(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS rdfa_type_ofs + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, rdfa_type_of, - COUNT(rdfa_type_of) AS count, - SUM(COUNT(rdfa_type_of)) OVER (PARTITION BY client) AS total, - COUNT(rdfa_type_of) / SUM(COUNT(rdfa_type_of)) OVER (PARTITION BY client) AS pct, - client + COUNT(rdfa_type_of) AS freq_rdfa_type_of, + SUM(COUNT(rdfa_type_of)) OVER (PARTITION BY client) AS total_rdfa_type_of, + COUNT(rdfa_type_of) / SUM(COUNT(rdfa_type_of)) OVER (PARTITION BY client) AS pct_rdfa_type_of, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM rendered_data, UNNEST(rdfa_type_ofs) AS rdfa_type_of +JOIN + page_totals +USING (client) GROUP BY + client, rdfa_type_of, - client + total_pages ORDER BY - count DESC + pct_rdfa_type_of DESC, + client diff --git a/sql/2021/structured-data/rdfa_vocabs.sql b/sql/2021/structured-data/rdfa_vocabs.sql index 772090e5bde..eb6a84f3bb0 100644 --- a/sql/2021/structured-data/rdfa_vocabs.sql +++ b/sql/2021/structured-data/rdfa_vocabs.sql @@ -14,37 +14,51 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getRDFaVocabs(rendered) AS rdfa_vocabs, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getRDFaVocabs(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS rdfa_vocabs + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, rdfa_vocab, - COUNT(rdfa_vocab) AS count, - SUM(COUNT(rdfa_vocab)) OVER (PARTITION BY client) AS total, - COUNT(rdfa_vocab) / SUM(COUNT(rdfa_vocab)) OVER (PARTITION BY client) AS pct, - client + COUNT(rdfa_vocab) AS freq_rdfa_vocab, + SUM(COUNT(rdfa_vocab)) OVER (PARTITION BY client) AS total_rdfa_vocab, + COUNT(rdfa_vocab) / SUM(COUNT(rdfa_vocab)) OVER (PARTITION BY client) AS pct_rdfa_vocab, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM ( SELECT + client, + url, -- Removes the protocol and any subdomains from the URL. -- e.g. "https://my.example.com/pathname" becomes "example.com/pathname" -- This is done to normalize the URL a bit before counting. CONCAT(NET.REG_DOMAIN(rdfa_vocab), SPLIT(rdfa_vocab, NET.REG_DOMAIN(rdfa_vocab))[SAFE_OFFSET(1)]) AS rdfa_vocab, - client FROM rendered_data, UNNEST(rdfa_vocabs) AS rdfa_vocab ) +JOIN + page_totals +USING (client) GROUP BY + client, rdfa_vocab, - client + total_pages ORDER BY - count DESC + pct_pages DESC, + client diff --git a/sql/2021/structured-data/summary_count.sql b/sql/2021/structured-data/summary_count.sql index 166864c2508..894c798cb8d 100644 --- a/sql/2021/structured-data/summary_count.sql +++ b/sql/2021/structured-data/summary_count.sql @@ -1,51 +1,33 @@ #standardSQL # A summary count of the pages run against SELECT - SUM(CASE - WHEN success IS NOT NULL THEN 1 - ELSE - 0 - END - ) AS success, - SUM(CASE - WHEN errors IS NOT NULL THEN 1 - ELSE - 0 - END - ) AS errors, - SUM(CASE - WHEN success IS NOT NULL AND errors IS NULL THEN 1 - ELSE - 0 - END - ) AS success_no_errors, - SUM(CASE - WHEN errors IS NOT NULL AND success IS NULL THEN 1 - ELSE - 0 - END - ) AS errors_no_success, - SUM(CASE - WHEN success IS NOT NULL AND errors IS NOT NULL THEN 1 - ELSE - 0 - END - ) AS success_errors, - SUM(CASE - WHEN success IS NULL AND errors IS NULL THEN 1 - ELSE - 0 - END - ) AS no_success_no_errors + client, + COUNTIF(success IS NOT NULL) AS success, + COUNTIF(errors IS NOT NULL) AS errors, + COUNTIF(success IS NOT NULL AND errors is NULL) AS success_no_errors, + COUNTIF(errors IS NOT NULL AND success is NULL) AS errors_no_success, + COUNTIF(success IS NOT NULL AND errors is NOT NULL) AS success_errors, + COUNTIF(success IS NULL AND errors IS NULL) AS no_success_no_errors, + COUNTIF(success IS NOT NULL) / COUNT(0) AS pct_success, + COUNTIF(errors IS NOT NULL) / COUNT(0) AS pct_errors, + COUNTIF(success IS NOT NULL AND errors is NULL) / COUNT(0) AS pct_success_no_errors, + COUNTIF(errors IS NOT NULL AND success is NULL) / COUNT(0) AS pct_errors_no_success, + COUNTIF(success IS NOT NULL AND errors is NOT NULL) / COUNT(0) AS pct_success_errors, + COUNTIF(success IS NULL AND errors IS NULL) / COUNT(0) AS pct_no_success_no_errors, FROM ( SELECT - JSON_EXTRACT(structured_data, - '$.structured_data') AS success, - JSON_EXTRACT(structured_data, - '$.log') AS errors + client, + JSON_EXTRACT(structured_data, '$.structured_data') AS success, + JSON_EXTRACT(structured_data, '$.log') AS errors FROM ( SELECT - JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')) AS structured_data + _TABLE_SUFFIX AS client, + JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')) AS structured_data FROM - `httparchive.pages.2021_07_01_*`)) + `httparchive.pages.2021_07_01_*` + ) +) +GROUP BY + client +ORDER BY + client diff --git a/sql/2021/structured-data/twitter_types.sql b/sql/2021/structured-data/twitter_types.sql index cee530c258f..f68c211ef2d 100644 --- a/sql/2021/structured-data/twitter_types.sql +++ b/sql/2021/structured-data/twitter_types.sql @@ -14,29 +14,42 @@ LANGUAGE js AS """ WITH rendered_data AS ( SELECT - getTwitterTypes(rendered) AS twitter_types, - client - FROM ( - SELECT - JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, - '$._structured-data')), - '$.structured_data.rendered') AS rendered, - _TABLE_SUFFIX AS client - FROM - `httparchive.pages.2021_07_01_*`) + _TABLE_SUFFIX AS client, + url, + getTwitterTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS twitter_types + FROM + `httparchive.pages.2021_07_01_*` +), + +page_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total_pages + FROM + `httparchive.pages.2021_07_01_*` + GROUP BY + _TABLE_SUFFIX ) SELECT + client, twitter_type, - COUNT(twitter_type) AS count, - SUM(COUNT(twitter_type)) OVER (PARTITION BY client) AS total, - COUNT(twitter_type) / SUM(COUNT(twitter_type)) OVER (PARTITION BY client) AS pct, - client + COUNT(twitter_type) AS freq_twitter, + SUM(COUNT(twitter_type)) OVER (PARTITION BY client) AS total_twitter, + COUNT(twitter_type) / SUM(COUNT(twitter_type)) OVER (PARTITION BY client) AS pct_twitter, + COUNT(DISTINCT url) AS freq_pages, + total_pages, + COUNT(DISTINCT url) / total_pages AS pct_pages FROM rendered_data, UNNEST(twitter_types) AS twitter_type +JOIN + page_totals +USING (client) GROUP BY + client, twitter_type, - client + total_pages ORDER BY - count DESC + pct_twitter DESC, + client From b6d3b7ac6736fe98c7fc117c8139de3978a40912 Mon Sep 17 00:00:00 2001 From: Barry Date: Sat, 6 Nov 2021 16:56:06 +0000 Subject: [PATCH 3/4] Linting fixes --- .../structured-data/classic_microformats_types.sql | 2 +- sql/2021/structured-data/dublin_core_types.sql | 2 +- sql/2021/structured-data/facebook_types.sql | 2 +- sql/2021/structured-data/jsonld_contexts.sql | 2 +- .../structured-data/jsonld_depth_percentiles.sql | 2 +- .../jsonld_entities_relationships.sql | 2 +- .../structured-data/jsonld_relationship_values.sql | 2 +- sql/2021/structured-data/jsonld_relationships.sql | 2 +- sql/2021/structured-data/jsonld_same_ases.sql | 2 +- sql/2021/structured-data/jsonld_types.sql | 2 +- sql/2021/structured-data/microdata_item_types.sql | 2 +- sql/2021/structured-data/microformats2_types.sql | 2 +- sql/2021/structured-data/open_graph_types.sql | 2 +- sql/2021/structured-data/present_types.sql | 2 +- sql/2021/structured-data/rdfa_prefixes.sql | 4 ++-- sql/2021/structured-data/rdfa_type_ofs.sql | 2 +- sql/2021/structured-data/rdfa_vocabs.sql | 4 ++-- sql/2021/structured-data/summary_count.sql | 14 +++++++------- sql/2021/structured-data/twitter_types.sql | 2 +- 19 files changed, 27 insertions(+), 27 deletions(-) diff --git a/sql/2021/structured-data/classic_microformats_types.sql b/sql/2021/structured-data/classic_microformats_types.sql index 8af1bdfc8e3..d61ea9968cb 100644 --- a/sql/2021/structured-data/classic_microformats_types.sql +++ b/sql/2021/structured-data/classic_microformats_types.sql @@ -16,7 +16,7 @@ rendered_data AS ( SELECT _TABLE_SUFFIX AS client, url, - getClassicMicroformatsTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS classic_microformats_types + getClassicMicroformatsTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS classic_microformats_types FROM `httparchive.pages.2021_07_01_*` ), diff --git a/sql/2021/structured-data/dublin_core_types.sql b/sql/2021/structured-data/dublin_core_types.sql index 3cdda147387..9b8e24b73aa 100644 --- a/sql/2021/structured-data/dublin_core_types.sql +++ b/sql/2021/structured-data/dublin_core_types.sql @@ -16,7 +16,7 @@ rendered_data AS ( SELECT _TABLE_SUFFIX AS client, url, - getDublinCoreTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS dublin_core_types + getDublinCoreTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS dublin_core_types FROM `httparchive.pages.2021_07_01_*` ), diff --git a/sql/2021/structured-data/facebook_types.sql b/sql/2021/structured-data/facebook_types.sql index a7ab2449247..7b49be95517 100644 --- a/sql/2021/structured-data/facebook_types.sql +++ b/sql/2021/structured-data/facebook_types.sql @@ -16,7 +16,7 @@ rendered_data AS ( SELECT _TABLE_SUFFIX AS client, url, - getFacebookTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS facebook_type + getFacebookTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS facebook_type FROM `httparchive.pages.2021_07_01_*` ), diff --git a/sql/2021/structured-data/jsonld_contexts.sql b/sql/2021/structured-data/jsonld_contexts.sql index 2af76dcc2db..6ee68e5c88e 100644 --- a/sql/2021/structured-data/jsonld_contexts.sql +++ b/sql/2021/structured-data/jsonld_contexts.sql @@ -35,7 +35,7 @@ rendered_data AS ( SELECT _TABLE_SUFFIX AS client, url, - getJSONLDContexts(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS jsonld_context + getJSONLDContexts(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS jsonld_context FROM `httparchive.pages.2021_07_01_*` ), diff --git a/sql/2021/structured-data/jsonld_depth_percentiles.sql b/sql/2021/structured-data/jsonld_depth_percentiles.sql index 972db82b121..23b3a92ade0 100644 --- a/sql/2021/structured-data/jsonld_depth_percentiles.sql +++ b/sql/2021/structured-data/jsonld_depth_percentiles.sql @@ -45,7 +45,7 @@ WITH rendered_data AS ( SELECT _TABLE_SUFFIX AS client, url, - getJSONLDEntitiesRelationships(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS jsonld_entities_relationships + getJSONLDEntitiesRelationships(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS jsonld_entities_relationships FROM `httparchive.pages.2021_07_01_*` ) diff --git a/sql/2021/structured-data/jsonld_entities_relationships.sql b/sql/2021/structured-data/jsonld_entities_relationships.sql index 3e7950cf92d..ba2dfcafae9 100644 --- a/sql/2021/structured-data/jsonld_entities_relationships.sql +++ b/sql/2021/structured-data/jsonld_entities_relationships.sql @@ -46,7 +46,7 @@ rendered_data AS ( SELECT _TABLE_SUFFIX AS client, url, - getJSONLDEntitiesRelationships(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS jsonld_entities_relationships + getJSONLDEntitiesRelationships(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS jsonld_entities_relationships FROM `httparchive.pages.2021_07_01_*` ), diff --git a/sql/2021/structured-data/jsonld_relationship_values.sql b/sql/2021/structured-data/jsonld_relationship_values.sql index 9a33e1f8289..d5a3cffe32c 100644 --- a/sql/2021/structured-data/jsonld_relationship_values.sql +++ b/sql/2021/structured-data/jsonld_relationship_values.sql @@ -46,7 +46,7 @@ rendered_data AS ( SELECT _TABLE_SUFFIX AS client, url, - getJSONLDEntitiesRelationships(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS jsonld_entities_relationships + getJSONLDEntitiesRelationships(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS jsonld_entities_relationships FROM `httparchive.pages.2021_07_01_*` ), diff --git a/sql/2021/structured-data/jsonld_relationships.sql b/sql/2021/structured-data/jsonld_relationships.sql index 0e707cabb63..83c71174dcd 100644 --- a/sql/2021/structured-data/jsonld_relationships.sql +++ b/sql/2021/structured-data/jsonld_relationships.sql @@ -35,7 +35,7 @@ rendered_data AS ( SELECT _TABLE_SUFFIX AS client, url, - getJSONLDRelationships(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS jsonld_relationships + getJSONLDRelationships(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS jsonld_relationships FROM `httparchive.pages.2021_07_01_*` ), diff --git a/sql/2021/structured-data/jsonld_same_ases.sql b/sql/2021/structured-data/jsonld_same_ases.sql index e665e2437f3..9677fb27e96 100644 --- a/sql/2021/structured-data/jsonld_same_ases.sql +++ b/sql/2021/structured-data/jsonld_same_ases.sql @@ -35,7 +35,7 @@ rendered_data AS ( SELECT _TABLE_SUFFIX AS client, url, - getJSONLDSameAses(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS jsonld_sameases + getJSONLDSameAses(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS jsonld_sameases FROM `httparchive.pages.2021_07_01_*` ), diff --git a/sql/2021/structured-data/jsonld_types.sql b/sql/2021/structured-data/jsonld_types.sql index 6a9341a5a85..988292c62c2 100644 --- a/sql/2021/structured-data/jsonld_types.sql +++ b/sql/2021/structured-data/jsonld_types.sql @@ -35,7 +35,7 @@ rendered_data AS ( SELECT _TABLE_SUFFIX AS client, url, - getJSONLDTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS jsonld_types + getJSONLDTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS jsonld_types FROM `httparchive.pages.2021_07_01_*` ), diff --git a/sql/2021/structured-data/microdata_item_types.sql b/sql/2021/structured-data/microdata_item_types.sql index dcb185b0c27..2f6fea91485 100644 --- a/sql/2021/structured-data/microdata_item_types.sql +++ b/sql/2021/structured-data/microdata_item_types.sql @@ -16,7 +16,7 @@ rendered_data AS ( SELECT _TABLE_SUFFIX AS client, url, - getMicrodataItemTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS microdata_item_types + getMicrodataItemTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS microdata_item_types FROM `httparchive.pages.2021_07_01_*` ), diff --git a/sql/2021/structured-data/microformats2_types.sql b/sql/2021/structured-data/microformats2_types.sql index e40a2a97636..6174a2a2385 100644 --- a/sql/2021/structured-data/microformats2_types.sql +++ b/sql/2021/structured-data/microformats2_types.sql @@ -16,7 +16,7 @@ rendered_data AS ( SELECT _TABLE_SUFFIX AS client, url, - getMicroformats2Types(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS microformats2_types + getMicroformats2Types(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS microformats2_types FROM `httparchive.pages.2021_07_01_*` ), diff --git a/sql/2021/structured-data/open_graph_types.sql b/sql/2021/structured-data/open_graph_types.sql index ee326515658..334cd7b0868 100644 --- a/sql/2021/structured-data/open_graph_types.sql +++ b/sql/2021/structured-data/open_graph_types.sql @@ -16,7 +16,7 @@ rendered_data AS ( SELECT _TABLE_SUFFIX AS client, url, - getOpenGraphTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS open_graph_types + getOpenGraphTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS open_graph_types FROM `httparchive.pages.2021_07_01_*` ), diff --git a/sql/2021/structured-data/present_types.sql b/sql/2021/structured-data/present_types.sql index bcd0d66b1ec..0c5b8867cda 100644 --- a/sql/2021/structured-data/present_types.sql +++ b/sql/2021/structured-data/present_types.sql @@ -22,7 +22,7 @@ SELECT COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.twitter') AS BOOL) = TRUE) / COUNT(0) AS pct_twitter, COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.facebook') AS BOOL) = TRUE) / COUNT(0) AS pct_facebook, COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.opengraph') AS BOOL) = TRUE) / COUNT(0) AS pct_opengraph, - COUNTIF(JSON_EXTRACT(structured_data, '$.structured_data') IS NOT NULL AND JSON_EXTRACT(structured_data, '$.log') IS NULL) / COUNT(0) AS pct_total_structured_data_ran, + COUNTIF(JSON_EXTRACT(structured_data, '$.structured_data') IS NOT NULL AND JSON_EXTRACT(structured_data, '$.log') IS NULL) / COUNT(0) AS pct_total_structured_data_ran FROM ( SELECT _TABLE_SUFFIX AS client, diff --git a/sql/2021/structured-data/rdfa_prefixes.sql b/sql/2021/structured-data/rdfa_prefixes.sql index 2ed685a22fe..d7925caa06d 100644 --- a/sql/2021/structured-data/rdfa_prefixes.sql +++ b/sql/2021/structured-data/rdfa_prefixes.sql @@ -20,7 +20,7 @@ rendered_data AS ( SELECT _TABLE_SUFFIX AS client, url, - getRDFaPrefixes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS rdfa_prefixes + getRDFaPrefixes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS rdfa_prefixes FROM `httparchive.pages.2021_07_01_*` ), @@ -51,7 +51,7 @@ FROM ( -- Removes the protocol and any subdomains from the URL. -- e.g. "https://my.example.com/pathname" becomes "example.com/pathname" -- This is done to normalize the URL a bit before counting. - CONCAT(NET.REG_DOMAIN(rdfa_prefix), SPLIT(rdfa_prefix, NET.REG_DOMAIN(rdfa_prefix))[SAFE_OFFSET(1)]) AS rdfa_prefix, + CONCAT(NET.REG_DOMAIN(rdfa_prefix), SPLIT(rdfa_prefix, NET.REG_DOMAIN(rdfa_prefix))[SAFE_OFFSET(1)]) AS rdfa_prefix FROM rendered_data, UNNEST(rdfa_prefixes) AS rdfa_prefix diff --git a/sql/2021/structured-data/rdfa_type_ofs.sql b/sql/2021/structured-data/rdfa_type_ofs.sql index ec8698dceba..cc90e4e542c 100644 --- a/sql/2021/structured-data/rdfa_type_ofs.sql +++ b/sql/2021/structured-data/rdfa_type_ofs.sql @@ -16,7 +16,7 @@ rendered_data AS ( SELECT _TABLE_SUFFIX AS client, url, - getRDFaTypeOfs(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS rdfa_type_ofs + getRDFaTypeOfs(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS rdfa_type_ofs FROM `httparchive.pages.2021_07_01_*` ), diff --git a/sql/2021/structured-data/rdfa_vocabs.sql b/sql/2021/structured-data/rdfa_vocabs.sql index eb6a84f3bb0..a2c2490404c 100644 --- a/sql/2021/structured-data/rdfa_vocabs.sql +++ b/sql/2021/structured-data/rdfa_vocabs.sql @@ -16,7 +16,7 @@ rendered_data AS ( SELECT _TABLE_SUFFIX AS client, url, - getRDFaVocabs(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS rdfa_vocabs + getRDFaVocabs(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS rdfa_vocabs FROM `httparchive.pages.2021_07_01_*` ), @@ -47,7 +47,7 @@ FROM ( -- Removes the protocol and any subdomains from the URL. -- e.g. "https://my.example.com/pathname" becomes "example.com/pathname" -- This is done to normalize the URL a bit before counting. - CONCAT(NET.REG_DOMAIN(rdfa_vocab), SPLIT(rdfa_vocab, NET.REG_DOMAIN(rdfa_vocab))[SAFE_OFFSET(1)]) AS rdfa_vocab, + CONCAT(NET.REG_DOMAIN(rdfa_vocab), SPLIT(rdfa_vocab, NET.REG_DOMAIN(rdfa_vocab))[SAFE_OFFSET(1)]) AS rdfa_vocab FROM rendered_data, UNNEST(rdfa_vocabs) AS rdfa_vocab diff --git a/sql/2021/structured-data/summary_count.sql b/sql/2021/structured-data/summary_count.sql index 894c798cb8d..5847d67a169 100644 --- a/sql/2021/structured-data/summary_count.sql +++ b/sql/2021/structured-data/summary_count.sql @@ -4,16 +4,16 @@ SELECT client, COUNTIF(success IS NOT NULL) AS success, COUNTIF(errors IS NOT NULL) AS errors, - COUNTIF(success IS NOT NULL AND errors is NULL) AS success_no_errors, - COUNTIF(errors IS NOT NULL AND success is NULL) AS errors_no_success, - COUNTIF(success IS NOT NULL AND errors is NOT NULL) AS success_errors, + COUNTIF(success IS NOT NULL AND errors IS NULL) AS success_no_errors, + COUNTIF(errors IS NOT NULL AND success IS NULL) AS errors_no_success, + COUNTIF(success IS NOT NULL AND errors IS NOT NULL) AS success_errors, COUNTIF(success IS NULL AND errors IS NULL) AS no_success_no_errors, COUNTIF(success IS NOT NULL) / COUNT(0) AS pct_success, COUNTIF(errors IS NOT NULL) / COUNT(0) AS pct_errors, - COUNTIF(success IS NOT NULL AND errors is NULL) / COUNT(0) AS pct_success_no_errors, - COUNTIF(errors IS NOT NULL AND success is NULL) / COUNT(0) AS pct_errors_no_success, - COUNTIF(success IS NOT NULL AND errors is NOT NULL) / COUNT(0) AS pct_success_errors, - COUNTIF(success IS NULL AND errors IS NULL) / COUNT(0) AS pct_no_success_no_errors, + COUNTIF(success IS NOT NULL AND errors IS NULL) / COUNT(0) AS pct_success_no_errors, + COUNTIF(errors IS NOT NULL AND success IS NULL) / COUNT(0) AS pct_errors_no_success, + COUNTIF(success IS NOT NULL AND errors IS NOT NULL) / COUNT(0) AS pct_success_errors, + COUNTIF(success IS NULL AND errors IS NULL) / COUNT(0) AS pct_no_success_no_errors FROM ( SELECT client, diff --git a/sql/2021/structured-data/twitter_types.sql b/sql/2021/structured-data/twitter_types.sql index f68c211ef2d..128462c4231 100644 --- a/sql/2021/structured-data/twitter_types.sql +++ b/sql/2021/structured-data/twitter_types.sql @@ -16,7 +16,7 @@ rendered_data AS ( SELECT _TABLE_SUFFIX AS client, url, - getTwitterTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,'$._structured-data')),'$.structured_data.rendered')) AS twitter_types + getTwitterTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS twitter_types FROM `httparchive.pages.2021_07_01_*` ), From 91d90adf236e8eaa1bc85ffab0f630024cce132a Mon Sep 17 00:00:00 2001 From: Barry Date: Sun, 7 Nov 2021 16:59:02 +0000 Subject: [PATCH 4/4] Review feedback --- sql/2021/structured-data/present_types.sql | 36 +++++++++++----------- sql/2021/structured-data/rdfa_prefixes.sql | 4 +-- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/sql/2021/structured-data/present_types.sql b/sql/2021/structured-data/present_types.sql index 0c5b8867cda..7a0ccef484e 100644 --- a/sql/2021/structured-data/present_types.sql +++ b/sql/2021/structured-data/present_types.sql @@ -2,26 +2,26 @@ # A count of pages which include each type of structured data SELECT client, - COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.rdfa') AS BOOL) = TRUE) AS rdfa, - COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.json_ld') AS BOOL) = TRUE) AS json_ld, - COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microdata') AS BOOL) = TRUE) AS microdata, - COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microformats2') AS BOOL) = TRUE) AS microformats2, - COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microformats_classic') AS BOOL) = TRUE) AS microformats_classic, - COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.dublin_core') AS BOOL) = TRUE) AS dublin_core, - COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.twitter') AS BOOL) = TRUE) AS twitter, - COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.facebook') AS BOOL) = TRUE) AS facebook, - COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.opengraph') AS BOOL) = TRUE) AS opengraph, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.rdfa') AS BOOL)) AS rdfa, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.json_ld') AS BOOL)) AS json_ld, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microdata') AS BOOL)) AS microdata, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microformats2') AS BOOL)) AS microformats2, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microformats_classic') AS BOOL)) AS microformats_classic, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.dublin_core') AS BOOL)) AS dublin_core, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.twitter') AS BOOL)) AS twitter, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.facebook') AS BOOL)) AS facebook, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.opengraph') AS BOOL)) AS opengraph, COUNTIF(JSON_EXTRACT(structured_data, '$.structured_data') IS NOT NULL AND JSON_EXTRACT(structured_data, '$.log') IS NULL) AS total_structured_data_ran, COUNT(0) AS total_pages, - COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.rdfa') AS BOOL) = TRUE) / COUNT(0) AS pct_rdfa, - COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.json_ld') AS BOOL) = TRUE) / COUNT(0) AS pct_json_ld, - COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microdata') AS BOOL) = TRUE) / COUNT(0) AS pct_microdata, - COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microformats2') AS BOOL) = TRUE) / COUNT(0) AS pct_microformats2, - COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microformats_classic') AS BOOL) = TRUE) / COUNT(0) AS pct_microformats_classic, - COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.dublin_core') AS BOOL) = TRUE) / COUNT(0) AS pct_dublin_core, - COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.twitter') AS BOOL) = TRUE) / COUNT(0) AS pct_twitter, - COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.facebook') AS BOOL) = TRUE) / COUNT(0) AS pct_facebook, - COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.opengraph') AS BOOL) = TRUE) / COUNT(0) AS pct_opengraph, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.rdfa') AS BOOL)) / COUNT(0) AS pct_rdfa, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.json_ld') AS BOOL)) / COUNT(0) AS pct_json_ld, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microdata') AS BOOL)) / COUNT(0) AS pct_microdata, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microformats2') AS BOOL)) / COUNT(0) AS pct_microformats2, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.microformats_classic') AS BOOL)) / COUNT(0) AS pct_microformats_classic, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.dublin_core') AS BOOL)) / COUNT(0) AS pct_dublin_core, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.twitter') AS BOOL)) / COUNT(0) AS pct_twitter, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.facebook') AS BOOL)) / COUNT(0) AS pct_facebook, + COUNTIF(CAST(JSON_EXTRACT(structured_data, '$.structured_data.rendered.present.opengraph') AS BOOL)) / COUNT(0) AS pct_opengraph, COUNTIF(JSON_EXTRACT(structured_data, '$.structured_data') IS NOT NULL AND JSON_EXTRACT(structured_data, '$.log') IS NULL) / COUNT(0) AS pct_total_structured_data_ran FROM ( SELECT diff --git a/sql/2021/structured-data/rdfa_prefixes.sql b/sql/2021/structured-data/rdfa_prefixes.sql index d7925caa06d..2725b414408 100644 --- a/sql/2021/structured-data/rdfa_prefixes.sql +++ b/sql/2021/structured-data/rdfa_prefixes.sql @@ -2,10 +2,10 @@ # Count RDFa Prefixes CREATE TEMP FUNCTION getRDFaPrefixes(rendered STRING) RETURNS ARRAY -LANGUAGE js AS """ +LANGUAGE js AS r""" try { rendered = JSON.parse(rendered); - const prefixRegExp = new RegExp(/(?[^:]*):\\s+(?[^\\s]*)\\s*/gm) + const prefixRegExp = new RegExp(/(?[^:]*):\s+(?[^\s]*)\s*/gm) return rendered.rdfa_prefixes.map(prefix => { const matches = [...prefix.toLowerCase().trim().matchAll(prefixRegExp)]; return matches.map(match => match.groups.uri);