Structured Data 2024 queries (#3757)

* Queries for Structured Data 2024 * Adjusted Queries * Linting fixes --------- Co-authored-by: Barry Pollard <[email protected]>
HTTPArchive · Oct 12, 2024 · 7c438ed · 7c438ed
1 parent a64ea4e
commit 7c438ed
Show file tree

Hide file tree

Showing 19 changed files with 1,288 additions and 0 deletions.
diff --git a/sql/2024/structured-data/classic_microformats_types.sql b/sql/2024/structured-data/classic_microformats_types.sql
@@ -0,0 +1,61 @@
+# standardSQL
+# classic_microformats_types.sql
+# Count Classic Microformats types
+
+CREATE TEMP FUNCTION getClassicMicroformatsTypes(rendered STRING)
+RETURNS ARRAY<STRUCT<name STRING, count NUMERIC>>
+LANGUAGE js AS """
+  try {
+    rendered = JSON.parse(rendered);
+    return rendered.microformats_classic_types.map(microformats_classic_type => ({name: microformats_classic_type.name, count: microformats_classic_type.count}));
+  } catch (e) {
+    return [];
+  }
+""";
+
+WITH
+rendered_data AS (
+  SELECT
+    client,
+    root_page AS url,
+    getClassicMicroformatsTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS classic_microformats_types
+  FROM
+    `httparchive.all.pages`
+  WHERE
+    date = '2024-06-01'
+),
+
+page_totals AS (
+  SELECT
+    client,
+    COUNT(DISTINCT root_page) AS total_pages
+  FROM
+    `httparchive.all.pages`
+  WHERE
+    date = '2024-06-01'
+  GROUP BY
+    client
+)
+
+SELECT
+  client,
+  classic_microformats_type.name AS classic_microformats_type,
+  SUM(classic_microformats_type.count) AS freq_microformat,
+  SUM(SUM(classic_microformats_type.count)) OVER (PARTITION BY client) AS total_microformat,
+  SUM(classic_microformats_type.count) / SUM(SUM(classic_microformats_type.count)) OVER (PARTITION BY client) AS pct_microformat,
+  COUNT(DISTINCT url) AS freq_pages,
+  total_pages,
+  COUNT(DISTINCT url) / total_pages AS pct_pages
+FROM
+  rendered_data,
+  UNNEST(classic_microformats_types) AS classic_microformats_type
+JOIN
+  page_totals
+USING (client)
+GROUP BY
+  client,
+  classic_microformats_type,
+  total_pages
+ORDER BY
+  freq_microformat DESC,
+  client
diff --git a/sql/2024/structured-data/dublin_core_types.sql b/sql/2024/structured-data/dublin_core_types.sql
@@ -0,0 +1,60 @@
+# standardSQL
+# dublin_core_types.sql
+# Count Dublin Core types
+CREATE TEMP FUNCTION getDublinCoreTypes(rendered STRING)
+RETURNS ARRAY<STRING>
+LANGUAGE js AS """
+  try {
+    rendered = JSON.parse(rendered);
+    return rendered.dublin_core.map(dublin_core => dublin_core.name.toLowerCase());
+  } catch (e) {
+    return [];
+  }
+""";
+
+WITH
+rendered_data AS (
+  SELECT
+    client,
+    root_page AS url,
+    getDublinCoreTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS dublin_core_types
+  FROM
+    `httparchive.all.pages`
+  WHERE
+    date = '2024-06-01'
+),
+
+page_totals AS (
+  SELECT
+    client,
+    COUNT(DISTINCT root_page) AS total_pages
+  FROM
+    `httparchive.all.pages`
+  WHERE
+    date = '2024-06-01'
+  GROUP BY
+    client
+)
+
+SELECT
+  client,
+  dublin_core_type,
+  COUNT(dublin_core_type) AS count,
+  SUM(COUNT(dublin_core_type)) OVER (PARTITION BY client) AS freq_dublin_core,
+  COUNT(dublin_core_type) / SUM(COUNT(dublin_core_type)) OVER (PARTITION BY client) AS pct_dublin_core,
+  COUNT(DISTINCT url) AS freq_pages,
+  total_pages,
+  COUNT(DISTINCT url) / total_pages AS pct_pages
+FROM
+  rendered_data,
+  UNNEST(dublin_core_types) AS dublin_core_type
+JOIN
+  page_totals
+USING (client)
+GROUP BY
+  client,
+  dublin_core_type,
+  total_pages
+ORDER BY
+  pct_dublin_core DESC,
+  client
diff --git a/sql/2024/structured-data/facebook_types.sql b/sql/2024/structured-data/facebook_types.sql
@@ -0,0 +1,60 @@
+# standardSQL
+# facebook_types.sql
+# Count Facebook types
+CREATE TEMP FUNCTION getFacebookTypes(rendered STRING)
+RETURNS ARRAY<STRING>
+LANGUAGE js AS """
+  try {
+    rendered = JSON.parse(rendered);
+    return rendered.facebook.map(facebook => facebook.property.toLowerCase());
+  } catch (e) {
+    return [];
+  }
+""";
+
+WITH
+rendered_data AS (
+  SELECT
+    client,
+    root_page AS url,
+    getFacebookTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS facebook_type
+  FROM
+    `httparchive.all.pages`
+  WHERE
+    date = '2024-06-01'
+),
+
+page_totals AS (
+  SELECT
+    client,
+    COUNT(DISTINCT root_page) AS total_pages
+  FROM
+    `httparchive.all.pages`
+  WHERE
+    date = '2024-06-01'
+  GROUP BY
+    client
+)
+
+SELECT
+  client,
+  facebook_type,
+  COUNT(facebook_type) AS freq_facebook,
+  SUM(COUNT(facebook_type)) OVER (PARTITION BY client) AS total_facebook,
+  COUNT(facebook_type) / SUM(COUNT(facebook_type)) OVER (PARTITION BY client) AS pct_facebook,
+  COUNT(DISTINCT url) AS freq_pages,
+  total_pages,
+  COUNT(DISTINCT url) / total_pages AS pct_pages
+FROM
+  rendered_data,
+  UNNEST(facebook_type) AS facebook_type
+JOIN
+  page_totals
+USING (client)
+GROUP BY
+  client,
+  facebook_type,
+  total_pages
+ORDER BY
+  freq_facebook DESC,
+  client
diff --git a/sql/2024/structured-data/jsonld_contexts.sql b/sql/2024/structured-data/jsonld_contexts.sql
@@ -0,0 +1,79 @@
+# standardSQL
+# jsonld_contexts.sql
+# Count JSON-LD contexts
+CREATE TEMP FUNCTION getJSONLDContexts(rendered STRING)
+RETURNS ARRAY<STRING>
+LANGUAGE js AS """
+  try {
+    const arrayify = (value) => Array.isArray(value) ? value : [value];
+
+    const getDeep = (key, o) => {
+      if (Array.isArray(o)) return o.map(child => getDeep(key, child)).flat();
+
+      if (o instanceof Object) {
+        return Object.entries(o).map(([k, value]) => {
+          if (k === key) return [...arrayify(value), ...getDeep(value)];
+          return getDeep(value);
+        }).flat();
+      }
+
+      return [];
+    }
+
+    rendered = JSON.parse(rendered);
+    const jsonld_scripts = rendered.jsonld_scripts;
+    return jsonld_scripts.map(jsonld_script => {
+      jsonld_script = JSON.parse(jsonld_script);
+      return getDeep('@context', jsonld_script);
+    }).flat().filter(context => typeof context === 'string');
+  } catch (e) {
+    return [];
+  }
+""";
+
+WITH
+rendered_data AS (
+  SELECT
+    client,
+    root_page AS url,
+    getJSONLDContexts(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS jsonld_context
+  FROM
+    `httparchive.all.pages`
+  WHERE
+    date = '2024-06-01'
+),
+
+page_totals AS (
+  SELECT
+    client,
+    COUNT(DISTINCT root_page) AS total_pages
+  FROM
+    `httparchive.all.pages`
+  WHERE
+    date = '2024-06-01'
+  GROUP BY
+    client
+)
+
+SELECT
+  client,
+  NET.REG_DOMAIN(jsonld_context) AS jsonld_context,
+  COUNT(0) AS freq_jsonld_context,
+  SUM(COUNT(0)) OVER (PARTITION BY client) AS total_jsonld_context,
+  COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_jsonld_context,
+  COUNT(DISTINCT url) AS freq_pages,
+  total_pages,
+  COUNT(DISTINCT url) / total_pages AS pct_pages
+FROM
+  rendered_data,
+  UNNEST(jsonld_context) AS jsonld_context
+JOIN
+  page_totals
+USING (client)
+GROUP BY
+  client,
+  jsonld_context,
+  total_pages
+ORDER BY
+  pct_jsonld_context DESC,
+  client
diff --git a/sql/2024/structured-data/jsonld_depth_percentiles.sql b/sql/2024/structured-data/jsonld_depth_percentiles.sql
@@ -0,0 +1,70 @@
+# standardSQL
+# jsonld_depth_percentiles.sql
+# Find the most nested entity in a JSON-LD document
+CREATE TEMP FUNCTION getJSONLDEntitiesRelationships(rendered STRING)
+RETURNS ARRAY<STRUCT<_from STRING, relationship STRING, _to STRING, depth NUMERIC>>
+LANGUAGE js AS """
+  try {
+    const types = new Map();
+
+    const loadTypes = (o) => {
+      if (Array.isArray(o)) {
+        o.forEach(loadTypes);
+      } else if (o instanceof Object) {
+        if (o['@id'] && o['@type']) {
+          types.set(o['@id'], o['@type']);
+        }
+
+        Object.values(o).forEach(loadTypes);
+      }
+    }
+
+    const arrayify = (value) => Array.isArray(value) ? value : [value];
+
+    const getEntitiesAndRelationships = (o, _from, relationship, depth = 0) => {
+      if (Array.isArray(o)) return o.map(value => getEntitiesAndRelationships(value, _from, relationship, depth)).flat();
+
+      if (o instanceof Object) {
+        const type = types.get(o['@id']) || o['@type'];
+        return [{_from, relationship, _to: type, depth}, ...Object.entries(o).map(([k, value]) => getEntitiesAndRelationships(value, type, k, depth + 1))].flat();
+      }
+
+      return [];
+    }
+
+    rendered = JSON.parse(rendered);
+    const jsonld_scripts = rendered.jsonld_scripts.map(JSON.parse);
+    loadTypes(jsonld_scripts);
+
+    return jsonld_scripts.map(jsonld_script => getEntitiesAndRelationships(jsonld_script, undefined, undefined, 0)).flat();
+  } catch (e) {
+    return [];
+  }
+""";
+
+WITH rendered_data AS (
+  SELECT
+    client,
+    root_page AS url,
+    getJSONLDEntitiesRelationships(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS jsonld_entities_relationships
+  FROM
+    `httparchive.all.pages`
+  WHERE
+    date = '2024-06-01'
+)
+
+SELECT
+  client,
+  percentile,
+  APPROX_QUANTILES(jsonld_entity_relationship.depth, 1000)[OFFSET(percentile * 10)] AS depth,
+  ARRAY_TO_STRING(ARRAY_AGG(DISTINCT url LIMIT 5), ' ') AS sample_urls
+FROM
+  rendered_data,
+  UNNEST(jsonld_entities_relationships) AS jsonld_entity_relationship,
+  UNNEST([10, 25, 50, 75, 90, 100]) AS percentile
+GROUP BY
+  client,
+  percentile
+ORDER BY
+  client,
+  percentile