-
-
Notifications
You must be signed in to change notification settings - Fork 176
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Structured Data 2024 queries (#3757)
* Queries for Structured Data 2024 * Adjusted Queries * Linting fixes --------- Co-authored-by: Barry Pollard <[email protected]>
- Loading branch information
1 parent
a64ea4e
commit 7c438ed
Showing
19 changed files
with
1,288 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
# standardSQL | ||
# classic_microformats_types.sql | ||
# Count Classic Microformats types | ||
|
||
CREATE TEMP FUNCTION getClassicMicroformatsTypes(rendered STRING) | ||
RETURNS ARRAY<STRUCT<name STRING, count NUMERIC>> | ||
LANGUAGE js AS """ | ||
try { | ||
rendered = JSON.parse(rendered); | ||
return rendered.microformats_classic_types.map(microformats_classic_type => ({name: microformats_classic_type.name, count: microformats_classic_type.count})); | ||
} catch (e) { | ||
return []; | ||
} | ||
"""; | ||
|
||
WITH | ||
rendered_data AS ( | ||
SELECT | ||
client, | ||
root_page AS url, | ||
getClassicMicroformatsTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS classic_microformats_types | ||
FROM | ||
`httparchive.all.pages` | ||
WHERE | ||
date = '2024-06-01' | ||
), | ||
|
||
page_totals AS ( | ||
SELECT | ||
client, | ||
COUNT(DISTINCT root_page) AS total_pages | ||
FROM | ||
`httparchive.all.pages` | ||
WHERE | ||
date = '2024-06-01' | ||
GROUP BY | ||
client | ||
) | ||
|
||
SELECT | ||
client, | ||
classic_microformats_type.name AS classic_microformats_type, | ||
SUM(classic_microformats_type.count) AS freq_microformat, | ||
SUM(SUM(classic_microformats_type.count)) OVER (PARTITION BY client) AS total_microformat, | ||
SUM(classic_microformats_type.count) / SUM(SUM(classic_microformats_type.count)) OVER (PARTITION BY client) AS pct_microformat, | ||
COUNT(DISTINCT url) AS freq_pages, | ||
total_pages, | ||
COUNT(DISTINCT url) / total_pages AS pct_pages | ||
FROM | ||
rendered_data, | ||
UNNEST(classic_microformats_types) AS classic_microformats_type | ||
JOIN | ||
page_totals | ||
USING (client) | ||
GROUP BY | ||
client, | ||
classic_microformats_type, | ||
total_pages | ||
ORDER BY | ||
freq_microformat DESC, | ||
client |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
# standardSQL | ||
# dublin_core_types.sql | ||
# Count Dublin Core types | ||
CREATE TEMP FUNCTION getDublinCoreTypes(rendered STRING) | ||
RETURNS ARRAY<STRING> | ||
LANGUAGE js AS """ | ||
try { | ||
rendered = JSON.parse(rendered); | ||
return rendered.dublin_core.map(dublin_core => dublin_core.name.toLowerCase()); | ||
} catch (e) { | ||
return []; | ||
} | ||
"""; | ||
|
||
WITH | ||
rendered_data AS ( | ||
SELECT | ||
client, | ||
root_page AS url, | ||
getDublinCoreTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS dublin_core_types | ||
FROM | ||
`httparchive.all.pages` | ||
WHERE | ||
date = '2024-06-01' | ||
), | ||
|
||
page_totals AS ( | ||
SELECT | ||
client, | ||
COUNT(DISTINCT root_page) AS total_pages | ||
FROM | ||
`httparchive.all.pages` | ||
WHERE | ||
date = '2024-06-01' | ||
GROUP BY | ||
client | ||
) | ||
|
||
SELECT | ||
client, | ||
dublin_core_type, | ||
COUNT(dublin_core_type) AS count, | ||
SUM(COUNT(dublin_core_type)) OVER (PARTITION BY client) AS freq_dublin_core, | ||
COUNT(dublin_core_type) / SUM(COUNT(dublin_core_type)) OVER (PARTITION BY client) AS pct_dublin_core, | ||
COUNT(DISTINCT url) AS freq_pages, | ||
total_pages, | ||
COUNT(DISTINCT url) / total_pages AS pct_pages | ||
FROM | ||
rendered_data, | ||
UNNEST(dublin_core_types) AS dublin_core_type | ||
JOIN | ||
page_totals | ||
USING (client) | ||
GROUP BY | ||
client, | ||
dublin_core_type, | ||
total_pages | ||
ORDER BY | ||
pct_dublin_core DESC, | ||
client |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
# standardSQL | ||
# facebook_types.sql | ||
# Count Facebook types | ||
CREATE TEMP FUNCTION getFacebookTypes(rendered STRING) | ||
RETURNS ARRAY<STRING> | ||
LANGUAGE js AS """ | ||
try { | ||
rendered = JSON.parse(rendered); | ||
return rendered.facebook.map(facebook => facebook.property.toLowerCase()); | ||
} catch (e) { | ||
return []; | ||
} | ||
"""; | ||
|
||
WITH | ||
rendered_data AS ( | ||
SELECT | ||
client, | ||
root_page AS url, | ||
getFacebookTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS facebook_type | ||
FROM | ||
`httparchive.all.pages` | ||
WHERE | ||
date = '2024-06-01' | ||
), | ||
|
||
page_totals AS ( | ||
SELECT | ||
client, | ||
COUNT(DISTINCT root_page) AS total_pages | ||
FROM | ||
`httparchive.all.pages` | ||
WHERE | ||
date = '2024-06-01' | ||
GROUP BY | ||
client | ||
) | ||
|
||
SELECT | ||
client, | ||
facebook_type, | ||
COUNT(facebook_type) AS freq_facebook, | ||
SUM(COUNT(facebook_type)) OVER (PARTITION BY client) AS total_facebook, | ||
COUNT(facebook_type) / SUM(COUNT(facebook_type)) OVER (PARTITION BY client) AS pct_facebook, | ||
COUNT(DISTINCT url) AS freq_pages, | ||
total_pages, | ||
COUNT(DISTINCT url) / total_pages AS pct_pages | ||
FROM | ||
rendered_data, | ||
UNNEST(facebook_type) AS facebook_type | ||
JOIN | ||
page_totals | ||
USING (client) | ||
GROUP BY | ||
client, | ||
facebook_type, | ||
total_pages | ||
ORDER BY | ||
freq_facebook DESC, | ||
client |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
# standardSQL | ||
# jsonld_contexts.sql | ||
# Count JSON-LD contexts | ||
CREATE TEMP FUNCTION getJSONLDContexts(rendered STRING) | ||
RETURNS ARRAY<STRING> | ||
LANGUAGE js AS """ | ||
try { | ||
const arrayify = (value) => Array.isArray(value) ? value : [value]; | ||
const getDeep = (key, o) => { | ||
if (Array.isArray(o)) return o.map(child => getDeep(key, child)).flat(); | ||
if (o instanceof Object) { | ||
return Object.entries(o).map(([k, value]) => { | ||
if (k === key) return [...arrayify(value), ...getDeep(value)]; | ||
return getDeep(value); | ||
}).flat(); | ||
} | ||
return []; | ||
} | ||
rendered = JSON.parse(rendered); | ||
const jsonld_scripts = rendered.jsonld_scripts; | ||
return jsonld_scripts.map(jsonld_script => { | ||
jsonld_script = JSON.parse(jsonld_script); | ||
return getDeep('@context', jsonld_script); | ||
}).flat().filter(context => typeof context === 'string'); | ||
} catch (e) { | ||
return []; | ||
} | ||
"""; | ||
|
||
WITH | ||
rendered_data AS ( | ||
SELECT | ||
client, | ||
root_page AS url, | ||
getJSONLDContexts(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS jsonld_context | ||
FROM | ||
`httparchive.all.pages` | ||
WHERE | ||
date = '2024-06-01' | ||
), | ||
|
||
page_totals AS ( | ||
SELECT | ||
client, | ||
COUNT(DISTINCT root_page) AS total_pages | ||
FROM | ||
`httparchive.all.pages` | ||
WHERE | ||
date = '2024-06-01' | ||
GROUP BY | ||
client | ||
) | ||
|
||
SELECT | ||
client, | ||
NET.REG_DOMAIN(jsonld_context) AS jsonld_context, | ||
COUNT(0) AS freq_jsonld_context, | ||
SUM(COUNT(0)) OVER (PARTITION BY client) AS total_jsonld_context, | ||
COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_jsonld_context, | ||
COUNT(DISTINCT url) AS freq_pages, | ||
total_pages, | ||
COUNT(DISTINCT url) / total_pages AS pct_pages | ||
FROM | ||
rendered_data, | ||
UNNEST(jsonld_context) AS jsonld_context | ||
JOIN | ||
page_totals | ||
USING (client) | ||
GROUP BY | ||
client, | ||
jsonld_context, | ||
total_pages | ||
ORDER BY | ||
pct_jsonld_context DESC, | ||
client |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
# standardSQL | ||
# jsonld_depth_percentiles.sql | ||
# Find the most nested entity in a JSON-LD document | ||
CREATE TEMP FUNCTION getJSONLDEntitiesRelationships(rendered STRING) | ||
RETURNS ARRAY<STRUCT<_from STRING, relationship STRING, _to STRING, depth NUMERIC>> | ||
LANGUAGE js AS """ | ||
try { | ||
const types = new Map(); | ||
const loadTypes = (o) => { | ||
if (Array.isArray(o)) { | ||
o.forEach(loadTypes); | ||
} else if (o instanceof Object) { | ||
if (o['@id'] && o['@type']) { | ||
types.set(o['@id'], o['@type']); | ||
} | ||
Object.values(o).forEach(loadTypes); | ||
} | ||
} | ||
const arrayify = (value) => Array.isArray(value) ? value : [value]; | ||
const getEntitiesAndRelationships = (o, _from, relationship, depth = 0) => { | ||
if (Array.isArray(o)) return o.map(value => getEntitiesAndRelationships(value, _from, relationship, depth)).flat(); | ||
if (o instanceof Object) { | ||
const type = types.get(o['@id']) || o['@type']; | ||
return [{_from, relationship, _to: type, depth}, ...Object.entries(o).map(([k, value]) => getEntitiesAndRelationships(value, type, k, depth + 1))].flat(); | ||
} | ||
return []; | ||
} | ||
rendered = JSON.parse(rendered); | ||
const jsonld_scripts = rendered.jsonld_scripts.map(JSON.parse); | ||
loadTypes(jsonld_scripts); | ||
return jsonld_scripts.map(jsonld_script => getEntitiesAndRelationships(jsonld_script, undefined, undefined, 0)).flat(); | ||
} catch (e) { | ||
return []; | ||
} | ||
"""; | ||
|
||
WITH rendered_data AS ( | ||
SELECT | ||
client, | ||
root_page AS url, | ||
getJSONLDEntitiesRelationships(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS jsonld_entities_relationships | ||
FROM | ||
`httparchive.all.pages` | ||
WHERE | ||
date = '2024-06-01' | ||
) | ||
|
||
SELECT | ||
client, | ||
percentile, | ||
APPROX_QUANTILES(jsonld_entity_relationship.depth, 1000)[OFFSET(percentile * 10)] AS depth, | ||
ARRAY_TO_STRING(ARRAY_AGG(DISTINCT url LIMIT 5), ' ') AS sample_urls | ||
FROM | ||
rendered_data, | ||
UNNEST(jsonld_entities_relationships) AS jsonld_entity_relationship, | ||
UNNEST([10, 25, 50, 75, 90, 100]) AS percentile | ||
GROUP BY | ||
client, | ||
percentile | ||
ORDER BY | ||
client, | ||
percentile |
Oops, something went wrong.