Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Structured Data 2021 SQL update - add page counts #2442

Merged
merged 4 commits into from
Nov 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 29 additions & 16 deletions sql/2021/structured-data/classic_microformats_types.sql
Original file line number Diff line number Diff line change
Expand Up @@ -14,29 +14,42 @@ LANGUAGE js AS """
WITH
rendered_data AS (
SELECT
getClassicMicroformatsTypes(rendered) AS classic_microformats_types,
client
FROM (
SELECT
JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,
'$._structured-data')),
'$.structured_data.rendered') AS rendered,
_TABLE_SUFFIX AS client
FROM
`httparchive.pages.2021_07_01_*`)
_TABLE_SUFFIX AS client,
url,
getClassicMicroformatsTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS classic_microformats_types
FROM
`httparchive.pages.2021_07_01_*`
),

page_totals AS (
SELECT
_TABLE_SUFFIX AS client,
COUNT(0) AS total_pages
FROM
`httparchive.pages.2021_07_01_*`
GROUP BY
_TABLE_SUFFIX
)

SELECT
client,
classic_microformats_type.name AS classic_microformats_type,
SUM(classic_microformats_type.count) AS count,
SUM(SUM(classic_microformats_type.count)) OVER (PARTITION BY client) AS total,
SUM(classic_microformats_type.count) / SUM(SUM(classic_microformats_type.count)) OVER (PARTITION BY client) AS pct,
client
SUM(classic_microformats_type.count) AS freq_microformat,
SUM(SUM(classic_microformats_type.count)) OVER (PARTITION BY client) AS total_microformat,
SUM(classic_microformats_type.count) / SUM(SUM(classic_microformats_type.count)) OVER (PARTITION BY client) AS pct_microformat,
COUNT(DISTINCT url) AS freq_pages,
total_pages,
COUNT(DISTINCT url) / total_pages AS pct_pages
FROM
rendered_data,
UNNEST(classic_microformats_types) AS classic_microformats_type
JOIN
page_totals
USING (client)
GROUP BY
client,
classic_microformats_type,
client
total_pages
ORDER BY
count DESC
freq_microformat DESC,
client
43 changes: 28 additions & 15 deletions sql/2021/structured-data/dublin_core_types.sql
Original file line number Diff line number Diff line change
Expand Up @@ -14,29 +14,42 @@ LANGUAGE js AS """
WITH
rendered_data AS (
SELECT
getDublinCoreTypes(rendered) AS dublin_core_types,
client
FROM (
SELECT
JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,
'$._structured-data')),
'$.structured_data.rendered') AS rendered,
_TABLE_SUFFIX AS client
FROM
`httparchive.pages.2021_07_01_*`)
_TABLE_SUFFIX AS client,
url,
getDublinCoreTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS dublin_core_types
FROM
`httparchive.pages.2021_07_01_*`
),

page_totals AS (
SELECT
_TABLE_SUFFIX AS client,
COUNT(0) AS total_pages
FROM
`httparchive.pages.2021_07_01_*`
GROUP BY
_TABLE_SUFFIX
)

SELECT
client,
dublin_core_type,
COUNT(dublin_core_type) AS count,
SUM(COUNT(dublin_core_type)) OVER (PARTITION BY client) AS total,
COUNT(dublin_core_type) / SUM(COUNT(dublin_core_type)) OVER (PARTITION BY client) AS pct,
client
SUM(COUNT(dublin_core_type)) OVER (PARTITION BY client) AS freq_dublin_core,
COUNT(dublin_core_type) / SUM(COUNT(dublin_core_type)) OVER (PARTITION BY client) AS pct_dublin_core,
COUNT(DISTINCT url) AS freq_pages,
total_pages,
COUNT(DISTINCT url) / total_pages AS pct_pages
FROM
rendered_data,
UNNEST(dublin_core_types) AS dublin_core_type
JOIN
page_totals
USING (client)
GROUP BY
client,
dublin_core_type,
client
total_pages
ORDER BY
count DESC
pct_dublin_core DESC,
client
47 changes: 30 additions & 17 deletions sql/2021/structured-data/facebook_types.sql
Original file line number Diff line number Diff line change
Expand Up @@ -14,29 +14,42 @@ LANGUAGE js AS """
WITH
rendered_data AS (
SELECT
getFacebookTypes(rendered) AS facebook_types,
client
FROM (
SELECT
JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,
'$._structured-data')),
'$.structured_data.rendered') AS rendered,
_TABLE_SUFFIX AS client
FROM
`httparchive.pages.2021_07_01_*`)
_TABLE_SUFFIX AS client,
url,
getFacebookTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS facebook_type
FROM
`httparchive.pages.2021_07_01_*`
),

page_totals AS (
SELECT
_TABLE_SUFFIX AS client,
COUNT(0) AS total_pages
FROM
`httparchive.pages.2021_07_01_*`
GROUP BY
_TABLE_SUFFIX
)

SELECT
client,
facebook_type,
COUNT(facebook_type) AS count,
SUM(COUNT(facebook_type)) OVER (PARTITION BY client) AS total,
COUNT(facebook_type) / SUM(COUNT(facebook_type)) OVER (PARTITION BY client) AS pct,
client
COUNT(facebook_type) AS freq_facebook,
SUM(COUNT(facebook_type)) OVER (PARTITION BY client) AS total_facebook,
COUNT(facebook_type) / SUM(COUNT(facebook_type)) OVER (PARTITION BY client) AS pct_facebook,
COUNT(DISTINCT url) AS freq_pages,
total_pages,
COUNT(DISTINCT url) / total_pages AS pct_pages
FROM
rendered_data,
UNNEST(facebook_types) AS facebook_type
UNNEST(facebook_type) AS facebook_type
JOIN
page_totals
USING (client)
GROUP BY
client,
facebook_type,
client
total_pages
ORDER BY
count DESC
freq_facebook DESC,
client
47 changes: 30 additions & 17 deletions sql/2021/structured-data/jsonld_contexts.sql
Original file line number Diff line number Diff line change
Expand Up @@ -33,29 +33,42 @@ LANGUAGE js AS """
WITH
rendered_data AS (
SELECT
getJSONLDContexts(rendered) AS jsonld_contexts,
client
FROM (
SELECT
JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,
'$._structured-data')),
'$.structured_data.rendered') AS rendered,
_TABLE_SUFFIX AS client
FROM
`httparchive.pages.2021_07_01_*`)
_TABLE_SUFFIX AS client,
url,
getJSONLDContexts(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS jsonld_context
FROM
`httparchive.pages.2021_07_01_*`
),

page_totals AS (
SELECT
_TABLE_SUFFIX AS client,
COUNT(0) AS total_pages
FROM
`httparchive.pages.2021_07_01_*`
GROUP BY
_TABLE_SUFFIX
)

SELECT
client,
NET.REG_DOMAIN(jsonld_context) AS jsonld_context,
COUNT(0) AS count,
SUM(COUNT(0)) OVER (PARTITION BY client) AS total,
COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct,
client
COUNT(0) AS freq_jsonld_context,
SUM(COUNT(0)) OVER (PARTITION BY client) AS total_jsonld_context,
COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_jsonld_context,
COUNT(DISTINCT url) AS freq_pages,
total_pages,
COUNT(DISTINCT url) / total_pages AS pct_pages
FROM
rendered_data,
UNNEST(jsonld_contexts) AS jsonld_context
UNNEST(jsonld_context) AS jsonld_context
JOIN
page_totals
USING (client)
GROUP BY
client,
jsonld_context,
client
total_pages
ORDER BY
count DESC
pct_jsonld_context DESC,
client
Original file line number Diff line number Diff line change
Expand Up @@ -41,26 +41,27 @@ LANGUAGE js AS """
}
""";

WITH
rendered_data AS (
WITH rendered_data AS (
SELECT
getJSONLDEntitiesRelationships(rendered) AS jsonld_entities_relationships,
client
FROM (
SELECT
JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,
'$._structured-data')),
'$.structured_data.rendered') AS rendered,
_TABLE_SUFFIX AS client
FROM
`httparchive.pages.2021_07_01_*`)
_TABLE_SUFFIX AS client,
url,
getJSONLDEntitiesRelationships(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS jsonld_entities_relationships
FROM
`httparchive.pages.2021_07_01_*`
)

SELECT
MAX(jsonld_entity_relationship.depth) AS max_depth,
client
client,
percentile,
APPROX_QUANTILES(jsonld_entity_relationship.depth, 1000)[OFFSET(percentile * 10)] AS depth,
ARRAY_TO_STRING(ARRAY_AGG(DISTINCT url LIMIT 5), ' ') AS sample_urls
FROM
rendered_data,
UNNEST(jsonld_entities_relationships) AS jsonld_entity_relationship
UNNEST(jsonld_entities_relationships) AS jsonld_entity_relationship,
UNNEST([10, 25, 50, 75, 90, 100]) AS percentile
GROUP BY
client
client,
percentile
ORDER BY
client,
percentile
45 changes: 30 additions & 15 deletions sql/2021/structured-data/jsonld_entities_relationships.sql
Original file line number Diff line number Diff line change
Expand Up @@ -44,34 +44,49 @@ LANGUAGE js AS """
WITH
rendered_data AS (
SELECT
getJSONLDEntitiesRelationships(rendered) AS jsonld_entities_relationships,
client
FROM (
SELECT
JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload,
'$._structured-data')),
'$.structured_data.rendered') AS rendered,
_TABLE_SUFFIX AS client
FROM
`httparchive.pages.2021_07_01_*`)
_TABLE_SUFFIX AS client,
url,
getJSONLDEntitiesRelationships(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS jsonld_entities_relationships
FROM
`httparchive.pages.2021_07_01_*`
),

page_totals AS (
SELECT
_TABLE_SUFFIX AS client,
COUNT(0) AS total_pages
FROM
`httparchive.pages.2021_07_01_*`
GROUP BY
_TABLE_SUFFIX
)

SELECT
client,
jsonld_entity_relationship._from,
jsonld_entity_relationship.relationship,
jsonld_entity_relationship._to,
jsonld_entity_relationship.depth,
COUNT(0) AS count,
client
COUNT(0) AS freq_relationship,
SUM(COUNT(0)) OVER (PARTITION BY client) AS total_relationship,
COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_relationship,
COUNT(DISTINCT url) AS freq_pages,
total_pages,
COUNT(DISTINCT url) / total_pages AS pct_pages
FROM
rendered_data,
UNNEST(jsonld_entities_relationships) AS jsonld_entity_relationship
JOIN
page_totals
USING (client)
GROUP BY
client,
jsonld_entity_relationship._from,
jsonld_entity_relationship.relationship,
jsonld_entity_relationship._to,
jsonld_entity_relationship.depth,
client
total_pages
ORDER BY
count DESC
LIMIT 10000
pct_relationship DESC,
client
LIMIT 1000
Loading