-
Notifications
You must be signed in to change notification settings - Fork 102
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Guillaume NICOLAS
committed
Aug 9, 2024
1 parent
e97267d
commit 0bda124
Showing
2 changed files
with
70 additions
and
57 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,54 +1,61 @@ | ||
CREATE TEMP | ||
TABLE entities (name STRING, domain STRING) AS | ||
SELECT JSON_VALUE(entity, '$.name') AS name, JSON_VALUE(entity, '$.domain') as domain | ||
FROM UNNEST ( | ||
JSON_QUERY_ARRAY (@entities_string, '$') | ||
) AS entity | ||
|
||
; | ||
CREATE TEMP TABLE | ||
entities (name STRING, domain STRING) AS | ||
SELECT | ||
JSON_VALUE(entity, '$.name') AS name, | ||
JSON_VALUE(entity, '$.domain') as domain | ||
FROM | ||
UNNEST (JSON_QUERY_ARRAY(@entities_string, '$')) AS entity; | ||
|
||
-- Map each observed domain and count to entity | ||
CREATE TEMP | ||
TABLE entity_domain_count ( | ||
domain STRING, | ||
totalOccurrences INT, | ||
name STRING | ||
) AS | ||
CREATE TEMP TABLE | ||
entity_domain_count (domain STRING, totalOccurrences INT, name STRING) AS | ||
SELECT | ||
domain_occurrences.domain as domain, | ||
domain_occurrences.totalOccurrences as totalOccurrences, | ||
entities.name as name, | ||
FROM ( | ||
FROM | ||
( | ||
-- How many times an observed domain is called in archive | ||
SELECT domain, COUNT(*) AS totalOccurrences | ||
FROM ( | ||
SELECT page, NET.HOST (url) AS domain | ||
SELECT | ||
domain, | ||
COUNT(0) AS totalOccurrences | ||
FROM | ||
( | ||
SELECT | ||
page, | ||
NET.HOST(url) AS domain | ||
FROM | ||
`httparchive.requests.2022_01_01_mobile` | ||
GROUP BY | ||
page, domain | ||
page, | ||
domain | ||
) | ||
GROUP BY | ||
domain | ||
) as domain_occurrences | ||
JOIN | ||
-- Mapping between a domain and an entity | ||
entities ON domain_occurrences.domain LIKE | ||
REPLACE (entities.domain, '*', '%'); | ||
entities ON domain_occurrences.domain LIKE REPLACE(entities.domain, '*', '%'); | ||
|
||
-- Get entities with at least 50 observed domains | ||
WITH | ||
entity_count AS ( | ||
SELECT name, SUM(totalOccurrences) as totalOccurrences | ||
FROM entity_domain_count | ||
SELECT | ||
name, | ||
SUM(totalOccurrences) as totalOccurrences | ||
FROM | ||
entity_domain_count | ||
GROUP BY | ||
name | ||
HAVING | ||
totalOccurrences >= 50 | ||
) | ||
-- Get observed domains owned by entities with at least 50 observed domains | ||
SELECT entity_domain_count.domain, entity_domain_count.totalOccurrences | ||
SELECT | ||
entity_domain_count.domain, | ||
entity_domain_count.totalOccurrences | ||
FROM | ||
entity_domain_count | ||
JOIN entity_count ON entity_count.name = entity_domain_count.name | ||
ORDER BY totalOccurrences DESC | ||
ORDER BY | ||
totalOccurrences DESC |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters