-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_data.R
69 lines (48 loc) · 1.56 KB
/
extract_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# from ebawsconnect.athena import get_query_results_from_athena
# query = """
# SELECT *
# FROM dfp_prod_jppol_dsa_shrd.escenic_article
# WHERE article_url NOT LIKE '/incoming%'
# AND LENGTH(article_body) >= 100
# AND (article_title LIKE '%Mark Zuckerberg%'
# OR article_lead LIKE '%Mark Zuckerberg%'
# OR article_body LIKE '%Mark Zuckerberg%')
# """
# out = get_query_results_from_athena(query, results_file = "zuckerberg.csv", to_df = True, force_query = True, profile_name = "jppol-dfp")
library(tidyverse)
df = readr::read_csv("projects/zuckerberg.csv")
# subset relevant publications
df <- df %>% filter(cms_publication %in% c('ekstrabladet','politiken','jyllandsposten'))
# filter stories without lead, title or body
df <- df %>%
filter(!is.na(article_lead)) %>%
filter(!is.na(article_title)) %>%
filter(!is.na(article_body))
# exploratory
df %>%
mutate(year_lk = substr(first_published, 1, 4)) %>%
group_by(year_lk) %>%
summarize(antal = n()) %>%
select(-antal)
# handle unique content-ids
singles = df %>%
group_by(content_id) %>%
mutate(antal = n()) %>%
filter(antal == 1) %>%
select(-antal)
# handle non-unique content-ids
dubs = df %>%
group_by(content_id) %>%
mutate(antal = n()) %>%
filter(antal > 1) %>%
select(-antal) %>%
arrange(last_modified) %>%
slice(1) %>%
ungroup()
out = bind_rows(singles, dubs)
year_pub = out %>%
mutate(year_lk = substr(first_published, 1, 4)) %>%
group_by(year_lk, cms_publication) %>%
summarize(antal = n()) %>%
arrange(cms_publication, year_lk)
write_csv(out, "news_zuckerberg.csv")