-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdescript.qmd
419 lines (300 loc) · 11.3 KB
/
descript.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
---
title: "Descriptive results"
author: "Maria"
format: html
toc: true
toc-depth: 3
code-fold: false
execute:
echo: false
warning: false
message: false
---
## Setup
Based on existing search files as provided
## Preliminary results
#### Summarize main results of the bibliometric analysis (Scopus)
```{r}
library(bibliometrix)
library(tidyverse)
files_bib<-list.files(
path="new",
pattern="*.bib",
full.names=TRUE
)
bib_data<- lapply(files_bib, function(file){
convert2df(file, dbsource = "scopus", format = "bibtex")
})
M <- dplyr::bind_rows(bib_data)
M<-M%>% distinct(url, .keep_all=TRUE)
M_og<-M
M_scopus_csv<-convert2df("databases/scopus.csv", dbsource = "scopus", format = "csv")
# main bibliometric measures
results <- biblioAnalysis(M, sep = ";")
summary(results, k=10, pause=F, width=130)
```
Based on provided scopus.csv
```{r}
#based on csv
results_csv <- biblioAnalysis(M_scopus_csv, sep = ";")
summary(results_csv, k=10, pause=F, width=130)
```
```{r}
plot(x=results_csv, k=10, pause=F)
```
### Most Cited References
```{r}
CR <- citations(M, field = "article", sep = ";")
cbind(CR$Cited[1:20])
```
## The Intellectual Structure of the field - Co-citation Analysis
### Article (References) co-citation analysis
```{r}
NetMatrix <- biblioNetwork(M_scopus_csv, analysis = "co-citation", network = "references", sep = ";")
net=networkPlot(NetMatrix, n = 50, Title = "Co-Citation Network", type = "fruchterman", size.cex=TRUE, size=20, remove.multiple=FALSE, labelsize=1,edgesize = 10, edges.min=5)
```
### Co-word Analysis through Keyword co-occurrences
The plot uses the main 50 cited references and applies the [Fruchterman-Reingold Algorithm](https://en.wikipedia.org/wiki/Force-directed_graph_drawing) for network generation
```{r}
NetMatrix <- biblioNetwork(M, analysis = "co-occurrences", network = "keywords", sep = ";")
net=networkPlot(NetMatrix, normalize="association", weighted=T, n = 30, Title = "Keyword Co-occurrences", type = "fruchterman", size=T,edgesize = 5,labelsize=0.7)
```
Descriptive analysis of Article co-citation network characteristics
```{r}
netstat <- networkStat(NetMatrix)
summary(netstat,k=10)
```
### Journal (Source) co-citation analysis
```{r}
M_tags=metaTagExtraction(M_scopus_csv,"CR_SO",sep=";")
NetMatrix <- biblioNetwork(M_tags, analysis = "co-citation", network = "sources", sep = ";")
net=networkPlot(NetMatrix, n = 20, Title = "Co-Citation Network", type = "auto", size.cex=TRUE, size=15, remove.multiple=FALSE, labelsize=1,edgesize = 10, edges.min=5)
```
Descriptive analysis of Journal co-citation network characteristics
```{r}
netstat <- networkStat(NetMatrix)
summary(netstat,k=10)
```
## Historiograph - Direct citation linkages
based on scopus.csv
```{r}
histResults <- histNetwork(M_scopus_csv, sep = ";")
```
```{r}
options(width = 130)
net <- histPlot(histResults, n=20, size = 5, labelsize = 4)
```
## The conceptual structure - Co-Word Analysis
### Co-word Analysis through Keyword co-occurrences
The network layout is generated using the Fruchterman-Reingold Algorithm)
```{r}
NetMatrix <- biblioNetwork(M, analysis = "co-occurrences", network = "keywords", sep = ";")
net=networkPlot(NetMatrix, normalize="association", n = 20, Title = "Keyword Co-occurrences", type = "fruchterman", size.cex=TRUE, size=20, remove.multiple=F, edgesize = 10, labelsize=5,label.cex=TRUE,label.n=30,edges.min=2)
```
Descriptive analysis of keyword co-occurrences network characteristics
```{r}
netstat <- networkStat(NetMatrix)
summary(netstat,k=10)
```
### Co-word Analysis through Correspondence Analysis
```{r}
suppressWarnings(
CS <- conceptualStructure(M, method="MCA", field="ID", minDegree=15, clust=5, stemming=FALSE, labelsize=15,documents=20)
)
```
## Thematic Map
Co-word analysis is used to identify clusters of keywords based on [this](https://www.mdpi.com/2071-1050/14/6/3643) . Clusters are used as themes and their characteristics (e.g. density and centrality) are used in classifying themes and mapping as two-dimensional diagram.
The result is a thematic map which is an intuitive tool to present themes for further analysis based on the quadrant in which themes are placed:
(1) upper-right quadrant: motor-themes;
(2) lower-right quadrant: basic themes;
(3) lower-left quadrant: emerging or disappearing themes;
(4) upper-left quadrant: very specialized/niche themes.
```{r}
Map=thematicMap(M, field = "ID", n = 250, minfreq = 4,
stemming = FALSE, size = 0.7, n.labels=5, repel = TRUE)
plot(Map$map)
```
Cluster description
```{r}
Clusters=Map$words[order(Map$words$Cluster,-Map$words$Occurrences),]
library(dplyr)
CL <- Clusters %>% group_by(.data$Cluster_Label) %>% top_n(5, .data$Occurrences)
CL
```
## The social structure - Collaboration Analysis
Collaboration network analysis evaluates how authors / institutions / countries relate to others in a specific field of research.
Such analysis include generation of a co-author network to evaluate distinct groupings, e.g. regular study groups, hidden groups of scholars, and pivotal authors.
Further analysis includes generation of the collaboration network which links relevant institutions in the field and uncover their relations.
### Author collaboration network
```{r}
NetMatrix <- biblioNetwork(M, analysis = "collaboration", network = "authors", sep = ";")
net=networkPlot(NetMatrix, n = 20, Title = "Author collaboration",type = "auto", size=10,size.cex=T,edgesize = 3,labelsize=1)
```
Descriptive analysis of author collaboration network characteristics
```{r}
netstat <- networkStat(NetMatrix)
summary(netstat,k=15)
```
Institutional collaboration
```{r}
NetMatrix <- biblioNetwork(M, analysis = "collaboration", network = "universities", sep = ";")
net=networkPlot(NetMatrix, n = 20, Title = "Institutional collaboration",type = "auto", size=4,size.cex=F,edgesize = 3,labelsize=1)
```
Descriptive analysis of edu collaboration network characteristics
```{r}
netstat <- networkStat(NetMatrix)
summary(netstat,k=15)
```
Countries collaborating on this set of bibliographic data
```{r}
M <- metaTagExtraction(M_scopus_csv, Field = "AU_CO", sep = ";")
NetMatrix <- biblioNetwork(M, analysis = "collaboration", network = "countries", sep = ";")
net=networkPlot(NetMatrix, n = dim(NetMatrix)[1], Title = "Country collaboration",type = "circle", size=10,size.cex=T,edgesize = 1,labelsize=0.6, cluster="none")
```
Descriptive analysis of country collaboration network characteristics
```{r}
netstat <- networkStat(NetMatrix)
summary(netstat,k=15)
```
## Abstracts analysis
### Topic modelling
Papers with abstracts
```{r}
library(tidytext)
abstracts<-M_og%>%filter(AB!="[NO ABSTRACT AVAILABLE]")%>%select(DI,AB)
tidy_data <-
abstracts %>%
unnest_tokens(
output=word,
input=AB,
to_lower=TRUE,
) %>%
anti_join(get_stopwords())%>%
add_count(word) %>%
filter(n > 100) %>%
select(-n)
tidy_data %>%
count(word, sort = TRUE)%>%head(20)
tidy_data_sparse <- tidy_data %>%
count(DI, word) %>%
cast_sparse(DI, word, n)
```
Topic number
fitting several models with different number of topics
```{r}
library(tidyverse)
library(tidymodels)
library(lubridate)
library(tidytext)
library(stm)
library(furrr)
plan(multisession)
theme_set(theme_minimal())
many_models <- data_frame(K = c(20, 40, 50, 60, 70, 80, 100)) %>%
mutate(topic_model = future_map(K, ~stm(tidy_data_sparse, K = .,
verbose = FALSE)))
```
evaluate models trained on a sparse matrix
```{r}
heldout <- make.heldout(tidy_data_sparse)
k_result <- many_models %>%
mutate(exclusivity = map(topic_model, exclusivity),
semantic_coherence = map(topic_model, semanticCoherence, tidy_data_sparse),
eval_heldout = map(topic_model, eval.heldout, heldout$missing),
residual = map(topic_model, checkResiduals, tidy_data_sparse),
bound = map_dbl(topic_model, function(x) max(x$convergence$bound)),
lfact = map_dbl(topic_model, function(x) lfactorial(x$settings$dim$K)),
lbound = bound + lfact,
iterations = map_dbl(topic_model, function(x) length(x$convergence$bound)))
k_result
```
```{r}
k_result %>%
transmute(K,
`Lower bound` = lbound,
Residuals = map_dbl(residual, "dispersion"),
`Semantic coherence` = map_dbl(semantic_coherence, mean),
`Held-out likelihood` = map_dbl(eval_heldout, "expected.heldout")) %>%
gather(Metric, Value, -K) %>%
ggplot(aes(K, Value, color = Metric)) +
geom_line(size = 1.5, alpha = 0.7, show.legend = FALSE) +
facet_wrap(~Metric, scales = "free_y") +
labs(x = "K (number of topics)",
y = NULL,
title = "Model diagnostics by number of topics",
subtitle = "These diagnostics indicate that a good number of topics would be around 80-100")
```
highest held-out likehood
lowest residuals
Semantic coherence is maximized when the most probable words in a given topic frequently co-occur together, and it’s a metric that correlates well with human judgment of topic quality. Having high semantic coherence is relatively easy, though, if you only have a few topics dominated by very common words, so you want to look at both semantic coherence and exclusivity of words to topics. It’s a tradeoff. Read more about semantic coherence in the original paper about it.
```{r}
k_result %>%
select(K, exclusivity, semantic_coherence) %>%
filter(K %in% c(20, 80, 100)) %>%
unnest() %>%
mutate(K = as.factor(K)) %>%
ggplot(aes(semantic_coherence, exclusivity, color = K)) +
geom_point(size = 2, alpha = 0.7) +
labs(x = "Semantic coherence",
y = "Exclusivity",
title = "Comparing exclusivity and semantic coherence",
subtitle = "Models with fewer topics have higher semantic coherence for more topics, but lower exclusivity")
```
```{r}
topic_model <- k_result %>%
filter(K == 100) %>%
pull(topic_model) %>%
.[[1]]
topic_model
```
```{r}
td_beta <- tidy(topic_model)
td_beta
```
```{r}
td_gamma <- tidy(topic_model, matrix = "gamma",
document_names = rownames(tidy_data_sparse))
td_gamma
```
```{r}
library(ggthemes)
top_terms <- td_beta %>%
arrange(beta) %>%
group_by(topic) %>%
top_n(7, beta) %>%
arrange(-beta) %>%
select(topic, term) %>%
summarise(terms = list(term)) %>%
mutate(terms = map(terms, paste, collapse = ", ")) %>%
unnest()
gamma_terms <- td_gamma %>%
group_by(topic) %>%
summarise(gamma = mean(gamma)) %>%
arrange(desc(gamma)) %>%
left_join(top_terms, by = "topic") %>%
mutate(topic = paste0("Topic ", topic),
topic = reorder(topic, gamma))
gamma_terms %>%
top_n(20, gamma) %>%
ggplot(aes(topic, gamma, label = terms, fill = topic)) +
geom_col(show.legend = FALSE) +
geom_text(hjust = 0, nudge_y = 0.0005, size = 3) +
coord_flip() +
scale_y_continuous(expand = c(0,0),
limits = c(0, 0.09),
labels = percent_format()) +
theme_tufte(ticks = FALSE) +
theme(plot.title = element_text(size = 16),
plot.subtitle = element_text(size = 13)) +
labs(x = NULL, y = expression(gamma),
title = "Top 20 topics by prevalence ",
subtitle = "With the top words that contribute to each topic")
```
```{r}
gamma_terms %>%
select(topic, gamma, terms) %>%
knitr::kable(digits = 3,
col.names = c("Topic", "Expected topic proportion", "Top 7 terms"))
```
### Clustering
### Summarization