-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpapers.qmd
200 lines (164 loc) · 5.65 KB
/
papers.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
---
title: "Systematic literature review"
bibliography: references.bib
title-block-banner: true
subtitle: "A papers' analysis"
author:
- name: Olivier Caron
email: [email protected]
affiliations:
name: "Paris Dauphine - PSL"
city: Paris
state: France
- name: Christophe Benavent
email: [email protected]
affiliations:
name: "Paris Dauphine - PSL"
city: Paris
state: France
date : "last-modified"
toc: true
number-sections: true
number-depth: 5
format:
html:
theme:
light: yeti
dark: darkly
code-fold: true
code-summary: "Display code"
code-tools: true #enables to display/hide all blocks of code
code-copy: true #enables to copy code
grid:
body-width: 1800px
margin-width: 100px
toc: true
toc-location: left
pdf: default
execute:
echo: true
warning: false
message: false
editor: visual
fig-align: "center"
highlight-style: breeze
css: styles.css
reference-location: margin
---
## Libraries and loading data
```{r libraries and load data}
library(cowplot)
library(tidyverse)
library(ggstatsplot)
#read.csv transforms ":" in "." so it was complicated to replicate code
#every column name is now lowered, the dots are replaced with _
#every underscore at the start or end of the column name is deleted
nlp_papers <- read.csv("nlp_papers_utf8.csv", fileEncoding = "UTF-8")
nlp_full <- read.csv("nlp_full_data_utf8.csv", fileEncoding = "UTF-8")
new_colnames <- gsub("\\.", "_", colnames(nlp_papers)) %>%
trimws(., whitespace = "_") %>%
gsub("_+", "_", .) %>%
tolower()
colnames(nlp_papers) <- new_colnames
```
## A look at the number of marketing publications with NLP
The number of papers in marketing reviews using NLP methods has increased a lot in recent years:\
```{r graph number of publications}
#| fig-width: 50
#| fig-height: 20
#| column: screen-inset-shaded
nlp_papers <- nlp_papers %>%
mutate(year = substr(prism_coverdate,1,4))
#get rid of conference papers
nlp_papers_journal_only <- nlp_papers %>%
filter(!grepl("conference", subtypedescription, ignore.case = TRUE)) %>%
filter(year < 2023)
t0 <-as.data.frame(prop.table(table(nlp_papers_journal_only$prism_publicationname)))
g01<-ggplot(t0,aes(x=reorder(Var1, Freq), y=Freq))+geom_bar(stat="identity")+
coord_flip()+
labs( title="Number of articles per review", y="Proportion", x= NULL)
t1<-as.data.frame(table(nlp_papers_journal_only$year))
g02<-ggplot(t1, aes(x=Var1, y=Freq, group=1))+
geom_smooth(color="Grey70", linewidth=2)+
geom_line(stat="identity", size=1.1) +
labs( title="Number of publications per year", y="", x=NULL)
plotgrid <- plot_grid(g01,
g02,
label_size = 10,
ncol=2,
rel_widths = c(2,1))
ggsave(filename="images/evolution_publications_nlp_marketing.png",
width = 80,
height = 40,
units = "cm")
plotgrid
```
## A look at the number of citations
It seems that research endeavors within the nascent domain of marketing using NLP methodologies exhibit a resemblance to research conducted on a global scale regarding the number of citations. Indeed, it showcases a pronounced inequity, where numerous papers find themselves receiving minimal or no citations whatsoever.
The subsequent graphs are done with the ggstatsplot R package [@ggstatsplot].
```{r}
#| layout-nrow: 4
#| echo: fenced
set.seed(42)
gghistostats(
data = nlp_papers,
x = citedby_count,
title = "Distribution of citations",
test.value = 12,
binwidth = 1,
xlab = "Number of citations"
)
nlp_papers$log_citedby = log(nlp_papers$citedby_count+1)
ggscatterstats(
data = nlp_papers,
x = author_count,
y = log_citedby,
xlab = "Number of authors",
ylab = "Ln(Number of citations+1)",
title = "Distribution of citations by number of authors",
label.var = dc_creator,
label.expression = citedby_count > 200,
point.label.args = list(alpha = 0.7, size = 4, color = "grey50"),
xfill = "#CC79A7", ## fill for marginals on the x-axis
yfill = "#009E73", ## fill for marginals on the y-axis
)
ggstatsplot::ggbetweenstats(data = nlp_papers,
x = subtypedescription,
y = log_citedby,
xlab = "",
ylab = "Number of citations (ln scale)",
plottype = "box",
type = "p",
conf.level = 0.95)
nlp_article_per_country <- nlp_full %>%
filter(!is.na(afid)) %>%
group_by(entry_number) %>%
reframe(unique_countries = unique(affiliation_country),
nbcitations = unique(citedby_count),
publication_id = unique(eid),
title = unique(dc.title),
journal = unique(prism.publicationName),
first_author = unique(dc.creator),
date_publication = unique(prism.coverDate),
nb_authors = unique(author_count),
keyword = unique(authkeywords))
#dataframe with the sum of citations per country per article
nlp_article25 <- nlp_article_per_country %>%
group_by(unique_countries) %>%
summarise(sumcitations = sum(nbcitations)) %>%
arrange(desc(sumcitations)) %>%
head(25)
ggdotplotstats(
data = nlp_article25,
y = unique_countries,
x = sumcitations,
test.value = 25,
type = "robust",
title = "Distribution of academic citations among the top 25 countries",
xlab = "Number of citations"
)
#ggsave("images/distribution_citations_authors.svg", width=15, height = 12)
```
## The landscape of research
```{r}
```