forked from jborycz/dhum_5100_code
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathJane_Austen.R
199 lines (168 loc) · 8.43 KB
/
Jane_Austen.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# LIBRARIES
# install.packages(pacman)
library(pacman)
pacman::p_load(tm, # text mining
textmineR, # text mining
janeaustenr, # Text of Jane Austen novels
tidyverse, # Data cleaning & plotting packages (includes ggplot2)
tidytext, # Sentiment and term frequency analysis
forcats, # Tools for working with categorical variables
ggpubr) # Makes publication ready plots
# Let’s start by looking at the published novels of Jane Austen and examine first term frequency
# We can start just by using dplyr verbs such as group_by() and join().
###########################
# Word frequencies #
###########################
## Compute word frequencies in each Jane Austen novel
book_words <- austen_books() %>%
unnest_tokens(word, text) %>% # unnest_tokens() flattens a table into one token per row
count(book, word, sort = TRUE) # count() counts the words in each book and sorts them
total_words <- book_words %>%
group_by(book) %>% # group_by() combines portions of a dataframe into based on specified column
summarize(total = sum(n)) # summarize() or mutate() can be used to compute results based on groups
book_words <- left_join(book_words, total_words) # left_join() combines tables while preserving data order
book_words
#> # A tibble: 40,379 × 4
#> book word n total
#> <fct> <chr> <int> <int>
#> 1 Mansfield Park the 6206 160460
#> 2 Mansfield Park to 5475 160460
#> 3 Mansfield Park and 5438 160460
#> 4 Emma to 5239 160996
# Now let’s look at the distribution of n/total for each novel, the number of times a word
# appears in a novel divided by the total number of terms (words) in that novel.
ggplot(book_words, aes(n/total, fill = book)) +
geom_histogram(show.legend = FALSE) +
xlim(NA, 0.0009) +
facet_wrap(~book, ncol = 2, scales = "free_y")
###########################
# Zipf's law #
###########################
# Zipf’s law states that the frequency that a word appears is inversely proportional to its rank.
# Since the table was already ordered by n, we could use row_number() to find the rank.
# Let's make a log-log plot of Zipf's Law
freq_by_rank <- book_words %>%
group_by(book) %>%
mutate(rank = row_number(),
`term frequency` = n/total) %>%
ungroup()
View(freq_by_rank)
#> # A tibble: 40,379 × 6
#> book word n total rank `term frequency`
#> <fct> <chr> <int> <int> <int> <dbl>
#> 1 Mansfield Park the 6206 160460 1 0.0387
#> 2 Mansfield Park to 5475 160460 2 0.0341
#> 3 Mansfield Park and 5438 160460 3 0.0339
#> 4 Emma to 5239 160996 1 0.0325
#> # … with 40,369 more rows
# Let’s see what the exponent of the power law is for the middle section of the rank range.
freq_by_rank %>%
ggplot(aes(x= rank, y= `term frequency`, color = book)) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()
rank_subset <- freq_by_rank %>%
filter(rank < 500,
rank > 10)
lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
#>
#> Call:
#> lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
#>
#> Coefficients:
#> (Intercept) log10(rank)
#> -0.6226 -1.1125
# Let’s plot a power law (line on the log-log plot) with the data:
## Create theme for plots
plot_theme <- theme(legend.position = "bottom",
legend.title = element_text(face="bold",size=24,hjust = 0.5),
legend.text = element_text(size=20,hjust = 0.5),
plot.title = element_text(face="bold",size=24,hjust = 0.5),
strip.text = element_text(size=16,hjust = 0.5),
axis.text.x = element_text(color="black",size=16, angle=0),
axis.title.x = element_text(face="bold",color="black",size=20, angle=0),
axis.text.y = element_text(color="black",size=16, angle=0),
axis.title.y = element_text(face="bold",color="black",size=20, angle=90))
## Plot Zipf's law
freq_by_rank_plot <- freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color = book)) +
geom_abline(intercept = -0.62, slope = -1.1,
color = "gray50", linetype = 3, size=1.5) +
geom_line(size = 1.1, alpha = 0.8, show.legend = TRUE) +
labs(color="Book") +
scale_x_log10() +
scale_y_log10() +
theme_classic() + plot_theme
ggexport(plotlist = list(freq_by_rank_plot),
filename = "plots/freq_by_rank_plot.png",width=1000,height = 1000)
# The statistic tf-idf is intended to measure how important a word is to a document in a collection (or corpus) of documents. It finds the important words by decreasing the weight for commonly used words and increasing the weight for words that are not used very much in a collection or corpus of documents (e.g. Jane Austen’s novels as a whole).
# The bind_tf_idf() function in the tidytext package takes a tidy text dataset as input
# with one row per token (term), per document. One column (word here) contains the terms/tokens,
# one column contains the documents (book in this case), and the last necessary column contains
# the counts, how many times each document contains each term (n in this example).
book_tf_idf <- book_words %>%
bind_tf_idf(word, book, n)
book_tf_idf
#> # A tibble: 40,379 × 7
#> book word n total tf idf tf_idf
#> <fct> <chr> <int> <int> <dbl> <dbl> <dbl>
#> 1 Mansfield Park the 6206 160460 0.0387 0 0
#> 2 Mansfield Park to 5475 160460 0.0341 0 0
#> 3 Mansfield Park and 5438 160460 0.0339 0 0
#> # … with 40,369 more rows
# Now, let's look at terms with high tf-idf in Jane Austen’s works. Note Some of the values for
# idf are the same for different terms because there are 6 documents in this corpus and we
# are seeing the same numerical values for the statistic.
book_tf_idf %>%
select(-total) %>%
arrange(desc(tf_idf))
#> # A tibble: 40,379 × 6
#> book word n tf idf tf_idf
#> <fct> <chr> <int> <dbl> <dbl> <dbl>
#> 1 Sense & Sensibility elinor 623 0.00519 1.79 0.00931
#> 2 Sense & Sensibility marianne 492 0.00410 1.79 0.00735
#> # … with 40,369 more rows
# Now, Let’s look at a visualization for these high tf-idf words
# Plot most important words
book_tf_idf_plot <- book_tf_idf %>%
group_by(book) %>%
slice_max(tf_idf, n = 15) %>%
ungroup() %>%
ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free") +
labs(x = "tf-idf", y = "Term Frequency") + plot_theme
ggexport(plotlist = list(book_tf_idf_plot),
filename = "plots/book_tf_idf_plot.png",width=800,height = 1000)
###########################
# Sentiment analysis #
###########################
# Upload books to dataframe and split by chapter
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
# Choose sentiment lexicon and type, select a book and join dataframes
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
# Separate book into lines and compute sentiments by line
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>% # Use bing sentiment lexicon
count(book, index = linenumber %/% 80, sentiment) %>% # Compute sentiment for each line
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% # Use pivot_wider() so that we have negative and positive sentiment in separate columns
mutate(sentiment = positive - negative)
# Plot sentiments for each book
austen_sentiments_plot <- ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x") + plot_theme
ggexport(plotlist = list(austen_sentiments_plot),
filename = "plots/austen_sentiments_plot.png",width=800,height = 1000)