-
Notifications
You must be signed in to change notification settings - Fork 3
/
nlp.R
152 lines (132 loc) · 3.86 KB
/
nlp.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
library(igraph)
library(tidyverse)
library(openNLP)
library(NLP)
g2016 <- read_graph('traversing_graphs_in_R/Data/g2016.graphml', 'graphml')
g2016 <- read_graph('Data/g2016.graphml', 'graphml')
sent_token_annotator <- Maxent_Sent_Token_Annotator()
word_token_annotator <- Maxent_Word_Token_Annotator()
pos_tag_annotator <- Maxent_POS_Tag_Annotator()
V(g2016)
test <- g2016 %>%
(function(x){
set_vertex_attr(x,
'abstract',
V(x)[nodeType == 'ABSTRACT'],
V(x)[nodeType == 'ABSTRACT']$pageBody %>%
str_extract('(?<=Abstract:).+(?=Keywords:)') %>%
str_trim() %>%
str_replace_all('(?<=\\w)- (?=\\w)', ''))
})
write_graph(test, 'Data/g2016_abstract.graphml', 'graphml')
abstracts <- V(g2016)[nodeType == 'ABSTRACT'] %>%
.$pageBody %>%
str_extract('(?<=Abstract:).+(?=Keywords:)') %>%
str_trim() %>%
str_replace_all('(?<=\\w)- (?=\\w)', '') %>%
map(function(x){
s <- as.String(x)
a <- annotate(s , list(sent_token_annotator, word_token_annotator, pos_tag_annotator)) %>%
{annotations_in_spans(subset(., type == 'word'),
subset(., type == 'sentence'))}
})
index <- 3
s <- V(g2016)[nodeType == 'ABSTRACT']$pageBody[[index]] %>%
str_extract('(?<=Abstract:).+(?=Keywords:)') %>%
str_trim() %>%
str_replace_all('(?<=\\w)- (?=\\w)', '') %>%
as.String()
test <- abstracts[[index]]
# test2 <- map(test, function(x){
# lapply(x, function(y){
# original <- s[y] %>%
# str_to_upper()
# st <- s[y] %>%
# str_to_lower() %>%
# SnowballC::wordStem()
# pos <- y$features[[1]]$POS
# tibble(
# original = original,
# stem = st,
# pos = pos
# )
# }) %>%
# bind_rows() %>%
# rbind(tibble(original = 'BEGIN_SENT', stem = NA, pos = NA),
# .,
# tibble(original = 'END_SENT', stem = NA, pos = NA))%>%
# mutate(order = 1:nrow(.)) #%>%
# # filter(str_detect(pos, 'NN.?|JJ.?')) %>%
# # mutate(to = lapply(order, function(y){
# # filter(., order %in% c(y +1, y + 2))
# # })) %>%
# # unnest() %>%
# # mutate(distance = order1 - order) %>%
# # select(original, original1, distance)
# }) #%>%
# # bind_rows() %>%
# # group_by(original, original1) %>%
# # summarize(
# # distance = min(distance),
# # connections = n()
# # ) -> test2
#
test2 <- map(test, function(x){
sapply(x, function(y){
original <- s[y] %>%
str_to_upper()
pos <- y$features[[1]]$POS
str_c(pos, ':', original)
# tibble(
# original = original,
# pos = pos
# )
}) %>%
c('BEGIN_SENT', ., 'END_SENT') %>%
{
tibble(from = .[1:(length(.)-1)],
to = .[2:length(.)])
} %>%
mutate(order = 1:nrow(.),
type = 'next')
}) %>%
enframe(name = 'sentence') %>%
unnest() %>%
select(from, to, sentence, order, type)
g_test <- graph_from_data_frame(test2, T)
g_textRank <- g_test %>%
{. - V(.)[!str_detect(name, '^NN|^JJ')]}
top_third <- page_rank(g_textRank)$vector %>%
sort(T) %>%
.[1:(length(.)/3)]
g_test %>%
{. - V(.)[!name %in% names(top_third)]} %>%
plot(
vertex.size = 0,
vertex.label.cex = .7,
edge.arrow.size = .5
)
g_textRank2 <- V(g_test) %>%
ego(g_test, 2, ., 'out') %>%
map(function(x){
if(length(x) == 3 && names(x)[3] != 'END_SENT' && names(x)[1] != 'BEGIN_SENT'){
names(x)[c(1, 3)]
}
}) %>%
unlist %>%
{g_test + edges(., type = 'close')} %>%
{. - V(.)[!str_detect(name, '^NN|^JJ')]}
top_third <- page_rank(g_textRank2)$vector %>%
sort(T) %>%
.[1:(length(.)/3)]
g_test %>%
{. - V(.)[!name %in% names(top_third)]} %>%
{. - E(.)[type != 'next']} %>%
plot(
vertex.label.cex = .7,
vertex.size = 0,
edge.arrow.size = .5
)
V(g2016)[nodeType == 'ABSTRACT'][index]$pageBody
%>%
ego(g2016, 1, ., 'out')