-
Notifications
You must be signed in to change notification settings - Fork 1
/
pmid.tagcloud.R
97 lines (75 loc) · 4.26 KB
/
pmid.tagcloud.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#########################################################
#### CAPTURE ABSTRACTS FROM PMIDs & MAKE A WORDCLOUD ####
#########################################################
# GNU-GPL license
# Author: Mareviv (https://talesofr.wordpress.com)
# Script to retrieve and mine abstracts from PubMed (http://www.ncbi.nlm.nih.gov/pubmed)
# Uses the function ReadPubMed from the package RefManageR. It reads the PubMed API similarly to the search engine in PubMed's page.
# This script creates two directories in your working directory: 'corpus1' for the abstracts file, and 'wordcloud' to store the plot.
# First, automagically install needed libraries:
list.of.packages <- c("slam") # installing 'slam' gives error in OSX Yosemite/El Capitan. This is an attempt to fix it, specifying 'type="binary"'.
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages, repos='https://cran.rstudio.com/', type="binary")
list.of.packages <- c("RCurl", "RefManageR", "plyr", "tm", "wordcloud", "SnowballC")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages, repos='https://cran.rstudio.com/')
# Get and store the working directory
wdir <- getwd()
# 1. Import PMIDs
message("retrieving PMIDs info...")
library(RCurl)
urli <- getURL("https://gist.githubusercontent.com/aurora-mareviv/14e5837814a8d8d47c20/raw/90b198bae82154688dcd9a2596af798612e6619f/pmids.csv", ssl.verifypeer = FALSE)
pmids <- read.csv(textConnection(urli))
message("PMID info succesfully retrieved")
# 2. Loop several queries to PubMed and return in a data.frame
index <- pmids$pmId[1:length(pmids$pmId)]
# The PubMed (free) API may give problems with large queries, so we'll prefer a shorter vector for this test:
index50 <- pmids$pmId[1:50]
library(RefManageR)
library(plyr)
message("connecting to the free PubMed API...")
auth.pm <- ldply(index50, function(x){
tmp <- unlist(ReadPubMed(x, database = "PubMed", mindate = 1950))
tmp <- lapply(tmp, function(z) if(is(z, "person")) paste0(z, collapse = ",") else z)
data.frame(tmp, stringsAsFactors = FALSE)
})
message("abstract data successfully downloaded!")
# 3. Create a directory to write the abstracts.txt file into: (this folder can only contain this .txt file!)
corpus.dir <- paste(wdir, "corpus1", sep="/")
message(paste("creating new directory: ", corpus.dir, sep=""))
dir.create(corpus.dir)
setwd(corpus.dir)
# 4. Extract abstracts to a .txt
text <- paste(auth.pm$abstract)
message(paste("writing file: ", corpus.dir, "/abstracts.txt", sep=""))
writeLines(text, "abstracts.txt")
# 5. Create tagcloud
library(tm)
library(wordcloud)
library(SnowballC)
message("constructing the tagcloud...")
abstract <- Corpus (DirSource(corpus.dir)) # import text file in this directory
abstract <- tm_map(abstract, stripWhitespace) # transformations
abstract <- tm_map(abstract, content_transformer(tolower))
abstract <- tm_map(abstract, removeWords, stopwords("english"))
# abstract <- tm_map(abstract, stemDocument) # optional in this case
abstract <- tm_map(abstract, removeNumbers) # optional in this case
abstract <- tm_map(abstract, removePunctuation)
# tuning
abstract <- tm_map(abstract, removeWords, "methods")
abstract <- tm_map(abstract, removeWords, "results")
abstract <- tm_map(abstract, removeWords, "conclusions")
abstract <- tm_map(abstract, removeWords, "conclusion")
abstract <- tm_map(abstract, removeWords, "whether")
abstract <- tm_map(abstract, removeWords, "due")
# 6. Print image in a new folder: wordcloud
plot.dir <- paste(wdir, "wordcloud", sep="/")
message(paste("creating new directory: ", plot.dir, sep=""))
dir.create(plot.dir)
setwd(plot.dir)
message(paste("printing file: ", plot.dir, "/wordcloud.png", sep=""))
png(file = "wordcloud.png", width = 1500, height = 1500, units = "px", res = 300, bg = "transparent")
wordcloud(abstract, scale=c(5,0.5), max.words=150, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, "Dark2"))
dev.off()
# 7. Reset the working directory. This may not work when sourcing the script from Github...
setwd(wdir)