pmid.tagcloud.R

#########################################################
#### CAPTURE ABSTRACTS FROM PMIDs & MAKE A WORDCLOUD ####
#########################################################

# GNU-GPL license
# Author: Mareviv (https://talesofr.wordpress.com)

# Script to retrieve and mine abstracts from PubMed (http://www.ncbi.nlm.nih.gov/pubmed)
# Uses the function ReadPubMed from the package RefManageR. It reads the PubMed API similarly to the search engine in PubMed's page.
# This script creates two directories in your working directory: 'corpus1' for the abstracts file, and 'wordcloud' to store the plot.

# First, automagically install needed libraries:
list.of.packages <- c("slam") # installing 'slam' gives error in OSX Yosemite/El Capitan. This is an attempt to fix it, specifying 'type="binary"'.
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages, repos='https://cran.rstudio.com/', type="binary")

list.of.packages <- c("RCurl", "RefManageR", "plyr", "tm", "wordcloud", "SnowballC")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages, repos='https://cran.rstudio.com/')

# Get and store the working directory
wdir <- getwd()

            
# 1. Import PMIDs
message("retrieving PMIDs info...")
library(RCurl)
urli <- getURL("https://gist.githubusercontent.com/aurora-mareviv/14e5837814a8d8d47c20/raw/90b198bae82154688dcd9a2596af798612e6619f/pmids.csv", ssl.verifypeer = FALSE)
pmids <- read.csv(textConnection(urli))
message("PMID info succesfully retrieved")
            
            
# 2. Loop several queries to PubMed and return in a data.frame
index <- pmids$pmId[1:length(pmids$pmId)]  
# The PubMed (free) API may give problems with large queries, so we'll prefer a shorter vector for this test: 
index50 <- pmids$pmId[1:50]

library(RefManageR)
library(plyr)
message("connecting to the free PubMed API...")
auth.pm <- ldply(index50, function(x){
            tmp <- unlist(ReadPubMed(x, database = "PubMed", mindate = 1950))
            tmp <- lapply(tmp, function(z) if(is(z, "person")) paste0(z, collapse = ",") else z)
            data.frame(tmp, stringsAsFactors = FALSE)
           })
message("abstract data successfully downloaded!")

                        
# 3. Create a directory to write the abstracts.txt file into: (this folder can only contain this .txt file!)       
corpus.dir <- paste(wdir, "corpus1", sep="/")
message(paste("creating new directory: ", corpus.dir, sep=""))
dir.create(corpus.dir)
setwd(corpus.dir)

                        
# 4. Extract abstracts to a .txt
text <- paste(auth.pm$abstract)
message(paste("writing file: ", corpus.dir, "/abstracts.txt", sep=""))
writeLines(text, "abstracts.txt")

                        
# 5. Create tagcloud
library(tm)
library(wordcloud)
library(SnowballC)

message("constructing the tagcloud...")
abstract <- Corpus (DirSource(corpus.dir)) # import text file in this directory
abstract <- tm_map(abstract, stripWhitespace) # transformations
abstract <- tm_map(abstract, content_transformer(tolower))
abstract <- tm_map(abstract, removeWords, stopwords("english"))
# abstract <- tm_map(abstract, stemDocument) # optional in this case
abstract <- tm_map(abstract, removeNumbers) # optional in this case
abstract <- tm_map(abstract, removePunctuation)
# tuning
abstract <- tm_map(abstract, removeWords, "methods")
abstract <- tm_map(abstract, removeWords, "results")
abstract <- tm_map(abstract, removeWords, "conclusions")
abstract <- tm_map(abstract, removeWords, "conclusion")
abstract <- tm_map(abstract, removeWords, "whether")
abstract <- tm_map(abstract, removeWords, "due")


# 6. Print image in a new folder: wordcloud
plot.dir <- paste(wdir, "wordcloud", sep="/")
message(paste("creating new directory: ", plot.dir, sep=""))
dir.create(plot.dir)
setwd(plot.dir)
message(paste("printing file: ", plot.dir, "/wordcloud.png", sep=""))
png(file = "wordcloud.png", width = 1500, height = 1500, units = "px", res = 300, bg = "transparent")
wordcloud(abstract, scale=c(5,0.5), max.words=150, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, "Dark2"))
dev.off()


# 7. Reset the working directory. This may not work when sourcing the script from Github...
setwd(wdir)