-
Notifications
You must be signed in to change notification settings - Fork 0
/
.Rhistory
65 lines (65 loc) · 2.4 KB
/
.Rhistory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
install.packages("tm")
file.info("Coursera-Swiftkey/final/en_US/en_US.blogs.txt")$size / 1024^2
file.info("data/en_US/en_US.blogs.txt")$size / 1024^2
blogs <- readLines("Coursera-Swiftkey/final/en_US/en_US.blogs.txt")
blogs <- readLines("data/en_US/en_US.blogs.txt")
news <- readLines("data/en_US/en_US.news.txt")
twitter <- readLines("data/en_US/en_US.twitter.txt")
news <- readLines("data/en_US/en_US.news.txt")
news <- readLines("data/en_US/en_US.news.txt")
length(twitter)
max(nchar(blogs))
max(nchar(news))
max(nchar(twitter))
love_count <- sum(grepl("love", twitter))
hate_count <- sum(grepl("hate", twitter))
love_count / hate_count
biostats <- grep("biostats", twitter)
twitter[biostats]
sum(grepl("A computer once beat me at chess, but it was no match for me at kickboxing", twitter))
sample.size <- 10000
blogs_text <-
readLines("data/en_US/en_US.blogs.txt", encoding = "UTF-8", n = sample.size)
create_corpus <- function(text_data){
corpus_temp <- VCorpus(VectorSource( sample(text_data, sample.size) ))
corpus_temp <- tm_map(corpus_temp, removeNumbers)
corpus_temp <- tm_map(corpus_temp, stripWhitespace)
corpus_temp <- tm_map(corpus_temp, removePunctuation,preserve_intra_word_dashes = TRUE)
corpus_temp <- tm_map(corpus_temp, content_transformer(tolower))
}
blogs_corpus <- create_corpus(blogs_text)
library(stringi)
library(ggplot2)
library(wordcloud)
library(NLP)
library(tm)
create_corpus <- function(text_data){
sample.size <- 10000
corpus_temp <- VCorpus(VectorSource( sample(text_data, sample.size) ))
corpus_temp <- tm_map(corpus_temp, removeNumbers)
corpus_temp <- tm_map(corpus_temp, stripWhitespace)
corpus_temp <- tm_map(corpus_temp, removePunctuation,preserve_intra_word_dashes = TRUE)
corpus_temp <- tm_map(corpus_temp, content_transformer(tolower))
}
blogs_corpus <- create_corpus(blogs_text)
source('D:/Projects/Learning/DataScience-CapstoneProject/ngram_tokenizer.R')
library("base64enc", lib.loc="~/R/win-library/3.3")
detach("package:base64enc", unload=TRUE)
library("base64enc", lib.loc="~/R/win-library/3.3")
library("bitops", lib.loc="~/R/win-library/3.3")
install.packages("RWeka")
install.packages("dplyr")
install.packages("doParallel")
install.packages("tm")
install.packages("wordcloud")
install.packages("ggplot2")
library(dplyr)
library(doParallel)
library(stringi)
library(tm)
library(slam)
library(ggplot2)
library(wordcloud)
install.packages(c("DBI", "jsonlite", "packrat", "R6", "Rcpp", "stringi"))
ver
version