various updates

bquast · Dec 3, 2014 · de1ca23 · de1ca23
1 parent 3abba02
commit de1ca23
Show file tree

Hide file tree

Showing 7 changed files with 18 additions and 11 deletions.
diff --git a/construct_frequency_table.R b/construct_frequency_table.R
@@ -8,7 +8,7 @@ library(RWeka)
 library(dplyr)
 
 # load the sample data
-load("sample.RData")
+load("sample_data.RData")
 
 # ngram tokaniser
 n <- 2L
@@ -17,7 +17,7 @@ n <- 3L
 trigram_token <- function(x) NGramTokenizer(x, Weka_control(min = n, max = n))
 
 # check length function
-lengthIs <- function(n) function(x) length(x)==n
+length_is <- function(n) function(x) length(x)==n
 
 # contruct single corpus from sample data
 sample_blogs %>%
@@ -51,6 +51,8 @@ tdm_unigram %>%
   as.matrix %>%
   rowSums -> freq_unigram
 
+# write all unigrams to a list
+# in order to create uniform levels of factors
 unigram_levels <- unique(tdm_unigram$dimnames$Terms)
 
 # trigram Term-Document Matrix
@@ -77,7 +79,7 @@ freq_trigram %>%
 
 # filter out those of less than three columns
 freq_trigram <- do.call(rbind, 
-                        Filter( lengthIs(3),
+                        Filter( length_is(3),
                                 freq_trigram )
                         )
 
@@ -87,4 +89,4 @@ df_trigram <- data.frame(X1 = factor(freq_trigram[,1], levels = unigram_levels),
                          Y  = factor(freq_trigram[,3], levels = unigram_levels) )
 
 # save data frame
-save( df_trigram, file = "df_trigram.RData")
+save( df_trigram, unigram_levels, file = "df_trigram.RData")
diff --git a/df_trigram.RData b/df_trigram.RData
diff --git a/import_data.R b/import_data.R
@@ -3,6 +3,9 @@
 # [email protected]
 # ------------------
 
+# load the stingi library for text manipulation
+library(stringi)
+
 # inspect the data
 list.files("final")
 list.files("final/en_US")
@@ -19,6 +22,8 @@ rm(con)
 
 # drop non UTF-8 characters
 twitter <- iconv(twitter, from = "latin1", to = "UTF-8", sub="")
+twitter <- stri_replace_all_regex(twitter, "\u2019|`","'")
+twitter <- stri_replace_all_regex(twitter, "\u201c|\u201d|u201f|``",'"')
 
 # save the data to an .RData files
 save(blogs, file="blogs.RData")

diff --git a/predict_word.R b/predict_word.R
@@ -6,7 +6,7 @@
 library(e1071)
 
 # load the model
-load("model.RData")
+load("tri_naiveBayes.RData")
 
 # create a test string
 test_string <- "accused of"
@@ -15,10 +15,10 @@ test_string <- "accused of"
 test_split <- strsplit(test_string, split = " " )
 
 # encode as a factor using the same levels
-test_factor <- factor(unlist(test_split), levels=news_levels)
+test_factor <- factor(unlist(test_split), levels=unigram_levels)
 
 # transform to data frame
 test_df <- data.frame(X1 = test_factor[1], X2 = test_factor[2])
 
 # estimate using the model
-predict(tri.naiveBayes, test_df)
+predict(tri_naiveBayes, test_df)
diff --git a/sample_data.R b/sample_data.R
@@ -8,9 +8,9 @@ load("news.RData")
 load("twitter.RData")
 
 # sample data (100,000 of each)
-sample_blogs   <- sample(blogs, 100)
-sample_news    <- sample(news, 100)
-sample_twitter <- sample(twitter, 100)
+sample_blogs   <- sample(blogs, 10000)
+sample_news    <- sample(news, 10000)
+sample_twitter <- sample(twitter, 10000)
 
 # save samples
 save(sample_blogs, sample_news, sample_twitter, file= "sample_data.RData")
diff --git a/train_model.R b/train_model.R
@@ -13,4 +13,4 @@ tri_naiveBayes <- naiveBayes( Y ~ X1 + X2 ,
                               df_trigram )
 
 # save the model
-save(tri_naiveBayes, file = "tri_naiveBayes.RData")
+save(tri_naiveBayes, unigram_levels, file = "tri_naiveBayes.RData")
diff --git a/tri_naiveBayes.RData b/tri_naiveBayes.RData