Skip to content

Commit

Permalink
various updates
Browse files Browse the repository at this point in the history
  • Loading branch information
bquast committed Dec 3, 2014
1 parent 3abba02 commit de1ca23
Show file tree
Hide file tree
Showing 7 changed files with 18 additions and 11 deletions.
10 changes: 6 additions & 4 deletions construct_frequency_table.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ library(RWeka)
library(dplyr)

# load the sample data
load("sample.RData")
load("sample_data.RData")

# ngram tokaniser
n <- 2L
Expand All @@ -17,7 +17,7 @@ n <- 3L
trigram_token <- function(x) NGramTokenizer(x, Weka_control(min = n, max = n))

# check length function
lengthIs <- function(n) function(x) length(x)==n
length_is <- function(n) function(x) length(x)==n

# contruct single corpus from sample data
sample_blogs %>%
Expand Down Expand Up @@ -51,6 +51,8 @@ tdm_unigram %>%
as.matrix %>%
rowSums -> freq_unigram

# write all unigrams to a list
# in order to create uniform levels of factors
unigram_levels <- unique(tdm_unigram$dimnames$Terms)

# trigram Term-Document Matrix
Expand All @@ -77,7 +79,7 @@ freq_trigram %>%

# filter out those of less than three columns
freq_trigram <- do.call(rbind,
Filter( lengthIs(3),
Filter( length_is(3),
freq_trigram )
)

Expand All @@ -87,4 +89,4 @@ df_trigram <- data.frame(X1 = factor(freq_trigram[,1], levels = unigram_levels),
Y = factor(freq_trigram[,3], levels = unigram_levels) )

# save data frame
save( df_trigram, file = "df_trigram.RData")
save( df_trigram, unigram_levels, file = "df_trigram.RData")
Binary file modified df_trigram.RData
Binary file not shown.
5 changes: 5 additions & 0 deletions import_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
# [email protected]
# ------------------

# load the stingi library for text manipulation
library(stringi)

# inspect the data
list.files("final")
list.files("final/en_US")
Expand All @@ -19,6 +22,8 @@ rm(con)

# drop non UTF-8 characters
twitter <- iconv(twitter, from = "latin1", to = "UTF-8", sub="")
twitter <- stri_replace_all_regex(twitter, "\u2019|`","'")
twitter <- stri_replace_all_regex(twitter, "\u201c|\u201d|u201f|``",'"')

# save the data to an .RData files
save(blogs, file="blogs.RData")
Expand Down
6 changes: 3 additions & 3 deletions predict_word.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
library(e1071)

# load the model
load("model.RData")
load("tri_naiveBayes.RData")

# create a test string
test_string <- "accused of"
Expand All @@ -15,10 +15,10 @@ test_string <- "accused of"
test_split <- strsplit(test_string, split = " " )

# encode as a factor using the same levels
test_factor <- factor(unlist(test_split), levels=news_levels)
test_factor <- factor(unlist(test_split), levels=unigram_levels)

# transform to data frame
test_df <- data.frame(X1 = test_factor[1], X2 = test_factor[2])

# estimate using the model
predict(tri.naiveBayes, test_df)
predict(tri_naiveBayes, test_df)
6 changes: 3 additions & 3 deletions sample_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ load("news.RData")
load("twitter.RData")

# sample data (100,000 of each)
sample_blogs <- sample(blogs, 100)
sample_news <- sample(news, 100)
sample_twitter <- sample(twitter, 100)
sample_blogs <- sample(blogs, 10000)
sample_news <- sample(news, 10000)
sample_twitter <- sample(twitter, 10000)

# save samples
save(sample_blogs, sample_news, sample_twitter, file= "sample_data.RData")
2 changes: 1 addition & 1 deletion train_model.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ tri_naiveBayes <- naiveBayes( Y ~ X1 + X2 ,
df_trigram )

# save the model
save(tri_naiveBayes, file = "tri_naiveBayes.RData")
save(tri_naiveBayes, unigram_levels, file = "tri_naiveBayes.RData")
Binary file modified tri_naiveBayes.RData
Binary file not shown.

0 comments on commit de1ca23

Please sign in to comment.