docs - fixes #2 #9

dselivanov · Mar 15, 2017 · c809d19 · c809d19
1 parent efeb22f
commit c809d19
Show file tree

Hide file tree

Showing 10 changed files with 89 additions and 24 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,23 +1,23 @@
 Package: LSHR
 Type: Package
-Title: Locality Sensitive Hashing In R
+Title: Locality Sensitive Hashing in R
 Version: 0.2.0
 Date: 2017-03-08
 Authors@R: person("Dmitriy", "Selivanov", role = c("aut", "cre"),
     email = "[email protected]")
 Maintainer: Dmitriy Selivanov <[email protected]>
 Description: Fast and memory efficient duplicate-detection and near-neighbor
-    search in high-dimensional data. 
-    Only Cosine and Jaccard distances supported at the moment
+    search in high-dimensional data. Only Cosine and Jaccard distances supported at the moment.
 License: MIT + file LICENSE
 URL: https://github.com/dselivanov/LSHR
 BugReports: https://github.com/dselivanov/LSHR/issues
 VignetteBuilder: knitr
 SystemRequirements: C++11
 Depends:
-    Matrix
+    Matrix,
+    methods
 Imports:
-    data.table(>= 1.9.4),
+    data.table(>= 1.9.10),
     magrittr (>= 1.5),
     Rcpp (>= 0.10.3),
     matrixStats,

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,12 +1,12 @@
 # Generated by roxygen2: do not edit by hand
 
-export(get_minhash_matrix)
 export(get_s_curve)
 export(get_similar_pairs)
 export(get_similar_pairs_cosine)
 import(Matrix)
 import(data.table)
 import(magrittr)
+import(methods)
 importFrom(Rcpp,evalCpp)
 importFrom(ggplot2,aes)
 importFrom(ggplot2,geom_line)

diff --git a/R/LSHR.R b/R/LSHR.R
@@ -5,6 +5,7 @@
 #' @useDynLib LSHR
 #' @import data.table
 #' @import magrittr
+#' @import methods
 #' @import Matrix
 #' @importFrom Rcpp evalCpp
 #' @importFrom ggplot2 ggplot geom_line aes scale_color_discrete
@@ -13,7 +14,7 @@ NULL
 #' @export
 #' @name get_similar_pairs
 #' @title Calculating candidate pairs using locality sensitive hashing.
-#'
+#' @description For a given matrix function generate indices of similar rows.
 #' @param X input matrix - sparse or dense
 #' @param bands_number number of bands for LSH algorithm - tradeoff between precision and
 #' number of false positive candidates. See \link{get_s_curve} for details.
@@ -36,10 +37,12 @@ NULL
 #' library(LSHR)
 #' library(Matrix)
 #' data("movie_review")
-#' it <- itoken(movie_review$review, preprocess_function = tolower, tokenizer = word_tokenizer)
+#' it <- itoken(movie_review$review, preprocess_function = tolower,
+#' tokenizer = word_tokenizer)
 #' dtm <- create_dtm(it, hash_vectorizer())
 #' dtm = as(dtm, "RsparseMatrix")
-#' pairs = get_similar_pairs(dtm, bands_number = 4, rows_per_band = 32, distance = 'cosine', verbose = TRUE)
+#' pairs = get_similar_pairs(dtm, bands_number = 4, rows_per_band = 32,
+#' distance = 'cosine', verbose = TRUE)
 #' pairs[order(-N)]
 #' }
 
@@ -51,3 +54,6 @@ get_similar_pairs = function(X, bands_number, rows_per_band, distance = c("cosin
          cosine =  get_similar_pairs_cosine (X, bands_number, rows_per_band, seed, verbose, mc.cores = mc.cores, ...)
   )
 }
+
+
+globalVariables(c("n_bands", "probability_become_candidate", "similarity", "n_rows_per_band", "band_id", "hash_val", "id1", "id2", "."))
diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -9,7 +9,6 @@ hashfun_2 <- function(vec) {
     .Call('LSHR_hashfun_2', PACKAGE = 'LSHR', vec)
 }
 
-#' @export
 get_minhash_matrix <- function(unique_shingles_length, hashfun_number, seed) {
     .Call('LSHR_get_minhash_matrix', PACKAGE = 'LSHR', unique_shingles_length, hashfun_number, seed)
 }

diff --git a/R/cosine.R b/R/cosine.R
@@ -1,3 +1,14 @@
+#' @rdname get_similar_pairs
+#' @param mc.cores number of cores to use for bands processing - random projection and candidate selection
+#' (this is embrassingly parallel task - can be done independently for each band).
+#' Most epensive operations - random projection. It is itself parallelized with OpenMP, so when \code{mc.cores > 1}
+#' random projection becomes single threaded. \bold{usually we recommend use \code{mc.cores = 1}} and rely on internal
+#' OpenMP parallelism. Candidate selection which not trivially parallelizable is not usually a bottleneck.
+#' @param n_band_join calculate in how many bands signatures became same. Since each bucket is independant obvious way is
+#' to calculate this stastics at the end (by default), so we will do it only once. On the other side we can calculate it
+#' each \code{n_band_join} so we can save some memory (if this becomes a issue).
+#' \bold{in most cases we recommend to use default value for this parameter}.
+#' @param ... other parameters to \code{mclapply} (used if \code{mc.cores > 1} )
 #' @export
 get_similar_pairs_cosine <- function(X, bands_number, rows_per_band, seed = 1L, verbose = FALSE,
                                      mc.cores = 1, n_band_join = bands_number, ...) {
@@ -17,15 +28,13 @@ get_similar_pairs_cosine <- function(X, bands_number, rows_per_band, seed = 1L,
   pad_bits_matrix = NULL
   if(rows_per_band < PACK_BITS)
     pad_bits_matrix = matrix(0L, ncol = PACK_BITS - rows_per_band, nrow = N)
-  # hashfun_number = bands_number * rows_per_band
   # allocate memory for result
   n_chunks = ceiling(bands_number / n_band_join)
   result = NULL
   # suppressWarnings for case when "data length is not a multiple of split variable"
   suppressWarnings({batch_indices = split(1:bands_number, rep(seq_len(n_chunks), each = n_band_join))})
   for(bi in batch_indices) {
 
-    # sketches = parallel::mclapply(seq_len(bands_number), function(i) {
     sketches = parallel::mclapply(bi, function(i) {
       start = Sys.time()
 
@@ -39,7 +48,7 @@ get_similar_pairs_cosine <- function(X, bands_number, rows_per_band, seed = 1L,
         } else
         {
           # FIXME - will be faster to generate projections "on the fly" instead of generating `sample` and then multilply
-          hm = matrix(runif(ncol(X) * rows_per_band, -2**16, 2**16), nrow = rows_per_band)
+          hm = matrix(stats::runif(ncol(X) * rows_per_band, -2**16, 2**16), nrow = rows_per_band)
           x = tcrossprod(X, hm)
           if(class(x) != 'matrix') x = as.matrix(x)
           x = sign_bit(x)
@@ -53,7 +62,7 @@ get_similar_pairs_cosine <- function(X, bands_number, rows_per_band, seed = 1L,
       setDT(dt)
       dt[, id2 := id1]
       # join with itself to generate all candidates pairs
-      dt = dt[dt, on = .(hash_val = hash_val, id1 > id2), nomatch = 0, allow.cartesian = T]
+      dt = dt[dt, on = list(hash_val = hash_val, id1 > id2), nomatch = 0, allow.cartesian = T]
       pair_self_join_time = difftime(Sys.time(), start, units = 'secs')
       # caclulate how many times each pair became candidate - local reduce
       start = Sys.time()

diff --git a/R/jaccard.R b/R/jaccard.R
@@ -1,4 +1,3 @@
-
 get_similar_pairs_jaccard = function(X, bands_number, rows_per_band, seed = 1L, verbose = TRUE) {
 
   hash_matrix = get_minhash_matrix(unique_shingles_length = ncol(X),
@@ -39,7 +38,7 @@ minhashing <- function(dtm, hash_matrix, ...) {
   if (!inherits(dtm, 'dgCMatrix'))
     dtm <- as(dtm, 'dgCMatrix')
 
-  dtm <- LSHR:::to_lil( t(dtm) )
+  dtm <- to_lil( t(dtm) )
 
   minhash_signatures <-
     parallel::mcmapply(

diff --git a/R/s_curve.R b/R/s_curve.R
@@ -1,10 +1,18 @@
-# df <- get_s_curve(2400, n_bands_min = 40, n_rows_per_band_min = 20)
+#' @name get_s_curve
+#' @title Calculates/plots "S-curve"
+#' @description Calculates "S-curve". This is used to pick number of hash functions and bands which
+#' provides tradeoff between precision and recall for approximate near neighbor search.
+#' @param number_hashfun guess about number of hash functions to use
+#' @param n_bands_min don't plot s-curves for number of bands less than \code{n_bands_min}
+#' @param n_rows_per_band_min don't plot s-curves for number of rows less than \code{n_rows_per_band_min}
+#' @param plot logical, whether to plot s-curves.
 #' @export
 get_s_curve <- function(number_hashfun,
                         n_bands_min = 1,
                         n_rows_per_band_min = 1,
-                        s = seq(0.5, 1, 0.01),
-                        plot = TRUE) {
+                        plot = interactive()) {
+
+  s = seq(0.5, 1, 0.005)
 
   bands_number <- divisors(number_hashfun)
   rows_per_band <- number_hashfun / bands_number

diff --git a/man/get_s_curve.Rd b/man/get_s_curve.Rd
diff --git a/man/get_similar_pairs.Rd b/man/get_similar_pairs.Rd
diff --git a/src/hash.cpp b/src/hash.cpp
@@ -34,7 +34,6 @@ Rcpp::IntegerVector hashfun_2(IntegerVector vec) {
   return res;
 }
 
-//' @export
 // [[Rcpp::export]]
 IntegerVector get_minhash_matrix(uint32_t unique_shingles_length, uint32_t hashfun_number, uint32_t seed) {
   IntegerMatrix res_matrix( hashfun_number, unique_shingles_length);
@@ -47,7 +46,7 @@ IntegerVector get_minhash_matrix(uint32_t unique_shingles_length, uint32_t hashf
     // http://stackoverflow.com/questions/24676237/generating-random-hash-functions-for-lsh-minhash-algorithm
     // http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/rsa.pdf
     for (uint32_t j = 0; j < hashfun_number; j++) {
-      res_matrix(j, i) = h1 + (j + 1) * h2 + j * j;
+      res_matrix(j, i) = atom_hashfun_1(h1 + h2 + j);
     }
   }
   return res_matrix;