Skip to content

Commit

Permalink
docs - fixes #2 #9
Browse files Browse the repository at this point in the history
  • Loading branch information
dselivanov committed Mar 15, 2017
1 parent efeb22f commit c809d19
Show file tree
Hide file tree
Showing 10 changed files with 89 additions and 24 deletions.
10 changes: 5 additions & 5 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
Package: LSHR
Type: Package
Title: Locality Sensitive Hashing In R
Title: Locality Sensitive Hashing in R
Version: 0.2.0
Date: 2017-03-08
Authors@R: person("Dmitriy", "Selivanov", role = c("aut", "cre"),
email = "[email protected]")
Maintainer: Dmitriy Selivanov <[email protected]>
Description: Fast and memory efficient duplicate-detection and near-neighbor
search in high-dimensional data.
Only Cosine and Jaccard distances supported at the moment
search in high-dimensional data. Only Cosine and Jaccard distances supported at the moment.
License: MIT + file LICENSE
URL: https://github.com/dselivanov/LSHR
BugReports: https://github.com/dselivanov/LSHR/issues
VignetteBuilder: knitr
SystemRequirements: C++11
Depends:
Matrix
Matrix,
methods
Imports:
data.table(>= 1.9.4),
data.table(>= 1.9.10),
magrittr (>= 1.5),
Rcpp (>= 0.10.3),
matrixStats,
Expand Down
2 changes: 1 addition & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# Generated by roxygen2: do not edit by hand

export(get_minhash_matrix)
export(get_s_curve)
export(get_similar_pairs)
export(get_similar_pairs_cosine)
import(Matrix)
import(data.table)
import(magrittr)
import(methods)
importFrom(Rcpp,evalCpp)
importFrom(ggplot2,aes)
importFrom(ggplot2,geom_line)
Expand Down
12 changes: 9 additions & 3 deletions R/LSHR.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#' @useDynLib LSHR
#' @import data.table
#' @import magrittr
#' @import methods
#' @import Matrix
#' @importFrom Rcpp evalCpp
#' @importFrom ggplot2 ggplot geom_line aes scale_color_discrete
Expand All @@ -13,7 +14,7 @@ NULL
#' @export
#' @name get_similar_pairs
#' @title Calculating candidate pairs using locality sensitive hashing.
#'
#' @description For a given matrix function generate indices of similar rows.
#' @param X input matrix - sparse or dense
#' @param bands_number number of bands for LSH algorithm - tradeoff between precision and
#' number of false positive candidates. See \link{get_s_curve} for details.
Expand All @@ -36,10 +37,12 @@ NULL
#' library(LSHR)
#' library(Matrix)
#' data("movie_review")
#' it <- itoken(movie_review$review, preprocess_function = tolower, tokenizer = word_tokenizer)
#' it <- itoken(movie_review$review, preprocess_function = tolower,
#' tokenizer = word_tokenizer)
#' dtm <- create_dtm(it, hash_vectorizer())
#' dtm = as(dtm, "RsparseMatrix")
#' pairs = get_similar_pairs(dtm, bands_number = 4, rows_per_band = 32, distance = 'cosine', verbose = TRUE)
#' pairs = get_similar_pairs(dtm, bands_number = 4, rows_per_band = 32,
#' distance = 'cosine', verbose = TRUE)
#' pairs[order(-N)]
#' }

Expand All @@ -51,3 +54,6 @@ get_similar_pairs = function(X, bands_number, rows_per_band, distance = c("cosin
cosine = get_similar_pairs_cosine (X, bands_number, rows_per_band, seed, verbose, mc.cores = mc.cores, ...)
)
}


globalVariables(c("n_bands", "probability_become_candidate", "similarity", "n_rows_per_band", "band_id", "hash_val", "id1", "id2", "."))
1 change: 0 additions & 1 deletion R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ hashfun_2 <- function(vec) {
.Call('LSHR_hashfun_2', PACKAGE = 'LSHR', vec)
}

#' @export
get_minhash_matrix <- function(unique_shingles_length, hashfun_number, seed) {
.Call('LSHR_get_minhash_matrix', PACKAGE = 'LSHR', unique_shingles_length, hashfun_number, seed)
}
Expand Down
17 changes: 13 additions & 4 deletions R/cosine.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
#' @rdname get_similar_pairs
#' @param mc.cores number of cores to use for bands processing - random projection and candidate selection
#' (this is embrassingly parallel task - can be done independently for each band).
#' Most epensive operations - random projection. It is itself parallelized with OpenMP, so when \code{mc.cores > 1}
#' random projection becomes single threaded. \bold{usually we recommend use \code{mc.cores = 1}} and rely on internal
#' OpenMP parallelism. Candidate selection which not trivially parallelizable is not usually a bottleneck.
#' @param n_band_join calculate in how many bands signatures became same. Since each bucket is independant obvious way is
#' to calculate this stastics at the end (by default), so we will do it only once. On the other side we can calculate it
#' each \code{n_band_join} so we can save some memory (if this becomes a issue).
#' \bold{in most cases we recommend to use default value for this parameter}.
#' @param ... other parameters to \code{mclapply} (used if \code{mc.cores > 1} )
#' @export
get_similar_pairs_cosine <- function(X, bands_number, rows_per_band, seed = 1L, verbose = FALSE,
mc.cores = 1, n_band_join = bands_number, ...) {
Expand All @@ -17,15 +28,13 @@ get_similar_pairs_cosine <- function(X, bands_number, rows_per_band, seed = 1L,
pad_bits_matrix = NULL
if(rows_per_band < PACK_BITS)
pad_bits_matrix = matrix(0L, ncol = PACK_BITS - rows_per_band, nrow = N)
# hashfun_number = bands_number * rows_per_band
# allocate memory for result
n_chunks = ceiling(bands_number / n_band_join)
result = NULL
# suppressWarnings for case when "data length is not a multiple of split variable"
suppressWarnings({batch_indices = split(1:bands_number, rep(seq_len(n_chunks), each = n_band_join))})
for(bi in batch_indices) {

# sketches = parallel::mclapply(seq_len(bands_number), function(i) {
sketches = parallel::mclapply(bi, function(i) {
start = Sys.time()

Expand All @@ -39,7 +48,7 @@ get_similar_pairs_cosine <- function(X, bands_number, rows_per_band, seed = 1L,
} else
{
# FIXME - will be faster to generate projections "on the fly" instead of generating `sample` and then multilply
hm = matrix(runif(ncol(X) * rows_per_band, -2**16, 2**16), nrow = rows_per_band)
hm = matrix(stats::runif(ncol(X) * rows_per_band, -2**16, 2**16), nrow = rows_per_band)
x = tcrossprod(X, hm)
if(class(x) != 'matrix') x = as.matrix(x)
x = sign_bit(x)
Expand All @@ -53,7 +62,7 @@ get_similar_pairs_cosine <- function(X, bands_number, rows_per_band, seed = 1L,
setDT(dt)
dt[, id2 := id1]
# join with itself to generate all candidates pairs
dt = dt[dt, on = .(hash_val = hash_val, id1 > id2), nomatch = 0, allow.cartesian = T]
dt = dt[dt, on = list(hash_val = hash_val, id1 > id2), nomatch = 0, allow.cartesian = T]
pair_self_join_time = difftime(Sys.time(), start, units = 'secs')
# caclulate how many times each pair became candidate - local reduce
start = Sys.time()
Expand Down
3 changes: 1 addition & 2 deletions R/jaccard.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

get_similar_pairs_jaccard = function(X, bands_number, rows_per_band, seed = 1L, verbose = TRUE) {

hash_matrix = get_minhash_matrix(unique_shingles_length = ncol(X),
Expand Down Expand Up @@ -39,7 +38,7 @@ minhashing <- function(dtm, hash_matrix, ...) {
if (!inherits(dtm, 'dgCMatrix'))
dtm <- as(dtm, 'dgCMatrix')

dtm <- LSHR:::to_lil( t(dtm) )
dtm <- to_lil( t(dtm) )

minhash_signatures <-
parallel::mcmapply(
Expand Down
14 changes: 11 additions & 3 deletions R/s_curve.R
Original file line number Diff line number Diff line change
@@ -1,10 +1,18 @@
# df <- get_s_curve(2400, n_bands_min = 40, n_rows_per_band_min = 20)
#' @name get_s_curve
#' @title Calculates/plots "S-curve"
#' @description Calculates "S-curve". This is used to pick number of hash functions and bands which
#' provides tradeoff between precision and recall for approximate near neighbor search.
#' @param number_hashfun guess about number of hash functions to use
#' @param n_bands_min don't plot s-curves for number of bands less than \code{n_bands_min}
#' @param n_rows_per_band_min don't plot s-curves for number of rows less than \code{n_rows_per_band_min}
#' @param plot logical, whether to plot s-curves.
#' @export
get_s_curve <- function(number_hashfun,
n_bands_min = 1,
n_rows_per_band_min = 1,
s = seq(0.5, 1, 0.01),
plot = TRUE) {
plot = interactive()) {

s = seq(0.5, 1, 0.005)

bands_number <- divisors(number_hashfun)
rows_per_band <- number_hashfun / bands_number
Expand Down
23 changes: 23 additions & 0 deletions man/get_s_curve.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 25 additions & 3 deletions man/get_similar_pairs.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions src/hash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ Rcpp::IntegerVector hashfun_2(IntegerVector vec) {
return res;
}

//' @export
// [[Rcpp::export]]
IntegerVector get_minhash_matrix(uint32_t unique_shingles_length, uint32_t hashfun_number, uint32_t seed) {
IntegerMatrix res_matrix( hashfun_number, unique_shingles_length);
Expand All @@ -47,7 +46,7 @@ IntegerVector get_minhash_matrix(uint32_t unique_shingles_length, uint32_t hashf
// http://stackoverflow.com/questions/24676237/generating-random-hash-functions-for-lsh-minhash-algorithm
// http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/rsa.pdf
for (uint32_t j = 0; j < hashfun_number; j++) {
res_matrix(j, i) = h1 + (j + 1) * h2 + j * j;
res_matrix(j, i) = atom_hashfun_1(h1 + h2 + j);
}
}
return res_matrix;
Expand Down

0 comments on commit c809d19

Please sign in to comment.