-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathjaccard.R
67 lines (54 loc) · 2.36 KB
/
jaccard.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
get_similar_pairs_jaccard = function(X, bands_number, rows_per_band, seed = 1L, verbose = TRUE) {
hash_matrix = get_minhash_matrix(unique_shingles_length = ncol(X),
hashfun_number = rows_per_band * bands_number,
seed = seed)
signature_matrix = minhashing(X, hash_matrix); rm(hash_matrix)
sm_nrow = nrow(signature_matrix)
sm_ncol = ncol(signature_matrix)
start = Sys.time()
hashed_signatures = hash_signatures(m = signature_matrix, bands_number, rows_per_band = sm_nrow / bands_number)
if (verbose) message( sprintf( "hashing in %.3f sec", difftime(Sys.time(), start, units = 'secs') ) )
dim(hashed_signatures) = NULL
start = Sys.time()
dt = data.table(hash_val = hashed_signatures,
band_id = rep(seq_len(bands_number), each = sm_ncol),
id1 = rep(seq_len(sm_ncol),times = bands_number ))
dt[, id2 := id1]
start = Sys.time()
# join with itself to receive candidates pairs
dt = dt[dt, on = .(hash_val = hash_val, band_id = band_id, id1 < id2), nomatch = 0, allow.cartesian = T]
if (verbose) print( sprintf( "generating pairs %.3f sec", difftime(Sys.time(), start, units = 'secs') ) )
start = Sys.time()
# caclulate how many times each pair became candidate
dt = dt[, .N, keyby = .(id1, id2)]
if (verbose) print( sprintf( "finding unique pairs %.3f sec", difftime(Sys.time(), start, units = 'secs') ) )
dt
}
# vectorized version of minhash algorithm with many hash functions
minhashing <- function(dtm, hash_matrix, ...) {
if (!inherits(dtm, 'dgCMatrix'))
dtm <- as(dtm, 'dgCMatrix')
dtm <- to_lil( t(dtm) )
minhash_signatures <-
parallel::mcmapply(
function(nnz, hm) {
matrixStats::rowMins(hm[, nnz, drop = FALSE])
},
dtm,
MoreArgs = list(hm = hash_matrix), ...)
minhash_signatures
}
# @name to_lil
# @title Converts 'dgCMatrix' to 'lil' format
# @description Converts 'dgCMatrix' (or coercible to 'dgCMatrix') to 'lil'
# format. This function is specially useful when you want to do LSH for
# "jaccard" similarity (which current minhashing algorithm work with
# List-of-Lists input)
# @param dtm Document-Term matrix
to_lil <- function(dtm) {
Map(f = function(i1,i2, ind) ind[i1:i2],
dtm@p[-length(dtm@p)] + 1L,
dtm@p[-1L],
MoreArgs = list(ind = dtm@i + 1L),
USE.NAMES = F)
}