Skip to content

Commit

Permalink
lsh test,fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
sgreben committed Oct 9, 2024
1 parent 8633573 commit 8055df1
Show file tree
Hide file tree
Showing 12 changed files with 287 additions and 175 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
*.out
*.txt
*.test
testdata
4 changes: 4 additions & 0 deletions internal/heap/heap.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ func MakeMax[T int | uint64](distances []int, value []T) Max[T] {
}
}

func (me *Max[T]) Len() int {
return me.len
}

func (me *Max[T]) swap(i, j int) {
me.distances[i], me.distances[j] = me.distances[j], me.distances[i]
me.values[i], me.values[j] = me.values[j], me.values[i]
Expand Down
4 changes: 4 additions & 0 deletions internal/heap/heap_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ func TestNeighborHeapPushPop(t *testing.T) {

heap.PushPop(25, 4)

if heap.Len() != 3 {
t.Error("Expected length not to change")
}

// Check if heap is reordered correctly
expectedDistances := []int{25, 20, 10,
30,
Expand Down
82 changes: 46 additions & 36 deletions lsh/hashes.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,24 @@ func (me HashFunc) Hash(data []uint64, out []uint64) {
}
}

type HashCompose []Hash

// Hash1 applies the function to a single uint64 value.
func (me HashCompose) Hash1(x uint64) uint64 {
for _, h := range me {
x = h.Hash1(x)
}
return x
}

// Hash applies the function to a slice of uint64 values.
func (me HashCompose) Hash(data []uint64, out []uint64) {
for _, h := range me {
h.Hash(data, out)
data = out
}
}

// NoHash is the identity function. Used as a dummy [Hash] for testing.
type NoHash struct{}

Expand All @@ -35,6 +53,17 @@ func (me NoHash) Hash(data []uint64, out []uint64) {
copy(out, data)
}

// ConstantHash is a constant 0 function. Used as a dummy [Hash] for testing.
type ConstantHash struct{}

// Hash1 returns the given value.
func (me ConstantHash) Hash1(x uint64) uint64 { return 0 }

// Hash copies the input slice to the output slice.
func (me ConstantHash) Hash(data []uint64, out []uint64) {
clear(out)
}

// MinHashes is a concatenation of [MinHash]es
type MinHashes []MinHash

Expand Down Expand Up @@ -117,47 +146,12 @@ func (me MinHash) Hash(data []uint64, out []uint64) {
for j, m := range me {
if (d & m) != 0 {
out[i] = uint64(j)
break
}
}
}
}

var boxBlur3LUT = [8]uint64{
0, // 0b000,
0, // 0b001,
0, // 0b010,
1, // 0b011,
0, // 0b100,
1, // 0b101,
1, // 0b110,
1, // 0b111,
}

func boxBlur3(x uint64) uint64 {
var b uint64
b = boxBlur3LUT[x&0b11]
for i := range 61 {
b |= boxBlur3LUT[x&0b111] << (i + 1)
x >>= 1
}
return b
}

// BoxBlur3 hashes values by applying a box blur with radius 3 (each bit in the output is the average of the 3 neighboring bits in the input)
type BoxBlur3 struct{}

// Hash1 hashes a single uint64 value.
func (me BoxBlur3) Hash1(x uint64) uint64 {
return boxBlur3(x)
}

// Hash hashes a slice of uint64 values.
func (me BoxBlur3) Hash(data []uint64, out []uint64) {
for i, d := range data {
out[i] = boxBlur3(d)
}
}

// Blur hashes values based on thresholding the number of bits in common with the given bitmasks.
// For bitmasks of consecutive set bits, this is in effect a "blur" of the bit vector.
type Blur struct {
Expand Down Expand Up @@ -254,3 +248,19 @@ func RandomBitSampleR(numBitsSet int, rand *rand.Rand) BitSample {
}
return BitSample(out)
}

// BoxBlur generates a Blur that averages groups of neighboring bits for each bit in the output.
func BoxBlur(radius int, step int) Blur {
mask := uint64(1<<radius) - 1
threshold := (radius / 2) + 1
n := 64
bits := make([]uint64, n)
for i := radius; i < 64-radius; i += step {
bits[i] = mask
mask <<= step
}
return Blur{
Masks: bits,
Threshold: threshold,
}
}
144 changes: 74 additions & 70 deletions lsh/hashes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,28 @@ import (

"github.com/keilerkonzept/bitknn/internal/testrandom"
"github.com/keilerkonzept/bitknn/lsh"
"pgregory.net/rapid"
)

func TestNoHash(t *testing.T) {
var h lsh.NoHash
query := uint64(0x12345)
data := []uint64{0x12345, 0x54321}
out := make([]uint64, len(data))
if h.Hash1(query) != query {
t.Fatal()
}
h.Hash(data, out)
if !reflect.DeepEqual(data, out) {
t.Fatal()
}
func TestHashCompose(t *testing.T) {
h1 := lsh.RandomBlurR(3, 20, testrandom.Source)
h2 := lsh.RandomMinHash()
h := lsh.HashCompose{h1, h2}
rapid.Check(t, func(t *rapid.T) {
q := rapid.Uint64().Draw(t, "q")
qs := rapid.SliceOf(rapid.Uint64()).Draw(t, "qs")
if h.Hash1(q) != h2.Hash1(h1.Hash1(q)) {
t.Fatal()
}
out12 := make([]uint64, len(qs))
out := make([]uint64, len(qs))
h.Hash(qs, out)
h1.Hash(qs, out12)
h2.Hash(out12, out12)
if !reflect.DeepEqual(out, out12) {
t.Fatal()
}
})
}

func TestMinHash(t *testing.T) {
Expand Down Expand Up @@ -140,6 +148,36 @@ func TestBlur(t *testing.T) {
}
})

t.Run("BoxBlur", func(t *testing.T) {
trials := 1000
yCloser := 0
zCloser := 0
for range trials {
n := testrandom.Source.IntN(32)
dist := func(x, y uint64) int {
return bits.OnesCount64(x ^ y)
}
flipNBits := uint64(lsh.RandomBitSampleR(n, testrandom.Source))
flip2NBits := uint64(lsh.RandomBitSampleR(2*n, testrandom.Source))
x := testrandom.Query()
y := x ^ flipNBits
z := x ^ flip2NBits
h := lsh.BoxBlur(3, 3)
dy := dist(h.Hash1(x), h.Hash1(y))
dz := dist(h.Hash1(x), h.Hash1(z))
if dy < dz {
yCloser++
}
if dy > dz {
zCloser++
}
}

if zCloser > yCloser {
t.Errorf("Expected Hash1(x) to be closer to Hash1(y) more often than Hash1(x) to be closer to Hash1(z), got %d and %d", yCloser, zCloser)
}
})

t.Run("Blur_Hamming_LS_Property", func(t *testing.T) {
x := uint64(0b1110)
y := uint64(0b1100)
Expand Down Expand Up @@ -171,7 +209,7 @@ func TestBlur(t *testing.T) {

xyEqual := 0
xzEqual := 0
trials := 1000
trials := 10_000

for range trials {
h := lsh.RandomBlurR(3, 10, testrandom.Source)
Expand Down Expand Up @@ -350,7 +388,7 @@ func TestMinHashes(t *testing.T) {

xyEqual := 0
xzEqual := 0
trials := 1000
trials := 10_000

for range trials {
h := lsh.RandomMinHashesR(3, testrandom.Source)
Expand Down Expand Up @@ -390,67 +428,33 @@ func TestHashFunc(t *testing.T) {
}
}

func TestBoxBlur3(t *testing.T) {
t.Run("BoxBlur3_Hash1", func(t *testing.T) {
var h lsh.BoxBlur3

testCases := []struct {
input uint64
want uint64
}{
{0xF0F0F0F0, 0xF0F0F0F0},
{0x0F0F0F0F, 0x0F0F0F0F},
{
0b11110010111100101111001011110010,
0b11110001111100011111000111110000,
},
func TestDummyHashes(t *testing.T) {
t.Run("NoHash", func(t *testing.T) {
var h lsh.NoHash
query := uint64(0x12345)
data := []uint64{0x12345, 0x54321}
out := make([]uint64, len(data))
if h.Hash1(query) != query {
t.Fatal()
}

for _, tc := range testCases {
got := h.Hash1(tc.input)
if got != tc.want {
t.Errorf("BoxBlur3.Hash1(%x) = %x; want %x", tc.input, got, tc.want)
}
h.Hash(data, out)
if !reflect.DeepEqual(data, out) {
t.Fatal()
}
})

t.Run("BoxBlur3_Hash", func(t *testing.T) {
var h lsh.BoxBlur3

input := []uint64{0xF0F0F0F0, 0x0F0F0F0F, 0x72F2F2F2}
output := make([]uint64, len(input))
want := []uint64{0xF0F0F0F0, 0x0F0F0F0F, 0x71F1F1F0}

h.Hash(input, output)

for i, v := range output {
if v != want[i] {
t.Errorf("BoxBlur3.Hash() for input %x = %x; want %x", input[i], v, want[i])
}
t.Run("ConstantHash", func(t *testing.T) {
var h lsh.ConstantHash
q := uint64(0x12345)
data := []uint64{0x12345, 0x54321}
out := make([]uint64, len(data))
if h.Hash1(q) != 0 {
t.Fatal()
}
})

t.Run("BoxBlur3_Hamming_LS_Property", func(t *testing.T) {
xyEqual := 0
xzEqual := 0
trials := 1000
var h lsh.BoxBlur3
for range trials {
flip3Bits := uint64(lsh.RandomBitSampleR(3, testrandom.Source))
flip10Bits := uint64(lsh.RandomBitSampleR(10, testrandom.Source))
x := testrandom.Query()
y := x ^ flip3Bits
z := x ^ flip10Bits
if h.Hash1(x) == h.Hash1(y) {
xyEqual++
h.Hash(data, out)
for i := range out {
if out[i] != 0 {
t.Fatal()
}
if h.Hash1(x) == h.Hash1(z) {
xzEqual++
}
}

if xyEqual <= xzEqual {
t.Errorf("Expected Hash1(x) to equal Hash1(y) more often than Hash1(x) to equal Hash1(z), got %d and %d", xyEqual, xzEqual)
}
})
}
28 changes: 18 additions & 10 deletions lsh/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,19 @@ type Model struct {
*bitknn.Model
Hash Hash // LSH function mapping points to bucket IDs.

BucketIDs []uint64 // Bucket IDs.
Buckets map[uint64]slice.IndexRange // Bucket contents for each hash (offset+length in Data).
HeapBucketIDs []uint64
BucketIDs []uint64 // Bucket IDs.
Buckets map[uint64]slice.IndexRange // Bucket contents for each hash (offset+length in Data).

HeapBucketDistances []int
HeapBucketIDs []uint64
}

// PreallocateHeap allocates memory for the nearest neighbor heap.
func (me *Model) PreallocateHeap(k int) {
me.HeapBucketDistances = slice.OrAlloc(me.HeapBucketDistances, k+1)
me.HeapBucketIDs = slice.OrAlloc(me.HeapBucketIDs, k+1)
me.HeapDistances = slice.OrAlloc(me.HeapDistances, k+1)
me.HeapIndices = slice.OrAlloc(me.HeapIndices, k+1)
me.HeapBucketIDs = slice.OrAlloc(me.HeapBucketIDs, k+1)
}

// Fit creates and fits an LSH k-NN model using the provided data, labels, and hash function.
Expand Down Expand Up @@ -65,22 +68,27 @@ func Fit(data []uint64, labels []int, hash Hash, opts ...bitknn.Option) *Model {

// Predict1 predicts the label for a single input using the LSH model.
func (me *Model) Predict1(k int, x uint64, votes []float64) int {
me.HeapBucketDistances = slice.OrAlloc(me.HeapBucketDistances, k+1)
me.HeapBucketIDs = slice.OrAlloc(me.HeapBucketIDs, k+1)
me.HeapDistances = slice.OrAlloc(me.HeapDistances, k+1)
me.HeapIndices = slice.OrAlloc(me.HeapIndices, k+1)
me.HeapBucketIDs = slice.OrAlloc(me.HeapBucketIDs, k+1)
return me.Predict1Into(k, x, votes, me.HeapDistances, me.HeapBucketIDs, me.HeapIndices)
return me.Predict1Into(k, x, votes, me.HeapBucketDistances, me.HeapBucketIDs, me.HeapDistances, me.HeapIndices)
}

// Predicts the label of a single input point. Each call allocates three new slices of length [k]+1 for the neighbor heaps.
func (me *Model) Predict1Alloc(k int, x uint64, votes []float64) int {
distances, indices, bucketIDs := make([]int, k+1), make([]int, k+1), make([]uint64, k+1)
return me.Predict1Into(k, x, votes, distances, bucketIDs, indices)
bucketDistances := make([]int, k+1)
bucketIDs := make([]uint64, k+1)
distances := make([]int, k+1)
indices := make([]int, k+1)

return me.Predict1Into(k, x, votes, bucketDistances, bucketIDs, distances, indices)
}

// Predict1Into predicts the label for a single input using the given slices (of length [k]+1 each) for the neighbor heaps.
func (me *Model) Predict1Into(k int, x uint64, votes []float64, distances []int, bucketIDs []uint64, indices []int) int {
func (me *Model) Predict1Into(k int, x uint64, votes []float64, bucketDistances []int, bucketIDs []uint64, distances []int, indices []int) int {
xp := me.Hash.Hash1(x)
k, n := Nearest(me.Data, me.BucketIDs, me.Buckets, k, xp, x, distances, bucketIDs, indices)
k, n := Nearest(me.Data, me.BucketIDs, me.Buckets, k, xp, x, bucketDistances, bucketIDs, distances, indices)

clear(votes)
switch me.DistanceWeighting {
Expand Down
Loading

0 comments on commit 8055df1

Please sign in to comment.