-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathA2CountMin.py
71 lines (64 loc) · 2.54 KB
/
A2CountMin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import random
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
random.seed(123)
# read first three rows to get information of the txt
summary = pd.read_csv('docword.kos.txt', sep=' ', nrows=3, header=None)
[doc, word, length] = summary[0] # set as doc, word and length
# import data
df = pd.read_csv('docword.kos.txt', sep=' ', skiprows=[0, 1, 2], header=None)
columns_name = ["docID", "wordID", "Frequency"]
df.columns = columns_name # set col names to help use this dataframe
df.Frequency = 1 # we do not use the information of count
# Get the stream tuple
stream = np.vstack((df.docID, df.wordID, df.Frequency)).T.tolist()
# εm= 100, I choose w = 2 / ε, this prob. is at most 1/2
w = int(2 / (100 / length))
# Count-Min Sketch choose d = log(1 /δ), have Pr [ F1 ≥ f1 + εm] ≤ δ
d = int(math.ceil(math.log(1 / 0.1)))
# The hash function elements
p = 1000003
an = np.array(random.sample(range(0, p), d))
bn = np.array(random.sample(range(0, p), d))
# initial the counter
counter = np.zeros((d, w), dtype=int)
totalHash = {}
for i in range(length):
m = stream[i][1]
hashValue = ((((m * an) + bn) % p) % w)
totalHash[m] = hashValue
for j in range(len(hashValue)):
counter[j, hashValue[j]] += 1
totalWordID = range(1, word + 1)
frequency = np.zeros([word, 2]) # init to save memory allocation time (compare than append())
frequency[:, 0] = totalWordID
for i in totalWordID:
temp = totalHash[i]
frequency[i - 1, 1] = min(counter[j, temp[j]] for j in range(d))
# Sort frequency in descending (arrange their opposite number in ascending order)
sortFreq = frequency[(-frequency[:, 1]).argsort()]
# plot
plt.plot(range(word), sortFreq[:, 1])
plt.xlabel('Words in descending sort')
plt.ylabel('Frequency')
plt.title('The figure of descending sorted words by their frequencies')
plt.show()
# load brute force result
brute = pd.read_csv('sortFreq.txt', sep=' ', header=None)
columns_name = ["wordID", "Frequency"]
brute.columns = columns_name
# judge if it's frequency is larger than 1000, return its index
indexList = np.where(brute.Frequency.values > 1000)
larger1000 = {}
# only store >1000 data and change to list
larger1000WordIDList = brute.wordID.values[indexList].tolist()
larger1000FrequencyList = brute.Frequency.values[indexList].tolist()
ans = []
for i in range(len(larger1000WordIDList)):
index = larger1000WordIDList[i]
# compare count-Min sketch result with data which it's larger than 1000 in brute force result
if frequency[index - 1, 1] > 1000:
ans.append(frequency[index - 1, 0])
print(ans)