-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathXAFCM.py
199 lines (149 loc) · 7.22 KB
/
XAFCM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
from math import log2, pow
from ContextLine import ContextLine
from pympler import asizeof
class XAFCM(object):
def __init__(self, alphab_size, k, d, word_size=None, alpha="auto", p=0.9):
self.word_size = word_size
self.number_of_bits = 0.0
self.d = d
self.k = k
if alpha == "auto" or alpha is None:
self.alpha = 1.1
prob = 0
# while prob < p:
while prob < pow(p, self.d):
self.alpha /= 1.1
prob = (1 + self.alpha) / (1 + self.alpha * pow(alphab_size, self.d))
print("auto alpha = %e" % self.alpha)
# if alpha is provided
else:
self.alpha = alpha
self.alphabet_size = alphab_size
self.model_learned = dict()
self.list_of_bits_per_symbol = []
self._default_lidstone = self.alpha / (self.alpha * pow(self.alphabet_size, self.d))
self._default_lidstone_part1 = self.alpha
self._default_lidstone_part2 = self.alpha * pow(self.alphabet_size, self.d)
def _reset_model(self):
self.model_learned = dict()
def _reset_number_of_bits(self):
self.number_of_bits = 0
def learn_model_from_text_files(self, list_of_path_text_files):
self._reset_model()
data = ""
for path_text_file in list_of_path_text_files:
with open(path_text_file, "r") as my_file:
data += my_file.read()
self.learn_models_from_string(data.upper())
def learn_model_from_text_file(self, path_text_file):
self._reset_model()
data = ""
with open(path_text_file, "r") as my_file:
data += my_file.read()
self.learn_models_from_string(data.upper())
def learn_models_from_string(self, np_string):
tmp_word_size = self.word_size
if self.word_size is None:
tmp_word_size = len(np_string)
assert(len(np_string) % tmp_word_size == 0)
self._reset_model()
# print(np_string)
aux_list_k = list(reversed(range(1, self.d + 1)))
aux_list_l = list(reversed(range(self.d + 1, self.d + self.k + 1)))
for curr_word_start_index in range(0, len(np_string), tmp_word_size):
word = np_string[curr_word_start_index:curr_word_start_index+tmp_word_size]
# print("word = %s" % word)
for i in range(0, len(word)):
curr_string_for_l = ""
for curr_l in aux_list_l:
curr_string_for_l += word[(i-curr_l) % len(word)]
curr_string_for_k = ""
for curr_k in aux_list_k:
curr_string_for_k += word[(i-curr_k) % len(word)]
if not ContextLine.check_key_in_dict(curr_string_for_l, self.model_learned):
default_context_line = ContextLine(context_word=curr_string_for_l)
self.model_learned[curr_string_for_l] = default_context_line
self.model_learned[curr_string_for_l].increment_symbol(curr_string_for_k)
def print_models_learned(self):
print("Model learned:")
from operator import itemgetter
for item in sorted(self.model_learned.items(), key=itemgetter(1)):
print(item[1])
def get_memory_size_used_mbytes(self):
mem_used_bytes = asizeof.asizeof(self.model_learned)
mem_used_mbytes = mem_used_bytes / (1024 * 1024)
return mem_used_mbytes
def print_memory_size_used_mbytes(self):
print("RAM used: %.2fMB" % self.get_memory_size_used_mbytes())
def print_details_of_models_learned(self):
different_contexts_found = self.model_learned.keys()
number_of_different_contexts_found = len(different_contexts_found)
print("Found %s different combinations of contexts for k = %s." % (number_of_different_contexts_found, self.k))
def lidstone_probability_part1(self, current_context_word, symbol):
try:
model_line = self.model_learned[current_context_word]
# in case this word never appeared in the reference model
except KeyError as e:
return self._default_lidstone_part1
try:
tmp = model_line.cols[symbol]
# in case this symbol never appeared for this specific word
except KeyError as e:
tmp = 0
return tmp + self.alpha
def lidstone_probability_part2(self, current_context_word, symbol):
try:
model_line = self.model_learned[current_context_word]
# in case this word never appeared in the reference model
except KeyError as e:
return self._default_lidstone_part2
return model_line.cols['total'] + pow(self.alphabet_size, self.d) * self.alpha
def lidstone_estimate_probability_for_symbol(self, current_context_word, symbol):
try:
model_line = self.model_learned[current_context_word]
# in case this word never appeared in the reference model
except KeyError as e:
return self._default_lidstone
try:
tmp = model_line.cols[symbol]
# in case this symbol never appeared for this specific word
except KeyError as e:
tmp = 0
return (tmp + self.alpha) / \
(model_line.cols['total'] + pow(self.alphabet_size, self.d) * self.alpha)
def compress_text_file(self, path_text_file, based_on_model=True):
self._reset_number_of_bits()
data = ""
with open(path_text_file, "r") as my_file:
data += my_file.read()
if based_on_model:
return self.compress_string_based_on_models(data.upper())
# compress itself
else:
return self.compress_string_based_on_counts_so_far(data.upper())
def compress_string_based_on_models(self, string_to_compress):
# compress based on self.model_learned
self._reset_number_of_bits()
self.list_of_bits_per_symbol = []
tmp_word_size = self.word_size
if self.word_size is None:
tmp_word_size = len(string_to_compress)
assert(len(string_to_compress) % tmp_word_size == 0)
assert(self.model_learned != dict())
aux_list_next_seq = list(reversed(range(0, self.d)))
aux_list_current_context_k = list(reversed(range(self.d, self.d + self.k)))
for curr_word_start_index in range(0, len(string_to_compress), tmp_word_size):
word_to_process = string_to_compress[curr_word_start_index:curr_word_start_index+tmp_word_size]
# init curr_string
for i in range(0, len(word_to_process), self.d):
next_sequence = ""
current_context_k = ""
for i_tmp in aux_list_next_seq:
next_sequence += word_to_process[(i - i_tmp) % len(word_to_process)]
for i_tmp in aux_list_current_context_k:
current_context_k += word_to_process[(i - i_tmp) % len(word_to_process)]
prob = self.lidstone_estimate_probability_for_symbol(current_context_k, next_sequence)
tmp_bits_needed = - log2(prob)
self.list_of_bits_per_symbol.append(tmp_bits_needed)
self.number_of_bits += tmp_bits_needed
return self.list_of_bits_per_symbol, self.number_of_bits