-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmarkov_language_model.clj
474 lines (421 loc) · 14.5 KB
/
markov_language_model.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
(ns markov-language-model
(:require [clojure.string :as string]
[clojure.java.io :as io]
[com.owoga.trie.math :as math]
[com.owoga.tightly-packed-trie :as tpt]
[com.owoga.tightly-packed-trie.encoding :as encoding]
[com.owoga.trie :as tr]
[cljol.dig9 :as d]
[clojure.zip :as zip]
[com.owoga.tightly-packed-trie.bit-manip :as bm]))
(def corpus (slurp (io/resource "cask_of_amontillado.txt")))
(defn prep-punctuation-for-tokenization
"Puts spaces around punctuation so that they aren't
tokenized with the words they are attached to.
Might add extraneous whitespace, but presumedly that will be ignored/removed
during tokenization."
[text]
(string/replace text #"([\.,!?])" " $1 "))
;; For better generation of text, you'll probably want to pad the starts
;; of sentences with n-1 "start-of-sentence" tokens.
(defn add-bol-and-eol-tokens [text]
(-> text
(string/replace #"(\.)" "</s> . <s>")
(#(str "<s> " %))))
(defn remove-quotes
[text]
(string/replace text #"\"" ""))
(defn remove-formatting-characters
"Input has underscores, presumably because the text
might be rendered by something that can italicize or bold text.
We'll just ignore them for now."
[text]
(string/replace text #"[_*]" ""))
(defn tokenize [text]
(-> text
remove-formatting-characters
prep-punctuation-for-tokenization
remove-quotes
add-bol-and-eol-tokens
string/lower-case
(string/split #"[\n ]+")))
(defn interleave-all
"Like interleave, but instead of ending the interleave when the shortest collection
has been consumed, continues to interleave the remaining collections."
{:added "1.0"
:static true}
([] ())
([c1] (lazy-seq c1))
([c1 c2]
(lazy-seq
(let [s1 (seq c1) s2 (seq c2)]
(if (and s1 s2)
(cons (first s1) (cons (first s2)
(interleave-all (rest s1) (rest s2))))
(lazy-seq (or s1 s2))))))
([c1 c2 & colls]
(lazy-seq
(let [ss (->> (map seq (conj colls c2 c1))
(remove nil?))]
(when ss
(concat (map first ss) (apply interleave-all (map rest ss))))))))
(comment
(let [tokens [1 2 3 4 5]
p1 (partition 1 1 tokens)
p2 (partition 2 1 tokens)
p3 (partition 3 1 tokens)]
[p1
p2
p3
(interleave-all p1 p2 p3)])
;; => [((1) (2) (3) (4) (5))
;; ((1 2) (2 3) (3 4) (4 5))
;; ((1 2 3) (2 3 4) (3 4 5))
;; ((1) (1 2) (1 2 3) (2) (2 3) (2 3 4) (3) (3 4) (3 4 5) (4) (4 5) (5))]
)
(defn ngramify-tokens [n m tokens]
(let [partition-colls (map #(partition % 1 tokens) (range n m))
ngrams (apply interleave-all partition-colls)]
ngrams))
(comment
(->> (tokenize corpus)
(take 5)
(ngramify-tokens 1 4))
;; => (("the")
;; ("the" "thousand")
;; ("the" "thousand" "injuries")
;; ("thousand")
;; ("thousand" "injuries")
;; ("thousand" "injuries" "of")
;; ("injuries")
;; ("injuries" "of")
;; ("injuries" "of" "fortunato")
;; ("of")
;; ("of" "fortunato")
;; ("fortunato"))
)
(defn add-terminal-value-to-ngram
"The Trie expects entries to be of the form '(k1 k2 k3 value).
The ngrams generated above are just '(k1 k2 k3).
This adds a value that is simply the ngram itself:
'(k1 k2 k3 '(k1 k2 k3))."
[ngram]
(concat ngram (list ngram)))
(comment
(let [ngrams (->> corpus
tokenize
(take 200)
(ngramify-tokens 1 4)
(map add-terminal-value-to-ngram))]))
(defn trie->database
"It's convenient to work with a trie that has keys and values as
human-readable strings, as pulled straight from a corpus in the case
of a markov trie. But to tightly pack the trie into a byte array,
we need every value to be an integer that we can variable-length-encode.
This creates a database for conveniently converting the human-readable
entries to ids and back from ids to human-readable entries.
Ids will start at 1 so that 0 can be reserved for the root node."
[trie]
(let [sorted-keys (->> (seq trie)
(sort-by (fn [[k v]]
(:count v)))
(reverse))]
(loop [sorted-keys sorted-keys
database {}
i 1]
(if (empty? sorted-keys)
database
(recur
(rest sorted-keys)
(-> database
(assoc (first (first sorted-keys))
(assoc (second (first sorted-keys)) :id i))
(assoc i (first (first sorted-keys))))
(inc i))))))
(comment
(take 10 (trie->database trie))
(let [ngrams (->> corpus
tokenize
(take 200)
(ngramify-tokens 1 4)
(map add-terminal-value-to-ngram))
trie (apply make-trie ngrams)]
(trie->database trie))
;; {("at") {:count 1, :id 39},
;; 453 ("revenge" "." "You"),
;; ("The") {:count 1, :id 37},
;; ("resolved" ",") {:count 1, :id 256},
;; 487 ("very" "definitiveness" "with"),
;; ("be" "respected") {:count 1, :id 170},
;; ("a" "point") {:count 1, :id 158},
;; 357 ("and" "he" "did"),
;; 275 ("the" "very"),
;; ("doubt" "my" "good") {:count 1, :id 381},
;; ,,,}
)
(defn transform-trie->ids
"Once we have a database to convert from string-keys to integer-ids and back,
we can traverse the trie replacing the string-keys with their integer-ids."
[trie database]
(->> trie
(map
(fn [[k v]]
[(vec (map #(get (get database [%]) :id) k))
{:id (get-in database [k :id])
:count (get-in database [k :count])}]))
(into (tr/make-trie))))
(def trie
(let [ngrams (->> corpus
tokenize
(ngramify-tokens 1 4)
(map add-terminal-value-to-ngram)
(map (fn [entry]
(list (butlast entry)
(last entry)))))]
(->> ngrams
(reduce
(fn [acc [k v]]
(update
acc
k
(fnil
(fn [existing]
(update existing :count inc))
{:value v
:count 0})))
(tr/make-trie)))))
(comment
(take 10 (drop 1000 trie))
;; => ([["be" "awaiting"] {:value ("be" "awaiting"), :count 1}]
;; [["be" "cautious" "as"] {:value ("be" "cautious" "as"), :count 1}]
;; [["be" "gone"] {:value ("be" "gone"), :count 2}]
;; [["be" "ill" ","] {:value ("be" "ill" ","), :count 1}])
)
(def trie-database
(trie->database trie))
(comment
(take 4 trie-database)
;; => ([0 ["."]]
;; [["to" "your" "long"] {:value ("to" "your" "long"), :count 1, :id 1119}]
;; [["an" "instant" "he"] {:value ("an" "instant" "he"), :count 1, :id 4800}]
;; [["fifth" "," "the"] {:value ("fifth" "," "the"), :count 1, :id 3919}])
)
(def tpt-ready-trie (transform-trie->ids trie trie-database))
(comment
(take 4 tpt-ready-trie)
;; => ([[0 1 27] {:id 5082, :count 1}]
;; [[0 1 104] {:id 5072, :count 1}]
;; [[0 1 112] {:id 5075, :count 1}]
;; [[0 1 146] {:id 5077, :count 1}])
)
(defn value-encode-fn [v]
(if (and (number? v) (zero? v))
(encoding/encode 0)
(byte-array
(concat (encoding/encode (:id v))
(encoding/encode (:count v))))))
(defn value-decode-fn [byte-buffer]
(let [id (encoding/decode byte-buffer)]
(if (zero? id)
{:id id}
{:id id
:count (encoding/decode byte-buffer)})))
(def tightly-packed-trie
(tpt/tightly-packed-trie tpt-ready-trie value-encode-fn value-decode-fn))
;;;; DEMO
;;;; ** Out of date since new TrieAgain code
(comment
;;;; Size comparisons
;;
;; Original trie, keys and values are lists and strings.
;; ~1,900 kb
(d/sum [trie])
;; 61112 objects
;; 103249 references between them
;; 1901656 bytes total in all objects
;; no cycles
;; 8421 leaf objects (no references to other objects)
;; Original trie, keys and values numbers
;; ~900 kb
(d/sum [tpt-ready-trie])
;; 30008 objects
;; 62543 references between them
;; 907992 bytes total in all objects
;; no cycles
;; 5438 leaf objects (no references to other objects)
;; Tightly-packed-trie, keys and values numbers (backed by var-len encoded ints)
;; ~36 kb
(d/sum [tightly-packed-trie])
;; 6 objects
;; 5 references between them
;; 36736 bytes total in all objects
;; no cycles
;; 4 leaf objects (no references to other objects)
;;;; Size comparison summary
;;
;; Condensed original: 900 kb
;; Tightly packed: 36 kb
;; Compression: ~96% !!!
;;;; Getting value from each type of trie
;;
(get trie ["<s>" "i" "was"])
;; => {:value ("<s>" "i" "was"), :count 1}
(get tpt-ready-trie [0 8 21])
;; => {:id 5116, :count 1}
(get tightly-packed-trie [0 8 21])
;; => {:id 5116, :count 1}
;; And then to get back to a string version, use the database.
(->> [0 8 21]
(get tightly-packed-trie)
:id
(get trie-database)
(get trie-database))
;; => {:value ("<s>" "i" "was"), :count 1, :id 5116}
;;;; Our "database" (just a hash-map) serves a dual purpose.
;;
;; It maps n-grams to their frequency counts and to an integer ID.
;; It also maps that integer ID back to the n-gram.
(take 10 trie-database)
;; => ([("Then" "I") {:count 1, :id 1475}]
;; [("to" "your" "long") {:count 1, :id 5371}]
;; [("an" "instant" "he") {:count 1, :id 3842}]
;; [("fifth" "," "the") {:count 1, :id 4209}]
;; [("from" "the" "depth") {:count 1, :id 4270}]
;; [2721 ("the" "more")]
;; [("during" "which" ",") {:count 1, :id 4144}]
;; [("nodded") {:count 1, :id 674}]
;; [("the" "feeble") {:count 1, :id 2693}]
;; [("intermingling" "," "into") {:count 1, :id 4488}])
;;;; Database
;; Each n-gram has its own integer ID. The integer IDs should be handed
;; out to n-grams in order of frequency. Therefore, you're 1-grams will probably
;; have lower IDs than the higher-order n-grams.
;;
;; Here we see "," is the 2nd most-common n-gram.
(get-in trie-database ['(",") :id]) ;; => 7
(get-in trie-database ['("i") :id]) ;; => 8
;; The ID of a 2-gram is not related in any way to
;; the two 1-grams that make it up. Every n-gram is unique
;; and gets its own unique ID.
;;
;; BUT... Every node is referenced by a 1-gram key.
;; So the 2-gram '("," "I") is referenced from
;; the :root key's children by the 1-gram key '(",")
;; and then by that 1-gram key's children by the 1-gram '("I").
;; The VALUE of that node though is the 2-gram '("," "I").
;;
;; To re-iterate: The keys are all 1-grams at every nesting level.
;; The values are the higher-order n-grams the lower in the nesting
;; that you go.
(get-in trie-database ['("," "i") :id]) ;; => 23
;;;; Trie vs Tightly Packed Trie
;;;;
;; The interface is *almost* the same between the two.
;; Tightly packed tries can't be updated or written to.
;; They can only be read.
;; And to get from integer IDs to human-readable strings,
;; you need to go through the database.
;;
;; Other than that though, let's see it in action!
;;
;;;; Here is the Trie
(get trie '("i"))
;; => {:value ("i"), :count 107}
(->> (tr/lookup trie '("i"))
(take 5))
;; => ([["," "i"] {:value ("i" "," "i"), :count 1}]
;; [[","] {:value ("i" ","), :count 1}]
;; [["again" "paused"] {:value ("i" "again" "paused"), :count 1}]
;; [["again"] {:value ("i" "again"), :count 1}]
;; [["am" "on"] {:value ("i" "am" "on"), :count 1}])
(->> (tr/lookup trie '("i"))
(tr/children)
(map #(get % []))
(take 5))
;; => ({:value ("i" ","), :count 1}
;; {:value ("i" "again"), :count 1}
;; {:value ("i" "am"), :count 1}
;; {:value ("i" "answered"), :count 1}
;; {:value ("i" "began"), :count 2})
;;;; And here is the tightly-packed trie
(->> (tr/lookup tightly-packed-trie '(8))
(tr/children)
(map #(get % []))
(take 5))
;; => ({:id 3392, :count 1}
;; {:id 3270, :count 1}
;; {:id 129, :count 5}
;; {:id 70, :count 9}
;; {:id 69, :count 9})
(->> (tr/lookup tightly-packed-trie '(8))
(tr/children)
(map #(get % []))
(take 5)
(map #(get trie-database (:id %))))
;; => (["i" ","] ["i" "to"] ["i" "was"] ["i" "had"] ["i" "said"])
;;;; Ta-da!
)
(defn key-get-in-tpt [tpt db ks]
(let [id (map #(get-in db [(list %) :id]) ks)
v (get tpt id)]
{id v}))
(defn id-get-in-tpt [tpt db ids]
(let [ks (apply concat (map #(get db %) ids))
v (get tpt ids)
id (get-in db [ks :id])]
{ks (assoc v :value (get db id))}))
(comment
(key-get-in-tpt
tightly-packed-trie
trie-database
'("i" "will"))
;; => {(8 49) {:id 3257, :count 1}}
(id-get-in-tpt
tightly-packed-trie
trie-database
'(8 49))
;; => {("i" "will") {:id 3257, :count 1, :value ["i" "will"]}}
)
;;;; Markov-generating text from trie
(comment
(def example-story
(loop [generated-text [(:id (get trie-database ["<s>"]))]
i 0]
(if (> i 100)
generated-text
(recur
(conj
generated-text
(tpt/.key
(math/weighted-selection
#(:count (get % []))
(loop [i 3
children
(tr/children
(tr/lookup
tightly-packed-trie
(vec (take-last i generated-text))))]
(if (empty? children)
(recur (dec i)
(tr/children
(tr/lookup
tightly-packed-trie
(vec (take-last i generated-text)))))
children)))))
(inc i)))))
(->> example-story
(map #(get trie-database %))
(apply concat)
(remove #{"<s>" "</s>"})
(string/join " ")
(#(string/replace % #" ([\.,\?])" "$1"))
((fn [txt]
(string/replace txt #"(^|\. |\? )([a-z])" (fn [[a b c]]
(str b (.toUpperCase c)))))))
;; => "I broke and reached him a flagon of de grave. We came at length. He
;; again took my arm, and holding the flambeaux over the wall; i replied, were
;; a great and numerous family. Whither? to your long life. Putting on a
;; tight-fitting parti-striped dress, and descending again, and had given them
;; explicit orders not to be found, and this time i made bold to seize
;; fortunato by an arm above the elbow. In its destined position."
)