-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmarkov.py
59 lines (52 loc) · 1.72 KB
/
markov.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from collections import defaultdict
import random
def tokenizer(raw_data):
ends = set(' .?!\n')
no_yield = set(' \n')
current_token = []
for char in raw_data:
if char in ends and len(current_token) != 0:
yield ''.join(current_token)
if char not in no_yield:
yield char
current_token = []
else:
if char not in no_yield:
current_token.append(char)
def walker(graph):
current = tuple(random.choice(list(graph.keys())))
for token in current:
yield token
while True:
try:
new_token = random.choice(graph[current])
current = tuple(list(current[1:]) + [new_token])
yield new_token
except IndexError:
raise StopIteration
class Chain:
def __init__(self, raw_data, ngrams = 1):
tokens = tokenizer(raw_data)
self.train(tokens, ngrams)
def train(self, tokens, ngrams):
chain = defaultdict(list)
prev = [next(tokens) for _ in range(ngrams)]
for word in tokens:
chain[tuple(prev)].append(word)
prev = prev[1:]
prev.append(word)
self.chain = chain
def walk(self, n):
# for token, _ in zip(walker(self.chain), range(n)):
# print(token)
return [token for token, _ in zip(walker(self.chain), range(n))]
def load_multiple(files):
datasets = []
for file_name in files:
with open('datasets/' + file_name + '.txt', 'r') as f:
datasets.append(f.read())
return '\n'.join(datasets)
if __name__ == "__main__":
raw_data = load_multiple(['aliens', 'agency', 'agency'])
c = Chain(raw_data, 1)
print(' '.join(c.walk(100)))