-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlcs.py
123 lines (99 loc) · 4.86 KB
/
lcs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import json
from statistics import mean, median
from typing import Dict, List, Tuple
from load_data import get_cleaned_text
import numpy as np
def lcs_simple(source_text: str, target_text: str) -> int:
'''
Calculate lcs matrix and return normalized lcs value
'''
source_token: List[str] = source_text.split()
target_token: List[str] = target_text.split()
num_cols = len(source_token)
num_rows = len(target_token)
matrix = [[0]*(num_rows + 1) for i in range(num_cols + 1)] # make matrix filled with 0 (one extra top, left)
for col in range(num_cols + 1):
if col == 0: # skip first empty col
continue
for row in range(num_rows + 1):
if row == 0: # skip first empty row
continue
# ***
# the actual comparison part
if source_token[col-1] == target_token[row-1]:
matrix[col][row] = matrix[col-1][row-1]+1 # top left diag. value + 1
else:
matrix[col][row] = max(matrix[col-1][row], matrix[col][row-1]) # max value: top, left
return round(matrix[-1][-1]/len(target_token), 2) # very last value in matrix is abs. lcs value
def lcs_with_text(source_text: str, target_text: str) -> Tuple[float, List[str], int, float, float]:
'''
Calculate lcs matrix (same as above) and additionally store equal substrings in list
'''
source_token: List[str] = source_text.split()
target_token: List[str] = target_text.split()
num_cols = len(source_token)
num_rows = len(target_token)
matrix = [[0]*(num_rows + 1) for i in range(num_cols + 1)] # make matrix filled with 0 (one extra top, left)
for col in range(num_cols + 1):
if col == 0: # skip first empty col
continue
for row in range(num_rows + 1):
if row == 0: # skip first empty row
continue
# ***
# the actual comparison part
if source_token[col-1] == target_token[row-1]:
matrix[col][row] = matrix[col-1][row-1]+1 # top left diag. value + 1
last_max_indices = [col, row]
else:
matrix[col][row] = max(matrix[col-1][row], matrix[col][row-1]) # max value: top, left
lcs_value: float = round(matrix[-1][-1]/len(target_token), 2) # normalize abs. lccs value (last in matrix)
col: int = num_cols
row: int = num_rows
lcs_token: List[List[str]] = [[]]
while col > 0 and row > 0:
if source_token[col-1] == target_token[row-1]: # token at position are equal: store and move cursor (top, left)
lcs_token[0].insert(0, source_token[col-1]) # prepend token to current list
col -= 1
row -= 1
else:
# new sublist of consecutive token (can also be len == 1)
if len(lcs_token[0]) > 0:
lcs_token.insert(0, []) # prepend new list
# find max. value (top or teft) and move to this position
if matrix[col-1][row] > matrix[col][row-1]: # value left > value top: move left
col -= 1
else:
row -= 1 # move top
# some additional mini statistics on stored substrings
max_len: int = len(max(lcs_token, key=len))
mean_len: float = round(mean([len(x) for x in lcs_token]), 2)
median_len: float = round(median([len(x) for x in lcs_token]), 2)
# create list of str (from sublists of token)
lcs_str_list: List[str] = []
for sublist in lcs_token:
lcs_str_list.append(" ".join(sublist))
return lcs_value, lcs_str_list, max_len, mean_len, median_len
if __name__ == '__main__':
# ***
# <source> is the base text (i.e. wikipedia article) which is then compared to different <targets>
texts: Dict[str, str] = {}
for key, file_name in [
("source", "wikipedia_expl_source.txt"),
("target_strong", "wikipedia_expl_target_strong.txt"),
("target_moderate", "wikipedia_expl_target_moderate.txt"),
("target_weak", "wikipedia_expl_target_weak.txt")
]:
texts[key] = get_cleaned_text(file_name)
# mini test with hardcoded texts
# lcs_value, lcs_str = lcs_with_text(
# "Henry VIII was King of England from 1509 until his death in 1547".lower(),
# "Henry VIII reigned England between 1509 and 1547 when he died from a natural death".lower()
# )
# check short, loaded test texts with loaded source text
for target in ["target_strong", "target_moderate", "target_weak"]:
lcs_value, lcs_str_list, max_len, mean_len, median_len = lcs_with_text(texts["source"], texts[target])
print(f"--------- {target} ---------")
print(f"lcs: {lcs_value} | max: {max_len} | mean: {mean_len} | median: {median_len}")
print(json.dumps(lcs_str_list, indent=2))
print()