-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathngapi.py
245 lines (197 loc) · 10.5 KB
/
ngapi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
"""
Ngapi is a semantic chunker based on fastText word embeddings for the Myanmar language.
This approach can also be applied to other languages.
About Ngapi: Ngapi is a traditional fermented fish or shrimp paste widely used in Myanmar cuisine,
known for its strong and distinctive smell.
Written by Ye Kyaw Thu, LU Lab., Myanmar
Last updated: 25 Dec 2024
Reference:
https://fasttext.cc/
The 5 Levels Of Text Splitting For Retrieval,
https://www.youtube.com/watch?v=8OJC21T2SL4&t=2112s
How to Run:
Run ngapi.py with the --help option for usage details.
For more information, please refer to the two semantic chunking experiments
available here (https://github.com/ye-kyaw-thu/NgaPi/tree/main/test).
## Test-1
time python ./ngapi.py --model ./model/myfasttext_v1.bin --input ./ma_thaung.txt \
--graph_filename ./exp2/tst2.distances-eg1 --chunk_filename ./exp2/ma_thaung.chunk1.txt \
--apply_syllable_break --chunks_per_line 3 \
--syllable_group_separator "_" --chunk_separator "|"
## Test-2
time python ./ngapi.py --model ./model/myfasttext_v1.bin --input ./ma_thaung.txt \
--graph_filename ./exp2/tst2.distances-eg2 --chunk_filename ./exp2/ma_thaung.chunk2.txt \
--percentile_threshold 50 --apply_syllable_break \
--syllable_group_separator "_" --chunk_separator "|"
## Test-3
time python ./ngapi.py --model ./model/myfasttext_v1.bin --input ./ma_thaung.txt \
--graph_filename ./exp2/tst2.distances-eg3 --chunk_filename ./exp2/ma_thaung.chunk3.txt \
--percentile_threshold 60 --y_upper_bound 0.6 --apply_syllable_break --chunks_per_line 2 \
--syllable_group_separator "_" --chunk_separator "|"
"""
import argparse
import fasttext
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import re
def create_break_pattern():
"""Creates and returns the regular expression pattern for Myanmar syllable breaking."""
my_consonant = r"က-အ"
other_char = r"ဣဤဥဦဧဩဪဿ၌၍၏၀-၉၊။!-/:-@[-`{-~\s"
subscript_symbol = r'္'
a_that = r'်'
# Regular expression pattern for Myanmar syllable breaking
return re.compile(
r"((?<!" + subscript_symbol + r")[" + my_consonant + r"]"
r"(?!["
+ a_that + subscript_symbol + r"])"
+ r"|[" + other_char + r"])"
)
def break_syllables(line, break_pattern, separator=' '):
"""Applies syllable breaking rules to a line."""
line = re.sub(r'\s+', ' ', line.strip())
segmented_line = break_pattern.sub(separator + r"\1", line)
# Remove the leading delimiter if it exists
if segmented_line.startswith(separator):
segmented_line = segmented_line[len(separator):]
# Replace delimiter+space+delimiter with a single space
double_delimiter = separator + " " + separator
segmented_line = segmented_line.replace(double_delimiter, " ")
return segmented_line
def combine_sentences(sentences, buffer_size=1):
for i in range(len(sentences)):
combined_sentence = ''
for j in range(i - buffer_size, i):
if j >= 0:
combined_sentence += sentences[j]['sentence'] + ' '
for j in range(i + 1, i + 1 + buffer_size):
if j < len(sentences):
combined_sentence += ' ' + sentences[j]['sentence']
sentences[i]['combined_sentence'] = combined_sentence
return sentences
def process_file(file_path, group_length=2, break_pattern=None):
result = []
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
# Apply syllable breaking
if break_pattern:
line = break_syllables(line, break_pattern)
words = line.split() # Split the line into words
# Create variable-length word groups
word_groups = [' '.join(words[i:i+group_length]) for i in range(0, len(words), group_length)]
result.extend(word_groups) # Add to the result list
return result
def embed_documents(sentences, model):
embeddings = [np.mean([model.get_word_vector(word) for word in sentence.split()], axis=0)
if sentence.strip() else np.zeros(model.get_dimension())
for sentence in sentences]
return embeddings
def calculate_cosine_distances(sentences):
distances = []
for i in range(len(sentences) - 1):
embedding_current = sentences[i]['combined_sentence_embedding']
embedding_next = sentences[i + 1]['combined_sentence_embedding']
similarity = cosine_similarity([embedding_current], [embedding_next])[0][0]
distance = 1 - similarity
distances.append(distance)
sentences[i]['distance_to_next'] = distance
return distances, sentences
def main():
parser = argparse.ArgumentParser(description="Process text and visualize embeddings.")
parser.add_argument('--model', required=True, help="Path to the FastText model file.")
parser.add_argument('--input', required=True, help="Path to the input text file.")
parser.add_argument('--buffer', type=int, default=3000, help="Buffer size for printing chunks. Default: 3000")
parser.add_argument('--percentile_threshold', type=int, default=70, help="Breakpoint percentile threshold. Default: 70")
parser.add_argument('--y_upper_bound', type=float, default=1.0, help="Upper bound for the Y-axis in the plot. Default: 1.0")
parser.add_argument('--graph_filename', required=True, help="Filename for saving the graph (without extension). Default: 1.0")
parser.add_argument('--chunk_filename', help="Filename to save the segmented chunks.")
parser.add_argument('--syllable_group_length', type=int, default=6, help="Initial syllable group length. Default: 6")
parser.add_argument('--apply_syllable_break', action='store_true', help="Apply Myanmar syllable breaking to the input file.")
parser.add_argument('--syllable_group_separator', type=str, default="_", help="Separator for syllable group. Default is a newline ('_').")
parser.add_argument('--chunk_separator', type=str, default="|", help="Separator for syllable group. Default is a newline ('|').")
parser.add_argument('--chunks_per_line', type=int, default=None,
help="Number of segments per line. If not specified, outputs as a single line.")
args = parser.parse_args()
# Load FastText model
model = fasttext.load_model(args.model)
# Create syllable break pattern if needed
break_pattern = create_break_pattern() if args.apply_syllable_break else None
# Process input file with optional syllable breaking
#group_length = 6 # Default group length for splitting words
word_groups = process_file(args.input, args.syllable_group_length, break_pattern)
# Prepare sentences
sentences2 = [{'sentence': x, 'index': i} for i, x in enumerate(word_groups)]
sentences = combine_sentences(sentences2)
# Embed documents
embeddings = embed_documents([x['combined_sentence'] for x in sentences], model)
for i, sentence in enumerate(sentences):
sentence['combined_sentence_embedding'] = embeddings[i]
# Calculate cosine distances
distances, sentences = calculate_cosine_distances(sentences)
# Plot distances
plt.plot(distances)
plt.ylim(0, args.y_upper_bound)
plt.xlim(0, len(distances))
breakpoint_distance_threshold = np.percentile(distances, args.percentile_threshold)
plt.axhline(y=breakpoint_distance_threshold, color='r', linestyle='-')
# Add text annotation for the number of chunks
num_distances_above_threshold = len([d for d in distances if d > breakpoint_distance_threshold])-1
plt.text(x=(len(distances) * 0.01),
y=args.y_upper_bound / 50,
s=f"{num_distances_above_threshold + 1} Chunks",
color='black', fontsize=10)
# Add breakpoints and regions
indices_above_thresh = [i for i, x in enumerate(distances) if x > breakpoint_distance_threshold]
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
for i, breakpoint_index in enumerate(indices_above_thresh):
start_index = 0 if i == 0 else indices_above_thresh[i - 1]
end_index = breakpoint_index if i < len(indices_above_thresh) - 1 else len(distances)
plt.axvspan(start_index, end_index, facecolor=colors[i % len(colors)], alpha=0.25)
if indices_above_thresh:
last_breakpoint = indices_above_thresh[-1]
if last_breakpoint < len(distances):
plt.axvspan(last_breakpoint, len(distances), facecolor=colors[len(indices_above_thresh) % len(colors)], alpha=0.25)
plt.title("Chunks Based On FastText Embedding Breakpoints")
plt.xlabel("Index of Tokens (e.g. a group of syllables)")
plt.ylabel("Cosine distance between sequential tokens")
plt.savefig(f"{args.graph_filename}.png")
plt.savefig(f"{args.graph_filename}.pdf")
# Segment into chunks
start_index = 0
chunks = []
for index in indices_above_thresh:
end_index = index - 1
group = sentences[start_index:end_index + 1]
combined_text = args.syllable_group_separator.join([d['sentence'].replace(" ", "") for d in group]) # Remove spaces between syllables
chunks.append(combined_text)
start_index = index
if start_index < len(sentences):
combined_text = args.syllable_group_separator.join([d['sentence'].replace(" ", "") for d in sentences[start_index:]]) # Remove spaces between syllables
chunks.append(combined_text)
# Remove empty chunks
chunks = [chunk for chunk in chunks if chunk]
print("No. of chunks: ", len(chunks)) #; exit()
#print("chunks[0]", chunks[0]); print("chunks[1]", chunks[1]); exit()
# Apply word wrapping based on chunks_per_line
if args.chunks_per_line:
wrapped_chunks = []
for i in range(0, len(chunks), args.chunks_per_line):
wrapped_line = args.chunk_separator.join(chunks[i:i + args.chunks_per_line])
wrapped_chunks.append(wrapped_line)
else:
wrapped_chunks = chunks # No wrapping if chunks_per_line is not specified
# Print wrapped chunks
for i, line in enumerate(wrapped_chunks):
print(f"Chunk line #{i + 1}")
print(line[:args.buffer].strip())
#print(wrapped_chunks[-1][-args.buffer:].strip())
# Save wrapped chunks to file if specified
if args.chunk_filename:
with open(args.chunk_filename, 'w', encoding='utf-8') as f:
for line in wrapped_chunks:
if line.strip(): # Only write non-empty lines
f.write(line.strip() + "\n") # Always use a newline to separate lines in the file
print(f"Chunks saved to {args.chunk_filename}")
if __name__ == "__main__":
main()