-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path09.irregular-verbs.py
99 lines (82 loc) · 3.99 KB
/
09.irregular-verbs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import re
import csv
from collections import defaultdict, OrderedDict
def process_irregular_verbs_to_csv(input_path, csv_output_path):
"""
Process the input file to merge irregular verb forms (past and past participle)
into unified entries and write them into a CSV file using commas as delimiters.
Args:
input_path (str): Path to the input file.
csv_output_path (str): Path to the CSV output file.
"""
# Dictionary to store relationships and positions
key_groups = defaultdict(set) # Use sets to avoid duplicate entries
verb_positions = OrderedDict() # To track base verb positions for output order
# Regular expressions for irregular verb patterns
past_pattern = re.compile(
r"^(.*?)\s+<div style=\"margin-left:1em\"><i class=\"p\"><font color=\"green\">past</font></i> <i class=\"p\"><font color=\"green\">від</font></i> <<([^<>&]*?)>></div>$"
)
past_participle_pattern = re.compile(
r"^(.*?)\s+<div style=\"margin-left:1em\"><i class=\"p\"><font color=\"green\">p\.p\.</font></i> <i class=\"p\"><font color=\"green\">від</font></i> <<([^<>&]*?)>></div>$"
)
# Read and parse the input file
with open(input_path, "r", encoding="utf-8") as file:
lines = file.readlines()
for idx, line in enumerate(lines):
line = line.strip()
# Match past forms
past_match = past_pattern.match(line)
if past_match:
past_form = past_match.group(1).strip()
base_forms = past_match.group(2).strip().split("|") # Split by |
for base in base_forms:
key_groups[base.strip()].add(past_form)
key_groups[past_form].add(base.strip())
continue
# Match past participle forms
pp_match = past_participle_pattern.match(line)
if pp_match:
pp_form = pp_match.group(1).strip()
base_forms = pp_match.group(2).strip().split("|") # Split by |
for base in base_forms:
key_groups[base.strip()].add(pp_form)
key_groups[pp_form].add(base.strip())
continue
# Regular lines
parts = line.split("\t", 1)
keys = parts[0].strip().split("|") # Split by |
# Track the position of the first base form in the input
for key in keys:
verb_positions[key.strip()] = idx
# Resolve all key groups into canonical forms
resolved_entries = []
seen_keys = set()
for key in key_groups:
if key not in seen_keys:
# Resolve all related keys
stack = [key]
group = set()
while stack:
current = stack.pop()
if current not in seen_keys:
seen_keys.add(current)
group.add(current)
stack.extend(k for k in key_groups[current] if k not in seen_keys)
# Extract base, past, and past participle forms
base_forms = sorted(k for k in group if k in verb_positions)
related_forms = sorted(k for k in group if k not in verb_positions)
if base_forms:
# Add each base form with its associated related forms to the output
for base in base_forms:
# Replace vertical bars with commas in related forms
cleaned_related = [form for related in related_forms for form in related.split("|")]
resolved_entries.append([base] + cleaned_related)
# Write the resolved data to the CSV file
with open(csv_output_path, "w", encoding="utf-8", newline="") as csvfile:
writer = csv.writer(csvfile)
writer.writerows(resolved_entries)
print(f"Processed irregular verbs saved to {csv_output_path}")
if __name__ == "__main__":
input_file = "temp/05.filter-irregular-nouns.txt" # Input file path
csv_output_file = "temp/verbs-irregular.csv" # CSV output file path
process_irregular_verbs_to_csv(input_file, csv_output_file)