-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_types.py
144 lines (125 loc) · 3.58 KB
/
data_types.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from dataclasses import dataclass
from typing import Literal
Pos = Literal["stopword", "noun", "verb", ""]
Case = Literal["n", "g", "a", "j", ""]
"""
In arabic: marfoo3, majroor, mansoob, majzoom, unknown
For nouns: nominative, genetive, accusative, no application, unknown
For verbs: indicative, no application, subjunctive, jussive, unknown
"""
profile_descriptions = {
# id, title, description, off, on
"pausa": (
"Pausa",
"Ob der Text in Pausa gelesen wird",
"al-kalbu",
"al-kalb",
),
"ta_marbutah": (
"Ta marbuta",
"Ob die Ta marbuta am Ende eines Wortes wiedergegeben wird",
"al-madina",
"al-madinah",
),
"diphthongs": (
"Diphthonge",
"Ob Diphthonge ai/au wiedergegeben werden",
"nawm",
"naum",
),
"double_vowels": (
"Geminierte Halbvokale",
"Ob Halbvokale mit Shaddah als doppelte Konsonanten wiedergegeben werden",
"nīya",
"niyya",
),
"nisba": (
"-ī und -ū",
"Ob am Ende eines Wortes immer -ī/-ū statt -iyy/-uww wiedergegeben wird",
"nabiyy o. nabīy, al-ʿarabī",
"nabī, al-ʿarabī",
),
"begin_hamza": (
"Anlautendes Hamza",
"Ob ein anlautendes Hamza wiedergegeben wird",
"amr",
"ʾamr",
),
"hu_hi": (
"-hu und -hi",
"Ob die Pronomen -hu und -hi ihrer Aussprache entsprechend wiedergegeben werden",
"baituhu, abūhu",
"baituhū, abūhu",
),
# for names
"is_book": (
"Buchtitel",
"Ob ein Buchtitel wiedergegeben wird",
"Kitāb al-Aġānī li-l-Imām Abī l-Faraǧ",
"Kitāb al-Aġānī li-l-imām abī l-faraǧ",
),
"short_ibn": (
"b. und bt.",
"Ob ibn/bin und bint als b. und bt. abgekürzt werden",
"Muḥammad ibn ʿAbdallāh",
"Muḥammad b. ʿAbdallāh",
),
}
@dataclass
class Profile:
pausa: bool = False
ta_marbutah: bool = False
diphthongs: bool = False
double_vowels: bool = True
nisba: bool = True
begin_hamza: bool = False
hu_hi: bool = True
# TODO: imalah, ishmam
# TODO: Zwei Doppelpunkte bei emphatischen Konsonanten
# TODO: alif maqsura mit Unterpunkt
@dataclass
class Token:
def __post_init__(self):
import data
self.latin_after = data.sub_after(self.after)
self.original = self.arab
arab: str
after: str = ""
original: str = ""
lemma: str = ""
pos: Pos = ""
gram_case: Case = ""
is_definite: bool = False
prefix: str = ""
suffix: str = ""
is_pausa: bool = False
is_end_of_sentence: bool = False
is_idafah: bool = False
is_name: bool = False
is_nisba: bool = False
latin: str = ""
latin_after: str = ""
latin_prefix: str = ""
@property
def is_genetive(self) -> bool:
return self.pos == "noun" and self.gram_case == "g"
@property
def result(self) -> str:
latin = self.latin
if self.is_name:
if len(latin) >= 2 and latin[0] in "ʿʾ" and latin[1] in "aui":
latin = latin[0] + latin[1:].capitalize()
else:
latin = self.latin.capitalize()
return self.latin_prefix + latin + self.latin_after
Sentence = list[Token]
@dataclass
class NameProfile:
is_book: bool = False
short_ibn: bool = True
ta_marbutah: bool = False
diphthongs: bool = False
double_vowels: bool = True
begin_hamza: bool = False
hu_hi: bool = True
nisba: bool = True