forked from vrandezo/lexicographic_coverage
-
Notifications
You must be signed in to change notification settings - Fork 2
/
generate-statistics.py
99 lines (93 loc) · 2.28 KB
/
generate-statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import meta
for language in meta.languages:
filtered = meta.load_filter(language)
forms = set()
try:
fh = open(meta.output_dir + "/" + "formlist-" + language + ".txt")
except Exception:
print("Couldn't read {}".format("formlist-" + language + ".txt"))
continue
for line in fh:
form = line.strip().lower()
form = form.replace(".", "")
forms.add(form)
tokencount = 0
wordcount = 0
coveredtokens = 0
uncoveredtokens = 0
coveredwords = 0
uncoveredwords = 0
try:
fh = open(meta.output_dir + "/" + "wordlist-" + language + ".txt")
except Exception:
print("Couldn't read {}".format("wordlist-" + language + ".txt"))
continue
for line in fh:
word, _, num = line.strip().rpartition(" ")
count = int(num)
tokencount += count
wordcount += 1
if word in forms or word in filtered:
coveredwords += 1
coveredtokens += count
else:
uncoveredwords += 1
uncoveredtokens += count
try:
output = open(meta.output_dir + "/" + "stats-" + language + ".txt", "w")
except Exception:
print("Couldn't open {}".format("stats-" + language + ".txt"))
continue
if meta.data[language]["source"] == "unileipzig":
output.write(
"These statistics use corpus data from the "
"[{} Leipzig Corpora Collection].\n"
.format(meta.data[language]["infopage"])
)
output.write(
"<table><tr><td>\n"
"* Forms in Wikidata: {:,}\n"
"* Forms in Wikipedia: {:,}\n"
"* Tokens: {:,}\n"
"* Covered forms: {:,} ({:.1%})\n"
"* Missing forms: {:,} ({:.1%})\n"
"* Covered tokens: {:,} ({:.1%})\n"
"* Missing tokens: {:,} ({:.1%})\n"
"* [[Wikidata:Lexicographical coverage/{}/Missing"
"|Most frequent missing forms]]\n"
"</td><td>\n"
"{{{{Graph:Chart"
"|width=100"
"|type=pie"
"|legend=Forms"
"|x=Covered,Missing"
"|y1={},{}}}}}\n"
"</td><td>"
"{{{{Graph:Chart"
"|width=100"
"|type=pie"
"|legend=Tokens"
"|x=Covered,Missing"
"|y1={},{}}}}}\n"
"</td></td></table>\n"
"\n"
.format(
len(forms),
wordcount,
tokencount,
coveredwords,
1.0 * coveredwords / wordcount,
uncoveredwords,
1.0 * uncoveredwords / wordcount,
coveredtokens,
1.0 * coveredtokens / tokencount,
uncoveredtokens,
1.0 * uncoveredtokens / tokencount,
language,
coveredwords,
uncoveredwords,
coveredtokens,
uncoveredtokens
)
)
output.close()