-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlexibank_rrdm.py
83 lines (69 loc) · 2.87 KB
/
lexibank_rrdm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import attr
import csv
import json
from clldutils.misc import slug
from clldutils.path import Path
from pylexibank import Concept
from pylexibank.dataset import Dataset as BaseDataset
from pylexibank.util import progressbar
def build_concept_id(concept):
"""
Build a concept id in the format `number_gloss`.
The gloss is simplified to lowercase, with no extended/accented
letters, and no spaces. We use the Concepticon gloss by default,
using the original name only if the concept is not mapped.
"""
# If we have a concepticon gloss, use it; otherwise, use the name
# stripped of all additional information.
if concept.concepticon_gloss:
gloss = concept.concepticon_gloss
else:
gloss = (concept.name.split("::")[0],)
return "%s_%s" % (concept.id.split("_")[0], slug(gloss))
class Dataset(BaseDataset):
"""
Class defining our dataset.
"""
# Sets the working `dir`ectory and the dataset `id`
dir = Path(__file__).parent
id = "rrdm"
def cmd_makecldf(self, args):
"""
Method for reading raw data, auxiliary data, and writing CLDF files.
"""
# Add bibliographic sources
args.writer.add_sources()
# Add languages
languages = args.writer.add_languages(lookup_factory="Name")
# add concepts
concepts = args.writer.add_concepts(
id_factory=lambda x: build_concept_id(x), lookup_factory="Name"
)
# Load raw data and add forms
with open("raw/dataset.csv") as raw_data:
# Read the raw data as a list of dictionaries
reader = csv.DictReader(raw_data)
# Iterate over each entry and add it; note that, as we lack the
# difference between values and forms in these mock datasets, we
# use the raw value for both fields.
for entry in reader:
# If the segments are given in the raw source, hand-coded,
# make a list out of them and call the appropriate function;
# otherwise, call the default ones that accepts no segments.
if entry['Segments']:
args.writer.add_form_with_segments(
Language_ID=languages[entry["Language"]],
Parameter_ID=concepts[entry["Concept"]],
Value=entry["Value"],
Form=entry["Value"],
Segments=entry['Segments'].split(),
Source=["Dellert2017"],
)
else:
args.writer.add_form(
Language_ID=languages[entry["Language"]],
Parameter_ID=concepts[entry["Concept"]],
Value=entry["Value"],
Form=entry["Value"],
Source=["Dellert2017"],
)