-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_dataset.py
106 lines (89 loc) · 4.78 KB
/
test_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import unittest
import csv
import numpy as np
import panphon
from panphon.segment import Segment
from data_processing.bccwj_dataset import BCCWJDataset
from data_processing.word import Word
from data_processing.transcriber import Transcriber
OUTPUT_CSV_LENGTH = 36396 # as of last generation
MAX_SEQ_LEN_NO_PAD = 20
MAX_SEQ_LEN_WITH_PAD = 21
NUM_PHONETIC_FEATURES = 22
class TestTranscriber(unittest.TestCase):
# partition:
# want to cover:
# glide substitution
# geminate consonant realization
# long vowels
# syllable final nasal place assimilation
# affricate joining
# yotsugana realization word-initially and intervocalically
def setUp(self):
self.t = Transcriber()
def test_katakana_to_ipa(self):
self.assertEqual(self.t.katakana_to_ipa('オトナ'), 'otona')
self.assertEqual(self.t.katakana_to_ipa('インターネット'), 'intaːnetto') # tests geminate consonant
self.assertEqual(self.t.katakana_to_ipa('アズキイリ'), 'azɯkiːɾi') # tests yotsugana intervocalically, long vowels
self.assertEqual(self.t.katakana_to_ipa('タニン'), 'taɲiɴ') # tests nasal place assimilation
self.assertEqual(self.t.katakana_to_ipa('キョウシツ'), 'kʲoːɕit͡sɯ') # tests glide substitution, affricate joining
self.assertEqual(self.t.katakana_to_ipa('ジコウ'), 'd͡ʑikoː') # tests word initial yotsugana
self.assertEqual(self.t.katakana_to_ipa('コンバンハ'), 'kombaɰ̃ɰa') # TODO this depends on how broad the transcription is
def test_fv_to_binary_fv(self):
# for a putative 4 phonetic feature vector set
fvs = [[1, -1, 0, 1],
[-1, -1, 1, 1],
[0, 0, 0, -1],
[0, 1, 1, 1],
[1, 0, -1, 0]]
binary_target = [[1, 0, 0, 1],
[0, 0, 1, 1],
[0, 0, 0, 0],
[0, 1, 1, 1],
[1, 0, 0, 0]]
binary_fvs = self.t.fv_to_binary_fv(fvs)
self.assertEqual(binary_fvs, binary_target)
class TestWord(unittest.TestCase):
def setUp(self):
ipa = 'sɯika'
self._ft = panphon.FeatureTable()
self._t = Transcriber()
self.w = Word(7976, '西瓜', 'スイカ', '漢', ipa)
def test_to_str(self):
self.assertEqual(str(self.w), '7976: 西瓜, スイカ, sɯika', 'Should be ' + str(self.w))
def test_segments(self):
a_segment = Segment(['syl', 'son', 'cons', 'cont', 'delrel', 'lat', 'nas', 'strid', 'voi', 'sg', 'cg', 'ant', 'cor', 'distr', 'lab', 'hi', 'lo', 'back', 'round', 'velaric', 'tense', 'long', 'hitone', 'hireg'], ftstr='[+syl, +son, -cons, +cont, -delrel, -lat, -nas, 0strid, +voi, -sg, -cg, 0ant, -cor, 0distr, -lab, -hi, +lo, -back, -round, -velaric, +tense, -long, 0hitone, 0hireg]')
segments = self._t.ipa_to_panphon_segments(self.w.ipa)
# Segments are objects, so equality is by reference. To confirm two segments are the same we show they have distance 0.
distance = a_segment - segments[4]
self.assertEqual(distance, 0, 'Segments not identical')
def test_feature_vectors(self):
feature_vectors = self._t.ipa_to_feature_vectors(self.w.ipa)
a_features = [1, 1, -1, 1, -1, -1, -1, 0, 1, -1, -1, 0, -1, 0, -1, -1, 1, -1, -1, -1, 1, -1]
self.assertEqual(feature_vectors[4], a_features, 'Feature vectors not identical')
class TestDataset(unittest.TestCase):
def setUp(self):
self.dataset = BCCWJDataset()
def test_size(self):
self.assertEqual(len(self.dataset), OUTPUT_CSV_LENGTH)
def test_output_dims(self):
entry = next(iter(self.dataset))
array, length = entry
self.assertIsInstance(array, np.ndarray)
self.assertIsInstance(length, float)
self.assertEqual(array.shape, (MAX_SEQ_LEN_WITH_PAD, NUM_PHONETIC_FEATURES))
def test_accesses(self):
first_access = self.dataset[0]
first_vec_array, first_word_length = first_access
self.assertEqual(first_vec_array.shape, (MAX_SEQ_LEN_WITH_PAD, NUM_PHONETIC_FEATURES))
self.assertEqual(first_word_length, 2)
self.assertTrue(all([(x == 1) for x in first_vec_array[2, :]])) # the word boundary token
self.assertTrue(all([(x == 0) for x in first_vec_array[-1, :]])) # the last token should be a PAD token
middle_access = self.dataset[17511]
mid_vec_array, mid_word_length = middle_access
self.assertEqual(mid_vec_array.shape, (MAX_SEQ_LEN_WITH_PAD, NUM_PHONETIC_FEATURES))
self.assertEqual(mid_word_length, 7)
self.assertTrue(all([(x == 1) for x in mid_vec_array[7, :]])) # the word boundary token
self.assertTrue(all([(x == 0) for x in mid_vec_array[-1, :]])) # ie a PAD token
if __name__ == '__main__':
unittest.main()