Skip to content

Commit

Permalink
Remove extraneous spaces
Browse files Browse the repository at this point in the history
  • Loading branch information
polm committed Jul 31, 2023
1 parent 49779d1 commit 5c4a302
Showing 1 changed file with 16 additions and 16 deletions.
32 changes: 16 additions & 16 deletions cutlet/cutlet.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ def is_ascii(s):
def is_ascii(s):
"""Check if a given string is ASCII."""
# this version is for old Pythons
for c in s:
if c > '\x7f':
return False
return True
for c in s:
if c > '\x7f':
return False
return True

def has_foreign_lemma(word):
"""Check if a word (node) has a foreign lemma.
Expand All @@ -39,7 +39,7 @@ def has_foreign_lemma(word):
with non-foreign-lemma information.
"""

if '-' in word.surface:
if '-' in word.surface:
# TODO check if this is actually possible in vanilla unidic
return False

Expand Down Expand Up @@ -207,7 +207,7 @@ def romaji(self, text, capitalize=True, title=False):
out = out[:-1] + roma[0]
if word.feature.pos2 == '固有名詞':
roma = roma.title()
if (title and
if (title and
word.feature.pos1 not in ('助詞', '助動詞', '接尾辞') and
not (pw and pw.feature.pos1 == '接頭辞')):
roma = roma.title()
Expand All @@ -233,7 +233,7 @@ def romaji(self, text, capitalize=True, title=False):
# 思えば -> omoeba
if nw and nw.feature.pos2 in ('接続助詞'): continue
# 333 -> 333 ; this should probably be handled in mecab
if (word.surface.isdigit() and
if (word.surface.isdigit() and
nw and nw.surface.isdigit()):
continue
# そうでした -> sou deshita
Expand Down Expand Up @@ -267,7 +267,7 @@ def romaji_word(self, word):
# deal with unks first
if word.is_unk:
# at this point is is presumably an unk
# Check character type using the values defined in char.def.
# Check character type using the values defined in char.def.
# This is constant across unidic versions so far but not guaranteed.
if word.char_type == 6 or word.char_type == 7: # hiragana/katakana
kana = jaconv.kata2hira(word.surface)
Expand All @@ -285,16 +285,16 @@ def romaji_word(self, word):
if word.feature.pos1 == '補助記号':
# If it's punctuation we don't recognize, just discard it
return self.table.get(word.surface, '')
elif (self.use_wa and
elif (self.use_wa and
word.feature.pos1 == '助詞' and word.feature.pron == 'ワ'):
return 'wa'
elif (not self.use_he and
elif (not self.use_he and
word.feature.pos1 == '助詞' and word.feature.pron == 'エ'):
return 'e'
elif (not self.use_wo and
elif (not self.use_wo and
word.feature.pos1 == '助詞' and word.feature.pron == 'オ'):
return 'o'
elif (self.use_foreign_spelling and
elif (self.use_foreign_spelling and
has_foreign_lemma(word)):
# this is a foreign word with known spelling
return word.feature.lemma.split('-')[-1]
Expand Down Expand Up @@ -324,7 +324,7 @@ def get_single_mapping(self, pk, kk, nk):
# handle odoriji
# NOTE: This is very rarely useful at present because odoriji are not
# left in readings for dictionary words, and we can't follow kana
# across word boundaries.
# across word boundaries.
if kk in ODORI:
if kk in 'ゝヽ':
if pk: return pk
Expand All @@ -337,7 +337,7 @@ def get_single_mapping(self, pk, kk, nk):
# remaining are 々 for kanji and 〃 for symbols, but we can't
# infer their span reliably (or handle rendaku)
return ''


# handle digraphs
if pk and (pk + kk) in self.table:
Expand All @@ -354,13 +354,13 @@ def get_single_mapping(self, pk, kk, nk):
if kk == 'ー': # 長音符
if pk and pk in self.table: return self.table[pk][-1]
else: return '-'

if kk == 'っ':
if nk:
if self.use_tch and nk == 'ち': return 't'
elif nk in 'あいうえおっ': return '-'
else: return self.table[nk][0] # first character
else:
else:
# seems like it should never happen, but 乗っ|た is two tokens
# so leave this as is and pick it up at the word level
return 'っ'
Expand Down

0 comments on commit 5c4a302

Please sign in to comment.