Skip to content
This repository has been archived by the owner on Aug 17, 2023. It is now read-only.

Commit

Permalink
add tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
tatsumoto-ren committed Apr 18, 2022
1 parent f3f6513 commit f61e3ab
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 3 deletions.
2 changes: 1 addition & 1 deletion mecab_controller
12 changes: 10 additions & 2 deletions reading.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

from .helpers import *
from .mecab_controller import MecabController
from .tokens import tokenize, ParseableToken


# Focus lost hook
Expand Down Expand Up @@ -108,8 +109,15 @@ def get_skip_numbers() -> List[str]:
return list(NUMBERS) if config.get('skip_numbers') is True else []


def reading(expr: str) -> str:
return mecab.reading(expr)
def reading(src_text: str) -> str:
substrings = []
for token in tokenize(src_text):
if isinstance(token, ParseableToken):
substrings.append(mecab.reading(token))
else:
substrings.append(token)

return ''.join(substrings).strip()


mecab = MecabController(skip_words=get_skip_words() + get_skip_numbers())
Expand Down
79 changes: 79 additions & 0 deletions tokens.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# Copyright: Ren Tatsumoto <tatsu at autistici.org>
# License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html

import re
from typing import Optional, Iterable, Collection

RE_FLAGS = re.MULTILINE | re.IGNORECASE
HTML_AND_MEDIA_REGEX = re.compile(
r'<[^<>]+>|\[sound:[^\[\]]+]',
flags=RE_FLAGS
)
NON_JP_REGEX = re.compile(
# Reference: https://stackoverflow.com/questions/15033196/
# Added `[` and `]` at the end to keep furigana notation.
# Furigana notation is going to be parsed separately.
# Added arabic numbers.
r'[^\u3000-\u303f\u3040-\u309f\u30a0-\u30ff\uff66-\uff9f\u4e00-\u9fff\u3400-\u4dbf0-90-9\]\[]+',
flags=RE_FLAGS
)
JP_SEP_REGEX = re.compile(
# Reference: https://wikiless.org/wiki/List_of_Japanese_typographic_symbols
r'[仝  ・、※【】「」〒◎×〃゜『』《》〜~〽,.。〄〇〈〉〓〔〕〖〗〘〙〚〛〝〞〟〠〡〢〣〥〦〧〨〭〮〯〫〬〶〷〸〹〺〻〼〾〿!?…ヽヾゞ〱〲〳〵〴()[]{}⦅⦆゠=‥•◦﹅﹆*♪♫♬♩ⓍⓁⓎ]',
flags=RE_FLAGS
)


class Token(str):
pass


class ParseableToken(Token):
pass


def clean_furigana(expr: str) -> str:
return re.sub(r' ?([^ \[\]]+)\[[^ \[\]]+]', r'\g<1>', expr, flags=RE_FLAGS)


def mark_non_jp_token(m: re.Match) -> str:
return "<no-jp>" + m.group() + "</no-jp>"


def tokenize(expr: str, regexes: Optional[Collection[re.Pattern]] = None) -> Iterable[Token]:
"""
Splits expr to tokens.
Each token can be either parseable with mecab or not.
Furigana is removed from parseable tokens, if present.
"""
if not regexes:
expr = clean_furigana(expr)
regexes = HTML_AND_MEDIA_REGEX, NON_JP_REGEX, JP_SEP_REGEX

expr = re.sub(regexes[0], mark_non_jp_token, expr)

for part in re.split(r'(<no-jp>.*?</no-jp>)', expr, flags=RE_FLAGS):
if part:
if m := re.fullmatch(r'<no-jp>(.*?)</no-jp>', part, flags=RE_FLAGS):
yield Token(m.group(1))
elif len(regexes) > 1:
yield from tokenize(part, regexes[1:])
else:
yield ParseableToken(part.replace(' ', ''))


def main():
print(clean_furigana("富竹[とみたけ]さん 今[いま] 扉[とびら]の 南京錠[なんきんじょう]いじってませんでした?"))

expr = (
"<div>Lorem ipsum dolor sit amet, [sound:はな.mp3]<img src=\"はな.jpg\"> "
"consectetur adipiscing<br> elit <b>私達</b>は昨日ロンドンに着いた。おはよう。 Тест.</div>"
"1月8日八日.彼女は12月のある寒い夜に亡くなった。"
"情報処理[じょうほうしょり]の 技術[ぎじゅつ]は 日々[にちにち,ひび] 進化[しんか]している。"
)
for token in tokenize(expr):
print(f"{token.__class__.__name__}({token})")


if __name__ == '__main__':
main()

0 comments on commit f61e3ab

Please sign in to comment.