Skip to content

Commit

Permalink
account for words in punctaution checker
Browse files Browse the repository at this point in the history
  • Loading branch information
korakoe committed Sep 23, 2024
1 parent 1bd2f20 commit 2b9fa1e
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 3 deletions.
12 changes: 11 additions & 1 deletion VoPho/langtokenizers/multicoded.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,17 @@ def is_punctuation(self, char):
"""
Check if a character is a punctuation mark.
"""
return not char.isalnum() and not char.isspace() and not self.is_writing_system(char, self.detect_writing_system(char))
has_writing_system = False
if len(char) > 1:
for character in char:
is_writing = self.is_writing_system(character, self.detect_writing_system(character))
if is_writing:
has_writing_system = True
break

return not char.isalnum() and not char.isspace() and not has_writing_system
else:
return not char.isalnum() and not char.isspace() and not self.is_writing_system(char, self.detect_writing_system(char))

def split_text_by_writing_system(self, text):
"""
Expand Down
2 changes: 1 addition & 1 deletion examples/phonemize_texts.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from VoPho.engine import Phonemizer
from time import time

input_text = "hello, 你好は中国語でこんにちはと言う意味をしています。 Привет!"
input_text = "hello, 你好は中国語でこんにちはと言う意味をしています。 ます。 (testing, this is a test) [me too], Привет"

engine = Phonemizer()
start = time()
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "VoPho"
version = "0.0.1"
version = "0.0.2"
description = "An easy to use Multilingual phonemization meta-library"
readme = "README.md"
authors = [
Expand Down

0 comments on commit 2b9fa1e

Please sign in to comment.