Skip to content

Commit

Permalink
add dynamic language support based on available language dirs (#8)
Browse files Browse the repository at this point in the history
* implement dynamic language detection

* update SafeText for automatic language handling

* remove init files contains not needed classes anymore

* delete languages base module

* move baseprofanity as profanity checker

* update readme

* remove languages/init

* minor fix

* simplified code
  • Loading branch information
SeeknnDestroy authored Dec 11, 2023
1 parent 4e39bd2 commit ed8124e
Show file tree
Hide file tree
Showing 10 changed files with 157 additions and 156 deletions.
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
<div align="center">
<p>
<a align="center" href="" target="_blank">
<img
width="1280"
src="https://github.com/safevideo/safetext/assets/44926076/9af66dde-3a93-4c5b-b802-cb31dffcb2e5"
>
</a>
</p>
</div>

# safetext

Rule-based profanity checking tool for English and Turkish.
Expand Down
120 changes: 99 additions & 21 deletions safetext/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
from safetext.utils import detect_language_from_srt, detect_language_from_text
import os

from .languages.de import GermanProfanityChecker
from .languages.en import EnglishProfanityChecker
from .languages.es import SpanishProfanityChecker
from .languages.pt import PortugueseProfanityChecker
from .languages.tr import TurkishProfanityChecker
from safetext.utils import detect_language_from_srt, detect_language_from_text

__version__ = "0.0.4"

Expand All @@ -17,20 +13,17 @@ def __init__(self, language="en"):
if language is not None:
self.set_language(language)

def set_language(self, language):
def set_language(self, language: str):
"""Sets the language of the profanity checker."""
words_file_path = self._get_words_filepath(language)
if not os.path.exists(words_file_path):
raise ValueError(f"No profanity word list found for language '{language}'.")

self.language = language
if language == "en":
self.checker = EnglishProfanityChecker()
elif language == "tr":
self.checker = TurkishProfanityChecker()
elif language == "es":
self.checker = SpanishProfanityChecker()
elif language == "de":
self.checker = GermanProfanityChecker()
elif language == "pt":
self.checker = PortugueseProfanityChecker()
else:
raise ValueError("Language not supported")
self.checker = ProfanityChecker(language)

def _get_words_filepath(self, language: str) -> str:
return os.path.join(os.path.dirname(__file__), f"languages/{language}/words.txt")

def set_language_from_text(self, text):
"""
Expand Down Expand Up @@ -76,7 +69,7 @@ def check_profanity(self, text):
- end: The end index of the profanity word in the text.
"""
if self.checker is None:
raise ValueError("Language not set")
self._auto_set_language(text)
return self.checker.check(text)

def censor_profanity(self, text):
Expand All @@ -90,5 +83,90 @@ def censor_profanity(self, text):
str: The censored text. The profanity words are replaced with asterisks.
"""
if self.checker is None:
raise ValueError("Language not set")
self._auto_set_language(text)
return self.checker.censor(text)

def _auto_set_language(self, text: str):
detected_language = detect_language_from_text(text)
self.set_language(detected_language)


class ProfanityChecker:
"""Base class for profanity checkers."""

def __init__(self, language):
self.language = language

@property
def words_filepath(self):
"""Get the filepath for the profanity words file."""
import pathlib

return f"{pathlib.Path(__file__).parent.resolve()}/languages/{self.language}/words.txt"

@property
def profanity_words(self):
"""Get the profanity words for the language."""
if not hasattr(self, "_profanity_words"):
self._profanity_words = self._read_words(self.words_filepath)

return self._profanity_words

def _check(self, text):
"""Check the text for profanity."""
# Split the text into a list of words
words = text.split()

# Initialize a list to store the indices of profanity words
profanity_infos = []

for i, word in enumerate(words):
if word.lower() in self.profanity_words:
start_index = sum(len(w) + 1 for w in words[:i]) # +1 to account for space between words
end_index = start_index + len(word)
profanity_info = {
"word": word,
"index": i + 1,
"start": start_index,
"end": end_index,
}
profanity_infos.append(profanity_info)

return profanity_infos

def _read_words(self, filepath):
"""Read the profanity words from the given file."""
with open(filepath, encoding="utf8") as f:
profanity_words = f.read().splitlines()

return profanity_words

def _preprocess(self, text):
"""Preprocess the text before checking for profanity."""
return text

def check(self, text):
"""
Check the text for profanity.
Args:
text (str): The text to check for profanity.
Returns:
list: A list of profanity infos. Each profanity info is a dict with the following keys:
- word: The profanity word.
- index: The index of the profanity word in the text.
- start: The start index of the profanity word in the text.
- end: The end index of the profanity word in the text.
"""
return self._check(self._preprocess(text))

def censor(self, text):
"""Censor the text."""
detected_profanities = self.check(text)
for profanity in detected_profanities:
start_index = profanity["start"]
end_index = profanity["end"]
text = text.replace(text[start_index:end_index], "***")

return text
Empty file removed safetext/languages/__init__.py
Empty file.
79 changes: 0 additions & 79 deletions safetext/languages/base.py

This file was deleted.

8 changes: 0 additions & 8 deletions safetext/languages/de/__init__.py

This file was deleted.

8 changes: 0 additions & 8 deletions safetext/languages/en/__init__.py

This file was deleted.

8 changes: 0 additions & 8 deletions safetext/languages/es/__init__.py

This file was deleted.

8 changes: 0 additions & 8 deletions safetext/languages/pt/__init__.py

This file was deleted.

8 changes: 0 additions & 8 deletions safetext/languages/tr/__init__.py

This file was deleted.

63 changes: 47 additions & 16 deletions safetext/utils.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,62 @@
import os
from typing import List

import pysrt
from lingua import Language, LanguageDetectorBuilder

LANGUAGE_TO_CODE = {
Language.ENGLISH: "en",
Language.TURKISH: "tr",
Language.GERMAN: "de",
Language.FRENCH: "fr",
Language.SPANISH: "es",
}
LANGUAGES = [Language.ENGLISH, Language.TURKISH, Language.GERMAN, Language.FRENCH, Language.SPANISH]
DETECTOR = LanguageDetectorBuilder.from_languages(*LANGUAGES).build()

def available_languages() -> List[Language]:
"""
Scans the 'languages' directory to identify available languages based on directory names.
Returns:
List[Language]: A list of available languages as Language enum values.
"""
current_file_directory = os.path.dirname(__file__)

languages_path = os.path.join(current_file_directory, "languages")

all_items_in_languages_dir = os.listdir(languages_path)

available_lang_codes = []

for item in all_items_in_languages_dir:
item_full_path = os.path.join(languages_path, item)

if os.path.isdir(item_full_path):
available_lang_codes.append(item)

available_langs = []
for lang in Language:
if lang.iso_code_639_1.name.lower() in available_lang_codes: # Correctly access the ISO 639-1 code
available_langs.append(lang)

return available_langs


def initialize_detector() -> LanguageDetectorBuilder:
"""
Dynamically initializes the language detector based on the available languages.
Returns:
LanguageDetectorBuilder: An initialized language detector.
"""
return LanguageDetectorBuilder.from_languages(*available_languages()).build()


def detect_language_from_text(text: str) -> str:
"""
Detects the language of the given text.
Detects the language of the given text using the dynamically initialized language detector.
Args:
text (str): The text to detect the language of.
Returns:
str: The language code of the detected language.
(e.g. "en", "tr")
str: The ISO 639-1 language code of the detected language.
"""
result = DETECTOR.detect_language_of(text)
return LANGUAGE_TO_CODE[result]
DETECTOR = initialize_detector()
detected_language = DETECTOR.detect_language_of(text)
return detected_language.iso_code_639_1.name.lower() # IsoCode639_1


def detect_language_from_srt(srt_file: str, use_first_n_subs: 10) -> str:
Expand All @@ -38,8 +71,6 @@ def detect_language_from_srt(srt_file: str, use_first_n_subs: 10) -> str:
str: The language code of the detected language.
(e.g. "en", "tr")
"""
import pysrt

subs = pysrt.open(srt_file, encoding="utf-8")
text = " ".join([sub.text_without_tags.replace("\n", " ") for sub in subs[:use_first_n_subs]])

Expand Down

0 comments on commit ed8124e

Please sign in to comment.