diff --git a/requirements.txt b/requirements.txt index 537208a..abda458 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ lingua-language-detector==1.3.1 pysrt +requests +python-dotenv diff --git a/safetext/__init__.py b/safetext/__init__.py index fcfd63d..22ca2e5 100644 --- a/safetext/__init__.py +++ b/safetext/__init__.py @@ -1,19 +1,39 @@ +import logging import os import re -from typing import Dict, List +from typing import Dict, List, Optional + +import requests from safetext.utils import detect_language_from_srt, detect_language_from_text -__version__ = "0.0.5" +__version__ = "0.0.6" class SafeText: + """A class to provide text analysis for profanity detection using the built-in ProfanityChecker and + optionally validating the results against the ModerateContentAPI. + """ + + def __init__(self, language: str = "en", validate_profanity: bool = False): + """ + Initializes the SafeText with a specified language and validation option. - def __init__(self, language="en"): + Args: + language (str): The language code for the profanity list. (ISO 639-1) + validate_profanity (bool): Flag to enable validation of profanity detection results + against ModerateContentAPI when using ProfanityChecker. + """ self.language = language self.checker = None if language is not None: self.set_language(language) + self.moderate_content_api_key = os.getenv('MODERATE_CONTENT_API_KEY') + self.validate_profanity = validate_profanity + + if validate_profanity and not self.moderate_content_api_key: + raise ValueError( + "MODERATE_CONTENT_API_KEY key must set as an environment variable for validation.") def set_language(self, language: str): """Sets the language of the profanity checker.""" @@ -56,9 +76,11 @@ def set_language_from_srt(self, srt_file, use_first_n_subs=10): language = detect_language_from_srt(srt_file, use_first_n_subs) self.set_language(language) - def check_profanity(self, text): + def check_profanity(self, text: str): """ - Checks the given text for profanity. + Checks the given text for profanity using the selected method. If validation is enabled, it logs the + probable missing bad words and false positives between the results of ProfanityChecker and + ModerateContentAPI. Args: text (str): The text to check for profanity. @@ -70,9 +92,12 @@ def check_profanity(self, text): - start: The start index of the profanity word in the text. - end: The end index of the profanity word in the text. """ - if self.checker is None: - self._auto_set_language(text) - return self.checker.check(text) + checker_results = self.checker.check(text) + if self.validate_profanity: + checker_bad_words = [profanity["word"] for profanity in checker_results] + self._validate_profanity(text, checker_bad_words) + + return checker_results def censor_profanity(self, text): """ @@ -88,9 +113,48 @@ def censor_profanity(self, text): self._auto_set_language(text) return self.checker.censor(text) - def _auto_set_language(self, text: str): - detected_language = detect_language_from_text(text) - self.set_language(detected_language) + def get_bad_words(self, text: str = None, profanity_results: Optional[List[Dict]] = None) -> List[str]: + """ + Retrieves a list of bad words found in the given text or from provided profanity results. + + Args: + text (str, optional): The text to scan for profanities. + profanity_results (Optional[List[Dict]], optional): Pre-calculated profanity results. + + Returns: + List[str]: A list of bad words detected in the text. + """ + return self.checker.get_bad_words(text, profanity_results) + + def _auto_set_language(self, text): + """ + Detects the language of the given text and sets the language of the profanity checker. + + Args: + text (str): The text to detect the language of. + """ + language = detect_language_from_text(text) + self.set_language(language) + + def _validate_profanity(self, text: str, checker_bad_words: list): + """ + Validates the profanity detection results of ProfanityChecker against the ModerateContentAPI. + + Args: + text (str): The text that was checked. + checker_bad_words (list): The list of bad words detected by ProfanityChecker. + """ + api_bad_words = ModerateContentAPI(self.moderate_content_api_key).get_bad_words(text) + + missing_words = set(api_bad_words) - set(checker_bad_words) + false_positives = set(checker_bad_words) - set(api_bad_words) + + if missing_words: + logging.info(f"Possible missing bad words: {missing_words}") + if false_positives: + logging.info(f"Possible false detected words: {false_positives}") + if not missing_words and not false_positives: + logging.info("All good for validation!") class ProfanityChecker: @@ -189,17 +253,29 @@ def _find_profanity_phrase(self, profanity: str, lower_text: str, profanity_info }) start = lower_text.find(profanity, end) - def check(self, text: str) -> List[Dict]: + def get_bad_words(self, text: str = None, profanity_results: Optional[List[Dict]] = None) -> List[str]: """ - Checks the given text for profanity. + Retrieves a list of bad words found in the given text or from provided profanity results. Args: - text (str): The text to check for profanity. + text (str, optional): The text to scan for profanities. + profanity_results (Optional[List[Dict]], optional): Pre-calculated profanity results from + ProfanityChecker.check() method. Returns: - List[Dict]: A list of dictionaries, each containing information about a found profanity. + List[str]: A list of bad words detected in the text. """ - return self._find_profanities(text) + if text is None and profanity_results is None: + raise ValueError("Either text or profanity_results must be provided.") + + if profanity_results is None: + profanity_results = self.check(text) + + bad_words = [] + for profanity in profanity_results: + bad_words.append(profanity["word"]) + + return bad_words def censor(self, text: str) -> str: """ @@ -217,3 +293,108 @@ def censor(self, text: str) -> str: end_index = profanity["end"] text = text[:start_index] + '*' * (end_index - start_index) + text[end_index:] return text + + def check(self, text: str) -> List[Dict]: + """ + Checks the given text for profanity. + + Args: + text (str): The text to check for profanity. + + Returns: + List[Dict]: A list of dictionaries, each containing information about a found profanity. + """ + return self._find_profanities(text) + + +class ModerateContentAPI: + """ + A class to interact with the Moderate Content API for profanity detection. + + This class facilitates the detection of bad words in text using the + Moderate Content API. It allows for fetching a list of bad words detected + in a given text. + + Attributes: + api_key (str): The API key for accessing the Moderate Content API. + """ + + def __init__(self, api_key: Optional[str] = None): + """ + Initializes the ModerateContentAPI with an optional API key. + + Args: + api_key (str, optional): The API key for the Moderate Content API. + If not provided, it will look for an API key + in the MODERATE_CONTENT_API_KEY environment variable. + """ + self.api_key = api_key or os.getenv('MODERATE_CONTENT_API_KEY') + if not self.api_key: + raise ValueError("API key must be provided or set as an environment variable.") + + def _request_api(self, text: str, exclude: Optional[str] = None, replace: Optional[str] = None) -> Dict: + """ + Makes a request to the Moderate Content API and returns the response. + + Args: + text (str): The text to analyze for bad words. + exclude (str, optional): A comma-delimited list of words to exclude from checking. + replace (str, optional): A string of characters to replace bad words with. + + Returns: + Dict: A dictionary containing the API response. + """ + api_url = "https://api.moderatecontent.com/text/" + params = {'key': self.api_key, 'msg': text, 'exclude': exclude, 'replace': replace} + try: + response = requests.get(api_url, params=params) + response.raise_for_status() + return response.json() + except requests.RequestException as e: + # Log the exception details here + raise ConnectionError("Failed to connect to the Moderate Content API.") from e + + def get_bad_words(self, + text: str, + exclude: Optional[str] = None, + replace: Optional[str] = None) -> List[str]: + """ + Analyzes the given text and returns a list of bad words found. + + Args: + text (str): The text to analyze for bad words. + exclude (str, optional): A comma-delimited list of words to exclude from checking. + replace (str, optional): A string of characters to replace bad words with. + + Returns: + List[str]: A list of bad words detected in the text. + """ + response = self._request_api(text, exclude, replace) + return response.get('bad_words', []) + + def censor(self, text: str, exclude: Optional[str] = None, replace: Optional[str] = None) -> str: + """ + Analyzes the given text and returns a censored version of it. + + Args: + text (str): The text to analyze for bad words. + exclude (str, optional): A comma-delimited list of words to exclude from checking. + replace (str, optional): A string of characters to replace bad words with. + + Returns: + str: The censored text with bad words replaced by asterisks. + """ + response = self._request_api(text, exclude, replace) + return response.get('clean', '') + + def check(self, text: str): + """ + Checks the given text for profanity. + + Args: + text (str): The text to check for profanity. + + Returns: + List[str]: A list of bad words detected in the text. + """ + return self.get_bad_words(text) diff --git a/safetext/languages/en/words.txt b/safetext/languages/en/words.txt index 6f8fdff..68cd3cc 100644 --- a/safetext/languages/en/words.txt +++ b/safetext/languages/en/words.txt @@ -46,7 +46,6 @@ ball licking ball sack ball sucking ballbag -balls ballsack bangbros bangbus @@ -118,7 +117,6 @@ bukkake bulldyke bullet vibe bullshit -bum bung hole bunghole bunny fucker @@ -502,6 +500,7 @@ mo-fo mof0 mofo mong +moron mothafuck mothafucka mothafuckas @@ -668,7 +667,6 @@ rimming rosy palm rosy palm and her 5 sisters rusty trombone -s hit s&m s.o.b. s_h_i_t