diff --git a/.editorconfig b/.editorconfig index 74321681..6894a512 100644 --- a/.editorconfig +++ b/.editorconfig @@ -1,28 +1,24 @@ # EditorConfig is awesome: https://EditorConfig.org -# top-most EditorConfig file root = true -# Unix-style newlines with a newline ending every file [*] indent_style = space trim_trailing_whitespace = true end_of_line = lf charset = utf-8 -# 4 space indentation [*.py] +indent_size = 4 max_line_length = 119 insert_final_newline = true [*.json] indent_size = 2 -# Tab indentation (no size specified) [Makefile] indent_style = tab -# Matches the exact files either package.json or .travis.yml [config.yml] indent_size = 2 diff --git a/service/protect.py b/service/protect.py new file mode 100644 index 00000000..2f0617b0 --- /dev/null +++ b/service/protect.py @@ -0,0 +1,63 @@ +from datetime import datetime, timedelta + +from pywikibot import Site, Page + +from tools.bots import BotException +from tools.bots.pi import CanonicalBot +from tools.petscan import PetScan + + +class Protect(CanonicalBot): + def __init__(self, wiki, debug): + CanonicalBot.__init__(self, wiki, debug, log_to_wiki=False) + self.timeout: timedelta = timedelta(hours=2) + + def __enter__(self): + super().__enter__() + if not self.data: + self.logger.warning("Try to get the deprecated data back.") + try: + self.data.get_deprecated() + except BotException: + self.logger.warning("There isn't deprecated data to reload.") + return self + + @staticmethod + def _prepare_searcher() -> PetScan: + searcher = PetScan() + searcher.add_positive_category("Fertig") + searcher.add_negative_category("Korrigiert") + searcher.add_negative_category("Unkorrigiert") + searcher.add_negative_category("Unvollständig") + searcher.set_sort_criteria("date") + searcher.set_sortorder_decending() + searcher.set_search_depth(1) + searcher.set_timeout(120) + searcher.last_change_after(datetime(year=2022, month=9, day=3)) + return searcher + + def task(self) -> bool: + searcher = self._prepare_searcher() + self.logger.info(str(searcher)) + lemma_list = searcher.get_combined_lemma_list(self.data) + print(len(lemma_list)) + for idx, lemma_str in enumerate(lemma_list): + self.data[lemma_str] = datetime.now().strftime("%Y%m%d%H%M%S") + lemma = Page(self.wiki, lemma_str) + self.logger.debug(f"check lemma {lemma.title()} for protection") + if not lemma.protection(): + self.logger.debug(f"protect lemma {lemma.title()}") + lemma.protect(reason="Schutz fertiger Seiten", + protections={'move': 'autoconfirmed', 'edit': 'autoconfirmed'}) + if self._watchdog(): + self.logger.info(f"checked {idx} lemmas") + break + return True + + +# PYWIKIBOT_DIR=/home/esommer/.pywikibot_protect + +if __name__ == "__main__": + WS_WIKI = Site(code="de", fam="wikisource", user="THEprotectIT") + with Protect(wiki=WS_WIKI, debug=False) as bot: + bot.run() diff --git a/service/starter.sh b/service/starter.sh index 22b22597..c40a4bfe 100755 --- a/service/starter.sh +++ b/service/starter.sh @@ -12,3 +12,5 @@ sudo /usr/local/bin/python3.10 -m pip install -r requirements.txt export PYTHONPATH=${PYTHONPATH}:${BASE_DIR} source /etc/environment /usr/local/bin/python3.10 service/runner.py +export PYWIKIBOT_DIR=/home/pi/.pywikibot_protect/ +/usr/local/bin/python3.10 service/protect.py diff --git a/service/test_protect.py b/service/test_protect.py new file mode 100644 index 00000000..6d2727a2 --- /dev/null +++ b/service/test_protect.py @@ -0,0 +1,13 @@ +from unittest import TestCase, mock + + +class TestProtect(TestCase): + def setUp(self): + self.petscan_patcher = mock.patch("service.protect.PetScan") + self.petscan_mock = self.petscan_patcher.start() + self.run_mock = mock.Mock() + self.petscan_mock.return_value = mock.Mock(run=self.run_mock) + self.addCleanup(mock.patch.stopall) + + def tearDown(self): + mock.patch.stopall() diff --git a/service/ws_re/scanner/base.py b/service/ws_re/scanner/base.py index 2da9ccf6..f94db101 100644 --- a/service/ws_re/scanner/base.py +++ b/service/ws_re/scanner/base.py @@ -1,7 +1,6 @@ import traceback from contextlib import suppress from datetime import timedelta, datetime -from operator import itemgetter from typing import List, Optional, Dict, Callable import pywikibot @@ -17,10 +16,9 @@ from service.ws_re.scanner.tasks.wikidata.task import DATATask from service.ws_re.template import ReDatenException from service.ws_re.template.re_page import RePage -from tools._typing import PetscanLemma from tools.bots import BotException from tools.bots.pi import CanonicalBot -from tools.petscan import PetScan, PetScanException +from tools.petscan import PetScan class ReScanner(CanonicalBot): @@ -43,41 +41,6 @@ def __enter__(self): self.logger.warning("There isn't deprecated data to reload.") return self - def compile_lemma_list(self) -> List[str]: - self.logger.info("Compile the lemma list") - self.logger.info("Searching for lemmas") - raw_lemma_list = self._petscan_search() - self.statistic["len_raw_lemma_list"] = len(raw_lemma_list) - self.logger.info("Filter new_lemma_list") - # all items which wasn't process before - new_lemma_list = [] - for lemma in raw_lemma_list: - try: - self.data[lemma] - except KeyError: - new_lemma_list.append(lemma) - self.statistic["len_new_lemma_list"] = len(new_lemma_list) - self.logger.info("Sort old_lemma_list") - # before processed lemmas ordered by last process time - old_lemma_list = [x[0] for x in sorted(self.data.items(), key=itemgetter(1))] - # first iterate new items then the old ones (oldest first) - self.logger.info("Add the two lists") - self.statistic["len_old_lemma_list"] = len(old_lemma_list) - self.logger.info(f"raw: {self.statistic['len_raw_lemma_list']}, " - f"new: {self.statistic['len_new_lemma_list']}, " - f"old: {self.statistic['len_old_lemma_list']}") - return new_lemma_list + old_lemma_list - - def _petscan_search(self) -> List[str]: - searcher = self._prepare_searcher() - self.logger.info(f"[{searcher} {searcher}]") - raw_lemma_list: List[PetscanLemma] = [] - try: - raw_lemma_list = searcher.run() - except PetScanException: - self.logger.error("Search timed out.") - return [item["nstext"] + ":" + item["title"] for item in raw_lemma_list] - def _prepare_searcher(self) -> PetScan: searcher = PetScan() searcher.add_yes_template("REDaten") @@ -95,6 +58,11 @@ def _prepare_searcher(self) -> PetScan: searcher.set_timeout(120) return searcher + @property + def lemma_list(self) -> list[str]: + searcher = self._prepare_searcher() + return searcher.get_combined_lemma_list(self.data) + def _activate_tasks(self) -> List[ReScannerTask]: active_tasks = [] for task in self.tasks: @@ -135,10 +103,9 @@ def get_oldest_datetime(self) -> datetime: def task(self) -> bool: active_tasks = self._activate_tasks() error_task = ERROTask(wiki=self.wiki, debug=self.debug, logger=self.logger) - lemma_list = self.compile_lemma_list() self.logger.info("Start processing the lemmas.") processed_lemmas = 0 - for idx, lemma in enumerate(lemma_list): + for idx, lemma in enumerate(self.lemma_list): self.logger.debug(f"Process [https://de.wikisource.org/wiki/{lemma} {lemma}]") list_of_done_tasks = [] try: diff --git a/service/ws_re/scanner/test.py b/service/ws_re/scanner/test.py index 1d09630d..c508aee5 100644 --- a/service/ws_re/scanner/test.py +++ b/service/ws_re/scanner/test.py @@ -11,14 +11,13 @@ from service.ws_re.scanner.tasks.base_task import ReScannerTask from service.ws_re.template import ReDatenException from tools.bots.test_pi import setup_data_path, teardown_data_path, _DATA_PATH_TEST +from tools.test import SearchStringChecker class TestReScanner(TestCase): def setUp(self): - self.petscan_patcher = mock.patch("service.ws_re.scanner.base.PetScan") + self.petscan_patcher = mock.patch("service.ws_re.scanner.base.PetScan.get_combined_lemma_list") self.petscan_mock = self.petscan_patcher.start() - self.run_mock = mock.Mock() - self.petscan_mock.return_value = mock.Mock(run=self.run_mock) setup_data_path(self) self.addCleanup(mock.patch.stopall) @@ -26,22 +25,10 @@ def tearDown(self): teardown_data_path() mock.patch.stopall() - class SearchStringChecker: - def __init__(self, search_string: str): - self.search_string = search_string - - def is_part_of_searchstring(self, part: str): - pre_length = len(self.search_string) - self.search_string = "".join(self.search_string.split(part)) - return pre_length != len(self.search_string) - - def is_empty(self): - return len(self.search_string) == 0 - def test_search_prepare_debug(self): mock.patch.stopall() with ReScanner(log_to_screen=False, log_to_wiki=False) as bot: - checker = self.SearchStringChecker(str(bot._prepare_searcher())) + checker = SearchStringChecker(str(bot._prepare_searcher())) self.assertTrue(checker.is_part_of_searchstring( r"https://petscan.wmflabs.org/?language=de&project=wikisource")) self.assertTrue(checker.is_part_of_searchstring("&templates_yes=REDaten")) @@ -51,7 +38,7 @@ def test_search_prepare_debug(self): def test_search_prepare(self): mock.patch.stopall() with ReScanner(log_to_screen=False, log_to_wiki=False, debug=False) as bot: - checker = self.SearchStringChecker(str(bot._prepare_searcher())) + checker = SearchStringChecker(str(bot._prepare_searcher())) self.assertTrue(checker.is_part_of_searchstring( "https://petscan.wmflabs.org/?language=de&project=wikisource")) self.assertTrue(checker.is_part_of_searchstring( @@ -63,30 +50,6 @@ def test_search_prepare(self): self.assertTrue(checker.is_part_of_searchstring("&sortorder=descending")) self.assertTrue(checker.is_empty()) - result_of_searcher = [{"id": 42, "len": 42, "n": "page", "namespace": 0, "nstext": '', - "title": "RE:Lemma1", "touched": "20010101232359"}, - {"id": 42, "len": 42, "n": "page", "namespace": 0, "nstext": '', - "title": "RE:Lemma2", "touched": "20000101232359"}, - {"id": 42, "len": 42, "n": "page", "namespace": 0, "nstext": '', - "title": "RE:Lemma3", "touched": "19990101232359"} - ] - - def test_compile_lemmas_no_old_lemmas(self): - self.run_mock.return_value = self.result_of_searcher - with ReScanner(log_to_screen=False, log_to_wiki=False) as bot: - self.assertEqual([":RE:Lemma1", ":RE:Lemma2", ":RE:Lemma3"], bot.compile_lemma_list()) - - def test_compile_lemmas_old_lemmas(self): - self.run_mock.return_value = self.result_of_searcher - with ReScanner(log_to_screen=False, log_to_wiki=False) as bot: - with mock.patch.dict(bot.data, {":RE:Lemma1": "20010101232359"}): - self.assertEqual([":RE:Lemma2", ":RE:Lemma3", ":RE:Lemma1"], - bot.compile_lemma_list()) - with mock.patch.dict(bot.data, {":RE:Lemma1": "20010101232359", - ":RE:Lemma3": "20020101232359"}): - self.assertEqual([":RE:Lemma2", ":RE:Lemma1", ":RE:Lemma3"], - bot.compile_lemma_list()) - def test_get_oldest_processed(self): with ReScanner(log_to_screen=False, log_to_wiki=False) as bot: with mock.patch.dict(bot.data, {":RE:Lemma1": "20010101000000", @@ -113,8 +76,8 @@ def test_activate_tasks(self): def _mock_surroundings(self): # pylint: disable=attribute-defined-outside-init - lemma_patcher = mock.patch("service.ws_re.scanner.base.ReScanner.compile_lemma_list", - mock.Mock()) + lemma_patcher = mock.patch("service.ws_re.scanner.base.ReScanner.lemma_list", + mock.PropertyMock()) page_patcher = mock.patch("service.ws_re.scanner.base.pywikibot.Page") page_patcher_error = mock.patch("service.ws_re.scanner.tasks.base_task.pywikibot.Page") re_page_patcher = mock.patch("service.ws_re.scanner.base.RePage") diff --git a/tools/bots/pi.py b/tools/bots/pi.py index 9429346a..a3b28798 100644 --- a/tools/bots/pi.py +++ b/tools/bots/pi.py @@ -8,7 +8,7 @@ from typing import Dict, Any, Iterator, List from typing import TypedDict # pylint: disable=no-name-in-module -from pywikibot import Page, Site, Category +from pywikibot import Page, Site, Category, BaseSite from pywikibot.pagegenerators import CategorizedPageGenerator from tools.bots import BotException @@ -196,7 +196,7 @@ def __init__(self, wiki: Site = None, debug: bool = True, " def task(self):\n" " do_stuff()") self.timestamp: PersistedTimestamp = PersistedTimestamp(bot_name=self.bot_name) - self.wiki: Page = wiki + self.wiki: BaseSite = wiki self.debug: bool = debug self.timeout: timedelta = timedelta(days=1) self.logger: WikiLogger = WikiLogger(self.bot_name, diff --git a/tools/petscan.py b/tools/petscan.py index 2d0004a6..7e28355c 100644 --- a/tools/petscan.py +++ b/tools/petscan.py @@ -1,7 +1,8 @@ # pylint: disable=ungrouped-imports import json from datetime import datetime -from typing import List, Union +from operator import itemgetter +from typing import List, Union, Mapping from urllib.parse import quote import requests @@ -242,3 +243,23 @@ def run(self) -> List[PetscanLemma]: response_byte = response.content response_dict = json.loads(response_byte.decode("utf8")) return response_dict["*"][0]["a"]["*"] # type: ignore + + def get_combined_lemma_list(self, old_lemmas: Mapping) -> list[str]: + """ + Executes the search. Filters out all preprocessed lemmas from a provided dictionary. + Interlaces this two lists to a combined list sorted by: + * every new lemma + * old lemmas sorted by dictionary value (probably a timestamp) + """ + raw_lemma_list = [item["nstext"] + ":" + item["title"] for item in self.run()] + # all items which wasn't process before + new_lemma_list = [] + for lemma in raw_lemma_list: + try: + old_lemmas[lemma] + except KeyError: + new_lemma_list.append(lemma) + # before processed lemmas ordered by last process time + old_lemma_list = [x[0] for x in sorted(old_lemmas.items(), key=itemgetter(1))] + # first iterate new items then the old ones (oldest first) + return new_lemma_list + old_lemma_list diff --git a/tools/test.py b/tools/test.py index 258ddc56..a9c0401d 100644 --- a/tools/test.py +++ b/tools/test.py @@ -8,3 +8,16 @@ def real_wiki_test(func): wrapper = skipUnless(REAL_WIKI_TEST, "only execute in test against real wiki")(func) return wrapper + + +class SearchStringChecker: + def __init__(self, search_string: str): + self.search_string = search_string + + def is_part_of_searchstring(self, part: str): + pre_length = len(self.search_string) + self.search_string = "".join(self.search_string.split(part)) + return pre_length != len(self.search_string) + + def is_empty(self): + return len(self.search_string) == 0 diff --git a/tools/test_petscan.py b/tools/test_petscan.py index e537677d..f2353e74 100644 --- a/tools/test_petscan.py +++ b/tools/test_petscan.py @@ -1,16 +1,20 @@ # pylint: disable=protected-access from datetime import datetime -from unittest import TestCase +from unittest import TestCase, mock import requests_mock +from testfixtures import compare from tools.petscan import PetScan, PetScanException -class TestCatScan(TestCase): +class TestPetScan(TestCase): def setUp(self): self.petscan = PetScan() + def tearDown(self): + mock.patch.stopall() + def test_add_options(self): self.petscan.add_options({"max_age": "45"}) self.petscan.add_options({"smaller": "300"}) @@ -173,24 +177,24 @@ def test_construct_string(self): "https://petscan.wmflabs.org/?language=en&project=wikipedia&max_age=10") def test_do_positive(self): - with requests_mock.mock() as mock: - mock.get("https://petscan.wmflabs.org/" - "?language=de&project=wikisource&format=json&doit=1", - text='{"n": "result","a": {"querytime_sec": 1.572163,' - '"query": "https://petscan.wmflabs.org/?language=de' - '&project=wikisource&categories=Autoren&get_q=1' - '&show_redirects=no&ns[0]=1&max_age=48' - '&format=json&doit=1"},' - '"*": [{"n": "combination",' - '"a": {"type": "subset",' - '"*": [{"id": 3279,' - '"len": 10197,' - '"n": "page",' - '"namespace": 0,' - '"nstext": "",' - '"q": "Q60644",' - '"title": "Friedrich_Rückert",' - '"touched": "20161024211701"}]}}]}') + with requests_mock.mock() as request_mock: + request_mock.get("https://petscan.wmflabs.org/" + "?language=de&project=wikisource&format=json&doit=1", + text='{"n": "result","a": {"querytime_sec": 1.572163,' + '"query": "https://petscan.wmflabs.org/?language=de' + '&project=wikisource&categories=Autoren&get_q=1' + '&show_redirects=no&ns[0]=1&max_age=48' + '&format=json&doit=1"},' + '"*": [{"n": "combination",' + '"a": {"type": "subset",' + '"*": [{"id": 3279,' + '"len": 10197,' + '"n": "page",' + '"namespace": 0,' + '"nstext": "",' + '"q": "Q60644",' + '"title": "Friedrich_Rückert",' + '"touched": "20161024211701"}]}}]}') self.assertEqual(self.petscan.run(), [{"id": 3279, "len": 10197, "n": "page", @@ -201,9 +205,34 @@ def test_do_positive(self): "touched": "20161024211701"}]) def test_do_negative(self): - with requests_mock.mock() as mock: - mock.get("https://petscan.wmflabs.org/" - "?language=de&project=wikisource&format=json&doit=1", - status_code=404) + with requests_mock.mock() as request_mock: + request_mock.get("https://petscan.wmflabs.org/" + "?language=de&project=wikisource&format=json&doit=1", + status_code=404) with self.assertRaises(PetScanException): self.petscan.run() + + result_of_searcher = [{"id": 42, "len": 42, "n": "page", "namespace": 0, "nstext": '', + "title": "RE:Lemma1", "touched": "20010101232359"}, + {"id": 42, "len": 42, "n": "page", "namespace": 0, "nstext": '', + "title": "RE:Lemma2", "touched": "20000101232359"}, + {"id": 42, "len": 42, "n": "page", "namespace": 0, "nstext": '', + "title": "RE:Lemma3", "touched": "19990101232359"} + ] + + def mock_searcher(self): + self.petscan_patcher = mock.patch("tools.petscan.PetScan.run") # pylint: disable=attribute-defined-outside-init) + self.petscan_mock = self.petscan_patcher.start() # pylint: disable=attribute-defined-outside-init) + self.addCleanup(mock.patch.stopall) + def test_get_combined_lemmas_no_old_lemmas(self): + self.mock_searcher() + self.petscan_mock.return_value = self.result_of_searcher + compare([":RE:Lemma1", ":RE:Lemma2", ":RE:Lemma3"], self.petscan.get_combined_lemma_list({})) + + def test_get_combined_lemmas_old_lemmas(self): + self.mock_searcher() + self.petscan_mock.return_value = self.result_of_searcher + compare([":RE:Lemma2", ":RE:Lemma3", ":RE:Lemma1"], + self.petscan.get_combined_lemma_list({":RE:Lemma1": "20010101232359"})) + compare([":RE:Lemma2", ":RE:Lemma1", ":RE:Lemma3"], + self.petscan.get_combined_lemma_list({":RE:Lemma1": "20010101232359", ":RE:Lemma3": "20020101232359"}))