diff --git a/src/lootscraper/scraper/scraper_base.py b/src/lootscraper/scraper/scraper_base.py index 0e212883..19593d9e 100644 --- a/src/lootscraper/scraper/scraper_base.py +++ b/src/lootscraper/scraper/scraper_base.py @@ -13,7 +13,11 @@ from lootscraper.browser import get_new_page from lootscraper.common import Category, OfferDuration, OfferType, Source from lootscraper.config import Config -from lootscraper.utils import clean_game_title, clean_loot_title, clean_title +from lootscraper.utils import ( + clean_combined_title, + clean_game_title, + clean_loot_title, +) if TYPE_CHECKING: from collections.abc import Awaitable, Callable @@ -184,24 +188,40 @@ def clean_offers(self, offers: list[Offer]) -> list[Offer]: if offer.rawtext is None: continue - try: - raw_title = offer.rawtext["gametitle"] - title_new = ( - clean_game_title(raw_title) - + " - " - + clean_loot_title(offer.rawtext["title"]) - ) - except KeyError: - raw_title = offer.rawtext["title"] - title_new = clean_title(raw_title, offer.type) + # TODO: Since it's refreshed from the rawtext, there is no need to + # set the title and probable_game_name in the scrapers anymore. + + new_title = None + new_probable_name = None - if title_new != offer.title: - offer.title = title_new + if offer.type == OfferType.GAME: + new_title = clean_game_title(offer.rawtext["title"]) + new_probable_name = new_title + elif offer.type == OfferType.LOOT: + try: + new_probable_name = clean_game_title(offer.rawtext["gametitle"]) + new_title = ( + new_probable_name + + " - " + + clean_loot_title(offer.rawtext["title"]) + ) + except KeyError: + new_probable_name, new_title = clean_combined_title( + offer.rawtext["title"], + ) + + if new_title != offer.title: + logging.debug( + f"Cleaning up title. Old: {offer.title}, new: {new_title}.", + ) + offer.title = new_title - if offer.probable_game_name is not None: - offer.probable_game_name = clean_game_title( - offer.probable_game_name, + if new_probable_name != offer.probable_game_name: + logging.debug( + "Cleaning up probable game name. " + f"Old: {offer.probable_game_name}, new: {new_probable_name}.", ) + offer.probable_game_name = new_probable_name if offer.url is not None: offer.url = offer.url.replace("\n", "").strip() diff --git a/src/lootscraper/tools.py b/src/lootscraper/tools.py index 2b5b32a7..52ad9838 100644 --- a/src/lootscraper/tools.py +++ b/src/lootscraper/tools.py @@ -7,14 +7,14 @@ from sqlalchemy.orm import Session from lootscraper.browser import get_browser_context -from lootscraper.common import Category, OfferDuration +from lootscraper.common import Category, OfferDuration, OfferType from lootscraper.database import Game, IgdbInfo, LootDatabase, Offer, SteamInfo from lootscraper.processing import add_game_info from lootscraper.scraper.scraper_base import Scraper from lootscraper.utils import ( + clean_combined_title, clean_game_title, clean_loot_title, - clean_title, ) logger = logging.getLogger(__name__) @@ -31,21 +31,29 @@ async def refresh_all_games(session: Session, context: BrowserContext) -> None: Drop all games from the database and re-add them, scraping all information again. """ + all_offers = session.query(Offer).all() + # Remove all games from the database. Skip this step if you don't want to + # re-scrape all games. logger.info("Dropping all existing information from database") + for offer in all_offers: + offer.game_id = None session.query(Game).delete() session.query(SteamInfo).delete() session.query(IgdbInfo).delete() + # Commit all changes at once to keep the database consistent. session.commit() - all_offers = session.query(Offer).all() - log("Gathering new information") offer: Offer for offer in all_offers: + # Use this to skip offers that have already been processed (e.g. when + # the script crashed). + # if offer.id < 2678: + # continue log(f"Adding game info for offer {offer.id}.") await add_game_info(offer, session, context) - - session.commit() + # Save after every offer to avoid losing progress. + session.commit() def delete_invalid_offers(session: Session) -> None: @@ -77,34 +85,36 @@ def fix_offer_titles(session: Session) -> None: if offer.rawtext is None: continue - try: - raw_title = offer.rawtext["gametitle"] - title_new = ( - clean_game_title(raw_title) - + " - " - + clean_loot_title(offer.rawtext["title"]) - ) - except KeyError: - raw_title = offer.rawtext["title"] - title_new = clean_title(raw_title, offer.type) + new_title = None + new_probable_name = None + + if offer.type == OfferType.GAME: + new_title = clean_game_title(offer.rawtext["title"]) + new_probable_name = new_title + elif offer.type == OfferType.LOOT: + try: + new_probable_name = clean_game_title(offer.rawtext["gametitle"]) + new_title = ( + new_probable_name + " - " + clean_loot_title(offer.rawtext["title"]) + ) + except KeyError: + new_probable_name, new_title = clean_combined_title( + offer.rawtext["title"], + ) - if title_new != offer.title: + if new_title != offer.title: log( f"Cleaning up title for offer {offer.id}. " - f"Old: {offer.title}, new: {title_new}.", + f"Old: {offer.title}, new: {new_title}.", ) - offer.title = title_new + offer.title = new_title - if offer.probable_game_name is not None: - new_name = clean_game_title( - offer.probable_game_name, + if new_probable_name != offer.probable_game_name: + log( + f"Cleaning up probable game name for offer {offer.id}. " + f"Old: {offer.probable_game_name}, new: {new_probable_name}.", ) - if new_name != offer.probable_game_name: - log( - f"Cleaning up probable game name for offer {offer.id}. " - f"Old: {offer.probable_game_name}, new: {new_name}.", - ) - offer.probable_game_name = new_name + offer.probable_game_name = new_probable_name session.commit() diff --git a/src/lootscraper/utils.py b/src/lootscraper/utils.py index c42feb89..3db48a08 100644 --- a/src/lootscraper/utils.py +++ b/src/lootscraper/utils.py @@ -3,8 +3,6 @@ from datetime import datetime, timedelta, timezone from typing import Any -from lootscraper.common import OfferType - RESULT_MATCH_THRESHOLD = 0.85 @@ -66,20 +64,6 @@ def clean_nones(value: dict[str, Any]) -> dict[str, Any]: return value -def clean_title(title: str, type_: OfferType) -> str: - """Cleans the title of an offer. This is different for games and loot. - For games, we remove some common parts of the title that are not needed. - """ - if type_ == OfferType.GAME: - return clean_game_title(title) - - if type_ == OfferType.LOOT: - # The second element is the full offer title - return clean_combined_title(title)[1] - - raise ValueError(f"Unknown type {type_}") - - def clean_game_title(title: str) -> str: return ( title.replace("\n", "")