Skip to content

Commit

Permalink
fix: better cleanup
Browse files Browse the repository at this point in the history
Signed-off-by: Eiko Wagenknecht <[email protected]>
  • Loading branch information
eikowagenknecht committed Oct 25, 2023
1 parent 07b649c commit f2f9fe9
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 60 deletions.
52 changes: 36 additions & 16 deletions src/lootscraper/scraper/scraper_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,11 @@
from lootscraper.browser import get_new_page
from lootscraper.common import Category, OfferDuration, OfferType, Source
from lootscraper.config import Config
from lootscraper.utils import clean_game_title, clean_loot_title, clean_title
from lootscraper.utils import (
clean_combined_title,
clean_game_title,
clean_loot_title,
)

if TYPE_CHECKING:
from collections.abc import Awaitable, Callable
Expand Down Expand Up @@ -184,24 +188,40 @@ def clean_offers(self, offers: list[Offer]) -> list[Offer]:
if offer.rawtext is None:
continue

try:
raw_title = offer.rawtext["gametitle"]
title_new = (
clean_game_title(raw_title)
+ " - "
+ clean_loot_title(offer.rawtext["title"])
)
except KeyError:
raw_title = offer.rawtext["title"]
title_new = clean_title(raw_title, offer.type)
# TODO: Since it's refreshed from the rawtext, there is no need to
# set the title and probable_game_name in the scrapers anymore.

new_title = None
new_probable_name = None

if title_new != offer.title:
offer.title = title_new
if offer.type == OfferType.GAME:
new_title = clean_game_title(offer.rawtext["title"])
new_probable_name = new_title
elif offer.type == OfferType.LOOT:
try:
new_probable_name = clean_game_title(offer.rawtext["gametitle"])
new_title = (
new_probable_name
+ " - "
+ clean_loot_title(offer.rawtext["title"])
)
except KeyError:
new_probable_name, new_title = clean_combined_title(
offer.rawtext["title"],
)

if new_title != offer.title:
logging.debug(
f"Cleaning up title. Old: {offer.title}, new: {new_title}.",
)
offer.title = new_title

if offer.probable_game_name is not None:
offer.probable_game_name = clean_game_title(
offer.probable_game_name,
if new_probable_name != offer.probable_game_name:
logging.debug(
"Cleaning up probable game name. "
f"Old: {offer.probable_game_name}, new: {new_probable_name}.",
)
offer.probable_game_name = new_probable_name

if offer.url is not None:
offer.url = offer.url.replace("\n", "").strip()
Expand Down
66 changes: 38 additions & 28 deletions src/lootscraper/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@
from sqlalchemy.orm import Session

from lootscraper.browser import get_browser_context
from lootscraper.common import Category, OfferDuration
from lootscraper.common import Category, OfferDuration, OfferType
from lootscraper.database import Game, IgdbInfo, LootDatabase, Offer, SteamInfo
from lootscraper.processing import add_game_info
from lootscraper.scraper.scraper_base import Scraper
from lootscraper.utils import (
clean_combined_title,
clean_game_title,
clean_loot_title,
clean_title,
)

logger = logging.getLogger(__name__)
Expand All @@ -31,21 +31,29 @@ async def refresh_all_games(session: Session, context: BrowserContext) -> None:
Drop all games from the database and re-add them, scraping all
information again.
"""
all_offers = session.query(Offer).all()
# Remove all games from the database. Skip this step if you don't want to
# re-scrape all games.
logger.info("Dropping all existing information from database")
for offer in all_offers:
offer.game_id = None
session.query(Game).delete()
session.query(SteamInfo).delete()
session.query(IgdbInfo).delete()
# Commit all changes at once to keep the database consistent.
session.commit()

all_offers = session.query(Offer).all()

log("Gathering new information")
offer: Offer
for offer in all_offers:
# Use this to skip offers that have already been processed (e.g. when
# the script crashed).
# if offer.id < 2678:
# continue
log(f"Adding game info for offer {offer.id}.")
await add_game_info(offer, session, context)

session.commit()
# Save after every offer to avoid losing progress.
session.commit()


def delete_invalid_offers(session: Session) -> None:
Expand Down Expand Up @@ -77,34 +85,36 @@ def fix_offer_titles(session: Session) -> None:
if offer.rawtext is None:
continue

try:
raw_title = offer.rawtext["gametitle"]
title_new = (
clean_game_title(raw_title)
+ " - "
+ clean_loot_title(offer.rawtext["title"])
)
except KeyError:
raw_title = offer.rawtext["title"]
title_new = clean_title(raw_title, offer.type)
new_title = None
new_probable_name = None

if offer.type == OfferType.GAME:
new_title = clean_game_title(offer.rawtext["title"])
new_probable_name = new_title
elif offer.type == OfferType.LOOT:
try:
new_probable_name = clean_game_title(offer.rawtext["gametitle"])
new_title = (
new_probable_name + " - " + clean_loot_title(offer.rawtext["title"])
)
except KeyError:
new_probable_name, new_title = clean_combined_title(
offer.rawtext["title"],
)

if title_new != offer.title:
if new_title != offer.title:
log(
f"Cleaning up title for offer {offer.id}. "
f"Old: {offer.title}, new: {title_new}.",
f"Old: {offer.title}, new: {new_title}.",
)
offer.title = title_new
offer.title = new_title

if offer.probable_game_name is not None:
new_name = clean_game_title(
offer.probable_game_name,
if new_probable_name != offer.probable_game_name:
log(
f"Cleaning up probable game name for offer {offer.id}. "
f"Old: {offer.probable_game_name}, new: {new_probable_name}.",
)
if new_name != offer.probable_game_name:
log(
f"Cleaning up probable game name for offer {offer.id}. "
f"Old: {offer.probable_game_name}, new: {new_name}.",
)
offer.probable_game_name = new_name
offer.probable_game_name = new_probable_name

session.commit()

Expand Down
16 changes: 0 additions & 16 deletions src/lootscraper/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
from datetime import datetime, timedelta, timezone
from typing import Any

from lootscraper.common import OfferType

RESULT_MATCH_THRESHOLD = 0.85


Expand Down Expand Up @@ -66,20 +64,6 @@ def clean_nones(value: dict[str, Any]) -> dict[str, Any]:
return value


def clean_title(title: str, type_: OfferType) -> str:
"""Cleans the title of an offer. This is different for games and loot.
For games, we remove some common parts of the title that are not needed.
"""
if type_ == OfferType.GAME:
return clean_game_title(title)

if type_ == OfferType.LOOT:
# The second element is the full offer title
return clean_combined_title(title)[1]

raise ValueError(f"Unknown type {type_}")


def clean_game_title(title: str) -> str:
return (
title.replace("\n", "")
Expand Down

0 comments on commit f2f9fe9

Please sign in to comment.