Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: better cleanup #276

Merged
merged 1 commit into from
Oct 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 36 additions & 16 deletions src/lootscraper/scraper/scraper_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,11 @@
from lootscraper.browser import get_new_page
from lootscraper.common import Category, OfferDuration, OfferType, Source
from lootscraper.config import Config
from lootscraper.utils import clean_game_title, clean_loot_title, clean_title
from lootscraper.utils import (
clean_combined_title,
clean_game_title,
clean_loot_title,
)

if TYPE_CHECKING:
from collections.abc import Awaitable, Callable
Expand Down Expand Up @@ -184,24 +188,40 @@ def clean_offers(self, offers: list[Offer]) -> list[Offer]:
if offer.rawtext is None:
continue

try:
raw_title = offer.rawtext["gametitle"]
title_new = (
clean_game_title(raw_title)
+ " - "
+ clean_loot_title(offer.rawtext["title"])
)
except KeyError:
raw_title = offer.rawtext["title"]
title_new = clean_title(raw_title, offer.type)
# TODO: Since it's refreshed from the rawtext, there is no need to
# set the title and probable_game_name in the scrapers anymore.

new_title = None
new_probable_name = None

if title_new != offer.title:
offer.title = title_new
if offer.type == OfferType.GAME:
new_title = clean_game_title(offer.rawtext["title"])
new_probable_name = new_title
elif offer.type == OfferType.LOOT:
try:
new_probable_name = clean_game_title(offer.rawtext["gametitle"])
new_title = (
new_probable_name
+ " - "
+ clean_loot_title(offer.rawtext["title"])
)
except KeyError:
new_probable_name, new_title = clean_combined_title(
offer.rawtext["title"],
)

if new_title != offer.title:
logging.debug(
f"Cleaning up title. Old: {offer.title}, new: {new_title}.",
)
offer.title = new_title

if offer.probable_game_name is not None:
offer.probable_game_name = clean_game_title(
offer.probable_game_name,
if new_probable_name != offer.probable_game_name:
logging.debug(
"Cleaning up probable game name. "
f"Old: {offer.probable_game_name}, new: {new_probable_name}.",
)
offer.probable_game_name = new_probable_name

if offer.url is not None:
offer.url = offer.url.replace("\n", "").strip()
Expand Down
66 changes: 38 additions & 28 deletions src/lootscraper/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@
from sqlalchemy.orm import Session

from lootscraper.browser import get_browser_context
from lootscraper.common import Category, OfferDuration
from lootscraper.common import Category, OfferDuration, OfferType
from lootscraper.database import Game, IgdbInfo, LootDatabase, Offer, SteamInfo
from lootscraper.processing import add_game_info
from lootscraper.scraper.scraper_base import Scraper
from lootscraper.utils import (
clean_combined_title,
clean_game_title,
clean_loot_title,
clean_title,
)

logger = logging.getLogger(__name__)
Expand All @@ -31,21 +31,29 @@ async def refresh_all_games(session: Session, context: BrowserContext) -> None:
Drop all games from the database and re-add them, scraping all
information again.
"""
all_offers = session.query(Offer).all()
# Remove all games from the database. Skip this step if you don't want to
# re-scrape all games.
logger.info("Dropping all existing information from database")
for offer in all_offers:
offer.game_id = None
session.query(Game).delete()
session.query(SteamInfo).delete()
session.query(IgdbInfo).delete()
# Commit all changes at once to keep the database consistent.
session.commit()

all_offers = session.query(Offer).all()

log("Gathering new information")
offer: Offer
for offer in all_offers:
# Use this to skip offers that have already been processed (e.g. when
# the script crashed).
# if offer.id < 2678:
# continue
log(f"Adding game info for offer {offer.id}.")
await add_game_info(offer, session, context)

session.commit()
# Save after every offer to avoid losing progress.
session.commit()


def delete_invalid_offers(session: Session) -> None:
Expand Down Expand Up @@ -77,34 +85,36 @@ def fix_offer_titles(session: Session) -> None:
if offer.rawtext is None:
continue

try:
raw_title = offer.rawtext["gametitle"]
title_new = (
clean_game_title(raw_title)
+ " - "
+ clean_loot_title(offer.rawtext["title"])
)
except KeyError:
raw_title = offer.rawtext["title"]
title_new = clean_title(raw_title, offer.type)
new_title = None
new_probable_name = None

if offer.type == OfferType.GAME:
new_title = clean_game_title(offer.rawtext["title"])
new_probable_name = new_title
elif offer.type == OfferType.LOOT:
try:
new_probable_name = clean_game_title(offer.rawtext["gametitle"])
new_title = (
new_probable_name + " - " + clean_loot_title(offer.rawtext["title"])
)
except KeyError:
new_probable_name, new_title = clean_combined_title(
offer.rawtext["title"],
)

if title_new != offer.title:
if new_title != offer.title:
log(
f"Cleaning up title for offer {offer.id}. "
f"Old: {offer.title}, new: {title_new}.",
f"Old: {offer.title}, new: {new_title}.",
)
offer.title = title_new
offer.title = new_title

if offer.probable_game_name is not None:
new_name = clean_game_title(
offer.probable_game_name,
if new_probable_name != offer.probable_game_name:
log(
f"Cleaning up probable game name for offer {offer.id}. "
f"Old: {offer.probable_game_name}, new: {new_probable_name}.",
)
if new_name != offer.probable_game_name:
log(
f"Cleaning up probable game name for offer {offer.id}. "
f"Old: {offer.probable_game_name}, new: {new_name}.",
)
offer.probable_game_name = new_name
offer.probable_game_name = new_probable_name

session.commit()

Expand Down
16 changes: 0 additions & 16 deletions src/lootscraper/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
from datetime import datetime, timedelta, timezone
from typing import Any

from lootscraper.common import OfferType

RESULT_MATCH_THRESHOLD = 0.85


Expand Down Expand Up @@ -66,20 +64,6 @@ def clean_nones(value: dict[str, Any]) -> dict[str, Any]:
return value


def clean_title(title: str, type_: OfferType) -> str:
"""Cleans the title of an offer. This is different for games and loot.
For games, we remove some common parts of the title that are not needed.
"""
if type_ == OfferType.GAME:
return clean_game_title(title)

if type_ == OfferType.LOOT:
# The second element is the full offer title
return clean_combined_title(title)[1]

raise ValueError(f"Unknown type {type_}")


def clean_game_title(title: str) -> str:
return (
title.replace("\n", "")
Expand Down
Loading