From 65bb5a5191a1997574e01d0aee39851bc4cba72e Mon Sep 17 00:00:00 2001 From: Pieter Robberechts Date: Tue, 21 Jan 2025 17:20:17 +0100 Subject: [PATCH] fix(SoFIFA): update parsing of player profile page (#794) Fixes #793 Fixes #791 --- soccerdata/sofifa.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/soccerdata/sofifa.py b/soccerdata/sofifa.py index c5cd234..9325cee 100644 --- a/soccerdata/sofifa.py +++ b/soccerdata/sofifa.py @@ -10,7 +10,11 @@ import pandas as pd from lxml import html -from ._common import BaseRequestsReader, add_standardized_team_name, standardize_colnames +from ._common import ( + BaseRequestsReader, + add_standardized_team_name, + standardize_colnames, +) from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS, logger SO_FIFA_DATADIR = DATA_DIR / "SoFIFA" @@ -108,7 +112,7 @@ def read_leagues(self) -> pd.DataFrame: leagues.append( { "league_id": child["id"], - "league": f'[{child["nationName"]}] {child["value"]}', + "league": f"[{child['nationName']}] {child['value']}", } ) return ( @@ -462,15 +466,18 @@ def read_player_ratings( # extract scores one-by-one tree = html.parse(reader, parser=html.HTMLParser(encoding="utf8")) + node_player_name = tree.xpath("//div[contains(@class, 'profile')]/h1")[0] + # Extract what is before
+ before_br = node_player_name.xpath("string(./text()[1])").strip() + # Extract what is after
+ after_br = node_player_name.xpath("string(./br/following-sibling::text()[1])").strip() scores = { - "player": tree.xpath("//div[contains(@class, 'profile')]/h1")[0].text.strip(), + "player": before_br if before_br else after_br, **version.to_dict(), } for s in score_labels: nodes = tree.xpath( - "(//li[not(self::script)] | //div | //p)" - f"[.//text()[contains(.,'{s}')]]" - "/em" + f"(//li[not(self::script)] | //div | //p)[.//text()[contains(.,'{s}')]]//em" ) # for multiple matches, only accept first match if len(nodes) >= 1: