Skip to content

Commit

Permalink
fix(fbref): Avoid duplicate Big 5 leagues
Browse files Browse the repository at this point in the history
When the FBref reader was initialized with leagues=["Big 5 European Leagues
Combined", ...] where ... contains other leagues from the Big 5 (e.g.,
"ENG-Premier League") it would scrape the league twice.

Fixes #576
  • Loading branch information
probberechts committed May 27, 2024
1 parent 0e56fff commit 0e5076d
Showing 1 changed file with 56 additions and 19 deletions.
75 changes: 56 additions & 19 deletions soccerdata/fbref.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,10 @@ def __init__(
@property
def leagues(self) -> List[str]:
"""Return a list of selected leagues."""
if "Big 5 European Leagues Combined" in self._leagues_dict:
for _, standardized_name in BIG_FIVE_DICT.items():
if standardized_name in self._leagues_dict:
del self._leagues_dict[standardized_name]
return list(self._leagues_dict.keys())

@classmethod
Expand Down Expand Up @@ -323,7 +327,7 @@ def read_team_season_stats( # noqa: C901

# return data frame
df = (
_concat(teams, key=['league', 'season'])
_concat(teams, key=["league", "season"])
.rename(columns={"Squad": "team", "# Pl": "players_used"})
.replace({"team": TEAMNAME_REPLACEMENTS})
# .pipe(standardize_colnames)
Expand Down Expand Up @@ -426,7 +430,7 @@ def read_team_match_stats( # noqa: C901
for (lkey, skey, team), team_url in iterator.url.items():
# read html page
filepath = self.data_dir / filemask.format(team, skey, stat_type)
if len(team_url.split('/')) == 6: # already have season in the url
if len(team_url.split("/")) == 6: # already have season in the url
url = (
FBREF_API
+ team_url.rsplit("/", 1)[0]
Expand Down Expand Up @@ -465,7 +469,7 @@ def read_team_match_stats( # noqa: C901
df_table["season"] = skey
df_table["team"] = team
df_table["Time"] = [
x.get('csk', None) for x in html_table.xpath(".//td[@data-stat='start_time']")
x.get("csk", None) for x in html_table.xpath(".//td[@data-stat='start_time']")
]
df_table["Match Report"] = [
(
Expand All @@ -482,7 +486,7 @@ def read_team_match_stats( # noqa: C901

# return data frame
df = (
_concat(stats, key=['league', 'season', 'team'])
_concat(stats, key=["league", "season", "team"])
.replace(
{
"Opponent": TEAMNAME_REPLACEMENTS,
Expand Down Expand Up @@ -664,7 +668,9 @@ def read_schedule(self, force_cache: bool = False) -> pd.DataFrame:
filepath_fixtures = self.data_dir / f"schedule_{lkey}_{skey}.html"
current_season = not self._is_complete(lkey, skey)
reader = self.get(
url_fixtures, filepath_fixtures, no_cache=current_season and not force_cache
url_fixtures,
filepath_fixtures,
no_cache=current_season and not force_cache,
)
tree = html.parse(reader)
html_table = tree.xpath("//table[contains(@id, 'sched')]")[0]
Expand Down Expand Up @@ -799,7 +805,10 @@ def read_player_match_stats(
url = urlmask.format(game["game_id"])
# get league and season
logger.info(
"[%s/%s] Retrieving game with id=%s", i + 1, len(iterator), game["game_id"]
"[%s/%s] Retrieving game with id=%s",
i + 1,
len(iterator),
game["game_id"],
)
filepath = self.data_dir / filemask.format(game["game_id"])
reader = self.get(url, filepath)
Expand Down Expand Up @@ -832,7 +841,7 @@ def read_player_match_stats(
else:
logger.warning("No stats found for away team for game with id=%s", game["game_id"])

df = _concat(stats, key=['game'])
df = _concat(stats, key=["game"])
df = df[~df.Player.str.contains(r"^\d+\sPlayers$")]
df = (
df.rename(columns={"#": "jersey_number"})
Expand All @@ -844,7 +853,9 @@ def read_player_match_stats(
return df

def read_lineup(
self, match_id: Optional[Union[str, List[str]]] = None, force_cache: bool = False
self,
match_id: Optional[Union[str, List[str]]] = None,
force_cache: bool = False,
) -> pd.DataFrame:
"""Retrieve lineups for the selected leagues and seasons.
Expand Down Expand Up @@ -887,7 +898,10 @@ def read_lineup(
url = urlmask.format(game["game_id"])
# get league and season
logger.info(
"[%s/%s] Retrieving game with id=%s", i + 1, len(iterator), game["game_id"]
"[%s/%s] Retrieving game with id=%s",
i + 1,
len(iterator),
game["game_id"],
)
filepath = self.data_dir / filemask.format(game["game_id"])
reader = self.get(url, filepath)
Expand All @@ -914,7 +928,12 @@ def read_lineup(
)
df_stats_table = _parse_table(html_stats_table)
df_stats_table = df_stats_table.droplevel(0, axis=1)[["Player", "#", "Pos", "Min"]]
df_stats_table.columns = ["player", "jersey_number", "position", "minutes_played"]
df_stats_table.columns = [
"player",
"jersey_number",
"position",
"minutes_played",
]
df_stats_table["jersey_number"] = df_stats_table["jersey_number"].astype("Int64")
df_table["jersey_number"] = df_table["jersey_number"].astype("Int64")
df_table = pd.merge(
Expand All @@ -926,7 +945,9 @@ def read_lineup(
return df

def read_events(
self, match_id: Optional[Union[str, List[str]]] = None, force_cache: bool = False
self,
match_id: Optional[Union[str, List[str]]] = None,
force_cache: bool = False,
) -> pd.DataFrame:
"""Retrieve match events for the selected seasons or selected matches.
Expand Down Expand Up @@ -973,7 +994,10 @@ def read_events(
url = urlmask.format(game["game_id"])
# get league and season
logger.info(
"[%s/%s] Retrieving game with id=%s", i + 1, len(iterator), game["game_id"]
"[%s/%s] Retrieving game with id=%s",
i + 1,
len(iterator),
game["game_id"],
)
filepath = self.data_dir / filemask.format(game["game_id"])
reader = self.get(url, filepath)
Expand Down Expand Up @@ -1021,7 +1045,9 @@ def read_events(
return df

def read_shot_events(
self, match_id: Optional[Union[str, List[str]]] = None, force_cache: bool = False
self,
match_id: Optional[Union[str, List[str]]] = None,
force_cache: bool = False,
) -> pd.DataFrame:
"""Retrieve shooting data for the selected seasons or selected matches.
Expand Down Expand Up @@ -1068,7 +1094,10 @@ def read_shot_events(
url = urlmask.format(game["game_id"])
# get league anigd season
logger.info(
"[%s/%s] Retrieving game with id=%s", i + 1, len(iterator), game["game_id"]
"[%s/%s] Retrieving game with id=%s",
i + 1,
len(iterator),
game["game_id"],
)
filepath = self.data_dir / filemask.format(game["game_id"])
reader = self.get(url, filepath)
Expand All @@ -1087,12 +1116,20 @@ def read_shot_events(
return pd.DataFrame()

df = (
_concat(shots, key=['game'])
_concat(shots, key=["game"])
.rename(columns={"Squad": "team"})
.replace({"team": TEAMNAME_REPLACEMENTS})
.pipe(
standardize_colnames,
cols=["Outcome", "Minute", "Distance", "Player", "Body Part", "Notes", "Event"],
cols=[
"Outcome",
"Minute",
"Distance",
"Player",
"Body Part",
"Notes",
"Event",
],
)
.set_index(["league", "season", "game"])
.sort_index()
Expand Down Expand Up @@ -1178,7 +1215,7 @@ def _concat(dfs: List[pd.DataFrame], key: List[str]) -> pd.DataFrame:
if len(all_columns) and all_columns[0].shape[1] == 2:
for i, columns in enumerate(all_columns):
if not columns[1].equals(all_columns[0][1]):
res = all_columns[0].merge(columns, indicator=True, how='outer')
res = all_columns[0].merge(columns, indicator=True, how="outer")
warnings.warn(
(
"Different columns found for {first} and {cur}.\n\n"
Expand All @@ -1191,15 +1228,15 @@ def _concat(dfs: List[pd.DataFrame], key: List[str]) -> pd.DataFrame:
extra_cols=", ".join(
map(
str,
res.loc[res['_merge'] == "left_only", [0, 1]]
res.loc[res["_merge"] == "left_only", [0, 1]]
.to_records(index=False)
.tolist(),
)
),
missing_cols=", ".join(
map(
str,
res.loc[res['_merge'] == "right_only", [0, 1]]
res.loc[res["_merge"] == "right_only", [0, 1]]
.to_records(index=False)
.tolist(),
)
Expand Down

0 comments on commit 0e5076d

Please sign in to comment.