From 80c316ba20fbbc0115329c97fafdefd15214fa61 Mon Sep 17 00:00:00 2001 From: Pieter Robberechts Date: Mon, 27 May 2024 17:11:18 +0200 Subject: [PATCH] [WhoScored] Update read_schedule method (#582) WhoScored has updated the HTML structure of their fixture pages. Fixes #581 --- docs/datasources/WhoScored.ipynb | 935 +++++++++++++++++++++++++------ soccerdata/_common.py | 2 + soccerdata/whoscored.py | 353 +++++++----- 3 files changed, 969 insertions(+), 321 deletions(-) diff --git a/docs/datasources/WhoScored.ipynb b/docs/datasources/WhoScored.ipynb index e19d0e25..2a44e585 100644 --- a/docs/datasources/WhoScored.ipynb +++ b/docs/datasources/WhoScored.ipynb @@ -7,7 +7,22 @@ "metadata": { "nbsphinx": "hidden" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_413602/2059154722.py:1: DeprecationWarning: \n", + "Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n", + "(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n", + "but was not found to be installed on your system.\n", + "If this would cause problems for you,\n", + "please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n", + " \n", + " import pandas as pd\n" + ] + } + ], "source": [ "import pandas as pd\n", "pd.set_option('display.max_columns', None)" @@ -65,8 +80,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "/cw/dtaijupiter/NoCsBack/dtai/pieterr/Projects/soccerdata/soccerdata/_common.py:462: UserWarning: Season id \"2021\" is ambiguous: interpreting as \"20-21\"\n", - " warnings.warn(msg)\n" + "/cw/dtaijupiter/NoCsBack/dtai/pieterr/Projects/soccerdata/soccerdata/_common.py:493: UserWarning: Season id \"2021\" is ambiguous: interpreting as \"20-21\"\n", + " warnings.warn(msg, stacklevel=1)\n" ] }, { @@ -112,7 +127,7 @@ " Path to the Chrome executable.\n", " headless : bool, default: True\n", " If True, will run Chrome in headless mode. Setting this to False might\n", - " help to avoid getting blocked.\n", + " help to avoid getting blocked. Only supported for Selenium <4.13.\n", " \n" ] } @@ -161,11 +176,48 @@ " \n", " \n", " \n", + " stage_id\n", " game_id\n", + " status\n", + " start_time\n", + " home_team_id\n", " home_team\n", + " home_yellow_cards\n", + " home_red_cards\n", + " away_team_id\n", " away_team\n", + " away_yellow_cards\n", + " away_red_cards\n", + " has_incidents_summary\n", + " has_preview\n", + " score_changed_at\n", + " elapsed\n", + " last_scorer\n", + " is_top_match\n", + " home_team_country_code\n", + " away_team_country_code\n", + " comment_count\n", + " is_lineup_confirmed\n", + " is_stream_available\n", + " match_is_opta\n", + " home_team_country_name\n", + " away_team_country_name\n", " date\n", - " url\n", + " home_score\n", + " away_score\n", + " incidents\n", + " bets\n", + " aggregate_winner_field\n", + " winner_field\n", + " period\n", + " extra_result_field\n", + " home_extratime_score\n", + " away_extratime_score\n", + " home_penalty_score\n", + " away_penalty_score\n", + " started_at_utc\n", + " first_half_ended_at_utc\n", + " second_half_started_at_utc\n", " stage\n", " \n", " \n", @@ -178,6 +230,43 @@ " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -185,101 +274,582 @@ " ENG-Premier League\n", " 2021\n", " 2020-09-12 Crystal Palace-Southampton\n", + " 18685\n", " 1485186\n", + " 6\n", + " 2020-09-12T15:00:00\n", + " 162\n", " Crystal Palace\n", + " 2\n", + " 0\n", + " 18\n", " Southampton\n", - " 2020-09-12 15:00:00\n", - " https://www.whoscored.com/Matches/1485186/Live...\n", - " NaN\n", + " 1\n", + " 0\n", + " True\n", + " True\n", + " 2020-09-12 15:14:31Z\n", + " FT\n", + " 0.0\n", + " False\n", + " gb-eng\n", + " gb-eng\n", + " 6\n", + " True\n", + " False\n", + " False\n", + " England\n", + " England\n", + " 2020-09-12 14:00:00+00:00\n", + " 1\n", + " 0\n", + " [{'minute': '13', 'type': 1, 'subType': 1, 'pl...\n", + " None\n", + " None\n", + " 0.0\n", + " 7\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " 2020-09-12T14:01:42Z\n", + " None\n", + " 2020-09-12T15:04:01Z\n", + " None\n", " \n", " \n", " 2020-09-12 Fulham-Arsenal\n", + " 18685\n", " 1485187\n", + " 6\n", + " 2020-09-12T12:30:00\n", + " 170\n", " Fulham\n", + " 2\n", + " 0\n", + " 13\n", " Arsenal\n", - " 2020-09-12 12:30:00\n", - " https://www.whoscored.com/Matches/1485187/Live...\n", - " NaN\n", + " 2\n", + " 0\n", + " True\n", + " True\n", + " 2020-09-12 13:48:13Z\n", + " FT\n", + " 1.0\n", + " True\n", + " gb-eng\n", + " gb-eng\n", + " 15\n", + " True\n", + " False\n", + " False\n", + " England\n", + " England\n", + " 2020-09-12 11:30:00+00:00\n", + " 0\n", + " 3\n", + " [{'minute': '8', 'type': 1, 'subType': 1, 'pla...\n", + " None\n", + " None\n", + " 1.0\n", + " 7\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " 2020-09-12T11:32:23Z\n", + " None\n", + " 2020-09-12T12:35:50Z\n", + " None\n", " \n", " \n", " 2020-09-12 Liverpool-Leeds United\n", + " 18685\n", " 1485188\n", + " 6\n", + " 2020-09-12T17:30:00\n", + " 26\n", " Liverpool\n", + " 1\n", + " 0\n", + " 19\n", " Leeds United\n", - " 2020-09-12 17:30:00\n", - " https://www.whoscored.com/Matches/1485188/Live...\n", - " NaN\n", + " 0\n", + " 0\n", + " True\n", + " True\n", + " 2020-09-12 19:15:39Z\n", + " FT\n", + " 0.0\n", + " True\n", + " gb-eng\n", + " gb-eng\n", + " 61\n", + " True\n", + " False\n", + " False\n", + " England\n", + " England\n", + " 2020-09-12 16:30:00+00:00\n", + " 4\n", + " 3\n", + " [{'minute': '4', 'type': 1, 'subType': 2, 'pla...\n", + " None\n", + " None\n", + " 0.0\n", + " 7\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " 2020-09-12T16:30:21Z\n", + " None\n", + " 2020-09-12T17:32:57Z\n", + " None\n", " \n", " \n", - " 2020-09-12 West Ham United-Newcastle United\n", + " 2020-09-12 West Ham United-Newcastle\n", + " 18685\n", " 1485191\n", + " 6\n", + " 2020-09-12T20:00:00\n", + " 29\n", " West Ham United\n", - " Newcastle United\n", - " 2020-09-12 20:00:00\n", - " https://www.whoscored.com/Matches/1485191/Live...\n", - " NaN\n", + " 2\n", + " 0\n", + " 23\n", + " Newcastle\n", + " 2\n", + " 0\n", + " True\n", + " True\n", + " 2020-09-12 21:45:39Z\n", + " FT\n", + " 1.0\n", + " False\n", + " gb-eng\n", + " gb-eng\n", + " 10\n", + " True\n", + " False\n", + " False\n", + " England\n", + " England\n", + " 2020-09-12 19:00:00+00:00\n", + " 0\n", + " 2\n", + " [{'minute': '56', 'type': 1, 'subType': 1, 'pl...\n", + " None\n", + " None\n", + " 1.0\n", + " 7\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " 2020-09-12T19:00:32Z\n", + " None\n", + " 2020-09-12T20:03:20Z\n", + " None\n", " \n", " \n", " 2020-09-13 Tottenham-Everton\n", + " 18685\n", " 1485189\n", + " 6\n", + " 2020-09-13T16:30:00\n", + " 30\n", " Tottenham\n", + " 1\n", + " 0\n", + " 31\n", " Everton\n", - " 2020-09-13 16:30:00\n", - " https://www.whoscored.com/Matches/1485189/Live...\n", - " NaN\n", + " 0\n", + " 0\n", + " True\n", + " True\n", + " 2020-09-13 17:41:16Z\n", + " FT\n", + " 1.0\n", + " True\n", + " gb-eng\n", + " gb-eng\n", + " 32\n", + " True\n", + " False\n", + " False\n", + " England\n", + " England\n", + " 2020-09-13 15:30:00+00:00\n", + " 0\n", + " 1\n", + " [{'minute': '55', 'type': 1, 'subType': 1, 'pl...\n", + " None\n", + " None\n", + " 1.0\n", + " 7\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " 2020-09-13T15:30:20Z\n", + " None\n", + " 2020-09-13T16:31:33Z\n", + " None\n", " \n", " \n", "\n", "" ], "text/plain": [ - " game_id \\\n", + " stage_id \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton 18685 \n", + " 2020-09-12 Fulham-Arsenal 18685 \n", + " 2020-09-12 Liverpool-Leeds United 18685 \n", + " 2020-09-12 West Ham United-Newcastle 18685 \n", + " 2020-09-13 Tottenham-Everton 18685 \n", + "\n", + " game_id \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton 1485186 \n", + " 2020-09-12 Fulham-Arsenal 1485187 \n", + " 2020-09-12 Liverpool-Leeds United 1485188 \n", + " 2020-09-12 West Ham United-Newcastle 1485191 \n", + " 2020-09-13 Tottenham-Everton 1485189 \n", + "\n", + " status \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton 6 \n", + " 2020-09-12 Fulham-Arsenal 6 \n", + " 2020-09-12 Liverpool-Leeds United 6 \n", + " 2020-09-12 West Ham United-Newcastle 6 \n", + " 2020-09-13 Tottenham-Everton 6 \n", + "\n", + " start_time \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton 2020-09-12T15:00:00 \n", + " 2020-09-12 Fulham-Arsenal 2020-09-12T12:30:00 \n", + " 2020-09-12 Liverpool-Leeds United 2020-09-12T17:30:00 \n", + " 2020-09-12 West Ham United-Newcastle 2020-09-12T20:00:00 \n", + " 2020-09-13 Tottenham-Everton 2020-09-13T16:30:00 \n", + "\n", + " home_team_id \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton 162 \n", + " 2020-09-12 Fulham-Arsenal 170 \n", + " 2020-09-12 Liverpool-Leeds United 26 \n", + " 2020-09-12 West Ham United-Newcastle 29 \n", + " 2020-09-13 Tottenham-Everton 30 \n", + "\n", + " home_team \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton Crystal Palace \n", + " 2020-09-12 Fulham-Arsenal Fulham \n", + " 2020-09-12 Liverpool-Leeds United Liverpool \n", + " 2020-09-12 West Ham United-Newcastle West Ham United \n", + " 2020-09-13 Tottenham-Everton Tottenham \n", + "\n", + " home_yellow_cards \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton 2 \n", + " 2020-09-12 Fulham-Arsenal 2 \n", + " 2020-09-12 Liverpool-Leeds United 1 \n", + " 2020-09-12 West Ham United-Newcastle 2 \n", + " 2020-09-13 Tottenham-Everton 1 \n", + "\n", + " home_red_cards \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton 0 \n", + " 2020-09-12 Fulham-Arsenal 0 \n", + " 2020-09-12 Liverpool-Leeds United 0 \n", + " 2020-09-12 West Ham United-Newcastle 0 \n", + " 2020-09-13 Tottenham-Everton 0 \n", + "\n", + " away_team_id \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton 18 \n", + " 2020-09-12 Fulham-Arsenal 13 \n", + " 2020-09-12 Liverpool-Leeds United 19 \n", + " 2020-09-12 West Ham United-Newcastle 23 \n", + " 2020-09-13 Tottenham-Everton 31 \n", + "\n", + " away_team \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton Southampton \n", + " 2020-09-12 Fulham-Arsenal Arsenal \n", + " 2020-09-12 Liverpool-Leeds United Leeds United \n", + " 2020-09-12 West Ham United-Newcastle Newcastle \n", + " 2020-09-13 Tottenham-Everton Everton \n", + "\n", + " away_yellow_cards \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton 1 \n", + " 2020-09-12 Fulham-Arsenal 2 \n", + " 2020-09-12 Liverpool-Leeds United 0 \n", + " 2020-09-12 West Ham United-Newcastle 2 \n", + " 2020-09-13 Tottenham-Everton 0 \n", + "\n", + " away_red_cards \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton 0 \n", + " 2020-09-12 Fulham-Arsenal 0 \n", + " 2020-09-12 Liverpool-Leeds United 0 \n", + " 2020-09-12 West Ham United-Newcastle 0 \n", + " 2020-09-13 Tottenham-Everton 0 \n", + "\n", + " has_incidents_summary \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton True \n", + " 2020-09-12 Fulham-Arsenal True \n", + " 2020-09-12 Liverpool-Leeds United True \n", + " 2020-09-12 West Ham United-Newcastle True \n", + " 2020-09-13 Tottenham-Everton True \n", + "\n", + " has_preview \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton True \n", + " 2020-09-12 Fulham-Arsenal True \n", + " 2020-09-12 Liverpool-Leeds United True \n", + " 2020-09-12 West Ham United-Newcastle True \n", + " 2020-09-13 Tottenham-Everton True \n", + "\n", + " score_changed_at \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton 2020-09-12 15:14:31Z \n", + " 2020-09-12 Fulham-Arsenal 2020-09-12 13:48:13Z \n", + " 2020-09-12 Liverpool-Leeds United 2020-09-12 19:15:39Z \n", + " 2020-09-12 West Ham United-Newcastle 2020-09-12 21:45:39Z \n", + " 2020-09-13 Tottenham-Everton 2020-09-13 17:41:16Z \n", + "\n", + " elapsed \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton FT \n", + " 2020-09-12 Fulham-Arsenal FT \n", + " 2020-09-12 Liverpool-Leeds United FT \n", + " 2020-09-12 West Ham United-Newcastle FT \n", + " 2020-09-13 Tottenham-Everton FT \n", + "\n", + " last_scorer \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton 0.0 \n", + " 2020-09-12 Fulham-Arsenal 1.0 \n", + " 2020-09-12 Liverpool-Leeds United 0.0 \n", + " 2020-09-12 West Ham United-Newcastle 1.0 \n", + " 2020-09-13 Tottenham-Everton 1.0 \n", + "\n", + " is_top_match \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton False \n", + " 2020-09-12 Fulham-Arsenal True \n", + " 2020-09-12 Liverpool-Leeds United True \n", + " 2020-09-12 West Ham United-Newcastle False \n", + " 2020-09-13 Tottenham-Everton True \n", + "\n", + " home_team_country_code \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton gb-eng \n", + " 2020-09-12 Fulham-Arsenal gb-eng \n", + " 2020-09-12 Liverpool-Leeds United gb-eng \n", + " 2020-09-12 West Ham United-Newcastle gb-eng \n", + " 2020-09-13 Tottenham-Everton gb-eng \n", + "\n", + " away_team_country_code \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton gb-eng \n", + " 2020-09-12 Fulham-Arsenal gb-eng \n", + " 2020-09-12 Liverpool-Leeds United gb-eng \n", + " 2020-09-12 West Ham United-Newcastle gb-eng \n", + " 2020-09-13 Tottenham-Everton gb-eng \n", + "\n", + " comment_count \\\n", "league season game \n", - "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton 1485186 \n", - " 2020-09-12 Fulham-Arsenal 1485187 \n", - " 2020-09-12 Liverpool-Leeds United 1485188 \n", - " 2020-09-12 West Ham United-Newcastle United 1485191 \n", - " 2020-09-13 Tottenham-Everton 1485189 \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton 6 \n", + " 2020-09-12 Fulham-Arsenal 15 \n", + " 2020-09-12 Liverpool-Leeds United 61 \n", + " 2020-09-12 West Ham United-Newcastle 10 \n", + " 2020-09-13 Tottenham-Everton 32 \n", "\n", - " home_team \\\n", + " is_lineup_confirmed \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton True \n", + " 2020-09-12 Fulham-Arsenal True \n", + " 2020-09-12 Liverpool-Leeds United True \n", + " 2020-09-12 West Ham United-Newcastle True \n", + " 2020-09-13 Tottenham-Everton True \n", + "\n", + " is_stream_available \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton False \n", + " 2020-09-12 Fulham-Arsenal False \n", + " 2020-09-12 Liverpool-Leeds United False \n", + " 2020-09-12 West Ham United-Newcastle False \n", + " 2020-09-13 Tottenham-Everton False \n", + "\n", + " match_is_opta \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton False \n", + " 2020-09-12 Fulham-Arsenal False \n", + " 2020-09-12 Liverpool-Leeds United False \n", + " 2020-09-12 West Ham United-Newcastle False \n", + " 2020-09-13 Tottenham-Everton False \n", + "\n", + " home_team_country_name \\\n", "league season game \n", - "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton Crystal Palace \n", - " 2020-09-12 Fulham-Arsenal Fulham \n", - " 2020-09-12 Liverpool-Leeds United Liverpool \n", - " 2020-09-12 West Ham United-Newcastle United West Ham United \n", - " 2020-09-13 Tottenham-Everton Tottenham \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton England \n", + " 2020-09-12 Fulham-Arsenal England \n", + " 2020-09-12 Liverpool-Leeds United England \n", + " 2020-09-12 West Ham United-Newcastle England \n", + " 2020-09-13 Tottenham-Everton England \n", "\n", - " away_team \\\n", - "league season game \n", - "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton Southampton \n", - " 2020-09-12 Fulham-Arsenal Arsenal \n", - " 2020-09-12 Liverpool-Leeds United Leeds United \n", - " 2020-09-12 West Ham United-Newcastle United Newcastle United \n", - " 2020-09-13 Tottenham-Everton Everton \n", + " away_team_country_name \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton England \n", + " 2020-09-12 Fulham-Arsenal England \n", + " 2020-09-12 Liverpool-Leeds United England \n", + " 2020-09-12 West Ham United-Newcastle England \n", + " 2020-09-13 Tottenham-Everton England \n", "\n", " date \\\n", "league season game \n", - "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton 2020-09-12 15:00:00 \n", - " 2020-09-12 Fulham-Arsenal 2020-09-12 12:30:00 \n", - " 2020-09-12 Liverpool-Leeds United 2020-09-12 17:30:00 \n", - " 2020-09-12 West Ham United-Newcastle United 2020-09-12 20:00:00 \n", - " 2020-09-13 Tottenham-Everton 2020-09-13 16:30:00 \n", - "\n", - " url \\\n", - "league season game \n", - "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton https://www.whoscored.com/Matches/1485186/Live... \n", - " 2020-09-12 Fulham-Arsenal https://www.whoscored.com/Matches/1485187/Live... \n", - " 2020-09-12 Liverpool-Leeds United https://www.whoscored.com/Matches/1485188/Live... \n", - " 2020-09-12 West Ham United-Newcastle United https://www.whoscored.com/Matches/1485191/Live... \n", - " 2020-09-13 Tottenham-Everton https://www.whoscored.com/Matches/1485189/Live... \n", - "\n", - " stage \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton 2020-09-12 14:00:00+00:00 \n", + " 2020-09-12 Fulham-Arsenal 2020-09-12 11:30:00+00:00 \n", + " 2020-09-12 Liverpool-Leeds United 2020-09-12 16:30:00+00:00 \n", + " 2020-09-12 West Ham United-Newcastle 2020-09-12 19:00:00+00:00 \n", + " 2020-09-13 Tottenham-Everton 2020-09-13 15:30:00+00:00 \n", + "\n", + " home_score \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton 1 \n", + " 2020-09-12 Fulham-Arsenal 0 \n", + " 2020-09-12 Liverpool-Leeds United 4 \n", + " 2020-09-12 West Ham United-Newcastle 0 \n", + " 2020-09-13 Tottenham-Everton 0 \n", + "\n", + " away_score \\\n", "league season game \n", - "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton NaN \n", - " 2020-09-12 Fulham-Arsenal NaN \n", - " 2020-09-12 Liverpool-Leeds United NaN \n", - " 2020-09-12 West Ham United-Newcastle United NaN \n", - " 2020-09-13 Tottenham-Everton NaN " + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton 0 \n", + " 2020-09-12 Fulham-Arsenal 3 \n", + " 2020-09-12 Liverpool-Leeds United 3 \n", + " 2020-09-12 West Ham United-Newcastle 2 \n", + " 2020-09-13 Tottenham-Everton 1 \n", + "\n", + " incidents \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton [{'minute': '13', 'type': 1, 'subType': 1, 'pl... \n", + " 2020-09-12 Fulham-Arsenal [{'minute': '8', 'type': 1, 'subType': 1, 'pla... \n", + " 2020-09-12 Liverpool-Leeds United [{'minute': '4', 'type': 1, 'subType': 2, 'pla... \n", + " 2020-09-12 West Ham United-Newcastle [{'minute': '56', 'type': 1, 'subType': 1, 'pl... \n", + " 2020-09-13 Tottenham-Everton [{'minute': '55', 'type': 1, 'subType': 1, 'pl... \n", + "\n", + " bets \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton None \n", + " 2020-09-12 Fulham-Arsenal None \n", + " 2020-09-12 Liverpool-Leeds United None \n", + " 2020-09-12 West Ham United-Newcastle None \n", + " 2020-09-13 Tottenham-Everton None \n", + "\n", + " aggregate_winner_field \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton None \n", + " 2020-09-12 Fulham-Arsenal None \n", + " 2020-09-12 Liverpool-Leeds United None \n", + " 2020-09-12 West Ham United-Newcastle None \n", + " 2020-09-13 Tottenham-Everton None \n", + "\n", + " winner_field \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton 0.0 \n", + " 2020-09-12 Fulham-Arsenal 1.0 \n", + " 2020-09-12 Liverpool-Leeds United 0.0 \n", + " 2020-09-12 West Ham United-Newcastle 1.0 \n", + " 2020-09-13 Tottenham-Everton 1.0 \n", + "\n", + " period \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton 7 \n", + " 2020-09-12 Fulham-Arsenal 7 \n", + " 2020-09-12 Liverpool-Leeds United 7 \n", + " 2020-09-12 West Ham United-Newcastle 7 \n", + " 2020-09-13 Tottenham-Everton 7 \n", + "\n", + " extra_result_field \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton None \n", + " 2020-09-12 Fulham-Arsenal None \n", + " 2020-09-12 Liverpool-Leeds United None \n", + " 2020-09-12 West Ham United-Newcastle None \n", + " 2020-09-13 Tottenham-Everton None \n", + "\n", + " home_extratime_score \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton None \n", + " 2020-09-12 Fulham-Arsenal None \n", + " 2020-09-12 Liverpool-Leeds United None \n", + " 2020-09-12 West Ham United-Newcastle None \n", + " 2020-09-13 Tottenham-Everton None \n", + "\n", + " away_extratime_score \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton None \n", + " 2020-09-12 Fulham-Arsenal None \n", + " 2020-09-12 Liverpool-Leeds United None \n", + " 2020-09-12 West Ham United-Newcastle None \n", + " 2020-09-13 Tottenham-Everton None \n", + "\n", + " home_penalty_score \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton None \n", + " 2020-09-12 Fulham-Arsenal None \n", + " 2020-09-12 Liverpool-Leeds United None \n", + " 2020-09-12 West Ham United-Newcastle None \n", + " 2020-09-13 Tottenham-Everton None \n", + "\n", + " away_penalty_score \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton None \n", + " 2020-09-12 Fulham-Arsenal None \n", + " 2020-09-12 Liverpool-Leeds United None \n", + " 2020-09-12 West Ham United-Newcastle None \n", + " 2020-09-13 Tottenham-Everton None \n", + "\n", + " started_at_utc \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton 2020-09-12T14:01:42Z \n", + " 2020-09-12 Fulham-Arsenal 2020-09-12T11:32:23Z \n", + " 2020-09-12 Liverpool-Leeds United 2020-09-12T16:30:21Z \n", + " 2020-09-12 West Ham United-Newcastle 2020-09-12T19:00:32Z \n", + " 2020-09-13 Tottenham-Everton 2020-09-13T15:30:20Z \n", + "\n", + " first_half_ended_at_utc \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton None \n", + " 2020-09-12 Fulham-Arsenal None \n", + " 2020-09-12 Liverpool-Leeds United None \n", + " 2020-09-12 West Ham United-Newcastle None \n", + " 2020-09-13 Tottenham-Everton None \n", + "\n", + " second_half_started_at_utc \\\n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton 2020-09-12T15:04:01Z \n", + " 2020-09-12 Fulham-Arsenal 2020-09-12T12:35:50Z \n", + " 2020-09-12 Liverpool-Leeds United 2020-09-12T17:32:57Z \n", + " 2020-09-12 West Ham United-Newcastle 2020-09-12T20:03:20Z \n", + " 2020-09-13 Tottenham-Everton 2020-09-13T16:31:33Z \n", + "\n", + " stage \n", + "league season game \n", + "ENG-Premier League 2021 2020-09-12 Crystal Palace-Southampton None \n", + " 2020-09-12 Fulham-Arsenal None \n", + " 2020-09-12 Liverpool-Leeds United None \n", + " 2020-09-12 West Ham United-Newcastle None \n", + " 2020-09-13 Tottenham-Everton None " ] }, "execution_count": 5, @@ -475,31 +1045,32 @@ " \n", " \n", " \n", + " game_id\n", " period\n", " minute\n", + " second\n", " expanded_minute\n", " type\n", " outcome_type\n", + " team_id\n", " team\n", + " player_id\n", " player\n", - " qualifiers\n", " x\n", " y\n", " end_x\n", " end_y\n", " goal_mouth_y\n", " goal_mouth_z\n", + " blocked_x\n", + " blocked_y\n", + " qualifiers\n", " is_touch\n", " is_shot\n", " is_goal\n", + " card_type\n", " related_event_id\n", " related_player_id\n", - " blocked_x\n", - " blocked_y\n", - " card_type\n", - " game_id\n", - " team_id\n", - " player_id\n", " \n", " \n", " league\n", @@ -531,6 +1102,7 @@ " \n", " \n", " \n", + " \n", " \n", " \n", " \n", @@ -539,149 +1111,162 @@ " 2021\n", " 2021-01-12 Burnley-Manchester United\n", " 2253458317\n", + " 1485184\n", " PreMatch\n", " 0\n", + " 0.0\n", " 0\n", " FormationSet\n", " Successful\n", + " 184\n", " Burnley\n", " NaN\n", - " [{'type': {'displayName': 'TeamPlayerFormation...\n", + " NaN\n", " 0.0\n", " 0.0\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", - " False\n", " NaN\n", " NaN\n", + " [{'type': {'displayName': 'TeamPlayerFormation...\n", + " False\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", - " 1485184\n", - " 184\n", - " NaN\n", " \n", " \n", " 2253458375\n", + " 1485184\n", " PreMatch\n", " 0\n", + " 0.0\n", " 0\n", " FormationSet\n", " Successful\n", + " 32\n", " Man Utd\n", " NaN\n", - " [{'type': {'displayName': 'CaptainPlayerId', '...\n", + " NaN\n", " 0.0\n", " 0.0\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", - " False\n", - " NaN\n", " NaN\n", " NaN\n", + " [{'type': {'displayName': 'CaptainPlayerId', '...\n", + " False\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", - " 1485184\n", - " 32\n", " NaN\n", " \n", " \n", " 2253487469\n", + " 1485184\n", " FirstHalf\n", " 0\n", + " 0.0\n", " 0\n", " Start\n", " Successful\n", + " 184\n", " Burnley\n", " NaN\n", - " []\n", + " NaN\n", " 0.0\n", " 0.0\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", - " False\n", " NaN\n", " NaN\n", + " []\n", + " False\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", - " 1485184\n", - " 184\n", - " NaN\n", " \n", " \n", " 2253487473\n", + " 1485184\n", " FirstHalf\n", " 0\n", + " 0.0\n", " 0\n", " Start\n", " Successful\n", + " 32\n", " Man Utd\n", " NaN\n", - " []\n", + " NaN\n", " 0.0\n", " 0.0\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", - " False\n", - " NaN\n", " NaN\n", " NaN\n", + " []\n", + " False\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", - " 1485184\n", - " 32\n", " NaN\n", " \n", " \n", " 2253487625\n", + " 1485184\n", " FirstHalf\n", " 0\n", + " 0.0\n", " 0\n", " Pass\n", " Successful\n", + " 184\n", " Burnley\n", + " 79050.0\n", " Ashley Westwood\n", - " [{'type': {'displayName': 'Angle', 'value': 21...\n", " 50.3\n", " 50.3\n", " 30.5\n", " 50.3\n", " NaN\n", " NaN\n", - " True\n", " NaN\n", " NaN\n", + " [{'type': {'displayName': 'Angle', 'value': 21...\n", + " True\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", - " 1485184\n", - " 184\n", - " 79050.0\n", " \n", " \n", "\n", "" ], "text/plain": [ + " game_id \\\n", + "league season game id \n", + "ENG-Premier League 2021 2021-01-12 Burnley-Manchester United 2253458317 1485184 \n", + " 2253458375 1485184 \n", + " 2253487469 1485184 \n", + " 2253487473 1485184 \n", + " 2253487625 1485184 \n", + "\n", " period \\\n", "league season game id \n", "ENG-Premier League 2021 2021-01-12 Burnley-Manchester United 2253458317 PreMatch \n", @@ -698,6 +1283,14 @@ " 2253487473 0 \n", " 2253487625 0 \n", "\n", + " second \\\n", + "league season game id \n", + "ENG-Premier League 2021 2021-01-12 Burnley-Manchester United 2253458317 0.0 \n", + " 2253458375 0.0 \n", + " 2253487469 0.0 \n", + " 2253487473 0.0 \n", + " 2253487625 0.0 \n", + "\n", " expanded_minute \\\n", "league season game id \n", "ENG-Premier League 2021 2021-01-12 Burnley-Manchester United 2253458317 0 \n", @@ -722,6 +1315,14 @@ " 2253487473 Successful \n", " 2253487625 Successful \n", "\n", + " team_id \\\n", + "league season game id \n", + "ENG-Premier League 2021 2021-01-12 Burnley-Manchester United 2253458317 184 \n", + " 2253458375 32 \n", + " 2253487469 184 \n", + " 2253487473 32 \n", + " 2253487625 184 \n", + "\n", " team \\\n", "league season game id \n", "ENG-Premier League 2021 2021-01-12 Burnley-Manchester United 2253458317 Burnley \n", @@ -730,6 +1331,14 @@ " 2253487473 Man Utd \n", " 2253487625 Burnley \n", "\n", + " player_id \\\n", + "league season game id \n", + "ENG-Premier League 2021 2021-01-12 Burnley-Manchester United 2253458317 NaN \n", + " 2253458375 NaN \n", + " 2253487469 NaN \n", + " 2253487473 NaN \n", + " 2253487625 79050.0 \n", + "\n", " player \\\n", "league season game id \n", "ENG-Premier League 2021 2021-01-12 Burnley-Manchester United 2253458317 NaN \n", @@ -738,14 +1347,6 @@ " 2253487473 NaN \n", " 2253487625 Ashley Westwood \n", "\n", - " qualifiers \\\n", - "league season game id \n", - "ENG-Premier League 2021 2021-01-12 Burnley-Manchester United 2253458317 [{'type': {'displayName': 'TeamPlayerFormation... \n", - " 2253458375 [{'type': {'displayName': 'CaptainPlayerId', '... \n", - " 2253487469 [] \n", - " 2253487473 [] \n", - " 2253487625 [{'type': {'displayName': 'Angle', 'value': 21... \n", - "\n", " x \\\n", "league season game id \n", "ENG-Premier League 2021 2021-01-12 Burnley-Manchester United 2253458317 0.0 \n", @@ -794,6 +1395,30 @@ " 2253487473 NaN \n", " 2253487625 NaN \n", "\n", + " blocked_x \\\n", + "league season game id \n", + "ENG-Premier League 2021 2021-01-12 Burnley-Manchester United 2253458317 NaN \n", + " 2253458375 NaN \n", + " 2253487469 NaN \n", + " 2253487473 NaN \n", + " 2253487625 NaN \n", + "\n", + " blocked_y \\\n", + "league season game id \n", + "ENG-Premier League 2021 2021-01-12 Burnley-Manchester United 2253458317 NaN \n", + " 2253458375 NaN \n", + " 2253487469 NaN \n", + " 2253487473 NaN \n", + " 2253487625 NaN \n", + "\n", + " qualifiers \\\n", + "league season game id \n", + "ENG-Premier League 2021 2021-01-12 Burnley-Manchester United 2253458317 [{'type': {'displayName': 'TeamPlayerFormation... \n", + " 2253458375 [{'type': {'displayName': 'CaptainPlayerId', '... \n", + " 2253487469 [] \n", + " 2253487473 [] \n", + " 2253487625 [{'type': {'displayName': 'Angle', 'value': 21... \n", + "\n", " is_touch \\\n", "league season game id \n", "ENG-Premier League 2021 2021-01-12 Burnley-Manchester United 2253458317 False \n", @@ -818,38 +1443,6 @@ " 2253487473 NaN \n", " 2253487625 NaN \n", "\n", - " related_event_id \\\n", - "league season game id \n", - "ENG-Premier League 2021 2021-01-12 Burnley-Manchester United 2253458317 NaN \n", - " 2253458375 NaN \n", - " 2253487469 NaN \n", - " 2253487473 NaN \n", - " 2253487625 NaN \n", - "\n", - " related_player_id \\\n", - "league season game id \n", - "ENG-Premier League 2021 2021-01-12 Burnley-Manchester United 2253458317 NaN \n", - " 2253458375 NaN \n", - " 2253487469 NaN \n", - " 2253487473 NaN \n", - " 2253487625 NaN \n", - "\n", - " blocked_x \\\n", - "league season game id \n", - "ENG-Premier League 2021 2021-01-12 Burnley-Manchester United 2253458317 NaN \n", - " 2253458375 NaN \n", - " 2253487469 NaN \n", - " 2253487473 NaN \n", - " 2253487625 NaN \n", - "\n", - " blocked_y \\\n", - "league season game id \n", - "ENG-Premier League 2021 2021-01-12 Burnley-Manchester United 2253458317 NaN \n", - " 2253458375 NaN \n", - " 2253487469 NaN \n", - " 2253487473 NaN \n", - " 2253487625 NaN \n", - "\n", " card_type \\\n", "league season game id \n", "ENG-Premier League 2021 2021-01-12 Burnley-Manchester United 2253458317 NaN \n", @@ -858,29 +1451,21 @@ " 2253487473 NaN \n", " 2253487625 NaN \n", "\n", - " game_id \\\n", - "league season game id \n", - "ENG-Premier League 2021 2021-01-12 Burnley-Manchester United 2253458317 1485184 \n", - " 2253458375 1485184 \n", - " 2253487469 1485184 \n", - " 2253487473 1485184 \n", - " 2253487625 1485184 \n", - "\n", - " team_id \\\n", - "league season game id \n", - "ENG-Premier League 2021 2021-01-12 Burnley-Manchester United 2253458317 184 \n", - " 2253458375 32 \n", - " 2253487469 184 \n", - " 2253487473 32 \n", - " 2253487625 184 \n", + " related_event_id \\\n", + "league season game id \n", + "ENG-Premier League 2021 2021-01-12 Burnley-Manchester United 2253458317 NaN \n", + " 2253458375 NaN \n", + " 2253487469 NaN \n", + " 2253487473 NaN \n", + " 2253487625 NaN \n", "\n", - " player_id \n", - "league season game id \n", - "ENG-Premier League 2021 2021-01-12 Burnley-Manchester United 2253458317 NaN \n", - " 2253458375 NaN \n", - " 2253487469 NaN \n", - " 2253487473 NaN \n", - " 2253487625 79050.0 " + " related_player_id \n", + "league season game id \n", + "ENG-Premier League 2021 2021-01-12 Burnley-Manchester United 2253458317 NaN \n", + " 2253458375 NaN \n", + " 2253487469 NaN \n", + " 2253487473 NaN \n", + " 2253487625 NaN " ] }, "execution_count": 7, @@ -911,7 +1496,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 8, "id": "dfd8f019", "metadata": {}, "outputs": [ @@ -956,10 +1541,18 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 9, "id": "2078b018", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/cw/dtaijupiter/NoCsBack/dtai/pieterr/Projects/soccerdata/.venv/lib/python3.11/site-packages/socceraction/spadl/opta.py:219: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " ).bfill()\n" + ] + }, { "data": { "text/html": [ @@ -1090,7 +1683,7 @@ " 38.828\n", " 11\n", " 0\n", - " 0\n", + " 4\n", " 4\n", " Robbie Brady\n", " Burnley\n", @@ -1112,7 +1705,7 @@ "1 31.080 38.220 36.312 15.844 0 1 0 \n", "2 38.220 43.365 15.844 12.512 21 1 0 \n", "3 43.365 90.300 12.512 49.708 0 1 0 \n", - "4 90.300 105.000 49.708 38.828 11 0 0 \n", + "4 90.300 105.000 49.708 38.828 11 0 4 \n", "\n", " action_id player team \n", "0 0 Ashley Westwood Burnley \n", @@ -1122,7 +1715,7 @@ "4 4 Robbie Brady Burnley " ] }, - "execution_count": 13, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -1134,10 +1727,18 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 10, "id": "10f8a086", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/cw/dtaijupiter/NoCsBack/dtai/pieterr/Projects/soccerdata/.venv/lib/python3.11/site-packages/socceraction/spadl/opta.py:219: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " ).bfill()\n" + ] + }, { "data": { "text/html": [ @@ -1294,7 +1895,7 @@ "4 Matthew Lowton Burnley " ] }, - "execution_count": 14, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -1306,7 +1907,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 11, "id": "1cff9142", "metadata": {}, "outputs": [ @@ -1701,8 +2302,8 @@ " qualifiers\n", " related_player_id\n", " touch\n", - " shot\n", " goal\n", + " shot\n", " type_name\n", " \n", " \n", @@ -1848,7 +2449,7 @@ "3 23.3 {178: True, 213: '5.0', 212: '21.7', 141: '23.... \n", "4 73.1 {1: True, 213: '0.7', 56: 'Center', 178: True,... \n", "\n", - " related_player_id touch shot goal type_name \n", + " related_player_id touch goal shot type_name \n", "0 NaN False False False start \n", "1 NaN False False False start \n", "2 NaN True False False pass \n", @@ -1896,7 +2497,7 @@ ], "metadata": { "kernelspec": { - "display_name": "soccerdata", + "display_name": "/home/pieterr/Jupiter/Projects/soccerdata", "language": "python", "name": "soccerdata" }, @@ -1910,7 +2511,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.6" + "version": "3.11.1" }, "toc": { "base_numbering": 1, diff --git a/soccerdata/_common.py b/soccerdata/_common.py index bfab09f6..09cb5a9b 100644 --- a/soccerdata/_common.py +++ b/soccerdata/_common.py @@ -452,6 +452,8 @@ def _download_and_save( # noqa: C901 response = self._driver.execute_script( "return document.body.innerHTML;" ).encode("utf-8") + if response == b"": + raise Exception("Empty response.") else: if not isinstance(var, str): raise NotImplementedError("Only implemented for single variables.") diff --git a/soccerdata/whoscored.py b/soccerdata/whoscored.py index 19d16009..2c108ba1 100644 --- a/soccerdata/whoscored.py +++ b/soccerdata/whoscored.py @@ -6,7 +6,7 @@ import time from datetime import datetime from pathlib import Path -from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union +from typing import Callable, Dict, Iterable, List, Optional, Union import numpy as np import pandas as pd @@ -14,12 +14,8 @@ from selenium.common.exceptions import ( ElementClickInterceptedException, NoSuchElementException, - TimeoutException, ) from selenium.webdriver.common.by import By -from selenium.webdriver.remote.webelement import WebElement -from selenium.webdriver.support import expected_conditions as ec -from selenium.webdriver.support.ui import WebDriverWait from ._common import ( BaseSeleniumReader, @@ -127,6 +123,44 @@ def _parse_datetime(ts: str) -> datetime: return datetime.strptime(ts, "%A, %b %d %Y %H:%M") +def _parse_url(url: str) -> Dict: + """Parse a URL from WhoScored. + + Parameters + ---------- + url : str + URL to parse. + + Raises + ------ + ValueError + If the URL could not be parsed. + + Returns + ------- + dict + """ + patt = ( + r"^(?:https:\/\/www.whoscored.com)?\/" + + r"(?:Regions\/(\d+)\/)?" + + r"(?:Tournaments\/(\d+)\/)?" + + r"(?:Seasons\/(\d+)\/)?" + + r"(?:Stages\/(\d+)\/)?" + + r"(?:Matches\/(\d+)\/)?" + ) + matches = re.search(patt, url) + if matches: + return { + "region_id": matches.group(1), + "league_id": matches.group(2), + "season_id": matches.group(3), + "stage_id": matches.group(4), + "match_id": matches.group(5), + } + else: + raise ValueError(f"Could not parse URL: {url}") + + class WhoScored(BaseSeleniumReader): """Provides pd.DataFrames from data available at http://whoscored.com. @@ -224,7 +258,6 @@ def read_leagues(self) -> pd.DataFrame: "region": region["name"], "league_id": league["id"], "league": league["name"], - "url": league["url"], } ) @@ -249,7 +282,11 @@ def read_seasons(self) -> pd.DataFrame: seasons = [] for lkey, league in df_leagues.iterrows(): - url = WHOSCORED_URL + league.url + url = ( + WHOSCORED_URL + + f"/Regions/{league['region_id']}" + + f"/Tournaments/{league['league_id']}" + ) filemask = "seasons/{}.html" filepath = self.data_dir / filemask.format(lkey) reader = self.get(url, filepath, var=None) @@ -258,12 +295,15 @@ def read_seasons(self) -> pd.DataFrame: tree = html.parse(reader) for node in tree.xpath("//select[contains(@id,'seasons')]/option"): # extract team IDs from links + season_url = node.get("value") + season_id = _parse_url(season_url)["season_id"] seasons.append( { - "url": node.get("value"), "league": lkey, - "league_id": league.league_id, "season": season_code(node.text), + "region_id": league.region_id, + "league_id": league.league_id, + "season_id": season_id, } ) @@ -275,88 +315,76 @@ def read_seasons(self) -> pd.DataFrame: ) return df - def _parse_season_stages(self) -> List[Dict]: - match_selector = ( - "//div[contains(@id,'tournament-fixture')]//div[contains(@class,'divtable-row')]" - ) - WebDriverWait(self._driver, 30, poll_frequency=1).until( - ec.presence_of_element_located((By.XPATH, match_selector)) - ) - node_stages_selector = "//select[contains(@id,'stages')]/option" - node_stages = self._driver.find_elements(By.XPATH, node_stages_selector) - stages = [] - for stage in node_stages: - if not re.search(r"Grp. ([A-Z])$", stage.text): - # there is always a page with all group stage games combined - stages.append({"url": stage.get_attribute("value"), "name": stage.text}) - return stages - - def _parse_schedule_page(self) -> Tuple[List[Dict], Optional[WebElement]]: - match_selector = ( - "//div[contains(@id,'tournament-fixture')]//div[contains(@class,'divtable-row')]" - ) - date_selector = "./div[contains(@class,'divtable-header')]" - time_selector = "./div[contains(@class,'time')]" - home_team_selector = "./div[contains(@class,'team home')]//a" - away_team_selector = "./div[contains(@class,'team away')]//a" - result_selector = "./div[contains(@class,'result')]//a" + def read_season_stages(self, force_cache: bool = False) -> pd.DataFrame: + """Retrieve the season stages for the selected leagues. - try: - WebDriverWait(self._driver, 30, poll_frequency=1).until( - ec.presence_of_element_located((By.XPATH, match_selector)) + Parameters + ---------- + force_cache : bool + By default no cached data is used for the current season. + If True, will force the use of cached data anyway. + + Returns + ------- + pd.DataFrame + """ + df_seasons = self.read_seasons() + filemask = "seasons/{}_{}.html" + + season_stages = [] + for (lkey, skey), season in df_seasons.iterrows(): + current_season = not self._is_complete(lkey, skey) + + # get season page + url = ( + WHOSCORED_URL + + f"/Regions/{season['region_id']}" + + f"/Tournaments/{season['league_id']}" + + f"/Seasons/{season['season_id']}" ) - date_str = None - schedule_page = [] - for node in self._driver.find_elements(By.XPATH, match_selector): - if node.get_attribute("data-id"): - match_id = int(node.get_attribute("data-id")) - time_str = node.find_element(By.XPATH, time_selector).get_attribute( - "textContent" - ) - match_url = node.find_element(By.XPATH, result_selector).get_attribute("href") - schedule_page.append( - { - "date": _parse_datetime(f"{date_str} {time_str}"), - "home_team": node.find_element(By.XPATH, home_team_selector).text, - "away_team": node.find_element(By.XPATH, away_team_selector).text, - "game_id": match_id, - "url": match_url, - } - ) - else: - date_str = node.find_element(By.XPATH, date_selector).text - logger.info("Scraping game schedule for %s", date_str) - except TimeoutException: - schedule_page = [] + filepath = self.data_dir / filemask.format(lkey, skey) + reader = self.get(url, filepath, var=None, no_cache=current_season and not force_cache) + tree = html.parse(reader) - try: - next_page_selector = ( - "//div[contains(@id,'date-controller')]" - "/a[contains(@class,'previous') and not(contains(@class, 'is-disabled'))]" + # get default season stage + fixtures_url = tree.xpath("//a[text()='Fixtures']/@href")[0] + stage_id = _parse_url(fixtures_url)["stage_id"] + season_stages.append( + { + "league": lkey, + "season": skey, + "region_id": season.region_id, + "league_id": season.league_id, + "season_id": season.season_id, + "stage_id": stage_id, + "stage": None, + } ) - next_page = self._driver.find_element(By.XPATH, next_page_selector) - except NoSuchElementException: - next_page = None - return schedule_page, next_page - - def _parse_schedule(self, stage: Optional[str] = None) -> List[Dict]: - schedule = [] - # Parse first page - page_schedule, next_page = self._parse_schedule_page() - schedule.extend(page_schedule) - # Go to next page - while next_page is not None: - try: - next_page.click() - time.sleep(5) - logger.debug("Next page") - except ElementClickInterceptedException: - self._handle_banner() - # Parse next page - page_schedule, next_page = self._parse_schedule_page() - schedule.extend(page_schedule) - schedule = [dict(item, stage=stage) for item in schedule] - return schedule + + # extract additional stages + for node in tree.xpath("//select[contains(@id,'stages')]/option"): + stage_url = node.get("value") + stage_id = _parse_url(stage_url)["stage_id"] + season_stages.append( + { + "league": lkey, + "season": skey, + "region_id": season.region_id, + "league_id": season.league_id, + "season_id": season.season_id, + "stage_id": stage_id, + "stage": node.text, + } + ) + + df = ( + pd.DataFrame(season_stages) + .drop_duplicates(subset=["league", "season", "stage_id"], keep="last") + .set_index(["league", "season"]) + .sort_index() + .loc[itertools.product(self.leagues, self.seasons)] + ) + return df def read_schedule(self, force_cache: bool = False) -> pd.DataFrame: # noqa: C901 """Retrieve the game schedule for the selected leagues and seasons. @@ -371,74 +399,82 @@ def read_schedule(self, force_cache: bool = False) -> pd.DataFrame: # noqa: C90 ------- pd.DataFrame """ - df_seasons = self.read_seasons() - filemask = "matches/{}_{}.csv" + df_season_stages = self.read_season_stages(force_cache=force_cache) + filemask_schedule = "matches/{}_{}_{}_{}.json" all_schedules = [] - for (lkey, skey), season in df_seasons.iterrows(): - filepath = self.data_dir / filemask.format(lkey, skey) - url = WHOSCORED_URL + season.url - - schedule = [] - is_current_season = not self._is_complete(lkey, skey) - no_cache = (not filepath.exists()) or self.no_cache - if (is_current_season and not force_cache) or no_cache: - # Scrape the season's schedule - self._driver.get(url) - - # Check if season consists of multiple stages - stages = self._parse_season_stages() - - # Handle a multi-stage season - if len(stages) > 0: - for stage in stages: - url = WHOSCORED_URL + stage["url"].replace("Show", "Fixtures") - self._driver.get(url) - try: - WebDriverWait(self._driver, 30, poll_frequency=1).until( - ec.presence_of_element_located( - (By.XPATH, "//div[@id='tournament-fixture']") - ) - ) - except TimeoutException: - # Tournaments sometimes do not have a fixtures page, - # the summary page has to be used instead - url = WHOSCORED_URL + stage["url"] - self._driver.get(url) - logger.info("Scraping game schedule with stage=%s from %s", stage, url) - schedule.extend(self._parse_schedule(stage=stage["name"])) - - # Handle a single-stage season - else: - fixtures_nav_selector = "//a[text()='Fixtures']" - fixtures_nav = self._driver.find_element(By.XPATH, fixtures_nav_selector) - self._driver.get(fixtures_nav.get_attribute("href")) - try: - WebDriverWait(self._driver, 30, poll_frequency=1).until( - ec.presence_of_element_located( - (By.XPATH, "//div[@id='tournament-fixture']") - ) - ) - except TimeoutException: - # Tournaments sometimes do not have a fixtures page, - # the summary page has to be used instead - summary_nav_selector = "//a[text()='Fixtures']" - summary_nav = self._driver.find_element(By.XPATH, summary_nav_selector) - self._driver.get(summary_nav.get_attribute("href")) - logger.info("Scraping game schedule from %s", url) - schedule.extend(self._parse_schedule()) - - # Cache the data - df_schedule = pd.DataFrame(schedule).assign(league=lkey, season=skey) - if not self.no_store: - df_schedule.to_csv(filepath, index=False) - + for (lkey, skey), stage in df_season_stages.iterrows(): + current_season = not self._is_complete(lkey, skey) + stage_id = stage["stage_id"] + stage_name = stage["stage"] + + # get the calendar of the season stage + season_stage_url = ( + WHOSCORED_URL + + f"/Regions/{stage['region_id']}" + + f"/Tournaments/{stage['league_id']}" + + f"/Seasons/{stage['season_id']}" + + f"/Stages/{stage['stage_id']}" + ) + if stage_name is not None: + calendar_filepath = self.data_dir / "matches/{}_{}_{}.html".format( + lkey, skey, stage_id + ) + logger.info( + "Retrieving calendar for %s %s (%s)", + lkey, + skey, + stage_name, + ) else: - # Load cached data - logger.info("Retrieving game schedule of %s - %s from the cache", lkey, skey) - df_schedule = pd.read_csv(filepath) + calendar_filepath = self.data_dir / f"matches/{lkey}_{skey}.html" + logger.info( + "Retrieving calendar for %s %s", + lkey, + skey, + ) + calendar = self.get( + season_stage_url, + calendar_filepath, + var="wsCalendar", + no_cache=current_season and not force_cache, + ) + mask = json.load(calendar)["mask"] + + # get the fixtures for each month + it = [(year, month) for year in mask.keys() for month in mask[year].keys()] + for i, (year, month) in enumerate(it): + filepath = self.data_dir / filemask_schedule.format(lkey, skey, stage_id, month) + url = WHOSCORED_URL + f"/tournaments/{stage_id}/data/?d={year}{(int(month)+1):02d}" + + if stage_name is not None: + logger.info( + "[%s/%s] Retrieving fixtures for %s %s (%s)", + i + 1, + len(it), + lkey, + skey, + stage_name, + ) + else: + logger.info( + "[%s/%s] Retrieving fixtures for %s %s", + i + 1, + len(it), + lkey, + skey, + ) - all_schedules.append(df_schedule) + reader = self.get( + url, filepath, var=None, no_cache=current_season and not force_cache + ) + data = json.load(reader) + for tournament in data["tournaments"]: + df_schedule = pd.DataFrame(tournament["matches"]) + df_schedule["league"] = lkey + df_schedule["season"] = skey + df_schedule["stage"] = stage_name + all_schedules.append(df_schedule) if len(all_schedules) == 0: return pd.DataFrame(index=["league", "season", "game"]) @@ -446,15 +482,24 @@ def read_schedule(self, force_cache: bool = False) -> pd.DataFrame: # noqa: C90 # Construct the output dataframe df = ( pd.concat(all_schedules) - .drop_duplicates() + .drop_duplicates(subset=["id"]) .replace( { - "home_team": TEAMNAME_REPLACEMENTS, - "away_team": TEAMNAME_REPLACEMENTS, + "homeTeamName": TEAMNAME_REPLACEMENTS, + "awayTeamName": TEAMNAME_REPLACEMENTS, + } + ) + .rename( + columns={ + "homeTeamName": "home_team", + "awayTeamName": "away_team", + "id": "game_id", + "startTimeUtc": "date", } ) .assign(date=lambda x: pd.to_datetime(x["date"])) .assign(game=lambda df: df.apply(make_game_id, axis=1)) + .pipe(standardize_colnames) .set_index(["league", "season", "game"]) .sort_index() )