diff --git a/src/viadot/sources/sharepoint.py b/src/viadot/sources/sharepoint.py index d4f05f702..8f70272d2 100644 --- a/src/viadot/sources/sharepoint.py +++ b/src/viadot/sources/sharepoint.py @@ -1,6 +1,7 @@ import io import os -from typing import Literal, Optional, Union +import re +from typing import Any, Literal, Optional, Union from urllib.parse import urlparse import pandas as pd @@ -13,7 +14,12 @@ from viadot.exceptions import CredentialError from viadot.signals import SKIP from viadot.sources.base import Source -from viadot.utils import add_viadot_metadata_columns, cleanup_df, validate +from viadot.utils import ( + add_viadot_metadata_columns, + cleanup_df, + validate, + validate_and_reorder_dfs_columns, +) class SharepointCredentials(BaseModel): @@ -34,44 +40,6 @@ def is_configured(cls, credentials): return credentials -def get_last_segment_from_url( - url: str, -) -> tuple[str, Literal["file"]] | tuple[str, Literal["directory"]]: - """ - Get the last part of the URL and determine if it represents a file or directory. - - This function parses the provided URL, extracts the last segment, and identifies - whether it corresponds to a file (based on the presence of a file extension) - or a directory. - - Args: - url (str): The URL to a SharePoint file or directory. - - Raises: - ValueError: If an invalid URL is provided. - - Returns: - tuple: A tuple where the first element is the last part of the URL (file extension - or folder name) and the second element is a string indicating the type: - - If a file URL is provided, returns (file extension, 'file'). - - If a folder URL is provided, returns (last folder name, 'directory'). - """ - path_parts = urlparse(url).path.split("/") - # Filter out empty parts - non_empty_parts = [part for part in path_parts if part] - - # Check if the last part has a file extension - if non_empty_parts: - last_part = non_empty_parts[-1] - _, extension = os.path.splitext(last_part) - if extension: - return extension, "file" - else: - return last_part, "directory" - else: - raise ValueError("Incorrect URL provided : '{url}'") - - class Sharepoint(Source): """ Download Excel files from Sharepoint. @@ -95,6 +63,16 @@ def __init__( super().__init__(*args, credentials=validated_creds, **kwargs) def get_connection(self) -> sharepy.session.SharePointSession: + """Establishes a connection to SharePoint using credentials provided during + object initialization. + + Returns: + sharepy.session.SharePointSession: A session object representing + the authenticated connection. + + Raises: + CredentialError: If authentication to SharePoint fails due to incorrect credentials. + """ try: connection = sharepy.connect( site=self.credentials.get("site"), @@ -109,8 +87,7 @@ def get_connection(self) -> sharepy.session.SharePointSession: return connection def download_file(self, url: str, to_path: list | str) -> None: - """ - Download a file from Sharepoint. + """Download a file from Sharepoint to specific location. Args: url (str): The URL of the file to be downloaded. @@ -129,70 +106,218 @@ def download_file(self, url: str, to_path: list | str) -> None: ) conn.close() - def _download_excel(self, url: str, **kwargs) -> pd.ExcelFile: - endpoint_value, endpoint_type = get_last_segment_from_url(url) + def scan_sharepoint_folder(self, url: str) -> list[str]: + """Scan Sharepoint folder to get all file URLs of all files within it. + + Args: + url (str): The URL of the folder to scan. + + Raises: + ValueError: If the provided URL does not contain the expected '/sites/' segment. + + Returns: + list[str]: List of URLs pointing to each file within the specified + SharePoint folder. + """ + conn = self.get_connection() + + parsed_url = urlparse(url) + path_parts = parsed_url.path.split("/") + if "sites" in path_parts: + site_index = ( + path_parts.index("sites") + 2 + ) # +2 to include 'sites' and the next segment + site_url = f"{parsed_url.scheme}://{parsed_url.netloc}{'/'.join(path_parts[:site_index])}" + library = "/".join(path_parts[site_index:]) + else: + message = "URL does not contain '/sites/' segment." + raise ValueError(message) + + # -> site_url = company.sharepoint.com/sites/site_name/ + # -> library = /shared_documents/folder/sub_folder/final_folder + endpoint = ( + f"{site_url}/_api/web/GetFolderByServerRelativeUrl('{library}')/Files" + ) + response = conn.get(endpoint) + files = response.json().get("d", {}).get("results", []) + + return [f'{site_url}/{library}{file["Name"]}' for file in files] + + def _get_file_extension(self, url: str) -> str: + """ + Extracts the file extension from a given URL. + + Parameters: + url (str): The URL from which to extract the file extension. + + Returns: + str: The file extension, including the leading dot (e.g., '.xlsx'). + """ + # Parse the URL to get the path + parsed_url = urlparse(url) + + # Get the file extension + _, ext = os.path.splitext(parsed_url.path) + + return ext + + def _download_file_stream(self, url: str, **kwargs) -> pd.ExcelFile: + """Downloads the content of a file from SharePoint and returns it as an in-memory + byte stream. + + Args: + url (str): The URL of the file to download. + + Returns: + io.BytesIO: An in-memory byte stream containing the file content. + """ if "nrows" in kwargs: raise ValueError("Parameter 'nrows' is not supported.") + conn = self.get_connection() - if endpoint_type == "file": - if endpoint_value != ".xlsx": - raise ValueError( - "Only Excel files with 'XLSX' extension can be loaded into a DataFrame." - ) - self.logger.info(f"Downloading data from {url}...") - response = conn.get(url) - bytes_stream = io.BytesIO(response.content) - return pd.ExcelFile(bytes_stream) + self.logger.info(f"Downloading data from {url}...") + response = conn.get(url) + bytes_stream = io.BytesIO(response.content) + + return pd.ExcelFile(bytes_stream) - def _convert_all_to_string_type(self, df: pd.DataFrame) -> pd.DataFrame: - """Convert all column data types in the DataFrame to strings. + def _is_file(self, url: str) -> bool: + """Determines whether a provided URL points to a file based on its structure. - This method converts all the values in the DataFrame to strings, - handling NaN values by replacing them with None. + This function uses a regular expression to check if the URL ends with a + common file extension. It does not make any network requests and purely + relies on the URL structure for its determination. + + Parameters: + url (str): The URL to be checked. + + Returns: + bool: True if the URL is identified as a file based on its extension, + False otherwise. + + Example: + >>> _is_file("https://example.com/file.xlsx") + True + >>> _is_file("https://example.com/folder/") + False + >>> _is_file("https://example.com/folder") + False + """ + # Regular expression for matching file extensions + file_extension_pattern = re.compile(r"\.[a-zA-Z0-9]+$") + + return bool(file_extension_pattern.search(url)) + + def _handle_multiple_files( + self, + url: str, + file_sheet_mapping: dict, + na_values: Optional[list[str]] = None, + **kwargs, + ): + """Handles download and parsing of multiple Excel files from a SharePoint folder. Args: - df (pd.DataFrame): DataFrame to convert. + url (str): The base URL of the SharePoint folder containing the files. + file_sheet_mapping (dict): A dictionary mapping file names to sheet names + or indexes. The keys are file names, and the values are sheet names/indices. + na_values (Optional[list[str]]): Additional strings to recognize as NA/NaN. Returns: - pd.DataFrame: DataFrame with all data types converted to string. - Columns that contain only None values are also - converted to string type. + pd.DataFrame: A concatenated DataFrame containing the data from all + specified files and sheets. + + Raises: + ValueError: If the file extension is not supported. """ - df_converted = df.astype(str).where(pd.notnull(df), None) - return self._empty_column_to_string(df=df_converted) + dfs = [ + self._load_and_parse( + file_url=url + file, sheet_name=sheet, na_values=na_values, **kwargs + ) + for file, sheet in file_sheet_mapping.items() + ] + return pd.concat(validate_and_reorder_dfs_columns(dfs)) - def _empty_column_to_string(self, df: pd.DataFrame) -> pd.DataFrame: - """Convert the type of columns containing only None values to string. + def _load_and_parse( + self, + file_url: str, + sheet_name: Optional[Union[str, list[str]]] = None, + na_values: Optional[list[str]] = None, + **kwargs, + ): + """Loads and parses an Excel file from a URL. - This method iterates through the DataFrame columns and converts the - type of any column that contains only None values to string. + Args: + file_url (str): The URL of the file to download and parse. + sheet_name (Optional[Union[str, list[str]]]): The name(s) or index(es) of + the sheet(s) to parse. If None, all sheets are parsed. + na_values (Optional[list[str]]): Additional strings to recognize as NA/NaN. + **kwargs: Additional keyword arguments to pass to the pandas read function. + + Returns: + pd.DataFrame: The parsed data as a pandas DataFrame. + + Raises: + ValueError: If the file extension is not supported. + """ + file_extension = self._get_file_extension(file_url) + file_stream = self._download_file_stream(file_url) + + if file_extension == ".xlsx": + return self._parse_excel(file_stream, sheet_name, na_values, **kwargs) + else: + raise ValueError("Only Excel (.xlsx) files can be loaded into a DataFrame.") + + def _parse_excel( + self, + excel_file, + sheet_name: Optional[Union[str, list[str]]] = None, + na_values: Optional[list[str]] = None, + **kwargs, + ): + """Parses an Excel file into a DataFrame. Casts all columns to string. Args: - df (pd.DataFrame): DataFrame to convert. + excel_file: An ExcelFile object containing the data to parse. + sheet_name (Optional[Union[str, list[str]]]): The name(s) or index(es) of + the sheet(s) to parse. If None, all sheets are parsed. + na_values (Optional[list[str]]): Additional strings to recognize as NA/NaN. + **kwargs: Additional keyword arguments to pass to the pandas read function. Returns: - pd.DataFrame: Updated DataFrame with columns containing only - None values converted to string type. All columns - in the returned DataFrame will be of type object/string. + pd.DataFrame: The parsed data as a pandas DataFrame. """ - for col in df.columns: - if df[col].isnull().all(): - df[col] = df[col].astype("string") - return df + return pd.concat( + [ + excel_file.parse( + sheet, + keep_default_na=False, + na_values=na_values or self.DEFAULT_NA_VALUES, + dtype=str, # Ensure all columns are read as strings + **kwargs, + ) + for sheet in ([sheet_name] if sheet_name else excel_file.sheet_names) + ] + ) @add_viadot_metadata_columns def to_df( self, url: str, - sheet_name: Optional[Union[str, list, int]] = None, - if_empty: str = "warn", - tests: dict = {}, - na_values: list[str] | None = None, + sheet_name: Optional[Union[str, list[str]]] = None, + if_empty: Literal["warn", "skip", "fail"] = "warn", + tests: dict[str, Any] = {}, + file_sheet_mapping: Optional[dict[str, Union[str, int, list[str]]]] = None, + na_values: Optional[list[str]] = None, **kwargs, ) -> pd.DataFrame: """ - Load an Excel file into a pandas DataFrame. + Load an Excel file or files from a SharePoint URL into a pandas DataFrame. + + This method handles downloading the file(s), parsing the content, and converting + it into a pandas DataFrame. It supports both single file URLs and folder URLs + with multiple files. Args: url (str): The URL of the file to be downloaded. @@ -200,46 +325,57 @@ def to_df( Integers are used in zero-indexed sheet positions (chart sheets do not count as a sheet position). Lists of strings/integers are used to request multiple sheets. Specify None to get all worksheets. Defaults to None. - if_empty (str, optional): What to do if the file is empty. Defaults to "warn". - tests (Dict[str], optional): A dictionary with optional list of tests + if_empty (Literal["warn", "skip", "fail"], optional): Action to take if + the DataFrame is empty. + - "warn": Logs a warning. + - "skip": Skips the operation. + - "fail": Raises an error. + Defaults to "warn". + tests (Dict[str, Any], optional): A dictionary with optional list of tests to verify the output dataframe. If defined, triggers the `validate` function from utils. Defaults to None. - na_values (list[str] | None): Additional strings to recognize as NA/NaN. + file_sheet_mapping (Optional[dict[str, Union[str, int, list[str]]]], optional): + Mapping of file names to sheet names or indices. The keys are file names + and the values are sheet names/indices. Used when multiple files are + involved. Defaults to None. + na_values (list[str], optional): Additional strings to recognize as NA/NaN. If list passed, the specific NA values for each column will be recognized. Defaults to None. - If None then the "DEFAULT_NA_VALUES" is assigned list(" ", "#N/A", "#N/A N/A", - "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN", - "", "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null"). - If list passed, the specific NA values for each column will be recognized. - Defaults to None. - kwargs (dict[str, Any], optional): Keyword arguments to pass to pd.ExcelFile.parse(). Note that - `nrows` is not supported. + kwargs (dict[str, Any], optional): Keyword arguments to pass to pd.ExcelFile.parse(). + Note that `nrows` is not supported. Returns: pd.DataFrame: The resulting data as a pandas DataFrame. + + Raises: + ValueError: If the file extension is not supported or if `if_empty` is set to "fail" and the DataFrame is empty. + SKIP: If `if_empty` is set to "skip" and the DataFrame is empty. """ - excel_file = self._download_excel(url=url, **kwargs) - - if sheet_name: - df = excel_file.parse( - sheet_name=sheet_name, - keep_default_na=False, - na_values=na_values or self.DEFAULT_NA_VALUES, - **kwargs, + + if self._is_file(url): + df = self._load_and_parse( + file_url=url, sheet_name=sheet_name, na_values=na_values, **kwargs ) - df["sheet_name"] = sheet_name else: - sheets: list[pd.DataFrame] = [] - for sheet_name in excel_file.sheet_names: - sheet = excel_file.parse( - sheet_name=sheet_name, - keep_default_na=False, - na_values=na_values or self.DEFAULT_NA_VALUES, + if file_sheet_mapping: + df = self._handle_multiple_files( + url=url, + file_sheet_mapping=file_sheet_mapping, + na_values=na_values, **kwargs, ) - sheet["sheet_name"] = sheet_name - sheets.append(sheet) - df = pd.concat(sheets) + else: + list_of_urls = self.scan_sharepoint_folder(url) + dfs = [ + self._load_and_parse( + file_url=file_url, + sheet_name=sheet_name, + na_values=na_values, + **kwargs, + ) + for file_url in list_of_urls + ] + df = pd.concat(validate_and_reorder_dfs_columns(dfs)) if df.empty: try: @@ -247,11 +383,11 @@ def to_df( except SKIP: return pd.DataFrame() else: - self.logger.info(f"Successfully downloaded {len(df)} of data.") + self.logger.info(f"Successfully downloaded {len(df)} rows of data.") df_clean = cleanup_df(df) if tests: validate(df=df_clean, tests=tests) - return self._convert_all_to_string_type(df=df_clean) + return df_clean diff --git a/src/viadot/utils.py b/src/viadot/utils.py index 5ef1323cf..1efa1d353 100644 --- a/src/viadot/utils.py +++ b/src/viadot/utils.py @@ -680,3 +680,36 @@ def validate( raise ValidationError( f"Validation failed for {failed_tests} test(s): {failed_tests_msg}" ) + + +def validate_and_reorder_dfs_columns( + dataframes_list: list[pd.DataFrame], +) -> list[pd.DataFrame]: + """Validate if dataframes from the list have the same column structure. + + Reorder columns to match the first DataFrame if necessary. + + Args: + dataframes_list (list[pd.DataFrame]): List containing DataFrames. + + Raises: + IndexError: If the list of DataFrames is empty. + ValueError: If DataFrames have different column structures. + """ + if not dataframes_list: + message = "The list of dataframes is empty." + raise IndexError(message) + + first_df_columns = dataframes_list[0].columns + + # Check that all DataFrames have the same columns + for i, df in enumerate(dataframes_list): + if set(df.columns) != set(first_df_columns): + message = f"""DataFrame at index {i} does not have the same structure as + the first DataFrame.""" + raise ValueError(message) + if not df.columns.equals(first_df_columns): + # Reordering columns for DataFrame at index 'i' to match the first DataFrame. + dataframes_list[i] = df.loc[:, first_df_columns] + + return dataframes_list diff --git a/tests/viadot_tests/unit/test_file.xlsx b/tests/viadot_tests/unit/test_file.xlsx index 197881d2b..4a03e058a 100644 Binary files a/tests/viadot_tests/unit/test_file.xlsx and b/tests/viadot_tests/unit/test_file.xlsx differ diff --git a/tests/viadot_tests/unit/test_sharepoint.py b/tests/viadot_tests/unit/test_sharepoint.py index 6de4406d9..5a7255cd0 100644 --- a/tests/viadot_tests/unit/test_sharepoint.py +++ b/tests/viadot_tests/unit/test_sharepoint.py @@ -1,7 +1,13 @@ from pathlib import Path +from unittest.mock import MagicMock, patch import pandas as pd +import pytest +import sharepy +from sharepy.errors import AuthError +from viadot.exceptions import CredentialError from viadot.sources import Sharepoint +from viadot.sources.sharepoint import SharepointCredentials DUMMY_CREDS = {"site": "test", "username": "test2", "password": "test"} SAMPLE_DF = pd.DataFrame( @@ -16,40 +22,178 @@ class SharepointMock(Sharepoint): - def _download_excel(self, url=None): + def get_connection(self): + return sharepy.session.SharePointSession + + def _download_file_stream(self, url=None, **kwargs): + if "nrows" in kwargs: + raise ValueError("Parameter 'nrows' is not supported.") + return pd.ExcelFile(Path("tests/unit/test_file.xlsx")) -def test_sharepoint_default_na(): - s = SharepointMock(credentials=DUMMY_CREDS) - df = s.to_df(url="test", na_values=Sharepoint.DEFAULT_NA_VALUES) +@pytest.fixture +def sharepoint_mock(): + return SharepointMock(credentials=DUMMY_CREDS) + + +def test_valid_credentials(): + credentials = { + "site": "tenant.sharepoint.com", + "username": "user@example.com", + "password": "password", + } + shrp_creds = SharepointCredentials(**credentials) + assert shrp_creds.site == credentials["site"] + assert shrp_creds.username == credentials["username"] + assert shrp_creds.password == credentials["password"] + + +def test_invalid_authentication(): + credentials = { + "site": "tenant.sharepoint.com", + "username": "user@example.com", + "password": "password", + } + + s = Sharepoint(credentials=credentials) + + # Patch the sharepy.connect method to simulate an authentication failure + with patch("sharepy.connect") as mock_connect: + mock_connect.side_effect = AuthError("Authentication failed") + + with pytest.raises( + CredentialError, + match="Could not authenticate to tenant.sharepoint.com with provided credentials.", + ): + s.get_connection() + + +def test_missing_username(): + credentials = {"site": "example.sharepoint.com", "password": "password"} + with pytest.raises( + CredentialError, + match="'site', 'username', and 'password' credentials are required.", + ): + SharepointCredentials(**credentials) + + +def test_sharepoint_default_na(sharepoint_mock): + df = sharepoint_mock.to_df( + url="test/file.xlsx", na_values=Sharepoint.DEFAULT_NA_VALUES + ) assert not df.empty assert "NA" not in list(df["col_a"]) -def test_sharepoint_custom_na(): - s = SharepointMock(credentials=DUMMY_CREDS) - df = s.to_df( - url="test", na_values=[v for v in Sharepoint.DEFAULT_NA_VALUES if v != "NA"] +def test_sharepoint_custom_na(sharepoint_mock): + df = sharepoint_mock.to_df( + url="test/file.xlsx", + na_values=[v for v in Sharepoint.DEFAULT_NA_VALUES if v != "NA"], ) assert not df.empty assert "NA" in list(df["col_a"]) -def test_sharepoint_convert_all_to_string_type(): - s = SharepointMock(credentials=DUMMY_CREDS) - converted_df = s._convert_all_to_string_type(df=SAMPLE_DF) +def test__get_file_extension(sharepoint_mock): + url_excel = "https://tenant.sharepoint.com/sites/site/file.xlsx" + url_dir = "https://tenant.sharepoint.com/sites/site/" + url_txt = "https://tenant.sharepoint.com/sites/site/file.txt" + + excel_ext = sharepoint_mock._get_file_extension(url=url_excel) + txt_ext = sharepoint_mock._get_file_extension(url=url_txt) + dir = sharepoint_mock._get_file_extension(url=url_dir) + + assert excel_ext == ".xlsx" + assert txt_ext == ".txt" + assert dir == "" + + +def test__is_file(sharepoint_mock): + is_file = sharepoint_mock._is_file(url="https://example.com/file.xlsx") + assert is_file is True + + is_file = sharepoint_mock._is_file(url="https://example.com/dir") + assert is_file is False + + +def test__parse_excel_single_sheet(sharepoint_mock): + excel_file = sharepoint_mock._download_file_stream() + result_df = sharepoint_mock._parse_excel(excel_file, sheet_name="Sheet1") + expected = pd.DataFrame( + { + "col_a": ["val1", "", "val2", "NA", "N/A", "#N/A"], + "col_b": ["val1", "val2", "val3", "val4", "val5", "val6"], + } + ) + + assert result_df["col_b"].equals(expected["col_b"]) + + +def test__parse_excel_string_dtypes(sharepoint_mock): + excel_file = sharepoint_mock._download_file_stream() + result_df = sharepoint_mock._parse_excel(excel_file, sheet_name="Sheet1") + + for column in result_df.columns: + assert result_df[column].dtype == object + + +def test__load_and_parse_not_valid_extension(sharepoint_mock): + with pytest.raises(ValueError): + sharepoint_mock._load_and_parse(file_url="https://example.com/file.txt") + + +def test_scan_sharepoint_folder_valid_url(sharepoint_mock): + url = "https://company.sharepoint.com/sites/site_name/final_folder/" + + # Mock the response from SharePoint + mock_response = MagicMock() + mock_response.json.return_value = { + "d": { + "results": [ + {"Name": "file1.txt"}, + {"Name": "file2.txt"}, + ] + } + } + + # Inject the mock response + sharepoint_mock.get_connection().get = MagicMock(return_value=mock_response) + + expected_files = [ + "https://company.sharepoint.com/sites/site_name/final_folder/file1.txt", + "https://company.sharepoint.com/sites/site_name/final_folder/file2.txt", + ] + + result = sharepoint_mock.scan_sharepoint_folder(url) + assert result == expected_files + + +def test_scan_sharepoint_folder_invalid_url(sharepoint_mock): + url = "https://company.sharepoint.com/folder/sub_folder/final_folder" + + with pytest.raises(ValueError, match="URL does not contain '/sites/' segment."): + sharepoint_mock.scan_sharepoint_folder(url) + + +def test_scan_sharepoint_folder_empty_response(sharepoint_mock): + url = ( + "https://company.sharepoint.com/sites/site_name/folder/sub_folder/final_folder" + ) + + mock_response = MagicMock() + mock_response.json.return_value = {"d": {"results": []}} + + sharepoint_mock.get_connection().get = MagicMock(return_value=mock_response) - assert not converted_df.empty - assert pd.isnull(converted_df["nan_col"]).all() + result = sharepoint_mock.scan_sharepoint_folder(url) + assert result == [] -def test_sharepoint_convert_empty_columns_to_string(): - s = SharepointMock(credentials=DUMMY_CREDS) - converted_df = s._empty_column_to_string(df=SAMPLE_DF) +def test_download_file_stream_unsupported_param(sharepoint_mock): + url = "https://company.sharepoint.com/sites/site_name/folder/test_file.xlsx" - assert not converted_df.empty - assert converted_df["float_col"].dtype == float - assert converted_df["nan_col"].dtype == "string" + with pytest.raises(ValueError, match="Parameter 'nrows' is not supported."): + sharepoint_mock._download_file_stream(url, nrows=10) diff --git a/tests/viadot_tests/unit/test_utils.py b/tests/viadot_tests/unit/test_utils.py index e3c84438d..d00b3283c 100644 --- a/tests/viadot_tests/unit/test_utils.py +++ b/tests/viadot_tests/unit/test_utils.py @@ -2,7 +2,7 @@ import logging import pandas as pd - +import pytest from viadot.exceptions import ValidationError from viadot.utils import ( _cast_df_cols, @@ -11,6 +11,7 @@ get_fqn, handle_api_request, validate, + validate_and_reorder_dfs_columns, ) @@ -251,3 +252,43 @@ def test_validate_column_sum_fail(caplog): with caplog.at_level(logging.INFO): validate(df, tests) assert "Sum of 10 for col1 is out of the expected range - <5:6>" in caplog.text + + +def test_validate_and_reorder_wrong_columns(): + df1 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + df2 = pd.DataFrame({"a": [5, 6], "c": [7, 8]}) + + with pytest.raises(ValueError): + validate_and_reorder_dfs_columns([df1, df2]) + + +def test_validate_and_reorder_empty_list(): + with pytest.raises(IndexError): + validate_and_reorder_dfs_columns([]) + + +def test_validate_and_reorder_identical_columns(): + df1 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + df2 = pd.DataFrame({"a": [5, 6], "b": [7, 8]}) + + result = validate_and_reorder_dfs_columns([df1, df2]) + + assert len(result) == 2 + assert list(result[0].columns) == list(df1.columns) + assert result[0].equals(df1) + assert list(result[1].columns) == list(df2.columns) + assert result[1].equals(df2) + + +def test_validate_and_reorder_different_order_columns(): + df1 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + df2 = pd.DataFrame({"b": [7, 8], "a": [5, 6]}) + + expected_df2 = pd.DataFrame({"a": [5, 6], "b": [7, 8]}) + result = validate_and_reorder_dfs_columns([df1, df2]) + + assert len(result) == 2 + assert list(result[0].columns) == list(df1.columns) + assert result[0].equals(df1) + assert list(result[1].columns) == list(expected_df2.columns) + assert result[1].equals(expected_df2)