From 3f2665e46fdc48e245d9f50419af5e267bc907ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Zawadzki?= Date: Wed, 3 Jul 2024 12:03:19 +0200 Subject: [PATCH 01/15] =?UTF-8?q?=E2=AC=86=EF=B8=8F=20Relax=20sql-metadata?= =?UTF-8?q?=20version=20requirement=20(#940)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * ⬆️ Relax sql-metadata version requirement * 📌 Update lockfiles --- pyproject.toml | 4 ++-- requirements-dev.lock | 2 +- requirements.lock | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4a140fe7b..cb9b2a1a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "viadot2" -version = "2.0a25" +version = "2.0a26" description = "A simple data ingestion library to guide data flows from some places to other places." authors = [ { name = "acivitillo", email = "acivitillo@dyvenia.com" }, @@ -24,7 +24,7 @@ dependencies = [ "visions==0.7.5", "sharepy>=2.0.0, <2.1.0", "simple_salesforce==1.11.5", - "sql-metadata==2.3.0", + "sql-metadata>=2.3.0", "duckdb==0.5.1", "sendgrid==6.9.7", "pandas-gbq==0.19.1", diff --git a/requirements-dev.lock b/requirements-dev.lock index b738535d7..1014d8200 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -668,7 +668,7 @@ sniffio==1.3.1 # via httpx soupsieve==2.5 # via beautifulsoup4 -sql-metadata==2.3.0 +sql-metadata==2.12.0 # via viadot2 sqlalchemy==2.0.28 # via viadot2 diff --git a/requirements.lock b/requirements.lock index 98e93906a..42738a429 100644 --- a/requirements.lock +++ b/requirements.lock @@ -493,7 +493,7 @@ sniffio==1.3.1 # via anyio soupsieve==2.5 # via beautifulsoup4 -sql-metadata==2.3.0 +sql-metadata==2.12.0 # via viadot2 sqlalchemy==2.0.28 # via viadot2 From 4d8c2cf55396c6ea7547e072fa83942753bf2402 Mon Sep 17 00:00:00 2001 From: rziemianek Date: Fri, 5 Jul 2024 00:51:13 +0200 Subject: [PATCH 02/15] =?UTF-8?q?=E2=9C=A8=20Added=20`validate=5Fand=5Freo?= =?UTF-8?q?rder=5Fdfs=5Fcolumns`=20to=20utils?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/viadot/utils.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/src/viadot/utils.py b/src/viadot/utils.py index 5ef1323cf..c73e6e457 100644 --- a/src/viadot/utils.py +++ b/src/viadot/utils.py @@ -680,3 +680,34 @@ def validate( raise ValidationError( f"Validation failed for {failed_tests} test(s): {failed_tests_msg}" ) + +def validate_and_reorder_dfs_columns(dataframes_list: list[pd.DataFrame] + ) -> list[pd.DataFrame]: + """Validate if dataframes from the list have the same column structure. + + Reorder columns to match the first DataFrame if necessary. + + Args: + dataframes_list (list[pd.DataFrame]): List containing DataFrames. + + Raises: + IndexError: If the list of DataFrames is empty. + ValueError: If DataFrames have different column structures. + """ + if not dataframes_list: + message = "The list of dataframes is empty." + raise IndexError(message) + + first_df_columns = dataframes_list[0].columns + + # Check that all DataFrames have the same columns + for i, df in enumerate(dataframes_list): + if set(df.columns) != set(first_df_columns): + message = f"""DataFrame at index {i} does not have the same structure as + the first DataFrame.""" + raise ValueError(message) + if not df.columns.equals(first_df_columns): + # Reordering columns for DataFrame at index 'i' to match the first DataFrame. + dataframes_list[i] = df.loc[:, first_df_columns] + + return dataframes_list \ No newline at end of file From fd937371ddb127027df6df56a06c0de375904fd4 Mon Sep 17 00:00:00 2001 From: rziemianek Date: Fri, 5 Jul 2024 00:53:49 +0200 Subject: [PATCH 03/15] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Added=20new=20versio?= =?UTF-8?q?n=20of=20Sharepoint=20source=20class=20with=20additional=20func?= =?UTF-8?q?tions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/viadot/sources/sharepoint.py | 262 +++++++++++++++++++++---------- 1 file changed, 177 insertions(+), 85 deletions(-) diff --git a/src/viadot/sources/sharepoint.py b/src/viadot/sources/sharepoint.py index d4f05f702..8e30a4e31 100644 --- a/src/viadot/sources/sharepoint.py +++ b/src/viadot/sources/sharepoint.py @@ -1,6 +1,7 @@ import io import os -from typing import Literal, Optional, Union +import re +from typing import Any, Literal, Optional, Union from urllib.parse import urlparse import pandas as pd @@ -13,7 +14,12 @@ from viadot.exceptions import CredentialError from viadot.signals import SKIP from viadot.sources.base import Source -from viadot.utils import add_viadot_metadata_columns, cleanup_df, validate +from viadot.utils import ( + add_viadot_metadata_columns, + cleanup_df, + validate, + validate_and_reorder_dfs_columns, +) class SharepointCredentials(BaseModel): @@ -34,44 +40,6 @@ def is_configured(cls, credentials): return credentials -def get_last_segment_from_url( - url: str, -) -> tuple[str, Literal["file"]] | tuple[str, Literal["directory"]]: - """ - Get the last part of the URL and determine if it represents a file or directory. - - This function parses the provided URL, extracts the last segment, and identifies - whether it corresponds to a file (based on the presence of a file extension) - or a directory. - - Args: - url (str): The URL to a SharePoint file or directory. - - Raises: - ValueError: If an invalid URL is provided. - - Returns: - tuple: A tuple where the first element is the last part of the URL (file extension - or folder name) and the second element is a string indicating the type: - - If a file URL is provided, returns (file extension, 'file'). - - If a folder URL is provided, returns (last folder name, 'directory'). - """ - path_parts = urlparse(url).path.split("/") - # Filter out empty parts - non_empty_parts = [part for part in path_parts if part] - - # Check if the last part has a file extension - if non_empty_parts: - last_part = non_empty_parts[-1] - _, extension = os.path.splitext(last_part) - if extension: - return extension, "file" - else: - return last_part, "directory" - else: - raise ValueError("Incorrect URL provided : '{url}'") - - class Sharepoint(Source): """ Download Excel files from Sharepoint. @@ -129,21 +97,99 @@ def download_file(self, url: str, to_path: list | str) -> None: ) conn.close() - def _download_excel(self, url: str, **kwargs) -> pd.ExcelFile: - endpoint_value, endpoint_type = get_last_segment_from_url(url) + def scan_sharepoint_folder(self, url: str) -> list[str]: + """Scan Sharepoint folder to get all file URLs. + + Args: + url (str): The URL of the folder to scan. + + Raises: + ValueError: Raises when URL have the wrong structure - without 'sites' segment. + + Returns: + list[str]: List of URLS. + """ + conn = self.get_connection() + + parsed_url = urlparse(url) + path_parts = parsed_url.path.split("/") + if "sites" in path_parts: + site_index = ( + path_parts.index("sites") + 2 + ) # +2 to include 'sites' and the next segment + site_url = f"{parsed_url.scheme}://{parsed_url.netloc}{'/'.join(path_parts[:site_index])}" + library = "/".join(path_parts[site_index:]) + else: + message = "URL does not contain '/sites/' segment." + raise ValueError(message) + + # -> site_url = company.sharepoint.com/sites/site_name/ + # -> library = /shared_documents/folder/sub_folder/final_folder + endpoint = ( + f"{site_url}/_api/web/GetFolderByServerRelativeUrl('{library}')/Files" + ) + response = conn.get(endpoint) + files = response.json().get("d", {}).get("results", []) + + return [f'{site_url}/{library}{file["Name"]}' for file in files] + + def _get_file_extension(self, url: str) -> str: + """ + Extracts the file extension from a URL. + + Parameters: + url (str): The URL to extract the file extension from. + + Returns: + str: The file extension, including the leading dot (e.g., '.xlsx'). + """ + # Parse the URL to get the path + parsed_url = urlparse(url) + + # Get the file extension + _, ext = os.path.splitext(parsed_url.path) + + return ext + + def _download_file_stream(self, url: str, **kwargs) -> pd.ExcelFile: if "nrows" in kwargs: raise ValueError("Parameter 'nrows' is not supported.") + conn = self.get_connection() - if endpoint_type == "file": - if endpoint_value != ".xlsx": - raise ValueError( - "Only Excel files with 'XLSX' extension can be loaded into a DataFrame." - ) - self.logger.info(f"Downloading data from {url}...") - response = conn.get(url) - bytes_stream = io.BytesIO(response.content) - return pd.ExcelFile(bytes_stream) + self.logger.info(f"Downloading data from {url}...") + response = conn.get(url) + bytes_stream = io.BytesIO(response.content) + + return pd.ExcelFile(bytes_stream) + + def _is_file(self, url: str) -> bool: + """ + Determines whether a provided URL points to a file based on its structure. + + This function uses a regular expression to check if the URL ends with a + common file extension. It does not make any network requests and purely + relies on the URL structure for its determination. + + Parameters: + url (str): The URL to be checked. + + Returns: + bool: True if the URL is identified as a file based on its extension, + False otherwise. + + Example: + >>> _is_file("https://example.com/file.xlsx") + True + >>> _is_file("https://example.com/folder/") + False + >>> _is_file("https://example.com/folder") + False + """ + # Regular expression for matching file extensions + file_extension_pattern = re.compile(r"\.[a-zA-Z0-9]+$") + + return bool(file_extension_pattern.search(url)) def _convert_all_to_string_type(self, df: pd.DataFrame) -> pd.DataFrame: """Convert all column data types in the DataFrame to strings. @@ -181,14 +227,63 @@ def _empty_column_to_string(self, df: pd.DataFrame) -> pd.DataFrame: df[col] = df[col].astype("string") return df + def _handle_multiple_files( + self, + url: str, + file_sheet_mapping: dict, + na_values: Optional[list[str]] = None, + ): + dfs = [ + self._load_and_parse( + file_url=url + file, sheet_name=sheet, na_values=na_values + ) + for file, sheet in file_sheet_mapping.items() + ] + return pd.concat(validate_and_reorder_dfs_columns(dfs)) + + def _load_and_parse( + self, + file_url: str, + sheet_name: Optional[Union[str, list[str]]] = None, + na_values: Optional[list[str]] = None, + **kwargs, + ): + file_extension = self._get_file_extension(file_url) + file_stream = self._download_file_stream(file_url) + + if file_extension == ".xlsx": + return self._parse_excel(file_stream, sheet_name, na_values, **kwargs) + else: + raise ValueError("Only Excel (.xlsx) files can be loaded into a DataFrame.") + + def _parse_excel( + self, + excel_file, + sheet_name: Optional[Union[str, list[str]]] = None, + na_values: Optional[list[str]] = None, + **kwargs, + ): + return pd.concat( + [ + excel_file.parse( + sheet, + keep_default_na=False, + na_values=na_values or self.DEFAULT_NA_VALUES, + **kwargs, + ) + for sheet in ([sheet_name] if sheet_name else excel_file.sheet_names) + ] + ) + @add_viadot_metadata_columns def to_df( self, url: str, - sheet_name: Optional[Union[str, list, int]] = None, - if_empty: str = "warn", - tests: dict = {}, - na_values: list[str] | None = None, + sheet_name: Optional[Union[str, list[str]]] = None, + if_empty: Literal["warn", "skip", "fail"] = "warn", + tests: dict[str, Any] = {}, + file_sheet_mapping: Optional[dict[str, Union[str, int, list[str]]]] = None, + na_values: Optional[list[str]] = None, **kwargs, ) -> pd.DataFrame: """ @@ -200,46 +295,43 @@ def to_df( Integers are used in zero-indexed sheet positions (chart sheets do not count as a sheet position). Lists of strings/integers are used to request multiple sheets. Specify None to get all worksheets. Defaults to None. - if_empty (str, optional): What to do if the file is empty. Defaults to "warn". - tests (Dict[str], optional): A dictionary with optional list of tests + if_empty (Literal["warn", "skip", "fail"], optional): What to do if the file + is empty. Defaults to "warn". + tests (Dict[str, Any], optional): A dictionary with optional list of tests to verify the output dataframe. If defined, triggers the `validate` function from utils. Defaults to None. - na_values (list[str] | None): Additional strings to recognize as NA/NaN. + na_values (list[str], optional): Additional strings to recognize as NA/NaN. If list passed, the specific NA values for each column will be recognized. Defaults to None. - If None then the "DEFAULT_NA_VALUES" is assigned list(" ", "#N/A", "#N/A N/A", - "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN", - "", "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null"). - If list passed, the specific NA values for each column will be recognized. - Defaults to None. - kwargs (dict[str, Any], optional): Keyword arguments to pass to pd.ExcelFile.parse(). Note that - `nrows` is not supported. + kwargs (dict[str, Any], optional): Keyword arguments to pass to pd.ExcelFile.parse(). + Note that `nrows` is not supported. Returns: pd.DataFrame: The resulting data as a pandas DataFrame. """ - excel_file = self._download_excel(url=url, **kwargs) - - if sheet_name: - df = excel_file.parse( - sheet_name=sheet_name, - keep_default_na=False, - na_values=na_values or self.DEFAULT_NA_VALUES, - **kwargs, - ) - df["sheet_name"] = sheet_name + + if self._is_file(url): + df = self._load_and_parse(file_url=url, sheet_name=sheet_name, **kwargs) else: - sheets: list[pd.DataFrame] = [] - for sheet_name in excel_file.sheet_names: - sheet = excel_file.parse( - sheet_name=sheet_name, - keep_default_na=False, - na_values=na_values or self.DEFAULT_NA_VALUES, + if file_sheet_mapping: + df = self._handle_multiple_files( + url=url, + file_sheet_mapping=file_sheet_mapping, + na_values=na_values, **kwargs, ) - sheet["sheet_name"] = sheet_name - sheets.append(sheet) - df = pd.concat(sheets) + else: + list_of_urls = self.scan_sharepoint_folder(url) + dfs = [ + self._load_and_parse( + file_url=file_url, + sheet_name=sheet_name, + na_values=na_values, + **kwargs, + ) + for file_url in list_of_urls + ] + df = pd.concat(validate_and_reorder_dfs_columns(dfs)) if df.empty: try: @@ -247,7 +339,7 @@ def to_df( except SKIP: return pd.DataFrame() else: - self.logger.info(f"Successfully downloaded {len(df)} of data.") + self.logger.info(f"Successfully downloaded {len(df)} rows of data.") df_clean = cleanup_df(df) From 16e44a9241ba0ee0a673a25d5d4f8582193b3504 Mon Sep 17 00:00:00 2001 From: rziemianek Date: Fri, 5 Jul 2024 01:05:06 +0200 Subject: [PATCH 04/15] =?UTF-8?q?=E2=9C=85=20added=20tests=20for=20`valida?= =?UTF-8?q?te=5Fand=5Freorder=5Fdfs=5Fcolumns`=20function?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/test_utils.py | 43 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index e3c84438d..d00b3283c 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -2,7 +2,7 @@ import logging import pandas as pd - +import pytest from viadot.exceptions import ValidationError from viadot.utils import ( _cast_df_cols, @@ -11,6 +11,7 @@ get_fqn, handle_api_request, validate, + validate_and_reorder_dfs_columns, ) @@ -251,3 +252,43 @@ def test_validate_column_sum_fail(caplog): with caplog.at_level(logging.INFO): validate(df, tests) assert "Sum of 10 for col1 is out of the expected range - <5:6>" in caplog.text + + +def test_validate_and_reorder_wrong_columns(): + df1 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + df2 = pd.DataFrame({"a": [5, 6], "c": [7, 8]}) + + with pytest.raises(ValueError): + validate_and_reorder_dfs_columns([df1, df2]) + + +def test_validate_and_reorder_empty_list(): + with pytest.raises(IndexError): + validate_and_reorder_dfs_columns([]) + + +def test_validate_and_reorder_identical_columns(): + df1 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + df2 = pd.DataFrame({"a": [5, 6], "b": [7, 8]}) + + result = validate_and_reorder_dfs_columns([df1, df2]) + + assert len(result) == 2 + assert list(result[0].columns) == list(df1.columns) + assert result[0].equals(df1) + assert list(result[1].columns) == list(df2.columns) + assert result[1].equals(df2) + + +def test_validate_and_reorder_different_order_columns(): + df1 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + df2 = pd.DataFrame({"b": [7, 8], "a": [5, 6]}) + + expected_df2 = pd.DataFrame({"a": [5, 6], "b": [7, 8]}) + result = validate_and_reorder_dfs_columns([df1, df2]) + + assert len(result) == 2 + assert list(result[0].columns) == list(df1.columns) + assert result[0].equals(df1) + assert list(result[1].columns) == list(expected_df2.columns) + assert result[1].equals(expected_df2) From 8aed3c36ebdd7eee22590a8c40949a8235fbccdd Mon Sep 17 00:00:00 2001 From: rziemianek Date: Fri, 5 Jul 2024 01:10:12 +0200 Subject: [PATCH 05/15] =?UTF-8?q?=E2=9C=85=20Created=20`sharepoint=5Fmock`?= =?UTF-8?q?=20function=20and=20changed=20function=20name=20to=20`=5Fdownlo?= =?UTF-8?q?ad=5Ffile=5Fstream`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/test_sharepoint.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/tests/unit/test_sharepoint.py b/tests/unit/test_sharepoint.py index 6de4406d9..4960ad78c 100644 --- a/tests/unit/test_sharepoint.py +++ b/tests/unit/test_sharepoint.py @@ -1,6 +1,7 @@ from pathlib import Path import pandas as pd +import pytest from viadot.sources import Sharepoint DUMMY_CREDS = {"site": "test", "username": "test2", "password": "test"} @@ -16,21 +17,24 @@ class SharepointMock(Sharepoint): - def _download_excel(self, url=None): + def _download_file_stream(self, url=None): return pd.ExcelFile(Path("tests/unit/test_file.xlsx")) -def test_sharepoint_default_na(): - s = SharepointMock(credentials=DUMMY_CREDS) - df = s.to_df(url="test", na_values=Sharepoint.DEFAULT_NA_VALUES) +@pytest.fixture +def sharepoint_mock(): + return SharepointMock(credentials=DUMMY_CREDS) + + +def test_sharepoint_default_na(sharepoint_mock): + df = sharepoint_mock.to_df(url="test", na_values=Sharepoint.DEFAULT_NA_VALUES) assert not df.empty assert "NA" not in list(df["col_a"]) -def test_sharepoint_custom_na(): - s = SharepointMock(credentials=DUMMY_CREDS) - df = s.to_df( +def test_sharepoint_custom_na(sharepoint_mock): + df = sharepoint_mock.to_df( url="test", na_values=[v for v in Sharepoint.DEFAULT_NA_VALUES if v != "NA"] ) @@ -38,17 +42,15 @@ def test_sharepoint_custom_na(): assert "NA" in list(df["col_a"]) -def test_sharepoint_convert_all_to_string_type(): - s = SharepointMock(credentials=DUMMY_CREDS) - converted_df = s._convert_all_to_string_type(df=SAMPLE_DF) +def test_sharepoint_convert_all_to_string_type(sharepoint_mock): + converted_df = sharepoint_mock._convert_all_to_string_type(df=SAMPLE_DF) assert not converted_df.empty assert pd.isnull(converted_df["nan_col"]).all() -def test_sharepoint_convert_empty_columns_to_string(): - s = SharepointMock(credentials=DUMMY_CREDS) - converted_df = s._empty_column_to_string(df=SAMPLE_DF) +def test_sharepoint_convert_empty_columns_to_string(sharepoint_mock): + converted_df = sharepoint_mock._empty_column_to_string(df=SAMPLE_DF) assert not converted_df.empty assert converted_df["float_col"].dtype == float From 51b32cfa6e4ca57083bb1d1f0fda9f5fee7d2c4a Mon Sep 17 00:00:00 2001 From: rziemianek Date: Fri, 5 Jul 2024 01:26:06 +0200 Subject: [PATCH 06/15] =?UTF-8?q?=F0=9F=93=9D=20Updated=20docstring=20for?= =?UTF-8?q?=20Sharepoint=20source=20class=20and=20functions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/viadot/sources/sharepoint.py | 100 +++++++++++++++++++++++++++---- 1 file changed, 88 insertions(+), 12 deletions(-) diff --git a/src/viadot/sources/sharepoint.py b/src/viadot/sources/sharepoint.py index 8e30a4e31..60a894d1a 100644 --- a/src/viadot/sources/sharepoint.py +++ b/src/viadot/sources/sharepoint.py @@ -63,6 +63,16 @@ def __init__( super().__init__(*args, credentials=validated_creds, **kwargs) def get_connection(self) -> sharepy.session.SharePointSession: + """Establishes a connection to SharePoint using credentials provided during + object initialization. + + Returns: + sharepy.session.SharePointSession: A session object representing + the authenticated connection. + + Raises: + CredentialError: If authentication to SharePoint fails due to incorrect credentials. + """ try: connection = sharepy.connect( site=self.credentials.get("site"), @@ -77,8 +87,7 @@ def get_connection(self) -> sharepy.session.SharePointSession: return connection def download_file(self, url: str, to_path: list | str) -> None: - """ - Download a file from Sharepoint. + """Download a file from Sharepoint to specific location. Args: url (str): The URL of the file to be downloaded. @@ -98,16 +107,17 @@ def download_file(self, url: str, to_path: list | str) -> None: conn.close() def scan_sharepoint_folder(self, url: str) -> list[str]: - """Scan Sharepoint folder to get all file URLs. + """Scan Sharepoint folder to get all file URLs of all files within it. Args: url (str): The URL of the folder to scan. Raises: - ValueError: Raises when URL have the wrong structure - without 'sites' segment. + ValueError: If the provided URL does not contain the expected '/sites/' segment. Returns: - list[str]: List of URLS. + list[str]: List of URLs pointing to each file within the specified + SharePoint folder. """ conn = self.get_connection() @@ -135,10 +145,10 @@ def scan_sharepoint_folder(self, url: str) -> list[str]: def _get_file_extension(self, url: str) -> str: """ - Extracts the file extension from a URL. + Extracts the file extension from a given URL. Parameters: - url (str): The URL to extract the file extension from. + url (str): The URL from which to extract the file extension. Returns: str: The file extension, including the leading dot (e.g., '.xlsx'). @@ -152,6 +162,15 @@ def _get_file_extension(self, url: str) -> str: return ext def _download_file_stream(self, url: str, **kwargs) -> pd.ExcelFile: + """Downloads the content of a file from SharePoint and returns it as an in-memory + byte stream. + + Args: + url (str): The URL of the file to download. + + Returns: + io.BytesIO: An in-memory byte stream containing the file content. + """ if "nrows" in kwargs: raise ValueError("Parameter 'nrows' is not supported.") @@ -164,8 +183,7 @@ def _download_file_stream(self, url: str, **kwargs) -> pd.ExcelFile: return pd.ExcelFile(bytes_stream) def _is_file(self, url: str) -> bool: - """ - Determines whether a provided URL points to a file based on its structure. + """Determines whether a provided URL points to a file based on its structure. This function uses a regular expression to check if the URL ends with a common file extension. It does not make any network requests and purely @@ -233,6 +251,21 @@ def _handle_multiple_files( file_sheet_mapping: dict, na_values: Optional[list[str]] = None, ): + """Handles download and parsing of multiple Excel files from a SharePoint folder. + + Args: + url (str): The base URL of the SharePoint folder containing the files. + file_sheet_mapping (dict): A dictionary mapping file names to sheet names + or indexes. The keys are file names, and the values are sheet names/indices. + na_values (Optional[list[str]]): Additional strings to recognize as NA/NaN. + + Returns: + pd.DataFrame: A concatenated DataFrame containing the data from all + specified files and sheets. + + Raises: + ValueError: If the file extension is not supported. + """ dfs = [ self._load_and_parse( file_url=url + file, sheet_name=sheet, na_values=na_values @@ -248,6 +281,21 @@ def _load_and_parse( na_values: Optional[list[str]] = None, **kwargs, ): + """Loads and parses an Excel file from a URL. + + Args: + file_url (str): The URL of the file to download and parse. + sheet_name (Optional[Union[str, list[str]]]): The name(s) or index(es) of + the sheet(s) to parse. If None, all sheets are parsed. + na_values (Optional[list[str]]): Additional strings to recognize as NA/NaN. + **kwargs: Additional keyword arguments to pass to the pandas read function. + + Returns: + pd.DataFrame: The parsed data as a pandas DataFrame. + + Raises: + ValueError: If the file extension is not supported. + """ file_extension = self._get_file_extension(file_url) file_stream = self._download_file_stream(file_url) @@ -263,6 +311,18 @@ def _parse_excel( na_values: Optional[list[str]] = None, **kwargs, ): + """Parses an Excel file into a DataFrame. + + Args: + excel_file: An ExcelFile object containing the data to parse. + sheet_name (Optional[Union[str, list[str]]]): The name(s) or index(es) of + the sheet(s) to parse. If None, all sheets are parsed. + na_values (Optional[list[str]]): Additional strings to recognize as NA/NaN. + **kwargs: Additional keyword arguments to pass to the pandas read function. + + Returns: + pd.DataFrame: The parsed data as a pandas DataFrame. + """ return pd.concat( [ excel_file.parse( @@ -287,7 +347,11 @@ def to_df( **kwargs, ) -> pd.DataFrame: """ - Load an Excel file into a pandas DataFrame. + Load an Excel file or files from a SharePoint URL into a pandas DataFrame. + + This method handles downloading the file(s), parsing the content, and converting + it into a pandas DataFrame. It supports both single file URLs and folder URLs + with multiple files. Args: url (str): The URL of the file to be downloaded. @@ -295,11 +359,19 @@ def to_df( Integers are used in zero-indexed sheet positions (chart sheets do not count as a sheet position). Lists of strings/integers are used to request multiple sheets. Specify None to get all worksheets. Defaults to None. - if_empty (Literal["warn", "skip", "fail"], optional): What to do if the file - is empty. Defaults to "warn". + if_empty (Literal["warn", "skip", "fail"], optional): Action to take if + the DataFrame is empty. + - "warn": Logs a warning. + - "skip": Skips the operation. + - "fail": Raises an error. + Defaults to "warn". tests (Dict[str, Any], optional): A dictionary with optional list of tests to verify the output dataframe. If defined, triggers the `validate` function from utils. Defaults to None. + file_sheet_mapping (Optional[dict[str, Union[str, int, list[str]]]], optional): + Mapping of file names to sheet names or indices. The keys are file names + and the values are sheet names/indices. Used when multiple files are + involved. Defaults to None. na_values (list[str], optional): Additional strings to recognize as NA/NaN. If list passed, the specific NA values for each column will be recognized. Defaults to None. @@ -308,6 +380,10 @@ def to_df( Returns: pd.DataFrame: The resulting data as a pandas DataFrame. + + Raises: + ValueError: If the file extension is not supported or if `if_empty` is set to "fail" and the DataFrame is empty. + SKIP: If `if_empty` is set to "skip" and the DataFrame is empty. """ if self._is_file(url): From 1ebe9d9a7fbed9b476b4edb317f97e775fbaad11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Zawadzki?= Date: Wed, 3 Jul 2024 12:03:19 +0200 Subject: [PATCH 07/15] =?UTF-8?q?=E2=AC=86=EF=B8=8F=20Relax=20sql-metadata?= =?UTF-8?q?=20version=20requirement=20(#940)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * ⬆️ Relax sql-metadata version requirement * 📌 Update lockfiles --- pyproject.toml | 4 ++-- requirements-dev.lock | 2 +- requirements.lock | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4a140fe7b..cb9b2a1a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "viadot2" -version = "2.0a25" +version = "2.0a26" description = "A simple data ingestion library to guide data flows from some places to other places." authors = [ { name = "acivitillo", email = "acivitillo@dyvenia.com" }, @@ -24,7 +24,7 @@ dependencies = [ "visions==0.7.5", "sharepy>=2.0.0, <2.1.0", "simple_salesforce==1.11.5", - "sql-metadata==2.3.0", + "sql-metadata>=2.3.0", "duckdb==0.5.1", "sendgrid==6.9.7", "pandas-gbq==0.19.1", diff --git a/requirements-dev.lock b/requirements-dev.lock index b738535d7..1014d8200 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -668,7 +668,7 @@ sniffio==1.3.1 # via httpx soupsieve==2.5 # via beautifulsoup4 -sql-metadata==2.3.0 +sql-metadata==2.12.0 # via viadot2 sqlalchemy==2.0.28 # via viadot2 diff --git a/requirements.lock b/requirements.lock index 98e93906a..42738a429 100644 --- a/requirements.lock +++ b/requirements.lock @@ -493,7 +493,7 @@ sniffio==1.3.1 # via anyio soupsieve==2.5 # via beautifulsoup4 -sql-metadata==2.3.0 +sql-metadata==2.12.0 # via viadot2 sqlalchemy==2.0.28 # via viadot2 From 42549eb93fdd05448e25941ac4495a1fd487bd57 Mon Sep 17 00:00:00 2001 From: rziemianek Date: Fri, 5 Jul 2024 01:29:29 +0200 Subject: [PATCH 08/15] =?UTF-8?q?=F0=9F=9A=A7=20Modified=20`validate=5Fand?= =?UTF-8?q?=5Freorder=5Fdfs=5Fcolumns`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/viadot/utils.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/viadot/utils.py b/src/viadot/utils.py index c73e6e457..1efa1d353 100644 --- a/src/viadot/utils.py +++ b/src/viadot/utils.py @@ -681,8 +681,10 @@ def validate( f"Validation failed for {failed_tests} test(s): {failed_tests_msg}" ) -def validate_and_reorder_dfs_columns(dataframes_list: list[pd.DataFrame] - ) -> list[pd.DataFrame]: + +def validate_and_reorder_dfs_columns( + dataframes_list: list[pd.DataFrame], +) -> list[pd.DataFrame]: """Validate if dataframes from the list have the same column structure. Reorder columns to match the first DataFrame if necessary. @@ -707,7 +709,7 @@ def validate_and_reorder_dfs_columns(dataframes_list: list[pd.DataFrame] the first DataFrame.""" raise ValueError(message) if not df.columns.equals(first_df_columns): - # Reordering columns for DataFrame at index 'i' to match the first DataFrame. + # Reordering columns for DataFrame at index 'i' to match the first DataFrame. dataframes_list[i] = df.loc[:, first_df_columns] - return dataframes_list \ No newline at end of file + return dataframes_list From c7327bd13d0f2d446ef51631033520b123703057 Mon Sep 17 00:00:00 2001 From: rziemianek Date: Fri, 5 Jul 2024 01:47:08 +0200 Subject: [PATCH 09/15] =?UTF-8?q?=F0=9F=90=9B=20Added=20`na=5Fvalues`=20to?= =?UTF-8?q?=20`=5Fload=5Fand=5Fparse`=20function?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/viadot/sources/sharepoint.py | 4 +++- tests/unit/test_sharepoint.py | 11 +++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/viadot/sources/sharepoint.py b/src/viadot/sources/sharepoint.py index 60a894d1a..9c3b0b41f 100644 --- a/src/viadot/sources/sharepoint.py +++ b/src/viadot/sources/sharepoint.py @@ -387,7 +387,9 @@ def to_df( """ if self._is_file(url): - df = self._load_and_parse(file_url=url, sheet_name=sheet_name, **kwargs) + df = self._load_and_parse( + file_url=url, sheet_name=sheet_name, na_values=na_values, **kwargs + ) else: if file_sheet_mapping: df = self._handle_multiple_files( diff --git a/tests/unit/test_sharepoint.py b/tests/unit/test_sharepoint.py index 4960ad78c..e632935f4 100644 --- a/tests/unit/test_sharepoint.py +++ b/tests/unit/test_sharepoint.py @@ -2,6 +2,7 @@ import pandas as pd import pytest +import sharepy from viadot.sources import Sharepoint DUMMY_CREDS = {"site": "test", "username": "test2", "password": "test"} @@ -17,6 +18,9 @@ class SharepointMock(Sharepoint): + def get_connection(self): + return sharepy.session.SharePointSession + def _download_file_stream(self, url=None): return pd.ExcelFile(Path("tests/unit/test_file.xlsx")) @@ -27,7 +31,9 @@ def sharepoint_mock(): def test_sharepoint_default_na(sharepoint_mock): - df = sharepoint_mock.to_df(url="test", na_values=Sharepoint.DEFAULT_NA_VALUES) + df = sharepoint_mock.to_df( + url="test/file.xlsx", na_values=Sharepoint.DEFAULT_NA_VALUES + ) assert not df.empty assert "NA" not in list(df["col_a"]) @@ -35,7 +41,8 @@ def test_sharepoint_default_na(sharepoint_mock): def test_sharepoint_custom_na(sharepoint_mock): df = sharepoint_mock.to_df( - url="test", na_values=[v for v in Sharepoint.DEFAULT_NA_VALUES if v != "NA"] + url="test/file.xlsx", + na_values=[v for v in Sharepoint.DEFAULT_NA_VALUES if v != "NA"], ) assert not df.empty From 8f137d1e7ebfa10d348cc226ff347bd2e6f3fbbb Mon Sep 17 00:00:00 2001 From: rziemianek Date: Mon, 8 Jul 2024 15:03:36 +0200 Subject: [PATCH 10/15] =?UTF-8?q?=F0=9F=90=9B=20Added=20tests=20for=20Shar?= =?UTF-8?q?epoint=20functions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/viadot/sources/sharepoint.py | 4 +- tests/unit/test_file.xlsx | Bin 4812 -> 11067 bytes tests/unit/test_sharepoint.py | 114 +++++++++++++++++++++++++++++++ 3 files changed, 116 insertions(+), 2 deletions(-) diff --git a/src/viadot/sources/sharepoint.py b/src/viadot/sources/sharepoint.py index 9c3b0b41f..9163319ca 100644 --- a/src/viadot/sources/sharepoint.py +++ b/src/viadot/sources/sharepoint.py @@ -148,10 +148,10 @@ def _get_file_extension(self, url: str) -> str: Extracts the file extension from a given URL. Parameters: - url (str): The URL from which to extract the file extension. + url (str): The URL from which to extract the file extension. Returns: - str: The file extension, including the leading dot (e.g., '.xlsx'). + str: The file extension, including the leading dot (e.g., '.xlsx'). """ # Parse the URL to get the path parsed_url = urlparse(url) diff --git a/tests/unit/test_file.xlsx b/tests/unit/test_file.xlsx index 197881d2b04cd201a0c54c19855133383f6debae..4a03e058ad12b7605c54ac7b45fcf029122abea1 100644 GIT binary patch literal 11067 zcmeHN1z42bwx+v#2myhS)PX@7N$C`&k!Cy!cbzRQlKOP4nm|~nTZHR4PbSSY$q2=Wp7tq#aHbioC?dl=ZiM?6$HmI}|q{bf;S_CO~#2F5T9l!`ZYhdCkb6knlh?_eA66Q%LAp@^E8=JeDBeFq#adza#tVz5JpRI`&5w( zoLy~1OhBC5IhP;uExV6#Ad*CR%tPx-VsMOnU^1Qe9`%E=pwcOPtA{r@7$JZwhPh&m zX+%o&OaPtdoP2Mh?NMFFpl;g)(Pwxn#b(w@G1s<$BH+_HsIi38cdv3&L@KY&whErZ zxrrj?cCL?%EEYviC-b{XZkOvS$)cj68kqSY!^6R$BErEb{j;Iu`2SG;t3nC>|4{zf zP&!%Ln%kKg8{6C6yZZB!;1jxvT9V|b(~5;t@I^ymf)|k6@16zcY;@=3R8DHg^>Iex8TR_y7N;(GzPY<`kj~2I1p!HM}dEcnHNq3I+^|%`b7Z0}#s?T;E zIYtt7x?)SZ-wb9Q4Sv_1Ecn(r;&`xe+-$UCCYYQwl~h`j_-Un7A*Q5daOMondWfTS z_3Wf&pg;gbwRsmP;!5B9rccEFT8m9-i za}Tmp?SHI>&DNWqwqS9#eLUb)O)Iofb@;-ja`@b4XlM?U9FAKsq-IUsdicF4Y|L)* z`=ZB*i^~yYJ2oPJ6H?#B4$yVDV2Hl3P6s9VZW(0@xSdyTm6<@o>DVmeK8xSF8wpLT zruEQBrp*6P9vhN-bZ^b4d(ZZu^I~q|)Yieyy`uU#HO^_(;q#(eK@cx*g1}p+gO+;I zoj2W+L#XAk+nc$ApU13UKDby-KO0T{jxg2j;#fQKA?6}(N)aCiw`XY+G88*TtrmNs zk!XF-@$>APgj?YkRaGvmcdfrm2x;k9eLr_6TkqY$!Ho&YJ*kIOhd4Q%jjxA0j}4u@ zccMz_DJy}d?X52Ay>RMj#>%IBux7(OV%d*doztze@qO9EX?Lf6*YwAc<8VN*Q!h)? z5V6pt+Nav!Lx+}?o#0({_L04lVf*yVxssS4(9DbC+1AwrDyEdWZX$E8YmljbfhYK))E5@cR*Zg zdW@-i-fL2@wSas0bf%NKwzer&h|A6Io^J2u#q9Ee;O0jz*JqRpB)lJ=-7YQW^18K2rtB%f)@@$jc92dHvCKBxV$eC9KbC(!o^a)U-!rp$Iu`Ajnod8k=}+<;L6 zaDX%Z%4E)=16@gw8#1Z_4&0Cb&~7iEd1(S9boCke1Hqi}DeYMDCZ-%>P=I^dic^An zF7JX{H`A1b!jY9-NBJ9-x~aq;*h|ryJX!9YybGaE8{rcpeLO=vp`LF%(>!xMOW>-7 z^~lgz5xvZ43bBz|y&CxN;!qJzg!O39SdqMfXbSO>B)l7B@Z!)B_(k*>(O8ka-qREk zBei-rXyV0TBDjg@fzVh{yxM6B0Z0-)4bSl6uo0?7^mx!%QN31Z3aOA(5!^)el+ai)yrgLh znUEy>8glUBh!Ltq_0-W=F}=)a3)zrb{Tiz9;z$urMD_I0Sh2iWpX@Ztan z{9;Csz;&$1B-&!IxTAkt58epi_TMoc@1UN;HL<)Eo(??)AbFrQCW~%;#yjOj^1$=@ zXK?=UPy9Fhr_?%DP!eqsnBLLsr0<6;J1~9&%t7 zs6)KKKVFl#zemy`=9y5e{K~igFEEz z-PrB)VK<-U#iq;e4n&rtF{det8`$DJi7{Z13pb)6+Ks;NusA zK>qI;8+$;*02KaaJqX!4mQRv?AsEZi|Nop3ZJ^%YzkSi*!PMKU0(fU{-2~RF=$$#* zymYTPKRT_OHTih%tmg3Js_Uny9A+TLhJ&-n`J=9%?x%oqt-sQ>HMY3kW!=4eAJ!|2 z(FDRnnI?1&0J%ZG`2})rNcdW1+|rZE#I?9ZzuiCfCMuj` zq2Nl2l_}sa;{2GDQpzIOfTbx5elNCNla02t3h5i$+w&rwDaBwM1s&{>-8sKL_{~an zAUjqx{qytD+U6(-iMUQsLGT{orSg%Vy(V7*TOl({{rJ1e$F&$-^_V}EkJl4cZ$NMo z$$Fcf!w#SeX*{L-4+mYrPo9+PRJWq5f+Wh&SH^n1KHYX!PMZKM?eOY`vV94l%g;w# z^ezOJ2!8&^Hcq1&WfD75KbcI*GDbka#gM1r13n?dJfXda*$>@(=vP!q^bS8>M|uPZ zhxS+>!>?I$JX^OcRE2SnZVJBN7OPVqKi_M!iv3*#=f0HGp&(Y-1c5K6KVL8fuPD!l zl*fAyA8Oj9^rv~HA=vM$r0(kOa8<;HqRI3ChI z%ttkc+q$0>5c}g8ti_S+ z5vKM;js_|6uCcvrnrGs}zD2BrrT&`q*?vSd6Hm6pa^i3t`X=8tC-79dsa}D|JH^?7-bJNz=UM>m=r;{;5@0ZTAf)4YPR&Ya zDVQU$Gk!oW&!oq67R!psFh3M~+BK7taB@A4c>5Dz()R90J6_sZ{r-dGX`4))gwc@^ zwirT7Bh$wD)o9B7RSM&6-)!?vT^Z|&%Hxvg3*^guTMgg(z>EL~H;VE{`SypLyUM!X z<6IdgzS(FP+JuZ?LxddlFpxGVd~(u)2=lIX&uJI;sqtCj9mhM=66W;t0*De48iEN= zn=%9~4;n1ec)P%2ZpWyvC=SEX@OE?2KK15-sGmO4f%A?sGehhA^D4Y zlsPQ>^uerS=ogm4A2~GA7!D7U0#^iwK&a|bUC7ENSb_E}zC)PV9 z7fuHuE59~)QsfGM83aH0J>t?oOS+#L+3%3daVL#Vp=07{)jsP^Q6Xl&0w(5!}|B65C1%u3m7kHDR?6dnuoK+8(JIyVZ`lxn^mx zNS;?Pnq0GdA@vqhz}g=prIPeu$Xg1!{DSOEqDPWm5p|xWSLItr@YloFTB#Y&c7(Vq z40KF#ecawU)15rp|3$i>_Tkm=ZB)a6^O=kaP0sEIxc9Vqm09T1gtoG`EOHz8hgEGm zbA-|(G7@%@c=~p%^9E+x0LzG5=rVCxi4G*2>@LB_jcEwM@VP%kyKmWXBTEjVB@t49 zXLRnA88#);yC=bQWPhytc8tQAXjU(0@mU~#evKi;SE`rzYjW%g|LzQYNVB=q&D7&Gp5a_JF zdv?b;(JVI8@g+y>PqrBlQ97<-8o}5m+pddYT@ibn2K~u4p);l+IvCq3{yf{R((U)y zrYa(y;7BK0{sMBWHI!vB|_YtU^DVMvx6eCpadX5D9$engDvf)|Q_B@nut-#1wkK zYA{uhe9VUL1=l<6xV&!!?)i3l-RsNLV`UrfmQ4pOZxedd;Em4F5zEpaze+{wnW|Q- zSb(vO#6Y>o{fcd=Mt#wLpKXLck9m7jV@uYMch|%=;X1wzhGomZ@5~ z_{|e)%_pm6tKg#TjK*OEF0=DGGB-_-+H#QLQr5N%7GBgPsmc^9~9k!L5t4_CLwhZ zi904)5NLiR?-DBdikjF`BGOO;96b}=IGSAFDiof|NmDJrW8&K$>`NG&5p_4J6b}!S z$P^afi^nv*NdF@y>Wf&kzvmCJcdu~^XKHoM@dfI}9^u>q>4?c*(2Q!Qq6jbGl@HaE zt06YIEipXEnx&X^btm`q5q`+7J!VI8KABsDl1+|NX@k?&Np-YrDvJt)RdLEJz|jU@q)2) zh-hoi`_uTuQAxl#bf*?!c{XRG-P4^}$`{eEdXXuULGUTQfEDhiII6wClIYq2W;S*X z#6f+?M0Wie-TSmxk7RIv1Vt5y$`bI(H`j8k201>9(x!i=es8F;g>sF3y7DxsV?JV9 zl+AFB*g#q3GXhbiQUJPw?OHlziM5L}^V-Td6Dq4#^#Zr!Yf%ji$j*KQ*RCrGriy2Q zwC=qo^*oj)DtvY)-8)TXJsAePsRsH&SqOk9%BaZG#pZ`kGtdIzR0v`ma!wTo1>G{T zqDHXC0O-~$N%>!>@LL_%fg_xP2wZ-bCrB0&ivaB(AX} zz>%7hyFN=2{CiVG*vr$y!vobtg`tf z(UfMVU2FX;p(dq+N^K#0(xKCD&pp=JO$?OI)|yc{_a)QyGk_YY)wU$4h8H^MZpgj7 zscBThvX^(DIM!Z#gnD_b8K-&aGs5nm<>UO(*-rG6Xm+Ohw#G&(_O_RIB`-Ja4Igwh1OoFojQ|IqC8UU|)0x=n0xTduL&M>)8 zbM&+IJ;EW$qrncuZdK*YA0FjjJgJHiRI{)H*9r`t!K!CewB_l<>IT^QqW-X&hP51* zw`A??T`Y{Rmj3avh}9Sf&t2p~c`Rfm|rkoa9?#Q@Lm%+}=t`!== zTDAv953n>R%O9b5V|`mH#ZIZ#?+Q^R|E|_29TJa{I0G+M4ym(?8hT*GkzhHr7T)#T z*vz0}gM)>)7-JUA}+p*x0`yDeO!>T3NwAfS?To`W-LRs z(l+wKMnZU03R_oR_=k%gk&+*_6Tdkt<+UN1KL50ln;5g;dirqKmQH0RNkv=Iblngk zta$o%8-cki@B?0_()2yff~oDqt-~KuS*0%%9AycWjGS3T__Pd!Gxw~Pl{!<23A)t} z!k^{uTOO_-77laNDODncz2AGmIUc^4UTQ6sbM$D5D2YOPi9vyj zZKE;T1jl)3Wd=v9SOS=U^g1FnyEkL*-U_C$Bd#$BAH`pQY3V-}NbViX-!5pHsN9+V^q3m=Rl&mCb>Ueki zG41;-aTDR!Mk%V$uWQn)3sSX|?+Y~Ve^&|AXkrMd*GwwgWMaye zjl91Y!ezC`+(NetPs0{Rw>Bn2fro0)Rrh#Dad~&#ZUd=dTLzNXSGmy*anK{ zr`SNnx)}r|!Vs0O5|Nq^<@GGOBP6pPw9XVUW|%FdLiCU|^+Vo4p35dwa?(ObzH_qi zh}^Oj%D9M?C#o~XF;{SswdU3%Af0O*Lc%fWvov=4YzHep9W<9)wZrPGp1d8L?S|Ks zixIku@2J--@6{uP@#S?YG$)cFkn`10QyAF!WlXyx-<(wzVbA^x)Qk4k)NEpbR6epo zPGwrgE>X1T4@ez{;Q|TLL`@ZOCQ`9RTVKm(uu%ik-sbAEY3fR&9{XvlZ^ao{^fr#O zqVITp*d~Tc5=L30xKCLUQdtX<<3lZ}19d&aS^SP0%I7S)6Y}_W44XJcxA+EyfuU!5 z487up4{hCi{<`>xZTk zvI8x!Z@nuMqZQ6F%Wc}&coc>i3q9I-nVA>sVt~7=Ps8%A%~sEpc%_jXerUf&5DrsW zlkHA1qh1S~!$+ojJneB-qCyo71O`5y4XAL@#R%!cYgE(+f=owz8%)oQE|&^Bf-H^g^^NrH_2&$@zJenNW(+t9W;sehpkmICATYbK zbK?hE28ON{Q^rqbRu%>(7AB^ic$_4&rKN?1oW-2wAVhQ-S7O_u%*B=f6ijsf#Y_@o z;GNZzdD74Piy2vq=_D*5{y^t+q$As+=O8H%M$zHXNu^8??@@s7;R&aCKSLYl&G~*&AEg>ppgQZfvJ>xs`T5(Rv1>ih#(b-maDb15@;)+4;2a z-m<=3I4yIkkhZ8kt6Oz1i20c~DyE<~KMwfUPy~#jsd(FO-G*(%3P}Injth7rIvC^u*Xduzu#|<{*Bwxhp ze33i0pF6YIK>%#H&Yx*Bp^~h6#yl>j^_$sX_(_T+L`hwasGxZ>ipFn?R(BNxwX__J zObWT9P&m8EnLlQ&8F~eNQJA@}3N-bDNLQrx@iNC2V3c0(FMTPfzXrJC%?O6Kk z{lGj2!QwU~cJWqf^K>Zre$LpM#sjefZ%Awf*Kln=_LnS<&soIP>E!P$?;UD$zR>h4 ztDhK9T#7+`F>*?kc_pj9O6?!hfMuV~RnrNNKm_;e$+YVaT3tSw_S^Bdo>Nnj{YOz` zVW0auC|8ovKaLUuYngus<*F6FiE>>;{_TopV5Kk&<)&zQ)mGj_xh}(AieJBm2>wl! z-<6+l0$$hIe(Tmlf}4Q9wA`C0*M+Lzy7duO#{YJ4*WCK0(A@;QE=c^=tt#T1fLCt) zZp^QD^uKkB4hH%+-MYfM+S+frb-i=B+%kR*M!*%|OkJY3d|zeYLrU&Ha2D)Q#!>qp~deer8_(Oymd8`Z{5 zoa^V#<>~Tku)w~eYsc!%iPtys-%cc PeArJItP+R=UcUPePzA8k delta 2614 zcmZ9N2{e>#8^^~C*~eg%C5>r@VI(F?vP>nr?E6|-GK41SC1NsDc7?|h;w{UNG9|Rw zr?Q1eWY1(9sYVjA<(qux{l2`EC`m=Hx`tuBEE~EIH)T#uZcsaAUN&1%KUY+%gS7Dx_gnu0#GB@rdYnl zp0b){R^3fk<|iFbr|Rr=n*JJZXZH=xX^RNE=9OyOU``)jVYc&ynWFLwO^In_lr;mn zo?4~HvNMTW7HG2PGcErtu6Nz+MM_RW4so#vmWB)rnkMZoa}h~jK|0X0=apqnq*DM` z2QEgQFQFZ15=6QE4Ku+OnMd09y11+TMbtE`fSba@6S%a^s2njZ>~J35kQcMZs(akw2y1lO=MgHh6^u1 z<`COYT5(T$c>Phvpk_-{>Y{LaTK<&`=lzoI))pScC6Z@7eqA_$eM@cHJG%HnYAmzH z^{O*CkkAfGy){#AK zS!V+^+&jOn_8gM4_v6UnuX#^yTr)7p5AZFd~9hp`I5I?oJ^e z^)n%Tc}`CKIT~L z=h#AYYq`^!_hj*#NHuOSc(;O2o z+KiAC9yIU6KB<+=bbO9r2h1}PR3}*r0)YtZn0Ll|M_PtN4;qz$ZIXXMCi0tW6UyWs zOHWUfW-X_b_bIkKGUR9hYgDf3I3TkRkusGBcyJjmf;XQRx*$3Um&@)&UkBw*{&lXRgflxwb ztrKP9V<-w#j^aD@MzJw5!klXx`f{PSlM6)s^scP0$4`rJw1{XOT?jWOIo z2e}CIMMJI3^{h!Er3!Mhrr)sD?@hw{V(=b_FfpW}pq8t&<6CZBFokKyI9^c$Kiwos zA}9{4HE*B?FDPDQ6b_7{32(eO%Ku`BwjDYbue`BqsIN z?h<_bVCCIa7pPRO-^??TzEtGwB*1((l(|ZIt@IOWb-+{ zMh3y11GECo*!U24PoIun=|C?>iV!7xA5PZShuUrh88WS520L$@3|+yES=XyZ*G%09 zic(3<=azI$x^hp&`^! zoSQO`+A6>dskMo1X0r0{7U}wnR(CWcLXp0Q^K*dulB4FE+`sewSnF|JKRf3%&VS|I ziEV~|9-s#yMoKdadZ7Y~vFy>-|3LB5SBx_rw98<>!~2>BMLjD!)~dxYi;FI!_IB(V&I`YmbD0Gw-NIV z@B5Jy0NbKfs20a3AY|r9X$E>PzeF9}jE@HV;;^>QZxAo1x?lha8_ck`Id%9h!M;Ewl;b%=cz&7OuXD@FW5#22Sm94%ERT}fPZ zaXaaQRuxj+Y=fnGCXB?EG}D-|XTJ1G;BF=J&6P%@Go{?3{_+*597*Pme=8T?4Y$}h zPUE$ijZ3w{@_(293}*4u48~(_m?xh4P2K!;LAeo|x*X(R)PW(SBB?y>IJ=j?4A`jS zh6Wx3Hm32n&E>sEsNAsL!9x|YNb}*fnK-l4_&25PYM-t>Wj7$l-?Ht=`&%ytN!TZ0F3}~Wex7#bfESgkp+hM_`vH|{u z{4S;*bVTM_&T`Wvk8!$?(-(8Ee&x$!1LKt@F6c6u=su6SC#~A5pQ}q2^D3KniDokt zaSE*k#~zY3qMYMzHZRUvz|-mH9?`vImyJE~wK)fuh3_%EEi(wgH&$%Y{ChYukOjU) z(DvgqJ}$3${JG5150fSr%I)aBKQ5=cLUFlxPr-A!iLp_Q#Wc=Wgk*AO0~&~*@M}GN zzS6zp=u}`G9;d1$t{^(h5>S`xUAhMyI6}yK5vj+5dGG6 z+0We(i7&5*wjIe(CGqb~o4XoSN4S3^6PmR3wJS2_k=9%X%eT9M-ORap0^_Qz)Y*2E z*e-GoFdX!M#9!s4fERM|GVF2Y^yknM|EKu3S2GUajEER;A0xb-k4B6XI1vK8g-Zb| znBCi-@Cs7ko1B2F0&@GPKj#U2$5B7If!hlEw?%(8AHf}>B_7~|g4DL?&sq`MA>!f( zB(T!kqCY((xkD5$0L&uy0=MJ^wyznDl>*la0{K{p?R|fe5#6zm3I$+_(ojASA1H@y K%)PMRfByrBtARNH diff --git a/tests/unit/test_sharepoint.py b/tests/unit/test_sharepoint.py index e632935f4..bbcc374db 100644 --- a/tests/unit/test_sharepoint.py +++ b/tests/unit/test_sharepoint.py @@ -62,3 +62,117 @@ def test_sharepoint_convert_empty_columns_to_string(sharepoint_mock): assert not converted_df.empty assert converted_df["float_col"].dtype == float assert converted_df["nan_col"].dtype == "string" + + +def test__get_file_extension(sharepoint_mock): + url_excel = "https://tenant.sharepoint.com/sites/site/file.xlsx" + url_dir = "https://tenant.sharepoint.com/sites/site/" + url_txt = "https://tenant.sharepoint.com/sites/site/file.txt" + + excel_ext = sharepoint_mock._get_file_extension(url=url_excel) + txt_ext = sharepoint_mock._get_file_extension(url=url_txt) + dir = sharepoint_mock._get_file_extension(url=url_dir) + + assert excel_ext == ".xlsx" + assert txt_ext == ".txt" + assert dir == "" + + +def test__is_file(sharepoint_mock): + is_file = sharepoint_mock._is_file(url="https://example.com/file.xlsx") + assert is_file is True + + is_file = sharepoint_mock._is_file(url="https://example.com/dir") + assert is_file is False + + +def test__empty_column_to_string_mixed_values(sharepoint_mock): + df = pd.DataFrame( + {"col1": [None, None, None], "col2": [1, None, 3], "col3": ["a", "b", "c"]} + ) + result = sharepoint_mock._empty_column_to_string(df) + + expected = pd.DataFrame( + { + "col1": [None, None, None], + "col2": [1, None, 3], + "col3": ["a", "b", "c"], + } + ) + expected["col1"] = expected["col1"].astype("string") + + pd.testing.assert_frame_equal(result, expected) + + +def test_convert_all_to_string_type_mixed_types(sharepoint_mock): + df = pd.DataFrame( + { + "int": [1, 2, 3], + "float": [1.1, 2.2, 3.3], + "bool": [True, False, True], + "string": ["a", "b", "c"], + } + ) + result = sharepoint_mock._convert_all_to_string_type(df) + expected = pd.DataFrame( + { + "int": ["1", "2", "3"], + "float": ["1.1", "2.2", "3.3"], + "bool": ["True", "False", "True"], + "string": ["a", "b", "c"], + } + ) + + pd.testing.assert_frame_equal(result, expected) + + +def test_convert_all_to_string_type_only_nan(sharepoint_mock): + df = pd.DataFrame( + { + "int": [None, None, None], + "float": [None, None, None], + "bool": [None, None, None], + "string": [None, None, None], + } + ) + result = sharepoint_mock._convert_all_to_string_type(df) + expected = pd.DataFrame( + { + "int": [None, None, None], + "float": [None, None, None], + "bool": [None, None, None], + "string": [None, None, None], + } + ).astype("string") + pd.testing.assert_frame_equal(result, expected) + + +def test_convert_all_to_string_type_empty_dataframe(sharepoint_mock): + df = pd.DataFrame() + result = sharepoint_mock._convert_all_to_string_type(df) + + expected = pd.DataFrame() + + pd.testing.assert_frame_equal(result, expected) + + +def test_convert_all_to_string_type_already_strings(sharepoint_mock): + df = pd.DataFrame({"string1": ["1", "2", "3"], "string2": ["a", "b", "c"]}) + result = sharepoint_mock._convert_all_to_string_type(df) + + expected = pd.DataFrame({"string1": ["1", "2", "3"], "string2": ["a", "b", "c"]}) + + pd.testing.assert_frame_equal(result, expected) + + +def test__parse_excel_single_sheet(sharepoint_mock): + excel_file = sharepoint_mock._download_file_stream() + result = sharepoint_mock._parse_excel(excel_file, sheet_name="Sheet1") + expected = pd.DataFrame( + { + "col_a": ["val1", "", "val2", "NA", "N/A", "#N/A"], + "col_b": ["val1", "val2", "val3", "val4", "val5", "val6"], + } + ) + + assert result["col_b"].equals(expected["col_b"]) From 4a58d24cb28c8adc0a44740572cfe99906e7e275 Mon Sep 17 00:00:00 2001 From: rziemianek Date: Thu, 11 Jul 2024 12:47:43 +0200 Subject: [PATCH 11/15] =?UTF-8?q?=F0=9F=90=9B=20Added=20**kwargs=20to=20ha?= =?UTF-8?q?ndle=5Fmultiple=5Ffiles=20function?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/viadot/sources/sharepoint.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/viadot/sources/sharepoint.py b/src/viadot/sources/sharepoint.py index 9163319ca..5ce09c6ef 100644 --- a/src/viadot/sources/sharepoint.py +++ b/src/viadot/sources/sharepoint.py @@ -250,6 +250,7 @@ def _handle_multiple_files( url: str, file_sheet_mapping: dict, na_values: Optional[list[str]] = None, + **kwargs, ): """Handles download and parsing of multiple Excel files from a SharePoint folder. @@ -268,7 +269,7 @@ def _handle_multiple_files( """ dfs = [ self._load_and_parse( - file_url=url + file, sheet_name=sheet, na_values=na_values + file_url=url + file, sheet_name=sheet, na_values=na_values, **kwargs ) for file, sheet in file_sheet_mapping.items() ] From 89d98aaeb16a88f72c39646fcf28232e3aceb1fb Mon Sep 17 00:00:00 2001 From: rziemianek Date: Thu, 11 Jul 2024 14:52:44 +0200 Subject: [PATCH 12/15] =?UTF-8?q?=F0=9F=9A=A7=20Added=20`dtypes=3Dstr`=20i?= =?UTF-8?q?nstead=20of=20functions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/viadot/sources/sharepoint.py | 41 +++----------------------------- 1 file changed, 3 insertions(+), 38 deletions(-) diff --git a/src/viadot/sources/sharepoint.py b/src/viadot/sources/sharepoint.py index 5ce09c6ef..15a295a34 100644 --- a/src/viadot/sources/sharepoint.py +++ b/src/viadot/sources/sharepoint.py @@ -209,42 +209,6 @@ def _is_file(self, url: str) -> bool: return bool(file_extension_pattern.search(url)) - def _convert_all_to_string_type(self, df: pd.DataFrame) -> pd.DataFrame: - """Convert all column data types in the DataFrame to strings. - - This method converts all the values in the DataFrame to strings, - handling NaN values by replacing them with None. - - Args: - df (pd.DataFrame): DataFrame to convert. - - Returns: - pd.DataFrame: DataFrame with all data types converted to string. - Columns that contain only None values are also - converted to string type. - """ - df_converted = df.astype(str).where(pd.notnull(df), None) - return self._empty_column_to_string(df=df_converted) - - def _empty_column_to_string(self, df: pd.DataFrame) -> pd.DataFrame: - """Convert the type of columns containing only None values to string. - - This method iterates through the DataFrame columns and converts the - type of any column that contains only None values to string. - - Args: - df (pd.DataFrame): DataFrame to convert. - - Returns: - pd.DataFrame: Updated DataFrame with columns containing only - None values converted to string type. All columns - in the returned DataFrame will be of type object/string. - """ - for col in df.columns: - if df[col].isnull().all(): - df[col] = df[col].astype("string") - return df - def _handle_multiple_files( self, url: str, @@ -312,7 +276,7 @@ def _parse_excel( na_values: Optional[list[str]] = None, **kwargs, ): - """Parses an Excel file into a DataFrame. + """Parses an Excel file into a DataFrame. Cast all columns to string. Args: excel_file: An ExcelFile object containing the data to parse. @@ -330,6 +294,7 @@ def _parse_excel( sheet, keep_default_na=False, na_values=na_values or self.DEFAULT_NA_VALUES, + dtype=str, # Ensure all columns are read as strings **kwargs, ) for sheet in ([sheet_name] if sheet_name else excel_file.sheet_names) @@ -425,4 +390,4 @@ def to_df( if tests: validate(df=df_clean, tests=tests) - return self._convert_all_to_string_type(df=df_clean) + return df_clean From 9dcf387b9ccc097e34fe584c201b500d03365c8d Mon Sep 17 00:00:00 2001 From: rziemianek Date: Thu, 11 Jul 2024 16:31:19 +0200 Subject: [PATCH 13/15] =?UTF-8?q?=E2=9C=85=20=20Removed=20tests=20for=20no?= =?UTF-8?q?t=20existing=20functions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/viadot/sources/sharepoint.py | 2 +- tests/unit/test_sharepoint.py | 106 +++---------------------------- 2 files changed, 11 insertions(+), 97 deletions(-) diff --git a/src/viadot/sources/sharepoint.py b/src/viadot/sources/sharepoint.py index 15a295a34..8f70272d2 100644 --- a/src/viadot/sources/sharepoint.py +++ b/src/viadot/sources/sharepoint.py @@ -276,7 +276,7 @@ def _parse_excel( na_values: Optional[list[str]] = None, **kwargs, ): - """Parses an Excel file into a DataFrame. Cast all columns to string. + """Parses an Excel file into a DataFrame. Casts all columns to string. Args: excel_file: An ExcelFile object containing the data to parse. diff --git a/tests/unit/test_sharepoint.py b/tests/unit/test_sharepoint.py index bbcc374db..14e53c526 100644 --- a/tests/unit/test_sharepoint.py +++ b/tests/unit/test_sharepoint.py @@ -49,21 +49,6 @@ def test_sharepoint_custom_na(sharepoint_mock): assert "NA" in list(df["col_a"]) -def test_sharepoint_convert_all_to_string_type(sharepoint_mock): - converted_df = sharepoint_mock._convert_all_to_string_type(df=SAMPLE_DF) - - assert not converted_df.empty - assert pd.isnull(converted_df["nan_col"]).all() - - -def test_sharepoint_convert_empty_columns_to_string(sharepoint_mock): - converted_df = sharepoint_mock._empty_column_to_string(df=SAMPLE_DF) - - assert not converted_df.empty - assert converted_df["float_col"].dtype == float - assert converted_df["nan_col"].dtype == "string" - - def test__get_file_extension(sharepoint_mock): url_excel = "https://tenant.sharepoint.com/sites/site/file.xlsx" url_dir = "https://tenant.sharepoint.com/sites/site/" @@ -86,88 +71,9 @@ def test__is_file(sharepoint_mock): assert is_file is False -def test__empty_column_to_string_mixed_values(sharepoint_mock): - df = pd.DataFrame( - {"col1": [None, None, None], "col2": [1, None, 3], "col3": ["a", "b", "c"]} - ) - result = sharepoint_mock._empty_column_to_string(df) - - expected = pd.DataFrame( - { - "col1": [None, None, None], - "col2": [1, None, 3], - "col3": ["a", "b", "c"], - } - ) - expected["col1"] = expected["col1"].astype("string") - - pd.testing.assert_frame_equal(result, expected) - - -def test_convert_all_to_string_type_mixed_types(sharepoint_mock): - df = pd.DataFrame( - { - "int": [1, 2, 3], - "float": [1.1, 2.2, 3.3], - "bool": [True, False, True], - "string": ["a", "b", "c"], - } - ) - result = sharepoint_mock._convert_all_to_string_type(df) - expected = pd.DataFrame( - { - "int": ["1", "2", "3"], - "float": ["1.1", "2.2", "3.3"], - "bool": ["True", "False", "True"], - "string": ["a", "b", "c"], - } - ) - - pd.testing.assert_frame_equal(result, expected) - - -def test_convert_all_to_string_type_only_nan(sharepoint_mock): - df = pd.DataFrame( - { - "int": [None, None, None], - "float": [None, None, None], - "bool": [None, None, None], - "string": [None, None, None], - } - ) - result = sharepoint_mock._convert_all_to_string_type(df) - expected = pd.DataFrame( - { - "int": [None, None, None], - "float": [None, None, None], - "bool": [None, None, None], - "string": [None, None, None], - } - ).astype("string") - pd.testing.assert_frame_equal(result, expected) - - -def test_convert_all_to_string_type_empty_dataframe(sharepoint_mock): - df = pd.DataFrame() - result = sharepoint_mock._convert_all_to_string_type(df) - - expected = pd.DataFrame() - - pd.testing.assert_frame_equal(result, expected) - - -def test_convert_all_to_string_type_already_strings(sharepoint_mock): - df = pd.DataFrame({"string1": ["1", "2", "3"], "string2": ["a", "b", "c"]}) - result = sharepoint_mock._convert_all_to_string_type(df) - - expected = pd.DataFrame({"string1": ["1", "2", "3"], "string2": ["a", "b", "c"]}) - - pd.testing.assert_frame_equal(result, expected) - - def test__parse_excel_single_sheet(sharepoint_mock): excel_file = sharepoint_mock._download_file_stream() - result = sharepoint_mock._parse_excel(excel_file, sheet_name="Sheet1") + result_df = sharepoint_mock._parse_excel(excel_file, sheet_name="Sheet1") expected = pd.DataFrame( { "col_a": ["val1", "", "val2", "NA", "N/A", "#N/A"], @@ -175,4 +81,12 @@ def test__parse_excel_single_sheet(sharepoint_mock): } ) - assert result["col_b"].equals(expected["col_b"]) + assert result_df["col_b"].equals(expected["col_b"]) + + +def test__parse_excel_string_dtypes(sharepoint_mock): + excel_file = sharepoint_mock._download_file_stream() + result_df = sharepoint_mock._parse_excel(excel_file, sheet_name="Sheet1") + + for column in result_df.columns: + assert result_df[column].dtype == object From 9602097b42da624775a0c424947e687b57b3ade9 Mon Sep 17 00:00:00 2001 From: rziemianek Date: Thu, 11 Jul 2024 16:50:02 +0200 Subject: [PATCH 14/15] =?UTF-8?q?=E2=9C=85=20Added=20missing=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/test_sharepoint.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/unit/test_sharepoint.py b/tests/unit/test_sharepoint.py index 14e53c526..5e7d1e9b4 100644 --- a/tests/unit/test_sharepoint.py +++ b/tests/unit/test_sharepoint.py @@ -3,7 +3,9 @@ import pandas as pd import pytest import sharepy +from viadot.exceptions import CredentialError from viadot.sources import Sharepoint +from viadot.sources.sharepoint import SharepointCredentials DUMMY_CREDS = {"site": "test", "username": "test2", "password": "test"} SAMPLE_DF = pd.DataFrame( @@ -30,6 +32,27 @@ def sharepoint_mock(): return SharepointMock(credentials=DUMMY_CREDS) +def test_valid_credentials(): + credentials = { + "site": "tenant.sharepoint.com", + "username": "user@example.com", + "password": "password", + } + shrp_creds = SharepointCredentials(**credentials) + assert shrp_creds.site == credentials["site"] + assert shrp_creds.username == credentials["username"] + assert shrp_creds.password == credentials["password"] + + +def test_missing_username(): + credentials = {"site": "example.sharepoint.com", "password": "password"} + with pytest.raises( + CredentialError, + match="'site', 'username', and 'password' credentials are required.", + ): + SharepointCredentials(**credentials) + + def test_sharepoint_default_na(sharepoint_mock): df = sharepoint_mock.to_df( url="test/file.xlsx", na_values=Sharepoint.DEFAULT_NA_VALUES @@ -90,3 +113,8 @@ def test__parse_excel_string_dtypes(sharepoint_mock): for column in result_df.columns: assert result_df[column].dtype == object + + +def test__load_and_parse_not_valid_extension(sharepoint_mock): + with pytest.raises(ValueError): + sharepoint_mock._load_and_parse(file_url="https://example.com/file.txt") From 03e16de411bda2f9509c28a231b4be0be9e21f49 Mon Sep 17 00:00:00 2001 From: rziemianek Date: Thu, 1 Aug 2024 20:35:29 +0200 Subject: [PATCH 15/15] =?UTF-8?q?=E2=9C=85=20Added=20missing=20tests=20to?= =?UTF-8?q?=20sharepoint=20class=20methods?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/test_sharepoint.py | 81 ++++++++++++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_sharepoint.py b/tests/unit/test_sharepoint.py index 5e7d1e9b4..5a7255cd0 100644 --- a/tests/unit/test_sharepoint.py +++ b/tests/unit/test_sharepoint.py @@ -1,8 +1,10 @@ from pathlib import Path +from unittest.mock import MagicMock, patch import pandas as pd import pytest import sharepy +from sharepy.errors import AuthError from viadot.exceptions import CredentialError from viadot.sources import Sharepoint from viadot.sources.sharepoint import SharepointCredentials @@ -23,7 +25,10 @@ class SharepointMock(Sharepoint): def get_connection(self): return sharepy.session.SharePointSession - def _download_file_stream(self, url=None): + def _download_file_stream(self, url=None, **kwargs): + if "nrows" in kwargs: + raise ValueError("Parameter 'nrows' is not supported.") + return pd.ExcelFile(Path("tests/unit/test_file.xlsx")) @@ -44,6 +49,26 @@ def test_valid_credentials(): assert shrp_creds.password == credentials["password"] +def test_invalid_authentication(): + credentials = { + "site": "tenant.sharepoint.com", + "username": "user@example.com", + "password": "password", + } + + s = Sharepoint(credentials=credentials) + + # Patch the sharepy.connect method to simulate an authentication failure + with patch("sharepy.connect") as mock_connect: + mock_connect.side_effect = AuthError("Authentication failed") + + with pytest.raises( + CredentialError, + match="Could not authenticate to tenant.sharepoint.com with provided credentials.", + ): + s.get_connection() + + def test_missing_username(): credentials = {"site": "example.sharepoint.com", "password": "password"} with pytest.raises( @@ -118,3 +143,57 @@ def test__parse_excel_string_dtypes(sharepoint_mock): def test__load_and_parse_not_valid_extension(sharepoint_mock): with pytest.raises(ValueError): sharepoint_mock._load_and_parse(file_url="https://example.com/file.txt") + + +def test_scan_sharepoint_folder_valid_url(sharepoint_mock): + url = "https://company.sharepoint.com/sites/site_name/final_folder/" + + # Mock the response from SharePoint + mock_response = MagicMock() + mock_response.json.return_value = { + "d": { + "results": [ + {"Name": "file1.txt"}, + {"Name": "file2.txt"}, + ] + } + } + + # Inject the mock response + sharepoint_mock.get_connection().get = MagicMock(return_value=mock_response) + + expected_files = [ + "https://company.sharepoint.com/sites/site_name/final_folder/file1.txt", + "https://company.sharepoint.com/sites/site_name/final_folder/file2.txt", + ] + + result = sharepoint_mock.scan_sharepoint_folder(url) + assert result == expected_files + + +def test_scan_sharepoint_folder_invalid_url(sharepoint_mock): + url = "https://company.sharepoint.com/folder/sub_folder/final_folder" + + with pytest.raises(ValueError, match="URL does not contain '/sites/' segment."): + sharepoint_mock.scan_sharepoint_folder(url) + + +def test_scan_sharepoint_folder_empty_response(sharepoint_mock): + url = ( + "https://company.sharepoint.com/sites/site_name/folder/sub_folder/final_folder" + ) + + mock_response = MagicMock() + mock_response.json.return_value = {"d": {"results": []}} + + sharepoint_mock.get_connection().get = MagicMock(return_value=mock_response) + + result = sharepoint_mock.scan_sharepoint_folder(url) + assert result == [] + + +def test_download_file_stream_unsupported_param(sharepoint_mock): + url = "https://company.sharepoint.com/sites/site_name/folder/test_file.xlsx" + + with pytest.raises(ValueError, match="Parameter 'nrows' is not supported."): + sharepoint_mock._download_file_stream(url, nrows=10)