From a0f0b5d81562ba5121ae68a5f294ea96f6ec1ee9 Mon Sep 17 00:00:00 2001 From: Martin Davis Date: Wed, 4 Oct 2023 17:30:58 -0400 Subject: [PATCH 01/14] feat: Add ability to upload Ludwig models to Predibase. --- ludwig/cli.py | 2 +- ludwig/upload.py | 12 +++++ ludwig/utils/upload_utils.py | 92 ++++++++++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+), 1 deletion(-) diff --git a/ludwig/cli.py b/ludwig/cli.py index c89bc47f7cb..27c16a2ca3c 100644 --- a/ludwig/cli.py +++ b/ludwig/cli.py @@ -56,7 +56,7 @@ def __init__(self): init_config Initialize a user config from a dataset and targets render_config Renders the fully populated config with all defaults set check_install Runs a quick training run on synthetic data to verify installation status - upload Push trained model artifacts to a registry (e.g., HuggingFace Hub) + upload Push trained model artifacts to a registry (e.g., HuggingFace Hub, Predibase) """, ) parser.add_argument("command", help="Subcommand to run") diff --git a/ludwig/upload.py b/ludwig/upload.py index 4cc86e488ad..d7dd40d6926 100644 --- a/ludwig/upload.py +++ b/ludwig/upload.py @@ -5,6 +5,7 @@ from ludwig.utils.print_utils import get_logging_level_registry from ludwig.utils.upload_utils import HuggingFaceHub +from ludwig.utils.upload_utils import Predibase logger = logging.getLogger(__name__) @@ -12,6 +13,7 @@ def get_upload_registry(): return { "hf_hub": HuggingFaceHub, + "predibase": Predibase, } @@ -23,6 +25,8 @@ def upload_cli( private: bool = False, commit_message: str = "Upload trained [Ludwig](https://ludwig.ai/latest/) model weights", commit_description: Optional[str] = None, + dataset_file: Optional[str] = None, + dataset_name: Optional[str] = None, **kwargs, ) -> None: """Create an empty repo on the HuggingFace Hub and upload trained model artifacts to that repo. @@ -49,6 +53,12 @@ def upload_cli( `f"Upload {path_in_repo} with huggingface_hub"` commit_description (`str` *optional*): The description of the generated commit + dataset_file (`str`, *optional*): + The path to the dataset file. Required if `service` is set to + `"predibase"`. + dataset_name (`str`, *optional*): + The name of the dataset. Used by the `service` + `"predibase"`. """ model_service = get_upload_registry().get(service, "hf_hub") hub = model_service() @@ -60,6 +70,8 @@ def upload_cli( private=private, commit_message=commit_message, commit_description=commit_description, + dataset_file=dataset_file, + dataset_name=dataset_name, ) diff --git a/ludwig/utils/upload_utils.py b/ludwig/utils/upload_utils.py index 4dc193d6410..63bcf5fb39e 100644 --- a/ludwig/utils/upload_utils.py +++ b/ludwig/utils/upload_utils.py @@ -1,5 +1,8 @@ import logging import os +import tempfile +import zipfile +from pathlib import Path from abc import ABC, abstractmethod from typing import Optional @@ -37,6 +40,8 @@ def upload( private: Optional[bool] = False, commit_message: Optional[str] = None, commit_description: Optional[str] = None, + dataset_file: Optional[str] = None, + dataset_name: Optional[str] = None, ) -> bool: """Abstract method to upload trained model artifacts to the target repository. @@ -61,6 +66,8 @@ def _validate_upload_parameters( private: Optional[bool] = False, commit_message: Optional[str] = None, commit_description: Optional[str] = None, + dataset_file: Optional[str] = None, + dataset_name: Optional[str] = None, ): """Validate parameters before uploading trained model artifacts. @@ -205,3 +212,88 @@ def upload( return True return False + + +class Predibase(BaseModelUpload): + def __init__(self): + self.pc = None + + def login(self): + """Login to Predibase using the token stored in the PREDIBASE_API_TOKEN environment variable and returns + a PredibaseClient object that can be used to interact with Predibase.""" + from predibase import PredibaseClient + + try: + pc = PredibaseClient() + + # TODO: Check if subscription has expired + + self.pc = pc + except Exception as e: + raise Exception(f"Failed to login to Predibase: {e}") + return False + + return True + + def upload( + self, + repo_id: str, + model_path: str, + commit_description: Optional[str] = None, + dataset_file: Optional[str] = None, + dataset_name: Optional[str] = None, + ) -> bool: + """Create an empty repo in Predibase and upload trained model artifacts to that repo. + + Args: + model_path (`str`): + The path of the saved model. This is the top level directory where + the models weights as well as other associated training artifacts + are saved. + repo_name (`str`): + A repo name. + repo_description (`str` *optional*): + The description of the repo. + """ + # Create empty model repo using repo_name, but it is okay if it already exists. + try: + repo = self.pc.create_repo( + name=repo_id, + description=commit_description, + exist_ok=True, + ) + except Exception as e: + raise Exception(f"Failed to create repo in Predibase: {e}") + return True + + # Upload the dataset to Predibase + try: + dataset = self.pc.upload_dataset( + file_path=dataset_file, + name=dataset_name, + ) + except Exception as e: + raise Exception(f"Failed to upload dataset to Predibase: {e}") + return True + + with tempfile.TemporaryDirectory() as tmpdir: + # Create a zip file of the model weights folder + model_data_zip_path = os.path.join(tmpdir, "model_data.zip") + fp_zip = Path(model_data_zip_path) + path_to_archive = Path(model_path) + with zipfile.ZipFile(fp_zip, "w", zipfile.ZIP_DEFLATED) as zipf: + for fp in path_to_archive.glob("**/*"): + zipf.write(fp, arcname=fp.relative_to(path_to_archive)) + + # Upload the zip file to Predibase + try: + self.pc.upload_model( + repo_id=repo.id, + zip_fp=model_data_zip_path, + dataset=dataset, + ) + except Exception as e: + raise Exception(f"Failed to upload model to Predibase: {e}") + return True + + return False From d20af580b189789aa9918d894a9e4492c951f6b7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 4 Oct 2023 21:33:37 +0000 Subject: [PATCH 02/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- ludwig/upload.py | 3 +-- ludwig/utils/upload_utils.py | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/ludwig/upload.py b/ludwig/upload.py index d7dd40d6926..b986b9b30a4 100644 --- a/ludwig/upload.py +++ b/ludwig/upload.py @@ -4,8 +4,7 @@ from typing import Optional from ludwig.utils.print_utils import get_logging_level_registry -from ludwig.utils.upload_utils import HuggingFaceHub -from ludwig.utils.upload_utils import Predibase +from ludwig.utils.upload_utils import HuggingFaceHub, Predibase logger = logging.getLogger(__name__) diff --git a/ludwig/utils/upload_utils.py b/ludwig/utils/upload_utils.py index 63bcf5fb39e..2c3c333ceec 100644 --- a/ludwig/utils/upload_utils.py +++ b/ludwig/utils/upload_utils.py @@ -2,8 +2,8 @@ import os import tempfile import zipfile -from pathlib import Path from abc import ABC, abstractmethod +from pathlib import Path from typing import Optional from huggingface_hub import HfApi, login @@ -219,8 +219,8 @@ def __init__(self): self.pc = None def login(self): - """Login to Predibase using the token stored in the PREDIBASE_API_TOKEN environment variable and returns - a PredibaseClient object that can be used to interact with Predibase.""" + """Login to Predibase using the token stored in the PREDIBASE_API_TOKEN environment variable and returns a + PredibaseClient object that can be used to interact with Predibase.""" from predibase import PredibaseClient try: From 1f73c61be914d673ec75c6d9a45e82a6eb4dc3d1 Mon Sep 17 00:00:00 2001 From: Martin Davis Date: Thu, 5 Oct 2023 13:29:29 -0400 Subject: [PATCH 03/14] Removed unnecessary zip file. --- ludwig/utils/upload_utils.py | 34 +++++++++++----------------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/ludwig/utils/upload_utils.py b/ludwig/utils/upload_utils.py index 63bcf5fb39e..ca33a09962a 100644 --- a/ludwig/utils/upload_utils.py +++ b/ludwig/utils/upload_utils.py @@ -268,32 +268,20 @@ def upload( # Upload the dataset to Predibase try: - dataset = self.pc.upload_dataset( - file_path=dataset_file, - name=dataset_name, - ) + dataset = self.pc.upload_dataset(file_path=dataset_file, name=dataset_name) except Exception as e: raise Exception(f"Failed to upload dataset to Predibase: {e}") return True - with tempfile.TemporaryDirectory() as tmpdir: - # Create a zip file of the model weights folder - model_data_zip_path = os.path.join(tmpdir, "model_data.zip") - fp_zip = Path(model_data_zip_path) - path_to_archive = Path(model_path) - with zipfile.ZipFile(fp_zip, "w", zipfile.ZIP_DEFLATED) as zipf: - for fp in path_to_archive.glob("**/*"): - zipf.write(fp, arcname=fp.relative_to(path_to_archive)) - - # Upload the zip file to Predibase - try: - self.pc.upload_model( - repo_id=repo.id, - zip_fp=model_data_zip_path, - dataset=dataset, - ) - except Exception as e: - raise Exception(f"Failed to upload model to Predibase: {e}") - return True + # Upload the zip file to Predibase + try: + self.pc.upload_model( + repo=repo, + model_path=model_path, + dataset=dataset, + ) + except Exception as e: + raise Exception(f"Failed to upload model to Predibase: {e}") + return True return False From 816731fc34e4be32d7c500690cf26240b1665cf4 Mon Sep 17 00:00:00 2001 From: Martin Davis Date: Thu, 5 Oct 2023 15:59:25 -0400 Subject: [PATCH 04/14] Able to hit pbase upload. --- ludwig/upload.py | 4 +- ludwig/utils/upload_utils.py | 75 ++++++++++++++++++++++++++++-------- 2 files changed, 61 insertions(+), 18 deletions(-) diff --git a/ludwig/upload.py b/ludwig/upload.py index d7dd40d6926..65c3360e772 100644 --- a/ludwig/upload.py +++ b/ludwig/upload.py @@ -34,7 +34,7 @@ def upload_cli( Args: service (`str`): Name of the hosted model service to push the trained artifacts to. - Currently, this only supports `hf_hub`. + Currently, this only supports `hf_hub` and `predibase`. repo_id (`str`): A namespace (user or an organization) and a repo name separated by a `/`. @@ -89,7 +89,7 @@ def cli(sys_argv): "service", help="Name of the model repository service.", default="hf_hub", - choices=["hf_hub"], + choices=["hf_hub", "predibase"], ) parser.add_argument( diff --git a/ludwig/utils/upload_utils.py b/ludwig/utils/upload_utils.py index ca33a09962a..e27df5506a9 100644 --- a/ludwig/utils/upload_utils.py +++ b/ludwig/utils/upload_utils.py @@ -1,8 +1,5 @@ import logging import os -import tempfile -import zipfile -from pathlib import Path from abc import ABC, abstractmethod from typing import Optional @@ -92,18 +89,10 @@ def _validate_upload_parameters( implementations. Defaults to None. Raises: - AssertionError: If the repo_id does not have both a namespace and a repo name separated by a '/'. FileNotFoundError: If the model_path does not exist. Exception: If the trained model artifacts are not found at the expected location within model_path, or if the artifacts are not in the required format (i.e., 'pytorch_model.bin' or 'adapter_model.bin'). """ - # Validate repo_id has both a namespace and a repo name - assert "/" in repo_id, ( - "`repo_id` must be a namespace (user or an organization) and a repo name separated by a `/`." - " For example, if your HF username is `johndoe` and you want to create a repository called `test`, the" - " repo_id should be johndoe/test" - ) - # Make sure the model's save path is actually a valid path if not os.path.exists(model_path): raise FileNotFoundError(f"The path '{model_path}' does not exist.") @@ -149,6 +138,59 @@ def login(self): self.api = hf_api + @staticmethod + def _validate_upload_parameters( + repo_id: str, + model_path: str, + repo_type: Optional[str] = None, + private: Optional[bool] = False, + commit_message: Optional[str] = None, + commit_description: Optional[str] = None, + dataset_file: Optional[str] = None, + dataset_name: Optional[str] = None, + ): + """Validate parameters before uploading trained model artifacts. + + This method checks if the input parameters meet the necessary requirements before uploading + trained model artifacts to the target repository. + + Args: + repo_id (str): The ID of the target repository. It must be a namespace (user or an organization) + and a repository name separated by a '/'. For example, if your HF username is 'johndoe' and you + want to create a repository called 'test', the repo_id should be 'johndoe/test'. + model_path (str): The path to the directory containing the trained model artifacts. It should contain + the model's weights, usually saved under 'model/model_weights'. + repo_type (str, optional): The type of the repository. Not used in the base class, but subclasses + may use it for specific repository implementations. Defaults to None. + private (bool, optional): Whether the repository should be private or not. Not used in the base class, + but subclasses may use it for specific repository implementations. Defaults to False. + commit_message (str, optional): A message to attach to the commit when uploading to version control + systems. Not used in the base class, but subclasses may use it for specific repository + implementations. Defaults to None. + commit_description (str, optional): A description of the commit when uploading to version control + systems. Not used in the base class, but subclasses may use it for specific repository + implementations. Defaults to None. + + Raises: + AssertionError: If the repo_id does not have both a namespace and a repo name separated by a '/'. + """ + # Validate repo_id has both a namespace and a repo name + assert "/" in repo_id, ( + "`repo_id` must be a namespace (user or an organization) and a repo name separated by a `/`." + " For example, if your HF username is `johndoe` and you want to create a repository called `test`, the" + " repo_id should be johndoe/test" + ) + BaseModelUpload._validate_upload_parameters( + repo_id, + model_path, + repo_type, + private, + commit_message, + commit_description, + dataset_file, + dataset_name, + ) + def upload( self, repo_id: str, @@ -242,6 +284,7 @@ def upload( commit_description: Optional[str] = None, dataset_file: Optional[str] = None, dataset_name: Optional[str] = None, + **kwargs, ) -> bool: """Create an empty repo in Predibase and upload trained model artifacts to that repo. @@ -257,20 +300,20 @@ def upload( """ # Create empty model repo using repo_name, but it is okay if it already exists. try: - repo = self.pc.create_repo( + repo = self.pc.create_model_repo( name=repo_id, description=commit_description, - exist_ok=True, + exists_ok=True, ) except Exception as e: - raise Exception(f"Failed to create repo in Predibase: {e}") + raise RuntimeError("Failed to create repo in Predibase") from e return True # Upload the dataset to Predibase try: dataset = self.pc.upload_dataset(file_path=dataset_file, name=dataset_name) except Exception as e: - raise Exception(f"Failed to upload dataset to Predibase: {e}") + raise RuntimeError("Failed to upload dataset to Predibase") from e return True # Upload the zip file to Predibase @@ -281,7 +324,7 @@ def upload( dataset=dataset, ) except Exception as e: - raise Exception(f"Failed to upload model to Predibase: {e}") + raise RuntimeError("Failed to upload model to Predibase") from e return True return False From 8a57ece1024f304f1eef7391294694027e0c99f0 Mon Sep 17 00:00:00 2001 From: Martin Davis Date: Thu, 5 Oct 2023 17:01:35 -0400 Subject: [PATCH 05/14] Successfully upload to Predibase! --- ludwig/upload.py | 5 +++ ludwig/utils/upload_utils.py | 78 ++++++++++++++++++++++++++++++++---- 2 files changed, 76 insertions(+), 7 deletions(-) diff --git a/ludwig/upload.py b/ludwig/upload.py index 65c3360e772..48763c369d9 100644 --- a/ludwig/upload.py +++ b/ludwig/upload.py @@ -127,6 +127,11 @@ def cli(sys_argv): choices=["critical", "error", "warning", "info", "debug", "notset"], ) + parser.add_argument("-df", "--dataset_file", help="The location of the dataset file", default=None) + parser.add_argument( + "-dn", "--dataset_name", help="(Optional) The name of the dataset in the Provider", default=None + ) + args = parser.parse_args(sys_argv) args.logging_level = get_logging_level_registry()[args.logging_level] diff --git a/ludwig/utils/upload_utils.py b/ludwig/utils/upload_utils.py index e27df5506a9..798f63bd471 100644 --- a/ludwig/utils/upload_utils.py +++ b/ludwig/utils/upload_utils.py @@ -277,6 +277,60 @@ def login(self): return True + @staticmethod + def _validate_upload_parameters( + repo_id: str, + model_path: str, + repo_type: Optional[str] = None, + private: Optional[bool] = False, + commit_message: Optional[str] = None, + commit_description: Optional[str] = None, + dataset_file: Optional[str] = None, + dataset_name: Optional[str] = None, + ): + """Validate parameters before uploading trained model artifacts. + + This method checks if the input parameters meet the necessary requirements before uploading + trained model artifacts to the target repository. + + Args: + repo_id (str): The ID of the target repository. It must be a namespace (user or an organization) + and a repository name separated by a '/'. For example, if your HF username is 'johndoe' and you + want to create a repository called 'test', the repo_id should be 'johndoe/test'. + model_path (str): The path to the directory containing the trained model artifacts. It should contain + the model's weights, usually saved under 'model/model_weights'. + repo_type (str, optional): The type of the repository. Not used in the base class, but subclasses + may use it for specific repository implementations. Defaults to None. + private (bool, optional): Whether the repository should be private or not. Not used in the base class, + but subclasses may use it for specific repository implementations. Defaults to False. + commit_message (str, optional): A message to attach to the commit when uploading to version control + systems. Not used in the base class, but subclasses may use it for specific repository + implementations. Defaults to None. + commit_description (str, optional): A description of the commit when uploading to version control + systems. Not used in the base class, but subclasses may use it for specific repository + implementations. Defaults to None. + + Raises: + AssertionError: If the repo_id has non-url safe characters. + """ + # Validate repo_id has both a namespace and a repo name + # TODO: Add this validation? + # assert "/" in repo_id, ( + # "`repo_id` must be a namespace (user or an organization) and a repo name separated by a `/`." + # " For example, if your HF username is `johndoe` and you want to create a repository called `test`, the" + # " repo_id should be johndoe/test" + # ) + BaseModelUpload._validate_upload_parameters( + repo_id, + model_path, + repo_type, + private, + commit_message, + commit_description, + dataset_file, + dataset_name, + ) + def upload( self, repo_id: str, @@ -298,6 +352,23 @@ def upload( repo_description (`str` *optional*): The description of the repo. """ + # Validate upload parameters are in the right format + Predibase._validate_upload_parameters( + repo_id, + model_path, + repo_type, + private, + commit_message, + commit_description, + ) + + # Upload the dataset to Predibase + try: + dataset = self.pc.upload_dataset(file_path=dataset_file, name=dataset_name) + except Exception as e: + raise RuntimeError("Failed to upload dataset to Predibase") from e + return True + # Create empty model repo using repo_name, but it is okay if it already exists. try: repo = self.pc.create_model_repo( @@ -309,13 +380,6 @@ def upload( raise RuntimeError("Failed to create repo in Predibase") from e return True - # Upload the dataset to Predibase - try: - dataset = self.pc.upload_dataset(file_path=dataset_file, name=dataset_name) - except Exception as e: - raise RuntimeError("Failed to upload dataset to Predibase") from e - return True - # Upload the zip file to Predibase try: self.pc.upload_model( From efac4ac6c9f15fae7fb41cc2be1db5720012f9c8 Mon Sep 17 00:00:00 2001 From: Martin Davis Date: Fri, 6 Oct 2023 10:18:36 -0400 Subject: [PATCH 06/14] Validated uploading to Predibase and separating HF logic. --- ludwig/upload.py | 2 +- ludwig/utils/upload_utils.py | 60 +++++++++++++++--------------------- 2 files changed, 25 insertions(+), 37 deletions(-) diff --git a/ludwig/upload.py b/ludwig/upload.py index 62507264eff..0d49862ef90 100644 --- a/ludwig/upload.py +++ b/ludwig/upload.py @@ -54,7 +54,7 @@ def upload_cli( The description of the generated commit dataset_file (`str`, *optional*): The path to the dataset file. Required if `service` is set to - `"predibase"`. + `"predibase"` for new model repos. dataset_name (`str`, *optional*): The name of the dataset. Used by the `service` `"predibase"`. diff --git a/ludwig/utils/upload_utils.py b/ludwig/utils/upload_utils.py index 7fccd36e207..8f86fc653b0 100644 --- a/ludwig/utils/upload_utils.py +++ b/ludwig/utils/upload_utils.py @@ -1,7 +1,6 @@ import logging import os from abc import ABC, abstractmethod -from pathlib import Path from typing import Optional from huggingface_hub import HfApi, login @@ -64,8 +63,6 @@ def _validate_upload_parameters( private: Optional[bool] = False, commit_message: Optional[str] = None, commit_description: Optional[str] = None, - dataset_file: Optional[str] = None, - dataset_name: Optional[str] = None, ): """Validate parameters before uploading trained model artifacts. @@ -73,9 +70,7 @@ def _validate_upload_parameters( trained model artifacts to the target repository. Args: - repo_id (str): The ID of the target repository. It must be a namespace (user or an organization) - and a repository name separated by a '/'. For example, if your HF username is 'johndoe' and you - want to create a repository called 'test', the repo_id should be 'johndoe/test'. + repo_id (str): The ID of the target repository. Each provider will verify their specific rules. model_path (str): The path to the directory containing the trained model artifacts. It should contain the model's weights, usually saved under 'model/model_weights'. repo_type (str, optional): The type of the repository. Not used in the base class, but subclasses @@ -107,17 +102,6 @@ def _validate_upload_parameters( "wrong during training where the model's weights were not saved." ) - # Make sure the model's saved artifacts either contain: - # 1. pytorch_model.bin -> regular model training, such as ECD or for LLMs - # 2. adapter_model.bin -> LLM fine-tuning using PEFT - files = set(os.listdir(trained_model_artifacts_path)) - if "pytorch_model.bin" not in files and "adapter_model.bin" not in files: - raise Exception( - f"Can't find model weights at {trained_model_artifacts_path}. Trained model weights should " - "either be saved as `pytorch_model.bin` for regular model training, or have `adapter_model.bin`" - "if using parameter efficient fine-tuning methods like LoRA." - ) - class HuggingFaceHub(BaseModelUpload): def __init__(self): @@ -147,8 +131,6 @@ def _validate_upload_parameters( private: Optional[bool] = False, commit_message: Optional[str] = None, commit_description: Optional[str] = None, - dataset_file: Optional[str] = None, - dataset_name: Optional[str] = None, ): """Validate parameters before uploading trained model artifacts. @@ -188,10 +170,20 @@ def _validate_upload_parameters( private, commit_message, commit_description, - dataset_file, - dataset_name, ) + trained_model_artifacts_path = os.path.join(model_path, "model", "model_weights") + # Make sure the model's saved artifacts either contain: + # 1. pytorch_model.bin -> regular model training, such as ECD or for LLMs + # 2. adapter_model.bin -> LLM fine-tuning using PEFT + files = set(os.listdir(trained_model_artifacts_path)) + if "pytorch_model.bin" not in files and "adapter_model.bin" not in files: + raise Exception( + f"Can't find model weights at {trained_model_artifacts_path}. Trained model weights should " + "either be saved as `pytorch_model.bin` for regular model training, or have `adapter_model.bin`" + "if using parameter efficient fine-tuning methods like LoRA." + ) + def upload( self, repo_id: str, @@ -200,6 +192,7 @@ def upload( private: Optional[bool] = False, commit_message: Optional[str] = None, commit_description: Optional[str] = None, + **kwargs, ) -> bool: """Create an empty repo on the HuggingFace Hub and upload trained model artifacts to that repo. @@ -262,7 +255,7 @@ def __init__(self): self.pc = None def login(self): - """Login to Predibase using the token stored in the PREDIBASE_API_TOKEN environment variable and returns a + """Login to Predibase using the token stored in the PREDIBASE_API_TOKEN environment variable and return a PredibaseClient object that can be used to interact with Predibase.""" from predibase import PredibaseClient @@ -286,8 +279,6 @@ def _validate_upload_parameters( private: Optional[bool] = False, commit_message: Optional[str] = None, commit_description: Optional[str] = None, - dataset_file: Optional[str] = None, - dataset_name: Optional[str] = None, ): """Validate parameters before uploading trained model artifacts. @@ -295,9 +286,7 @@ def _validate_upload_parameters( trained model artifacts to the target repository. Args: - repo_id (str): The ID of the target repository. It must be a namespace (user or an organization) - and a repository name separated by a '/'. For example, if your HF username is 'johndoe' and you - want to create a repository called 'test', the repo_id should be 'johndoe/test'. + repo_id (str): The ID of the target repository. It must be a less than 256 characters. model_path (str): The path to the directory containing the trained model artifacts. It should contain the model's weights, usually saved under 'model/model_weights'. repo_type (str, optional): The type of the repository. Not used in the base class, but subclasses @@ -314,13 +303,7 @@ def _validate_upload_parameters( Raises: AssertionError: If the repo_id has non-url safe characters. """ - # Validate repo_id has both a namespace and a repo name - # TODO: Add this validation? - # assert "/" in repo_id, ( - # "`repo_id` must be a namespace (user or an organization) and a repo name separated by a `/`." - # " For example, if your HF username is `johndoe` and you want to create a repository called `test`, the" - # " repo_id should be johndoe/test" - # ) + assert len(repo_id) <= 255, "`repo_id` must be 255 characters or less." BaseModelUpload._validate_upload_parameters( repo_id, model_path, @@ -328,8 +311,6 @@ def _validate_upload_parameters( private, commit_message, commit_description, - dataset_file, - dataset_name, ) def upload( @@ -352,6 +333,12 @@ def upload( A repo name. repo_description (`str` *optional*): The description of the repo. + dataset_file (`str` *optional*): + The path to the dataset file. Required if `service` is set to + `"predibase"` for new model repos. + dataset_name (`str` *optional*): + The name of the dataset. Used by the `service` + `"predibase"`. Falls back to the filename. """ # Validate upload parameters are in the right format Predibase._validate_upload_parameters( @@ -392,4 +379,5 @@ def upload( raise RuntimeError("Failed to upload model to Predibase") from e return True + logger.info(f"Model uploaded to Predibase with repository name `{repo_id}`") return False From 872d40c7fdfcb0e286d85f1ebaa4864f502a687d Mon Sep 17 00:00:00 2001 From: Martin Davis Date: Fri, 6 Oct 2023 10:33:14 -0400 Subject: [PATCH 07/14] Adding Preibase to requirements --- requirements.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/requirements.txt b/requirements.txt index f8f92939821..16da6083c23 100644 --- a/requirements.txt +++ b/requirements.txt @@ -56,3 +56,6 @@ pyxlsb>=1.0.8 # excel pyarrow # parquet lxml # html html5lib # html + +# Allows users to upload +predibase>=2023.10.2 \ No newline at end of file From 7bb9da65de13d10e6483dc048b39276ecd7b7964 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 6 Oct 2023 14:34:29 +0000 Subject: [PATCH 08/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 16da6083c23..aa7ff9624b1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -58,4 +58,4 @@ lxml # html html5lib # html # Allows users to upload -predibase>=2023.10.2 \ No newline at end of file +predibase>=2023.10.2 From d997062e55557f56f3e8659b84bc59af615a3c66 Mon Sep 17 00:00:00 2001 From: Martin Davis Date: Fri, 6 Oct 2023 14:29:22 -0400 Subject: [PATCH 09/14] Added custom logic to better explain how to set PREDIBASE_API_TOKEN. --- ludwig/utils/upload_utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ludwig/utils/upload_utils.py b/ludwig/utils/upload_utils.py index 8f86fc653b0..47292f0dcd2 100644 --- a/ludwig/utils/upload_utils.py +++ b/ludwig/utils/upload_utils.py @@ -259,6 +259,12 @@ def login(self): PredibaseClient object that can be used to interact with Predibase.""" from predibase import PredibaseClient + token = os.environ.get("PREDIBASE_API_TOKEN") + if token is None: + raise ValueError( + "Unable to find PREDIBASE_API_TOKEN environment variable. Please log into Predibase, generate a token and use `export PREDIBASE_API_TOKEN=` to use Predibase" + ) + try: pc = PredibaseClient() From d0111e00602fc292c622f153326064de8d07fee1 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 9 Oct 2023 14:25:11 -0700 Subject: [PATCH 10/14] Update cli.py --- ludwig/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ludwig/cli.py b/ludwig/cli.py index 27c16a2ca3c..345a69d0e7d 100644 --- a/ludwig/cli.py +++ b/ludwig/cli.py @@ -56,7 +56,7 @@ def __init__(self): init_config Initialize a user config from a dataset and targets render_config Renders the fully populated config with all defaults set check_install Runs a quick training run on synthetic data to verify installation status - upload Push trained model artifacts to a registry (e.g., HuggingFace Hub, Predibase) + upload Push trained model artifacts to a registry (e.g., Predibase, HuggingFace Hub) """, ) parser.add_argument("command", help="Subcommand to run") From df066f683dd6fc1473533568378f4ee56e0de05b Mon Sep 17 00:00:00 2001 From: Martin Davis Date: Mon, 9 Oct 2023 21:04:24 -0400 Subject: [PATCH 11/14] Moved predibase to requirements extra --- requirements.txt | 3 --- requirements_extra.txt | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index aa7ff9624b1..f8f92939821 100644 --- a/requirements.txt +++ b/requirements.txt @@ -56,6 +56,3 @@ pyxlsb>=1.0.8 # excel pyarrow # parquet lxml # html html5lib # html - -# Allows users to upload -predibase>=2023.10.2 diff --git a/requirements_extra.txt b/requirements_extra.txt index 4be0e04497a..26fe48eb998 100644 --- a/requirements_extra.txt +++ b/requirements_extra.txt @@ -3,3 +3,6 @@ horovod[pytorch]>=0.24.0,!=0.26.0 # alternative to Dask modin[ray] + +# Allows users to upload +predibase>=2023.10.2 From c54df37dd76fdea8025c6b2da13753913d4c53d5 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 9 Oct 2023 19:21:48 -0700 Subject: [PATCH 12/14] Update upload_utils.py --- ludwig/utils/upload_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ludwig/utils/upload_utils.py b/ludwig/utils/upload_utils.py index 47292f0dcd2..f2185001851 100644 --- a/ludwig/utils/upload_utils.py +++ b/ludwig/utils/upload_utils.py @@ -262,7 +262,8 @@ def login(self): token = os.environ.get("PREDIBASE_API_TOKEN") if token is None: raise ValueError( - "Unable to find PREDIBASE_API_TOKEN environment variable. Please log into Predibase, generate a token and use `export PREDIBASE_API_TOKEN=` to use Predibase" + "Unable to find PREDIBASE_API_TOKEN environment variable. Please log into Predibase, " + "generate a token and use `export PREDIBASE_API_TOKEN=` to use Predibase" ) try: From 3c9c036c5a89985fda27e87cfbba8ec07a1fbf90 Mon Sep 17 00:00:00 2001 From: Martin Davis Date: Tue, 10 Oct 2023 11:57:25 -0400 Subject: [PATCH 13/14] Moved login logic to init function. --- ludwig/upload.py | 1 - ludwig/utils/upload_utils.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ludwig/upload.py b/ludwig/upload.py index 0d49862ef90..fa626f3a1e1 100644 --- a/ludwig/upload.py +++ b/ludwig/upload.py @@ -61,7 +61,6 @@ def upload_cli( """ model_service = get_upload_registry().get(service, "hf_hub") hub = model_service() - hub.login() hub.upload( repo_id=repo_id, model_path=model_path, diff --git a/ludwig/utils/upload_utils.py b/ludwig/utils/upload_utils.py index 47292f0dcd2..d89ae4863a2 100644 --- a/ludwig/utils/upload_utils.py +++ b/ludwig/utils/upload_utils.py @@ -106,6 +106,7 @@ def _validate_upload_parameters( class HuggingFaceHub(BaseModelUpload): def __init__(self): self.api = None + self.login() def login(self): """Login to huggingface hub using the token stored in ~/.cache/huggingface/token and returns a HfApi client @@ -253,6 +254,7 @@ def upload( class Predibase(BaseModelUpload): def __init__(self): self.pc = None + self.login() def login(self): """Login to Predibase using the token stored in the PREDIBASE_API_TOKEN environment variable and return a From 72935aa39c915275f5735eb3c2ea4240111486d4 Mon Sep 17 00:00:00 2001 From: Martin Davis Date: Tue, 10 Oct 2023 16:46:18 -0400 Subject: [PATCH 14/14] Raise Value Error and return expected boolean output. --- ludwig/utils/upload_utils.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/ludwig/utils/upload_utils.py b/ludwig/utils/upload_utils.py index c373772bc2e..cc166170282 100644 --- a/ludwig/utils/upload_utils.py +++ b/ludwig/utils/upload_utils.py @@ -156,14 +156,15 @@ def _validate_upload_parameters( implementations. Defaults to None. Raises: - AssertionError: If the repo_id does not have both a namespace and a repo name separated by a '/'. + ValueError: If the repo_id does not have both a namespace and a repo name separated by a '/'. """ # Validate repo_id has both a namespace and a repo name - assert "/" in repo_id, ( - "`repo_id` must be a namespace (user or an organization) and a repo name separated by a `/`." - " For example, if your HF username is `johndoe` and you want to create a repository called `test`, the" - " repo_id should be johndoe/test" - ) + if "/" not in repo_id: + raise ValueError( + "`repo_id` must be a namespace (user or an organization) and a repo name separated by a `/`." + " For example, if your HF username is `johndoe` and you want to create a repository called `test`, the" + " repo_id should be johndoe/test" + ) BaseModelUpload._validate_upload_parameters( repo_id, model_path, @@ -310,9 +311,11 @@ def _validate_upload_parameters( implementations. Defaults to None. Raises: - AssertionError: If the repo_id has non-url safe characters. + ValueError: If the repo_id is too long. """ - assert len(repo_id) <= 255, "`repo_id` must be 255 characters or less." + if len(repo_id) > 255: + raise ValueError("`repo_id` must be 255 characters or less.") + BaseModelUpload._validate_upload_parameters( repo_id, model_path, @@ -364,7 +367,6 @@ def upload( dataset = self.pc.upload_dataset(file_path=dataset_file, name=dataset_name) except Exception as e: raise RuntimeError("Failed to upload dataset to Predibase") from e - return True # Create empty model repo using repo_name, but it is okay if it already exists. try: @@ -375,7 +377,6 @@ def upload( ) except Exception as e: raise RuntimeError("Failed to create repo in Predibase") from e - return True # Upload the zip file to Predibase try: @@ -386,7 +387,6 @@ def upload( ) except Exception as e: raise RuntimeError("Failed to upload model to Predibase") from e - return True logger.info(f"Model uploaded to Predibase with repository name `{repo_id}`") - return False + return True