Skip to content

Commit

Permalink
feat: enhance RemoteDataset with comprehensive dataset management met…
Browse files Browse the repository at this point in the history
…hods

- Add detailed docstrings for RemoteDataset class and its methods
- Improve logging with more informative messages
- Implement `download_data()` method to retrieve dataset files
- Refactor existing methods with better error handling and logging
- Add context-specific logging for dataset operations
  • Loading branch information
CuriousDolphin committed Feb 11, 2025
1 parent 1b07dec commit f521336
Show file tree
Hide file tree
Showing 2 changed files with 213 additions and 7 deletions.
82 changes: 75 additions & 7 deletions focoos/remote_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,36 +9,82 @@


class RemoteDataset:
"""
A class to manage remote datasets through the Focoos API.
This class provides functionality to interact with datasets stored remotely,
including uploading, downloading, and managing dataset data.
Args:
ref (str): The reference identifier for the dataset.
api_client (ApiClient): The API client instance for making requests.
Attributes:
ref (str): The dataset reference identifier.
api_client (ApiClient): The API client instance.
metadata (DatasetPreview): The dataset metadata.
"""

def __init__(self, ref: str, api_client: ApiClient):
self.ref = ref
self.api_client = api_client
self.metadata: DatasetPreview = self.get_info()

def get_info(self) -> DatasetPreview:
"""
Retrieves the dataset information from the API.
Returns:
DatasetPreview: The dataset preview information.
"""
res = self.api_client.get(f"datasets/{self.ref}")
return DatasetPreview.from_json(res.json())

def delete(self):
"""
Deletes the entire dataset from the remote storage.
Raises:
Exception: If the deletion fails.
"""
try:
res = self.api_client.delete(f"datasets/{self.ref}")
res.raise_for_status()
logger.info("Deleted dataset")
logger.warning(f"Deleted dataset {self.ref}")
except Exception as e:
logger.error(f"Failed to delete dataset: {e}")
logger.error(f"Failed to delete dataset {self.ref}: {e}")
raise e

def delete_data(self):
"""
Deletes only the data content of the dataset while preserving metadata.
Updates the metadata after successful deletion.
"""
try:
res = self.api_client.delete(f"datasets/{self.ref}/data")

res.raise_for_status()
new_metadata = DatasetPreview.from_json(res.json())
self.metadata = new_metadata
logger.info("Deleted dataset data")
logger.warning(f"Deleted dataset data {self.ref}")
except Exception as e:
logger.error(f"Failed to delete dataset data: {e}")
logger.error(f"Failed to delete dataset data {self.ref}: {e}")

def upload_data(self, path: str) -> Optional[DatasetSpec]:
"""
Uploads dataset data from a local zip file to the remote storage.
Args:
path (str): Local path to the zip file containing dataset data.
Returns:
Optional[DatasetSpec]: The dataset specification after successful upload.
Raises:
FileNotFoundError: If the specified file does not exist.
ValueError: If the file is not a zip file or upload fails.
"""
if not os.path.exists(path):
raise FileNotFoundError(f"File not found: {path}")
if not path.endswith(".zip"):
Expand All @@ -64,15 +110,37 @@ def upload_data(self, path: str) -> Optional[DatasetSpec]:
data=presigned_url["fields"],
stream=True,
)
logger.info("✅ Upload Done.")
logger.info("✅ Upload file done.")
if res.status_code not in [200, 201, 204]:
raise ValueError(f"Failed to upload dataset: {res.status_code} {res.text}")

logger.info("🔗 Validate dataset..")
logger.info("🔗 Validating dataset..")
complete_upload = self.api_client.post(
f"datasets/{self.ref}/complete-upload",
)
complete_upload.raise_for_status()
logger.info("✅ Done.")
self.metadata = self.get_info()
logger.info(f"✅ Dataset validated! => {self.metadata.spec}")
return self.metadata.spec

def download_data(self, path: str):
"""
Downloads the dataset data to a local path.
Args:
path (str): Local path where the dataset should be downloaded.
Returns:
str: The path where the file was downloaded.
Raises:
ValueError: If the download fails.
"""
res = self.api_client.get(f"datasets/{self.ref}/download")
if res.status_code != 200:
raise ValueError(f"Failed to download dataset data: {res.status_code} {res.text}")
logger.info(f"📥 Downloading dataset data to {path}")
url = res.json()["download_uri"]
path = self.api_client.download_file(url, path)
logger.info(f"✅ Dataset data downloaded to {path}")
return path
138 changes: 138 additions & 0 deletions notebooks/datasets.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "shellscript"
}
},
"outputs": [],
"source": [
"!uv pip install -e ..[cpu]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Cloud Dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from pprint import pprint\n",
"\n",
"from focoos import LOCAL_API_URL, Focoos\n",
"\n",
"focoos = Focoos(api_key=os.getenv(\"FOCOOS_API_KEY\"), host_url=LOCAL_API_URL)\n",
"\n",
"datasets = focoos.list_datasets(include_shared=False)\n",
"pprint(datasets)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Delete datasets"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from pprint import pprint\n",
"\n",
"from focoos import LOCAL_API_URL, Focoos\n",
"\n",
"focoos = Focoos(api_key=os.getenv(\"FOCOOS_API_KEY\"), host_url=LOCAL_API_URL)\n",
"\n",
"datasets = focoos.list_datasets(include_shared=False)\n",
"refs = [ds.ref for ds in datasets]\n",
"for ref in refs:\n",
" ds = focoos.get_remote_dataset(ref)\n",
" ds.delete_data()\n",
" ds.delete()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Upload Dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pprint import pprint\n",
"\n",
"from focoos import LOCAL_API_URL, DatasetLayout, Focoos, FocoosTask\n",
"\n",
"focoos = Focoos(host_url=LOCAL_API_URL)\n",
"\n",
"ds = focoos.add_remote_dataset(\n",
" name=\"slot-car-3\", description=\"Slot Car Tracker\", layout=DatasetLayout.ROBOFLOW_COCO, task=FocoosTask.DETECTION\n",
")\n",
"ds_spec = ds.upload_data(\"./.data/carrera_go.zip\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Download Dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"from focoos import LOCAL_API_URL, Focoos\n",
"\n",
"focoos = Focoos(api_key=os.getenv(\"FOCOOS_API_KEY\"), host_url=LOCAL_API_URL)\n",
"\n",
"ds = focoos.get_remote_dataset(\"025410c1a752431c\")\n",
"ds.download_data(\"./datasets\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit f521336

Please sign in to comment.