delta-incubator · mshtelma · Oct 31, 2023 · Oct 31, 2023 · Oct 31, 2023 · Oct 31, 2023
diff --git a/deltatorch/deltadataset.py b/deltatorch/deltadataset.py
@@ -6,7 +6,6 @@
 from typing import Optional, Callable, List, Tuple, Dict, Any
 
 import numpy as np
-import pyarrow as pa
 import pyarrow.dataset as ds
 import torch.distributed
 from PIL import Image
@@ -40,6 +39,7 @@ def __init__(
         shuffle: bool = False,
         batch_size: int = 32,
         drop_last: bool = False,
+        storage_options: Optional[Dict[str, str]] = None,
     ) -> None:
         super().__init__()
         self.path = path
@@ -55,6 +55,7 @@ def __init__(
         self.drop_last = drop_last
         self.path = path
         self.batch_size = batch_size
+        self.storage_options = storage_options
         self.init_boundaries(path)
 
     @abstractmethod
@@ -127,7 +128,9 @@ def count(self):
             return self.count_with_partition_filters(_delta_table)
         else:
             _add_actions = _delta_table.get_add_actions().to_pandas()
-            return _add_actions["num_records"].sum()
+            num_records = _add_actions["num_records"].sum()
+            del _delta_table
+            return num_records
 
     def count_with_partition_filters(self, _delta_table):
         _cnt = 0
@@ -142,7 +145,17 @@ def count_with_partition_filters(self, _delta_table):
         return _cnt
 
     def create_delta_table(self):
-        return DeltaTable(self.path, version=self.version)
+        delta_table = DeltaTable(
+            self.path, version=self.version, storage_options=self.storage_options
+        )
+        conf = delta_table.metadata().configuration
+        if conf:
+            deletion_vectors = conf.get("delta.enableDeletionVectors", None)
+            if deletion_vectors == "true":
+                raise Exception(
+                    "Tables with enabled Deletion Vectors are not supported."
+                )
+        return delta_table
 
     def __iter__(self):
         return self.process_data()

diff --git a/deltatorch/id_based_deltadataset.py b/deltatorch/id_based_deltadataset.py
@@ -1,10 +1,8 @@
 import logging
 import random
-from typing import List, Optional, Tuple, Any
+from typing import List, Optional, Tuple, Any, Dict
 
 import pyarrow.compute as pc
-from pyarrow.dataset import Expression
-from deltalake import DeltaTable
 from torch.utils.data import get_worker_info
 
 from deltatorch import DeltaIterableDataset
@@ -28,6 +26,7 @@ def __init__(
         shuffle: bool = False,
         batch_size: int = 32,
         drop_last: bool = False,
+        storage_options: Optional[Dict[str, str]] = None,
     ):
         super().__init__(
             path,
@@ -41,6 +40,7 @@ def __init__(
             shuffle,
             batch_size,
             drop_last,
+            storage_options,
         )
         self.id_field = id_field
 
@@ -68,7 +68,7 @@ def process_data(self):
                 pc.field(self.id_field) < pc.scalar(iter_end)
             )
 
-        delta_table = DeltaTable(self.path, version=self.version)
+        delta_table = self.create_delta_table()
         scanner = delta_table.to_pyarrow_dataset().scanner(
             columns=self.arrow_fields, filter=_filter
         )
@@ -83,3 +83,4 @@ def process_data(self):
                     item, self.field_specs
                 )
                 yield item
+        del delta_table
diff --git a/deltatorch/pytorch.py b/deltatorch/pytorch.py
@@ -1,6 +1,5 @@
-from typing import List, Optional, Tuple, Any
+from typing import List, Optional, Tuple, Any, Dict
 
-from pyarrow.dataset import Expression
 from torch.utils.data import DataLoader
 
 from .deltadataset import FieldSpec
@@ -20,6 +19,7 @@ def create_pytorch_dataloader(
     num_workers: int = 2,
     shuffle: bool = False,
     drop_last: bool = False,
+    storage_options: Optional[Dict[str, str]] = None,
     **pytorch_dataloader_kwargs
 ):
     """Create a PyTorch DataLoader.
@@ -50,6 +50,7 @@ def create_pytorch_dataloader(
             if the dataset size is not divisible by the batch size. If ``False`` and
             the size of dataset is not divisible by the batch size, then the last batch
             will be smaller. (default: ``False``)
+    :param storage_options: a dictionary of the options to use for the storage backend
     :param pytorch_dataloader_kwargs: arguments for `torch.utils.data.DataLoader`,
         exclude these arguments: ``batch_size``, ``num_workers``, ``shuffle``,
         ``drop_last``.
@@ -69,6 +70,7 @@ def create_pytorch_dataloader(
         shuffle,
         batch_size,
         drop_last,
+        storage_options,
     )
 
     return DataLoader(