lhotse-speech · pzelasko · Apr 19, 2022 · May 16, 2022 · May 16, 2022 · May 16, 2022
diff --git a/lhotse/dataset/sampling/dynamic_bucketing.py b/lhotse/dataset/sampling/dynamic_bucketing.py
@@ -1,7 +1,9 @@
+import concurrent.futures
 import random
 import warnings
 from bisect import bisect_right
 from collections import deque
+from concurrent.futures import ThreadPoolExecutor
 from itertools import islice
 from typing import Any, Deque, Dict, Generator, Iterable, List, Optional, Tuple, Union
 
@@ -334,6 +336,9 @@ def __init__(
             deque() for _ in range(len(duration_bins) + 1)
         ]
 
+        self._cut_reading_thread = ThreadPoolExecutor(1)
+        self._cut_reading_future: Optional[concurrent.futures.Future] = None
+
     def __iter__(self) -> Generator[CutSet, None, None]:
         # Init: sample `buffer_size` cuts and assign them to the right buckets.
         self.cuts_iter = iter(self.cuts)
@@ -356,6 +361,7 @@ def is_ready(bucket: Deque[Cut]):
         # On each step we're sampling a new batch.
         try:
             while True:
+                self._wait_for_cut_collection()
                 ready_buckets = [b for b in self.buckets if is_ready(b)]
                 if not ready_buckets:
                     # No bucket has enough data to yield for the last full batch.
@@ -394,13 +400,21 @@ def is_ready(bucket: Deque[Cut]):
         self.cuts_iter = None
 
     def _collect_cuts_in_buckets(self, n_cuts: int):
-        try:
+        def collect():
             for _ in range(n_cuts):
                 cuts = next(self.cuts_iter)
                 duration = (
                     cuts[0].duration if isinstance(cuts, tuple) else cuts.duration
                 )
                 bucket_idx = bisect_right(self.duration_bins, duration)
                 self.buckets[bucket_idx].append(cuts)
-        except StopIteration:
-            pass
+
+        assert self._cut_reading_future is None
+        self._cut_reading_future = self._cut_reading_thread.submit(collect)
+
+    def _wait_for_cut_collection(self):
+        assert self._cut_reading_future is not None
+        err = self._cut_reading_future.exception()
+        if err is not None and not isinstance(err, StopIteration):
+            raise err
+        self._cut_reading_future = None