diff --git a/DeepFilterNet/df/enhance.py b/DeepFilterNet/df/enhance.py
index eea1fc0c0..c6faccf0b 100644
--- a/DeepFilterNet/df/enhance.py
+++ b/DeepFilterNet/df/enhance.py
@@ -4,6 +4,7 @@
 import warnings
 from typing import Optional, Tuple, Union
 
+import numpy as np
 import torch
 from loguru import logger
 from torch import Tensor, nn
@@ -15,7 +16,14 @@
 from df.logger import init_logger
 from df.model import ModelParams
 from df.modules import get_device
-from df.utils import as_complex, as_real, download_file, get_cache_dir, get_norm_alpha
+from df.utils import (
+    as_complex,
+    as_real,
+    download_file,
+    get_cache_dir,
+    get_norm_alpha,
+    measure_gpu_mem,
+)
 from libdf import DF, erb, erb_norm, unit_norm
 
 PRETRAINED_MODELS = ("DeepFilterNet", "DeepFilterNet2")
@@ -36,13 +44,23 @@ def main(args):
         os.mkdir(args.output_dir)
     df_sr = ModelParams().sr
     n_samples = len(args.noisy_audio_files)
+    measure_mem = args.log_level in ("DEBUG", "TRACE") and get_device().type == "cuda"
+    if measure_mem:
+        mem_watcher, used_q, done_event = measure_gpu_mem(sleep_ms=10)
     for i, file in enumerate(args.noisy_audio_files):
         progress = (i + 1) / n_samples * 100
         audio, meta = load_audio(file, df_sr)
         t0 = time.time()
-        audio = enhance(
-            model, df_state, audio, pad=args.compensate_delay, atten_lim_db=args.atten_lim
-        )
+        try:
+            audio = enhance(
+                model, df_state, audio, pad=args.compensate_delay, atten_lim_db=args.atten_lim
+            )
+        except RuntimeError as e:
+            if "CUDA" in str(e):
+                logger.error(f"Error running enhance() on audio file with shape {audio.shape}: {e}")
+                break
+            else:
+                raise e
         t1 = time.time()
         t_audio = audio.shape[-1] / df_sr
         t = t1 - t0
@@ -54,6 +72,14 @@ def main(args):
         save_audio(
             file, audio, sr=meta.sample_rate, output_dir=args.output_dir, suffix=suffix, log=False
         )
+    if measure_mem:
+        done_event.set()
+        mem_watcher.join()
+        used = []
+        while not used_q.empty():
+            used.append(used_q.get()/1024**2)
+        ic(len(used))
+        logger.debug(f"Memory usage: Mean: {np.mean(used)} MB, Max: {np.max(used)} MB")
 
 
 def get_model_basedir(m: Optional[str]) -> str:
diff --git a/DeepFilterNet/df/utils.py b/DeepFilterNet/df/utils.py
index 84fe1724d..4fe87fc3b 100644
--- a/DeepFilterNet/df/utils.py
+++ b/DeepFilterNet/df/utils.py
@@ -1,9 +1,12 @@
 import collections
 import math
 import os
+import queue
 import random
 import subprocess
+import threading
 from socket import gethostname
+from time import sleep
 from typing import Any, Optional, Set, Tuple, Union
 
 import numpy as np
@@ -230,6 +233,23 @@ def download_file(url: str, download_dir: str, extract: bool = False):
     return local_filename
 
 
+def measure_gpu_mem(gpu_idx=0, sleep_ms=100) -> threading.Thread:
+    # nvmlInit()
+    # h = nvmlDeviceGetHandleByIndex(gpu_idx)
+    used = queue.SimpleQueue()
+    done_event = threading.Event()
+
+    def target(used: queue.Queue, done_event: threading.Event):
+        while not done_event.is_set():
+            # used.put(nvmlDeviceGetMemoryInfo(h).used)
+            used.put(torch.cuda.memory_reserved(gpu_idx))
+            sleep(sleep_ms / 1000)
+
+    thread = threading.Thread(target=target, args=(used, done_event), daemon=True)
+    thread.start()
+    return thread, used, done_event
+
+
 def get_cache_dir():
     try:
         from appdirs import user_cache_dir