From 552c60175d3be8209ac6aff81263822c8c42dae9 Mon Sep 17 00:00:00 2001 From: Anders Jess Pedersen Date: Wed, 29 Nov 2023 14:13:09 +0100 Subject: [PATCH] fix: remove empty files. --- src/coral_models/prepare_raw_data.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/coral_models/prepare_raw_data.py b/src/coral_models/prepare_raw_data.py index c6186ce2..4e7f428b 100644 --- a/src/coral_models/prepare_raw_data.py +++ b/src/coral_models/prepare_raw_data.py @@ -344,9 +344,16 @@ def prepare_raw_data( # audio. read_aloud_duration = 0.0 conversation_duration = 0.0 + rows_to_remove = [] for row_i, row in tqdm(recordings.iterrows()): filename = input_path / row["filename"] + # Check if the file is empty, and if it is, remove it from the dataframe + # and continue to the next file + if filename.stat().st_size < 200000: # Any file smaller than this is empty + rows_to_remove.append(row_i) + continue + # Get the new filename # New filename is in the format is for conversations: # "recording_id_speaker_id1_speaker_id2_recorder_speaker_id_conversation.wav" @@ -400,6 +407,9 @@ def prepare_raw_data( except FileNotFoundError: pass + # Remove rows with empty files + recordings = recordings.drop(rows_to_remove).reset_index(drop=True) + # Write a README file readme = make_readme() with open(output_path / "README.md", "w") as f: