-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathinterview_audio_process.sh
executable file
·281 lines (254 loc) · 13 KB
/
interview_audio_process.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
#!/bin/bash
# top level pipeline script for audio side processing
# should be called with path to config file as argument
if [[ -z "${1}" ]]; then
echo "Please provide a path to settings file"
exit
fi
config_path=$1
# start by getting the absolute path to the directory this script is in, which will be the top level of the repo
# this way script will work even if the repo is downloaded to a new location, rather than relying on hard coded paths to where I put the repo.
full_path=$(realpath $0)
repo_root=$(dirname $full_path)
# export the path to the repo for scripts called by this script to also use
export repo_root
# running config file will set up necessary environment variables
source "$config_path"
# confirm study folder exists where it should, and has the expected GENERAL/PROTECTED and raw and processed folder paths
cd "$data_root"/PROTECTED
if [[ ! -d $study ]]; then
echo "Invalid data root path ${data_root} or study ID ${study} 1"
exit
fi
if [[ ! -d ../GENERAL/$study ]]; then
echo "Invalid data root path ${data_root} or study ID ${study} 2"
exit
fi
if [[ ! -d $study/raw ]]; then
echo "Study folder ${study} improperly set up 3"
exit
fi
if [[ ! -d $study/processed ]]; then
echo "Study folder ${study} improperly set up 4"
exit
fi
# don't care if there is a raw in GENERAL as I will never use that part, so excluding that check
if [[ ! -d ../GENERAL/$study/processed ]]; then
echo "Study folder ${study} improperly set up 5"
exit
fi
if [[ ! -e ../PROTECTED/${study}/${study}_metadata.csv ]]; then
echo "Study ${study} missing metadata"
exit
fi
cd "$study"/raw # switch to study's raw folder for first loop over patient list
# make directory for logs if needed
if [[ ! -d ${repo_root}/logs ]]; then
mkdir "$repo_root"/logs
fi
# keep logs in individual directories per study
if [[ ! -d ${repo_root}/logs/${study} ]]; then
mkdir "$repo_root"/logs/"$study"
fi
# also have a separate subfolder setup for possible emails to be more easily located
if [[ ! -d ${repo_root}/logs/${study}/emails_sent ]]; then
mkdir "$repo_root"/logs/"$study"/emails_sent
fi
# save with unique timestamp (unix seconds)
log_timestamp=$(date +%s)
# test using console and log file simultaneously
# redirect stdout and stderr to a log file
log_file="$repo_root"/logs/"$study"/audio_process_logging_"$log_timestamp".txt
exec > >(tee -ia "$log_file")
exec 2> >(tee -ia "$log_file" >&2)
# let user know script is starting
echo ""
echo "Beginning script - mono interview audio preprocessing for:"
echo "$study"
echo "with data root:"
echo "$data_root"
# give additional info about this preprocess run
if [ $auto_send_on = "Y" ] || [ $auto_send_on = "y" ]; then
echo "Automatically sending all qualifying audio to TranscribeMe, at username:"
echo "$transcribeme_username"
echo "qualifying audio have a duration (in seconds) of at least:"
echo "$length_cutoff"
echo "and db level of at least:"
echo "$db_cutoff"
if [ $auto_send_limit_bool = "Y" ] || [ $auto_send_limit_bool = "y" ]; then
echo "If the total minutes of audio across patients exceeds:"
echo "$auto_send_limit"
echo "instead of being uploaded to TranscribeMe, all decrypted files will be left in the audio_to_send subfolder of offsite_interview/processed for each patient"
else
echo "All acceptable audio will be sent regardless of total amount"
fi
else
echo "Audio will not be automatically sent to TranscribeMe"
echo "all decrypted files will be left in the audio_to_send subfolder under PROTECTED side processed for each patient"
fi
echo ""
# setup processed folder (if necessary for new studies/patients)
for p in *; do
# check that it is truly a patient ID that has some offsite interview data
if [[ ! -d ${data_root}/PROTECTED/${study}/raw/${p}/interviews ]]; then
continue
fi
# now need to make sure processed folders are setup for this patient in both PROTECTED and GENERAL
# need patient folder under processed, and then also need interviews folder under patient (and psychs/open under interviews)
# TODO - confirm desired folder permissions situation here for prod, add appropriate checks if needed (also in the other top level bash scripts), or could potentially delete this section
if [[ ! -d ${data_root}/PROTECTED/${study}/processed/${p} ]]; then
mkdir "$data_root"/PROTECTED/"$study"/processed/"$p"
fi
if [[ ! -d ${data_root}/GENERAL/${study}/processed/${p} ]]; then
mkdir "$data_root"/GENERAL/"$study"/processed/"$p"
fi
if [[ ! -d ${data_root}/PROTECTED/${study}/processed/${p}/interviews ]]; then
mkdir "$data_root"/PROTECTED/"$study"/processed/"$p"/interviews
fi
if [[ ! -d ${data_root}/GENERAL/${study}/processed/${p}/interviews ]]; then
mkdir "$data_root"/GENERAL/"$study"/processed/"$p"/interviews
fi
if [[ ! -d ${data_root}/PROTECTED/${study}/processed/${p}/interviews/open ]]; then
mkdir "$data_root"/PROTECTED/"$study"/processed/"$p"/interviews/open
fi
if [[ ! -d ${data_root}/GENERAL/${study}/processed/${p}/interviews/open ]]; then
mkdir "$data_root"/GENERAL/"$study"/processed/"$p"/interviews/open
fi
if [[ ! -d ${data_root}/PROTECTED/${study}/processed/${p}/interviews/psychs ]]; then
mkdir "$data_root"/PROTECTED/"$study"/processed/"$p"/interviews/psychs
fi
if [[ ! -d ${data_root}/GENERAL/${study}/processed/${p}/interviews/psychs ]]; then
mkdir "$data_root"/GENERAL/"$study"/processed/"$p"/interviews/psychs
fi
# if auto send is on:
# for each patient, check that there is currently no to_send folder (or if there is, it is empty)
# otherwise those contents would also get uploaded to TranscribeMe, but that may not be intended behavior -
# when someone calls full pipeline with auto transcribe on, they would probably expect only the newly processed files to be sent
# especially because when it is run with auto send off, a to_send folder will be left that someone may forget about, could inadvertently send a backlog later
# so solution for now is just to exit the script if there are preexisting to_send files for this study
# then let user know the outstanding files should be dealt with outside of the main pipeline
if [ $auto_send_on = "Y" ] || [ $auto_send_on = "y" ]; then
if [[ -d "$data_root"/PROTECTED/"$study"/processed/"$p"/interviews/open/audio_to_send ]]; then
# know to_send exists for this patient now, so need it to be empty to continue the script
cd "$data_root"/PROTECTED/"$study"/processed/"$p"/interviews/open
if [ ! -z "$(ls -A audio_to_send)" ]; then
echo "Automatic transcription was selected, but there are preexisting audio files in audio_to_send folder(s) under this study"
echo "As those would get sent potentially unintentionally by auto transcription, please handle the backlog outside of the main pipeline"
echo "The files that need to be addressed can be listed with the following command:"
echo "ls ${data_root}/PROTECTED/${study}/processed/*/interviews/open/audio_to_send"
echo ""
echo "Exiting, please requeue once the above has been addressed"
exit # will exit if there is a problem with even one patient in this study
fi
fi
# need to do everyting for psychs separately from open
if [[ -d "$data_root"/PROTECTED/"$study"/processed/"$p"/interviews/psychs/audio_to_send ]]; then
# know to_send exists for this patient now, so need it to be empty to continue the script
cd "$data_root"/PROTECTED/"$study"/processed/"$p"/interviews/psychs
if [ ! -z "$(ls -A audio_to_send)" ]; then
echo "Automatic transcription was selected, but there are preexisting audio files in audio_to_send folder(s) under this study"
echo "As those would get sent potentially unintentionally by auto transcription, please handle the backlog outside of the main pipeline"
echo "The files that need to be addressed can be listed with the following command:"
echo "ls ${data_root}/PROTECTED/${study}/processed/*/interviews/psychs/audio_to_send"
echo ""
echo "Exiting, please requeue once the above has been addressed"
exit # will exit if there is a problem with even one patient in this study
fi
fi
# do a similar check for a temp_audio folder. if one already exists additional audio would be accidentally sent to TranscribeMe
if [[ -d "$data_root"/PROTECTED/"$study"/processed/"$p"/interviews/open/temp_audio ]]; then
cd "$data_root"/PROTECTED/"$study"/processed/"$p"/interviews/open
if [ ! -z "$(ls -A temp_audio)" ]; then
echo "Automatic transcription was selected, but there are preexisting audio files in decrypted_audio folder(s) under this study"
echo "As those could get sent potentially unintentionally by auto transcription, please handle the backlog outside of the main pipeline"
echo "The files that need to be addressed can be listed with the following command:"
echo "ls ${data_root}/PROTECTED/${study}/processed/*/interviews/open/temp_audio"
echo ""
echo "Exiting, please requeue once the above has been addressed"
exit # will exit if there is a problem with even one patient in this study
fi
fi
# again repeat for psychs
if [[ -d "$data_root"/PROTECTED/"$study"/processed/"$p"/interviews/psychs/temp_audio ]]; then
cd "$data_root"/PROTECTED/"$study"/processed/"$p"/interviews/psychs
if [ ! -z "$(ls -A temp_audio)" ]; then
echo "Automatic transcription was selected, but there are preexisting audio files in decrypted_audio folder(s) under this study"
echo "As those could get sent potentially unintentionally by auto transcription, please handle the backlog outside of the main pipeline"
echo "The files that need to be addressed can be listed with the following command:"
echo "ls ${data_root}/PROTECTED/${study}/processed/*/interviews/psychs/temp_audio"
echo ""
echo "Exiting, please requeue once the above has been addressed"
exit # will exit if there is a problem with even one patient in this study
fi
fi
fi
done
cd "$data_root"/PROTECTED/"$study"/raw # ensure back to study's raw directory at end of loop
# add current time for runtime tracking purposes
now=$(date +"%T")
echo "Current time: ${now}"
echo ""
bash "$repo_root"/individual_modules/run_new_audio_conversion.sh "$data_root" "$study"
echo ""
# add current time for runtime tracking purposes
now=$(date +"%T")
echo "Current time: ${now}"
echo ""
# now run audio QC
bash "$repo_root"/individual_modules/run_audio_qc.sh "$data_root" "$study"
echo ""
# add current time for runtime tracking purposes
now=$(date +"%T")
echo "Current time: ${now}"
echo ""
# finally can set aside the audio files to be sent for transcription
echo "Setting aside files to be sent for transcription"
echo "(if auto transcription is off, all decrypted files will be moved to the audio_to_send subfolder, left there)"
# run script
bash "$repo_root"/individual_modules/run_audio_selection.sh "$data_root" "$study" "$length_cutoff" "$db_cutoff"
echo ""
# add current time for runtime tracking purposes
now=$(date +"%T")
echo "Current time: ${now}"
echo ""
# if auto send is on, will now actually send the set aside audio files and prepare email to alert relevant lab members/transcribeme
# the below script will also locally move any transcript successfully sent from the to_send subfolder to the pending_audio subfolder
# (if to_send is empty at the end it will be deleted, otherwise an error message to review the transcripts left in that folder will be included in email)
if [ $auto_send_on = "Y" ] || [ $auto_send_on = "y" ]; then
echo "Sending files for transcription"
# run push script
bash "$repo_root"/individual_modules/run_transcription_push.sh "$data_root" "$study" "$transcribeme_username" "$transcribeme_password" "$transcription_language" "$auto_send_limit_bool" "$auto_send_limit" "$log_file"
echo ""
# add current time for runtime tracking purposes
now=$(date +"%T")
echo "Current time: ${now}"
echo ""
# call script to fill in rest of email bodies
echo "Preparing information for automated emails"
bash "$repo_root"/individual_modules/run_email_writer.sh "$data_root" "$study"
echo ""
# finally, deal with email alerts. if no audio was successfully uploaded, there will be no transcribeme email file. only send emails if there is an update to report on
if [[ -e "$repo_root"/audio_lab_email_body.txt ]]; then
echo "Emailing status update to lab"
# send the email notifying lab members about audio files successfully pushed, with info about any errors or excluded files.
mail -s "[${study} ${server_version} Interview Pipeline Updates] New Audio Processed" "$lab_email_list" <"$repo_root"/audio_lab_email_body.txt
# move the email to logs folder for reference if there was actual content
mv "$repo_root"/audio_lab_email_body.txt "$repo_root"/logs/"$study"/emails_sent/audio_lab_email_body_"$log_timestamp".txt
else
echo "No new audio updates for this study, so no email to send"
fi
echo ""
# add current time for runtime tracking purposes
now=$(date +"%T")
echo "Current time: ${now}"
echo ""
fi
# finally run the file accounting updates for this study
echo "Compiling/updating file lists with processing date"
bash "$repo_root"/individual_modules/run_final_audio_accounting.sh "$data_root" "$study"
echo ""
# add current time for runtime tracking purposes
now=$(date +"%T")
echo "Current time: ${now}"
echo ""
echo "Script completed!"