-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathinterview_transcript_process.sh
executable file
·188 lines (165 loc) · 7.69 KB
/
interview_transcript_process.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#!/bin/bash
# top level pipeline script for transcript side processing
# should be called with path to config file as argument
if [[ -z "${1}" ]]; then
echo "Please provide a path to settings file"
exit
fi
config_path=$1
# start by getting the absolute path to the directory this script is in, which will be the top level of the repo
# this way script will work even if the repo is downloaded to a new location, rather than relying on hard coded paths to where I put the repo.
full_path=$(realpath $0)
repo_root=$(dirname $full_path)
# export the path to the repo for scripts called by this script to also use
export repo_root
# running config file will set up necessary environment variables
source "$config_path"
# confirm study folder exists where it should, and has the expected GENERAL/PROTECTED and raw and processed folder paths
cd "$data_root"/PROTECTED
if [[ ! -d $study ]]; then
echo "Invalid data root path ${data_root} or study ID ${study}"
exit
fi
if [[ ! -d ../GENERAL/$study ]]; then
echo "Invalid data root path ${data_root} or study ID ${study}"
exit
fi
if [[ ! -d $study/raw ]]; then
echo "Study ${study} folder improperly set up"
exit
fi
if [[ ! -d $study/processed ]]; then
echo "Study ${study} folder improperly set up"
exit
fi
# don't care if there is a raw in GENERAL as I will never use that part, so excluding that check
if [[ ! -d ../GENERAL/$study/processed ]]; then
echo "Study ${study} folder improperly set up"
exit
fi
if [[ ! -e ../PROTECTED/${study}/${study}_metadata.csv ]]; then
echo "Study ${study} missing metadata"
exit
fi
# don't repeat further folder setup here though, because audio side of pipeline should really always be run before transcript
# just want basic check to make sure invalid site IDs don't get traversed
# make directory for logs if needed
if [[ ! -d ${repo_root}/logs ]]; then
mkdir "$repo_root"/logs
fi
# keep logs in individual directories per study
if [[ ! -d ${repo_root}/logs/${study} ]]; then
mkdir "$repo_root"/logs/"$study"
fi
# also have a separate subfolder setup for possible emails to be more easily located
if [[ ! -d ${repo_root}/logs/${study}/emails_sent ]]; then
mkdir "$repo_root"/logs/"$study"/emails_sent
fi
# save with unique timestamp (unix seconds)
log_timestamp=`date +%s`
# test using console and log file simultaneously
exec > >(tee -ia "$repo_root"/logs/"$study"/transcript_process_logging_"$log_timestamp".txt)
exec 2> >(tee -ia "$repo_root"/logs/"$study"/transcript_process_logging_"$log_timestamp".txt >&2)
# let user know script is starting
echo ""
echo "Beginning script - interview transcript preprocessing for:"
echo "$study"
echo "with data root:"
echo "$data_root"
echo "Automatically pulling all new transcripts from TranscribeMe username:"
echo "$transcribeme_username"
echo ""
# add current time for runtime tracking purposes
now=$(date +"%T")
echo "Current time: ${now}"
echo ""
# run transcript pull script, also puts together bulk of email update
# use source to maintain environment variable on transcript update boolean
source "$repo_root"/individual_modules/run_transcription_pull.sh "$data_root" "$study" "$transcribeme_username" "$transcribeme_password" "$transcription_language"
echo ""
echo "Transcript pull complete"
echo ""
# add current time for runtime tracking purposes
now=$(date +"%T")
echo "Current time: ${now}"
echo ""
# put copies of any newly reviewed transcripts (returned via Lochness) from raw to appropriate processed folder location here
# info on newly reviewed transcripts also appended to email
# use source to maintain environment variable on transcript update boolean
source "$repo_root"/individual_modules/run_transcription_review_update.sh "$data_root" "$study"
# also run the code that checks for transcripts that were set aside for prescreening but not yet returned by the sites as part of this block
bash "$repo_root"/individual_modules/run_transcription_review_alerts.sh "$data_root" "$study"
echo ""
echo "Transcript review updates complete"
echo ""
# add current time for runtime tracking purposes
now=$(date +"%T")
echo "Current time: ${now}"
echo ""
# run script to get redacted copies of the transcripts into GENERAL
bash "$repo_root"/individual_modules/run_transcript_redaction.sh "$data_root" "$study"
echo ""
echo "Any newly reviewed transcripts have been redacted"
echo ""
# add current time for runtime tracking purposes
now=$(date +"%T")
echo "Current time: ${now}"
echo ""
# run script to convert new transcripts in GENERAL to CSV
bash "$repo_root"/individual_modules/run_transcript_csv_conversion.sh "$data_root" "$study"
echo ""
echo "CSV conversion completed for newly redacted transcripts"
echo ""
# add current time for runtime tracking purposes
now=$(date +"%T")
echo "Current time: ${now}"
echo ""
# run transcript QC
bash "$repo_root"/individual_modules/run_transcript_qc.sh "$data_root" "$study"
echo ""
echo "Transcript QC completed"
echo ""
# add current time for runtime tracking purposes
now=$(date +"%T")
echo "Current time: ${now}"
echo ""
# send email notifying lab members about transcripts successfully pulled/processed, and those we are still waiting on.
# only send if there is something relevant for this study though - check environment variables set by relevant modules
if [[ "$trans_updates" == 1 || "$review_updates" == 1 ]]; then
echo "Emailing status update to lab"
mail -s "[${study} ${server_version} Interview Pipeline Updates] Transcription Status Update" "$lab_email_list" < "$repo_root"/transcript_lab_email_body.txt
# move email to logs folder for reference if it has real content
mv "$repo_root"/transcript_lab_email_body.txt "$repo_root"/logs/"$study"/emails_sent/transcript_lab_email_body_"$log_timestamp".txt
else
echo "No new transcript updates for this study, so no email to send"
rm "$repo_root"/transcript_lab_email_body.txt
fi
echo ""
# similarly will send the email about transcripts remaining to be reviewed
if [[ -e ${repo_root}/site_review_email_body.txt ]]; then
echo "Emailing review prompt to site"
# first add additional context to the email
echo "" >> "$repo_root"/site_review_email_body.txt
echo "These files are in Box under the 'for review' subfolder of transcripts, which can be found in your site interviews folder. Please review each file for redaction correctness, ensuring all (and only) words that should be redacted are encased in curly braces." >> "$repo_root"/site_review_email_body.txt
echo "In order for your review to register as complete so that the transcript can finish processing, you must then move the transcript text file (with filename unchanged!) into the appropriate part of the 'approved' subfolder you will find in Box, also under 'transcripts'. This means that each completed transcript should be placed directly under the subfolder for the corresponding subject ID that is found in 'approved'." >> "$repo_root"/site_review_email_body.txt
echo "If you have additional questions, please see the SOP." >> "$repo_root"/site_review_email_body.txt
# now can send
mail -s "[Action Required] ${server_version} Interview Pipeline Transcripts to Review" "$site_email_list" < "$repo_root"/site_review_email_body.txt
mv "$repo_root"/site_review_email_body.txt "$repo_root"/logs/"$study"/emails_sent/site_review_email_body_"$log_timestamp".txt
else
echo "No transcriptions requiring review currently for this site"
fi
echo ""
# add current time for runtime tracking purposes
now=$(date +"%T")
echo "Current time: ${now}"
echo ""
# finally run the file accounting updates for this study
echo "Compiling/updating file lists with processing date"
bash "$repo_root"/individual_modules/run_final_transcript_accounting.sh "$data_root" "$study" "$transcription_language"
echo ""
# add current time for runtime tracking purposes
now=$(date +"%T")
echo "Current time: ${now}"
echo ""
echo "Script completed!"