-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathgenome_updater.sh
executable file
·1580 lines (1439 loc) · 68.3 KB
/
genome_updater.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env bash
set -euo pipefail
IFS=$' '
# The MIT License (MIT)
# Copyright (c) 2024 - Vitor C. Piro - pirovc.github.io
# All rights reserved.
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
version="0.6.4"
# Define ncbi_base_url or use local files (for testing)
local_dir=${local_dir:-}
if [[ ! -z "${local_dir}" ]]; then
# set local dir with absulute path and "file://"
local_dir="file://$(cd "${local_dir}" && pwd)"
fi
ncbi_base_url=${ncbi_base_url:-ftp://ftp.ncbi.nlm.nih.gov/} #Alternative ftp://ftp.ncbi.nih.gov/
gtdb_base_url="https://data.gtdb.ecogenomic.org/releases/latest/"
retries=${retries:-3}
timeout=${timeout:-120}
export retries timeout ncbi_base_url gtdb_base_url local_dir
# Export locale numeric to avoid errors on printf in different setups
export LC_NUMERIC="en_US.UTF-8"
#activate aliases in the script
shopt -s expand_aliases
alias sort="sort --field-separator=$'\t'"
join_as_fields1="1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,1.10,1.11,1.12,1.13,1.14,1.15,1.16,1.17,1.18,1.19,1.20,1.21,1.22,1.23,1.24,1.25,1.26,1.27,1.28,1.29,1.30,1.31,1.32,1.33,1.34,1.35,1.36,1.37,1.38"
join_as_fields2="1.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9,2.10,2.11,2.12,2.13,2.14,2.15,2.16,2.17,2.18,2.19,2.20,2.21,2.22,2.23,2.24,2.25,2.26,2.27,2.28,2.29,2.30,2.31,2.32,2.33,2.34,2.35,2.36,2.37,2.38"
download_url() # parameter: ${1} url, ${2} output file/directory (omit/empty to STDOUT)
{
url="${1}"
outfiledir="${2:-}"
if [[ ! -z "${outfiledir}" ]]; then
if [[ -d "${outfiledir}" ]]; then
outfile="${outfiledir}/${1##*/}" # based on given output dir and file to download
else
outfile="${outfiledir}"
fi
else
outfile="-" # STDOUT
fi
# Replace base url with local directory if provided
if [[ ! -z "${local_dir}" ]]; then
url="${local_dir}/${url#*://*/}";
fi
downloader "${outfile}" "${url}"
}
export -f download_url #export it to be accessible to the parallel call
download_retry_md5(){ # parameter: ${1} url, ${2} output file, ${3} url MD5 (empty to skip), ${4} re-tries
for (( att=1; att<=${4:-1}; att++ )); do
if [ "${att}" -gt 1 ]; then
echolog " - Failed to download ${url}. Trying again #${att}" "1"
fi
download_url "${1}" "${2}"
# No md5 file to check
if [[ -z "${3}" ]]; then
return 0;
else
real_md5=$(download_url "${3}" | grep "${1##*/}" | cut -f1 -d' ')
if [ -z "${real_md5}" ]; then
continue; # did not find url file on md5 file (or empty), try again
else
file_md5=$(md5sum ${2} | cut -f1 -d' ')
if [ "${file_md5}" != "${real_md5}" ]; then
continue; # md5 didn't match, try again
else
return 0; # md5 matched, return success
fi
fi
fi
done
return 1; # failed to check md5 after all attempts
}
path_output() # parameter: ${1} file/url
{
f=$(basename ${1});
path="${files_dir}";
if [[ "${ncbi_folders}" -eq 1 ]]; then
path="${path}${f:0:3}/${f:4:3}/${f:7:3}/${f:10:3}/";
fi
echo "${path}";
}
export -f path_output
link_version() # parameter: ${1} current_output_prefix, ${2} new_output_prefix, ${3} file
{
path_out=$(path_output ${3})
if [[ -f "${1}${path_out}${3}" ]]; then
mkdir -p "${2}${path_out}";
ln -s -r "${1}${path_out}${3}" "${2}${path_out}";
fi
}
export -f link_version #export it to be accessible to the parallel call
list_local_files() # parameter: ${1} prefix, ${2} 1 to list list all, "" list only '-not -empty'
{
# Returns list of local files, without folder structure
if [[ "${ncbi_folders}" -eq 0 ]]; then
depth="-maxdepth 1";
else
depth="-mindepth 4";
fi
param="-not -empty"
if [[ ! -z "${2:-}" ]]; then
param=""
fi
find "${1}${files_dir}" ${depth} ${param} \( -type f -o -type l \) -printf "%f\n"
}
unpack() # parameter: ${1} file, ${2} output folder[, ${3} files to unpack]
{
tar xf "${1}" -C "${2}" "${3}"
}
count_lines() # parameter: ${1} file - return number of lines
{
echo ${1:-} | sed '/^\s*$/d' | wc -l | cut -f1 -d' '
}
count_lines_file() # parameter: ${1} file - return number of lines
{
sed '/^\s*$/d' ${1:-} | wc -l | cut -f1 -d' '
}
check_assembly_summary() # parameter: ${1} assembly_summary file - return 0 true 1 false
{
# file exists and it's not empty
if [ ! -s "${1}" ]; then return 1; fi
# Last char is empty (line break)
if [ ! -z $(tail -c -1 "${1}") ]; then return 1; fi
# if contains header char parts of the header anywhere besides starting lines
grep -m 1 "^#" "${1}" > /dev/null 2>&1
if [ $? -eq 0 ]; then return 1; fi
# if contains parts of the header anywhere
## See ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt for a description of the columns in this file.
grep -m 1 "ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt" "${1}" > /dev/null 2>&1
if [ $? -eq 0 ]; then return 1; fi
# assembly_accession bioproject biosample wgs_master refseq_category taxid species_taxid organism_name infraspecific_name isolate version_status assembly_levelrelease_type genome_rep seq_rel_date asm_name submitter gbrs_paired_asm paired_asm_comp ftp_path excluded_from_refseq relation_to_type_material asm_not_live_date
grep -m 1 " assembly_accession" "${1}" > /dev/null 2>&1
if [ $? -eq 0 ]; then return 1; fi
# if every line has same number of cols (besides headers)
ncols=$(grep -v "^#" "${1}" | awk 'BEGIN{FS=OFS="\t"}{print NF}' | uniq | wc -l)
if [[ ${ncols} -gt 1 ]]; then return 1; fi
# if every line starts with GCF_ or GCA_
grep -v "^GC[FA]_" "${1}" > /dev/null 2>&1
if [ $? -eq 0 ]; then return 1; fi
return 0;
}
get_assembly_summary() # parameter: ${1} assembly_summary file, ${2} database, ${3} organism_group - return number of lines
{
# Collect urls to download
as_to_download=()
for d in ${2//,/ }
do
# If no organism group is chosen, get complete assembly_summary for the database
if [[ -z "${3}" ]]; then
as_to_download+=("${ncbi_base_url}genomes/${d}/assembly_summary_${d}.txt")
if [[ "${tax_mode}" == "gtdb" ]]; then
as_to_download+=("${ncbi_base_url}genomes/${d}/assembly_summary_${d}_historical.txt")
fi
else
for og in ${3//,/ }
do
#special case: human
if [[ "${og}" == "human" ]]; then og="vertebrate_mammalian/Homo_sapiens"; fi
as_to_download+=("${ncbi_base_url}genomes/${d}/${og}/assembly_summary.txt")
if [[ "${tax_mode}" == "gtdb" ]]; then
as_to_download+=("${ncbi_base_url}genomes/${d}/${og}/assembly_summary_historical.txt")
fi
done
fi
done
# Download files with retry attempts, checking consistency of assembly_summary after every download
for as in "${as_to_download[@]}"
do
for (( att=1; att<=${retry_download_batch}; att++ )); do
if [ "${att}" -gt 1 ]; then
echolog " - Failed to download ${as}. Trying again #${att}" "1"
fi
download_url "${as}" 2> /dev/null | tail -n+3 > "${1}.tmp"
if check_assembly_summary "${1}.tmp"; then
cat "${1}.tmp" >> "${1}"
break;
elif [ ${att} -eq ${retry_download_batch} ]; then
return 1; # failed to download after all attempts
fi
done
done
rm -f "${1}.tmp"
# Final check full file
if check_assembly_summary "${1}"; then
return 0;
else
return 1;
fi
}
write_history(){ # parameter: ${1} current label, ${2} new label, ${3} new timestamp, ${4} assembly_summary file
# if current label is the same as new label (new)
# reading the history
# Only new_label = NEW
# both current and new_label = UPDATE
# only current_label = FIX
if [[ "${1}" == "${2}" ]]; then
echo -e "#current_label\tnew_label\ttimestamp\tassembly_summary_entries\targuments" > ${history_file}
echo -n -e "\t" >> ${history_file}
else
echo -n -e "${1}\t" >> ${history_file}
fi
echo -n -e "${2}\t" >> ${history_file}
echo -n -e "${3}\t" >> ${history_file}
echo -n -e "$(count_lines_file ${4})\t" >> ${history_file}
echo -e "${genome_updater_args}" >> ${history_file}
}
filter_assembly_summary() # parameter: ${1} assembly_summary file, ${2} number of lines - return 1 if no lines or failed, 0 success
{
assembly_summary="${1}"
filtered_lines=${2}
if [[ "${filtered_lines}" -eq 0 ]]; then return 1; fi
gtdb_tax=""
ncbi_tax=""
ncbi_rank_tax=""
tmp_new_taxdump=""
if [[ "${tax_mode}" == "gtdb" ]]; then
echolog " - Downloading taxonomy (gtdb)" "1"
# Download and parse GTDB tax
gtdb_tax=$(tmp_file "gtdb_tax.tmp")
for url in "${gtdb_urls[@]}"; do
tmp_tax=$(tmp_file "gtdb_tax.tmp.gz")
#if ! download_retry_md5 "${url}" "${tmp_tax}" "${gtdb_base_url}MD5SUM.txt" "${retry_download_batch}"; then
if ! download_retry_md5 "${url}" "${tmp_tax}" "" "${retry_download_batch}"; then
return 1;
else
# awk to remove prefix RS_ or GB_
zcat "${tmp_tax}" | awk -F "\t" '{print substr($1, 4, length($1))"\t"$2}' >> "${gtdb_tax}"
fi
rm -f "${tmp_tax}"
done
elif [[ "${tax_mode}" == "ncbi" && ( ! -z "${taxids}" || ( ! -z "${top_assemblies_rank}" && "${top_assemblies_rank}" != "species" ) ) ]]; then
echolog " - Downloading taxonomy (ncbi)" "1"
tmp_new_taxdump="${working_dir}new_taxdump.tar.gz"
if ! download_retry_md5 "${ncbi_base_url}pub/taxonomy/new_taxdump/new_taxdump.tar.gz" "${tmp_new_taxdump}" "${ncbi_base_url}pub/taxonomy/new_taxdump/new_taxdump.tar.gz.md5" "${retry_download_batch}"; then
return 1;
fi
fi
if [[ "${tax_mode}" == "gtdb" ]]; then
tmp_gtdb_missing=$(tmp_file "gtdb_missing")
gtdb_lines=$(filter_gtdb "${assembly_summary}" "${gtdb_tax}" "${tmp_gtdb_missing}")
echolog " - $((filtered_lines-gtdb_lines)) assemblies removed not in GTDB" "1"
# If missing file has entries, report on log
gtdb_missing_lines=$(count_lines_file "${tmp_gtdb_missing}")
if [[ "${gtdb_missing_lines}" -gt 0 ]]; then
echolog " - Could not retrieve "${gtdb_missing_lines}" GTDB assemblies" "1"
cat "${tmp_gtdb_missing}" >> "${log_file}"
fi
rm "${tmp_gtdb_missing}"
filtered_lines=${gtdb_lines}
if [[ "${filtered_lines}" -eq 0 ]]; then return 0; fi
fi
# DATE
if [[ ! -z "${date_start}" || ! -z "${date_end}" ]]; then
date_lines=$(filter_date "${assembly_summary}")
echolog " - $((filtered_lines-date_lines)) assemblies removed not in the date range [ ${date_start} .. ${date_end} ]" "1"
filtered_lines=${date_lines}
if [[ "${filtered_lines}" -eq 0 ]]; then return 0; fi
fi
# TAXIDS
if [[ ! -z "${taxids}" ]]; then
if [[ "${tax_mode}" == "ncbi" ]]; then
unpack "${tmp_new_taxdump}" "${working_dir}" "taxidlineage.dmp"
ncbi_tax="${working_dir}taxidlineage.dmp"
taxids_lines=$(filter_taxids_ncbi "${assembly_summary}" "${ncbi_tax}")
else
taxids_lines=$(filter_taxids_gtdb "${assembly_summary}" "${gtdb_tax}")
fi
echolog " - $((filtered_lines-taxids_lines)) assemblies removed not in taxids [${taxids}]" "1"
filtered_lines=${taxids_lines}
if [[ "${filtered_lines}" -eq 0 ]]; then return 0; fi
fi
# Filter columns
columns_lines=$(filter_columns "${assembly_summary}")
if [ "$((filtered_lines-columns_lines))" -gt 0 ]; then
echolog " - $((filtered_lines-columns_lines)) assemblies removed based on filters:" "1"
echolog " valid URLs" "1"
if [[ "${tax_mode}" == "ncbi" ]]; then echolog " version status=latest" "1"; fi
if [ ! -z "${refseq_category}" ]; then echolog " refseq category=${refseq_category}" "1"; fi
if [ ! -z "${assembly_level}" ]; then echolog " assembly level=${assembly_level}" "1"; fi
if [ ! -z "${custom_filter}" ]; then echolog " custom filter=${custom_filter}" "1"; fi
filtered_lines=${columns_lines}
if [[ "${filtered_lines}" -eq 0 ]]; then return 0; fi
fi
#TOP ASSEMBLIES
if [ "${top_assemblies_num}" -gt 0 ]; then
# Add chosen rank as first col of a temporary assembly_summary
if [[ "${tax_mode}" == "ncbi" ]]; then
if [[ ! -z "${top_assemblies_rank}" && "${top_assemblies_rank}" != "species" ]]; then
unpack "${tmp_new_taxdump}" "${working_dir}" "rankedlineage.dmp"
ncbi_rank_tax="${working_dir}rankedlineage.dmp"
fi
ranked_lines=$(add_rank_ncbi "${assembly_summary}" "${assembly_summary}_rank" "${ncbi_rank_tax}")
else
ranked_lines=$(add_rank_gtdb "${assembly_summary}" "${assembly_summary}_rank" "${gtdb_tax}")
fi
if [ $((filtered_lines-ranked_lines)) -gt 0 ]; then
echolog " - Failed to match all entries to taxonomic identifiers with ${top_assemblies}" "1"
fi
top_lines=$(filter_top_assemblies "${assembly_summary}" "${assembly_summary}_rank")
echolog " - $((filtered_lines-top_lines)) entries removed with top ${top_assemblies}" "1"
rm -f "${assembly_summary}_rank"
filtered_lines=${top_lines}
if [[ "${filtered_lines}" -eq 0 ]]; then return 0; fi
fi
rm -f "${ncbi_tax}" "${ncbi_rank_tax}" "${gtdb_tax}" "${tmp_new_taxdump}"
return 0;
}
filter_taxids_ncbi() # parameter: ${1} assembly_summary file, ${2} ncbi_tax file - return number of lines
{
# Keep only selected taxid lineage, removing at the end duplicated entries from duplicates on taxids
tmp_lineage=$(tmp_file "lineage.tmp")
for tx in ${taxids//,/ }; do
txids_lin=$(grep "[^0-9]${tx}[^0-9]" "${2}" | cut -f 1) #get only taxids in the lineage section
echolog " - $(count_lines "${txids_lin}") children taxids in the lineage of ${tx}" "0"
echo "${txids_lin}" >> "${tmp_lineage}"
done
lineage_taxids=$(sort ${tmp_lineage} | uniq | tr '\n' ',')${taxids} # put lineage back into the taxids variable with the provided taxids
rm "${tmp_lineage}"
# Join with assembly_summary based on taxid field 6
join -1 6 -2 1 <(sort -k 6,6 "${1}") <(echo "${lineage_taxids//,/$'\n'}" | sort -k 1,1) -t$'\t' -o ${join_as_fields1} | sort | uniq > "${1}_taxids"
mv "${1}_taxids" "${1}"
count_lines_file "${1}"
}
filter_taxids_gtdb() # parameter: ${1} assembly_summary file, ${2} gtdb_tax file return number of lines
{
tmp_gtdb_acc=$(tmp_file "gtdb_acc.tmp")
IFS=","
for tx in ${taxids}; do
sed -e 's/\t/\t;/g' -e 's/$/;/p' ${2} | grep ";${tx};" | cut -f 1 >> "${tmp_gtdb_acc}"
done
IFS=$' '
join -1 1 -2 1 <(sort -k 1,1 "${1}") <(sort -k 1,1 "${tmp_gtdb_acc}" | uniq) -t$'\t' -o ${join_as_fields1} | sort | uniq > "${1}_taxids"
mv "${1}_taxids" "${1}"
rm "${tmp_gtdb_acc}"
count_lines_file "${1}"
}
filter_date() # parameter: ${1} assembly_summary file - return number of lines
{
awk -v dstart="${date_start}" -v dend="${date_end}" 'BEGIN{FS=OFS="\t"}{date=$15; gsub("/","",date); if((date>=dstart || dstart=="") && (date<=dend || dend=="")) print $0}' "${1}" > "${1}_date"
mv "${1}_date" "${1}"
count_lines_file "${1}"
}
filter_columns() # parameter: ${1} assembly_summary file - return number of lines
{
# Build string to filter file by columns in the format
# colA:val1,val2|colB:val3
# AND between cols, OR between values
# Valid URLs (not na)
awk -F "\t" '{if($20!="na"){print $0}}' "${1}" > "${1}_valid"
colfilter=""
if [[ "${tax_mode}" == "ncbi" ]]; then
colfilter="11:latest|"
fi
if [[ ! -z "${refseq_category}" ]]; then
colfilter="${colfilter}5:${refseq_category}|"
fi
if [[ ! -z "${assembly_level}" ]]; then
colfilter="${colfilter}12:${assembly_level}|"
fi
if [[ ! -z "${custom_filter}" ]]; then
colfilter="${colfilter}${custom_filter}|"
fi
if [[ ! -z "${colfilter}" ]]; then
awk -F "\t" -v colfilter="${colfilter%?}" '
function ltrim(s) { sub(/^[ \t\r\n]+/, "", s); return s }
function rtrim(s) { sub(/[ \t\r\n]+$/, "", s); return s }
function trim(s) { return rtrim(ltrim(s)); }
BEGIN{
split(colfilter, fields, "|");
for(f in fields){
split(fields[f], keyvals, ":");
filter[keyvals[1]]=keyvals[2];}
}{
k=0;
for(f in filter){
split(filter[f], v, ","); for (i in v) vals[tolower(trim(v[i]))]="";
if(tolower($f) in vals){
k+=1;
}
};
if(k==length(filter)){
print $0;
}
}' "${1}_valid" > "${1}"
rm -f "${1}_valid"
else
mv "${1}_valid" "${1}"
fi
count_lines_file "${1}"
}
filter_gtdb() # parameter: ${1} assembly_summary file, ${2} gtdb_tax file, ${3} gtdb_missing file - return number of lines
{
# Check for missing entries
join -1 1 -2 1 <(sort -k 1,1 "${1}") <(sort -k 1,1 "${2}") -v 2 > ${3}
# Match entries
join -1 1 -2 1 <(sort -k 1,1 "${1}") <(sort -k 1,1 "${2}") -t$'\t' -o ${join_as_fields1} | sort | uniq > "${1}_gtdb"
mv "${1}_gtdb" "${1}"
count_lines_file "${1}"
}
add_rank_ncbi(){ # parameter: ${1} assembly_summary file, ${2} modified assembly_summary file with rank as first col, ${3} ncbi_tax file - return number of lines
# rankedlineage.dmp cols (sep tab|tab):
# $1=taxid, $3=name, $5=species, $7=genus, $9=family, $11=order, $13=class, $15=phylum, $17=kingdom, $19=superkingdom
if [[ -z "${top_assemblies_rank}" ]]; then
# Repeat leaf taxid
awk 'BEGIN{FS=OFS="\t"}{print $6,$0}' "${1}" > "${2}"
elif [[ "${top_assemblies_rank}" == "species" ]]; then
# Repeat species taxid
awk 'BEGIN{FS=OFS="\t"}{print $7,$0}' "${1}" > "${2}"
else
# export taxid <tab> ranked name
tmp_ranked_taxids=$(tmp_file "ranked_taxids.tmp")
awk -v rank="${top_assemblies_rank}" 'BEGIN{
FS=OFS="\t";
r["genus"]=7;
r["family"]=9;
r["order"]=11;
r["class"]=13;
r["phylum"]=15;
r["superkingdom"]=19;
}{
print $1, $r[rank] ? $r[rank] : $1;
}' "${3}" > "${tmp_ranked_taxids}"
# Join ranked name by taxid col
join -1 6 -2 1 <(sort -k 6,6 "${1}") <(sort -k 1,1 "${tmp_ranked_taxids}") -t$'\t' -o "2.2,${join_as_fields1}" > "${2}"
rm -f "${tmp_ranked_taxids}"
fi
count_lines_file "${2}"
}
add_rank_gtdb(){ # parameter: ${1} assembly_summary file, ${2} modified assembly_summary file with rank as first col, ${3} gtdb_tax file - return number of lines
# gtdb taxonomy (RS_ and GB_ already stripped)
# accession.version <tab> d__Bacteria;p__Firmicutes;c__Bacilli;o__Staphylococcales;f__Staphylococcaceae;g__Staphylococcus;s__Staphylococcus aureus
# export accession <tab> ranked name
#if top_assemblies_rank empty, default to species (leaves on gtdb)
tmp_ranked_accessions=$(tmp_file "ranked_accessions.tmp")
cat "${3}" | tr ';' '\t' | awk -v rank="${top_assemblies_rank:-species}" 'BEGIN{
FS=OFS="\t";
r["species"]=8;
r["genus"]=7;
r["family"]=6;
r["order"]=5;
r["class"]=4;
r["phylum"]=3;
r["superkingdom"]=2;
}{
print $1, $r[rank] ? $r[rank] : $1;
}' > "${tmp_ranked_accessions}"
# Join ranked taxid by accession
join -1 1 -2 1 <(sort -k 1,1 "${1}") <(sort -k 1,1 "${tmp_ranked_accessions}") -t$'\t' -o "2.2,${join_as_fields1}" > "${2}"
rm -f "${tmp_ranked_accessions}"
count_lines_file "${2}"
}
filter_top_assemblies() # parameter: ${1} assembly_summary file, ${2} modified assembly_summary file with rank as first col - return number of lines
{
# First col contains rank info (all other get shifted with +1)
awk -v taxcol="1" 'BEGIN{
FS=OFS="\t";
col5["reference genome"]=1;
col5["na"]=2;
col12["Complete Genome"]=1;
col12["Chromosome"]=2;
col12["Scaffold"]=3;
col12["Contig"]=4;
col22["assembly from type material"]=1;
col22["assembly from synonym type material"]=2;
col22["assembly from pathotype material"]=3;
col22["assembly designated as neotype"]=4;
col22["assembly designated as reftype"]=5;
col22["ICTV species exemplar"]=6;
col22["ICTV additional isolate"]=7;
max_val=9;
}{
gsub("/","",$(15+1));
print $(1+1), $taxcol, $(5+1) in col5 ? col5[$(5+1)] : max_val, $(12+1) in col12 ? col12[$(12+1)] : max_val, $(22+1) in col22 ? col22[$(22+1)] : max_val, $(15+1);
}' "${2}" | sort -t$'\t' -k 2,2 -k 3,3 -k 4,4 -k 5,5 -k 6nr,6 -k 1,1 | awk -v top="${top_assemblies_num}" 'BEGIN{FS=OFS="\t"}{if(cnt[$2]<top){print $1;cnt[$2]+=1}}' > "${2}_top_acc"
join <(sort -k 1,1 "${2}_top_acc") <(sort -k 1,1 "${1}") -t$'\t' -o ${join_as_fields2} > "${1}_top"
mv "${1}_top" "${1}"
rm "${2}_top_acc"
count_lines_file "${1}"
}
list_files() # parameter: ${1} file, ${2} fields [assembly_accesion,url], ${3} extensions - returns assembly accession, url and filename (for all selected extensions)
{
# Given an url returns the url and the filename for all extensions
for extension in ${3//,/ }
do
cut --fields="${2}" ${1} | awk -F "\t" -v ext="${extension}" '{url_count=split($2,url,"/"); print $1 "\t" $2 "\t" url[url_count] "_" ext}'
done
}
tmp_file(){ # parameter: ${1} filename - return full path of created file
f="${working_dir}${1}"
rm -f "${f}"
touch "${f}"
echo "${f}"
}
print_progress() # parameter: ${1} file number, ${2} total number of files
{
if [ "${silent_progress}" -eq 1 ] || [ "${silent}" -eq 0 ] ; then
printf "%5d/%d - " ${1} ${2}
printf "%2.2f%%\r" $(bc -l <<< "scale=4;(${1}/${2})*100")
fi
}
export -f print_progress #export it to be accessible to the parallel call
check_file_folder() # parameter: ${1} url, ${2} log (0->before download/1->after download) - returns 0 (ok) / 1 (error)
{
file_name=$(basename ${1})
path_name="${target_output_prefix}$(path_output ${file_name})${file_name}"
# Check if file exists and if it has a size greater than zero (-s)
if [ ! -s "${path_name}" ]; then
if [ "${2}" -eq 1 ]; then echolog "${file_name} download failed [${1}]" "0"; fi
# Remove file if exists (only zero-sized files)
rm -vf "${path_name}" >> "${log_file}" 2>&1
return 1
else
if [ "${verbose_log}" -eq 1 ]; then
if [ "${2}" -eq 0 ]; then
echolog "${file_name} file found on the output folder [${path_name}]" "0"
else
echolog "${file_name} downloaded successfully [${1} -> ${path_name}]" "0"
fi
fi
return 0
fi
}
export -f check_file_folder #export it to be accessible to the parallel call
check_md5_ftp() # parameter: ${1} url - returns 0 (ok) / 1 (error)
{
if [ "${check_md5}" -eq 1 ]; then # Only if md5 checking is activated
md5checksums_url="$(dirname ${1})/md5checksums.txt" # ftp directory
file_name=$(basename ${1}) # downloaded file name
md5checksums_file=$(download_url "${md5checksums_url}")
if [ -z "${md5checksums_file}" ]; then
echolog "${file_name} MD5checksum file download failed [${md5checksums_url}] - FILE KEPT" "0"
return 0
else
ftp_md5=$(echo "${md5checksums_file}" | grep "${file_name}" | cut -f1 -d' ')
if [ -z "${ftp_md5}" ]; then
echolog "${file_name} MD5checksum file not available [${md5checksums_url}] - FILE KEPT" "0"
return 1
else
path_name="${target_output_prefix}$(path_output ${file_name})${file_name}" # local file path and name
file_md5=$(md5sum ${path_name} | cut -f1 -d' ')
if [ "${file_md5}" != "${ftp_md5}" ]; then
echolog "${file_name} MD5 not matching [${md5checksums_url}] - FILE REMOVED" "0"
# Remove file only when MD5 doesn't match
rm -v "${path_name}" >> ${log_file} 2>&1
return 1
else
if [ "${verbose_log}" -eq 1 ]; then
echolog "${file_name} MD5 successfully checked ${file_md5} [${md5checksums_url}]" "0"
fi
return 0
fi
fi
fi
else
return 0
fi
}
export -f check_md5_ftp #export it to be accessible to the parallel call
download() # parameter: ${1} url, ${2} job number, ${3} total files, ${4} url_success_download (append)
{
ex=0
dl=0
if ! check_file_folder ${1} "0"; then # Check if the file is already on the output folder (avoid redundant download)
dl=1
elif ! check_md5_ftp ${1}; then # Check if the file already on folder has matching md5
dl=1
fi
if [ "${dl}" -eq 1 ]; then # If file is not yet on folder, download it
path_out="${target_output_prefix}$(path_output ${1})"
mkdir -p "${path_out}"
download_url "${1}" "${path_out}"
if ! check_file_folder ${1} "1"; then # Check if file was downloaded
ex=1
elif ! check_md5_ftp ${1}; then # Check file md5
ex=1
fi
fi
print_progress ${2} ${3}
if [ "${ex}" -eq 0 ]; then
echo ${1} >> ${4}
fi
}
export -f download
download_files() # parameter: ${1} file, ${2} fields [assembly_accesion,url] or field [url,filename], ${3} extension
{
url_list_download=$(tmp_file "url_list_download.tmp") #Temporary url list of files to download in this call
# sort files to get all files for the same entry in sequence, in case of failure
if [ -z ${3:-} ]; then #direct download (url+file)
cut --fields="${2}" ${1} | tr '\t' '/' | sort > "${url_list_download}"
else
list_files ${1} ${2} ${3} | cut -f 2,3 | tr '\t' '/' | sort > "${url_list_download}"
fi
total_files=$(count_lines_file "${url_list_download}")
url_success_download=$(tmp_file "url_success_download.tmp") #Temporary url list of downloaded files
# Retry download in batches
for (( att=1; att<=${retry_download_batch}; att++ )); do
if [ "${att}" -gt 1 ]; then
echolog " - Failed download - ${failed_count} files. Trying again #${att}" "1"
# Make a new list to download without entres already successfuly downloaded
join <(sort "${url_list_download}") <(sort "${url_success_download}") -v 1 > "${url_list_download}_2"
mv "${url_list_download}_2" "${url_list_download}"
total_to_download=$(count_lines_file "${url_list_download}")
else
total_to_download=${total_files}
fi
# send url, job number and total files (to print progress)
# successfuly files are appended to the $url_success_download
parallel --gnu --tmpdir ${working_dir} -a ${url_list_download} -j ${threads} download "{}" "{#}" "${total_to_download}" "${url_success_download}"
downloaded_count=$(count_lines_file "${url_success_download}")
failed_count=$(( total_files - downloaded_count ))
echolog " - $(( total_files-failed_count ))/${total_files} files successfully downloaded" "1"
# If no failures, break
if [ "${failed_count}" -eq 0 ]; then
break;
fi
done
# Output URL reports
if [ "${url_list}" -eq 1 ]; then
# add left overs of the list to the failed urls
join <(sort "${url_list_download}") <(sort "${url_success_download}") -v 1 >> "${target_output_prefix}${timestamp}_url_failed.txt"
# add successful downloads the the downloaded urls
cat "${url_success_download}" >> "${target_output_prefix}${timestamp}_url_downloaded.txt"
fi
rm -f ${url_list_download} ${url_success_download}
}
remove_files() # parameter: ${1} file, ${2} fields [assembly_accesion,url] OR field [filename], ${3} extension - returns number of deleted files
{
if [ -z ${3:-} ]; then
# direct remove (filename)
filelist=$(cut --fields="${2}" ${1});
else
# generate files
filelist=$(list_files ${1} ${2} ${3} | cut -f 3);
fi
deleted_files=0
while read f; do
path_name="${target_output_prefix}$(path_output ${f})${f}"
# Only delete if delete option is enable or if it's a symbolic link (from updates)
if [[ -L "${path_name}" || "${delete_extra_files}" -eq 1 ]]; then
rm "${path_name}" -v >> ${log_file}
deleted_files=$((deleted_files + 1))
else
echolog "kept '${path_name}'" "0"
fi
done <<< "${filelist}"
echo ${deleted_files}
}
check_missing_files() # ${1} file, ${2} fields [assembly_accesion,url], ${3} extension - returns assembly accession, url and filename
{
join -1 3 -2 1 <(list_files ${1} ${2} ${3} | sort -k 3,3 -t$'\t') <(list_local_files "${target_output_prefix}" | sort) -t$'\t' -v 1 -o "1.1,1.2,1.3"
}
check_complete_record() # parameters: ${1} file, ${2} field [assembly accession, url], ${3} extension - returns assembly accession, url
{
expected_files=$(list_files ${1} ${2} ${3} | sort -k 3,3)
join -1 3 -2 1 <(echo "${expected_files}" | sort -k 3,3) <(list_local_files "${target_output_prefix}" | sort) -t$'\t' -o "1.1" -v 1 | sort | uniq | # Check for accessions with at least one missing file
join -1 1 -2 1 <(echo "${expected_files}" | cut -f 1,2 | sort | uniq) - -t$'\t' -v 1 # Extract just assembly accession and url for complete entries (no missing files)
}
output_assembly_accession() # parameters: ${1} file, ${2} field [assembly accession, url], ${3} extension, ${4} mode (A/R) - returns assembly accession, url and mode
{
check_complete_record ${1} ${2} ${3} | sed "s/^/${4}\t/" # add mode
}
output_sequence_accession() # parameters: ${1} file, ${2} field [assembly accession, url], ${3} extension, ${4} mode (A/R), ${5} assembly_summary (for taxid)
{
join <(list_files ${1} ${2} "assembly_report.txt" | sort -k 1,1) <(check_complete_record ${1} ${2} ${3} | sort -k 1,1) -t$'\t' -o "1.1,1.3" | # List assembly accession and filename for all assembly_report.txt with complete record (no missing files) - returns assembly accesion, filename
join - <(sort -k 1,1 ${5}) -t$'\t' -o "1.1,1.2,2.6" | # Get taxid {1} assembly accession, {2} filename {3} taxid
parallel --tmpdir ${working_dir} --colsep "\t" -j ${threads} -k 'grep "^[^#]" "${target_output_prefix}$(path_output {2}){2}" | tr -d "\r" | cut -f 5,7,9 | sed "s/^/{1}\\t/" | sed "s/$/\\t{3}/"' | # Retrieve info from assembly_report.txt and add assemby accession in the beggining and taxid at the end
sed "s/^/${4}\t/" # Add mode A/R at the end
}
exit_status() # parameters: ${1} # expected files, ${2} # current files
{
if [[ ${conditional_exit} =~ ^[+-]?[0-9]*$ ]] ; then # INTEGER
if [[ ${conditional_exit} -eq 0 ]] ; then # Condition off
return 0
elif [[ $(( $1-$2 )) -ge ${conditional_exit} ]] ; then
return 1
else
return 0
fi
elif [[ ${conditional_exit} =~ ^[+-]?[0-9]+\.?[0-9]*$ ]] ; then # FLOAT
if (( $(echo "((${1}-${2})/${1}) >= ${conditional_exit} " | bc -l) )); then
return 1
else
return 0
fi
elif [ $1 -gt 0 ] && [ $2 -eq 0 ]; then # all failed
return 1
else
return 0
fi
}
echolog() # parameters: ${1} text, ${2} STDOUT (0->no/1->yes)
{
if [[ "${2:-0}" -eq "1" ]] && [ "${silent}" -eq 0 ]; then
echo "${1}" # STDOUT
fi
echo "${1}" >> "${log_file}" # LOG
}
export -f echolog #export it to be accessible to the parallel call
print_debug() # parameters: ${1} tools
{
echo "========================================================";
echo "genome_updater version ${version}"
echo "========================================================";
bash --version
echo "========================================================";
locale
for t in "${tools[@]}"
do
echo "========================================================";
tool=$(command -v "${t}");
echo "${t} => ${tool}";
echo "========================================================";
${tool} --version;
done
echo "========================================================";
}
function print_logo {
echo "┌─┐┌─┐┌┐┌┌─┐┌┬┐┌─┐ ┬ ┬┌─┐┌┬┐┌─┐┌┬┐┌─┐┬─┐";
echo "│ ┬├┤ ││││ ││││├┤ │ │├─┘ ││├─┤ │ ├┤ ├┬┘";
echo "└─┘└─┘┘└┘└─┘┴ ┴└─┘────└─┘┴ ─┴┘┴ ┴ ┴ └─┘┴└─";
echo " v${version} ";
}
function print_line {
echo "-------------------------------------------"
}
function showhelp {
echo
print_logo
echo
echo $'Database options:'
echo $' -d Database (comma-separated entries)\n\t[genbank, refseq]'
echo
echo $'Organism options:'
echo $' -g Organism group(s) (comma-separated entries, empty for all)\n\t[archaea, bacteria, fungi, human, invertebrate, metagenomes, \n\tother, plant, protozoa, vertebrate_mammalian, vertebrate_other, viral]\n\tDefault: ""'
echo $' -T Taxonomic identifier(s) (comma-separated entries, empty for all).\n\tExample: "562" (for -M ncbi) or "s__Escherichia coli" (for -M gtdb)\n\tDefault: ""'
echo
echo $'File options:'
echo $' -f file type(s) (comma-separated entries)\n\t[genomic.fna.gz, assembly_report.txt, protein.faa.gz, genomic.gbff.gz]\n\tMore formats at https://ftp.ncbi.nlm.nih.gov/genomes/all/README.txt\n\tDefault: assembly_report.txt'
echo
echo $'Filter options:'
echo $' -c refseq category (comma-separated entries, empty for all)\n\t[reference genome, na]\n\tDefault: ""'
echo $' -l assembly level (comma-separated entries, empty for all)\n\t[complete genome, chromosome, scaffold, contig]\n\tDefault: ""'
echo $' -D Start date (>=), based on the sequence release date. Format YYYYMMDD.\n\tDefault: ""'
echo $' -E End date (<=), based on the sequence release date. Format YYYYMMDD.\n\tDefault: ""'
echo $' -F custom filter for the assembly summary in the format colA:val1|colB:valX,valY (case insensitive).\n\tExample: -F "2:PRJNA12377,PRJNA670754|14:Partial" (AND between cols, OR between values)\n\tColumn info at https://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt\n\tDefault: ""'
echo
echo $'Taxonomy options:'
echo $' -M Taxonomy. gtdb keeps only assemblies in GTDB (latest). ncbi keeps only latest assemblies (version_status). \n\t[ncbi, gtdb]\n\tDefault: "ncbi"'
echo $' -A Keep a limited number of assemblies for each selected taxa (leaf nodes). 0 for all. \n\tSelection by ranks are also supported with rank:number (e.g genus:3)\n\t[species, genus, family, order, class, phylum, kingdom, superkingdom]\n\tSelection order based on: RefSeq Category, Assembly level, Relation to type material, Date.\n\tDefault: 0'
echo $' -a Keep the current version of the taxonomy database in the output folder'
echo
echo $'Run options:'
echo $' -o Output/Working directory \n\tDefault: ./tmp.XXXXXXXXXX'
echo $' -t Threads to parallelize download and some file operations\n\tDefault: 1'
echo $' -k Dry-run mode. No sequence data is downloaded or updated - just checks for available sequences and changes'
echo $' -i Fix only mode. Re-downloads incomplete or failed data from a previous run. Can also be used to change files (-f).'
echo $' -m Check MD5 of downloaded files'
echo
echo $'Report options:'
echo $' -u Updated assembly accessions report\n\t(Added/Removed, assembly accession, url)'
echo $' -r Updated sequence accessions report\n\t(Added/Removed, assembly accession, genbank accession, refseq accession, sequence length, taxid)\n\tOnly available when file format assembly_report.txt is selected and successfully downloaded'
echo $' -p Reports URLs successfuly downloaded and failed (url_failed.txt url_downloaded.txt)'
echo
echo $'Misc. options:'
echo $' -b Version label\n\tDefault: current timestamp (YYYY-MM-DD_HH-MM-SS)'
echo $' -e External "assembly_summary.txt" file to recover data from. Mutually exclusive with -d / -g \n\tDefault: ""'
echo $' -B Alternative version label to use as the current version. Mutually exclusive with -i.\n\tCan be used to rollback to an older version or to create multiple branches from a base version.\n\tDefault: ""'
echo $' -R Number of attempts to retry to download files in batches \n\tDefault: 3'
echo $' -n Conditional exit status based on number of failures accepted, otherwise will Exit Code = 1.\n\tExample: -n 10 will exit code 1 if 10 or more files failed to download\n\t[integer for file number, float for percentage, 0 = off]\n\tDefault: 0'
echo $' -N Output files in folders like NCBI ftp structure (e.g. files/GCF/000/499/605/GCF_000499605.1_EMW001_assembly_report.txt)'
echo $' -L Downloader\n\t[wget, curl]\n\tDefault: wget'
echo $' -x Allow the deletion of regular extra files (not symbolic links) found in the output folder'
echo $' -s Silent output'
echo $' -w Silent output with download progress only'
echo $' -V Verbose log'
echo $' -Z Print debug information and run in debug mode'
echo
}
# Defaults
database=""
organism_group=""
taxids=""
refseq_category=""
assembly_level=""
custom_filter=""
file_formats="assembly_report.txt"
top_assemblies=0
date_start=""
date_end=""
tax_mode="ncbi"
download_taxonomy=0
delete_extra_files=0
check_md5=0
updated_assembly_accession=0
updated_sequence_accession=0
url_list=0
dry_run=0
just_fix=0
conditional_exit=0
ncbi_folders=0
silent=0
silent_progress=0
debug_mode=0
working_dir=""
external_assembly_summary=""
retry_download_batch=3
label=""
rollback_label=""
threads=1
verbose_log=0
downloader_tool="wget"
# Check for required tools
tool_not_found=0
tools=( "awk" "bc" "find" "join" "md5sum" "parallel" "sed" "tar" "wget" )
for t in "${tools[@]}"
do
if [ ! -x "$(command -v ${t})" ]; then
echo "${t} not found";
tool_not_found=1;
fi
done
if [ "${tool_not_found}" -eq 1 ]; then exit 1; fi
# Parse -o and -B first to detect possible updates
getopts_list="aA:b:B:c:d:D:e:E:f:F:g:hikl:L:mM:n:No:prR:st:T:uVwxZ"
OPTIND=1 # Reset getopts
# Parses working_dir from "$@"
while getopts "${getopts_list}" opt; do
case ${opt} in
o) working_dir=${OPTARG} ;;
B) rollback_label=${OPTARG} ;;
\?) echo "Invalid options" >&2; exit 1 ;;
:) echo "Option -$OPTARG requires an argument." >&2; exit 1 ;;
esac
done
# If workingdir exists and there's a history file, grab and inject params
if [[ ! -z "${working_dir}" && -s "${working_dir}/history.tsv" ]]; then
if [[ ! -z "${rollback_label}" ]]; then
# If rolling back, get specific parameters of that version
rollback_assembly_summary="${working_dir}/${rollback_label}/assembly_summary.txt"
if [[ -f "${rollback_assembly_summary}" ]]; then
declare -a "args=($(awk -F '\t' '$2 == "'${rollback_label}'"' "${working_dir}/history.tsv" | cut -f 5))"
else
echo "Rollback label/assembly_summary.txt not found ["${rollback_assembly_summary}"]"; exit 1
fi
else
# Parse arguments into associative array
# automatically detecting and replacing the escaped non-printable characters (e.g.: complete\ genome)
declare -a "args=($(cut -f 5 "${working_dir}/history.tsv" | tail -n 1))"
fi
# For each entry of the current argument list $@
# add to the end of the array to have priority
c=${#args[@]}
for f in "$@"; do
args[$c]="${f}"
c=$((c+1))
done
else
# parse command line arguments by default
declare -a "args=($( printf "%q " "$@" ))"
fi
declare -A new_args
bool_args=""
OPTIND=1 # Reset getopts
while getopts "${getopts_list}" opt "${args[@]}"; do
case ${opt} in
a) download_taxonomy=1 ;;
A) top_assemblies=${OPTARG} ;;
b) label=${OPTARG} ;;
B) rollback_label=${OPTARG} ;;
c) refseq_category=${OPTARG} ;;
d) database=${OPTARG} ;;
D) date_start=${OPTARG} ;;
e) external_assembly_summary=${OPTARG} ;;
E) date_end=${OPTARG} ;;
f) file_formats=${OPTARG// } ;; #remove spaces
F) custom_filter=${OPTARG} ;;
g) organism_group=${OPTARG// } ;; #remove spaces
h) showhelp; exit 0 ;;
i) just_fix=1 ;;
k) dry_run=1 ;;
l) assembly_level=${OPTARG} ;;
L) downloader_tool=${OPTARG} ;;
m) check_md5=1 ;;
M) tax_mode=${OPTARG} ;;
n) conditional_exit=${OPTARG} ;;
N) ncbi_folders=1 ;;
o) working_dir=${OPTARG} ;;
p) url_list=1 ;;
r) updated_sequence_accession=1 ;;
R) retry_download_batch=${OPTARG} ;;
s) silent=1 ;;
t) threads=${OPTARG} ;;
T) taxids=${OPTARG} ;;
u) updated_assembly_accession=1 ;;
V) verbose_log=1 ;;
w) silent_progress=1 ;;
x) delete_extra_files=1 ;;
Z) debug_mode=1 ;;
\?) echo "Invalid options" >&2; exit 1 ;;
:) echo "Option -$OPTARG requires an argument." >&2; exit 1 ;;
esac
# Colect parsed args in an associative array for each opt
# the args added later have precedence