diff --git a/CHANGELOG.md b/CHANGELOG.md index d706f7ca..2fd6cb6e 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,11 +4,11 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [2.1.0dev] - 2024-0X-0X : https://github.com/BU-ISCIII/buisciii-tools/releases/tag/2.1.X +## [2.2.Xdev] - 2024-0X-XX : https://github.com/BU-ISCIII/buisciii-tools/releases/tag/2.2.X ### Credits -Code contributions to the hotfix: +Code contributions to the new version: ### Template fixes and updates @@ -44,6 +44,50 @@ Code contributions to the hotfix: ### Requirements +## [2.1.0] - 2024-04-19 : https://github.com/BU-ISCIII/buisciii-tools/releases/tag/2.1.0 + +### Credits + +Code contributions to the new version: +- [Sarai Varona](https://github.com/svarona) +- [Pablo Mata](https://github.com/Shettland) +- [Daniel Valle](https://github.com/Daniel-VM) + +### Template fixes and updates + +- Added blast_nt template to services.json [#208](https://github.com/BU-ISCIII/buisciii-tools/pull/208) +- Included new user to sftp_user.json +- Included a missing sed inside IRMA's 04-irma/lablog [#213](https://github.com/BU-ISCIII/buisciii-tools/pull/213) +- Changed singularity mount options in Viralrecon template to fix errors with Nextflow v23.10.0 +- excel_generator.py reverted to last state, now lineage tables are merged when argument -l is given +- Adapted viralrecon_results lablog to new excel_generator.py argument +- IRMA/RESULTS now creates a summary of the different types of flu found in irma_stats.txt +- Updated IRMA to v1.1.4 date 02-2024 and reduced threads to 16 +- IRMA 04-irma/lablog now creates B and C dirs only if those flu-types are present +- Fixed characterization template [#220](https://github.com/BU-ISCIII/buisciii-tools/pull/220) +- Created Chewbbaca template [#230](https://github.com/BU-ISCIII/buisciii-tools/pull/230) + +### Modules + +#### Added enhancements + +- [#207](https://github.com/BU-ISCIII/buisciii-tools/pull/207) - Bioinfo-doc updates: email password can be given in buisciii_config.yml and delivery notes in a text file + +#### Fixes + +- Added missing url for service assembly_annotation in module list +- Autoclean-sftp refined folder name parsing with regex label adjustment +- Autoclean_sftp does not crash anymore. New argument from 'utils.prompt_yn_question()' in v2.0.0 was missing: 'dflt' +- Bioinfo-doc now sends email correctly to multiple CCs + +#### Changed + +#### Removed + +- Removed empty strings from services.json + +### Requirements + ## [2.0.0] - 2024-03-01 : https://github.com/BU-ISCIII/buisciii-tools/releases/tag/2.0.0 ### Credits diff --git a/README.md b/README.md index 96310623..9d597a7c 100644 --- a/README.md +++ b/README.md @@ -110,7 +110,7 @@ Output: ┏━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ Service name ┃ Description ┃ Github ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ -│ assembly_annotation │ Nextflow assembly pipeline to assemble │ │ +│ assembly_annotation │ Nextflow assembly pipeline to assemble │ https://github.com/Daniel-VM/bacass/... │ │ │ bacterial genomes │ │ │ mtbseq_assembly │ Mycobacterium tuberculosis mapping, │ https://github.com/ngs-fzb/MTBseq_source │ │ │ variant calling and detection of │ │ diff --git a/bu_isciii/__main__.py b/bu_isciii/__main__.py old mode 100644 new mode 100755 index 9dcbbb25..50be620b --- a/bu_isciii/__main__.py +++ b/bu_isciii/__main__.py @@ -55,7 +55,7 @@ def run_bu_isciii(): ) # stderr.print("[green] `._,._,'\n", highlight=False) - __version__ = "1.0.1" + __version__ = "2.0.0" stderr.print( "[grey39] BU-ISCIII-tools version {}".format(__version__), highlight=False ) @@ -507,6 +507,7 @@ def bioinfo_doc( """ Create the folder documentation structure in bioinfo_doc server """ + email_pass = email_psswd if email_psswd else ctx.obj.get("email_password") new_doc = bu_isciii.bioinfo_doc.BioinfoDoc( type, resolution, @@ -517,7 +518,7 @@ def bioinfo_doc( results_md, ctx.obj["api_user"], ctx.obj["api_password"], - email_psswd, + email_pass, ) new_doc.create_documentation() @@ -564,6 +565,7 @@ def bioinfo_doc( default=None, help="Tsv output path + filename with archive stats and info", ) +@click.pass_context def archive( ctx, service_id, diff --git a/bu_isciii/autoclean_sftp.py b/bu_isciii/autoclean_sftp.py old mode 100644 new mode 100755 index 274a1515..43d2e76d --- a/bu_isciii/autoclean_sftp.py +++ b/bu_isciii/autoclean_sftp.py @@ -68,7 +68,9 @@ class AutoremoveSftpService: def __init__(self, path=None, days=14): # Parse input path if path is None: - use_default = bu_isciii.utils.prompt_yn_question("Use default path?: ") + use_default = bu_isciii.utils.prompt_yn_question( + "Use default path?: ", dflt=False + ) if use_default: data_path = bu_isciii.config_json.ConfigJson().get_configuration( "global" @@ -107,7 +109,7 @@ def check_path_exists(self): def get_sftp_services(self): self.sftp_services = {} # {sftp-service_path : last_update} service_pattern = ( - r"^[SRV][A-Z]+[0-9]+_\d{8}_[A-Z0-9]+_[a-zA-Z]+(?:\.[a-zA-Z]+)?_[a-zA-Z]$" + r"^[SRV][A-Z]+[0-9]+_\d{8}_[A-Z0-9.-]+_[a-zA-Z]+(?:\.[a-zA-Z]+)?_[a-zA-Z]$" ) stderr.print("[blue]Scanning " + self.path + "...") @@ -149,7 +151,9 @@ def remove_oldservice(self): "The following services are going to be deleted from the sftp:\n" + service_elements ) - confirm_sftp_delete = bu_isciii.utils.prompt_yn_question("Are you sure?: ") + confirm_sftp_delete = bu_isciii.utils.prompt_yn_question( + "Are you sure?: ", dflt=False + ) if confirm_sftp_delete: for service in self.marked_services: try: diff --git a/bu_isciii/bioinfo_doc.py b/bu_isciii/bioinfo_doc.py old mode 100644 new mode 100755 index f6385457..52e72dfc --- a/bu_isciii/bioinfo_doc.py +++ b/bu_isciii/bioinfo_doc.py @@ -262,13 +262,34 @@ def create_structure(self): return def post_delivery_info(self): - delivery_notes = bu_isciii.utils.ask_for_some_text( - msg="Write some delivery notes:" - ) + if bu_isciii.utils.prompt_yn_question( + msg="Do you wish to provide a text file for delivery notes?", dflt=False + ): + for i in range(3, -1, -1): + self.provided_txt = bu_isciii.utils.prompt_path( + msg="Write the path to the file with RAW text as delivery notes" + ) + if not os.path.isfile(os.path.expanduser(self.provided_txt)): + stderr.print(f"Provided file doesn't exist. Attempts left: {i}") + else: + stderr.print(f"File selected: {self.provided_txt}") + break + else: + stderr.print("No more attempts. Delivery notes will be given by prompt") + self.provided_txt = None + else: + self.provided_txt = None + if self.provided_txt: + with open(os.path.expanduser(self.provided_txt)) as f: + self.delivery_notes = " ".join([x.strip() for x in f.readlines()]) + else: + self.delivery_notes = bu_isciii.utils.ask_for_some_text( + msg="Write some delivery notes:" + ) delivery_dict = { "resolution_number": self.resolution_id, - "delivery_notes": delivery_notes, + "delivery_notes": self.delivery_notes, } # How json should be fully formatted: @@ -568,9 +589,15 @@ def email_creation(self): if bu_isciii.utils.prompt_yn_question( "Do you want to add some delivery notes to the e-mail?", dflt=False ): - email_data["email_notes"] = bu_isciii.utils.ask_for_some_text( - msg="Write email notes" - ) + if self.provided_txt: + if bu_isciii.utils.prompt_yn_question( + f"Do you want to use notes from {self.provided_txt}?", dflt=False + ): + email_data["email_notes"] = self.delivery_notes + else: + email_data["email_notes"] = bu_isciii.utils.ask_for_some_text( + msg="Write email notes" + ) email_data["user_data"] = self.resolution_info["service_user_id"] email_data["service_id"] = self.service_name.split("_", 5)[0] @@ -604,7 +631,7 @@ def send_email(self, html_text, results_pdf_file): server.login(user=email_host_user, password=email_host_password) except Exception as e: stderr.print("[red] Unable to send e-mail" + e) - + default_cc = "bioinformatica@isciii.es" msg = MIMEMultipart("alternative") msg["To"] = self.resolution_info["service_user_id"]["email"] msg["From"] = email_host_user @@ -617,18 +644,21 @@ def send_email(self, html_text, results_pdf_file): + self.service_name.split("_", 5)[2] ) if bu_isciii.utils.prompt_yn_question( - "Do you want to add any other sender? appart from " - + self.resolution_info["service_user_id"]["email"], + "Do you want to add any other sender? apart from %s. Note: %s is the default CC." + % (self.resolution_info["service_user_id"]["email"], default_cc), dflt=False, ): stderr.print( - "[red] Write emails to be added in semicolon separated format: bioinformatica@isciii.es;icuesta@isciii.es" + "[red] Write emails to be added in semicolon separated format: icuesta@isciii.es;user2@isciii.es" ) - msg["CC"] = bu_isciii.utils.ask_for_some_text(msg="E-mails:") - rcpt = msg["CC"].split(";") + [msg["To"]] + cc_address = bu_isciii.utils.ask_for_some_text(msg="E-mails:") else: - rcpt = self.resolution_info["service_user_id"]["email"] - + cc_address = str() + if cc_address: + msg["CC"] = str(default_cc + ";" + str(cc_address)) + else: + msg["CC"] = default_cc + rcpt = msg["CC"].split(";") + [msg["To"]] html = MIMEText(html_text, "html") msg.attach(html) with open(results_pdf_file, "rb") as f: @@ -639,7 +669,6 @@ def send_email(self, html_text, results_pdf_file): filename=str(os.path.basename(results_pdf_file)), ) msg.attach(attach) - server.sendmail( email_host_user, rcpt, diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog index 540640fe..43af890a 100755 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog @@ -5,7 +5,7 @@ mkdir logs scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") -cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --cpus-per-task 32 --mem 35000M --chdir $scratch_dir --time 01:00:00 --output logs/IRMA.${in}.%j.log /data/bi/pipelines/flu-amd/IRMA FLU_AD ../02-preprocessing/${in}/${in}_R1_filtered.fastq.gz ../02-preprocessing/${in}/${in}_R2_filtered.fastq.gz ${in} &"; done > _01_irma.sh +cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --cpus-per-task 16 --mem 35000M --chdir $scratch_dir --time 01:00:00 --output logs/IRMA.${in}.%j.log /data/bi/pipelines/flu-amd-202402/IRMA FLU_AD ../02-preprocessing/${in}/${in}_R1_filtered.fastq.gz ../02-preprocessing/${in}/${in}_R2_filtered.fastq.gz ${in} &"; done > _01_irma.sh echo 'bash create_irma_stats.sh' > _02_create_stats.sh @@ -13,9 +13,9 @@ echo "ls */*HA*.fasta | cut -d '/' -f2 | cut -d '.' -f1 | sort -u | cut -d '_' - echo "cat HA_types.txt | while read in; do mkdir \${in}; done" >> _03_post_processing.sh -echo "mkdir B" >> _03_post_processing.sh +echo "if grep -qw 'B__' irma_stats.txt; then mkdir B; fi" >> _03_post_processing.sh -echo "mkdir C" >> _03_post_processing.sh +echo "if grep -qw 'C__' irma_stats.txt; then mkdir C; fi" >> _03_post_processing.sh echo "ls */*.fasta | cut -d '/' -f2 | cut -d '.' -f1 | cut -d '_' -f1,2 | sort -u | grep 'A_' > A_fragment_list.txt" >> _03_post_processing.sh @@ -29,7 +29,7 @@ echo 'grep -w 'B__' irma_stats.txt | cut -f1 | while read sample; do cat B_fragm echo 'grep -w 'C__' irma_stats.txt | cut -f1 | while read sample; do cat C_fragment_list.txt | while read fragment; do if test -f ${sample}/${fragment}*.fasta; then cat ${sample}/${fragment}*.fasta | sed "s/^>/\>${sample}_/g" | sed s/_H1//g | sed s/_H3//g | sed s/_N1//g | sed s/_N2//g | sed s@-@/@g | sed s/_C_/_/g ; fi >> C/${fragment}.txt; done; done' >> _03_post_processing.sh -echo 'cat ../samples_id.txt | while read in; do cat ${in}/*.fasta | sed "s/^>/\>${in}_/g" | sed 's/_H1//g' | sed 's/_H3//g' | sed 's/_N1//g' | sed 's/_N2//g' | sed 's@-@/@g' | 's/_A_/_/g' | sed 's/_B_/_/g' | sed 's/_C_/_/g' >> all_samples_completo.txt; done' >> _03_post_processing.sh +echo 'cat ../samples_id.txt | while read in; do cat ${in}/*.fasta | sed "s/^>/\>${in}_/g" | sed 's/_H1//g' | sed 's/_H3//g' | sed 's/_N1//g' | sed 's/_N2//g' | sed 's@-@/@g' | sed 's/_A_/_/g' | sed 's/_B_/_/g' | sed 's/_C_/_/g' >> all_samples_completo.txt; done' >> _03_post_processing.sh -echo 'sed -i "s/__//g" irma_stats.txt' >> _03_post_processing.sh -echo 'sed -i "s/_\t/\t/g" irma_stats.txt' >> _03_post_processing.sh \ No newline at end of file +echo 'sed "s/__//g" irma_stats.txt > clean_irma_stats.txt' >> _03_post_processing.sh +echo 'sed "s/_\t/\t/g" irma_stats.txt > clean_irma_stats.txt' >> _03_post_processing.sh \ No newline at end of file diff --git a/bu_isciii/templates/IRMA/RESULTS/irma_results b/bu_isciii/templates/IRMA/RESULTS/irma_results index a2a5bb33..eee33aa6 100755 --- a/bu_isciii/templates/IRMA/RESULTS/irma_results +++ b/bu_isciii/templates/IRMA/RESULTS/irma_results @@ -7,4 +7,5 @@ ln -s ../../ANALYSIS/*_MET/99-stats/multiqc_report.html ./krona_results.html ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/all_samples_completo.txt . ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/A_H* . ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/B . -ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/C . \ No newline at end of file +ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/C . +tail -n +2 ../../ANALYSIS/*_FLU_IRMA/04-irma/clean_irma_stats.txt | cut -f4 | sort | uniq -c > flu_type_summary.txt \ No newline at end of file diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/01-preprocessing/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/01-preprocessing/lablog index 83679f26..0c86d16d 100644 --- a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/01-preprocessing/lablog +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/01-preprocessing/lablog @@ -1,6 +1,6 @@ # module load fastp # if assembly pipeline was performed first and the trimmed sequences were saved, this should work: -# cat ../samples_id | xargs -I mkdir @@; cd $_; ln -s ../../*/01-preprocessing/trimmed_sequences/@@*.gz @@; cd - +# cat ../samples_id.txt | xargs -I @@ mkdir @@; cd @@; ln -s ../../../*/01-processing/fastp/@@_1.fastp.fastq.gz ./@@_R1_filtered.fastq.gz; ln -s ../../../*/01-processing/fastp/@@_2.fastp.fastq.gz ./@@_R2_filtered.fastq.gz ; cd - # else: mkdir logs scratch_dir=$(echo $(pwd) | sed 's@/data/bi/scratch_tmp/@/scratch/@g') diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/02-ariba/run/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/02-ariba/run/lablog index c67caced..b942fba8 100644 --- a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/02-ariba/run/lablog +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/02-ariba/run/lablog @@ -1,13 +1,16 @@ # conda activate ariba -# ARIBA runs local assembli/processing_Data/bioinformatics/services_and_colaborations/CNM/bacteriologia/20190821_QCASSEMBLT_s.gonzalez_T/RAW/fastqc_2/. mkdir logs scratch_dir=$(echo $PWD | sed 's/\/data\/bi\/scratch_tmp/\/scratch/g') +downloaded_ref=$(find ../../../../REFERENCES/ -type d -name 'ref_db') # Cartesian product of the two files to avoid double looping join -j 2 ../../samples_id.txt ../databases.txt | sed 's/^ //g' > sample_database.txt # col 1 (arr[0]): sample # col 2 (arr[1]): database -cat sample_database.txt | while read in; do arr=($in); echo "mkdir -p ${arr[0]}; srun --chdir $scratch_dir --output logs/ARIBA${arr[0]}_${arr[1]}.%j.log --job-name ARIBA_${arr[0]}_${arr[1]} --cpus-per-task 5 --mem 5G --partition short_idx --time 02:00:00 ariba run /data/bi/references/ariba/20211216/${arr[1]}/out.${arr[1]}.prepareref ../../../*ASSEMBLY/01-preprocessing/trimmed_sequences/${arr[0]}_1.trim.fastq.gz ../../../*ASSEMBLY/01-preprocessing/trimmed_sequences/${arr[0]}_2.trim.fastq.gz ${arr[0]}/out_${arr[1]}_${arr[0]}_run &"; done > _01_ariba.sh -cat sample_database.txt | while read in; do arr=($in); echo "mv ${arr[0]}/out_${arr[1]}_${arr[0]}_run/report.tsv ${arr[0]}/out_${arr[1]}_${arr[0]}_run/${arr[0]}_${arr[1]}_report.tsv"; done > _02_fix_tsvreport.sh +cat sample_database.txt | grep -v 'pubmlst' | while read in; do arr=($in); echo "mkdir -p ${arr[0]}; srun --chdir $scratch_dir --output logs/ARIBA_${arr[0]}_${arr[1]}.%j.log --job-name ARIBA_${arr[0]}_${arr[1]} --cpus-per-task 5 --mem 5G --partition short_idx --time 02:00:00 ariba run /data/bi/references/ariba/20211216/${arr[1]}/out.${arr[1]}.prepareref ../../01-preprocessing/${arr[0]}/${arr[0]}_R1_filtered.fastq.gz ../../01-preprocessing/${arr[0]}/${arr[0]}_R2_filtered.fastq.gz ${arr[0]}/out_${arr[1]}_${arr[0]}_run &"; done > _01_ariba.sh + +cat ../samples_id.txt | while read in; echo "mkdir -p ${arr[0]}; srun --chdir $scratch_dir --output logs/ARIBA_${in}_pubmlst.%j.log --job-name ARIBA_${in}_pubmlst --cpus-per-task 5 --mem 5G --partition short_idx --time 02:00:00 ariba run ${downloaded_ref} ../../01-preprocessing/${in}/${in}_R1_filtered.fastq.gz ../../01-preprocessing/${in}/${in}_R2_filtered.fastq.gz ${in}/out_pubmlst_${in}_run &"; done > _01_ariba.sh + +cat sample_database.txt | while read in; do arr=($in); echo "mv ${arr[0]}/out_${arr[1]}_${arr[0]}_run/report.tsv ${arr[0]}/out_${arr[1]}_${arr[0]}_run/${arr[0]}_${arr[1]}_report.tsv"; done > _02_fix_tsvreport.sh \ No newline at end of file diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/02-ariba/summary/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/02-ariba/summary/lablog index 694f2c0a..be959a97 100644 --- a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/02-ariba/summary/lablog +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/02-ariba/summary/lablog @@ -8,4 +8,4 @@ scratch_dir=$(echo $PWD | sed 's/\/data\/bi\/scratch_tmp/\/scratch/g') # 1 - Use the ls in parenthesis to find the reports for a certain db, and xargs to make it into a single line # 2 - Integrate this into the ariba summary command -cat ../databases.txt | while read in; do echo "srun --chdir $scratch_dir --output logs/ARIBA_SUMMARY_${in}.log --job-name ARIBA_${in} --cpus-per-task 5 --mem 5G --partition short_idx --time 00:30:00 ariba summary --cluster_cols ref_seq,match out_summary_${in} $(ls ../run/*/out*_${in}*/*${in}*_report.tsv | xargs)"; done > _01_ariba_summary_prueba.sh +cat ../databases.txt | while read in; do echo "srun --chdir $scratch_dir --output logs/ARIBA_SUMMARY_${in}.log --job-name ARIBA_${in} --cpus-per-task 5 --mem 5G --partition short_idx --time 00:30:00 ariba summary --cluster_cols ref_seq,match out_summary_${in} $(ls ../run/*/out*_${in}*/*${in}*_report.tsv | xargs) &"; done > _01_ariba_summary_prueba.sh diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/99-stats/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/99-stats/lablog index 29bc7b29..9ab99ce9 100644 --- a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/99-stats/lablog +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/99-stats/lablog @@ -2,3 +2,5 @@ python3 /data/bi/pipelines/bacterial_qc/parse_ariba.py --path ../02-ariba/summary/out_summary_card.csv --database card --output_bn ariba_card.bn --output_csv ariba_card.csv python3 /data/bi/pipelines/bacterial_qc/parse_ariba.py --path ../02-ariba/summary/out_summary_plasmidfinder.csv --database plasmidfinder --output_bn ariba_plasmidfinder.bn --output_csv ariba_plasmidfinder.csv python3 /data/bi/pipelines/bacterial_qc/parse_ariba.py --path ../02-ariba/summary/out_summary_vfdb_full.csv --database vfdb_full --output_bn ariba_vfdb_full.bn --output_csv ariba_vfdb_full.csv + +paste <(echo "sample_id") <(cat ../02-ariba/run/*/out_pubmlst_*_run/mlst_report.tsv | head -n1) > ariba_mlst_full.tsv; cat ../samples_id.txt | while read in; do paste <(echo ${in}) <(tail -n1 ../02-ariba/run/${in}/out_pubmlst_${in}_run/mlst_report.tsv); done >> ariba_mlst_full.tsv \ No newline at end of file diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/lablog index dc6d483c..97055cdf 100644 --- a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/lablog +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/lablog @@ -1,6 +1,2 @@ ln -s ../samples_id.txt . ln -s ../00-reads . -mkdir 01-preprocessing -mkdir 02-srst2 -mkdir 03-ariba -mkdir 99-stats diff --git a/bu_isciii/templates/characterization/RAW/REFERENCES/README b/bu_isciii/templates/characterization/REFERENCES/README similarity index 100% rename from bu_isciii/templates/characterization/RAW/REFERENCES/README rename to bu_isciii/templates/characterization/REFERENCES/README diff --git a/bu_isciii/templates/characterization/REFERENCES/lablog b/bu_isciii/templates/characterization/REFERENCES/lablog new file mode 100644 index 00000000..ad9f4010 --- /dev/null +++ b/bu_isciii/templates/characterization/REFERENCES/lablog @@ -0,0 +1,44 @@ +# conda activate ariba + +mkdir logs + +scratch_dir=$(echo $(pwd) | sed 's@/data/bi/scratch_tmp/@/scratch/@g') + +# Function to print colored text +print_color() { + case "$2" in + "red") + echo -e "\e[1;31m$1\e[0m" + ;; + "green") + echo -e "\e[1;32m$1\e[0m" + ;; + "blue") + echo -e "\e[1;34m$1\e[0m" + ;; + *) + echo "$1" + ;; + esac +} + +# Function to prompt with color +prompt_with_color() { + read -p "$(print_color $1 'blue') $2" response +} + +print_color "This will take some seconds to display, please wait" 'blue' + +# Select genome from PMLST +IFS=$'\n' +bacterial_options=( $(ariba pubmlstspecies | sed 's/^/"/g' | sed 's/$/"/g') ) +print_color "Indicate the preferred bacterial genome:" 'blue' +select BACTERIA in "${bacterial_options[@]}"; do + if [ -n "$BACTERIA" ]; then + print_color "Selected bacteria: $BACTERIA" 'green' + echo "srun --chdir ${scratch_dir} --mem 10G --time 1:00:00 --job-name PUBMLSTGET --output logs/PUBMLSTGET.%j.log --partition short_idx --cpus-per-task 5 ariba pubmlstget $BACTERIA $(date '+%Y%m%d') &" > _01_download_pubmlst.sh + break + else + print_color "Invalid input. Please select a valid option." 'red' + fi +done \ No newline at end of file diff --git a/bu_isciii/templates/characterization/RESULTS/README b/bu_isciii/templates/characterization/RESULTS/README deleted file mode 100644 index 37dc4134..00000000 --- a/bu_isciii/templates/characterization/RESULTS/README +++ /dev/null @@ -1 +0,0 @@ -##Folder to hold relevant results of the service diff --git a/bu_isciii/templates/characterization/RESULTS/lablog_characterization_results b/bu_isciii/templates/characterization/RESULTS/lablog_characterization_results index 323dc0dd..8879a843 100644 --- a/bu_isciii/templates/characterization/RESULTS/lablog_characterization_results +++ b/bu_isciii/templates/characterization/RESULTS/lablog_characterization_results @@ -1,11 +1,12 @@ DELIVERY_FOLDER="$(date '+%Y%m%d')_entrega" -mkdir $DELIVERY_FOLDER +mkdir -p $DELIVERY_FOLDER mkdir "${DELIVERY_FOLDER}/characterization" # ARIBA characterization service cd $DELIVERY_FOLDER/characterization -ln -s ../../../ANALYSIS/*CHARACTERIZATION/99-stats/*.csv . -ln -s ../../../ANALYSIS/*CHARACTERIZATION/02-ariba/summary/*summary*.csv . -rm *phandango* +ln -s ../../../ANALYSIS/*CHARACTERIZATION/99-stats/ariba_*.tsv . +ln -s ../../../ANALYSIS/*CHARACTERIZATION/99-stats/ariba_*.csv . +find . -xtype l -delete + cd - diff --git a/bu_isciii/templates/chewbbaca/ANALYSIS/ANALYSIS01_CHEWBBACA/02-chewbbaca/lablog b/bu_isciii/templates/chewbbaca/ANALYSIS/ANALYSIS01_CHEWBBACA/02-chewbbaca/lablog new file mode 100644 index 00000000..d74a4f22 --- /dev/null +++ b/bu_isciii/templates/chewbbaca/ANALYSIS/ANALYSIS01_CHEWBBACA/02-chewbbaca/lablog @@ -0,0 +1,60 @@ +scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") + +# module load singularity + +mkdir logs + +# Function to print colored text +print_color() { + case "$2" in + "red") + echo -e "\e[1;31m$1\e[0m" + ;; + "green") + echo -e "\e[1;32m$1\e[0m" + ;; + "blue") + echo -e "\e[1;34m$1\e[0m" + ;; + *) + echo "$1" + ;; + esac +} + +# Function to prompt with color +prompt_with_color() { + read -p "$(print_color $1 'blue') $2" response +} + +while true; do + read -p "Is your schema already prepared for ChewBBACA? (yes/no)" yesno + case $yesno in + [Yy]* ) + print_color "Indicate the path to the schema:" 'blue' + read -e schema_path + print_color "Using schema from path: $schema_path" 'green' + + echo "srun --chdir $scratch_dir --output logs/CHEWBBACA-CALLING.%j.log --job-name CHEWBBACA-CALLING --cpus-per-task 20 --mem 20G --partition middle_idx --time 05:00:00 singularity exec --bind ${scratch_dir}/../../../ /scratch/bi/singularity-images/chewbbaca:3.3.3--pyhdfd78af_0 chewBBACA.py AlleleCall -i ../00-assemblies/ -g $schema_path -o allele_calling/ --cpu 20 &" > _01_chewbbaca_calling.sh + + echo "srun --chdir $scratch_dir --output logs/CHEWBBACA-ALLELECALL-EVALUATOR.%j.log --job-name CHEWBBACA-ALLELECALL-EVALUATOR --cpus-per-task 1 --mem 20G --partition middle_idx --time 05:00:00 singularity exec --bind ${scratch_dir}/../../../ /scratch/bi/singularity-images/chewbbaca:3.3.3--pyhdfd78af_0 chewBBACA.py AlleleCallEvaluator -i ./allele_calling/ -g $schema_path -o ./allele_calling_evaluation &" > _02_chewbacca_allelecall_evaluator.sh + + break + ;; + [Nn]* ) + print_color "Indicate the path to the schema:" 'blue' + read -e schema_path + print_color "Using schema from path: $schema_path" 'green' + echo "srun --chdir $scratch_dir --output logs/CHEWBBACA-PREPARE-SCHEMA.%j.log --job-name CHEWBBACA-PREPARE-SCHEMA --partition middle_idx --time 12:00:00 singularity exec --bind ${scratch_dir}/../../../ /scratch/bi/singularity-images/chewbbaca:3.3.3--pyhdfd78af_0 chewBBACA.py PrepExternalSchema --schema-directory $schema_path --output-directory prep_schema &" > _01_prep_schema.sh + + echo "srun --chdir $scratch_dir --output logs/CHEWBBACA-SCHEMA-EVALUATOR.%j.log --job-name CHEWBBACA-SCHEMA-EVALUATOR --partition middle_idx --time 12:00:00 singularity exec --bind ${scratch_dir}/../../../ /scratch/bi/singularity-images/chewbbaca:3.3.3--pyhdfd78af_0 chewBBACA.py SchemaEvaluator --schema-directory schema_path --output-directory analyze_schema &" > _02_analyze_schema.sh + + echo "srun --chdir $scratch_dir --output logs/CHEWBBACA-ALLELECALL.%j.log --job-name CHEWBBACA-ALLELECALL --cpus-per-task 4 --mem 64G --partition middle_idx --time 12:00:00 singularity exec --bind ${scratch_dir}/../../../ /scratch/bi/singularity-images/chewbbaca:3.3.3--pyhdfd78af_0 chewBBACA.py AlleleCall -i ../00-assemblies/ -g ./prep_schema/ -o ./allele_calling --cpu 4 &" > _03_allele_calling.sh + + echo "srun --chdir $scratch_dir --output logs/CHEWBBACA-ALLELECALL-EVALUATOR.%j.log --job-name CHEWBBACA-ALLELECALL-EVALUATOR --cpus-per-task 1 --mem 20G --partition middle_idx --time 05:00:00 singularity exec --bind ${scratch_dir}/../../../ /scratch/bi/singularity-images/chewbbaca:3.3.3--pyhdfd78af_0 chewBBACA.py AlleleCallEvaluator -i ./allele_calling/ -g ./prep_schema/ -o ./allele_calling_evaluation &" > _04_chewbacca_allelecall_evaluator.sh + + break + ;; + * ) echo "Answer either yes or no!";; + esac +done \ No newline at end of file diff --git a/bu_isciii/templates/chewbbaca/ANALYSIS/ANALYSIS01_CHEWBBACA/lablog b/bu_isciii/templates/chewbbaca/ANALYSIS/ANALYSIS01_CHEWBBACA/lablog new file mode 100644 index 00000000..bf94164a --- /dev/null +++ b/bu_isciii/templates/chewbbaca/ANALYSIS/ANALYSIS01_CHEWBBACA/lablog @@ -0,0 +1,6 @@ +ln -s ../samples_id.txt . +ln -s ../00-reads . + +mkdir 01-assemblies 03-grapetree + +cd 01-assemblies; echo "rsync -rlv ../../*ASSEMBLY*/03-assembly/unicycler/*.fasta.gz ." | bash; gunzip *; cd - diff --git a/bu_isciii/templates/chewbbaca/ANALYSIS/lablog_chewbbaca b/bu_isciii/templates/chewbbaca/ANALYSIS/lablog_chewbbaca new file mode 100644 index 00000000..bd209b18 --- /dev/null +++ b/bu_isciii/templates/chewbbaca/ANALYSIS/lablog_chewbbaca @@ -0,0 +1,6 @@ +mkdir -p 00-reads + +cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R1*.fastq.gz %_R1.fastq.gz" | bash; cd - +cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R2*.fastq.gz %_R2.fastq.gz" | bash; cd - + +mv ANALYSIS01_CHEWBBACA "$(date '+%Y%m%d')_ANALYSIS03_CHEWBBACA" \ No newline at end of file diff --git a/bu_isciii/templates/chewbbaca/DOC/README b/bu_isciii/templates/chewbbaca/DOC/README new file mode 100644 index 00000000..353f2e2c --- /dev/null +++ b/bu_isciii/templates/chewbbaca/DOC/README @@ -0,0 +1 @@ +##Folder to hold DOC folder diff --git a/bu_isciii/templates/chewbbaca/RAW/README b/bu_isciii/templates/chewbbaca/RAW/README new file mode 100644 index 00000000..a774e7bb --- /dev/null +++ b/bu_isciii/templates/chewbbaca/RAW/README @@ -0,0 +1 @@ +##Folder to hold raw reads to analyze in the service diff --git a/bu_isciii/templates/chewbbaca/REFERENCES/README b/bu_isciii/templates/chewbbaca/REFERENCES/README new file mode 100644 index 00000000..eaa651aa --- /dev/null +++ b/bu_isciii/templates/chewbbaca/REFERENCES/README @@ -0,0 +1 @@ +##Folder to hold REFERENCE folder diff --git a/bu_isciii/templates/chewbbaca/RESULTS/lablog_mlst_results b/bu_isciii/templates/chewbbaca/RESULTS/lablog_mlst_results new file mode 100644 index 00000000..650d3d22 --- /dev/null +++ b/bu_isciii/templates/chewbbaca/RESULTS/lablog_mlst_results @@ -0,0 +1,13 @@ +DELIVERY_FOLDER="$(date '+%Y%m%d')_entrega" + +mkdir -p $DELIVERY_FOLDER +mkdir $DELIVERY_FOLDER/mlst + +# Assembly service +cd $DELIVERY_FOLDER/mlst + +# Links to reports +ln -s ../../../ANALYSIS/*CHEWBBACA/*-chewbbaca/allele_calling_evaluation/allelecall_report.html . +ln -s ../../../ANALYSIS/*CHEWBBACA/*-chewbbaca/allele_calling_evaluation/distance_matrix_symmetric.tsv . +ln -s ../../../ANALYSIS/*CHEWBBACA/*-grapetree/*.nwk +ln -s ../../../ANALYSIS/*CHEWBBACA/*-grapetree/*.svg diff --git a/bu_isciii/templates/chewbbaca/TMP/README b/bu_isciii/templates/chewbbaca/TMP/README new file mode 100644 index 00000000..ba322945 --- /dev/null +++ b/bu_isciii/templates/chewbbaca/TMP/README @@ -0,0 +1 @@ +##Folder to hold temporary files diff --git a/bu_isciii/templates/services.json b/bu_isciii/templates/services.json index a2007838..46795322 100755 --- a/bu_isciii/templates/services.json +++ b/bu_isciii/templates/services.json @@ -2,7 +2,7 @@ "assembly_annotation": { "label": "", "template": "assembly", - "url": "", + "url": "https://github.com/Daniel-VM/bacass/tree/buisciii-develop", "order": 1, "begin": "", "end": "", @@ -196,7 +196,7 @@ "url": "https://github.com/GuilleGorines/Seek-Destroy", "description": "Simple pipeline for basic quality control, host removal and exploratory analysis of samples.", "clean": { - "folders":[""], + "folders":[], "files":[] }, "no_copy": ["RAW", "TMP", "00-reads"], @@ -212,7 +212,7 @@ "end": "", "description": "", "clean": { - "folders":[""], + "folders":[], "files":[] }, "no_copy": ["RAW", "TMP", "00-reads"], @@ -229,7 +229,7 @@ "url": "https://github.com/nf-core/mag", "description": "Bioinformatics best-practise analysis pipeline for assembly, binning and annotation of metagenomes.", "clean": { - "folders":[""], + "folders":[], "files":[] }, "no_copy": ["RAW", "TMP"], @@ -321,5 +321,22 @@ "last_folder":"REFERENCES", "delivery_md": "", "results_md": "" + }, + "blast_nt": { + "label": "", + "template": "blast_nt", + "url": "", + "order": 1, + "begin": "", + "end": "", + "description": "", + "clean": { + "folders":[], + "files":[] + }, + "no_copy": ["RAW", "TMP"], + "last_folder":"REFERENCES", + "delivery_md": "", + "results_md": "" } } diff --git a/bu_isciii/templates/sftp_user.json b/bu_isciii/templates/sftp_user.json old mode 100644 new mode 100755 index 05736dba..bdf82822 --- a/bu_isciii/templates/sftp_user.json +++ b/bu_isciii/templates/sftp_user.json @@ -48,5 +48,6 @@ "sresino": ["Labvirushep"], "svaldezate": ["Labtaxonomia"], "svazquez": ["Labvirusres"], - "ycampos": ["LabUfiecMithocondrial"] + "ycampos": ["LabUfiecMithocondrial"], + "anadonoso": ["Labenterovirus"] } diff --git a/bu_isciii/templates/viralrecon/DOC/viralrecon_sars.config b/bu_isciii/templates/viralrecon/DOC/viralrecon_sars.config old mode 100644 new mode 100755 index a4936716..1e36576e --- a/bu_isciii/templates/viralrecon/DOC/viralrecon_sars.config +++ b/bu_isciii/templates/viralrecon/DOC/viralrecon_sars.config @@ -1,7 +1,7 @@ singularity { enabled = true autoMounts = true - runOptions = '-B /data/bi/references/' + runOptions = '-B /data/bi/references/ -B /data/bi/pipelines/artic-ncov2019/ -B "$HOME"' } process { diff --git a/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py b/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py index b33c761c..8467c414 100755 --- a/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py +++ b/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py @@ -4,7 +4,7 @@ from typing import List, Dict # conda activate viralrecon_report -"""Standard usage: python excel_generator.py -r ./reference.tmp""" +"""Standard usage: python excel_generator.py -r ./reference.tmp --merge_lineage_files""" """Single csv to excel usage: python excel_generator.py -s csv_file.csv""" parser = argparse.ArgumentParser( description="Generate excel files from viralrecon results" @@ -22,6 +22,12 @@ default="", help="Transform a single csv file to excel format. Omit rest of processes", ) +parser.add_argument( + "-l", + "--merge_lineage_files", + action="store_true", + help="Merge pangolin and nextclade lineage tables", +) args = parser.parse_args() @@ -135,7 +141,7 @@ def main(args): ref: str("ref_samples/samples_" + ref + ".tmp") for ref in references } - if len(references) > 1: + if args.merge_lineage_files: # Merge pangolin and nextclade csv files separatedly and create excel files for them merge_lineage_tables(reference_folders, samples_ref_files) for reference, folder in reference_folders.items(): diff --git a/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results b/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results old mode 100644 new mode 100755 index 1b9f2275..e71f4294 --- a/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results +++ b/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results @@ -26,7 +26,7 @@ ln -s ../../ANALYSIS/*/assembly_stats.csv ./assembly_stats.csv ln -s ../../ANALYSIS/*/01-PikaVirus-results/all_samples_virus_table_filtered.csv ./pikavirus_table.csv #conda activate viralrecon_report -echo "python ./excel_generator.py -r ./references.tmp" > _01_generate_excel_files.sh +echo "python ./excel_generator.py -r ./references.tmp --merge_lineage_files" > _01_generate_excel_files.sh #Cleaning temp files and broken symbolic links echo "find . -xtype l -delete" > _02_clean_folders.sh echo 'for dir in */; do find ${dir} -xtype l -delete; done' >> _02_clean_folders.sh @@ -46,3 +46,9 @@ cd variants_annot; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do a cd assembly_spades; rsync -rlv ../../../ANALYSIS/*/*/assembly/spades/rnaviral/*.scaffolds.fa.gz .; gunzip *.scaffolds.fa.gz; cd - cd abacas_assembly; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/assembly/spades/rnaviral/abacas/${arr[0]}.abacas.fasta ./${arr[0]}_${arr[1]}.abacas.fasta; done; cd - ln -s ../../ANALYSIS/*/all_samples_filtered_BLAST_results.xlsx . + +#Auxiliar script useful for multiple hosts if samples have pattern to distinguish: e.g. HOSTPATTERN1 = 'MONKEY' + +#cat ../viralrecon_results | sed 's@../../@../../../@g' | sed 's/entrega01/HOSTPATTERN1_results/g' > results_wgs +#sed -i 's@ANALYSIS/\*/@ANALYSIS/*HOSTPATTERN1*/@g' results_wgs.sh +#sed 's/HOSTPATTERN1/HOSTPATTERN2/g' results_wgs > results_gf.sh diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 index ba13fa2f..dcda6436 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup, find_packages -version = "2.0.0" +version = "2.1.0" with open("README.md") as f: readme = f.read()