diff --git a/.github/workflows/python_lint.yml b/.github/workflows/python_lint.yml index 9d043bbd..b373db7a 100644 --- a/.github/workflows/python_lint.yml +++ b/.github/workflows/python_lint.yml @@ -2,11 +2,10 @@ name: python_lint on: push: - paths: - - '**.py' + branches: "**" pull_request: - paths: - - '**.py' + types: [opened, reopened, synchronize, closed] + branches: "**" jobs: flake8_py3: @@ -18,18 +17,43 @@ jobs: python-version: 3.9.x architecture: x64 - name: Checkout PyTorch - uses: actions/checkout@master + uses: actions/checkout@v2 + with: + fetch-depth: 0 - name: Install flake8 run: pip install flake8 + - name: Check for Python file changes + id: file_check + uses: tj-actions/changed-files@v44 + with: + sha: ${{ github.event.pull_request.head.sha }} + files: | + **.py - name: Run flake8 + if: steps.file_check.outputs.any_changed == 'true' run: flake8 --ignore E501,W503,E203,W605 + - name: No Python files changed + if: steps.file_check.outputs.any_changed != 'true' + run: echo "No Python files have been changed." black_lint: runs-on: ubuntu-latest steps: - name: Setup uses: actions/checkout@v2 + with: + fetch-depth: 0 - name: Install black in jupyter run: pip install black[jupyter] + - name: Check for Python file changes + id: file_check + uses: tj-actions/changed-files@v44 + with: + sha: ${{ github.event.pull_request.head.sha }} + files: '**.py' - name: Check code lints with Black + if: steps.file_check.outputs.any_changed == 'true' uses: psf/black@stable + - name: No Python files changed + if: steps.file_check.outputs.any_changed != 'true' + run: echo "No Python files have been changed." diff --git a/CHANGELOG.md b/CHANGELOG.md index 2fd6cb6e..4ee3575e 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [2.2.Xdev] - 2024-0X-XX : https://github.com/BU-ISCIII/buisciii-tools/releases/tag/2.2.X +## [2.X.Xdev] - 2024-0X-XX : https://github.com/BU-ISCIII/buisciii-tools/releases/tag/2.X.X ### Credits @@ -24,7 +24,7 @@ Code contributions to the new version: ### Requirements -## [2.X.1hot] - 2024-0X-0X : https://github.com/BU-ISCIII/buisciii-tools/releases/tag/2.X.1 +## [2.X.Xhot] - 2024-0X-0X : https://github.com/BU-ISCIII/buisciii-tools/releases/tag/2.X.1 ### Credits @@ -44,6 +44,99 @@ Code contributions to the hotfix: ### Requirements +## [2.2.0] - 2024-09-12 : https://github.com/BU-ISCIII/buisciii-tools/releases/tag/2.2.0 + +### Credits + +Code contributions to the new version: +- [Pablo Mata](https://github.com/Shettland) +- [Jaime Ozáez](https://github.com/jaimeozaez) +- [Sara Monzón](https://github.com/saramonzon) +- [Sarai Varona](https://github.com/svarona) +- [Daniel Valle](https://github.com/Daniel-VM) +- [Víctor López](https://github.com/victor5lm) +- [Juan Ledesma](https://github.com/juanledesma78) + +### Template fixes and updates + +- Updated documentation and results markdown for viralrecon, pikavirus and MAG [#247](https://github.com/BU-ISCIII/buisciii-tools/pull/247) +- Added documentation and results markdown for RNAseq [#248](https://github.com/BU-ISCIII/buisciii-tools/pull/248) +- Added documentation both output and results for plasmidID[#258](https://github.com/BU-ISCIII/buisciii-tools/pull/258) +- Added markdown of assembly analysis procedure [#244](https://github.com/BU-ISCIII/buisciii-tools/pull/244) +- Added output and results markdowns for ExomeEB, ExomeTrio and WGStrio [#249](https://github.com/BU-ISCIII/buisciii-tools/pull/249) +- Added markdown of assembly results folder [#250](https://github.com/BU-ISCIII/buisciii-tools/pull/250) +- Updated lablog results filenames where necessary (IRMA, seekndestroy, viralrecon and genomeev) [#253](https://github.com/BU-ISCIII/buisciii-tools/pull/253) +- Added output and results markdowns for cgMLST/wgMLST [#255](https://github.com/BU-ISCIII/buisciii-tools/pull/255) +- Added markdown for IRMA [#256](https://github.com/BU-ISCIII/buisciii-tools/pull/256) +- Included RESULTS/lablog for exomeeb, exometrio and wgstrio templates and updated files to clean [#260](https://github.com/BU-ISCIII/buisciii-tools/pull/260) +- Changed scratch copy queue to middle_obx +- Included missing folders in wgstrio template +- Changed exomiser-html-description to png format and fixed location of irma-sarek markdowns [#261](https://github.com/BU-ISCIII/buisciii-tools/pull/261) +- Updated configuration.json so that either idx or obx is used in case one of these queues is full [#263](https://github.com/BU-ISCIII/buisciii-tools/pull/263) +- Updated lablog_viralrecon script for the automation of the setup of viralrecon services. [#264](https://github.com/BU-ISCIII/buisciii-tools/pull/264) +- Included MULTIQC v.1.19 in viralrecon.config to fix error with string and numeric samples [#267](https://github.com/BU-ISCIII/buisciii-tools/pull/267) +- Updated MTBSeq template to fit bacass pipeline. [#268](https://github.com/BU-ISCIII/buisciii-tools/pull/268) +- IRMA template modified in order to avoid average overload. +- Added "01" to results folder creation in assembly template. +- Some prompt answers limited to 1 character in lablog_viralrecon. +- Created lablog_mtbseq_results. [#270](https://github.com/BU-ISCIII/buisciii-tools/pull/270) +- PR #271. Closes [#235](https://github.com/BU-ISCIII/buisciii-tools/issues/235), [#228](https://github.com/BU-ISCIII/buisciii-tools/issues/228) and [#196](https://github.com/BU-ISCIII/buisciii-tools/issues/196) +- Included annotated tab description in exome-trios markdowns [#273](https://github.com/BU-ISCIII/buisciii-tools/pull/273) +- Installed all necessary singularity images and modified all templates so that, instead of using conda environments or loaded modules, the corresponding singularity images are used [#272](https://github.com/BU-ISCIII/buisciii-tools/pull/272) +- Updated sarek version in exomeeb, exometrio and wgstrio templates [#277](https://github.com/BU-ISCIII/buisciii-tools/pull/277) +- Extension file of all_samples_virus_table_filtered (from csv to tsv) in lablog_viralrecon_results changed [#278](https://github.com/BU-ISCIII/buisciii-tools/pull/278) +- Fixed singularity-images path when updating pangolin database in lablog_viralrecon. Added line break after prompted input. [#282](https://github.com/BU-ISCIII/buisciii-tools/pull/282) +- Updated characterization and snippy templates to fit bacass pipeline. Corrected path in 05-iqtree in snippy template. [#283](https://github.com/BU-ISCIII/buisciii-tools/pull/283) +- Included multiqc_report.html in RESULTS folder in every service, where necessary [#265] (https://github.com/BU-ISCIII/buisciii-tools/pull/265) +- Added MAG tempalte and removed MAG from other templates [#288](https://github.com/BU-ISCIII/buisciii-tools/pull/288) +- Added amrfinderplus to characterization template. [#289] (https://github.com/BU-ISCIII/buisciii-tools/pull/289) +- Updated all files so that paths referring to /pipelines/ are updated according to the new structure [#287](https://github.com/BU-ISCIII/buisciii-tools/pull/287) +- Updated assembly, ariba, snippy, amrfinderplus and iqtree templates, removed genomeev and mtbseq_assembly templates and updated services.json [#295](https://github.com/BU-ISCIII/buisciii-tools/pull/295) +- Changed viralrecon's lablog so that references are available within refgenie [#296](https://github.com/BU-ISCIII/buisciii-tools/pull/296) +- Updated services.json, mtbseq's lablog, viralrecon's lablog and assembly's config file [#299](https://github.com/BU-ISCIII/buisciii-tools/pull/299) +- Added lablog to automate gene characterization with emmtyper, including unzipping assemblies. [#300](https://github.com/BU-ISCIII/buisciii-tools/pull/300) +- Fixed 99-stats (MAG) template. [#301](https://github.com/BU-ISCIII/buisciii-tools/pull/301) +- Created a python script to process IRMA's results and create a standard vcf file against reference. [#304](https://github.com/BU-ISCIII/buisciii-tools/pull/304) +- Fixed IRMA's lablog so that the sequences of the samples are not displayed several times neither in the .txt files of each influenza type nor in all_samples_completo.txt [#305](https://github.com/BU-ISCIII/buisciii-tools/pull/305) +- Modified bioinfo_doc.py so that new lines in the delivery message are applied in the email [#307](https://github.com/BU-ISCIII/buisciii-tools/pull/307) +- Added several improvements in lablog_viralrecon (created log files, modified check_references function behaviour, enabled config files regeneration) [#306](https://github.com/BU-ISCIII/buisciii-tools/pull/306) +- Fixed bug when lablog_viralrecon tries to download references that don't belong to any family. [#310](https://github.com/BU-ISCIII/buisciii-tools/pull/310) +- Added mvmoneo to SFTP users. [#317](https://github.com/BU-ISCIII/buisciii-tools/pull/317) +- Added scripts for time series RNAseq and updated differential expression code for differentially expressed transcripts [#316](https://github.com/BU-ISCIII/buisciii-tools/pull/316). +- Added bbaladron to SFTP users [#316](https://github.com/BU-ISCIII/buisciii-tools/pull/316). +- Added new template for comprehensive taxonomy profiling using the nf-core/taxprofiler pipeline [#320](https://github.com/BU-ISCIII/buisciii-tools/pull/320). +- Added full execution support for the MAG template [#321](https://github.com/BU-ISCIII/buisciii-tools/pull/321). +- Added labels to services.json and updated bioinfo_doc.py and jinja_template_delivery.j2 so that software versions data is displayed in the delivery pdf [#330](https://github.com/BU-ISCIII/buisciii-tools/pull/330). +- Updated several templates (singularity images, outdated paths, improvements, etc) [#331](https://github.com/BU-ISCIII/buisciii-tools/pull/331) +- Added permissions fixing after running scratch_copy, as well as a new fix-permissions module in the tools [#332](https://github.com/BU-ISCIII/buisciii-tools/pull/332). +- Updated MAG lablogs and utils.py [#334](https://github.com/BU-ISCIII/buisciii-tools/pull/334). +- Updated some files (setup.py, __main__.py, README, etc) for the 2.2.0 release [#335](https://github.com/BU-ISCIII/buisciii-tools/pull/335). + +### Modules + +#### Added enhancements + +- PR [#274](https://github.com/BU-ISCIII/buisciii-tools/pull/274): added `--dev` option, configuration dev and test folder structure. +- PR [#276](https://github.com/BU-ISCIII/buisciii-tools/pull/276): wkhtmlpdf does not need absolute path to executable. Added better error handling when executable does not exists. +- PR [#288](https://github.com/BU-ISCIII/buisciii-tools/pull/288) Allowed to handle more than one service at a time, related to issue [#217](https://github.com/BU-ISCIII/buisciii-tools/issues/217) + +#### Fixes + +- Fixed archive module. Updated correct header for scout tsv [#258](https://github.com/BU-ISCIII/buisciii-tools/pull/258). +- Fixed clean module. Corrected purge_files function. Renaming stage moved from clean to rename_nocopy option. Updated services.json file with correct paths for some services. [#280](https://github.com/BU-ISCIII/buisciii-tools/pull/280) +- Fixed autoclean-sftp function. [#281](https://github.com/BU-ISCIII/buisciii-tools/pull/281) +- Fixed bioinfo_doc.py. Modified it so that this module creates a .pdf file including new-line characters, without merging lines into one single line [#259](https://github.com/BU-ISCIII/buisciii-tools/pull/259). +- PR [#288](https://github.com/BU-ISCIII/buisciii-tools/pull/288) Fixed updating service's state to in_progress multiple times, related with issue [#285](https://github.com/BU-ISCIII/buisciii-tools/issues/285) +- Review and update of services.json for files and folders cleaning [#318](https://github.com/BU-ISCIII/buisciii-tools/pull/318). + +#### Changed + +- Forcing python lint to success if no .py files are in PR [#279](https://github.com/BU-ISCIII/buisciii-tools/pull/279) + +#### Removed + +### Requirements + ## [2.1.0] - 2024-04-19 : https://github.com/BU-ISCIII/buisciii-tools/releases/tag/2.1.0 ### Credits diff --git a/README.md b/README.md index 9d597a7c..da5522a9 100644 --- a/README.md +++ b/README.md @@ -7,9 +7,8 @@ BU-ISCIII provides a serie or services in its portfolio for supporting bioinform - [buisciii-tools](#buisciii-tools) - [Installation](#installation) - - [Bioconda](#bioconda) - - [Pip](#pip) - - [Development version](#development-version) + - [Micromamba and pip](#micromamba-and-pip) + - [Dev version](#dev-version) - [Usage](#usage) - [Command-line](#command-line) - [list](#list) @@ -22,34 +21,47 @@ BU-ISCIII provides a serie or services in its portfolio for supporting bioinform - [bioinfo\_doc](#bioinfo_doc) - [archive](#archive) - [autoclean\_sftp](#autoclean_sftp) + - [fix-permissions](#fix-permissions) - [Acknowledgements](#acknowledgements) ## Installation -### Bioconda +### Micromamba and pip ```bash -conda create -n buisciii-tools pip -conda activate -conda env update --file environment.yml +micromamba create -n buisciii -f environment.yml +micromamba activate buisciii +pip install --force-reinstall --upgrade git+https://github.com/bu-isciii/buisciii-tools.git@main ``` -### Pip +or ```bash +git checkout main +conda create -n buisciii -f environment.yml conda activate pip install . ``` -### Development version +### Dev version If you want to install the latest code in the repository: ```bash -conda create -n buisciii_dev pip +micromamba create -n buisciii_dev -f environment.yml +micromamba activate buisciii_dev pip install --force-reinstall --upgrade git+https://github.com/bu-isciii/buisciii-tools.git@develop ``` +or locally: + +```bash +git checkout develop +micromamba create -n buisciii_dev -f environment.yml +micromamba activate buisciii_dev +pip install . +``` + ## Usage ### Command-line @@ -72,7 +84,7 @@ Options: -u, --api_user TEXT User for the API logging -p, --api_password TEXT Password for the API logging -c, --cred_file TEXT Config file with API logging credentials - --help Show this message and exit. + --help Show this message and exit Commands: list List available bu-isciii services. @@ -83,6 +95,8 @@ Commands: finish Service cleaning, remove big files, rename folders before... bioinfo-doc Create the folder documentation structure in bioinfo_doc... archive Archive services or retrieve services from archive + autoclean-sftp Clean old sftp services + fix-permissions Fix permissions ``` #### list @@ -137,9 +151,10 @@ Output: │ │ control, host removal and exploratory │ │ │ │ analysis of samples. │ │ │ ariba_characterization │ │ │ -│ mag_met │ Bioinformatics best-practise analysis │ https://github.com/nf-core/mag │ -│ │ pipeline for assembly, binning and │ │ -│ │ annotation of metagenomes. │ │ +│ mag_met │ 1- Bioinformatics best-practise analysis │ https://github.com/nf-core/mag or │ +│ │ for taxonomic classification and │ https://github.com/nf-core/taxprofiler │ +│ │ profiling; 2- Bioinformatics best-practise│ │ +│ │ analysis pipeline for assembly, binning │ │ └────────────────────────┴───────────────────────────────────────────┴────────────────────────────────────────────┘ ``` @@ -377,6 +392,26 @@ Options: --help Show this message and exit. ``` +#### fix-permissions + +Example of usage: + +```bash +bu-isciii fix-permissions -d /data/bi +``` + +Help: + +```bash +Usage: bu-isciii fix-permissions [OPTIONS] + + Fix permissions + +Options: + -d, --input_directory PATH Input directory to fix permissions (absolute path) [required] + --help Show this message and exit. +``` + ## Acknowledgements Python package idea and design is really inspired in [nf-core/tools](https://github.com/nf-core/tools). diff --git a/bu_isciii/__main__.py b/bu_isciii/__main__.py index 50be620b..3933a027 100755 --- a/bu_isciii/__main__.py +++ b/bu_isciii/__main__.py @@ -2,6 +2,7 @@ # import sys import logging +import os import click import rich.console @@ -9,6 +10,7 @@ import rich.traceback import bu_isciii +import bu_isciii.config_json import bu_isciii.utils import bu_isciii.new_service import bu_isciii.scratch @@ -55,7 +57,7 @@ def run_bu_isciii(): ) # stderr.print("[green] `._,._,'\n", highlight=False) - __version__ = "2.0.0" + __version__ = "2.2.0" stderr.print( "[grey39] BU-ISCIII-tools version {}".format(__version__), highlight=False ) @@ -133,8 +135,9 @@ def decorator(f): required=False, default=None, ) +@click.option("-d", "--dev", help="Develop settings", is_flag=True, default=False) @click.pass_context -def bu_isciii_cli(ctx, verbose, log_file, api_user, api_password, cred_file): +def bu_isciii_cli(ctx, verbose, log_file, api_user, api_password, cred_file, dev): # Set the base logger to output DEBUG log.setLevel(logging.INFO) # Initialize context @@ -150,7 +153,18 @@ def bu_isciii_cli(ctx, verbose, log_file, api_user, api_password, cred_file): ) log.addHandler(log_fh) - ctx.obj = bu_isciii.utils.get_yaml_config() + if dev: + conf = bu_isciii.config_json.ConfigJson( + json_file=os.path.join( + os.path.dirname(__file__), "conf", "configuration_dev.json" + ) + ) + else: + conf = bu_isciii.config_json.ConfigJson() + + ctx.obj = bu_isciii.utils.get_yaml_config(conf, cred_file) + ctx.obj["conf"] = conf + if bu_isciii.utils.validate_api_credentials(ctx.obj): print("API credentials successfully extracted from yaml config file") else: @@ -212,6 +226,7 @@ def new_service(ctx, resolution, path, no_create_folder, ask_path): ask_path, ctx.obj["api_user"], ctx.obj["api_password"], + ctx.obj["conf"], ) new_ser.create_new_service() @@ -264,6 +279,7 @@ def scratch(ctx, resolution, path, tmp_dir, direction, ask_path): ask_path, ctx.obj["api_user"], ctx.obj["api_password"], + ctx.obj["conf"], ) scratch_copy.handle_scratch() @@ -291,7 +307,7 @@ def scratch(ctx, resolution, path, tmp_dir, direction, ask_path): type=click.Choice( [ "full_clean", - "rename_nocopy", + "rename", "clean", "revert_renaming", "show_removable", @@ -301,7 +317,7 @@ def scratch(ctx, resolution, path, tmp_dir, direction, ask_path): multiple=False, help=( "Select what to do inside the cleanning step: full_clean: delete files and folders to clean," - " rename no copy and deleted folders, rename_nocopy: just rename no copy folders, clean: " + " rename no copy and deleted folders, rename: just rename folders, clean: " "delete files and folders to clean," "revert_renaming: remove no_copy and delete tags," "show_removable: list folders and files to remove " @@ -315,7 +331,13 @@ def clean(ctx, resolution, path, ask_path, option): show removable files or show folders for no copy. """ clean = bu_isciii.clean.CleanUp( - resolution, path, ask_path, option, ctx.obj["api_user"], ctx.obj["api_password"] + resolution, + path, + ask_path, + option, + ctx.obj["api_user"], + ctx.obj["api_password"], + ctx.obj["conf"], ) clean.handle_clean() @@ -356,6 +378,7 @@ def copy_sftp(ctx, resolution, path, ask_path, sftp_folder): sftp_folder, ctx.obj["api_user"], ctx.obj["api_password"], + ctx.obj["conf"], ) new_del.copy_sftp() @@ -404,6 +427,7 @@ def finish(ctx, resolution, path, ask_path, sftp_folder, tmp_dir): "clean", ctx.obj["api_user"], ctx.obj["api_password"], + ctx.obj["conf"], ) clean_scratch.handle_clean() print("Starting copy from scratch directory: " + tmp_dir + " to service directory.") @@ -415,6 +439,7 @@ def finish(ctx, resolution, path, ask_path, sftp_folder, tmp_dir): ask_path, ctx.obj["api_user"], ctx.obj["api_password"], + ctx.obj["conf"], ) copy_scratch2service.handle_scratch() print("Starting renaming of the service directory.") @@ -422,9 +447,10 @@ def finish(ctx, resolution, path, ask_path, sftp_folder, tmp_dir): resolution, path, ask_path, - "rename_nocopy", + "rename", ctx.obj["api_user"], ctx.obj["api_password"], + ctx.obj["conf"], ) rename_databi.handle_clean() print("Starting copy of the service directory to the SFTP folder") @@ -435,6 +461,7 @@ def finish(ctx, resolution, path, ask_path, sftp_folder, tmp_dir): sftp_folder, ctx.obj["api_user"], ctx.obj["api_password"], + ctx.obj["conf"], ) copy_sftp.copy_sftp() print("Service correctly in SFTP folder") @@ -518,6 +545,7 @@ def bioinfo_doc( results_md, ctx.obj["api_user"], ctx.obj["api_password"], + ctx.obj["conf"], email_pass, ) new_doc.create_documentation() @@ -587,6 +615,7 @@ def archive( option, ctx.obj["api_user"], ctx.obj["api_password"], + ctx.obj["conf"], skip_prompts, date_from, date_until, @@ -611,11 +640,40 @@ def archive( default=14, help="Integer, remove files older than a window of `-d [int]` days. Default 14 days.", ) -def autoclean_sftp(sftp_folder, days): +@click.pass_context +def autoclean_sftp(ctx, sftp_folder, days): """Clean old sftp services""" - sftp_clean = bu_isciii.autoclean_sftp.AutoremoveSftpService(sftp_folder, days) + sftp_clean = bu_isciii.autoclean_sftp.AutoremoveSftpService( + sftp_folder, days, ctx.obj["conf"] + ) sftp_clean.handle_autoclean_sftp() +# FIX PERMISSIONS +@bu_isciii_cli.command(help_priority=9) +@click.option( + "-d", + "--input_directory", + type=click.Path(), + default=None, + required=True, + help="Input directory to fix permissions (absolute path)", +) +@click.pass_context +def fix_permissions(ctx, input_directory): + """ + Fix permissions + """ + if not os.path.isdir(input_directory): + exit("Invalid input directory") + conf = bu_isciii.config_json.ConfigJson() + permissions = conf.get_configuration("global").get("permissions") + bu_isciii.utils.remake_permissions(input_directory, permissions) + stderr = rich.console.Console( + stderr=True, force_terminal=bu_isciii.utils.rich_force_colors() + ) + stderr.print(f"[green]Correct permissions were applied to {input_directory}") + + if __name__ == "__main__": run_bu_isciii() diff --git a/bu_isciii/archive.py b/bu_isciii/archive.py index 09b65f24..d89f36e4 100644 --- a/bu_isciii/archive.py +++ b/bu_isciii/archive.py @@ -37,6 +37,7 @@ def __init__( option=None, api_user=None, api_password=None, + conf=None, skip_prompts=False, date_from=None, date_until=None, @@ -74,8 +75,8 @@ def __init__( # Get configuration params from configuration.json # Get data to connect to the API - self.conf = bu_isciii.config_json.ConfigJson().get_configuration("archive") - conf_api = bu_isciii.config_json.ConfigJson().get_configuration("api_settings") + self.conf = conf.get_configuration("archive") + conf_api = conf.get_configuration("api_settings") # Initiate API rest_api = bu_isciii.drylab_api.RestServiceApi( @@ -223,7 +224,7 @@ def __init__( try: for service in rest_api.get_request( request_info="services", - safe=False, + safe=True, state="delivered", date_from=str(self.date_from), date_until=str(self.date_until), @@ -258,7 +259,7 @@ def __init__( if isinstance( ( service_data := rest_api.get_request( - request_info="service-data", safe=False, service=service + request_info="service-data", safe=True, service=service ) ), int, @@ -273,30 +274,26 @@ def __init__( self.services[service]["found_in_system"] = True self.services[service]["archived_path"] = os.path.join( bu_isciii.utils.get_service_paths( - self.ser_type, service_data, "archived_path" + conf, self.ser_type, service_data, "archived_path" ), service_data["resolutions"][0]["resolution_full_number"], ) self.services[service]["non_archived_path"] = os.path.join( bu_isciii.utils.get_service_paths( - self.ser_type, service_data, "non_archived_path" + conf, self.ser_type, service_data, "non_archived_path" ), service_data["resolutions"][0]["resolution_full_number"], ) else: self.services[service]["found_in_system"] = True self.services[service]["archived_path"] = os.path.join( - bu_isciii.config_json.ConfigJson().get_configuration("global")[ - "archived_path" - ], + conf.get_configuration("global")["archived_path"], self.ser_type, service_id, ) self.services[service]["non_archived_path"] = os.path.join( - bu_isciii.config_json.ConfigJson().get_configuration("global")[ - "data_path" - ], + conf.get_configuration("global")["data_path"], self.ser_type, service_id, ) @@ -1037,7 +1034,7 @@ def generate_tsv_table(self, filename): if self.services[service]["found_in_system"] else "NOT found on iSkyLIMS" ) - csv_dict["Delivery date"] = "" + csv_dict["Delivery date"] = self.services[service]["delivery_date"] # Fields for archive csv_dict["Path in archive"] = ( @@ -1045,7 +1042,7 @@ def generate_tsv_table(self, filename): if self.services[service]["archived_path"] is not None else "Archived path could not be generated" ) - csv_dict["Found in archive"] = ( + csv_dict["Found on archive"] = ( "Yes" if "Archive" in self.services[service]["found"] else "Not found in archive" @@ -1077,12 +1074,12 @@ def generate_tsv_table(self, filename): if "Data dir" in self.services[service]["found"] else "Not found in data dir" ) - csv_dict["Compressed size in data directory"] = ( + csv_dict["Uncompressed size in data directory"] = ( self.services[service]["non_archived_size"] if self.services[service]["non_archived_size"] != 0 else "Not calculated" ) - csv_dict["Uncompressed size in data directory"] = ( + csv_dict["Compressed size in data directory"] = ( self.services[service]["non_archived_compressed_size"] if self.services[service]["non_archived_compressed_size"] != 0 else "Not calculated" diff --git a/bu_isciii/assets/reports/md/assembly.md b/bu_isciii/assets/reports/md/assembly.md new file mode 100644 index 00000000..d8d586cd --- /dev/null +++ b/bu_isciii/assets/reports/md/assembly.md @@ -0,0 +1,279 @@ +# nf-core/bacass: Output + +## Introduction + +This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. + +The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. + +## Pipeline overview + +The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: + +- [Quality trimming and QC](#quality-trimming-and-qc) + - [Short Read Trimming](#short-read-trimming) + - [Short Read RAW QC](#short-read-raw-qc) + - [Long Read Trimming](#long-read-trimming) + - [Long Read RAW QC](#long-read-raw-qc) +- [Taxonomic classification](#taxonomic-classification) +- [Assembly Output](#assembly-output) + - [Polished assemblies](#polished-assemblies) +- [Assembly QC with QUAST](#assembly-qc-with-quast) +- [Annotation](#annotation) +- [Report](#report) +- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution + +## Quality trimming and QC + +### Short Read Trimming + +This step quality trims the end of reads, removes degenerate or too short reads and if needed, +combines reads coming from multiple sequencing runs. + +
+Output files + +- `trimming/shortreads/` + - `*.fastp.fastq.gz`: The trimmed/modified/unmerged fastq reads + +
+ +### Short Read RAW QC + +[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). + +![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) + +![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_plot.png) + +![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) + +:::note +The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. +::: + +
+Output files + +- `FastQC/` + - `*.html`: FastQC report containing quality metrics. + - `*.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. + +![FastQC report](images/fastqc.png) + +
+ +### Long Read Trimming + +This step performs long read trimming on Nanopore input (if provided). + +
+Output files + +- `trimming/longreads/` + - `*.fastq.gz`: The trimmed FASTQ file + - `*.log*`: Log file + +
+ +### Long Read RAW QC + +These steps perform long read QC for input data (if provided). + +Please refer to the documentation of [NanoPlot](https://github.com/wdecoster/NanoPlot) and [PycoQC](https://a-slide.github.io/pycoQC/) if you want to know more about the plots created by these tools. + +
+Output files + +- `QC_Longreads/NanoPlot`: Various plots in HTML and PNG format + +- `QC_Longreads/PycoQC` + - `*_pycoqc.html`: QC report in HTML format + - `*_pycoqc.json`: QC report in JSON format + +Example plot from Nanoplot: + +![Nanoplot](images/nanoplot.png) + +
+ +## Taxonomic classification + +This QC step classifies your reads using [Kraken2](https://ccb.jhu.edu/software/kraken2/) a k-mer based approach. This helps to identify samples that have purity +issues. Ideally you will not want to assemble reads from samples that are contaminated or contain +multiple species. If you like to visualize the report, try +[Pavian](https://github.com/fbreitwieser/pavian) or [Krakey](http://krakey.info/). + +
+Output files + +- `Kraken2/` + - `*.kraken2.report.txt`: Classification of short reads in the Kraken(1) report format. + - `*_longreads.kraken2.report.txt`: Classification of long reads in the Kraken(1) report format. + +See [webpage](http://ccb.jhu.edu/software/kraken/MANUAL.html#sample-reports) for more details. + +Exemplary Kraken2 report screenshot: + +![Kraken2 report](images/kraken2.png) + +
+ +## Reads QC and Sample purity + +The pipeline includes a dedicated step for short and long reads QC as well as contamination analysis using [Kmerfinder](https://bitbucket.org/genomicepidemiology/kmerfinder/src/master/). This process helps assess the quality and purity of the samples. + +
+Output files + +- `Kmerfinder/{ID}/` + - `*_results.txt`: Kmerfinder report table containing reads QC results and taxonomic information. + +- `Kmerfinder/`: + - kmerfinder_summary.csv: A CSV file containing the most relevant results of all samples analyzed with Kmerfinder. + +
+ +## Assembly Output + +Trimmed reads are assembled with [Unicycler](https://github.com/rrwick/Unicycler) in `short` or `hybrid` assembly modes. For long-read assembly, there are also `canu` and `miniasm` available. +Unicycler is a pipeline on its own, which at least for Illumina reads mainly acts as a frontend to Spades with added polishing steps. + +
+Output files + +- `Unicycler/` + - `*.scaffolds.fa`: Final assembly in fasta format + - `*.assembly.gfa`: Final assembly in Graphical Fragment Assembly (GFA) format + - `*.unicycler.log`: Log file summarizing steps and intermediate results on the Unicycler execution + +Check out the [Unicycler documentation](https://github.com/rrwick/Unicycler) for more information on Unicycler output. + +- `Canu/` + - `*.contigs.fasta.gz`: Final assembly in fasta format + - `*.report`: Log file summarizing steps and intermediate results + +Check out the [Canu documentation](https://canu.readthedocs.io/en/latest/index.html) for more information on Canu output. + +- `Miniasm/` + - `*.fasta.gz`: Assembly in Fasta format + - `*_assembly_consensus.fasta.gz`: Consensus assembly in fasta format (polished by Racon) + +Check out the [Miniasm documentation](https://github.com/lh3/miniasm) for more information on Miniasm output. + +- `Dragonflye/` + - `*.contigs.fa`: Assembly in Fasta format + - `*.dragonflye.log`: Log file containing the report of the dragonflye process + +Checkout the [Dragonflye](https://github.com/rpetit3/dragonflye) documentation for more information of the Dragonflye output. + +
+ +### Polished assemblies + +Long reads assemblies can be polished using [Medaka](https://github.com/nanoporetech/medaka) or [NanoPolish](https://github.com/jts/nanopolish) with Fast5 files. + +
+Output files + +- `Medaka/*_polished_genome.fa` + + - `*_polished_genome.fa`: Polished consensus assembly in fasta format + - `calls_to_draft.bam`: Alignment in bam format + - `calls_to_draft.bam.bai`: Index of alignment + - `consensus.fasta.gaps_in_draft_coords.bed` + - `consensus_probs.hdf` + +- `Nanopolish/` + - `polished_genome.fa`: Polished consensus assembly in fasta format + +
+ +## Assembly QC with QUAST + +The assembly QC is performed with [QUAST](http://quast.sourceforge.net/quast) for all assemblies in one report. It reports multiple metrics including number of contigs, N50, lengths etc in form of an html report. It further creates an HTML file with integrated contig viewer (Icarus). + +
+Output files + +- `QUAST/report/` + - `icarus.html`: QUAST's contig browser as HTML + - `report.html`: QUAST assembly QC as HTML report + - `report.pdf`: QUAST assembly QC as pdf + +- `QUAST/runs_per_reference/{reference_assembly}/` + - `icarus.html`: QUAST's contig browser as HTML + - `report.html`: QUAST assembly QC as HTML report + - `report.pdf`: QUAST assembly QC as pdf + +![QUAST QC](images/quast.png) + +![Icarus](images/icarus.png) + +
+ +## Annotation + +By default, the assembly is annotated with [Prokka](https://github.com/tseemann/prokka) which acts as frontend for several annotation tools and includes rRNA and ORF predictions. Alternatively, on request, the assembly is annotated with [Bakta](https://github.com/oschwengers/bakta) or [DFAST](https://github.com/nigyta/dfast_core). + +
+Output files + +- `Prokka/{ID}/` + - `*.gff`: Annotation in gff format + - `*.txt`: Annotation in text format + - `*.faa`: Protein sequences in fasta format + +See [Prokka's documentation](https://github.com/tseemann/prokka#output-files) for a full description of all output files. + +![Prokka annotation](images/prokka.png) + +- `Bakta/{ID}/` + - `*.gff3`: Annotations in gff3 format + - `*.txt`: Summary in txt format + - `*.faa`: CDS/sORF amino acid sequences in fasta format + +See [Baktas's documentation](https://github.com/oschwengers/bakta#output) for a full description of all output files. + +- `DFAST/{ID}_results/` + - `genome.gff`: Annotation in gff format + - `statistics.txt`: Annotation statistics in text format + - `protein.faa`: Protein sequences in fasta format + +
+ +## Report + +Some pipeline results are visualised by [MultiQC](http://multiqc.info), which is a visualisation tool that generates a single HTML report summarising all samples in your project. Further statistics are available in within the report data directory. + +[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. + +The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. + +Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . + +
+Output files + +- `multiqc/` + - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. + - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. + - `multiqc_plots/`: directory containing static images from the report in various formats. + - summary_assembly_metrics_mqc.csv: custom table containing most relevant assembly QC metrics. + +
+ +### Pipeline information + +[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. + +
+Output files + +- `pipeline_info/` + - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. + - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. + - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. + - Parameters used by the pipeline run: `params.json`. + +
\ No newline at end of file diff --git a/bu_isciii/assets/reports/md/images/ALM-ASM.png b/bu_isciii/assets/reports/md/images/ALM-ASM.png new file mode 100644 index 00000000..ba29587c Binary files /dev/null and b/bu_isciii/assets/reports/md/images/ALM-ASM.png differ diff --git a/bu_isciii/assets/reports/md/images/IRMA_workflow.png b/bu_isciii/assets/reports/md/images/IRMA_workflow.png new file mode 100644 index 00000000..b68fd5d8 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/IRMA_workflow.png differ diff --git a/bu_isciii/assets/reports/md/images/KPN30_000240185_summary.png b/bu_isciii/assets/reports/md/images/KPN30_000240185_summary.png new file mode 100644 index 00000000..e49bc9fc Binary files /dev/null and b/bu_isciii/assets/reports/md/images/KPN30_000240185_summary.png differ diff --git a/bu_isciii/assets/reports/md/images/NIPH-NIPHEM.png b/bu_isciii/assets/reports/md/images/NIPH-NIPHEM.png new file mode 100644 index 00000000..a22457d4 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/NIPH-NIPHEM.png differ diff --git a/bu_isciii/assets/reports/md/images/PLOT.png b/bu_isciii/assets/reports/md/images/PLOT.png new file mode 100644 index 00000000..33178cb9 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/PLOT.png differ diff --git a/bu_isciii/assets/reports/md/images/SEN30_000195995_NC_013365.1.png b/bu_isciii/assets/reports/md/images/SEN30_000195995_NC_013365.1.png new file mode 100644 index 00000000..86a24bd2 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/SEN30_000195995_NC_013365.1.png differ diff --git a/bu_isciii/assets/reports/md/images/deseq2_boxplot.png b/bu_isciii/assets/reports/md/images/deseq2_boxplot.png new file mode 100644 index 00000000..ca705948 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/deseq2_boxplot.png differ diff --git a/bu_isciii/assets/reports/md/images/deseq2_cluster_dendogram.png b/bu_isciii/assets/reports/md/images/deseq2_cluster_dendogram.png new file mode 100644 index 00000000..2916aee1 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/deseq2_cluster_dendogram.png differ diff --git a/bu_isciii/assets/reports/md/images/deseq2_dispersion-estimate.png b/bu_isciii/assets/reports/md/images/deseq2_dispersion-estimate.png new file mode 100644 index 00000000..8927bcf9 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/deseq2_dispersion-estimate.png differ diff --git a/bu_isciii/assets/reports/md/images/deseq2_heatmap-top-20-genes.png b/bu_isciii/assets/reports/md/images/deseq2_heatmap-top-20-genes.png new file mode 100644 index 00000000..8aa61ec8 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/deseq2_heatmap-top-20-genes.png differ diff --git a/bu_isciii/assets/reports/md/images/deseq2_heatmap_all.png b/bu_isciii/assets/reports/md/images/deseq2_heatmap_all.png new file mode 100644 index 00000000..476d6c0b Binary files /dev/null and b/bu_isciii/assets/reports/md/images/deseq2_heatmap_all.png differ diff --git a/bu_isciii/assets/reports/md/images/deseq2_maplot.png b/bu_isciii/assets/reports/md/images/deseq2_maplot.png new file mode 100644 index 00000000..c1ebdea0 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/deseq2_maplot.png differ diff --git a/bu_isciii/assets/reports/md/images/deseq2_pca.png b/bu_isciii/assets/reports/md/images/deseq2_pca.png new file mode 100644 index 00000000..3e31268b Binary files /dev/null and b/bu_isciii/assets/reports/md/images/deseq2_pca.png differ diff --git a/bu_isciii/assets/reports/md/images/deseq2_plotSD.png b/bu_isciii/assets/reports/md/images/deseq2_plotSD.png new file mode 100644 index 00000000..67733bb3 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/deseq2_plotSD.png differ diff --git a/bu_isciii/assets/reports/md/images/deseq2_pvalue-hist.png b/bu_isciii/assets/reports/md/images/deseq2_pvalue-hist.png new file mode 100644 index 00000000..830b5e54 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/deseq2_pvalue-hist.png differ diff --git a/bu_isciii/assets/reports/md/images/deseq2_qc_plots.png b/bu_isciii/assets/reports/md/images/deseq2_qc_plots.png new file mode 100755 index 00000000..bd19f1fd Binary files /dev/null and b/bu_isciii/assets/reports/md/images/deseq2_qc_plots.png differ diff --git a/bu_isciii/assets/reports/md/images/deseq2_sample-to-sample.png b/bu_isciii/assets/reports/md/images/deseq2_sample-to-sample.png new file mode 100644 index 00000000..98e05c8c Binary files /dev/null and b/bu_isciii/assets/reports/md/images/deseq2_sample-to-sample.png differ diff --git a/bu_isciii/assets/reports/md/images/dupradar_example_plot.png b/bu_isciii/assets/reports/md/images/dupradar_example_plot.png new file mode 100644 index 00000000..72db5d45 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/dupradar_example_plot.png differ diff --git a/bu_isciii/assets/reports/md/images/exomiser-html-description-1.png b/bu_isciii/assets/reports/md/images/exomiser-html-description-1.png new file mode 100755 index 00000000..6e60a97b Binary files /dev/null and b/bu_isciii/assets/reports/md/images/exomiser-html-description-1.png differ diff --git a/bu_isciii/assets/reports/md/images/exomiser-html-description-2.png b/bu_isciii/assets/reports/md/images/exomiser-html-description-2.png new file mode 100755 index 00000000..a43a5693 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/exomiser-html-description-2.png differ diff --git a/bu_isciii/assets/reports/md/images/fastqc.png b/bu_isciii/assets/reports/md/images/fastqc.png new file mode 100644 index 00000000..8abb95f2 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/fastqc.png differ diff --git a/bu_isciii/assets/reports/md/images/icarus.png b/bu_isciii/assets/reports/md/images/icarus.png new file mode 100644 index 00000000..3396e239 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/icarus.png differ diff --git a/bu_isciii/assets/reports/md/images/kraken2.png b/bu_isciii/assets/reports/md/images/kraken2.png new file mode 100644 index 00000000..0f0802d4 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/kraken2.png differ diff --git a/bu_isciii/assets/reports/md/images/mag_workflow.png b/bu_isciii/assets/reports/md/images/mag_workflow.png index 69c93297..d4cda1a0 100755 Binary files a/bu_isciii/assets/reports/md/images/mag_workflow.png and b/bu_isciii/assets/reports/md/images/mag_workflow.png differ diff --git a/bu_isciii/assets/reports/md/images/mag_workflow.svg b/bu_isciii/assets/reports/md/images/mag_workflow.svg index 7f78e0fc..293354db 100755 --- a/bu_isciii/assets/reports/md/images/mag_workflow.svg +++ b/bu_isciii/assets/reports/md/images/mag_workflow.svg @@ -1,11 +1,11 @@ + y2="103.70081" /> + inkscape:snap-page="true" + inkscape:showpageshadow="2" + inkscape:deskcolor="#d1d1d1"> + originx="26.458333" + originy="145.52081" + dotted="true" + spacingy="1" + spacingx="1" + units="mm" + visible="false" /> @@ -460,19 +467,149 @@ inkscape:label="Layer 1" inkscape:groupmode="layer" id="layer1" - transform="translate(21.371482,-30.735583)"> + transform="translate(26.458364,-26.452039)"> + width="320.14584" + height="160.82994" + x="-33.414696" + y="36.337353" /> + + + + + Taxonomicclassification + + + + + Centrifuge + + + + Kraken2 + + + + Visualization + + + Krona + + + + + + + Domain classification + id="g2183" + transform="translate(-48.61407,65.869579)"> + inkscape:export-ydpi="289.40701" + sodipodi:nodetypes="ccccc" /> @@ -532,7 +709,7 @@ x="258.7608" sodipodi:role="line">Reporting MAG summary + sodipodi:role="line">(MAG summary) + transform="translate(-127.8353,38.642012)"> - + d="m 72.418637,113.06652 -0.0554,-18.523695 0.319985,1.325776 C 75.219173,106.37566 76.96753,108.9136 80.83652,112.1513 l 2.165452,0.91522 -2.120862,0.8558 c -3.507127,1.86657 -7.123192,10.89328 -7.824223,13.72939 -0.22865,0.96955 -0.47831,2.02243 -0.55479,2.33974 -0.11954,0.496 -0.03714,-1.43797 -0.08346,-16.92493 z" + style="fill:url(#linearGradient5670-9-6-3-2-02);fill-opacity:1;stroke:#000000;stroke-width:0.0517527;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" + sodipodi:nodetypes="scsscccss" /> + d="m 37.016355,133.94985 -32.159458,4e-5 2.117303,0.20537 c 18.904981,1.83348 23.539836,3.32083 29.320361,6.59094 l 1.481939,0.83835 1.679944,-0.80064 c 3.15648,-1.50434 4.94046,-2.72016 7.704475,-3.49467 3.583223,-1.00405 8.808608,-1.85994 16.80759,-2.75304 1.73097,-0.19326 3.61073,-0.40426 4.17727,-0.46891 0.88554,-0.10097 -3.47938,-0.11749 -31.129424,-0.11749 z" + style="fill:url(#linearGradient5670-9-173);fill-opacity:1;stroke:#000000;stroke-width:0.0482295;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" + sodipodi:nodetypes="cccscsssccc" /> @@ -719,42 +891,45 @@ inkscape:export-ydpi="289.40701" inkscape:export-xdpi="289.40701" id="text4732-2-51-2-5" - y="93.097748" - x="-18.763838" + y="95.489738" + x="-0.21160807" style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;line-height:0.25;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro Bold';letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583" xml:space="preserve">Short reads(required) Adapter/qualitytrimming + fastp + AdapterRemoval Host read removal + style="fill:#24af63;fill-opacity:1;stroke:#000000;stroke-width:0.3;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" /> + style="fill:#24af63;fill-opacity:1;stroke:#000000;stroke-width:0.3;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" /> - - - - - Taxonomicclassification - - - - - Centrifuge - - - - - Kraken2 - - - - - Visualization - - - Krona - - - - @@ -1175,55 +1232,56 @@ inkscape:export-ydpi="289.40701" inkscape:export-xdpi="289.40701" ry="4.5584702" - y="42.587685" - x="-34.27684" - height="73.163437" - width="39.596622" + y="43.085247" + x="-35.226036" + height="74.083328" + width="39.6875" id="rect4728-66" style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.489677;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" transform="rotate(-90)" /> Long readsLong reads(optional) + x="39.050476" + y="32.812668" + style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro Bold';stroke-width:0.254709" /> + id="g932" + transform="translate(0,-2.6458334)"> + style="fill:#24af63;fill-opacity:1;stroke:#000000;stroke-width:0.3;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" /> Evaluation + transform="translate(-35.256451,22.522336)"> + style="fill:#24af63;fill-opacity:1;stroke:#000000;stroke-width:0.3;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" /> Remove Lambda + transform="translate(0,-5.7595465)"> Quality filtering + transform="translate(-139.92402,148.73448)"> + transform="translate(-78.042851,145.23947)"> + transform="translate(197.63408,-247.93437)"> @@ -1455,20 +1513,10 @@ - + transform="translate(0.41482033,-21.713362)"> + transform="translate(12.595022,186.3768)"> @@ -1476,22 +1524,22 @@ xml:space="preserve" style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;line-height:1;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro Bold';text-align:center;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583" x="209.84291" - y="-73.202888" + y="-73.732056" id="text4732-0-6-5-4" inkscape:export-xdpi="289.40701" inkscape:export-ydpi="289.40701">Taxonomic classification GTDB-Tk + + + Tiara + + + + MetaEuk + - + transform="translate(34.122913,-23.872595)"> + transform="translate(249.24779,-79.485727)"> Genome annotation + transform="translate(-197.12776,198.55821)"> + transform="translate(51.346208,2.0158954)"> Protein-codinggene prediction + + + + Virus identification + + @@ -1656,53 +1774,53 @@ inkscape:export-ydpi="289.40701" inkscape:export-xdpi="289.40701" ry="3.3498373" - y="-39.356258" - x="22.722082" - height="57.991798" - width="73.501045" + y="-39.406479" + x="23.640951" + height="68.79171" + width="74.083336" id="rect4728-6" style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.47292;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" /> Assembly(sample- or group-wise) + transform="translate(-126.89976,-0.60967038)"> Evaluation QUAST + aDNA Validation + + pyDamage + + Freebayes + + BCFTools + + + geNomad + + transform="translate(-100.18223,-3.7041668)"> @@ -1872,7 +2083,7 @@ ry="2.4730365" /> + transform="translate(87.850198,-14.321784)"> + inkscape:export-ydpi="289.40701" + sodipodi:nodetypes="ccsscccssc" /> - - - + + ry="7.6142898" + y="-282.19238" + x="127.87727" + height="117.6434" + width="35.11269" + id="rect4728-6-3" + style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.471203;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" + transform="rotate(90)" /> - + - - Binning + style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;line-height:1;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro Bold';text-align:center;text-anchor:middle;stroke-width:0.264583" + id="tspan5056">Binning + + + MetaBAT2 + MaxBin2 + + + + + + + + + + + + + MetaBAT2 - - - - - - - - - - - - + y="67.229904" + x="148.50856" + id="tspan4772-2-9-7-6" + sodipodi:role="line">CONCOCT + + + Evaluation + transform="translate(216.15171,-134.79898)" + id="g3493"> + Evaluation - - - BUSCO - - - - BUSCO + + + + QUAST - + style="font-size:3.175px;stroke-width:0.264583">CheckM + + + Abundance estimationGUNC + + + + and visualization + id="tspan4772-2-9-0-1-3-5-5" + x="39.888378" + y="235.9169" + style="font-size:3.175px;stroke-width:0.264583">QUAST + (Abundance estimation and visualization) + v2.5.0 + + Binning refinement + + DAS Tool + CC-BY 4.0 Design originally by Zandra Fagernäs + x="220.64191" + y="46.223618">CC-BY 4.0 Design originally by Zandra Fagernäs + + + + Bin post-processing + diff --git a/bu_isciii/assets/reports/md/images/mqc_alignment_check.png b/bu_isciii/assets/reports/md/images/mqc_alignment_check.png new file mode 100755 index 00000000..507e6496 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/mqc_alignment_check.png differ diff --git a/bu_isciii/assets/reports/md/images/mqc_cutadapt_trimmed.png b/bu_isciii/assets/reports/md/images/mqc_cutadapt_trimmed.png new file mode 100644 index 00000000..279eb4e5 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/mqc_cutadapt_trimmed.png differ diff --git a/bu_isciii/assets/reports/md/images/mqc_deseq2_clustering.png b/bu_isciii/assets/reports/md/images/mqc_deseq2_clustering.png new file mode 100755 index 00000000..f44010a9 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/mqc_deseq2_clustering.png differ diff --git a/bu_isciii/assets/reports/md/images/mqc_deseq2_pca.png b/bu_isciii/assets/reports/md/images/mqc_deseq2_pca.png new file mode 100755 index 00000000..25e86ba6 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/mqc_deseq2_pca.png differ diff --git a/bu_isciii/assets/reports/md/images/mqc_dupradar.png b/bu_isciii/assets/reports/md/images/mqc_dupradar.png new file mode 100755 index 00000000..7dd66a1b Binary files /dev/null and b/bu_isciii/assets/reports/md/images/mqc_dupradar.png differ diff --git a/bu_isciii/assets/reports/md/images/mqc_featurecounts_biotype.png b/bu_isciii/assets/reports/md/images/mqc_featurecounts_biotype.png new file mode 100755 index 00000000..652ca836 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/mqc_featurecounts_biotype.png differ diff --git a/bu_isciii/assets/reports/md/images/mqc_hisat2.png b/bu_isciii/assets/reports/md/images/mqc_hisat2.png new file mode 100755 index 00000000..eea1c99e Binary files /dev/null and b/bu_isciii/assets/reports/md/images/mqc_hisat2.png differ diff --git a/bu_isciii/assets/reports/md/images/mqc_picard_markduplicates.png b/bu_isciii/assets/reports/md/images/mqc_picard_markduplicates.png new file mode 100755 index 00000000..33b4e3d7 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/mqc_picard_markduplicates.png differ diff --git a/bu_isciii/assets/reports/md/images/mqc_preseq_plot.png b/bu_isciii/assets/reports/md/images/mqc_preseq_plot.png new file mode 100755 index 00000000..c4c98f17 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/mqc_preseq_plot.png differ diff --git a/bu_isciii/assets/reports/md/images/mqc_qualimap_coverage.png b/bu_isciii/assets/reports/md/images/mqc_qualimap_coverage.png new file mode 100755 index 00000000..3696115c Binary files /dev/null and b/bu_isciii/assets/reports/md/images/mqc_qualimap_coverage.png differ diff --git a/bu_isciii/assets/reports/md/images/mqc_qualimap_features.png b/bu_isciii/assets/reports/md/images/mqc_qualimap_features.png new file mode 100755 index 00000000..b2853314 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/mqc_qualimap_features.png differ diff --git a/bu_isciii/assets/reports/md/images/mqc_rsem_mapped.png b/bu_isciii/assets/reports/md/images/mqc_rsem_mapped.png new file mode 100755 index 00000000..da7449d7 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/mqc_rsem_mapped.png differ diff --git a/bu_isciii/assets/reports/md/images/mqc_rsem_multimapped.png b/bu_isciii/assets/reports/md/images/mqc_rsem_multimapped.png new file mode 100755 index 00000000..b4d0548b Binary files /dev/null and b/bu_isciii/assets/reports/md/images/mqc_rsem_multimapped.png differ diff --git a/bu_isciii/assets/reports/md/images/mqc_rseqc_inferexperiment.png b/bu_isciii/assets/reports/md/images/mqc_rseqc_inferexperiment.png new file mode 100755 index 00000000..c020564a Binary files /dev/null and b/bu_isciii/assets/reports/md/images/mqc_rseqc_inferexperiment.png differ diff --git a/bu_isciii/assets/reports/md/images/mqc_rseqc_innerdistance.png b/bu_isciii/assets/reports/md/images/mqc_rseqc_innerdistance.png new file mode 100755 index 00000000..0b3d5679 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/mqc_rseqc_innerdistance.png differ diff --git a/bu_isciii/assets/reports/md/images/mqc_rseqc_junctionannotation.png b/bu_isciii/assets/reports/md/images/mqc_rseqc_junctionannotation.png new file mode 100755 index 00000000..76ec230f Binary files /dev/null and b/bu_isciii/assets/reports/md/images/mqc_rseqc_junctionannotation.png differ diff --git a/bu_isciii/assets/reports/md/images/mqc_rseqc_junctionsaturation.png b/bu_isciii/assets/reports/md/images/mqc_rseqc_junctionsaturation.png new file mode 100755 index 00000000..b19080a0 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/mqc_rseqc_junctionsaturation.png differ diff --git a/bu_isciii/assets/reports/md/images/mqc_rseqc_readdistribution.png b/bu_isciii/assets/reports/md/images/mqc_rseqc_readdistribution.png new file mode 100755 index 00000000..29bee6ce Binary files /dev/null and b/bu_isciii/assets/reports/md/images/mqc_rseqc_readdistribution.png differ diff --git a/bu_isciii/assets/reports/md/images/mqc_rseqc_readduplication.png b/bu_isciii/assets/reports/md/images/mqc_rseqc_readduplication.png new file mode 100755 index 00000000..588f0935 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/mqc_rseqc_readduplication.png differ diff --git a/bu_isciii/assets/reports/md/images/mqc_salmon.png b/bu_isciii/assets/reports/md/images/mqc_salmon.png new file mode 100755 index 00000000..baa0ac6b Binary files /dev/null and b/bu_isciii/assets/reports/md/images/mqc_salmon.png differ diff --git a/bu_isciii/assets/reports/md/images/mqc_samtools_idxstats.png b/bu_isciii/assets/reports/md/images/mqc_samtools_idxstats.png new file mode 100755 index 00000000..a3eff4c3 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/mqc_samtools_idxstats.png differ diff --git a/bu_isciii/assets/reports/md/images/mqc_samtools_mapped.png b/bu_isciii/assets/reports/md/images/mqc_samtools_mapped.png new file mode 100755 index 00000000..33376009 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/mqc_samtools_mapped.png differ diff --git a/bu_isciii/assets/reports/md/images/mqc_sortmerna.png b/bu_isciii/assets/reports/md/images/mqc_sortmerna.png new file mode 100755 index 00000000..04c9e505 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/mqc_sortmerna.png differ diff --git a/bu_isciii/assets/reports/md/images/mqc_star.png b/bu_isciii/assets/reports/md/images/mqc_star.png new file mode 100755 index 00000000..b9675cea Binary files /dev/null and b/bu_isciii/assets/reports/md/images/mqc_star.png differ diff --git a/bu_isciii/assets/reports/md/images/mqc_strand_check.png b/bu_isciii/assets/reports/md/images/mqc_strand_check.png new file mode 100755 index 00000000..7b163f9e Binary files /dev/null and b/bu_isciii/assets/reports/md/images/mqc_strand_check.png differ diff --git a/bu_isciii/assets/reports/md/images/nanoplot.png b/bu_isciii/assets/reports/md/images/nanoplot.png new file mode 100644 index 00000000..dc755a73 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/nanoplot.png differ diff --git a/bu_isciii/assets/reports/md/images/nf-core-mag_logo_dark.png b/bu_isciii/assets/reports/md/images/nf-core-mag_logo_dark.png new file mode 100755 index 00000000..fbeb3e81 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/nf-core-mag_logo_dark.png differ diff --git a/bu_isciii/assets/reports/md/images/nf-core-mag_logo_light.png b/bu_isciii/assets/reports/md/images/nf-core-mag_logo_light.png new file mode 100755 index 00000000..f377e05b Binary files /dev/null and b/bu_isciii/assets/reports/md/images/nf-core-mag_logo_light.png differ diff --git a/bu_isciii/assets/reports/md/images/nf-core-rnaseq_metro_map_grey.png b/bu_isciii/assets/reports/md/images/nf-core-rnaseq_metro_map_grey.png new file mode 100644 index 00000000..0dbf23f8 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/nf-core-rnaseq_metro_map_grey.png differ diff --git a/bu_isciii/assets/reports/md/images/nf-core-viralrecon_metro_map.svg b/bu_isciii/assets/reports/md/images/nf-core-viralrecon_metro_map.svg new file mode 100755 index 00000000..38e6792c --- /dev/null +++ b/bu_isciii/assets/reports/md/images/nf-core-viralrecon_metro_map.svg @@ -0,0 +1,7257 @@ + + + +image/svg+xmlviralreconSTAGE1. Pre-processing2. Alignment & BAM post-processing3. Variant calling4. Consensus calling5. De novo assembly6. Final QCFASTQcatfastqFastQCfastpKraken2FastQCilluminapicardCollectMultipleMetricsmosdepthBowtie2BCFToolsiVarvariantsBCFToolsPangolinNextcladeQUASTSnpSiftTSVVariantslong tableMultiQCHTMLSPAdesBlastABACAScutadaptminiaUnicyclerQUASTPlasmidIDBandageimageMultiQCHTMLMETHODVariant calling - Variants: iVar, Consensus: iVarVariant calling - Variants: iVar, Consensus: BCFToolsVariant calling - Variants: BCFTools, Consensus: iVarVariant calling - Variants: BCFTools, Consensus: BCFToolsDe novo assemblyASCIIGenomeSAMtoolsiVartrimpicardMarkDuplicatesSnpEffiVarconsensusLicense:125346nanoporeviralreconSTAGE1. Pre-processing2. Alignment, variant & consensus calling3. Consensus analysis4. Variant analysis5. Final QC43125License:NanoPlotpycoQCarticguppyplexFAST5articminionvcflibvcfuniqASCIIGenomeQUASTSnpEffPangolinSnpSiftNextclademosdepthVariantslong tableMultiQCTSVHTMLVCFFASTASamtoolsview diff --git a/bu_isciii/assets/reports/md/images/nf-core-viralrecon_metro_map_illumina.png b/bu_isciii/assets/reports/md/images/nf-core-viralrecon_metro_map_illumina.png new file mode 100755 index 00000000..fd5f2928 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/nf-core-viralrecon_metro_map_illumina.png differ diff --git a/bu_isciii/assets/reports/md/images/nf-core-viralrecon_metro_map_nanopore.png b/bu_isciii/assets/reports/md/images/nf-core-viralrecon_metro_map_nanopore.png new file mode 100755 index 00000000..5fe0cdf1 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/nf-core-viralrecon_metro_map_nanopore.png differ diff --git a/bu_isciii/assets/reports/md/images/prokka.png b/bu_isciii/assets/reports/md/images/prokka.png new file mode 100644 index 00000000..4719ee86 Binary files /dev/null and b/bu_isciii/assets/reports/md/images/prokka.png differ diff --git a/bu_isciii/assets/reports/md/images/quast.png b/bu_isciii/assets/reports/md/images/quast.png new file mode 100644 index 00000000..4c33b02b Binary files /dev/null and b/bu_isciii/assets/reports/md/images/quast.png differ diff --git a/bu_isciii/assets/reports/md/irma.md b/bu_isciii/assets/reports/md/irma.md new file mode 100644 index 00000000..1ed531bd --- /dev/null +++ b/bu_isciii/assets/reports/md/irma.md @@ -0,0 +1,144 @@ +# IRMA + +IRMA does not include quality trimming of the reads by itself. Therefore, before the execution of the pipeline the samples are analysed with FastQC for quality control and pre-processed with fastp. + +## FASTQC + +[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). + +
+Output files + +- `fastqc/raw/` + - `*_fastqc.html`: FastQC report containing quality metrics. + - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. + +**NB:** The FastQC plots in this directory are generated relative to the raw, input reads. They may contain adapter sequence and regions of low quality. To see how your reads look after trimming please refer to the FastQC reports in the `fastqc/trim/` directory. + +
+ +![](images/mqc_fastqc_plot.png) + +## FASTP + +[Fastp](https://github.com/OpenGene/fastp?tab=readme-ov-file#fastp) is a tool designed to provide fast, all-in-one preprocessing for FastQ files. It has been developed in C++ with multithreading support to achieve higher performance. This tools is used to pre-process the reads using multiple filters. For this specific workflow it is used for quality trimming and to filter any short-reads, adapters, polyG and polyX. This software includes in its output an html with several stats that show the state of the reads before and after processing. + +
+Output files + +- `fastp/` + - `*.fastp.html`: Trimming report in html format. + - `*.fastp.json`: Trimming report in json format. +- `fastp/log/` + - `*.fastp.log`: Trimming log file. +- `fastqc/trim/` + - `*_fastqc.html`: FastQC report of the trimmed reads. + - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. + +
+ +![](images/mqc_fastp_plot.png) + +## IRMA workflow + +[IRMA (Iterative Refinement Meta-Assembler)](https://wonder.cdc.gov/amd/flu/irma/) was designed for the robust assembly, variant calling, and phasing of highly variable RNA viruses. IRMA is deployed for this service to analyse Influenza virus. IRMA is free to use and parallelizes computations for both cluster computing and single computer multi-core setups. You can read the [IRMA manuscript](https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-016-3030-6#Sec9) for more background on the methodology. + +![](images/IRMA_workflow.png) + +**IRMA** uses several auxiliar softwares in its workflow: + +- [BLAT](http://www.kentinformatics.com/products.html) for the match step +- [LABEL](https://wonder.cdc.gov/amd/flu/label), which also packages certain resources used by IRMA: + - [Sequence Alignment and Modeling System (SAM)](http://www.ncbi.nlm.nih.gov/pubmed/9927713) for both the rough align and sort steps + - [Shogun Toolbox](https://github.com/shogun-toolbox/shogun), which is an essential part of LABEL, is used in the sort step. +- [SSW](http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0082138) for the final assembly step. +- [MINIMAP2](https://academic.oup.com/bioinformatics/article/34/18/3094/4994778) for the final assembly step (alternate option that may be useful for long - read assemblies). +- [samtools](http://samtools.sourceforge.net/) for BAM-SAM conversion as well as BAM sorting and indexing +- [GNU Parallel](http://www.gnu.org/software/parallel/) for single node parallelization + +### IRMA output directory structure + +**_gene\_segment_** in the case of influenza can be any of the following: [HA, NA, NS, MP, NP, PA, PB1, PB2] and HE for Influenza-C + +### **Final files:** + +- **_gene\_segment_.bam** Sorted BAM file for the final gene_segment assembly (merged if applicable) +- **_gene\_segment_.bam.bai** BAM file index for gene_segment assembly +- **_gene\_segment_.fasta** Final assembled plurality consensus (no mixed base calls) for gene_segment +- **_gene\_segment_.a2m** Optional file: Plurality consensus aligned to profile HMM +- **_gene\_segment_.vcf** Custom variant call file for called IRMA variants for each gene segment + +### **Folders:** + +**amended_consensus/**: Assembled consensus per gene segment with missed based calls. Numbers correspond to fragments in $SEG_NUMBERS +- **_sample\_name_\__fragment\_number_.fa** Amended consensus +- **_sample\_name_\_7.a2m** Optional output: amended global alignment to profile HMM +- **_sample\_name_\_7.pad.fa** Optional output: N-padded consensus for amplicon dropouts + +**figures/** + + - **_gene\_segment_-coverageDiagram.pdf** Shows coverage and variant calls + - **_gene\_segment_-heuristics.pdf** Heuristic graphs for gene segment + - **_gene\_segment_-EXPENRD.pdf** gene segment variant phasing using experimental enrichment distances + - **_gene\_segment_-JACCARD.pdf** gene segment variant phasing using modified Jaccard distances + - **_gene\_segment_-MUTUALD.pdf** gene segment variant phasing using mutual association distances + - **_gene\_segment_-NJOINTP.pdf** gene segment variant phasing using normalized joint probability distances + - **READ_PERCENTAGES.pdf** Break down for reads assembled + +**intermediate/** Intermediate data for each step + + - **0-ITERATIVE-REFERENCES/** + - **R0-_gene\_segment_.ref** Starting reference library sequence for segment + - **R1-_gene\_segment_.ref** Gene segment working reference after round 1,template for round 2 + - **R2-_gene\_segment_.ref** Working reference for gene segment after round 2 + - **1-MATCH_BLAT/** + - **R1.tar.gz** + - **R2.tar.gz** Archive of BLAT results for the MATCH step + - **R3.tar.gz** + - **2-SORT_BLAT/** + - **R1.tar.gz** Classification/sorting intermediate files for round 1 + - **R1.txt** Summary statistics of sorting results for round 1 + - **R2.tar.gz** Classification/sorting intermediate files for round 2 + - **R2.txt** Summary statistics of sorting results for round 2 + - **3-ALIGN_SAM/BLAT/** + - **storedCounts.tar.gz** Statistic files used to create rough consensus sequences + - **4-ASSEMBLE_SSW/** Final assembly phase intermediate iterativeresuls + - **F1-_gene\_segment_.bam** Unsorted BAM file for gene segment assembly iteration 1 + - **F1-_gene\_segment_.ref** Reference for final assembly, gene segment, iteration 1 + - **F2-_gene\_segment_.bam** Unsorted BAM file for gene segment assembly, iteration 2 + - **F2-_gene\_segment_.ref** Reference for final assembly, gene segment, iteration 2 + - **reads.tar.gz** Archive of sorted, unmerged reads by gene segment + +**logs/** + + - **ASSEMBLY_log.txt** SSw scores per all rounds tried in the iterative refinement + - **NR_COUNTS_log.txt** Read pattern counts at various stages + - **QC_log.txt** Quality control output + - **READ_log.txt** Counts of assembled reads from BAM files + - **FLU-_sample\_name_.sh** Configuration file corresponding to this IRMA run + - **run_info.txt** Table of parameters used by the IRMA run + +**matrices/** Phasing matrices used to generate heat maps + + - **_gene\_segment_-EXPENRD.sqm** + - **_gene\_segment_-JACCARD.sqm** + - **_gene\_segment_-MUTUALD.sqm** + - **_gene\_segment_-NJOINTP.sqm** + +**secondary/** + + - **R1-A_NA_N1.fa** Trace A_NA_N1 sorted into secondary status + - **R1-UNRECOGNIZABLE.fa** Read patterns that matched flu but had poor signal according to LABEL + - **R2-UNRECOGNIZABLE.fa** + - **unmatched_read_patterns.tar.gz** Archive of left over read patterns that did not match FLU + +**tables/** + - **_gene\_segment_-pairingStats.txt** Summary of paired-end merging statistics, if applicable, gene segment + - **_gene\_segment_-coverage.txt** Summary coverage statistics for assembly, gene segment + - **_gene\_segment_-coverage.a2m.txt** Optional file: Coverage statistics for plurality consensus globally aligned to profile HMM + - **_gene\_segment_-coverage.pad.txt** Optional file: Coverage statistics for padded plurality consensus globally aligned to profile HMM + - **_gene\_segment_-allAlleles.txt** Statistics for every position & allele in the assembly, gene segment + - **_gene\_segment_-insertions.txt** Called insertion variants for gene segment + - **_gene\_segment_-deletions.txt** Called deletion variants for gene segment + - **_gene\_segment_-variants.txt** Called single nucleotide variants for gene segment + - **READ_COUNTS.txt** Read counts for various points in the assembly process \ No newline at end of file diff --git a/bu_isciii/assets/reports/md/mag.md b/bu_isciii/assets/reports/md/mag.md index 644efb1d..67ef902a 100644 --- a/bu_isciii/assets/reports/md/mag.md +++ b/bu_isciii/assets/reports/md/mag.md @@ -10,16 +10,19 @@ The directories listed below will be created in the results directory after the The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -* [Quality control](#quality-control) of input reads - trimming and contaminant removal -* [Taxonomic classification of trimmed reads](#taxonomic-classification-of-trimmed-reads) -* [Assembly](#assembly) of trimmed reads -* [Protein-coding gene prediction](#gene-prediction) of assemblies -* [Binning](#binning) of assembled contigs -* [Taxonomic classification of binned genomes](#taxonomic-classification-of-binned-genomes) -* [Genome annotation of binned genomes](#genome-annotation-of-binned-genomes) -* [Additional summary for binned genomes](#additional-summary-for-binned-genomes) -* [MultiQC](#multiqc) - aggregate report, describing results of the whole pipeline -* [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +- [Quality control](#quality-control) of input reads - trimming and contaminant removal +- [Taxonomic classification of trimmed reads](#taxonomic-classification-of-trimmed-reads) +- [Digital sequencing normalisation](#digital-normalization-with-BBnorm) +- [Assembly](#assembly) of trimmed reads +- [Protein-coding gene prediction](#gene-prediction) of assemblies +- [Virus identification](#virus-identification-in-assemblies) of assemblies +- [Binning and binning refinement](#binning-and-binning-refinement) of assembled contigs +- [Taxonomic classification of binned genomes](#taxonomic-classification-of-binned-genomes) +- [Genome annotation of binned genomes](#genome-annotation-of-binned-genomes) +- [Additional summary for binned genomes](#additional-summary-for-binned-genomes) +- [Ancient DNA](#ancient-dna) +- [MultiQC](#multiqc) - aggregate report, describing results of the whole pipeline +- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution Note that when specifying the parameter `--coassemble_group`, for the corresponding output filenames/directories of the assembly or downsteam processes the group ID, or more precisely the term `group-[group_id]`, will be used instead of the sample ID. @@ -36,9 +39,9 @@ FastQC is run for visualising the general quality metrics of the sequencing runs
Output files -* `QC_shortreads/fastqc/` - * `[sample]_[1/2]_fastqc.html`: FastQC report, containing quality metrics for your untrimmed raw fastq files - * `[sample].trimmed_[1/2]_fastqc.html`: FastQC report, containing quality metrics for trimmed and, if specified, filtered read files +- `QC_shortreads/fastqc/` + - `[sample]_[1/2]_fastqc.html`: FastQC report, containing quality metrics for your untrimmed raw fastq files + - `[sample].trimmed_[1/2]_fastqc.html`: FastQC report, containing quality metrics for trimmed and, if specified, filtered read files
@@ -51,9 +54,21 @@ FastQC is run for visualising the general quality metrics of the sequencing runs
Output files -* `QC_shortreads/fastp/[sample]/` - * `fastp.html`: Interactive report - * `fastp.json`: Report in json format +- `QC_shortreads/fastp/[sample]/` + - `fastp.html`: Interactive report + - `fastp.json`: Report in json format + +
+ +### AdapterRemoval2 + +[AdapterRemoval](https://adapterremoval.readthedocs.io/en/stable/) searches for and removes remnant adapter sequences from High-Throughput Sequencing (HTS) data and (optionally) trims low quality bases from the 3' end of reads following adapter removal. It is popular in the field of palaeogenomics. The output logs are stored in the results folder, and as a part of the MultiQC report. + +
+Output files + +- `QC_shortreads/adapterremoval/[sample]/` + - `[sample]_ar2.settings`: AdapterRemoval log file.
@@ -64,8 +79,8 @@ The pipeline uses bowtie2 to map the reads against PhiX and removes mapped reads
Output files -* `QC_shortreads/remove_phix/` - * `[sample].phix_removed.bowtie2.log`: Contains a brief log file indicating how many reads have been retained. +- `QC_shortreads/remove_phix/` + - `[sample].phix_removed.bowtie2.log`: Contains a brief log file indicating how many reads have been retained.
@@ -76,8 +91,9 @@ The pipeline uses bowtie2 to map short reads against the host reference genome s
Output files -* `QC_shortreads/remove_host/` - * `[sample].host_removed.bowtie2.log`: Contains the bowtie2 log file indicating how many reads have been mapped as well as a file listing the read ids of discarded reads. +- `QC_shortreads/remove_host/` + - `[sample].host_removed.bowtie2.log`: Contains the bowtie2 log file indicating how many reads have been mapped. + - `[sample].host_removed.mapped*.read_ids.txt`: Contains a file listing the read ids of discarded reads.
@@ -88,8 +104,8 @@ The pipeline uses Nanolyse to map the reads against the Lambda phage and removes
Output files -* `QC_longreads/NanoLyse/` - * `[sample]_nanolyse.log`: Contains a brief log file indicating how many reads have been retained. +- `QC_longreads/NanoLyse/` + - `[sample]_nanolyse.log`: Contains a brief log file indicating how many reads have been retained.
@@ -109,9 +125,23 @@ NanoPlot is used to calculate various metrics and plots about the quality and le
Output files -* `QC_longreads/NanoPlot/[sample]/` - * `raw_*.[png/html/txt]`: Plots and reports for raw data - * `filtered_*.[png/html/txt]`: Plots and reports for filtered data +- `QC_longreads/NanoPlot/[sample]/` + - `raw_*.[png/html/txt]`: Plots and reports for raw data + - `filtered_*.[png/html/txt]`: Plots and reports for filtered data + +
+ +## Digital normalization with BBnorm + +If the pipeline is called with the `--bbnorm` option, it will normalize sequencing depth of libraries prior assembly by removing reads to 1) reduce coverage of very abundant kmers and 2) delete very rare kmers (see `--bbnorm_target` and `--bbnorm_min` parameters). +When called in conjunction with `--coassemble_group`, BBnorm will operate on interleaved (merged) FastQ files, producing only a single output file. +If the `--save_bbnorm_reads` parameter is set, the resulting FastQ files are saved together with log output. + +
+Output files + +- `bbmap/bbnorm/[sample]\*.fastq.gz` +- `bbmap/bbnorm/log/[sample].bbnorm.log`
@@ -124,9 +154,9 @@ Kraken2 classifies reads using a k-mer based approach as well as assigns taxonom
Output files -* `Taxonomy/kraken2/[sample]/` - * `kraken2.report`: Classification in the Kraken report format. See the [kraken2 manual](https://github.com/DerrickWood/kraken2/wiki/Manual#output-formats) for more details - * `taxonomy.krona.html`: Interactive pie chart produced by [KronaTools](https://github.com/marbl/Krona/wiki) +- `Taxonomy/kraken2/[sample]/` + - `kraken2.report`: Classification in the Kraken report format. See the [kraken2 manual](https://github.com/DerrickWood/kraken2/wiki/Manual#output-formats) for more details + - `taxonomy.krona.html`: Interactive pie chart produced by [KronaTools](https://github.com/marbl/Krona/wiki)
@@ -139,10 +169,10 @@ More information on the [Centrifuge](https://ccb.jhu.edu/software/centrifuge/) w
Output files -* `Taxonomy/centrifuge/[sample]/` - * `report.txt`: Tab-delimited result file. See the [centrifuge manual](https://ccb.jhu.edu/software/centrifuge/manual.shtml#centrifuge-classification-output) for information about the fields - * `kreport.txt`: Classification in the Kraken report format. See the [kraken2 manual](https://github.com/DerrickWood/kraken2/wiki/Manual#output-formats) for more details - * `taxonomy.krona.html`: Interactive pie chart produced by [KronaTools](https://github.com/marbl/Krona/wiki) +- `Taxonomy/centrifuge/[sample]/` + - `report.txt`: Tab-delimited result file. See the [centrifuge manual](https://ccb.jhu.edu/software/centrifuge/manual.shtml#centrifuge-classification-output) for information about the fields + - `kreport.txt`: Classification in the Kraken report format. See the [kraken2 manual](https://github.com/DerrickWood/kraken2/wiki/Manual#output-formats) for more details + - `taxonomy.krona.html`: Interactive pie chart produced by [KronaTools](https://github.com/marbl/Krona/wiki)
@@ -157,12 +187,13 @@ Trimmed (short) reads are assembled with both megahit and SPAdes. Hybrid assembl
Output files -* `Assembly/MEGAHIT/` - * `[sample/group].contigs.fa.gz`: Compressed metagenome assembly in fasta format - * `[sample/group].log`: Log file - * `QC/[sample/group]/`: Directory containing QUAST files and Bowtie2 mapping logs - * `MEGAHIT-[sample].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the sample that the metagenome was assembled from, only present if `--coassemble_group` is not set. - * `MEGAHIT-[sample/group]-[sampleToMap].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the respective sample ("sampleToMap"). +- `Assembly/MEGAHIT/` + - `[sample/group].contigs.fa.gz`: Compressed metagenome assembly in fasta format + - `[sample/group].log`: Log file + - `QC/[sample/group]/`: Directory containing QUAST files and Bowtie2 mapping logs + - `MEGAHIT-[sample].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the sample that the metagenome was assembled from, only present if `--coassemble_group` is not set. + - `MEGAHIT-[sample/group]-[sampleToMap].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the respective sample ("sampleToMap"). + - `MEGAHIT-[sample].[bam/bai]`: Optionally saved BAM file of the Bowtie2 mapping of reads against the assembly.
@@ -173,14 +204,15 @@ Trimmed (short) reads are assembled with both megahit and SPAdes. Hybrid assembl
Output files -* `Assembly/SPAdes/` - * `[sample/group]_scaffolds.fasta.gz`: Compressed assembled scaffolds in fasta format - * `[sample/group]_graph.gfa.gz`: Compressed assembly graph in gfa format - * `[sample/group]_contigs.fasta.gz`: Compressed assembled contigs in fasta format - * `[sample/group].log`: Log file - * `QC/[sample/group]/`: Directory containing QUAST files and Bowtie2 mapping logs - * `SPAdes-[sample].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the sample that the metagenome was assembled from, only present if `--coassemble_group` is not set. - * `SPAdes-[sample/group]-[sampleToMap].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the respective sample ("sampleToMap"). +- `Assembly/SPAdes/` + - `[sample/group]_scaffolds.fasta.gz`: Compressed assembled scaffolds in fasta format + - `[sample/group]_graph.gfa.gz`: Compressed assembly graph in gfa format + - `[sample/group]_contigs.fasta.gz`: Compressed assembled contigs in fasta format + - `[sample/group].log`: Log file + - `QC/[sample/group]/`: Directory containing QUAST files and Bowtie2 mapping logs + - `SPAdes-[sample].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the sample that the metagenome was assembled from, only present if `--coassemble_group` is not set. + - `SPAdes-[sample/group]-[sampleToMap].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the respective sample ("sampleToMap"). + - `SPAdes-[sample].[bam/bai]`: Optionally saved BAM file of the Bowtie2 mapping of reads against the assembly.
@@ -191,14 +223,15 @@ SPAdesHybrid is a part of the [SPAdes](http://cab.spbu.ru/software/spades/) soft
Output files -* `Assembly/SPAdesHybrid/` - * `[sample/group]_scaffolds.fasta.gz`: Compressed assembled scaffolds in fasta format - * `[sample/group]_graph.gfa.gz`: Compressed assembly graph in gfa format - * `[sample/group]_contigs.fasta.gz`: Compressed assembled contigs in fasta format - * `[sample/group].log`: Log file - * `QC/[sample/group]/`: Directory containing QUAST files and Bowtie2 mapping logs - * `SPAdesHybrid-[sample].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the sample that the metagenome was assembled from, only present if `--coassemble_group` is not set. - * `SPAdesHybrid-[sample/group]-[sampleToMap].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the respective sample ("sampleToMap"). +- `Assembly/SPAdesHybrid/` + - `[sample/group]_scaffolds.fasta.gz`: Compressed assembled scaffolds in fasta format + - `[sample/group]_graph.gfa.gz`: Compressed assembly graph in gfa format + - `[sample/group]_contigs.fasta.gz`: Compressed assembled contigs in fasta format + - `[sample/group].log`: Log file + - `QC/[sample/group]/`: Directory containing QUAST files and Bowtie2 mapping logs + - `SPAdesHybrid-[sample].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the sample that the metagenome was assembled from, only present if `--coassemble_group` is not set. + - `SPAdesHybrid-[sample/group]-[sampleToMap].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the respective sample ("sampleToMap"). + - `SPAdesHybrid-[sample].[bam/bai]`: Optionally saved BAM file of the Bowtie2 mapping of reads against the assembly.
@@ -209,10 +242,19 @@ SPAdesHybrid is a part of the [SPAdes](http://cab.spbu.ru/software/spades/) soft
Output files -* `Assembly/[assembler]/QC/[sample/group]/` - * `report.*`: QUAST report in various formats, such as html, txt, tsv or tex - * `quast.log`: QUAST log file - * `predicted_genes/[assembler]-[sample/group].rna.gff`: Contig positions for rRNA genes in gff version 3 format +- `Assembly/[assembler]/QC/[sample/group]/QUAST/` + - `report.*`: QUAST report in various formats, such as html, pdf, tex, tsv, or txt + - `transposed_report.*`: QUAST report that has been transposed into wide format (tex, tsv, or txt) + - `quast.log`: QUAST log file + - `metaquast.log`: MetaQUAST log file + - `icarus.html`: Icarus main menu with links to interactive viewers + - `icarus_viewers/contig_size_viewer.html`: Diagram of contigs that are ordered from longest to shortest + - `basic_stats/cumulative_plot.pdf`: Shows the growth of contig lengths (contigs are ordered from largest to shortest) + - `basic_stats/GC_content_plot.pdf`: Shows the distribution of GC content in the contigs + - `basic_stats/[assembler]-[sample/group]_GC_content_plot.pdf`: Histogram of the GC percentage for the contigs + - `basic_stats/Nx_plot.pdf`: Plot of Nx values as x varies from 0 to 100%. + - `predicted_genes/[assembler]-[sample/group].rna.gff`: Contig positions for rRNA genes in gff version 3 format + - `predicted_genes/barrnap.log`: Barrnap log file (ribosomal RNA predictor)
@@ -223,25 +265,56 @@ Protein-coding genes are predicted for each assembly.
Output files -* `Prodigal/` - * `[sample/group].gff`: Gene Coordinates in GFF format - * `[sample/group].faa`: The protein translation file consists of all the proteins from all the sequences in multiple FASTA format. - * `[sample/group].fna`: Nucleotide sequences of the predicted proteins using the DNA alphabet, not mRNA (so you will see 'T' in the output and not 'U'). - * `[sample/group]_all.txt`: Information about start positions of genes. +- `Annotation/Prodigal/` + - `[assembler]-[sample/group].gff.gz`: Gene Coordinates in GFF format + - `[assembler]-[sample/group].faa.gz`: The protein translation file consists of all the proteins from all the sequences in multiple FASTA format. + - `[assembler]-[sample/group].fna.gz`: Nucleotide sequences of the predicted proteins using the DNA alphabet, not mRNA (so you will see 'T' in the output and not 'U'). + - `[assembler]-[sample/group]_all.txt.gz`: Information about start positions of genes. + +
+ +## Virus identification in assemblies + +### geNomad + +[geNomad](https://github.com/apcamargo/genomad) identifies viruses and plasmids in sequencing data (isolates, metagenomes, and metatranscriptomes) + +
+Output files + +- `VirusIdentification/geNomad/[assembler]-[sample/group]*/` + - `[assembler]-[sample/group]*_annotate` + - `[assembler]-[sample/group]*_taxonomy.tsv`: Taxonomic assignment data + - `[assembler]-[sample/group]*_aggregated_classification` + - `[assembler]-[sample/group]*_aggregated_classification.tsv`: Sequence classification in tabular format + - `[assembler]-[sample/group]*_find_proviruses` + - `[assembler]-[sample/group]*_provirus.tsv`: Characteristics of proviruses identified by geNomad + - `[assembler]-[sample/group]*_summary` + - `[assembler]-[sample/group]*_virus_summary.tsv`: Virus classification summary file in tabular format + - `[assembler]-[sample/group]*_plasmid_summary.tsv`: Plasmid classification summary file in tabular format + - `[assembler]-[sample/group]*_viruses_genes.tsv`: Virus gene annotation data in tabular format + - `[assembler]-[sample/group]*_plasmids_genes.tsv`: Plasmid gene annotation data in tabular format + - `[assembler]-[sample/group]*_viruses.fna`: Virus nucleotide sequences in FASTA format + - `[assembler]-[sample/group]*_plasmids.fna`: Plasmid nucleotide sequences in FASTA format + - `[assembler]-[sample/group]*_viruses_proteins.faa`: Virus protein sequences in FASTA format + - `[assembler]-[sample/group]*_plasmids_proteins.faa`: Plasmid protein sequences in FASTA format + - `[assembler]-[sample/group]*.log`: Plain text log file detailing the steps executed by geNomad (annotate, find-proviruses, marker-classification, nn-classification, aggregated-classification and summary)
-## Binning +## Binning and binning refinement ### Contig sequencing depth -Sequencing depth per contig and sample is generated by `jgi_summarize_bam_contig_depths --outputDepth`. The values correspond to `(sum of exactely aligned bases) / ((contig length)-2*75)`. For example, for two reads aligned exactly with `10` and `9` bases on a 1000 bp long contig the depth is calculated by `(10+9)/(1000-2*75)` (1000bp length of contig minus 75bp from each end, which is excluded). +Sequencing depth per contig and sample is generated by MetaBAT2's `jgi_summarize_bam_contig_depths --outputDepth`. The values correspond to `(sum of exactly aligned bases) / ((contig length)-2*75)`. For example, for two reads aligned exactly with `10` and `9` bases on a 1000 bp long contig the depth is calculated by `(10+9)/(1000-2*75)` (1000bp length of contig minus 75bp from each end, which is excluded). + +These depth files are used for downstream binning steps.
Output files -* `GenomeBinning/` - * `[assembler]-[sample/group]-depth.txt.gz`: Sequencing depth for each contig and sample or group, only for short reads. +- `GenomeBinning/depths/contigs/` + - `[assembler]-[sample/group]-depth.txt.gz`: Sequencing depth for each contig and sample or group, only for short reads.
@@ -252,22 +325,25 @@ Sequencing depth per contig and sample is generated by `jgi_summarize_bam_contig
Output files -* `GenomeBinning/MetaBAT2/` - * `[assembler]-[sample/group].*.fa`: Genome bins retrieved from input assembly - * `[assembler]-[sample/group].unbinned.*.fa`: Contigs that were not binned with other contigs but considered interesting. By default, these are at least 1 Mbp (`--min_length_unbinned_contigs`) in length and at most the 100 longest contigs (`--max_unbinned_contigs`) are reported +- `GenomeBinning/MetaBAT2/` + - `bins/[assembler]-[binner]-[sample/group].*.fa.gz`: Genome bins retrieved from input assembly + - `unbinned/[assembler]-[binner]-[sample/group].unbinned.[1-9]*.fa.gz`: Contigs that were not binned with other contigs but considered interesting. By default, these are at least 1 Mbp (`--min_length_unbinned_contigs`) in length and at most the 100 longest contigs (`--max_unbinned_contigs`) are reported
-All the files and contigs in this folder will be assessed by QUAST and BUSCO. +All the files and contigs in these folders will be assessed by QUAST and BUSCO. + +All other files that were discarded by the tool, or from the low-quality unbinned contigs, can be found here.
Output files -* `GenomeBinning/MetaBAT2/discarded/` - * `*.lowDepth.fa.gz`: Low depth contigs that are filtered by MetaBat2 - * `*.tooShort.fa.gz`: Too short contigs that are filtered by MetaBat2 - * `*.unbinned.pooled.fa.gz`: Pooled unbinned contigs equal or above `--min_contig_size`, by default 1500 bp. - * `*.unbinned.remaining.fa.gz`: Remaining unbinned contigs below `--min_contig_size`, by default 1500 bp, but not in any other file. +- `GenomeBinning/MetaBAT2/discarded/` + - `*.lowDepth.fa.gz`: Low depth contigs that are filtered by MetaBAT2 + - `*.tooShort.fa.gz`: Too short contigs that are filtered by MetaBAT2 +- `GenomeBinning/MetaBAT2/unbinned/discarded/` + - `*.unbinned.pooled.fa.gz`: Pooled unbinned contigs equal or above `--min_contig_size`, by default 1500 bp. + - `*.unbinned.remaining.fa.gz`: Remaining unbinned contigs below `--min_contig_size`, by default 1500 bp, but not in any other file.
@@ -275,61 +351,166 @@ All the files in this folder contain small and/or unbinned contigs that are not Files in these two folders contain all contigs of an assembly. +### MaxBin2 + +[MaxBin2](https://sourceforge.net/projects/maxbin2/) recovers genome bins (that is, contigs/scaffolds that all belongs to a same organism) from metagenome assemblies. + +
+Output files + +- `GenomeBinning/MaxBin2/` + - `bins/[assembler]-[binner]-[sample/group].*.fa.gz`: Genome bins retrieved from input assembly + - `unbinned/[assembler]-[binner]-[sample/group].noclass.[1-9]*.fa.gz`: Contigs that were not binned with other contigs but considered interesting. By default, these are at least 1 Mbp (`--min_length_unbinned_contigs`) in length and at most the 100 longest contigs (`--max_unbinned_contigs`) are reported. + +
+ +All the files and contigs in these folders will be assessed by QUAST and BUSCO. + +
+Output files + +- `GenomeBinning/MaxBin2/discarded/` + - `*.tooshort.gz`: Too short contigs that are filtered by MaxBin2 +- `GenomeBinning/MaxBin2/unbinned/discarded/` + - `*.noclass.pooled.fa.gz`: Pooled unbinned contigs equal or above `--min_contig_size`, by default 1500 bp. + - `*.noclass.remaining.fa.gz`: Remaining unbinned contigs below `--min_contig_size`, by default 1500 bp, but not in any other file. + +
+ +All the files in this folder contain small and/or unbinned contigs that are not further processed. + +Files in these two folders contain all contigs of an assembly. + +### CONCOCT + +[CONCOCT](https://github.com/BinPro/CONCOCT) performs unsupervised binning of metagenomic contigs by using nucleotide composition, coverage data in multiple samples and linkage data from paired end reads. + +
+Output files + +- `GenomeBinning/CONCOCT/` + - `bins/[assembler]-[binner]-[sample/group].*.fa.gz`: Genome bins retrieved from input assembly + - `stats/[assembler]-[binner]-[sample/group].csv`: Table indicating which contig goes with which cluster bin. + - `stats/[assembler]-[binner]-[sample/group]*_gt1000.csv`: Various intermediate PCA statistics used for clustering. + - `stats/[assembler]-[binner]-[sample/group]_*.tsv`: Coverage statistics of each sub-contig cut up by CONOCOCT prior in an intermediate step prior to binning. Likely not useful in most cases. + - `stats/[assembler]-[binner]-[sample/group].log.txt`: CONCOCT execution log file. + - `stats/[assembler]-[binner]-[sample/group]_*.args`: List of arguments used in CONCOCT execution. + -
+ +All the files and contigs in these folders will be assessed by QUAST and BUSCO, if the parameter `--postbinning_input` is not set to `refined_bins_only`. + +Note that CONCOCT does not output what it considers 'unbinned' contigs, therefore no 'discarded' contigs are produced here. You may still need to do your own manual curation of the resulting bins. + +### DAS Tool + +[DAS Tool](https://github.com/cmks/DAS_Tool) is an automated binning refinement method that integrates the results of a flexible number of binning algorithms to calculate an optimized, non-redundant set of bins from a single assembly. nf-core/mag uses this tool to attempt to further improve bins based on combining the MetaBAT2 and MaxBin2 binning output, assuming sufficient quality is met for those bins. + +DAS Tool will remove contigs from bins that do not pass additional filtering criteria, and will discard redundant lower-quality output from binners that represent the same estimated 'organism', until the single highest quality bin is represented. + +> ⚠️ If DAS Tool does not find any bins passing your selected threshold it will exit with an error. Such an error is 'ignored' by nf-core/mag, therefore you will not find files in the `GenomeBinning/DASTool/` results directory for that particular sample. + +
+Output files + +- `GenomeBinning/DASTool/` + - `[assembler]-[sample/group]_allBins.eval`: Tab-delimited description with quality and completeness metrics for the input bin sets. Quality and completeness are estimated by DAS TOOL using a scoring function based on the frequency of bacterial or archaeal reference single-copy genes (SCG). Please see note at the bottom of this section on file names. + - `[assembler]-[sample/group]_DASTool_summary.tsv`: Tab-delimited description with quality and completeness metrics for the refined output bin sets. + - `[assembler]-[sample/group]_DASTool_contig2bin.tsv`: File describing which contig is associated to which bin from the input binners. + - `[assembler]-[sample/group]_DASTool.log`: Log file from the DAS Tool run describing the command executed and additional runtime information. + - `[assembler]-[sample/group].seqlength`: Tab-delimited file describing the length of each contig. + - `bins/[assembler]-[binner]Refined-[sample/group].*.fa`: Refined bins in fasta format. + - `unbinned/[assembler]-DASToolUnbinned-[sample/group].*.fa`: Unbinned contigs from bin refinement in fasta format. + +
+ +By default, only the raw bins (and unbinned contigs) from the actual binning methods, but not from the binning refinement with DAS Tool, will be used for downstream bin quality control, annotation and taxonomic classification. The parameter `--postbinning_input` can be used to change this behaviour. + +⚠️ Due to ability to perform downstream QC of both raw and refined bins in parallel (via `--postbinning_input)`, bin names in DAS Tools's `*_allBins.eval` file will include `Refined`. However for this particular file, they _actually_ refer to the 'raw' input bins. The pipeline renames the input files prior to running DASTool to ensure they can be disambiguated from the original bin files in the downstream QC steps. + +### Tiara + +Tiara is a contig classifier that identifies the domain (prokarya, eukarya) of contigs within an assembly. This is used in this pipeline to rapidly and with few resources identify the most likely domain classification of each bin or unbin based on its contig identities. + +
+Output files + +- `Taxonomy/Tiara/` + - `[assembler]-[sample/group].tiara.txt` - Tiara output classifications (with probabilities) for all contigs within the specified sample/group assembly + - `log_[assembler]-[sample/group].txt` - log file detailing the parameters used by the Tiara model for contig classification. +- `GenomeBinning/tiara_summary.tsv` - Summary of Tiara domain classification for all bins. + +
+ +Typically, you would use `tiara_summary.tsv` as the primary file to see which bins or unbins have been classified to which domains at a glance, whereas the files in `Taxonomy/Tiara` provide classifications for each contig. + ### Bin sequencing depth -For each genome bin the median sequencing depth is computed based on the corresponding contig depths given in `GenomeBinning/[assembler]-[sample/group]-depth.txt.gz`. +For each bin or refined bin the median sequencing depth is computed based on the corresponding contig depths.
Output files -* `GenomeBinning/` - * `bin_depths_summary.tsv`: Summary of bin sequencing depths for all samples. Depths are available for samples mapped against the corresponding assembly, i.e. according to the mapping strategy specified with `--binning_map_mode`. Only for short reads. - * `[assembler]-[sample/group]-binDepths.heatmap.png`: Clustered heatmap showing bin abundances of the assembly across samples. Bin depths are transformed to centered log-ratios and bins as well as samples are clustered by Euclidean distance. Again, sample depths are available according to the mapping strategy specified with `--binning_map_mode`. +- `GenomeBinning/depths/bins/` + - `bin_depths_summary.tsv`: Summary of bin sequencing depths for all samples. Depths are available for samples mapped against the corresponding assembly, i.e. according to the mapping strategy specified with `--binning_map_mode`. Only for short reads. + - `bin_refined_depths_summary.tsv`: Summary of sequencing depths for refined bins for all samples, if refinement was performed. Depths are available for samples mapped against the corresponding assembly, i.e. according to the mapping strategy specified with `--binning_map_mode`. Only for short reads. + - `[assembler]-[binner]-[sample/group]-binDepths.heatmap.png`: Clustered heatmap showing bin abundances of the assembly across samples. Bin depths are transformed to centered log-ratios and bins as well as samples are clustered by Euclidean distance. Again, sample depths are available according to the mapping strategy specified with `--binning_map_mode`.
### QC for metagenome assembled genomes with QUAST -[QUAST](http://cab.spbu.ru/software/quast/) is a tool that evaluates genome assemblies by computing various metrics. The QUAST output is also included in the MultiQC report, as well as in the assembly directories themselves. +[QUAST](http://cab.spbu.ru/software/quast/) is a tool that evaluates genome assemblies by computing various metrics. The QUAST output is in the bin directories shown below. This QUAST output is not shown in the MultiQC report.
Output files -* `GenomeBinning/QC/QUAST/[assembler]-[bin]/` - * `report.*`: QUAST report in various formats, such as html, txt, tsv or tex - * `quast.log`: QUAST log file - * `predicted_genes/[assembler]-[sample/group].rna.gff`: Contig positions for rRNA genes in gff version 3 format -* `GenomeBinning/QC/` - * `quast_summary.tsv`: QUAST output for all bins summarized +- `GenomeBinning/QC/QUAST/[assembler]-[bin]/` + - `report.*`: QUAST report in various formats, such as html, pdf, tex, tsv, or txt + - `transposed_report.*`: QUAST report that has been transposed into wide format (tex, tsv, or txt) + - `quast.log`: QUAST log file + - `metaquast.log`: MetaQUAST log file + - `icarus.html`: Icarus main menu with links to interactive viewers + - `icarus_viewers/contig_size_viewer.html`: Diagram of contigs that are ordered from longest to shortest + - `basic_stats/cumulative_plot.pdf`: Shows the growth of contig lengths (contigs are ordered from largest to shortest) + - `basic_stats/GC_content_plot.pdf`: Shows the distribution of GC content in the contigs + - `basic_stats/[assembler]-[bin]_GC_content_plot.pdf`: Histogram of the GC percentage for the contigs + - `basic_stats/Nx_plot.pdf`: Plot of Nx values as x varies from 0 to 100%. + - `predicted_genes/[assembler]-[bin].rna.gff`: Contig positions for rRNA genes in gff version 3 format + - `predicted_genes/barrnap.log`: Barrnap log file (ribosomal RNA predictor) +- `GenomeBinning/QC/` + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]-quast_summary.tsv`: QUAST output summarized per sample/condition. + - `quast_summary.tsv`: QUAST output for all bins summarized
-### QC for metagenome assembled genomes with BUSCO +### QC for metagenome assembled genomes + +#### BUSCO -[BUSCO](https://busco.ezlab.org/) is a tool used to assess the completeness of a genome assembly. It is run on all the genome bins and high quality contigs obtained by MetaBAT2. By default, BUSCO is run in automated lineage selection mode in which it first tries to select the domain and then a more specific lineage based on phylogenetic placement. If available, result files for both the selected domain lineage and the selected more specific lineage are placed in the output directory. If a lineage dataset is specified already with `--busco_reference`, only results for this specific lineage will be generated. +[BUSCO](https://busco.ezlab.org/) is a tool used to assess the completeness of a genome assembly. It is run on all the genome bins and high quality contigs obtained by the applied binning and/or binning refinement methods (depending on the `--postbinning_input` parameter). By default, BUSCO is run in automated lineage selection mode in which it first tries to select the domain and then a more specific lineage based on phylogenetic placement. If available, result files for both the selected domain lineage and the selected more specific lineage are placed in the output directory. If a lineage dataset is specified already with `--busco_db`, only results for this specific lineage will be generated.
Output files -* `GenomeBinning/QC/BUSCO/` - * `[assembler]-[bin]_busco.log`: Log file containing the standard output of BUSCO. - * `[assembler]-[bin]_busco.err`: File containing potential error messages returned from BUSCO. - * `short_summary.domain.[lineage].[assembler]-[bin].txt`: BUSCO summary of the results for the selected domain when run in automated lineage selection mode. Not available for bins for which a viral lineage was selected. - * `short_summary.specific_lineage.[lineage].[assembler]-[bin].txt`: BUSCO summary of the results in case a more specific lineage than the domain could be selected or for the lineage provided via `--busco_reference`. - * `[assembler]-[bin]_buscos.[lineage].fna.gz`: Nucleotide sequence of all identified BUSCOs for used lineages (domain or specific). - * `[assembler]-[bin]_buscos.[lineage].faa.gz`: Aminoacid sequence of all identified BUSCOs for used lineages (domain or specific). - * `[assembler]-[bin]_prodigal.gff`: Genes predicted with Prodigal. +- `GenomeBinning/QC/BUSCO/` + - `[assembler]-[bin]_busco.log`: Log file containing the standard output of BUSCO. + - `[assembler]-[bin]_busco.err`: File containing potential error messages returned from BUSCO. + - `short_summary.domain.[lineage].[assembler]-[bin].txt`: BUSCO summary of the results for the selected domain when run in automated lineage selection mode. Not available for bins for which a viral lineage was selected. + - `short_summary.specific_lineage.[lineage].[assembler]-[bin].txt`: BUSCO summary of the results in case a more specific lineage than the domain could be selected or for the lineage provided via `--busco_db`. + - `[assembler]-[bin]_buscos.[lineage].fna.gz`: Nucleotide sequence of all identified BUSCOs for used lineages (domain or specific). + - `[assembler]-[bin]_buscos.[lineage].faa.gz`: Aminoacid sequence of all identified BUSCOs for used lineages (domain or specific). + - `[assembler]-[bin]_prodigal.gff`: Genes predicted with Prodigal.
-If the parameter `--save_busco_reference` is set, additionally the used BUSCO lineage datasets are stored in the output directy. +If the parameter `--save_busco_db` is set, additionally the used BUSCO lineage datasets are stored in the output directory.
Output files -* `GenomeBinning/QC/BUSCO/` - * `busco_downloads/`: All files and lineage datasets downloaded by BUSCO when run in automated lineage selection mode. (Can currently not be used to reproduce analysis, see the [nf-core/mag website documentation](https://nf-co.re/mag/usage#reproducibility) how to achieve reproducible BUSCO results). - * `reference/*.tar.gz`: BUSCO reference lineage dataset that was provided via `--busco_reference`. +- `GenomeBinning/QC/BUSCO/` + - `busco_downloads/`: All files and lineage datasets downloaded by BUSCO when run in automated lineage selection mode. (Can currently not be used to reproduce analysis, see the [nf-core/mag website documentation](https://nf-co.re/mag/usage#reproducibility) how to achieve reproducible BUSCO results). + - `reference/*.tar.gz`: BUSCO reference lineage dataset that was provided via `--busco_db`.
@@ -338,29 +519,79 @@ Besides the reference files or output files created by BUSCO, the following summ
Output files -* `GenomeBinning/QC/` - * `busco_summary.tsv`: A summary table of the BUSCO results, with % of marker genes found. If run in automated lineage selection mode, both the results for the selected domain and for the selected more specific lineage will be given, if available. +- `GenomeBinning/QC/` + - `busco_summary.tsv`: A summary table of the BUSCO results, with % of marker genes found. If run in automated lineage selection mode, both the results for the selected domain and for the selected more specific lineage will be given, if available. + +
+ +#### CheckM + +[CheckM](https://ecogenomics.github.io/CheckM/) CheckM provides a set of tools for assessing the quality of genomes recovered from isolates, single cells, or metagenomes. It provides robust estimates of genome completeness and contamination by using collocated sets of genes that are ubiquitous and single-copy within a phylogenetic lineage + +By default, nf-core/mag runs CheckM with the `check_lineage` workflow that places genome bins on a reference tree to define lineage-marker sets, to check for completeness and contamination based on lineage-specific marker genes. and then subsequently runs `qa` to generate the summary files. + +
+Output files + +- `GenomeBinning/QC/CheckM/` + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]_qa.txt`: Detailed statistics about bins informing completeness and contamamination scores (output of `checkm qa`). This should normally be your main file to use to evaluate your results. + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]_wf.tsv`: Overall summary file for completeness and contamination (output of `checkm lineage_wf`). + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]/`: intermediate files for CheckM results, including CheckM generated annotations, log, lineage markers etc. + - `checkm_summary.tsv`: A summary table of the CheckM results for all bins (output of `checkm qa`).
+If the parameter `--save_checkm_reference` is set, additionally the used the CheckM reference datasets are stored in the output directory. + +
+Output files + +- `GenomeBinning/QC/CheckM/` + - `checkm_downloads/`: All CheckM reference files downloaded from the CheckM FTP server, when not supplied by the user. + - `checkm_data_2015_01_16/*`: a range of directories and files required for CheckM to run. + +
+ +#### GUNC + +[Genome UNClutterer (GUNC)](https://grp-bork.embl-community.io/gunc/index.html) is a tool for detection of chimerism and contamination in prokaryotic genomes resulting from mis-binning of genomic contigs from unrelated lineages. It does so by applying an entropy based score on taxonomic assignment and contig location of all genes in a genome. It is generally considered as a additional complement to CheckM results. + +
+Output files + +- `GenomeBinning/QC/gunc_summary.tsv` +- `GenomeBinning/QC/gunc_checkm_summary.tsv` +- `[gunc-database].dmnd` +- `GUNC/` + - `raw/` + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]/GUNC_checkM.merged.tsv`: Per sample GUNC [output](https://grp-bork.embl-community.io/gunc/output.html) containing with taxonomic and completeness QC statistics. + - `checkmmerged/` + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]/GUNC.progenomes_2.1.maxCSS_level.tsv`: Per sample GUNC output merged with output from [CheckM](#checkm) + +
+ +GUNC will be run if specified with `--run_gunc` as a standalone, unless CheckM is also activated via `--qc_tool 'checkm'`, in which case GUNC output will be merged with the CheckM output using `gunc merge_checkm`. + +If `--gunc_save_db` is specified, the output directory will also contain the requested database (progenomes, or GTDB) in DIAMOND format. + ## Taxonomic classification of binned genomes ### CAT -[CAT](https://github.com/dutilh/CAT) is a toolkit for annotating contigs and bins from metagenome-assembled-genomes. The MAG pipeline uses CAT to assign taxonomy to genome bins based on the taxnomy of the contigs. +[CAT](https://github.com/dutilh/CAT) is a toolkit for annotating contigs and bins from metagenome-assembled-genomes. The nf-core/mag pipeline uses CAT to assign taxonomy to genome bins based on the taxnomy of the contigs.
Output files -* `Taxonomy/CAT/[assembler]/` - * `[assembler]-[sample/group].ORF2LCA.names.txt.gz`: Tab-delimited files containing the lineage of each contig, with full lineage names - * `[assembler]-[sample/group].bin2classification.names.txt.gz`: Taxonomy classification of the genome bins, with full lineage names -* `Taxonomy/CAT/[assembler]/raw/` - * `[assembler]-[sample/group].concatenated.predicted_proteins.faa.gz`: Predicted protein sequences for each genome bin, in fasta format - * `[assembler]-[sample/group].concatenated.predicted_proteins.gff.gz`: Predicted protein features for each genome bin, in gff format - * `[assembler]-[sample/group].ORF2LCA.txt.gz`: Tab-delimited files containing the lineage of each contig - * `[assembler]-[sample/group].bin2classification.txt.gz`: Taxonomy classification of the genome bins - * `[assembler]-[sample/group].log`: Log files +- `Taxonomy/CAT/[assembler]/[binner]/` + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].ORF2LCA.names.txt.gz`: Tab-delimited files containing the lineage of each contig, with full lineage names + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].bin2classification.names.txt.gz`: Taxonomy classification of the genome bins, with full lineage names +- `Taxonomy/CAT/[assembler]/[binner]/raw/` + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].concatenated.predicted_proteins.faa.gz`: Predicted protein sequences for each genome bin, in fasta format + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].concatenated.predicted_proteins.gff.gz`: Predicted protein features for each genome bin, in gff format + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].ORF2LCA.txt.gz`: Tab-delimited files containing the lineage of each contig + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].bin2classification.txt.gz`: Taxonomy classification of the genome bins + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].log`: Log files
@@ -369,7 +600,7 @@ If the parameters `--cat_db_generate` and `--save_cat_db` are set, additionally
Output files -* `Taxonomy/CAT/CAT_prepare_*.tar.gz`: Generated and used CAT database. +- `Taxonomy/CAT/CAT_prepare_*.tar.gz`: Generated and used CAT database.
@@ -380,15 +611,15 @@ If the parameters `--cat_db_generate` and `--save_cat_db` are set, additionally
Output files -* `Taxonomy/GTDB-Tk/[assembler]/[sample/group]/` - * `gtdbtk.[assembler]-[sample/group].{bac120/ar122}.summary.tsv`: Classifications for bacterial and archaeal genomes (see the [GTDB-Tk documentation for details](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html). - * `gtdbtk.[assembler]-[sample/group].{bac120/ar122}.classify.tree.gz`: Reference tree in Newick format containing query genomes placed with pplacer. - * `gtdbtk.[assembler]-[sample/group].{bac120/ar122}.markers_summary.tsv`: A summary of unique, duplicated, and missing markers within the 120 bacterial marker set, or the 122 archaeal marker set for each submitted genome. - * `gtdbtk.[assembler]-[sample/group].{bac120/ar122}.msa.fasta.gz`: FASTA file containing MSA of submitted and reference genomes. - * `gtdbtk.[assembler]-[sample/group].{bac120/ar122}.filtered.tsv`: A list of genomes with an insufficient number of amino acids in MSA. - * `gtdbtk.[assembler]-[sample/group].*.log`: Log files. - * `gtdbtk.[assembler]-[sample/group].failed_genomes.tsv`: A list of genomes for which the GTDB-Tk analysis failed, e.g. because Prodigal could not detect any genes. -* `Taxonomy/GTDB-Tk/gtdbtk_summary.tsv`: A summary table of the GTDB-Tk classification results for all bins, also containing bins which were discarded based on the BUSCO QC, which were filtered out by GTDB-Tk ((listed in `*.filtered.tsv`) or for which the analysis failed (listed in `*.failed_genomes.tsv`). +- `Taxonomy/GTDB-Tk/[assembler]/[binner]/[sample/group]/` + - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.summary.tsv`: Classifications for bacterial and archaeal genomes (see the [GTDB-Tk documentation for details](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html)). + - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.classify.tree.gz`: Reference tree in Newick format containing query genomes placed with pplacer. + - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.markers_summary.tsv`: A summary of unique, duplicated, and missing markers within the 120 bacterial marker set, or the 122 archaeal marker set for each submitted genome. + - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.msa.fasta.gz`: FASTA file containing MSA of submitted and reference genomes. + - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.filtered.tsv`: A list of genomes with an insufficient number of amino acids in MSA. + - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].*.log`: Log files. + - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].failed_genomes.tsv`: A list of genomes for which the GTDB-Tk analysis failed, e.g. because Prodigal could not detect any genes. +- `Taxonomy/GTDB-Tk/gtdbtk_summary.tsv`: A summary table of the GTDB-Tk classification results for all bins, also containing bins which were discarded based on the BUSCO QC, which were filtered out by GTDB-Tk (listed in `*.filtered.tsv`) or for which the analysis failed (listed in `*.failed_genomes.tsv`).
@@ -401,19 +632,34 @@ Whole genome annotation is the process of identifying features of interest in a
Output files -* `Prokka/[assembler]/[bin]/` - * `[bin].gff`: annotation in GFF3 format, containing both sequences and annotations - * `[bin].gbk`: annotation in GenBank format, containing both sequences and annotations - * `[bin].fna`: nucleotide FASTA file of the input contig sequences - * `[bin].faa`: protein FASTA file of the translated CDS sequences - * `[bin].ffn`: nucleotide FASTA file of all the prediction transcripts (CDS, rRNA, tRNA, tmRNA, misc_RNA) - * `[bin].sqn`: an ASN1 format "Sequin" file for submission to Genbank - * `[bin].fsa`: nucleotide FASTA file of the input contig sequences, used by "tbl2asn" to create the .sqn file - * `[bin].tbl`: feature Table file, used by "tbl2asn" to create the .sqn file - * `[bin].err`: unacceptable annotations - the NCBI discrepancy report. - * `[bin].log`: contains all the output that Prokka produced during its run - * `[bin].txt`: statistics relating to the annotated features found - * `[bin].tsv`: tab-separated file of all features (locus_tag, ftype, len_bp, gene, EC_number, COG, product) +- `Annotation/Prokka/[assembler]/[bin]/` + - `[assembler]-[binner]-[bin].gff`: annotation in GFF3 format, containing both sequences and annotations + - `[assembler]-[binner]-[bin].gbk`: annotation in GenBank format, containing both sequences and annotations + - `[assembler]-[binner]-[bin].fna`: nucleotide FASTA file of the input contig sequences + - `[assembler]-[binner]-[bin].faa`: protein FASTA file of the translated CDS sequences + - `[assembler]-[binner]-[bin].ffn`: nucleotide FASTA file of all the prediction transcripts (CDS, rRNA, tRNA, tmRNA, misc_RNA) + - `[assembler]-[binner]-[bin].sqn`: an ASN1 format "Sequin" file for submission to Genbank + - `[assembler]-[binner]-[bin].fsa`: nucleotide FASTA file of the input contig sequences, used by "tbl2asn" to create the .sqn file + - `[assembler]-[binner]-[bin].tbl`: feature Table file, used by "tbl2asn" to create the .sqn file + - `[assembler]-[binner]-[bin].err`: unacceptable annotations - the NCBI discrepancy report. + - `[assembler]-[binner]-[bin].log`: contains all the output that Prokka produced during its run + - `[assembler]-[binner]-[bin].txt`: statistics relating to the annotated features found + - `[assembler]-[binner]-[bin].tsv`: tab-separated file of all features (locus_tag, ftype, len_bp, gene, EC_number, COG, product) + +
+ +### MetaEuk + +In cases where eukaryotic genomes are recovered in binning, [MetaEuk](https://github.com/soedinglab/metaeuk) is also available to annotate eukaryotic genomes quickly with standards-compliant output files. + +
+Output files + +- `Annotation/MetaEuk/[assembler]/[bin]` + - `[assembler]-[binner]-[bin].fas`: fasta file of protein sequences identified by MetaEuk + - `[assembler]-[binner]-[bin].codon.fas`: fasta file of nucleotide sequences corresponding to the protein sequences fasta + - `[assembler]-[binner]-[bin].headersMap.tsv`: tab-separated table containing the information from each header in the fasta files + - `[assembler]-[binner]-[bin].gff`: annotation in GFF3 format
@@ -422,19 +668,725 @@ Whole genome annotation is the process of identifying features of interest in a
Output files -* `GenomeBinning/bin_summary.tsv`: Summary of bin sequencing depths together with BUSCO, QUAST and GTDB-Tk results, if at least one of the later was generated. +- `GenomeBinning/bin_summary.tsv`: Summary of bin sequencing depths together with BUSCO, CheckM, QUAST and GTDB-Tk results, if at least one of the later was generated. This will also include refined bins if `--refine_bins_dastool` binning refinement is performed. Note that in contrast to the other tools, for CheckM the bin name given in the column "Bin Id" does not contain the ".fa" extension. + +
+ +## Ancient DNA + +Optional, only running when parameter `-profile ancient_dna` is specified. + +### `PyDamage` + +[Pydamage](https://github.com/maxibor/pydamage), is a tool to automate the process of ancient DNA damage identification and estimation from contigs. After modelling the ancient DNA damage using the C to T transitions, Pydamage uses a likelihood ratio test to discriminate between truly ancient, and modern contigs originating from sample contamination. + +
+Output files + +- `Ancient_DNA/pydamage/analyze` + - `[assembler]_[sample/group]/pydamage_results/pydamage_results.csv`: PyDamage raw result tabular file in `.csv` format. Format described here: [pydamage.readthedocs.io/en/0.62/output.html](https://pydamage.readthedocs.io/en/0.62/output.html) +- `Ancient_DNA/pydamage/filter` + - `[assembler]_[sample/group]/pydamage_results/pydamage_results.csv`: PyDamage filtered result tabular file in `.csv` format. Format described here: [pydamage.readthedocs.io/en/0.62/output.html](https://pydamage.readthedocs.io/en/0.62/output.html) + +
+ +### `variant_calling` + +Because of aDNA damage, _de novo_ assemblers sometimes struggle to call a correct consensus on the contig sequence. To avoid this situation, the consensus is optionally re-called with a variant calling software using the reads aligned back to the contigs when `--run_ancient_damagecorrection` is supplied. + +
+Output files + +- `variant_calling/consensus` + - `[assembler]_[sample/group].fa`: contigs sequence with re-called consensus from read-to-contig alignment +- `variant_calling/unfiltered` + - `[assembler]_[sample/group].vcf.gz`: raw variant calls of the reads aligned back to the contigs. +- `variant_calling/filtered` + - `[assembler]_[sample/group].filtered.vcf.gz`: quality filtered variant calls of the reads aligned back to the contigs. + +
+ +### MultiQC + +
+Output files + +- `multiqc/` + - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. + - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. + - `multiqc_plots/`: directory containing static images from the report in various formats. + +
+ +[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. + +Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . + +The general stats table at the top of the table will by default only display the most relevant pre- and post-processing statistics prior to assembly, i.e., FastQC, fastp/Adapter removal, and Bowtie2 PhiX and host removal mapping results. + +Note that the FastQC raw and processed columns are right next to each other for improved visual comparability, however the processed columns represent the input reads _after_ fastp/Adapter Removal processing (the dedicated columns of which come directly after the two FastQC set of columns). Hover your cursor over each column name to see the which tool the column is derived from. + +Summary tool-specific plots and tables of following tools are currently displayed (if activated): + +- FastQC (pre- and post-trimming) +- fastp +- Adapter Removal +- bowtie2 +- BUSCO +- QUAST +- Kraken2 / Centrifuge +- PROKKA + +### Pipeline information + +
+Output files + +- `pipeline_info/` + - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. + - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. + - Parameters used by the pipeline run: `params.json`. + +
+ +[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. + +# nf-core/taxprofiler: Output + +## Introduction + +This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. + +The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. + +## Pipeline overview + +The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: + +- [UNTAR](#untar) - Optionally saved decompressed input databases +- [FastQC](#fastqc) - Raw read QC +- [falco](#fastqc) - Alternative to FastQC for raw read QC +- [fastp](#fastp) - Adapter trimming for Illumina data +- [AdapterRemoval](#adapterremoval) - Adapter trimming for Illumina data +- [Porechop](#porechop) - Adapter removal for Oxford Nanopore data +- [BBDuk](#bbduk) - Quality trimming and filtering for Illumina data +- [PRINSEQ++](#prinseq) - Quality trimming and filtering for Illunina data +- [Filtlong](#filtlong) - Quality trimming and filtering for Nanopore data +- [Bowtie2](#bowtie2) - Host removal for Illumina reads +- [minimap2](#minimap2) - Host removal for Nanopore reads +- [SAMtools stats](#samtools-stats) - Statistics from host removal +- [SAMtools fastq](#samtools-fastq) - Converts unmapped BAM file to fastq format (minimap2 only) +- [Analysis Ready Reads](#analysis-read-reads) - Optional results directory containing the final processed reads used as input for classification/profiling. +- [Bracken](#bracken) - Taxonomic classifier using k-mers and abundance estimations +- [Kraken2](#kraken2) - Taxonomic classifier using exact k-mer matches +- [KrakenUniq](#krakenuniq) - Taxonomic classifier that combines the k-mer-based classification and the number of unique k-mers found in each species +- [Centrifuge](#centrifuge) - Taxonomic classifier that uses a novel indexing scheme based on the Burrows-Wheeler transform (BWT) and the Ferragina-Manzini (FM) index. +- [Kaiju](#kaiju) - Taxonomic classifier that finds maximum (in-)exact matches on the protein-level. +- [Diamond](#diamond) - Sequence aligner for protein and translated DNA searches. +- [MALT](#malt) - Sequence alignment and analysis tool designed for processing high-throughput sequencing data, especially in the context of metagenomics +- [MetaPhlAn](#metaphlan) - Genome-level marker gene based taxonomic classifier +- [mOTUs](#motus) - Tool for marker gene-based OTU (mOTU) profiling. +- [KMCP](#kmcp) - Taxonomic classifier that utilizes genome coverage information by splitting the reference genomes into chunks and stores k-mers in a modified and optimized COBS index for fast alignment-free sequence searching. +- [ganon](#ganon) - Taxonomic classifier and profile that uses Interleaved Bloom Filters as indices based on k-mers/minimizers. +- [TAXPASTA](#taxpasta) - Tool to standardise taxonomic profiles as well as merge profiles across samples from the same database and classifier/profiler. +- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline +- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution + +![](images/taxprofiler_tube.png) + +### untar + +untar is used in nf-core/taxprofiler to decompress various input files ending in `.tar.gz`. This process is mainly used for decompressing input database archive files. + +
+Output files + +- `untar/` + - `database/` + - ``: directory containing contents of the decompressed archive + +
+ +This directory will only be present if `--save_untarred_databases` is supplied. The contained directories can be useful for moving the decompressed directories to a central 'cache' location allowing users to re-use the same databases. This is useful to save unnecessary computational time of decompressing the archives on every run. + +### FastQC or Falco + +
+Output files + +- `{fastqc,falco}/` + - {raw,preprocessed} + - `*html`: FastQC or Falco report containing quality metrics in HTML format. + - `*.txt`: FastQC or Falco report containing quality metrics in TXT format. + - `*.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images (FastQC only). + +
+ +[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). + +If preprocessing is turned on, nf-core/taxprofiler runs FastQC/Falco twice -once before and once after adapter removal/read merging, to allow evaluation of the performance of these preprocessing steps. Note in the General Stats table, the columns of these two instances of FastQC/Falco are placed next to each other to make it easier to evaluate. However, the columns of the actual preprocessing steps (i.e, fastp, AdapterRemoval, and Porechop) will be displayed _after_ the two FastQC/Falco columns, even if they were run 'between' the two FastQC/Falco jobs in the pipeline itself. + +:::info +Falco produces identical output to FastQC but in the `falco/` directory. +::: + +![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) + +![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png) + +![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) + +:::note +The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. +::: + +### fastp + +[fastp](https://github.com/OpenGene/fastp) is a FASTQ pre-processing tool for quality control, trimmming of adapters, quality filtering and other features. + +It is used in nf-core/taxprofiler for adapter trimming of short-reads. + +
+Output files + +- `fastp/` + - `.fastp.fastq.gz`: File with the trimmed unmerged fastq reads. + - `.merged.fastq.gz`: File with the reads that were successfully merged. + - `.*{log,html,json}`: Log files in different formats. + +
+ +By default nf-core/taxprofiler will only provide the `.fastp.fastq.gz` file if fastp is selected. The file `.merged.fastq.gz` will be available in the output folder if you provide the argument ` --shortread_qc_mergepairs` (optionally retaining un-merged pairs when in combination with `--shortread_qc_includeunmerged`). + +You can change the default value for low complexity filtering by using the argument `--shortread_complexityfilter_fastp_threshold`. + +### AdapterRemoval + +[AdapterRemoval](https://adapterremoval.readthedocs.io/en/stable/) searches for and removes remnant adapter sequences from High-Throughput Sequencing (HTS) data and (optionally) trims low quality bases from the 3' end of reads following adapter removal. It is popular in the field of palaeogenomics. The output logs are stored in the results folder, and as a part of the MultiQC report. + +
+Output files + +- `adapterremoval/` + - `.settings`: AdapterRemoval log file containing general adapter removal, read trimming and merging statistics + - `.collapsed.fastq.gz` - read-pairs that merged and did not undergo trimming (only when `--shortread_qc_mergepairs` supplied) + - `.collapsed.truncated.fastq.gz` - read-pairs that merged underwent quality trimming (only when `--shortread_qc_mergepairs` supplied) + - `.pair1.truncated.fastq.gz` - read 1 of pairs that underwent quality trimming + - `.pair2.truncated.fastq.gz` - read 2 of pairs that underwent quality trimming (and could not merge if `--shortread_qc_mergepairs` supplied) + - `.singleton.truncated.fastq.gz` - orphaned read pairs where one of the pair was discarded + - `.discard.fastq.gz` - reads that were discarded due to length or quality filtering + +
+ +By default nf-core/taxprofiler will only provide the `.settings` file if AdapterRemoval is selected. + +You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. If this is selected, you may receive different combinations of `.fastq` files for each sample depending on the input types - e.g. whether you have merged or not, or if you're supplying both single- and paired-end reads. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. + +:::warning +The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as complexity filtering, host removal, run merging etc.. +::: + +### Porechop + +[Porechop](https://github.com/rrwick/Porechop) is a tool for finding and removing adapters from Oxford Nanopore reads. Adapters on the ends of reads are trimmed and if a read has an adapter in its middle, it is considered a chimeric and it chopped into separate reads. + +
+Output files + +- `porechop/` + - `.log`: Log file containing trimming statistics + - `.fastq.gz`: Adapter-trimmed file + +
+ +The output logs are saved in the output folder and are part of MultiQC report.You do not normally need to check these manually. + +You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. + +:::warning +We do **not** recommend using Porechop if you are already trimming the adapters with ONT's basecaller Guppy. +::: + +### BBDuk + +[BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) stands for Decontamination Using Kmers. BBDuk was developed to combine most common data-quality-related trimming, filtering, and masking operations into a single high-performance tool. + +It is used in nf-core/taxprofiler for complexity filtering using different algorithms. This means that it will remove reads with low sequence diversity (e.g. mono- or dinucleotide repeats). + +
+Output files + +- `bbduk/` + - `.bbduk.log`: log file containing filtering statistics + - `.fastq.gz`: resulting FASTQ file without low-complexity reads + +
+ +By default nf-core/taxprofiler will only provide the `.log` file if BBDuk is selected as the complexity filtering tool. You will only find the complexity filtered reads in your results directory if you provide ` --save_complexityfiltered_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. + +:::warning +The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as host removal, run merging etc.. +::: + +### PRINSEQ++ + +[PRINSEQ++](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus) is a C++ implementation of the [prinseq-lite.pl](https://prinseq.sourceforge.net/) program. It can be used to filter, reformat or trim genomic and metagenomic sequence data. + +It is used in nf-core/taxprofiler for complexity filtering using different algorithms. This means that it will remove reads with low sequence diversity (e.g. mono- or dinucleotide repeats). + +
+Output files + +- `prinseqplusplus/` + - `.log`: log file containing number of reads. Row IDs correspond to: `min_len, max_len, min_gc, max_gc, min_qual_score, min_qual_mean, ns_max_n, noiupac, derep, lc_entropy, lc_dust, trim_tail_left, trim_tail_right, trim_qual_left, trim_qual_right, trim_left, trim_right` + - `_good_out.fastq.gz`: resulting FASTQ file without low-complexity reads + +
+ +By default nf-core/taxprofiler will only provide the `.log` file if PRINSEQ++ is selected as the complexity filtering tool. You will only find the complexity filtered `.fastq` files in your results directory if you supply ` --save_complexityfiltered_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. + +:::warning +The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as host removal, run merging etc.. +::: + +### Filtlong + +[Filtlong](https://github.com/rrwick/Filtlong) is a quality filtering tool for long reads. It can take a set of small reads and produce a smaller, better subset. + +
+Output files + +- `filtlong/` + - `_filtered.fastq.gz`: Quality or short read data filtered file + - `_filtered.log`: log file containing summary statistics + +
+ +You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. + +:::warning +We do _not_ recommend using Filtlong if you are performing filtering of low quality reads with ONT's basecaller Guppy. +::: + +### Bowtie2 + +[Bowtie 2](https://bowtie-bio.sourceforge.net/bowtie2/index.shtml) is an ultrafast and memory-efficient tool for aligning sequencing reads to long reference sequences. It is particularly good at aligning reads of about 50 up to 100s or 1,000s of characters, and particularly good at aligning to relatively long (e.g. mammalian) genomes. + +It is used with nf-core/taxprofiler to allow removal of 'host' (e.g. human) and/or other possible contaminant reads (e.g. Phi X) from short-read `.fastq` files prior to profiling. + +
+Output files + +- `bowtie2/` + - `build/` + - `*.bt2`: Bowtie2 indicies of reference genome, only if `--save_hostremoval_index` supplied. + - `align/` + - `.bam`: BAM file containing reads that aligned against the user-supplied reference genome as well as unmapped reads + - `.bowtie2.log`: log file about the mapped reads + - `.unmapped.fastq.gz`: the off-target reads from the mapping that is used in downstream steps. + +
+ +By default nf-core/taxprofiler will only provide the `.log` file if host removal is turned on. You will only have a `.bam` file if you specify `--save_hostremoval_bam`. This will contain _both_ mapped and unmapped reads. You will only get FASTQ files if you specify to save `--save_hostremoval_unmapped` - these contain only unmapped reads. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. + +:::info +Unmapped reads in FASTQ are only found in this directory for short-reads, for long-reads see [`samtools/fastq/`](#samtools-fastq). +::: + +:::info +The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as run merging etc.. +::: + +:::info +While there is a dedicated section in the MultiQC HTML for Bowtie2, these values are not displayed by default in the General Stats table. Rather, alignment statistics to host genome is reported via samtools stats module in MultiQC report for direct comparison with minimap2 (see below). +::: + +### minimap2 + +[minimap2](https://github.com/lh3/minimap2) is an alignment tool suited to mapping long reads to reference sequences. + +It is used with nf-core/taxprofiler to allow removal of 'host' (e.g. human) or other possible contaminant reads from long-read `.fastq` files prior to taxonomic classification/profiling. + +
+Output files + +- `minimap2/` + - `build/` + - `*.mmi2`: minimap2 indices of reference genome, only if `--save_hostremoval_index` supplied. + - `align/` + - `.bam`: Alignment file in BAM format containing both mapped and unmapped reads. + +
+ +By default, nf-core/taxprofiler will only provide the `.bam` file containing mapped and unmapped reads if saving of host removal for long reads is turned on via `--save_hostremoval_bam`. + +:::info +minimap2 is not yet supported as a module in MultiQC and therefore there is no dedicated section in the MultiQC HTML. Rather, alignment statistics to host genome is reported via samtools stats module in MultiQC report. +::: + +:::info +Unlike Bowtie2, minimap2 does not produce an unmapped FASTQ file by itself. See [`samtools/fastq`](#samtools-fastq). +::: + +### SAMtools fastq + +[SAMtools fastq](http://www.htslib.org/doc/1.1/samtools.html) converts a `.sam`, `.bam`, or `.cram` alignment file to FASTQ format + +
+Output files + +- `samtools/stats/` + - `_interleaved.fq.gz`: Unmapped reads only in FASTQ gzip format + +
+ +This directory will be present and contain the unmapped reads from the `.fastq` format from long-read minimap2 host removal, if `--save_hostremoval_unmapped` is supplied. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. + +:::info +For short-read unmapped reads, see [bowtie2](#bowtie2). +::: + +### Analysis Ready Reads + +:::info +This optional results directory will only be present in the pipeline results when supplying `--save_analysis_ready_reads`. +::: + +
+Output files + +- `samtools/stats/` + - `_{fq,fastq}.gz`: Final reads that underwent preprocessing and were sent for classification/profiling. + +
+ +The results directory will contain the 'final' processed reads used as input for classification/profiling. It will _only_ include the output of the _last_ step of any combinations of preprocessing steps that may have been specified in the run configuration. For example, if you perform the read QC and host-removal preprocessing steps, the final reads that are sent to classification/profiling are the host-removed FASTQ files - those will be the ones present in this directory. + +:::warning +If you turn off all preprocessing steps, then no results will be present in this directory. This happens independently for short- and long-reads. I.e. you will only have FASTQ files for short reads in this directory if you skip all long-read preprocessing. +::: + +### SAMtools stats + +[SAMtools stats](http://www.htslib.org/doc/samtools-stats.html) collects statistics from a `.sam`, `.bam`, or `.cram` alignment file and outputs in a text format. + +
+Output files + +- `samtools/stats/` + - `.stats`: File containing samtools stats output. + +
+ +In most cases you do not need to check this file, as it is rendered in the MultiQC run report. + +### Run Merging + +nf-core/taxprofiler offers the option to merge FASTQ files of multiple sequencing runs or libraries that derive from the same sample, as specified in the input samplesheet. + +This is the last possible preprocessing step, so if you have multiple runs or libraries (and run merging turned on), this will represent the final reads that will go into classification/profiling steps. + +
+Output files + +- `run_merging/` + - `*.fastq.gz`: Concatenated FASTQ files on a per-sample basis + +
+ +Note that you will only find samples that went through the run merging step in this directory. For samples that had a single run or library will not go through this step of the pipeline and thus will not be present in this directory. + +This directory and its FASTQ files will only be present if you supply `--save_runmerged_reads`.Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. + +### Bracken + +[Bracken](https://ccb.jhu.edu/software/bracken/) (Bayesian Reestimation of Abundance with Kraken) is a highly accurate statistical method that computes the abundance of species in DNA sequences from a metagenomics sample. Braken uses the taxonomy labels assigned by Kraken, a highly accurate metagenomics classification algorithm, to estimate the number of reads originating from each species present in a sample. + +:::info +The first step of using Bracken requires running Kraken2, therefore the initial results before abundance estimation will be found in `/kraken2/`. +::: + +
+Output files + +- `bracken/` + - `/` + - `bracken__combined_reports.txt`: combined bracken results as output from Bracken's `combine_bracken_outputs.py` script + - `/` + - `_.tsv`: TSV file containing per-sample summary of Bracken results with abundance information + - `_.report_bracken_species.txt`: Kraken2 style report with Bracken abundance information + +
+ +The main taxonomic profiling file from Bracken is the `*.tsv` file. This provides the basic results from Kraken2 but with the corrected abundance information. Note that the raw Kraken2 version of the upstream step of Bracken can be found in the `kraken2/` directory with the suffix of `_.bracken.report.txt` (with a 6 column variant when `--save_minimizers` specified). + +### Kraken2 + +[Kraken](https://ccb.jhu.edu/software/kraken2/) is a taxonomic sequence classifier that assigns taxonomic labels to DNA sequences. Kraken examines the k-mers within a query sequence and uses the information within those k-mers to query a database. That database maps -mers to the lowest common ancestor (LCA) of all genomes known to contain a given k-mer. + +
+Output files + +- `kraken2/` + - `_combined_reports.txt`: A combined profile of all samples aligned to a given database (as generated by `krakentools`) + - If you have also run Bracken, the original Kraken report (i.e., _before_ read re-assignment) will also be included in this directory with `-bracken` suffixed to your Bracken database name if you supply `--bracken_save_intermediatekraken2` to the run. For example: `kraken2--bracken.tsv`. However in most cases you want to use the actual Bracken file (i.e., `bracken_.tsv`). + - `/` + - `_.classified.fastq.gz`: FASTQ file containing all reads that had a hit against a reference in the database for a given sample + - `_.unclassified.fastq.gz`: FASTQ file containing all reads that did not have a hit in the database for a given sample + - `_.report.txt`: A Kraken2 report that summarises the fraction abundance, taxonomic ID, number of Kmers, taxonomic path of all the hits in the Kraken2 run for a given sample. Will be 6 column rather than 8 if `--save_minimizers` specified. This report will **only** be included if you supply `--bracken_save_intermediatekraken2` to the run. + - `_.classifiedreads.txt`: A list of read IDs and the hits each read had against each database for a given sample + +
+ +The main taxonomic classification file from Kraken2 is the `_combined_reports.txt` or `*report.txt` file. The former provides you the broadest over view of the taxonomic classification results across all samples against a single database, where you get two columns for each sample e.g. `2_all` and `2_lvl`, as well as a summarised column summing up across all samples `tot_all` and `tot_lvl`. The latter gives you the most information for a single sample. The report file is also used for the taxpasta step. + +You will only receive the `.fastq` and `*classifiedreads.txt` file if you supply `--kraken2_save_reads` and/or `--kraken2_save_readclassifications` parameters to the pipeline. + +When running Bracken, you will only get the 'intermediate' Kraken2 report files in this directory if you supply `--bracken_save_intermediatekraken2` to the run. + +### KrakenUniq + +[KrakenUniq](https://github.com/fbreitwieser/krakenuniq) (formerly KrakenHLL) is an extension to the fast k-mer-based classification performed by [Kraken](https://github.com/DerrickWood/kraken) with an efficient algorithm for additionally assessing the coverage of unique k-mers found in each species in a dataset. + +
+Output files + +- `krakenuniq/` + - `/` + - `_[.merged].classified.fast{a,q}.gz`: Optional FASTA file containing all reads that had a hit against a reference in the database for a given sample. Paired-end input reads are merged in this output. + - `_[.merged].unclassified.fast{a,q}.gz`: Optional FASTA file containing all reads that did not have a hit in the database for a given sample. Paired-end input reads are merged in this output. + - `_.krakenuniq.report.txt`: A Kraken2-style report that summarises the fraction abundance, taxonomic ID, number of Kmers, taxonomic path of all the hits, with an additional column for k-mer coverage, that allows for more accurate distinguishing between false-positive/true-postitive hits. + - `_.krakenuniq.classified.txt`: An optional list of read IDs and the hits each read had against each database for a given sample. + +
+ +The main taxonomic classification file from KrakenUniq is the `*.krakenuniq.report.txt` file. This is an extension of the Kraken2 report with the additional k-mer coverage information that provides more information about the accuracy of hits. + +You will only receive the `.fasta.gz` and `*.krakenuniq.classified.txt` file if you supply `--krakenuniq_save_reads` and/or `--krakenuniq_save_readclassification` parameters to the pipeline. + +:::info +The output system of KrakenUniq can result in other `stdout` or `stderr` logging information being saved in the report file, therefore you must check your report files before downstream use! +::: + +### Centrifuge + +[Centrifuge](https://github.com/DaehwanKimLab/centrifuge) is a taxonomic sequence classifier that uses a Burrows-Wheeler transform and Ferragina-Manzina index for storing and mapping sequences. + +
+Output files + +- `centrifuge/` + - `/` + - `.centrifuge.mapped.fastq.gz`: `FASTQ` files containing all mapped reads + - `.centrifuge.report.txt`: A classification report that summarises the taxonomic ID, the taxonomic rank, length of genome sequence, number of classified and uniquely classified reads + - `.centrifuge.results.txt`: A file that summarises the classification assignment for a read, i.e read ID, sequence ID, score for the classification, score for the next best classification, number of classifications for this read + - `.centrifuge.txt`: A Kraken2-style report that summarises the fraction abundance, taxonomic ID, number of k-mers, taxonomic path of all the hits in the centrifuge run for a given sample + - `.centrifuge.unmapped.fastq.gz`: FASTQ file containing all unmapped reads + +
+ +The main taxonomic classification files from Centrifuge are the `_combined_reports.txt`, `*report.txt`, `*results.txt` and the `*centrifuge.txt`. The latter is used by the taxpasta step. You will receive the `.fastq` files if you supply `--centrifuge_save_reads`. + +### Kaiju + +[Kaiju](https://github.com/bioinformatics-centre/kaiju) is a taxonomic classifier that finds maximum exact matches on the protein-level using the Burrows-Wheeler transform. + +
+Output files + +- `kaiju/` + - `kaiju__combined_reports.txt`: A combined profile of all samples aligned to a given database (as generated by kaiju2table) + - `/` + - `_.kaiju.tsv`: Raw output from Kaiju with taxonomic rank, read ID and taxonic ID + - `_.kaijutable.txt`: Summarised Kaiju output with fraction abundance, taxonomic ID, number of reads, and taxonomic names (as generated by `kaiju2table`) + +
+ +The most useful summary file is the `_combined_reports.txt` file which summarises hits across all reads and samples. Separate per-sample versions summaries can be seen in `/*.txt`. However if you wish to look at more precise information on a per-read basis, see the `*tsv` file. The default taxonomic rank is `species`. You can provide a different one by updating the argument `--kaiju_taxon_rank`. + +### DIAMOND + +[DIAMOND](https://github.com/bbuchfink/diamond) is a sequence aligner for translated DNA searches or protein sequences against a protein reference database such as NR. It is a replacement for the NCBI BLAST software tools.It has many key features and it is used as taxonomic classifier in nf-core/taxprofiler. + +
+Output files + +- `diamond/` + - `/` + - `.log`: A log file containing stdout information + - `*.{blast,xml,txt,daa,sam,tsv,paf}`: A file containing alignment information in various formats, or taxonomic information in a text-based format. Exact output depends on user choice. + +
+ +By default you will receive a TSV output. Alternatively, you will receive a `*.sam` file if you provide the parameter `--diamond_save_reads` but in this case no taxonomic classification will be available(!), only the aligned reads in sam format. + +:::info +DIAMOND has many output formats, so depending on your [choice](https://github.com/bbuchfink/diamond/wiki/3.-Command-line-options) with ` --diamond_output_format` you will receive the taxonomic information in a different format. +::: + +### MALT + +[MALT](https://software-ab.cs.uni-tuebingen.de/download/malt) is a fast replacement for BLASTX, BLASTP and BLASTN, and provides both local and semi-global alignment capabilities. + +
+Output files + +- `malt/` + - `/` + - `.blastn.sam`: sparse SAM file containing alignments of each hit + - `.megan`: summary file that can be loaded into the [MEGAN6](https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/megan6/) interactive viewer. Generated by MEGAN6 companion tool `rma2info` + - `.rma6`: binary file containing all alignments and taxonomic information of hits that can be loaded into the [MEGAN6](https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/megan6/) interactive viewer + - `.txt.gz`: text file containing taxonomic IDs and read counts against each taxon. Generated by MEGAN6 companion tool `rma2info` + +
+ +The main output of MALT is the `.rma6` file format, which can be only loaded into MEGAN and it's related tools. We provide the `rma2info` text files for improved compatibility with spreadsheet programs and other programmtic data manipulation tools, however this has only limited information compared to the 'binary' RMA6 file format (the `.txt` file only contains taxonomic ID and count, whereas RMA6 has taxonomic lineage information). + +You will only receive the `.sam` and `.megan` files if you supply `--malt_save_reads` and/or `--malt_generate_megansummary` parameters to the pipeline. + +### MetaPhlAn + +[MetaPhlAn](https://github.com/biobakery/metaphlan) is a computational tool for profiling the composition of microbial communities (Bacteria, Archaea and Eukaryotes) from metagenomic shotgun sequencing data (i.e. not 16S) with species-level resolution via marker genes. + +
+Output files + +- `metaphlan/` + - `metaphlan__combined_reports.txt`: A combined profile of all samples aligned to a given database (as generated by `metaphlan_merge_tables`) + - `/` + - `.biom`: taxonomic profile in BIOM format + - `.bowtie2out.txt`: BowTie2 alignment information (can be re-used for skipping alignment when re-running MetaPhlAn with different parameters) + - `_profile.txt`: MetaPhlAn taxonomic profile including abundance estimates + +
+ +The output contains a file named `*_combined_reports.txt`, which provides an overview of the classification results for all samples. The main taxonomic profiling file from MetaPhlAn is the `*_profile.txt` file. This provides the abundance estimates from MetaPhlAn however does not include raw counts by default. Additionally, it contains intermediate Bowtie2 output `.bowtie2out.txt`, which presents a condensed representation of the mapping results of your sequencing reads to MetaPhlAn's marker gene sequences. The alignments are listed in tab-separated columns, including Read ID and Marker Gene ID, with each alignment represented on a separate line. + +### mOTUs + +[mOTUS](https://github.com/motu-tool/mOTUs) is a taxonomic profiler that maps reads to a unique marker specific database and estimates the relative abundance of known and unknown species. + +
+Output files + +- `motus/` + - `/` + - `.log`: A log file that contains summary statistics + - `.out`: A classification file that summarises taxonomic identifiers, by default at the rank of mOTUs (i.e., species level), and their relative abundances in the profiled sample. + - `motus__combined_reports.txt`: A combined profile of all samples aligned to a given database (as generated by `motus_merge`)
+Normally `*_combined_reports.txt` is the most useful file for downstream analyses, but the per sample `.out` file can provide additional more specific information. By default, nf-core/taxprofiler is providing a column describing NCBI taxonomic ID as this is used in the taxpasta step. You can disable this column by activating the argument `--motus_remove_ncbi_ids`. +You will receive the relative abundance instead of read counts if you provide the argument `--motus_use_relative_abundance`. + +### KMCP + +[KMCP](https://github.com/shenwei356/kmcp) utilises genome coverage information by splitting the reference genomes into chunks and stores k-mers in a modified and optimised COBS index for fast alignment-free sequence searching. KMCP combines k-mer similarity and genome coverage information to reduce the false positive rate of k-mer-based taxonomic classification and profiling methods. + +
+Output files + +- `kmcp/` + + - `/` + - `.gz`: output of `kmcp_search` containing search sequences against a database in tab-delimited format with 15 columns. + - `_kmcp.profile`: output of `kmcp_profile` containing the taxonomic profile from search results. + +
+ +You will receive the `.gz` file if you supply `--kmcp_save_search`. Please note that there is no taxonomic label assignment in this output file. + +The main taxonomic classification file from KMCP is the `*kmcp.profile` which is also used by the taxpasta step. + +### ganon + +[ganon](https://pirovc.github.io/ganon/) is designed to index large sets of genomic reference sequences and to classify reads against them efficiently. The tool uses Interleaved Bloom Filters as indices based on k-mers/minimizers. It was mainly developed, but not limited, to the metagenomics classification problem: quickly assign sequence fragments to their closest reference among thousands of references. After classification, taxonomic abundance is estimated and reported. + +
+Output files + +- `ganon/` + + - `/` + + - `_report.tre`: output of `ganon report` containing taxonomic classifications with possible formatting and/or filtering depending on options specified. + - ``.tre: output of `ganon classify` containing raw taxonomic classifications and abundance estimations with no additional formatting or filtering. + - ``.rep: 'raw' report of counts against each taxon. + - ``.all: per-read summary of all hits of each reads. + - ``.lca: per-read summary of the best single hit after LCA for each read. + - ``.unc: list of read IDs with no hits. + - ``.log: the stdout console messages printed by `ganon classify`, containing some classification summary information + + - `ganon__combined_reports.txt`: A combined profile of all samples aligned to a given database (as generated by `ganon table`) + +
+ +Generally you will want to refer to the `combined_reports.txt` or `_report.tre` file. For further descriptions of the contents of each file, see the [ganon documentation](https://pirovc.github.io/ganon/outputfiles/). + +You will only receive the `.all`, `.lca`, and `.unc` files if you supply the `--ganon_save_readclassifications` parameter to the pipeline. + +### Krona + +[Krona](https://github.com/marbl/Krona) allows the exploration of (metagenomic) hierarchical data with interactive zooming, multi-layered pie charts. + +Krona charts will be generated by the pipeline for supported tools (Kraken2, Centrifuge, Kaiju, and MALT) + +
+Output files + +- `krona/` + - `_.html`: per-tool/per-database interactive HTML file containing hierarchical piecharts + +
+ +The resulting HTML files can be loaded into your web browser for exploration. Each file will have a dropdown to allow you to switch between each sample aligned against the given database of the tool. + +### TAXPASTA + +[TAXPASTA](https://github.com/taxprofiler/taxpasta) standardises and optionally merges two or more taxonomic profiles across samples into one single table. It supports multiple different classifiers simplifying comparison of taxonomic classification results between tools and databases. + +
+Output files + +- `taxpasta/` + + - `_*.{tsv,csv,arrow,parquet,biom}`: Standardised taxon table containing multiple samples. The standard format is the `tsv`. + - The first column describes the taxonomy ID and the rest of the columns describe the read counts for each sample. + - Note that the file naming scheme will apply regardless of whether `TAXPASTA_MERGE` (multiple sample run) or `TAXPASTA_STANDARDISE` (single sample run) are executed. + - If you have also run Bracken, the initial Kraken report (i.e., _before_ read re-assignment) will also be included in this directory with `-bracken` suffixed to your Bracken database name. For example: `kraken2--bracken.tsv`. However in most cases you want to use the actual Bracken file (i.e., `bracken_.tsv`). + +
+ +By providing the path to a directory containing taxdump files to `--taxpasta_taxonomy_dir`, the taxon name, the taxon rank, the taxon's entire lineage including taxon names and/or the taxon's entire lineage including taxon identifiers can also be added in the output in addition to just the taxon ID. Addition of this extra information can be turned by using the parameters `--taxpasta_add_name`, `--taxpasta_add_rank`, `--taxpasta_add_lineage` and `--taxpasta_add_idlineage` respectively. + +These files will likely be the most useful files for the comparison of differences in classification between different tools or building consensuses, with the caveat they have slightly less information than the actual output from each tool (which may have non-standard information e.g. taxonomic rank, percentage of hits, abundance estimations). + +The following report files are used for the taxpasta step: + +- Bracken: `_.tsv` Taxpasta used the `new_est_reads` column for the standardised profile. +- Centrifuge: `.centrifuge.txt` Taxpasta uses the `direct_assigned_reads` column for the standardised profile. +- Diamond: `` Taxpasta summarises number of reads per NCBI taxonomy ID standardised profile. +- Kaiju: `_.kaijutable.txt` Taxpasta uses the `reads` column from kaiju2table standardised profile. +- KrakenUniq: `_.report.txt` Taxpasta uses the `reads` column for the standardised profile. +- Kraken2: `_.report.txt` Taxpasta uses the `direct_assigned_reads` column for the standardised profile. +- MALT: `.txt.gz` Taxpasta uses the `count` (second) column from the output of MEGAN6's rma2info for the standardised profile. +- MetaPhlAn: `_profile.txt` Taxpasta uses the `relative_abundance` column multiplied with a fixed number to yield an integer for the standardised profile. +- mOTUs: `.out` Taxpasta uses the `read_count` column for the standardised profile. + +:::warning +Please aware the outputs of each tool's standardised profile _may not_ be directly comparable between each tool. Some may report raw read counts, whereas others may report abundance information. Please always refer to the list above, for which information is used for each tool. +::: + ### MultiQC
Output files -* `multiqc/` - * `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. - * `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. - * `multiqc_plots/`: directory containing static images from the report in various formats. +- `multiqc/` + - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. + - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. + - `multiqc_plots/`: directory containing static images from the report in various formats.
@@ -442,15 +1394,44 @@ Whole genome annotation is the process of identifying features of interest in a Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . +All tools in taxprofiler supported by MultiQC will have a dedicated section showing summary statistics of each tool based on information stored in log files. + +You can expect in the MultiQC reports either sections and/or general stats columns for the following tools: + +- fastqc +- adapterRemoval +- fastp +- bbduk +- prinseqplusplus +- porechop +- filtlong +- bowtie2 +- minimap2 +- samtools (stats) +- kraken +- bracken +- centrifuge +- kaiju +- diamond +- malt +- motus + +:::info +The 'General Stats' table by default will only show statistics referring to pre-processing steps, and will not display possible values from each classifier/profiler, unless turned on by the user within the 'Configure Columns' menu or via a custom MultiQC config file (`--multiqc_config`) +::: + ### Pipeline information
Output files -* `pipeline_info/` - * Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - * Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.tsv`. +- `pipeline_info/` + - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. + - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. + - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. + - Parameters used by the pipeline run: `params.json`.
[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. + diff --git a/bu_isciii/assets/reports/md/plasmidid.md b/bu_isciii/assets/reports/md/plasmidid.md new file mode 100755 index 00000000..90e371d7 --- /dev/null +++ b/bu_isciii/assets/reports/md/plasmidid.md @@ -0,0 +1,169 @@ +# PlasmidID + +This document describes the output produced by the pipeline. + +The directories listed below will be created in the analysis directory after the pipeline has finished. All paths are relative to the top-level results directory. + +- [PlasmidID](#plasmidid) + - [Preprocessing](#preprocessing) + - [Assembly](#assembly) + - [PlasmidID](#plasmidid-1) + - [Clustering (mash) results](#clustering-mash-results) + - [Prokka annotation](#prokka-annotation) + - [Mapping againts found plasmids](#mapping-againts-found-plasmids) + - [PlasmidID data for circos](#plasmidid-data-for-circos) + - [Circos images](#circos-images) + - [Reconstructed plasmid sequences](#reconstructed-plasmid-sequences) + - [Summary report](#summary-report) + +## Preprocessing + +Preprocessing is performed using the assembly template. Check that doc for reference. + +## Assembly + +Assembly steps are done using the assembly template. Check that doc for reference. + +## PlasmidID + +[PlasmidID](https://github.com/BU-ISCIII/plasmidID) v1.6.5 is a mapping-based, assembly-assisted plasmid identification tool that analyzes and gives graphic solution for plasmid identification. + +PlasmidID is a computational pipeline that maps Illumina reads over plasmid database sequences. The k-mer filtered, most covered sequences are clustered by identity to avoid redundancy and the longest are used as scaffold for plasmid reconstruction. Reads are assembled and annotated by automatic and specific annotation. All information generated from mapping, assembly, annotation and local alignment analyses is gathered and accurately represented in a circular image which allow user to determine plasmidic composition in any bacterial sample. + +### Clustering (mash) results + +Mash is employed to pinpoint the plasmids present in the sample using a specific database. Subsequently, Mash calculates the genetic distances between these identified plasmids. These distances are then used to group similar plasmids into clusters. From each cluster, the longest plasmid is selected to represent the group in subsequent analyses. + +
+Output files description + +`NO_GROUP/kmer` + - database.filtered_XX: Contains identifiers for plasmids identified by Mash that exhibit a distance value greater than a specified threshold (e.g., 0.95). + - database.filtered_XX_term.XX.clusters.tab: A tabulated file listing the clusters of plasmids grouped based on their genetic similarities. + - database.filtered_XX_term.XX.representative.fasta: The FASTA formatted sequence file of the longests plasmids selected as representatives for each cluster. + - database.filtered_XX_term.XX.representative.fasta.*.bt2: Bowtie2 index files for the representative FASTA sequences. + - database.filtered_XX_term.fasta: the FASTA formatted sequences from the plasmids in database.filtered_XX. + - database.filtered_XX_term.mash.distances.tab: Tabulated data of Mash-calculated distances between the filtered plasmid sequences, used for clustering. + - database.msh: The Mash sketch file of the database, which is a compact binary representation of the set of plasmids used for quick distance estimation. + - database.screen.tab: Output file listing the results of the Mash screen operation, which compares the sample against the database to find matching plasmids. +
+ +### Prokka annotation + +Provided assemblies are automatic annotated using prokka. + +
+Output files description + +`NO_GROUP/database` +Prokka output files can be found [here](https://github.com/tseemann/prokka?tab=readme-ov-file#output-files) + - SAMPLE_NAME.err + - SAMPLE_NAME.fna + - SAMPLE_NAME.gff + - SAMPLE_NAME.gff.renamed + - SAMPLE_NAME.gff.bed: gff in bed format + - SAMPLE_NAME.gff.reverse.bed: only reverse genes + - SAMPLE_NAME.gff.forward.bed: only forward genes + - SAMPLE_NAME.sqn + - SAMPLE_NAME.txt + - SAMPLE_NAME.faa + - SAMPLE_NAME.fsa + - SAMPLE_NAME.tbl + - SAMPLE_NAME.ffn + - SAMPLE_NAME.gbk + - SAMPLE_NAME.log + - SAMPLE_NAME.tsv +
+ +### Mapping againts found plasmids + +Once we have selected the representative plasmids that may be present in the sample, Bowtie2 is employed to map the raw sequencing reads against these plasmid sequences in FASTA format. Plasmids that achieve more than 80% coverage are retained for further analysis. Coverage metrics are calculated and recorded in temporary output files. + +
+Output files description + +`NO_GROUP/mapping` + - SAMPLE_NAME.coverage: Contains initial coverage bedgraph data for each plasmid + - SAMPLE_NAME.coverage_adapted: Adjusted coverage mean for each plasmid + - SAMPLE_NAME.coverage_adapted_clustered: adjusted coverage mean filtered with more than 80% coverage. + - SAMPLE_NAME.coverage_adapted_clustered_ac: identificator of filtered plasmids + - SAMPLE_NAME.coverage_adapted_clustered_percentage: coverage data for each plasmid in percentaje (1-value) + - SAMPLE_NAME.coverage_adapted_filtered_80: Lists plasmids with coverage exceeding 80%, selected for subsequent analysis. + - SAMPLE_NAME.coverage_adapted_filtered_80_term.fasta: FASTA formatted file containing sequences of plasmids with more than 80% coverage. + - SAMPLE_NAME.coverage_adapted_filtered_80_term.fasta.blast.tmp.*: Temporary BLAST files for sequences that have passed the 80% coverage threshold, used for further comparative analysis. + - SAMPLE_NAME.sorted.bam: BAM file of aligned reads sorted by coordinates. + - SAMPLE_NAME.sorted.bam.bai: Index file for the sorted BAM file, facilitating faster data retrieval. +
+ +### PlasmidID data for circos + +Blast is employed to annotate selected databases by comparing them against the assemblies. Additionally, each contig within the assembly is aligned with the identified plasmids. The necessary files for Circos visualization are created in the data folder. + +
+Output files description + +`NO_GROUP/SAMPLE_NAME/data` + - pID_highlights.conf: genes highlights for circos. + - pID_text_annotation.coordinates: text annotation coordinates for circos. + - SAMPLE_NAME.bedgraph: bedgraph coverage for each plasmid + - SAMPLE_NAME.bedgraph_term: filtered bedgraph coverage for each plasmid + - SAMPLE_NAME.DB.bed: blast result in bed format + - SAMPLE_NAME.DB.blast: for each annotation database blast result against the assembly + - SAMPLE_NAME.DB.coordinates: blast result with the coordinates needed for the circos image + - SAMPLE_NAME.fna.blast.tmp.*: blast tmp database files + - SAMPLE_NAME.gff.forward.coordinates: gff coordinates for forward genes for annotation track + - SAMPLE_NAME.gff.reverse.coordinates: gff coordinates for reverse genes for annotation track + - SAMPLE_NAME.karyotype_individual.txt: karyotype template for each plasmid individual image + - SAMPLE_NAME.karyotype_summary.txt: karyotype circos file for summary image + - SAMPLE_NAME.plasmids.bed: blast result plasmids in bed format. + - SAMPLE_NAME.plasmids.blast: blast result contigs against identified plasmids. + - SAMPLE_NAME.plasmids.blast.links: blast result for links for contigs that match different plasmids. + - SAMPLE_NAME.plasmids.complete: complete track information for citcos + - SAMPLE_NAME.plasmids.links: links for contigs that match different plasmids. +
+ +### Circos images + +Circos is used for creating one image for each identified plasmid and a summary image with all the plasmids identified in one figure. A manual for image interpretation can be found [here](https://github.com/BU-ISCIII/plasmidID/wiki/Understanding-the-image:-track-by-track) and a manual about how to select the correct plasmid can be found [here](https://github.com/BU-ISCIII/plasmidID/wiki/How-to-chose-the-right-plasmids). + +![plasmidid_image](./images/SEN30_000195995_NC_013365.1.png) +![summary_image](./images/KPN30_000240185_summary.png) + +
+Output files description + +`NO_GROUP/images` + +- SAMPLE_NAME_PLASMID_individual.circos.conf: circos conf file used for generating the individual image +- SAMPLE_NAME_PLASMID.png: circos image for individual plasmidID +- SAMPLE_NAME_summary.circos.conf: circos conf file used for genering the summary image +- SAMPLE_NAME_summary.png: summary image + +
+ +## Reconstructed plasmid sequences + +A multifasta file is created for each plasmid including all the contig sequences that have matched the identified contig. + +
+Output files description + +`NO_GROUP/fasta_files` + +- PLASMID_term.fasta: multifasta file for each plasmid identified in the sample. + +
+ +### Summary report + +A summary report consolidating all samples in the analysis is created. + +
+Output files description + +`NO_GROUP` + +- `NO_GROUP_final_results.html`: report with same info as table below that can be viewed using chrome. +- `NO_GROUP_final_results.tab`: plasmid info for each sample. + +
diff --git a/bu_isciii/assets/reports/md/rnaseq_deg.md b/bu_isciii/assets/reports/md/rnaseq_deg.md new file mode 100755 index 00000000..c3157323 --- /dev/null +++ b/bu_isciii/assets/reports/md/rnaseq_deg.md @@ -0,0 +1,781 @@ + +# mRNAseq (DEG): Output + +## Introduction + +This document describes the output produced by the pipeline. + + +The directories listed below will be created in the results directory (`01-${DATE}_rnaseq`) after the pipeline has finished. All paths are relative to the top-level results directory. + +## Pipeline overview + +The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: + +- [Preprocessing](#preprocessing) + - [cat](#cat) - Merge re-sequenced FastQ files + - [FastQC](#fastqc) - Raw read QC + - [UMI-tools extract](#umi-tools-extract) - UMI barcode extraction + - [TrimGalore](#trimgalore) - Adapter and quality trimming + - [BBSplit](#bbsplit) - Removal of genome contaminants + - [SortMeRNA](#sortmerna) - Removal of ribosomal RNA +- [Alignment and quantification](#alignment-and-quantification) + - [STAR and Salmon](#star-and-salmon) - Fast spliced aware genome alignment and transcriptome quantification + - [STAR via RSEM](#star-via-rsem) - Alignment and quantification of expression levels + - [HISAT2](#hisat2) - Memory efficient splice aware alignment to a reference +- [Alignment post-processing](#alignment-post-processing) + - [SAMtools](#samtools) - Sort and index alignments + - [UMI-tools dedup](#umi-tools-dedup) - UMI-based deduplication + - [picard MarkDuplicates](#picard-markduplicates) - Duplicate read marking +- [Other steps](#other-steps) + - [StringTie](#stringtie) - Transcript assembly and quantification + - [BEDTools and bedGraphToBigWig](#bedtools-and-bedgraphtobigwig) - Create bigWig coverage files +- [Quality control](#quality-control) + - [RSeQC](#rseqc) - Various RNA-seq QC metrics + - [Qualimap](#qualimap) - Various RNA-seq QC metrics + - [dupRadar](#dupradar) - Assessment of technical / biological read duplication + - [Preseq](#preseq) - Estimation of library complexity + - [featureCounts](#featurecounts) - Read counting relative to gene biotype + - [DESeq2](#deseq2) - PCA plot and sample pairwise distance heatmap and dendrogram + - [MultiQC](#multiqc) - Present QC for raw reads, alignment, read counting and sample similiarity +- [Pseudo-alignment and quantification](#pseudo-alignment-and-quantification) + - [Salmon](#salmon) - Wicked fast gene and isoform quantification relative to the transcriptome +- [Workflow reporting and genomes](#workflow-reporting-and-genomes) + - [Reference genome files](#reference-genome-files) - Saving reference genome indices/files + - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution + +## Preprocessing + +### cat + +
+Output files + +- `fastq/` + - `*.merged.fastq.gz`: If `--save_merged_fastq` is specified, concatenated FastQ files will be placed in this directory. + +
+ +If multiple libraries/runs have been provided for the same sample in the input samplesheet (e.g. to increase sequencing depth) then these will be merged at the very beginning of the pipeline in order to have consistent sample naming throughout the pipeline. Please refer to the [usage documentation](https://nf-co.re/rnaseq/usage#samplesheet-input) to see how to specify these samples in the input samplesheet. + +### FastQC + +
+Output files + +- `fastqc/` + - `*_fastqc.html`: FastQC report containing quality metrics. + - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. + +> **NB:** The FastQC plots in this directory are generated relative to the raw, input reads. They may contain adapter sequence and regions of low quality. To see how your reads look after adapter and quality trimming please refer to the FastQC reports in the `trimgalore/fastqc/` directory. + +
+ +[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). + +![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) + +![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png) + +![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) + +### UMI-tools extract + +
+Output files + +- `umitools/` + - `*.fastq.gz`: If `--save_umi_intermeds` is specified, FastQ files **after** UMI extraction will be placed in this directory. + - `*.log`: Log file generated by the UMI-tools `extract` command. + +
+ +[UMI-tools](https://github.com/CGATOxford/UMI-tools) deduplicates reads based on unique molecular identifiers (UMIs) to address PCR-bias. Firstly, the UMI-tools `extract` command removes the UMI barcode information from the read sequence and adds it to the read name. Secondly, reads are deduplicated based on UMI identifier after mapping as highlighted in the [UMI-tools dedup](#umi-tools-dedup) section. + +To facilitate processing of input data which has the UMI barcode already embedded in the read name from the start, `--skip_umi_extract` can be specified in conjunction with `--with_umi`. + +### TrimGalore + +
+Output files + +- `trimgalore/` + - `*.fq.gz`: If `--save_trimmed` is specified, FastQ files **after** adapter trimming will be placed in this directory. + - `*_trimming_report.txt`: Log file generated by Trim Galore!. +- `trimgalore/fastqc/` + - `*_fastqc.html`: FastQC report containing quality metrics for read 1 (_and read2 if paired-end_) **after** adapter trimming. + - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. + +
+ +[Trim Galore!](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/) is a wrapper tool around Cutadapt and FastQC to peform quality and adapter trimming on FastQ files. By default, Trim Galore! will automatically detect and trim the appropriate adapter sequence. + +> **NB:** TrimGalore! will only run using multiple cores if you are able to use more than > 5 and > 6 CPUs for single- and paired-end data, respectively. The total cores available to TrimGalore! will also be capped at 4 (7 and 8 CPUs in total for single- and paired-end data, respectively) because there is no longer a run-time benefit. See [release notes](https://github.com/FelixKrueger/TrimGalore/blob/master/Changelog.md#version-060-release-on-1-mar-2019) and [discussion whilst adding this logic to the nf-core/atacseq pipeline](https://github.com/nf-core/atacseq/pull/65). + +![MultiQC - cutadapt trimmed sequence length plot](images/mqc_cutadapt_trimmed.png) + +### BBSplit + +
+Output files + +- `bbsplit/` + - `*.fastq.gz`: If `--save_bbsplit_reads` is specified FastQ files split by reference will be saved to the results directory. Reads from the main reference genome will be named "_primary_.fastq.gz". Reads from contaminating genomes will be named "__.fastq.gz" where `` is the first column in `--bbsplit_fasta_list` that needs to be provided to initially build the index. + - `*.txt`: File containing statistics on how many reads were assigned to each reference. + +
+ +[BBSplit](http://seqanswers.com/forums/showthread.php?t=41288) is a tool that bins reads by mapping to multiple references simultaneously, using BBMap. The reads go to the bin of the reference they map to best. There are also disambiguation options, such that reads that map to multiple references can be binned with all of them, none of them, one of them, or put in a special "ambiguous" file for each of them. + +This functionality would be especially useful, for example, if you have [mouse PDX](https://en.wikipedia.org/wiki/Patient_derived_xenograft) samples that contain a mixture of human and mouse genomic DNA/RNA and you would like to filter out any mouse derived reads. + +The BBSplit index will have to be built at least once with this pipeline by providing [`--bbsplit_fasta_list`](https://nf-co.re/rnaseq/parameters#bbsplit_fasta_list) which has to be a file containing 2 columns: short name and full path to reference genome(s): + +```bash +mm10,/path/to/mm10.fa +ecoli,/path/to/ecoli.fa +sarscov2,/path/to/sarscov2.fa +``` + +You can save the index by using the [`--save_reference`](https://nf-co.re/rnaseq/parameters#save_reference) parameter and then provide it via [`--bbsplit_index`](https://nf-co.re/rnaseq/parameters#bbsplit_index) for future runs. As described in the `Output files` dropdown box above the FastQ files relative to the main reference genome will always be called `*primary*.fastq.gz`. + +### SortMeRNA + +
+Output files + +- `sortmerna/` + - `*.fastq.gz`: If `--save_non_ribo_reads` is specified, FastQ files containing non-rRNA reads will be placed in this directory. + - `*.log`: Log file generated by SortMeRNA with information regarding reads that matched the reference database(s). + +
+ +When `--remove_ribo_rna` is specified, the pipeline uses [SortMeRNA](https://github.com/biocore/sortmerna) for the removal of ribosomal RNA. By default, [rRNA databases](https://github.com/biocore/sortmerna/tree/master/data/rRNA_databases) defined in the SortMeRNA GitHub repo are used. You can see an example in the pipeline Github repository in `assets/rrna-default-dbs.txt` which is used by default via the `--ribo_database_manifest` parameter. Please note that commercial/non-academic entities require [`licensing for SILVA`](https://www.arb-silva.de/silva-license-information) for these default databases. + +![MultiQC - SortMeRNA hit count plot](images/mqc_sortmerna.png) + +## Alignment and quantification + +### STAR and Salmon + +
+Output files + +- `star_salmon/` + - `*.Aligned.out.bam`: If `--save_align_intermeds` is specified the original BAM file containing read alignments to the reference genome will be placed in this directory. + - `*.Aligned.toTranscriptome.out.bam`: If `--save_align_intermeds` is specified the original BAM file containing read alignments to the transcriptome will be placed in this directory. +- `star_salmon/log/` + - `*.SJ.out.tab`: File containing filtered splice junctions detected after mapping the reads. + - `*.Log.final.out`: STAR alignment report containing the mapping results summary. + - `*.Log.out` and `*.Log.progress.out`: STAR log files containing detailed information about the run. Typically only useful for debugging purposes. +- `star_salmon/unmapped/` + - `*.fastq.gz`: If `--save_unaligned` is specified, FastQ files containing unmapped reads will be placed in this directory. + +
+ +[STAR](https://github.com/alexdobin/STAR) is a read aligner designed for splice aware mapping typical of RNA sequencing data. STAR stands for *S*pliced *T*ranscripts *A*lignment to a *R*eference, and has been shown to have high accuracy and outperforms other aligners by more than a factor of 50 in mapping speed, but it is memory intensive. Using `--aligner star_salmon` is the default alignment and quantification option. + +[Salmon](https://salmon.readthedocs.io/en/latest/salmon.html) from [Ocean Genomics](https://oceangenomics.com/) is a tool for wicked-fast transcript quantification from RNA-seq data. It requires a set of target transcripts (either from a reference or de-novo assembly) in order to perform quantification. All you need to run Salmon is a FASTA file containing your reference transcripts and a set of FASTA/FASTQ/BAM file(s) containing your reads. The transcriptome-level BAM files generated by STAR are provided to Salmon for downstream quantification. You can of course also provide FASTQ files directly as input to Salmon in order to pseudo-align and quantify your data by providing the `--pseudo_aligner salmon` parameter. The results generated by the pipeline are exactly the same whether you provide BAM or FASTQ input so please see the [Salmon](#salmon) results section for more details. + +The STAR section of the MultiQC report shows a bar plot with alignment rates: good samples should have most reads as _Uniquely mapped_ and few _Unmapped_ reads. + +![MultiQC - STAR alignment scores plot](images/mqc_star.png) + +### STAR via RSEM + +
+Output files + +- `star_rsem/` + - `rsem.merged.gene_counts.tsv`: Matrix of gene-level raw counts across all samples. + - `rsem.merged.gene_tpm.tsv`: Matrix of gene-level TPM values across all samples. + - `rsem.merged.transcript_counts.tsv`: Matrix of isoform-level raw counts across all samples. + - `rsem.merged.transcript_tpm.tsv`: Matrix of isoform-level TPM values across all samples. + - `*.genes.results`: RSEM gene-level quantification results for each sample. + - `*.isoforms.results`: RSEM isoform-level quantification results for each sample. + - `*.STAR.genome.bam`: If `--save_align_intermeds` is specified the original BAM file containing read alignments to the reference genome will be placed in this directory. + - `*.transcript.bam`: If `--save_align_intermeds` is specified the original BAM file containing read alignments to the transcriptome will be placed in this directory. +- `star_rsem/.stat/` + - `*.cnt`, `*.model`, `*.theta`: RSEM counts and statistics for each sample. + - `star_rsem/log/` + - `*.log`: STAR alignment report containing the mapping results summary. + +
+ +[RSEM](https://github.com/deweylab/RSEM) is a software package for estimating gene and isoform expression levels from RNA-seq data. It has been widely touted as one of the most accurate quantification tools for RNA-seq analysis. RSEM wraps other popular tools to map the reads to the genome (i.e. STAR, Bowtie2, HISAT2; STAR is used in this pipeline) which are then subsequently filtered relative to a transcriptome before quantifying at the gene- and isoform-level. Other advantages of using RSEM are that it performs both the alignment and quantification in a single package and its ability to effectively use ambiguously-mapping reads. + +You can choose to align and quantify your data with RSEM by providing the `--aligner star_rsem` parameter. + +![MultiQC - RSEM alignment scores plot](images/mqc_rsem_mapped.png) + +![MultiQC - RSEM uniquely mapped plot](images/mqc_rsem_multimapped.png) + +### HISAT2 + +
+Output files + +- `hisat2/` + - `.bam`: If `--save_align_intermeds` is specified the original BAM file containing read alignments to the reference genome will be placed in this directory. +- `hisat2/log/` + - `*.log`: HISAT2 alignment report containing the mapping results summary. +- `hisat2/unmapped/` + - `*.fastq.gz`: If `--save_unaligned` is specified, FastQ files containing unmapped reads will be placed in this directory. + +
+ +[HISAT2](http://daehwankimlab.github.io/hisat2/) is a fast and sensitive alignment program for mapping next-generation sequencing reads (both DNA and RNA) to a population of human genomes as well as to a single reference genome. It introduced a new indexing scheme called a Hierarchical Graph FM index (HGFM) which when combined with several alignment strategies, enable rapid and accurate alignment of sequencing reads. The HISAT2 route through the pipeline is a good option if you have memory limitations on your compute. However, quantification isn't performed if using `--aligner hisat2` due to the lack of an appropriate option to calculate accurate expression estimates from HISAT2 derived genomic alignments. However, you can use this route if you have a preference for the alignment, QC and other types of downstream analysis compatible with the output of HISAT2. + +You can choose to align your data with HISAT2 by providing the `--aligner hisat2` parameter. + +![MultiQC - HISAT2 alignment scores plot](images/mqc_hisat2.png) + +## Alignment post-processing + +The pipeline has been written in a way where all the files generated downstream of the alignment are placed in the same directory as specified by `--aligner` e.g. if `--aligner star_salmon` is specified then all the downstream results will be placed in the `star_salmon/` directory. This helps with organising the directory structure and more importantly, allows the end-user to get the results from multiple aligners by simply re-running the pipeline with a different `--aligner` option along the `-resume` parameter. It also means that results won't be overwritten when resuming the pipeline and can be used for benchmarking between alignment algorithms if required. + +### SAMtools + +
+Output files + +- `/` + - `.sorted.bam`: If `--save_align_intermeds` is specified the original coordinate sorted BAM file containing read alignments will be placed in this directory. + - `.sorted.bam.bai`: If `--save_align_intermeds` is specified the BAI index file for the original coordinate sorted BAM file will be placed in this directory. + - `.sorted.bam.csi`: If `--save_align_intermeds --bam_csi_index` is specified the CSI index file for the original coordinate sorted BAM file will be placed in this directory. +- `/samtools_stats/` + - SAMtools `.sorted.bam.flagstat`, `.sorted.bam.idxstats` and `.sorted.bam.stats` files generated from the alignment files. + +
+ +The original BAM files generated by the selected alignment algorithm are further processed with [SAMtools](http://samtools.sourceforge.net/) to sort them by coordinate, for indexing, as well as to generate read mapping statistics. + +![MultiQC - SAMtools alignment scores plot](images/mqc_samtools_mapped.png) + +![MultiQC - SAMtools mapped reads per contig plot](images/mqc_samtools_idxstats.png) + +### UMI-tools dedup + +
+Output files + +- `/` + - `.umi_dedup.sorted.bam`: If `--save_umi_intermeds` is specified the UMI deduplicated, coordinate sorted BAM file containing read alignments will be placed in this directory. + - `.umi_dedup.sorted.bam.bai`: If `--save_umi_intermeds` is specified the BAI index file for the UMI deduplicated, coordinate sorted BAM file will be placed in this directory. + - `.umi_dedup.sorted.bam.csi`: If `--save_umi_intermeds --bam_csi_index` is specified the CSI index file for the UMI deduplicated, coordinate sorted BAM file will be placed in this directory. +- `/umitools/` + - `*_edit_distance.tsv`: Reports the (binned) average edit distance between the UMIs at each position. + - `*_per_umi.tsv`: UMI-level summary statistics. + - `*_per_umi_per_position.tsv`: Tabulates the counts for unique combinations of UMI and position. + +The content of the files above is explained in more detail in the [UMI-tools documentation](https://umi-tools.readthedocs.io/en/latest/reference/dedup.html#dedup-specific-options). + +
+ +After extracting the UMI information from the read sequence (see [UMI-tools extract](#umi-tools-extract)), the second step in the removal of UMI barcodes involves deduplicating the reads based on both mapping and UMI barcode information using the UMI-tools `dedup` command. This will generate a filtered BAM file after the removal of PCR duplicates. + +### picard MarkDuplicates + +
+Output files + +- `/` + - `.markdup.sorted.bam`: Coordinate sorted BAM file after duplicate marking. This is the final post-processed BAM file and so will be saved by default in the results directory. + - `.markdup.sorted.bam.bai`: BAI index file for coordinate sorted BAM file after duplicate marking. This is the final post-processed BAM index file and so will be saved by default in the results directory. + - `.markdup.sorted.bam.csi`: CSI index file for coordinate sorted BAM file after duplicate marking. This is the final post-processed BAM index file and so will be saved by default in the results directory. Only generated if `--bam_csi_index` is specified as a parameter. +- `/samtools_stats/` + - SAMtools `.markdup.sorted.bam.flagstat`, `.markdup.sorted.bam.idxstats` and `.markdup.sorted.bam.stats` files generated from the duplicate marked alignment files. +- `/picard_metrics/` + - `.markdup.sorted.MarkDuplicates.metrics.txt`: Metrics file from MarkDuplicates. + +
+ +Unless you are using [UMIs](https://emea.illumina.com/science/sequencing-method-explorer/kits-and-arrays/umi.html) it is not possible to establish whether the fragments you have sequenced from your sample were derived via true biological duplication (i.e. sequencing independent template fragments) or as a result of PCR biases introduced during the library preparation. By default, the pipeline uses [picard MarkDuplicates](https://broadinstitute.github.io/picard/command-line-overview.html#MarkDuplicates) to _mark_ the duplicate reads identified amongst the alignments to allow you to guage the overall level of duplication in your samples. However, for RNA-seq data it is not recommended to physically remove duplicate reads from the alignments (unless you are using UMIs) because you expect a significant level of true biological duplication that arises from the same fragments being sequenced from for example highly expressed genes. This step will be skipped automatically when using the `--with_umi` option or explicitly via the `--skip_markduplicates` parameter. + +![MultiQC - Picard MarkDuplicates metrics plot](images/mqc_picard_markduplicates.png) + +## Other steps + +### StringTie + +
+Output files + +- `/stringtie/` + - `*.coverage.gtf`: GTF file containing transcripts that are fully covered by reads. + - `*.transcripts.gtf`: GTF file containing all of the assembled transcipts from StringTie. + - `*.gene_abundance.txt`: Text file containing gene aboundances and FPKM values. +- `/stringtie/.ballgown/`: Ballgown output directory. + +
+ +[StringTie](https://ccb.jhu.edu/software/stringtie/) is a fast and highly efficient assembler of RNA-Seq alignments into potential transcripts. It uses a novel network flow algorithm as well as an optional de novo assembly step to assemble and quantitate full-length transcripts representing multiple splice variants for each gene locus. In order to identify differentially expressed genes between experiments, StringTie's output can be processed by specialized software like [Ballgown](https://github.com/alyssafrazee/ballgown), [Cuffdiff](http://cole-trapnell-lab.github.io/cufflinks/cuffdiff/index.html) or other programs ([DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html), [edgeR](https://bioconductor.org/packages/release/bioc/html/edgeR.html), etc.). + +### BEDTools and bedGraphToBigWig + +
+Output files + +- `/bigwig/` + - `*.forward.bigWig`: bigWig coverage file relative to genes on the forward DNA strand. + - `*.reverse.bigWig`: bigWig coverage file relative to genes on the reverse DNA strand. + +
+ +The [bigWig](https://genome.ucsc.edu/goldenpath/help/bigWig.html) format is an indexed binary format useful for displaying dense, continuous data in Genome Browsers such as the [UCSC](https://genome.ucsc.edu/cgi-bin/hgTracks) and [IGV](http://software.broadinstitute.org/software/igv/). This mitigates the need to load the much larger BAM files for data visualisation purposes which will be slower and result in memory issues. The bigWig format is also supported by various bioinformatics software for downstream processing such as meta-profile plotting. + +## Quality control + +### RSeQC + +[RSeQC](<(http://rseqc.sourceforge.net/)>) is a package of scripts designed to evaluate the quality of RNA-seq data. This pipeline runs several, but not all RSeQC scripts. You can tweak the supported scripts you would like to run by adjusting the `--rseqc_modules` parameter which by default will run all of the following: `bam_stat.py`, `inner_distance.py`, `infer_experiment.py`, `junction_annotation.py`, `junction_saturation.py`,`read_distribution.py` and `read_duplication.py`. + +The majority of RSeQC scripts generate output files which can be plotted and summarised in the MultiQC report. + +#### Infer experiment + +
+Output files + +- `/rseqc/infer_experiment/` + - `*.infer_experiment.txt`: File containing fraction of reads mapping to given strandedness configurations. + +
+ +This script predicts the "strandedness" of the protocol (i.e. unstranded, sense or antisense) that was used to prepare the sample for sequencing by assessing the orientation in which aligned reads overlay gene features in the reference genome. The strandedness of each sample has to be provided to the pipeline in the input samplesheet (see [usage docs](https://nf-co.re/rnaseq/usage#samplesheet-input)). However, this information is not always available, especially for public datasets. As a result, additional features have been incorporated into this pipeline to auto-detect whether you have provided the correct information in the samplesheet, and if this is not the case then a warning table will be placed at the top of the MultiQC report highlighting the offending samples (see image below). If required, this will allow you to correct the input samplesheet and rerun the pipeline with the accurate strand information. Note, it is important to get this information right because it can affect the final results. + +RSeQC documentation: [infer_experiment.py](http://rseqc.sourceforge.net/#infer-experiment-py) + +![MultiQC - Strand check table](images/mqc_strand_check.png) + +![MultiQC - RSeQC infer experiment plot](images/mqc_rseqc_inferexperiment.png) + +#### Read distribution + +
+Output files + +- `/rseqc/read_distribution/` + - `*.read_distribution.txt`: File containing fraction of reads mapping to genome feature e.g. CDS exon, 5’UTR exon, 3’ UTR exon, Intron, Intergenic regions etc. + +
+ +This tool calculates how mapped reads are distributed over genomic features. A good result for a standard RNA-seq experiments is generally to have as many exonic reads as possible (`CDS_Exons`). A large amount of intronic reads could be indicative of DNA contamination in your sample but may be expected for a total RNA preparation. + +RSeQC documentation: [read_distribution.py](http://rseqc.sourceforge.net/#read-distribution-py) + +![MultiQC - RSeQC read distribution plot](images/mqc_rseqc_readdistribution.png) + +#### Junction annotation + +
+Output files + +- `/rseqc/junction_annotation/bed/` + - `*.junction.bed`: BED file containing splice junctions. + - `*.junction.Interact.bed`: BED file containing interacting splice junctions. +- `/rseqc/junction_annotation/log/` + - `*.junction_annotation.log`: Log file generated by the program. +- `/rseqc/junction_annotation/pdf/` + - `*.splice_events.pdf`: PDF file containing splicing events plot. + - `*.splice_junction.pdf`: PDF file containing splice junctions plot. +- `/rseqc/junction_annotation/rscript/` + - `*.junction_plot.r`: R script used to generate pdf plots above. +- `/rseqc/junction_annotation/xls/` + - `*.junction.xls`: Excel spreadsheet with junction information. + +
+ +Junction annotation compares detected splice junctions to a reference gene model. Splicing annotation is performed in two levels: splice event level and splice junction level. + +RSeQC documentation: [junction_annotation.py](http://rseqc.sourceforge.net/#junction-annotation-py) + +![MultiQC - RSeQC junction annotation plot](images/mqc_rseqc_junctionannotation.png) + +#### Inner distance + +
+Output files + +- `/rseqc/inner_distance/pdf/` + - `*.inner_distance_plot.pdf`: PDF file containing inner distance plot. +- `/rseqc/inner_distance/rscript/` + - `*.inner_distance_plot.r`: R script used to generate pdf plot above. +- `/rseqc/inner_distance/txt/` + - `*.inner_distance_freq.txt`: File containing frequency of insert sizes. + - `*.inner_distance_mean.txt`: File containing mean, median and standard deviation of insert sizes. + +
+ +The inner distance script tries to calculate the inner distance between two paired-end reads. It is the distance between the end of read 1 to the start of read 2, and it is sometimes confused with the insert size (see [this blog post](http://thegenomefactory.blogspot.com.au/2013/08/paired-end-read-confusion-library.html) for disambiguation): + +This plot will not be generated for single-end data. Very short inner distances are often seen in old or degraded samples (_eg._ FFPE) and values can be negative if the reads overlap consistently. + +RSeQC documentation: [inner_distance.py](http://rseqc.sourceforge.net/#inner-distance-py) + +![MultiQC - RSeQC inner distance plot](images/mqc_rseqc_innerdistance.png) + +#### Junction saturation + +
+Output files + +- `/rseqc/junction_saturation/pdf/` + - `*.junctionSaturation_plot.pdf`: PDF file containing junction saturation plot. +- `/rseqc/junction_saturation/rscript/` + - `*.junctionSaturation_plot.r`: R script used to generate pdf plot above. + +
+ +This script shows the number of splice sites detected within the data at various levels of subsampling. A sample that reaches a plateau before getting to 100% data indicates that all junctions in the library have been detected, and that further sequencing will not yield any more observations. A good sample should approach such a plateau of _Known junctions_, however, very deep sequencing is typically required to saturate all _Novel Junctions_ in a sample. + +RSeQC documentation: [junction_saturation.py](http://rseqc.sourceforge.net/#junction-saturation-py) + +![MultiQC - RSeQC junction saturation plot](images/mqc_rseqc_junctionsaturation.png) + +#### Read duplication + +
+Output files + +- `/rseqc/read_duplication/pdf/` + - `*.DupRate_plot.pdf`: PDF file containing read duplication plot. +- `/rseqc/read_duplication/rscript/` + - `*.DupRate_plot.r`: R script used to generate pdf plot above. +- `/rseqc/read_duplication/xls/` + - `*.pos.DupRate.xls`: Read duplication rate determined from mapping position of read. First column is “occurrence” or duplication times, second column is number of uniquely mapped reads. + - `*.seq.DupRate.xls`: Read duplication rate determined from sequence of read. First column is “occurrence” or duplication times, second column is number of uniquely mapped reads. + +
+ +This plot shows the number of reads (y-axis) with a given number of exact duplicates (x-axis). Most reads in an RNA-seq library should have a low number of exact duplicates. Samples which have many reads with many duplicates (a large area under the curve) may be suffering excessive technical duplication. + +RSeQC documentation: [read_duplication.py](http://rseqc.sourceforge.net/#read-duplication-py) + +![MultiQC - RSeQC read duplication plot](images/mqc_rseqc_readduplication.png) + +#### BAM stat + +
+Output files + +- `/rseqc/bam_stat/` + - `*.bam_stat.txt`: Mapping statistics for the BAM file. + +
+ +This script gives numerous statistics about the aligned BAM files. A typical output looks as follows: + +```txt +#Output (all numbers are read count) +#================================================== +Total records: 41465027 +QC failed: 0 +Optical/PCR duplicate: 0 +Non Primary Hits 8720455 +Unmapped reads: 0 + +mapq < mapq_cut (non-unique): 3127757 +mapq >= mapq_cut (unique): 29616815 +Read-1: 14841738 +Read-2: 14775077 +Reads map to '+': 14805391 +Reads map to '-': 14811424 +Non-splice reads: 25455360 +Splice reads: 4161455 +Reads mapped in proper pairs: 21856264 +Proper-paired reads map to different chrom: 7648 +``` + +MultiQC plots each of these statistics in a dot plot. Each sample in the project is a dot - hover to see the sample highlighted across all fields. + +RSeQC documentation: [bam_stat.py](http://rseqc.sourceforge.net/#bam-stat-py) + +#### TIN + +
+Output files + +- `/rseqc/tin/` + - `*.summary.txt`: File containing TIN results summary. + - `*.tin.xls`: XLS file containing TIN results. + +
+ +This script is designed to evaluate RNA integrity at the transcript level. TIN (transcript integrity number) is named in analogous to RIN (RNA integrity number). RIN (RNA integrity number) is the most widely used metric to evaluate RNA integrity at sample (or transcriptome) level. It is a very useful preventive measure to ensure good RNA quality and robust, reproducible RNA sequencing. This process isn't run by default - please see [this issue](https://github.com/nf-core/rnaseq/issues/769). + +RSeQC documentation: [tin.py](http://rseqc.sourceforge.net/#tin-py) + +### Qualimap + +
+Output files + +- `/qualimap//` + - `qualimapReport.html`: Qualimap HTML report that can be viewed in a web browser. + - `rnaseq_qc_results.txt`: Textual results output. +- `/qualimap//images_qualimapReport/`: Images required for the HTML report. +- `/qualimap//raw_data_qualimapReport/`: Raw data required for the HTML report. +- `/qualimap//css/`: CSS files required for the HTML report. + +
+ +[Qualimap](http://qualimap.bioinfo.cipf.es/) is a platform-independent application written in Java and R that provides both a Graphical User Interface (GUI) and a command-line interface to facilitate the quality control of alignment sequencing data. Shortly, Qualimap: + +- Examines sequencing alignment data according to the features of the mapped reads and their genomic properties. +- Provides an overall view of the data that helps to to the detect biases in the sequencing and/or mapping of the data and eases decision-making for further analysis. + +The [Qualimap RNA-seq QC module](http://qualimap.bioinfo.cipf.es/doc_html/analysis.html#rna-seq-qc) is used within this pipeline to assess the overall mapping and coverage relative to gene features. + +![MultiQC - Qualimap gene coverage plot](images/mqc_qualimap_coverage.png) + +![MultiQC - Qualimap genomic origin plot](images/mqc_qualimap_features.png) + +### dupRadar + +
+Output files + +- `/dupradar/box_plot/` + - `*_duprateExpBoxplot.pdf`: PDF file containing box plot for duplicate rate relative to mean expression. +- `/dupradar/gene_data/` + - `*_dupMatrix.txt`: Text file containing duplicate metrics per gene. +- `/dupradar/histogram/` + - `*_expressionHist.pdf`: PDF file containing histogram of reads per kilobase values per gene. +- `/dupradar/intercepts_slope/` + - `*_intercept_slope.txt`: Text file containing intercept slope values. +- `/dupradar/scatter_plot/` + - `*_duprateExpDens.pdf`: PDF file containing typical dupRadar 2D density scatter plot. + +See [dupRadar docs](https://www.bioconductor.org/packages/devel/bioc/vignettes/dupRadar/inst/doc/dupRadar.html) for further information regarding the content of these files. + +
+ +[dupRadar](https://www.bioconductor.org/packages/release/bioc/html/dupRadar.html) is a Bioconductor library written in the R programming language. It generates various QC metrics and plots that relate duplication rate with gene expression levels in order to identify experiments with high technical duplication. A good sample with little technical duplication will only show high numbers of duplicates for highly expressed genes. Samples with technical duplication will have high duplication for all genes, irrespective of transcription level. + +![dupRadar - Example good and bad experiment plot](images/dupradar_example_plot.png) + +> _Credit: [dupRadar documentation](https://www.bioconductor.org/packages/devel/bioc/vignettes/dupRadar/inst/doc/dupRadar.html)_ + +### Preseq + +
+Output files + +- `/preseq/` + - `*.lc_extrap.txt`: Preseq expected future yield file. +- `/preseq/log/` + - `*.command.log`: Standard error output from command. + +
+ +The [Preseq](http://smithlabresearch.org/software/preseq/) package is aimed at predicting and estimating the complexity of a genomic sequencing library, equivalent to predicting and estimating the number of redundant reads from a given sequencing depth and how many will be expected from additional sequencing using an initial sequencing experiment. The estimates can then be used to examine the utility of further sequencing, optimize the sequencing depth, or to screen multiple libraries to avoid low complexity samples. A shallow curve indicates that the library has reached complexity saturation and further sequencing would likely not add further unique reads. The dashed line shows a perfectly complex library where total reads = unique reads. Note that these are predictive numbers only, not absolute. The MultiQC plot can sometimes give extreme sequencing depth on the X axis - click and drag from the left side of the plot to zoom in on more realistic numbers. + +![MultiQC - Preseq library complexity plot](images/mqc_preseq_plot.png) + +### featureCounts + +
+Output files + +- `/featurecounts/` + - `*.featureCounts.txt`: featureCounts biotype-level quantification results for each sample. + - `*.featureCounts.txt.summary`: featureCounts summary file containing overall statistics about the counts. + - `*_mqc.tsv`: MultiQC custom content files used to plot biotypes in report. + +
+ +[featureCounts](http://bioinf.wehi.edu.au/featureCounts/) from the [Subread](http://subread.sourceforge.net/) package is a quantification tool used to summarise the mapped read distribution over genomic features such as genes, exons, promotors, gene bodies, genomic bins and chromosomal locations. We can also use featureCounts to count overlaps with different classes of genomic features. This provides an additional QC to check which features are most abundant in the sample, and to highlight potential problems such as rRNA contamination. + +![MultiQC - featureCounts biotypes plot](images/mqc_featurecounts_biotype.png) + +### DESeq2 + +
+Output files + +- `/deseq2_qc/` + - `*.plots.pdf`: File containing PCA and hierarchical clustering plots. + - `*.dds.RData`: File containing R `DESeqDataSet` object generated + by DESeq2, with either an rlog or vst `assay` storing the + variance-stabilised data. + - `*.rds`: Alternative version of the RData file suitable for + `readRDS` to give user control of the eventual object name. + - `*pca.vals.txt`: Matrix of values for the first 2 principal components. + - `*sample.dists.txt`: Sample distance matrix. + - `R_sessionInfo.log`: File containing information about R, the OS and attached or loaded packages. +- `/deseq2_qc/size_factors/` + - `*.txt`, `*.RData`: Files containing DESeq2 sizeFactors per sample. + +
+ +[DESeq2](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html) is one of the most commonly used software packages to perform differential expression analysis for RNA-seq datasets. + +**This pipeline uses a standardised DESeq2 analysis script to get an idea of the reproducibility across samples within the experiment. Please note that this will not suit every experimental design, and if there are other problems with the experiment then it may not work as well as expected.** + +The script included in the pipeline uses DESeq2 to normalise read counts across all of the provided samples in order to create a PCA plot and a clustered heatmap showing pairwise Euclidean distances between the samples in the experiment. These help to show the similarity between groups of samples and can reveal batch effects and other potential issues with the experiment. + +By default, the pipeline uses the `vst` transformation which is more suited to larger experiments. You can set the parameter `--deseq2_vst false` if you wish to use the DESeq2 native `rlog` option. See [DESeq2 docs](http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#data-transformations-and-visualization) for a more detailed explanation. + +The PCA plots are generated based alternately on the top five hundred most variable genes, or all genes. The former is the conventional approach that is more likely to pick up strong effects (ie the biological signal) and the latter, when different, is picking up a weaker but consistent effect that is synchronised across many transcripts. We project both of these onto the first two PCs (shown in the top row of the figure below), which is the best two dimensional representation of the variation between samples. + +We also explore higher components in terms of experimental factors inferred from sample names. If your sample naming convention follows a strict policy of using underscores to delimit values of experimental factors (for example `WT_UNTREATED_REP1`) and all names have the same number of underscores (so excluding `WT_TREATED_10ml_REP1` from being compatible with the previous label), then any of these factors that are informative (ie label some but not all samples the same) then we individually plot upto the first five PCs, per experimental level, for each of the experimental factors. + +The plot on the left hand side shows the standard PC plot - notice the variable number of underscores, meaning that the central plot would not be produced: here we have changed the underscore that is hyphenating the treatment to a '-' character. This allows the central plot to be generated, and we can see that replicate (the 2nd part of the sample name) seems to be affecting the 3rd principal component, but the treatment factor is affecting the more important first two components. The right-most plot shows all pairwise euclidean distances between the samples. + +

DESeq2 PCA plots

+ +![MultiQC - DESeq2 PCA plot](images/mqc_deseq2_pca.png) + +

MultiQC - DESeq2 sample similarity plot

+ +### MultiQC + +
+Output files + +- `multiqc//` + - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. + - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. + +
+ +[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. + +Results generated by MultiQC collate pipeline QC from supported tools i.e. FastQC, Cutadapt, SortMeRNA, STAR, RSEM, HISAT2, Salmon, SAMtools, Picard, RSeQC, Qualimap, Preseq and featureCounts. Additionally, various custom content has been added to the report to assess the output of dupRadar, DESeq2 and featureCounts biotypes, and to highlight samples failing a mimimum mapping threshold or those that failed to match the strand-specificity provided in the input samplesheet. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . + +## Pseudo-alignment and quantification + +### Salmon + +
+Output files + +- `salmon/` + - `salmon.merged.gene_counts.tsv`: Matrix of gene-level raw counts across all samples. + - `salmon.merged.gene_tpm.tsv`: Matrix of gene-level TPM values across all samples. + - `salmon.merged.gene_counts.rds`: RDS object that can be loaded in R that contains a [SummarizedExperiment](https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html) container with the TPM (`abundance`), estimated counts (`counts`) and transcript length (`length`) in the assays slot for genes. + - `salmon.merged.gene_counts_scaled.tsv`: Matrix of gene-level library size-scaled counts across all samples. + - `salmon.merged.gene_counts_scaled.rds`: RDS object that can be loaded in R that contains a [SummarizedExperiment](https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html) container with the TPM (`abundance`), estimated library size-scaled counts (`counts`) and transcript length (`length`) in the assays slot for genes. + - `salmon.merged.gene_counts_length_scaled.tsv`: Matrix of gene-level length-scaled counts across all samples. + - `salmon.merged.gene_counts_length_scaled.rds`: RDS object that can be loaded in R that contains a [SummarizedExperiment](https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html) container with the TPM (`abundance`), estimated length-scaled counts (`counts`) and transcript length (`length`) in the assays slot for genes. + - `salmon.merged.transcript_counts.tsv`: Matrix of isoform-level raw counts across all samples. + - `salmon.merged.transcript_tpm.tsv`: Matrix of isoform-level TPM values across all samples. + - `salmon.merged.transcript_counts.rds`: RDS object that can be loaded in R that contains a [SummarizedExperiment](https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html) container with the TPM (`abundance`), estimated isoform-level raw counts (`counts`) and transcript length (`length`) in the assays slot for transcripts. + - `salmon_tx2gene.tsv`: Tab-delimited file containing gene to transcripts ids mappings. +- `salmon//` + - `aux_info/`: Auxiliary info e.g. versions and number of mapped reads. + - `cmd_info.json`: Information about the Salmon quantification command, version and options. + - `lib_format_counts.json`: Number of fragments assigned, unassigned and incompatible. + - `libParams/`: Contains the file `flenDist.txt` for the fragment length distribution. + - `logs/`: Contains the file `salmon_quant.log` giving a record of Salmon's quantification. + - `quant.genes.sf`: Salmon _gene_-level quantification of the sample, including feature length, effective length, TPM, and number of reads. + - `quant.sf`: Salmon _transcript_-level quantification of the sample, including feature length, effective length, TPM, and number of reads. + +
+ +As described in the [STAR and Salmon](#star-and-salmon) section, you can choose to pseudo-align and quantify your data with [Salmon](https://salmon.readthedocs.io/en/latest/salmon.html) by providing the `--pseudo_aligner salmon` parameter. By default, Salmon is run in addition to the standard alignment workflow defined by `--aligner`, mainly because it allows you to obtain QC metrics with respect to the genomic alignments. However, you can provide the `--skip_alignment` parameter if you would like to run Salmon in isolation. If Salmon is run in isolation, the outputs mentioned above will be found in a folder named `salmon`. If Salmon is run alongside STAR, the folder will be named `star_salmon`. + +Transcripts with large inferential uncertainty won't be assigned the exact number of reads reproducibly, every time Salmon is run. Read more about this on the [nf-core/rnaseq](https://github.com/nf-core/rnaseq/issues/585) and [salmon](https://github.com/COMBINE-lab/salmon/issues/613) Github repos. + +The [tximport](https://bioconductor.org/packages/release/bioc/html/tximport.html) package is used in this pipeline to summarise the results generated by Salmon into matrices for use with downstream differential analysis packages. We use tximport with different options to summarize count and TPM quantifications at the gene- and transcript-level. Please see [#499](https://github.com/nf-core/rnaseq/issues/499) for discussion and links regarding which counts are suitable for different types of analysis. + +According to the `txtimport` documentation you can do one of the following: + +- Use bias corrected counts with an offset: import all the salmon files with `tximport` and then use `DESeq2` with `dds <- DESeqDataSetFromTximport(txi, sampleTable, ~condition)` to correct for changes to the average transcript length across samples. +- Use bias corrected counts without an offset: load and use `salmon.merged.gene_counts_length_scaled.tsv` or `salmon.merged.gene_counts_scaled.tsv` directly as you would with a regular counts matrix. +- Use bias uncorrected counts: load and use the `txi$counts` matrix (or `salmon.merged.gene_counts.tsv`) with `DESeq2`. This does not correct for potential differential isoform usage. Alternatively, if you have 3’ tagged RNA-seq data this is the most suitable method. + +> **NB:** The default Salmon parameters and a k-mer size of 31 are used to create the index. As [documented here](https://salmon.readthedocs.io/en/latest/salmon.html#preparing-transcriptome-indices-mapping-based-mode) and [discussed here](https://github.com/COMBINE-lab/salmon/issues/482#issuecomment-583799668), a k-mer size off 31 works well with reads that are 75bp or longer. + +![MultiQC - Salmon fragment length distribution plot](images/mqc_salmon.png) + +## Workflow reporting and genomes + +### Reference genome files + +
+Output files + +- `genome/` + - `*.fa`, `*.gtf`, `*.gff`, `*.bed`, `.tsv`: If the `--save_reference` parameter is provided then all of the genome reference files will be placed in this directory. +- `genome/index/` + - `star/`: Directory containing STAR indices. + - `hisat2/`: Directory containing HISAT2 indices. + - `rsem/`: Directory containing STAR and RSEM indices. + - `salmon/`: Directory containing Salmon indices. + +
+ +A number of genome-specific files are generated by the pipeline because they are required for the downstream processing of the results. If the `--save_reference` parameter is provided then these will be saved in the `genome/` directory. It is recommended to use the `--save_reference` parameter if you are using the pipeline to build new indices so that you can save them somewhere locally. The index building step can be quite a time-consuming process and it permits their reuse for future runs of the pipeline to save disk space. + +### Pipeline information + +
+Output files + +- `pipeline_info/` + - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. + - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. + - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. + +
+ +[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. + +# Differential expression analysis with DESeq2: Output + +After nf-core/rnaseq pipeline is completed, then custom differental expression anlysis with DESeq2 is performed. + +The directories listed below will be created in the differential expression analysis results directory (`02-differential_expression`). All paths are relative to the top-level results directory. + +## Output files + +
+Output files + +- `/Differential_expression/DESeq2/Differential_expression.csv`: This file contains the results of the differential expression analysis performed using DESeq2, including information on differentially expressed genes and associated statistical metrics such as fold change, p-values, and adjusted p-values. + +- `/Differential_expression/DESeq2/heatmapCount_top20_differentially_expressed.pdf`: This PDF file presents a heatmap visualization displaying the expression patterns of the top 20 differentially expressed genes, clustered by sample distance, as determined by the DESeq2 analysis. + ![](./images/deseq2_heatmap-top-20-genes.png) + +- `/Differential_expression/DESeq2/maPlot_all.pdf`: This PDF file illustrates MA plots depicting the log fold changes (M) versus the mean average (A) expression levels of all genes analyzed in the DESeq2 differential expression analysis. + + ![](./images/deseq2_maplot.png) + +- `/Differential_expression/DESeq2/pvalues.pdf`: This PDF file provides graphical representations, such as histograms or scatter plots, illustrating the distribution and significance of p-values calculated during the DESeq2 analysis. + ![](./images/deseq2_pvalue-hist.png) + +- `/Quality_plots/DESeq2/boxplot.pdf`: This PDF file displays boxplots depicting the distribution of normalized count expression values values across samples, allowing for the assessment of data variability and potential batch effects. + ![](./images/deseq2_boxplot.png) + +- `/Quality_plots/DESeq2/cluster_dendrogram.pdf`: This PDF file presents a dendrogram visualization illustrating the hierarchical clustering of samples based on gene expression profiles, enabling the identification of sample similarities and differences. + ![](./images/deseq2_cluster_dendogram.png) + +- `/Quality_plots/DESeq2/heatmapCount_all_genes.pdf`: This PDF file contains a heatmap visualization showing the expression patterns of all genes analyzed in the experiment, facilitating the identification of gene expression trends and patterns. + ![](./images/deseq2_heatmap_all.png) + +- `/Quality_plots/DESeq2/heatmapCount_top20_highest_expression.pdf`: This PDF file presents a heatmap visualization highlighting the expression patterns of the top 20 genes with the highest expression levels across samples, aiding in the identification of highly expressed genes. + + ![](./images/deseq2_heatmap-top-20-genes.png) + +- `/Quality_plots/DESeq2/heatmap_sample_to_sample.pdf`: This PDF file contains a heatmap visualization illustrating the pairwise sample-to-sample correlation matrix based on gene expression profiles, enabling the assessment of sample similarities and reproducibility. + + ![](./images/deseq2_sample-to-sample.png) + +- `/Quality_plots/DESeq2/plotDispersions.pdf`: This PDF file displays dispersion plots showing the relationship between the mean expression levels and the dispersion estimates for each gene, allowing for the assessment of data variability and the adequacy of the statistical model. + + ![](./images/deseq2_dispersion-estimate.png) + +- `/Quality_plots/DESeq2/plotPCA.pdf`: This PDF file presents a PCA (Principal Component Analysis) plot visualizing the distribution of samples in a multidimensional space based on their gene expression profiles, allowing for the exploration of sample relationships and potential batch effects. + ![](./images/deseq2_pca.png) + +- `/Quality_plots/DESeq2/plotSD.pdf`:The standard deviation of the transformed data, across samples, against the mean, using the shifted logarithm transformation, the regularized log transformation and the variance stabilizing transformation. This plot enables the assessment of data variability and the identification of potential outliers. + + ![](./images/deseq2_plotSD.png) + +- `99-stats/Quality_plots/`: This folder contains the same quality plots as described above, but they are generated considering all samples in the service without accounting for the expermiental design specified in DESeq2. This allows for a general overview of the data in the service without incorporating the experimental design. + +
diff --git a/bu_isciii/assets/reports/md/sarek.md b/bu_isciii/assets/reports/md/sarek.md new file mode 100755 index 00000000..783ff1eb --- /dev/null +++ b/bu_isciii/assets/reports/md/sarek.md @@ -0,0 +1,1318 @@ +# nf-core/sarek: Output + +## Introduction + +This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. + +The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. + +## Pipeline overview + +The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: + +- [Directory Structure](#directory-structure) +- [Preprocessing](#preprocessing) + - [Preparation of input files (FastQ or (u)BAM)](#preparation-of-input-files-fastq-or-ubam) + - [Trim adapters](#trim-adapters) + - [Split FastQ files](#split-fastq-files) + - [UMI consensus](#umi-consensus) + - [Map to Reference](#map-to-reference) + - [BWA](#bwa) + - [BWA-mem2](#bwa-mem2) + - [DragMap](#dragmap) + - [Sentieon BWA mem](#sentieon-bwa-mem) + - [Mark Duplicates](#mark-duplicates) + - [GATK MarkDuplicates (Spark)](#gatk-markduplicates-spark) + - [Sentieon LocusCollector and Dedup](#sentieon-locuscollector-and-dedup) + - [Base Quality Score Recalibration](#base-quality-score-recalibration) + - [GATK BaseRecalibrator (Spark)](#gatk-baserecalibrator-spark) + - [GATK ApplyBQSR (Spark)](#gatk-applybqsr-spark) + - [CSV files](#csv-files) +- [Variant Calling](#variant-calling) + - [SNVs and small indels](#snvs-and-small-indels) + - [bcftools](#bcftools) + - [DeepVariant](#deepvariant) + - [FreeBayes](#freebayes) + - [GATK HaplotypeCaller](#gatk-haplotypecaller) + - [GATK Germline Single Sample Variant Calling](#gatk-germline-single-sample-variant-calling) + - [GATK Joint Germline Variant Calling](#gatk-joint-germline-variant-calling) + - [GATK Mutect2](#gatk-mutect2) + - [Sentieon DNAscope](#sentieon-dnascope) + - [Sentieon DNAscope joint germline variant calling](#sentieon-dnascope-joint-germline-variant-calling) + - [Sentieon Haplotyper](#sentieon-haplotyper) + - [Sentieon Haplotyper joint germline variant calling](#sentieon-haplotyper-joint-germline-variant-calling) + - [Strelka2](#strelka2) + - [Structural Variants](#structural-variants) + - [Manta](#manta) + - [TIDDIT](#tiddit) + - [Sample heterogeneity, ploidy and CNVs](#sample-heterogeneity-ploidy-and-cnvs) + - [ASCAT](#ascat) + - [CNVKit](#cnvkit) + - [Control-FREEC](#control-freec) + - [Microsatellite instability (MSI)](#microsatellite-instability-msi) + - [MSIsensorPro](#msisensorpro) + - [Concatenation](#concatenation) +- [Variant annotation](#variant-annotation) + - [snpEff](#snpeff) + - [VEP](#vep) + - [BCFtools annotate](#bcftools-annotate) +- [Quality control and reporting](#quality-control-and-reporting) + - [Quality control](#quality-control) + - [FastQC](#fastqc) + - [FastP](#fastp) + - [Mosdepth](#mosdepth) + - [NGSCheckMate](#ngscheckmate) + - [GATK MarkDuplicates reports](#gatk-markduplicates-reports) + - [Sentieon Dedup reports](#sentieon-dedup-reports) + - [samtools stats](#samtools-stats) + - [bcftools stats](#bcftools-stats) + - [VCFtools](#vcftools) + - [snpEff reports](#snpeff-reports) + - [VEP reports](#vep-reports) + - [Reporting](#reporting) + - [MultiQC](#multiqc) + - [Pipeline information](#pipeline-information) +- [Reference files](#reference-files) + +## Directory Structure + +The default directory structure is as follows + +``` +{outdir} +├── csv +├── multiqc +├── pipeline_info +├── preprocessing +│ ├── markduplicates +│ └── +│ ├── recal_table +│ └── +│ └── recalibrated +│ └── +├── reference +└── reports + ├── + └── +work/ +.nextflow.log +``` + +## Preprocessing + +Sarek pre-processes raw FastQ files or unmapped BAM files, based on [GATK best practices](https://gatk.broadinstitute.org/hc/en-us/sections/360007226651-Best-Practices-Workflows). + +### Preparation of input files (FastQ or (u)BAM) + +[FastP](https://github.com/OpenGene/fastp) is a tool designed to provide all-in-one preprocessing for FastQ files and as such is used for trimming and splitting. By default, these files are not published. However, if publishing is enabled, please be aware that these files are only published once, meaning if trimming and splitting is enabled, then the resulting files will be sharded FastQ files with trimmed reads. If only one of them is enabled then the files contain either trimmed or split reads, respectively. + +#### Trim adapters + +[FastP](https://github.com/OpenGene/fastp) supports global trimming, which means it trims all reads in the front or the tail. This function is useful since sometimes you want to drop some cycles of a sequencing run. In the current implementation in Sarek +`--detect_adapter_for_pe` is set by default which enables auto-detection of adapter sequences. For more information on how to fine-tune adapter trimming, take a look into the parameter docs. + +The resulting files are intermediate and by default not kept in the final files delivered to users. Set `--save_trimmed` to enable publishing of the files in: + +
+Output files for all samples + +**Output directory: `{outdir}/preprocessing/fastp/`** + +- `__{1,2}.fastp.fastq.gz>` + - Bgzipped FastQ file + +
+ +#### Split FastQ files + +[FastP](https://github.com/OpenGene/fastp) supports splitting of one FastQ file into multiple files allowing parallel alignment of sharded FastQ file. To enable splitting, the number of reads per output can be specified. For more information, take a look into the parameter `--split_fastq`in the parameter docs. + +These files are intermediate and by default not placed in the output-folder kept in the final files delivered to users. Set `--save_split` to enable publishing of these files to: + +
+Output files for all samples + +**Output directory: `{outdir}/preprocessing/fastp//`** + +- `` + - Bgzipped FastQ file + +
+ +#### UMI consensus + +Sarek can process UMI-reads, using [fgbio](http://fulcrumgenomics.github.io/fgbio/tools/latest/) tools. + +These files are intermediate and by default not placed in the output-folder kept in the final files delivered to users. Set `--save_split` to enable publishing of these files to: + +
+Output files for all samples + +**Output directory: `{outdir}/preprocessing/umi//`** + +- `` + +**Output directory: `{outdir}/reports/umi/`** + +- `` + +
+ +### Map to Reference + +#### BWA + +[BWA](https://github.com/lh3/bwa) is a software package for mapping low-divergent sequences against a large reference genome. The aligned reads are then coordinate-sorted (or name-sorted if [`GATK MarkDuplicatesSpark`](https://gatk.broadinstitute.org/hc/en-us/articles/5358833264411-MarkDuplicatesSpark) is used for duplicate marking) with [samtools](https://www.htslib.org/doc/samtools.html). + +#### BWA-mem2 + +[BWA-mem2](https://github.com/bwa-mem2/bwa-mem2) is a software package for mapping low-divergent sequences against a large reference genome.The aligned reads are then coordinate-sorted (or name-sorted if [`GATK MarkDuplicatesSpark`](https://gatk.broadinstitute.org/hc/en-us/articles/5358833264411-MarkDuplicatesSpark) is used for duplicate marking) with [samtools](https://www.htslib.org/doc/samtools.html). + +#### DragMap + +[DragMap](https://github.com/Illumina/dragmap) is an open-source software implementation of the DRAGEN mapper, which the Illumina team created so that we would have an open-source way to produce the same results as their proprietary DRAGEN hardware. The aligned reads are then coordinate-sorted (or name-sorted if [`GATK MarkDuplicatesSpark`](https://gatk.broadinstitute.org/hc/en-us/articles/5358833264411-MarkDuplicatesSpark) is used for duplicate marking) with [samtools](https://www.htslib.org/doc/samtools.html). + +These files are intermediate and by default not placed in the output-folder kept in the final files delivered to users. Set `--save_mapped` to enable publishing, furthermore add the flag `save_output_as_bam` for publishing in BAM format. + +#### Sentieon BWA mem + +Sentieon [bwa mem](https://support.sentieon.com/manual/usages/general/#bwa-mem-syntax) is a subroutine for mapping low-divergent sequences against a large reference genome. It is part of the proprietary software package [DNAseq](https://www.sentieon.com/detailed-description-of-pipelines/#dnaseq) from [Sentieon](https://www.sentieon.com/). + +The aligned reads are coordinate-sorted with Sentieon. + +
+Output files for all mappers and samples + +The alignment files (BAM or CRAM) produced by the chosen aligner are not published by default. CRAM output files will not be saved in the output-folder (`outdir`), unless the flag `--save_mapped` is used. BAM output can be selected by setting the flag `--save_output_as_bam`. + +**Output directory: `{outdir}/preprocessing/mapped//`** + +- if `--save_mapped`: `.sorted.cram` and `.sorted.cram.crai` + + - CRAM file and index + +- if `--save_mapped --save_output_as_bam`: `.sorted.bam` and `.sorted.bam.bai` + - BAM file and index +
+ +### Mark Duplicates + +During duplicate marking, read pairs that are likely to have originated from duplicates of the same original DNA fragments through some artificial processes are identified. These are considered to be non-independent observations, so all but a single read pair within each set of duplicates are marked, causing the marked pairs to be ignored by default during the variant discovery process. + +For further reading and documentation see the [data pre-processing for variant discovery from the GATK best practices](https://gatk.broadinstitute.org/hc/en-us/articles/360035535912-Data-pre-processing-for-variant-discovery). + +#### GATK MarkDuplicates (Spark) + +By default, Sarek will use [GATK MarkDuplicates](https://gatk.broadinstitute.org/hc/en-us/articles/5358880192027-MarkDuplicates-Picard-). + +To use the corresponding spark implementation [`GATK MarkDuplicatesSpark`](https://gatk.broadinstitute.org/hc/en-us/articles/5358833264411-MarkDuplicatesSpark), please specify `--use_gatk_spark markduplicates`. The resulting files are converted to CRAM with either [samtools](https://www.htslib.org/doc/samtools.html), when GATK MarkDuplicates is used, or, implicitly, by GATK MarkDuplicatesSpark. + +The resulting CRAM files are delivered to the users. + +
+Output files for all samples + +**Output directory: `{outdir}/preprocessing/markduplicates//`** + +- `.md.cram` and `.md.cram.crai` + - CRAM file and index +- if `--save_output_as_bam`: + - `.md.bam` and `.md.bam.bai` + +
+ +### Sentieon LocusCollector and Dedup + +The subroutines LocusCollector and Dedup are part of Sentieon DNAseq packages with speedup versions of the standard GATK tools, and together those two subroutines correspond to GATK's MarkDuplicates. + +The subroutine [LocusCollector](https://support.sentieon.com/manual/usages/general/#driver-algorithm-syntax) collects read information that will be used for removing or tagging duplicate reads; its output is the score file indicating which reads are likely duplicates. + +The subroutine [Dedup](https://support.sentieon.com/manual/usages/general/#dedup-algorithm) marks or removes duplicate reads based on the score file supplied by LocusCollector, and produces a BAM or CRAM file. + +
+Output files for all samples + +**Output directory: `{outdir}/preprocessing/sentieon_dedup//`** + +- `.dedup.cram` and `.dedup.cram.crai` + - CRAM file and index +- if `--save_output_as_bam`: + - `.dedup.bam` and `.dedup.bam.bai` + +
+ +### Base Quality Score Recalibration + +During Base Quality Score Recalibration, systematic errors in the base quality scores are corrected by applying machine learning to detect and correct for them. This is important for evaluating the correct call of a variant during the variant discovery process. However, this is not needed for all combinations of tools in Sarek. Notably, this should be turned off when having UMI tagged reads or using DragMap (see [here](https://gatk.broadinstitute.org/hc/en-us/articles/4407897446939--How-to-Run-germline-single-sample-short-variant-discovery-in-DRAGEN-mode)) as mapper. + +For further reading and documentation see the [technical documentation by GATK](https://gatk.broadinstitute.org/hc/en-us/articles/360035890531-Base-Quality-Score-Recalibration-BQSR-). + +#### GATK BaseRecalibrator (Spark) + +[GATK BaseRecalibrator](https://gatk.broadinstitute.org/hc/en-us/articles/360042477672-BaseRecalibrator) generates a recalibration table based on various co-variates. + +To use the corresponding spark implementation [GATK BaseRecalibratorSpark](https://gatk.broadinstitute.org/hc/en-us/articles/5358896138011-BaseRecalibrator), please specify `--use_gatk_spark baserecalibrator`. + +
+Output files for all samples + +**Output directory: `{outdir}/preprocessing/recal_table//`** + +- `.recal.table` + - Recalibration table associated to the duplicates-marked CRAM file. + +
+ +#### GATK ApplyBQSR (Spark) + +[GATK ApplyBQSR](https://gatk.broadinstitute.org/hc/en-us/articles/5358826654875-ApplyBQSR) recalibrates the base qualities of the input reads based on the recalibration table produced by the [GATK BaseRecalibrator](#gatk-baserecalibrator) tool. + +Specify `--use_gatk_spark baserecalibrator` to use [GATK ApplyBQSRSpark](https://gatk.broadinstitute.org/hc/en-us/articles/5358898266011-ApplyBQSRSpark-BETA-) instead, the respective spark implementation. + +The resulting recalibrated CRAM files are delivered to the user. Recalibrated CRAM files are usually 2-3 times larger than the duplicate-marked CRAM files. + +
+Output files for all samples + +**Output directory: `{outdir}/preprocessing/recalibrated//`** + +- `.recal.cram` and `.recal.cram.crai` + - CRAM file and index +- if `--save_output_as_bam`: + - `.recal.bam` and `.recal.bam.bai` - BAM file and index +
+ +### CSV files + +The CSV files are auto-generated and can be used by Sarek for further processing and/or variant calling. + +See the [`input`](usage#input-sample-sheet-configurations) section in the usage documentation for further reading and documentation on how to make the most of them. + +
+Output files: + +**Output directory: `{outdir}/preprocessing/csv`** + +- `mapped.csv` + - if `--save_mapped` + - CSV containing an entry for each sample with the columns `patient,sample,sex,status,bam,bai` +- `markduplicates_no_table.csv` + - CSV containing an entry for each sample with the columns `patient,sample,sex,status,cram,crai` +- `markduplicates.csv` + - CSV containing an entry for each sample with the columns `patient,sample,sex,status,cram,crai,table` +- `recalibrated.csv` + - CSV containing an entry for each sample with the columns`patient,sample,sex,status,cram,crai` +- `variantcalled.csv` + - CSV containing an entry for each sample with the columns `patient,sample,vcf` +
+ +## Variant Calling + +The results regarding variant calling are collected in `{outdir}/variantcalling/`. +If some results from a variant caller do not appear here, please check out the `--tools` section in the parameter [documentation](https://nf-co.re/sarek/latest/parameters). + +(Recalibrated) CRAM files can used as an input to start the variant calling. + +### SNVs and small indels + +For single nucleotide variants (SNVs) and small indels, multiple tools are available for normal (germline), tumor-only, and tumor-normal (somatic) paired data. For a list of the appropriate tool(s) for the data and sequencing type at hand, please check [here](usage#which-tool). + +#### bcftools + +[bcftools mpileup](https://samtools.github.io/bcftools/bcftools.html#mpileup) generates pileup of a CRAM file, followed by [bcftools call](https://samtools.github.io/bcftools/bcftools.html#call) and filtered with `-i 'count(GT==\"RR\")==0`. +For further reading and documentation see the [bcftools manual](https://samtools.github.io/bcftools/howtos/variant-calling.html). + +
+Output files for all samples + +**Output directory: `{outdir}/variantcalling/bcftools//`** + +- `.bcftools.vcf.gz` and `.bcftools.vcf.gz.tbi` + - VCF with tabix index + +
+ +#### DeepVariant + +[DeepVariant](https://github.com/google/deepvariant) is a deep learning-based variant caller that takes aligned reads, produces pileup image tensors from them, classifies each tensor using a convolutional neural network and finally reports the results in a standard VCF or gVCF file. For further documentation take a look [here](https://github.com/google/deepvariant/tree/r1.4/docs). + +
+Output files for normal samples + +**Output directory: `{outdir}/variantcalling/deepvariant//`** + +- `.deepvariant.vcf.gz` and `.deepvariant.vcf.gz.tbi` + - VCF with tabix index +- `.deepvariant.g.vcf.gz` and `.deepvariant.g.vcf.gz.tbi` + - gVCF with tabix index +
+ +#### FreeBayes + +[FreeBayes](https://github.com/ekg/freebayes) is a Bayesian genetic variant detector designed to find small polymorphisms, specifically SNPs, indels, MNPs, and complex events smaller than the length of a short-read sequencing alignment. For further reading and documentation see the [FreeBayes manual](https://github.com/ekg/freebayes/blob/master/README.md#user-manual-and-guide). + +
+Output files for all samples + +**Output directory: `{outdir}/variantcalling/freebayes/{sample,normalsample_vs_tumorsample}/`** + +- `.freebayes.vcf.gz` and `.freebayes.vcf.gz.tbi` + - VCF with tabix index + +
+ +#### GATK HaplotypeCaller + +[GATK HaplotypeCaller](https://gatk.broadinstitute.org/hc/en-us/articles/5358864757787-HaplotypeCaller) calls germline SNPs and indels via local re-assembly of haplotypes. + +
+Output files for normal samples + +**Output directory: `{outdir}/variantcalling/haplotypecaller//`** + +- `.haplotypecaller.vcf.gz` and `.haplotypecaller.vcf.gz.tbi` + - VCF with tabix index + +
+ +##### GATK Germline Single Sample Variant Calling + +[GATK Single Sample Variant Calling](https://gatk.broadinstitute.org/hc/en-us/articles/360035535932-Germline-short-variant-discovery-SNPs-Indels-) +uses HaplotypeCaller in its default single-sample mode to call variants. The VCF that HaplotypeCaller emits errors on the side of sensitivity, therefore they are filtered by first running the [CNNScoreVariants](https://gatk.broadinstitute.org/hc/en-us/articles/5358904862107-CNNScoreVariants) tool. This tool annotates each variant with a score indicating the model's prediction of the quality of each variant. To apply filters based on those scores run the [FilterVariantTranches](https://gatk.broadinstitute.org/hc/en-us/articles/5358928898971-FilterVariantTranches) tool with SNP and INDEL sensitivity tranches appropriate for your task. + +If the haplotype-called VCF files are not filtered, then Sarek should be run with at least one of the options `--dbsnp` or `--known_indels`. + +
+Output files for normal samples + +**Output directory: `{outdir}/variantcalling/haplotypecaller//`** + +- `.haplotypecaller.filtered.vcf.gz` and `.haplotypecaller.filtered.vcf.gz.tbi` + - VCF with tabix index + +
+ +##### GATK Joint Germline Variant Calling + +[GATK Joint germline Variant Calling](https://gatk.broadinstitute.org/hc/en-us/articles/360035535932-Germline-short-variant-discovery-SNPs-Indels-) uses Haplotypecaller per sample in `gvcf` mode. Next, the gVCFs are consolidated from multiple samples into a [GenomicsDB](https://gatk.broadinstitute.org/hc/en-us/articles/5358869876891-GenomicsDBImport) datastore. After joint [genotyping](https://gatk.broadinstitute.org/hc/en-us/articles/5358906861083-GenotypeGVCFs), [VQSR](https://gatk.broadinstitute.org/hc/en-us/articles/5358906115227-VariantRecalibrator) is applied for filtering to produce the final multisample callset with the desired balance of precision and sensitivity. + +
+Output files from joint germline variant calling + +**Output directory: `{outdir}/variantcalling/haplotypecaller//`** + +- `.haplotypecaller.g.vcf.gz` and `.haplotypecaller.g.vcf.gz.tbi` + - gVCF with tabix index + +**Output directory: `{outdir}/variantcalling/haplotypecaller/joint_variant_calling/`** + +- `joint_germline.vcf.gz` and `joint_germline.vcf.gz.tbi` + - VCF with tabix index +- `joint_germline_recalibrated.vcf.gz` and `joint_germline_recalibrated.vcf.gz.tbi` + - variant recalibrated VCF with tabix index (if VQSR is applied) + +
+ +#### GATK Mutect2 + +[GATK Mutect2](https://gatk.broadinstitute.org/hc/en-us/articles/5358911630107-Mutect2) calls somatic SNVs and indels via local assembly of haplotypes. +When `--joint_mutect2` is used, Mutect2 subworkflow outputs will be saved in a subfolder named with the patient ID and `{patient}.mutect2.vcf.gz` file will contain variant calls from all of the normal and tumor samples of the patient. +For further reading and documentation see the [Mutect2 manual](https://gatk.broadinstitute.org/hc/en-us/articles/360035531132). +It is not required, but recommended to have a [panel of normals (PON)](https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON) using at least 40 normal samples to get filtered somatic calls. When using `--genome GATK.GRCh38`, a panel-of-normals file is available. However, it is _highly_ recommended to create one matching your tumor samples. Creating your own panel-of-normals is currently not natively supported by the pipeline. See [here](https://gatk.broadinstitute.org/hc/en-us/articles/360035531132) for how to create one manually. + +
+Output files for tumor-only and tumor/normal paired samples + +**Output directory: `{outdir}/variantcalling/mutect2/{sample,tumorsample_vs_normalsample,patient}/`** + +Files created: + +- `{sample,tumorsample_vs_normalsample,patient}.mutect2.vcf.gz` and `{sample,tumorsample_vs_normalsample,patient}.mutect2.vcf.gz.tbi` + - unfiltered (raw) Mutect2 calls VCF with tabix index +- `{sample,tumorsample_vs_normalsample,patient}.mutect2.vcf.gz.stats` + - a stats file generated during calling of raw variants (needed for filtering) +- `{sample,tumorsample_vs_normalsample}.mutect2.contamination.table` + - table calculating the fraction of reads coming from cross-sample contamination +- `{sample,tumorsample_vs_normalsample}.mutect2.segmentation.table` + - table containing segmentation of the tumor by minor allele fraction +- `{sample,tumorsample_vs_normalsample,patient}.mutect2.artifactprior.tar.gz` + - prior probabilities for read orientation artifacts +- `{sample,tumorsample,normalsample}.mutect2.pileups.table` + - tabulates pileup metrics for inferring contamination +- `{sample,tumorsample_vs_normalsample,patient}.mutect2.filtered.vcf.gz` and `{sample,tumorsample_vs_normalsample,patient}.mutect2.filtered.vcf.gz.tbi` + - filtered Mutect2 calls VCF with tabix index based on the probability that a variant is somatic +- `{sample,tumorsample_vs_normalsample,patient}.mutect2.filtered.vcf.gz.filteringStats.tsv` + - a stats file generated during the filtering of Mutect2 called variants + +
+ +#### Sentieon DNAscope + +[Sentieon DNAscope](https://support.sentieon.com/appnotes/dnascope_ml/#dnascope-germline-variant-calling-with-a-machine-learning-model) is a variant-caller which aims at outperforming GATK's Haplotypecaller in terms of both speed and accuracy. DNAscope allows you to use a machine learning model to perform variant calling with higher accuracy by improving the candidate detection and filtering. + +
+Unfiltered VCF-files for normal samples + +**Output directory: `{outdir}/variantcalling/sentieon_dnascope//`** + +- `.dnascope.unfiltered.vcf.gz` and `.dnascope.unfiltered.vcf.gz.tbi` + - VCF with tabix index + +
+ +The output from Sentieon's DNAscope can be controlled through the option `--sentieon_dnascope_emit_mode` for Sarek, see [Basic usage of Sentieon functions](#basic-usage-of-sentieon-functions). + +Unless `dnascope_filter` is listed under `--skip_tools` in the nextflow command, Sentieon's [DNAModelApply](https://support.sentieon.com/manual/usages/general/#dnamodelapply-algorithm) is applied to the unfiltered VCF-files in order to obtain filtered VCF-files. + +
+Filtered VCF-files for normal samples + +**Output directory: `{outdir}/variantcalling/sentieon_dnascope//`** + +- `.dnascope.filtered.vcf.gz` and `.dnascope.filtered.vcf.gz.tbi` + - VCF with tabix index + +
+ +##### Sentieon DNAscope joint germline variant calling + +In Sentieon's package DNAscope, joint germline variant calling is done by first running Sentieon's Dnacope in emit-mode `gvcf` for each sample and then running Sentieon's [GVCFtyper](https://support.sentieon.com/manual/usages/general/#gvcftyper-algorithm) on the set of gVCF-files. See [Basic usage of Sentieon functions](#basic-usage-of-sentieon-functions) for information on how joint germline variant calling can be done in Sarek using Sentieon's DNAscope. + +
+Output files from joint germline variant calling + +**Output directory: `{outdir}/variantcalling/sentieon_dnascope//`** + +- `.dnascope.g.vcf.gz` and `.dnascope.g.vcf.gz.tbi` + - VCF with tabix index + +**Output directory: `{outdir}/variantcalling/sentieon_dnascope/joint_variant_calling/`** + +- `joint_germline.vcf.gz` and `joint_germline.vcf.gz.tbi` + - VCF with tabix index + +
+ +#### Sentieon Haplotyper + +[Sentieon Haplotyper](https://support.sentieon.com/manual/usages/general/#haplotyper-algorithm) is Sention's speedup version of GATK's Haplotypecaller (see above). + +
+Unfiltered VCF-files for normal samples + +**Output directory: `{outdir}/variantcalling/sentieon_haplotyper//`** + +- `.haplotyper.unfiltered.vcf.gz` and `.haplotyper.unfiltered.vcf.gz.tbi` + - VCF with tabix index + +
+ +The output from Sentieon's Haplotyper can be controlled through the option `--sentieon_haplotyper_emit_mode` for Sarek, see [Basic usage of Sentieon functions](#basic-usage-of-sentieon-functions). + +Unless `haplotyper_filter` is listed under `--skip_tools` in the nextflow command, GATK's CNNScoreVariants and FilterVariantTranches (see above) is applied to the unfiltered VCF-files in order to obtain filtered VCF-files. + +
+Filtered VCF-files for normal samples + +**Output directory: `{outdir}/variantcalling/sentieon_haplotyper//`** + +- `.haplotyper.filtered.vcf.gz` and `.haplotyper.filtered.vcf.gz.tbi` + - VCF with tabix index + +
+ +##### Sentieon Haplotyper joint germline variant calling + +In Sentieon's package DNAseq, joint germline variant calling is done by first running Sentieon's Haplotyper in emit-mode `gvcf` for each sample and then running Sentieon's [GVCFtyper](https://support.sentieon.com/manual/usages/general/#gvcftyper-algorithm) on the set of gVCF-files. See [Basic usage of Sentieon functions](#basic-usage-of-sentieon-functions) for information on how joint germline variant calling can be done in Sarek using Sentieon's DNAseq. After joint genotyping, Sentieon's version of VQSR ([VarCal](https://support.sentieon.com/manual/usages/general/#varcal-algorithm) and [ApplyVarCal](https://support.sentieon.com/manual/usages/general/#applyvarcal-algorithm)) is applied for filtering to produce the final multisample callset with the desired balance of precision and sensitivity. + +
+Output files from joint germline variant calling + +**Output directory: `{outdir}/variantcalling/sentieon_haplotyper//`** + +- `.haplotyper.g.vcf.gz` and `.haplotyper.g.vcf.gz.tbi` + - VCF with tabix index + +**Output directory: `{outdir}/variantcalling/sentieon_haplotyper/joint_variant_calling/`** + +- `joint_germline.vcf.gz` and `joint_germline.vcf.gz.tbi` + - VCF with tabix index +- `joint_germline_recalibrated.vcf.gz` and `joint_germline_recalibrated.vcf.gz.tbi` + - variant recalibrated VCF with tabix index (if VarCal is applied) + +
+ +#### Strelka2 + +[Strelka2](https://github.com/Illumina/strelka) is a fast and accurate small variant caller optimized for analysis of germline variation in small cohorts and somatic variation in tumor/normal sample pairs. For further reading and documentation see the [Strelka2 user guide](https://github.com/Illumina/strelka/blob/master/docs/userGuide/README.md). If [Strelka2](https://github.com/Illumina/strelka) is used for somatic variant calling and [Manta](https://github.com/Illumina/manta) is also specified in tools, the output candidate indels from [Manta](https://github.com/Illumina/manta) are used according to [Strelka Best Practices](https://github.com/Illumina/strelka/blob/master/docs/userGuide/README.md#somatic-configuration-example). +For further downstream analysis, take a look [here](https://github.com/Illumina/strelka/blob/v2.9.x/docs/userGuide/README.md#interpreting-the-germline-multi-sample-variants-vcf). + +
+Output files for all single samples (normal or tumor-only) + +**Output directory: `{outdir}/variantcalling/strelka//`** + +- `.strelka.genome.vcf.gz` and `.strelka.genome.vcf.gz.tbi` + - genome VCF with tabix index +- `.strelka.variants.vcf.gz` and `.strelka.variants.vcf.gz.tbi` + - VCF with tabix index with all potential variant loci across the sample. Note this file includes non-variant loci if they have a non-trivial level of variant evidence or contain one or more alleles for which genotyping has been forced. +
+ +
+Output files for tumor/normal paired samples + +**Output directory: `{outdir}/variantcalling/strelka//`** + +- `.strelka.somatic_indels.vcf.gz` and `.strelka.somatic_indels.vcf.gz.tbi` + - VCF with tabix index with all somatic indels inferred in the tumor sample. +- `.strelka.somatic_snvs.vcf.gz` and `.strelka.somatic_snvs.vcf.gz.tbi` + - VCF with tabix index with all somatic SNVs inferred in the tumor sample. + +
+ +### Structural Variants + +#### Manta + +[Manta](https://github.com/Illumina/manta) calls structural variants (SVs) and indels from mapped paired-end sequencing reads. +It is optimized for analysis of germline variation in small sets of individuals and somatic variation in tumor/normal sample pairs. +[Manta](https://github.com/Illumina/manta) provides a candidate list for small indels that can be fed to [Strelka2](https://github.com/Illumina/strelka) following [Strelka Best Practices](https://github.com/Illumina/strelka/blob/master/docs/userGuide/README.md#somatic-configuration-example). For further reading and documentation see the [Manta user guide](https://github.com/Illumina/manta/blob/master/docs/userGuide/README.md). + +
+Output files for normal samples + +**Output directory: `{outdir}/variantcalling/manta//`** + +- `.manta.diploid_sv.vcf.gz` and `.manta.diploid_sv.vcf.gz.tbi` + - VCF with tabix index containing SVs and indels scored and genotyped under a diploid model for the sample. +
+ +
+Output files for tumor-only samples + +**Output directory: `{outdir}/variantcalling/manta//`** + +- `.manta.tumor_sv.vcf.gz` and `.manta.tumor_sv.vcf.gz.tbi` + - VCF with tabix index containing a subset of the candidateSV.vcf.gz file after removing redundant candidates and small indels less than the minimum scored variant size (50 by default). The SVs are not scored, but include additional details: (1) paired and split read supporting evidence counts for each allele (2) a subset of the filters from the scored tumor-normal model are applied to the single tumor case to improve precision. +
+ +
+Output files for tumor/normal paired samples + +**Output directory: `{outdir}/variantcalling/manta//`** + +- `.manta.diploid_sv.vcf.gz` and `.manta.diploid_sv.vcf.gz.tbi` + - VCF with tabix index containing SVs and indels scored and genotyped under a diploid model for the sample. In the case of a tumor/normal subtraction, the scores in this file do not reflect any information from the tumor sample. +- `.manta.somatic_sv.vcf.gz` and `.manta.somatic_sv.vcf.gz.tbi` + - VCF with tabix index containing SVs and indels scored under a somatic variant model. +
+ +#### TIDDIT + +[TIDDIT](https://github.com/SciLifeLab/TIDDIT) identifies intra and inter-chromosomal translocations, deletions, tandem-duplications and inversions. For further reading and documentation see the [TIDDIT manual](https://github.com/SciLifeLab/TIDDIT/blob/master/README.md). + +
+Output files for normal and tumor-only samples + +**Output directory: `{outdir}/variantcalling/tiddit//`** + +- `.tiddit.vcf.gz` and `.tiddit.vcf.gz.tbi` + - VCF with tabix index containing SV calls +- `.tiddit.ploidies.tab` + - tab file describing the estimated ploidy and coverage across each contig + +
+ +
+Output files for tumor/normal paired samples + +**Output directory: `{outdir}/variantcalling/tiddit//`** + +- `.tiddit.normal.vcf.gz` and `.tiddit.normal.vcf.gz.tbi` + - VCF with tabix index containing SV calls +- `.tiddit.tumor.vcf.gz` and `.tiddit.tumor.vcf.gz.tbi` + - VCF with tabix index containing SV calls +- `_sv_merge.tiddit.vcf.gz` and `_sv_merge.tiddit.vcf.gz.tbi` + - merged tumor/normal VCF with tabix index +- `.tiddit.ploidies.tab` + - tab file describing the estimated ploidy and coverage across each contig + +
+ +### Sample heterogeneity, ploidy and CNVs + +#### ASCAT + +[ASCAT](https://github.com/VanLoo-lab/ascat) is a software for performing allele-specific copy number analysis of tumor samples and for estimating tumor ploidy and purity (normal contamination). +It infers tumor purity and ploidy and calculates whole-genome allele-specific copy number profiles. +The [ASCAT](https://github.com/VanLoo-lab/ascat) process gives several images as output, described in detail in this [book chapter](http://www.ncbi.nlm.nih.gov/pubmed/22130873). +Running ASCAT on NGS data requires that the BAM files are converted into BAF and LogR values. +This is done internally using the software [AlleleCount](https://github.com/cancerit/alleleCount). For further reading and documentation see the [ASCAT manual](https://www.crick.ac.uk/research/labs/peter-van-loo/software). + +
+Output files for tumor/normal paired samples + +**Output directory: `{outdir}/variantcalling/ascat//`** + +- `.tumour.ASCATprofile.png` + - image with information about allele-specific copy number profile +- `.tumour.ASPCF.png` + - image with information about allele-specific copy number segmentation +- `.before_correction_Tumour..tumour.png` + - image with information about raw profile of tumor sample of logR and BAF values before GC correction +- `.before_correction_Tumour..germline.png` + - image with information about raw profile of normal sample of logR and BAF values before GC correction +- `.after_correction_GC_Tumour..tumour.png` + - image with information about GC and RT corrected logR and BAF values of tumor sample after GC correction +- `.after_correction_GC_Tumour..germline.png` + - image with information about GC and RT corrected logR and BAF values of normal sample after GC correction +- `.tumour.sunrise.png` + - image visualising the range of ploidy and tumor percentage values +- `.metrics.txt` + - file with information about different metrics from ASCAT profiles +- `.cnvs.txt` + - file with information about CNVS +- `.purityploidy.txt` + - file with information about purity and ploidy +- `.segments.txt` + - file with information about copy number segments +- `.tumour_tumourBAF.txt` and `.tumour_normalBAF.txt` + - file with beta allele frequencies +- `.tumour_tumourLogR.txt` and `.tumour_normalLogR.txt` + - file with total copy number on a logarithmic scale + +The text file `.cnvs.txt` contains predictions about copy number state for all the segments. +The output is a tab delimited text file with the following columns: + +- _chr_: chromosome number +- _startpos_: start position of the segment +- _endpos_: end position of the segment +- _nMajor_: number of copies of one of the allels (for example the chromosome inherited of one parent) +- _nMinor_: number of copies of the other allele (for example the chromosome inherited of the other parent) + +The file `.cnvs.txt` contains all segments predicted by ASCAT, both those with normal copy number (nMinor = 1 and nMajor =1) and those corresponding to copy number aberrations. + +
+ +#### CNVKit + +[CNVKit](https://cnvkit.readthedocs.io/en/stable/) is a toolkit to infer and visualize copy number from high-throughput DNA sequencing data. It is designed for use with hybrid capture, including both whole-exome and custom target panels, and short-read sequencing platforms such as Illumina. For further reading and documentation, see the [CNVKit Documentation](https://cnvkit.readthedocs.io/en/stable/plots.html) + +
+Output files for normal and tumor-only samples + +**Output directory: `{outdir}/variantcalling/cnvkit//`** + +- `.antitargetcoverage.cnn` + - file containing coverage information +- `.targetcoverage.cnn` + - file containing coverage information +- `-diagram.pdf` + - file with plot of copy numbers or segments on chromosomes +- `-scatter.png` + - file with plot of bin-level log2 coverages and segmentation calls +- `.bintest.cns` + - file containing copy number segment information +- `.cnr` + - file containing copy number ratio information +- `.cns` + - file containing copy number segment information +- `.call.cns` + - file containing copy number segment information +- `.genemetrics.tsv` + - file containing per gene copy number information (if input files are annotated) +
+ +
+Output files for tumor/normal samples + +**Output directory: `{outdir}/variantcalling/cnvkit//`** + +- `.antitargetcoverage.cnn` + - file containing coverage information +- `.targetcoverage.cnn` + - file containing coverage information +- `.antitargetcoverage.cnn` + - file containing coverage information +- `.targetcoverage.cnn` + - file containing coverage information +- `.bintest.cns` + - file containing copy number segment information +- `-scatter.png` + - file with plot of bin-level log2 coverages and segmentation calls +- `-diagram.pdf` + - file with plot of copy numbers or segments on chromosomes +- `.cnr` + - file containing copy number ratio information +- `.cns` + - file containing copy number segment information +- `.call.cns` + - file containing copy number segment information +- `.genemetrics.tsv` + - file containing per gene copy number information (if input files are annotated) +
+ +#### Control-FREEC + +[Control-FREEC](https://github.com/BoevaLab/FREEC) is a tool for detection of copy-number changes and allelic imbalances (including loss of heterozygoity (LOH)) using deep-sequencing data. +[Control-FREEC](https://github.com/BoevaLab/FREEC) automatically computes, normalizes, segments copy number and beta allele frequency profiles, then calls copy number alterations and LOH. +It also detects subclonal gains and losses and evaluates the most likely average ploidy of the sample. For further reading and documentation see the [Control-FREEC Documentation](http://boevalab.inf.ethz.ch/FREEC/tutorial.html). + +
+Output files for tumor-only and tumor/normal paired samples + +**Output directory: `{outdir}/variantcalling/controlfreec/{tumorsample,tumorsample_vs_normalsample}/`** + +- `config.txt` + - Configuration file used to run Control-FREEC +- `_BAF.png` and `_BAF.png` + - image of BAF plot +- `_ratio.log2.png` and `_ratio.log2.png` + - image of ratio log2 plot +- `_ratio.png` and `_ratio.png` + - image of ratio plot +- `.bed` and `.bed` + - translated output to a .BED file (so to view it in the UCSC Genome Browser) +- `.circos.txt` and `.circos.txt` + - translated output to the Circos format +- `.p.value.txt` and `.p.value.txt` + - CNV file containing p_values for each call +- `_BAF.txt` and `.mpileup.gz_BAF.txt` + - file with beta allele frequencies for each possibly heterozygous SNP position +- `.tumor.mpileup.gz_CNVs` + - file with coordinates of predicted copy number alterations +- `_info.txt` and `.tumor.mpileup.gz_info.txt` + - parsable file with information about FREEC run +- ` _ratio.BedGraph` and `.tumor.mpileup.gz_ratio.BedGraph ` + - file with ratios in BedGraph format for visualization in the UCSC genome browser. The file contains tracks for normal copy number, gains and losses, and copy neutral LOH (\*). +- `_ratio.txt` and `.tumor.mpileup.gz_ratio.txt` + - file with ratios and predicted copy number alterations for each window +- `_sample.cpn` and `.tumor.mpileup.gz_sample.cpn` + - files with raw copy number profiles for the tumor sample +- `.normal.mpileup.gz_control.cpn` + - files with raw copy number profiles for the control sample +- `.cpn>` + - file with GC-content profile + +
+ +### Microsatellite instability (MSI) + +[Microsatellite instability](https://en.wikipedia.org/wiki/Microsatellite_instability) is a genetic condition associated with deficiencies in the mismatch repair (MMR) system which causes a tendency to accumulate a high number of mutations (SNVs and indels). +An altered distribution of microsatellite length is associated with a missed replication slippage which would be corrected under normal MMR conditions. + +#### MSIsensorPro + +[MSIsensorPro](https://github.com/xjtu-omics/msisensor-pro) is a tool to detect the MSI status of a tumor scanning the length of the microsatellite regions. +It requires a normal sample for each tumour to differentiate the somatic and germline cases. For further reading see the [MSIsensor paper](https://www.ncbi.nlm.nih.gov/pubmed/24371154). + +
+Output files for tumor/normal paired samples + +**Output directory: `{outdir}/variantcalling/msisensor//`** + +- `` + - MSI score output, contains information about the number of somatic sites. +- `_dis` + - The normal and tumor length distribution for each microsatellite position. +- `_germline` + - Somatic sites detected. +- `_somatic` + - Germline sites detected. +
+ +### Concatenation + +Germline VCFs from `DeepVariant`, `FreeBayes`, `HaplotypeCaller`, `Haplotyper`, `Manta`, `bcftools mpileup`, `Strelka2`, or `Tiddit` are concatenated with `bcftools concat`. The field `SOURCE` is added to the VCF header to report the variant caller. + +
+Concatenated VCF-files for normal samples + +**Output directory: `{outdir}/variantcalling/concat//`** + +- `.germline.vcf.gz` and `.germline.vcf.gz.tbi` + - VCF with tabix index + +
+ +## Variant annotation + +This directory contains results from the final annotation steps: two tools are used for annotation, [snpEff](http://snpeff.sourceforge.net/) and [VEP](https://www.ensembl.org/info/docs/tools/vep/index.html). Both results can also be combined by setting `--tools merge`. +All variants present in the called VCF files are annotated. For some variant callers this can mean that the variants are already filtered by `PASS`, for some this needs to be done during post-processing. + +### snpEff + +[snpeff](http://snpeff.sourceforge.net/) is a genetic variant annotation and effect prediction toolbox. +It annotates and predicts the effects of variants on genes (such as amino acid changes) using multiple databases for annotations. +The generated VCF header contains the software version and the used command line. For further reading and documentation see the [snpEff manual](http://snpeff.sourceforge.net/SnpEff_manual.html#outputSummary). + +
+Output files for all samples + +**Output directory: `{outdir}/annotation/{sample,tumorsample_vs_normalsample}`** + +- `{sample,tumorsample_vs_normalsample}._snpEff.ann.vcf.gz` and `{sample,tumorsample_vs_normalsample}._snpEff.ann.vcf.gz.tbi` + - VCF with tabix index +
+ +### VEP + +[VEP (Variant Effect Predictor)](https://www.ensembl.org/info/docs/tools/vep/index.html), based on `Ensembl`, is a tool to determine the effects of all sorts of variants, including SNPs, indels, structural variants, CNVs. +The generated VCF header contains the software version, also the version numbers for additional databases like [Clinvar](https://www.ncbi.nlm.nih.gov/clinvar/) or [dbSNP](https://www.ncbi.nlm.nih.gov/snp/) used in the [VEP](https://www.ensembl.org/info/docs/tools/vep/index.html) line. +The format of the [consequence annotations](https://www.ensembl.org/info/genome/variation/prediction/predicted_data.html) is also in the VCF header describing the `INFO` field. +For further reading and documentation see the [VEP manual](https://www.ensembl.org/info/docs/tools/vep/index.html). + +Currently, it contains: + +- _Consequence_: impact of the variation, if there is any +- _Codons_: the codon change, i.e. cGt/cAt +- _Amino_acids_: change in amino acids, i.e. R/H if there is any +- _Gene_: ENSEMBL gene name +- _SYMBOL_: gene symbol +- _Feature_: actual transcript name +- _EXON_: affected exon +- _PolyPhen_: prediction based on [PolyPhen](http://genetics.bwh.harvard.edu/pph2/) +- _SIFT_: prediction by [SIFT](http://sift.bii.a-star.edu.sg/) +- _Protein_position_: Relative position of amino acid in protein +- _BIOTYPE_: Biotype of transcript or regulatory feature + +plus any additional filed selected via the plugins: [dbNSFP](https://sites.google.com/site/jpopgen/dbNSFP), [LOFTEE](https://github.com/konradjk/loftee), [SpliceAI](https://spliceailookup.broadinstitute.org/), [SpliceRegion](https://www.ensembl.info/2018/10/26/cool-stuff-the-vep-can-do-splice-site-variant-annotation/). + +
+Output files for all samples + +**Output directory: `{outdir}/annotation/{sample,tumorsample_vs_normalsample}`** + +- `{sample,tumorsample_vs_normalsample}._VEP.ann.vcf.gz` and `{sample,tumorsample_vs_normalsample}._VEP.ann.vcf.gz.tbi` + - VCF with tabix index + +
+ +### BCFtools annotate + +[BCFtools annotate](https://samtools.github.io/bcftools/bcftools.html#annotate) is used to add annotations to VCF files. The annotations are added to the INFO column of the VCF file. The annotations are added to the VCF header and the VCF header is updated with the new annotations. For further reading and documentation see the [BCFtools annotate manual](https://samtools.github.io/bcftools/bcftools.html#annotate). + +
+Output files for all samples + +- `{sample,tumorsample_vs_normalsample}._bcf.ann.vcf.gz` and `{sample,tumorsample_vs_normalsample}._bcf.ann.vcf.gz.tbi` + - VCF with tabix index + +
+ +## Quality control and reporting + +### Quality control + +#### FastQC + +[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). + +The plots display: + +- Sequence counts for each sample. +- Sequence Quality Histograms: The mean quality value across each base position in the read. +- Per Sequence Quality Scores: The number of reads with average quality scores. Shows if a subset of reads has poor quality. +- Per Base Sequence Content: The proportion of each base position for which each of the four normal DNA bases has been called. +- Per Sequence GC Content: The average GC content of reads. Normal random library typically have a roughly normal distribution of GC content. +- Per Base N Content: The percentage of base calls at each position for which an N was called. +- Sequence Length Distribution. +- Sequence Duplication Levels: The relative level of duplication found for each sequence. +- Overrepresented sequences: The total amount of overrepresented sequences found in each library. +- Adapter Content: The cumulative percentage count of the proportion of your library which has seen each of the adapter sequences at each position. + +
+Output files for all samples + +:::note +The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. +::: +**Output directory: `{outdir}/reports/fastqc/`** + +- `_fastqc.html` and `_fastqc.html` + - [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) report containing quality metrics for your untrimmed raw FastQ files +- `_fastqc.zip` and `_fastqc.zip` + - Zip archive containing the [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) report, tab-delimited data file and plot images + +> **NB:** The FastQC plots displayed in the [MultiQC](https://multiqc.info/) report shows _untrimmed_ reads. +> They may contain adapter sequence and potentially regions with low quality. + +
+ +#### FastP + +[FastP](https://github.com/OpenGene/fastp) is a tool designed to provide all-in-one preprocessing for FastQ files and is used for trimming and splitting. The tool then determines QC metrics for the processed reads. + +
+Output files for all samples + +**Output directory: `{outdir}/reports/fastp/`** + +- `_fastp.html` + - report in HTML format +- `_fastp.json` + - report in JSON format +- `_fastp.log` + - FastQ log file + +
+ +#### Mosdepth + +[Mosdepth](https://github.com/brentp/mosdepth) reports information for the evaluation of the quality of the provided alignment data. +In short, the basic statistics of the alignment (number of reads, coverage, GC-content, etc.) are summarized and a number of useful graphs are produced. +For further reading and documentation see the [Mosdepth documentation](https://github.com/brentp/mosdepth). + +Plots will show: + +- cumulative coverage distribution +- absolute coverage distribution +- average coverage per contig/chromosome + +
+Output files for all samples + +**Output directory: `{outdir}/reports/mosdepth/`** + +- `.{sorted,md,recal}.mosdepth.global.dist.txt` + - file used by [MultiQC](https://multiqc.info/), if `.region` file does not exist +- `.{sorted,md,recal}.mosdepth.region.dist.txt` + - file used by [MultiQC](https://multiqc.info/) +- `.{sorted,md,recal}.mosdepth.summary.txt` + -A summary of mean depths per chromosome and within specified regions per chromosome. +- `.{sorted,md,recal}.{per-base,regions}.bed.gz` + - per-base depth for targeted data, per-window (500bp) depth of WGS +- `.{sorted,md,recal}.regions.bed.gz.csi` + - CSI index for per-base depth for targeted data, per-window (500bp) depth of WGS +
+ +#### NGSCheckMate + +[NGSCheckMate](https://github.com/parklab/NGSCheckMate) is a tool for determining whether samples come from the same genetic individual, using a set of commonly heterozygous SNPs. This enables for the detecting of sample mislabelling events. The output includes a text file indicating whether samples have matched or not according to the algorithm, as well as a dendrogram visualising these results. + +
+Output files for all samples + +**Output directory: `{outdir}/reports/ngscheckmate/`** + +- `ngscheckmate_all.txt` + - Tab delimited text file listing all the comparisons made, whether they were considered as a match, with the correlation and a normalised depth. +- `ngscheckmate_matched.txt` + - Tab delimited text file listing only the comparison that were considered to match, with the correlation and a normalised depth. +- `ngscheckmate_output_corr_matrix.txt` + - Tab delimited text file containing a matrix of all correlations for all comparisons made. +- `vcfs/.vcf.gz` + - Set of vcf files for each sample. Contains calls for the set of SNP positions used to calculate sample relatedness. +
+ +#### GATK MarkDuplicates reports + +More information in the [GATK MarkDuplicates section](#gatk-markduplicates) + +Duplicates can arise during sample preparation _e.g._ library construction using PCR. +Duplicate reads can also result from a single amplification cluster, incorrectly detected as multiple clusters by the optical sensor of the sequencing instrument. +These duplication artifacts are referred to as optical duplicates. If [GATK MarkDuplicates](https://gatk.broadinstitute.org/hc/en-us/articles/5358880192027-MarkDuplicates-Picard-) is used, the metrics file generated by the tool is used, if [`GATK MarkDuplicatesSpark`](https://gatk.broadinstitute.org/hc/en-us/articles/5358833264411-MarkDuplicatesSpark) is used the report is generated by [GATK4 EstimateLibraryComplexity](https://gatk.broadinstitute.org/hc/en-us/articles/5358838684187-EstimateLibraryComplexity-Picard-) on the mapped BAM files. +For further reading and documentation see the [MarkDuplicates manual](https://software.broadinstitute.org/gatk/documentation/tooldocs/4.1.2.0/picard_sam_markduplicates_MarkDuplicates.php). + +The plot will show: + +- duplication statistics + +
+Output files for all samples + +**Output directory: `{outdir}/reports/markduplicates/`** + +- `.md.cram.metrics` + - file used by [MultiQC](https://multiqc.info/) +
+ +#### Sentieon Dedup reports + +Sentieon's DNAseq subroutine Dedup produces a metrics report much like the one produced by GATK's MarkDuplicates. The Dedup metrics are imported into MultiQC as custom content and displayed in a table. + +
+Output files for all samples + +**Output directory: `{outdir}/reports/sentieon_dedup/`** + +- `.dedup.cram.metrics` + - file used by [MultiQC](https://multiqc.info/). +
+ +#### samtools stats + +[samtools stats](https://www.htslib.org/doc/samtools.html) collects statistics from CRAM files and outputs in a text format. +For further reading and documentation see the [`samtools` manual](https://www.htslib.org/doc/samtools.html#COMMANDS_AND_OPTIONS). + +The plots will show: + +- Alignment metrics. + +
+Output files for all samples + +**Output directory: `{outdir}/reports/samtools/`** + +- `.{sorted,md,recal}.samtools.stats.out` + - Raw statistics used by `MultiQC` + +
+ +#### bcftools stats + +[bcftools stats](https://samtools.github.io/bcftools/bcftools.html#stats) produces a statistics text file which is suitable for machine processing and can be plotted using plot-vcfstats. +For further reading and documentation see the [bcftools stats manual](https://samtools.github.io/bcftools/bcftools.html#stats). + +Plots will show: + +- Stats by non-reference allele frequency, depth distribution, stats by quality and per-sample counts, singleton stats, etc. +- Note: When using [Strelka2](https://github.com/Illumina/strelka), there will be no depth distribution plot, as Strelka2 does not report the INFO/DP field + +
+Output files for all samples + +**Output directory: `{outdir}/reports/bcftools/`** + +- `..bcftools_stats.txt` + - Raw statistics used by `MultiQC` +
+ +#### VCFtools + +[VCFtools](https://vcftools.github.io/) is a program package designed for working with VCF files. For further reading and documentation see the [VCFtools manual](https://vcftools.github.io/man_latest.html#OUTPUT%20OPTIONS). + +Plots will show: + +- the summary counts of each type of transition to transversion ratio for each `FILTER` category. +- the transition to transversion ratio as a function of alternative allele count (using only bi-allelic SNPs). +- the transition to transversion ratio as a function of SNP quality threshold (using only bi-allelic SNPs). + +
+Output files for all samples + +**Output directory: `{outdir}/reports/vcftools/`** + +- `..FILTER.summary` + - Raw statistics used by `MultiQC` with a summary of the number of SNPs and Ts/Tv ratio for each FILTER category +- `..TsTv.count` + - Raw statistics used by `MultiQC` with the Transition / Transversion ratio as a function of alternative allele count. Only uses bi-allelic SNPs. +- `..TsTv.qual` + - Raw statistics used by `MultiQC` with Transition / Transversion ratio as a function of SNP quality threshold. Only uses bi-allelic SNPs. +
+ +#### snpEff reports + +[snpeff](http://snpeff.sourceforge.net/) is a genetic variant annotation and effect prediction toolbox. +It annotates and predicts the effects of variants on genes (such as amino acid changes) using multiple databases for annotations. For further reading and documentation see the [snpEff manual](http://snpeff.sourceforge.net/SnpEff_manual.html#outputSummary). + +The plots will show: + +- locations of detected variants in the genome and the number of variants for each location. +- the putative impact of detected variants and the number of variants for each impact. +- the effect of variants at protein level and the number of variants for each effect type. +- the quantity as function of the variant quality score. + +
+Output files for all samples + +**Output directory: `{outdir}/reports/SnpEff/{sample,tumorsample_vs_normalsample}//`** + +- `._snpEff.csv` + - Raw statistics used by [MultiQC](http://multiqc.info) +- `._snpEff.html` + - Statistics to be visualised with a web browser +- `._snpEff.genes.txt` + - TXT (tab separated) summary counts for variants affecting each transcript and gene +
+ +#### VEP reports + +[VEP (Variant Effect Predictor)](https://www.ensembl.org/info/docs/tools/vep/index.html), based on `Ensembl`, is a tool to determine the effects of all sorts of variants, including SNPs, indels, structural variants, CNVs. For further reading and documentation see the [VEP manual](https://www.ensembl.org/info/docs/tools/vep/index.html) + +
+Output files for all samples + +**Output directory: `{outdir}/reports/EnsemblVEP/{sample,tumorsamplt_vs_normalsample}//`** + +- `._VEP.summary.html` + - Summary of the VEP run to be visualised with a web browser +
+ +### Reporting + +#### MultiQC + +[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarizing all samples in your project. +Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. +Results generated by MultiQC collect pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . + +
+Output files + +- `multiqc/` + - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. + - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. + - `multiqc_plots/`: directory containing static images from the report in various formats. +
+ +### Pipeline information + +
+Output files + +- `pipeline_info/` + - Reports generated by Nextflow: `execution_report_.html`, `execution_timeline_.html`, `execution_trace_.txt`, `pipeline_dag_.dot`/`pipeline_dag_.svg` and `manifest_.bco.json`. + - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. + - Parameters used by the pipeline run: `params_.json`. + +
+ +[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. + +## Reference files + +Contains reference folders generated by the pipeline. These files are only published, if `--save_reference` is set. + +
+Output files + +- `bwa/` + - Index corresponding to the [BWA](https://github.com/lh3/bwa) aligner +- `bwamem2/` + - Index corresponding to the [BWA-mem2](https://github.com/bwa-mem2/bwa-mem2) aligner +- `cnvkit/` + - Reference files generated by [CNVKit](https://cnvkit.readthedocs.io/en/stable/) +- `dragmap/` + - Index corresponding to the [DragMap](https://github.com/Illumina/dragmap) aligner +- `dbsnp/` + - Tabix index generated by [Tabix](http://www.htslib.org/doc/tabix.html) from the given dbsnp file +- `dict/` + - Sequence dictionary generated by [GATK4 CreateSequenceDictionary](https://gatk.broadinstitute.org/hc/en-us/articles/5358872471963-CreateSequenceDictionary-Picard-) from the given fasta +- `fai/` + - Fasta index generated with [samtools faidx](http://www.htslib.org/doc/samtools-faidx.html) from the given fasta +- `germline_resource/` + - Tabix index generated by [Tabix](http://www.htslib.org/doc/tabix.html) from the given gernline resource file +- `intervals/` + - Bed files in various stages: .bed, .bed.gz, .bed per chromosome, .bed.gz per chromsome +- `known_indels/` + - Tabix index generated by [Tabix](http://www.htslib.org/doc/tabix.html) from the given known indels file +- `msi/` + - [MSIsensorPro](https://github.com/xjtu-omics/msisensor-pro) scan of the reference genome to get microsatellites information +- `pon/` + - Tabix index generated by [Tabix](http://www.htslib.org/doc/tabix.html) from the given panel-of-normals file +
+ +# Mapping stats with Picard. + +[Picard](https://broadinstitute.github.io/picard/) is used to gather mapping quality metrics from sarek's output into a single table that can be easily inspected. The module used for this task is CollectHsMetrics. You may find more information regarding this module and its output [here](http://broadinstitute.github.io/picard/picard-metric-definitions.html#HsMetrics). + +# Post-processing and Annotation + +[GATK toolkit](https://gatk.broadinstitute.org/hc/en-us) is used to separate SNPs, indels and apply several filters to process the vcf files generated by sarek. + +[AWK](https://www.gnu.org/software/gawk/manual/gawk.html) and [BCFtools query](https://samtools.github.io/bcftools/bcftools.html#query) are used to modify the VCF previously processed with GATK toolkit, creating a new variants table which is easier to merge with the annotation data from VEP and Exomiser. + +The annotation step is performed apart from sarek in order to provide a more thorough annotation of the variants, which will include prediction of effect and correlation with the disease thanks to [Ensembl's Variant Effect Predictor (VEP)](https://www.ensembl.org/info/docs/tools/vep/index.html) and [exomiser](https://exomiser.readthedocs.io/en/latest/advanced_analysis.html). This latter will also include inheritance typing. + +# VEP documentation + +You can find information on how to interpret results from VEP toolkit [here](https://www.ensembl.org/info/docs/tools/vep/vep_formats.html#output). + +# The Exomiser - A Tool to Annotate and Prioritize Exome Variants + +[![GitHub release](https://img.shields.io/github/release/exomiser/Exomiser.svg)](https://github.com/exomiser/Exomiser/releases) +[![CircleCI](https://circleci.com/gh/exomiser/Exomiser/tree/development.svg?style=shield)](https://circleci.com/gh/exomiser/Exomiser/tree/development) +[![Codacy Badge](https://api.codacy.com/project/badge/Grade/b518a9448b5b4889a40b3dc660ef585a)](https://www.codacy.com/app/monarch-initiative/Exomiser?utm_source=github.com&utm_medium=referral&utm_content=exomiser/Exomiser&utm_campaign=Badge_Grade) +[![Documentation](https://readthedocs.org/projects/exomiser/badge/?version=latest)](http://exomiser.readthedocs.io/en/latest) +#### Overview: + +The Exomiser is a Java program that finds potential disease-causing variants from whole-exome or whole-genome sequencing data. + +Starting from a [VCF](https://samtools.github.io/hts-specs/VCFv4.3.pdf) file and a set of phenotypes encoded using the [Human Phenotype Ontology](http://www.human-phenotype-ontology.org) (HPO) it will annotate, filter and prioritise likely causative variants. The program does this based on user-defined criteria such as a variant's predicted pathogenicity, frequency of occurrence in a population and also how closely the given phenotype matches the known phenotype of diseased genes from human and model organism data. + +The functional annotation of variants is handled by [Jannovar](https://github.com/charite/jannovar) and uses any of [UCSC](http://genome.ucsc.edu), [RefSeq](https://www.ncbi.nlm.nih.gov/refseq/) or [Ensembl](https://www.ensembl.org/Homo_sapiens/Info/Index) KnownGene transcript definitions and hg19 or hg38 genomic coordinates. + +Variants are prioritised according to user-defined criteria on variant frequency, pathogenicity, quality, inheritance pattern, and model organism phenotype data. Predicted pathogenicity data is extracted from the [dbNSFP](http://www.ncbi.nlm.nih.gov/pubmed/21520341) resource. Variant frequency data is taken from the [1000 Genomes](http://www.1000genomes.org/), [ESP](http://evs.gs.washington.edu/EVS), [TOPMed](http://www.uk10k.org/studies/cohorts.html), [UK10K](http://www.uk10k.org/studies/cohorts.html), [ExAC](http://exac.broadinstitute.org) and [gnomAD](http://gnomad.broadinstitute.org/) datasets. Subsets of these frequency and pathogenicity data can be defined to further tune the analysis. Cross-species phenotype comparisons come from our PhenoDigm tool powered by the OWLTools [OWLSim](https://github.com/owlcollab/owltools) algorithm. + +The Exomiser was developed by the Computational Biology and Bioinformatics group at the Institute for Medical Genetics and Human Genetics of the Charité - Universitätsmedizin Berlin, the Mouse Informatics Group at the Sanger Institute and other members of the [Monarch initiative](https://monarchinitiative.org). + +# Interpreting the Results + +Depending on the output options provided, Exomiser will write out at least an HTML results file in the `exomiser` sub-directory of the Exomiser installation. + +As a general rule all output files contain a ranked list of genes and/or variants with the top-ranked gene/variant displayed first. The exception being the VCF output which, since version 13.1.0, is sorted according to VCF convention and tabix indexed. + +Exomiser attempts to predict the variant or variants likely to be causative of a patient's phenotype and does so by associating them with the gene (or genes in the case of large structural variations) they intersect with on the genomic sequence. Variants occurring in intergenic regions are associated to the closest gene and those overlapping two genes are associated with the gene in which they are predicted to have the largest consequence. + +Once associated with a gene, Exomiser uses the compatible modes of inheritance for a variant to assess it in the context of any diseases associated with the gene or any mouse knockout models of that gene. These are all bundled together into a `GeneScore` which features filtered variants located in that gene compatible with a given mode of inheritance. After the filtering steps Exomiser ranks these GeneScores according to descending combined score. The results are then written out to the files and formats specified in the output settings. + +As of release 13.2.0 the output files only feature a single, combined output file of ranked genes/variants e.g. when supplying the output options (via an output-options.yaml file) `outputFileName: Pfeiffer-hiphive-exome-PASS_ONLY` and `outputFormats: [TSV_VARIANT, TSV_GENE, VCF]`, the following files will be written out: `Pfeiffer-hiphive-exome-PASS_ONLY.variants.tsv` `Pfeiffer-hiphive-exome-PASS_ONLY.genes.tsv`, `Pfeiffer-hiphive-exome-PASS_ONLY.vcf` + +These formats are detailed below. + +## HTML + +![HTML Description 1](images/exomiser-html-description-1.png) + +![HTML Description 2](images/exomiser-html-description-2.png) + +## JSON + +The JSON file represents the most accurate representation of the data, as it is referenced internally by Exomiser. As such, we don't provide a schema for this, but it has been pretty stable and breaking changes will only occur with major version changes to the software. Minor additions are to be expected for minor releases, as per the [SemVer](https://semver.org) specification. + +We recommend using [Python](https://docs.python.org/3/library/json.html?highlight=json#module-json) or [JQ](https://stedolan.github.io/jq/) to extract data from this file. + +## TSV GENES + +In the genes.tsv file it is possible for a gene to appear multiple times, depending on the MOI it is compatible with, given the filtered variants. For example in the example below MUC6 is ranked 7th under the AD model and 8th under an AR model. + +```tsv +#RANK ID GENE_SYMBOL ENTREZ_GENE_ID MOI P-VALUE EXOMISER_GENE_COMBINED_SCORE EXOMISER_GENE_PHENO_SCORE EXOMISER_GENE_VARIANT_SCORE HUMAN_PHENO_SCORE MOUSE_PHENO_SCORE FISH_PHENO_SCORE WALKER_SCORE PHIVE_ALL_SPECIES_SCORE OMIM_SCORE MATCHES_CANDIDATE_GENE HUMAN_PHENO_EVIDENCE MOUSE_PHENO_EVIDENCE FISH_PHENO_EVIDENCE HUMAN_PPI_EVIDENCE MOUSE_PPI_EVIDENCE FISH_PPI_EVIDENCE +1 FGFR2_AD FGFR2 2263 AD 0.0000 0.9981 1.0000 1.0000 0.8808 1.0000 0.0000 0.5095 1.0000 1.0000 0 Jackson-Weiss syndrome (OMIM:123150): Brachydactyly (HP:0001156)-Broad hallux (HP:0010055), Craniosynostosis (HP:0001363)-Craniosynostosis (HP:0001363), Broad thumb (HP:0011304)-Broad metatarsal (HP:0001783), Broad hallux (HP:0010055)-Broad hallux (HP:0010055), Brachydactyly (HP:0001156)-abnormal sternum morphology (MP:0000157), Craniosynostosis (HP:0001363)-premature cranial suture closure (MP:0000081), Broad thumb (HP:0011304)-abnormal sternum morphology (MP:0000157), Broad hallux (HP:0010055)-abnormal sternum morphology (MP:0000157), Proximity to FGF14 associated with Spinocerebellar ataxia 27 (OMIM:609307): Broad hallux (HP:0010055)-Pes cavus (HP:0001761), Proximity to FGF14 Brachydactyly (HP:0001156)-abnormal digit morphology (MP:0002110), Broad thumb (HP:0011304)-abnormal digit morphology (MP:0002110), Broad hallux (HP:0010055)-abnormal digit morphology (MP:0002110), +2 ENPP1_AD ENPP1 5167 AD 0.0049 0.8690 0.5773 0.9996 0.6972 0.5773 0.5237 0.5066 0.6972 1.0000 0 Autosomal recessive hypophosphatemic rickets (ORPHA:289176): Brachydactyly (HP:0001156)-Genu varum (HP:0002970), Craniosynostosis (HP:0001363)-Craniosynostosis (HP:0001363), Broad thumb (HP:0011304)-Tibial bowing (HP:0002982), Broad hallux (HP:0010055)-Genu varum (HP:0002970), Brachydactyly (HP:0001156)-fused carpal bones (MP:0008915), Craniosynostosis (HP:0001363)-abnormal nucleus pulposus morphology (MP:0006392), Broad thumb (HP:0011304)-fused carpal bones (MP:0008915), Broad hallux (HP:0010055)-fused carpal bones (MP:0008915), Craniosynostosis (HP:0001363)-ceratohyal cartilage premature perichondral ossification, abnormal (ZP:0012007), Broad thumb (HP:0011304)-cleithrum nodular, abnormal (ZP:0006782), Proximity to PAPSS2 associated with Brachyolmia 4 with mild epiphyseal and metaphyseal changes (OMIM:612847): Brachydactyly (HP:0001156)-Brachydactyly (HP:0001156), Broad thumb (HP:0011304)-Brachydactyly (HP:0001156), Broad hallux (HP:0010055)-Brachydactyly (HP:0001156), Proximity to PAPSS2 Brachydactyly (HP:0001156)-abnormal long bone epiphyseal plate morphology (MP:0003055), Craniosynostosis (HP:0001363)-domed cranium (MP:0000440), Broad thumb (HP:0011304)-abnormal long bone epiphyseal plate morphology (MP:0003055), Broad hallux (HP:0010055)-abnormal long bone epiphyseal plate morphology (MP:0003055), +// +7 MUC6_AD MUC6 4588 AD 0.0096 0.7532 0.5030 0.9990 0.0000 0.0000 0.0000 0.5030 0.5030 1.0000 0 Proximity to GKN2 Brachydactyly (HP:0001156)-brachydactyly (MP:0002544), Broad thumb (HP:0011304)-brachydactyly (MP:0002544), Broad hallux (HP:0010055)-brachydactyly (MP:0002544), +8 MUC6_AR MUC6 4588 AR 0.0096 0.7531 0.5030 0.9990 0.0000 0.0000 0.0000 0.5030 0.5030 1.0000 0 Proximity to GKN2 Brachydactyly (HP:0001156)-brachydactyly (MP:0002544), Broad thumb (HP:0011304)-brachydactyly (MP:0002544), Broad hallux (HP:0010055)-brachydactyly (MP:0002544), +``` + + +## TSV VARIANTS + +In the variants.tsv file it is possible for a variant, like a gene, to appear multiple times, depending on the MOI it is +compatible with. For example in the example below MUC6 has two variants ranked 7th under the AD model and two ranked 8th +under an AR (compound heterozygous) model. In the AD case the CONTRIBUTING_VARIANT column indicates whether the variant +was (1) or wasn't (0) used for calculating the EXOMISER_GENE_COMBINED_SCORE and EXOMISER_GENE_VARIANT_SCORE. + +``` tsv + + #RANK ID GENE_SYMBOL ENTREZ_GENE_ID MOI P-VALUE EXOMISER_GENE_COMBINED_SCORE EXOMISER_GENE_PHENO_SCORE EXOMISER_GENE_VARIANT_SCORE EXOMISER_VARIANT_SCORE CONTRIBUTING_VARIANT WHITELIST_VARIANT VCF_ID RS_ID CONTIG START END REF ALT CHANGE_LENGTH QUAL FILTER GENOTYPE FUNCTIONAL_CLASS HGVS EXOMISER_ACMG_CLASSIFICATION EXOMISER_ACMG_EVIDENCE EXOMISER_ACMG_DISEASE_ID EXOMISER_ACMG_DISEASE_NAME CLINVAR_VARIANT_ID CLINVAR_PRIMARY_INTERPRETATION CLINVAR_STAR_RATING GENE_CONSTRAINT_LOEUF GENE_CONSTRAINT_LOEUF_LOWER GENE_CONSTRAINT_LOEUF_UPPER MAX_FREQ_SOURCE MAX_FREQ ALL_FREQ MAX_PATH_SOURCE MAX_PATH ALL_PATH + 1 10-123256215-T-G_AD FGFR2 2263 AD 0.0000 0.9981 1.0000 1.0000 1.0000 1 1 rs121918506 10 123256215 123256215 T G 0 100.0000 PASS 1|0 missense_variant FGFR2:ENST00000346997.2:c.1688A>C:p.(Glu563Ala) LIKELY_PATHOGENIC PM2,PP3_Strong,PP4,PP5 OMIM:123150 Jackson-Weiss syndrome 28333 LIKELY_PATHOGENIC 1 0.13692 0.074 0.27 REVEL 0.965 REVEL=0.965,MVP=0.9517972 + 2 6-132203615-G-A_AD ENPP1 5167 AD 0.0049 0.8690 0.5773 0.9996 0.9996 1 0 rs770775549 6 132203615 132203615 G A 0 922.9800 PASS 0/1 splice_donor_variant ENPP1:ENST00000360971.2:c.2230+1G>A:p.? UNCERTAIN_SIGNIFICANCE PVS1_Strong OMIM:615522 Cole disease NOT_PROVIDED 0 0.41042 0.292 0.586 GNOMAD_E_SAS 0.0032486517 TOPMED=7.556E-4,EXAC_NON_FINNISH_EUROPEAN=0.0014985314,GNOMAD_E_NFE=0.0017907989,GNOMAD_E_SAS=0.0032486517 + // + 7 11-1018088-TG-T_AD MUC6 4588 AD 0.0096 0.7532 0.5030 0.9990 0.9990 1 0 rs765231061 11 1018088 1018089 TG T -1 441.8100 PASS 0/1 frameshift_variant MUC6:ENST00000421673.2:c.4712del:p.(Pro1571Hisfs*21) UNCERTAIN_SIGNIFICANCE NOT_PROVIDED 0 0.79622 0.656 0.971 GNOMAD_G_NFE 0.0070363074 GNOMAD_E_AMR=0.0030803352,GNOMAD_G_NFE=0.0070363074 + 7 11-1018093-G-GT_AD MUC6 4588 AD 0.0096 0.7532 0.5030 0.9990 0.9989 0 0 rs376177791 11 1018093 1018093 G GT 1 592.4500 PASS 0/1 frameshift_elongation MUC6:ENST00000421673.2:c.4707dup:p.(Pro1570Thrfs*136) NOT_AVAILABLE NOT_PROVIDED 0 0.79622 0.656 0.971 GNOMAD_G_NFE 0.007835763 GNOMAD_G_NFE=0.007835763 + 8 11-1018088-TG-T_AR MUC6 4588 AR 0.0096 0.7531 0.5030 0.9990 0.9990 1 0 rs765231061 11 1018088 1018089 TG T -1 441.8100 PASS 0/1 frameshift_variant MUC6:ENST00000421673.2:c.4712del:p.(Pro1571Hisfs*21) UNCERTAIN_SIGNIFICANCE NOT_PROVIDED 0 0.79622 0.656 0.971 GNOMAD_G_NFE 0.0070363074 GNOMAD_E_AMR=0.0030803352,GNOMAD_G_NFE=0.0070363074 + 8 11-1018093-G-GT_AR MUC6 4588 AR 0.0096 0.7531 0.5030 0.9990 0.9989 1 0 rs376177791 11 1018093 1018093 G GT 1 592.4500 PASS 0/1 frameshift_elongation MUC6:ENST00000421673.2:c.4707dup:p.(Pro1570Thrfs*136) UNCERTAIN_SIGNIFICANCE NOT_PROVIDED 0 0.79622 0.656 0.971 GNOMAD_G_NFE 0.007835763 GNOMAD_G_NFE=0.007835763 +``` + +## VCF + +In the VCF file it is possible for a variant, like a gene, to appear multiple times, depending on the MOI it is compatible with. For example in the example below MUC6 has two variants ranked 7th under the AD model and two ranked 8th under an AR (compound heterozygous) model. In the AD case the CONTRIBUTING_VARIANT column indicates whether the variant was (1) or wasn't (0) used for calculating the EXOMISER_GENE_COMBINED_SCORE and EXOMISER_GENE_VARIANT_SCORE. +The ``INFO`` field with the ``ID=Exomiser`` describes the internal format of this sub-field. Be aware that for multi-allelic sites, Exomiser will decompose and trim them for the proband sample and this is what will be displayed in the Exomiser ``ID`` sub-field e.g. ``11-1018088-TG-T_AD``. + +``` vcf + + ##INFO= + #CHROM POS ID REF ALT QUAL FILTER INFO sample + 10 123256215 . T G 100 PASS Exomiser={1|10-123256215-T-G_AD|FGFR2|2263|AD|0.0000|0.9981|1.0000|1.0000|1.0000|1|1|missense_variant|FGFR2:ENST00000346997.2:c.1688A>C:p.(Glu563Ala)|LIKELY_PATHOGENIC|PM2,PP3_Strong,PP4,PP5|OMIM:123150|"Jackson-Weiss syndrome"};GENE=FGFR2;INHERITANCE=AD;MIM=101600 GT:DS:PL 1|0:2.000:50,11,0 + 11 1018088 . TG T 441.81 PASS AC=1;AF=0.50;AN=2;BaseQRankSum=7.677;DP=162;DS;Exomiser={7|11-1018088-TG-T_AD|MUC6|4588|AD|0.0096|0.7532|0.5030|0.9990|0.9990|1|0|frameshift_variant|MUC6:ENST00000421673.2:c.4712del:p.(Pro1571Hisfs*21)|UNCERTAIN_SIGNIFICANCE|||""},{8|11-1018088-TG-T_AR|MUC6|4588|AR|0.0096|0.7531|0.5030|0.9990|0.9990|1|0|frameshift_variant|MUC6:ENST00000421673.2:c.4712del:p.(Pro1571Hisfs*21)|UNCERTAIN_SIGNIFICANCE|||""};FS=25.935;HRun=3;HaplotypeScore=1327.2952;MQ=43.58;MQ0=6;MQRankSum=-5.112;QD=2.31;ReadPosRankSum=2.472;set=variant GT:AD:DP:GQ:PL 0/1:146,45:162:99:481,0,5488 + 11 1018093 . G GT 592.45 PASS AC=1;AF=0.50;AN=2;BaseQRankSum=8.019;DP=157;Exomiser={7|11-1018093-G-GT_AD|MUC6|4588|AD|0.0096|0.7532|0.5030|0.9990|0.9989|0|0|frameshift_elongation|MUC6:ENST00000421673.2:c.4707dup:p.(Pro1570Thrfs*136)|NOT_AVAILABLE|||""},{8|11-1018093-G-GT_AR|MUC6|4588|AR|0.0096|0.7531|0.5030|0.9990|0.9989|1|0|frameshift_elongation|MUC6:ENST00000421673.2:c.4707dup:p.(Pro1570Thrfs*136)|UNCERTAIN_SIGNIFICANCE|||""};FS=28.574;HRun=1;HaplotypeScore=1267.6968;MQ=44.06;MQ0=4;MQRankSum=-5.166;QD=3.26;ReadPosRankSum=1.328;set=variant GT:AD:DP:GQ:PL 0/1:140,42:157:99:631,0,4411 + 6 132203615 . G A 922.98 PASS AC=1;AF=0.50;AN=2;BaseQRankSum=-0.671;DP=94;Dels=0.00;Exomiser={2|6-132203615-G-A_AD|ENPP1|5167|AD|0.0049|0.8690|0.5773|0.9996|0.9996|1|0|splice_donor_variant|ENPP1:ENST00000360971.2:c.2230+1G>A:p.?|UNCERTAIN_SIGNIFICANCE|PVS1_Strong|OMIM:615522|"Cole disease"};FS=0.805;HRun=0;HaplotypeScore=3.5646;MQ=56.63;MQ0=0;MQRankSum=1.807;QD=9.82;ReadPosRankSum=-0.900;set=variant2 GT:AD:DP:GQ:PL 0/1:53,41:94:99:953,0,1075 +``` + +The VCF file is tabix-indexed and exomiser ranked alleles can be extracted using ``grep``. For example, to display the top 5 ranked variants, you would issue the command: + +``` shell + zgrep -E '\{[1-5]{1}\|' Pfeiffer-hiphive-exome-PASS_ONLY.vcf.gz +``` diff --git a/bu_isciii/assets/reports/md/viralrecon.md b/bu_isciii/assets/reports/md/viralrecon.md index 7d3fd83e..6e14c057 100644 --- a/bu_isciii/assets/reports/md/viralrecon.md +++ b/bu_isciii/assets/reports/md/viralrecon.md @@ -1,156 +1,252 @@ -# Viralrecon +# nf-core/viralrecon Description This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. -# Nanopore: Pipeline overview +# Illumina: Pipeline overview -- [Preprocessing](#nanopore-preprocessing) - - [pycoQC](#nanopore-pycoqc) - Sequencing QC - - [artic guppyplex](#nanopore-artic-guppyplex) - Aggregate pre-demultiplexed reads from MinKNOW/Guppy - - [NanoPlot](#nanopore-nanoplot) - Read QC -- [Variant calling](#nanopore-variant-calling) - - [artic minion](#nanopore-artic-minion) - Align reads, call variants and generate consensus sequence -- [Downstream analysis](#nanopore-downstream-analysis) - - [SAMtools](#nanopore-samtools) - Remove unmapped reads and obtain alignment metrics - - [mosdepth](#nanopore-mosdepth) - Genome-wide and amplicon coverage QC plots - - [BCFTools](#nanopore-bcftools) - Variant count metrics - - [SnpEff and SnpSift](#nanopore-snpeff-and-snpsift) - Genetic variant annotation and functional effect prediction - - [QUAST](#nanopore-quast) - Consensus assessment report - - [Pangolin](#nanopore-pangolin) - Lineage analysis - - [Nextclade](#nanopore-nextclade) - Clade assignment, mutation calling and sequence quality checks - - [ASCIIGenome](#nanopore-asciigenome) - Individual variant screenshots with annotation tracks - - [Variants long table](#nanopore-variants-long-table) - Collate per-sample information for individual variants, functional effect prediction and lineage analysis -- [Workflow reporting](#nanopore-workflow-reporting) - - [MultiQC](#nanopore-multiqc) - Present QC, visualisation and custom reporting for sequencing, raw reads, alignment and variant calling results +- [nf-core/viralrecon Description](#nf-coreviralrecon-description) +- [Illumina: Pipeline overview](#illumina-pipeline-overview) + - [Illumina: Preprocessing](#illumina-preprocessing) + - [cat](#cat) + - [FastQC](#fastqc) + - [fastp](#fastp) + - [Kraken 2](#kraken-2) + - [Illumina: Variant calling](#illumina-variant-calling) + - [Bowtie 2](#bowtie-2) + - [SAMtools](#samtools) + - [iVar trim](#ivar-trim) + - [picard MarkDuplicates](#picard-markduplicates) + - [picard CollectMultipleMetrics](#picard-collectmultiplemetrics) + - [mosdepth](#mosdepth) + - [iVar variants](#ivar-variants) + - [BCFTools call](#bcftools-call) + - [SnpEff and SnpSift](#snpeff-and-snpsift) + - [ASCIIGenome](#asciigenome) + - [iVar consensus](#ivar-consensus) + - [BCFTools and BEDTools](#bcftools-and-bedtools) + - [QUAST](#quast) + - [Pangolin](#pangolin) + - [Nextclade](#nextclade) + - [Variants long table](#variants-long-table) + - [Illumina: De novo assembly](#illumina-de-novo-assembly) + - [Cutadapt](#cutadapt) + - [SPAdes](#spades) + - [Unicycler](#unicycler) + - [minia](#minia) + - [BLAST](#blast) + - [ABACAS](#abacas) + - [PlasmidID](#plasmidid) + - [Assembly QUAST](#assembly-quast) + - [Illumina: Workflow reporting and genomes](#illumina-workflow-reporting-and-genomes) + - [MultiQC](#multiqc) +- [Nanopore: Pipeline overview](#nanopore-pipeline-overview) + - [Nanopore: Preprocessing](#nanopore-preprocessing) + - [Nanopore: pycoQC](#nanopore-pycoqc) + - [Nanopore: artic guppyplex](#nanopore-artic-guppyplex) + - [Nanopore: NanoPlot](#nanopore-nanoplot) + - [Nanopore: Variant calling](#nanopore-variant-calling) + - [Nanopore: artic minion](#nanopore-artic-minion) + - [Nanopore: Downstream analysis](#nanopore-downstream-analysis) + - [Nanopore: SAMtools](#nanopore-samtools) + - [Nanopore: mosdepth](#nanopore-mosdepth) + - [Nanopore: BCFTools](#nanopore-bcftools) + - [Nanopore: SnpEff and SnpSift](#nanopore-snpeff-and-snpsift) + - [Nanopore: QUAST](#nanopore-quast) + - [Nanopore: Pangolin](#nanopore-pangolin) + - [Nanopore: Nextclade](#nanopore-nextclade) + - [Nanopore: ASCIIGenome](#nanopore-asciigenome) + - [Nanopore: Variants long table](#nanopore-variants-long-table) + - [Nanopore: Workflow reporting](#nanopore-workflow-reporting) + - [Nanopore: MultiQC](#nanopore-multiqc) + - [Reference genome files](#reference-genome-files) +- [Pipeline information](#pipeline-information) -## Nanopore: Preprocessing +## Illumina: Preprocessing -A file called `summary_variants_metrics_mqc.csv` containing a selection of read alignment and variant calling metrics will be saved in the `multiqc//` output directory which is determined by the `--artic_minion_caller` parameter (Default: `nanopolish/`). The same metrics will also be added to the top of the MultiQC report. +### cat -### Nanopore: pycoQC +
+Output files + +- `fastq/` + - `*.merged.fastq.gz`: These files are not saved by default but can be via a custom config file such as the one below. + +```nextflow +params { + modules { + 'illumina_cat_fastq' { + publish_files = null + } + } +} +``` + +
+ +If multiple libraries/runs have been provided for the same sample in the input samplesheet (e.g. to increase sequencing depth) then these will be merged at the very beginning of the pipeline in order to have consistent sample naming throughout the pipeline. Please refer to the [usage documentation](https://nf-co.re/viralrecon/usage#illumina-samplesheet-format) to see how to specify these samples in the input samplesheet. + +### FastQC
Output files -- `pycoqc/` - - `*.html` and `.json` file that includes a run summary and graphical representation of various QC metrics including distribution of read length, distribution of read quality scores, mean read quality per sequence length, output per channel over experiment time and percentage of reads per barcode. +- `fastqc/raw/` + - `*_fastqc.html`: FastQC report containing quality metrics. + - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. + +**NB:** The FastQC plots in this directory are generated relative to the raw, input reads. They may contain adapter sequence and regions of low quality. To see how your reads look after trimming please refer to the FastQC reports in the `fastqc/trim/` directory.
-[PycoQC](https://github.com/a-slide/pycoQC) compute metrics and generate QC plots using the sequencing summary information generated by basecalling/demultiplexing tools such as Guppy e.g. distribution of read length, read length over time, number of reads per barcode and other general stats. +[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). -

PycoQC - Number of reads per barcode

+![MultiQC - FastQC per base sequence plot](images/mqc_fastqc_plot.png) -### Nanopore: artic guppyplex +### fastp
Output files -- `guppyplex/` - - `*.fastq.gz` files generated by aggregate pre-demultiplexed reads from MinKNOW/Guppy. These files are not saved by default but can be via a custom config file such as the one below. - -```nextflow -params { - modules { - 'nanopore_artic_guppyplex' { - publish_files = ['fastq.gz':''] - } - } -} -``` +- `fastp/` + - `*.fastp.html`: Trimming report in html format. + - `*.fastp.json`: Trimming report in json format. +- `fastp/log/` + - `*.fastp.log`: Trimming log file. +- `fastqc/trim/` + - `*_fastqc.html`: FastQC report of the trimmed reads. + - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images.
-The [artic guppyplex](https://artic.readthedocs.io/en/latest/commands/) tool from the [ARTIC field bioinformatics pipeline](https://github.com/artic-network/fieldbioinformatics) is used to perform length filtering of the demultiplexed Nanopore reads obtained per barcode. This essentially filters out chimeric reads that may be generated by the ARTIC protocol. The pipeline uses a default minimum and maximum read length of 400 and 700, respectively as tailored for the [nCoV-2019 primer set](https://artic.network/ncov-2019/ncov2019-bioinformatics-sop.html). However, you may need to adjust these for different primer schemes e.g. by using the minimum length of the amplicons (`--min-length`) as well as the maximum length plus 200 (`--max-length`). +[fastp](https://github.com/OpenGene/fastp) is a tool designed to provide fast, all-in-one preprocessing for FastQ files. It has been developed in C++ with multithreading support to achieve higher performance. fastp is used in this pipeline for standard adapter trimming and quality filtering. -### Nanopore: NanoPlot +![MultiQC - fastp filtered reads plot](images/mqc_fastp_plot.png) + +### Kraken 2
Output files -- `nanoplot//` - - Per-sample `*.html` files for QC metrics and individual `*.png` image files for plots. +- `kraken2/` + - `*.kraken2.report.txt`: Kraken 2 taxonomic report. See [here](https://ccb.jhu.edu/software/kraken2/index.shtml?t=manual#sample-report-output-format) for a detailed description of the format.
-[NanoPlot](https://github.com/wdecoster/NanoPlot) it a tool that can be used to produce general quality metrics from various Nanopore-based input files including fastq files e.g. quality score distribution, read lengths and other general stats. +[Kraken 2](https://ccb.jhu.edu/software/kraken2/index.shtml?t=manual) is a sequence classifier that assigns taxonomic labels to DNA sequences. Kraken 2 examines the k-mers within a query sequence and uses the information within those k-mers to query a database. That database maps k-mers to the lowest common ancestor (LCA) of all genomes known to contain a given k-mer. -

Nanoplot - Read quality vs read length

+We use a Kraken 2 database in this workflow to filter out reads specific to the host genome before performing the _de novo_ assembly steps in the pipeline. This filtering is not performed in the variant calling arm of the pipeline by default but Kraken 2 is still run to obtain an estimate of host reads, however, the filtering can be amended via the `--kraken2_variants_host_filter` parameter. -## Nanopore: Variant calling +![MultiQC - Kraken 2 classification plot](images/mqc_kraken2_plot.png) -### Nanopore: artic minion +## Illumina: Variant calling + +A file called `summary_variants_metrics_mqc.csv` containing a selection of read alignment and variant calling metrics will be saved in the `multiqc/` results directory. The same metrics will also be added to the top of the MultiQC report. + +### Bowtie 2
Output files -- `/` - - `*.consensus.fasta`: Consensus fasta file generated by artic minion. - - `*.pass.unique.vcf.gz`: VCF file containing unique variants passing quality filters. - - `*.pass.unique.vcf.gz.tbi`: VCF index file containing unique variants passing quality filters. - - `*.pass.vcf.gz`: VCF file containing variants passing quality filters. - - `*.pass.vcf.gz.tbi`: VCF index file containing variants passing quality filters. - - `*.primers.vcf`: VCF file containing variants found in primer-binding regions. - - `*.merged.vcf`: VCF file containing all detected variants. - - `*.fail.vcf`: VCF file containing variants failing quality filters. - - `*.sorted.bam`: BAM file generated by initial alignment. - - `*.sorted.bam.bai`: BAM index file generated by initial alignment. - - `*.trimmed.rg.sorted.bam`: BAM file without primer-binding site trimming. - - `*.trimmed.rg.sorted.bam.bai`: BAM index file without primer-binding site trimming. - - `*.primertrimmed.rg.sorted.bam`: BAM file generated after primer-binding site trimming. - - `*.primertrimmed.rg.sorted.bam.bai`: BAM index file generated after primer-binding site trimming. +- `variants/bowtie2/log/` + - `*.bowtie2.log`: Bowtie 2 mapping log file. -**NB:** The value of `` in the output directory name above is determined by the `--artic_minion_caller` parameter (Default: 'nanopolish'). +
+ +[Bowtie 2](http://bio-bwa.sourceforge.net/) is an ultrafast and memory-efficient tool for aligning sequencing reads to long reference sequences. Bowtie 2 supports gapped, local, and paired-end alignment modes. + +![MultiQC - Bowtie2 alignment score plot](images/mqc_bowtie2_plot.png) + +### SAMtools + +
+Output files + +- `variants/bowtie2/` + - `.sorted.bam`: Coordinate sorted BAM file containing read alignment information. + - `.sorted.bam.bai`: Index file for coordinate sorted BAM file. +- `variants/bowtie2/samtools_stats/` + - SAMtools `.sorted.bam.flagstat`, `.sorted.bam.idxstats` and `.sorted.bam.stats` files generated from the alignment files.
-The [artic minion](https://artic.readthedocs.io/en/latest/commands/) tool from the [ARTIC field bioinformatics pipeline](https://github.com/artic-network/fieldbioinformatics) is used to align reads, call variants and to generate the consensus sequence. By default, artic minion uses [Minimap2](https://github.com/lh3/minimap2) to align the reads to the viral genome, however you can use [BWA](https://github.com/lh3/bwa) instead using the `--artic_minion_aligner bwa` parameter. Similarly, the default variant caller used by artic minion is [Nanopolish](https://github.com/jts/nanopolish), however, you can use [Medaka](https://github.com/nanoporetech/medaka) instead via the `--artic_minion_caller medaka` parameter. Medaka is faster than Nanopolish, performs mostly the same and can be run directly from `fastq` input files as opposed to requiring the `fastq`, `fast5` and `sequencing_summary.txt` files required to run Nanopolish. You must provide the appropriate [Medaka model](https://github.com/nanoporetech/medaka#models) via the `--artic_minion_medaka_model` parameter if using `--artic_minion_caller medaka`. +Bowtie 2 BAM files are further processed with [SAMtools](http://samtools.sourceforge.net/) to sort them by coordinate, for indexing, as well as to generate read mapping statistics. -## Nanopore: Downstream analysis +![MultiQC - SAMtools alignment scores plot](images/mqc_samtools_stats_plot.png) -### Nanopore: SAMtools +### iVar trim
Output files -- `/` - - `*.mapped.sorted.bam`: Coordinate sorted BAM file containing read alignment information. - - `*.mapped.sorted.bam.bai`: Index file for coordinate sorted BAM file. -- `/samtools_stats/` - - SAMtools `*.mapped.sorted.bam.flagstat`, `*.mapped.sorted.bam.idxstats` and `*.mapped.sorted.bam.stats` files generated from the alignment files. +- `variants/bowtie2/` + - `*.ivar_trim.sorted.bam`: Coordinate sorted BAM file after primer trimming. + - `*.ivar_trim.sorted.bam.bai`: Index file for coordinate sorted BAM file after primer trimming. +- `variants/bowtie2/samtools_stats/` + - SAMtools `*.ivar_trim.sorted.bam.flagstat`, `*.ivar_trim.sorted.bam.idxstats` and `*.ivar_trim.sorted.bam.stats` files generated from the primer trimmed alignment files. +- `variants/bowtie2/log/` + - `*.ivar_trim.ivar.log`: iVar trim log file obtained from stdout. -**NB:** The value of `` in the output directory name above is determined by the `--artic_minion_caller` parameter (Default: 'nanopolish'). +
+ +If the `--protocol amplicon` parameter is provided then [iVar](http://gensoft.pasteur.fr/docs/ivar/1.0/manualpage.html) is used to trim amplicon primer sequences from the aligned reads. iVar uses the primer positions supplied in `--primer_bed` to soft clip primer sequences from a coordinate sorted BAM file. + +### picard MarkDuplicates + +
+Output files + +- `variants/bowtie2/` + - `*.markduplicates.sorted.bam`: Coordinate sorted BAM file after duplicate marking. + - `*.markduplicates.sorted.bam.bai`: Index file for coordinate sorted BAM file after duplicate marking. +- `variants/bowtie2/samtools_stats/` + - SAMtools `*.markduplicates.sorted.bam.flagstat`, `*.markduplicates.sorted.bam.idxstats` and `*.markduplicates.sorted.bam.stats` files generated from the duplicate marked alignment files. +- `variants/bowtie2/picard_metrics/` + - `*.markduplicates.sorted.MarkDuplicates.metrics.txt`: Metrics file from MarkDuplicates.
-BAM files containing the original alignments from either Minimap2 or BWA are further processed with [SAMtools](http://samtools.sourceforge.net/) to remove unmapped reads as well as to generate read mapping statistics. +Unless you are using [UMIs](https://emea.illumina.com/science/sequencing-method-explorer/kits-and-arrays/umi.html) it is not possible to establish whether the fragments you have sequenced from your sample were derived via true biological duplication (i.e. sequencing independent template fragments) or as a result of PCR biases introduced during the library preparation. [picard MarkDuplicates](https://gatk.broadinstitute.org/hc/en-us/articles/360037052812-MarkDuplicates-Picard-) isn't run by default because you anticipate high levels of duplication with viral data due to the size of the genome, however, you can activate it by adding `--skip_markduplicates false` to the command you use to run the pipeline. This will only _mark_ the duplicate reads identified amongst the alignments to allow you to guage the overall level of duplication in your samples. You can also choose to remove any reads identified as duplicates via the `--filter_duplicates` parameter. -![MultiQC - SAMtools alignment scores plot](images/mqc_samtools_stats_plot.png) +![MultiQC - Picard MarkDuplicates metrics plot](images/mqc_picard_duplicates_plot.png) -### Nanopore: mosdepth +### picard CollectMultipleMetrics
Output files -- `/mosdepth/genome/` +- `variants/bowtie2/picard_metrics/` + - `*.CollectMultipleMetrics.*`: Alignment QC files from picard CollectMultipleMetrics in `*_metrics` textual format. +- `variants/bowtie2/picard_metrics/pdf/` + - `*.pdf` plots for metrics obtained from CollectMultipleMetrics. + +
+ +[picard-tools](https://broadinstitute.github.io/picard/command-line-overview.html) is a set of command-line tools for manipulating high-throughput sequencing data. We use picard-tools in this pipeline to obtain mapping and coverage metrics. + +![MultiQC - Picard insert size plot](images/mqc_picard_insert_size_plot.png) + +### mosdepth + +
+Output files + +- `variants/bowtie2/mosdepth/genome/` - `all_samples.mosdepth.coverage.tsv`: File aggregating genome-wide coverage values across all samples used for plotting. - `*.mosdepth.coverage.pdf`: Whole-genome coverage plot. - `*.mosdepth.coverage.tsv`: File containing coverage values for the above plot. - `*.mosdepth.summary.txt`: Summary metrics including mean, min and max coverage values. -- `/mosdepth/amplicon/` +- `variants/bowtie2/mosdepth/amplicon/` - `all_samples.mosdepth.coverage.tsv`: File aggregating per-amplicon coverage values across all samples used for plotting. - `all_samples.mosdepth.heatmap.pdf`: Heatmap showing per-amplicon coverage across all samples. - `*.mosdepth.coverage.pdf`: Bar plot showing per-amplicon coverage for an individual sample. - `*.mosdepth.coverage.tsv`: File containing per-amplicon coverage values for the above plot. - `*.mosdepth.summary.txt`: Summary metrics including mean, min and max coverage values. -**NB:** The value of `` in the output directory name above is determined by the `--artic_minion_caller` parameter (Default: 'nanopolish'). -
-[mosdepth](mosdepth) is a fast BAM/CRAM depth calculation for WGS, exome, or targeted sequencing. mosdepth is used in this pipeline to obtain genome-wide coverage values in 200bp windows and to obtain amplicon/region-specific coverage metrics. The results are then either rendered in MultiQC (genome-wide coverage) or are plotted using custom `R` scripts. +[mosdepth](mosdepth) is a fast BAM/CRAM depth calculation for WGS, exome, or targeted sequencing. mosdepth is used in this pipeline to obtain genome-wide coverage values in 200bp windows and for `--protocol amplicon` to obtain amplicon/region-specific coverage metrics. The results are then either rendered in MultiQC (genome-wide coverage) or are plotted using custom `R` scripts. ![R - Samples amplicon coverage heatmap ](images/r_amplicon_heatmap.png) @@ -158,38 +254,61 @@ BAM files containing the original alignments from either Minimap2 or BWA are fur

R - Sample per-amplicon coverage plot

-### Nanopore: BCFTools +### iVar variants
Output files -- `/bcftools_stats/` - - `*.bcftools_stats.txt`: Statistics and counts obtained from VCF file. +- `variants/ivar/` + - `*.tsv`: Original iVar variants in TSV format. + - `*.vcf.gz`: iVar variants in VCF format. Converted using custom `ivar_variants_to_vcf.py` python script. + - `*.vcf.gz.tbi`: iVar variants VCF index file. +- `variants/ivar/log/` + - `*.variant_counts.log`: Counts for type of variants called by iVar. +- `variants/ivar/bcftools_stats/` + - `*.bcftools_stats.txt`: Statistics and counts obtained from iVar variants VCF file. -**NB:** The value of `` in the output directory name above is determined by the `--artic_minion_caller` parameter (Default: 'nanopolish'). +
+ +[iVar](https://github.com/andersen-lab/ivar/blob/master/docs/MANUAL.md) is a computational package that contains functions broadly useful for viral amplicon-based sequencing. We use iVar in this pipeline to [trim primer sequences](#ivar-trim) for amplicon input data as well as to call variants. + +iVar outputs a tsv format which is not compatible with downstream analysis such as annotation using SnpEff. Moreover some issues need to be addressed such as [strand-bias filtering](https://github.com/andersen-lab/ivar/issues/5) and [the consecutive reporting of variants belonging to the same codon](https://github.com/andersen-lab/ivar/issues/92). This pipeline uses a custom Python script [ivar_variants_to_vcf.py](https://github.com/nf-core/viralrecon/blob/master/bin/ivar_variants_to_vcf.py) to convert the default iVar output to VCF whilst also addressing both of these issues. + +![MultiQC - iVar variants called plot](images/mqc_ivar_variants_plot.png) + +### BCFTools call + +
+Output files + +- `variants/bcftools/` + - `*.vcf.gz`: Variants VCF file. + - `*.vcf.gz.tbi`: Variants VCF index file. +- `variants/bcftools/bcftools_stats/` + - `*.bcftools_stats.txt`: Statistics and counts obtained from VCF file.
-[BCFtools](http://samtools.github.io/bcftools/bcftools.html) is a set of utilities that manipulate variant calls in [VCF](https://vcftools.github.io/specs.html) and its binary counterpart BCF format. It can also used be used to generate statistics and counts obtained from VCF files as used here. +[BCFtools](http://samtools.github.io/bcftools/bcftools.html) can be used to call variants directly from BAM alignment files. It is a set of utilities that manipulate variant calls in [VCF](https://vcftools.github.io/specs.html) and its binary counterpart BCF format. BCFTools is used in the variant calling and _de novo_ assembly steps of this pipeline to obtain basic statistics from the VCF output. ![MultiQC - BCFTools variant counts](images/mqc_bcftools_stats_plot.png) -### Nanopore: SnpEff and SnpSift +### SnpEff and SnpSift
Output files -- `/snpeff/` +- `variants//snpeff/` - `*.snpeff.csv`: Variant annotation csv file. - `*.snpeff.genes.txt`: Gene table for annotated variants. - `*.snpeff.summary.html`: Summary html file for variants. - `*.snpeff.vcf.gz`: VCF file with variant annotations. - `*.snpeff.vcf.gz.tbi`: Index for VCF file with variant annotations. - `*.snpsift.txt`: SnpSift summary table. -- `/snpeff/bcftools_stats/` - - `*.snpeff.bcftools_stats.txt`: Statistics and counts obtained from SnpEff VCF file. +- `variants//snpeff/bcftools_stats/` + - `*.bcftools_stats.txt`: Statistics and counts obtained from VCF file. -**NB:** The value of `` in the output directory name above is determined by the `--artic_minion_caller` parameter (Default: 'nanopolish'). +**NB:** The value of `` in the output directory name above is determined by the `--variant_caller` parameter (Default: 'ivar' for '--protocol amplicon' and 'bcftools' for '--protocol metagenomic').
@@ -199,690 +318,635 @@ BAM files containing the original alignments from either Minimap2 or BWA are fur ![MultiQC - SnpEff annotation counts](images/mqc_snpeff_plot.png) -### Nanopore: QUAST +### ASCIIGenome
Output files -- `/quast/` - - `report.html`: Results report in HTML format. Also available in various other file formats i.e. `report.pdf`, `report.tex`, `report.tsv` and `report.txt`. +- `variants//asciigenome//` + - `*.pdf`: Individual variant screenshots with annotation tracks in PDF format. -**NB:** The value of `` in the output directory name above is determined by the `--artic_minion_caller` parameter (Default: 'nanopolish'). +**NB:** The value of `` in the output directory name above is determined by the `--variant_caller` parameter (Default: 'ivar' for '--protocol amplicon' and 'bcftools' for '--protocol metagenomic').
-[QUAST](http://bioinf.spbau.ru/quast) is used to generate a single report with which to evaluate the quality of the consensus sequence across all of the samples provided to the pipeline. The HTML results can be opened within any browser (we recommend using Google Chrome). Please see the [QUAST output docs](http://quast.sourceforge.net/docs/manual.html#sec3) for more detailed information regarding the output files. +As described in the documentation, [ASCIIGenome](https://asciigenome.readthedocs.io/en/latest/) is a command-line genome browser that can be run from a terminal window and is solely based on ASCII characters. The closest program to ASCIIGenome is probably [samtools tview](http://www.htslib.org/doc/samtools-tview.html) but ASCIIGenome offers much more flexibility, similar to popular GUI viewers like the [IGV](https://software.broadinstitute.org/software/igv/) browser. We are using the batch processing mode of ASCIIGenome in this pipeline to generate individual screenshots for all of the variant sites reported for each sample in the VCF files. This is incredibly useful to be able to quickly QC the variants called by the pipeline without having to tediously load all of the relevant tracks into a conventional genome browser. Where possible, the BAM read alignments, VCF variant file, primer BED file and GFF annotation track will be represented in the screenshot for contextual purposes. The screenshot below shows a SNP called relative to the MN908947.3 SARS-CoV-2 reference genome that overlaps the ORF7a protein and the nCoV-2019_91_LEFT primer from the ARIC v3 protocol. -### Nanopore: Pangolin +

ASCIIGenome screenshot

+ +### iVar consensus
Output files -- `/pangolin/` - - `*.pangolin.csv`: Lineage analysis results from Pangolin. +- `variants//consensus/ivar/` + - `*.consensus.fa`: Consensus Fasta file generated by iVar. + - `*.consensus.qual.txt`: File with the average quality of each base in the consensus sequence. +- `variants//consensus/ivar/base_qc/` + - `*.ACTG_density.pdf`: Plot showing density of ACGT bases within the consensus sequence. + - `*.base_counts.pdf`: Plot showing frequency and percentages of all bases in consensus sequence. + - `*.base_counts.tsv`: File containing frequency and percentages of all bases in consensus sequence. + - `*.N_density.pdf`: Plot showing density of N bases within the consensus sequence. + - `*.N_run.tsv`: File containing start positions and width of N bases in consensus sequence. -**NB:** The value of `` in the output directory name above is determined by the `--artic_minion_caller` parameter (Default: 'nanopolish'). +**NB:** The value of `` in the output directory name above is determined by the `--variant_caller` parameter (Default: 'ivar' for '--protocol amplicon' and 'bcftools' for '--protocol metagenomic').
-Phylogenetic Assignment of Named Global Outbreak LINeages ([Pangolin](https://github.com/cov-lineages/pangolin)) has been used extensively during the COVID-19 pandemic to assign lineages to SARS-CoV-2 genome sequenced samples. A [web application](https://pangolin.cog-uk.io/) also exists that allows users to upload genome sequences via a web browser to assign lineages to genome sequences of SARS-CoV-2, view descriptive characteristics of the assigned lineage(s), view the placement of the lineage in a phylogeny of global samples, and view the temporal and geographic distribution of the assigned lineage(s). +As described in the [iVar variants](#ivar-variants) section, iVar can be used in this pipeline to call variants and for the consensus sequence generation. -### Nanopore: Nextclade +### BCFTools and BEDTools
Output files -- `/nextclade/` - - `*.csv`: Analysis results from Nextlade containing genome clade assignment, mutation calling and sequence quality checks. +- `variants//consensus/bcftools/` + - `*.consensus.fa`: Consensus fasta file generated by integrating the high allele-frequency variants called by iVar/BCFTools into the reference genome. + - `*.filtered.vcf.gz`: VCF file containing high allele-frequency variants (default: `>= 0.75`) that were integrated into the consensus sequence. + - `*.filtered.vcf.gz.tbi`: Variants VCF index file for high allele frequency variants. +- `variants//consensus/bcftools/base_qc/` + - `*.ACTG_density.pdf`: Plot showing density of ACGT bases within the consensus sequence. + - `*.base_counts.pdf`: Plot showing frequency and percentages of all bases in consensus sequence. + - `*.base_counts.tsv`: File containing frequency and percentages of all bases in consensus sequence. + - `*.N_density.pdf`: Plot showing density of N bases within the consensus sequence. + - `*.N_run.tsv`: File containing start positions and width of N bases in consensus sequence. -**NB:** The value of `` in the output directory name above is determined by the `--artic_minion_caller` parameter (Default: 'nanopolish'). +**NB:** The value of `` in the output directory name above is determined by the `--variant_caller` parameter (Default: 'ivar' for '--protocol amplicon' and 'bcftools' for '--protocol metagenomic').
-[Nextclade](https://github.com/nextstrain/nextclade) performs viral genome clade assignment, mutation calling and sequence quality checks for the consensus sequences generated in this pipeline. Similar to Pangolin, it has been used extensively during the COVID-19 pandemic. A [web application](https://clades.nextstrain.org/) also exists that allows users to upload genome sequences via a web browser. +[BCFTools](http://samtools.github.io/bcftools/bcftools.html) is used in the variant calling and _de novo_ assembly steps of this pipeline to obtain basic statistics from the VCF output. It can also used be used to generate a consensus sequence by integrating variant calls into the reference genome. In this pipeline, we use `samtools mpileup` to create a mask using low coverage positions, and `bedtools maskfasta` to mask the genome sequences based on these intervals. Finally, `bcftools consensus` is used to generate the consensus by projecting the high allele frequency variants onto the masked genome reference sequence. -### Nanopore: ASCIIGenome +### QUAST
Output files -- `/asciigenome//` - - `*.pdf`: Individual variant screenshots with annotation tracks in PDF format. +- `variants//consensus//quast/` + - `report.html`: Results report in HTML format. Also available in various other file formats i.e. `report.pdf`, `report.tex`, `report.tsv` and `report.txt`. -**NB:** The value of `` in the output directory name above is determined by the `--artic_minion_caller` parameter (Default: 'nanopolish'). +**NB:** The value of `` in the output directory name above is determined by the `--variant_caller` parameter (Default: 'ivar' for '--protocol amplicon' and 'bcftools' for '--protocol metagenomic'). +**NB:** The value of `` in the output directory name above is determined by the `--consensus_caller` parameter (Default: 'bcftools' for both '--protocol amplicon' and '--protocol metagenomic').
-As described in the documentation, [ASCIIGenome](https://asciigenome.readthedocs.io/en/latest/) is a command-line genome browser that can be run from a terminal window and is solely based on ASCII characters. The closest program to ASCIIGenome is probably [samtools tview](http://www.htslib.org/doc/samtools-tview.html) but ASCIIGenome offers much more flexibility, similar to popular GUI viewers like the [IGV](https://software.broadinstitute.org/software/igv/) browser. We are using the batch processing mode of ASCIIGenome in this pipeline to generate individual screenshots for all of the variant sites reported for each sample in the VCF files. This is incredibly useful to be able to quickly QC the variants called by the pipeline without having to tediously load all of the relevant tracks into a conventional genome browser. Where possible, the BAM read alignments, VCF variant file, primer BED file and GFF annotation track will be represented in the screenshot for contextual purposes. The screenshot below shows a SNP called relative to the MN908947.3 SARS-CoV-2 reference genome that overlaps the ORF7a protein and the nCoV-2019_91_LEFT primer from the ARIC v3 protocol. - -

ASCIIGenome screenshot

+[QUAST](http://bioinf.spbau.ru/quast) is used to generate a single report with which to evaluate the quality of the consensus sequence across all of the samples provided to the pipeline. The HTML results can be opened within any browser (we recommend using Google Chrome). Please see the [QUAST output docs](http://quast.sourceforge.net/docs/manual.html#sec3) for more detailed information regarding the output files. -### Nanopore: Variants long table +### Pangolin
Output files -- `/` - - `variants_long_table.csv`: Long format table collating per-sample information for individual variants, functional effect prediction and lineage analysis. +- `variants//consensus//pangolin/` + - `*.pangolin.csv`: Lineage analysis results from Pangolin. -**NB:** The value of `` in the output directory name above is determined by the `--artic_minion_caller` parameter (Default: 'nanopolish'). +**NB:** The value of `` in the output directory name above is determined by the `--variant_caller` parameter (Default: 'ivar' for '--protocol amplicon' and 'bcftools' for '--protocol metagenomic'). +**NB:** The value of `` in the output directory name above is determined by the `--consensus_caller` parameter (Default: 'bcftools' for both '--protocol amplicon' and '--protocol metagenomic').
-Create variants long format table collating per-sample information for individual variants ([`BCFTools`](http://samtools.github.io/bcftools/bcftools.html)), functional effect prediction ([`SnpSift`](http://snpeff.sourceforge.net/SnpSift.html)) and lineage analysis ([`Pangolin`](https://github.com/cov-lineages/pangolin)). - -The more pertinent variant information is summarised in this table to make it easier for researchers to assess the impact of variants found amongst the sequenced sample(s). An example of the fields included in the table are shown below: - -```bash -SAMPLE,CHROM,POS,REF,ALT,FILTER,DP,REF_DP,ALT_DP,AF,GENE,EFFECT,HGVS_C,HGVS_P,HGVS_P_1LETTER,CALLER,LINEAGE -SAMPLE1_PE,MN908947.3,241,C,T,PASS,489,4,483,0.99,orf1ab,upstream_gene_variant,c.-25C>T,.,.,ivar,B.1 -SAMPLE1_PE,MN908947.3,1875,C,T,PASS,92,62,29,0.32,orf1ab,missense_variant,c.1610C>T,p.Ala537Val,p.A537V,ivar,B.1 -SAMPLE1_PE,MN908947.3,3037,C,T,PASS,213,0,213,1.0,orf1ab,synonymous_variant,c.2772C>T,p.Phe924Phe,p.F924F,ivar,B.1 -SAMPLE1_PE,MN908947.3,11719,G,A,PASS,195,9,186,0.95,orf1ab,synonymous_variant,c.11454G>A,p.Gln3818Gln,p.Q3818Q,ivar,B.1 -``` - -## Nanopore: Workflow reporting +Phylogenetic Assignment of Named Global Outbreak LINeages ([Pangolin](https://github.com/cov-lineages/pangolin)) has been used extensively during the COVID-19 pandemic in order to to assign lineages to SARS-CoV-2 genome sequenced samples. A [web application](https://pangolin.cog-uk.io/) also exists that allows users to upload genome sequences via a web browser to assign lineages to genome sequences of SARS-CoV-2, view descriptive characteristics of the assigned lineage(s), view the placement of the lineage in a phylogeny of global samples, and view the temporal and geographic distribution of the assigned lineage(s). -### Nanopore: MultiQC +### Nextclade
Output files -- `multiqc//` - - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. - - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. - - `summary_variants_metrics_mqc.csv`: file containing a selection of read alignmnet and variant calling metrics. The same metrics will also be added to the top of the MultiQC report. - -
- -![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) - -Results generated by MultiQC collate pipeline QC from pycoQC, samtools, mosdepth, BCFTools, SnpEff and QUAST. - -The default [`multiqc config file`](https://github.com/nf-core/viralrecon/blob/master/assets/multiqc_config_nanopore.yaml) has been written in a way in which to structure these QC metrics to make them more interpretable in the final report. - -The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . - -An example MultiQC report generated from a full-sized dataset can be viewed on the [nf-core website](https://nf-co.re/viralrecon/results). +- `variants//consensus//nextclade/` + - `*.csv`: Analysis results from Nextlade containing genome clade assignment, mutation calling and sequence quality checks. -# Illumina: Pipeline overview +**NB:** The value of `` in the output directory name above is determined by the `--variant_caller` parameter (Default: 'ivar' for '--protocol amplicon' and 'bcftools' for '--protocol metagenomic'). +**NB:** The value of `` in the output directory name above is determined by the `--consensus_caller` parameter (Default: 'bcftools' for both '--protocol amplicon' and '--protocol metagenomic'). -- [Preprocessing](#illumina-preprocessing) - - [cat](#cat) - Merge re-sequenced FastQ files - - [FastQC](#fastqc) - Raw read QC - - [fastp](#fastp) - Adapter and quality trimming - - [Kraken 2](#kraken-2) - Removal/QC for host reads -- [Variant calling](#illumina-variant-calling) - - [Bowtie 2](#bowtie-2) - Read alignment relative to reference genome - - [SAMtools](#samtools) - Sort, index and generate metrics for alignments - - [iVar trim](#ivar-trim) - Primer sequence removal for amplicon data - - [picard MarkDuplicates](#picard-markduplicates) - Duplicate read marking and removal - - [picard CollectMultipleMetrics](#picard-collectmultiplemetrics) - Alignment metrics - - [mosdepth](#mosdepth) - Whole-genome and amplicon coverage metrics - - [iVar variants](#ivar-variants) _||_ [BCFTools call](#bcftools-call) - Variant calling - - [SnpEff and SnpSift](#snpeff-and-snpsift) - Genetic variant annotation and functional effect prediction - - [ASCIIGenome](#asciigenome) - Individual variant screenshots with annotation tracks - - [iVar consensus](#ivar-consensus) _||_ [BCFTools and BEDTools](#bcftools-and-bedtools) - Consensus sequence generation - - [QUAST](#quast) - Consensus assessment report - - [Pangolin](#pangolin) - Lineage analysis - - [Nextclade](#nextclade) - Clade assignment, mutation calling and sequence quality checks - - [Variants long table](#variants-long-table) - Collate per-sample information for individual variants, functional effect prediction and lineage analysis -- [De novo assembly](#illumina-de-novo-assembly) - - [Cutadapt](#cutadapt) - Primer trimming for amplicon data - - [SPAdes](#spades) _||_ [Unicycler](#unicycler) _||_ [minia](#minia) - Viral genome assembly - - [BLAST](#blast) - Blast to reference assembly - - [ABACAS](#abacas) - Order contigs according to reference genome - - [PlasmidID](#plasmidid) - Assembly report and visualisation - - [Assembly QUAST](#assembly-quast) - Assembly quality assessment -- [Workflow reporting and genomes](#illumina-workflow-reporting-and-genomes) - - [MultiQC](#multiqc) - Present QC for raw reads, alignment, assembly and variant calling - - [Reference genome files](#reference-genome-files) - Save reference genome indices/files + -## Illumina: Preprocessing +[Nextclade](https://github.com/nextstrain/nextclade) performs viral genome clade assignment, mutation calling and sequence quality checks for the consensus sequences generated in this pipeline. Similar to Pangolin, it has been used extensively during the COVID-19 pandemic. A [web application](https://clades.nextstrain.org/) also exists that allows users to upload genome sequences via a web browser. -### cat +### Variants long table
Output files -- `fastq/` - - `*.merged.fastq.gz`: These files are not saved by default but can be via a custom config file such as the one below. +- `variants//` + - `variants_long_table.csv`: Long format table collating per-sample information for individual variants, functional effect prediction and lineage analysis. -```nextflow -params { - modules { - 'illumina_cat_fastq' { - publish_files = null - } - } -} -``` +**NB:** The value of `` in the output directory name above is determined by the `--variant_caller` parameter (Default: 'ivar' for '--protocol amplicon' and 'bcftools' for '--protocol metagenomic').
-If multiple libraries/runs have been provided for the same sample in the input samplesheet (e.g. to increase sequencing depth) then these will be merged at the very beginning of the pipeline in order to have consistent sample naming throughout the pipeline. Please refer to the [usage documentation](https://nf-co.re/viralrecon/usage#illumina-samplesheet-format) to see how to specify these samples in the input samplesheet. - -### FastQC - -
-Output files - -- `fastqc/raw/` - - `*_fastqc.html`: FastQC report containing quality metrics. - - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. +Create variants long format table collating per-sample information for individual variants ([`BCFTools`](http://samtools.github.io/bcftools/bcftools.html)), functional effect prediction ([`SnpSift`](http://snpeff.sourceforge.net/SnpSift.html)) and lineage analysis ([`Pangolin`](https://github.com/cov-lineages/pangolin)). -**NB:** The FastQC plots in this directory are generated relative to the raw, input reads. They may contain adapter sequence and regions of low quality. To see how your reads look after trimming please refer to the FastQC reports in the `fastqc/trim/` directory. +The more pertinent variant information is summarised in this table to make it easier for researchers to assess the impact of variants found amongst the sequenced sample(s). An example of the fields included in the table are shown below: -
+```bash +SAMPLE,CHROM,POS,REF,ALT,FILTER,DP,REF_DP,ALT_DP,AF,GENE,EFFECT,HGVS_C,HGVS_P,HGVS_P_1LETTER,CALLER,LINEAGE +SAMPLE1_PE,MN908947.3,241,C,T,PASS,489,4,483,0.99,orf1ab,upstream_gene_variant,c.-25C>T,.,.,ivar,B.1 +SAMPLE1_PE,MN908947.3,1875,C,T,PASS,92,62,29,0.32,orf1ab,missense_variant,c.1610C>T,p.Ala537Val,p.A537V,ivar,B.1 +SAMPLE1_PE,MN908947.3,3037,C,T,PASS,213,0,213,1.0,orf1ab,synonymous_variant,c.2772C>T,p.Phe924Phe,p.F924F,ivar,B.1 +SAMPLE1_PE,MN908947.3,11719,G,A,PASS,195,9,186,0.95,orf1ab,synonymous_variant,c.11454G>A,p.Gln3818Gln,p.Q3818Q,ivar,B.1 +``` -[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +## Illumina: De novo assembly -![MultiQC - FastQC per base sequence plot](images/mqc_fastqc_plot.png) +A file called `summary_assembly_metrics_mqc.csv` containing a selection of read alignment and _de novo_ assembly related metrics will be saved in the `multiqc/` results directory. The same metrics will also be added to the top of the MultiQC report. -### fastp +### Cutadapt
Output files -- `fastp/` - - `*.fastp.html`: Trimming report in html format. - - `*.fastp.json`: Trimming report in json format. -- `fastp/log/` - - `*.fastp.log`: Trimming log file. -- `fastqc/trim/` +- `assembly/cutadapt/log/` + - `*.cutadapt.log`: Cutadapt log file generated from stdout. +- `assembly/cutadapt/fastqc/` - `*_fastqc.html`: FastQC report of the trimmed reads. - - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. + - `*_fastqc.zip`: Zip archive containing the FastQC report.
-[fastp](https://github.com/OpenGene/fastp) is a tool designed to provide fast, all-in-one preprocessing for FastQ files. It has been developed in C++ with multithreading support to achieve higher performance. fastp is used in this pipeline for standard adapter trimming and quality filtering. +In the variant calling branch of the pipeline we are using [iVar trim](#ivar-trim) to remove primer sequences from the aligned BAM files for amplicon data. Since in the _de novo_ assembly branch we don't align the reads, we use [Cutadapt](https://cutadapt.readthedocs.io/en/stable/guide.html) as an alternative option to remove and clean the primer sequences directly from FastQ files. -![MultiQC - fastp filtered reads plot](images/mqc_fastp_plot.png) +![MultiQC - Cutadapt filtered reads plot](images/mqc_cutadapt_plot.png) -### Kraken 2 +### SPAdes
Output files -- `kraken2/` - - `*.kraken2.report.txt`: Kraken 2 taxonomic report. See [here](https://ccb.jhu.edu/software/kraken2/index.shtml?t=manual#sample-report-output-format) for a detailed description of the format. - -
- -[Kraken 2](https://ccb.jhu.edu/software/kraken2/index.shtml?t=manual) is a sequence classifier that assigns taxonomic labels to DNA sequences. Kraken 2 examines the k-mers within a query sequence and uses the information within those k-mers to query a database. That database maps k-mers to the lowest common ancestor (LCA) of all genomes known to contain a given k-mer. - -We use a Kraken 2 database in this workflow to filter out reads specific to the host genome before performing the _de novo_ assembly steps in the pipeline. This filtering is not performed in the variant calling arm of the pipeline by default but Kraken 2 is still run to obtain an estimate of host reads, however, the filtering can be amended via the `--kraken2_variants_host_filter` parameter. - -![MultiQC - Kraken 2 classification plot](images/mqc_kraken2_plot.png) - -## Illumina: Variant calling - -A file called `summary_variants_metrics_mqc.csv` containing a selection of read alignment and variant calling metrics will be saved in the `multiqc/` results directory. The same metrics will also be added to the top of the MultiQC report. - -### Bowtie 2 - -
-Output files +- `assembly/spades//` + - `*.scaffolds.fa.gz`: SPAdes scaffold assembly. + - `*.contigs.fa.gz`: SPAdes assembly contigs. + - `*.assembly.gfa.gz`: SPAdes assembly graph in [GFA](https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md) format. +- `assembly/spades//bandage/` + - `*.png`: Bandage visualisation for SPAdes assembly graph in PNG format. + - `*.svg`: Bandage visualisation for SPAdes assembly graph in SVG format. -- `variants/bowtie2/log/` - - `*.bowtie2.log`: Bowtie 2 mapping log file. +**NB:** The value of `` in the output directory name above is determined by the `--spades_mode` parameter (Default: 'rnaviral').
-[Bowtie 2](http://bio-bwa.sourceforge.net/) is an ultrafast and memory-efficient tool for aligning sequencing reads to long reference sequences. Bowtie 2 supports gapped, local, and paired-end alignment modes. +[SPAdes](http://cab.spbu.ru/software/spades/) is an assembly toolkit containing various assembly pipelines. Generically speaking, SPAdes is one of the most popular de Bruijn graph-based assembly algorithms used for bacterial/viral genome reconstruction. -![MultiQC - Bowtie2 alignment score plot](images/mqc_bowtie2_plot.png) +[Bandage](https://rrwick.github.io/Bandage/) is a program for visualising _de novo_ assembly graphs. By displaying connections which are not present in the contigs file, Bandage opens up new possibilities for analysing _de novo_ assemblies. -### SAMtools +### Unicycler
Output files -- `variants/bowtie2/` - - `.sorted.bam`: Coordinate sorted BAM file containing read alignment information. - - `.sorted.bam.bai`: Index file for coordinate sorted BAM file. -- `variants/bowtie2/samtools_stats/` - - SAMtools `.sorted.bam.flagstat`, `.sorted.bam.idxstats` and `.sorted.bam.stats` files generated from the alignment files. +- `assembly/unicycler/` + - `*.scaffolds.fa.gz`: Unicycler scaffold assembly. + - `*.assembly.gfa.gz`: Unicycler assembly graph in GFA format. +- `assembly/unicycler/bandage/` + - `*.png`: Bandage visualisation for Unicycler assembly graph in PNG format. + - `*.svg`: Bandage visualisation for Unicycler assembly graph in SVG format.
-Bowtie 2 BAM files are further processed with [SAMtools](http://samtools.sourceforge.net/) to sort them by coordinate, for indexing, as well as to generate read mapping statistics. - -![MultiQC - SAMtools alignment scores plot](images/mqc_samtools_stats_plot.png) +[Unicycler](https://github.com/rrwick/Unicycler) is an assembly pipeline for bacterial genomes. It can assemble Illumina-only read sets where it functions as a SPAdes-optimiser. -### iVar trim +### minia
Output files -- `variants/bowtie2/` - - `*.ivar_trim.sorted.bam`: Coordinate sorted BAM file after primer trimming. - - `*.ivar_trim.sorted.bam.bai`: Index file for coordinate sorted BAM file after primer trimming. -- `variants/bowtie2/samtools_stats/` - - SAMtools `*.ivar_trim.sorted.bam.flagstat`, `*.ivar_trim.sorted.bam.idxstats` and `*.ivar_trim.sorted.bam.stats` files generated from the primer trimmed alignment files. -- `variants/bowtie2/log/` - - `*.ivar_trim.ivar.log`: iVar trim log file obtained from stdout. +- `assembly/minia/` + - `*.contigs.fa`: Minia scaffold assembly. + - `*.unitigs.fa`: Minia unitigs fasta file. + - `*.h5`: Minia h5 output file.
-If the `--protocol amplicon` parameter is provided then [iVar](http://gensoft.pasteur.fr/docs/ivar/1.0/manualpage.html) is used to trim amplicon primer sequences from the aligned reads. iVar uses the primer positions supplied in `--primer_bed` to soft clip primer sequences from a coordinate sorted BAM file. +[Minia](https://github.com/GATB/minia) is a short-read assembler based on a de Bruijn graph, capable of assembling a human genome on a desktop computer in a day. The output of Minia is a set of contigs. Minia produces results of similar contiguity and accuracy to other de Bruijn assemblers. -### picard MarkDuplicates +### BLAST
Output files -- `variants/bowtie2/` - - `*.markduplicates.sorted.bam`: Coordinate sorted BAM file after duplicate marking. - - `*.markduplicates.sorted.bam.bai`: Index file for coordinate sorted BAM file after duplicate marking. -- `variants/bowtie2/samtools_stats/` - - SAMtools `*.markduplicates.sorted.bam.flagstat`, `*.markduplicates.sorted.bam.idxstats` and `*.markduplicates.sorted.bam.stats` files generated from the duplicate marked alignment files. -- `variants/bowtie2/picard_metrics/` - - `*.markduplicates.sorted.MarkDuplicates.metrics.txt`: Metrics file from MarkDuplicates. - -
- -Unless you are using [UMIs](https://emea.illumina.com/science/sequencing-method-explorer/kits-and-arrays/umi.html) it is not possible to establish whether the fragments you have sequenced from your sample were derived via true biological duplication (i.e. sequencing independent template fragments) or as a result of PCR biases introduced during the library preparation. [picard MarkDuplicates](https://gatk.broadinstitute.org/hc/en-us/articles/360037052812-MarkDuplicates-Picard-) isn't run by default because you anticipate high levels of duplication with viral data due to the size of the genome, however, you can activate it by adding `--skip_markduplicates false` to the command you use to run the pipeline. This will only _mark_ the duplicate reads identified amongst the alignments to allow you to guage the overall level of duplication in your samples. You can also choose to remove any reads identified as duplicates via the `--filter_duplicates` parameter. - -![MultiQC - Picard MarkDuplicates metrics plot](images/mqc_picard_duplicates_plot.png) - -### picard CollectMultipleMetrics - -
-Output files +- `assembly//blastn/` + - `*.blastn.txt`: BLAST results against the target virus. + - `*.filter.blastn.txt`: Filtered BLAST results. -- `variants/bowtie2/picard_metrics/` - - `*.CollectMultipleMetrics.*`: Alignment QC files from picard CollectMultipleMetrics in `*_metrics` textual format. -- `variants/bowtie2/picard_metrics/pdf/` - - `*.pdf` plots for metrics obtained from CollectMultipleMetrics. +**NB:** The value of `` in the output directory name above is determined by the `--assemblers` parameter (Default: 'spades').
-[picard-tools](https://broadinstitute.github.io/picard/command-line-overview.html) is a set of command-line tools for manipulating high-throughput sequencing data. We use picard-tools in this pipeline to obtain mapping and coverage metrics. - -![MultiQC - Picard insert size plot](images/mqc_picard_insert_size_plot.png) +[blastn](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch) is used to align the assembled contigs against the virus reference genome. -### mosdepth +### ABACAS
Output files -- `variants/bowtie2/mosdepth/genome/` - - `all_samples.mosdepth.coverage.tsv`: File aggregating genome-wide coverage values across all samples used for plotting. - - `*.mosdepth.coverage.pdf`: Whole-genome coverage plot. - - `*.mosdepth.coverage.tsv`: File containing coverage values for the above plot. - - `*.mosdepth.summary.txt`: Summary metrics including mean, min and max coverage values. -- `variants/bowtie2/mosdepth/amplicon/` - - `all_samples.mosdepth.coverage.tsv`: File aggregating per-amplicon coverage values across all samples used for plotting. - - `all_samples.mosdepth.heatmap.pdf`: Heatmap showing per-amplicon coverage across all samples. - - `*.mosdepth.coverage.pdf`: Bar plot showing per-amplicon coverage for an individual sample. - - `*.mosdepth.coverage.tsv`: File containing per-amplicon coverage values for the above plot. - - `*.mosdepth.summary.txt`: Summary metrics including mean, min and max coverage values. - -
- -[mosdepth](mosdepth) is a fast BAM/CRAM depth calculation for WGS, exome, or targeted sequencing. mosdepth is used in this pipeline to obtain genome-wide coverage values in 200bp windows and for `--protocol amplicon` to obtain amplicon/region-specific coverage metrics. The results are then either rendered in MultiQC (genome-wide coverage) or are plotted using custom `R` scripts. - -![R - Samples amplicon coverage heatmap ](images/r_amplicon_heatmap.png) - -![R - Sample genome-wide coverage plot](images/r_genome_coverage.png) - -

R - Sample per-amplicon coverage plot

- -### iVar variants - -
-Output files +- `assembly//abacas/` + - `*.abacas.bin`: Bin file that contains contigs that are not used in ordering. + - `*.abacas.crunch`: Comparison file. + - `*.abacas.fasta`: Ordered and orientated sequence file. + - `*.abacas.gaps`: Gap information. + - `*.abacas.gaps.tab`: Gap information in tab-delimited format. + - `*.abacas.MULTIFASTA.fa`: A list of ordered and orientated contigs in a multi-fasta format. + - `*.abacas.tab`: Feature file + - `*.unused_contigs.out`: Information on contigs that have a mapping information but could not be used in the ordering. +- `assembly//abacas/nucmer/`: Folder containing the files generated by the NUCmer algorithm used by ABACAS. -- `variants/ivar/` - - `*.tsv`: Original iVar variants in TSV format. - - `*.vcf.gz`: iVar variants in VCF format. Converted using custom `ivar_variants_to_vcf.py` python script. - - `*.vcf.gz.tbi`: iVar variants VCF index file. -- `variants/ivar/log/` - - `*.variant_counts.log`: Counts for type of variants called by iVar. -- `variants/ivar/bcftools_stats/` - - `*.bcftools_stats.txt`: Statistics and counts obtained from iVar variants VCF file. +**NB:** The value of `` in the output directory name above is determined by the `--assemblers` parameter (Default: 'spades').
-[iVar](https://github.com/andersen-lab/ivar/blob/master/docs/MANUAL.md) is a computational package that contains functions broadly useful for viral amplicon-based sequencing. We use iVar in this pipeline to [trim primer sequences](#ivar-trim) for amplicon input data as well as to call variants. - -iVar outputs a tsv format which is not compatible with downstream analysis such as annotation using SnpEff. Moreover some issues need to be addressed such as [strand-bias filtering](https://github.com/andersen-lab/ivar/issues/5) and [the consecutive reporting of variants belonging to the same codon](https://github.com/andersen-lab/ivar/issues/92). This pipeline uses a custom Python script [ivar_variants_to_vcf.py](https://github.com/nf-core/viralrecon/blob/master/bin/ivar_variants_to_vcf.py) to convert the default iVar output to VCF whilst also addressing both of these issues. - -![MultiQC - iVar variants called plot](images/mqc_ivar_variants_plot.png) +[ABACAS](https://www.sanger.ac.uk/science/tools/pagit) was developed to rapidly contiguate (align, order, orientate), visualize and design primers to close gaps on shotgun assembled contigs based on a reference sequence. -### BCFTools call +### PlasmidID
Output files -- `variants/bcftools/` - - `*.vcf.gz`: Variants VCF file. - - `*.vcf.gz.tbi`: Variants VCF index file. -- `variants/bcftools/bcftools_stats/` - - `*.bcftools_stats.txt`: Statistics and counts obtained from VCF file. +- `assembly//plasmidid//` + - `*_final_results.html`: Summary file with reference coverage stats and contigs for visualization. + - `*_final_results.tab`: Summary file with reference coverage stats and contigs. + - `images/_.png`: PNG file with the visualization of the alignment between the viral assembly and the reference viral genome. + - `logs/`: Log files. -
+**NB:** The value of `` in the output directory name above is determined by the `--assemblers` parameter (Default: 'spades'). -[BCFtools](http://samtools.github.io/bcftools/bcftools.html) can be used to call variants directly from BAM alignment files. It is a set of utilities that manipulate variant calls in [VCF](https://vcftools.github.io/specs.html) and its binary counterpart BCF format. BCFTools is used in the variant calling and _de novo_ assembly steps of this pipeline to obtain basic statistics from the VCF output. + -![MultiQC - BCFTools variant counts](images/mqc_bcftools_stats_plot.png) +[PlasmidID](https://github.com/BU-ISCIII/plasmidID) was used to graphically represent the alignment of the reference genome relative to a given assembly. This helps to visualize the coverage of the reference genome in the assembly. To find more information about the output files refer to the [documentation](https://github.com/BU-ISCIII/plasmidID/wiki/Understanding-the-image:-track-by-track). -### SnpEff and SnpSift +### Assembly QUAST
Output files -- `variants//snpeff/` - - `*.snpeff.csv`: Variant annotation csv file. - - `*.snpeff.genes.txt`: Gene table for annotated variants. - - `*.snpeff.summary.html`: Summary html file for variants. - - `*.snpeff.vcf.gz`: VCF file with variant annotations. - - `*.snpeff.vcf.gz.tbi`: Index for VCF file with variant annotations. - - `*.snpsift.txt`: SnpSift summary table. -- `variants//snpeff/bcftools_stats/` - - `*.bcftools_stats.txt`: Statistics and counts obtained from VCF file. +- `assembly//quast/` + - `report.html`: Results report in HTML format. Also available in various other file formats i.e. `report.pdf`, `report.tex`, `report.tsv` and `report.txt`. -**NB:** The value of `` in the output directory name above is determined by the `--variant_caller` parameter (Default: 'ivar' for '--protocol amplicon' and 'bcftools' for '--protocol metagenomic'). +**NB:** The value of `` in the output directory name above is determined by the `--assemblers` parameter (Default: 'spades').
-[SnpEff](http://snpeff.sourceforge.net/SnpEff.html) is a genetic variant annotation and functional effect prediction toolbox. It annotates and predicts the effects of genetic variants on genes and proteins (such as amino acid changes). +[QUAST](http://bioinf.spbau.ru/quast) is used to generate a single report with which to evaluate the quality of the _de novo_ assemblies across all of the samples provided to the pipeline. The HTML results can be opened within any browser (we recommend using Google Chrome). Please see the [QUAST output docs](http://quast.sourceforge.net/docs/manual.html#sec3) for more detailed information regarding the output files. -[SnpSift](http://snpeff.sourceforge.net/SnpSift.html) annotates genomic variants using databases, filters, and manipulates genomic annotated variants. After annotation with SnpEff, you can use SnpSift to help filter large genomic datasets in order to find the most significant variants. +![MultiQC - QUAST contig counts](images/mqc_quast_plot.png) -![MultiQC - SnpEff annotation counts](images/mqc_snpeff_plot.png) +## Illumina: Workflow reporting and genomes -### ASCIIGenome +### MultiQC
Output files -- `variants//asciigenome//` - - `*.pdf`: Individual variant screenshots with annotation tracks in PDF format. - -**NB:** The value of `` in the output directory name above is determined by the `--variant_caller` parameter (Default: 'ivar' for '--protocol amplicon' and 'bcftools' for '--protocol metagenomic'). +- `multiqc/` + - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. + - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. + - `summary_variants_metrics_mqc.csv`: file containing a selection of read alignment and variant calling metrics. The same metrics will also be added to the top of the MultiQC report. + - `summary_assembly_metrics_mqc.csv`: file containing a selection of read alignment and _de novo_ assembly related metrics. The same metrics will also be added to the top of the MultiQC report.
-As described in the documentation, [ASCIIGenome](https://asciigenome.readthedocs.io/en/latest/) is a command-line genome browser that can be run from a terminal window and is solely based on ASCII characters. The closest program to ASCIIGenome is probably [samtools tview](http://www.htslib.org/doc/samtools-tview.html) but ASCIIGenome offers much more flexibility, similar to popular GUI viewers like the [IGV](https://software.broadinstitute.org/software/igv/) browser. We are using the batch processing mode of ASCIIGenome in this pipeline to generate individual screenshots for all of the variant sites reported for each sample in the VCF files. This is incredibly useful to be able to quickly QC the variants called by the pipeline without having to tediously load all of the relevant tracks into a conventional genome browser. Where possible, the BAM read alignments, VCF variant file, primer BED file and GFF annotation track will be represented in the screenshot for contextual purposes. The screenshot below shows a SNP called relative to the MN908947.3 SARS-CoV-2 reference genome that overlaps the ORF7a protein and the nCoV-2019_91_LEFT primer from the ARIC v3 protocol. +[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarizing all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. -

ASCIIGenome screenshot

+Results generated by MultiQC collate pipeline QC from FastQC, fastp, Cutadapt, Bowtie 2, Kraken 2, samtools, picard CollectMultipleMetrics, BCFTools, SnpEff and QUAST. -### iVar consensus +The default [`multiqc config file`](https://github.com/nf-core/viralrecon/blob/master/assets/multiqc_config_illumina.yaml) has been written in a way in which to structure these QC metrics to make them more interpretable in the final report. -
-Output files +The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . -- `variants//consensus/ivar/` - - `*.consensus.fa`: Consensus Fasta file generated by iVar. - - `*.consensus.qual.txt`: File with the average quality of each base in the consensus sequence. -- `variants//consensus/ivar/base_qc/` - - `*.ACTG_density.pdf`: Plot showing density of ACGT bases within the consensus sequence. - - `*.base_counts.pdf`: Plot showing frequency and percentages of all bases in consensus sequence. - - `*.base_counts.tsv`: File containing frequency and percentages of all bases in consensus sequence. - - `*.N_density.pdf`: Plot showing density of N bases within the consensus sequence. - - `*.N_run.tsv`: File containing start positions and width of N bases in consensus sequence. +An example MultiQC report generated from a full-sized dataset can be viewed on the [nf-core website](https://nf-co.re/viralrecon/results). -**NB:** The value of `` in the output directory name above is determined by the `--variant_caller` parameter (Default: 'ivar' for '--protocol amplicon' and 'bcftools' for '--protocol metagenomic'). +# Nanopore: Pipeline overview -
+- [nf-core/viralrecon Description](#nf-coreviralrecon-description) +- [Illumina: Pipeline overview](#illumina-pipeline-overview) + - [Illumina: Preprocessing](#illumina-preprocessing) + - [cat](#cat) + - [FastQC](#fastqc) + - [fastp](#fastp) + - [Kraken 2](#kraken-2) + - [Illumina: Variant calling](#illumina-variant-calling) + - [Bowtie 2](#bowtie-2) + - [SAMtools](#samtools) + - [iVar trim](#ivar-trim) + - [picard MarkDuplicates](#picard-markduplicates) + - [picard CollectMultipleMetrics](#picard-collectmultiplemetrics) + - [mosdepth](#mosdepth) + - [iVar variants](#ivar-variants) + - [BCFTools call](#bcftools-call) + - [SnpEff and SnpSift](#snpeff-and-snpsift) + - [ASCIIGenome](#asciigenome) + - [iVar consensus](#ivar-consensus) + - [BCFTools and BEDTools](#bcftools-and-bedtools) + - [QUAST](#quast) + - [Pangolin](#pangolin) + - [Nextclade](#nextclade) + - [Variants long table](#variants-long-table) + - [Illumina: De novo assembly](#illumina-de-novo-assembly) + - [Cutadapt](#cutadapt) + - [SPAdes](#spades) + - [Unicycler](#unicycler) + - [minia](#minia) + - [BLAST](#blast) + - [ABACAS](#abacas) + - [PlasmidID](#plasmidid) + - [Assembly QUAST](#assembly-quast) + - [Illumina: Workflow reporting and genomes](#illumina-workflow-reporting-and-genomes) + - [MultiQC](#multiqc) +- [Nanopore: Pipeline overview](#nanopore-pipeline-overview) + - [Nanopore: Preprocessing](#nanopore-preprocessing) + - [Nanopore: pycoQC](#nanopore-pycoqc) + - [Nanopore: artic guppyplex](#nanopore-artic-guppyplex) + - [Nanopore: NanoPlot](#nanopore-nanoplot) + - [Nanopore: Variant calling](#nanopore-variant-calling) + - [Nanopore: artic minion](#nanopore-artic-minion) + - [Nanopore: Downstream analysis](#nanopore-downstream-analysis) + - [Nanopore: SAMtools](#nanopore-samtools) + - [Nanopore: mosdepth](#nanopore-mosdepth) + - [Nanopore: BCFTools](#nanopore-bcftools) + - [Nanopore: SnpEff and SnpSift](#nanopore-snpeff-and-snpsift) + - [Nanopore: QUAST](#nanopore-quast) + - [Nanopore: Pangolin](#nanopore-pangolin) + - [Nanopore: Nextclade](#nanopore-nextclade) + - [Nanopore: ASCIIGenome](#nanopore-asciigenome) + - [Nanopore: Variants long table](#nanopore-variants-long-table) + - [Nanopore: Workflow reporting](#nanopore-workflow-reporting) + - [Nanopore: MultiQC](#nanopore-multiqc) + - [Reference genome files](#reference-genome-files) +- [Pipeline information](#pipeline-information) -As described in the [iVar variants](#ivar-variants) section, iVar can be used in this pipeline to call variants and for the consensus sequence generation. +## Nanopore: Preprocessing -### BCFTools and BEDTools +A file called `summary_variants_metrics_mqc.csv` containing a selection of read alignment and variant calling metrics will be saved in the `multiqc//` output directory which is determined by the `--artic_minion_caller` parameter (Default: `nanopolish/`). The same metrics will also be added to the top of the MultiQC report. -
-Output files +### Nanopore: pycoQC -- `variants//consensus/bcftools/` - - `*.consensus.fa`: Consensus fasta file generated by integrating the high allele-frequency variants called by iVar/BCFTools into the reference genome. - - `*.filtered.vcf.gz`: VCF file containing high allele-frequency variants (default: `>= 0.75`) that were integrated into the consensus sequence. - - `*.filtered.vcf.gz.tbi`: Variants VCF index file for high allele frequency variants. -- `variants//consensus/bcftools/base_qc/` - - `*.ACTG_density.pdf`: Plot showing density of ACGT bases within the consensus sequence. - - `*.base_counts.pdf`: Plot showing frequency and percentages of all bases in consensus sequence. - - `*.base_counts.tsv`: File containing frequency and percentages of all bases in consensus sequence. - - `*.N_density.pdf`: Plot showing density of N bases within the consensus sequence. - - `*.N_run.tsv`: File containing start positions and width of N bases in consensus sequence. +
+Output files -**NB:** The value of `` in the output directory name above is determined by the `--variant_caller` parameter (Default: 'ivar' for '--protocol amplicon' and 'bcftools' for '--protocol metagenomic'). +- `pycoqc/` + - `*.html` and `.json` file that includes a run summary and graphical representation of various QC metrics including distribution of read length, distribution of read quality scores, mean read quality per sequence length, output per channel over experiment time and percentage of reads per barcode.
-[BCFTools](http://samtools.github.io/bcftools/bcftools.html) is used in the variant calling and _de novo_ assembly steps of this pipeline to obtain basic statistics from the VCF output. It can also used be used to generate a consensus sequence by integrating variant calls into the reference genome. In this pipeline, we use `samtools mpileup` to create a mask using low coverage positions, and `bedtools maskfasta` to mask the genome sequences based on these intervals. Finally, `bcftools consensus` is used to generate the consensus by projecting the high allele frequency variants onto the masked genome reference sequence. +[PycoQC](https://github.com/a-slide/pycoQC) compute metrics and generate QC plots using the sequencing summary information generated by basecalling/demultiplexing tools such as Guppy e.g. distribution of read length, read length over time, number of reads per barcode and other general stats. -### QUAST +

PycoQC - Number of reads per barcode

+ +### Nanopore: artic guppyplex
Output files -- `variants//consensus//quast/` - - `report.html`: Results report in HTML format. Also available in various other file formats i.e. `report.pdf`, `report.tex`, `report.tsv` and `report.txt`. +- `guppyplex/` + - `*.fastq.gz` files generated by aggregate pre-demultiplexed reads from MinKNOW/Guppy. These files are not saved by default but can be via a custom config file such as the one below. -**NB:** The value of `` in the output directory name above is determined by the `--variant_caller` parameter (Default: 'ivar' for '--protocol amplicon' and 'bcftools' for '--protocol metagenomic'). -**NB:** The value of `` in the output directory name above is determined by the `--consensus_caller` parameter (Default: 'bcftools' for both '--protocol amplicon' and '--protocol metagenomic'). +```nextflow +params { + modules { + 'nanopore_artic_guppyplex' { + publish_files = ['fastq.gz':''] + } + } +} +```
-[QUAST](http://bioinf.spbau.ru/quast) is used to generate a single report with which to evaluate the quality of the consensus sequence across all of the samples provided to the pipeline. The HTML results can be opened within any browser (we recommend using Google Chrome). Please see the [QUAST output docs](http://quast.sourceforge.net/docs/manual.html#sec3) for more detailed information regarding the output files. +The [artic guppyplex](https://artic.readthedocs.io/en/latest/commands/) tool from the [ARTIC field bioinformatics pipeline](https://github.com/artic-network/fieldbioinformatics) is used to perform length filtering of the demultiplexed Nanopore reads obtained per barcode. This essentially filters out chimeric reads that may be generated by the ARTIC protocol. The pipeline uses a default minimum and maximum read length of 400 and 700, respectively as tailored for the [nCoV-2019 primer set](https://artic.network/ncov-2019/ncov2019-bioinformatics-sop.html). However, you may need to adjust these for different primer schemes e.g. by using the minimum length of the amplicons (`--min-length`) as well as the maximum length plus 200 (`--max-length`). -### Pangolin +### Nanopore: NanoPlot
Output files -- `variants//consensus//pangolin/` - - `*.pangolin.csv`: Lineage analysis results from Pangolin. - -**NB:** The value of `` in the output directory name above is determined by the `--variant_caller` parameter (Default: 'ivar' for '--protocol amplicon' and 'bcftools' for '--protocol metagenomic'). -**NB:** The value of `` in the output directory name above is determined by the `--consensus_caller` parameter (Default: 'bcftools' for both '--protocol amplicon' and '--protocol metagenomic'). +- `nanoplot//` + - Per-sample `*.html` files for QC metrics and individual `*.png` image files for plots.
-Phylogenetic Assignment of Named Global Outbreak LINeages ([Pangolin](https://github.com/cov-lineages/pangolin)) has been used extensively during the COVID-19 pandemic in order to to assign lineages to SARS-CoV-2 genome sequenced samples. A [web application](https://pangolin.cog-uk.io/) also exists that allows users to upload genome sequences via a web browser to assign lineages to genome sequences of SARS-CoV-2, view descriptive characteristics of the assigned lineage(s), view the placement of the lineage in a phylogeny of global samples, and view the temporal and geographic distribution of the assigned lineage(s). +[NanoPlot](https://github.com/wdecoster/NanoPlot) it a tool that can be used to produce general quality metrics from various Nanopore-based input files including fastq files e.g. quality score distribution, read lengths and other general stats. -### Nextclade +

Nanoplot - Read quality vs read length

+ +## Nanopore: Variant calling + +### Nanopore: artic minion
Output files -- `variants//consensus//nextclade/` - - `*.csv`: Analysis results from Nextlade containing genome clade assignment, mutation calling and sequence quality checks. +- `/` + - `*.consensus.fasta`: Consensus fasta file generated by artic minion. + - `*.pass.unique.vcf.gz`: VCF file containing unique variants passing quality filters. + - `*.pass.unique.vcf.gz.tbi`: VCF index file containing unique variants passing quality filters. + - `*.pass.vcf.gz`: VCF file containing variants passing quality filters. + - `*.pass.vcf.gz.tbi`: VCF index file containing variants passing quality filters. + - `*.primers.vcf`: VCF file containing variants found in primer-binding regions. + - `*.merged.vcf`: VCF file containing all detected variants. + - `*.fail.vcf`: VCF file containing variants failing quality filters. + - `*.sorted.bam`: BAM file generated by initial alignment. + - `*.sorted.bam.bai`: BAM index file generated by initial alignment. + - `*.trimmed.rg.sorted.bam`: BAM file without primer-binding site trimming. + - `*.trimmed.rg.sorted.bam.bai`: BAM index file without primer-binding site trimming. + - `*.primertrimmed.rg.sorted.bam`: BAM file generated after primer-binding site trimming. + - `*.primertrimmed.rg.sorted.bam.bai`: BAM index file generated after primer-binding site trimming. -**NB:** The value of `` in the output directory name above is determined by the `--variant_caller` parameter (Default: 'ivar' for '--protocol amplicon' and 'bcftools' for '--protocol metagenomic'). -**NB:** The value of `` in the output directory name above is determined by the `--consensus_caller` parameter (Default: 'bcftools' for both '--protocol amplicon' and '--protocol metagenomic'). +**NB:** The value of `` in the output directory name above is determined by the `--artic_minion_caller` parameter (Default: 'nanopolish').
-[Nextclade](https://github.com/nextstrain/nextclade) performs viral genome clade assignment, mutation calling and sequence quality checks for the consensus sequences generated in this pipeline. Similar to Pangolin, it has been used extensively during the COVID-19 pandemic. A [web application](https://clades.nextstrain.org/) also exists that allows users to upload genome sequences via a web browser. +The [artic minion](https://artic.readthedocs.io/en/latest/commands/) tool from the [ARTIC field bioinformatics pipeline](https://github.com/artic-network/fieldbioinformatics) is used to align reads, call variants and to generate the consensus sequence. By default, artic minion uses [Minimap2](https://github.com/lh3/minimap2) to align the reads to the viral genome, however you can use [BWA](https://github.com/lh3/bwa) instead using the `--artic_minion_aligner bwa` parameter. Similarly, the default variant caller used by artic minion is [Nanopolish](https://github.com/jts/nanopolish), however, you can use [Medaka](https://github.com/nanoporetech/medaka) instead via the `--artic_minion_caller medaka` parameter. Medaka is faster than Nanopolish, performs mostly the same and can be run directly from `fastq` input files as opposed to requiring the `fastq`, `fast5` and `sequencing_summary.txt` files required to run Nanopolish. You must provide the appropriate [Medaka model](https://github.com/nanoporetech/medaka#models) via the `--artic_minion_medaka_model` parameter if using `--artic_minion_caller medaka`. -### Variants long table +## Nanopore: Downstream analysis + +### Nanopore: SAMtools
Output files -- `variants//` - - `variants_long_table.csv`: Long format table collating per-sample information for individual variants, functional effect prediction and lineage analysis. +- `/` + - `*.mapped.sorted.bam`: Coordinate sorted BAM file containing read alignment information. + - `*.mapped.sorted.bam.bai`: Index file for coordinate sorted BAM file. +- `/samtools_stats/` + - SAMtools `*.mapped.sorted.bam.flagstat`, `*.mapped.sorted.bam.idxstats` and `*.mapped.sorted.bam.stats` files generated from the alignment files. -**NB:** The value of `` in the output directory name above is determined by the `--variant_caller` parameter (Default: 'ivar' for '--protocol amplicon' and 'bcftools' for '--protocol metagenomic'). +**NB:** The value of `` in the output directory name above is determined by the `--artic_minion_caller` parameter (Default: 'nanopolish').
-Create variants long format table collating per-sample information for individual variants ([`BCFTools`](http://samtools.github.io/bcftools/bcftools.html)), functional effect prediction ([`SnpSift`](http://snpeff.sourceforge.net/SnpSift.html)) and lineage analysis ([`Pangolin`](https://github.com/cov-lineages/pangolin)). - -The more pertinent variant information is summarised in this table to make it easier for researchers to assess the impact of variants found amongst the sequenced sample(s). An example of the fields included in the table are shown below: - -```bash -SAMPLE,CHROM,POS,REF,ALT,FILTER,DP,REF_DP,ALT_DP,AF,GENE,EFFECT,HGVS_C,HGVS_P,HGVS_P_1LETTER,CALLER,LINEAGE -SAMPLE1_PE,MN908947.3,241,C,T,PASS,489,4,483,0.99,orf1ab,upstream_gene_variant,c.-25C>T,.,.,ivar,B.1 -SAMPLE1_PE,MN908947.3,1875,C,T,PASS,92,62,29,0.32,orf1ab,missense_variant,c.1610C>T,p.Ala537Val,p.A537V,ivar,B.1 -SAMPLE1_PE,MN908947.3,3037,C,T,PASS,213,0,213,1.0,orf1ab,synonymous_variant,c.2772C>T,p.Phe924Phe,p.F924F,ivar,B.1 -SAMPLE1_PE,MN908947.3,11719,G,A,PASS,195,9,186,0.95,orf1ab,synonymous_variant,c.11454G>A,p.Gln3818Gln,p.Q3818Q,ivar,B.1 -``` - -## Illumina: De novo assembly +BAM files containing the original alignments from either Minimap2 or BWA are further processed with [SAMtools](http://samtools.sourceforge.net/) to remove unmapped reads as well as to generate read mapping statistics. -A file called `summary_assembly_metrics_mqc.csv` containing a selection of read alignment and _de novo_ assembly related metrics will be saved in the `multiqc/` results directory. The same metrics will also be added to the top of the MultiQC report. +![MultiQC - SAMtools alignment scores plot](images/mqc_samtools_stats_plot.png) -### Cutadapt +### Nanopore: mosdepth
Output files -- `assembly/cutadapt/log/` - - `*.cutadapt.log`: Cutadapt log file generated from stdout. -- `assembly/cutadapt/fastqc/` - - `*_fastqc.html`: FastQC report of the trimmed reads. - - `*_fastqc.zip`: Zip archive containing the FastQC report. +- `/mosdepth/genome/` + - `all_samples.mosdepth.coverage.tsv`: File aggregating genome-wide coverage values across all samples used for plotting. + - `*.mosdepth.coverage.pdf`: Whole-genome coverage plot. + - `*.mosdepth.coverage.tsv`: File containing coverage values for the above plot. + - `*.mosdepth.summary.txt`: Summary metrics including mean, min and max coverage values. +- `/mosdepth/amplicon/` + - `all_samples.mosdepth.coverage.tsv`: File aggregating per-amplicon coverage values across all samples used for plotting. + - `all_samples.mosdepth.heatmap.pdf`: Heatmap showing per-amplicon coverage across all samples. + - `*.mosdepth.coverage.pdf`: Bar plot showing per-amplicon coverage for an individual sample. + - `*.mosdepth.coverage.tsv`: File containing per-amplicon coverage values for the above plot. + - `*.mosdepth.summary.txt`: Summary metrics including mean, min and max coverage values. + +**NB:** The value of `` in the output directory name above is determined by the `--artic_minion_caller` parameter (Default: 'nanopolish').
-In the variant calling branch of the pipeline we are using [iVar trim](#ivar-trim) to remove primer sequences from the aligned BAM files for amplicon data. Since in the _de novo_ assembly branch we don't align the reads, we use [Cutadapt](https://cutadapt.readthedocs.io/en/stable/guide.html) as an alternative option to remove and clean the primer sequences directly from FastQ files. +[mosdepth](mosdepth) is a fast BAM/CRAM depth calculation for WGS, exome, or targeted sequencing. mosdepth is used in this pipeline to obtain genome-wide coverage values in 200bp windows and to obtain amplicon/region-specific coverage metrics. The results are then either rendered in MultiQC (genome-wide coverage) or are plotted using custom `R` scripts. -![MultiQC - Cutadapt filtered reads plot](images/mqc_cutadapt_plot.png) +![R - Samples amplicon coverage heatmap ](images/r_amplicon_heatmap.png) -### SPAdes +![R - Sample genome-wide coverage plot](images/r_genome_coverage.png) + +

R - Sample per-amplicon coverage plot

+ +### Nanopore: BCFTools
Output files -- `assembly/spades//` - - `*.scaffolds.fa.gz`: SPAdes scaffold assembly. - - `*.contigs.fa.gz`: SPAdes assembly contigs. - - `*.assembly.gfa.gz`: SPAdes assembly graph in [GFA](https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md) format. -- `assembly/spades//bandage/` - - `*.png`: Bandage visualisation for SPAdes assembly graph in PNG format. - - `*.svg`: Bandage visualisation for SPAdes assembly graph in SVG format. +- `/bcftools_stats/` + - `*.bcftools_stats.txt`: Statistics and counts obtained from VCF file. -**NB:** The value of `` in the output directory name above is determined by the `--spades_mode` parameter (Default: 'rnaviral'). +**NB:** The value of `` in the output directory name above is determined by the `--artic_minion_caller` parameter (Default: 'nanopolish').
-[SPAdes](http://cab.spbu.ru/software/spades/) is an assembly toolkit containing various assembly pipelines. Generically speaking, SPAdes is one of the most popular de Bruijn graph-based assembly algorithms used for bacterial/viral genome reconstruction. +[BCFtools](http://samtools.github.io/bcftools/bcftools.html) is a set of utilities that manipulate variant calls in [VCF](https://vcftools.github.io/specs.html) and its binary counterpart BCF format. It can also used be used to generate statistics and counts obtained from VCF files as used here. -[Bandage](https://rrwick.github.io/Bandage/) is a program for visualising _de novo_ assembly graphs. By displaying connections which are not present in the contigs file, Bandage opens up new possibilities for analysing _de novo_ assemblies. +![MultiQC - BCFTools variant counts](images/mqc_bcftools_stats_plot.png) -### Unicycler +### Nanopore: SnpEff and SnpSift
Output files -- `assembly/unicycler/` - - `*.scaffolds.fa.gz`: Unicycler scaffold assembly. - - `*.assembly.gfa.gz`: Unicycler assembly graph in GFA format. -- `assembly/unicycler/bandage/` - - `*.png`: Bandage visualisation for Unicycler assembly graph in PNG format. - - `*.svg`: Bandage visualisation for Unicycler assembly graph in SVG format. +- `/snpeff/` + - `*.snpeff.csv`: Variant annotation csv file. + - `*.snpeff.genes.txt`: Gene table for annotated variants. + - `*.snpeff.summary.html`: Summary html file for variants. + - `*.snpeff.vcf.gz`: VCF file with variant annotations. + - `*.snpeff.vcf.gz.tbi`: Index for VCF file with variant annotations. + - `*.snpsift.txt`: SnpSift summary table. +- `/snpeff/bcftools_stats/` + - `*.snpeff.bcftools_stats.txt`: Statistics and counts obtained from SnpEff VCF file. + +**NB:** The value of `` in the output directory name above is determined by the `--artic_minion_caller` parameter (Default: 'nanopolish').
-[Unicycler](https://github.com/rrwick/Unicycler) is an assembly pipeline for bacterial genomes. It can assemble Illumina-only read sets where it functions as a SPAdes-optimiser. +[SnpEff](http://snpeff.sourceforge.net/SnpEff.html) is a genetic variant annotation and functional effect prediction toolbox. It annotates and predicts the effects of genetic variants on genes and proteins (such as amino acid changes). -### minia +[SnpSift](http://snpeff.sourceforge.net/SnpSift.html) annotates genomic variants using databases, filters, and manipulates genomic annotated variants. After annotation with SnpEff, you can use SnpSift to help filter large genomic datasets in order to find the most significant variants. + +![MultiQC - SnpEff annotation counts](images/mqc_snpeff_plot.png) + +### Nanopore: QUAST
Output files -- `assembly/minia/` - - `*.contigs.fa`: Minia scaffold assembly. - - `*.unitigs.fa`: Minia unitigs fasta file. - - `*.h5`: Minia h5 output file. +- `/quast/` + - `report.html`: Results report in HTML format. Also available in various other file formats i.e. `report.pdf`, `report.tex`, `report.tsv` and `report.txt`. + +**NB:** The value of `` in the output directory name above is determined by the `--artic_minion_caller` parameter (Default: 'nanopolish').
-[Minia](https://github.com/GATB/minia) is a short-read assembler based on a de Bruijn graph, capable of assembling a human genome on a desktop computer in a day. The output of Minia is a set of contigs. Minia produces results of similar contiguity and accuracy to other de Bruijn assemblers. +[QUAST](http://bioinf.spbau.ru/quast) is used to generate a single report with which to evaluate the quality of the consensus sequence across all of the samples provided to the pipeline. The HTML results can be opened within any browser (we recommend using Google Chrome). Please see the [QUAST output docs](http://quast.sourceforge.net/docs/manual.html#sec3) for more detailed information regarding the output files. -### BLAST +### Nanopore: Pangolin
Output files -- `assembly//blastn/` - - `*.blastn.txt`: BLAST results against the target virus. - - `*.filter.blastn.txt`: Filtered BLAST results. +- `/pangolin/` + - `*.pangolin.csv`: Lineage analysis results from Pangolin. -**NB:** The value of `` in the output directory name above is determined by the `--assemblers` parameter (Default: 'spades'). +**NB:** The value of `` in the output directory name above is determined by the `--artic_minion_caller` parameter (Default: 'nanopolish').
-[blastn](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch) is used to align the assembled contigs against the virus reference genome. +Phylogenetic Assignment of Named Global Outbreak LINeages ([Pangolin](https://github.com/cov-lineages/pangolin)) has been used extensively during the COVID-19 pandemic to assign lineages to SARS-CoV-2 genome sequenced samples. A [web application](https://pangolin.cog-uk.io/) also exists that allows users to upload genome sequences via a web browser to assign lineages to genome sequences of SARS-CoV-2, view descriptive characteristics of the assigned lineage(s), view the placement of the lineage in a phylogeny of global samples, and view the temporal and geographic distribution of the assigned lineage(s). -### ABACAS +### Nanopore: Nextclade
Output files -- `assembly//abacas/` - - `*.abacas.bin`: Bin file that contains contigs that are not used in ordering. - - `*.abacas.crunch`: Comparison file. - - `*.abacas.fasta`: Ordered and orientated sequence file. - - `*.abacas.gaps`: Gap information. - - `*.abacas.gaps.tab`: Gap information in tab-delimited format. - - `*.abacas.MULTIFASTA.fa`: A list of ordered and orientated contigs in a multi-fasta format. - - `*.abacas.tab`: Feature file - - `*.unused_contigs.out`: Information on contigs that have a mapping information but could not be used in the ordering. -- `assembly//abacas/nucmer/`: Folder containing the files generated by the NUCmer algorithm used by ABACAS. +- `/nextclade/` + - `*.csv`: Analysis results from Nextlade containing genome clade assignment, mutation calling and sequence quality checks. -**NB:** The value of `` in the output directory name above is determined by the `--assemblers` parameter (Default: 'spades'). +**NB:** The value of `` in the output directory name above is determined by the `--artic_minion_caller` parameter (Default: 'nanopolish').
-[ABACAS](https://www.sanger.ac.uk/science/tools/pagit) was developed to rapidly contiguate (align, order, orientate), visualize and design primers to close gaps on shotgun assembled contigs based on a reference sequence. +[Nextclade](https://github.com/nextstrain/nextclade) performs viral genome clade assignment, mutation calling and sequence quality checks for the consensus sequences generated in this pipeline. Similar to Pangolin, it has been used extensively during the COVID-19 pandemic. A [web application](https://clades.nextstrain.org/) also exists that allows users to upload genome sequences via a web browser. -### PlasmidID +### Nanopore: ASCIIGenome
Output files -- `assembly//plasmidid//` - - `*_final_results.html`: Summary file with reference coverage stats and contigs for visualization. - - `*_final_results.tab`: Summary file with reference coverage stats and contigs. - - `images/_.png`: PNG file with the visualization of the alignment between the viral assembly and the reference viral genome. - - `logs/`: Log files. +- `/asciigenome//` + - `*.pdf`: Individual variant screenshots with annotation tracks in PDF format. -**NB:** The value of `` in the output directory name above is determined by the `--assemblers` parameter (Default: 'spades'). +**NB:** The value of `` in the output directory name above is determined by the `--artic_minion_caller` parameter (Default: 'nanopolish').
-[PlasmidID](https://github.com/BU-ISCIII/plasmidID) was used to graphically represent the alignment of the reference genome relative to a given assembly. This helps to visualize the coverage of the reference genome in the assembly. To find more information about the output files refer to the [documentation](https://github.com/BU-ISCIII/plasmidID/wiki/Understanding-the-image:-track-by-track). +As described in the documentation, [ASCIIGenome](https://asciigenome.readthedocs.io/en/latest/) is a command-line genome browser that can be run from a terminal window and is solely based on ASCII characters. The closest program to ASCIIGenome is probably [samtools tview](http://www.htslib.org/doc/samtools-tview.html) but ASCIIGenome offers much more flexibility, similar to popular GUI viewers like the [IGV](https://software.broadinstitute.org/software/igv/) browser. We are using the batch processing mode of ASCIIGenome in this pipeline to generate individual screenshots for all of the variant sites reported for each sample in the VCF files. This is incredibly useful to be able to quickly QC the variants called by the pipeline without having to tediously load all of the relevant tracks into a conventional genome browser. Where possible, the BAM read alignments, VCF variant file, primer BED file and GFF annotation track will be represented in the screenshot for contextual purposes. The screenshot below shows a SNP called relative to the MN908947.3 SARS-CoV-2 reference genome that overlaps the ORF7a protein and the nCoV-2019_91_LEFT primer from the ARIC v3 protocol. -### Assembly QUAST +

ASCIIGenome screenshot

+ +### Nanopore: Variants long table
Output files -- `assembly//quast/` - - `report.html`: Results report in HTML format. Also available in various other file formats i.e. `report.pdf`, `report.tex`, `report.tsv` and `report.txt`. +- `/` + - `variants_long_table.csv`: Long format table collating per-sample information for individual variants, functional effect prediction and lineage analysis. -**NB:** The value of `` in the output directory name above is determined by the `--assemblers` parameter (Default: 'spades'). +**NB:** The value of `` in the output directory name above is determined by the `--artic_minion_caller` parameter (Default: 'nanopolish').
-[QUAST](http://bioinf.spbau.ru/quast) is used to generate a single report with which to evaluate the quality of the _de novo_ assemblies across all of the samples provided to the pipeline. The HTML results can be opened within any browser (we recommend using Google Chrome). Please see the [QUAST output docs](http://quast.sourceforge.net/docs/manual.html#sec3) for more detailed information regarding the output files. +Create variants long format table collating per-sample information for individual variants ([`BCFTools`](http://samtools.github.io/bcftools/bcftools.html)), functional effect prediction ([`SnpSift`](http://snpeff.sourceforge.net/SnpSift.html)) and lineage analysis ([`Pangolin`](https://github.com/cov-lineages/pangolin)). -![MultiQC - QUAST contig counts](images/mqc_quast_plot.png) +The more pertinent variant information is summarised in this table to make it easier for researchers to assess the impact of variants found amongst the sequenced sample(s). An example of the fields included in the table are shown below: -## Illumina: Workflow reporting and genomes +```bash +SAMPLE,CHROM,POS,REF,ALT,FILTER,DP,REF_DP,ALT_DP,AF,GENE,EFFECT,HGVS_C,HGVS_P,HGVS_P_1LETTER,CALLER,LINEAGE +SAMPLE1_PE,MN908947.3,241,C,T,PASS,489,4,483,0.99,orf1ab,upstream_gene_variant,c.-25C>T,.,.,ivar,B.1 +SAMPLE1_PE,MN908947.3,1875,C,T,PASS,92,62,29,0.32,orf1ab,missense_variant,c.1610C>T,p.Ala537Val,p.A537V,ivar,B.1 +SAMPLE1_PE,MN908947.3,3037,C,T,PASS,213,0,213,1.0,orf1ab,synonymous_variant,c.2772C>T,p.Phe924Phe,p.F924F,ivar,B.1 +SAMPLE1_PE,MN908947.3,11719,G,A,PASS,195,9,186,0.95,orf1ab,synonymous_variant,c.11454G>A,p.Gln3818Gln,p.Q3818Q,ivar,B.1 +``` -### MultiQC +## Nanopore: Workflow reporting + +### Nanopore: MultiQC
Output files -- `multiqc/` +- `multiqc//` - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. - - `summary_variants_metrics_mqc.csv`: file containing a selection of read alignment and variant calling metrics. The same metrics will also be added to the top of the MultiQC report. - - `summary_assembly_metrics_mqc.csv`: file containing a selection of read alignment and _de novo_ assembly related metrics. The same metrics will also be added to the top of the MultiQC report. + - `summary_variants_metrics_mqc.csv`: file containing a selection of read alignmnet and variant calling metrics. The same metrics will also be added to the top of the MultiQC report.
-[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarizing all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. +![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) -Results generated by MultiQC collate pipeline QC from FastQC, fastp, Cutadapt, Bowtie 2, Kraken 2, samtools, picard CollectMultipleMetrics, BCFTools, SnpEff and QUAST. +Results generated by MultiQC collate pipeline QC from pycoQC, samtools, mosdepth, BCFTools, SnpEff and QUAST. -The default [`multiqc config file`](https://github.com/nf-core/viralrecon/blob/master/assets/multiqc_config_illumina.yaml) has been written in a way in which to structure these QC metrics to make them more interpretable in the final report. +The default [`multiqc config file`](https://github.com/nf-core/viralrecon/blob/master/assets/multiqc_config_nanopore.yaml) has been written in a way in which to structure these QC metrics to make them more interpretable in the final report. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . @@ -918,4 +982,4 @@ A number of genome-specific files are generated by the pipeline because they are
-[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. +[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. \ No newline at end of file diff --git a/bu_isciii/assets/reports/md/wgmlst_chewbbaca.md b/bu_isciii/assets/reports/md/wgmlst_chewbbaca.md new file mode 100644 index 00000000..45e68e2d --- /dev/null +++ b/bu_isciii/assets/reports/md/wgmlst_chewbbaca.md @@ -0,0 +1,65 @@ +# Output description for cgMLST/wgMLST with ChewBBACA + +## Pipeline description + +MLST service performs Multi-Locus Sequence Typing of the samples with the _de novo_ assembly genomes of the samples. It uses [ChewBBACA](https://chewbbaca.readthedocs.io/en/latest/index.html) to generate the schemas (if necessary) and perform the allele calling, and [GrapeTree](https://enterobase.readthedocs.io/en/latest/grapetree/grapetree-about.html) to generate the minimun spanning tree. + +### Pipeline overview + +- ChewBBACA - v3.3.3 - Schema generation and allele calling +- GRapeTree - v2.2 - Minimum spanning tree + +> [!WARNING] +> Needs the _de novo_ assembly of the samples to be performed. + +## Output directory + +- `02-chewbbaca`: Results from ChewBBACA analysis: + - `prep_schema`: Schema files prepared for chewbbaca. + - `analyze_schema/schema_report.html`: HTML report with the evaluation of the schema used for the analysis. + - `allele_calling`: + - `cds_coordinates.tsv`: Contains the coordinates (genome unique identifier, contig identifier, start position, stop position, protein identifier attributed by chewBBACA, and coding strand (chewBBACA<=3.2.0 assigns 1 to the forward strand and 0 to the reverse strand and chewBBACA>=3.3.0 assigns 1 and -1 to the forward and reverse strands, respectively)) of the CDSs identified in each genome. + - `loci_summary_stats.tsv`: Contains the classification type counts (EXC, INF, PLOT3, PLOT5, LOTSC, NIPH, NIPHEM, ALM, ASM, PAMA, LNF) and the total number of classified CDSs (non-LNF) per locus. + - `paralogous_counts.tsv`: Contains the list of paralogous loci and the number of times those loci matched a CDS that was also similar to other loci in the schema. + - `results_alleles.tsv`: Contains the allelic profiles determined for the input samples. The first column has the identifiers of the genome assemblies for which the allele call was performed. The remaining columns contain the allele call data for loci present in the schema, with the column headers being the locus identifiers. The INF- prefix in the allelic number indicates that such allele was newly inferred in that genome, and the number following the prefix is the ID attributed to such allele. For the PLOT classification, in the allelic profile output, a locus can be classified as PLOT5 or PLOT3 depending whether the CDS in the genome under analysis matching the schema locus is located in the 5' end or 3' end (respectively) of the contig. All other annotations are identical to what was described above. + - `results_statistics.tsv`: Contains the classification type counts (EXC, INF, PLOT3, PLOT5, LOTSC, NIPH, NIPHEM, ALM, ASM, PAMA, LNF), the total number of invalid CDSs, the total number of classified CDSs (non-LNF) and the total number of predicted CDSs per genome. The column headers stand for: + - EXC - EXaCt matches (100% DNA identity) with previously identified alleles. + - INF - INFerred new alleles that had no exact match in the schema but are highly similar to loci in the schema. The INF- prefix in the allele identifier indicates that such allele was newly inferred in that genome, and the number following the prefix is the allele identifier attributed to such allele. Inferred alleles are added to the FASTA file of the locus they share high similarity with. + - LNF - Locus Not Found. No alleles were found for the number of loci in the schema shown. This means that, for those loci, there were no BLAST hits or they were not within the BSR threshold for allele assignment. + - PLNF - Probable Locus Not Found. Attributed when a locus is not found during execution modes 1, 2 and 3. Those modes do not perform the complete analysis, that is only performed in mode 4 (default), and the distinct classification indicates that a more thorough analysis might have found a match for the loci that were not found. + - PLOT3/PLOT5 - Possible Locus On the Tip of the query genome contigs (see image below). A locus is classified as PLOT when the CDS of the query genome has a BLAST hit with a known larger allele that covers the CDS sequence entirely and the unaligned regions of the larger allele exceed one of the query genome contigs ends (a locus can be classified as PLOT5 or PLOT3 depending on whether the CDS in the genome under analysis matching the schema locus is located in the 5’ end or 3’ end (respectively) of the contig). This could be an artifact caused by genome fragmentation resulting in a shorter CDS prediction by Prodigal. To avoid locus misclassification, loci in such situations are classified as PLOT. + - LOTSC - A locus is classified as LOTSC when the contig of the query genome is smaller than the matched allele. + - NIPH - Non-Informative Paralogous Hit (see image below). When ≥2 CDSs in the query genome match one locus in the schema with a BSR > 0.6, that locus is classified as NIPH. This suggests that such locus can have paralogous (or orthologous) loci in the query genome and should be removed from the analysis due to the potential uncertainty in allele assignment (for example, due to the presence of multiple copies of the same mobile genetic element (MGE) or as a consequence of gene duplication followed by pseudogenization). A high number of NIPH may also indicate a poorly assembled genome due to a high number of smaller contigs which result in partial CDS predictions. These partial CDSs may contain conserved domains that match multiple loci. + - NIPHEM - similar to the NIPH classification, but specifically referring to exact matches. Whenever several CDSs from the same genome match a single or multiple alleles of the same locus with 100% DNA similarity during the first DNA sequence comparison, the NIPHEM tag is attributed. + - PAMA - PAralogous MAtch. Attributed to CDSs that are highly similar to more than one locus. This type of classification allows the identification of groups of similar loci in the schema that are classified as paralogous loci and listed in the paralogous_counts.tsv and paralogous_loci.tsv files. + - ALM - Alleles 20% Larger than the length Mode of the distribution of the matched loci (CDS length > (locus length mode + locus length mode * 0.2)) (see image below). This determination is based on the currently identified set of alleles for a given locus. It is important to remember that, although infrequently, the mode may change as more alleles for a given locus are called and added to a schema. + - ASM - similar to ALM but for Alleles 20% Smaller than the length Mode distribution of the matched loci (CDS length < (locus length mode - locus length mode * 0.2)). As with ALMs it is important to remember that, although infrequently, the mode may change as more alleles for a given locus are called and added to a schema. +![PLOT](./images/PLOT.png) +![NIPH-NIPHEM](./images/NIPH-NIPHEM.png) +![ALM-ASM](./images/ALM-ASM.png) + - `invalid_cds.txt`: Contains the list of alleles predicted by Prodigal that were excluded based on the minimum sequence size value and presence of ambiguous bases. + - `logging_info.txt`: Contains summary information about the allele calling process. + - `paralogous_loci.tsv`: Contains the sets of paralogous loci identified per genome (genome identifier, identifiers of the paralogous loci and the coordinates of the CDS that is similar to the group of paralogous loci). + - `results_contigsInfo.tsv`: Contains the loci coordinates in the genomes analyzed. The first column contains the identifier of the genome used in the allele calling and the other columns (with loci names in the headers) the locus coordinate information or the classification attributed by chewBBACA if it was not an exact match or inferred allele. + - `allele_calling_evaluation`: + - `allelecall_report.html`: A HTML report, that contains the following components: + - A table with the total number of samples, total number of loci, total number of coding sequences (CDSs) extracted from the samples, total number of CDSs classified and totals per classification type. + - A tab panel with stacked bar charts for the classification type counts per sample and per locus. + - A tab panel with detailed sample and locus statistics. + - If a TSV file with annotations is provided to the --annotations parameter, the report will also include a table with the provided annotations. Otherwise, it will display a warning informing that no annotations were provided. + - A Heatmap chart representing the loci presence-absence matrix for all samples in the dataset. + - A Heatmap chart representing the allelic distance matrix for all samples in the dataset. + - A tree drawn with Phylocanvas.gl based on the Neighbor-Joining (NJ) tree computed by FastTree. + - `cgMLST_MSA.fasta`: contains the MSA of the core loci. For each locus in the core genome, the alleles found in all samples are translated and aligned with MAFFT. The alignment files are concatenated to generate the full alignment. + - `cgMLST_profiles.tsv`: contains the allelic profiles for the set of core loci. + - `distance_matrix_symmetric.tsv`: contains the symmetric distance matrix. The distances are computed by determining the number of allelic differences from the set of core loci (shared by 100% of the samples) between each pair of samples. + - `masked_profiles.tsv`: contains the masked allelic profiles (results from masking the allelic profiles in the results_alleles.tsv file generated by the AlleleCall module). + - `presence_absence.tsv`: Contains the loci presence-absence matrix. + - `report_bundle.js`: A JavaScript bundle file necessary to visualize the report. + +> [!NOTE] +> For more information, see the [ChewBBACA's documentation](https://chewbbaca.readthedocs.io/en/latest/index.html) in the `User Guide` secton + +- `03-grapetree`: + - `tree.svg`: Minimum Spannig Tree plot in SVG (Scalable Vector Graphics) format. Branches longer than = 700 are shown shortenned. + - `tree.nwk`: Newick tree from the Minimum Spannig Tree. diff --git a/bu_isciii/assets/reports/results/assembly.md b/bu_isciii/assets/reports/results/assembly.md new file mode 100644 index 00000000..2685350a --- /dev/null +++ b/bu_isciii/assets/reports/results/assembly.md @@ -0,0 +1,68 @@ +# Assembly + +Here, we describe the results from the Assembly pipeline for de novo genome assembly and annotation. + +* **assemblies**: a symbolic link to the raw reads associated with the resolution. +* **kmerfinder_summary.csv**: a .csv file containing the main results from kmerfinder. For each sample, you should check that both the best hit and the second hit reported by kmerfinder correspond to the species name indicated by the researcher when requesting the service. If the second hit is associated to a different species, check other metrics like %GC or %genome fraction in the MultiQC report, since this might reveal a contamination in that sample. + * *sample_name*: sample name. + * *07-kmerfinder_best_hit_# Assembly*: RefSeq assembly accession ID. + * *07-kmerfinder_best_hit_Accession Number*: accession number of entry ID in fasta file. + * *07-kmerfinder_best_hit_Depth*: is the number of matched kmers in the query sequence divided by the total number of Kmers in the template. For read files this estimates the sequencing depth. + * *07-kmerfinder_best_hit_Description*: additional descriptions available in fasta file, or in the case of organism databases the identifier lines of fasta files. + * *07-kmerfinder_best_hit_Expected*: is the expected score, i.e.the expected total number of matching Kmers between query and template (randomly selected). + * *07-kmerfinder_best_hit_Num*: is the sequence number of accession entry in the KmerFinder database. + * *07-kmerfinder_best_hit_Query_Coverage*: is the percentage of input query/reads Kmers that match the template. + * *07-kmerfinder_best_hit_Score*: is the total number of matching Kmers between the query and the template. + * *07-kmerfinder_best_hit_Species*: Species name. + * *07-kmerfinder_best_hit_TAXID*: NCBI's TaxID number of the hit. + * *07-kmerfinder_best_hit_TAXID Species*: NCBI's species TaxID number of the hit (sometimes bacterial strain or substrain TaxIDs can be given above). + * *07-kmerfinder_best_hit_Taxonomy*: complete taxonomy of the hit. + * *07-kmerfinder_best_hit_Template_Coverage*: is the template/genome coverage. + * *07-kmerfinder_best_hit_Template_length*: is the number of Kmers in the template. + * *07-kmerfinder_best_hit_p_value*: is the p-value corresponding to the obtained q_value. + * *07-kmerfinder_best_hit_q_value*: is the quantile in a standard Pearson Chi-square test, to test whether the current template is a significant hit. + * *07-kmerfinder_best_hit_tot_depth*: depth value based on all query kmers that can be found in the template sequence. + * *07-kmerfinder_best_hit_tot_query_Coverage*: is calculated based on the ratio of the score and the number of kmers in the query sequence, where the score includes kmers matched before. + * *07-kmerfinder_best_hit_tot_template_Coverage*: is calculated based on ratio of the score and the number of unique kmers in the template sequence, where the score includes kmers matched before. + * *07-kmerfinder_second_hit_# Assembly*: RefSeq assembly accession ID. + * *07-kmerfinder_second_hit_Accession Number*: accession number of entry ID in fasta file. + * *07-kmerfinder_second_hit_Depth*: is the number of matched kmers in the query sequence divided by the total number of Kmers in the template. For read files this estimates the sequencing depth. + * *07-kmerfinder_second_hit_Description*: additional descriptions available in fasta file, or in the case of organism databases the identifier lines of fasta files. + * *07-kmerfinder_second_hit_Expected*: is the expected score, i.e.the expected total number of matching Kmers between query and template (randomly selected). + * *07-kmerfinder_second_hit_Num*: is the sequence number of accession entry in the KmerFinder database. + * *07-kmerfinder_second_hit_Query_Coverage*: is the percentage of input query Kmers that match the template. + * *07-kmerfinder_second_hit_Score*: is the total number of matching Kmers between the query and the template. + * *07-kmerfinder_second_hit_Species*: Species name. + * *07-kmerfinder_second_hit_TAXID*: NCBI's TaxID number of the hit. + * *07-kmerfinder_second_hit_TAXID Species*: NCBI's species TaxID number of the hit (sometimes bacterial strain or substrain TaxIDs can be given above). + * *07-kmerfinder_second_hit_Taxonomy*: complete taxonomy of the hit. + * *07-kmerfinder_second_hit_Template_Coverage*: is the template coverage. + * *07-kmerfinder_second_hit_Template_length*: is the number of Kmers in the template. + * *07-kmerfinder_second_hit_p_value*: is the p-value corresponding to the obtained q_value. + * *07-kmerfinder_second_hit_q_value*: is the quantile in a standard Pearson Chi-square test, to test whether the current template is a significant hit. + * *07-kmerfinder_second_hit_tot_depth*: depth value based on all query kmers that can be found in the template sequence. + * *07-kmerfinder_second_hit_tot_query_Coverage*: is calculated based on the ratio of the score and the number of kmers in the query sequence, where the score includes kmers matched before. + * *07-kmerfinder_second_hit_tot_template_Coverage*: is calculated based on ratio of the score and the number of unique kmers in the template sequence, where the score includes kmers matched before. + * *Total_hits_07_kmerfinder*: number of total hits. +* **multiqc_report.html**: an interactive report containing the results from MultiQC (kmerfinder, QUAST, quality control, etc.) +* **quast_GCF_XXXXXXXXX.X_ASMXXXXXv2_report.html**: an interactive report obtained after the execution of QUAST, providing different metrics about the assembly QC against the reference. +* **quast_global_report.html**: an interactive report obtained after the execution of QUAST, providing different metrics about the global assembly QC. +* **summary_assembly_metrics_mqc.csv**: a custom table containing most relevant assembly QC metrics. + * *Sample*: sample ID. + * *Input reads*: number of input reads for each sample. + * *Trimmed reads (fastp)*: number of trimmed reads. + * *Contigs*: number of contigs. + * *Largest contig*: length of the largest contig. + * *N50*: is the contig length such that using longer or equal length contigs produces half of the bases of the assembly. Usually there is no value that produces exactly 50%, so the technical definition is the maximum length x such that using contigs of length at least x accounts for at least 50% of the total assembly length. + * *% Genome fraction*: the total number of aligned bases in the reference, divided by the genome size. + * *Best hit (Kmerfinder)*: best hit species name. + * *Best hit assembly ID (Kmerfinder)*: best hit RefSeq assembly accession ID. + * *Best hit query coverage (Kmerfinder)*: best hit query coverage. + * *Best hit depth (Kmerfinder)*: best hit depth. + * *Second hit (Kmerfinder)*: second hit species name. + * *Second hit assembly ID (Kmerfinder)*: second hit RefSeq assembly accession ID. + * *Second hit query coverage (Kmerfinder)*: second hit query coverage. + * *Second hit depth (Kmerfinder)*: second hit depth. + +> [!WARNING] +> Software's versions used in this analysis can be obtained from the `MultiQC` report. \ No newline at end of file diff --git a/bu_isciii/assets/reports/results/exomeeb.md b/bu_isciii/assets/reports/results/exomeeb.md new file mode 100644 index 00000000..5e9dab93 --- /dev/null +++ b/bu_isciii/assets/reports/results/exomeeb.md @@ -0,0 +1,154 @@ +# ExomeEB + +This markdown briefly describes the files found in `RESULTS/` folder for ExomeEB service. + +## exomiser.html + +This file includes information regarding variant annotation, effect prediction and inheritance typing: + +![HTML Description 1](images/exomiser-html-description-1.png) + +![HTML Description 2](images/exomiser-html-description-2.png) + +## picard_hsmetrics.csv + +This table includes mapping quality metrics from sarek's pipeline results, with the following columns: +- SAMPLE +- MEAN TARGET COVERAGE: The mean coverage of a target region. +- PCT USABLE BASES ON TARGET: The number of aligned, de-duped, on-bait bases out of the PF bases available (those that pass the vendor's filter). +- FOLD ENRICHMENT: The fold by which the baited region has been amplified above genomic background. +- PCT TARGET BASES 10X: The fraction of all target bases achieving 10X or greater coverage. +- PCT TARGET BASES 20X: The fraction of all target bases achieving 20X or greater coverage. +- PCT TARGET BASES 30X: The fraction of all target bases achieving 30X or greater coverage. +- PCT TARGET BASES 40X: The fraction of all target bases achieving 40X or greater coverage. +- PCT TARGET BASES 50X: The fraction of all target bases achieving 50X or greater coverage. + +You may find further documentation for the metrics in this table [here](http://broadinstitute.github.io/picard/picard-metric-definitions.html#HsMetrics). + +## variants_annot_highmoderate.tab + +This table includes all the variants from VEP and Exomiser annotation with a predicted high or moderate effect. + +ID: Variant identifier +Chrom: Chromosome number +Pos: Reference position according to hg19 +Ref: Nucleotides found in the reference genome +Alt: Nucleotides found in the individual's genome +Filter: Indicates whether it has passed the filter or not +Sample_GT: Genotype of sample +Sample_DP: Coverage depth of sample +Sample_AD: Allelic Depth of Genotype of sample +Sample_GQ: Quality of Genotype of sample +Gene: Gene affected by the variant +Location: Location of the variant in the genome +Allele: Alternative allele of the variant +Feature: Genomic element affected by the variant +Feature_type: Type of genomic element affected by the variant +Consequence: Functional consequence of the variant +cDNA_position: Position of the variant in the cDNA sequence +CDS_position: Position of the variant in the gene's coding sequence +Protein_position: Position of the variant in the protein encoded by the gene +Amino_acids: Amino acids affected by the variant +Codons: Codons affected by the variant +Existing_variation: Existing genetic variation at this position +Impact: Fundamental impact of the variant +Distance: Distance of the variant to the nearest exon +Strand: DNA strand on which the variant is located +Flags: Indicators of additional variant features +Variant_class: Variant class +Symbol: Gene symbol affected by the variant +Symbol_source: Source of the variant symbol +HGNC_ID: Unique identifier in the HGNC database +Biotype: Gene biotype +Canonical: Indicates if it is the canonical transcript of the gene +Mane_sel: Indicates if the transcript is identified by Mane +Mane_plus: Indicates if the transcript is identified by Mane+ +TSL: Indicates if the transcript is identified by TSL +Appris: Indicates if the transcript is identified by Appris +ENSP: Unique identifier of the transcript in Ensembl +Swissprot: Unique identifier of the transcript in Swissprot +TREMBL: Unique identifier of the transcript in TREMBL +Uniparc: Unique identifier of the protein in the Uniparc database +Uniparc_Isoform: Unique identifier of the protein isoform in Uniparc +Gene_pheno: Indicates if the gene is associated with a phenotype +SIFT: SIFT pathogenicity of the variant +PolyPhen: PolyPhen pathogenicity of the variant +Exon: Indicates if the variant is located in an exon +Intron: Indicates if the variant is located in an intron +Domains: Protein domains affected by the variant +miRNA: miRNA binding to the region affected by the variant +HGVSc: HGVS notation for the variant in cDNA sequence +HGVSp: HGVS notation for the variant in protein sequence +HGVS_offset: Offset of the variant in cDNA or protein sequence +AF: Allelic frequency of the variant in the general population +AFR_AF: Allelic frequency of the variant in the African population +AMR_AF: Allelic frequency of the variant in the American population +EAS_AF: Allelic frequency of the variant in the East Asian population +EUR_AF: Allelic frequency of the variant in the European population +SAS_AF: Allelic frequency of the variant in the South Asian population +AA_AF: Allelic frequency of the variant in the African American population +EA_AF: Allelic frequency of the variant in the European American population +gnomAD_AF: Allelic frequency of the variant in the gnomAD database +gnomAD_AFR_AF: Allelic frequency of the variant in the African population in gnomAD +gnomAD_AMR_AF: Allele frequency of the variant in the American population in gnomAD +gnomAD_ASJ_AF: Allele frequency of the variant in the Asian and Japanese population in gnomAD +gnomAD_EAS_AF: Allele frequency of the variant in the East Asian population in gnomAD +gnomAD_FIN_AF: Allele frequency of the variant in the Finnish population in gnomAD +gnomAD_NFE_AF: Allele frequency of the variant in the non-Finnish European population in gnomAD +gnomAD_OTH_AF: Allele frequency of the variant in other populations in gnomAD +gnomAD_SAS_AF: Allele frequency of the variant in the South Asian population in gnomAD +MAX_AF: Maximum allele frequency of the variant in all populations +MAX_AF_POPS: Population with the maximum allele frequency of the variant +CLIN_SIG: Clinical significance of the variant +SOMATIC: Indicates if the variant is somatic or germline +PHENO: Phenotype associated with the variant +HGNC_ID: Unique identifier of the gene affected by the variant +PUBMED: Scientific articles describing the variant +MOTIF_NAME: Transcription factor binding motif affected by the variant +MOTIF_POS: Position of the transcription factor binding motif affected +HIGH_INF_POS: High information position in the transcription factor binding motif +MOTIF_SCORE_CHANGE: Change in the transcription factor binding motif score caused by the variant +TRANSCRIPTION_FACTORS: Transcription factors that bind to the affected transcription factor binding motif +HGVSp_snpEff: HGVS notation for the variant in the protein sequence in snpEff +SIFT_score: SIFT score of the pathogenicity of the variant +SIFT_pred: SIFT prediction of the pathogenicity of the variant +Polyphen2_HDIV_score: PolyPhen2_HDIV score of the pathogenicity of the variant +Polyphen2_HDIV_pred: PolyPhen2_HDIV prediction of the pathogenicity of the variant +Polyphen2_HVAR_score: PolyPhen2_HVAR score of the pathogenicity of the variant +Polyphen2_HVAR_pred: PolyPhen2_HVAR prediction of the pathogenicity of the variant +MutationTaster_score: MutationTaster score of the pathogenicity of the variant +MutationTaster_pred: MutationTaster prediction of the pathogenicity of the variant +MutationAssessor_score: MutationAssessor score of the pathogenicity of the variant +MutationAssessor_pred: MutationAssessor prediction of the pathogenicity of the variant +FATHMM_score: FATHMM score of the pathogenicity of the variant +FATHMM_pred: FATHMM prediction of the pathogenicity of the variant +HGVSp: HGVS notation for the variant in the protein sequence +HGVS_offset: Displacement of the variant in the cDNA or protein sequence +PROVEAN_score: PROVEAN score of the pathogenicity of the variant +PROVEAN_pred: PROVEAN prediction of the pathogenicity of the variant +VEST4_score: VEST4 score of the pathogenicity of the variant +MetaSVM_score: MetaSVM score of the pathogenicity of the variant +MetaSVM_pred: MetaSVM prediction of the pathogenicity of the variant +MetaLR_score: MetaLR score of the pathogenicity of the variant +MetaLR_pred: MetaLR prediction of the pathogenicity of the variant +CADD_raw: Raw CADD score +CADD_phred: CADD Phred score +CADD_raw_hg19: Raw CADD score for the hg19 genome version +CADD_phred_hg19: CADD Phred score for the hg19 genome version +GERP++_NR: GERP++ score for the non-coding region +GERP++_RS: GERP++ score for the synonymous region +phyloP100way_vertebrate: phyloP score for 100 vertebrate species +phastCons100way_vertebrate: phastCons score for 100 vertebrate species +clinvar_trait: Clinical trait or condition associated with the variant +clinvar_id: Unique identifier of the variant in ClinVar +clinvar_OMIM_id: Unique identifier of the variant in OMIM +OMIM_id: Unique identifier of the disease in OMIM +Function_description: Description of the function of the gene affected by the variant +Disease_description: Description of the disease associated with the variant +HPO_id: Unique identifier of the phenotype in the HPO database +HPO_name: Name of the phenotype in the HPO database + +## multiqc_report.html + +Most of sarek's QC results are visualised in this report and further statistics are available in the report data directory. +Results generated by MultiQC collect pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see http://multiqc.info. diff --git a/bu_isciii/assets/reports/results/irma_output.md b/bu_isciii/assets/reports/results/irma_output.md new file mode 100644 index 00000000..ef4d79f3 --- /dev/null +++ b/bu_isciii/assets/reports/results/irma_output.md @@ -0,0 +1,27 @@ +# IRMA-output + +This markdown briefly describes the files found in `RESULTS/` folder for IRMA services. As described [here]() + +## **`krona_results.html`** + +Includes the multiQC html report from MAG, you can find a further description in [MAG](https://github.com/BU-ISCIII/buisciii-tools/blob/main/bu_isciii/assets/reports/md/mag.md). + +## Files in `fragment_name/` + +Depending on the virus types found in your samples, you will have one folder for each fragment found (e.g. A_H1 for Influenza A H1_N1). In these folders you will have a multi-fasta file for each of the fragments found, which will include all the sequences for that specific fragment found by IRMA in the samples. Here's an example: +``` +A_H1/A_HA.txt +>Sample1_HA +XXXXXXXX-SEQUENCE_OF_FRAGMENT_HA_FOR_SAMPLE1-XXXXXXXX +>Sample2_HA +XXXXXXXX-SEQUENCE_OF_FRAGMENT_HA_FOR_SAMPLE2-XXXXXXXX +>Sample3_HA +XXXXXXXX-SEQUENCE_OF_FRAGMENT_HA_FOR_SAMPLE3-XXXXXXXX +``` + +## **`all_samples_completo.txt`** + +This file is a multi-fasta that includes all the assembly sequences generated by IRMA for all the fragments found in all the samples. The structure content of this file is the same as the previously described. + +In case of Influenza services (FLU), you will find a file named `flu_type_summary.txt`, which will include a summary of the different types of influenza found in your samples. This is the most relevant information to be included in your report. + diff --git a/bu_isciii/assets/reports/results/mag.md b/bu_isciii/assets/reports/results/mag.md index 1cfd12ca..ce968514 100644 --- a/bu_isciii/assets/reports/results/mag.md +++ b/bu_isciii/assets/reports/results/mag.md @@ -1,6 +1,26 @@ ## MAG + Here we describe the results from the MAG pipeline for multispecies metagenomic analysis. -* krona_results.html​ : Final HTML report with the top 5 species most present in all samples. +### MAG - TAXONIMIC ANALYSIS + +* `krona_results.html`​ : Final HTML report with the top 5 species most present in all samples. + +> [!WARNING] +> Software's versions used in this analysis can be obtained from the `MultiQC` report. + +### MAG - COMPLETE ANALYSIS + +* `mag_all/krona/${sample_name}.${tool}.report.html`: A Krona interactive visualization report for the each sample based on Kraken2 (or other) taxonomic classification mehtod. +* `mag_all/quast/${sample_name}.${tool}.report.html`: A Quast report for the assembly quality control of each sample assembled using MEGAHIT, SPAdes or other. +* `mag_all/multiqc_report.html`: A combined report generated by MultiQC summarizing various quality control results for all samples. + +## Taxprofiler + +Here we describe the results from the (nf-core/taxprofiler)[https://nf-co.re/taxprofiler/1.1.8] pipeline for multispecies taxonomic classification and profiling of shorgun short- and long-read. + +* `taxprofiler/multiqc_report.html​`: Final HTML report collecting numerical stats from each module executed in this pipeline. +* `taxprofiler/krona/database_*.html`: Interactive HTML files generated by Krona, displaying the results of taxonomic classification for supported tools (Kraken2, Centrifuge, Kaiju, and MALT) -*Warning:* Software's versions used in this analysis can be obtained from the `MultiQC` report. +> [!WARNING] +> Software's versions used in this analysis can be obtained from the `MultiQC` report. \ No newline at end of file diff --git a/bu_isciii/assets/reports/results/pikavirus.md b/bu_isciii/assets/reports/results/pikavirus.md index 59f8faab..a5565e91 100644 --- a/bu_isciii/assets/reports/results/pikavirus.md +++ b/bu_isciii/assets/reports/results/pikavirus.md @@ -1,4 +1,5 @@ ## PikaVirus + Here we describe the results from the PikaVirus pipeline for viral presence discovery. * filtered_all_samples_virus_table.xlsx​: Results from PikaVirus. diff --git a/bu_isciii/assets/reports/results/plasmidid.md b/bu_isciii/assets/reports/results/plasmidid.md new file mode 100644 index 00000000..047dddc5 --- /dev/null +++ b/bu_isciii/assets/reports/results/plasmidid.md @@ -0,0 +1,27 @@ +# PlasmidID + +Here we describe the results from the Viralrecon pipeline for viral genome reconstruction. + +> [!WARNING] +> Some of the files listed here may not be in your `RESULTS` folder. It will depend on the analysis you requested. + +## Summary report + +A summary report consolidating all samples in the analysis is created. + +- `NO_GROUP_final_results.html`: report with same info as table below that can be viewed using chrome. +- `NO_GROUP_final_results.tab`: plasmid info for each sample. Header columns are described here: + - id: plasmid unique identifier for each entry. + - length: The length of the plasmid sequence. + - species description: A description of the species from which the sequence originates. + - fraction_covered: The fraction of the sequence that is covered by alignments. + - contig_name: The name of the contigs associated with the sequence. + - percentage: The percentage of the genome or sequence that is covered. + - images: Links or references to related images or visual data. + +## Circos images + +Circos is used for creating one image for each identified plasmid and a summary image with all the plasmids identified in one figure. A manual for image interpretation can be found [here](https://github.com/BU-ISCIII/plasmidID/wiki/Understanding-the-image:-track-by-track) and a manual about how to select the correct plasmid can be found [here](https://github.com/BU-ISCIII/plasmidID/wiki/How-to-chose-the-right-plasmids). + +- `images/SAMPLE_NAME_PLASMID_individual.png`: circos image for individual plasmidID +- `images/SAMPLE_NAME_summary_image.png`: summary image \ No newline at end of file diff --git a/bu_isciii/assets/reports/results/rnaseq_deg.md b/bu_isciii/assets/reports/results/rnaseq_deg.md new file mode 100644 index 00000000..8c39926f --- /dev/null +++ b/bu_isciii/assets/reports/results/rnaseq_deg.md @@ -0,0 +1,68 @@ +# mRNAseq (DEG): results + +Here we describe the results from the mRNAseq pipeline for transcriptomic analysis and differential gene expression. + +> [!WARNING] +> Some of the files listed here may not be in your `RESULTS` folder. It will depend on the analysis you requested. + +## Alignment and quantification + + +> [!WARNING] +> Please note that the files in the RESULTS directory are still pending determination by our administrative team. We are currently discussing which generic/common files will be included in this section to ensure that you receive the most relevant and useful information for your analysis. + +## Differential expression analysis with DESEQ2 + + +> [!WARNING] +> Please note that the files in the RESULTS directory are still pending determination by our administrative team. We are currently discussing which generic/common files will be included in this section to ensure that you receive the most relevant and useful information for your analysis. + +## Interpretation of Differential Expression Results: + +For each comparison conducted, a separate folder has been created, following the naming convention outlined below: + + 1_Treatment1_Control1/ + 2_Treatment2_Control2/ + 3_Treatment3_Control3/ + 4_Treatment1_Treatment2_Treatment3_Control1_Control2_Control3/ + +Within each folder, you will find the results of the differential expression analysis. When interpreting the results, it's important to note that: + + A positive log2FC indicates that the mRNA is overexpressed in the Treatment group compared to the Control group. + Conversely, a negative log2FC suggests that the mRNA is downregulated in the Treatment group compared to the Control group. + +Additionally, the file `normalized_expression.xlsx` contains the normalized expression counts matrix, providing further insight into the expression levels of the mRNAs across the experimental conditions. + +> [!WARNING] +> Software's versions used in this analysis can be obtained from the `MultiQC` report. + +### Description of output files +- `/Differential_expression/DESeq2/Differential_expression.csv`: This file contains the results of the differential expression analysis performed using DESeq2, including information on differentially expressed genes and associated statistical metrics such as fold change, p-values, and adjusted p-values. + +- `/Differential_expression/DESeq2/heatmapCount_top20_differentially_expressed.pdf`: This PDF file presents a heatmap visualization displaying the expression patterns of the top 20 differentially expressed genes, clustered by sample distance, as determined by the DESeq2 analysis. + +- `/Differential_expression/DESeq2/maPlot_all.pdf`: This PDF file illustrates MA plots depicting the log fold changes (M) versus the mean average (A) expression levels of all genes analyzed in the DESeq2 differential expression analysis. + +- `/Differential_expression/DESeq2/pvalues.pdf`: This PDF file provides graphical representations, such as histograms or scatter plots, illustrating the distribution and significance of p-values calculated during the DESeq2 analysis. + +- `/Quality_plots/DESeq2/boxplot.pdf`: This PDF file displays boxplots depicting the distribution of normalized count expression values values across samples, allowing for the assessment of data variability and potential batch effects. + +- `/Quality_plots/DESeq2/cluster_dendrogram.pdf`: This PDF file presents a dendrogram visualization illustrating the hierarchical clustering of samples based on gene expression profiles, enabling the identification of sample similarities and differences. + +- `/Quality_plots/DESeq2/heatmapCount_all_genes.pdf`: This PDF file contains a heatmap visualization showing the expression patterns of all genes analyzed in the experiment, facilitating the identification of gene expression trends and patterns. + +- `/Quality_plots/DESeq2/heatmapCount_top20_highest_expression.pdf`: This PDF file presents a heatmap visualization highlighting the expression patterns of the top 20 genes with the highest expression levels across samples, aiding in the identification of highly expressed genes. + +- `/Quality_plots/DESeq2/heatmap_sample_to_sample.pdf`: This PDF file contains a heatmap visualization illustrating the pairwise sample-to-sample correlation matrix based on gene expression profiles, enabling the assessment of sample similarities and reproducibility. + +- `/Quality_plots/DESeq2/plotDispersions.pdf`: This PDF file displays dispersion plots showing the relationship between the mean expression levels and the dispersion estimates for each gene, allowing for the assessment of data variability and the adequacy of the statistical model. + +- `/Quality_plots/DESeq2/plotPCA.pdf`: This PDF file presents a PCA (Principal Component Analysis) plot visualizing the distribution of samples in a multidimensional space based on their gene expression profiles, allowing for the exploration of sample relationships and potential batch effects. + +- `/Quality_plots/DESeq2/plotSD.pdf`:The standard deviation of the transformed data, across samples, against the mean, using the shifted logarithm transformation, the regularized log transformation and the variance stabilizing transformation. This plot enables the assessment of data variability and the identification of potential outliers. + +- `99-stats/Quality_plots/`: This folder contains the same quality plots as described above, but they are generated considering all samples in the service without accounting for the expermiental design specified in DESeq2. This allows for a general overview of the data in the service without incorporating the experimental design. diff --git a/bu_isciii/assets/reports/results/trios.md b/bu_isciii/assets/reports/results/trios.md new file mode 100755 index 00000000..2d966b7d --- /dev/null +++ b/bu_isciii/assets/reports/results/trios.md @@ -0,0 +1,168 @@ +# ExomeTrio & WGStrio + +These file briefly describes the files in `RESULTS/` folder from ExomeTrio and WGStrio services. + +## exomiser.html + +This file includes information regarding variant annotation, effect prediction and inheritance typing: + +![HTML Description 1](images/exomiser-html-description-1.png) + +![HTML Description 2](images/exomiser-html-description-2.png) + +## picard_hsmetrics.csv + +This table includes mapping quality metrics from sarek's pipeline results, with the following columns: +- SAMPLE +- MEAN TARGET COVERAGE: The mean coverage of a target region. +- PCT USABLE BASES ON TARGET: The number of aligned, de-duped, on-bait bases out of the PF bases available (those that pass the vendor's filter). +- FOLD ENRICHMENT: The fold by which the baited region has been amplified above genomic background. +- PCT TARGET BASES 10X: The fraction of all target bases achieving 10X or greater coverage. +- PCT TARGET BASES 20X: The fraction of all target bases achieving 20X or greater coverage. +- PCT TARGET BASES 30X: The fraction of all target bases achieving 30X or greater coverage. +- PCT TARGET BASES 40X: The fraction of all target bases achieving 40X or greater coverage. +- PCT TARGET BASES 50X: The fraction of all target bases achieving 50X or greater coverage. + +You may find further documentation for the metrics in this table [here](http://broadinstitute.github.io/picard/picard-metric-definitions.html#HsMetrics). + +## variants_annot_filterAF_head.tab + +This table includes all the variants from VEP and Exomiser annotation with a minimum Allele Frequency of **0.001** + +ID: Variant identifier +Chrom: Chromosome number +Pos: Reference position according to hg19 +Ref: Nucleotides found in the reference genome +Alt: Nucleotides found in the individual's genome +Filter: Indicates whether it has passed the filter or not +Parent1_GT: Genotype of Parent 1 +Parent1_DP: Coverage depth of Parent 1 +Parent1_GQ: Quality of Genotype of Parent 1 +Parent2_GT: Genotype of Parent 2 +Parent2_DP: Coverage depth of Parent 2 +Parent2_GQ: Quality of Genotype of Parent 2 +Child_GT: Genotype of Child +Child_DP: Coverage depth of Child +Child_GQ: Quality of Genotype of Child +Gene: Gene affected by the variant +Location: Location of the variant in the genome +Allele: Alternative allele of the variant +Feature: Genomic element affected by the variant +Feature_type: Type of genomic element affected by the variant +Consequence: Functional consequence of the variant +cDNA_position: Position of the variant in the cDNA sequence +CDS_position: Position of the variant in the gene's coding sequence +Protein_position: Position of the variant in the protein encoded by the gene +Amino_acids: Amino acids affected by the variant +Codons: Codons affected by the variant +Existing_variation: Existing genetic variation at this position +Impact: Fundamental impact of the variant +Distance: Distance of the variant to the nearest exon +Strand: DNA strand on which the variant is located +Flags: Indicators of additional variant features +Variant_class: Variant class +Symbol: Gene symbol affected by the variant +Symbol_source: Source of the variant symbol +HGNC_ID: Unique identifier in the HGNC database +Biotype: Gene biotype +Canonical: Indicates if it is the canonical transcript of the gene +Mane_sel: Indicates if the transcript is identified by Mane +Mane_plus: Indicates if the transcript is identified by Mane+ +TSL: Indicates if the transcript is identified by TSL +Appris: Indicates if the transcript is identified by Appris +ENSP: Unique identifier of the transcript in Ensembl +Swissprot: Unique identifier of the transcript in Swissprot +TREMBL: Unique identifier of the transcript in TREMBL +Uniparc: Unique identifier of the protein in the Uniparc database +Uniparc_Isoform: Unique identifier of the protein isoform in Uniparc +Gene_pheno: Indicates if the gene is associated with a phenotype +SIFT: SIFT pathogenicity of the variant +PolyPhen: PolyPhen pathogenicity of the variant +Exon: Indicates if the variant is located in an exon +Intron: Indicates if the variant is located in an intron +Domains: Protein domains affected by the variant +miRNA: miRNA binding to the region affected by the variant +HGVSc: HGVS notation for the variant in cDNA sequence +HGVSp: HGVS notation for the variant in protein sequence +HGVS_offset: Offset of the variant in cDNA or protein sequence +AF: Allelic frequency of the variant in the general population +AFR_AF: Allelic frequency of the variant in the African population +AMR_AF: Allelic frequency of the variant in the American population +EAS_AF: Allelic frequency of the variant in the East Asian population +EUR_AF: Allelic frequency of the variant in the European population +SAS_AF: Allelic frequency of the variant in the South Asian population +AA_AF: Allelic frequency of the variant in the African American population +EA_AF: Allelic frequency of the variant in the European American population +gnomAD_AF: Allelic frequency of the variant in the gnomAD database +gnomAD_AFR_AF: Allelic frequency of the variant in the African population in gnomAD +gnomAD_AMR_AF: Allele frequency of the variant in the American population in gnomAD +gnomAD_ASJ_AF: Allele frequency of the variant in the Asian and Japanese population in gnomAD +gnomAD_EAS_AF: Allele frequency of the variant in the East Asian population in gnomAD +gnomAD_FIN_AF: Allele frequency of the variant in the Finnish population in gnomAD +gnomAD_NFE_AF: Allele frequency of the variant in the non-Finnish European population in gnomAD +gnomAD_OTH_AF: Allele frequency of the variant in other populations in gnomAD +gnomAD_SAS_AF: Allele frequency of the variant in the South Asian population in gnomAD +MAX_AF: Maximum allele frequency of the variant in all populations +MAX_AF_POPS: Population with the maximum allele frequency of the variant +CLIN_SIG: Clinical significance of the variant +SOMATIC: Indicates if the variant is somatic or germline +PHENO: Phenotype associated with the variant +HGNC_ID: Unique identifier of the gene affected by the variant +PUBMED: Scientific articles describing the variant +MOTIF_NAME: Transcription factor binding motif affected by the variant +MOTIF_POS: Position of the transcription factor binding motif affected +HIGH_INF_POS: High information position in the transcription factor binding motif +MOTIF_SCORE_CHANGE: Change in the transcription factor binding motif score caused by the variant +TRANSCRIPTION_FACTORS: Transcription factors that bind to the affected transcription factor binding motif +HGVSp_snpEff: HGVS notation for the variant in the protein sequence in snpEff +SIFT_score: SIFT score of the pathogenicity of the variant +SIFT_pred: SIFT prediction of the pathogenicity of the variant +Polyphen2_HDIV_score: PolyPhen2_HDIV score of the pathogenicity of the variant +Polyphen2_HDIV_pred: PolyPhen2_HDIV prediction of the pathogenicity of the variant +Polyphen2_HVAR_score: PolyPhen2_HVAR score of the pathogenicity of the variant +Polyphen2_HVAR_pred: PolyPhen2_HVAR prediction of the pathogenicity of the variant +MutationTaster_score: MutationTaster score of the pathogenicity of the variant +MutationTaster_pred: MutationTaster prediction of the pathogenicity of the variant +MutationAssessor_score: MutationAssessor score of the pathogenicity of the variant +MutationAssessor_pred: MutationAssessor prediction of the pathogenicity of the variant +FATHMM_score: FATHMM score of the pathogenicity of the variant +FATHMM_pred: FATHMM prediction of the pathogenicity of the variant +HGVSp: HGVS notation for the variant in the protein sequence +HGVS_offset: Displacement of the variant in the cDNA or protein sequence +PROVEAN_score: PROVEAN score of the pathogenicity of the variant +PROVEAN_pred: PROVEAN prediction of the pathogenicity of the variant +VEST4_score: VEST4 score of the pathogenicity of the variant +MetaSVM_score: MetaSVM score of the pathogenicity of the variant +MetaSVM_pred: MetaSVM prediction of the pathogenicity of the variant +MetaLR_score: MetaLR score of the pathogenicity of the variant +MetaLR_pred: MetaLR prediction of the pathogenicity of the variant +CADD_raw: Raw CADD score +CADD_phred: CADD Phred score +CADD_raw_hg19: Raw CADD score for the hg19 genome version +CADD_phred_hg19: CADD Phred score for the hg19 genome version +GERP++_NR: GERP++ score for the non-coding region +GERP++_RS: GERP++ score for the synonymous region +phyloP100way_vertebrate: phyloP score for 100 vertebrate species +phastCons100way_vertebrate: phastCons score for 100 vertebrate species +clinvar_trait: Clinical trait or condition associated with the variant +clinvar_id: Unique identifier of the variant in ClinVar +clinvar_OMIM_id: Unique identifier of the variant in OMIM +OMIM_id: Unique identifier of the disease in OMIM +Function_description: Description of the function of the gene affected by the variant +Disease_description: Description of the disease associated with the variant +HPO_id: Unique identifier of the phenotype in the HPO database +HPO_name: Name of the phenotype in the HPO database + +## multiqc_report.html + +Most of sarek's QC results are visualised in this report and further statistics are available in the report data directory. +Results generated by MultiQC collect pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see http://multiqc.info. + +## Tables in vep_annot folder + +These tables are obtained by splitting the variants depending on the mode of inheritance assigned by exomiser. The columns remain the same as the ones found in variants_annot_filterAF.tab +- vep_annot_AD_final.txt: Autosomal Dominant +- vep_annot_AR_final.txt: Autosomal Recessive +- vep_annot_MT_final.txt: Mythocondrial +- vep_annot_XD_final.txt: X-linked Dominant +- vep_annot_XR_final.txt: X-linked Recessive diff --git a/bu_isciii/assets/reports/results/viralrecon.md b/bu_isciii/assets/reports/results/viralrecon.md index ac477fc1..48533c3d 100644 --- a/bu_isciii/assets/reports/results/viralrecon.md +++ b/bu_isciii/assets/reports/results/viralrecon.md @@ -1,66 +1,72 @@ -## Viralrecon +# Viralrecon + Here we describe the results from the Viralrecon pipeline for viral genome reconstruction. -*Warning:* Some of the files listed here may not be in your `RESULTS` folder. It will depend on the analysis you requested. +> [!WARNING] +> Some of the files listed here may not be in your `RESULTS` folder. It will depend on the analysis you requested. + +## Mapping approach results -### Mapping approach results * mapping_illumina.xlsx: statistics for mapped reads against viral and host genomes. - - run: Run name - - user: User name - - host: Host name - - Virussequence: Reference virus used - - sample: Sample name - - totalreads: Total reads after trimming - - readshostR1: Total reads of host genome in R1 - - readshost: Total reads of host genome in R1 and R2 - - %readshost: Percentage of reads that correspond to the host genome - - readsvirus: Number of reference viral genome reads - - %readsvirus: Percentage of reference viral genome reads - - unmappedreads: number of reads that did not correspond to viral reference or host genome. - - %unmapedreads: Percentage of reads that did not correspond to viral reference or host genome. - - medianDPcoveragevirus: Median depth of coverage of the reference viral genome - - Coverage>10x(%): Percentage of viral reference genome coverage to more than 10X - - Variantsinconsensusx10: Number of variants included in the consensus after filtering: more than 10X and 0.75 AF - - %Ns10x: Percentage of consesus genome masked due to having less than 10X depth - - Lineage: Pangolin assigned lineage. *Warning: Only for SARS-CoV-2 sequencing data* - - Date: Analysis date. *Warning: Only for SARS-CoV-2 sequencing data* + * run: Run name + * user: User name + * host: Host name + * Virussequence: Reference virus used + * sample: Sample name + * totalreads: Total reads after trimming + * readshostR1: Total reads of host genome in R1 + * readshost: Total reads of host genome in R1 and R2 + * %readshost: Percentage of reads that correspond to the host genome + * readsvirus: Number of reference viral genome reads + * %readsvirus: Percentage of reference viral genome reads + * unmappedreads: number of reads that did not correspond to viral reference or host genome. + * %unmapedreads: Percentage of reads that did not correspond to viral reference or host genome. + * medianDPcoveragevirus: Median depth of coverage of the reference viral genome + * Coverage>10x(%): Percentage of viral reference genome coverage to more than 10X + * Variantsinconsensusx10: Number of variants included in the consensus after filtering: more than 10X and 0.75 AF + * %Ns10x: Percentage of consesus genome masked due to having less than 10X depth + * Lineage: Pangolin assigned lineage. *Warning: Only for SARS-CoV-2 sequencing data* + * Date: Analysis date. *Warning: Only for SARS-CoV-2 sequencing data* * mapping_consensus: this folder contains the masked (<10x) genomes obtained with consensus sequences using mapping and majority variant calling * variants_annot: table with all annotated variants. *Warning: Only when annotation .gff file was provided* * variants_long_table.xlsx: Table with variants for all the samples in long format. *Warning: Only when annotation .gff file was provided* - * SAMPLE: sample name - * CHROM: Reference ID - * POS: Position of the variant - * REF: Ref allele - * ALT: Alt allele - * FILTER: Column indicating if the variant passed the filters. If PASS the variant passed all the filters. If not, the name of the filter that wasn't passed will appear - * DP: Position depth - * REF_DP: Ref allele depth - * ALT_DP: Alt allele depth - * AF: Allele frequency - * GENE: Gene name in annotation file​ - * EFFECT: Effect of the variant - * HGVS_C: Position annotation at CDS level - * HGVS_P: Position annotation at protein level - * HGVS_P_1LETTER: Position annotation at protein level with the aminoacid annotation in 1 letter format - * Caller: Variant caller used + * SAMPLE: sample name + * CHROM: Reference ID + * POS: Position of the variant + * REF: Ref allele + * ALT: Alt allele + * FILTER: Column indicating if the variant passed the filters. If PASS the variant passed all the filters. If not, the name of the filter that wasn't passed will appear + * DP: Position depth + * REF_DP: Ref allele depth + * ALT_DP: Alt allele depth + * AF: Allele frequency + * GENE: Gene name in annotation file​ + * EFFECT: Effect of the variant + * HGVS_C: Position annotation at CDS level + * HGVS_P: Position annotation at protein level + * HGVS_P_1LETTER: Position annotation at protein level with the aminoacid annotation in 1 letter format + * Caller: Variant caller used * pangolin.xlsx: Pangolin complete results *Warning: Only for SARS-CoV-2 sequencing data* * nextclade.xlsx: Results from Nextclade *Warning: Only for SARS-CoV-2 sequencing data* -### *de novo* assembly approach results +## *de novo* assembly approach results + * assembly_stats.xlsx: Stats of the *de novo* assembly steps. This table contains the following columns: - - run: Run name - - user: User name - - host: Host name - - Virussequence: Reference virus used - - sample: Sample name - - totalreads: Total reads after trimming - - readshostR1: Total reads of host genome in R1 - - readshost: Total reads of host genome in R1 and R2 - - %readshost: Percentage of reads that correspond to the host genome - - Non-host-reedas: Number of reads remaining after host removal - - \#Contigs: Number of contigs in the assembly - - Largest contig: Size in nucleotides of the larges contig in the assembly - - % Genome fraction: Percentage of the reference genome covered by the assembly. *Warning: Only when reference genome was provided* + * run: Run name + * user: User name + * host: Host name + * Virussequence: Reference virus used + * sample: Sample name + * totalreads: Total reads after trimming + * readshostR1: Total reads of host genome in R1 + * readshost: Total reads of host genome in R1 and R2 + * %readshost: Percentage of reads that correspond to the host genome + * Non-host-reedas: Number of reads remaining after host removal + * \#Contigs: Number of contigs in the assembly + * Largest contig: Size in nucleotides of the larges contig in the assembly + * % Genome fraction: Percentage of the reference genome covered by the assembly. *Warning: Only when reference genome was provided* * assembly_spades: Scaffolds fasta files with the spades de novo assembly. *Warning: Only when NO reference genome was provided, or reference genome didn't match* * abacas_assembly: spades de novo assembly where contigs were contiguated using ABACAS and the reference genome. *Warning: Only when reference genome was provided* -*Warning:* Software's versions used in this analysis can be obtained from the `MultiQC` report. + +> [!WARNING] +> Software's versions used in this analysis can be obtained from the `MultiQC` report. diff --git a/bu_isciii/assets/reports/results/wgmlst_chewbbaca.md b/bu_isciii/assets/reports/results/wgmlst_chewbbaca.md new file mode 100644 index 00000000..62147f8f --- /dev/null +++ b/bu_isciii/assets/reports/results/wgmlst_chewbbaca.md @@ -0,0 +1,16 @@ +## cgMLST/wgMLST + +Here we describe the results from the cgMLST/wgMLST service using ChewBBACA and GrapeTree + +- `mlst/allelecall_report.html`: A HTML report, that contains the following components: + - A table with the total number of samples, total number of loci, total number of coding sequences (CDSs) extracted from the samples, total number of CDSs classified and totals per classification type. + - A tab panel with stacked bar charts for the classification type counts per sample and per locus. + - A tab panel with detailed sample and locus statistics. + - If a TSV file with annotations is provided to the --annotations parameter, the report will also include a table with the provided annotations. Otherwise, it will display a warning informing that no annotations were provided. + - A Heatmap chart representing the loci presence-absence matrix for all samples in the dataset. + - A Heatmap chart representing the allelic distance matrix for all samples in the dataset. + - A tree drawn with Phylocanvas.gl based on the Neighbor-Joining (NJ) tree computed by FastTree. +- `mlst/distance_matrix_symmetric.tsv`: Symmetric distance matrix. The distances are computed by determining the number of allelic differences from the set of core loci (shared by 100% of the samples) between each pair of samples. +- `mlst/tree.nwk`: Newick tree from the Minimum Spannig Tree. +- `mlst/tree.svg`: Minimum Spannig Tree SVG (Scalable Vector Graphics) plot. Branches longer than = 700 are shown shortenned. +- `results_alleles.tsv`: Contains the allelic profiles determined for the input samples. The first column has the identifiers of the genome assemblies for which the allele call was performed. The remaining columns contain the allele call data for loci present in the schema, with the column headers being the locus identifiers. The INF- prefix in the allelic number indicates that such allele was newly inferred in that genome, and the number following the prefix is the ID attributed to such allele. For the PLOT classification, in the allelic profile output, a locus can be classified as PLOT5 or PLOT3 depending whether the CDS in the genome under analysis matching the schema locus is located in the 5' end or 3' end (respectively) of the contig. All other annotations are identical to what was described above. \ No newline at end of file diff --git a/bu_isciii/autoclean_sftp.py b/bu_isciii/autoclean_sftp.py index 43d2e76d..846eb003 100755 --- a/bu_isciii/autoclean_sftp.py +++ b/bu_isciii/autoclean_sftp.py @@ -65,16 +65,14 @@ class AutoremoveSftpService: within 14 days """ - def __init__(self, path=None, days=14): + def __init__(self, path=None, days=14, conf=None): # Parse input path if path is None: use_default = bu_isciii.utils.prompt_yn_question( "Use default path?: ", dflt=False ) if use_default: - data_path = bu_isciii.config_json.ConfigJson().get_configuration( - "global" - )["data_path"] + data_path = conf.get_configuration("global")["data_path"] self.path = os.path.join(data_path, "sftp") else: self.path = bu_isciii.utils.prompt_path( diff --git a/bu_isciii/bioinfo_doc.py b/bu_isciii/bioinfo_doc.py index 52e72dfc..4da84672 100755 --- a/bu_isciii/bioinfo_doc.py +++ b/bu_isciii/bioinfo_doc.py @@ -11,6 +11,7 @@ import markdown import pdfkit import PyPDF2 +import yaml import subprocess import json import shutil @@ -47,6 +48,7 @@ def __init__( results_md=False, api_user=None, api_password=None, + conf=None, email_psswd=None, ): if type is None: @@ -54,7 +56,7 @@ def __init__( msg="Select the documentation type you want to create", choices=["service_info", "delivery"], ) - self.conf = bu_isciii.config_json.ConfigJson().get_configuration("bioinfo_doc") + self.conf = conf.get_configuration("bioinfo_doc") if path is None: if ask_path: self.path = bu_isciii.utils.prompt_path( @@ -71,12 +73,12 @@ def __init__( self.resolution_id = bu_isciii.utils.prompt_resolution_id() else: self.resolution_id = resolution_id - conf_api = bu_isciii.config_json.ConfigJson().get_configuration("api_settings") + conf_api = conf.get_configuration("api_settings") self.rest_api = bu_isciii.drylab_api.RestServiceApi( conf_api["server"], conf_api["api_url"], api_user, api_password ) self.resolution_info = self.rest_api.get_request( - request_info="service-data", safe=False, resolution=self.resolution_id + request_info="service-data", safe=True, resolution=self.resolution_id ) if self.resolution_info == 404: print("Received Error 404 from Iskylims API. Aborting") @@ -91,7 +93,7 @@ def __init__( else: self.post_delivery_info() self.resolution_info = self.rest_api.get_request( - request_info="service-data", safe=False, resolution=self.resolution_id + request_info="service-data", safe=True, resolution=self.resolution_id ) self.services_requested = self.resolution_info["resolutions"][0][ "available_services" @@ -157,7 +159,7 @@ def __init__( ) if self.type == "delivery": - self.sftp_data = bu_isciii.utils.get_sftp_folder(self.resolution_info) + self.sftp_data = bu_isciii.utils.get_sftp_folder(conf, self.resolution_info) if self.type == "delivery" and sftp_folder is None: self.sftp_folder = self.sftp_data[0] else: @@ -183,9 +185,18 @@ def __init__( self.path, self.conf["services_path"], year, self.service_name ) self.samples = self.resolution_info.get("samples", None) + self.versions = self.load_versions() self.handled_services = None - path_to_wkhtmltopdf = os.path.normpath(self.conf["wkhtmltopdf_path"]) - self.config_pdfkit = pdfkit.configuration(wkhtmltopdf=path_to_wkhtmltopdf) + self.all_services = None + try: + self.config_pdfkit = pdfkit.configuration() + except OSError as e: + stderr.print( + "[red] wkhtmlpdf executable was not found. Install it using conda environment." + ) + stderr.print(f"[red] Error: {e}") + sys.exit() + if self.type == "service_info": self.template_file = self.conf["service_info_template_path_file"] else: @@ -198,6 +209,41 @@ def __init__( else: self.email_psswd = email_psswd + if self.type == "delivery": + service_list = {} + for service_id_requested in self.service_ids_requested_list: + service_list[ + service_id_requested + ] = bu_isciii.service_json.ServiceJson().get_find( + service_id_requested, "label" + ) + self.all_services = service_list + + def load_versions(self): + """Load and parse the versions.yml file.""" + result = subprocess.run( + f"find /data/bi/services_and_colaborations/*/*/{self.service_name} -name '*versions.yml'", + stdout=subprocess.PIPE, + text=True, + shell=True, + ) + versions_files = result.stdout.strip().split("\n") + if versions_files == [""]: + stderr.print( + f"[red] No versions.yml files found for the service {self.service_name}!" + ) + return "No software versions data available for this service" + else: + versions_data = {} + loaded_contents = [] + for versions_file in versions_files: + with open(versions_file, "r") as f: + content = yaml.safe_load(f) + if content not in loaded_contents: + versions_data[versions_file] = content + loaded_contents.append(content) + return versions_data + def create_structure(self): if os.path.exists(self.service_folder): log.info("Already creted the service folder for %s", self.service_folder) @@ -282,7 +328,7 @@ def post_delivery_info(self): if self.provided_txt: with open(os.path.expanduser(self.provided_txt)) as f: - self.delivery_notes = " ".join([x.strip() for x in f.readlines()]) + self.delivery_notes = f.read() else: self.delivery_notes = bu_isciii.utils.ask_for_some_text( msg="Write some delivery notes:" @@ -324,6 +370,8 @@ def create_markdown(self, file_path): # service related information markdown_data["service"] = self.resolution_info markdown_data["user_data"] = self.resolution_info["service_user_id"] + markdown_data["software_versions"] = self.versions + markdown_data["services_list"] = self.all_services samples_in_service = {} if self.samples is not None: @@ -380,6 +428,7 @@ def convert_markdown_to_html(self, mk_text): "pymdownx.highlight", "pymdownx.emoji", "pymdownx.tilde", + "nl2br", ], extension_configs={ "pymdownx.b64": { @@ -415,6 +464,7 @@ def convert_to_pdf(self, html_file): ) except OSError as e: stderr.print("[red] Unable to convert to PDF") + stderr.print(f"[red] Error: {e}") log.exception("Unable to create pdf.", exc_info=e) return @@ -593,11 +643,13 @@ def email_creation(self): if bu_isciii.utils.prompt_yn_question( f"Do you want to use notes from {self.provided_txt}?", dflt=False ): - email_data["email_notes"] = self.delivery_notes + email_data["email_notes"] = self.delivery_notes.replace( + "\n", "
" + ) else: email_data["email_notes"] = bu_isciii.utils.ask_for_some_text( msg="Write email notes" - ) + ).replace("\n", "
") email_data["user_data"] = self.resolution_info["service_user_id"] email_data["service_id"] = self.service_name.split("_", 5)[0] diff --git a/bu_isciii/clean.py b/bu_isciii/clean.py index 70cb1c1d..f1f14fc6 100644 --- a/bu_isciii/clean.py +++ b/bu_isciii/clean.py @@ -31,6 +31,7 @@ def __init__( option=None, api_user=None, api_password=None, + conf=None, ): # access the api with the resolution name to obtain the data # ask away if no input given @@ -40,15 +41,13 @@ def __init__( self.resolution_id = resolution_id # Obtain info from iskylims api - self.conf = bu_isciii.config_json.ConfigJson().get_configuration("cleanning") - conf_api = bu_isciii.config_json.ConfigJson().get_configuration( - "xtutatis_api_settings" - ) + self.conf = conf.get_configuration("cleanning") + conf_api = conf.get_configuration("xtutatis_api_settings") rest_api = bu_isciii.drylab_api.RestServiceApi( conf_api["server"], conf_api["api_url"], api_user, api_password ) self.resolution_info = rest_api.get_request( - request_info="service-data", safe=False, resolution=self.resolution_id + request_info="service-data", safe=True, resolution=self.resolution_id ) self.service_folder = self.resolution_info["resolutions"][0][ "resolution_full_number" @@ -56,7 +55,9 @@ def __init__( self.services_requested = self.resolution_info["resolutions"][0][ "available_services" ] - self.service_samples = self.resolution_info["samples"] + self.service_samples = [ + sample_id["sample_name"] for sample_id in self.resolution_info["samples"] + ] if ask_path and path is None: stderr.print( @@ -77,7 +78,10 @@ def __init__( sys.exit() else: self.path = bu_isciii.utils.get_service_paths( - "services_and_colaborations", self.resolution_info, "non_archived_path" + conf, + "services_and_colaborations", + self.resolution_info, + "non_archived_path", ) self.full_path = os.path.join(self.path, self.service_folder) @@ -92,14 +96,13 @@ def __init__( self.delete_files = self.get_clean_items(self.services_to_clean, type="files") # self.delete_list = [item for item in self.delete_list if item] self.nocopy = self.get_clean_items(self.services_to_clean, type="no_copy") - self.service_samples = self.resolution_info.get("Samples", None) if option is None: self.option = bu_isciii.utils.prompt_selection( "Options", [ "full_clean", - "rename_nocopy", + "rename", "clean", "revert_renaming", "show_removable", @@ -126,10 +129,16 @@ def get_clean_items(self, services_ids, type="files"): for service in services_ids: try: items = service_conf.get_find_deep(service, type) - if len(clean_items_list) == 0 and len(items) > 0: - clean_items_list = items - elif len(items) > 0: - clean_items_list.append(items) + if items is None: + stderr.print( + "[red]ERROR: Service type %s not found in services json file for service %s." + % (type, service) + ) + sys.exit() + else: + for item in items: + if item not in clean_items_list: + clean_items_list.append(item) except KeyError as e: stderr.print( "[red]ERROR: Service id %s not found in services json file." @@ -310,10 +319,9 @@ def purge_files(self): files_to_delete = [] for sample_info in self.service_samples: for file in self.delete_files: - file_to_delete = file.replace( - "sample_name", sample_info["sample_name"] - ) - files_to_delete.append(file_to_delete) + file_to_delete = file.replace("sample_name", sample_info) + if file_to_delete not in files_to_delete: + files_to_delete.append(file_to_delete) path_content = self.scan_dirs(to_find=files_to_delete) for file in path_content: os.remove(file) @@ -369,7 +377,7 @@ def delete_work(self): else: stderr.print("There is no work folder here") - def delete_rename(self, verbose=True, sacredtexts=["lablog", "logs"], add="_DEL"): + def delete(self, verbose=True, sacredtexts=["lablog", "logs"], add="_DEL"): """ Description: Remove both files and purge folders defined for the service, and rename to tag. @@ -390,10 +398,8 @@ def delete_rename(self, verbose=True, sacredtexts=["lablog", "logs"], add="_DEL" # Purge folders if self.delete_folders != "": self.purge_folders(sacredtexts=sacredtexts, add=add, verbose=verbose) - # Rename to tag. - self.rename(add=add, to_find=self.delete_folders, verbose=verbose) else: - stderr.print("No folders to remove or rename") + stderr.print("No folders to remove") # Purge work self.delete_work() # Delete files @@ -430,8 +436,10 @@ def full_clean(self): Perform and handle the whole cleaning of the service """ - self.delete_rename() + self.delete() self.rename(to_find=self.nocopy, add="_NC", verbose=True) + if self.delete_folders != "": + self.rename(add="_DEL", to_find=self.delete_folders, verbose=True) def handle_clean(self): """ @@ -443,9 +451,11 @@ def handle_clean(self): self.show_nocopy() if self.option == "full_clean": self.full_clean() - if self.option == "rename_nocopy": + if self.option == "rename": self.rename(to_find=self.nocopy, add="_NC", verbose=True) + if self.delete_folders != "": + self.rename(add="_DEL", to_find=self.delete_folders, verbose=True) if self.option == "clean": - self.delete_rename() + self.delete() if self.option == "revert_renaming": self.revert_renaming() diff --git a/bu_isciii/conf/configuration.json b/bu_isciii/conf/configuration.json old mode 100755 new mode 100644 index 7a39ff99..3551c276 --- a/bu_isciii/conf/configuration.json +++ b/bu_isciii/conf/configuration.json @@ -2,7 +2,11 @@ "global": { "data_path": "/data/bi", "archived_path": "/archived/bi", - "yaml_conf_path": "~/buisciii_config.yml" + "yaml_conf_path": "~/buisciii_config.yml", + "permissions": { + "directory_chmod": "2775", + "file_chmod": "664" + } }, "sftp_copy": { "protocol": "rsync", @@ -16,7 +20,8 @@ "'.nextflow*'", "'*_DEL'", "'*.R'", - "'*.py'" + "'*.py'", + "'*.sbatch'" ] }, "xtutatis_api_settings": { @@ -35,7 +40,6 @@ "delivery_template_path_file": "templates/jinja_template_delivery.j2", "html_template_path_file": "templates/html_service_template.html", "path_to_css": "assets/css", - "wkhtmltopdf_path": "/data/bi/pipelines/miniconda3/envs/buisciii-tools/bin/wkhtmltopdf", "email_host": "mx2.isciii.es", "email_port": "587", "email_host_user": "bioinformatica@isciii.es", @@ -54,7 +58,7 @@ ], "scratch_path": "/scratch/bi/", "srun_settings": { - "--partition": "middle_idx", + "--partition": "middle_obx,middle_idx", "--time": "24:00:00", "--chdir": "/scratch/bi/" } diff --git a/bu_isciii/conf/configuration_dev.json b/bu_isciii/conf/configuration_dev.json new file mode 100755 index 00000000..68e948cb --- /dev/null +++ b/bu_isciii/conf/configuration_dev.json @@ -0,0 +1,66 @@ +{ + "global": { + "data_path": "tests/data/bi", + "archived_path": "tests/archived/bi", + "yaml_conf_path": "~/buisciii_config.yml" + }, + "sftp_copy": { + "protocol": "rsync", + "options": ["-rlpv", "--update", "-L", "--inplace"], + "exclusions": [ + "'*_NC'", + "'*lablog*'", + "'work'", + "'00-reads'", + "'*.sh'", + "'.nextflow*'", + "'*_DEL'", + "'*.R'", + "'*.py'", + "'*.sbatch'" + ] + }, + "xtutatis_api_settings": { + "api_url": "/drylab/api/", + "server": "http://iskylims.isciiides.es" + }, + "api_settings": { + "server": "http://iskylims.isciiides.es", + "api_url": "/drylab/api/" + }, + "bioinfo_doc": { + "bioinfodoc_path": "tests/bioinfo_doc/", + "services_path": "services", + "service_folder": ["service_info", "result"], + "service_info_template_path_file": "templates/jinja_template_service_info.j2", + "delivery_template_path_file": "templates/jinja_template_delivery.j2", + "html_template_path_file": "templates/html_service_template.html", + "path_to_css": "assets/css", + "email_host": "mx2.isciii.es", + "email_port": "587", + "email_host_user": "bioinformatica@isciii.es", + "email_use_tls": "True" + }, + "new_service": { + "fastq_repo": "tests/fastq_repo" + }, + "scratch_copy": { + "protocol": "rsync", + "options": ["-rlpv"], + "exclusions": [ + "'*_NC'", + "'service_info.txt'", + "'work'" + ], + "scratch_path": "tests/scratch/bi/", + "srun_settings": { + "--partition": "middle_obx,middle_idx", + "--time": "24:00:00", + "--chdir": "tests/scratch/bi/" + } + }, + "archive": { + "protocol": "rsync", + "options": ["-rv"] + } +} diff --git a/bu_isciii/config_json.py b/bu_isciii/config_json.py index cafad194..5035ec93 100644 --- a/bu_isciii/config_json.py +++ b/bu_isciii/config_json.py @@ -24,7 +24,6 @@ def get_configuration(self, topic): def get_find(self, topic, found): """ - Owner: Pablo Description: Obtain from topic any forward items from json data """ diff --git a/bu_isciii/copy_sftp.py b/bu_isciii/copy_sftp.py index 13cd40f7..9474b59b 100644 --- a/bu_isciii/copy_sftp.py +++ b/bu_isciii/copy_sftp.py @@ -33,6 +33,7 @@ def __init__( sftp_folder=None, api_user=None, api_password=None, + conf=None, ): if resolution_id is None: self.resolution_id = bu_isciii.utils.prompt_resolution_id() @@ -40,10 +41,8 @@ def __init__( self.resolution_id = resolution_id # Load conf - self.conf = bu_isciii.config_json.ConfigJson().get_configuration("sftp_copy") - conf_api = bu_isciii.config_json.ConfigJson().get_configuration( - "xtutatis_api_settings" - ) + self.conf = conf.get_configuration("sftp_copy") + conf_api = conf.get_configuration("xtutatis_api_settings") # Obtain info from iskylims api rest_api = bu_isciii.drylab_api.RestServiceApi( @@ -51,10 +50,12 @@ def __init__( ) self.resolution_info = rest_api.get_request( - request_info="service-data", safe=False, resolution=self.resolution_id + request_info="service-data", safe=True, resolution=self.resolution_id ) if sftp_folder is None: - self.sftp_folder = bu_isciii.utils.get_sftp_folder(self.resolution_info)[0] + self.sftp_folder = bu_isciii.utils.get_sftp_folder( + conf, self.resolution_info + )[0] else: self.sftp_folder = sftp_folder @@ -64,9 +65,7 @@ def __init__( self.services_requested = self.resolution_info["resolutions"][0][ "available_services" ] - self.sftp_options = bu_isciii.config_json.ConfigJson().get_find( - "sftp_copy", "options" - ) + self.sftp_options = conf.get_find("sftp_copy", "options") self.services_to_copy = bu_isciii.utils.get_service_ids(self.services_requested) self.last_folders = self.get_last_folders( @@ -89,7 +88,10 @@ def __init__( sys.exit() else: self.path = bu_isciii.utils.get_service_paths( - "services_and_colaborations", self.resolution_info, "non_archived_path" + conf, + "services_and_colaborations", + self.resolution_info, + "non_archived_path", ) self.full_path = os.path.join(self.path, self.service_folder) @@ -110,8 +112,9 @@ def get_last_folders(self, services_ids, type="last_folder"): last_folders_list = [] for service in services_ids: try: - items = service_conf.get_find_deep(service, type) - last_folders_list.append(items) + item = service_conf.get_find_deep(service, type) + if item not in last_folders_list: + last_folders_list.append(item) except KeyError as e: stderr.print( "[red]ERROR: Service id %s not found in services json file." diff --git a/bu_isciii/new_service.py b/bu_isciii/new_service.py index c9eca73d..0bcf6556 100755 --- a/bu_isciii/new_service.py +++ b/bu_isciii/new_service.py @@ -34,6 +34,7 @@ def __init__( ask_path=False, api_user=None, api_password=None, + conf=None, ): if resolution_id is None: self.resolution_id = bu_isciii.utils.prompt_resolution_id() @@ -46,16 +47,14 @@ def __init__( self.no_create_folder = no_create_folder # Load conf - self.conf = bu_isciii.config_json.ConfigJson().get_configuration("new_service") - conf_api = bu_isciii.config_json.ConfigJson().get_configuration( - "xtutatis_api_settings" - ) + self.conf = conf.get_configuration("new_service") + conf_api = conf.get_configuration("xtutatis_api_settings") # Obtain info from iskylims api self.rest_api = bu_isciii.drylab_api.RestServiceApi( conf_api["server"], conf_api["api_url"], api_user, api_password ) self.resolution_info = self.rest_api.get_request( - request_info="service-data", safe=False, resolution=self.resolution_id + request_info="service-data", safe=True, resolution=self.resolution_id ) self.service_folder = self.resolution_info["resolutions"][0][ "resolution_full_number" @@ -82,7 +81,10 @@ def __init__( sys.exit() else: self.path = bu_isciii.utils.get_service_paths( - "services_and_colaborations", self.resolution_info, "non_archived_path" + conf, + "services_and_colaborations", + self.resolution_info, + "non_archived_path", ) self.full_path = os.path.join(self.path, self.service_folder) @@ -123,13 +125,13 @@ def copy_template(self): ) services_ids = bu_isciii.utils.get_service_ids(self.services_requested) services_json = bu_isciii.service_json.ServiceJson() - if len(services_ids) == 1: + for service_id in services_ids: try: - service_template = services_json.get_find(services_ids[0], "template") + service_template = services_json.get_find(service_id, "template") except KeyError as e: stderr.print( "[red]ERROR: Service id %s not found in services json file." - % services_ids[0] + % service_id ) stderr.print("traceback error %s" % e) sys.exit() @@ -151,13 +153,6 @@ def copy_template(self): stderr.print("[red]ERROR: Copying template failed.") stderr.print("traceback error %s" % e) sys.exit() - else: - stderr.print( - "[red] ERROR: I'm not already prepared for handling more than one error at the same time, sorry!" - "Please re-run and select one of the service ids." - ) - sys.exit(1) - return False return True def create_samples_id(self): @@ -212,11 +207,9 @@ def create_symbolic_links(self): ) except OSError as e: stderr.print( - "[red]ERROR: Symbolic links creation failed for sample %s." - % sample["sampleName"] + "[red]ERROR: Symbolic links creation failed for file %s." % file ) stderr.print("Traceback: %s" % e) - sys.exit() def samples_json(self): json_samples = json.dumps(self.service_samples, indent=4) @@ -235,16 +228,7 @@ def create_new_service(self): self.create_samples_id() self.create_symbolic_links() self.samples_json() - self.rest_api.put_request( - "update-state", "resolution", self.resolution_id, "state", "in_progress" - ) - else: - stderr.print( - "[yellow]WARN: No samples recorded in service: " + self.resolution_id - ) - if bu_isciii.utils.prompt_yn_question("Do you want to proceed?: "): - self.create_folder() - self.copy_template() + if self.resolution_info["service_state"] != "in_progress": self.rest_api.put_request( "update-state", "resolution", @@ -252,6 +236,22 @@ def create_new_service(self): "state", "in_progress", ) + + else: + stderr.print( + "[yellow]WARN: No samples recorded in service: " + self.resolution_id + ) + if bu_isciii.utils.prompt_yn_question("Do you want to proceed?: "): + self.create_folder() + self.copy_template() + if self.resolution_info["service_state"] != "in_progress": + self.rest_api.put_request( + "update-state", + "resolution", + self.resolution_id, + "state", + "in_progress", + ) else: stderr.print("Directory not created. Bye!") sys.exit(1) diff --git a/bu_isciii/scratch.py b/bu_isciii/scratch.py index d31ca0cd..529493ab 100755 --- a/bu_isciii/scratch.py +++ b/bu_isciii/scratch.py @@ -36,6 +36,7 @@ def __init__( ask_path=False, api_user=None, api_password=None, + conf=None, ): if resolution_id is None: self.resolution_id = bu_isciii.utils.prompt_resolution_id() @@ -55,17 +56,15 @@ def __init__( else: self.direction = direction # Load conf - conf_api = bu_isciii.config_json.ConfigJson().get_configuration( - "xtutatis_api_settings" - ) + conf_api = conf.get_configuration("xtutatis_api_settings") # Obtain info from iskylims api rest_api = bu_isciii.drylab_api.RestServiceApi( conf_api["server"], conf_api["api_url"], api_user, api_password ) - self.conf = bu_isciii.config_json.ConfigJson().get_configuration("scratch_copy") + self.conf = conf.get_configuration("scratch_copy") self.resolution_info = rest_api.get_request( - request_info="service-data", safe=False, resolution=self.resolution_id + request_info="service-data", safe=True, resolution=self.resolution_id ) self.service_folder = self.resolution_info["resolutions"][0][ "resolution_full_number" @@ -91,7 +90,10 @@ def __init__( sys.exit() else: self.path = bu_isciii.utils.get_service_paths( - "services_and_colaborations", self.resolution_info, "non_archived_path" + conf, + "services_and_colaborations", + self.resolution_info, + "non_archived_path", ) self.full_path = os.path.join(self.path, self.service_folder) @@ -185,6 +187,16 @@ def revert_copy_scratch(self): sync_source_contents=False, ) self.srun_command(self.srun_settings, rsync_command) + + # After successful rsync, apply correct permissions + conf = bu_isciii.config_json.ConfigJson() + permissions_config = conf.get_configuration("global").get( + "permissions" + ) + bu_isciii.utils.remake_permissions( + self.full_path, permissions_config + ) + stderr.print( "[green]Successfully copied the directory to %s" % dest_folder, @@ -199,7 +211,7 @@ def revert_copy_scratch(self): except Exception as e: stderr.print(e) stderr.print( - "[red]ERROR: Copy of the directory %s failed" + "[red]ERROR: Copy of directory %s failed" % self.scratch_tmp_path, highlight=False, ) @@ -239,17 +251,17 @@ def remove_scratch(self): if self.service_folder in scratch_folder: shutil.rmtree(scratch_folder) stderr.print( - "[green]Successfully removed the directory %s" % scratch_folder, + "[green]Successfully removed directory %s" % scratch_folder, highlight=False, ) else: log.error( - f"Directory path not the same as service resolution. Skip folder copy '{scratch_folder}'" + f"Directory path is not the same as service resolution. Skip folder copy '{scratch_folder}'" ) stderr.print( "[red]ERROR: Directory " + scratch_folder - + " not the same as " + + " is not the same as " + self.scratch_tmp_path, highlight=False, ) diff --git a/bu_isciii/service_json.py b/bu_isciii/service_json.py index d9f2a280..bbe8a111 100644 --- a/bu_isciii/service_json.py +++ b/bu_isciii/service_json.py @@ -38,7 +38,6 @@ def get_service_configuration(self, service): def get_find(self, service, found): """ - Owner: Pablo Description: Obtain from service any forward items from json data """ diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/01-preproQC/lablog b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/01-preproQC/lablog index 66e8db87..0062b7b6 100644 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/01-preproQC/lablog +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/01-preproQC/lablog @@ -1,7 +1,7 @@ -#module load FastQC +#module load singularity mkdir logs scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") -cat ../samples_id.txt | while read in; do echo "mkdir $in; srun --partition short_idx --cpus-per-task 8 --time 01:00:00 --chdir $scratch_dir --output logs/FASTQC.${in}.%j.log fastqc -o $in --nogroup -t 8 -k 8 ../00-reads/"$in"_R1.fastq.gz ../00-reads/"$in"_R2.fastq.gz &"; done > _01_rawfastqc.sh +cat ../samples_id.txt | while read in; do echo "mkdir $in; srun --partition short_idx --cpus-per-task 8 --time 01:00:00 --chdir $scratch_dir --output logs/FASTQC.${in}.%j.log singularity exec -B ${scratch_dir}/../../../ -B /srv/fastq_repo/ /data/bi/pipelines/singularity-images/fastqc:0.11.9--hdfd78af_1 fastqc -o ${scratch_dir}/$in --nogroup -t 8 -k 8 ${scratch_dir}/../00-reads/"$in"_R1.fastq.gz ${scratch_dir}/../00-reads/"$in"_R2.fastq.gz &"; done > _01_rawfastqc.sh diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/02-preprocessing/lablog b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/02-preprocessing/lablog index 46ab047b..4d355c58 100644 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/02-preprocessing/lablog +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/02-preprocessing/lablog @@ -1,4 +1,4 @@ -# module load fastp +# module load singularity mkdir logs scratch_dir=$(echo $(pwd) | sed 's@/data/bi/scratch_tmp/@/scratch/@g') -cat ../samples_id.txt | xargs -I @@ echo "mkdir @@; srun --chdir ${scratch_dir} --mem 10G --time 1:00:00 --job-name FP.@@ --output logs/FP.@@.%j.log --partition short_idx --cpus-per-task 5 fastp --in1 ../00-reads/@@_R1.fastq.gz --in2 ../00-reads/@@_R2.fastq.gz --thread 5 --cut_front --cut_tail --cut_mean_quality 15 --qualified_quality_phred 15 --trim_poly_x --length_required 50 --detect_adapter_for_pe --json @@/@@_fastp.json --html @@/@@_fastp.html --out1 @@/@@_R1_filtered.fastq.gz --out2 @@/@@_R2_filtered.fastq.gz --unpaired1 @@/@@_R1_unpaired.fastq.gz --unpaired2 @@/@@_R2_unpaired.fastq.gz &" > _01_fastp.sh +cat ../samples_id.txt | xargs -I @@ echo "mkdir @@; srun --chdir ${scratch_dir} --mem 10G --time 1:00:00 --job-name FP.@@ --output logs/FP.@@.%j.log --partition short_idx --cpus-per-task 5 singularity exec -B ${scratch_dir}/../../../ -B /srv/fastq_repo/ /data/bi/pipelines/singularity-images/fastp:0.20.0--hdbcaa40_0 fastp --in1 ${scratch_dir}/../00-reads/@@_R1.fastq.gz --in2 ${scratch_dir}/../00-reads/@@_R2.fastq.gz --thread 5 --cut_front --cut_tail --cut_mean_quality 15 --qualified_quality_phred 15 --trim_poly_x --length_required 50 --detect_adapter_for_pe --json ${scratch_dir}/@@/@@_fastp.json --html ${scratch_dir}/@@/@@_fastp.html --out1 ${scratch_dir}/@@/@@_R1_filtered.fastq.gz --out2 ${scratch_dir}/@@/@@_R2_filtered.fastq.gz --unpaired1 ${scratch_dir}/@@/@@_R1_unpaired.fastq.gz --unpaired2 ${scratch_dir}/@@/@@_R2_unpaired.fastq.gz &" > _01_fastp.sh diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/03-procQC/lablog b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/03-procQC/lablog index f0636764..c49d7076 100644 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/03-procQC/lablog +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/03-procQC/lablog @@ -1,7 +1,7 @@ -#module load FastQC +#module load singularity mkdir logs scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") -cat ../samples_id.txt | while read in; do echo "mkdir $in; srun --partition short_idx --chdir $scratch_dir --output logs/FASTQC.${in}.%j.log fastqc -o $in --nogroup -t 8 -k 8 ../02-preprocessing/${in}/${in}_R1_filtered.fastq.gz ../02-preprocessing/${in}/${in}_R2_filtered.fastq.gz &"; done > _01_rawfastqc.sh +cat ../samples_id.txt | while read in; do echo "mkdir $in; srun --partition short_idx --chdir $scratch_dir --output logs/FASTQC.${in}.%j.log singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/fastqc:0.11.9--hdfd78af_1 fastqc -o ${scratch_dir}/$in --nogroup -t 8 -k 8 ${scratch_dir}/../02-preprocessing/${in}/${in}_R1_filtered.fastq.gz ${scratch_dir}/../02-preprocessing/${in}/${in}_R2_filtered.fastq.gz &"; done > _01_rawfastqc.sh diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py new file mode 100644 index 00000000..1cd41672 --- /dev/null +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py @@ -0,0 +1,1020 @@ +# imports +from Bio import SeqIO +import statistics +import argparse +import sys + + +def parse_args(args=None): + Description = "Convert alignment between IRMA consensus and reference fasta to VCF file using IRMA stats" + Epilog = """Example usage: python create_irma_vcf.py -a -i -o """ + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument( + "-a", + "--alignment", + type=str, + required=True, + help="Alignment file", + ) + parser.add_argument( + "-i", + "--irma_alleles", + type=str, + required=True, + help="IRMA allAlleles.txt file", + ) + parser.add_argument( + "-o", + "--out_vcf", + type=str, + required=True, + help="Output vcf file", + ) + parser.add_argument( + "-f", + "--frequency", + type=float, + default=0.25, + required=True, + help="Minimum Allele Frequency for a variant to be included in the .vcf file. Default 0.25.", + ) + parser.add_argument( + "-d", + "--depth", + type=int, + default=10, + required=True, + help="Minimum depth for a variant to be included in the .vcf file. Default 10X.", + ) + return parser.parse_args(args) + + +def alleles_to_dict(alleles_file, frequency, depth): + """Convert IRMA's allAlleles file to dictionary. + + Parameters + ---------- + alleles_file : str + Path to the alleles file. + + Returns + ------- + alleles_dict + Dictionary containing alleles information with chrom+positions+allele as key. e.g. + { + "rsv_a2_1_A": { + "Reference_Name": "rsv_a2", + "Position": "1", + "Allele": "A", + "Count": "2", + "Total": "2", + "Frequency": "1", + "Average_Quality": "29.5", + "ConfidenceNotMacErr": "0.998877981545698", + "PairedUB": "1", + "QualityUB": "1", + "Allele_Type": "Consensus" + }, + "rsv_a2_2204_A": { + "Reference_Name": "rsv_a2", + "Position": "2204", + "Allele": "A", + "Count": "6532", + "Total": "15323", + "Frequency": "0.426287280558637", + "Average_Quality": "34.5708818126148", + "ConfidenceNotMacErr": "0.999181140401206", + "PairedUB": "0.00396999257813604", + "QualityUB": "0.0010642711614851", + "Allele_Type": "Minority" + }, + "rsv_a2_2204_G": { + "Reference_Name": "rsv_a2", + "Position": "2204", + "Allele": "G", + "Count": "8768", + "Total": "15323", + "Frequency": "0.5722117078901", + "Average_Quality": "35.0286268248175", + "ConfidenceNotMacErr": "0.999450989591763", + "PairedUB": "0.00396999257813604", + "QualityUB": "0.00100698799816366", + "Allele_Type": "Consensus" + }, + } + """ + + alleles_dict = {} + with open(alleles_file, "r") as file: + header = file.readline().strip().split("\t") + for line in file: + while line.count("\t") < len(header) - 1: + line += file.readline() + line_data = line.strip().split("\t") + position = int(line_data[1]) + variant_af = float(line_data[5]) + position_dp = float(line_data[4]) + if variant_af >= frequency and position_dp >= depth: + entry_dict = {header[i]: line_data[i] for i in range(len(header))} + variant = ( + str(line_data[0]) + "_" + str(position) + "_" + str(line_data[2]) + ) + alleles_dict[variant] = entry_dict + return alleles_dict + + +def align2dict(alignment_file): + """Convert alignment file to dictionary. + + Parameters + ---------- + alignment_file : str + Path to the alignment file in fasta format. + + Returns + ------- + vcf_dict + Dictionary containing alignment information with alignment positions as keys. + E.g.: + { + "10": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 1, + "SAMPLE_POS": [ + 8, + 9 + ], + "REF": "A", + "ALT": "AAA", + "TYPE": "INS" + }, + "11": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 2, + "SAMPLE_POS": [ + 11 + ], + "REF": "A", + "ALT": "A", + "TYPE": "REF" + }, + "7542": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 7531, + "SAMPLE_POS": [ + 7542 + ], + "REF": "T", + "ALT": "TT", + "TYPE": "INS" + }, + "7543": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 7531, + "SAMPLE_POS": [ + 7543 + ], + "REF": "T", + "ALT": "TC", + "TYPE": "INS" + }, + "7544": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 7531, + "SAMPLE_POS": [ + 7544 + ], + "REF": "C", + "ALT": "CA", + "TYPE": "INS" + }, + "10081": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 10068, + "SAMPLE_POS": [ + 10079 + ], + "REF": "AA", + "ALT": "A", + "TYPE": "DEL" + }, + "10082": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 10069, + "SAMPLE_POS": [ + 10079 + ], + "REF": "-C", + "ALT": "-", + "TYPE": "DEL" + }, + "10083": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 10070, + "SAMPLE_POS": [ + 10079 + ], + "REF": "-T", + "ALT": "-", + "TYPE": "DEL" + } + } + """ + sequences_dict = {} + with open(alignment_file, "r") as alignment: + for sequence in SeqIO.parse(alignment, "fasta"): + sequences_dict[sequence.id] = str(sequence.seq) + sample_id, sample_seq = list(sequences_dict.items())[0] + ref_id, ref_seq = list(sequences_dict.items())[1] + sample_position = 0 + ref_position = 0 + vcf_dict = {} + CHROM = ref_id + ALT = "" + SAMPLE_POS = [] + for i, (sample_base, ref_base) in enumerate(zip(sample_seq, ref_seq)): + align_position = i + 1 + if sample_base != "-": + sample_position += 1 + if ref_base != "-": + ref_position += 1 + if ref_base == "-" and sample_base != "N": + if ref_position == 0: + ALT += sample_base + SAMPLE_POS.append(sample_position) + else: + content_dict = { + "CHROM": CHROM, + "REF_POS": ref_position, + "SAMPLE_POS": [sample_position], + "REF": sample_seq[i - 1], + "ALT": sample_seq[i - 1] + sample_base, + "TYPE": "INS", + } + vcf_dict[align_position] = content_dict + elif ref_position == 1 and len(SAMPLE_POS) > 1: + content_dict = { + "CHROM": CHROM, + "REF_POS": ref_position, + "SAMPLE_POS": SAMPLE_POS, + "REF": ref_base, + "ALT": ALT + sample_base, + "TYPE": "INS", + } + vcf_dict[align_position] = content_dict + elif sample_base == "-" and ref_base != "N": + if sample_position == 0: + content_dict = { + "CHROM": CHROM, + "REF_POS": ref_position, + "SAMPLE_POS": [sample_position], + "REF": ref_base + ref_seq[i + 1], + "ALT": ref_seq[i + 1], + "TYPE": "DEL", + } + vcf_dict[align_position] = content_dict + else: + content_dict = { + "CHROM": CHROM, + "REF_POS": ref_position - 1, + "SAMPLE_POS": [sample_position], + "REF": sample_seq[i - 1] + ref_base, + "ALT": sample_seq[i - 1], + "TYPE": "DEL", + } + vcf_dict[align_position] = content_dict + elif ( + ref_base != sample_base + and ref_base != "N" + and ref_base != "-" + and sample_base != "N" + and sample_base != "-" + ): + content_dict = { + "CHROM": CHROM, + "REF_POS": ref_position, + "SAMPLE_POS": [sample_position], + "REF": ref_base, + "ALT": sample_base, + "TYPE": "SNP", + } + vcf_dict[align_position] = content_dict + elif ( + ref_base != "N" + and ref_base != "-" + and sample_base != "N" + and sample_base != "-" + ): + content_dict = { + "CHROM": CHROM, + "REF_POS": ref_position, + "SAMPLE_POS": [sample_position], + "REF": ref_base, + "ALT": sample_base, + "TYPE": "REF", + } + vcf_dict[align_position] = content_dict + return vcf_dict + + +def stats_vcf(vcf_dictionary, alleles_dictionary, last_pos, last_allele): + """Add stats to VCF dictionary. + + Parameters + ---------- + vcf_dictionary : dict + Dictionary containing VCF information. + alleles_dictionary : dict + Dictionary containing alleles information. + + Returns + ------- + af_vcf_dict + Updated dictionary with allele frequencies and other metrics. + E.g: + { + "EPI_ISL_18668201_1_AAA": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 1, + "SAMPLE_POS": [ + 8, + 9 + ], + "REF": "A", + "ALT": "AAA", + "TYPE": "INS", + "DP": [ + "9", + "10" + ], + "TOTAL_DP": [ + "9", + "10" + ], + "AF": [ + "1", + "1" + ], + "QUAL": [ + "33.7777777777778", + "34" + ] + }, + "EPI_ISL_18668201_10_A": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 10, + "SAMPLE_POS": [ + 19 + ], + "REF": "T", + "ALT": "A", + "TYPE": "SNP", + "DP": [ + "60" + ], + "TOTAL_DP": [ + "72" + ], + "AF": [ + "0.833333333333333" + ], + "QUAL": [ + "34.0166666666667" + ] + }, + "EPI_ISL_18668201_7531_TT": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 7531, + "SAMPLE_POS": [ + 7542 + ], + "REF": "T", + "ALT": "TT", + "TYPE": "INS", + "DP": [ + "74" + ], + "TOTAL_DP": [ + "75" + ], + "AF": [ + "0.986666666666667" + ], + "QUAL": [ + "34.8648648648649" + ] + }, + "EPI_ISL_18668201_7531_TC": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 7531, + "SAMPLE_POS": [ + 7543 + ], + "REF": "T", + "ALT": "TC", + "TYPE": "INS", + "DP": [ + "75" + ], + "TOTAL_DP": [ + "75" + ], + "AF": [ + "1" + ], + "QUAL": [ + "35.04" + ] + }, + "EPI_ISL_18668201_7531_CA": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 7531, + "SAMPLE_POS": [ + 7544 + ], + "REF": "C", + "ALT": "CA", + "TYPE": "INS", + "DP": [ + "75" + ], + "TOTAL_DP": [ + "75" + ], + "AF": [ + "1" + ], + "QUAL": [ + "33.8533333333333" + ] + }, + "EPI_ISL_18668201_10067_A": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 10067, + "SAMPLE_POS": [ + 10079 + ], + "REF": "AA", + "ALT": "A", + "TYPE": "DEL", + "DP": [ + "10" + ], + "TOTAL_DP": [ + "10" + ], + "AF": [ + "1" + ], + "QUAL": [ + "34.3" + ] + }, + "EPI_ISL_18668201_10068_-": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 10068, + "SAMPLE_POS": [ + 10079 + ], + "REF": "-C", + "ALT": "-", + "TYPE": "DEL", + "DP": [ + "10" + ], + "TOTAL_DP": [ + "10" + ], + "AF": [ + "1" + ], + "QUAL": [ + "34.3" + ] + }, + "EPI_ISL_18668201_10069_-": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 10069, + "SAMPLE_POS": [ + 10079 + ], + "REF": "-T", + "ALT": "-", + "TYPE": "DEL", + "DP": [ + "10" + ], + "TOTAL_DP": [ + "10" + ], + "AF": [ + "1" + ], + "QUAL": [ + "34.3" + ] + } + } + """ + + af_vcf_dict = {} + for _, value in alleles_dictionary.items(): + pos = value["Position"] + chrom = next(iter(vcf_dictionary.values()))["CHROM"] + + if int(pos) > last_pos and value["Allele_Type"] == "Minority": + content_dict = { + "CHROM": chrom, + "REF_POS": last_pos, + "SAMPLE_POS": [pos], + "REF": last_allele, + "ALT": last_allele + value["Allele"], + "TYPE": "INS", + "DP": [value["Count"]], + "TOTAL_DP": [value["Total"]], + "AF": [value["Frequency"]], + "QUAL": [value["Frequency"]], + } + + variant = ( + content_dict["CHROM"] + + "_" + + str(content_dict["REF_POS"]) + + "_" + + "final_ins" + ) + + if variant in af_vcf_dict: + af_vcf_dict[variant]["DP"] += content_dict["DP"] + af_vcf_dict[variant]["TOTAL_DP"] += content_dict["TOTAL_DP"] + af_vcf_dict[variant]["AF"] += content_dict["AF"] + af_vcf_dict[variant]["QUAL"] += content_dict["QUAL"] + af_vcf_dict[variant]["SAMPLE_POS"] += content_dict["SAMPLE_POS"] + af_vcf_dict[variant]["ALT"] += value["Allele"] + else: + af_vcf_dict[variant] = content_dict + pass + + for align_pos, subdict in vcf_dictionary.items(): + if (value["Allele_Type"] == "Consensus" and subdict["TYPE"] == "REF") or ( + value["Allele"] == subdict["REF"] + and subdict["TYPE"] not in ["DEL", "INS"] + ): + continue + if 0 in subdict["SAMPLE_POS"] and len(subdict["SAMPLE_POS"]) == 1: + content_dict = { + "CHROM": subdict["CHROM"], + "REF_POS": subdict["REF_POS"], + "SAMPLE_POS": subdict["SAMPLE_POS"], + "REF": subdict["REF"], + "ALT": subdict["ALT"], + "TYPE": subdict["TYPE"], + "DP": ["NA"], + "TOTAL_DP": ["NA"], + "AF": ["NA"], + "QUAL": ["NA"], + } + variant = ( + content_dict["CHROM"] + + "_" + + str(content_dict["REF_POS"]) + + "_" + + content_dict["ALT"] + ) + af_vcf_dict[variant] = content_dict + pass + + if "SAMPLE_POS" in subdict and int(pos) in subdict["SAMPLE_POS"]: + DP = [] + TOTAL_DP = [] + AF = [] + QUAL = [] + content_dict = { + "CHROM": subdict["CHROM"], + "REF_POS": subdict["REF_POS"], + "SAMPLE_POS": subdict["SAMPLE_POS"], + "REF": subdict["REF"], + "ALT": subdict["ALT"], + "TYPE": subdict["TYPE"], + } + if ( + value["Allele"] == content_dict["ALT"] + or value["Allele_Type"] == "Minority" + or content_dict["TYPE"] in ["INS", "DEL", "REF"] + ): + if value["Allele_Type"] == "Minority": + content_dict.update({"ALT": value["Allele"]}) + content_dict.update({"TYPE": "SNP"}) + if value["Allele"] == "-" and value["Allele_Type"] == "Minority": + REF = vcf_dictionary[align_pos - 1]["REF"] + subdict["REF"] + ALT = vcf_dictionary[align_pos - 1]["REF"] + content_dict.update( + {"REF_POS": vcf_dictionary[align_pos - 1]["REF_POS"]} + ) + content_dict.update({"REF": REF}) + content_dict.update({"ALT": ALT}) + content_dict.update({"TYPE": "DEL"}) + DP.append(value["Count"]) + TOTAL_DP.append(value["Total"]) + AF.append(value["Frequency"]) + QUAL.append(value["Average_Quality"]) + else: + print("SNP not the same in .fasta file and alleles file") + print(value) + print(content_dict) + content_dict.update( + {"DP": DP, "TOTAL_DP": TOTAL_DP, "AF": AF, "QUAL": QUAL} + ) + variant = ( + content_dict["CHROM"] + + "_" + + str(content_dict["REF_POS"]) + + "_" + + content_dict["ALT"] + ) + + if variant in af_vcf_dict: + af_vcf_dict[variant]["DP"] += DP + af_vcf_dict[variant]["TOTAL_DP"] += TOTAL_DP + af_vcf_dict[variant]["AF"] += AF + af_vcf_dict[variant]["QUAL"] += QUAL + else: + af_vcf_dict[variant] = content_dict + pass + + return af_vcf_dict + + +def combine_indels(vcf_dictionary): + """Combine insertion and deletion pñositons in the VCF dictionary. + + Parameters + ---------- + vcf_dictionary : dict + Dictionary containing VCF information. + + Returns + ------- + combined_vcf_dict + Updated dictionary with combined insertion and deletion variants. + { + "1": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 1, + "SAMPLE_POS": [ + 8, + 9 + ], + "REF": "A", + "ALT": "AAA", + "DP": [ + "9" + ], + "TOTAL_DP": [ + "9", + "10" + ], + "AF": [ + "1" + ], + "QUAL": [ + "33.7777777777778" + ], + "TYPE": "INS" + }, + "10": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 10, + "SAMPLE_POS": [ + 19 + ], + "REF": "T", + "ALT": "A", + "DP": [ + "72" + ], + "TOTAL_DP": [ + "10" + ], + "AF": [ + "0.833333333333333" + ], + "QUAL": [ + "34.0166666666667" + ], + "TYPE": "SNP" + }, + "7531": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 7531, + "SAMPLE_POS": [ + 7542, + 7543, + 7544 + ], + "REF": "T", + "ALT": "TTCA", + "DP": [ + "74", + "75", + "75" + ], + "TOTAL_DP": [ + "75", + "75", + "75" + ], + "AF": [ + "0.986666666666667", + "1", + "1" + ], + "QUAL": [ + "34.8648648648649", + "35.04", + "33.8533333333333" + ], + "TYPE": "INS" + }, + "10067": { + "CHROM": "EPI_ISL_18668201", + "REF_POS": 10067, + "SAMPLE_POS": [ + 10079 + ], + "REF": "AACT", + "ALT": "A", + "DP": [ + "10" + ], + "TOTAL_DP": [ + "10", + ], + "AF": [ + "1" + ], + "QUAL": [ + "34.3" + ], + "TYPE": "DEL" + } + } + + """ + + combined_vcf_dict = {} + for _, value in vcf_dictionary.items(): + content_dict = { + "CHROM": value["CHROM"], + "REF_POS": value["REF_POS"], + "SAMPLE_POS": value["SAMPLE_POS"], + "REF": value["REF"], + "ALT": value["ALT"], + "DP": value["DP"], + "TOTAL_DP": value["TOTAL_DP"], + "AF": value["AF"], + "QUAL": value["QUAL"], + "TYPE": value["TYPE"], + } + if value["TYPE"] == "INS": + if value["REF_POS"] in combined_vcf_dict: + if value["TYPE"] == combined_vcf_dict[value["REF_POS"]]["TYPE"]: + NEW_ALT = value["ALT"][len(value["REF"]) :] + combined_vcf_dict[value["REF_POS"]]["ALT"] += NEW_ALT + combined_vcf_dict[value["REF_POS"]]["SAMPLE_POS"].append( + value["SAMPLE_POS"][0] + ) + combined_vcf_dict[value["REF_POS"]]["DP"].append(value["DP"][0]) + combined_vcf_dict[value["REF_POS"]]["TOTAL_DP"].append( + value["TOTAL_DP"][0] + ) + combined_vcf_dict[value["REF_POS"]]["AF"].append(value["AF"][0]) + combined_vcf_dict[value["REF_POS"]]["QUAL"].append(value["QUAL"][0]) + else: + print("Same position annotated with multiple variant types") + print("value") + print(value) + print("combined_vcf_dict") + print(combined_vcf_dict[value["REF_POS"]]) + else: + combined_vcf_dict[value["REF_POS"]] = content_dict + elif value["TYPE"] == "DEL": + sample_found = False + minority = False + for af in value["AF"]: + if float(af) < 0.5: + minority = True + prev_sample_pos = "" + if minority and len(value["SAMPLE_POS"]) == 1: + sample_pos = value["SAMPLE_POS"][0] + prev_sample_pos = sample_pos - 1 + for _, data in combined_vcf_dict.items(): + if data["TYPE"] == "DEL": + if value["SAMPLE_POS"] == data["SAMPLE_POS"]: + if value["TYPE"] == data["TYPE"]: + sample_found = data["REF_POS"] + break + else: + print("Same position annotated with multiple variant types") + print("value") + print(value) + print("combined_vcf_dict") + print(combined_vcf_dict[value["REF_POS"]]) + elif minority and prev_sample_pos in data["SAMPLE_POS"]: + sample_found = data["REF_POS"] + break + if sample_found: + if 0 in value["SAMPLE_POS"] and len(value["SAMPLE_POS"]) == 1: + combined_vcf_dict[sample_found]["REF"] += value["ALT"] + combined_vcf_dict[sample_found]["ALT"] = value["ALT"] + else: + NEW_REF = value["REF"][len(value["ALT"]) :] + combined_vcf_dict[sample_found]["REF"] += NEW_REF + if minority: + combined_vcf_dict[sample_found]["SAMPLE_POS"] += value[ + "SAMPLE_POS" + ] + combined_vcf_dict[sample_found]["DP"] += value["DP"] + combined_vcf_dict[sample_found]["TOTAL_DP"] += value["TOTAL_DP"] + combined_vcf_dict[sample_found]["AF"] += value["AF"] + else: + combined_vcf_dict[value["REF_POS"]] = content_dict + elif value["TYPE"] == "SNP": + if value["REF_POS"] in combined_vcf_dict: + if value["TYPE"] == combined_vcf_dict[value["REF_POS"]]["TYPE"]: + print("Repeated SNP!!!") + else: + print("Same position annotated with multiple variant types") + print("value") + print(value) + print("combined_vcf_dict") + print(combined_vcf_dict[value["REF_POS"]]) + else: + combined_vcf_dict[value["REF_POS"]] = content_dict + else: + print("Different annotation type found") + return combined_vcf_dict + + +def get_vcf_header(chromosome, sample_name): + """Create the VCF header for VCFv4.2 + + Parameters + ---------- + chromosome : str + Chromosome name. + sample_name : str + Sample name. + + Returns + ------- + header + String containing all the VCF header lines separated by newline. + """ + + header_source = ["##fileformat=VCFv4.2", "##source=custom"] + header_contig = [] + if chromosome: + header_contig += ["##contig="] + header_source += header_contig + + header_info = [ + '##INFO=', + '##INFO=', + ] + header_filter = [ + '##FILTER=', + ] + header_format = [ + '##FORMAT=', + '##FORMAT=', + '##FORMAT=', + '##FORMAT=', + ] + columns = ["#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + sample_name] + header = header_source + header_info + header_filter + header_format + columns + return header + + +def create_vcf(variants_dict, out_vcf, alignment): + """Create VCF file from variants dictionary. + + Parameters + ---------- + variants_dict : dict + Dictionary containing variants information. + out_vcf : str + Path to the output VCF file. + alignment : str + Path to the alignment file. + + Returns + ------- + None + """ + + chrom = next(iter(variants_dict.values()))["CHROM"] + sample = alignment.replace(".align.fasta", "") + vcf_header = "\n".join(get_vcf_header(chrom, sample)) + FORMAT = "GT:ALT_DP:ALT_QUAL:ALT_FREQ" + ID = "." + QUAL = "." + FILTER = "PASS" + GT = "1" + with open(out_vcf, "w") as file_out: + file_out.write(vcf_header + "\n") + for key, value in variants_dict.items(): + CHROM = value["CHROM"] + POS = value["REF_POS"] + REF = value["REF"] + ALT = value["ALT"] + TOTAL_DP_list = [] + for number in value["TOTAL_DP"]: + if number != "NA": + TOTAL_DP_list.append(int(number)) + if TOTAL_DP_list: + TOTAL_DP = str(round(statistics.mean(TOTAL_DP_list))) + else: + TOTAL_DP = "NA" + + INFO = "TYPE=" + value["TYPE"] + ";" + "DP=" + TOTAL_DP + ALT_QUAL_list = [] + for number in value["QUAL"]: + if number != "NA": + ALT_QUAL_list.append(float(number)) + if ALT_QUAL_list: + ALT_QUAL = str(round(statistics.mean(ALT_QUAL_list), 2)) + else: + ALT_QUAL = "NA" + + ALT_DP_list = [] + for number in value["DP"]: + if number != "NA": + ALT_DP_list.append(int(number)) + if ALT_DP_list: + ALT_DP = str(round(statistics.mean(ALT_DP_list), 0)) + else: + ALT_DP = "NA" + + AF_list = [] + for number in value["AF"]: + if number != "NA": + AF_list.append(float(number)) + if AF_list: + AF = str(round(statistics.mean(AF_list), 4)) + else: + AF = "NA" + + SAMPLE = GT + ":" + ALT_DP + ":" + ALT_QUAL + ":" + AF + oline = ( + CHROM + + "\t" + + str(POS) + + "\t" + + ID + + "\t" + + REF + + "\t" + + ALT + + "\t" + + QUAL + + "\t" + + FILTER + + "\t" + + INFO + + "\t" + + FORMAT + + "\t" + + SAMPLE + ) + file_out.write(oline + "\n") + + +def main(args=None): + # Process args + args = parse_args(args) + + # Initialize vars + alignment = args.alignment + all_alleles = args.irma_alleles + output_vcf = args.out_vcf + freq = args.frequency + dp = args.depth + + # Start analysis + alleles_dict = alleles_to_dict(all_alleles, freq, dp) + alignment_dict = align2dict(alignment) + last_ref_pos = max(position["REF_POS"] for position in alignment_dict.values()) + last_ref_allele = None + for _, value in alignment_dict.items(): + if value["REF_POS"] == last_ref_pos: + last_ref_allele = value["REF"] + break + af_vcf_dict = stats_vcf(alignment_dict, alleles_dict, last_ref_pos, last_ref_allele) + combined_vcf_dict = combine_indels(af_vcf_dict) + create_vcf(combined_vcf_dict, output_vcf, alignment) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog old mode 100755 new mode 100644 index 43af890a..5e9d933b --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog @@ -5,12 +5,16 @@ mkdir logs scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") -cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --cpus-per-task 16 --mem 35000M --chdir $scratch_dir --time 01:00:00 --output logs/IRMA.${in}.%j.log /data/bi/pipelines/flu-amd-202402/IRMA FLU_AD ../02-preprocessing/${in}/${in}_R1_filtered.fastq.gz ../02-preprocessing/${in}/${in}_R2_filtered.fastq.gz ${in} &"; done > _01_irma.sh +cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --cpus-per-task 32 --mem 35000M --chdir $scratch_dir --time 01:00:00 --output logs/IRMA.${in}.%j.log /data/bi/pipelines/flu-amd/flu-amd-1.1.4/IRMA FLU_AD ../02-preprocessing/${in}/${in}_R1_filtered.fastq.gz ../02-preprocessing/${in}/${in}_R2_filtered.fastq.gz ${in} --external-config ../../../DOC/irma_config.sh &"; done > _01_irma.sh echo 'bash create_irma_stats.sh' > _02_create_stats.sh echo "ls */*HA*.fasta | cut -d '/' -f2 | cut -d '.' -f1 | sort -u | cut -d '_' -f3 | sed '/^\$/d' | sed 's/^/A_/g' > HA_types.txt" > _03_post_processing.sh +echo 'cat HA_types.txt | while read type; do if test -d ${type}; then rm -rf ${type}; fi; done; if test -d B ; then rm -rf B; fi; if test -d C; then rm -rf C; fi' >> _03_post_processing.sh + +echo 'if test -f all_samples_completo.txt; then rm all_samples_completo.txt; fi' >> _03_post_processing.sh + echo "cat HA_types.txt | while read in; do mkdir \${in}; done" >> _03_post_processing.sh echo "if grep -qw 'B__' irma_stats.txt; then mkdir B; fi" >> _03_post_processing.sh @@ -32,4 +36,4 @@ echo 'grep -w 'C__' irma_stats.txt | cut -f1 | while read sample; do cat C_fragm echo 'cat ../samples_id.txt | while read in; do cat ${in}/*.fasta | sed "s/^>/\>${in}_/g" | sed 's/_H1//g' | sed 's/_H3//g' | sed 's/_N1//g' | sed 's/_N2//g' | sed 's@-@/@g' | sed 's/_A_/_/g' | sed 's/_B_/_/g' | sed 's/_C_/_/g' >> all_samples_completo.txt; done' >> _03_post_processing.sh echo 'sed "s/__//g" irma_stats.txt > clean_irma_stats.txt' >> _03_post_processing.sh -echo 'sed "s/_\t/\t/g" irma_stats.txt > clean_irma_stats.txt' >> _03_post_processing.sh \ No newline at end of file +echo 'sed "s/_\t/\t/g" irma_stats.txt > clean_irma_stats.txt' >> _03_post_processing.sh diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS02_MET/99-stats/lablog b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS02_MET/99-stats/lablog deleted file mode 100644 index c9c6a808..00000000 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS02_MET/99-stats/lablog +++ /dev/null @@ -1,24 +0,0 @@ -#module load MultiQC -cat ../../samples_id.txt | while read in; do ln -s ../*_mag/Taxonomy/kraken2/${in}/kraken2_report.txt ./${in}_kraken2_report.txt; done - -scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") - -cat < multiqc.sbatch -#!/bin/sh -#SBATCH --ntasks 1 -#SBATCH --cpus-per-task 2 -#SBATCH --mem 4G -#SBATCH --time 00:30:00 -#SBATCH --partition short_idx -#SBATCH --output $(date '+%Y%m%d')_multiqc.log -#SBATCH --chdir $scratch_dir - -export NXF_OPTS="-Xms500M -Xmx4G" - -multiqc -d . --config multiqc_config.yaml - -EOF - -echo "sbatch multiqc.sbatch" > _01_run_multiqc.sh - -echo "find -type l | while read in; do unlink \${in}; done" > _02_unlink.sh diff --git a/bu_isciii/templates/IRMA/ANALYSIS/lablog_irma b/bu_isciii/templates/IRMA/ANALYSIS/lablog_irma index 3f99b6b0..798ee549 100644 --- a/bu_isciii/templates/IRMA/ANALYSIS/lablog_irma +++ b/bu_isciii/templates/IRMA/ANALYSIS/lablog_irma @@ -1,5 +1,4 @@ #ls ../RAW/* | tr '\/' '\t' | cut -f3 | cut -d "_" -f 1 | sort -u | grep -v "md5" > samples_id.txt mkdir -p 00-reads mv ANALYSIS01_FLU_IRMA $(date '+%Y%m%d')_ANALYSIS01_FLU_IRMA -mv ANALYSIS02_MET $(date '+%Y%m%d')_ANALYSIS02_MET cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R1*.fastq.gz %_R1.fastq.gz" | bash; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R2*.fastq.gz %_R2.fastq.gz" | bash; cd - \ No newline at end of file diff --git a/bu_isciii/templates/IRMA/DOC/irma_config.sh b/bu_isciii/templates/IRMA/DOC/irma_config.sh new file mode 100644 index 00000000..145da9b3 --- /dev/null +++ b/bu_isciii/templates/IRMA/DOC/irma_config.sh @@ -0,0 +1,7 @@ +### PROC config ### +SINGLE_LOCAL_PROC=8 # local maximum processes +DOUBLE_LOCAL_PROC=4 # local maximum processes (double this number) +MATCH_PROC=8 # grid maximum processes for the MATCH +SORT_PROC=8 # currently not used +ALIGN_PROC=8 # grid maximum processes for the rough align +ASSEM_PROC=8 # grid maximum processes for assembly diff --git a/bu_isciii/templates/IRMA/DOC/mag.config b/bu_isciii/templates/IRMA/DOC/mag.config deleted file mode 100644 index 732980bf..00000000 --- a/bu_isciii/templates/IRMA/DOC/mag.config +++ /dev/null @@ -1,19 +0,0 @@ -singularity { - enabled = true - autoMounts = true -} - -process { - executor = 'slurm' - queue = 'middle_idx' - queue = 'middle_idx' - errorStrategy = { task.exitStatus in [140,143,137,138,104,134,139] ? 'retry' : 'finish'; task.exitStatus in [1,4,255] ? 'ignore' : 'finish' } - maxRetries = 1 - maxErrors = '-1' -} - -params { - max_memory = 376.GB - max_cpus = 32 - max_time = '48.h' -} diff --git a/bu_isciii/templates/IRMA/RESULTS/irma_results b/bu_isciii/templates/IRMA/RESULTS/lablog_irma_results similarity index 85% rename from bu_isciii/templates/IRMA/RESULTS/irma_results rename to bu_isciii/templates/IRMA/RESULTS/lablog_irma_results index eee33aa6..5cb7c418 100755 --- a/bu_isciii/templates/IRMA/RESULTS/irma_results +++ b/bu_isciii/templates/IRMA/RESULTS/lablog_irma_results @@ -3,7 +3,6 @@ cd $(date '+%Y%m%d')_entrega01 #Create symbolic links depending on the analysis #Individual files -ln -s ../../ANALYSIS/*_MET/99-stats/multiqc_report.html ./krona_results.html ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/all_samples_completo.txt . ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/A_H* . ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/B . diff --git a/bu_isciii/templates/assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog b/bu_isciii/templates/assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog index bd8f8549..38f5edd7 100644 --- a/bu_isciii/templates/assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog +++ b/bu_isciii/templates/assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog @@ -107,7 +107,7 @@ cat < assembly.sbatch # module load Nextflow/23.10.0 singularity export NXF_OPTS="-Xms500M -Xmx8G" -nextflow run /data/bi/pipelines/nf-core-bacass/main.nf \\ +nextflow run /data/bi/pipelines/nf-core-bacass/nf-core-bacass-2.3.1/main.nf \\ -c ../../DOC/hpc_slurm_assembly.config \\ -profile singularity \\ --input samplesheet.csv \\ @@ -119,13 +119,11 @@ nextflow run /data/bi/pipelines/nf-core-bacass/main.nf \\ --fastp_args '--qualified_quality_phred 20 --cut_mean_quality 20' \\ --skip_kraken2 true \\ --skip_kmerfinder false \\ - --kmerfinderdb /data/bi/references/kmerfinder/20190108_stable_dirs/bacteria \\ - --ncbi_assembly_metadata /data/bi/references/bacteria/20191212/assembly_summary_bacteria.txt \\ + --kmerfinderdb /data/bi/references/kmerfinder/latest/bacteria \\ + --ncbi_assembly_metadata /data/bi/references/bacteria/20240626/assembly_summary_refseq.txt \\ ${PROKKA_ARGS} \\ -resume EOF echo "sbatch assembly.sbatch" > _01_nf_assembly.sh - - diff --git a/bu_isciii/templates/assembly/DOC/hpc_slurm_assembly.config b/bu_isciii/templates/assembly/DOC/hpc_slurm_assembly.config index 04dddf4d..8325bcd5 100644 --- a/bu_isciii/templates/assembly/DOC/hpc_slurm_assembly.config +++ b/bu_isciii/templates/assembly/DOC/hpc_slurm_assembly.config @@ -38,6 +38,8 @@ params { params { publish_dir_mode = 'copy' } process { withName: '.*:.*:FASTQ_TRIM_FASTP_FASTQC:FASTQC_RAW' { + maxRetries = 2 + memory = {12.GB * task.attempt} publishDir = [ [ path: { "${params.outdir}/01-processing/fastqc/raw" }, @@ -71,6 +73,8 @@ process { ] } withName: '.*:.*:FASTQ_TRIM_FASTP_FASTQC:FASTQC_TRIM' { + maxRetries = 2 + memory = {12.GB * task.attempt} publishDir = [ [ path: { "${params.outdir}/01-processing/fastqc/trim" }, @@ -127,6 +131,9 @@ process { ] } withName: '.*:.*:KMERFINDER_SUBWORKFLOW:KMERFINDER' { + maxRetries = 2 + memory = {12.GB * task.attempt} + errorStrategy = { task.exitStatus in [1] ? 'retry' : 'finish'} publishDir = [ path: { "${params.outdir}/02-taxonomy_contamination/kmerfinder/${meta.id}" }, mode: params.publish_dir_mode @@ -145,7 +152,9 @@ process { ] } withName: 'UNICYCLER|CANU|MINIASM|DRAGONFLYE' { - publishDir = [ + maxRetries = 2 + memory = {64.GB * task.attempt} + publishDir = [ path: { "${params.outdir}/03-assembly/${params.assembler}" }, mode: params.publish_dir_mode, saveAs: { filename -> @@ -208,7 +217,7 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: 'MULTIQC' { + withName: 'MULTIQC_CUSTOM' { publishDir = [ [ path: { "${params.outdir}/99-stats/multiqc" }, diff --git a/bu_isciii/templates/assembly/RESULTS/lablog_assembly_results b/bu_isciii/templates/assembly/RESULTS/lablog_assembly_results index 508d1d55..7f2f96e0 100644 --- a/bu_isciii/templates/assembly/RESULTS/lablog_assembly_results +++ b/bu_isciii/templates/assembly/RESULTS/lablog_assembly_results @@ -1,7 +1,6 @@ -DELIVERY_FOLDER="$(date '+%Y%m%d')_entrega" +DELIVERY_FOLDER="$(date '+%Y%m%d')_entrega01" -mkdir $DELIVERY_FOLDER -mkdir $DELIVERY_FOLDER/assembly +mkdir -p $DELIVERY_FOLDER/assembly # Assembly service cd $DELIVERY_FOLDER/assembly diff --git a/bu_isciii/templates/blast_nt/ANALYSIS/ANALYSIS02_BLAST/lablog b/bu_isciii/templates/blast_nt/ANALYSIS/ANALYSIS02_BLAST/lablog index c9bd56e6..0d5496a2 100644 --- a/bu_isciii/templates/blast_nt/ANALYSIS/ANALYSIS02_BLAST/lablog +++ b/bu_isciii/templates/blast_nt/ANALYSIS/ANALYSIS02_BLAST/lablog @@ -1,4 +1,4 @@ -# module load BLAST+/2.11.0-gompi-2020b +# module load singularity scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") mkdir logs @@ -27,7 +27,7 @@ cat ../samples_id.txt | while read in; do done # NOTE3: change the -query flag to meet your requirements -cat ../samples_id.txt | xargs -I %% echo "srun --chdir ${scratch_dir} --partition middle_idx --mem 200G --time 48:00:00 --cpus-per-task 10 --output logs/BLASTN_%%_%j.log --job-name BLASTN_%% blastn -num_threads 10 -db ${BLAST_DATABASE} -query %%/%%.scaffolds.fa -out %%/%%_blast.tsv -outfmt '6 qseqid stitle qaccver saccver pident length mismatch gaps qstart qend sstart send evalue bitscore slen qlen qcovs' &" > _01_blast.sh +cat ../samples_id.txt | xargs -I %% echo "srun --chdir ${scratch_dir} --partition middle_idx --mem 200G --time 48:00:00 --cpus-per-task 10 --output logs/BLASTN_%%_%j.log --job-name BLASTN_%% singularity exec -B ${scratch_dir}/../../ -B /data/bi/references/virus/BLAST/ /data/bi/pipelines/singularity-images/blast:2.11.0--pl5262h3289130_1 blastn -num_threads 10 -db ${BLAST_DATABASE} -query ${scratch_dir}/%%/%%.scaffolds.fa -out ${scratch_dir}/%%/%%_blast.tsv -outfmt '6 qseqid stitle qaccver saccver pident length mismatch gaps qstart qend sstart send evalue bitscore slen qlen qcovs' &" > _01_blast.sh # Filtering criteria: # %refCovered > 0.7 @@ -71,5 +71,5 @@ echo "rm header" >> _03_gather_results_add_header.sh # 20: %refCovered: length/slen # conda activate 2excel -cat ../samples_id.txt | xargs -I %% echo "srun --chdir ${scratch_dir} --partition short_idx --mem 10G --time 1:00:00 --output logs/2excel_%%.log --job-name 2excel_%% python /scratch/bi/pipelines/utilities/export_excel_from_csv.py --input_file %%/%%_blast_filt.tsv --delimiter '\t' --output_filename %%/%%_blast_filt --it_has_index --it_has_header" > _04_to_excel.sh -echo "srun --chdir ${scratch_dir} --partition short_idx --mem 10G --time 1:00:00 --output logs/2excel_all.log --job-name 2excel_all python /scratch/bi/pipelines/utilities/export_excel_from_csv.py --input_file all_samples_filtered_BLAST_results.tsv --delimiter '\t' --output_filename all_samples_filtered_BLAST_results --it_has_index --it_has_header" >> _04_to_excel.sh \ No newline at end of file +cat ../samples_id.txt | xargs -I %% echo "srun --chdir ${scratch_dir} --partition short_idx --mem 10G --time 1:00:00 --output logs/2excel_%%.log --job-name 2excel_%% python /data/bi/pipelines/utilities/export_excel_from_csv.py --input_file %%/%%_blast_filt.tsv --delimiter '\t' --output_filename %%/%%_blast_filt --it_has_index --it_has_header" > _04_to_excel.sh +echo "srun --chdir ${scratch_dir} --partition short_idx --mem 10G --time 1:00:00 --output logs/2excel_all.log --job-name 2excel_all python /data/bi/pipelines/utilities/export_excel_from_csv.py --input_file all_samples_filtered_BLAST_results.tsv --delimiter '\t' --output_filename all_samples_filtered_BLAST_results --it_has_index --it_has_header" >> _04_to_excel.sh diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/01-preprocessing/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/01-preprocessing/lablog index 0c86d16d..d1d3eb73 100644 --- a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/01-preprocessing/lablog +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/01-preprocessing/lablog @@ -1,7 +1,27 @@ -# module load fastp +# module load singularity + # if assembly pipeline was performed first and the trimmed sequences were saved, this should work: -# cat ../samples_id.txt | xargs -I @@ mkdir @@; cd @@; ln -s ../../../*/01-processing/fastp/@@_1.fastp.fastq.gz ./@@_R1_filtered.fastq.gz; ln -s ../../../*/01-processing/fastp/@@_2.fastp.fastq.gz ./@@_R2_filtered.fastq.gz ; cd - -# else: -mkdir logs -scratch_dir=$(echo $(pwd) | sed 's@/data/bi/scratch_tmp/@/scratch/@g') -cat ../samples_id.txt | xargs -I @@ echo "mkdir @@; srun --chdir ${scratch_dir} --mem 10G --time 1:00:00 --job-name FP.@@ --output logs/FP.@@.%j.log --partition short_idx --cpus-per-task 5 fastp --in1 ../00-reads/@@_R1.fastq.gz --in2 ../00-reads/@@_R2.fastq.gz --thread 5 --cut_front --cut_tail --cut_mean_quality 15 --qualified_quality_phred 15 --trim_poly_x --detect_adapter_for_pe --json @@/@@_fastp.json --html @@/@@_fastp.html --out1 @@/@@_R1_filtered.fastq.gz --out2 @@/@@_R2_filtered.fastq.gz &" > _01_fastp.sh +read -p $'\e[1;37mDid you save the trimmed reads from previous assembly pipeline? [y/N]: \e[1;38;5;220m' -n 1 answer; tput sgr0; echo + if [ "$answer" == "y" ]; then + echo "Creating links to trimmed reads..." + + while read in; do + mkdir ${in} + cd ${in} + ln -s ../../../*/01-processing/fastp/${in}_1.fastp.fastq.gz ${in}_R1_filtered.fastq.gz + ln -s ../../../*/01-processing/fastp/${in}_2.fastp.fastq.gz ${in}_R2_filtered.fastq.gz + cd - + done < ../samples_id.txt + + echo -e "\e[32mLinks for $(cat ../samples_id.txt | wc -l) samples succesfully created.\e[0m" + + else + echo "Preparing _01_fastp.sh file for trimming..." + + mkdir logs + scratch_dir=$(echo $(pwd) | sed 's@/data/bi/scratch_tmp/@/scratch/@g') + cat ../samples_id.txt | xargs -I @@ echo "mkdir @@; srun --chdir ${scratch_dir} --mem 10G --time 1:00:00 --job-name FP.@@ --output logs/FP.@@.%j.log --partition short_idx --cpus-per-task 5 singularity exec -B ${scratch_dir}/../../../ -B /srv/fastq_repo/ /data/bi/pipelines/singularity-images/fastp:0.20.0--hdbcaa40_0 fastp --in1 ${scratch_dir}/../00-reads/@@_R1.fastq.gz --in2 ${scratch_dir}/../00-reads/@@_R2.fastq.gz --thread 5 --cut_front --cut_tail --cut_mean_quality 15 --qualified_quality_phred 15 --trim_poly_x --length_required 50 --detect_adapter_for_pe --json ${scratch_dir}/@@/@@_fastp.json --html ${scratch_dir}/@@/@@_fastp.html --out1 ${scratch_dir}/@@/@@_R1_filtered.fastq.gz --out2 ${scratch_dir}/@@/@@_R2_filtered.fastq.gz --unpaired1 ${scratch_dir}/@@/@@_R1_unpaired.fastq.gz --unpaired2 ${scratch_dir}/@@/@@_R2_unpaired.fastq.gz &" > _01_fastp.sh + + echo -e "\e[32mFile _01_fastp.sh ready.\e[0m" + + fi \ No newline at end of file diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/02-ariba/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/02-ariba/lablog index 192960b3..4debfbd2 100644 --- a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/02-ariba/lablog +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/02-ariba/lablog @@ -1,3 +1,3 @@ -# conda activate ariba +# module load singularity cp /data/bi/references/ariba/databases.txt . diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/02-ariba/run/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/02-ariba/run/lablog index b942fba8..1bfc20c0 100644 --- a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/02-ariba/run/lablog +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/02-ariba/run/lablog @@ -1,4 +1,4 @@ -# conda activate ariba +# module load singularity mkdir logs scratch_dir=$(echo $PWD | sed 's/\/data\/bi\/scratch_tmp/\/scratch/g') @@ -9,8 +9,8 @@ join -j 2 ../../samples_id.txt ../databases.txt | sed 's/^ //g' > sample_databas # col 1 (arr[0]): sample # col 2 (arr[1]): database -cat sample_database.txt | grep -v 'pubmlst' | while read in; do arr=($in); echo "mkdir -p ${arr[0]}; srun --chdir $scratch_dir --output logs/ARIBA_${arr[0]}_${arr[1]}.%j.log --job-name ARIBA_${arr[0]}_${arr[1]} --cpus-per-task 5 --mem 5G --partition short_idx --time 02:00:00 ariba run /data/bi/references/ariba/20211216/${arr[1]}/out.${arr[1]}.prepareref ../../01-preprocessing/${arr[0]}/${arr[0]}_R1_filtered.fastq.gz ../../01-preprocessing/${arr[0]}/${arr[0]}_R2_filtered.fastq.gz ${arr[0]}/out_${arr[1]}_${arr[0]}_run &"; done > _01_ariba.sh +cat sample_database.txt | grep -v 'pubmlst' | while read in; do arr=($in); echo "mkdir -p ${arr[0]}; srun --chdir $scratch_dir --output logs/ARIBA_${arr[0]}_${arr[1]}.%j.log --job-name ARIBA_${arr[0]}_${arr[1]} --cpus-per-task 5 --mem 5G --partition short_idx --time 02:00:00 singularity exec -B ${scratch_dir}/../../../../ -B /data/bi/references/ariba/ /data/bi/pipelines/singularity-images/ariba:2.14.6--py39heaaa4ec_6 ariba run /data/bi/references/ariba/latest/${arr[1]}/out.${arr[1]}.prepareref ${scratch_dir}/../../01-preprocessing/${arr[0]}/${arr[0]}_R1_filtered.fastq.gz ${scratch_dir}/../../01-preprocessing/${arr[0]}/${arr[0]}_R2_filtered.fastq.gz ${scratch_dir}/${arr[0]}/out_${arr[1]}_${arr[0]}_run &"; done > _01_ariba.sh -cat ../samples_id.txt | while read in; echo "mkdir -p ${arr[0]}; srun --chdir $scratch_dir --output logs/ARIBA_${in}_pubmlst.%j.log --job-name ARIBA_${in}_pubmlst --cpus-per-task 5 --mem 5G --partition short_idx --time 02:00:00 ariba run ${downloaded_ref} ../../01-preprocessing/${in}/${in}_R1_filtered.fastq.gz ../../01-preprocessing/${in}/${in}_R2_filtered.fastq.gz ${in}/out_pubmlst_${in}_run &"; done > _01_ariba.sh +cat ../../../samples_id.txt | while read in; do echo "mkdir -p $in; srun --chdir $scratch_dir --output logs/ARIBA_${in}_pubmlst.%j.log --job-name ARIBA_${in}_pubmlst --cpus-per-task 5 --mem 5G --partition short_idx --time 02:00:00 singularity exec -B ${scratch_dir}/../../../../ /data/bi/pipelines/singularity-images/ariba:2.14.6--py39heaaa4ec_6 ariba run ${scratch_dir}/${downloaded_ref} ${scratch_dir}/../../01-preprocessing/${in}/${in}_R1_filtered.fastq.gz ${scratch_dir}/../../01-preprocessing/${in}/${in}_R2_filtered.fastq.gz ${scratch_dir}/${in}/out_pubmlst_${in}_run &"; done >> _01_ariba.sh -cat sample_database.txt | while read in; do arr=($in); echo "mv ${arr[0]}/out_${arr[1]}_${arr[0]}_run/report.tsv ${arr[0]}/out_${arr[1]}_${arr[0]}_run/${arr[0]}_${arr[1]}_report.tsv"; done > _02_fix_tsvreport.sh \ No newline at end of file +cat sample_database.txt | while read in; do arr=($in); echo "mv ${arr[0]}/out_${arr[1]}_${arr[0]}_run/report.tsv ${arr[0]}/out_${arr[1]}_${arr[0]}_run/${arr[0]}_${arr[1]}_report.tsv"; done > _02_fix_tsvreport.sh diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/02-ariba/summary/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/02-ariba/summary/lablog index be959a97..af5821ad 100644 --- a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/02-ariba/summary/lablog +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/02-ariba/summary/lablog @@ -1,4 +1,4 @@ -# conda activate ariba +# module load singularity mkdir -p logs scratch_dir=$(echo $PWD | sed 's/\/data\/bi\/scratch_tmp/\/scratch/g') @@ -8,4 +8,4 @@ scratch_dir=$(echo $PWD | sed 's/\/data\/bi\/scratch_tmp/\/scratch/g') # 1 - Use the ls in parenthesis to find the reports for a certain db, and xargs to make it into a single line # 2 - Integrate this into the ariba summary command -cat ../databases.txt | while read in; do echo "srun --chdir $scratch_dir --output logs/ARIBA_SUMMARY_${in}.log --job-name ARIBA_${in} --cpus-per-task 5 --mem 5G --partition short_idx --time 00:30:00 ariba summary --cluster_cols ref_seq,match out_summary_${in} $(ls ../run/*/out*_${in}*/*${in}*_report.tsv | xargs) &"; done > _01_ariba_summary_prueba.sh +cat ../databases.txt | while read in; do echo "srun --chdir $scratch_dir --output logs/ARIBA_SUMMARY_${in}.log --job-name ARIBA_${in} --cpus-per-task 5 --mem 5G --partition short_idx --time 00:30:00 singularity exec -B ${scratch_dir}/../../../../ /data/bi/pipelines/singularity-images/ariba:2.14.6--py36h4aaaa08_3 ariba summary --cluster_cols ref_seq,match out_summary_${in} $(ls ${scratch_dir}/../run/*/out*_${in}*/*${in}*_report.tsv | xargs) &"; done > _01_ariba_summary_prueba.sh diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/03-amrfinderplus/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/03-amrfinderplus/lablog new file mode 100644 index 00000000..a8694b06 --- /dev/null +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/03-amrfinderplus/lablog @@ -0,0 +1,25 @@ +# conda activate amrfinder + +# Only works with assemblies, prior to this you must have executed assembly pipeline + +scratch_dir=$(echo $(pwd) | sed 's@/data/bi/scratch_tmp/@/scratch/@g') + +mkdir logs + +possible_organisms=("Acinetobacter_baumannii" "Burkholderia_cepacia" "Burkholderia_pseudomallei" "Campylobacter" "Citrobacter_freundii" "Clostridioides_difficile" "Enterobacter_asburiae" "Enterobacter_cloacae" "Enterococcus_faecalis" "Enterococcus_faecium" "Escherichia" "Klebsiella_oxytoca" "Klebsiella_pneumoniae" "Neisseria_gonorrhoeae" "Neisseria_meningitidis" "Pseudomonas_aeruginosa" "Salmonella" "Serratia_marcescens" "Staphylococcus_aureus" "Staphylococcus_pseudintermedius" "Streptococcus_agalactiae" "Streptococcus_pneumoniae" "Streptococcus_pyogenes" "Vibrio_cholerae" "Vibrio_parahaemolyticus" "Vibrio_vulnificus" "OTHER") +echo +echo -e "\n\033[1;37mPlease select your bacteria from the following list:\033[0m" +PS3=$(echo -e "\n\033[1;37mSelect number:\033[0m ") +select bacteria in "${possible_organisms[@]}"; do + if [[ -n "$bacteria" ]]; then + echo -e "\033[0;32mOrganism selected: ${bacteria}\033[0m" + if [ $bacteria = "OTHER" ]; then + cat ../samples_id.txt | while read in; do echo "srun --chdir $scratch_dir --partition middle_idx --output logs/AMRFINDER_${in}.%j.log --job-name AMRFINDER_${in} amrfinder -n $(ls ../../*ANALYSIS*ASSEMBLY/03-assembly/unicycler/${in}.fasta.gz) --name ${in} --plus -o ${in}_out.tsv &" >> _01_run_amrfinder.sh; done + else + cat ../samples_id.txt | while read in; do echo "srun --chdir $scratch_dir --partition middle_idx --output logs/AMRFINDER_${in}.%j.log --job-name AMRFINDER_${in} amrfinder -n $(ls ../../*ANALYSIS*ASSEMBLY/03-assembly/unicycler/${in}.fasta.gz) --organism ${bacteria} --name ${in} --plus -o ${in}_out.tsv &" >> _01_run_amrfinder.sh; done + fi + break + else + echo -e "\n\033[0;31mInvalid input.\033[0m" + fi +done diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog new file mode 100644 index 00000000..d5897933 --- /dev/null +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/04-emmtyper/lablog @@ -0,0 +1,83 @@ +#!/bin/sh + +# Create folders +mkdir -p fasta_inputs +mkdir -p logs + +# Find all .gz files and write them to a file list +# TODO: add if to check >1 fasta files are available in assembly results +find ../../*ANALYSIS*ASSEMBLY/*-assembly/unicycler/*.fasta.gz > fasta_inputs/assembly_file_list.txt +ASSEMBLY_LIST=fasta_inputs/assembly_file_list.txt + +# Get the number of files +num_files=$(wc -l < $ASSEMBLY_LIST) + +scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") + +# STEP 1: Set up jobarray to unzip fasta files +cat < _00_unzip_jobarray.sbatch +#!/bin/bash +#SBATCH --job-name=unzip_fasta +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=2 +#SBATCH --mem=8G +#SBATCH --time=2:00:00 +#SBATCH --partition short_idx +#SBATCH --array=1-$num_files +#SBATCH --chdir $scratch_dir +#SBATCH --output logs/slurm-%A_%a.out +#SBATCH --error logs/slurm-%A_%a.err + +# Get the file to process +file=\$(sed -n "\${SLURM_ARRAY_TASK_ID}p" $ASSEMBLY_LIST) + +# Unzip the file to the destination directory +gzip -dkc \$file > fasta_inputs/\$(basename "\$file" .gz) + +EOF + +# STEP 2: Setup exe file to perform unzip and emmtyper. +cat < _01_emmtyper.sbatch +#!/bin/bash +#SBATCH --job-name emmtyper +#SBATCH --ntasks 1 +#SBATCH --cpus-per-task 4 +#SBATCH --mem 24G +#SBATCH --time 4:00:00 +#SBATCH --partition short_idx +#SBATCH --chdir $scratch_dir +#SBATCH --output ./$(date '+%Y%m%d')_emmtyper.log + +# module load singularity + +# create results folder +mkdir -p 01-typing +mkdir -p 01-typing/tmps +blastdb_path=/data/bi/references/cdc_emm_blastdb + +# Run emmtyper +singularity exec \\ + --bind ${scratch_dir} \\ + --bind ${scratch_dir}/../../ \\ + --bind \$blastdb_path \\ + /data/bi/pipelines/singularity-images/singularity-emmtyper.0.2.0--py_0 emmtyper \\ + -w blast \\ + --keep \\ + --blast_db "\${blastdb_path}/cdc_emm_database29042024" \\ + --percent-identity 95 \\ + --culling-limit 5 \\ + --output 01-typing/results_emmtyper.out \\ + --output-format verbose \\ + ./fasta_inputs/*.fasta + +mv *.tmp 01-typing/tmps + +EOF + +# Bash script that performs all steps above +echo "#!/bin/bash" > _ALLSTEPS_emmtyper.sh +echo "# # module load singularity" >> _ALLSTEPS_emmtyper.sh +echo "unzip_job_id=\$(sbatch _00_unzip_jobarray.sbatch | awk '{print \$4}')" >> _ALLSTEPS_emmtyper.sh +echo "sbatch --dependency=afterok:\${unzip_job_id} _01_emmtyper.sbatch" >> _ALLSTEPS_emmtyper.sh + +chmod +x _ALLSTEPS_emmtyper.sh diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/05-mlva/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/05-mlva/lablog new file mode 100644 index 00000000..fb6159b6 --- /dev/null +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/05-mlva/lablog @@ -0,0 +1,23 @@ +# micromamba activate buisciii-tools_2.1.0 + +mkdir logs +mkdir assemblies +mkdir MLVA_output +scratch_dir=$(pwd | sed 's|/data/bi/scratch_tmp|/scratch|g') +cp ../../*_ASSEMBLY/03-assembly/unicycler/*.fasta* assemblies/ +gzip -d assemblies/*.fasta.gz +available_primers=$(ls /data/bi/references/MLVA/*primer* | rev | cut -d "/" -f1 | rev | cut -d "_" -f1) + +echo "Available primers:" +select primer in $available_primers; do + if [ -n "$primer" ]; then + echo "You selected: $primer" + break + else + echo "Invalid selection. Please try again." + fi +done + +primer_file=$(ls /data/bi/references/MLVA/${primer}*) + +echo "srun --partition short_idx --chdir ${scratch_dir} --output logs/MLVA.log --job-name MLVA python /data/bi/pipelines/mlva/MLVA_finder.py -c -i assemblies -o MLVA_output -p ${primer_file} --full-locus-name --predicted-PCR-size-table --flanking-seq 20 &" > _01_mlva.sh diff --git a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/99-stats/lablog b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/99-stats/lablog index 9ab99ce9..6e11ee3d 100644 --- a/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/99-stats/lablog +++ b/bu_isciii/templates/characterization/ANALYSIS/ANALYSIS01_CHARACTERIZATION/99-stats/lablog @@ -1,6 +1,6 @@ #conda activate python3 -python3 /data/bi/pipelines/bacterial_qc/parse_ariba.py --path ../02-ariba/summary/out_summary_card.csv --database card --output_bn ariba_card.bn --output_csv ariba_card.csv -python3 /data/bi/pipelines/bacterial_qc/parse_ariba.py --path ../02-ariba/summary/out_summary_plasmidfinder.csv --database plasmidfinder --output_bn ariba_plasmidfinder.bn --output_csv ariba_plasmidfinder.csv -python3 /data/bi/pipelines/bacterial_qc/parse_ariba.py --path ../02-ariba/summary/out_summary_vfdb_full.csv --database vfdb_full --output_bn ariba_vfdb_full.bn --output_csv ariba_vfdb_full.csv +python3 /data/bi/pipelines/bacterial-qc/parse_ariba.py --path ../02-ariba/summary/out_summary_card.csv --database card --output_bn ariba_card.bn --output_csv ariba_card.csv +python3 /data/bi/pipelines/bacterial-qc/parse_ariba.py --path ../02-ariba/summary/out_summary_plasmidfinder.csv --database plasmidfinder --output_bn ariba_plasmidfinder.bn --output_csv ariba_plasmidfinder.csv +python3 /data/bi/pipelines/bacterial-qc/parse_ariba.py --path ../02-ariba/summary/out_summary_vfdb_full.csv --database vfdb_full --output_bn ariba_vfdb_full.bn --output_csv ariba_vfdb_full.csv -paste <(echo "sample_id") <(cat ../02-ariba/run/*/out_pubmlst_*_run/mlst_report.tsv | head -n1) > ariba_mlst_full.tsv; cat ../samples_id.txt | while read in; do paste <(echo ${in}) <(tail -n1 ../02-ariba/run/${in}/out_pubmlst_${in}_run/mlst_report.tsv); done >> ariba_mlst_full.tsv \ No newline at end of file +paste <(echo "sample_id") <(cat ../02-ariba/run/*/out_pubmlst_*_run/mlst_report.tsv | head -n1) > ariba_mlst_full.tsv; cat ../samples_id.txt | while read in; do paste <(echo ${in}) <(tail -n1 ../02-ariba/run/${in}/out_pubmlst_${in}_run/mlst_report.tsv); done >> ariba_mlst_full.tsv diff --git a/bu_isciii/templates/characterization/REFERENCES/lablog b/bu_isciii/templates/characterization/REFERENCES/lablog index ad9f4010..673c7904 100644 --- a/bu_isciii/templates/characterization/REFERENCES/lablog +++ b/bu_isciii/templates/characterization/REFERENCES/lablog @@ -1,4 +1,4 @@ -# conda activate ariba +# module load singularity mkdir logs @@ -31,14 +31,14 @@ print_color "This will take some seconds to display, please wait" 'blue' # Select genome from PMLST IFS=$'\n' -bacterial_options=( $(ariba pubmlstspecies | sed 's/^/"/g' | sed 's/$/"/g') ) +bacterial_options=( $(singularity exec /data/bi/pipelines/singularity-images/ariba:2.14.6--py39heaaa4ec_6 ariba pubmlstspecies | sed 's/^/"/g' | sed 's/$/"/g') ) print_color "Indicate the preferred bacterial genome:" 'blue' select BACTERIA in "${bacterial_options[@]}"; do if [ -n "$BACTERIA" ]; then print_color "Selected bacteria: $BACTERIA" 'green' - echo "srun --chdir ${scratch_dir} --mem 10G --time 1:00:00 --job-name PUBMLSTGET --output logs/PUBMLSTGET.%j.log --partition short_idx --cpus-per-task 5 ariba pubmlstget $BACTERIA $(date '+%Y%m%d') &" > _01_download_pubmlst.sh + echo "srun --chdir ${scratch_dir} --mem 10G --time 1:00:00 --job-name PUBMLSTGET --output logs/PUBMLSTGET.%j.log --partition short_idx --cpus-per-task 5 singularity exec -B ${scratch_dir} /data/bi/pipelines/singularity-images/ariba:2.14.6--py39heaaa4ec_6 ariba pubmlstget $BACTERIA $(date '+%Y%m%d') &" > _01_download_pubmlst.sh break else print_color "Invalid input. Please select a valid option." 'red' fi -done \ No newline at end of file +done diff --git a/bu_isciii/templates/characterization/RESULTS/lablog_characterization_results b/bu_isciii/templates/characterization/RESULTS/lablog_characterization_results index 8879a843..9d617543 100644 --- a/bu_isciii/templates/characterization/RESULTS/lablog_characterization_results +++ b/bu_isciii/templates/characterization/RESULTS/lablog_characterization_results @@ -1,12 +1,19 @@ -DELIVERY_FOLDER="$(date '+%Y%m%d')_entrega" +DELIVERY_FOLDER="$(date '+%Y%m%d')_entrega01" -mkdir -p $DELIVERY_FOLDER -mkdir "${DELIVERY_FOLDER}/characterization" +mkdir -p "${DELIVERY_FOLDER}/characterization/amrfinderplus" +mkdir -p "${DELIVERY_FOLDER}/characterization/emmtyper" # ARIBA characterization service cd $DELIVERY_FOLDER/characterization ln -s ../../../ANALYSIS/*CHARACTERIZATION/99-stats/ariba_*.tsv . ln -s ../../../ANALYSIS/*CHARACTERIZATION/99-stats/ariba_*.csv . -find . -xtype l -delete -cd - +cd amrfinderplus +ln -s ../../../../ANALYSIS/*CHARACTERIZATION/*amrfinderplus/*tsv . +find .. -xtype l -delete + +cd .. +cd emmtyper +ln -s ../../../../ANALYSIS/*CHARACTERIZATION/*emmtyper/01-typing/results_emmtyper.out . + +cd ../../ diff --git a/bu_isciii/templates/chewbbaca/RESULTS/lablog_mlst_results b/bu_isciii/templates/chewbbaca/RESULTS/lablog_mlst_results index 650d3d22..a8787519 100644 --- a/bu_isciii/templates/chewbbaca/RESULTS/lablog_mlst_results +++ b/bu_isciii/templates/chewbbaca/RESULTS/lablog_mlst_results @@ -9,5 +9,6 @@ cd $DELIVERY_FOLDER/mlst # Links to reports ln -s ../../../ANALYSIS/*CHEWBBACA/*-chewbbaca/allele_calling_evaluation/allelecall_report.html . ln -s ../../../ANALYSIS/*CHEWBBACA/*-chewbbaca/allele_calling_evaluation/distance_matrix_symmetric.tsv . +ln -s ../../../ANALYSIS/*CHEWBBACA/*-chewbbaca/allele_calling_evaluation/results_alleles.tsv . ln -s ../../../ANALYSIS/*CHEWBBACA/*-grapetree/*.nwk ln -s ../../../ANALYSIS/*CHEWBBACA/*-grapetree/*.svg diff --git a/bu_isciii/templates/exomeeb/ANALYSIS/ANALYSIS01_EXOME/02-postprocessing/lablog b/bu_isciii/templates/exomeeb/ANALYSIS/ANALYSIS01_EXOME/02-postprocessing/lablog index c072b67d..32414d09 100644 --- a/bu_isciii/templates/exomeeb/ANALYSIS/ANALYSIS01_EXOME/02-postprocessing/lablog +++ b/bu_isciii/templates/exomeeb/ANALYSIS/ANALYSIS01_EXOME/02-postprocessing/lablog @@ -1,23 +1,23 @@ # Lablog to apply variant filters to combined GVCFs generated by SAREK (HaplotypeCaller) -# module load GATK/4.2.0.0-GCCcore-10.2.0-Java-11 +# module load singularity scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") mkdir -p logs -echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/SELECTSNPS.log --job-name SELECTSNPS gatk SelectVariants \ - -V ../01-sarek/variant_calling/haplotypecaller/joint_variant_calling/joint_germline.vcf.gz \ +echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/SELECTSNPS.log --job-name SELECTSNPS singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/gatk4-spark:4.2.0.0--hdfd78af_1 gatk SelectVariants \ + -V ${scratch_dir}/../01-sarek/variant_calling/haplotypecaller/joint_variant_calling/joint_germline.vcf.gz \ -select-type SNP \ - -O snps.vcf.gz &" > _01_separate_snps_indels.sh + -O ${scratch_dir}/snps.vcf.gz &" > _01_separate_snps_indels.sh -echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/SELECTINDELS.log --job-name SELECTINDELS gatk SelectVariants \ - -V ../01-sarek/variant_calling/haplotypecaller/joint_variant_calling/joint_germline.vcf.gz \ +echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/SELECTINDELS.log --job-name SELECTINDELS singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/gatk4-spark:4.2.0.0--hdfd78af_1 gatk SelectVariants \ + -V ${scratch_dir}/../01-sarek/variant_calling/haplotypecaller/joint_variant_calling/joint_germline.vcf.gz \ -select-type INDEL \ - -O indels.vcf.gz &" >> _01_separate_snps_indels.sh + -O ${scratch_dir}/indels.vcf.gz &" >> _01_separate_snps_indels.sh -echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/FILSNP.log --job-name FILSNP gatk VariantFiltration \ - -V snps.vcf.gz \ +echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/FILSNP.log --job-name FILSNP singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/gatk4-spark:4.2.0.0--hdfd78af_1 gatk VariantFiltration \ + -V ${scratch_dir}/snps.vcf.gz \ -filter 'QD < 2.0' --filter-name 'QD2' \ -filter 'QUAL < 30.0' --filter-name 'QUAL30' \ -filter 'SOR > 3.0' --filter-name 'SOR3' \ @@ -25,20 +25,19 @@ echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output -filter 'MQ < 40.0' --filter-name 'MQ40' \ -filter 'MQRankSum < -12.5' --filter-name 'MQRankSum-12.5' \ -filter 'ReadPosRankSum < -8.0' --filter-name 'ReadPosRankSum-8' \ - -O snps_filtered.vcf.gz &" > _02_filter.sh + -O ${scratch_dir}/snps_filtered.vcf.gz &" > _02_filter.sh -echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/FILINDEL.log --job-name FILINDEL gatk VariantFiltration \ - -V indels.vcf.gz \ +echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/FILINDEL.log --job-name FILINDEL singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/gatk4-spark:4.2.0.0--hdfd78af_1 gatk VariantFiltration \ + -V ${scratch_dir}/indels.vcf.gz \ -filter 'QD < 2.0' --filter-name 'QD2' \ -filter 'QUAL < 30.0' --filter-name 'QUAL30' \ -filter 'FS > 200.0' --filter-name 'FS200' \ -filter 'ReadPosRankSum < -20.0' --filter-name 'ReadPosRankSum-20' \ - -O indels_filtered.vcf.gz &" >> _02_filter.sh + -O ${scratch_dir}/indels_filtered.vcf.gz &" >> _02_filter.sh -echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/MERGEVCF.log --job-name MERGEVCF gatk MergeVcfs \ - -I ./snps_filtered.vcf.gz \ - -I ./indels_filtered.vcf.gz \ - -O variants_fil.vcf.gz &" > _03_merge_vcfs.sh +echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/MERGEVCF.log --job-name MERGEVCF singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/gatk4-spark:4.2.0.0--hdfd78af_1 gatk MergeVcfs \ + -I ${scratch_dir}/snps_filtered.vcf.gz \ + -I ${scratch_dir}/indels_filtered.vcf.gz \ + -O ${scratch_dir}/variants_fil.vcf.gz &" > _03_merge_vcfs.sh echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/GZIP.log --job-name GZIP gzip -d variants_fil.vcf.gz &" > _04_gzip.sh - diff --git a/bu_isciii/templates/exomeeb/ANALYSIS/ANALYSIS01_EXOME/03-annotation/lablog b/bu_isciii/templates/exomeeb/ANALYSIS/ANALYSIS01_EXOME/03-annotation/lablog old mode 100755 new mode 100644 index 130ca294..61892d9b --- a/bu_isciii/templates/exomeeb/ANALYSIS/ANALYSIS01_EXOME/03-annotation/lablog +++ b/bu_isciii/templates/exomeeb/ANALYSIS/ANALYSIS01_EXOME/03-annotation/lablog @@ -1,4 +1,5 @@ -# module load BCFtools/1.12-GCC-10.2.0 VEP/103.1-GCC-10.2.0 R/4.2.1 Java/17.0.2.lua +# module load singularity +# module load Java/17.0.2.lua R/4.2.1 scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") @@ -14,14 +15,14 @@ echo "sed -i 's/\t$//' ./vep/variants_fil_mod.vcf" >> _01_bcftools_query.sh # 2-3. Create variant table. -echo "bcftools query -H -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\t%FILTER\t[%GT\t%DP\t%AD\t%GQ\t]\n' ./vep/variants_fil_mod.vcf > ./vep/variants.table" >> _01_bcftools_query.sh -echo "sed -i -r 's/(#|\[[0-9]+\])//g' ./vep/variants.table;sed -i 's/:/_/g' ./vep/variants.table;sed -i 's/ //g' ./vep/variants.table;sed -i 's/\t*$//g' ./vep/variants.table " >> _01_bcftools_query.sh +echo "singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/bcftools:1.12--h45bccc9_1 bcftools query -H -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\t%FILTER\t[%GT\t%DP\t%AD\t%GQ\t]\n' ${scratch_dir}/vep/variants_fil_mod.vcf > ${scratch_dir}/vep/variants.table" >> _01_bcftools_query.sh +echo "sed -i -r 's/(#|\[[0-9]+\])//g' ./vep/variants.table;sed -i 's/:/_/g' ./vep/variants.table;sed -i 's/ //g' ./vep/variants.table;sed -i 's/\t*$//g' ./vep/variants.table" >> _01_bcftools_query.sh echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/BCFTOOLSQUERY.log --job-name BCFTOOLSQUERY bash ./_01_bcftools_query.sh &" > _01_run_bcftools_query.sh ## 4-5. Lablog for annotating whole genome samples using Variant Effect Predictor (VEP). -echo "srun --partition short_idx --mem 100G --time 4:00:00 --chdir ${scratch_dir} --output logs/VEP.log --job-name VEP vep --fasta /data/bi/references/eukaria/homo_sapiens/hg19/1000genomes_b37/genome/human_g1k_v37.fasta -i ./vep/variants_fil_mod.vcf -o ./vep/vep_annot.vcf --cache --offline --dir_cache /data/bi/references/eukaria/homo_sapiens/cache_vep/ --everything --dir_plugins /data/bi/references/eukaria/homo_sapiens/cache_vep/Plugins/ --assembly GRCh37 --tab --plugin dbNSFP,/data/bi/references/eukaria/homo_sapiens/cache_vep/custom_databases/dbNSFP/GRCh37/dbNSFP4.1a_grch37.gz,clinvar_id,clinvar_trait,clinvar_OMIM_id,clinvar_Orphanet_id,HGVSc_snpEff,HGVSp_snpEff,SIFT_score,SIFT_pred,Polyphen2_HDIV_score,Polyphen2_HDIV_pred,Polyphen2_HVAR_score,Polyphen2_HVAR_pred,MutationTaster_score,MutationTaster_pred,MutationAssessor_score,MutationAssessor_pred,FATHMM_score,FATHMM_pred,PROVEAN_score,PROVEAN_pred,VEST4_score,MetaSVM_score,MetaSVM_pred,MetaLR_score,MetaLR_pred,CADD_raw,CADD_phred,CADD_raw_hg19,CADD_phred_hg19,GERP++_NR,GERP++_RS,phyloP100way_vertebrate,phastCons100way_vertebrate &" > _02_vep_annotation.sh +echo "srun --partition short_idx --mem 100G --time 4:00:00 --chdir ${scratch_dir} --output logs/VEP.log --job-name VEP singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/ensembl-vep:103.1--pl5262h4a94de4_2 vep --fasta /data/bi/references/eukaria/homo_sapiens/hg19/1000genomes_b37/genome/human_g1k_v37.fasta -i ${scratch_dir}/vep/variants_fil_mod.vcf -o ${scratch_dir}/vep/vep_annot.vcf --cache --offline --dir_cache /data/bi/references/eukaria/homo_sapiens/cache_vep/ --everything --dir_plugins /data/bi/references/eukaria/homo_sapiens/cache_vep/Plugins/ --assembly GRCh37 --tab --plugin dbNSFP,/data/bi/references/eukaria/homo_sapiens/cache_vep/custom_databases/dbNSFP/GRCh37/dbNSFP4.1a_grch37.gz,clinvar_id,clinvar_trait,clinvar_OMIM_id,clinvar_Orphanet_id,HGVSc_snpEff,HGVSp_snpEff,SIFT_score,SIFT_pred,Polyphen2_HDIV_score,Polyphen2_HDIV_pred,Polyphen2_HVAR_score,Polyphen2_HVAR_pred,MutationTaster_score,MutationTaster_pred,MutationAssessor_score,MutationAssessor_pred,FATHMM_score,FATHMM_pred,PROVEAN_score,PROVEAN_pred,VEST4_score,MetaSVM_score,MetaSVM_pred,MetaLR_score,MetaLR_pred,CADD_raw,CADD_phred,CADD_raw_hg19,CADD_phred_hg19,GERP++_NR,GERP++_RS,phyloP100way_vertebrate,phastCons100way_vertebrate &" > _02_vep_annotation.sh echo "grep -v '^##' ./vep/vep_annot.vcf > ./vep/vep_annot_head.txt" > _03_merge_data1.sh echo "sed -i 's/#Uploaded_variation/ID/' ./vep/vep_annot_head.txt" >> _03_merge_data1.sh diff --git a/bu_isciii/templates/exomeeb/ANALYSIS/ANALYSIS01_EXOME/99-stats/lablog b/bu_isciii/templates/exomeeb/ANALYSIS/ANALYSIS01_EXOME/99-stats/lablog index 0607dcd4..5dd4320a 100644 --- a/bu_isciii/templates/exomeeb/ANALYSIS/ANALYSIS01_EXOME/99-stats/lablog +++ b/bu_isciii/templates/exomeeb/ANALYSIS/ANALYSIS01_EXOME/99-stats/lablog @@ -1,9 +1,9 @@ -# module load picard +# module load singularity scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") mkdir logs -cat ../samples_id.txt | xargs -I @@ echo "srun --partition short_idx --mem 100G --time 2:00:00 --chdir ${scratch_dir} --output logs/PICARDHSMETRICS.@@.%j.log --job-name PICARDHSMETRICS java -Xmx10g -jar \$EBROOTPICARD/picard.jar CollectHsMetrics -R /data/bi/references/eukaria/homo_sapiens/hg19/1000genomes_b37/genome/human_g1k_v37_decoy.fasta -BI ../../../REFERENCES/Illumine_Exome_CEX_TargetedRegions_v1.2_modb37.interval_list -TI ../../../REFERENCES/Illumine_Exome_CEX_TargetedRegions_v1.2_modb37.interval_list -I ../01-sarek/preprocessing/recalibrated/@@/@@.recal.cram -O @@_hsMetrics.out -VALIDATION_STRINGENCY LENIENT &" > _01_picardHsMetrics.sh +cat ../samples_id.txt | xargs -I @@ echo "srun --partition short_idx --mem 100G --time 2:00:00 --chdir ${scratch_dir} --output logs/PICARDHSMETRICS.@@.%j.log --job-name PICARDHSMETRICS singularity exec -B ${scratch_dir}/../../../ -B /data/bi/references/eukaria/homo_sapiens/hg19/1000genomes_b37/genome /data/bi/pipelines/singularity-images/picard:2.25.1--hdfd78af_1 picard CollectHsMetrics -R /data/bi/references/eukaria/homo_sapiens/hg19/1000genomes_b37/genome/human_g1k_v37_decoy.fasta -BI ${scratch_dir}/../../../REFERENCES/Illumine_Exome_CEX_TargetedRegions_v1.2_modb37.interval_list -TI ${scratch_dir}/../../../REFERENCES/Illumine_Exome_CEX_TargetedRegions_v1.2_modb37.interval_list -I ${scratch_dir}/../01-sarek/preprocessing/recalibrated/@@/@@.recal.cram -O ${scratch_dir}/@@_hsMetrics.out -VALIDATION_STRINGENCY LENIENT &" > _01_picardHsMetrics.sh echo "echo "\"SAMPLE\",\"MEAN TARGET COVERAGE\", \"PCT USABLE BASES ON TARGET\",\"FOLD ENRICHMENT\",\"PCT TARGET BASES 10X\",\"PCT TARGET BASES 20X\",\"PCT TARGET BASES 30X\",\"PCT TARGET BASES 40X\",\"PCT TARGET BASES 50X\"" > hsMetrics_all.out" > _02_hsMetrics_all.sh cat ../samples_id.txt | xargs -I % echo "grep '^Illumin' %_hsMetrics.out | awk 'BEGIN{FS=\"\\t\";OFS=\",\"}{print \"%\",\$34,\$12,\$13,\$48,\$49,\$50,\$51,\$52}' >> hsMetrics_all.out" >> _02_hsMetrics_all.sh diff --git a/bu_isciii/templates/exomeeb/ANALYSIS/ANALYSIS01_EXOME/lablog b/bu_isciii/templates/exomeeb/ANALYSIS/ANALYSIS01_EXOME/lablog index 1780b477..22502631 100755 --- a/bu_isciii/templates/exomeeb/ANALYSIS/ANALYSIS01_EXOME/lablog +++ b/bu_isciii/templates/exomeeb/ANALYSIS/ANALYSIS01_EXOME/lablog @@ -28,7 +28,7 @@ cat < sarek.sbatch export NXF_OPTS="-Xms500M -Xmx4G" -nextflow run /data/bi/pipelines/nf-core-sarek-3.1.1/workflow/main.nf \\ +nextflow run /data/bi/pipelines/nf-core-sarek/nf-core-sarek-3.4.2/workflow/main.nf \\ -c ../../DOC/hpc_slurm_sarek.config \\ --input 'samplesheet.csv' \\ --outdir 01-sarek \\ @@ -46,5 +46,5 @@ EOF echo "sbatch sarek.sbatch" > _01_run_sarek.sh -echo "srun --partition short_idx --chdir $scratch_dir rm -rf work &" > _02_clean.sh -echo "srun --partition short_idx --chdir $scratch_dir rm -rf 01-sarek/gatk4/ &" >> _02_clean.sh \ No newline at end of file +echo "srun --partition short_obx --chdir $scratch_dir rm -rf work &" > _02_clean.sh +echo "srun --partition short_obx --chdir $scratch_dir rm -rf 01-sarek/gatk4/ &" >> _02_clean.sh diff --git a/bu_isciii/templates/exomeeb/RESULTS/lablog_exomeeb_results b/bu_isciii/templates/exomeeb/RESULTS/lablog_exomeeb_results new file mode 100755 index 00000000..4a8326e7 --- /dev/null +++ b/bu_isciii/templates/exomeeb/RESULTS/lablog_exomeeb_results @@ -0,0 +1,12 @@ +mkdir $(date '+%Y%m%d')_entrega01 +cd $(date '+%Y%m%d')_entrega01 + +#Create symbolic links depending on the analysis +#Individual files + +ln -s ../../ANALYSIS/*ANALYSIS01_EXOME/99-stats/hsMetrics_all.out mapping_metrics.csv +ln -s ../../ANALYSIS/*ANALYSIS01_EXOME/01-sarek/multiqc/multiqc_report.html . +ln -s ../../ANALYSIS/*ANALYSIS01_EXOME/03-annotation/variants_annot_highModerate.tab . + +ln -s ../../ANALYSIS/*ANALYSIS01_EXOME/03-annotation/exomiser/exomiser.html . + diff --git a/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/02-postprocessing/lablog b/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/02-postprocessing/lablog index 615a4690..e1afcc7f 100644 --- a/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/02-postprocessing/lablog +++ b/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/02-postprocessing/lablog @@ -1,4 +1,4 @@ -# module load GATK/4.2.0.0-GCCcore-10.2.0-Java-11 +# module load singularity # Lablog to apply variant filters to combined GVCFs generated by SAREK (HaplotypeCaller) @@ -7,18 +7,18 @@ scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") mkdir -p logs -echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/SELECTSNPS.log --job-name SELECTSNPS gatk SelectVariants \ - -V ../01-sarek/variant_calling/haplotypecaller/joint_variant_calling/joint_germline.vcf.gz \ +echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/SELECTSNPS.log --job-name SELECTSNPS singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/gatk4-spark:4.2.0.0--hdfd78af_1 gatk SelectVariants \ + -V ${scratch_dir}/../01-sarek/variant_calling/haplotypecaller/joint_variant_calling/joint_germline.vcf.gz \ -select-type SNP \ - -O snps.vcf.gz &" > _01_separate_snps_indels.sh + -O ${scratch_dir}/snps.vcf.gz &" > _01_separate_snps_indels.sh -echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/SELECTINDELS.log --job-name SELECTINDELS gatk SelectVariants \ - -V ../01-sarek/variant_calling/haplotypecaller/joint_variant_calling/joint_germline.vcf.gz \ +echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/SELECTINDELS.log --job-name SELECTINDELS singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/gatk4-spark:4.2.0.0--hdfd78af_1 gatk SelectVariants \ + -V ${scratch_dir}/../01-sarek/variant_calling/haplotypecaller/joint_variant_calling/joint_germline.vcf.gz \ -select-type INDEL \ - -O indels.vcf.gz &" >> _01_separate_snps_indels.sh + -O ${scratch_dir}/indels.vcf.gz &" >> _01_separate_snps_indels.sh -echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/FILSNP.log --job-name FILSNP gatk VariantFiltration \ - -V snps.vcf.gz \ +echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/FILSNP.log --job-name FILSNP singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/gatk4-spark:4.2.0.0--hdfd78af_1 gatk VariantFiltration \ + -V ${scratch_dir}/snps.vcf.gz \ -filter 'QD < 2.0' --filter-name 'QD2' \ -filter 'QUAL < 30.0' --filter-name 'QUAL30' \ -filter 'SOR > 3.0' --filter-name 'SOR3' \ @@ -26,20 +26,19 @@ echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output -filter 'MQ < 40.0' --filter-name 'MQ40' \ -filter 'MQRankSum < -12.5' --filter-name 'MQRankSum-12.5' \ -filter 'ReadPosRankSum < -8.0' --filter-name 'ReadPosRankSum-8' \ - -O snps_filtered.vcf.gz &" > _02_filter.sh + -O ${scratch_dir}/snps_filtered.vcf.gz &" > _02_filter.sh -echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/FILINDEL.log --job-name FILINDEL gatk VariantFiltration \ - -V indels.vcf.gz \ +echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/FILINDEL.log --job-name FILINDEL singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/gatk4-spark:4.2.0.0--hdfd78af_1 gatk VariantFiltration \ + -V ${scratch_dir}/indels.vcf.gz \ -filter 'QD < 2.0' --filter-name 'QD2' \ -filter 'QUAL < 30.0' --filter-name 'QUAL30' \ -filter 'FS > 200.0' --filter-name 'FS200' \ -filter 'ReadPosRankSum < -20.0' --filter-name 'ReadPosRankSum-20' \ - -O indels_filtered.vcf.gz &" >> _02_filter.sh + -O ${scratch_dir}/indels_filtered.vcf.gz &" >> _02_filter.sh -echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/MERGEVCF.log --job-name MERGEVCF gatk MergeVcfs \ - -I ./snps_filtered.vcf.gz \ - -I ./indels_filtered.vcf.gz \ - -O variants_fil.vcf.gz &" > _03_merge_vcfs.sh +echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/MERGEVCF.log --job-name MERGEVCF singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/gatk4-spark:4.2.0.0--hdfd78af_1 gatk MergeVcfs \ + -I ${scratch_dir}/snps_filtered.vcf.gz \ + -I ${scratch_dir}/indels_filtered.vcf.gz \ + -O ${scratch_dir}/variants_fil.vcf.gz &" > _03_merge_vcfs.sh echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/GZIP.log --job-name GZIP gzip -d variants_fil.vcf.gz &" > _04_gzip.sh - diff --git a/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/lablog b/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/lablog index 2dd494ea..063393c2 100644 --- a/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/lablog +++ b/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/lablog @@ -1,4 +1,5 @@ -# module load BCFtools/1.12-GCC-10.2.0 VEP/103.1-GCC-10.2.0 R/4.2.1 Java/17.0.2.lua +# module load singularity +# module load Java/17.0.2.lua R/4.2.1 scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") @@ -17,7 +18,7 @@ echo "sed -i 's/\t$//' ./vep/variants_fil_mod.vcf" >> aux_01_bcftools_query.sh # 2. Create variant table. -echo "bcftools query -H -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\t%FILTER\t[%GT\t%DP\t%AD\t%GQ\t]\n' ./vep/variants_fil_mod.vcf > ./vep/variants.table" >> aux_01_bcftools_query.sh +echo "singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/bcftools:1.12--h45bccc9_1 bcftools query -H -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\t%FILTER\t[%GT\t%DP\t%AD\t%GQ\t]\n' ${scratch_dir}/vep/variants_fil_mod.vcf > ${scratch_dir}/vep/variants.table" >> aux_01_bcftools_query.sh echo "sed -i -r 's/(#|\[[0-9]+\])//g' ./vep/variants.table;sed -i 's/:/_/g' ./vep/variants.table;sed -i 's/ //g' ./vep/variants.table;sed -i 's/\t*$//g' ./vep/variants.table " >> aux_01_bcftools_query.sh echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/BCFTOOLSQUERY.log --job-name BCFTOOLSQUERY bash ./aux_01_bcftools_query.sh &" > _01_run_bcftools_query.sh @@ -27,7 +28,7 @@ echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output # 3. Lablog for annotating whole genome samples using Variant Effect Predictor (VEP). # Run Vep without the plugin columns -echo "srun --partition short_idx --mem 100G --time 12:00:00 --chdir ${scratch_dir} --output logs/VEP.log --job-name VEP vep --fasta /data/bi/references/eukaria/homo_sapiens/hg19/1000genomes_b37/genome/human_g1k_v37.fasta -i ./vep/variants_fil_mod.vcf -o ./vep/vep_annot.vcf --cache --offline --dir_cache /data/bi/references/eukaria/homo_sapiens/cache_vep/ --everything --assembly GRCh37 --tab &" > _02_vep_annotation.sh +echo "srun --partition short_idx --mem 100G --time 12:00:00 --chdir ${scratch_dir} --output logs/VEP.log --job-name VEP singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/ensembl-vep:103.1--pl5262h4a94de4_2 vep --fasta /data/bi/references/eukaria/homo_sapiens/hg19/1000genomes_b37/genome/human_g1k_v37.fasta -i ${scratch_dir}/vep/variants_fil_mod.vcf -o ${scratch_dir}/vep/vep_annot.vcf --cache --offline --dir_cache /data/bi/references/eukaria/homo_sapiens/cache_vep/ --everything --assembly GRCh37 --tab &" > _02_vep_annotation.sh #-------------------------------------------------------------------------------------------------------------------- @@ -42,7 +43,7 @@ echo "sed -i 's/#Uploaded_variation/ID/' ./vep/vep_annot_head.txt" >> _03_Vep_pl # Merge vep_plugin.txt with dbNSFP_ENSG_gene_GRCh37.txt by "Gene" column, save as vep_dbNSFP.txt. # Merge vep_dbNSFP.txt with variants.table by "ID" column, save as variants_annot_all.tab -echo "srun --partition short_idx --nodelist ideafix04 --mem 200G --time 12:00:00 --chdir ${scratch_dir} --output logs/MERGE_ALL.log --job-name MERGE_ALL Rscript Merge_All.R" >> _03_Vep_plugin_dbNSFP_parse.sh +echo "srun --partition short_idx --mem 200G --time 12:00:00 --chdir ${scratch_dir} --output logs/MERGE_ALL.log --job-name MERGE_ALL Rscript Merge_All.R" >> _03_Vep_plugin_dbNSFP_parse.sh echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/AWK.log --job-name AWK bash ./aux_03_awk.sh &" >> _03_Vep_plugin_dbNSFP_parse.sh diff --git a/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/99-stats/lablog b/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/99-stats/lablog index 0607dcd4..3eda4044 100644 --- a/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/99-stats/lablog +++ b/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/99-stats/lablog @@ -1,9 +1,9 @@ -# module load picard +# module load singularity scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") mkdir logs -cat ../samples_id.txt | xargs -I @@ echo "srun --partition short_idx --mem 100G --time 2:00:00 --chdir ${scratch_dir} --output logs/PICARDHSMETRICS.@@.%j.log --job-name PICARDHSMETRICS java -Xmx10g -jar \$EBROOTPICARD/picard.jar CollectHsMetrics -R /data/bi/references/eukaria/homo_sapiens/hg19/1000genomes_b37/genome/human_g1k_v37_decoy.fasta -BI ../../../REFERENCES/Illumine_Exome_CEX_TargetedRegions_v1.2_modb37.interval_list -TI ../../../REFERENCES/Illumine_Exome_CEX_TargetedRegions_v1.2_modb37.interval_list -I ../01-sarek/preprocessing/recalibrated/@@/@@.recal.cram -O @@_hsMetrics.out -VALIDATION_STRINGENCY LENIENT &" > _01_picardHsMetrics.sh +cat ../samples_id.txt | xargs -I @@ echo "srun --partition short_idx --mem 100G --time 2:00:00 --chdir ${scratch_dir} --output logs/PICARDHSMETRICS.@@.%j.log --job-name PICARDHSMETRICS singularity exec -B ${scratch_dir}/../../../ -B /data/bi/references/eukaria/homo_sapiens/hg19/1000genomes_b37/genome/ /data/bi/pipelines/singularity-images/picard:2.25.1--hdfd78af_1 picard CollectHsMetrics -R /data/bi/references/eukaria/homo_sapiens/hg19/1000genomes_b37/genome/human_g1k_v37_decoy.fasta -BI ${scratch_dir}/../../../REFERENCES/Illumine_Exome_CEX_TargetedRegions_v1.2_modb37.interval_list -TI ${scratch_dir}/../../../REFERENCES/Illumine_Exome_CEX_TargetedRegions_v1.2_modb37.interval_list -I ${scratch_dir}/../01-sarek/preprocessing/recalibrated/@@/@@.recal.cram -O ${scratch_dir}/@@_hsMetrics.out -VALIDATION_STRINGENCY LENIENT &" > _01_picardHsMetrics.sh echo "echo "\"SAMPLE\",\"MEAN TARGET COVERAGE\", \"PCT USABLE BASES ON TARGET\",\"FOLD ENRICHMENT\",\"PCT TARGET BASES 10X\",\"PCT TARGET BASES 20X\",\"PCT TARGET BASES 30X\",\"PCT TARGET BASES 40X\",\"PCT TARGET BASES 50X\"" > hsMetrics_all.out" > _02_hsMetrics_all.sh cat ../samples_id.txt | xargs -I % echo "grep '^Illumin' %_hsMetrics.out | awk 'BEGIN{FS=\"\\t\";OFS=\",\"}{print \"%\",\$34,\$12,\$13,\$48,\$49,\$50,\$51,\$52}' >> hsMetrics_all.out" >> _02_hsMetrics_all.sh diff --git a/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/lablog b/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/lablog old mode 100644 new mode 100755 index a5c360ae..179c7493 --- a/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/lablog +++ b/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/lablog @@ -42,7 +42,7 @@ cat < sarek.sbatch export NXF_OPTS="-Xms500M -Xmx4G" -nextflow run /data/bi/pipelines/nf-core-sarek-3.1.1/workflow/main.nf \\ +nextflow run /data/bi/pipelines/nf-core-sarek/nf-core-sarek-3.4.2/workflow/main.nf \\ -c ../../DOC/hpc_slurm_sarek.config \\ --input 'samplesheet.csv' \\ --outdir 01-sarek \\ @@ -62,4 +62,6 @@ EOF echo "sbatch sarek.sbatch" > _01_run_sarek.sh -#nohup bash _00_sarek.sh &> $(date '+%Y%m%d')_sarek01.log & +echo "srun --partition short_obx --chdir $scratch_dir rm -rf work &" > _02_clean.sh +echo "srun --partition short_obx --chdir $scratch_dir rm -rf 01-sarek/gatk4/ &" >> _02_clean.sh +echo "srun --partition short_obx --chdir $scratch_dir rm -rf 01-sarek/preprocessing/markduplicates &" >> _02_clean.sh diff --git a/bu_isciii/templates/exometrio/RESULTS/lablog_exome_results b/bu_isciii/templates/exometrio/RESULTS/lablog_exome_results new file mode 100755 index 00000000..5425bdfc --- /dev/null +++ b/bu_isciii/templates/exometrio/RESULTS/lablog_exome_results @@ -0,0 +1,16 @@ +mkdir $(date '+%Y%m%d')_entrega01 +cd $(date '+%Y%m%d')_entrega01 + +#Create symbolic links depending on the analysis +#Individual files + +ln -s ../../ANALYSIS/*ANALYSIS01_EXOME/99-stats/hsMetrics_all.out mapping_metrics.csv +ln -s ../../ANALYSIS/*ANALYSIS01_EXOME/01-sarek/multiqc/multiqc_report.html . +ln -s ../../ANALYSIS/*ANALYSIS01_EXOME/03-annotation/variants_*filterAF*.tab . + +mkdir annotation_tables +cd annotation_tables; ln -s ../../../ANALYSIS/*ANALYSIS01_EXOME/03-annotation/vep_annot*.txt . ; cd - + +# For exomeEB services +# ln -s ../../ANALYSIS/*ANALYSIS01_EXOME/03-annotation/exomiser/exomiser.html . + diff --git a/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/01-preproQC/lablog b/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/01-preproQC/lablog index 66e8db87..0062b7b6 100644 --- a/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/01-preproQC/lablog +++ b/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/01-preproQC/lablog @@ -1,7 +1,7 @@ -#module load FastQC +#module load singularity mkdir logs scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") -cat ../samples_id.txt | while read in; do echo "mkdir $in; srun --partition short_idx --cpus-per-task 8 --time 01:00:00 --chdir $scratch_dir --output logs/FASTQC.${in}.%j.log fastqc -o $in --nogroup -t 8 -k 8 ../00-reads/"$in"_R1.fastq.gz ../00-reads/"$in"_R2.fastq.gz &"; done > _01_rawfastqc.sh +cat ../samples_id.txt | while read in; do echo "mkdir $in; srun --partition short_idx --cpus-per-task 8 --time 01:00:00 --chdir $scratch_dir --output logs/FASTQC.${in}.%j.log singularity exec -B ${scratch_dir}/../../../ -B /srv/fastq_repo/ /data/bi/pipelines/singularity-images/fastqc:0.11.9--hdfd78af_1 fastqc -o ${scratch_dir}/$in --nogroup -t 8 -k 8 ${scratch_dir}/../00-reads/"$in"_R1.fastq.gz ${scratch_dir}/../00-reads/"$in"_R2.fastq.gz &"; done > _01_rawfastqc.sh diff --git a/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/02-preprocessing/lablog b/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/02-preprocessing/lablog index 240619b7..71204967 100644 --- a/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/02-preprocessing/lablog +++ b/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/02-preprocessing/lablog @@ -1,4 +1,4 @@ -# module load fastp +# module load singularity mkdir logs scratch_dir=$(echo $(pwd) | sed 's@/data/bi/scratch_tmp/@/scratch/@g') -cat ../samples_id.txt | xargs -I @@ echo "mkdir @@; srun --chdir ${scratch_dir} --mem 10G --time 1:00:00 --job-name FP.@@ --output logs/FP.@@.%j.log --partition short_idx --cpus-per-task 5 fastp --in1 ../00-reads/@@_R1.fastq.gz --in2 ../00-reads/@@_R2.fastq.gz --thread 5 --cut_front --cut_tail --cut_mean_quality 20 --qualified_quality_phred 20 --trim_poly_x --length_required 50 --detect_adapter_for_pe --json @@/@@_fastp.json --html @@/@@_fastp.html --out1 @@/@@_R1_filtered.fastq.gz --out2 @@/@@_R2_filtered.fastq.gz --unpaired1 @@/@@_R1_unpaired.fastq.gz --unpaired2 @@/@@_R2_unpaired.fastq.gz &" > _01_fastp.sh +cat ../samples_id.txt | xargs -I @@ echo "mkdir @@; srun --chdir ${scratch_dir} --mem 10G --time 1:00:00 --job-name FP.@@ --output logs/FP.@@.%j.log --partition short_idx --cpus-per-task 5 singularity exec -B ${scratch_dir}/../../../ -B /srv/fastq_repo/ /data/bi/pipelines/singularity-images/fastp:0.20.0--hdbcaa40_0 fastp --in1 ${scratch_dir}/../00-reads/@@_R1.fastq.gz --in2 ${scratch_dir}/../00-reads/@@_R2.fastq.gz --thread 5 --cut_front --cut_tail --cut_mean_quality 20 --qualified_quality_phred 20 --trim_poly_x --length_required 50 --detect_adapter_for_pe --json ${scratch_dir}/@@/@@_fastp.json --html ${scratch_dir}/@@/@@_fastp.html --out1 ${scratch_dir}/@@/@@_R1_filtered.fastq.gz --out2 ${scratch_dir}/@@/@@_R2_filtered.fastq.gz --unpaired1 ${scratch_dir}/@@/@@_R1_unpaired.fastq.gz --unpaired2 ${scratch_dir}/@@/@@_R2_unpaired.fastq.gz &" > _01_fastp.sh diff --git a/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/03-procQC/lablog b/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/03-procQC/lablog index ee6ff3a0..767eff88 100644 --- a/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/03-procQC/lablog +++ b/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/03-procQC/lablog @@ -1,7 +1,7 @@ -#module load FastQC +#module load singularity mkdir logs scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") -cat ../samples_id.txt | while read in; do echo "mkdir $in; srun --partition short_idx --chdir $scratch_dir --output logs/FASTQC.${in}.%j.log fastqc -o $in --nogroup -t 8 -k 8 ../02-preprocessing/${in}/${in}_R1_filtered.fastq.gz ../02-preprocessing/${in}/${in}_R2_filtered.fastq.gz &"; done > _01_preprofastqc.sh +cat ../samples_id.txt | while read in; do echo "mkdir $in; srun --partition short_idx --chdir $scratch_dir --output logs/FASTQC.${in}.%j.log singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/fastqc:0.11.9--hdfd78af_1 fastqc -o ${scratch_dir}/$in --nogroup -t 8 -k 8 ${scratch_dir}/../02-preprocessing/${in}/${in}_R1_filtered.fastq.gz ${scratch_dir}/../02-preprocessing/${in}/${in}_R2_filtered.fastq.gz &"; done > _01_preprofastqc.sh diff --git a/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/04-Alignment/lablog b/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/04-Alignment/lablog index e347d126..8f61e3cf 100644 --- a/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/04-Alignment/lablog +++ b/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/04-Alignment/lablog @@ -1,18 +1,18 @@ -mkdir logs +# module load singularity -# module load BWA SAMtools picard +mkdir logs scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") -cat ../samples_id.txt | while read in; do echo "mkdir $in; srun --partition short_idx --chdir $scratch_dir --output logs/BWAMEM.${in}.%j.log --cpus-per-task 20 bwa mem -t 20 REFERENCE_GENOME ../02-preprocessing/${in}/${in}_R1_filtered.fastq.gz ../02-preprocessing/${in}/${in}_R2_filtered.fastq.gz -o ${in}/${in}.sam &"; done >> _01_bwamem.sh +cat ../samples_id.txt | while read in; do echo "mkdir $in; srun --partition short_idx --chdir $scratch_dir --output logs/BWAMEM.${in}.%j.log --cpus-per-task 20 singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/bwa:0.7.17--he4a0461_11 bwa mem -t 20 REFERENCE_GENOME ${scratch_dir}/../02-preprocessing/${in}/${in}_R1_filtered.fastq.gz ${scratch_dir}/../02-preprocessing/${in}/${in}_R2_filtered.fastq.gz -o ${scratch_dir}/${in}/${in}.sam &"; done >> _01_bwamem.sh -cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --chdir $scratch_dir --output logs/SAMTOOLS_VIEW.${in}.%j.log --cpus-per-task 20 samtools view -bS ${in}/${in}.sam -o ${in}/${in}.bam &"; done >> _02_samtools_view.sh +cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --chdir $scratch_dir --output logs/SAMTOOLS_VIEW.${in}.%j.log --cpus-per-task 20 singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/samtools:1.16.1--h6899075_1 samtools view -bS ${scratch_dir}/${in}/${in}.sam -o ${scratch_dir}/${in}/${in}.bam &"; done >> _02_samtools_view.sh -cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --chdir $scratch_dir --output logs/SAMTOOLS_SORT.${in}.%j.log --cpus-per-task 20 samtools sort -@ 20 -o ${in}/${in}_sorted.bam -T ${in}/${in}_sorted ${in}/${in}.bam &"; done >> _03_samtools_sort.sh +cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --chdir $scratch_dir --output logs/SAMTOOLS_SORT.${in}.%j.log --cpus-per-task 20 singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/samtools:1.16.1--h6899075_1 samtools sort -@ 20 -o ${scratch_dir}/${in}/${in}_sorted.bam -T ${scratch_dir}/${in}/${in}_sorted ${scratch_dir}/${in}/${in}.bam &"; done >> _03_samtools_sort.sh -cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --chdir $scratch_dir --output logs/PICARD_ADDORREPLACE.${in}.%j.log --mem 251346M --cpus-per-task 20 java -Xmx10g -jar \$EBROOTPICARD/picard.jar AddOrReplaceReadGroups VALIDATION_STRINGENCY=LENIENT INPUT=${in}/${in}_sorted.bam OUTPUT=${in}/${in}_sorted_rg.bam RGID=2022-NEXTERA-NOVASEQ-ILLUMINA-ISCIII RGLB=NEXTERA RGPL=ILLUMINA RGSM=${in} RGPU=A01158 RGDT=2022 RGCN=ISCIII TMP_DIR=../../../TMP/${in} &"; done >> _04_picard_replacegroups.sh +cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --chdir $scratch_dir --output logs/PICARD_ADDORREPLACE.${in}.%j.log --mem 251346M --cpus-per-task 20 singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/picard:2.25.1--hdfd78af_1 picard AddOrReplaceReadGroups VALIDATION_STRINGENCY=LENIENT INPUT=${scratch_dir}/${in}/${in}_sorted.bam OUTPUT=${scratch_dir}/${in}/${in}_sorted_rg.bam RGID=2022-NEXTERA-NOVASEQ-ILLUMINA-ISCIII RGLB=NEXTERA RGPL=ILLUMINA RGSM=${scratch_dir}/${in} RGPU=A01158 RGDT=2022 RGCN=ISCIII TMP_DIR=${scratch_dir}/../../../TMP/${in} &"; done >> _04_picard_replacegroups.sh -cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --chdir $scratch_dir --output logs/SAMTOOLS_INDEX.${in}.%j.log --cpus-per-task 20 samtools index ${in}/${in}_sorted_rg.bam &"; done >> _05_samtools_index.sh +cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --chdir $scratch_dir --output logs/SAMTOOLS_INDEX.${in}.%j.log --cpus-per-task 20 singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/samtools:1.16.1--h6899075_1 samtools index ${scratch_dir}/${in}/${in}_sorted_rg.bam &"; done >> _05_samtools_index.sh #clean #rm */*[0-9].sam diff --git a/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/05-mark_dups/lablog b/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/05-mark_dups/lablog index 9bbf80d6..dbef85b2 100644 --- a/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/05-mark_dups/lablog +++ b/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/05-mark_dups/lablog @@ -1,13 +1,13 @@ -mkdir logs +# module load singularity -# module load SAMtools picard +mkdir logs scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") -cat ../samples_id.txt | while read in; do mkdir ${in}; echo "srun --partition short_idx --chdir $scratch_dir --output logs/PICARD_DUPLICATES.${in}.%j.log --mem 251346M --cpus-per-task 20 java -Xmx10g -jar \$EBROOTPICARD/picard.jar MarkDuplicates ASSUME_SORTED=true VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=false INPUT=../04-Alignment/${in}/${in}_sorted_rg.bam OUTPUT=${in}/${in}_woduplicates.bam METRICS_FILE=${in}/${in}_duplicates.stats TMP_DIR=../../../TMP/${in} &"; done > _01_picard_dups.sh +cat ../samples_id.txt | while read in; do mkdir ${in}; echo "srun --partition short_idx --chdir $scratch_dir --output logs/PICARD_DUPLICATES.${in}.%j.log --mem 251346M --cpus-per-task 20 singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/picard:2.25.1--hdfd78af_1 picard MarkDuplicates ASSUME_SORTED=true VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=false INPUT=${scratch_dir}/../04-Alignment/${in}/${in}_sorted_rg.bam OUTPUT=${scratch_dir}/${in}/${in}_woduplicates.bam METRICS_FILE=${scratch_dir}/${in}/${in}_duplicates.stats TMP_DIR=${scratch_dir}/../../../TMP/${in} &"; done > _01_picard_dups.sh -cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --chdir $scratch_dir --output logs/PICARD_ADDORREPLACE.${in}.%j.log --mem 251346M --cpus-per-task 20 java -Xmx10g -jar \$EBROOTPICARD/picard.jar AddOrReplaceReadGroups VALIDATION_STRINGENCY=LENIENT INPUT=${in}/${in}_woduplicates.bam OUTPUT=${in}/${in}_rg.bam RGID=${in} RGLB=NEXTERA RGPL=ILLUMINA RGSM=${in} RGPU=A01158 RGDT=2022 RGCN=ISCIII TMP_DIR=../../../TMP/${in} &"; done > _02_picard_replacegroups.sh +cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --chdir $scratch_dir --output logs/PICARD_ADDORREPLACE.${in}.%j.log --mem 251346M --cpus-per-task 20 singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/picard:2.25.1--hdfd78af_1 picard AddOrReplaceReadGroups VALIDATION_STRINGENCY=LENIENT INPUT=${scratch_dir}/${in}/${in}_woduplicates.bam OUTPUT=${scratch_dir}/${in}/${in}_rg.bam RGID=${scratch_dir}/${in} RGLB=NEXTERA RGPL=ILLUMINA RGSM=${scratch_dir}/${in} RGPU=A01158 RGDT=2022 RGCN=ISCIII TMP_DIR=${scratch_dir}/../../../TMP/${in} &"; done > _02_picard_replacegroups.sh -cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --chdir $scratch_dir --output logs/SAMTOOLS_SORT.${in}.%j.log --cpus-per-task 20 samtools sort -@ 20 -o ${in}/${in}_sorted_rg.bam -T ${in}/${in}_sorted_rg ${in}/${in}_rg.bam &"; done > _03_samtools_sort.sh +cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --chdir $scratch_dir --output logs/SAMTOOLS_SORT.${in}.%j.log --cpus-per-task 20 singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/samtools:1.16.1--h6899075_1 samtools sort -@ 20 -o ${scratch_dir}/${in}/${in}_sorted_rg.bam -T ${scratch_dir}/${in}/${in}_sorted_rg ${scratch_dir}/${in}/${in}_rg.bam &"; done > _03_samtools_sort.sh -cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --chdir $scratch_dir --output logs/SAMTOOLS_INDEX.${in}.%j.log --cpus-per-task 20 samtools index ${in}/${in}_sorted_rg.bam &"; done > _04_samtools_index.sh +cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --chdir $scratch_dir --output logs/SAMTOOLS_INDEX.${in}.%j.log --cpus-per-task 20 singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/samtools:1.16.1--h6899075_1 samtools index ${scratch_dir}/${in}/${in}_sorted_rg.bam &"; done > _04_samtools_index.sh diff --git a/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/06-freebayes/lablog b/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/06-freebayes/lablog index b68be3d1..05d95e1c 100644 --- a/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/06-freebayes/lablog +++ b/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/06-freebayes/lablog @@ -1,4 +1,4 @@ -# conda activate freebayes +# module load singularity mkdir -p logs @@ -14,7 +14,7 @@ cat < freebayes.sbatch #SBATCH --output logs/freebayes.log #SBATCH --chdir $scratch_dir -freebayes -f REFERENCE_GENOME --bam-list bam_list.txt --pooled-continuous -F 0.1 -C 1 | vcffilter -f "QUAL > 20" > all_samples.vcf +singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/freebayes:1.3.6--hb0f3ef8_4 freebayes -f REFERENCE_GENOME --bam-list ${scratch_dir}/bam_list.txt --pooled-continuous -F 0.1 -C 1 | singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/freebayes:1.3.6--hb0f3ef8_4 vcffilter -f "QUAL > 20" > all_samples.vcf EOF diff --git a/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/08-iqtree/lablog b/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/08-iqtree/lablog index 7761afdc..58ac9983 100644 --- a/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/08-iqtree/lablog +++ b/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/08-iqtree/lablog @@ -1,6 +1,6 @@ -# conda activate iqtree +# module load singularity scratch_dir=$(echo $PWD | sed 's/\/data\/bi\/scratch_tmp/\/scratch/g') -mkdir logs -echo "srun --chdir ${scratch_dir} --output logs/IQTREEMFP.%j.log --job-name IQTREEMFP --cpus-per-task 20 --mem 5G --partition short_idx --time 00:30:00 iqtree -s ../07-snphylo/snphylo.output.fasta -m MFP &" > _00_iqtreemfp.sh -echo "srun --chdir ${scratch_dir} --output logs/IQTREEFULLALIGN.%j.log --job-name IQTREEFULLALIGN --cpus-per-task 20 --mem 15G --partition short_idx --time 08:00:00 iqtree -s ../07-snphylo/snphylo.output.fasta -m PMB+F+R2 -T 20 -B 1000 -pre phylo.iqtree.bootstrap &" > _01_iqtreeall.sh +mkdir log +echo "srun --chdir ${scratch_dir} --output logs/IQTREEMFP.%j.log --job-name IQTREEMFP --cpus-per-task 20 --mem 5G --partition short_idx --time 00:30:00 singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/iqtree:2.1.4_beta--hdcc8f71_0 iqtree -s ${scratch_dir}/../07-snphylo/snphylo.output.fasta -m MFP &" > _00_iqtreemfp.sh +echo "srun --chdir ${scratch_dir} --output logs/IQTREEFULLALIGN.%j.log --job-name IQTREEFULLALIGN --cpus-per-task 20 --mem 15G --partition short_idx --time 08:00:00 singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/iqtree:2.1.4_beta--hdcc8f71_0 iqtree -s ${scratch_dir}/../07-snphylo/snphylo.output.fasta -m PMB+F+R2 -T 20 -B 1000 -pre phylo.iqtree.bootstrap &" > _01_iqtreeall.sh diff --git a/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/99-stats/lablog b/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/99-stats/lablog index 2b97855d..384e269c 100644 --- a/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/99-stats/lablog +++ b/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS01_OUTBREAK/99-stats/lablog @@ -1,3 +1,5 @@ +# module load singularity + scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") mkdir -p logs @@ -18,23 +20,19 @@ cat < samtools_flagstats.sbatch #SBATCH --output logs/samtools_flagstats_%A_%a.log #SBATCH --chdir $scratch_dir -module load SAMtools - SAMPLE_LIST=(\$(<$samples_file)) echo \$SAMPLE_LIST SAMPLE=\${SAMPLE_LIST[\${SLURM_ARRAY_TASK_ID}-1]} echo \$SAMPLE -samtools flagstat ../04-Alignment/\${SAMPLE}/\${SAMPLE}_sorted_rg.bam > samtools_flagstats/\${SAMPLE}_flagstats.txt +singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/samtools:1.16.1--h6899075_1 samtools flagstat ${scratch_dir}/../04-Alignment/\${SAMPLE}/\${SAMPLE}_sorted_rg.bam > samtools_flagstats/\${SAMPLE}_flagstats.txt EOF echo "sbatch samtools_flagstats.sbatch" > _01_samtools_flagstats.sh echo "Done" -# module load picard - -cat ../samples_id.txt | xargs -I @@ echo "srun --chdir $scratch_dir --output logs/PICARD_@@.%j.log --job-name picard_@@ --partition short_idx --time 2:00:00 java -jar \$EBROOTPICARD/picard.jar CollectWgsMetrics -VALIDATION_STRINGENCY LENIENT -I ../04-Alignment/@@/@@_sorted_rg.bam -O picard/@@_collect_wgs_metrics.txt -R REFERENCE_GENOME &" > _02_wgsmetrics.sh +cat ../samples_id.txt | xargs -I @@ echo "srun --chdir $scratch_dir --output logs/PICARD_@@.%j.log --job-name picard_@@ --partition short_idx --time 2:00:00 singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/picard:2.25.1--hdfd78af_1 picard CollectWgsMetrics -VALIDATION_STRINGENCY LENIENT -I ${scratch_dir}/../04-Alignment/@@/@@_sorted_rg.bam -O ${scratch_dir}/picard/@@_collect_wgs_metrics.txt -R REFERENCE_GENOME &" > _02_wgsmetrics.sh echo "printf \"SAMPLENAME\t\$(grep \"GENOME_TERRITORY\" */*collect_wgs_metrics.txt | cut -d\":\" -f2 | sed 's/ /_/g'| sort -u)\n\" > wgs_metrics_all.txt" > _03_gather_wgs_metrics.sh diff --git a/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS02_MET/99-stats/lablog b/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS02_MET/99-stats/lablog index c9c6a808..246dae5d 100644 --- a/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS02_MET/99-stats/lablog +++ b/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS02_MET/99-stats/lablog @@ -1,4 +1,5 @@ -#module load MultiQC +#module load singularity + cat ../../samples_id.txt | while read in; do ln -s ../*_mag/Taxonomy/kraken2/${in}/kraken2_report.txt ./${in}_kraken2_report.txt; done scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") @@ -15,7 +16,7 @@ cat < multiqc.sbatch export NXF_OPTS="-Xms500M -Xmx4G" -multiqc -d . --config multiqc_config.yaml +singularity exec -B ${scratch_dir}/../../../ /data/bi/pipelines/singularity-images/multiqc:1.9--py_1 multiqc -d . --config multiqc_config.yaml EOF diff --git a/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS02_MET/lablog b/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS02_MET/lablog index 84c6c6df..0086364d 100644 --- a/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS02_MET/lablog +++ b/bu_isciii/templates/freebayes_outbreak/ANALYSIS/ANALYSIS02_MET/lablog @@ -17,7 +17,7 @@ cat < mag.sbatch export NXF_OPTS="-Xms500M -Xmx4G" -nextflow run /scratch/bi/pipelines/nf-core-mag-2.1.1/workflow/main.nf \\ +nextflow run /data/bi/pipelines/nf-core-mag/nf-core-mag-2.5.3/workflow/main.nf \\ -c ../../DOC/mag.config \\ --input '00-reads/*_R{1,2}.fastq.gz' \\ --outdir $(date '+%Y%m%d')_mag \\ diff --git a/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS01_PIKAVIRUS/lablog b/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS01_PIKAVIRUS/lablog deleted file mode 100644 index 7a8120a7..00000000 --- a/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS01_PIKAVIRUS/lablog +++ /dev/null @@ -1,38 +0,0 @@ -# module load Nextflow/21.10.6 singularity - -ln -s ../00-reads . -ln -s ../samples_id.txt . -echo "sample,fastq_1,fastq_2" > samplesheet.csv -cat samples_id.txt | while read in; do echo "${in},00-reads/${in}_R1.fastq.gz,00-reads/${in}_R2.fastq.gz"; done >> samplesheet.csv - - -scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") - -cat < pikavirus.sbatch -#!/bin/sh -#SBATCH --ntasks 1 -#SBATCH --cpus-per-task 2 -#SBATCH --mem 4G -#SBATCH --time 4:00:00 -#SBATCH --partition middle_idx -#SBATCH --output $(date '+%Y%m%d')_pikavirus01.log -#SBATCH --chdir $scratch_dir - -export NXF_OPTS="-Xms500M -Xmx4G" - -nextflow run /scratch/bi/pipelines/PikaVirus/main.nf \\ - -c ../../DOC/hpc_slurm_pikavirus.config \\ - --input samplesheet.csv \\ - --kraken_scouting false \\ - --virus true \\ - --bacteria false \\ - --fungi false \\ - --kaiju false \\ - --mash_winner_strategy true \\ - --mash_identitity_threshold 0.9 \\ - --mash_shared_hashes_threshold 0.01 \\ - --mash_pvalue_threshold 0.05 \\ - -resume -EOF - -echo "sbatch pikavirus.sbatch" > _01_nf_pikavirus.sh \ No newline at end of file diff --git a/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS03_MAG/99-stats/lablog b/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS03_MAG/99-stats/lablog deleted file mode 100644 index 07773be8..00000000 --- a/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS03_MAG/99-stats/lablog +++ /dev/null @@ -1,24 +0,0 @@ -# module load MultiQC -cat ../../samples_id.txt | while read in; do ln -s ../*_mag/Taxonomy/kraken2/${in}/kraken2_report.txt ./${in}_kraken2_report.txt; done - -scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") - -cat < multiqc.sbatch -#!/bin/sh -#SBATCH --ntasks 1 -#SBATCH --cpus-per-task 2 -#SBATCH --mem 4G -#SBATCH --time 00:30:00 -#SBATCH --partition short_idx -#SBATCH --output $(date '+%Y%m%d')_multiqc.log -#SBATCH --chdir $scratch_dir - -export NXF_OPTS="-Xms500M -Xmx4G" - -multiqc -d . --config multiqc_config.yaml - -EOF - -echo "sbatch multiqc.sbatch" > _01_run_multiqc.sh - -echo "find -type l | while read in; do unlink \${in}; done" > _02_unlink.sh diff --git a/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS03_MAG/99-stats/multiqc_config.yaml b/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS03_MAG/99-stats/multiqc_config.yaml deleted file mode 100644 index 96b7e613..00000000 --- a/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS03_MAG/99-stats/multiqc_config.yaml +++ /dev/null @@ -1,13 +0,0 @@ -extra_fn_clean_exts: - - _R1 - - _R2 - - .R1 - - .R2 - - .sort - - _sort - - .stats - - _bamstat - - _align - - .txt -report_comment: > - This report has been generated by BU-ISCIII diff --git a/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS04_BLAST/lablog b/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS04_BLAST/lablog deleted file mode 100644 index 7910e2cc..00000000 --- a/bu_isciii/templates/genomeev/ANALYSIS/ANALYSIS04_BLAST/lablog +++ /dev/null @@ -1,75 +0,0 @@ -# module load BLAST+/2.11.0-gompi-2020b - -scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") -mkdir logs - -# Location of assemblies to a variable so it only has to be changed here -LOCATION=../*/*/assembly/*/* -# Other databases: -# /data/bi/references/BLAST_dbs/nt_20211025/nt -BLAST_DATABASE="/data/bi/references/virus/BLAST/all_virus.fasta" - -# if there are scaffolds, uncompress the scaffolds in its dir (zcat for decompression) -# if there contigs and no scaffolds, uncompress the contigs as scaffolds in its dir -echo "Samples that did not generate scaffolds:" > noscaffold.txt -cat ../samples_id.txt | while read in; do - mkdir ${in} - # ls will return 0 if there are no scaffolds file - # NOTE: change extension and location at will - # NOTE2: zcat is only used in case of gzipped files, use a cp or ln -s if needed - if [ $(ls ${LOCATION}/${in}.scaffolds.fa.gz | wc -l) != 0 ]; then - zcat ${LOCATION}/${in}.scaffolds.fa.gz > ${in}/${in}.scaffolds.fa - else - # Note assemblies that did not make a scaffold - zcat ${LOCATION}/${in}.contigs.fa.gz > ${in}/${in}.scaffolds.fa - echo ${in} >> noscaffold.txt - fi -done - -# NOTE3: change the -query flag to meet your requirements -cat ../samples_id.txt | xargs -I %% echo "srun --chdir ${scratch_dir} --partition middle_idx --mem 200G --time 48:00:00 --cpus-per-task 10 --output logs/BLASTN_%%_%j.log --job-name BLASTN_%% blastn -num_threads 10 -db ${BLAST_DATABASE} -query %%/%%.scaffolds.fa -out %%/%%_blast.tsv -outfmt '6 qseqid stitle qaccver saccver pident length mismatch gaps qstart qend sstart send evalue bitscore slen qlen qcovs' &" > _01_blast.sh - -# Filtering criteria: - # %refCovered > 0.7 - # ref not a phage (stitle ~! /phage/) - # ref longer than 200 bp (slen > 200) - -# First awk: create the full table; second awk: filter it -cat ../samples_id.txt | xargs -I %% echo "awk -v \"samplename=%%\" 'BEGIN{OFS=\"\t\";FS=\"\t\"}{print samplename,\$0,(\$6-\$8)/\$16,\$6/\$15}' %%/%%_blast.tsv | awk 'BEGIN{OFS=\"\t\";FS=\"\t\"} \$16 > 200 && \$17 > 0.7 && \$3 !~ /phage/ {print \$0}' > %%/%%_blast_filt.tsv" > _02_filter_blast.sh -echo -e "echo \"samplename\tqseqid\tstitle\tqaccver\tsaccver\tpident\tlength\tmismatch\tgap\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\tref_len\tquery_len\tqcovs\t%queryAligned\t%refCovered\" > header" > _03_gather_results_add_header.sh -echo "cat header */*blast_filt.tsv > all_samples_filtered_BLAST_results.tsv" >> _03_gather_results_add_header.sh -cat ../samples_id.txt | xargs -I %% echo "cat header %%/%%_blast_filt.tsv > tmp; rm %%/%%_blast_filt.tsv; mv tmp %%/%%_blast_filt.tsv" >> _03_gather_results_add_header.sh -echo "rm header" >> _03_gather_results_add_header.sh - -# NOTES FOR FILTERING -# -# subject = reference -# -# COLS GENERATED BY US: -# 1: samplename -# GENERATED BY BLAST -# 2: contigname - qseqid -# 3: stitle -# 4: qaccver -# 5: saccver -# 6: pident -# 7: length (of alignment) -# 8: mismatch -# 9: gaps -# 10: qstart -# 11: qend -# 12: sstart -# 13: send -# 14: evalue -# 15: bitscore -# 16: ref len - slen -# 17: query len - qlen -# 18: qcovs -# MORE INFO: https://www.metagenomics.wiki/tools/blast/blastn-output-format-6 -# GENERATED BY US: -# 19: %queryAligned: (length-gaps)/qlen (if gaps are not deleted, then this would be bigger than 1 sometimes) -# 20: %refCovered: length/slen - -# conda activate 2excel -cat ../samples_id.txt | xargs -I %% echo "srun --chdir ${scratch_dir} --partition short_idx --mem 10G --time 1:00:00 --output logs/2excel_%%.log --job-name 2excel_%% python /scratch/bi/pipelines/utilities/export_excel_from_csv.py --input_file %%/%%_blast_filt.tsv --delimiter '\t' --output_filename %%/%%_blast_filt --it_has_index --it_has_header" > _04_to_excel.sh -echo "srun --chdir ${scratch_dir} --partition short_idx --mem 10G --time 1:00:00 --output logs/2excel_all.log --job-name 2excel_all python /scratch/bi/pipelines/utilities/export_excel_from_csv.py --input_file all_samples_filtered_BLAST_results.tsv --delimiter '\t' --output_filename all_samples_filtered_BLAST_results --it_has_index --it_has_header" >> _04_to_excel.sh diff --git a/bu_isciii/templates/genomeev/ANALYSIS/README b/bu_isciii/templates/genomeev/ANALYSIS/README deleted file mode 100644 index 0ecbe695..00000000 --- a/bu_isciii/templates/genomeev/ANALYSIS/README +++ /dev/null @@ -1,25 +0,0 @@ -This document should be read as INSTRUCTIONS to perform the "genomeev" service, as created on 25 Sep 2023. -The steps to follow to perform this service (which, by the way, can be done fairly quickly computationally speaking) are the following: - -- Load the samples into the RAW directory (manually or automatically using the BU-ISCIII tools) - -- Copy all files from this template (manually or automatically, make sure all files are there) - -- Copy the whole service folder to scratch_tmp (at least, we had to do that when this template was created) - -- First part is PikaVirus. Run PikaVirus by executing lablog_pikavirus, then enter the PikaVirus folder, execute the lablog (note that you need a samples_id.txt file, if you did not create it automatically, it has to be done manually), load the modules and do the thing. Feel free to change anything in PikaVirus through command or through the config (config is recommended so that any changes can be tracked). NOTE: wait for PikaVirus to end before you continue. Do something else in the meantime. read a paper or something dunno. - -- Once PikaVirus has ended, we have to dive into the results, particularly the "all_samples_virus_table.tsv" in the results dir. Here, we have to find the most abundant virus. I personally recommend opening this file in excel or similar, and find the virus that repeats the most in the samples using some formula such as "COUNTIF(range, value)". Make sure you are working with a genome and not with just a fragment of it. - -- Download said assembly locally, both its fna and its gff file. Make sure you store both files with the same name and different extension. The name SHOULD include the virus name, and the GCA/GCF code so its easier to identify (example: RotavirusG8_GCA_002669555_1.fasta; RotavirusG8_GCA_002669555_1.gff). Then, place it in the corresponding directory inside "/data/bi/references/virus". - -- Once the files have been placed, we have to modify the samples_ref.txt file. - First column will be the exact same as the samples_id.txt file. - Second column will be the name of the assemblies we downloaded in the previous step (example: RotavirusG8_GCA_002669555_1 ). Make sure that all the rows are the exact same. - Third column will be the name of the host (typically "human", but can be changed depending on the situation) - -- Execute the lablog_viralrecon. The ANALYSIS02 directory will be created and filled with the corresponding scripts. Load the modules and launch viralrecon. - -- Once it has ended, its time for MAG. Go to the ANALYSIS03 directory, execute the lablog, load the modules and run MAG with the specified params. - -- Last, but not least, go to the ANALYSIS04 directory and run the lablog, the lablog will check the assembly step in viralrecon, and will store the names of the samples that didnt assembly to the scaffold level in the noscaffold.txt file. Run normally the three scripts after loading the corresponding module, and that should be about everything there is to this service! diff --git a/bu_isciii/templates/genomeev/ANALYSIS/create_summary_report.sh b/bu_isciii/templates/genomeev/ANALYSIS/create_summary_report.sh deleted file mode 100644 index 4ed9b192..00000000 --- a/bu_isciii/templates/genomeev/ANALYSIS/create_summary_report.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/bash - -# Define fixed data variables -RUN=$(ls -l ../../RAW/ | cut -d'/' -f4 | sort -u | grep -v 'total' | head -n1 | rev | cut -d " " -f 2- | rev) -USER=$(pwd | cut -d '/' -f6 | cut -d '_' -f4) -HOST=$(pwd | cut -d '/' -f8 | cut -d '_' -f4 | tr '[:upper:]' '[:lower:]' | sed 's/.*/\u&/') - -# Define header for output file -HEADER="run\tuser\thost\tVirussequence\tsample\ttotalreads\treadshostR1\treadshost\t%readshost\treadsvirus\t%readsvirus\tunmappedreads\t%unmapedreads\tmedianDPcoveragevirus\tCoverage>10x(%)\tVariantsinconsensusx10\tMissenseVariants\t%Ns10x\tLineage\tread_length\tanalysis_date" - -# Print header to output file -echo -e $HEADER > mapping_illumina_$(date '+%Y%m%d').tab - -# Loop through sample list and extract relevant data -cat samples_ref.txt | while read in -do - # Sample and virus reference names - arr=($in); - - # Extract data for each column - total_reads=$(grep 'total_reads' ${arr[1]}*/fastp/${arr[0]}.fastp.json | head -n2 | tail -n1 | cut -d ':' -f2 | sed 's/,//g') - - reads_hostR1=$(cat ${arr[1]}*/kraken2/${arr[0]}.kraken2.report.txt | grep -v 'unclassified' | cut -f3 | awk '{s+=$1}END{print s}') - reads_host_x2=$(echo $((reads_hostR1 * 2)) ) - perc_mapped=$(echo $(awk -v v1=$total_reads -v v2=$reads_host_x2 'BEGIN {print (v2*100)/v1}') ) - - reads_virus=$(cat ${arr[1]}*/variants/bowtie2/samtools_stats/${arr[0]}.sorted.bam.flagstat | grep '+ 0 mapped' | cut -d ' ' -f1) - - unmapped_reads=$(echo $((total_reads - (reads_host_x2+reads_virus))) ) - perc_unmapped=$(echo $(awk -v v1=$total_reads -v v2=$unmapped_reads 'BEGIN {print (v2/v1)*100}') ) - - n_count=$(cat %Ns.tab | grep -w ${arr[0]} | grep ${arr[1]} | cut -f2) - - missense=$(LC_ALL=C awk -F, '{if($10 >= 0.75)print $0}' ${arr[1]}*/variants/ivar/variants_long_table.csv | grep ^${arr[0]}, | grep 'missense' | wc -l) - - Ns_10x_perc=$(zcat ${arr[1]}*/variants/ivar/consensus/bcftools/${arr[0]}.filtered.vcf.gz | grep -v '^#' | wc -l) - - lineage=$(cat ${arr[1]}*/variants/ivar/consensus/bcftools/pangolin/${arr[0]}.pangolin.csv | tail -n1 | cut -d ',' -f2) - - metrics=$(cat ${arr[1]}*/multiqc/summary_variants_metrics_mqc.csv | grep ^${arr[0]},) - reads_virus_perc=$(echo "$metrics" | cut -d ',' -f5) - medianDPcov=$(echo "$metrics" | cut -d ',' -f8) - cov10x=$(echo "$metrics" | cut -d ',' -f10) - - read_length=$(cat ${arr[1]}*/multiqc/multiqc_data/multiqc_fastqc.yaml | grep -A5 "${arr[0]}_1:$" | grep "Sequence length:" | tr "-" " " | rev | cut -d " " -f1 | rev) - - analysis_date=$(date '+%Y%m%d') - - # Introduce data row into output file - echo -e "${RUN}\t${USER}\t${HOST}\t${arr[1]}\t${arr[0]}\t$total_reads\t$reads_hostR1\t$reads_host_x2\t$perc_mapped\t$reads_virus\t$reads_virus_perc\t$unmapped_reads\t$perc_unmapped\t$medianDPcov\t$cov10x\t$Ns_10x_perc\t$missense\t$n_count\t$lineage\t$read_length\t$analysis_date" >> mapping_illumina_$(date '+%Y%m%d').tab -done diff --git a/bu_isciii/templates/genomeev/ANALYSIS/samples_ref.txt b/bu_isciii/templates/genomeev/ANALYSIS/samples_ref.txt deleted file mode 100644 index 5e3528b1..00000000 --- a/bu_isciii/templates/genomeev/ANALYSIS/samples_ref.txt +++ /dev/null @@ -1,4 +0,0 @@ -SampleID Reference Host -SampleID Reference Host -SampleID Reference Host - diff --git a/bu_isciii/templates/genomeev/DOC/hpc_slurm_pikavirus.config b/bu_isciii/templates/genomeev/DOC/hpc_slurm_pikavirus.config deleted file mode 100644 index 2517ff51..00000000 --- a/bu_isciii/templates/genomeev/DOC/hpc_slurm_pikavirus.config +++ /dev/null @@ -1,32 +0,0 @@ -/* - * --------------------------------------------------------------- - * Nextflow config file for the ISCIII High Performance Computer - * --------------------------------------------------------------- - * - * nextflow run PikaVirus/manin.nf -profile HPC_ISCIII, - */ - -process{ - executor = 'slurm' - queue = 'middle_idx' - conda = '/data/bi/pipelines/miniconda3/envs/PikaVirus' - errorStrategy = { task.exitStatus in [140,143,137,138,104,134,139] ? 'retry' : 'finish'; task.exitStatus in [1,4,255] ? 'ignore' : 'finish' } - maxRetries = 5 - maxErrors = '-1' -} - -params { - config_profile_name = 'ISCIII HPC profile' - config_profile_description = 'Profile designed for the High Performance Computer in the ISCIII' - kraken2_db = "/data/bi/references/kraken/minikraken_8GB_20200312" - vir_ref_dir = "/data/bi/references/PikaVirus/viral_assemblies_for_pikavirus" - vir_dir_repo = "/data/bi/references/PikaVirus/viral_assemblies.tsv" - bact_ref_dir = "/data/bi/references/PikaVirus/bacteria_assemblies_for_pikavirus" - bact_dir_repo = "/data/bi/references/PikaVirus/bacteria_assemblies.tsv" - fungi_ref_dir = "/data/bi/references/PikaVirus/fungi_assemblies_for_pikavirus" - fungi_dir_repo = "/data/bi/references/PikaVirus/fungi_assemblies.tsv" - outdir = "01-PikaVirus-results" - max_memory = 376.GB - max_cpus = 32 - max_time = '48.h' -} diff --git a/bu_isciii/templates/genomeev/DOC/mag.config b/bu_isciii/templates/genomeev/DOC/mag.config deleted file mode 100644 index 732980bf..00000000 --- a/bu_isciii/templates/genomeev/DOC/mag.config +++ /dev/null @@ -1,19 +0,0 @@ -singularity { - enabled = true - autoMounts = true -} - -process { - executor = 'slurm' - queue = 'middle_idx' - queue = 'middle_idx' - errorStrategy = { task.exitStatus in [140,143,137,138,104,134,139] ? 'retry' : 'finish'; task.exitStatus in [1,4,255] ? 'ignore' : 'finish' } - maxRetries = 1 - maxErrors = '-1' -} - -params { - max_memory = 376.GB - max_cpus = 32 - max_time = '48.h' -} diff --git a/bu_isciii/templates/genomeev/DOC/viralrecon_metagenomic_ignore_merge_codons.config b/bu_isciii/templates/genomeev/DOC/viralrecon_metagenomic_ignore_merge_codons.config deleted file mode 100644 index bb575ee3..00000000 --- a/bu_isciii/templates/genomeev/DOC/viralrecon_metagenomic_ignore_merge_codons.config +++ /dev/null @@ -1,42 +0,0 @@ -singularity { - enabled = true - autoMounts = true -} - -process { - executor = 'slurm' - queue = 'middle_idx' - withName: 'FASTP' { - ext.args = '--cut_front --cut_tail --trim_poly_x --cut_mean_quality 20 --qualified_quality_phred 20 --unqualified_percent_limit 10 --length_required 50' - } - withName: 'PANGOLIN' { - ext.args = '--datadir /scratch/bi/references/pangolin/20220322' - container = 'https://depot.galaxyproject.org/singularity/pangolin:3.1.20--pyhdfd78af_0' - } - withName: 'IVAR_VARIANTS_TO_VCF' { - ext.args = params.protocol == 'amplicon' ? '--ignore_strand_bias --ignore_merge_codons' : '--ignore_merge_codons' - } -} - -params { - // Input options - platform=illumina - protocol=metagenomic - - // Illumina QC, read trimming and filtering options - kraken2_db="/data/bi/references/eukaria/homo_sapiens/hg38/UCSC/kraken2/kraken2_human.tar.gz" - - // Illumina variant calling options - variant_caller=ivar - consensus_caller=bcftools - skip_pangolin=true - skip_nextclade=true - - // Illumina de novo assembly options - skip_assembly=true - - // Max resource options - max_memory = 376.GB - max_cpus = 32 - max_time = '48.h' -} diff --git a/bu_isciii/templates/genomeev/DOC/viralrecon_metagenomic_params.yml b/bu_isciii/templates/genomeev/DOC/viralrecon_metagenomic_params.yml deleted file mode 100644 index 7ef76186..00000000 --- a/bu_isciii/templates/genomeev/DOC/viralrecon_metagenomic_params.yml +++ /dev/null @@ -1,11 +0,0 @@ -platform: 'illumina' -protocol: 'metagenomic' -kraken2_db: '/data/bi/references/eukaria/homo_sapiens/hg38/UCSC/kraken2/kraken2_human.tar.gz' -variant_caller: 'ivar' -consensus_caller: 'bcftools' -skip_pangolin: true -skip_nextclade: true -skip_variants: true -skip_assembly: false -skip_abacas: true -skip_plasmidid: true diff --git a/bu_isciii/templates/genomeev/DOC/viralrecon_metagenomic_save_nohost.config b/bu_isciii/templates/genomeev/DOC/viralrecon_metagenomic_save_nohost.config deleted file mode 100644 index a62b5ac4..00000000 --- a/bu_isciii/templates/genomeev/DOC/viralrecon_metagenomic_save_nohost.config +++ /dev/null @@ -1,40 +0,0 @@ -singularity { - enabled = true - autoMounts = true -} - -process { - executor = 'slurm' - queue = 'middle_idx' - withName: 'FASTP' { - ext.args = '--cut_front --cut_tail --trim_poly_x --cut_mean_quality 20 --qualified_quality_phred 20 --unqualified_percent_limit 10 --length_required 50' - } - withName: 'KRAKEN2_KRAKEN2' { - publishDir = [ - pattern: "*.{unclassified_1.fastq.gz,unclassified_2.fastq.gz,txt}" - ] - } -} - -params { - // Input options - platform=illumina - protocol=metagenomic - - // Illumina QC, read trimming and filtering options - kraken2_db="/data/bi/references/eukaria/homo_sapiens/hg38/UCSC/kraken2/kraken2_human.tar.gz" - - // Illumina variant calling options - variant_caller=ivar - consensus_caller=bcftools - skip_pangolin=true - skip_nextclade=true - - // Illumina de novo assembly options - skip_assembly=true - - // Max resource options - max_memory = 376.GB - max_cpus = 32 - max_time = '48.h' -} diff --git a/bu_isciii/templates/genomeev/RAW/README b/bu_isciii/templates/genomeev/RAW/README deleted file mode 100644 index deb5220b..00000000 --- a/bu_isciii/templates/genomeev/RAW/README +++ /dev/null @@ -1 +0,0 @@ -RAW template diff --git a/bu_isciii/templates/genomeev/RESULTS/lablog b/bu_isciii/templates/genomeev/RESULTS/lablog deleted file mode 100644 index c593c7ac..00000000 --- a/bu_isciii/templates/genomeev/RESULTS/lablog +++ /dev/null @@ -1,17 +0,0 @@ -# conda activate 2excel -mkdir $(date '+%Y%m%d')_entrega01 -cd $(date '+%Y%m%d')_entrega01 - -#Create directories depending on the analysis -mkdir assembly_spades -mkdir blast - -#Create symbolic links depending on the analysis -#Individual files -ln -s ../../ANALYSIS/*_MAG/99-stats/multiqc_report.html ./krona_results.html -ln -s ../../ANALYSIS/*/assembly_stats.xlsx ./assembly_stats.xlsx -python /scratch/bi/pipelines/utilities/export_excel_from_csv.py --input_file ../../ANALYSIS/*PIKAVIRUS*/*/all_samples_virus_table_filtered.tsv --delimiter '\t' --output_filename filtered_all_samples_virus_table --it_has_index --it_has_header - -#Folders -cd assembly_spades; ln -s ../../../ANALYSIS/*BLAST*/*/*scaffolds.fa .; cd - -cd blast; ln -s ../../../ANALYSIS/*BLAST*/all_samples_filtered_BLAST_results.xlsx .; ln -s ../../../ANALYSIS/*BLAST*/*/*.xlsx .; cd - diff --git a/bu_isciii/templates/genomeev/TMP/README b/bu_isciii/templates/genomeev/TMP/README deleted file mode 100644 index 36ecd8dd..00000000 --- a/bu_isciii/templates/genomeev/TMP/README +++ /dev/null @@ -1 +0,0 @@ -TMP templates diff --git a/bu_isciii/templates/jinja_template_delivery.j2 b/bu_isciii/templates/jinja_template_delivery.j2 index b55828d3..058b0790 100644 --- a/bu_isciii/templates/jinja_template_delivery.j2 +++ b/bu_isciii/templates/jinja_template_delivery.j2 @@ -74,9 +74,8 @@ Here we describe information about the resolution delivery. {% endif %} {% if samples %} -## Samples sequenced at iSCIII: - -Here we describe information about the project associated to the service: +## Samples sequenced at ISCIII: +##Here we describe information about the project associated to the service: {% if service_sequencing_center -%} * Sequencing center: {{ service_sequencing_center }}{% endif %} {% for run , projects in samples.items() %} * Run name: {{ run }} @@ -88,6 +87,43 @@ Here we describe information about the project associated to the service: {% endfor %} {% endif %} + + +
+ +## Software versions: + +{% if services_list is mapping and software_versions is mapping %} +{%- set service_list = services_list.items() | list %} +{%- set file_version_list = software_versions.items() | list %} + +{%- for index in range(service_list | length) %} + {%- if index < file_version_list | length %} + {%- set service_id, description = service_list[index] %} +* {{ description }} ({{ service_id }}): + {%- set file_path, processes = file_version_list[index] %} + {%- if processes | length > 0 %} + {%- for process, tools in processes.items() %} + - {{ process }}: + {%- for tool, version in tools.items() %} + - {{ tool }}: {{ version }} + {%- endfor %} + {%- endfor %} + {%- else %} + - No software versions data available for this file path. + {%- endif %} + {%- else %} + {%- set service_id, description = service_list[index] %} +* {{ description }} ({{ service_id }}): + - No software versions data available for this service. + {%- endif %} +{%- endfor %} +{% else %} +No software versions data available for this service. +{% endif %} +