Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduced config files for RSV and a new lablog for results in viralrecon template #149

Merged
merged 14 commits into from
Aug 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bu_isciii/templates/viralrecon/ANALYSIS/DATE_ANALYSIS0X_MAG/99-stats/lablog
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,6 @@ EOF

echo "sbatch multiqc.sbatch" > _01_run_multiqc.sh

cp /data/bi/services_and_colaborations/CNM/virology/SRVCNM585_20220223_SARSCOV279_icasas_S/ANALYSIS/20220223_ANALYSIS02_MET/99-stats/multiqc_config.yaml .
ln -s ../../../DOC/multiqc_config.yml .

echo "find -type l | while read in; do unlink \${in}; done" > _02_unlink.sh
4 changes: 4 additions & 0 deletions bu_isciii/templates/viralrecon/ANALYSIS/samples_ref.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
SampleID Reference Host
SampleID Reference Host
SampleID Reference Host

13 changes: 13 additions & 0 deletions bu_isciii/templates/viralrecon/DOC/multiqc_config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
extra_fn_clean_exts:
- _R1
- _R2
- .R1
- .R2
- .sort
- _sort
- .stats
- _bamstat
- _align
- .txt
report_comment: >
This report has been generated by BU-ISCIII
22 changes: 22 additions & 0 deletions bu_isciii/templates/viralrecon/DOC/viralrecon_rsv.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
singularity {
enabled = true
autoMounts = true
}

process {
executor = 'slurm'
queue = 'middle_idx'
withName: 'FASTP' {
ext.args = '--cut_front --cut_tail --trim_poly_x --cut_mean_quality 20 --qualified_quality_phred 20 --unqualified_percent_limit 10 --length_required 50'
}
withName: 'NEXTCLADE_DATASETGET|NEXTCLADE_RUN' {
container = 'https://depot.galaxyproject.org/singularity/nextclade:2.14.0--h9ee0642_1'
}
}

params {
// Max resource options
max_memory = 376.GB
max_cpus = 32
max_time = '48.h'
}
8 changes: 8 additions & 0 deletions bu_isciii/templates/viralrecon/DOC/viralrecon_rsv_params.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
platform: 'illumina'
protocol: 'amplicon'
kraken2_db: '/data/bi/references/eukaria/homo_sapiens/hg38/UCSC/kraken2/kraken2_human.tar.gz'
variant_caller: 'ivar'
consensus_caller: 'bcftools'
skip_pangolin: true
skip_nextclade: false
skip_assembly: false
129 changes: 129 additions & 0 deletions bu_isciii/templates/viralrecon/RESULTS/excel_generator.py
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Puedes añadir a los templates un template del samples_ref de ANALYSIS y un template del reference_file necesario para lanzar el excel_generator?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Si, les pongo ejemplos con nombres reales o algo como "sample" "reference" "host" ?

Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import os
import argparse
import pandas as pd
from typing import List, Dict

# conda activate viralrecon_report
"""Usage: python excel_generator.py ./reference.tmp"""
parser = argparse.ArgumentParser(
description="Generate excel files from viralrecon results"
)
parser.add_argument(
"reference_file",
type=str,
help="File containing the references used in the analysis",
)

args = parser.parse_args()

print(
"Extracting references used for analysis and the samples associated with each reference\n"
)
with open(args.reference_file, "r") as file:
references = [line.rstrip() for line in file]
print(f"\nFound {len(references)} references: {str(references).strip('[]')}")

reference_folders = {ref: str("excel_files_" + ref) for ref in references}
samples_ref_files = {
ref: str("ref_samples/samples_" + ref + ".tmp") for ref in references
}


def concat_tables_and_write(csvs_in_folder: List[str], merged_csv_name: str):
"""Concatenate any tables that share the same header"""
if len(csvs_in_folder) == 0:
print(f"Could not find tables to merge over {merged_csv_name}")
return
with open(merged_csv_name, "wb") as merged_csv:
with open(csvs_in_folder[0], "rb") as f:
merged_csv.write(
f.read()
) # This is the fastest way to concatenate csv files
if len(csvs_in_folder) > 1:
for file in csvs_in_folder[1:]:
with open(file, "rb") as f:
next(f) # this is used to skip the header
merged_csv.write(f.read())
return merged_csv


def merge_lineage_tables(
reference_folders: Dict[str, str], samples_ref_files: Dict[str, str]
):
"""Creates the tables for pangolin and nextclade"""
for ref, folder in reference_folders.items():
print("Merging results for either pangolin or nextclade in a single csv file")
samples_for_ref = open(samples_ref_files[ref]).read().splitlines()
if os.path.isdir(os.path.abspath(folder + "/pangolin")):
pango_dir = os.path.join(folder, "pangolin")
csvs_in_folder = [
file.path
for file in os.scandir(pango_dir)
if os.path.basename(file).strip(".pangolin.csv") in samples_for_ref
]
merged_csv_name = os.path.join(folder, str(ref + "_pangolin.csv"))
concat_tables_and_write(
csvs_in_folder=csvs_in_folder, merged_csv_name=merged_csv_name
)
else:
print(f"No pangolin folder could be found for {ref}, omitting")

if os.path.isdir(os.path.abspath(folder + "/nextclade")):
nextcl_dir = os.path.join(folder, "nextclade")
csvs_in_folder = [
file.path
for file in os.scandir(nextcl_dir)
if os.path.basename(file).strip(".csv") in samples_for_ref
]
merged_csv_name = os.path.join(folder, str(ref + "_nextclade.csv"))
concat_tables_and_write(
csvs_in_folder=csvs_in_folder, merged_csv_name=merged_csv_name
)
else:
print(f"No nextclade folder could be found for {ref}, omitting")

return


def excel_generator(csv_files: List[str]):
for file in csv_files:
if not os.path.exists(file):
print(f"File {file} does not exist, omitting...")
continue
print(f"Generating excel file for {file}")
output_name = str(file.split(".csv")[0] + ".xlsx")
# workbook = openpyxl.Workbook(output_name)
if "nextclade" in str(file):
pd.read_csv(file, sep=";", header=0).to_excel(output_name, index=False)
elif "illumina" in str(file):
table = pd.read_csv(file, sep="\t", header=0)
table["analysis_date"] = pd.to_datetime(
table["analysis_date"].astype(str), format="%Y%m%d"
)
table.to_excel(output_name, index=False)
elif "assembly" in str(file):
pd.read_csv(file, sep="\t", header=0).to_excel(output_name, index=False)
else:
pd.read_csv(file).to_excel(output_name, index=False)
return file


# Merge pangolin and nextclade csv files separatedly and create excel files for them
merge_lineage_tables(reference_folders, samples_ref_files)
for reference, folder in reference_folders.items():
print(f"Creating excel files for reference {reference}")
csv_files = [file.path for file in os.scandir(folder) if file.path.endswith(".csv")]
excel_generator(csv_files)

# Merge all the variant long tables into one and convert to excel format
variants_tables = [
table.path for table in os.scandir(".") if "variants_long_table" in table.path
]
concat_tables_and_write(
csvs_in_folder=variants_tables, merged_csv_name="variants_long_table.csv"
)
pd.read_csv("variants_long_table.csv").to_excel("variants_long_table.xlsx", index=False)

# Create excel files for individual tables
result_tables = ["mapping_illumina.csv", "assembly_stats.csv", "pikavirus_table.csv"]
excel_generator(result_tables)
48 changes: 35 additions & 13 deletions bu_isciii/templates/viralrecon/RESULTS/viralrecon_results
Original file line number Diff line number Diff line change
@@ -1,26 +1,48 @@
mkdir $(date '+%Y%m%d')_entrega01
cd $(date '+%Y%m%d')_entrega01

mv ../excel_generator.py ./

#Create directories depending on the analysis
mkdir mapping_consensus
mkdir variants_annot
mkdir assembly_spades
mkdir abacas_assembly
mkdir blast
mkdir ref_samples

#Setting up folder and files required for excel_generator.py
cat ../../ANALYSIS/*/samples_ref.txt | cut -f2 | sort -u > references.tmp
cat references.tmp | while read in; do cat ../../ANALYSIS/*/samples_ref.txt | grep ${in} | cut -f 1 > ref_samples/samples_${in}.tmp; done
cat references.tmp | while read in; do mkdir excel_files_${in}; done
cat references.tmp | while read in; do cd excel_files_${in}; ln -s ../../../ANALYSIS/*/*${in}*/variants/ivar/consensus/bcftools/pangolin pangolin; cd -; done;
cat references.tmp | while read in; do cd excel_files_${in}; ln -s ../../../ANALYSIS/*/*${in}*/variants/ivar/consensus/bcftools/nextclade nextclade; cd -; done;

#Create symbolic links to files that are going to be converted to excel
cat references.tmp | while read in; do ln -s ../../ANALYSIS/*/*${in}*/variants/ivar/variants_long_table.csv ${in}_variants_long_table.csv; done

#Create symbolic links depending on the analysis
#Individual files
ln -s ../../ANALYSIS/*_MAG/99-stats/multiqc_report.html ./krona_results.html
ln -s ../../ANALYSIS/*/mapping_illumina*.xlsx ./mapping_illumina.xlsx
ln -s ../../ANALYSIS/*/*/variants/ivar/variants_long_table*.xlsx ./
ln -s ../../ANALYSIS/*/*/variants/ivar/consensus/bcftools/pangolin/pangolin.xlsx ./pangolin.xlsx
ln -s ../../ANALYSIS/*/*/variants/ivar/consensus/bcftools/nextclade/nextclade.xlsx ./nextclade.xlsx
ln -s ../../ANALYSIS/*/assembly_stats.xlsx ./assembly_stats.xlsx
ln -s ../../ANALYSIS/*/01-PikaVirus-results/all_samples_virus_table_filtered.xlsx ./filtered_all_samples_virus_table.xlsx
ln -s ../../ANALYSIS/*/mapping_illumina*.tab ./mapping_illumina.csv
ln -s ../../ANALYSIS/*/assembly_stats.csv ./assembly_stats.csv
ln -s ../../ANALYSIS/*/01-PikaVirus-results/all_samples_virus_table_filtered.csv ./pikavirus_table.csv

#conda activate viralrecon_report
echo "python ./excel_generator.py ./references.tmp" > _01_generate_excel_files.sh
#Cleaning temp files and broken symbolic links
echo "find . -xtype l -delete" > _02_clean_folders.sh
echo 'for dir in */; do find ${dir} -xtype l -delete; done' >> _02_clean_folders.sh
echo "find . -type d -empty -delete" >> _02_clean_folders.sh
echo 'cat references.tmp | while read in; do cp excel_files_${in}/*.xlsx ./ ;done' >> _02_clean_folders.sh
echo 'cat references.tmp | while read in; do rm -rf excel_files_${in}; done' >> _02_clean_folders.sh
echo "rm references.tmp" >> _02_clean_folders.sh
echo "rm -rf ref_samples/" >> _02_clean_folders.sh
echo "rm ./*.csv" >> _02_clean_folders.sh
echo "mkdir excel_files"
echo 'mv *.xlsx excel_files/'

#Folders
cd mapping_consensus;ln -s ../../../ANALYSIS/*/*/variants/ivar/consensus/bcftools/*.consensus.fa .; cd -
cd variants_annot; ln -s ../../../ANALYSIS/*/*/variants/ivar/snpeff/*.snpsift.txt .; cd -
#Create symbolic links to results for every process of the pipeline
cd mapping_consensus; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/variants/ivar/consensus/bcftools/${arr[0]}.consensus.fa ./${arr[0]}_${arr[1]}.consensus.fa; done; cd -
cd variants_annot; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/variants/ivar/snpeff/${arr[0]}.snpsift.txt ./${arr[0]}_${arr[1]}.snpsift.txt; done; cd -
cd assembly_spades; rsync -rlv ../../../ANALYSIS/*/*/assembly/spades/rnaviral/*.scaffolds.fa.gz .; gunzip *.scaffolds.fa.gz; cd -
cd abacas_assembly; ln -s ../../../ANALYSIS/*/*/assembly/spades/rnaviral/abacas/*.abacas.fasta .; cd -
cd blast; ln -s ../../../ANALYSIS/*/*.blast.filt.header.xlsx .; cd -
cd abacas_assembly; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/assembly/spades/rnaviral/abacas/${arr[0]}.abacas.fasta ./${arr[0]}_${arr[1]}.abacas.fasta; done; cd -
cd blast; ln -s ../../../ANALYSIS/*/*.blast.filt.header.xlsx .; cd -
Loading