BU-ISCIII · Shettland · Aug 22, 2023 · Aug 8, 2023 · Aug 8, 2023 · Aug 8, 2023
diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/DATE_ANALYSIS0X_MAG/99-stats/lablog b/bu_isciii/templates/viralrecon/ANALYSIS/DATE_ANALYSIS0X_MAG/99-stats/lablog
@@ -21,6 +21,6 @@ EOF
 
 echo "sbatch multiqc.sbatch" > _01_run_multiqc.sh
 
-cp /data/bi/services_and_colaborations/CNM/virology/SRVCNM585_20220223_SARSCOV279_icasas_S/ANALYSIS/20220223_ANALYSIS02_MET/99-stats/multiqc_config.yaml .
+ln -s ../../../DOC/multiqc_config.yml .
 
 echo "find -type l | while read in; do unlink \${in}; done" > _02_unlink.sh
diff --git a/bu_isciii/templates/viralrecon/ANALYSIS/samples_ref.txt b/bu_isciii/templates/viralrecon/ANALYSIS/samples_ref.txt
@@ -0,0 +1,4 @@
+SampleID	Reference	Host
+SampleID	Reference	Host
+SampleID	Reference	Host
+
diff --git a/bu_isciii/templates/viralrecon/DOC/multiqc_config.yml b/bu_isciii/templates/viralrecon/DOC/multiqc_config.yml
@@ -0,0 +1,13 @@
+extra_fn_clean_exts:
+    - _R1
+    - _R2
+    - .R1
+    - .R2
+    - .sort
+    - _sort
+    - .stats
+    - _bamstat
+    - _align
+    - .txt
+report_comment: >
+    This report has been generated by BU-ISCIII
diff --git a/bu_isciii/templates/viralrecon/DOC/viralrecon_rsv.config b/bu_isciii/templates/viralrecon/DOC/viralrecon_rsv.config
@@ -0,0 +1,22 @@
+singularity {
+	enabled = true
+	autoMounts = true
+}
+
+process {
+	executor = 'slurm'
+	queue = 'middle_idx'
+	withName: 'FASTP' {
+            ext.args = '--cut_front --cut_tail --trim_poly_x --cut_mean_quality 20 --qualified_quality_phred 20 --unqualified_percent_limit 10 --length_required 50'
+        }
+	withName: 'NEXTCLADE_DATASETGET|NEXTCLADE_RUN' {
+        container = 'https://depot.galaxyproject.org/singularity/nextclade:2.14.0--h9ee0642_1'
+        }
+}
+
+params {
+        // Max resource options
+	max_memory = 376.GB
+	max_cpus = 32
+	max_time = '48.h'
+}
diff --git a/bu_isciii/templates/viralrecon/DOC/viralrecon_rsv_params.yml b/bu_isciii/templates/viralrecon/DOC/viralrecon_rsv_params.yml
@@ -0,0 +1,8 @@
+platform: 'illumina'
+protocol: 'amplicon'
+kraken2_db: '/data/bi/references/eukaria/homo_sapiens/hg38/UCSC/kraken2/kraken2_human.tar.gz'
+variant_caller: 'ivar' 
+consensus_caller: 'bcftools'
+skip_pangolin: true
+skip_nextclade: false
+skip_assembly: false
diff --git a/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py b/bu_isciii/templates/viralrecon/RESULTS/excel_generator.py
@@ -0,0 +1,129 @@
+import os
+import argparse
+import pandas as pd
+from typing import List, Dict
+
+# conda activate viralrecon_report
+"""Usage: python excel_generator.py ./reference.tmp"""
+parser = argparse.ArgumentParser(
+    description="Generate excel files from viralrecon results"
+)
+parser.add_argument(
+    "reference_file",
+    type=str,
+    help="File containing the references used in the analysis",
+)
+
+args = parser.parse_args()
+
+print(
+    "Extracting references used for analysis and the samples associated with each reference\n"
+)
+with open(args.reference_file, "r") as file:
+    references = [line.rstrip() for line in file]
+    print(f"\nFound {len(references)} references: {str(references).strip('[]')}")
+
+reference_folders = {ref: str("excel_files_" + ref) for ref in references}
+samples_ref_files = {
+    ref: str("ref_samples/samples_" + ref + ".tmp") for ref in references
+}
+
+
+def concat_tables_and_write(csvs_in_folder: List[str], merged_csv_name: str):
+    """Concatenate any tables that share the same header"""
+    if len(csvs_in_folder) == 0:
+        print(f"Could not find tables to merge over {merged_csv_name}")
+        return
+    with open(merged_csv_name, "wb") as merged_csv:
+        with open(csvs_in_folder[0], "rb") as f:
+            merged_csv.write(
+                f.read()
+            )  # This is the fastest way to concatenate csv files
+        if len(csvs_in_folder) > 1:
+            for file in csvs_in_folder[1:]:
+                with open(file, "rb") as f:
+                    next(f)  # this is used to skip the header
+                    merged_csv.write(f.read())
+    return merged_csv
+
+
+def merge_lineage_tables(
+    reference_folders: Dict[str, str], samples_ref_files: Dict[str, str]
+):
+    """Creates the tables for pangolin and nextclade"""
+    for ref, folder in reference_folders.items():
+        print("Merging results for either pangolin or nextclade in a single csv file")
+        samples_for_ref = open(samples_ref_files[ref]).read().splitlines()
+        if os.path.isdir(os.path.abspath(folder + "/pangolin")):
+            pango_dir = os.path.join(folder, "pangolin")
+            csvs_in_folder = [
+                file.path
+                for file in os.scandir(pango_dir)
+                if os.path.basename(file).strip(".pangolin.csv") in samples_for_ref
+            ]
+            merged_csv_name = os.path.join(folder, str(ref + "_pangolin.csv"))
+            concat_tables_and_write(
+                csvs_in_folder=csvs_in_folder, merged_csv_name=merged_csv_name
+            )
+        else:
+            print(f"No pangolin folder could be found for {ref}, omitting")
+
+        if os.path.isdir(os.path.abspath(folder + "/nextclade")):
+            nextcl_dir = os.path.join(folder, "nextclade")
+            csvs_in_folder = [
+                file.path
+                for file in os.scandir(nextcl_dir)
+                if os.path.basename(file).strip(".csv") in samples_for_ref
+            ]
+            merged_csv_name = os.path.join(folder, str(ref + "_nextclade.csv"))
+            concat_tables_and_write(
+                csvs_in_folder=csvs_in_folder, merged_csv_name=merged_csv_name
+            )
+        else:
+            print(f"No nextclade folder could be found for {ref}, omitting")
+
+    return
+
+
+def excel_generator(csv_files: List[str]):
+    for file in csv_files:
+        if not os.path.exists(file):
+            print(f"File {file} does not exist, omitting...")
+            continue
+        print(f"Generating excel file for {file}")
+        output_name = str(file.split(".csv")[0] + ".xlsx")
+        # workbook = openpyxl.Workbook(output_name)
+        if "nextclade" in str(file):
+            pd.read_csv(file, sep=";", header=0).to_excel(output_name, index=False)
+        elif "illumina" in str(file):
+            table = pd.read_csv(file, sep="\t", header=0)
+            table["analysis_date"] = pd.to_datetime(
+                table["analysis_date"].astype(str), format="%Y%m%d"
+            )
+            table.to_excel(output_name, index=False)
+        elif "assembly" in str(file):
+            pd.read_csv(file, sep="\t", header=0).to_excel(output_name, index=False)
+        else:
+            pd.read_csv(file).to_excel(output_name, index=False)
+    return file
+
+
+# Merge pangolin and nextclade csv files separatedly and create excel files for them
+merge_lineage_tables(reference_folders, samples_ref_files)
+for reference, folder in reference_folders.items():
+    print(f"Creating excel files for reference {reference}")
+    csv_files = [file.path for file in os.scandir(folder) if file.path.endswith(".csv")]
+    excel_generator(csv_files)
+
+# Merge all the variant long tables into one and convert to excel format
+variants_tables = [
+    table.path for table in os.scandir(".") if "variants_long_table" in table.path
+]
+concat_tables_and_write(
+    csvs_in_folder=variants_tables, merged_csv_name="variants_long_table.csv"
+)
+pd.read_csv("variants_long_table.csv").to_excel("variants_long_table.xlsx", index=False)
+
+# Create excel files for individual tables
+result_tables = ["mapping_illumina.csv", "assembly_stats.csv", "pikavirus_table.csv"]
+excel_generator(result_tables)
diff --git a/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results b/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results
@@ -1,26 +1,48 @@
 mkdir $(date '+%Y%m%d')_entrega01
 cd $(date '+%Y%m%d')_entrega01
 
+mv ../excel_generator.py ./
+
 #Create directories depending on the analysis
 mkdir mapping_consensus 
 mkdir variants_annot
 mkdir assembly_spades
 mkdir abacas_assembly
 mkdir blast
+mkdir ref_samples
+
+#Setting up folder and files required for excel_generator.py
+cat ../../ANALYSIS/*/samples_ref.txt | cut -f2 | sort -u > references.tmp
+cat references.tmp | while read in; do cat ../../ANALYSIS/*/samples_ref.txt | grep ${in} | cut -f 1 > ref_samples/samples_${in}.tmp; done
+cat references.tmp | while read in; do mkdir excel_files_${in}; done
+cat references.tmp | while read in; do cd excel_files_${in}; ln -s ../../../ANALYSIS/*/*${in}*/variants/ivar/consensus/bcftools/pangolin pangolin; cd -; done; 
+cat references.tmp | while read in; do cd excel_files_${in}; ln -s ../../../ANALYSIS/*/*${in}*/variants/ivar/consensus/bcftools/nextclade nextclade; cd -; done;
+
+#Create symbolic links to files that are going to be converted to excel
+cat references.tmp | while read in; do ln -s ../../ANALYSIS/*/*${in}*/variants/ivar/variants_long_table.csv ${in}_variants_long_table.csv; done
 
-#Create symbolic links depending on the analysis
-#Individual files
 ln -s ../../ANALYSIS/*_MAG/99-stats/multiqc_report.html ./krona_results.html
-ln -s ../../ANALYSIS/*/mapping_illumina*.xlsx ./mapping_illumina.xlsx
-ln -s ../../ANALYSIS/*/*/variants/ivar/variants_long_table*.xlsx ./
-ln -s ../../ANALYSIS/*/*/variants/ivar/consensus/bcftools/pangolin/pangolin.xlsx ./pangolin.xlsx
-ln -s ../../ANALYSIS/*/*/variants/ivar/consensus/bcftools/nextclade/nextclade.xlsx ./nextclade.xlsx
-ln -s ../../ANALYSIS/*/assembly_stats.xlsx ./assembly_stats.xlsx
-ln -s ../../ANALYSIS/*/01-PikaVirus-results/all_samples_virus_table_filtered.xlsx ./filtered_all_samples_virus_table.xlsx
+ln -s ../../ANALYSIS/*/mapping_illumina*.tab ./mapping_illumina.csv
+ln -s ../../ANALYSIS/*/assembly_stats.csv ./assembly_stats.csv
+ln -s ../../ANALYSIS/*/01-PikaVirus-results/all_samples_virus_table_filtered.csv ./pikavirus_table.csv
+
+#conda activate viralrecon_report
+echo "python ./excel_generator.py ./references.tmp" > _01_generate_excel_files.sh
+#Cleaning temp files and broken symbolic links
+echo "find . -xtype l -delete" > _02_clean_folders.sh
+echo 'for dir in */; do find ${dir} -xtype l -delete; done' >> _02_clean_folders.sh
+echo "find . -type d -empty -delete" >> _02_clean_folders.sh
+echo 'cat references.tmp | while read in; do cp excel_files_${in}/*.xlsx ./ ;done' >> _02_clean_folders.sh
+echo 'cat references.tmp | while read in; do rm -rf excel_files_${in}; done' >> _02_clean_folders.sh
+echo "rm references.tmp" >> _02_clean_folders.sh
+echo "rm -rf ref_samples/" >> _02_clean_folders.sh
+echo "rm ./*.csv" >> _02_clean_folders.sh
+echo "mkdir excel_files"
+echo 'mv *.xlsx excel_files/'
 
-#Folders
-cd mapping_consensus;ln -s ../../../ANALYSIS/*/*/variants/ivar/consensus/bcftools/*.consensus.fa .; cd -
-cd variants_annot; ln -s ../../../ANALYSIS/*/*/variants/ivar/snpeff/*.snpsift.txt .; cd -
+#Create symbolic links to results for every process of the pipeline
+cd mapping_consensus; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/variants/ivar/consensus/bcftools/${arr[0]}.consensus.fa ./${arr[0]}_${arr[1]}.consensus.fa; done; cd -
+cd variants_annot; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/variants/ivar/snpeff/${arr[0]}.snpsift.txt ./${arr[0]}_${arr[1]}.snpsift.txt; done; cd -
 cd assembly_spades; rsync -rlv ../../../ANALYSIS/*/*/assembly/spades/rnaviral/*.scaffolds.fa.gz .; gunzip *.scaffolds.fa.gz; cd -
-cd abacas_assembly; ln -s ../../../ANALYSIS/*/*/assembly/spades/rnaviral/abacas/*.abacas.fasta .; cd -
-cd blast; ln -s ../../../ANALYSIS/*/*.blast.filt.header.xlsx .; cd -
+cd abacas_assembly; cat ../../../ANALYSIS/*/samples_ref.txt | while read in; do arr=($in); ln -s ../../../ANALYSIS/*/*${arr[1]}*/assembly/spades/rnaviral/abacas/${arr[0]}.abacas.fasta ./${arr[0]}_${arr[1]}.abacas.fasta; done; cd -
+cd blast; ln -s ../../../ANALYSIS/*/*.blast.filt.header.xlsx .; cd -