Merge pull request #159 from eastgenomics/IN-384_v2.6.0

IN-384_v2.6.0 - Adding variant counts as DNAnexus file details
eastgenomics · Dec 18, 2023 · b1daf06 · b1daf06
2 parents 4b914db + 9cdfcf6
commit b1daf06
Show file tree

Hide file tree

Showing 5 changed files with 49 additions and 12 deletions.
diff --git a/Readme.md b/Readme.md
@@ -198,6 +198,16 @@ This is the source code for an app that runs on the DNAnexus Platform.
 For more information about how to run or modify it, see
 https://documentation.dnanexus.com/.
 
+
+## File details
+The app will also add `details` metadata, in terms of variant counts, to the output xlsx report DNAnexus file. Example file details if `-isummary=Dias`, `-iclinical_indication=R208.1_Inherited breast cancer and ovarian cancer_P` and an `-ifilter` is provided, with `-ikeep_filtered=True`:
+```
+  "clinical_indication": "R208.1_Inherited breast cancer and ovarian cancer_P",
+  "included": 10,
+  "excluded": 255
+```
+Note: if `-isummary` not Dias, if `-ikeep_filtered=False`, and if `-iclinical_indication` not provided then the only details added to the file would be `"included": 10`. In this case, if no filtering is performed either then only `"variants": 265` would be added as details.
+
 #### This app was made by EMEE GLH
 
 [bcftools]: https://samtools.github.io/bcftools/bcftools.html#filter

diff --git a/dxapp.json b/dxapp.json
@@ -3,8 +3,8 @@
   "title": "eggd_generate_variant_workbook",
   "summary": "Create Excel workbook from VEP annotated vcf",
   "dxapi": "1.0.0",
-  "version": "2.5.0",
-  "whatsNew": "* v2.0.0 Rewrite of previous app to generate xlsx file from a VEP annotated VCF(s); * v2.0.1 Bug fix to correctly treat CHROM as string values; * v2.0.2 Bug fix for ACMG report template structure; * v2.0.3 Bug fixes for issues with hyperlinks, changed app name to eggd_generate_variant_workbook; * v2.1.0 Handle VCFs from GATK gCNV and Illumina TSO500, readability tweaks to variant sheets; * v2.1.1 Bug fix for typing of numeric values in hyperlinks; * v2.2.0 Added ability to pass in non VCF files (tsvs/csvs and images) to additional sheets, optional adding of links to DECIPHER with --decipher; * v2.3.0 Added conditional colouring of cells in variant sheets, new 'basic' summary sheet;  * v2.4.0 Added handling for duplicate annotation in VEP fields (i.e. cosmic, CGC, etc..); v2.5.0 Better parsing of CombinedVariantOutput files as additional files",
+  "version": "2.6.0",
+  "whatsNew": "* v2.0.0 Rewrite of previous app to generate xlsx file from a VEP annotated VCF(s); * v2.0.1 Bug fix to correctly treat CHROM as string values; * v2.0.2 Bug fix for ACMG report template structure; * v2.0.3 Bug fixes for issues with hyperlinks, changed app name to eggd_generate_variant_workbook; * v2.1.0 Handle VCFs from GATK gCNV and Illumina TSO500, readability tweaks to variant sheets; * v2.1.1 Bug fix for typing of numeric values in hyperlinks; * v2.2.0 Added ability to pass in non VCF files (tsvs/csvs and images) to additional sheets, optional adding of links to DECIPHER with --decipher; * v2.3.0 Added conditional colouring of cells in variant sheets, new 'basic' summary sheet;  * v2.4.0 Added handling for duplicate annotation in VEP fields (i.e. cosmic, CGC, etc..); * v2.5.0 Better parsing of CombinedVariantOutput files as additional files; * v2.6.0 Add variant counts as DNAnexus file details to the .xlsx workbook",
   "authorizedUsers": [
     "org-emee_1"
   ],

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 colour==0.1.5
-openpyxl==3.0.9
+openpyxl==3.1.2
 pandas==1.3.5
 et-xmlfile==1.1.0
 filetype==1.1.0

diff --git a/resources/home/dnanexus/generate_workbook/utils/excel.py b/resources/home/dnanexus/generate_workbook/utils/excel.py
@@ -1,5 +1,7 @@
 from collections import defaultdict
+import json
 import operator
+import os
 from pathlib import Path
 import re
 from string import ascii_uppercase as uppercase
@@ -181,7 +183,7 @@ def helios_summary(self) -> None:
 
         self.summary.cell(9, 1).value = "Variant totals"
 
-        to_bold.extend(["A1", "A2", "A4", "A5", "A6", "A9"])      
+        to_bold.extend(["A1", "A2", "A4", "A5", "A6", "A9"])
 
         # get sample name from vcf, should only be one but handle everything
         # list-wise just in case
@@ -228,7 +230,7 @@ def helios_summary(self) -> None:
                     # not 4 cols => didn't parse out just sample values in
                     # utils.parse_metrics => skip
                     continue
-                
+
                 # specific metrics lines we want to parse out
                 idxs = []
                 idxs.append(df[0].eq('Metric (UOM)').idxmax())
@@ -249,7 +251,7 @@ def helios_summary(self) -> None:
                     self.summary.cell(row_count, 2).value = lsl
                     self.summary.cell(row_count, 3).value = usl
                     self.summary.cell(row_count, 4).value = sample
-                
+
                     # perform colouring like in self.colour_metrics(), lazily
                     # catch anything in case of weird values to not break
                     try:
@@ -278,10 +280,10 @@ def helios_summary(self) -> None:
                             "WARNING: error in colouring metrics values in "
                             f"summary sheet: {err}.\nContinuing without colouring"
                         )
-                    
+
                     to_bold.append(f"A{row_count}")
                     row_count += 1
-                
+
                 # do the colouring
                 for colour, idxs in colouring.items():
                     for idx in idxs:
@@ -301,7 +303,7 @@ def helios_summary(self) -> None:
                                 start_color='b30000'
                             )
         row_count += 2
-        
+
         # Parsing of TMB/MSI/Gene Amplifications into summary
         for _, df in self.additional_files.items():
             if df.empty:
@@ -327,7 +329,7 @@ def helios_summary(self) -> None:
             for ref in list(set(self.refs)):
                 self.summary.cell(row_count, 2).value = ref
                 row_count += 1
-            
+
             row_count += 2
 
         if self.args.human_filter:
@@ -374,6 +376,7 @@ def dias_summary(self) -> None:
             - sample ID, panel(s), run IDs etc.
             - formatted tables for them to fill in reporting
         """
+        details_dict = defaultdict()
         # write titles for summary values
         self.summary.cell(1, 1).value = "Sample ID:"
         self.summary.cell(1, 5).value = "Clinical Indication(s):"
@@ -394,6 +397,13 @@ def dias_summary(self) -> None:
         self.summary.cell(1, 6).value = self.args.clinical_indication
         self.summary.cell(2, 6).value = self.args.panel
 
+        # If clinical indication given as arg, add this to our dict
+        # Write out the dias clinical indication info to JSON file
+        if self.args.clinical_indication:
+            details_dict['clinical_indication'] = self.args.clinical_indication
+            with open('details.json', 'w', encoding='utf8') as details_json:
+                json.dump(details_dict, details_json)
+
         # write total rows in each sheet
         count = 34
 
@@ -406,6 +416,7 @@ def dias_summary(self) -> None:
             to_bold.append(f"A{count}")
             count += 1
 
+
         count += 5
 
         # write genome reference(s) parsed from vcf header
@@ -724,6 +735,16 @@ def write_variants(self) -> None:
                 "this may take a few minutes..."
             )
 
+        # If details.json already exists (dias summary page written and
+        # clinical indication is given as arg) then
+        # open it and read it in so we can add var counts to the dict
+        # otherwise just make a new empty dict
+        if os.path.isfile('details.json'):
+            with open('details.json', 'r', encoding='utf8') as details_json:
+                details_dict = json.load(details_json)
+        else:
+            details_dict = defaultdict()
+
         with self.writer:
             # add variants
             for sheet, vcf in zip(self.args.sheets, self.vcfs):
@@ -732,6 +753,7 @@ def write_variants(self) -> None:
                     f"\nWriting {len(vcf)} rows to {sheet} sheet "
                     f"({sheet_no}/{len(self.args.sheets)})"
                 )
+                details_dict[sheet] = len(vcf)
 
                 # timing how long it takes to write because its slow
                 start = timer()
@@ -764,6 +786,10 @@ def write_variants(self) -> None:
                 self.set_types(curr_worksheet)
                 self.workbook.save(self.args.output)
 
+        # Write out dict to file
+        with open('details.json', 'w', encoding='utf8') as details_json:
+            json.dump(details_dict, details_json)
+
 
     def write_additional_files(self) -> None:
         """

diff --git a/src/code.sh b/src/code.sh
@@ -4,7 +4,7 @@ set -exo pipefail
 
 _dias_report_setup () {
     # function to handle parsing values and reading
-    # manifest / g2t etc. for Dias sampels
+    # manifest / g2t etc. for Dias samples
     mark-section "Getting output name for Dias"
 
     project_id=$DX_PROJECT_CONTEXT_ID
@@ -122,7 +122,8 @@ main() {
     fi
 
     mark-section "Uploading output"
-    output_xlsx=$(dx upload /home/dnanexus/out/xlsx_reports/* --brief)
+    JSON_DETAILS=$(cat details.json)
+    output_xlsx=$(dx upload /home/dnanexus/out/xlsx_reports/* --brief --details "$JSON_DETAILS")
     dx-jobutil-add-output xlsx_report "$output_xlsx" --class=file
 
     if [ "$keep_tmp" == true ]; then