update docs; add filtering

singleron-RD · Jun 21, 2024 · 1c5aa5a · 1c5aa5a
1 parent 9d107db
commit 1c5aa5a
Show file tree

Hide file tree

Showing 11 changed files with 91 additions and 35 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,8 +3,3 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## 1.1.0 - [2024-06-05]
-
-### `Added`
-- Make `--genes` and `--fasta` parameter required.
-- Update freebayes_args and bcftools_filter_args to filter more variants.
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
@@ -1,5 +1,5 @@
 report_comment: >
-  This report has been generated by the <a href="https://github.com/singleron-RD/sccite/" target="_blank">singleron-RD/sccite/1.1.0/</a>
+  This report has been generated by the <a href="https://github.com/singleron-RD/sccite/" target="_blank">singleron-RD/sccite/1.0.0/</a>
   analysis pipeline.
 report_section_order:
   "singleron-RD-sccite-methods-description":

diff --git a/bin/tag_barcode.py b/bin/tag_barcode.py
@@ -10,6 +10,8 @@
 import utils
 from __init__ import ASSAY
 
+MAX_UMI_FRACTION = 0.01
+
 
 def get_tag_barcode_mismatch_dict(barcode_dict):
     mismatch_dict = {}
@@ -59,7 +61,7 @@ def run(self):
         total_reads = 0
         tag_reads = 0
         incell_reads = 0
-        bc_antibody_umi = utils.nested_defaultdict(dim=3)
+        bc_antibody_umi_read = utils.nested_defaultdict(dim=3)
         with pysam.FastxFile(self.args.fq) as infile:
             for record in infile:
                 total_reads += 1
@@ -70,34 +72,64 @@ def run(self):
                     tag_reads += 1
                     if bc in bcs:
                         antibody = mismatch_dict[tag_barcode]
-                        bc_antibody_umi[bc][antibody][umi] += 1
+                        bc_antibody_umi_read[bc][antibody][umi] += 1
                         incell_reads += 1
 
         # median umi per cell
         bc_umi = defaultdict(int)
-        bc_antibody = utils.nested_defaultdict(dim=2)
-        for bc in bc_antibody_umi:
-            for antibody in bc_antibody_umi[bc]:
-                umi_count = len(bc_antibody_umi[bc][antibody])
+        bc_antibody_umi = utils.nested_defaultdict(dim=2)
+        antibody_umi = defaultdict(int)
+        for bc in bc_antibody_umi_read:
+            for antibody in bc_antibody_umi_read[bc]:
+                umi_count = len(bc_antibody_umi_read[bc][antibody])
                 bc_umi[bc] += umi_count
-                bc_antibody[bc][antibody] = umi_count
+                bc_antibody_umi[bc][antibody] = umi_count
+                antibody_umi[antibody] += umi_count
         median_umi_per_cell = np.median(list(bc_umi.values()))
 
+        # write raw csv
+        df = pd.DataFrame.from_dict(bc_antibody_umi)
+        df = df.reindex(columns=bcs_list, fill_value=0)
+        df.fillna(0, inplace=True)
+        df = df.astype(int)
+        df.to_csv(f"{self.args.sample}.raw.tag_barcode.csv.gz")
+
+        # filter aggregate barcodes
+        rows = []
+        for bc in bc_antibody_umi:
+            for antibody in bc_antibody_umi[bc]:
+                cur = bc_antibody_umi[bc][antibody]
+                if cur > 10000:
+                    frac = cur / antibody_umi[antibody]
+                    if frac > MAX_UMI_FRACTION:
+                        rows.append(
+                            {
+                                "barcode": bc,
+                                "antibody": antibody,
+                                "umi_count": cur,
+                                "total_umi_count": antibody_umi[antibody],
+                                "fraction": round(frac, 3),
+                            }
+                        )
+                        bc_antibody_umi[bc][antibody] = 0
+        df = pd.DataFrame.from_dict(bc_antibody_umi)
+        df = df.reindex(columns=bcs_list, fill_value=0)
+        df.fillna(0, inplace=True)
+        df = df.astype(int)
+        df.to_csv(f"{self.args.sample}.filtered.tag_barcode.csv.gz")
+
+        df_remove = pd.DataFrame(rows)
+        df_remove.to_csv(f"{self.args.sample}.aggregate.csv", index=False)
+
         # metrics
         stats = {}
         stats["Number of Cells"] = len(bcs)
         stats["Fraction Tag Reads"] = utils.get_frac(tag_reads / total_reads)
         stats["Fraction Tag Reads in Cell"] = utils.get_frac(incell_reads / tag_reads)
         stats["Median UMI per Cell"] = median_umi_per_cell
+        stats["Aggregate Barcodes"] = len(set(df_remove["barcode"]))
         utils.write_multiqc(stats, self.args.sample, ASSAY, "tag_barcode" + ".stats")
 
-        # write csv
-        df = pd.DataFrame.from_dict(bc_antibody)
-        df = df.reindex(columns=bcs_list, fill_value=0)
-        df.fillna(0, inplace=True)
-        df = df.astype(int)
-        df.to_csv(f"{self.args.sample}.tag_barcode.csv.gz")
-
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()

diff --git a/conf/modules.config b/conf/modules.config
@@ -22,6 +22,13 @@ process {
         ext.args = '--quiet'
     }
 
+    withName: TAG_BARCODE {
+        publishDir = [
+            path: { "${params.outdir}/tag_barcode" },
+            mode: 'copy'
+        ]
+    }
+
 
     withName: CUSTOM_DUMPSOFTWAREVERSIONS {
         publishDir = [

diff --git a/docs/output.md b/docs/output.md
@@ -1,20 +1,23 @@
 - [Main output](#main-output)
 - [Modules](#modules)
-  - [fastqc](#fastqc)
+  - [extract](#extract)
+  - [tag\_barcode](#tag_barcode)
   - [multiqc-sgr](#multiqc-sgr)
   - [pipeline\_info](#pipeline_info)
+  - [fastqc(optional)](#fastqcoptional)
 
 # Main output
 
-- `tag_barcode/{sample}.tag_barcode.csv.gz` antibody-derived tags (ADT) UMI matrix.
+- `tag_barcode/{sample}.filtered.tag_barcode.csv.gz` antibody-derived tags (ADT) UMI matrix aftering filtering protein aggregation. Protein aggregates in antibody staining experiments may cause a significant increase in the UMI counts of a few cell barcodes. To address this issue, the `tag_barcode` module 
+removes any antibody UMI counts that exceed max(10000, total antibody UMI counts * 0.01).
 
-Code snippet using [Seurat V5](https://satijalab.org/seurat/articles/multimodal_vignette.html)
+Downstream analysis code snippet using [Seurat V5](https://satijalab.org/seurat/articles/multimodal_vignette.html)
 ```R
 library(Seurat)
 # Load in the RNA UMI matrix
 cbmc.rna = Read10X('data/match/X/filtered')
 # Load in the ADT UMI matrix
-cbmc.adt = as.sparse(read.csv(file = "sccite/test/outs/tag_barcode/X.tag_barcode.csv.gz",
+cbmc.adt = as.sparse(read.csv(file = "sccite/test/outs/tag_barcode/X.filtered.tag_barcode.csv.gz",
     sep = ",", header = TRUE, row.names = 1))
 all.equal(colnames(cbmc.rna), colnames(cbmc.adt))
 cbmc <- CreateSeuratObject(counts = cbmc.rna,names.delim = ' ')
@@ -27,15 +30,21 @@ cbmc[["ADT"]] <- adt_assay
 
 # Modules
 
-## fastqc
+## extract
 
-[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/).
+Extract valid barcode and UMI from R1 fastqs and add them to the read name of R2 fastqs.
 
-**Output files**
+**Output files** 
+- `{sample}.{R2}.fq.gz` Fastq files with barcodes added.
 
-- `*_fastqc.html`: FastQC report containing quality metrics.
-- `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images.
+## tag_barcode
 
+Locate the tag barcode in the R2 reads, generate the ADT UMI matrix, and filter out the abnormally high UMI counts.
+
+**Output files** 
+- `{sample}.raw.tag_barcode.csv.gz` Raw ADT UMI matrix.
+- `{sample}.filtered.tag_barcode.csv.gz` Filtered ADT UMI matrix.
+- `{sample}.aggregate.csv` Barcode and antibody combinations removed due to abnormally high UMIs.
 
 ## multiqc-sgr
 
@@ -59,3 +68,12 @@ cbmc[["ADT"]] <- adt_assay
 - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline.
 - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`.
 - Parameters used by the pipeline run: `params.json`.
+
+## fastqc(optional)
+
+[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/).
+
+**Output files**
+
+- `*_fastqc.html`: FastQC report containing quality metrics.
+- `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images.
diff --git a/docs/usage.md b/docs/usage.md
@@ -5,7 +5,7 @@
 
 ## Samplesheet input
 
-You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 4 columns, and a header row as shown in the examples below. An example `samplesheet.csv` can be found [here](https://github.com/singleron-RD/sccite_test_data)
+You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 4 columns, and a header row as shown in the examples below. An example `samplesheet.csv` can be found [here](https://github.com/singleron-RD/sccite_test_data/test1).
 
 ```bash
 --input '[path to samplesheet file]'

diff --git a/modules/local/extract.nf b/modules/local/extract.nf
@@ -11,7 +11,7 @@ process EXTRACT {
     val protocol
 
     output:
-    tuple val(meta), path("${meta.id}_R*.fq*"),  emit: reads
+    tuple val(meta), path("${meta.id}_R2.fq.gz"),  emit: reads
     tuple val(meta), path("*.json"),  emit: json
     path  "versions.yml" , emit: versions
 

diff --git a/modules/local/tag_barcode.nf b/modules/local/tag_barcode.nf
@@ -12,7 +12,7 @@ process TAG_BARCODE {
 
     output:
     tuple val(meta), path("*.json"),  emit: json
-    tuple val(meta), path("*.csv.gz"),  emit: csv
+    tuple val(meta), path("*.csv*"),  emit: csv
     path  "versions.yml" , emit: versions
 
     script:

diff --git a/multiqc_sgr/multiqc_sgr/sccite.py b/multiqc_sgr/multiqc_sgr/sccite.py
@@ -113,6 +113,12 @@ def general_stats_table(self, summary_data):
                 "scale": "blue",
                 "format": "{:,.0f}",
             },
+            "Aggregate Barcodes": {
+                "title": "Aggregate Barcodes",
+                "description": "Number of cell barcodes with protein aggregation. Ideally < 10",
+                "scale": "blue",
+                "format": "{:,.0f}",
+            },
         }
         self.general_stats_addcols(summary_data, headers=headers)
 

diff --git a/nextflow.config b/nextflow.config
@@ -244,7 +244,7 @@ manifest {
     description     = """nextflow pipeline"""
     mainScript      = 'main.nf'
     nextflowVersion = '!>=23.04.0'
-    version         = '1.1.0'
+    version         = '1.0.0'
     doi             = ''
 }
 

diff --git a/subworkflows/local/utils_nfcore_sccite_pipeline/main.nf b/subworkflows/local/utils_nfcore_sccite_pipeline/main.nf
@@ -177,8 +177,6 @@ def toolBibliographyText() {
     // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "<li>Author (2023) Pub name, Journal, DOI</li>" : "",
     // Uncomment function in methodsDescriptionText to render in MultiQC report
     def reference_text = [
-            "<li>Andrews S, (2010) FastQC, URL: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/.</li>",
-            "<li>Dobin A, etc. Kaminow, Benjamin, Dinar Yunusov, and Alexander Dobin. STARsolo: accurate, fast and versatile mapping/quantification of single-cell and single-nucleus RNA-seq data. Biorxiv (2021): 2021-05.</li>",
             "<li>Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. doi: /10.1093/bioinformatics/btw354</li>"
         ].join(' ').trim()