Skip to content

Commit

Permalink
update docs; add filtering
Browse files Browse the repository at this point in the history
  • Loading branch information
zhouyiqi91 committed Jun 21, 2024
1 parent 9d107db commit 1c5aa5a
Show file tree
Hide file tree
Showing 11 changed files with 91 additions and 35 deletions.
5 changes: 0 additions & 5 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,3 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## 1.1.0 - [2024-06-05]

### `Added`
- Make `--genes` and `--fasta` parameter required.
- Update freebayes_args and bcftools_filter_args to filter more variants.
2 changes: 1 addition & 1 deletion assets/multiqc_config.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
report_comment: >
This report has been generated by the <a href="https://github.com/singleron-RD/sccite/" target="_blank">singleron-RD/sccite/1.1.0/</a>
This report has been generated by the <a href="https://github.com/singleron-RD/sccite/" target="_blank">singleron-RD/sccite/1.0.0/</a>
analysis pipeline.
report_section_order:
"singleron-RD-sccite-methods-description":
Expand Down
60 changes: 46 additions & 14 deletions bin/tag_barcode.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import utils
from __init__ import ASSAY

MAX_UMI_FRACTION = 0.01


def get_tag_barcode_mismatch_dict(barcode_dict):
mismatch_dict = {}
Expand Down Expand Up @@ -59,7 +61,7 @@ def run(self):
total_reads = 0
tag_reads = 0
incell_reads = 0
bc_antibody_umi = utils.nested_defaultdict(dim=3)
bc_antibody_umi_read = utils.nested_defaultdict(dim=3)
with pysam.FastxFile(self.args.fq) as infile:
for record in infile:
total_reads += 1
Expand All @@ -70,34 +72,64 @@ def run(self):
tag_reads += 1
if bc in bcs:
antibody = mismatch_dict[tag_barcode]
bc_antibody_umi[bc][antibody][umi] += 1
bc_antibody_umi_read[bc][antibody][umi] += 1
incell_reads += 1

# median umi per cell
bc_umi = defaultdict(int)
bc_antibody = utils.nested_defaultdict(dim=2)
for bc in bc_antibody_umi:
for antibody in bc_antibody_umi[bc]:
umi_count = len(bc_antibody_umi[bc][antibody])
bc_antibody_umi = utils.nested_defaultdict(dim=2)
antibody_umi = defaultdict(int)
for bc in bc_antibody_umi_read:
for antibody in bc_antibody_umi_read[bc]:
umi_count = len(bc_antibody_umi_read[bc][antibody])
bc_umi[bc] += umi_count
bc_antibody[bc][antibody] = umi_count
bc_antibody_umi[bc][antibody] = umi_count
antibody_umi[antibody] += umi_count
median_umi_per_cell = np.median(list(bc_umi.values()))

# write raw csv
df = pd.DataFrame.from_dict(bc_antibody_umi)
df = df.reindex(columns=bcs_list, fill_value=0)
df.fillna(0, inplace=True)
df = df.astype(int)
df.to_csv(f"{self.args.sample}.raw.tag_barcode.csv.gz")

# filter aggregate barcodes
rows = []
for bc in bc_antibody_umi:
for antibody in bc_antibody_umi[bc]:
cur = bc_antibody_umi[bc][antibody]
if cur > 10000:
frac = cur / antibody_umi[antibody]
if frac > MAX_UMI_FRACTION:
rows.append(
{
"barcode": bc,
"antibody": antibody,
"umi_count": cur,
"total_umi_count": antibody_umi[antibody],
"fraction": round(frac, 3),
}
)
bc_antibody_umi[bc][antibody] = 0
df = pd.DataFrame.from_dict(bc_antibody_umi)
df = df.reindex(columns=bcs_list, fill_value=0)
df.fillna(0, inplace=True)
df = df.astype(int)
df.to_csv(f"{self.args.sample}.filtered.tag_barcode.csv.gz")

df_remove = pd.DataFrame(rows)
df_remove.to_csv(f"{self.args.sample}.aggregate.csv", index=False)

# metrics
stats = {}
stats["Number of Cells"] = len(bcs)
stats["Fraction Tag Reads"] = utils.get_frac(tag_reads / total_reads)
stats["Fraction Tag Reads in Cell"] = utils.get_frac(incell_reads / tag_reads)
stats["Median UMI per Cell"] = median_umi_per_cell
stats["Aggregate Barcodes"] = len(set(df_remove["barcode"]))
utils.write_multiqc(stats, self.args.sample, ASSAY, "tag_barcode" + ".stats")

# write csv
df = pd.DataFrame.from_dict(bc_antibody)
df = df.reindex(columns=bcs_list, fill_value=0)
df.fillna(0, inplace=True)
df = df.astype(int)
df.to_csv(f"{self.args.sample}.tag_barcode.csv.gz")


if __name__ == "__main__":
parser = argparse.ArgumentParser()
Expand Down
7 changes: 7 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@ process {
ext.args = '--quiet'
}

withName: TAG_BARCODE {
publishDir = [
path: { "${params.outdir}/tag_barcode" },
mode: 'copy'
]
}


withName: CUSTOM_DUMPSOFTWAREVERSIONS {
publishDir = [
Expand Down
36 changes: 27 additions & 9 deletions docs/output.md
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
- [Main output](#main-output)
- [Modules](#modules)
- [fastqc](#fastqc)
- [extract](#extract)
- [tag\_barcode](#tag_barcode)
- [multiqc-sgr](#multiqc-sgr)
- [pipeline\_info](#pipeline_info)
- [fastqc(optional)](#fastqcoptional)

# Main output

- `tag_barcode/{sample}.tag_barcode.csv.gz` antibody-derived tags (ADT) UMI matrix.
- `tag_barcode/{sample}.filtered.tag_barcode.csv.gz` antibody-derived tags (ADT) UMI matrix aftering filtering protein aggregation. Protein aggregates in antibody staining experiments may cause a significant increase in the UMI counts of a few cell barcodes. To address this issue, the `tag_barcode` module
removes any antibody UMI counts that exceed max(10000, total antibody UMI counts * 0.01).

Code snippet using [Seurat V5](https://satijalab.org/seurat/articles/multimodal_vignette.html)
Downstream analysis code snippet using [Seurat V5](https://satijalab.org/seurat/articles/multimodal_vignette.html)
```R
library(Seurat)
# Load in the RNA UMI matrix
cbmc.rna = Read10X('data/match/X/filtered')
# Load in the ADT UMI matrix
cbmc.adt = as.sparse(read.csv(file = "sccite/test/outs/tag_barcode/X.tag_barcode.csv.gz",
cbmc.adt = as.sparse(read.csv(file = "sccite/test/outs/tag_barcode/X.filtered.tag_barcode.csv.gz",
sep = ",", header = TRUE, row.names = 1))
all.equal(colnames(cbmc.rna), colnames(cbmc.adt))
cbmc <- CreateSeuratObject(counts = cbmc.rna,names.delim = ' ')
Expand All @@ -27,15 +30,21 @@ cbmc[["ADT"]] <- adt_assay

# Modules

## fastqc
## extract

[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/).
Extract valid barcode and UMI from R1 fastqs and add them to the read name of R2 fastqs.

**Output files**
**Output files**
- `{sample}.{R2}.fq.gz` Fastq files with barcodes added.

- `*_fastqc.html`: FastQC report containing quality metrics.
- `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images.
## tag_barcode

Locate the tag barcode in the R2 reads, generate the ADT UMI matrix, and filter out the abnormally high UMI counts.

**Output files**
- `{sample}.raw.tag_barcode.csv.gz` Raw ADT UMI matrix.
- `{sample}.filtered.tag_barcode.csv.gz` Filtered ADT UMI matrix.
- `{sample}.aggregate.csv` Barcode and antibody combinations removed due to abnormally high UMIs.

## multiqc-sgr

Expand All @@ -59,3 +68,12 @@ cbmc[["ADT"]] <- adt_assay
- Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline.
- Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`.
- Parameters used by the pipeline run: `params.json`.

## fastqc(optional)

[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/).

**Output files**

- `*_fastqc.html`: FastQC report containing quality metrics.
- `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images.
2 changes: 1 addition & 1 deletion docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
## Samplesheet input

You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 4 columns, and a header row as shown in the examples below. An example `samplesheet.csv` can be found [here](https://github.com/singleron-RD/sccite_test_data)
You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 4 columns, and a header row as shown in the examples below. An example `samplesheet.csv` can be found [here](https://github.com/singleron-RD/sccite_test_data/test1).

```bash
--input '[path to samplesheet file]'
Expand Down
2 changes: 1 addition & 1 deletion modules/local/extract.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ process EXTRACT {
val protocol

output:
tuple val(meta), path("${meta.id}_R*.fq*"), emit: reads
tuple val(meta), path("${meta.id}_R2.fq.gz"), emit: reads
tuple val(meta), path("*.json"), emit: json
path "versions.yml" , emit: versions

Expand Down
2 changes: 1 addition & 1 deletion modules/local/tag_barcode.nf
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ process TAG_BARCODE {

output:
tuple val(meta), path("*.json"), emit: json
tuple val(meta), path("*.csv.gz"), emit: csv
tuple val(meta), path("*.csv*"), emit: csv
path "versions.yml" , emit: versions

script:
Expand Down
6 changes: 6 additions & 0 deletions multiqc_sgr/multiqc_sgr/sccite.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,12 @@ def general_stats_table(self, summary_data):
"scale": "blue",
"format": "{:,.0f}",
},
"Aggregate Barcodes": {
"title": "Aggregate Barcodes",
"description": "Number of cell barcodes with protein aggregation. Ideally < 10",
"scale": "blue",
"format": "{:,.0f}",
},
}
self.general_stats_addcols(summary_data, headers=headers)

Expand Down
2 changes: 1 addition & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ manifest {
description = """nextflow pipeline"""
mainScript = 'main.nf'
nextflowVersion = '!>=23.04.0'
version = '1.1.0'
version = '1.0.0'
doi = ''
}

Expand Down
2 changes: 0 additions & 2 deletions subworkflows/local/utils_nfcore_sccite_pipeline/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -177,8 +177,6 @@ def toolBibliographyText() {
// Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "<li>Author (2023) Pub name, Journal, DOI</li>" : "",
// Uncomment function in methodsDescriptionText to render in MultiQC report
def reference_text = [
"<li>Andrews S, (2010) FastQC, URL: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/.</li>",
"<li>Dobin A, etc. Kaminow, Benjamin, Dinar Yunusov, and Alexander Dobin. STARsolo: accurate, fast and versatile mapping/quantification of single-cell and single-nucleus RNA-seq data. Biorxiv (2021): 2021-05.</li>",
"<li>Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. doi: /10.1093/bioinformatics/btw354</li>"
].join(' ').trim()

Expand Down

0 comments on commit 1c5aa5a

Please sign in to comment.