Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rule for calculating CPM of charged / uncharged #28

Merged
merged 1 commit into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 37 additions & 8 deletions workflow/rules/aatrnaseq.smk
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,9 @@ rule get_final_bam_and_charg_prob:
output:
classified_bam=os.path.join(outdir, "classified_bams", "{sample}.bam"),
classified_bam_bai=os.path.join(outdir, "classified_bams", "{sample}.bam.bai"),
charging_tab=os.path.join(outdir, "tables", "{sample}.charging_prob.tsv.gz"),
charging_tab=os.path.join(
outdir, "tables", "{sample}", "{sample}.charging_prob.tsv.gz"
),
log:
os.path.join(outdir, "logs", "final_bams_and_tabs", "{sample}"),
params:
Expand All @@ -186,6 +188,29 @@ rule get_final_bam_and_charg_prob:
"""


rule get_cca_trna_cpm:
"""
calculate cpm for cca classified trnas
"""
input:
charging_tab=rules.get_final_bam_and_charg_prob.output.charging_tab,
output:
cpm=os.path.join(outdir, "tables", "{sample}", "{sample}.charging.cpm.tsv.gz"),
log:
os.path.join(outdir, "logs", "cca_trna_cpm", "{sample}"),
params:
src=SCRIPT_DIR,
# XXX move `ml_thresh` to config file
ml_thresh=200,
shell:
"""
python {params.src}/get_trna_charging_cpm.py \
--input {input.charging_tab} \
--output {output.cpm} \
--ml-threshold {params.ml_thresh}
"""


rule bcerror:
"""
extract base calling error metrics to tsv file
Expand All @@ -194,7 +219,7 @@ rule bcerror:
bam=rules.get_final_bam_and_charg_prob.output.classified_bam,
bai=rules.get_final_bam_and_charg_prob.output.classified_bam_bai,
output:
tsv=os.path.join(outdir, "tables", "{sample}.bcerror.tsv.gz"),
tsv=os.path.join(outdir, "tables", "{sample}", "{sample}.bcerror.tsv.gz"),
log:
os.path.join(outdir, "logs", "bcerror", "{sample}.bwa"),
params:
Expand All @@ -218,7 +243,7 @@ rule align_stats:
aligned=rules.bwa_align.output.bam,
classified=rules.get_final_bam_and_charg_prob.output.classified_bam,
output:
tsv=os.path.join(outdir, "tables", "{sample}.align_stats.tsv.gz"),
tsv=os.path.join(outdir, "tables", "{sample}", "{sample}.align_stats.tsv.gz"),
log:
os.path.join(outdir, "logs", "stats", "{sample}.align_stats"),
params:
Expand All @@ -240,10 +265,14 @@ rule bam_to_coverage:
bam=rules.get_final_bam_and_charg_prob.output.classified_bam,
bai=rules.get_final_bam_and_charg_prob.output.classified_bam_bai,
output:
counts_tmp=temp(os.path.join(outdir, "tables", "{sample}.counts.bg")),
cpm_tmp=temp(os.path.join(outdir, "tables", "{sample}.cpm.bg")),
counts=protected(os.path.join(outdir, "tables", "{sample}.counts.bg.gz")),
cpm=protected(os.path.join(outdir, "tables", "{sample}.cpm.bg.gz")),
counts_tmp=temp(
os.path.join(outdir, "tables", "{sample}", "{sample}.counts.bg")
),
cpm_tmp=temp(os.path.join(outdir, "tables", "{sample}", "{sample}.cpm.bg")),
counts=protected(
os.path.join(outdir, "tables", "{sample}", "{sample}.counts.bg.gz")
),
cpm=protected(os.path.join(outdir, "tables", "{sample}", "{sample}.cpm.bg.gz")),
params:
bg_opts=config["opts"]["coverage"],
log:
Expand Down Expand Up @@ -282,7 +311,7 @@ rule remora_signal_stats:
bai=rules.get_final_bam_and_charg_prob.output.classified_bam_bai,
pod5=rules.merge_pods.output,
output:
tsv=os.path.join(outdir, "tables", "{sample}.remora.tsv.gz"),
tsv=os.path.join(outdir, "tables", "{sample}", "{sample}.remora.tsv.gz"),
log:
os.path.join(outdir, "logs", "remora", "{sample}"),
params:
Expand Down
18 changes: 12 additions & 6 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -102,21 +102,27 @@ samples = find_raw_inputs(samples)
# Define target files for rule all
def pipeline_outputs():
outs = expand(
os.path.join(outdir, "tables", "{sample}.charging_prob.tsv.gz"),
os.path.join(outdir, "tables", "{sample}", "{sample}.charging_prob.tsv.gz"),
sample=samples.keys(),
)

outs += expand(
os.path.join(outdir, "tables", "{sample}.bcerror.tsv.gz"), sample=samples.keys()
os.path.join(outdir, "tables", "{sample}", "{sample}.charging.cpm.tsv.gz"),
sample=samples.keys(),
)

outs += expand(
os.path.join(outdir, "tables", "{sample}", "{sample}.bcerror.tsv.gz"),
sample=samples.keys(),
)

outs += expand(
os.path.join(outdir, "tables", "{sample}.align_stats.tsv.gz"),
os.path.join(outdir, "tables", "{sample}", "{sample}.align_stats.tsv.gz"),
sample=samples.keys(),
)

outs += expand(
os.path.join(outdir, "tables", "{sample}.{values}.bg.gz"),
os.path.join(outdir, "tables", "{sample}", "{sample}.{values}.bg.gz"),
sample=samples.keys(),
values=["cpm", "counts"],
)
Expand All @@ -127,12 +133,12 @@ def pipeline_outputs():
and config["remora_kmer_table"] is not None
):
outs += expand(
os.path.join(outdir, "tables", "{sample}.remora.tsv.gz"),
os.path.join(outdir, "tables", "{sample}", "{sample}.remora.tsv.gz"),
sample=samples.keys(),
)

# if "trna_table" in config and config["trna_table"] != "" and config["trna_table"] is not None:
# outs += expand(os.path.join(outdir, "tables", "{sample}.charging_status.tsv"),
# outs += expand(os.path.join(outdir, "tables", "{sample}", "{sample}.charging_status.tsv"),
# sample = samples.keys())

return outs
Expand Down
71 changes: 71 additions & 0 deletions workflow/scripts/get_trna_charging_cpm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#! /usr/bin/env python

"""
and collapses the output of running a Remora CCA model and extracting
per-read information on charging likelihood into an ML tag into
per-isodecoder counts (and CPM-normalized counts) of charged and uncharged
tRNAs as determined by the model with a ML >= 200 threshold.

tRNA-AA-anticodon-family-species-chargingref are all preserved from BWA alignment,
and can be further collapsed as desired in downstream analysis

CPM normalization reflects counts per million reads that passed alingnment and
the filtering parameters for Remora classification; these are full length tRNA
"""

import pandas as pd
import gzip


def per_read_charging(input, output, threshold):
# Read the TSV file into a DataFrame
df = pd.read_csv(input, sep="\t")

# Categorize tRNAs as charged or uncharged
df["status"] = df["charging_likelihood"].apply(
lambda x: "charged" if x >= 200 else "uncharged"
)

# Group by tRNA and status to get counts
count_data = df.groupby(["tRNA", "status"]).size().unstack(fill_value=0)

# Get total number of reads in the file
total_reads = len(df)

# Normalize counts by CPM
count_data["charged_CPM"] = (count_data["charged"] / total_reads) * 1e6
count_data["uncharged_CPM"] = (count_data["uncharged"] / total_reads) * 1e6

if output.endswith(".gz"):
output_file = gzip.open(output, "wt")
else:
output_file = open(output, "w")

# Write the results to a new file
count_data.to_csv(output_file, sep="\t")


if __name__ == "__main__":
import argparse

parser = argparse.ArgumentParser(
description="Process a TSV file to categorize tRNAs as charged or uncharged."
)
parser.add_argument(
"--input", type=str, help="Path to the input TSV file", required=True
)
parser.add_argument(
"--output", type=str, help="Path to the output TSV file", required=True
)
parser.add_argument(
"--ml-threshold",
type=int,
default=200,
help="Threshold for classifying tRNAs as charged (default: 200)",
)

# Parse arguments
args = parser.parse_args()

# Process the selected file with the given threshold
per_read_charging(args.input, args.output, args.ml_threshold)
Loading