rnabioco · jayhesselberth · Jan 8, 2025 · Jan 8, 2025
diff --git a/workflow/rules/aatrnaseq.smk b/workflow/rules/aatrnaseq.smk
@@ -163,7 +163,9 @@ rule get_final_bam_and_charg_prob:
     output:
         classified_bam=os.path.join(outdir, "classified_bams", "{sample}.bam"),
         classified_bam_bai=os.path.join(outdir, "classified_bams", "{sample}.bam.bai"),
-        charging_tab=os.path.join(outdir, "tables", "{sample}.charging_prob.tsv.gz"),
+        charging_tab=os.path.join(
+            outdir, "tables", "{sample}", "{sample}.charging_prob.tsv.gz"
+        ),
     log:
         os.path.join(outdir, "logs", "final_bams_and_tabs", "{sample}"),
     params:
@@ -186,6 +188,29 @@ rule get_final_bam_and_charg_prob:
     """
 
 
+rule get_cca_trna_cpm:
+    """
+    calculate cpm for cca classified trnas
+    """
+    input:
+        charging_tab=rules.get_final_bam_and_charg_prob.output.charging_tab,
+    output:
+        cpm=os.path.join(outdir, "tables", "{sample}", "{sample}.charging.cpm.tsv.gz"),
+    log:
+        os.path.join(outdir, "logs", "cca_trna_cpm", "{sample}"),
+    params:
+        src=SCRIPT_DIR,
+        # XXX move `ml_thresh` to config file
+        ml_thresh=200,
+    shell:
+        """
+    python {params.src}/get_trna_charging_cpm.py \
+      --input {input.charging_tab} \
+      --output {output.cpm} \
+      --ml-threshold {params.ml_thresh}
+    """
+
+
 rule bcerror:
     """
   extract base calling error metrics to tsv file
@@ -194,7 +219,7 @@ rule bcerror:
         bam=rules.get_final_bam_and_charg_prob.output.classified_bam,
         bai=rules.get_final_bam_and_charg_prob.output.classified_bam_bai,
     output:
-        tsv=os.path.join(outdir, "tables", "{sample}.bcerror.tsv.gz"),
+        tsv=os.path.join(outdir, "tables", "{sample}", "{sample}.bcerror.tsv.gz"),
     log:
         os.path.join(outdir, "logs", "bcerror", "{sample}.bwa"),
     params:
@@ -218,7 +243,7 @@ rule align_stats:
         aligned=rules.bwa_align.output.bam,
         classified=rules.get_final_bam_and_charg_prob.output.classified_bam,
     output:
-        tsv=os.path.join(outdir, "tables", "{sample}.align_stats.tsv.gz"),
+        tsv=os.path.join(outdir, "tables", "{sample}", "{sample}.align_stats.tsv.gz"),
     log:
         os.path.join(outdir, "logs", "stats", "{sample}.align_stats"),
     params:
@@ -240,10 +265,14 @@ rule bam_to_coverage:
         bam=rules.get_final_bam_and_charg_prob.output.classified_bam,
         bai=rules.get_final_bam_and_charg_prob.output.classified_bam_bai,
     output:
-        counts_tmp=temp(os.path.join(outdir, "tables", "{sample}.counts.bg")),
-        cpm_tmp=temp(os.path.join(outdir, "tables", "{sample}.cpm.bg")),
-        counts=protected(os.path.join(outdir, "tables", "{sample}.counts.bg.gz")),
-        cpm=protected(os.path.join(outdir, "tables", "{sample}.cpm.bg.gz")),
+        counts_tmp=temp(
+            os.path.join(outdir, "tables", "{sample}", "{sample}.counts.bg")
+        ),
+        cpm_tmp=temp(os.path.join(outdir, "tables", "{sample}", "{sample}.cpm.bg")),
+        counts=protected(
+            os.path.join(outdir, "tables", "{sample}", "{sample}.counts.bg.gz")
+        ),
+        cpm=protected(os.path.join(outdir, "tables", "{sample}", "{sample}.cpm.bg.gz")),
     params:
         bg_opts=config["opts"]["coverage"],
     log:
@@ -282,7 +311,7 @@ rule remora_signal_stats:
         bai=rules.get_final_bam_and_charg_prob.output.classified_bam_bai,
         pod5=rules.merge_pods.output,
     output:
-        tsv=os.path.join(outdir, "tables", "{sample}.remora.tsv.gz"),
+        tsv=os.path.join(outdir, "tables", "{sample}", "{sample}.remora.tsv.gz"),
     log:
         os.path.join(outdir, "logs", "remora", "{sample}"),
     params:

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -102,21 +102,27 @@ samples = find_raw_inputs(samples)
 # Define target files for rule all
 def pipeline_outputs():
     outs = expand(
-        os.path.join(outdir, "tables", "{sample}.charging_prob.tsv.gz"),
+        os.path.join(outdir, "tables", "{sample}", "{sample}.charging_prob.tsv.gz"),
         sample=samples.keys(),
     )
 
     outs += expand(
-        os.path.join(outdir, "tables", "{sample}.bcerror.tsv.gz"), sample=samples.keys()
+        os.path.join(outdir, "tables", "{sample}", "{sample}.charging.cpm.tsv.gz"),
+        sample=samples.keys(),
+    )
+
+    outs += expand(
+        os.path.join(outdir, "tables", "{sample}", "{sample}.bcerror.tsv.gz"),
+        sample=samples.keys(),
     )
 
     outs += expand(
-        os.path.join(outdir, "tables", "{sample}.align_stats.tsv.gz"),
+        os.path.join(outdir, "tables", "{sample}", "{sample}.align_stats.tsv.gz"),
         sample=samples.keys(),
     )
 
     outs += expand(
-        os.path.join(outdir, "tables", "{sample}.{values}.bg.gz"),
+        os.path.join(outdir, "tables", "{sample}", "{sample}.{values}.bg.gz"),
         sample=samples.keys(),
         values=["cpm", "counts"],
     )
@@ -127,12 +133,12 @@ def pipeline_outputs():
         and config["remora_kmer_table"] is not None
     ):
         outs += expand(
-            os.path.join(outdir, "tables", "{sample}.remora.tsv.gz"),
+            os.path.join(outdir, "tables", "{sample}", "{sample}.remora.tsv.gz"),
             sample=samples.keys(),
         )
 
     # if "trna_table" in config and config["trna_table"] != "" and config["trna_table"] is not None:
-    #    outs += expand(os.path.join(outdir, "tables", "{sample}.charging_status.tsv"),
+    #    outs += expand(os.path.join(outdir, "tables", "{sample}", "{sample}.charging_status.tsv"),
     #        sample = samples.keys())
 
     return outs

diff --git a/workflow/scripts/get_trna_charging_cpm.py b/workflow/scripts/get_trna_charging_cpm.py
@@ -0,0 +1,71 @@
+#! /usr/bin/env python
+
+"""
+and collapses the output of running a Remora CCA model and extracting
+per-read information on charging likelihood into an ML tag into
+per-isodecoder counts (and CPM-normalized counts) of charged and uncharged
+tRNAs as determined by the model with a ML >= 200 threshold.
+
+tRNA-AA-anticodon-family-species-chargingref are all preserved from BWA alignment,
+and can be further collapsed as desired in downstream analysis
+
+CPM normalization reflects counts per million reads that passed alingnment and
+the filtering parameters for Remora classification; these are full length tRNA
+"""
+
+import pandas as pd
+import gzip
+
+
+def per_read_charging(input, output, threshold):
+    # Read the TSV file into a DataFrame
+    df = pd.read_csv(input, sep="\t")
+
+    # Categorize tRNAs as charged or uncharged
+    df["status"] = df["charging_likelihood"].apply(
+        lambda x: "charged" if x >= 200 else "uncharged"
+    )
+
+    # Group by tRNA and status to get counts
+    count_data = df.groupby(["tRNA", "status"]).size().unstack(fill_value=0)
+
+    # Get total number of reads in the file
+    total_reads = len(df)
+
+    # Normalize counts by CPM
+    count_data["charged_CPM"] = (count_data["charged"] / total_reads) * 1e6
+    count_data["uncharged_CPM"] = (count_data["uncharged"] / total_reads) * 1e6
+
+    if output.endswith(".gz"):
+        output_file = gzip.open(output, "wt")
+    else:
+        output_file = open(output, "w")
+
+    # Write the results to a new file
+    count_data.to_csv(output_file, sep="\t")
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Process a TSV file to categorize tRNAs as charged or uncharged."
+    )
+    parser.add_argument(
+        "--input", type=str, help="Path to the input TSV file", required=True
+    )
+    parser.add_argument(
+        "--output", type=str, help="Path to the output TSV file", required=True
+    )
+    parser.add_argument(
+        "--ml-threshold",
+        type=int,
+        default=200,
+        help="Threshold for classifying tRNAs as charged (default: 200)",
+    )
+
+    # Parse arguments
+    args = parser.parse_args()
+
+    # Process the selected file with the given threshold
+    per_read_charging(args.input, args.output, args.ml_threshold)