Redo extraction of charging values (#46)

Closes #45 Uncovered an issue wherein a small number of reads have >= 1 value for ML (charging tag). Not sure why that is, but for now we ditch them (it was < 10 reads for a large run). Split get_final_bam_and_charg_prob into 2 rules: transfer_bam_tags get_cca_trna Replace the awk / sed strategy of extracting chargin values with a python script.
rnabioco · Jan 17, 2025 · a1e46ee · a1e46ee
1 parent 823219f
commit a1e46ee
Show file tree

Hide file tree

Showing 4 changed files with 87 additions and 21 deletions.
diff --git a/run-preprint.sh b/run-preprint.sh
@@ -9,4 +9,5 @@ mkdir -p logs
 
 snakemake \
     --configfile=config/config-preprint.yml \
-    --profile cluster
+    --profile cluster \
+    --rerun-incomplete
diff --git a/run-test.sh b/run-test.sh
@@ -8,5 +8,5 @@
 mkdir -p results/logs
 
 snakemake \
-  --configfile=config/config.yml \
+  --configfile=config/config-test.yml \
   --profile cluster
diff --git a/workflow/rules/aatrnaseq.smk b/workflow/rules/aatrnaseq.smk
@@ -153,7 +153,7 @@ rule cca_classify:
     """
 
 
-rule get_final_bam_and_charg_prob:
+rule transfer_bam_tags:
     """
   creates final bam with classified reads MM and ML tags and table with charging probability per read
   """
@@ -163,11 +163,8 @@ rule get_final_bam_and_charg_prob:
     output:
         classified_bam=os.path.join(outdir, "classified_bams", "{sample}.bam"),
         classified_bam_bai=os.path.join(outdir, "classified_bams", "{sample}.bam.bai"),
-        charging_tab=os.path.join(
-            outdir, "tables", "{sample}", "{sample}.charging_prob.tsv.gz"
-        ),
     log:
-        os.path.join(outdir, "logs", "final_bams_and_tabs", "{sample}"),
+        os.path.join(outdir, "logs", "transfer_bam_tags", "{sample}"),
     params:
         src=SCRIPT_DIR,
     shell:
@@ -178,13 +175,28 @@ rule get_final_bam_and_charg_prob:
       -o {output.classified_bam}
 
     samtools index {output.classified_bam}
+    """
 
-    (echo -e "read_id\ttRNA\tcharging_likelihood"; \
-      samtools view {output.classified_bam} \
-      | awk '{{ml=""; for(i=1;i<=NF;i++) {{if($i ~ /^ML:/) ml=$i}}; if(ml!="") print $1 "\t" $3 "\t" ml}}' \
-      | sed 's/ML:B:C,//g') \
-      | gzip -c \
-      > {output.charging_tab}
+
+rule get_cca_trna:
+    """
+    extract and report charing probability (ML tag) per read
+    """
+    input:
+        bam=rules.transfer_bam_tags.output.classified_bam,
+    output:
+        charging_tab=os.path.join(
+            outdir, "tables", "{sample}", "{sample}.charging_prob.tsv.gz"
+        ),
+    log:
+        os.path.join(outdir, "logs", "get_cca_trna", "{sample}"),
+    params:
+        src=SCRIPT_DIR,
+    shell:
+        """
+    python {params.src}/get_charging_table.py \
+      {input.bam} \
+      {output.charging_tab}
     """
 
 
@@ -193,7 +205,7 @@ rule get_cca_trna_cpm:
     calculate cpm for cca classified trnas
     """
     input:
-        charging_tab=rules.get_final_bam_and_charg_prob.output.charging_tab,
+        charging_tab=rules.get_cca_trna.output.charging_tab,
     output:
         cpm=os.path.join(outdir, "tables", "{sample}", "{sample}.charging.cpm.tsv.gz"),
     log:
@@ -216,8 +228,8 @@ rule bcerror:
   extract base calling error metrics to tsv file
   """
     input:
-        bam=rules.get_final_bam_and_charg_prob.output.classified_bam,
-        bai=rules.get_final_bam_and_charg_prob.output.classified_bam_bai,
+        bam=rules.transfer_bam_tags.output.classified_bam,
+        bai=rules.transfer_bam_tags.output.classified_bam_bai,
     output:
         tsv=os.path.join(outdir, "tables", "{sample}", "{sample}.bcerror.tsv.gz"),
     log:
@@ -241,7 +253,7 @@ rule align_stats:
     input:
         unmapped=get_optional_bam_inputs,
         aligned=rules.bwa_align.output.bam,
-        classified=rules.get_final_bam_and_charg_prob.output.classified_bam,
+        classified=rules.transfer_bam_tags.output.classified_bam,
     output:
         tsv=os.path.join(outdir, "tables", "{sample}", "{sample}.align_stats.tsv.gz"),
     log:
@@ -262,8 +274,8 @@ rule align_stats:
 
 rule bam_to_coverage:
     input:
-        bam=rules.get_final_bam_and_charg_prob.output.classified_bam,
-        bai=rules.get_final_bam_and_charg_prob.output.classified_bam_bai,
+        bam=rules.transfer_bam_tags.output.classified_bam,
+        bai=rules.transfer_bam_tags.output.classified_bam_bai,
     output:
         counts_tmp=temp(
             os.path.join(outdir, "tables", "{sample}", "{sample}.counts.bg")
@@ -307,8 +319,8 @@ rule remora_signal_stats:
   run remora to get signal stats
   """
     input:
-        bam=rules.get_final_bam_and_charg_prob.output.classified_bam,
-        bai=rules.get_final_bam_and_charg_prob.output.classified_bam_bai,
+        bam=rules.transfer_bam_tags.output.classified_bam,
+        bai=rules.transfer_bam_tags.output.classified_bam_bai,
         pod5=rules.merge_pods.output,
     output:
         tsv=os.path.join(outdir, "tables", "{sample}", "{sample}.remora.tsv.gz"),

diff --git a/workflow/scripts/get_charging_table.py b/workflow/scripts/get_charging_table.py
@@ -0,0 +1,53 @@
+#! /usr/bin/env python
+
+"""
+Generate table of read id, ref, value of charging tag
+"""
+
+import pysam
+import argparse
+import csv
+import gzip
+
+
+def extract_tag(bam_file, output_tsv, tag):
+
+    open_func = gzip.open if output_tsv.endswith(".gz") else open
+    mode = "wt" if output_tsv.endswith(".gz") else "w"
+
+    with (
+        pysam.AlignmentFile(bam_file, "rb") as bam,
+        open_func(output_tsv, mode) as tsvfile,
+    ):
+        writer = csv.writer(tsvfile, delimiter="\t")
+        writer.writerow(["read_id", "tRNA", "charging_likelihood"])
+
+        for read in bam.fetch():
+            read_id = read.query_name
+            reference = read.reference_name if read.reference_name else "*"
+            tag_array = dict(read.tags).get(tag, None)
+
+            # XXX: handle case where there are more than 1 tag value
+            # not clear why this is, but we skip for now as it's a small 
+            # number of reads affected
+            if len(tag_array) > 1:
+                continue
+
+            tag_value = tag_array[0]
+
+            if tag_value and reference != "*":
+                writer.writerow([read_id, reference, tag_value])
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Extract a specified tag from a BAM file and write to TSV."
+    )
+    parser.add_argument("bam_file", help="Input BAM file")
+    parser.add_argument(
+        "output_tsv", help="Output TSV file (can be .gz for compression)"
+    )
+    parser.add_argument("--tag", default="ML", help="BAM tag to extract (default: ML)")
+
+    args = parser.parse_args()
+    extract_tag(args.bam_file, args.output_tsv, args.tag)