Skip to content

Commit

Permalink
Redo extraction of charging values (#46)
Browse files Browse the repository at this point in the history
Closes #45

Uncovered an issue wherein a small number of reads have >= 1 value for ML (charging tag). Not sure why that is, but for now we ditch them (it was < 10 reads for a large run).

Split get_final_bam_and_charg_prob into 2 rules:

transfer_bam_tags
get_cca_trna
Replace the awk / sed strategy of extracting chargin values with a python script.
  • Loading branch information
jayhesselberth authored Jan 17, 2025
1 parent 823219f commit a1e46ee
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 21 deletions.
3 changes: 2 additions & 1 deletion run-preprint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ mkdir -p logs

snakemake \
--configfile=config/config-preprint.yml \
--profile cluster
--profile cluster \
--rerun-incomplete
2 changes: 1 addition & 1 deletion run-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@
mkdir -p results/logs

snakemake \
--configfile=config/config.yml \
--configfile=config/config-test.yml \
--profile cluster
50 changes: 31 additions & 19 deletions workflow/rules/aatrnaseq.smk
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ rule cca_classify:
"""


rule get_final_bam_and_charg_prob:
rule transfer_bam_tags:
"""
creates final bam with classified reads MM and ML tags and table with charging probability per read
"""
Expand All @@ -163,11 +163,8 @@ rule get_final_bam_and_charg_prob:
output:
classified_bam=os.path.join(outdir, "classified_bams", "{sample}.bam"),
classified_bam_bai=os.path.join(outdir, "classified_bams", "{sample}.bam.bai"),
charging_tab=os.path.join(
outdir, "tables", "{sample}", "{sample}.charging_prob.tsv.gz"
),
log:
os.path.join(outdir, "logs", "final_bams_and_tabs", "{sample}"),
os.path.join(outdir, "logs", "transfer_bam_tags", "{sample}"),
params:
src=SCRIPT_DIR,
shell:
Expand All @@ -178,13 +175,28 @@ rule get_final_bam_and_charg_prob:
-o {output.classified_bam}
samtools index {output.classified_bam}
"""

(echo -e "read_id\ttRNA\tcharging_likelihood"; \
samtools view {output.classified_bam} \
| awk '{{ml=""; for(i=1;i<=NF;i++) {{if($i ~ /^ML:/) ml=$i}}; if(ml!="") print $1 "\t" $3 "\t" ml}}' \
| sed 's/ML:B:C,//g') \
| gzip -c \
> {output.charging_tab}

rule get_cca_trna:
"""
extract and report charing probability (ML tag) per read
"""
input:
bam=rules.transfer_bam_tags.output.classified_bam,
output:
charging_tab=os.path.join(
outdir, "tables", "{sample}", "{sample}.charging_prob.tsv.gz"
),
log:
os.path.join(outdir, "logs", "get_cca_trna", "{sample}"),
params:
src=SCRIPT_DIR,
shell:
"""
python {params.src}/get_charging_table.py \
{input.bam} \
{output.charging_tab}
"""


Expand All @@ -193,7 +205,7 @@ rule get_cca_trna_cpm:
calculate cpm for cca classified trnas
"""
input:
charging_tab=rules.get_final_bam_and_charg_prob.output.charging_tab,
charging_tab=rules.get_cca_trna.output.charging_tab,
output:
cpm=os.path.join(outdir, "tables", "{sample}", "{sample}.charging.cpm.tsv.gz"),
log:
Expand All @@ -216,8 +228,8 @@ rule bcerror:
extract base calling error metrics to tsv file
"""
input:
bam=rules.get_final_bam_and_charg_prob.output.classified_bam,
bai=rules.get_final_bam_and_charg_prob.output.classified_bam_bai,
bam=rules.transfer_bam_tags.output.classified_bam,
bai=rules.transfer_bam_tags.output.classified_bam_bai,
output:
tsv=os.path.join(outdir, "tables", "{sample}", "{sample}.bcerror.tsv.gz"),
log:
Expand All @@ -241,7 +253,7 @@ rule align_stats:
input:
unmapped=get_optional_bam_inputs,
aligned=rules.bwa_align.output.bam,
classified=rules.get_final_bam_and_charg_prob.output.classified_bam,
classified=rules.transfer_bam_tags.output.classified_bam,
output:
tsv=os.path.join(outdir, "tables", "{sample}", "{sample}.align_stats.tsv.gz"),
log:
Expand All @@ -262,8 +274,8 @@ rule align_stats:

rule bam_to_coverage:
input:
bam=rules.get_final_bam_and_charg_prob.output.classified_bam,
bai=rules.get_final_bam_and_charg_prob.output.classified_bam_bai,
bam=rules.transfer_bam_tags.output.classified_bam,
bai=rules.transfer_bam_tags.output.classified_bam_bai,
output:
counts_tmp=temp(
os.path.join(outdir, "tables", "{sample}", "{sample}.counts.bg")
Expand Down Expand Up @@ -307,8 +319,8 @@ rule remora_signal_stats:
run remora to get signal stats
"""
input:
bam=rules.get_final_bam_and_charg_prob.output.classified_bam,
bai=rules.get_final_bam_and_charg_prob.output.classified_bam_bai,
bam=rules.transfer_bam_tags.output.classified_bam,
bai=rules.transfer_bam_tags.output.classified_bam_bai,
pod5=rules.merge_pods.output,
output:
tsv=os.path.join(outdir, "tables", "{sample}", "{sample}.remora.tsv.gz"),
Expand Down
53 changes: 53 additions & 0 deletions workflow/scripts/get_charging_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#! /usr/bin/env python

"""
Generate table of read id, ref, value of charging tag
"""

import pysam
import argparse
import csv
import gzip


def extract_tag(bam_file, output_tsv, tag):

open_func = gzip.open if output_tsv.endswith(".gz") else open
mode = "wt" if output_tsv.endswith(".gz") else "w"

with (
pysam.AlignmentFile(bam_file, "rb") as bam,
open_func(output_tsv, mode) as tsvfile,
):
writer = csv.writer(tsvfile, delimiter="\t")
writer.writerow(["read_id", "tRNA", "charging_likelihood"])

for read in bam.fetch():
read_id = read.query_name
reference = read.reference_name if read.reference_name else "*"
tag_array = dict(read.tags).get(tag, None)

# XXX: handle case where there are more than 1 tag value
# not clear why this is, but we skip for now as it's a small
# number of reads affected
if len(tag_array) > 1:
continue

tag_value = tag_array[0]

if tag_value and reference != "*":
writer.writerow([read_id, reference, tag_value])


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Extract a specified tag from a BAM file and write to TSV."
)
parser.add_argument("bam_file", help="Input BAM file")
parser.add_argument(
"output_tsv", help="Output TSV file (can be .gz for compression)"
)
parser.add_argument("--tag", default="ML", help="BAM tag to extract (default: ML)")

args = parser.parse_args()
extract_tag(args.bam_file, args.output_tsv, args.tag)

0 comments on commit a1e46ee

Please sign in to comment.