Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add python CLI #14

Merged
merged 10 commits into from
Jan 15, 2025
33,171 changes: 33,171 additions & 0 deletions example-data/pgPEN-library/paralog_pgRNA_annotations.txt
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be good to have a README that explains about this example data. What it is and how it was subsetted. But this can also be in a future PR.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree. I will address that in a future pr.

Large diffs are not rendered by default.

74 changes: 74 additions & 0 deletions src/pgmap/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import argparse
import os
import sys

from pgmap.counter import counter
from pgmap.io import barcode_reader, library_reader, counts_writer
from pgmap.trimming import read_trimmer

TWO_READ_STRATEGY = "two-read"
THREE_READ_STRATEGY = "three-read"


def get_counts(args: argparse.Namespace):
barcodes = barcode_reader.read_barcodes(args.barcodes)
gRNA1s, gRNA2s, gRNA_mappings, id_mapping = library_reader.read_paired_guide_library_annotation(
args.library)

candidate_reads = None

if args.trim_strategy == TWO_READ_STRATEGY:
candidate_reads = read_trimmer.two_read_trim(*args.fastq)
elif args.trim_strategy == THREE_READ_STRATEGY:
candidate_reads = read_trimmer.two_read_trim(*args.fastq)

paired_guide_counts = counter.get_counts(
candidate_reads, gRNA_mappings, barcodes, gRNA2_error_tolerance=args.gRNA2_error, barcode_error_tolerance=args.barcode_error)

counts_writer.write_counts(
args.output, paired_guide_counts, barcodes, id_mapping)


def _parse_args(args: list[str]) -> argparse.Namespace:
parser = argparse.ArgumentParser(
prog="pgmap", description="A tool to count paired guides from CRISPR double knockout screens.", exit_on_error=False)
# TODO in general these file formats should be documented more
parser.add_argument("-f", "--fastq", nargs='+', required=True, type=_check_file_exists,
help="Fastq files to count from, separated by space. Can optionally be gzipped.")
parser.add_argument("-l", "--library", required=True, type=_check_file_exists,
help="File containing annotated pgRNA information including the pgRNA id and both guide sequences.")
# TODO support no barcodes?
parser.add_argument("-b", "--barcodes", required=True, type=_check_file_exists,
help="File containing sample barcodes including the barcode sequence and the sample id.")
# TODO check can write to this path?
parser.add_argument("-o", "--output", required=False,
help="Output file path to populate with the counts for each paired guide and sample. If not provided the counts will be output in STDOUT.")
# TODO support arbitrary trim strategies
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would probably be good to add each of these TODOs as GitHub issues so we don't lose track of them. Also could be on Basecamp. Wherever you prefer.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do! I have a lot of TODOs to go back and see if they are worth prioritizing. Often these are where I didn't want to pause at that moment to make an issue, or small enough issues that I wanted to acknowledge a change could be made by future authors without necessarily committing to it the way an issue implies.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Totally makes sense! It's good move!

parser.add_argument("--trim-strategy", required=True, choices=(TWO_READ_STRATEGY, THREE_READ_STRATEGY),
help="The trim strategy used to extract guides and barcodes. The two read strategy should have fastqs R1 and I1. The three read strategy should have fastqs R1, I1, and I2") # TODO extract consts
parser.add_argument("--gRNA2-error", required=False, default=2, type=_check_nonnegative,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think these last ones need a "help" description.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are lines 50 and 52 not it?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh so sorry! I missed that somehow!

help="The number of substituted base pairs to allow in gRNA2.")
parser.add_argument("--barcode-error", required=False, default=2, type=_check_nonnegative,
help="The number of insertions, deletions, and subsititions of base pairs to allow in the barcodes.")
return parser.parse_args(args)


def _check_nonnegative(value: str) -> int:
int_value = int(value)

if int_value < 0:
raise ValueError(f"Count must be nonnegative but was {value}")

return int_value


def _check_file_exists(path: str) -> str:
if os.path.exists(path) and os.access(path, os.R_OK):
return path
else:
raise argparse.ArgumentTypeError(
f"File path {path} does not exist or is not readable")


if __name__ == "__main__":
get_counts(_parse_args(sys.argv[1:]))
Empty file added src/pgmap/counter/__init__.py
Empty file.
File renamed without changes.
41 changes: 41 additions & 0 deletions src/pgmap/io/counts_writer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import csv
from typing import Counter, Optional, IO, Iterator
from contextlib import contextmanager
import sys


def write_counts(output_path: Optional[str], counts: Counter[tuple[str, str, str]], barcodes: dict[str, str], id_mappings: dict[str, str]):
"""
Write counts to an output file tsv with a header [id, seq_1, seq2, sample_id_1, ..., sample_id_n]. If no
output_path is specified, write to STDOUT.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could alternatively just have a default file name its saved to in the working directory.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But then again people could >. Hmm...

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I felt pipe support was necessary based on the expectations set by other bioinformatics tools, though this function signature could be more agnostic of that fact and just take a file opened by the caller? Then again this is just a library function exposed for convenience, and I feel path is pretty convenient and conventional.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes path I think is definitely a good call. I was just thinking about the default for it.


Args
output_path (Optional[str]): The output path to write to. If none, then the counts will be written to STDOUT.
counts (Counter[tuple[str, str, str]]): The counts for each (gRNA1, gRNA2, barcode).
id_mappings (dict[str, str]): A mapping from (gRNA1, gRNA2) to the paired guide id.
"""
# TODO delete file if it already exists? manually test to see what happens if you don't do this
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like to opt for an "overwrite" parameter. Basically by default overwrite = FALSE. But if it is TRUE then we can delete the existing file of the same name and write a new one.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe the convention for python libraries is to overwrite e.g. https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.savefig.html and https://numpy.org/doc/2.1/reference/generated/numpy.save.html. The expectation is the caller can check before calling this if they desire that behavior?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A lot of bioinformatics packages are used by people who don't have a lot of experience with command line tools and because we are working with huge data sometimes we want to decrease the probability that people will shoot themselves in the foot (aka accidentally overwrite a file they didn't mean to).

So I think having a default that spits out a warning that people can shut off if they agree feels better than having it automatically overwrite before people realize what they are doing.

Happy to discuss more!

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm that is concerning. I think I have two users in mind. One is using pgMAP as a python library and I imagine they have the skills and expectations for this function signature. The second is the CLI user and I am concerned about them overwriting their files.

I wonder then if we should address that as part of the CLI (i.e. checking the output does not exist and erroring, and adding a flag to overwrite it if desired). Or do you still feel strongly that the library user should also have this warning?

Another concern is that taking a step back, we never really planned to support pgMAP as a python library, it more just was convenient enough to do it and it's been useful for some deep dive testing, but I wonder if it's something we say "use at your own risk". I will try to get answers about this prioritization of the library from the Berger lab folks.


with _open_file_or_stdout(output_path) as f:
writer = csv.writer(f, delimiter='\t')

sample_ids = list(barcodes.values())
sample_id_to_barcode = {v: k for k, v in barcodes.items()}

writer.writerow(["id", "seq_1", "seq_2"] + sample_ids)

for (gRNA1, gRNA2), pg_id in id_mappings.items():
row = [pg_id, gRNA1, gRNA2] + \
[counts[(gRNA1, gRNA2, sample_id_to_barcode[sample_id])]
for sample_id in sample_ids]

writer.writerow(row)


@contextmanager
def _open_file_or_stdout(path) -> Iterator[IO]:
if path is None:
yield sys.stdout
else:
with open(path, 'w', newline='') as f:
yield f
41 changes: 40 additions & 1 deletion src/pgmap/io/library_reader.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from collections import defaultdict
import csv

from pgmap.io import fastx_reader


def read_paired_guide_library(R1_path: str, R2_path: str) -> tuple[set[str], set[str], dict[str, set[str]]]:
def read_paired_guide_library_fastas(R1_path: str, R2_path: str) -> tuple[set[str], set[str], dict[str, set[str]]]:
"""
Reads a paired guide library from two fasta files. R1 and R2 are the first and second guide sequences
respectively.
Expand All @@ -18,6 +19,7 @@ def read_paired_guide_library(R1_path: str, R2_path: str) -> tuple[set[str], set
gRNA_mappings (dict[str, set[str]]): A mapping from each gRNA1 to a set of all of it's paired gRNA2s.
"""
# TODO examine how other guide libraries are stored / shared. This might be niche
# TODO deprecate?
gRNA1s = set()
gRNA2s = set()

Expand All @@ -30,3 +32,40 @@ def read_paired_guide_library(R1_path: str, R2_path: str) -> tuple[set[str], set
gRNA_mappings[gRNA1].add(gRNA2)

return gRNA1s, gRNA2s, gRNA_mappings


def read_paired_guide_library_annotation(annotation_path: str) -> tuple[set[str], set[str], dict[str, set[str]]]:
"""
Reads a paired guide library from an annotation file. The file should be a tsv with a header of id, gRNA1, gRNA2.

Args:
annotation_path (str): The path to the library annotation file.

Returns:
gRNA1s (set[str]): The set of all gRNA1 sequences.
gRNA2s (set[str]): The set of all gRNA2 sequences.
gRNA_mappings (dict[str, set[str]]): A mapping from each gRNA1 to a set of all of it's paired gRNA2s.
id_mapping (dict[str, str]): A mapping from each paired (gRNA1, gRNA2) to the annotation id name.
"""
gRNA1s = set()
gRNA2s = set()

gRNA_mappings = defaultdict(set)

id_mapping = {}

with open(annotation_path, 'r') as file:
tsv_reader = csv.reader(file, delimiter='\t')

for i, (id, gRNA1, gRNA2) in enumerate(tsv_reader):
if i == 0:
continue # skip column headers

gRNA1s.add(gRNA1)
gRNA2s.add(gRNA2)

gRNA_mappings[gRNA1].add(gRNA2)

id_mapping[(gRNA1, gRNA2)] = id

return gRNA1s, gRNA2s, gRNA_mappings, id_mapping
Loading
Loading