FredHutch · cansavvy · Jan 15, 2025 · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024
diff --git a/example-data/pgPEN-library/paralog_pgRNA_annotations.txt b/example-data/pgPEN-library/paralog_pgRNA_annotations.txt
diff --git a/src/pgmap/cli.py b/src/pgmap/cli.py
@@ -0,0 +1,74 @@
+import argparse
+import os
+import sys
+
+from pgmap.counter import counter
+from pgmap.io import barcode_reader, library_reader, counts_writer
+from pgmap.trimming import read_trimmer
+
+TWO_READ_STRATEGY = "two-read"
+THREE_READ_STRATEGY = "three-read"
+
+
+def get_counts(args: argparse.Namespace):
+    barcodes = barcode_reader.read_barcodes(args.barcodes)
+    gRNA1s, gRNA2s, gRNA_mappings, id_mapping = library_reader.read_paired_guide_library_annotation(
+        args.library)
+
+    candidate_reads = None
+
+    if args.trim_strategy == TWO_READ_STRATEGY:
+        candidate_reads = read_trimmer.two_read_trim(*args.fastq)
+    elif args.trim_strategy == THREE_READ_STRATEGY:
+        candidate_reads = read_trimmer.two_read_trim(*args.fastq)
+
+    paired_guide_counts = counter.get_counts(
+        candidate_reads, gRNA_mappings, barcodes, gRNA2_error_tolerance=args.gRNA2_error, barcode_error_tolerance=args.barcode_error)
+
+    counts_writer.write_counts(
+        args.output, paired_guide_counts, barcodes, id_mapping)
+
+
+def _parse_args(args: list[str]) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        prog="pgmap", description="A tool to count paired guides from CRISPR double knockout screens.", exit_on_error=False)
+    # TODO in general these file formats should be documented more
+    parser.add_argument("-f", "--fastq", nargs='+', required=True, type=_check_file_exists,
+                        help="Fastq files to count from, separated by space. Can optionally be gzipped.")
+    parser.add_argument("-l", "--library", required=True, type=_check_file_exists,
+                        help="File containing annotated pgRNA information including the pgRNA id and both guide sequences.")
+    # TODO support no barcodes?
+    parser.add_argument("-b", "--barcodes", required=True, type=_check_file_exists,
+                        help="File containing sample barcodes including the barcode sequence and the sample id.")
+    # TODO check can write to this path?
+    parser.add_argument("-o", "--output", required=False,
+                        help="Output file path to populate with the counts for each paired guide and sample. If not provided the counts will be output in STDOUT.")
+    # TODO support arbitrary trim strategies
+    parser.add_argument("--trim-strategy", required=True, choices=(TWO_READ_STRATEGY, THREE_READ_STRATEGY),
+                        help="The trim strategy used to extract guides and barcodes. The two read strategy should have fastqs R1 and I1. The three read strategy should have fastqs R1, I1, and I2")  # TODO extract consts
+    parser.add_argument("--gRNA2-error", required=False, default=2, type=_check_nonnegative,
+                        help="The number of substituted base pairs to allow in gRNA2.")
+    parser.add_argument("--barcode-error", required=False, default=2, type=_check_nonnegative,
+                        help="The number of insertions, deletions, and subsititions of base pairs to allow in the barcodes.")
+    return parser.parse_args(args)
+
+
+def _check_nonnegative(value: str) -> int:
+    int_value = int(value)
+
+    if int_value < 0:
+        raise ValueError(f"Count must be nonnegative but was {value}")
+
+    return int_value
+
+
+def _check_file_exists(path: str) -> str:
+    if os.path.exists(path) and os.access(path, os.R_OK):
+        return path
+    else:
+        raise argparse.ArgumentTypeError(
+            f"File path {path} does not exist or is not readable")
+
+
+if __name__ == "__main__":
+    get_counts(_parse_args(sys.argv[1:]))
diff --git a/src/pgmap/counter/__init__.py b/src/pgmap/counter/__init__.py
diff --git a/src/pgmap/counter.py → src/pgmap/counter/counter.py b/src/pgmap/counter.py → src/pgmap/counter/counter.py
diff --git a/src/pgmap/io/counts_writer.py b/src/pgmap/io/counts_writer.py
@@ -0,0 +1,41 @@
+import csv
+from typing import Counter, Optional, IO, Iterator
+from contextlib import contextmanager
+import sys
+
+
+def write_counts(output_path: Optional[str], counts: Counter[tuple[str, str, str]], barcodes: dict[str, str], id_mappings: dict[str, str]):
+    """
+    Write counts to an output file tsv with a header [id, seq_1, seq2, sample_id_1, ..., sample_id_n]. If no
+    output_path is specified, write to STDOUT.
+
+    Args
+        output_path (Optional[str]): The output path to write to. If none, then the counts will be written to STDOUT.
+        counts (Counter[tuple[str, str, str]]): The counts for each (gRNA1, gRNA2, barcode).
+        id_mappings (dict[str, str]): A mapping from (gRNA1, gRNA2) to the paired guide id.
+    """
+    # TODO delete file if it already exists? manually test to see what happens if you don't do this
+
+    with _open_file_or_stdout(output_path) as f:
+        writer = csv.writer(f, delimiter='\t')
+
+        sample_ids = list(barcodes.values())
+        sample_id_to_barcode = {v: k for k, v in barcodes.items()}
+
+        writer.writerow(["id", "seq_1", "seq_2"] + sample_ids)
+
+        for (gRNA1, gRNA2), pg_id in id_mappings.items():
+            row = [pg_id, gRNA1, gRNA2] + \
+                [counts[(gRNA1, gRNA2, sample_id_to_barcode[sample_id])]
+                 for sample_id in sample_ids]
+
+            writer.writerow(row)
+
+
+@contextmanager
+def _open_file_or_stdout(path) -> Iterator[IO]:
+    if path is None:
+        yield sys.stdout
+    else:
+        with open(path, 'w', newline='') as f:
+            yield f
diff --git a/src/pgmap/io/library_reader.py b/src/pgmap/io/library_reader.py
@@ -1,9 +1,10 @@
 from collections import defaultdict
+import csv
 
 from pgmap.io import fastx_reader
 
 
-def read_paired_guide_library(R1_path: str, R2_path: str) -> tuple[set[str], set[str], dict[str, set[str]]]:
+def read_paired_guide_library_fastas(R1_path: str, R2_path: str) -> tuple[set[str], set[str], dict[str, set[str]]]:
     """
     Reads a paired guide library from two fasta files. R1 and R2 are the first and second guide sequences
     respectively.
@@ -18,6 +19,7 @@ def read_paired_guide_library(R1_path: str, R2_path: str) -> tuple[set[str], set
         gRNA_mappings (dict[str, set[str]]): A mapping from each gRNA1 to a set of all of it's paired gRNA2s.
     """
     # TODO examine how other guide libraries are stored / shared. This might be niche
+    # TODO deprecate?
     gRNA1s = set()
     gRNA2s = set()
 
@@ -30,3 +32,40 @@ def read_paired_guide_library(R1_path: str, R2_path: str) -> tuple[set[str], set
         gRNA_mappings[gRNA1].add(gRNA2)
 
     return gRNA1s, gRNA2s, gRNA_mappings
+
+
+def read_paired_guide_library_annotation(annotation_path: str) -> tuple[set[str], set[str], dict[str, set[str]]]:
+    """
+    Reads a paired guide library from an annotation file. The file should be a tsv with a header of id, gRNA1, gRNA2.
+
+    Args:
+        annotation_path (str): The path to the library annotation file.
+
+    Returns:
+        gRNA1s (set[str]): The set of all gRNA1 sequences.
+        gRNA2s (set[str]): The set of all gRNA2 sequences.
+        gRNA_mappings (dict[str, set[str]]): A mapping from each gRNA1 to a set of all of it's paired gRNA2s.
+        id_mapping (dict[str, str]): A mapping from each paired (gRNA1, gRNA2) to the annotation id name.
+    """
+    gRNA1s = set()
+    gRNA2s = set()
+
+    gRNA_mappings = defaultdict(set)
+
+    id_mapping = {}
+
+    with open(annotation_path, 'r') as file:
+        tsv_reader = csv.reader(file, delimiter='\t')
+
+        for i, (id, gRNA1, gRNA2) in enumerate(tsv_reader):
+            if i == 0:
+                continue  # skip column headers
+
+            gRNA1s.add(gRNA1)
+            gRNA2s.add(gRNA2)
+
+            gRNA_mappings[gRNA1].add(gRNA2)
+
+            id_mapping[(gRNA1, gRNA2)] = id
+
+    return gRNA1s, gRNA2s, gRNA_mappings, id_mapping