feat: blast search dataset vs database (#99)

* style: fix deprecated dockerfile syntax * feat: add BLAST to backend container * wip: all-vs-all BLAST * wip: blastp * fix: empty state for blast results * fix: ci build of test env * fix: bytes decoding in exception message * ci: ignore flake8 false positives * fix: restore types-pytz in dev env
rosalindfranklininstitute · Oct 15, 2024 · 207cffd · 207cffd
1 parent 3c3bc22
commit 207cffd
Show file tree

Hide file tree

Showing 16 changed files with 642 additions and 195 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -37,6 +37,8 @@ jobs:
         with:
           context: ${{ matrix.component }}
           target: dev
+          build-args: |
+            PIPENV_SYNC_FLAGS=--dev
           platforms: linux/amd64
           load: true
           tags: antigen-app-dev-${{ matrix.component }}:latest

diff --git a/backend/Dockerfile b/backend/Dockerfile
@@ -1,29 +1,45 @@
-FROM python:3.11-bullseye AS builder
+FROM quay.io/rosalindfranklininstitute/blast:2.16.0 AS blast
 
-RUN pip install --user wheel pipenv
+FROM python:3.13-bookworm AS builder
+
+RUN pip install --upgrade pip wheel pipenv
+
+RUN adduser --uid 10191 --group --system --no-create-home nonroot
+RUN chown -R nonroot:nonroot /usr/src
+USER nonroot
+WORKDIR /usr/src
 
 # Tell pipenv to create venv in the current directory
 ENV PIPENV_VENV_IN_PROJECT=1
 
 COPY Pipfile Pipfile.lock pyproject.toml setup.cfg /usr/src/
-WORKDIR /usr/src
 
-RUN /root/.local/bin/pipenv sync
+ARG PIPENV_SYNC_FLAGS=
+RUN pipenv sync ${PIPENV_SYNC_FLAGS}
 
 COPY manage.py /usr/src/
 COPY antigendjango /usr/src/antigendjango
 COPY antigenapi /usr/src/antigenapi
 
 RUN DJANGO_CI=true .venv/bin/python manage.py collectstatic --noinput
 
-FROM python:3.11-slim-bullseye AS prod
+FROM python:3.13-slim-bookworm AS prod
 
+# liblmdb-dev required by BLAST
 RUN apt-get update && apt-get install -y \
     libxml2 \
     media-types \
+    liblmdb-dev \
     && rm -rf /var/lib/apt/lists/*
 
-RUN mkdir -v /usr/src/.venv
+RUN adduser --uid 10191 --group --system --no-create-home nonroot
+RUN chown -R nonroot:nonroot /usr/src
+USER nonroot
+WORKDIR /usr/src
+
+# blastp and makeblastdb commands
+COPY --from=blast /blast/ReleaseMT/bin/blastp /usr/local/bin/blastp
+COPY --from=blast /blast/ReleaseMT/bin/makeblastdb /usr/local/bin/makeblastdb
 
 COPY --from=builder /usr/src/.venv/ /usr/src/.venv/
 COPY --from=builder /usr/src/static/ /api_data/static/
@@ -34,16 +50,17 @@ COPY uwsgi.ini /usr/src
 
 WORKDIR /usr/src
 
-RUN addgroup --gid 10191 nonroot
-RUN adduser --uid 10191 --gid 10191 --system --no-create-home nonroot
-USER nonroot
-
 CMD [".venv/bin/uwsgi", "--ini", "uwsgi.ini"]
 
 FROM builder AS dev
 
-ENV PATH="$PATH:/root/.local/bin:/usr/src/.venv/bin"
+# liblmdb-dev required by BLAST
+USER root
+RUN apt update && apt install -y liblmdb-dev && rm -rf /var/lib/apt/lists/*
+USER nonroot
 
-RUN pipenv sync --dev
+# blastp and makeblastdb commands
+COPY --from=blast /blast/ReleaseMT/bin/blastp /usr/local/bin/blastp
+COPY --from=blast /blast/ReleaseMT/bin/makeblastdb /usr/local/bin/makeblastdb
 
 CMD ["pipenv", "run", "python", "manage.py", "runserver", "0.0.0.0:8080"]
diff --git a/backend/Pipfile.lock b/backend/Pipfile.lock
diff --git a/backend/antigenapi/bioinformatics/__init__.py b/backend/antigenapi/bioinformatics/__init__.py
diff --git a/backend/antigenapi/bioinformatics/blast.py b/backend/antigenapi/bioinformatics/blast.py
@@ -0,0 +1,137 @@
+import os
+import subprocess
+from tempfile import TemporaryDirectory
+from typing import Optional
+
+from ..models import SequencingRunResults
+from .imgt import read_airr_file
+
+# https://www.ncbi.nlm.nih.gov/books/NBK279684/table/appendices.T.options_common_to_all_blast/
+BLAST_FMT_MULTIPLE_FILE_BLAST_JSON = "15"
+BLAST_NUM_THREADS = 4
+
+
+def get_db_fasta(include_run: Optional[int] = None, exclude_run: Optional[int] = None):
+    """Get the sequencing database in fasta format.
+
+    Args:
+        include_run (int, optional): Sequencing run ID to include.
+          Defaults to None.
+        exclude_run (int, optional): Sequencing run ID to exclude.
+          Defaults to None.
+
+    Returns:
+        str: Sequencing run as a FASTA format string
+    """
+    fasta_data = ""
+    query = SequencingRunResults.objects.all()
+    if include_run:
+        query = query.filter(sequencing_run_id=include_run)
+    if exclude_run:
+        query = query.exclude(sequencing_run_id=exclude_run)
+    for sr in query:
+        airr_file = read_airr_file(
+            sr.airr_file, usecols=("sequence_id", "sequence_alignment_aa")
+        )
+        airr_file = airr_file[airr_file.sequence_alignment_aa.notna()]
+        if not airr_file.empty:
+            for _, row in airr_file.iterrows():
+                fasta_data += f"> {row.sequence_id}\n"
+                fasta_data += f"{row.sequence_alignment_aa.replace('.', '')}\n"
+    return fasta_data
+
+
+def get_sequencing_run_fasta(sequencing_run_id: int):
+    """Get sequencing run in BLAST format.
+
+    Args:
+        sequencing_run_id (int): Sequencing run ID
+
+    Returns:
+        str: Sequencing run as a FASTA format string
+    """
+    return get_db_fasta(include_run=sequencing_run_id)
+
+
+def run_blastp(
+    sequencing_run_id: int, outfmt: str = BLAST_FMT_MULTIPLE_FILE_BLAST_JSON
+):
+    """Run blastp for a sequencing run vs rest of database.
+
+    Args:
+        sequencing_run_id (int): Sequencing run ID.
+
+    Returns:
+        JSONResponse: Single file BLAST JSON
+    """
+    db_data = get_db_fasta()
+    if not db_data:
+        return None
+    query_data = get_sequencing_run_fasta(sequencing_run_id)
+    if not query_data:
+        return None
+
+    # Write the DB to disk as .fasta format
+    with TemporaryDirectory() as tmp_dir:
+        fasta_filename = os.path.join(tmp_dir, "db.fasta")
+        with open(fasta_filename, "w") as f:
+            f.write(db_data)
+
+        # Run makeblastdb
+        mkdb_proc = subprocess.run(
+            [
+                "makeblastdb",
+                "-in",
+                "db.fasta",
+                "-dbtype",
+                "prot",
+                "-out",
+                "antigen.db",
+            ],
+            capture_output=True,
+            cwd=tmp_dir,
+        )
+
+        if mkdb_proc.returncode != 0:
+            raise Exception(
+                f"makeblastdb returned exit code of "
+                f"{mkdb_proc.returncode}\n\n"
+                f"STDOUT: {mkdb_proc.stdout.decode()}\n\n"
+                f"STDERR: {mkdb_proc.stderr.decode()}"
+            )
+
+        # Write out query file
+        fasta_filename = os.path.join(tmp_dir, "query.fasta")
+        with open(fasta_filename, "w") as f:
+            f.write(query_data)
+
+        # Run blastp
+        blastp_proc = subprocess.run(
+            [
+                "blastp",
+                "-db",
+                "antigen.db",
+                "-query",
+                "query.fasta",
+                "-outfmt",
+                outfmt,
+                "-out",
+                "antigen.results",
+                "-num_threads",
+                str(BLAST_NUM_THREADS),
+            ],
+            capture_output=True,
+            cwd=tmp_dir,
+        )
+
+        if blastp_proc.returncode != 0:
+            raise Exception(
+                f"blastp returned exit code of "
+                f"{blastp_proc.returncode}\n\n"
+                f"STDOUT: {blastp_proc.stdout.decode()}\n\n"
+                f"STDERR: {blastp_proc.stderr.decode()}"
+            )
+
+        # Read in the results file
+        with open(os.path.join(tmp_dir, "antigen.results"), "r") as f:
+            return f.read()
diff --git a/backend/antigenapi/bioinformatics.py → backend/antigenapi/bioinformatics/imgt.py b/backend/antigenapi/bioinformatics.py → backend/antigenapi/bioinformatics/imgt.py
diff --git a/backend/antigenapi/models.py b/backend/antigenapi/models.py
@@ -28,7 +28,7 @@
 )
 from django.db.models.signals import post_init, post_save
 
-from .bioinformatics import read_airr_file
+from .bioinformatics.imgt import read_airr_file
 
 
 class Project(Model):

diff --git a/backend/antigenapi/tests/test_parsers.py b/backend/antigenapi/tests/test_parsers.py
@@ -2,7 +2,7 @@
 import unittest
 import zipfile
 
-from antigenapi.bioinformatics import load_sequences, trim_sequence
+from antigenapi.bioinformatics.imgt import load_sequences, trim_sequence
 from antigenapi.views_old import _extract_well
 
 

diff --git a/backend/antigenapi/views_old.py b/backend/antigenapi/views_old.py
@@ -1,6 +1,7 @@
 import collections.abc
 import datetime
 import io
+import json
 import math
 import os
 import re
@@ -36,7 +37,8 @@
 from rest_framework.views import APIView
 from rest_framework.viewsets import ModelViewSet
 
-from antigenapi.bioinformatics import (
+from antigenapi.bioinformatics.blast import get_db_fasta, run_blastp
+from antigenapi.bioinformatics.imgt import (
     AIRR_IMPORTANT_COLUMNS,
     as_fasta_files,
     load_sequences,
@@ -1052,20 +1054,83 @@ def search_sequencing_run_results(self, request, query):
             {"matches": pd.concat(results).to_dict(orient="records") if results else []}
         )
 
+    @action(
+        detail=True,
+        methods=["GET"],
+        name="BLAST sequencing run results vs DB.",
+        url_path="blast",
+    )
+    def get_blast_sequencing_run(self, request, pk):
+        """BLAST sequencing run vs database."""
+        blast_str = run_blastp(pk)
+        if not blast_str:
+            return JsonResponse({"hits": []}, status=status.HTTP_404_NOT_FOUND)
+
+        # Read query AIRR files for CDRs
+        results = SequencingRunResults.objects.filter(sequencing_run_id=int(pk))
+
+        airr_df = pd.concat(read_airr_file(r.airr_file) for r in results)
+        airr_df = airr_df.set_index("sequence_id")
+
+        ALIGN_PERC_THRESHOLD = 90
+        E_VALUE_THRESHOLD = 0.05
+
+        # Parse the JSON and keep the important bits
+        blast_result = json.loads(blast_str)
+        res = []
+        for blast_run in blast_result["BlastOutput2"]:
+            run_res = blast_run["report"]["results"]["search"]
+            for blast_hit_set in run_res["hits"]:
+                subject_title = blast_hit_set["description"][0]["title"]
+                query_title = run_res["query_title"]
+                query_cdr3 = airr_df.at[query_title, "cdr3_aa"]
+                if pd.isna(query_cdr3):
+                    query_cdr3 = None
+                if subject_title.strip() == query_title.strip():
+                    continue
+                hsps = blast_hit_set["hsps"][0]
+                assert len(blast_hit_set["description"]) == 1
+                for hsps in blast_hit_set["hsps"]:
+                    align_perc = round(
+                        (hsps["align_len"] / run_res["query_len"]) * 100, 2
+                    )
+                    if align_perc < ALIGN_PERC_THRESHOLD:
+                        continue
+                    e_value = hsps["evalue"]
+                    if e_value > E_VALUE_THRESHOLD:
+                        continue
+                    res.append(
+                        {
+                            "query_title": query_title.strip(),
+                            "query_cdr3": query_cdr3,
+                            "subject_title": subject_title.strip(),
+                            "submatch_no": hsps["num"],
+                            "query_seq": hsps["qseq"],
+                            "subject_seq": hsps["hseq"],
+                            "midline": hsps["midline"],
+                            "bit_score": hsps["bit_score"],
+                            "e_value": e_value,
+                            "align_len": hsps["align_len"],
+                            "align_perc": align_perc,
+                            "ident_perc": round(
+                                (hsps["identity"] / hsps["align_len"]) * 100, 2
+                            ),
+                        }
+                    )
+
+        # Sort the results
+        res = sorted(
+            res,
+            key=lambda r: (r["query_cdr3"] or "", -r["align_perc"], -r["ident_perc"]),
+        )
+
+        return JsonResponse({"hits": res})
+
 
 class GlobalFastaView(APIView):
     def get(self, request, format=None):
         """Download entire database as .fasta file."""
-        fasta_data = ""
-        for sr in SequencingRunResults.objects.all():
-            airr_file = read_airr_file(
-                sr.airr_file, usecols=("sequence_id", "sequence_alignment_aa")
-            )
-            airr_file = airr_file[airr_file.sequence_alignment_aa.notna()]
-            if not airr_file.empty:
-                for _, row in airr_file.iterrows():
-                    fasta_data += f"> {row.sequence_id}\n"
-                    fasta_data += f"{row.sequence_alignment_aa.replace('.', '')}\n"
+        fasta_data = get_db_fasta()
 
         fasta_filename = (
             f"antigenapp_database_{datetime.datetime.now().isoformat()}.fasta"

diff --git a/backend/setup.cfg b/backend/setup.cfg
@@ -26,9 +26,9 @@ install_requires =
 
 [options.extras_require]
 dev =
-    black==24.3.0
-    isort>5.0
-    flake8<4
+    black
+    isort
+    flake8
     pytest
     pytest-cov
     pytest-black
@@ -37,6 +37,7 @@ dev =
     pytest-flake8
     pytest-black
     pytest-pydocstyle
+    types-pytz
     flake8-isort
 
 
@@ -57,6 +58,9 @@ float_to_top=true
 max-line-length = 88
 extend-ignore =
     E203,
+    E231, # false positive
+    E702, # false positive
+    E713, # false positive
     F811,
     F722,
 exclude =