Skip to content

Commit

Permalink
feat: blast search dataset vs database (#99)
Browse files Browse the repository at this point in the history
* style: fix deprecated dockerfile syntax

* feat: add BLAST to backend container

* wip: all-vs-all BLAST

* wip: blastp

* fix: empty state for blast results

* fix: ci build of test env

* fix: bytes decoding in exception message

* ci: ignore flake8 false positives

* fix: restore types-pytz in dev env
  • Loading branch information
alubbock authored Oct 15, 2024
1 parent 3c3bc22 commit 207cffd
Show file tree
Hide file tree
Showing 16 changed files with 642 additions and 195 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ jobs:
with:
context: ${{ matrix.component }}
target: dev
build-args: |
PIPENV_SYNC_FLAGS=--dev
platforms: linux/amd64
load: true
tags: antigen-app-dev-${{ matrix.component }}:latest
Expand Down
41 changes: 29 additions & 12 deletions backend/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,29 +1,45 @@
FROM python:3.11-bullseye AS builder
FROM quay.io/rosalindfranklininstitute/blast:2.16.0 AS blast

RUN pip install --user wheel pipenv
FROM python:3.13-bookworm AS builder

RUN pip install --upgrade pip wheel pipenv

RUN adduser --uid 10191 --group --system --no-create-home nonroot
RUN chown -R nonroot:nonroot /usr/src
USER nonroot
WORKDIR /usr/src

# Tell pipenv to create venv in the current directory
ENV PIPENV_VENV_IN_PROJECT=1

COPY Pipfile Pipfile.lock pyproject.toml setup.cfg /usr/src/
WORKDIR /usr/src

RUN /root/.local/bin/pipenv sync
ARG PIPENV_SYNC_FLAGS=
RUN pipenv sync ${PIPENV_SYNC_FLAGS}

COPY manage.py /usr/src/
COPY antigendjango /usr/src/antigendjango
COPY antigenapi /usr/src/antigenapi

RUN DJANGO_CI=true .venv/bin/python manage.py collectstatic --noinput

FROM python:3.11-slim-bullseye AS prod
FROM python:3.13-slim-bookworm AS prod

# liblmdb-dev required by BLAST
RUN apt-get update && apt-get install -y \
libxml2 \
media-types \
liblmdb-dev \
&& rm -rf /var/lib/apt/lists/*

RUN mkdir -v /usr/src/.venv
RUN adduser --uid 10191 --group --system --no-create-home nonroot
RUN chown -R nonroot:nonroot /usr/src
USER nonroot
WORKDIR /usr/src

# blastp and makeblastdb commands
COPY --from=blast /blast/ReleaseMT/bin/blastp /usr/local/bin/blastp
COPY --from=blast /blast/ReleaseMT/bin/makeblastdb /usr/local/bin/makeblastdb

COPY --from=builder /usr/src/.venv/ /usr/src/.venv/
COPY --from=builder /usr/src/static/ /api_data/static/
Expand All @@ -34,16 +50,17 @@ COPY uwsgi.ini /usr/src

WORKDIR /usr/src

RUN addgroup --gid 10191 nonroot
RUN adduser --uid 10191 --gid 10191 --system --no-create-home nonroot
USER nonroot

CMD [".venv/bin/uwsgi", "--ini", "uwsgi.ini"]

FROM builder AS dev

ENV PATH="$PATH:/root/.local/bin:/usr/src/.venv/bin"
# liblmdb-dev required by BLAST
USER root
RUN apt update && apt install -y liblmdb-dev && rm -rf /var/lib/apt/lists/*
USER nonroot

RUN pipenv sync --dev
# blastp and makeblastdb commands
COPY --from=blast /blast/ReleaseMT/bin/blastp /usr/local/bin/blastp
COPY --from=blast /blast/ReleaseMT/bin/makeblastdb /usr/local/bin/makeblastdb

CMD ["pipenv", "run", "python", "manage.py", "runserver", "0.0.0.0:8080"]
328 changes: 163 additions & 165 deletions backend/Pipfile.lock

Large diffs are not rendered by default.

Empty file.
137 changes: 137 additions & 0 deletions backend/antigenapi/bioinformatics/blast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import os
import subprocess
from tempfile import TemporaryDirectory
from typing import Optional

from ..models import SequencingRunResults
from .imgt import read_airr_file

# https://www.ncbi.nlm.nih.gov/books/NBK279684/table/appendices.T.options_common_to_all_blast/
BLAST_FMT_MULTIPLE_FILE_BLAST_JSON = "15"
BLAST_NUM_THREADS = 4


def get_db_fasta(include_run: Optional[int] = None, exclude_run: Optional[int] = None):
"""Get the sequencing database in fasta format.
Args:
include_run (int, optional): Sequencing run ID to include.
Defaults to None.
exclude_run (int, optional): Sequencing run ID to exclude.
Defaults to None.
Returns:
str: Sequencing run as a FASTA format string
"""
fasta_data = ""
query = SequencingRunResults.objects.all()
if include_run:
query = query.filter(sequencing_run_id=include_run)
if exclude_run:
query = query.exclude(sequencing_run_id=exclude_run)
for sr in query:
airr_file = read_airr_file(
sr.airr_file, usecols=("sequence_id", "sequence_alignment_aa")
)
airr_file = airr_file[airr_file.sequence_alignment_aa.notna()]
if not airr_file.empty:
for _, row in airr_file.iterrows():
fasta_data += f"> {row.sequence_id}\n"
fasta_data += f"{row.sequence_alignment_aa.replace('.', '')}\n"
return fasta_data


def get_sequencing_run_fasta(sequencing_run_id: int):
"""Get sequencing run in BLAST format.
Args:
sequencing_run_id (int): Sequencing run ID
Returns:
str: Sequencing run as a FASTA format string
"""
return get_db_fasta(include_run=sequencing_run_id)


def run_blastp(
sequencing_run_id: int, outfmt: str = BLAST_FMT_MULTIPLE_FILE_BLAST_JSON
):
"""Run blastp for a sequencing run vs rest of database.
Args:
sequencing_run_id (int): Sequencing run ID.
Returns:
JSONResponse: Single file BLAST JSON
"""
db_data = get_db_fasta()
if not db_data:
return None
query_data = get_sequencing_run_fasta(sequencing_run_id)
if not query_data:
return None

# Write the DB to disk as .fasta format
with TemporaryDirectory() as tmp_dir:
fasta_filename = os.path.join(tmp_dir, "db.fasta")
with open(fasta_filename, "w") as f:
f.write(db_data)

# Run makeblastdb
mkdb_proc = subprocess.run(
[
"makeblastdb",
"-in",
"db.fasta",
"-dbtype",
"prot",
"-out",
"antigen.db",
],
capture_output=True,
cwd=tmp_dir,
)

if mkdb_proc.returncode != 0:
raise Exception(
f"makeblastdb returned exit code of "
f"{mkdb_proc.returncode}\n\n"
f"STDOUT: {mkdb_proc.stdout.decode()}\n\n"
f"STDERR: {mkdb_proc.stderr.decode()}"
)

# Write out query file
fasta_filename = os.path.join(tmp_dir, "query.fasta")
with open(fasta_filename, "w") as f:
f.write(query_data)

# Run blastp
blastp_proc = subprocess.run(
[
"blastp",
"-db",
"antigen.db",
"-query",
"query.fasta",
"-outfmt",
outfmt,
"-out",
"antigen.results",
"-num_threads",
str(BLAST_NUM_THREADS),
],
capture_output=True,
cwd=tmp_dir,
)

if blastp_proc.returncode != 0:
raise Exception(
f"blastp returned exit code of "
f"{blastp_proc.returncode}\n\n"
f"STDOUT: {blastp_proc.stdout.decode()}\n\n"
f"STDERR: {blastp_proc.stderr.decode()}"
)

# Read in the results file
with open(os.path.join(tmp_dir, "antigen.results"), "r") as f:
return f.read()
File renamed without changes.
2 changes: 1 addition & 1 deletion backend/antigenapi/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
)
from django.db.models.signals import post_init, post_save

from .bioinformatics import read_airr_file
from .bioinformatics.imgt import read_airr_file


class Project(Model):
Expand Down
2 changes: 1 addition & 1 deletion backend/antigenapi/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import unittest
import zipfile

from antigenapi.bioinformatics import load_sequences, trim_sequence
from antigenapi.bioinformatics.imgt import load_sequences, trim_sequence
from antigenapi.views_old import _extract_well


Expand Down
87 changes: 76 additions & 11 deletions backend/antigenapi/views_old.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import collections.abc
import datetime
import io
import json
import math
import os
import re
Expand Down Expand Up @@ -36,7 +37,8 @@
from rest_framework.views import APIView
from rest_framework.viewsets import ModelViewSet

from antigenapi.bioinformatics import (
from antigenapi.bioinformatics.blast import get_db_fasta, run_blastp
from antigenapi.bioinformatics.imgt import (
AIRR_IMPORTANT_COLUMNS,
as_fasta_files,
load_sequences,
Expand Down Expand Up @@ -1052,20 +1054,83 @@ def search_sequencing_run_results(self, request, query):
{"matches": pd.concat(results).to_dict(orient="records") if results else []}
)

@action(
detail=True,
methods=["GET"],
name="BLAST sequencing run results vs DB.",
url_path="blast",
)
def get_blast_sequencing_run(self, request, pk):
"""BLAST sequencing run vs database."""
blast_str = run_blastp(pk)
if not blast_str:
return JsonResponse({"hits": []}, status=status.HTTP_404_NOT_FOUND)

# Read query AIRR files for CDRs
results = SequencingRunResults.objects.filter(sequencing_run_id=int(pk))

airr_df = pd.concat(read_airr_file(r.airr_file) for r in results)
airr_df = airr_df.set_index("sequence_id")

ALIGN_PERC_THRESHOLD = 90
E_VALUE_THRESHOLD = 0.05

# Parse the JSON and keep the important bits
blast_result = json.loads(blast_str)
res = []
for blast_run in blast_result["BlastOutput2"]:
run_res = blast_run["report"]["results"]["search"]
for blast_hit_set in run_res["hits"]:
subject_title = blast_hit_set["description"][0]["title"]
query_title = run_res["query_title"]
query_cdr3 = airr_df.at[query_title, "cdr3_aa"]
if pd.isna(query_cdr3):
query_cdr3 = None
if subject_title.strip() == query_title.strip():
continue
hsps = blast_hit_set["hsps"][0]
assert len(blast_hit_set["description"]) == 1
for hsps in blast_hit_set["hsps"]:
align_perc = round(
(hsps["align_len"] / run_res["query_len"]) * 100, 2
)
if align_perc < ALIGN_PERC_THRESHOLD:
continue
e_value = hsps["evalue"]
if e_value > E_VALUE_THRESHOLD:
continue
res.append(
{
"query_title": query_title.strip(),
"query_cdr3": query_cdr3,
"subject_title": subject_title.strip(),
"submatch_no": hsps["num"],
"query_seq": hsps["qseq"],
"subject_seq": hsps["hseq"],
"midline": hsps["midline"],
"bit_score": hsps["bit_score"],
"e_value": e_value,
"align_len": hsps["align_len"],
"align_perc": align_perc,
"ident_perc": round(
(hsps["identity"] / hsps["align_len"]) * 100, 2
),
}
)

# Sort the results
res = sorted(
res,
key=lambda r: (r["query_cdr3"] or "", -r["align_perc"], -r["ident_perc"]),
)

return JsonResponse({"hits": res})


class GlobalFastaView(APIView):
def get(self, request, format=None):
"""Download entire database as .fasta file."""
fasta_data = ""
for sr in SequencingRunResults.objects.all():
airr_file = read_airr_file(
sr.airr_file, usecols=("sequence_id", "sequence_alignment_aa")
)
airr_file = airr_file[airr_file.sequence_alignment_aa.notna()]
if not airr_file.empty:
for _, row in airr_file.iterrows():
fasta_data += f"> {row.sequence_id}\n"
fasta_data += f"{row.sequence_alignment_aa.replace('.', '')}\n"
fasta_data = get_db_fasta()

fasta_filename = (
f"antigenapp_database_{datetime.datetime.now().isoformat()}.fasta"
Expand Down
10 changes: 7 additions & 3 deletions backend/setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ install_requires =

[options.extras_require]
dev =
black==24.3.0
isort>5.0
flake8<4
black
isort
flake8
pytest
pytest-cov
pytest-black
Expand All @@ -37,6 +37,7 @@ dev =
pytest-flake8
pytest-black
pytest-pydocstyle
types-pytz
flake8-isort


Expand All @@ -57,6 +58,9 @@ float_to_top=true
max-line-length = 88
extend-ignore =
E203,
E231, # false positive
E702, # false positive
E713, # false positive
F811,
F722,
exclude =
Expand Down
Loading

0 comments on commit 207cffd

Please sign in to comment.