Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: blast search dataset vs database #99

Merged
merged 9 commits into from
Oct 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ jobs:
with:
context: ${{ matrix.component }}
target: dev
build-args: |
PIPENV_SYNC_FLAGS=--dev
platforms: linux/amd64
load: true
tags: antigen-app-dev-${{ matrix.component }}:latest
Expand Down
41 changes: 29 additions & 12 deletions backend/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,29 +1,45 @@
FROM python:3.11-bullseye AS builder
FROM quay.io/rosalindfranklininstitute/blast:2.16.0 AS blast

RUN pip install --user wheel pipenv
FROM python:3.13-bookworm AS builder

RUN pip install --upgrade pip wheel pipenv

RUN adduser --uid 10191 --group --system --no-create-home nonroot
RUN chown -R nonroot:nonroot /usr/src
USER nonroot
WORKDIR /usr/src

# Tell pipenv to create venv in the current directory
ENV PIPENV_VENV_IN_PROJECT=1

COPY Pipfile Pipfile.lock pyproject.toml setup.cfg /usr/src/
WORKDIR /usr/src

RUN /root/.local/bin/pipenv sync
ARG PIPENV_SYNC_FLAGS=
RUN pipenv sync ${PIPENV_SYNC_FLAGS}

COPY manage.py /usr/src/
COPY antigendjango /usr/src/antigendjango
COPY antigenapi /usr/src/antigenapi

RUN DJANGO_CI=true .venv/bin/python manage.py collectstatic --noinput

FROM python:3.11-slim-bullseye AS prod
FROM python:3.13-slim-bookworm AS prod

# liblmdb-dev required by BLAST
RUN apt-get update && apt-get install -y \
libxml2 \
media-types \
liblmdb-dev \
&& rm -rf /var/lib/apt/lists/*

RUN mkdir -v /usr/src/.venv
RUN adduser --uid 10191 --group --system --no-create-home nonroot
RUN chown -R nonroot:nonroot /usr/src
USER nonroot
WORKDIR /usr/src

# blastp and makeblastdb commands
COPY --from=blast /blast/ReleaseMT/bin/blastp /usr/local/bin/blastp
COPY --from=blast /blast/ReleaseMT/bin/makeblastdb /usr/local/bin/makeblastdb

COPY --from=builder /usr/src/.venv/ /usr/src/.venv/
COPY --from=builder /usr/src/static/ /api_data/static/
Expand All @@ -34,16 +50,17 @@ COPY uwsgi.ini /usr/src

WORKDIR /usr/src

RUN addgroup --gid 10191 nonroot
RUN adduser --uid 10191 --gid 10191 --system --no-create-home nonroot
USER nonroot

CMD [".venv/bin/uwsgi", "--ini", "uwsgi.ini"]

FROM builder AS dev

ENV PATH="$PATH:/root/.local/bin:/usr/src/.venv/bin"
# liblmdb-dev required by BLAST
USER root
RUN apt update && apt install -y liblmdb-dev && rm -rf /var/lib/apt/lists/*
USER nonroot

RUN pipenv sync --dev
# blastp and makeblastdb commands
COPY --from=blast /blast/ReleaseMT/bin/blastp /usr/local/bin/blastp
COPY --from=blast /blast/ReleaseMT/bin/makeblastdb /usr/local/bin/makeblastdb

CMD ["pipenv", "run", "python", "manage.py", "runserver", "0.0.0.0:8080"]
328 changes: 163 additions & 165 deletions backend/Pipfile.lock

Large diffs are not rendered by default.

Empty file.
137 changes: 137 additions & 0 deletions backend/antigenapi/bioinformatics/blast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import os
import subprocess
from tempfile import TemporaryDirectory
from typing import Optional

from ..models import SequencingRunResults
from .imgt import read_airr_file

# https://www.ncbi.nlm.nih.gov/books/NBK279684/table/appendices.T.options_common_to_all_blast/
BLAST_FMT_MULTIPLE_FILE_BLAST_JSON = "15"
BLAST_NUM_THREADS = 4


def get_db_fasta(include_run: Optional[int] = None, exclude_run: Optional[int] = None):
"""Get the sequencing database in fasta format.

Args:
include_run (int, optional): Sequencing run ID to include.
Defaults to None.
exclude_run (int, optional): Sequencing run ID to exclude.
Defaults to None.

Returns:
str: Sequencing run as a FASTA format string
"""
fasta_data = ""
query = SequencingRunResults.objects.all()
if include_run:
query = query.filter(sequencing_run_id=include_run)
if exclude_run:
query = query.exclude(sequencing_run_id=exclude_run)
for sr in query:
airr_file = read_airr_file(
sr.airr_file, usecols=("sequence_id", "sequence_alignment_aa")
)
airr_file = airr_file[airr_file.sequence_alignment_aa.notna()]
if not airr_file.empty:
for _, row in airr_file.iterrows():
fasta_data += f"> {row.sequence_id}\n"
fasta_data += f"{row.sequence_alignment_aa.replace('.', '')}\n"
return fasta_data


def get_sequencing_run_fasta(sequencing_run_id: int):
"""Get sequencing run in BLAST format.

Args:
sequencing_run_id (int): Sequencing run ID

Returns:
str: Sequencing run as a FASTA format string
"""
return get_db_fasta(include_run=sequencing_run_id)


def run_blastp(
sequencing_run_id: int, outfmt: str = BLAST_FMT_MULTIPLE_FILE_BLAST_JSON
):
"""Run blastp for a sequencing run vs rest of database.

Args:
sequencing_run_id (int): Sequencing run ID.

Returns:
JSONResponse: Single file BLAST JSON
"""
db_data = get_db_fasta()
if not db_data:
return None
query_data = get_sequencing_run_fasta(sequencing_run_id)
if not query_data:
return None

# Write the DB to disk as .fasta format
with TemporaryDirectory() as tmp_dir:
fasta_filename = os.path.join(tmp_dir, "db.fasta")
with open(fasta_filename, "w") as f:
f.write(db_data)

# Run makeblastdb
mkdb_proc = subprocess.run(
[
"makeblastdb",
"-in",
"db.fasta",
"-dbtype",
"prot",
"-out",
"antigen.db",
],
capture_output=True,
cwd=tmp_dir,
)

if mkdb_proc.returncode != 0:
raise Exception(
f"makeblastdb returned exit code of "
f"{mkdb_proc.returncode}\n\n"
f"STDOUT: {mkdb_proc.stdout.decode()}\n\n"
f"STDERR: {mkdb_proc.stderr.decode()}"
)

# Write out query file
fasta_filename = os.path.join(tmp_dir, "query.fasta")
with open(fasta_filename, "w") as f:
f.write(query_data)

# Run blastp
blastp_proc = subprocess.run(
[
"blastp",
"-db",
"antigen.db",
"-query",
"query.fasta",
"-outfmt",
outfmt,
"-out",
"antigen.results",
"-num_threads",
str(BLAST_NUM_THREADS),
],
capture_output=True,
cwd=tmp_dir,
)

if blastp_proc.returncode != 0:
raise Exception(
f"blastp returned exit code of "
f"{blastp_proc.returncode}\n\n"
f"STDOUT: {blastp_proc.stdout.decode()}\n\n"
f"STDERR: {blastp_proc.stderr.decode()}"
)

# Read in the results file
with open(os.path.join(tmp_dir, "antigen.results"), "r") as f:
return f.read()
2 changes: 1 addition & 1 deletion backend/antigenapi/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
)
from django.db.models.signals import post_init, post_save

from .bioinformatics import read_airr_file
from .bioinformatics.imgt import read_airr_file


class Project(Model):
Expand Down
2 changes: 1 addition & 1 deletion backend/antigenapi/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import unittest
import zipfile

from antigenapi.bioinformatics import load_sequences, trim_sequence
from antigenapi.bioinformatics.imgt import load_sequences, trim_sequence
from antigenapi.views_old import _extract_well


Expand Down
87 changes: 76 additions & 11 deletions backend/antigenapi/views_old.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import collections.abc
import datetime
import io
import json
import math
import os
import re
Expand Down Expand Up @@ -36,7 +37,8 @@
from rest_framework.views import APIView
from rest_framework.viewsets import ModelViewSet

from antigenapi.bioinformatics import (
from antigenapi.bioinformatics.blast import get_db_fasta, run_blastp
from antigenapi.bioinformatics.imgt import (
AIRR_IMPORTANT_COLUMNS,
as_fasta_files,
load_sequences,
Expand Down Expand Up @@ -1052,20 +1054,83 @@ def search_sequencing_run_results(self, request, query):
{"matches": pd.concat(results).to_dict(orient="records") if results else []}
)

@action(
detail=True,
methods=["GET"],
name="BLAST sequencing run results vs DB.",
url_path="blast",
)
def get_blast_sequencing_run(self, request, pk):
"""BLAST sequencing run vs database."""
blast_str = run_blastp(pk)
if not blast_str:
return JsonResponse({"hits": []}, status=status.HTTP_404_NOT_FOUND)

# Read query AIRR files for CDRs
results = SequencingRunResults.objects.filter(sequencing_run_id=int(pk))

airr_df = pd.concat(read_airr_file(r.airr_file) for r in results)
airr_df = airr_df.set_index("sequence_id")

ALIGN_PERC_THRESHOLD = 90
E_VALUE_THRESHOLD = 0.05

# Parse the JSON and keep the important bits
blast_result = json.loads(blast_str)
res = []
for blast_run in blast_result["BlastOutput2"]:
run_res = blast_run["report"]["results"]["search"]
for blast_hit_set in run_res["hits"]:
subject_title = blast_hit_set["description"][0]["title"]
query_title = run_res["query_title"]
query_cdr3 = airr_df.at[query_title, "cdr3_aa"]
if pd.isna(query_cdr3):
query_cdr3 = None
if subject_title.strip() == query_title.strip():
continue
hsps = blast_hit_set["hsps"][0]
assert len(blast_hit_set["description"]) == 1
for hsps in blast_hit_set["hsps"]:
align_perc = round(
(hsps["align_len"] / run_res["query_len"]) * 100, 2
)
if align_perc < ALIGN_PERC_THRESHOLD:
continue
e_value = hsps["evalue"]
if e_value > E_VALUE_THRESHOLD:
continue
res.append(
{
"query_title": query_title.strip(),
"query_cdr3": query_cdr3,
"subject_title": subject_title.strip(),
"submatch_no": hsps["num"],
"query_seq": hsps["qseq"],
"subject_seq": hsps["hseq"],
"midline": hsps["midline"],
"bit_score": hsps["bit_score"],
"e_value": e_value,
"align_len": hsps["align_len"],
"align_perc": align_perc,
"ident_perc": round(
(hsps["identity"] / hsps["align_len"]) * 100, 2
),
}
)

# Sort the results
res = sorted(
res,
key=lambda r: (r["query_cdr3"] or "", -r["align_perc"], -r["ident_perc"]),
)

return JsonResponse({"hits": res})


class GlobalFastaView(APIView):
def get(self, request, format=None):
"""Download entire database as .fasta file."""
fasta_data = ""
for sr in SequencingRunResults.objects.all():
airr_file = read_airr_file(
sr.airr_file, usecols=("sequence_id", "sequence_alignment_aa")
)
airr_file = airr_file[airr_file.sequence_alignment_aa.notna()]
if not airr_file.empty:
for _, row in airr_file.iterrows():
fasta_data += f"> {row.sequence_id}\n"
fasta_data += f"{row.sequence_alignment_aa.replace('.', '')}\n"
fasta_data = get_db_fasta()

fasta_filename = (
f"antigenapp_database_{datetime.datetime.now().isoformat()}.fasta"
Expand Down
10 changes: 7 additions & 3 deletions backend/setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ install_requires =

[options.extras_require]
dev =
black==24.3.0
isort>5.0
flake8<4
black
isort
flake8
pytest
pytest-cov
pytest-black
Expand All @@ -37,6 +37,7 @@ dev =
pytest-flake8
pytest-black
pytest-pydocstyle
types-pytz
flake8-isort


Expand All @@ -57,6 +58,9 @@ float_to_top=true
max-line-length = 88
extend-ignore =
E203,
E231, # false positive
E702, # false positive
E713, # false positive
F811,
F722,
exclude =
Expand Down
Loading