Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: blast by cdr3 #101

Merged
merged 3 commits into from
Nov 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 42 additions & 17 deletions backend/antigenapi/bioinformatics/blast.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,20 @@
BLAST_NUM_THREADS = 4


def get_db_fasta(include_run: Optional[int] = None, exclude_run: Optional[int] = None):
def get_db_fasta(
include_run: Optional[int] = None,
exclude_run: Optional[int] = None,
query_type: str = "full",
):
"""Get the sequencing database in fasta format.

Args:
include_run (int, optional): Sequencing run ID to include.
Defaults to None.
exclude_run (int, optional): Sequencing run ID to exclude.
Defaults to None.
query_type (str): Query type - "full" sequence or "cdr3"
Defaults to "full".

Returns:
str: Sequencing run as a FASTA format string
Expand All @@ -31,20 +37,35 @@ def get_db_fasta(include_run: Optional[int] = None, exclude_run: Optional[int] =
query = query.exclude(sequencing_run_id=exclude_run)
for sr in query:
airr_file = read_airr_file(
sr.airr_file, usecols=("sequence_id", "sequence_alignment_aa")
sr.airr_file,
usecols=(
"sequence_id",
"cdr3_aa" if query_type == "cdr3" else "sequence_alignment_aa",
),
)
airr_file = airr_file[airr_file.sequence_alignment_aa.notna()]
airr_file = airr_file[
(
airr_file.cdr3_aa.notna()
if query_type == "cdr3"
else airr_file.sequence_alignment_aa.notna()
)
]
if not airr_file.empty:
for _, row in airr_file.iterrows():
seq = row.sequence_alignment_aa.replace(".", "")
try:
if fasta_data[row.sequence_id] != seq:
raise ValueError(
f"Different sequences with same name! {row.sequence_id}"
)
continue
except KeyError:
fasta_data[row.sequence_id] = seq
if query_type == "cdr3":
cdr3s = set(airr_file.cdr3_aa.unique())
for cdr3 in cdr3s:
fasta_data[f"CDR3: {cdr3}"] = cdr3
else:
for _, row in airr_file.iterrows():
seq = row.sequence_alignment_aa.replace(".", "")
try:
if fasta_data[row.sequence_id] != seq:
raise ValueError(
f"Different sequences with same name! {row.sequence_id}"
)
continue
except KeyError:
fasta_data[row.sequence_id] = seq

fasta_files = as_fasta_files(fasta_data, max_file_size=None)
if fasta_files:
Expand All @@ -53,33 +74,37 @@ def get_db_fasta(include_run: Optional[int] = None, exclude_run: Optional[int] =
return ""


def get_sequencing_run_fasta(sequencing_run_id: int):
def get_sequencing_run_fasta(sequencing_run_id: int, query_type: str):
"""Get sequencing run in BLAST format.

Args:
sequencing_run_id (int): Sequencing run ID
query_type (str): Query type - "full" sequence or "cdr3"

Returns:
str: Sequencing run as a FASTA format string
"""
return get_db_fasta(include_run=sequencing_run_id)
return get_db_fasta(include_run=sequencing_run_id, query_type=query_type)


def run_blastp(
sequencing_run_id: int, outfmt: str = BLAST_FMT_MULTIPLE_FILE_BLAST_JSON
sequencing_run_id: int,
query_type: str = "full",
outfmt: str = BLAST_FMT_MULTIPLE_FILE_BLAST_JSON,
):
"""Run blastp for a sequencing run vs rest of database.

Args:
sequencing_run_id (int): Sequencing run ID.
query_type (str): Query type - "full" sequence or "cdr3"

Returns:
JSONResponse: Single file BLAST JSON
"""
db_data = get_db_fasta()
if not db_data:
return None
query_data = get_sequencing_run_fasta(sequencing_run_id)
query_data = get_sequencing_run_fasta(sequencing_run_id, query_type=query_type)
if not query_data:
return None

Expand Down
15 changes: 11 additions & 4 deletions backend/antigenapi/views_old.py
Original file line number Diff line number Diff line change
Expand Up @@ -1062,7 +1062,10 @@ def search_sequencing_run_results(self, request, query):
)
def get_blast_sequencing_run(self, request, pk):
"""BLAST sequencing run vs database."""
blast_str = run_blastp(pk)
query_type = self.request.query_params.get("queryType", "full")
if query_type not in ("full", "cdr3"):
raise ValueError(f"Unknown queryType: {query_type}")
blast_str = run_blastp(pk, query_type=query_type)
if not blast_str:
return JsonResponse({"hits": []}, status=status.HTTP_404_NOT_FOUND)

Expand All @@ -1083,9 +1086,13 @@ def get_blast_sequencing_run(self, request, pk):
for blast_hit_set in run_res["hits"]:
subject_title = blast_hit_set["description"][0]["title"]
query_title = run_res["query_title"]
query_cdr3 = airr_df.at[query_title, "cdr3_aa"]
if pd.isna(query_cdr3):
query_cdr3 = None
if query_type == "cdr3":
# TODO: Make more robust
query_cdr3 = query_title[6:]
else:
query_cdr3 = airr_df.at[query_title, "cdr3_aa"]
if pd.isna(query_cdr3):
query_cdr3 = None
if subject_title.strip() == query_title.strip():
continue
hsps = blast_hit_set["hsps"][0]
Expand Down
184 changes: 171 additions & 13 deletions frontend/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion frontend/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"version": "0.1.0",
"private": true,
"dependencies": {
"@headlessui/react": "^1.7.19",
"@headlessui/react": "2.1",
"@heroicons/react": "^2.1.5",
"@sentry/react": "^7.119.1",
"@sentry/tracing": "^7.114.0",
Expand Down
Loading