Skip to content

Commit

Permalink
WIP #25 (multiple SDFs to check)
Browse files Browse the repository at this point in the history
  • Loading branch information
Adafede committed Mar 13, 2024
1 parent 369df8f commit 398e3d7
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 13 deletions.
5 changes: 2 additions & 3 deletions api/queries.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
from datetime import datetime
from typing import Any

from datetime import datetime
from fastapi import HTTPException

from api.models import (
Expand Down Expand Up @@ -91,7 +91,6 @@ def structures_from_structure_in_item(dm: DataModel, item: Item) -> set[int] | N
formula = item.structure.formula
sub = item.structure.option.substructure_search
sim = item.structure.option.similarity_level
sdf = item.structure.option.sdf
desc = item.structure.option.descriptors

args = len([param for param in [wid, molecule, formula] if param is not None])
Expand Down Expand Up @@ -279,7 +278,7 @@ def get_structures_for_item(item: Item, dm: DataModel) -> dict[int, str]:
)

return dm.get_structure_object_from_dict_of_sids(
ids, item.structure.option.descriptors
ids, item.structure.option.descriptors, item.structure.option.sdf
)


Expand Down
37 changes: 32 additions & 5 deletions model/data_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,10 @@
from rdkit import Chem, DataStructs
from rdkit.Chem import rdSubstructLibrary
from sqlalchemy.orm import aliased
from api.models import (
ReferenceObject,
StructureObject,
TaxonObject,
)

from api.models import ReferenceObject, StructureObject, TaxonObject
from chemistry_helpers import fingerprint, standardize
from sdf_helpers import find_structures_bytes_ranges, mmap_file, read_selected_ranges
from storage.models import (
Journals,
References,
Expand All @@ -38,6 +36,8 @@
class DataModel:
def __init__(self, path: Path = Path("./data")):
self.db = self.load_all_data(path)
self.sdf = self.load_sdf_data(path)
self.sdf_ranges = self.load_sdf_ranges(self.sdf)
self.storage = Storage(path)
self.taxa_name_db = self.preload_taxa()
self.path = path
Expand All @@ -57,6 +57,18 @@ def load_all_data(cls, path: Path):
data["structure_library_h"] = new_lib_h
return data

@classmethod
@functools.lru_cache(maxsize=None)
def load_sdf_data(cls, path: Path):
mmaped_sdf = mmap_file(path / "lotus.sdf")
return mmaped_sdf

@classmethod
@functools.lru_cache(maxsize=None)
def load_sdf_ranges(cls, sdf):
ranges = find_structures_bytes_ranges(sdf)
return ranges

### Taxonomy
def get_taxon_object_from_dict_of_tids(
self, tids: Iterable[int]
Expand Down Expand Up @@ -141,10 +153,20 @@ def structures_set(self) -> set[int]:
def get_structure_object_from_sid(self, sid: int) -> dict | None:
return self.get_structure_object_from_dict_of_sids([sid])

def get_structure_sdf_from_dict_of_sids(
self, sids: Iterable[int]
) -> Iterable[tuple[int, str]]:
ranges = self.sdf_ranges
blocks = []
for sid in sids:
blocks.append(read_selected_ranges(self.sdf, [ranges[sid]]))
return "".join(blocks)

def get_structure_object_from_dict_of_sids(
self,
sids: Iterable[int],
descriptors: bool | dict = False,
sdf: bool = False,
) -> dict[int, StructureObject]:
with self.storage.session() as session:
if descriptors == True:
Expand Down Expand Up @@ -178,6 +200,10 @@ def get_structure_object_from_dict_of_sids(
.filter(Structures.id.in_(sids))
.all()
)
if sdf:
blocks = self.get_structure_sdf_from_dict_of_sids(sids)
else:
blocks = None
if result:
return {
row.id: StructureObject(
Expand All @@ -188,6 +214,7 @@ def get_structure_object_from_dict_of_sids(
inchikey=row.inchikey,
inchikey_no_stereo=row.inchikey_no_stereo,
formula=row.formula,
sdf=blocks,
)
for row in result
}
Expand Down
4 changes: 2 additions & 2 deletions sdf_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,13 @@ def find_structures_bytes_ranges(mmapped_file: mmap.mmap) -> Dict[int, Tuple[int

def read_selected_ranges(
mmapped_file: mmap.mmap, ranges_to_read: List[Tuple[int, int]]
) -> List[str]:
) -> str:
selected_lines: Deque[str] = deque()

for start, end in ranges_to_read:
selected_lines.append(mmapped_file[start:end].decode())

return list(selected_lines)
return "".join(selected_lines)


def write_mols_to_sdf(path: Path, sdf_blocks: Iterable[Tuple[int, str]]) -> None:
Expand Down
21 changes: 21 additions & 0 deletions tests/test_api_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,27 @@ async def test_search_structures_by_similarity_explicit_h(self, data_model):
assert result.count == 1
assert result.objects[3].smiles == "C"

async def test_search_structures_sdf(self, data_model):
item = Item(
structure={
"molecule": "C([H])([H])([H])([H])",
"option": {
"sdf": True,
"substructure_search": False,
},
},
limit=10,
modeEnum="objects",
)
result = await search_structures(item=item, dm=data_model)
assert result.count == 1
assert (
result.objects[3].sdf
== "\n RDKit 2D\n\n 1 0 0 0 0 0 0 0 0 0999 V2000\n 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\nM END\n> <WID> (3) \n3\n\n"
)

# TODO test for multiple ones (SDF)

async def test_search_structures_by_substructure_limits(self, data_model):
item = Item(
structure={"molecule": "C", "option": {"substructure_search": True}},
Expand Down
4 changes: 1 addition & 3 deletions tests/test_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,7 @@ def test_sdf(self, data_model):

ranges_to_read = [ranges[key] for key in list(ranges.keys())]
block = read_selected_ranges(mmaped_sdf_generated, [ranges_to_read[2]])
block_expected = [
"""\n RDKit 2D\n\n 1 0 0 0 0 0 0 0 0 0999 V2000\n 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\nM END\n> <WID> (3) \n3\n\n"""
]
block_expected = """\n RDKit 2D\n\n 1 0 0 0 0 0 0 0 0 0999 V2000\n 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\nM END\n> <WID> (3) \n3\n\n"""
assert (
block == block_expected
), f"Content mismatch between {block} and {block_expected}"
Expand Down

0 comments on commit 398e3d7

Please sign in to comment.