Skip to content

Commit

Permalink
WIP #25
Browse files Browse the repository at this point in the history
  • Loading branch information
Adafede committed Jan 3, 2024
1 parent 20c488e commit a9f9909
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 21 deletions.
11 changes: 3 additions & 8 deletions chemistry_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,6 @@ def fingerprint(mol):
return fpgen.GetFingerprint(mol)


def process_smol_and_wid(smol_and_wid):
sdf_blocks = []
for smol, wid in smol_and_wid:
sdf_blocks.append((wid, Chem.MolToMolBlock(smol)))
return sdf_blocks


def standardize(mol):
clean_mol = rdMolStandardize.Cleanup(mol)
bigger_clean = rdMolStandardize.FragmentParent(clean_mol)
Expand All @@ -50,8 +43,9 @@ def process_smiles(inp):
nid, smiles = inp
mol = Chem.MolFromSmiles(smiles)
smol = standardize(mol)
smiles_clean = Chem.MolToSmiles(smol)
if smol is not None:
smiles_clean = Chem.MolToSmiles(smol)
mol_block = Chem.MolToMolBlock(smol)
sim_fp = fingerprint(smol)
sub_fp = Chem.PatternFingerprint(smol)
smol_h = Chem.AddHs(smol)
Expand All @@ -62,6 +56,7 @@ def process_smiles(inp):
smiles,
smol,
smiles_clean,
mol_block,
sim_fp,
sub_fp,
smol_h.ToBinary(),
Expand Down
24 changes: 11 additions & 13 deletions update/generate_database_chemo.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
import multiprocessing
import pickle
from concurrent.futures import ProcessPoolExecutor
# from itertools import islice
from pathlib import Path

from rdkit import RDLogger
from rdkit.Chem import Mol, rdSubstructLibrary

from chemistry_helpers import process_smiles, process_smol_and_wid, write_mols_to_sdf
from chemistry_helpers import process_smiles, write_mols_to_sdf

RDLogger.DisableLog("rdApp.*")
logging.basicConfig(
Expand All @@ -23,6 +24,7 @@ def run(path: Path) -> None:
with open(path / "structures.csv", "r") as f:
reader = csv.reader(f)
next(reader)
# for x in islice(reader, 1000):
for x in reader:
c, smi, cano = x
if smi == "":
Expand All @@ -42,6 +44,7 @@ def run(path: Path) -> None:

library_h = rdSubstructLibrary.SubstructLibrary(mols_h, fps_h)

sdf_blocks = []
p_smileses = []
p_smols = []
p_sim_fps = []
Expand All @@ -53,10 +56,11 @@ def run(path: Path) -> None:
for result in results:
if result is not None:
(
mid,
nid,
smiles,
smol,
smiles_clean,
mol_block,
sim_fp,
sub_fp,
mol_h,
Expand All @@ -69,13 +73,14 @@ def run(path: Path) -> None:
p_sim_h_fps.append(sim_fp_h)

smis.AddSmiles(smiles_clean)
sdf_blocks.append((links[nid], mol_block))
fps.AddFingerprint(sub_fp)
p_sim_fps.append(sim_fp)

p_smols.append(smol)
p_smileses.append(smiles)

p_links.append(links[mid])
p_links.append(links[nid])

logging.info("Finished generating the chemical libraries")

Expand All @@ -88,6 +93,7 @@ def run(path: Path) -> None:

database = {
"structure_wid": p_links,
# TODO add blocks if needed
"structure_sim_fps": p_sim_fps,
"structure_sim_h_fps": p_sim_h_fps,
"structure_library": library.Serialize(),
Expand All @@ -99,16 +105,8 @@ def run(path: Path) -> None:
pickle.dump(database, f)
logging.info("Finished dumping")

logging.info("Starting SDF")
smols_and_wids = list(zip(p_smols, p_links))

with ProcessPoolExecutor(max_workers=max_workers) as executor:
chunks = [
smols_and_wids[i : i + 1000] for i in range(0, len(smols_and_wids), 1000)
]
sdf_blocks_list = list(executor.map(process_smol_and_wid, chunks))
sdf_blocks = [block for sublist in sdf_blocks_list for block in sublist]
write_mols_to_sdf(path, sdf_blocks)
logging.info("Exporting SDF")
write_mols_to_sdf(path, sdf_blocks)

logging.info("Finished exporting")

Expand Down

0 comments on commit a9f9909

Please sign in to comment.