Skip to content

Commit

Permalink
Gunzip agnostic (#34)
Browse files Browse the repository at this point in the history
* handling opening with hook_compressed

* swap everything to fileinput

* bugfix
  • Loading branch information
drewnutt authored Oct 20, 2023
1 parent 919c32b commit 7b87b62
Show file tree
Hide file tree
Showing 8 changed files with 66 additions and 82 deletions.
24 changes: 14 additions & 10 deletions open_combind/dock/ligand_handling.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import requests
# import re
import urllib.parse
from rdkit import Chem
from rdkit.Chem.AllChem import AssignBondOrdersFromTemplate
from prody import writePDBStream
import gzip
# from bs4 import BeautifulSoup
import fileinput


class RDKitParseException(Exception):
Expand Down Expand Up @@ -211,14 +209,20 @@ def get_ligand_info_RCSB(pdb_id):
return entry_ligands_info

def mol_to_sdf(mol, path_to_sdf):
if path_to_sdf.endswith('.gz'):
with gzip.open(path_to_sdf,'wt') as f:
with Chem.SDWriter(f) as w:
w.write(mol)
else:
with Chem.SDWriter(path_to_sdf) as w:
w.write(mol)
"""
Writes an RDKit molecule to an SDF file.
Parameters
----------
mol : :class:`~rdkit.Chem.rdchem.Mol`
RDKit molecule to be written to an SDF file.
path_to_sdf : str
Path to the output SDF file. Can be gzipped.
"""
with fileinput.hook_compressed(path_to_sdf,'wt') as f:
with Chem.SDWriter(f) as w:
w.write(mol)

class DummyMolBlock():
def __init__(self):
Expand Down
9 changes: 3 additions & 6 deletions open_combind/dock/ligprep.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import argparse
import gzip
import fileinput
from rdkit.Chem import AllChem as Chem

def construct_set_conformers(mol, *, num_confs, confgen, seed=-1):
Expand Down Expand Up @@ -244,11 +244,8 @@ def ligsplit(big_sdf, root, multiplex=False, name_prop='BindingDB MonomerID',
The number of conformers to write to the file.
"""

if os.path.splitext(big_sdf)[-1] == ".gz":
big_sdf_data = gzip.open(big_sdf)
else:
big_sdf_data = open(big_sdf, 'rb')
ligands = Chem.ForwardSDMolSupplier(big_sdf_data)
with fileinput.hook_compressed(big_sdf_data, 'rb') as bsd:
ligands = Chem.ForwardSDMolSupplier(big_sdf_data)
unfinished = []
for count, ligand in enumerate(ligands):
het_id = ligand.GetProp('Ligand HET ID in PDB')
Expand Down
14 changes: 3 additions & 11 deletions open_combind/dock/postprocessing.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python3

import argparse
import gzip
import fileinput
import os
from rdkit.Chem import ForwardSDMolSupplier, SDWriter
from rdkit.Chem.rdMolAlign import CalcRMS
Expand Down Expand Up @@ -33,11 +33,7 @@ def coalesce_poses(docked_files, sort_by='CNNscore', filter_RMSD=None, reverse=T
if isinstance(docked_files, str):
docked_files = [docked_files]
for dock_file in docked_files:
if dock_file.endswith('.gz'):
openfile = gzip.open
else:
openfile = open
with openfile(dock_file,'rb') as gz:
with fileinput.hook_compressed(dock_file,'rb') as gz:
mols += [m for m in ForwardSDMolSupplier(gz)]
sorted_mols = sorted(mols, key=lambda x: float(x.GetProp(sort_by)),reverse=reverse)

Expand All @@ -61,11 +57,7 @@ def write_poses(sorted_mols, out_file):
Output file to write the poses to
"""

if out_file.endswith('.gz'):
openfile=gzip.open
else:
openfile=open
with openfile(out_file,'wt') as gz:
with fileinput.hook_compressed(out_file,'wt') as gz:
writer = SDWriter(gz)
for sm in sorted_mols:
writer.write(sm)
Expand Down
57 changes: 28 additions & 29 deletions open_combind/features/features.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import fileinput
import os
import gzip
import numpy as np
import pandas as pd
from glob import glob
Expand Down Expand Up @@ -114,22 +114,20 @@ def get_molecules_from_files(self, pvs, native=False, center_ligand=None):
for pv in pvs:
# mol_bundle = Chem.FixedMolSizeMolBundle()
mol_bundle = []
pv_open = pv
if pv.endswith('.gz'):
pv_open = gzip.open(pv)
mol_suppl = Chem.ForwardSDMolSupplier(pv_open)
mol_count = 0
for mol in mol_suppl:
lig_centroid = ComputeCentroid(mol.GetConformer())
displacement = lig_centroid.DirectionVector(center) * lig_centroid.Distance(center)
# print(distance)
if center_ligand is not None and (np.abs(displacement.x) > 7.5 or np.abs(displacement.y) > 7.5 or np.abs(displacement.z) > 7.5):
print(f"skipped for {pv}")
continue
mol_bundle.append(mol)
mol_count += 1
if mol_count == self.max_poses:
break
with fileinput.hook_compressed(pv,'rb') as pv_open:
mol_suppl = Chem.ForwardSDMolSupplier(pv_open)
mol_count = 0
for mol in mol_suppl:
lig_centroid = ComputeCentroid(mol.GetConformer())
displacement = lig_centroid.DirectionVector(center) * lig_centroid.Distance(center)
# print(distance)
if center_ligand is not None and (np.abs(displacement.x) > 7.5 or np.abs(displacement.y) > 7.5 or np.abs(displacement.z) > 7.5):
print(f"skipped for {pv}")
continue
mol_bundle.append(mol)
mol_count += 1
if mol_count == self.max_poses:
break
if (native is False) and (mol_count != self.max_poses):
print(f"Did not get {self.max_poses} poses for {pv}, only {len(mol_bundle)} poses")
print(mol_count,pv)
Expand Down Expand Up @@ -269,18 +267,19 @@ def load_single_features(self, pvs, ligands=None, center_ligand=None):
_ifps = [_ifps.loc[_ifps.pose==p] for p in range(max(_ifps.pose)+1)]

#Need to check for if ligand is centered here.
sts = Chem.ForwardSDMolSupplier(gzip.open(pv))
_poses = []
for st in sts:
if center_ligand is not None:
lig_centroid = ComputeCentroid(st.GetConformer())
displacement = lig_centroid.DirectionVector(center) * lig_centroid.Distance(center)
# print(distance)
if np.abs(displacement.x) > 7.5 or np.abs(displacement.y) > 7.5 or np.abs(displacement.z) > 7.5:
print(f"skipped for {pv}")
continue
_poses.append(st)
print(len(poses))
with fileinput.hook_compressed(pv, 'rb') as pv_open:
sts = Chem.ForwardSDMolSupplier(pv_open)
_poses = []
for st in sts:
if center_ligand is not None:
lig_centroid = ComputeCentroid(st.GetConformer())
displacement = lig_centroid.DirectionVector(center) * lig_centroid.Distance(center)
# print(distance)
if np.abs(displacement.x) > 7.5 or np.abs(displacement.y) > 7.5 or np.abs(displacement.z) > 7.5:
print(f"skipped for {pv}")
continue
_poses.append(st)
print(len(poses))

keep = []
for i in range(len(_names)):
Expand Down
12 changes: 5 additions & 7 deletions open_combind/features/ifp.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Compute interaction fingerprints for poseviewer files.
"""

import fileinput
import tempfile
import os
import re
Expand All @@ -10,7 +11,6 @@
import pandas as pd
from rdkit.Chem import MolFromSmarts,ForwardSDMolSupplier,MolFromPDBFile, AddHs
from rdkit.Chem.rdForceFieldHelpers import UFFGetMoleculeForceField, OptimizeMolecule
import gzip

def resname(atom):
"""
Expand Down Expand Up @@ -825,7 +825,7 @@ def fingerprint_poseviewer(input_file, poses, settings):
Parameters
----------
input_file : str
Path to the gzipped SDF file containing all of the ligand poses
Path to the SDF file containing all of the ligand poses. Can be gzipped (`.sdf.gz') or not(`.sdf').
poses : int
Number of poses to read from `input_file`
settings : dict
Expand All @@ -839,13 +839,11 @@ def fingerprint_poseviewer(input_file, poses, settings):
"""

prot_bname = input_file.split('-to-')[-1]
prot_fname = re.sub('-docked.*\.sdf\.gz','_prot.pdb',prot_bname)
prot_fname = re.sub('-docked.*\.sdf(\.gz)?','_prot.pdb',prot_bname)
prot_file = f"structures/proteins/{prot_fname}"
# print(prot_file)

# print(input_file)
fps = []
with gzip.open(input_file) as fp:
with fileinput.hook_compressed(input_file,'rb') as fp:
mols = ForwardSDMolSupplier(fp, removeHs=False)
rdk_prot = MolFromPDBFile(prot_file,removeHs=False)
if rdk_prot is None:
Expand Down Expand Up @@ -882,7 +880,7 @@ def ifp(settings, input_file, output_file, poses):
settings : dict
Settings of the interaction fingerprint
input_file : str
Path to the gzipped SDF file containing all of the ligand poses
Path to the SDF file containing all of the ligand poses. Can be gzipped (`.sdf.gz') or not (`.sdf').
output_file : str
Path to the output CSV file containing all the interactions
poses : int
Expand Down
7 changes: 4 additions & 3 deletions open_combind/open_combind.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,17 +384,18 @@ def extract_top_poses(scores, original_pvs):
Write top-scoring poses to a single file.
"""
from rdkit import Chem
import gzip
import fileinput

out = scores.replace('.csv', '.sdf.gz')
out = scores.replace('.csv', '.sdf').replace('.gz','')
scores = pd.read_csv(scores).set_index('ID')

writer = Chem.SDWriter(out)

counts = {}
written = []
for pv in original_pvs:
sts = Chem.ForwardSDMolSupplier(gzip.open(pv))
with fileinput.hook_compressed(pv,'rb') as f:
sts = Chem.ForwardSDMolSupplier(f)
for st in sts:
name = st.GetProp("_Name")
if name not in counts:
Expand Down
8 changes: 4 additions & 4 deletions open_combind/pymol/view_complexes.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import fileinput
from pymol import cmd
from rdkit import Chem
import gzip
import sys
import pandas as pd
from glob import glob
Expand Down Expand Up @@ -169,9 +169,9 @@ def show_interactions(ifp_file, interaction, protein, lig, ligand_file, pose, de
enable(lig, pose)

if lig_mol is None:
if isinstance(ligand_file,str) and ligand_file.endswith('.gz'):
ligand_file = gzip.open(ligand_file)
lig_mols = [mol for mol in Chem.ForwardSDMolSupplier(ligand_file)]
if isinstance(ligand_file,str):
with fileinput.hook_compressed(ligand_file,'rb') as lf:
lig_mols = [mol for mol in Chem.ForwardSDMolSupplier(ligand_file)]
assert len(lig_mols), "No ligands found in {}".format(ligand_file)
lig_mol = lig_mols[pose]

Expand Down
17 changes: 5 additions & 12 deletions open_combind/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from multiprocessing import Pool
import fileinput
import os
import numpy as np
from rdkit import Chem
Expand Down Expand Up @@ -75,12 +76,8 @@ def get_pose(pv, pose):
pose : :class:`~rdkit.Chem.rdchem.Mol`
The selected pose.
"""
if os.path.splitext(pv)[-1] == ".gz":
import gzip
pv = gzip.open(pv)
else:
pv = open(pv)
sts = Chem.ForwardSDMolSupplier(pv)
with fileinput.hook_compressed(pv,'rb') as pvf:
sts = Chem.ForwardSDMolSupplier(pv)
for i,st in enumerate(sts):
if i == pose:
break
Expand Down Expand Up @@ -159,10 +156,6 @@ def count_poses(pv):
num_poses : int
Number of poses in the file.
"""
if os.path.splitext(pv)[-1] == ".gz":
import gzip
pv = gzip.open(pv)
else:
pv = open(pv)
num_poses = [1 for i in Chem.ForwardSDMolSupplier(pv)]
with fileinput.hook_compressed(pv,'rb') as pvf:
num_poses = [1 for i in Chem.ForwardSDMolSupplier(pv)]
return sum(num_poses)

0 comments on commit 7b87b62

Please sign in to comment.