From 7b87b6209467bd54648a42d85b0635826d070fc5 Mon Sep 17 00:00:00 2001 From: Andrew McNutt <35881828+drewnutt@users.noreply.github.com> Date: Fri, 20 Oct 2023 15:07:54 -0400 Subject: [PATCH] Gunzip agnostic (#34) * handling opening with hook_compressed * swap everything to fileinput * bugfix --- open_combind/dock/ligand_handling.py | 24 +++++++----- open_combind/dock/ligprep.py | 9 ++--- open_combind/dock/postprocessing.py | 14 ++----- open_combind/features/features.py | 57 ++++++++++++++-------------- open_combind/features/ifp.py | 12 +++--- open_combind/open_combind.py | 7 ++-- open_combind/pymol/view_complexes.py | 8 ++-- open_combind/utils.py | 17 +++------ 8 files changed, 66 insertions(+), 82 deletions(-) diff --git a/open_combind/dock/ligand_handling.py b/open_combind/dock/ligand_handling.py index e281968..319e99d 100644 --- a/open_combind/dock/ligand_handling.py +++ b/open_combind/dock/ligand_handling.py @@ -1,11 +1,9 @@ import requests -# import re import urllib.parse from rdkit import Chem from rdkit.Chem.AllChem import AssignBondOrdersFromTemplate from prody import writePDBStream -import gzip -# from bs4 import BeautifulSoup +import fileinput class RDKitParseException(Exception): @@ -211,14 +209,20 @@ def get_ligand_info_RCSB(pdb_id): return entry_ligands_info def mol_to_sdf(mol, path_to_sdf): - if path_to_sdf.endswith('.gz'): - with gzip.open(path_to_sdf,'wt') as f: - with Chem.SDWriter(f) as w: - w.write(mol) - else: - with Chem.SDWriter(path_to_sdf) as w: - w.write(mol) + """ + Writes an RDKit molecule to an SDF file. + Parameters + ---------- + mol : :class:`~rdkit.Chem.rdchem.Mol` + RDKit molecule to be written to an SDF file. + path_to_sdf : str + Path to the output SDF file. Can be gzipped. + + """ + with fileinput.hook_compressed(path_to_sdf,'wt') as f: + with Chem.SDWriter(f) as w: + w.write(mol) class DummyMolBlock(): def __init__(self): diff --git a/open_combind/dock/ligprep.py b/open_combind/dock/ligprep.py index 264ba75..8417793 100644 --- a/open_combind/dock/ligprep.py +++ b/open_combind/dock/ligprep.py @@ -1,6 +1,6 @@ import os import argparse -import gzip +import fileinput from rdkit.Chem import AllChem as Chem def construct_set_conformers(mol, *, num_confs, confgen, seed=-1): @@ -244,11 +244,8 @@ def ligsplit(big_sdf, root, multiplex=False, name_prop='BindingDB MonomerID', The number of conformers to write to the file. """ - if os.path.splitext(big_sdf)[-1] == ".gz": - big_sdf_data = gzip.open(big_sdf) - else: - big_sdf_data = open(big_sdf, 'rb') - ligands = Chem.ForwardSDMolSupplier(big_sdf_data) + with fileinput.hook_compressed(big_sdf_data, 'rb') as bsd: + ligands = Chem.ForwardSDMolSupplier(big_sdf_data) unfinished = [] for count, ligand in enumerate(ligands): het_id = ligand.GetProp('Ligand HET ID in PDB') diff --git a/open_combind/dock/postprocessing.py b/open_combind/dock/postprocessing.py index 63adac9..401e89a 100644 --- a/open_combind/dock/postprocessing.py +++ b/open_combind/dock/postprocessing.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 import argparse -import gzip +import fileinput import os from rdkit.Chem import ForwardSDMolSupplier, SDWriter from rdkit.Chem.rdMolAlign import CalcRMS @@ -33,11 +33,7 @@ def coalesce_poses(docked_files, sort_by='CNNscore', filter_RMSD=None, reverse=T if isinstance(docked_files, str): docked_files = [docked_files] for dock_file in docked_files: - if dock_file.endswith('.gz'): - openfile = gzip.open - else: - openfile = open - with openfile(dock_file,'rb') as gz: + with fileinput.hook_compressed(dock_file,'rb') as gz: mols += [m for m in ForwardSDMolSupplier(gz)] sorted_mols = sorted(mols, key=lambda x: float(x.GetProp(sort_by)),reverse=reverse) @@ -61,11 +57,7 @@ def write_poses(sorted_mols, out_file): Output file to write the poses to """ - if out_file.endswith('.gz'): - openfile=gzip.open - else: - openfile=open - with openfile(out_file,'wt') as gz: + with fileinput.hook_compressed(out_file,'wt') as gz: writer = SDWriter(gz) for sm in sorted_mols: writer.write(sm) diff --git a/open_combind/features/features.py b/open_combind/features/features.py index ad7f4f7..9f82042 100644 --- a/open_combind/features/features.py +++ b/open_combind/features/features.py @@ -1,5 +1,5 @@ +import fileinput import os -import gzip import numpy as np import pandas as pd from glob import glob @@ -114,22 +114,20 @@ def get_molecules_from_files(self, pvs, native=False, center_ligand=None): for pv in pvs: # mol_bundle = Chem.FixedMolSizeMolBundle() mol_bundle = [] - pv_open = pv - if pv.endswith('.gz'): - pv_open = gzip.open(pv) - mol_suppl = Chem.ForwardSDMolSupplier(pv_open) - mol_count = 0 - for mol in mol_suppl: - lig_centroid = ComputeCentroid(mol.GetConformer()) - displacement = lig_centroid.DirectionVector(center) * lig_centroid.Distance(center) - # print(distance) - if center_ligand is not None and (np.abs(displacement.x) > 7.5 or np.abs(displacement.y) > 7.5 or np.abs(displacement.z) > 7.5): - print(f"skipped for {pv}") - continue - mol_bundle.append(mol) - mol_count += 1 - if mol_count == self.max_poses: - break + with fileinput.hook_compressed(pv,'rb') as pv_open: + mol_suppl = Chem.ForwardSDMolSupplier(pv_open) + mol_count = 0 + for mol in mol_suppl: + lig_centroid = ComputeCentroid(mol.GetConformer()) + displacement = lig_centroid.DirectionVector(center) * lig_centroid.Distance(center) + # print(distance) + if center_ligand is not None and (np.abs(displacement.x) > 7.5 or np.abs(displacement.y) > 7.5 or np.abs(displacement.z) > 7.5): + print(f"skipped for {pv}") + continue + mol_bundle.append(mol) + mol_count += 1 + if mol_count == self.max_poses: + break if (native is False) and (mol_count != self.max_poses): print(f"Did not get {self.max_poses} poses for {pv}, only {len(mol_bundle)} poses") print(mol_count,pv) @@ -269,18 +267,19 @@ def load_single_features(self, pvs, ligands=None, center_ligand=None): _ifps = [_ifps.loc[_ifps.pose==p] for p in range(max(_ifps.pose)+1)] #Need to check for if ligand is centered here. - sts = Chem.ForwardSDMolSupplier(gzip.open(pv)) - _poses = [] - for st in sts: - if center_ligand is not None: - lig_centroid = ComputeCentroid(st.GetConformer()) - displacement = lig_centroid.DirectionVector(center) * lig_centroid.Distance(center) - # print(distance) - if np.abs(displacement.x) > 7.5 or np.abs(displacement.y) > 7.5 or np.abs(displacement.z) > 7.5: - print(f"skipped for {pv}") - continue - _poses.append(st) - print(len(poses)) + with fileinput.hook_compressed(pv, 'rb') as pv_open: + sts = Chem.ForwardSDMolSupplier(pv_open) + _poses = [] + for st in sts: + if center_ligand is not None: + lig_centroid = ComputeCentroid(st.GetConformer()) + displacement = lig_centroid.DirectionVector(center) * lig_centroid.Distance(center) + # print(distance) + if np.abs(displacement.x) > 7.5 or np.abs(displacement.y) > 7.5 or np.abs(displacement.z) > 7.5: + print(f"skipped for {pv}") + continue + _poses.append(st) + print(len(poses)) keep = [] for i in range(len(_names)): diff --git a/open_combind/features/ifp.py b/open_combind/features/ifp.py index d5f00bc..c116a11 100644 --- a/open_combind/features/ifp.py +++ b/open_combind/features/ifp.py @@ -2,6 +2,7 @@ Compute interaction fingerprints for poseviewer files. """ +import fileinput import tempfile import os import re @@ -10,7 +11,6 @@ import pandas as pd from rdkit.Chem import MolFromSmarts,ForwardSDMolSupplier,MolFromPDBFile, AddHs from rdkit.Chem.rdForceFieldHelpers import UFFGetMoleculeForceField, OptimizeMolecule -import gzip def resname(atom): """ @@ -825,7 +825,7 @@ def fingerprint_poseviewer(input_file, poses, settings): Parameters ---------- input_file : str - Path to the gzipped SDF file containing all of the ligand poses + Path to the SDF file containing all of the ligand poses. Can be gzipped (`.sdf.gz') or not(`.sdf'). poses : int Number of poses to read from `input_file` settings : dict @@ -839,13 +839,11 @@ def fingerprint_poseviewer(input_file, poses, settings): """ prot_bname = input_file.split('-to-')[-1] - prot_fname = re.sub('-docked.*\.sdf\.gz','_prot.pdb',prot_bname) + prot_fname = re.sub('-docked.*\.sdf(\.gz)?','_prot.pdb',prot_bname) prot_file = f"structures/proteins/{prot_fname}" - # print(prot_file) - # print(input_file) fps = [] - with gzip.open(input_file) as fp: + with fileinput.hook_compressed(input_file,'rb') as fp: mols = ForwardSDMolSupplier(fp, removeHs=False) rdk_prot = MolFromPDBFile(prot_file,removeHs=False) if rdk_prot is None: @@ -882,7 +880,7 @@ def ifp(settings, input_file, output_file, poses): settings : dict Settings of the interaction fingerprint input_file : str - Path to the gzipped SDF file containing all of the ligand poses + Path to the SDF file containing all of the ligand poses. Can be gzipped (`.sdf.gz') or not (`.sdf'). output_file : str Path to the output CSV file containing all the interactions poses : int diff --git a/open_combind/open_combind.py b/open_combind/open_combind.py index 5104321..e0a5d84 100755 --- a/open_combind/open_combind.py +++ b/open_combind/open_combind.py @@ -384,9 +384,9 @@ def extract_top_poses(scores, original_pvs): Write top-scoring poses to a single file. """ from rdkit import Chem - import gzip + import fileinput - out = scores.replace('.csv', '.sdf.gz') + out = scores.replace('.csv', '.sdf').replace('.gz','') scores = pd.read_csv(scores).set_index('ID') writer = Chem.SDWriter(out) @@ -394,7 +394,8 @@ def extract_top_poses(scores, original_pvs): counts = {} written = [] for pv in original_pvs: - sts = Chem.ForwardSDMolSupplier(gzip.open(pv)) + with fileinput.hook_compressed(pv,'rb') as f: + sts = Chem.ForwardSDMolSupplier(f) for st in sts: name = st.GetProp("_Name") if name not in counts: diff --git a/open_combind/pymol/view_complexes.py b/open_combind/pymol/view_complexes.py index e34ecbe..6f4833f 100644 --- a/open_combind/pymol/view_complexes.py +++ b/open_combind/pymol/view_complexes.py @@ -1,6 +1,6 @@ +import fileinput from pymol import cmd from rdkit import Chem -import gzip import sys import pandas as pd from glob import glob @@ -169,9 +169,9 @@ def show_interactions(ifp_file, interaction, protein, lig, ligand_file, pose, de enable(lig, pose) if lig_mol is None: - if isinstance(ligand_file,str) and ligand_file.endswith('.gz'): - ligand_file = gzip.open(ligand_file) - lig_mols = [mol for mol in Chem.ForwardSDMolSupplier(ligand_file)] + if isinstance(ligand_file,str): + with fileinput.hook_compressed(ligand_file,'rb') as lf: + lig_mols = [mol for mol in Chem.ForwardSDMolSupplier(ligand_file)] assert len(lig_mols), "No ligands found in {}".format(ligand_file) lig_mol = lig_mols[pose] diff --git a/open_combind/utils.py b/open_combind/utils.py index 68f461e..1a755a8 100644 --- a/open_combind/utils.py +++ b/open_combind/utils.py @@ -1,4 +1,5 @@ from multiprocessing import Pool +import fileinput import os import numpy as np from rdkit import Chem @@ -75,12 +76,8 @@ def get_pose(pv, pose): pose : :class:`~rdkit.Chem.rdchem.Mol` The selected pose. """ - if os.path.splitext(pv)[-1] == ".gz": - import gzip - pv = gzip.open(pv) - else: - pv = open(pv) - sts = Chem.ForwardSDMolSupplier(pv) + with fileinput.hook_compressed(pv,'rb') as pvf: + sts = Chem.ForwardSDMolSupplier(pv) for i,st in enumerate(sts): if i == pose: break @@ -159,10 +156,6 @@ def count_poses(pv): num_poses : int Number of poses in the file. """ - if os.path.splitext(pv)[-1] == ".gz": - import gzip - pv = gzip.open(pv) - else: - pv = open(pv) - num_poses = [1 for i in Chem.ForwardSDMolSupplier(pv)] + with fileinput.hook_compressed(pv,'rb') as pvf: + num_poses = [1 for i in Chem.ForwardSDMolSupplier(pv)] return sum(num_poses)