From 7b87b6209467bd54648a42d85b0635826d070fc5 Mon Sep 17 00:00:00 2001
From: Andrew McNutt <35881828+drewnutt@users.noreply.github.com>
Date: Fri, 20 Oct 2023 15:07:54 -0400
Subject: [PATCH] Gunzip agnostic (#34)

* handling opening with hook_compressed

* swap everything to fileinput

* bugfix
---
 open_combind/dock/ligand_handling.py | 24 +++++++-----
 open_combind/dock/ligprep.py         |  9 ++---
 open_combind/dock/postprocessing.py  | 14 ++-----
 open_combind/features/features.py    | 57 ++++++++++++++--------------
 open_combind/features/ifp.py         | 12 +++---
 open_combind/open_combind.py         |  7 ++--
 open_combind/pymol/view_complexes.py |  8 ++--
 open_combind/utils.py                | 17 +++------
 8 files changed, 66 insertions(+), 82 deletions(-)

diff --git a/open_combind/dock/ligand_handling.py b/open_combind/dock/ligand_handling.py
index e281968..319e99d 100644
--- a/open_combind/dock/ligand_handling.py
+++ b/open_combind/dock/ligand_handling.py
@@ -1,11 +1,9 @@
 import requests
-# import re
 import urllib.parse
 from rdkit import Chem
 from rdkit.Chem.AllChem import AssignBondOrdersFromTemplate
 from prody import writePDBStream
-import gzip
-# from bs4 import BeautifulSoup
+import fileinput
 
 
 class RDKitParseException(Exception):
@@ -211,14 +209,20 @@ def get_ligand_info_RCSB(pdb_id):
     return entry_ligands_info
 
 def mol_to_sdf(mol, path_to_sdf):
-    if path_to_sdf.endswith('.gz'):
-        with gzip.open(path_to_sdf,'wt') as f:
-            with Chem.SDWriter(f) as w:
-                w.write(mol)
-    else:
-        with Chem.SDWriter(path_to_sdf) as w:
-                w.write(mol)
+    """
+    Writes an RDKit molecule to an SDF file.
 
+    Parameters
+    ----------
+    mol : :class:`~rdkit.Chem.rdchem.Mol`
+        RDKit molecule to be written to an SDF file.
+    path_to_sdf : str
+        Path to the output SDF file. Can be gzipped.
+
+    """
+    with fileinput.hook_compressed(path_to_sdf,'wt') as f:
+        with Chem.SDWriter(f) as w:
+            w.write(mol)
 
 class DummyMolBlock():
     def __init__(self):
diff --git a/open_combind/dock/ligprep.py b/open_combind/dock/ligprep.py
index 264ba75..8417793 100644
--- a/open_combind/dock/ligprep.py
+++ b/open_combind/dock/ligprep.py
@@ -1,6 +1,6 @@
 import os
 import argparse
-import gzip
+import fileinput
 from rdkit.Chem import AllChem as Chem
 
 def construct_set_conformers(mol, *, num_confs, confgen, seed=-1):
@@ -244,11 +244,8 @@ def ligsplit(big_sdf, root, multiplex=False, name_prop='BindingDB MonomerID',
         The number of conformers to write to the file.
     """
 
-    if os.path.splitext(big_sdf)[-1] == ".gz":
-        big_sdf_data = gzip.open(big_sdf)
-    else:
-        big_sdf_data = open(big_sdf, 'rb')
-    ligands = Chem.ForwardSDMolSupplier(big_sdf_data)
+    with fileinput.hook_compressed(big_sdf_data, 'rb') as bsd:
+        ligands = Chem.ForwardSDMolSupplier(big_sdf_data)
     unfinished = []
     for count, ligand in enumerate(ligands):
         het_id = ligand.GetProp('Ligand HET ID in PDB')
diff --git a/open_combind/dock/postprocessing.py b/open_combind/dock/postprocessing.py
index 63adac9..401e89a 100644
--- a/open_combind/dock/postprocessing.py
+++ b/open_combind/dock/postprocessing.py
@@ -1,7 +1,7 @@
 #!/usr/bin/python3
 
 import argparse
-import gzip
+import fileinput
 import os
 from rdkit.Chem import ForwardSDMolSupplier, SDWriter
 from rdkit.Chem.rdMolAlign import CalcRMS
@@ -33,11 +33,7 @@ def coalesce_poses(docked_files, sort_by='CNNscore', filter_RMSD=None, reverse=T
     if isinstance(docked_files, str):
         docked_files = [docked_files]
     for dock_file in docked_files:
-        if dock_file.endswith('.gz'):
-            openfile = gzip.open
-        else:
-            openfile = open
-        with openfile(dock_file,'rb') as gz:
+        with fileinput.hook_compressed(dock_file,'rb') as gz:
             mols += [m for m in ForwardSDMolSupplier(gz)]
     sorted_mols = sorted(mols, key=lambda x: float(x.GetProp(sort_by)),reverse=reverse)
 
@@ -61,11 +57,7 @@ def write_poses(sorted_mols, out_file):
         Output file to write the poses to
     """
 
-    if out_file.endswith('.gz'):
-        openfile=gzip.open
-    else:
-        openfile=open
-    with openfile(out_file,'wt') as gz:
+    with fileinput.hook_compressed(out_file,'wt') as gz:
         writer = SDWriter(gz)
         for sm in sorted_mols:
             writer.write(sm)
diff --git a/open_combind/features/features.py b/open_combind/features/features.py
index ad7f4f7..9f82042 100644
--- a/open_combind/features/features.py
+++ b/open_combind/features/features.py
@@ -1,5 +1,5 @@
+import fileinput
 import os
-import gzip
 import numpy as np
 import pandas as pd
 from glob import glob
@@ -114,22 +114,20 @@ def get_molecules_from_files(self, pvs, native=False, center_ligand=None):
         for pv in pvs:
             # mol_bundle = Chem.FixedMolSizeMolBundle()
             mol_bundle = []
-            pv_open = pv
-            if pv.endswith('.gz'):
-                pv_open = gzip.open(pv)
-            mol_suppl = Chem.ForwardSDMolSupplier(pv_open)
-            mol_count = 0
-            for mol in mol_suppl:
-                lig_centroid = ComputeCentroid(mol.GetConformer())
-                displacement = lig_centroid.DirectionVector(center) * lig_centroid.Distance(center)
-                # print(distance)
-                if center_ligand is not None and (np.abs(displacement.x) > 7.5 or np.abs(displacement.y) > 7.5 or np.abs(displacement.z) > 7.5):
-                    print(f"skipped for {pv}")
-                    continue
-                mol_bundle.append(mol)
-                mol_count += 1
-                if mol_count == self.max_poses:
-                    break
+            with fileinput.hook_compressed(pv,'rb') as pv_open:
+                mol_suppl = Chem.ForwardSDMolSupplier(pv_open)
+                mol_count = 0
+                for mol in mol_suppl:
+                    lig_centroid = ComputeCentroid(mol.GetConformer())
+                    displacement = lig_centroid.DirectionVector(center) * lig_centroid.Distance(center)
+                    # print(distance)
+                    if center_ligand is not None and (np.abs(displacement.x) > 7.5 or np.abs(displacement.y) > 7.5 or np.abs(displacement.z) > 7.5):
+                        print(f"skipped for {pv}")
+                        continue
+                    mol_bundle.append(mol)
+                    mol_count += 1
+                    if mol_count == self.max_poses:
+                        break
             if (native is False) and (mol_count != self.max_poses):
                 print(f"Did not get {self.max_poses} poses for {pv}, only {len(mol_bundle)} poses")
             print(mol_count,pv)
@@ -269,18 +267,19 @@ def load_single_features(self, pvs, ligands=None, center_ligand=None):
             _ifps = [_ifps.loc[_ifps.pose==p] for p in range(max(_ifps.pose)+1)]
 
             #Need to check for if ligand is centered here.
-            sts = Chem.ForwardSDMolSupplier(gzip.open(pv))
-            _poses = []
-            for st in sts:
-                if center_ligand is not None:
-                    lig_centroid = ComputeCentroid(st.GetConformer())
-                    displacement = lig_centroid.DirectionVector(center) * lig_centroid.Distance(center)
-                    # print(distance)
-                    if np.abs(displacement.x) > 7.5 or np.abs(displacement.y) > 7.5 or np.abs(displacement.z) > 7.5:
-                        print(f"skipped for {pv}")
-                        continue
-                _poses.append(st)
-            print(len(poses))
+            with fileinput.hook_compressed(pv, 'rb') as pv_open:
+                sts = Chem.ForwardSDMolSupplier(pv_open)
+                _poses = []
+                for st in sts:
+                    if center_ligand is not None:
+                        lig_centroid = ComputeCentroid(st.GetConformer())
+                        displacement = lig_centroid.DirectionVector(center) * lig_centroid.Distance(center)
+                        # print(distance)
+                        if np.abs(displacement.x) > 7.5 or np.abs(displacement.y) > 7.5 or np.abs(displacement.z) > 7.5:
+                            print(f"skipped for {pv}")
+                            continue
+                    _poses.append(st)
+                print(len(poses))
 
             keep = []
             for i in range(len(_names)):
diff --git a/open_combind/features/ifp.py b/open_combind/features/ifp.py
index d5f00bc..c116a11 100644
--- a/open_combind/features/ifp.py
+++ b/open_combind/features/ifp.py
@@ -2,6 +2,7 @@
 Compute interaction fingerprints for poseviewer files.
 """
 
+import fileinput
 import tempfile
 import os
 import re
@@ -10,7 +11,6 @@
 import pandas as pd
 from rdkit.Chem import MolFromSmarts,ForwardSDMolSupplier,MolFromPDBFile, AddHs
 from rdkit.Chem.rdForceFieldHelpers import UFFGetMoleculeForceField, OptimizeMolecule
-import gzip
 
 def resname(atom):
     """
@@ -825,7 +825,7 @@ def fingerprint_poseviewer(input_file, poses, settings):
     Parameters
     ----------
     input_file : str
-        Path to the gzipped SDF file containing all of the ligand poses
+        Path to the SDF file containing all of the ligand poses. Can be gzipped (`.sdf.gz') or not(`.sdf').
     poses : int
         Number of poses to read from `input_file`
     settings : dict
@@ -839,13 +839,11 @@ def fingerprint_poseviewer(input_file, poses, settings):
     """
     
     prot_bname = input_file.split('-to-')[-1]
-    prot_fname = re.sub('-docked.*\.sdf\.gz','_prot.pdb',prot_bname)
+    prot_fname = re.sub('-docked.*\.sdf(\.gz)?','_prot.pdb',prot_bname)
     prot_file = f"structures/proteins/{prot_fname}"
-    # print(prot_file)
 
-    # print(input_file)
     fps = []
-    with gzip.open(input_file) as fp:
+    with fileinput.hook_compressed(input_file,'rb') as fp:
         mols = ForwardSDMolSupplier(fp, removeHs=False)
         rdk_prot = MolFromPDBFile(prot_file,removeHs=False)
         if rdk_prot is None:
@@ -882,7 +880,7 @@ def ifp(settings, input_file, output_file, poses):
     settings : dict
         Settings of the interaction fingerprint
     input_file : str
-        Path to the gzipped SDF file containing all of the ligand poses
+        Path to the SDF file containing all of the ligand poses. Can be gzipped (`.sdf.gz') or not (`.sdf').
     output_file : str
         Path to the output CSV file containing all the interactions 
     poses : int
diff --git a/open_combind/open_combind.py b/open_combind/open_combind.py
index 5104321..e0a5d84 100755
--- a/open_combind/open_combind.py
+++ b/open_combind/open_combind.py
@@ -384,9 +384,9 @@ def extract_top_poses(scores, original_pvs):
     Write top-scoring poses to a single file.
     """
     from rdkit import Chem
-    import gzip
+    import fileinput
 
-    out = scores.replace('.csv', '.sdf.gz')
+    out = scores.replace('.csv', '.sdf').replace('.gz','')
     scores = pd.read_csv(scores).set_index('ID')
 
     writer = Chem.SDWriter(out)
@@ -394,7 +394,8 @@ def extract_top_poses(scores, original_pvs):
     counts = {}
     written = []
     for pv in original_pvs:
-        sts = Chem.ForwardSDMolSupplier(gzip.open(pv))
+        with fileinput.hook_compressed(pv,'rb') as f:
+            sts = Chem.ForwardSDMolSupplier(f)
         for st in sts:
             name = st.GetProp("_Name")
             if name not in counts:
diff --git a/open_combind/pymol/view_complexes.py b/open_combind/pymol/view_complexes.py
index e34ecbe..6f4833f 100644
--- a/open_combind/pymol/view_complexes.py
+++ b/open_combind/pymol/view_complexes.py
@@ -1,6 +1,6 @@
+import fileinput
 from pymol import cmd
 from rdkit import Chem
-import gzip
 import sys
 import pandas as pd
 from glob import glob
@@ -169,9 +169,9 @@ def show_interactions(ifp_file, interaction, protein, lig, ligand_file, pose, de
     enable(lig, pose)
 
     if lig_mol is None:
-        if isinstance(ligand_file,str) and ligand_file.endswith('.gz'):
-            ligand_file = gzip.open(ligand_file)
-        lig_mols = [mol for mol in Chem.ForwardSDMolSupplier(ligand_file)]
+        if isinstance(ligand_file,str):
+            with fileinput.hook_compressed(ligand_file,'rb') as lf:
+                lig_mols = [mol for mol in Chem.ForwardSDMolSupplier(ligand_file)]
         assert len(lig_mols), "No ligands found in {}".format(ligand_file)
         lig_mol = lig_mols[pose]
 
diff --git a/open_combind/utils.py b/open_combind/utils.py
index 68f461e..1a755a8 100644
--- a/open_combind/utils.py
+++ b/open_combind/utils.py
@@ -1,4 +1,5 @@
 from multiprocessing import Pool
+import fileinput
 import os
 import numpy as np
 from rdkit import Chem
@@ -75,12 +76,8 @@ def get_pose(pv, pose):
     pose : :class:`~rdkit.Chem.rdchem.Mol`
         The selected pose.
     """
-    if os.path.splitext(pv)[-1] == ".gz":
-        import gzip
-        pv = gzip.open(pv)
-    else:
-        pv = open(pv)
-    sts = Chem.ForwardSDMolSupplier(pv)
+    with fileinput.hook_compressed(pv,'rb') as pvf:
+        sts = Chem.ForwardSDMolSupplier(pv)
     for i,st in enumerate(sts):
         if i == pose:
             break
@@ -159,10 +156,6 @@ def count_poses(pv):
     num_poses : int
         Number of poses in the file.
     """
-    if os.path.splitext(pv)[-1] == ".gz":
-        import gzip
-        pv = gzip.open(pv)
-    else:
-        pv = open(pv)
-    num_poses = [1 for i in Chem.ForwardSDMolSupplier(pv)]
+    with fileinput.hook_compressed(pv,'rb') as pvf:
+        num_poses = [1 for i in Chem.ForwardSDMolSupplier(pv)]
     return sum(num_poses)