Open-ComBind Defaults, CLI, and Documentation (#37)

* fix docs issue * install theme first * updating docking defaults * docstring updates * remove unecessary hierarchy * add preprint to README * fix docs issue * install theme first * updating docking defaults * docstring updates * remove unecessary hierarchy * add preprint to README * bugfix * bugfix with IFP parallel * prettify and consistency * prettify and fix docs * making the CLI nice
drewnutt · Nov 7, 2023 · 6655cd7 · 6655cd7
1 parent 463d90c
commit 6655cd7
Show file tree

Hide file tree

Showing 10 changed files with 151 additions and 230 deletions.
diff --git a/README.md b/README.md
@@ -6,6 +6,7 @@
 
 Open-source docking pipeline leveraging pairwise statistics
 
+Open-ComBind is currently a [preprint](https://doi.org/10.26434/chemrxiv-2023-xjl84)
 
 This is a fork of [ComBind](https://github.com/drorlab/combind) that removes all uses of proprietary API calls or proprietary software. This will produce similar (if not the exact same) results
 as the original ComBind and it will be completely free to use.

diff --git a/docs/api/index.rst b/docs/api/index.rst
@@ -7,9 +7,6 @@ Open-Combind is primarily used through its command line interface. It provides a
 
 .. toctree::
 
-Main Combind Commands
----------------------
-
 .. autofunction:: open_combind.structprep
 
 .. autofunction:: open_combind.ligprep

diff --git a/docs/requirements.yaml b/docs/requirements.yaml
@@ -17,3 +17,4 @@ dependencies:
     - ProDy>=2.0
     - click
     - plumbum
+    - sphinx_rtd_theme
diff --git a/open_combind/cli/cli.py b/open_combind/cli/cli.py
@@ -5,10 +5,12 @@
 import click
 import os
 import sys
+from argparse import Namespace
 from glob import glob
 
 from open_combind.utils import *
 import open_combind as oc
+from open_combind.dock.ligprep import main as _single_ligprep
 ###############################################################################
 
 # Defaults
@@ -33,30 +35,23 @@ def cli(ctx):
 
 
 @cli.command()
-@click.argument('struct', default='')
-@click.option('--templ-struct')
-def structprep(templ_struct='', struct='', raw_dir='structures/raw',
+@click.option('--align-struct', default='', help='Structure to use for alignment. Defaults to alphabetically lowest.')
+@click.option('--templ-struct', help='Structure to use as template for alignment. Defaults to alphabetically lowest.')
+def structprep(templ_struct='', align_struct='', raw_dir='structures/raw',
         align_dir='structures/aligned', processed_dir='structures/processed',
         template_dir='structures/dir', ligand_dir='structures/ligands',
         protein_dir='structures/proteins' ):
     """
     Prepare structures and make a docking template file.
 
-    "struct" specifies the name of the structure for which to make a docking
-    template file. (Not the full path, generally just the PDB code.) Defaults to the
-    structure with alphabetically lowest name.
-
-    "templ_struct" specifies the name of the structure to use as a template for
-    docking. Defaults to the structure with alphabetically lowest name.
-
     raw_dir, align_dir, processed_dir, template_dir, ligand_dir, and protein_dir
     specify the directories where the raw structures, aligned structures,
     processed structures, docking templates, ligands, and proteins will be
-    written, respectively. Defaults to "structures/raw", "structures/aligned",
-    "structures/processed", "structures/templates", "structures/ligands", and
-    "structures/proteins", respectively.
+    written, respectively. 
+    Defaults to "structures/raw", "structures/aligned", "structures/processed", 
+    "structures/templates", "structures/ligands", and "structures/proteins", respectively.
 
-    The following directory structure is required:
+    The following directory structure is recommended:
 
             <raw_dir>/
                 structure_name.pdb
@@ -80,125 +75,148 @@ def structprep(templ_struct='', struct='', raw_dir='structures/raw',
 
     The process can be started from any step, e.g. if you have processed
     versions of your structures, you can place these in the processed directory.
-
-    Files ending with _lig contain only the small molecule ligand present in the
-    structure, and files ending with _prot contain everything else.
     """
-    oc.structprep(templ_struct=templ_struct, struct=struct, raw_dir=raw_dir,
+    oc.structprep(templ_struct=templ_struct, struct=align_struct, raw_dir=raw_dir,
             align_dir=align_dir, processed_dir=processed_dir,
             template_dir=template_dir, ligand_dir=ligand_dir,
             protein_dir=protein_dir)
 
 
 @cli.command()
-@click.argument('smiles')
-@click.option('--root', default='ligands')
-@click.option('--multiplex', is_flag=True)
-@click.option('--sdffile', is_flag=True)
-@click.option('--ligand-names', default='ID')
-@click.option('--ligand-smiles', default='SMILES')
-@click.option('--delim', default=',')
-@click.option('--num-out-confs', default=10)
-@click.option('--num-confs', default=50)
-@click.option('--confgen', default='etkdg_v2')
-@click.option('--ff', default="UFF")
-@click.option('--max-iterations', default=1000)
-@click.option('--seed', default=-1)
-@click.option('--processes', default=1)
+@click.argument('smiles', required=True)
+@click.option('--root', default='ligands',
+        help='Root directory for output of ligprep', type=click.Path(), show_default=True)
+@click.option('--multiplex', is_flag=True, help='Write all ligands to one file.')
+@click.option('--sdffile', is_flag=True,
+        help='`smiles` is a SDF file containing all the ligands')
+@click.option('--ligand-names', default='ID', help='Column name for ligand names',
+        show_default=True)
+@click.option('--ligand-smiles', default='SMILES', help='Column name for ligand smiles',
+        show_default=True)
+@click.option('--delim', default=',', help='Delimiter for smiles file', show_default=True)
+@click.option('--num-out-confs', default=10, help='Number of output conformers', show_default=True)
+@click.option('--num-confs', default=50,
+        help='Number of conformers to generate and query energy', show_default=True)
+@click.option('--confgen', default='etkdg_v2', help='Conformer generation method',
+        show_default=True)
+@click.option('--ff', default="UFF",
+        help='Force field for minimization and energy computation', show_default=True)
+@click.option('--max-iterations', default=1000,
+        help='Maximum number of iterations for minimization', show_default=True)
+@click.option('--seed', default=-1, help='Random seed for conformer generation')
+@click.option('--processes', default=1 , help='Number of processes to use', show_default=True)
 def ligprep(smiles, root, multiplex, ligand_names, ligand_smiles,
             delim, processes, sdffile, num_out_confs, num_confs, confgen,
             max_iterations, seed, ff):
     """
-    Prepare ligands for docking, from smiles or sdf.
+    Prepare ligands in `smiles` for docking, from SMILES or SDF.
 
-    Specifically, this will utilize RDKit to generate a 3D conformer for each of the ligands 
-
-    "smiles" should be a `delim` delimited file with columns "ligand-names"
-    and "ligand-smiles". Alternatively, "smiles" can be a SDF file with multiple
-    ligands as different entries, but you must specify "--sdffile".
-
-    "root" specifies where the processed ligands will be written.
-
-    By default, an individual file will be made for each ligand. If multiplex is
-    set, then only one file, containing all the ligands, will be produced.
-
-    Multiprocessing is only supported for non-multiplexed mode.
+    Specifically, this will utilize RDKit to generate a set of 3D conformers for each of the ligands 
     """
     oc.ligprep(smiles, root=root, multiplex=multiplex, ligand_names=ligand_names,
             ligand_smiles=ligand_smiles, delim=delim, sdffile=sdffile,
             num_out_confs=num_out_confs, num_confs=num_confs, confgen=confgen,
             max_iterations=max_iterations, ff=ff, processes=processes, seed=seed)
 
-
 @cli.command()
-@click.argument('ligands', nargs=-1)
-@click.option('--root', default='docking')
-@click.option('--template')
-@click.option('--screen', is_flag=True)
-@click.option('--slurm', is_flag=True)
-@click.option('--now', is_flag=True)
-@click.option('--dock-file')
-def dock_ligands(template, root, ligands, screen, slurm, now, dock_file):
+@click.argument('input-file', required=True)
+@click.argument('output-file', required=True)
+@click.option('--num-out-confs', default=10, help='Number of output conformers', show_default=True)
+@click.option('--num-confs', default=50,
+        help='Number of conformers to generate and query energy', show_default=True)
+@click.option('--confgen', default='etkdg_v2', help='Conformer generation method',
+        show_default=True)
+@click.option('--ff', default="UFF",
+        help='Force field for minimization and energy computation', show_default=True)
+@click.option('--max-iterations', default=1000,
+        help='Maximum number of iterations for minimization', show_default=True)
+@click.option('--seed', default=-1, help='Random seed for conformer generation')
+def singleligprep(input_file, output_file, num_out_confs, num_confs, confgen,
+            max_iterations, seed, ff):
+    """
+    Generate conformations for a single `input-file`, provided as a SMILES or SDF (detected via file extension). Output as an SDF to `output-file`
     """
-    Dock "ligands" to "grid".
-
-    "root" specifies where the docking results will be written.
 
-    Setting "screen" limits the thoroughness of the pose sampling. Recommended
-    for screening, but not pose prediction.
+    args = Namespace(input_file=input_file, output_file=output_file,
+            num_out_confs=num_out_confs, num_confs=num_confs, confgen=confgen,
+            maxIters=max_iterations, ff=ff, seed=seed)
+    _single_ligprep(args)
 
-    "ligands" are paths to prepared ligand files. Multiple can be specified.
+@cli.command()
+@click.argument('ligands', nargs=-1, required=True)
+@click.option('--root', default='docking',
+        help='Root directory for output of docking', type=click.Path(), show_default=True)
+@click.option('--template',
+        help='Template file for docking, defaults to the alphabetically first file in `structures/templates/*.template`',
+        type=click.Path())
+@click.option('--screen', is_flag=True,
+        help='Screening mode, for faster docking (i.e. less sampling).')
+@click.option('--slurm', is_flag=True,
+        help='If running on a cluster, outputs tarball of docking inputs and makes dockfile use paths that tarball has.')
+@click.option('--now', is_flag=True, help='Run docking now, instead of just writing dockfile.')
+@click.option('--processes', default=1, help='Number of processes to use for docking.', show_default=True, type=int)
+@click.option('--dock-file',
+        help='Format string for dockfile. Defaults to "-l {lig} -o {out} --exhaustiveness {exh} --num_modes 30 --min_rmsd_filter 0.01 > {log}", should include `{lig}` and `{out}` at a minimum.')
+def dock_ligands(template, root, ligands, screen, slurm, now, dock_file, processes):
+    """
+    Dock "ligands" using template.
 
-    "dock_file" is a format string that will be used with all of the ligands to 
-    create a docking file. The default looks like:
-     "-l {lig} -o {out} --exhaustiveness {exh} --num_modes 200 > {log} \n"
     """
-    oc.dock_ligands(ligands, template=template, dock_file=dock_file, root=root, screen=screen, slurm=slurm, now=now)
+    oc.dock_ligands(ligands, template=template, dock_file=dock_file, root=root, screen=screen, slurm=slurm, now=now, processes=processes)
 
 ################################################################################
 
 
 @cli.command()
-@click.argument('root')
-@click.argument('poseviewers', nargs=-1)
-@click.option('--native', default='structures/ligands/*_lig.sdf')
-@click.option('--ifp-version', default=IFP_VERSION)
-@click.option('--mcss-param', default=MCSS_PARAM)
-@click.option('--screen', is_flag=True)
-@click.option('--max-poses', default=100)
-@click.option('--no-mcss', is_flag=True)
-@click.option('--no-cnn', is_flag=True)
-@click.option('--use-shape', is_flag=True)
-@click.option('--processes', default=1)
-@click.option('--template', default='structures/template/*.template')
-@click.option('--check-center-ligs', is_flag=True)
-@click.option('--newscore', default=None)
-@click.option('--no-reverse', is_flag=False)
+@click.argument('root', required=True)
+@click.argument('poseviewers', nargs=-1, required=True)
+@click.option('--native', default='structures/ligands/*_lig.sdf', help='Native ligand files')
+@click.option('--ifp-version', default=IFP_VERSION, help='IFP version to use')
+@click.option('--mcss-param', default=MCSS_PARAM, help='MCSS parameter setup to use')
+@click.option('--max-poses', default=100,
+        help='Maximum number of poses to featurize per ligand', show_default=True)
+@click.option('--no-mcss', is_flag=True, help='Do not use RMSD of MCSS as a feature')
+@click.option('--no-cnn', is_flag=True, help='Do not extract CNN scores from poses')
+@click.option('--use-shape', is_flag=True, help='Use shape comparison as a feature')
+@click.option('--processes', default=1, help='Number of processes to use', show_default=True)
+@click.option('--template', default='structures/template/*.template',
+        help='Template files used for docking')
+@click.option('--check-center-ligs', is_flag=True,
+        help='Check that the center of the ligands poses are near the center of the native ligand')
+@click.option('--newscore', default=None,
+        help='If using a score other than Vina score, specify the name of the score here')
+@click.option('--no-reverse', is_flag=False,
+        help='Sort the poses from lowest (first) to highest score')
 def featurize(root, poseviewers, native, ifp_version,
-            mcss_param, screen, no_mcss, use_shape, processes, max_poses,
+            mcss_param, no_mcss, use_shape, processes, max_poses,
             no_cnn, template, check_center_ligs, newscore, no_reverse):
+    """
+    Featurize docking poses.
+
+    """
 
     oc.featurize(root, poseviewers, native=native, no_mcss=no_mcss, use_shape=use_shape,
-                max_poses=max_poses, no_cnn=no_cnn, screen=screen, ifp_version=ifp_version,
+                max_poses=max_poses, no_cnn=no_cnn, ifp_version=ifp_version,
                 mcss_param=mcss_param, processes=processes, template=template,
                 check_center_ligs=check_center_ligs, newscore=newscore, reverse=not no_reverse)
 ################################################################################
 
 
 @cli.command()
-@click.argument('root')
-@click.argument('out', default="poses.csv")
+@click.argument('root', required=True)
 @click.argument('ligands', nargs=-1)
-@click.option('--features', default='mcss,hbond,saltbridge,contact')
-@click.option('--alpha', default=1.0)
-@click.option('--stats-root', default=None)
-@click.option('--restart', default=500)
-@click.option('--max-iterations', default=1000)
-@click.option('--newscore', default=None)
+@click.option('--out', default="poses.csv", help='Output file', show_default=True)
+@click.option('--features', default='mcss,hbond,saltbridge,contact',
+        help='Features to use for pose prediction', show_default=True)
+@click.option('--alpha', default=-1.0, help='Weight of the docking score', show_default=True)
+@click.option('--stats-root', default=None, help='Root directory of the statistics files, defaults to Open-ComBind default statistics')
+@click.option('--restart', default=500, help='Number of likelihood minimization runs to run from random initialization', show_default=True)
+@click.option('--max-iterations', default=1000, help='Maximum number of iterations for the likelihood minimization for each run', show_default=True)
+@click.option('--newscore', default=None, help='If using a score other than Vina score, specify the name of the score here')
 def pose_prediction(root, out, ligands, alpha, stats_root, features, restart,
                     max_iterations, newscore):
     """
-    Run ComBind pose prediction.
+    Run ComBind pose prediction. Using the features and ligands in `root`, unless `ligands` are specified.
     """
     features = features.split(',')
 

diff --git a/open_combind/dock/dock.py b/open_combind/dock/dock.py
@@ -2,7 +2,7 @@
 import subprocess
 from glob import glob
 
-GNINA = ' -l {lig} -o {out} --exhaustiveness {exh} --num_modes 200 --min_rmsd_filter 0 > {log}'  #: default command for GNINA docking
+GNINA = ' -l {lig} -o {out} --exhaustiveness {exh} --num_modes 30 --min_rmsd_filter 0.01 > {log}'  #: default command for GNINA docking
 
 def docking_failed(gnina_log):
     if not os.path.exists(gnina_log):
@@ -38,8 +38,8 @@ def check_dock_line(infile):
     assert '{log}' in infile, "need to have {log} in your docking line to specify logfile"
     if '{exh}' not in infile:
         print('Warning: your docking line does not contain {exh}\n\
-                Docking will use either your specified exhaustiveness \
-                (if specified) or the default GNINA exchaustiveness of 8')
+                \t\tDocking will use either your specified exhaustiveness\n \
+                \t\t(if specified) or the default GNINA exhaustiveness of 8')
 
     return infile
 
@@ -79,7 +79,7 @@ def dock(template, ligands, root, name, enhanced, infile=None, slurm=False, now=
         infile = check_dock_line(infile)
     exh = 8
     if enhanced:
-        exh = 16
+        exh *= 2
     dock_template = open(template).readlines()[0].strip('\n')
     recname = os.path.splitext(os.path.split(dock_template.split('-r')[-1].strip().split(' ')[0])[1])[0]
     # aboxname = os.path.splitext(os.path.split(dock_template.split('--autobox_ligand')[-1].strip().split(' ')[0])[1])[0]

diff --git a/open_combind/dock/ligand_handling.py b/open_combind/dock/ligand_handling.py
@@ -1,9 +1,9 @@
+import fileinput
 import requests
 import urllib.parse
 from rdkit import Chem
 from rdkit.Chem.AllChem import AssignBondOrdersFromTemplate
 from prody import writePDBStream
-import fileinput
 
 
 class RDKitParseException(Exception):
@@ -17,7 +17,7 @@ def __init__(self, message):
 def get_ligands_from_RCSB(pdbid, lig_code=None, specific_chain=None,
                         save_file="{pdbid}_{chem_name}_{chain_id}_{seq_id}.sdf", first_only=False):
     """
-    Given a PDBFile (or a PDB Header file), download the ligands present in the PDB file directly
+    Given a PDB ID, download the ligands present in the PDB file directly
     from the RCSB as a SDF file
 
     Parameters
@@ -110,7 +110,7 @@ def ligand_selection_to_mol(ligand_selection, query_ligand, outfile=None):
 
 def get_ligand_from_SMILES(lig_id):
     """
-    Using the RCSB API, pulls the SMILES string containin stereochemical features for the given ligand ID.
+    Using the RCSB API, pulls the SMILES string containing stereochemical features for the given ligand ID.
 
     If a ligand has a dedicated page at ``https://www.rcsb.org/ligand/{ligand_id}`` then it likely has a SMILES string
     (does not need to be a 2 or 3 letter identifier)
@@ -141,38 +141,6 @@ def get_ligand_from_SMILES(lig_id):
 
     return mol
 
-
-
-# def scrape_rcsb_webpage(pdb_id):
-#     rec_page_url = "https://www.rcsb.org/structure/{receptor}"
-#     receptor_page = requests.get(rec_page_url.format(receptor=pdb_id))
-#     soup = BeautifulSoup(receptor_page.text, 'html.parser')
-#     ligand_rows = soup.find_all('tr', id=re.compile('ligand_row_\w{1,3}'))
-#     ligand_chain_info = dict()
-#     for lig_row in ligand_rows:
-#         lig_cols = lig_row.find_all('td')
-#         lig_name = lig_cols[0].a.text
-#         chain_info = dict()
-#         for link_tag in lig_cols[0].find_all('ul', class_='dropdown-menu')[0].find_all('a'):
-#             url = link_tag.get('href')
-#             if 'encoding=sdf' not in url:
-#                 continue
-#             text = link_tag.text.split(',')[-1]
-#             chain = text.split()[1]  # first word is always "chain"
-#             auth_chain = re.findall('\[auth ([A-Z])\]', text)
-#             if len(auth_chain) == 1:
-#                 auth_chain = auth_chain[0]
-#             elif len(auth_chain) > 1:
-#                 raise IndexError(f"more than one auth chain for {text} in {pdb_id}")
-#             else:
-#                 auth_chain = chain
-#             chain_info[chain] = {"auth_chain": auth_chain, "url": url}
-
-#         ligand_chain_info[lig_name] = chain_info
-
-#     return ligand_chain_info
-
-
 def get_ligand_info_RCSB(pdb_id):
     """
     Downloads the information for all of the nonpolymer entities in the given `pdb_id` from RCSB.