diff --git a/rdtools/compare.py b/rdtools/compare.py index bfae4ea9..da17b886 100644 --- a/rdtools/compare.py +++ b/rdtools/compare.py @@ -2,6 +2,7 @@ import traceback import numpy as np +from collections import defaultdict from rdkit import Chem from rdkit.Chem.rdMolDescriptors import CalcMolFormula @@ -282,3 +283,70 @@ def is_same_connectivity_conf( else: return is_same_connectivity_mol(mol, new_mol) + + +def is_symmetric_to_substructure(mol : Chem.Mol, substructure: Chem.Mol) -> bool: + ''' + Check whether a mol is symmetric to a provided substructure. + + Args: + mol1 (RWMol): The molecule to check. + substructure (RWMol): A molecule representing the SMARTS substructure to check. + + Returns: + bool: Whether the molecule is symmetric w.r.t. the substructure. + ''' + matches = mol.GetSubstructMatches(substructure) + + classes = find_symmetry_classes(mol) + + if len(matches) == 0: # Substructure isn't in molecule. + return False + elif len(matches) == 1: # Molecule has only one match and is therefore symmetric w.r.t. substructure + return True + + # Assumes that 'matches' contains sets of equal size. + length_matches = len(matches[0]) + num_matches = len(matches) + for match in matches: + assert len(match) == length_matches + + # There is a match if all of the nth elements of each list in the matches is in the classes set. + for cla in classes: # Example: classes = {(2, 4), (1,3), (0,5)}; cla = (2, 4) + + # Loop through the matches. + for j in range(length_matches): # Example: 0, 1 (length_matches = 2, the substructure is 2 atoms long) + + match_index = 0 + for i in range(num_matches): # Example: 0, 1 (num_matches = 2, we have 2 substructure matches) + # Logic here is that matches[i][j] should be in the cla set for all i. + if matches[i][j] in cla: + match_index += 1 + + # 2 possibilities: + if match_index == num_matches: # symmetric: all symmetry classes match all substructure matches at the same ID + pass + elif match_index == 0: # nothing matches, but other iterations of i, j, and cla might match + pass + else: # asymmetric, matches are out of order + return False + + return True + +def find_symmetry_classes(mol : Chem.Mol) -> set: + ''' + Find set of symmetry classes for a given mol. + Adapted from code by Greg Landrum, 2011: + https://sourceforge.net/p/rdkit/mailman/rdkit-discuss/thread/CAD4fdRSofifYy_GFeZfxoHd_z4Y=4tVNR3UuBTea3sr81e8UMQ@mail.gmail.com/ + + Args: + mol: Molecule to examine symmetry classes. + ''' + + equivs = defaultdict(set) + matches = mol.GetSubstructMatches(mol,uniquify=False) + for match in matches: + for idx1,idx2 in enumerate(match): equivs[idx1].add(idx2) + classes = set() + for s in equivs.values(): classes.add(tuple(s)) + return classes diff --git a/rdtools/mol.py b/rdtools/mol.py index 797c411e..8902f07a 100644 --- a/rdtools/mol.py +++ b/rdtools/mol.py @@ -4,9 +4,11 @@ from typing import List, Optional, Union, Sequence import numpy as np +import warnings from rdkit import Chem from rdkit.Chem import Descriptors +from rdkit.Chem.MolStandardize.rdMolStandardize import ChargeParent from rdkit.Geometry.rdGeometry import Point3D from rdtools.atommap import has_atom_map_numbers @@ -18,7 +20,7 @@ set_conformer_coordinates, ) from rdtools.conversion.xyz import xyz_to_coords - +from rdtools.conversion.smiles import mol_from_smiles, mol_to_smiles def get_spin_multiplicity(mol: Chem.Mol) -> int: """ @@ -224,6 +226,127 @@ def fast_sanitize(mol: Chem.RWMol): ) +def is_implicit(mol : Chem.RWMol): + """ + Infer whether a molecule has implicit hydrogens. + """ + + for atom in mol.GetAtoms(): + if atom.GetNumImplicitHs() > 0: + return True + + return False + + +def uncharge_mol(mol : Chem.RWMol, + method : str = "all"): + """ + Uncharges a molecule, adding or removing hydrogens wherever necessary. + + Args: + mol (Chem.Mol): The molecule to uncharge. + method (str, optional): Algorithm for uncharging: "rdkit" for RDKit's + built-in ChargeParent method, "nocharge" for + Neil O'Boyle's nocharge algorithm (as adapted + by Vincent Scalfani) + Default is to use "all", starting with uncharger. + """ + if isinstance(mol, Chem.RWMol): + mol = copy.copy(mol) + else: + mol = Chem.RWMol(mol) + + METHOD_LIST = ["all", "rdkit", "nocharge"] + + if method not in METHOD_LIST: + raise KeyError(f"Method must be in {METHOD_LIST}; got: {method}") + + if method in ("rdkit", "all"): + mol = ChargeParent(mol) + if get_formal_charge(mol) == 0: + return mol + + if method in ("nocharge", "all"): + # Algorithm adapted from Noel O’Boyle (Vincent Scalfani adapted code for RDKit) + # See https://www.rdkit.org/docs/Cookbook.html#neutralizing-molecules) + + implicit_h = is_implicit(mol) + if implicit_h: + pattern = Chem.MolFromSmarts("[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]") + else: + pattern = Chem.MolFromSmarts("[+1!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]") + + at_matches = mol.GetSubstructMatches(pattern) + at_matches_list = [y[0] for y in at_matches] + if len(at_matches_list) > 0: + for at_idx in at_matches_list: + atom = mol.GetAtomWithIdx(at_idx) + chg = atom.GetFormalCharge() + + if chg > 0: + mol = deprotonate_at_site(mol, at_idx) + elif chg < 0: + mol = protonate_at_site(mol, at_idx) + + if get_formal_charge(mol) == 0: + return mol + + warnings.warn(f"Unable to uncharge: got {mol_to_smiles(mol)}") + return mol + + +def protonate_at_site(mol : Chem.RWMol, site : int): + ''' + Add a proton of a mol object at the provided index. + + Args: + mol: Mol object + site: RDKit atom index of the site to be de/protonated. + ''' + + length = mol.GetNumAtoms() + atom = mol.GetAtomWithIdx(site) + atom.SetFormalCharge(atom.GetFormalCharge() + 1) + + if is_implicit(mol): + hcount = atom.GetTotalNumHs(includeNeighbors=True) + newcharge = hcount + 1 + atom.SetNumExplicitHs(newcharge) + else: + h_atom = Chem.MolFromSmiles('[H]') + mol = combine_mols(mol, h_atom) + mol = Chem.RWMol(mol) # as it appears to get un-RWmol from combining + mol.AddBond(site, length, order=Chem.rdchem.BondType.SINGLE) + + return mol + + +def deprotonate_at_site(mol : Chem.RWMol, site : int): + ''' + Remove a proton of a mol object at the provided index. + + Args: + mol: Mol object + site: RDKit atom index of the site to be de/protonated. + + ''' + + atom = mol.GetAtomWithIdx(site) + atom.SetFormalCharge(atom.GetFormalCharge() - 1) + + if is_implicit(mol): + hcount = atom.GetTotalNumHs(includeNeighbors=True) + newcharge = hcount - 1 + atom.SetNumExplicitHs(newcharge) + else: + for neighbor in atom.GetNeighbors(): + if neighbor.GetAtomicNum() == 1: + mol.RemoveAtom(neighbor.GetIdx()) + break + + return mol + + def get_closed_shell_mol( mol: Chem.RWMol, sanitize: bool = True, diff --git a/test/rdtools/test_compare.py b/test/rdtools/test_compare.py index 0f6e085c..1c3574ee 100644 --- a/test/rdtools/test_compare.py +++ b/test/rdtools/test_compare.py @@ -1,8 +1,9 @@ import pytest -from rdtools.compare import get_match_and_recover_recipe, get_unique_mols, has_matched_mol, is_same_connectivity_mol +from rdtools.compare import get_match_and_recover_recipe, get_unique_mols, has_matched_mol, is_same_connectivity_mol, is_symmetric_to_substructure from rdtools.conversion import mol_from_smiles, mol_to_smiles +from rdkit.Chem import MolFromSmarts @pytest.mark.parametrize( "smi1, smi2, expected", @@ -147,3 +148,18 @@ def test_has_same_connectivity(smi1, smi2, expect_match): mol1 = mol_from_smiles(smi1) mol2 = mol_from_smiles(smi2) assert is_same_connectivity_mol(mol1, mol2) == expect_match + + +@pytest.mark.parametrize( + 'smi, sma, expect_match', + [ + ('CC(=O)C', '[CX3]=[OX1]', True), + ('CC(=O)C(=O)C', '[CX3]=[OX1]', True), + ('CC(=O)CC(=O)', '[CX3]=[OX1]', False), + ('C', '[CX3]=[OX1]', False), + ('OCC(CO)(CO)CO', '[CX4]-[OX2]', True), + ]) +def test_is_symmetric_to_substructure(smi, sma, expect_match): + mol = mol_from_smiles(smi) + substructure = MolFromSmarts(sma) + assert is_symmetric_to_substructure(mol, substructure) == expect_match diff --git a/test/rdtools/test_mol.py b/test/rdtools/test_mol.py index 443110ae..9ff79002 100644 --- a/test/rdtools/test_mol.py +++ b/test/rdtools/test_mol.py @@ -16,6 +16,8 @@ get_atomic_nums, get_atom_masses, get_closed_shell_mol, + is_implicit, + uncharge_mol ) from rdtools.conf import ( embed_multiple_null_confs, @@ -175,3 +177,23 @@ def test_get_closed_shell_mol(rad_smi, expect_smi, cheap, atommap): rad_mol = mol_from_smiles(rad_smi, assign_atom_map=atommap) cs_mol = get_closed_shell_mol(rad_mol, cheap=cheap) assert mol_to_smiles(cs_mol) == expect_smi + + +@pytest.mark.parametrize( + "ion_smi, expected_smi", + [ + ("CCCCC(=O)[O-]", "CCCCC(=O)O"), + ("c1ccccc1[O-]", "Oc1ccccc1"), + ("CCC[NH3+]", "CCCN"), + ("[NH3+]CC(=O)[O-]", "NCC(=O)O"), + ("S(=O)(=O)([O-])[O-]", "O=S(=O)(O)O"), + ("C", "C"), + ], +) +@pytest.mark.parametrize("method", ["all", "rdkit", "nocharge"]) +def test_uncharge_mol(ion_smi, expected_smi, method): + + ion_mol = mol_from_smiles(ion_smi) + neut_mol = uncharge_mol(ion_mol, method=method) + assert mol_to_smiles(neut_mol) == expected_smi +