diff --git a/proteinshake/__init__.py b/proteinshake/__init__.py index 6a49b246..d180631a 100644 --- a/proteinshake/__init__.py +++ b/proteinshake/__init__.py @@ -1 +1 @@ -__version__ = '0.3.9' +__version__ = '0.3.11' \ No newline at end of file diff --git a/proteinshake/datasets/alphafold.py b/proteinshake/datasets/alphafold.py index ecf77117..83d6bf35 100644 --- a/proteinshake/datasets/alphafold.py +++ b/proteinshake/datasets/alphafold.py @@ -1,40 +1,10 @@ -import os -import re -import tarfile import glob from proteinshake.datasets import Dataset -from proteinshake.utils import download_url, extract_tar, load, save, unzip_file, progressbar - -# A map of organism names to their download file names. See https://alphafold.ebi.ac.uk/download -AF_DATASET_NAMES = { - 'arabidopsis_thaliana': 'UP000006548_3702_ARATH', - 'caenorhabditis_elegans': 'UP000001940_6239_CAEEL', - 'candida_albicans': 'UP000000559_237561_CANAL', - 'danio_rerio': 'UP000000437_7955_DANRE', - 'dictyostelium_discoideum': 'UP000002195_44689_DICDI', - 'drosophila_melanogaster': 'UP000000803_7227_DROME', - 'escherichia_coli': 'UP000000625_83333_ECOLI', - 'glycine_max': 'UP000008827_3847_SOYBN', - 'homo_sapiens': 'UP000005640_9606_HUMAN', - 'methanocaldococcus_jannaschii': 'UP000000805_243232_METJA', - 'mus_musculus': 'UP000000589_10090_MOUSE', - 'oryza_sativa': 'UP000059680_39947_ORYSJ', - 'rattus_norvegicus': 'UP000002494_10116_RAT', - 'saccharomyces_cerevisiae': 'UP000002311_559292_YEAST', - 'schizosaccharomyces_pombe': 'UP000002485_284812_SCHPO', - 'zea_mays': 'UP000007305_4577_MAIZE', - 'swissprot': 'swissprot_pdb', -} - -description = 'Predicted structures' +from proteinshake.utils import download_url, extract_tar, unzip_file, progressbar class AlphaFoldDataset(Dataset): - """ 3D structures predicted by AlphaFold. - Requires the `organism` name to be specified. - See https://alphafold.ebi.ac.uk/download for a full list of available organsims. - Pass the full latin organism name separated by a space or underscore. - `organism` can also be 'swissprot', in which case the full SwissProt structure predictions will be downloaded (ca. 500.000). + """ SwissProt 3D structures predicted by AlphaFold. .. admonition:: Please cite @@ -46,74 +16,22 @@ class AlphaFoldDataset(Dataset): Raw data was obtained and modified from `AlphaFoldDB `_, originally licensed under `CC-BY-4.0 `_. - - .. list-table :: Data Properties - :widths: 50 50 - :header-rows: 1 - - * - organism - - # proteins - * - ``'arabidopsis_thaliana'`` - - 27,386 - * - ``'caenorhabditis_elegans'`` - - 19,613 - * - ``'candida_albicans'`` - - 5,951 - * - ``'danio_rerio'`` - - 24,430 - * - ``'dictyostelium_discoideum'`` - - 12,485 - * - ``'drosophila_melanogaster'`` - - 13,318 - * - ``'escherichia_coli'`` - - 4,362 - * - ``'glycine_max'`` - - 55,696 - * - ``'homo_sapiens'`` - - 23,172 - * - ``'methanocaldococcus_jannaschii'`` - - 1,773 - * - ``'mus_musculus'`` - - 21,398 - * - ``'oryza_sativa'`` - - 43,631 - * - ``'rattus_norvegicus'`` - - 21,069 - * - ``'saccharomyces_cerevisiae'`` - - 6,016 - * - ``'schizosaccharomyces_pombe'`` - - 5,104 - * - ``'zea_mays'`` - - 39,203 - * - ``'swissprot'`` - - 541,143 - Parameters ---------- - organism: str - The organism name or 'swissprot'. + version: int, default 4 + The AlphaFoldDB version. """ - exlude_args_from_signature = ['organism'] - - def __init__(self, organism='swissprot', version='v4', only_single_chain=True, **kwargs): - self.organism = organism.lower().replace(' ','_') - self.base_url = 'https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/' - self.version = version + def __init__(self, version=4, only_single_chain=True, **kwargs): + self.base_url = f'https://ftp.ebi.ac.uk/pub/databases/alphafold/v{version}' + #self.file_name = f'swissprot_pdb_v{version}' + self.file_name = f'UP000000805_243232_METJA_v{version}' super().__init__(only_single_chain=only_single_chain, **kwargs) - @property - def name(self): - return f'{self.__class__.__name__}_{self.organism}' - - def get_raw_files(self): - return glob.glob(f'{self.root}/raw/*/*.pdb')[:self.limit] - def get_id_from_filename(self, filename): - return re.search('(?<=AF-)(.*)(?=-F.+-model)', filename).group() + return filename.split('-')[1] def download(self): - os.makedirs(f'{self.root}/raw/{self.organism}', exist_ok=True) - download_url(self.base_url+AF_DATASET_NAMES[self.organism]+f'_{self.version}.tar', f'{self.root}/raw/{self.organism}', verbosity=self.verbosity) - extract_tar(f'{self.root}/raw/{self.organism}/{AF_DATASET_NAMES[self.organism]}_{self.version}.tar', f'{self.root}/raw/{self.organism}', verbosity=self.verbosity) - [unzip_file(f) for f in progressbar(glob.glob(f'{self.root}/raw/*/*.pdb.gz')[:self.limit], desc='Unzipping', verbosity=self.verbosity)] + download_url(f'{self.base_url}/{self.file_name}.tar', f'{self.root}/raw', verbosity=self.verbosity) + extract_tar(f'{self.root}/raw/{self.file_name}.tar', f'{self.root}/raw/files', verbosity=self.verbosity) + for path in progressbar(glob.glob(f'{self.root}/raw/*/*.pdb.gz')[:self.limit], desc='Unzipping', verbosity=self.verbosity): unzip_file(path) diff --git a/proteinshake/datasets/dataset.py b/proteinshake/datasets/dataset.py index 9af77047..e3f541c0 100644 --- a/proteinshake/datasets/dataset.py +++ b/proteinshake/datasets/dataset.py @@ -2,22 +2,17 @@ """ Base dataset class for protein 3D structures. """ -import os, gzip, inspect, time, itertools, tarfile, io, requests -import copy -from collections import defaultdict, Counter +import os, inspect, requests, glob from functools import cached_property -import multiprocessing as mp -import pandas as pd import numpy as np import freesasa from biopandas.pdb import PandasPdb from joblib import Parallel, delayed -from sklearn.neighbors import kneighbors_graph, radius_neighbors_graph from fastavro import reader as avro_reader from proteinshake.transforms import IdentityTransform, RandomRotateTransform, CenterTransform -from proteinshake.utils import download_url, save, load, unzip_file, write_avro, Generator, progressbar, warning, error +from proteinshake.utils import download_url, unzip_file, write_avro, Generator, progressbar, warning, error AA_THREE_TO_ONE = {'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E', 'PHE': 'F', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LYS': 'K', 'LEU': 'L', 'MET': 'M', 'ASN': 'N', 'PRO': 'P', 'GLN': 'Q', 'ARG': 'R', 'SER': 'S', 'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y'} AA_ONE_TO_THREE = {v:k for k, v in AA_THREE_TO_ONE.items()} @@ -241,7 +236,7 @@ def get_raw_files(self): list The list of raw PDB files used in this dataset. """ - raise NotImplementedError + return glob.glob(f'{self.root}/raw/files/*.pdb')[:self.limit] def get_id_from_filename(self, filename): """ Implement me in a subclass! @@ -258,7 +253,7 @@ def get_id_from_filename(self, filename): str A PDB identifier or other ID. """ - raise NotImplementedError + return filename.rstrip('.pdb') def download(self): """ Implement me in a subclass! @@ -306,6 +301,10 @@ def download_precomputed(self, resolution='residue'): download_url(f'{self.repository_url}/{self.name}.{resolution}.avro.gz', f'{self.root}', verbosity=self.verbosity) if self.verbosity > 0: print('Unzipping...') unzip_file(f'{self.root}/{self.name}.{resolution}.avro.gz') + for filename in self.additional_files: + if not os.path.exists(f'{self.root}/{filename}'): + download_url(f'{self.repository_url}/{filename}.gz', f'{self.root}', verbosity=0) + unzip_file(f'{self.root}/{filename}.gz') def parse(self): """ Parses all PDB files returned from :meth:`proteinshake.datasets.Dataset.get_raw_files()` and saves them to disk. Can run in parallel. @@ -320,9 +319,6 @@ def parse(self): if self.verbosity > 0: print(f'Filtered {before-len(proteins)} proteins.') - # if self.center: - # if True: - # if self.random_rotate: if self.name == 'ProteinProteinInteractionDataset': print("Centering") proteins = [CenterTransform()(p) for p in proteins] @@ -334,6 +330,7 @@ def parse(self): atom_proteins = [{'protein':p['protein'], 'atom':p['atom']} for p in proteins] write_avro(residue_proteins, f'{self.root}/{self.name}.residue.avro') write_avro(atom_proteins, f'{self.root}/{self.name}.atom.avro') + return proteins def parse_pdb(self, path): """ Parses a single PDB file first into a DataFrame, then into a protein object (a dictionary). Also validates the PDB file and provides the hook for `add_protein_attributes`. Returns `None` if the protein was found to be invalid. @@ -489,21 +486,6 @@ def validate(self, df): if not sum(df['residue_type'].map(lambda x: not x is None)) > 0: return False return True - - def describe(self): - """ Produces dataset statistics. - - Returns - ------- - dict - A dictionary of summary statistics of this dataset. - """ - n_resi = len(self.data.residue_index) / len(self.data.ID) - data = {'name': type(self).__name__, - 'num_proteins': len(self), - 'avg size (# residues)': n_resi - } - return data def to_graph(self, resolution='residue', transform=IdentityTransform(), **kwargs): """ Converts the raw dataset to a graph dataset. See :meth:`proteinshake.representations.GraphDataset` for arguments. diff --git a/proteinshake/datasets/enzyme_commission.py b/proteinshake/datasets/enzyme_commission.py index 26d6caba..b203bae6 100644 --- a/proteinshake/datasets/enzyme_commission.py +++ b/proteinshake/datasets/enzyme_commission.py @@ -13,15 +13,6 @@ class EnzymeCommissionDataset(RCSBDataset): Raw data was obtained and modified from `RCSB Protein Data Bank `_, originally licensed under `CC0 1.0 `_. - - .. list-table:: Dataset stats - :widths: 100 - :header-rows: 1 - - * - # proteins - * - 15603 - - .. list-table:: Annotations :widths: 25 35 45 :header-rows: 1 @@ -35,8 +26,6 @@ class EnzymeCommissionDataset(RCSBDataset): """ - description = 'Enzymes' - def __init__(self, query=[['rcsb_polymer_entity.rcsb_ec_lineage.name','exists']], **kwargs): """ diff --git a/proteinshake/datasets/gene_ontology.py b/proteinshake/datasets/gene_ontology.py index ad39e0dd..2576f8e5 100644 --- a/proteinshake/datasets/gene_ontology.py +++ b/proteinshake/datasets/gene_ontology.py @@ -22,16 +22,6 @@ class GeneOntologyDataset(RCSBDataset): Raw data was obtained and modified from `RCSB Protein Data Bank `_, originally licensed under `CC0 1.0 `_. - - - .. list-table:: Dataset stats - :widths: 100 - :header-rows: 1 - - * - # proteins - * - 32633 - - .. list-table:: Annotations :widths: 25 25 50 :header-rows: 1 @@ -52,8 +42,6 @@ class GeneOntologyDataset(RCSBDataset): """ - description = 'Gene Ontology' - additional_files = ['GeneOntologyDataset.godag.obo'] def __init__(self, query=[['rcsb_polymer_entity_annotation.type','exact_match','GO']], **kwargs): @@ -85,10 +73,3 @@ def add_protein_attributes(self, protein): protein['protein']['cellular_component'] = [term for term in go_terms if godag[term].namespace == 'cellular_component'] protein['protein']['biological_process'] = [term for term in go_terms if godag[term].namespace == 'biological_process'] return protein - - def describe(self): - desc = super().describe() - desc['property'] = "Gene Ontology (GO)" - desc['values'] = f"{len(set((p['GO'][0] for p in self.proteins)))} (root)" - desc['type'] = 'Categorical, Hierarchical' - return desc diff --git a/proteinshake/datasets/protein_family.py b/proteinshake/datasets/protein_family.py index 1e084e49..5f65c790 100644 --- a/proteinshake/datasets/protein_family.py +++ b/proteinshake/datasets/protein_family.py @@ -14,16 +14,6 @@ class ProteinFamilyDataset(RCSBDataset): Raw data was obtained and modified from `RCSB Protein Data Bank `_, originally licensed under `CC0 1.0 `_. - - - .. list-table:: Dataset stats - :widths: 100 - :header-rows: 1 - - * - # proteins - * - 31109 - - .. list-table:: Annotations :widths: 25 35 45 :header-rows: 1 @@ -37,8 +27,6 @@ class ProteinFamilyDataset(RCSBDataset): """ - description = 'Protein Families' - def __init__(self, pfam_version='34.0', query=[['rcsb_polymer_entity_annotation.type','exact_match','Pfam']], **kwargs): self.pfam_version = pfam_version super().__init__(query=query, **kwargs) @@ -53,10 +41,3 @@ def add_protein_attributes(self, protein): pfams.append(a['annotation_id']) protein['protein']['Pfam'] = pfams return protein - - def describe(self): - desc = super().describe() - desc['property'] = "Protein Family (Pfam)" - desc['values'] = f"{len(set((p['Pfam'][0] for p in self.proteins)))} (root)" - desc['type'] = 'Categorical, Hierarchical' - return desc diff --git a/proteinshake/datasets/protein_ligand_decoys.py b/proteinshake/datasets/protein_ligand_decoys.py index 56fa1222..40d0265b 100644 --- a/proteinshake/datasets/protein_ligand_decoys.py +++ b/proteinshake/datasets/protein_ligand_decoys.py @@ -33,17 +33,6 @@ class ProteinLigandDecoysDataset(Dataset): Raw data was obtained and modified from `DUDE-Z `_. - - - - .. list-table:: Dataset stats - :widths: 100 - :header-rows: 1 - - * - # proteins - * - 38 - - .. list-table:: Annotations :widths: 25 35 45 :header-rows: 1 @@ -70,18 +59,10 @@ class ProteinLigandDecoysDataset(Dataset): """ - description = 'Proteins with ligands and decoys' - @patch('proteinshake.datasets.dataset.AA_THREE_TO_ONE', EXTENDED_AA_THREE_TO_ONE) def pdb2df(self, path): return super().pdb2df(path) - def get_raw_files(self): - return glob.glob(f'{self.root}/raw/files/*.pdb')[:self.limit] - - def get_id_from_filename(self, filename): - return filename.split(".")[0] - def download(self): targets = ['AA2AR', 'ABL1', 'ACES', 'ADA', 'ADRB2', 'AMPC', 'ANDR', 'CSF1R', 'CXCR4', 'DEF', 'DRD4', 'EGFR', 'FA7', 'FA10', 'FABP4', 'FGFR1', 'FKB1A', 'GLCM', 'HDAC8', 'HIVPR', 'HMDH', 'HS90A', 'ITAL', 'KITH', 'KIT', 'LCK', 'MAPK2', 'MK01', 'MT1', 'NRAM', 'PARP1', 'PLK1', 'PPARA', 'PTN1', 'PUR2', 'RENI', 'ROCK1', 'SRC', 'THRB', 'TRY1', 'TRYB1', 'UROK', 'XIAP'] diff --git a/proteinshake/datasets/protein_ligand_interface.py b/proteinshake/datasets/protein_ligand_interface.py index 40a4b586..0ef49e1d 100644 --- a/proteinshake/datasets/protein_ligand_interface.py +++ b/proteinshake/datasets/protein_ligand_interface.py @@ -37,13 +37,6 @@ class ProteinLigandInterfaceDataset(Dataset): version: str PDBBind version to use. - .. list-table:: Dataset stats - :widths: 100 - :header-rows: 1 - - * - # proteins - * - 4642 - .. list-table:: Annotations :widths: 20 55 25 :header-rows: 1 @@ -80,8 +73,6 @@ class ProteinLigandInterfaceDataset(Dataset): - :code:`'[..,0, 0, 1, 0, 1, 0, 0, 0,..]` """ - description = '' - def __init__(self, version='2020', **kwargs): self.version = version super().__init__(**kwargs) @@ -203,10 +194,3 @@ def add_protein_attributes(self, protein): protein['protein']['fp_morgan_r2'] = fp_morgan return protein - - def describe(self): - desc = super().describe() - desc['property'] = "Small Mol. Binding Site (residue-level)" - desc['values'] = 2 - desc['type'] = 'Binary' - return desc diff --git a/proteinshake/datasets/protein_protein_interface.py b/proteinshake/datasets/protein_protein_interface.py index 5abcffca..edd1c953 100644 --- a/proteinshake/datasets/protein_protein_interface.py +++ b/proteinshake/datasets/protein_protein_interface.py @@ -1,14 +1,10 @@ # -*- coding: utf-8 -*- -import os -import glob -from joblib import Parallel, delayed -from tqdm import tqdm -from collections import defaultdict, Counter - +import os, glob import pandas as pd +import numpy as np +from joblib import Parallel, delayed from biopandas.pdb import PandasPdb from sklearn.neighbors import KDTree -import numpy as np from proteinshake.datasets import Dataset from proteinshake.utils import extract_tar, download_url, progressbar, load, save, unzip_file @@ -40,14 +36,6 @@ class ProteinProteinInterfaceDataset(Dataset): Distance in angstroms within which a pair of residues is considered to belong to the interface. - .. list-table:: Dataset stats - :widths: 100 - :header-rows: 1 - - * - # proteins - * - 2839 - - .. list-table:: Annotations :widths: 25 55 20 :header-rows: 1 @@ -72,120 +60,51 @@ def __init__(self, cutoff=6, version='2020', **kwargs): self.version = version self.cutoff = cutoff super().__init__(**kwargs) - - def download_file(filename): - if not os.path.exists(f'{self.root}/{filename}'): - if not self.use_precomputed: - self.parse_interfaces() - else: - download_url(f'{self.repository_url}/{filename}.gz', f'{self.root}', verbosity=0) - unzip_file(f'{self.root}/{filename}.gz') - return load(f'{self.root}/{filename}') - - self._interfaces = download_file(f'{self.name}.interfaces.json') + self.interfaces = load(f'{self.root}/{self.name}.interfaces.json') def get_raw_files(self): - return glob.glob(f'{self.root}/raw/files/chains/*.pdb') - + return sorted(glob.glob(f'{self.root}/raw/files/chains/*.pdb')) + def get_id_from_filename(self, filename): - return filename[:4] - - def get_contacts(self, protein, cutoff=6): - """Obtain interfacing residues within a single structure of polymers. Uses - KDTree data structure for vector search. - - Parameters - ---------- - protein: dict - Parsed protein dictionary. + return filename.rstrip('.pdb')#.rstrip('.ent') - Returns - -------- - `dict`: 2-level dictionary mapping a pair of chains to the list of interfacing residue positions (e.g `interfaces['A']['B'] = {(1, 3), (2, 4)}` - says that residues 1 and 2 of chain A are in contact with 3 and 4 in chain B. The positions are _indices_ in the residue/atom list. + def get_contacts(self, path): + """ Obtain interfacing residues within a single structure of polymers. + Uses KDTree data structure for vector search. """ - - def get_coords(p): - return np.array([p['residue']['x'], - p['residue']['y'], - p['residue']['z']]).T - - - def defaultdict_to_dict(default_dict): - if isinstance(default_dict, defaultdict): - default_dict = {k: defaultdict_to_dict(v) for k, v in default_dict.items()} - elif isinstance(default_dict, dict): - default_dict = {k: defaultdict_to_dict(v) for k, v in default_dict.items()} - return default_dict - - interfaces = defaultdict(lambda: defaultdict(list)) - coords = get_coords(protein) - kdt = KDTree(coords, leaf_size=1) - - seq_inds = [0] - current_chain = protein['residue']['chain_id'][0] - - # get a vector of sequence positions - ind = 0 - for chain_id in protein['residue']['chain_id'][1:]: - if chain_id != current_chain: - ind = -1 - current_chain = chain_id - ind += 1 - seq_inds.append(ind) - - query = kdt.query_radius(coords, cutoff) - interface = [] - for i,result in enumerate(query): - this_chain = protein['residue']['chain_id'][i] - this_pos = seq_inds[i] - for r in result: - that_chain = protein['residue']['chain_id'][r] - that_pos = seq_inds[r] - if this_chain != that_chain: - # ugly , I know - interfaces[this_chain][that_chain].append((this_pos, that_pos)) - interfaces[that_chain][this_chain].append((that_pos, this_pos)) - - return defaultdict_to_dict(interfaces) - - def get_complexes_files(self): - return glob.glob(f"{self.root}/raw/files/PP/*.pdb") - - def parse_interfaces(self): - """ Get all interfaces and store in dict""" - protein_dfs = Parallel(n_jobs=self.n_jobs)(delayed(self.parse_pdb)(path) for path in progressbar(self.get_complexes_files(), desc='Loading complexes')) - print("Computing interfaces") - interfaces = {p['protein']['ID']: self.get_contacts(p, cutoff=self.cutoff) for p in tqdm(protein_dfs, total=len(protein_dfs)) if not p is None} - save(interfaces, f'{self.root}/{self.name}.interfaces.json') + df = PandasPdb().read_pdb(path).df['ATOM'] + + # split chains + pdbid = os.path.basename(path).rstrip('.pdb') + for chain, chain_df in df.groupby('chain_id'): + new_df = PandasPdb() + new_df._df = {'ATOM': chain_df} + new_df.to_pdb(f'{self.root}/raw/files/chains/{pdbid}_{chain}.pdb') + + # compute contacts + df = df[df['atom_name'] == 'CA'] + coords = df[['x_coord','y_coord','z_coord']].to_numpy() + chain_ids = np.array(df['chain_id']) + chain_index = np.hstack([np.arange(np.sum(chain_ids==chain)) for chain in list(dict.fromkeys(chain_ids))]) # get residue position relative to chain + contacts = KDTree(coords, leaf_size=1).query_radius(coords, self.cutoff) # for each residue, get all neighbouring residues within the cutoff radius + query_chains = np.repeat(chain_ids, [len(c) for c in contacts]) # construct long index of query chain ids + query_chain_index = np.repeat(chain_index, [len(c) for c in contacts]) # construct long index of query chain index + result_chains = np.hstack([chain_ids[c] for c in contacts]) # construct long index of result chain ids + result_chain_index = np.hstack([chain_index[c] for c in contacts]) # construct long index of result chain index + interfaces = pd.DataFrame({'query': query_chains, 'result': result_chains, 'index': tuple(zip(query_chain_index.tolist(),result_chain_index.tolist()))}) + interfaces = interfaces[query_chains != result_chains] # remove contacts in the same chain + if len(interfaces) == 0: return pdbid, None + interfaces = dict(interfaces.groupby('query').apply(lambda x: dict(x.groupby('result')['index'].apply(list)))) # format to dict of dicts: {chain_A: chain_B: [13, 27, ...]} + return pdbid, interfaces def download(self): - download_url(f'https://pdbbind.oss-cn-hangzhou.aliyuncs.com/download/PDBbind_v{self.version}_PP.tar.gz', f'{self.root}/raw') - extract_tar(f'{self.root}/raw/PDBbind_v{self.version}_PP.tar.gz', f'{self.root}/raw/files', extract_members=True) + #download_url(f'https://pdbbind.oss-cn-hangzhou.aliyuncs.com/download/PDBbind_v{self.version}_PP.tar.gz', f'{self.root}/raw') + #extract_tar(f'{self.root}/raw/PDBbind_v{self.version}_PP.tar.gz', f'{self.root}/raw/files', extract_members=True) os.makedirs(f'{self.root}/raw/files/chains', exist_ok=True) - print("Chain splitting") - self.chain_split(f'{self.root}/raw/files/chains') - - def chain_split(self, dest): - """ Split all the raw PDBs in path to individual ones by chain. - Replaces original PDB file in place. - """ - for p in tqdm(self.get_complexes_files()): - ppdb = PandasPdb().read_pdb(p).df['ATOM'] - pdbid = os.path.basename(p).split(".")[0] - for chain, chain_df in ppdb.groupby('chain_id'): - new_df = PandasPdb() - new_df._df = {'ATOM': chain_df} - new_df.to_pdb(os.path.join(dest, f"{pdbid}_{chain}.pdb")) - # assert not chain_df.isnull().values.any(), f"NULL in {p}" - pass - - def get_id_from_filename(self, filename): - return filename.split(".")[0] - - def describe(self): - desc = super().describe() - desc['property'] = "Protein-protein interface (residue-level)" - desc['values'] = 2 - desc['type'] = "Binary" - return desc + complexed_files = sorted(glob.glob(f'{self.root}/raw/files/PP/*.pdb')) + contacts = Parallel(n_jobs=self.n_jobs)(delayed(self.get_contacts)(path) for path in progressbar(complexed_files, desc='Computing interfaces', verbosity=self.verbosity)) + interfaces = {pdbid: interfaces for pdbid, interfaces in contacts if not interfaces is None} + save(interfaces, f'{self.root}/{self.name}.interfaces.json') + + + diff --git a/proteinshake/datasets/rcsb.py b/proteinshake/datasets/rcsb.py index a7abb91f..a1073dfa 100644 --- a/proteinshake/datasets/rcsb.py +++ b/proteinshake/datasets/rcsb.py @@ -39,12 +39,6 @@ def __init__(self, query=[], from_list=None, only_single_chain=True, max_request self.max_requests = max_requests super().__init__(only_single_chain=only_single_chain, **kwargs) - def get_raw_files(self): - return glob.glob(f'{self.root}/raw/files/*.pdb') - - def get_id_from_filename(self, filename): - return filename[:4] - def download(self): """ Fetches PDBs from RCSB with an API call. The default query selects protein-only structures with a single chain. diff --git a/proteinshake/datasets/scop.py b/proteinshake/datasets/scop.py index d528ed89..5e818807 100644 --- a/proteinshake/datasets/scop.py +++ b/proteinshake/datasets/scop.py @@ -19,15 +19,6 @@ class SCOPDataset(RCSBDataset): Raw data was obtained and modified from `RCSB Protein Data Bank `_, originally licensed under `CC0 1.0 `_. - - .. list-table:: Dataset stats - :widths: 100 - :header-rows: 1 - - * - # proteins - * - 10066 - - .. list-table:: Annotations :widths: 25 45 35 :header-rows: 1 @@ -51,7 +42,7 @@ class SCOPDataset(RCSBDataset): """ - def _parse_scop(self, path): + def parse_scop(self, path): names = ['FA-DOMID', 'FA-PDBID', 'FA-PDBREG', 'FA-UNIID', 'FA-UNIREG', 'SF-DOMID', 'SF-PDBID', 'SF-PDBREG', 'SF-UNIID', 'SF-UNIREG', 'SCOPCLA'] df = pd.read_csv(path, sep=' ', comment='#', names=names, dtype=str) return {k: dict([cla.split("=") for cla in v.split(",")]) for k,v in zip(df['FA-PDBID'], df['SCOPCLA'])} @@ -59,7 +50,7 @@ def _parse_scop(self, path): def download(self): # get the annots download_url(f'http://scop.mrc-lmb.cam.ac.uk/files/scop-cla-latest.txt', f'{self.root}/raw/scop.txt') - self.scop = self._parse_scop(f'{self.root}/raw/scop.txt') + self.scop = self.parse_scop(f'{self.root}/raw/scop.txt') ids = list(self.scop['FA-PDBID'].unique()) # get the proteins diff --git a/proteinshake/datasets/tm_align.py b/proteinshake/datasets/tm_align.py index 139ed772..f3a038fa 100644 --- a/proteinshake/datasets/tm_align.py +++ b/proteinshake/datasets/tm_align.py @@ -2,8 +2,6 @@ ''' TMalign needs to be in your $PATH. Follow the instructions at https://zhanggroup.org/TM-align/readme.c++.txt ''' -import glob -import requests import os import itertools import re @@ -12,20 +10,10 @@ import shutil import numpy as np from biopandas.pdb import PandasPdb -from collections import defaultdict from joblib import Parallel, delayed -from functools import cached_property from proteinshake.datasets import RCSBDataset -from proteinshake.utils import (extract_tar, - download_url, - save, - load, - unzip_file, - global_distance_test, - local_distance_difference_test, - progressbar - ) +from proteinshake.utils import download_url, load, unzip_file, global_distance_test, local_distance_difference_test, progressbar class TMAlignDataset(RCSBDataset): @@ -42,15 +30,6 @@ class TMAlignDataset(RCSBDataset): Raw data was obtained and modified from `RCSB Protein Data Bank `_, originally licensed under `CC0 1.0 `_. - - .. list-table:: Dataset stats - :widths: 100 - :header-rows: 1 - - * - # proteins - * - 994 - - .. code-block:: python from proteinshake.datasets import TMAlignDataset diff --git a/proteinshake/tasks/__init__.py b/proteinshake/tasks/__init__.py index 1cd088b7..9a983702 100644 --- a/proteinshake/tasks/__init__.py +++ b/proteinshake/tasks/__init__.py @@ -1,6 +1,6 @@ from .task import Task from .enzyme_class import EnzymeClassTask -from .pfam_task import ProteinFamilyTask +from .protein_family import ProteinFamilyTask from .ligand_affinity import LigandAffinityTask from .binding_site_detection import BindingSiteDetectionTask from .structure_similarity import StructureSimilarityTask diff --git a/proteinshake/tasks/binding_site_detection.py b/proteinshake/tasks/binding_site_detection.py index b4cb14c9..84bd1133 100644 --- a/proteinshake/tasks/binding_site_detection.py +++ b/proteinshake/tasks/binding_site_detection.py @@ -1,4 +1,5 @@ from sklearn import metrics +import numpy as np from proteinshake.datasets import ProteinLigandInterfaceDataset from proteinshake.tasks import Task @@ -11,44 +12,29 @@ class BindingSiteDetectionTask(Task): DatasetClass = ProteinLigandInterfaceDataset type = 'Binary Classification' - input = 'Residue' + input = 'Protein' output = 'Small Molecule Binding Residues' + default_metric = 'MCC' + level = 'Residue' - @property - def task_in(self): - return ('residue') - - @property - def task_type(self): - return ('residue', 'binary') - - @property - def task_out(self): - return ('binary') + def target(self, protein_dict): + binding_sites = protein_dict['residue']['binding_site'] + return np.arange(len(binding_sites))[binding_sites].tolist(), len(binding_sites) - @property - def target_dim(self): - return (1) - - def dummy_output(self): - import random - return [random.randint(0, 1) for p in self.test_targets] - - def target(self, protein): - return protein['residue']['binding_site'] - - def compute_targets(self): - # compute targets (e.g. for scaling) - self.train_targets = [p for i in self.train_index for p in self.target(self.proteins[i])] - self.val_targets = [p for i in self.val_index for p in self.target(self.proteins[i])] - self.test_targets = [p for i in self.test_index for p in self.target(self.proteins[i])] - - @property - def default_metric(self): - return 'mcc' + def target_transform(self, target): + residue_indices, size = target + transformed_target = np.zeros(size, dtype=bool) + transformed_target[residue_indices] = True + return transformed_target def evaluate(self, y_true, y_pred): + print(np.hstack(y_true)) + print(np.hstack(y_pred)) return { - 'accuracy': metrics.accuracy_score(y_true, y_pred), - 'mcc': metrics.matthews_corrcoef(y_true, y_pred), + 'Accuracy': metrics.accuracy_score(np.hstack(y_true), np.hstack(y_pred)), + 'MCC': metrics.matthews_corrcoef(np.hstack(y_true), np.hstack(y_pred)), } + + @property + def y_dummy(self): + return np.array([np.random.choice([0,1], size=len(x)).astype(bool) for x in self.y_test], dtype=object) diff --git a/proteinshake/tasks/enzyme_class.py b/proteinshake/tasks/enzyme_class.py index 4294613f..b90a5a44 100644 --- a/proteinshake/tasks/enzyme_class.py +++ b/proteinshake/tasks/enzyme_class.py @@ -1,13 +1,11 @@ from sklearn import metrics -from functools import cached_property import numpy as np from proteinshake.datasets import EnzymeCommissionDataset from proteinshake.tasks import Task class EnzymeClassTask(Task): - """ Predict the enzyme commission classification of a protein structure. This is a protein-level multi-class prediction. - + """ Predict the enzyme commission classification of a protein structure. """ DatasetClass = EnzymeCommissionDataset @@ -15,59 +13,23 @@ class EnzymeClassTask(Task): type = 'Multiclass Classification' input = 'Protein' output = 'Enzyme Commission Level 1' - - def __init__(self, ec_level=0, *args, **kwargs): - self.ec_level = ec_level - super().__init__(*args, **kwargs) - - @cached_property - def token_map(self): - labels = {p['protein']['EC'].split(".")[self.ec_level] for p in self.proteins} - return {label: i for i, label in enumerate(sorted(list(labels)))} - - def dummy_output(self): - import random - tokens = list(self.token_map.values()) - return [random.choice(tokens) for _ in range(len(self.test_targets))] - - @property - def num_classes(self): - return len(self.token_map) - - @property - def task_type(self): - return ('protein', 'multi_class') + default_metric = 'Precision' - @property - def task_in(self): - return ('protein') + def compute_token_map(self): + unique_labels = {p['protein']['EC'].split(".")[0] for p in self.dataset.proteins()} + return {k:v for v,k in enumerate(sorted(unique_labels))} - @property - def task_out(self): - return ('multi_class') - - @property - def target_dim(self): - return (len(self.token_map.values())) - - @property - def num_features(self): - return 20 - - def target(self, protein): - return self.token_map[protein['protein']['EC'].split(".")[0]] - - @property - def default_metric(self): - return 'precision' + def target(self, protein_dict): + return self.token_map[protein_dict['protein']['EC'].split(".")[0]] def evaluate(self, y_true, y_pred): - """ Using metrics from https://doi.org/10.1073/pnas.1821905116 """ y_true = np.array(y_true, dtype=int) y_pred = np.array(y_pred, dtype=int) return { - 'precision': metrics.precision_score(y_true, y_pred, average='macro', zero_division=0), - 'recall': metrics.recall_score(y_true, y_pred, average='macro', zero_division=0), - 'accuracy': metrics.accuracy_score(y_true, y_pred), - #'AUROC': metrics.roc_auc_score(y_true, y_pred, average='macro', multi_class='ovo'), + 'Precision': metrics.precision_score(y_true, y_pred, average='macro', zero_division=0), + 'Recall': metrics.recall_score(y_true, y_pred, average='macro', zero_division=0), + 'Accuracy': metrics.accuracy_score(y_true, y_pred), } + + def dummy(self): + return np.random.choice(list(self.token_map.values()), len(self.test_targets)) diff --git a/proteinshake/tasks/gene_ontology.py b/proteinshake/tasks/gene_ontology.py index a9531c58..997c1f51 100644 --- a/proteinshake/tasks/gene_ontology.py +++ b/proteinshake/tasks/gene_ontology.py @@ -1,7 +1,6 @@ import itertools import numpy as np -from sklearn import metrics -from functools import cached_property +from scipy.sparse import csr_matrix from proteinshake.datasets import GeneOntologyDataset from proteinshake.tasks import Task @@ -18,49 +17,19 @@ class GeneOntologyTask(Task): type = 'Multilabel Classification' input = 'Protein' output = 'Gene Ontology Terms' - - def __init__(self, branch='molecular_function', *args, **kwargs): - self.branch = branch - super().__init__(*args, **kwargs) - - @cached_property - def token_map(self): - labels = set(itertools.chain(*[p['protein'][self.branch] for p in self.proteins])) - return {label: i for i, label in enumerate(sorted(list(labels)))} - - @property - def num_classes(self): - return len(self.token_map) - - @property - def classes(self): - return list(self.token_map.keys()) - - @property - def task_in(self): - return ('protein') - - @property - def task_type(self): - return ('protein', 'multi_label') - - @property - def task_out(self): - return ('multi_label') + default_metric = 'Fmax' - @property - def target_dim(self): - return (len(self.token_map.values())) - - @property - def num_features(self): - return 20 + def compute_token_map(self): + labels = set(itertools.chain(*[p['protein']['molecular_function'] for p in self.dataset.proteins()])) + return {k:v for v,k in enumerate(sorted(labels))} def target(self, protein): - tokens = [self.token_map[i] for i in protein['protein'][self.branch]] - target = np.zeros_like(self.classes, dtype=bool) - target[tokens] = True - return target + return [self.token_map[i] for i in protein['protein']['molecular_function']] + + def target_transform(self, target): + transformed = np.zeros(len(self.token_map), dtype=bool) + transformed[target] = True + return transformed def precision(self, y_true, y_pred, threshold): mt = (y_pred.max(axis=1) >= threshold).sum() @@ -77,41 +46,22 @@ def recall(self, y_true, y_pred, threshold): nom = np.logical_and(y_true, y_pred).sum(axis=1).astype(np.float32) denom = y_true.sum(axis=1).astype(np.float32) return 1/ne * np.divide(nom, denom, out=np.zeros_like(nom), where=denom!=0).sum() - - def remaining_uncertainty(self, y_true, y_pred, threshold): - pass - - def missing_information(self, y_true, y_pred, threshold): - pass def fmax(self, y_true, y_pred): fmax = 0 - for t in np.linspace(0,1,21): + for t in np.linspace(0,1,11): prec, rec = self.precision(y_true, y_pred, t), self.recall(y_true, y_pred, t) if prec+rec == 0: continue f1 = (2 * prec * rec) / (prec + rec) fmax = max(fmax, f1) return fmax - - def smin(self, y_pred): - return min([ - np.sqrt( - self.remaining_uncertainty(y_pred, t) ** 2 - + self.missing_information(y_pred, t) ** 2 - ) - for t in np.linspace(0,1,21) - ]) - - def dummy_output(self): - return np.random.rand(len(self.test_index), len(self.token_map.keys())) - - @property - def default_metric(self): - return 'Fmax' def evaluate(self, y_true, y_pred): - y_true, y_pred = np.array(y_true), np.array(y_pred) + y_pred = np.array(y_pred) return { 'Fmax': self.fmax(y_true, y_pred), - #'Smin': self.smin(y_true, y_pred), } + + @property + def y_dummy(self): + return np.random.rand(len(self.y_test), len(self.token_map)) diff --git a/proteinshake/tasks/ligand_affinity.py b/proteinshake/tasks/ligand_affinity.py index 2ec127a7..321e6a07 100644 --- a/proteinshake/tasks/ligand_affinity.py +++ b/proteinshake/tasks/ligand_affinity.py @@ -1,4 +1,5 @@ from sklearn import metrics +import numpy as np from proteinshake.datasets import ProteinLigandInterfaceDataset from proteinshake.tasks import Task @@ -14,36 +15,16 @@ class LigandAffinityTask(Task): type = 'Regression' input = 'Protein and Molecule' output = 'Dissociation Constant Kd' + default_metric = 'R2' - @property - def task_in(self): - return ('protein', 'molecule') - - @property - def task_type(self): - return ('protein', 'regression') - - @property - def task_out(self): - return ('regression') - - @property - def out_dim(self): - return (1) - - def dummy_output(self): - import random - return [random.random() for _ in range(len(self.test_index))] - - def target(self, protein): - return protein['protein']['neglog_aff'] - - @property - def default_metric(self): - return 'r2' + def target(self, protein_dict): + return protein_dict['protein']['neglog_aff'] def evaluate(self, y_true, y_pred): return { - 'mse': metrics.mean_squared_error(y_true, y_pred), - 'r2': metrics.r2_score(y_true, y_pred) + 'Mean Squared Error': metrics.mean_squared_error(y_true, y_pred), + 'R2': metrics.r2_score(y_true, y_pred) } + + def dummy(self): + return np.random.uniform(size=len(self.test_targets)) diff --git a/proteinshake/tasks/pfam_task.py b/proteinshake/tasks/pfam_task.py deleted file mode 100644 index 634c2005..00000000 --- a/proteinshake/tasks/pfam_task.py +++ /dev/null @@ -1,69 +0,0 @@ -from sklearn import metrics -from functools import cached_property -import numpy as np - -from proteinshake.datasets import ProteinFamilyDataset -from proteinshake.tasks import Task - -class ProteinFamilyTask(Task): - """ Predict the protein family classification of a protein structure. This is a protein-level multi-class prediction. - - """ - - DatasetClass = ProteinFamilyDataset - - type = 'Multiclass Classification' - input = 'Protein' - output = 'Protein Family (Pfam)' - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - @cached_property - def token_map(self): - # Pfam': ['Fis1 N-terminal tetratricopeptide repeat (Fis1_TPR_N)', 'Fis1 C-terminal tetratricopeptide repeat (Fis1_TPR_C)'], - labels = {p['protein']['Pfam'][0] for p in self.proteins} - return {label: i for i, label in enumerate(sorted(list(labels)))} - - def dummy_output(self): - import random - tokens = list(self.token_map.values()) - return [random.choice(tokens) for _ in range(len(self.test_targets))] - - @property - def task_in(self): - return ('protein') - - @property - def task_type(self): - return ('protein', 'multi_class') - - @property - def task_out(self): - return ('multi_class') - - @property - def out_dim(self): - return len(self.token_map) - - @property - def num_features(self): - return 20 - - def target(self, protein): - return self.token_map[protein['protein']['Pfam'][0]] - - @property - def default_metric(self): - return 'precision' - - def evaluate(self, y_true, y_pred): - """ Using metrics from https://doi.org/10.1073/pnas.1821905116 """ - y_true = np.array(y_true, dtype=int) - y_pred = np.array(y_pred, dtype=int) - return { - 'precision': metrics.precision_score(y_true, y_pred, average='macro', zero_division=0), - 'recall': metrics.recall_score(y_true, y_pred, average='macro', zero_division=0), - 'accuracy': metrics.accuracy_score(y_true, y_pred), - #'AUROC': metrics.roc_auc_score(y_true, y_pred, average='macro', multi_class='ovo'), - } diff --git a/proteinshake/tasks/protein_family.py b/proteinshake/tasks/protein_family.py new file mode 100644 index 00000000..c60be744 --- /dev/null +++ b/proteinshake/tasks/protein_family.py @@ -0,0 +1,36 @@ +from sklearn import metrics +import numpy as np + +from proteinshake.datasets import ProteinFamilyDataset +from proteinshake.tasks import Task + +class ProteinFamilyTask(Task): + """ Predict the protein family classification of a protein structure. This is a protein-level multi-class prediction. + """ + + DatasetClass = ProteinFamilyDataset + + type = 'Multiclass Classification' + input = 'Protein' + output = 'Protein Family (Pfam)' + default_metric = 'Precision' + + def compute_token_map(self): + labels = {p['protein']['Pfam'][0] for p in self.dataset.proteins()} + return {k: v for v,k in enumerate(sorted(labels))} + + def target(self, protein): + return self.token_map[protein['protein']['Pfam'][0]] + + def evaluate(self, y_true, y_pred): + """ Using metrics from https://doi.org/10.1073/pnas.1821905116 """ + y_true = np.array(y_true, dtype=int) + y_pred = np.array(y_pred, dtype=int) + return { + 'Precision': metrics.precision_score(y_true, y_pred, average='macro', zero_division=0), + 'Recall': metrics.recall_score(y_true, y_pred, average='macro', zero_division=0), + 'Accuracy': metrics.accuracy_score(y_true, y_pred), + } + + def dummy(self): + return np.random.choice(list(self.token_map.values()), len(self.test_targets)) diff --git a/proteinshake/tasks/protein_protein_interface.py b/proteinshake/tasks/protein_protein_interface.py index 6b6dd2b5..6e660dbd 100644 --- a/proteinshake/tasks/protein_protein_interface.py +++ b/proteinshake/tasks/protein_protein_interface.py @@ -5,12 +5,8 @@ from proteinshake.tasks import Task class ProteinProteinInterfaceTask(Task): - """ Identify the binding residues of a protein-protein complex. This is a residue-level binary classification. - - .. code-block:: python - - >>> from proteinshake.tasks import ProteinProteinInterfaceTask - >>> ta = ProteinProteinInterfaceTask() + """ Identify the binding residues of a protein-protein complex. + This is a residue-level binary classification. """ DatasetClass = ProteinProteinInterfaceDataset @@ -18,86 +14,37 @@ class ProteinProteinInterfaceTask(Task): type = 'Binary Classification' input = 'Protein and Protein' output = 'Protein Binding Interface Residues' + default_metric = 'AUROC (median)' + pairwise = True + level = 'Residues' - @property - def task_in(self): - return ('residue', 'residue') - - @property - def task_type(self): - return ('residue_pair', 'binary') - - @property - def task_out(self): - return ('binary') - - @property - def out_dim(self): - return (1) - - def dummy_output(self): - import random - return [np.where(np.random.randint(0, 2, p.shape) == 0, 0, 1) for p in self.test_targets] - - def update_index(self): - """ Transform to pairwise indexing """ - self.train_index = self.compute_pairs(self.train_index) - self.val_index = self.compute_pairs(self.val_index) - self.test_index = self.compute_pairs(self.test_index) - - def compute_targets(self): - self.train_targets = [self.target(self.proteins[i], self.proteins[j]) for i,j in self.train_index] - self.val_targets = [self.target(self.proteins[i], self.proteins[j]) for i,j in self.val_index] - self.test_targets = [self.target(self.proteins[i], self.proteins[j]) for i,j in self.test_index] - - def compute_pairs(self, index): - """ Grab all pairs of chains that share an interface""" - protein_to_index = {p['protein']['ID']: i for i, p in enumerate(self.dataset.proteins())} - def find_index(pdbid, chain): - return protein_to_index[f'{pdbid}_{chain}'] - - proteins = self.dataset.proteins() - chain_pairs = [] - for i, protein in enumerate(proteins): - if i not in index: - continue - #chain = protein['residue']['chain_id'][0] - pdbid, chain = protein['protein']['ID'].split('_') - try: - chain_pairs.extend([(i, find_index(pdbid, partner)) for partner in self.dataset._interfaces[pdbid][chain]]) - # if chain is not in any interface, we skip - except (KeyError, IndexError): - continue - chain_pairs = [(i,j) for i,j in chain_pairs if i in index and j in index] # @carlos please check - return np.array(chain_pairs, dtype=int) - + def compute_paired_index(self, index): + """ Return all pairs of interactions that are in self.interfaces. + """ + pdb_to_idx = {p['protein']['ID']:i for i,p in enumerate(self.dataset.proteins())} + i,j = [],[] + for k def target(self, protein_1, protein_2): - chain_1 = protein_1['residue']['chain_id'][0] - chain_2 = protein_2['residue']['chain_id'][0] - chain_1_length = len(protein_1['residue']['chain_id']) - chain_2_length = len(protein_2['residue']['chain_id']) - pdbid = protein_1['protein']['ID'].split('_')[0] - - contacts = np.zeros((chain_1_length, chain_2_length)) - try: - inds = np.array(self.dataset._interfaces[pdbid][chain_1][chain_2]) - contacts[inds[:,0], inds[:,1]] = 1.0 - except KeyError: # raised if there are no interactions between query chains - pass - return np.array(contacts) - - @property - def default_metric(self): - return 'average_precision' + pdbid_1, chain_1 = protein_1['protein']['ID'].split('_') + pdbid_2, chain_2 = protein_2['protein']['ID'].split('_') + shape = (len(protein_1['residue']['chain_id']), len(protein_2['residue']['chain_id'])) + return self.dataset.interfaces[pdbid_1][chain_1][chain_2], shape + + def target_transform(self, target): + residue_indices, shape = target + transformed_target = np.zeros(shape, dtype=bool) + transformed_target[tuple(zip(*residue_indices))] = True + return transformed_target def evaluate(self, y_true, y_pred): """ Evaluate performance of an interface classifier. """ - raw_values = {'auroc': np.zeros(len(y_true)), - 'auprc': np.zeros(len(y_true)), - 'sizes':np.zeros(len(y_true)) - } + raw_values = { + 'auroc': np.zeros(len(y_true)), + 'auprc': np.zeros(len(y_true)), + 'sizes': np.zeros(len(y_true)) + } for i, (y, y_pred) in enumerate(zip(y_true, y_pred)): y = y.flatten() @@ -106,26 +53,17 @@ def evaluate(self, y_true, y_pred): raw_values['auprc'][i] = metrics.average_precision_score(y, y_pred) raw_values['sizes'][i] = len(y) - result = {} tot = np.sum(raw_values['sizes']) norm = raw_values['sizes'] / tot - result['auroc_weighted'] = np.mean(raw_values['auroc'] * norm) - result['auprc_weighted'] = np.mean(raw_values['auprc'] * norm) - result['auroc_median'] = np.median(raw_values['auroc']) - result['auprc_median'] = np.median(raw_values['auprc']) - result['auroc_mean'] = np.mean(raw_values['auroc']) - result['auprc_mean'] = np.mean(raw_values['auprc']) - return result - - def to_graph(self, *args, **kwargs): - self.dataset = self.dataset.to_graph(*args, **kwargs, transform=Compose([CenterTransform(), RandomRotateTransform()])) - return self - - def to_point(self, *args, **kwargs): - self.dataset = self.dataset.to_point(*args, **kwargs, transform=Compose([CenterTransform(), RandomRotateTransform()])) - return self - - def to_voxel(self, *args, **kwargs): - self.dataset = self.dataset.to_voxel(*args, **kwargs, transform=Compose([CenterTransform(), RandomRotateTransform()])) - return self + return { + 'AUROC (weighted)': np.mean(raw_values['auroc'] * norm), + 'AUPRC (weighted)': np.mean(raw_values['auprc'] * norm), + 'AUROC (median)': np.median(raw_values['auroc']), + 'AUPRC (median)': np.median(raw_values['auprc']), + 'AUROC (mean)': np.mean(raw_values['auroc']), + 'AUPRC (mean)': np.mean(raw_values['auprc']), + } + + def y_dummy(self): + return [np.where(np.random.randint(0, 2, p.shape) == 0, 0, 1) for p in self.test_targets] \ No newline at end of file diff --git a/proteinshake/tasks/structural_class.py b/proteinshake/tasks/structural_class.py index f13ab72f..ff0866d3 100644 --- a/proteinshake/tasks/structural_class.py +++ b/proteinshake/tasks/structural_class.py @@ -1,5 +1,4 @@ from sklearn import metrics -from functools import cached_property import numpy as np from proteinshake.datasets import SCOPDataset @@ -7,7 +6,6 @@ class StructuralClassTask(Task): """ Predict the SCOP class of a protein structure. This is a protein-level multi-class prediction. - """ DatasetClass = SCOPDataset @@ -15,55 +13,27 @@ class StructuralClassTask(Task): type = 'Multiclass Classification' input = 'Protein' output = 'SCOP Class' + default_metric = 'Accuracy' def __init__(self, scop_level='SCOP-FA', *args, **kwargs): self.scop_level = scop_level super().__init__(*args, **kwargs) - @cached_property - def token_map(self): - labels = {p['protein'][self.scop_level] for p in self.proteins} - return {label: i for i, label in enumerate(sorted(list(labels)))} - - @property - def target_dim(self): - return len(self.token_map) - - def dummy_output(self): - import random - tokens = list(self.token_map.values()) - return [random.choice(tokens) for _ in range(len(self.test_targets))] - - @property - def task_in(self): - return ('protein') - - @property - def task_type(self): - return ('protein', 'multi-class') - - @property - def task_out(self): - return ('multi_class') - - @property - def num_features(self): - return 20 + def compute_token_map(self): + labels = {p['protein'][self.scop_level] for p in self.dataset.proteins()} + return {k:v for v,k in enumerate(sorted(list(labels)))} def target(self, protein): return self.token_map[protein['protein'][self.scop_level]] - @property - def default_metric(self): - return 'precision' - def evaluate(self, y_true, y_pred): - """ Using metrics from https://doi.org/10.1073/pnas.1821905116 """ y_true = np.array(y_true, dtype=int) y_pred = np.array(y_pred, dtype=int) return { - 'precision': metrics.precision_score(y_true, y_pred, average='macro', zero_division=0), - 'recall': metrics.recall_score(y_true, y_pred, average='macro', zero_division=0), - 'accuracy': metrics.accuracy_score(y_true, y_pred), - #'AUROC': metrics.roc_auc_score(self.test_targets, y_pred, average='macro', multi_class='ovo'), + 'Precision': metrics.precision_score(y_true, y_pred, average='macro', zero_division=0), + 'Recall': metrics.recall_score(y_true, y_pred, average='macro', zero_division=0), + 'Accuracy': metrics.accuracy_score(y_true, y_pred), } + + def dummy(self): + return np.random.choice(self.token_map.values(), size=len(self.test_targets)) diff --git a/proteinshake/tasks/structure_search.py b/proteinshake/tasks/structure_search.py index 84ca0212..0505f900 100644 --- a/proteinshake/tasks/structure_search.py +++ b/proteinshake/tasks/structure_search.py @@ -24,24 +24,12 @@ class StructureSearchTask(Task): type = 'Retrieval' input = 'Protein' output = 'Similar Proteins' + default_metric = 'Precision@k' def __init__(self, min_sim=0.8, *args, **kwargs): self.min_sim = min_sim super().__init__(*args, **kwargs) - @property - def task_type(self): - return ('protein', 'retrieval') - - - @property - def task_in(self): - return ('protein') - - @property - def task_out(self): - return ('retrieval') - @cached_property def targets(self): """ Precompute the set of similar proteins for each query """ @@ -51,30 +39,16 @@ def targets(self): return targets def target(self, protein): - """ The target for a protein is a list of proteins deemed 'relevant' - according to `self.min_sim`. + """ The target for a protein is a list of proteins deemed 'relevant' according to `self.min_sim`. """ return self.targets[protein['protein']['ID']] - def _precision_at_k(self, y_true, y_pred, k): + def precision_at_k(self, y_true, y_pred, k): return len(set(y_pred[:k]).intersection(set(y_true))) / len(y_pred) - def _recall_at_k(self, y_true, y_pred, k): + def recall_at_k(self, y_true, y_pred, k): return len(set(y_pred[:k]).intersection(set(y_true))) / len(y_true) - def dummy_output(self): - import random - pred = [] - ids = [p['protein']['ID'] for p in self.proteins] - for query in self.proteins[self.test_index]: - targets = self.target(query) - pred.append(random.sample(ids, len(targets))) - return pred - - @property - def default_metric(self): - return 'precision_at_k' - def evaluate(self, y_true, y_pred, k=5): """ Retrieval metrics. @@ -83,9 +57,16 @@ def evaluate(self, y_true, y_pred, k=5): y_pred: List of indices of items (hits) in the dataset for a query. """ - results = defaultdict(list) - for yt, yp in zip(y_true, y_pred): - results['precision_at_k'].append(self._precision_at_k(yt, yp, k)) - results['recall_at_k'].append(self._recall_at_k(yt, yp, k)) + return { + 'Precision@k': np.mean([self.precision_at_k(yt, yp, k) for yt, yp in zip(y_true, y_pred)]), + 'Recall@k': np.mean([self.recall_at_k(yt, yp, k) for yt, yp in zip(y_true, y_pred)]), + } - return {k: np.mean(v) for k, v in results.items()} + def dummy(self): + import random + pred = [] + ids = [p['protein']['ID'] for p in self.proteins] + for query in self.proteins[self.test_index]: + targets = self.target(query) + pred.append(random.sample(ids, len(targets))) + return pred diff --git a/proteinshake/tasks/structure_similarity.py b/proteinshake/tasks/structure_similarity.py index fdf17254..4094ec6f 100644 --- a/proteinshake/tasks/structure_similarity.py +++ b/proteinshake/tasks/structure_similarity.py @@ -1,4 +1,4 @@ -import itertools +from collections import defaultdict from scipy.stats import spearmanr import numpy as np @@ -19,57 +19,19 @@ class StructureSimilarityTask(Task): type = 'Regression' input = 'Protein and Protein' output = 'Local Distance Difference Test' + default_metric = 'Spearman R' + pairwise = True - def __init__(self, *args, **kwargs): - - super().__init__(*args, **kwargs) - - def update_index(self): - """ Transform to pairwise indexing """ - self.train_index = self.compute_pairs(self.train_index) - self.val_index = self.compute_pairs(self.val_index) - self.test_index = self.compute_pairs(self.test_index) - - def compute_targets(self): - self.train_targets = np.array([self.target(*self.proteins[i]) for i in self.train_index]) - self.val_targets = np.array([self.target(*self.proteins[i]) for i in self.val_index]) - self.test_targets = np.array([self.target(*self.proteins[i]) for i in self.test_index]) - - @property - def task_in(self): - return ('protein', 'protein') - - @property - def task_type(self): - return ('protein_pair', 'regression') - - @property - def task_out(self): - return ('regression') - - @property - def target_dim(self): - return (1) - - def compute_pairs(self, index): - combinations = np.array(list(itertools.combinations(range(len(index)), 2)), dtype=int) - return index[combinations] - - def target(self, protein1, protein2): - pdbid_1 = protein1['protein']['ID'] - pdbid_2 = protein2['protein']['ID'] - return self.dataset.lddt(pdbid_1,pdbid_2) - - def dummy_output(self): - import random - return [random.random() for _ in range(len(self.test_targets))] - - @property - def default_metric(self): - return 'spearman' + def target(self, protein_1, protein_2): + pdbid_1 = protein_1['protein']['ID'] + pdbid_2 = protein_2['protein']['ID'] + return float(self.dataset.lddt(pdbid_1,pdbid_2)) def evaluate(self, y_true, y_pred): return { - 'mse': metrics.mean_squared_error(y_true, y_pred), - 'spearman': spearmanr(y_true, y_pred)[0] + 'Mean Squared Error': metrics.mean_squared_error(y_true, y_pred), + 'Spearman R': spearmanr(y_true, y_pred)[0] } + + def dummy(self): + return np.random.uniform(size=len(self.test_targets)) diff --git a/proteinshake/tasks/task.py b/proteinshake/tasks/task.py index 27b6579e..ccd1b965 100644 --- a/proteinshake/tasks/task.py +++ b/proteinshake/tasks/task.py @@ -1,37 +1,23 @@ import os -import json +import requests import itertools import numpy as np from sklearn.model_selection import train_test_split -from proteinshake.utils import download_url, save, load +from proteinshake.utils import save, load, progressbar class Task: - """ Base class for task-related utilities. - This class wraps a proteinshake dataset and exposes split indices, - integer-coded labels for classification tasks, and an evaluator function. - - Sample usage (assuming you have a model in the namespace): - - .. code-block:: python - - >>> from proteinshake.tasks import EnzymeClassTask - >>> task = EnzymeClassTask() - >>> data = task.dataset.to_graph(eps=8).pyg() - >>> y_pred = model(data[task.train]) - >>> task.evaluate(y_pred) - ... {'roc_auc_score': 0.7} - + """ Base class for a task. Arguments ---------- - dataset: pytorch.datasets.Dataset - Dataset to use for this task. - split: str, default='random' - How to split the data. Can be 'random', 'sequence', or 'structure'. + root: str, default 'data' + The root directory to put the dataset and task data. + split: str, default 'random' + How to split the data. Can be 'random', 'sequence', 'structure' or 'custom'. split_similarity_threshold: float - Maximum similarity to allow between train and test samples. + Maximum similarity between train and test set. """ DatasetClass = None @@ -39,119 +25,171 @@ class Task: type = None input = None output = None + default_metric = None + pairwise = False + level = 'Protein' def __init__(self, root = 'data', split = 'random', split_similarity_threshold = 0.7, + use_precomputed_task = True, **kwargs ): self.root = root self.dataset = self.DatasetClass(root=root, **kwargs) - proteins = self.dataset.proteins() - self.size = len(proteins) - self.split_similarity_threshold = split_similarity_threshold self.split = split - - class Proteins(): # dummy class to implement __getitem__, could be implemented directly on the task - def __init__(self, proteins): - self.proteins = list(proteins) - - def __len__(self): - return len(self.proteins) - - def __getitem__(self, idx): - try: - idx = int(idx) - except: - return [self.__getitem__(i) for i in idx] - if idx >= len(self.proteins): - raise StopIteration - return self.proteins[idx] - - self.proteins = Proteins(proteins) - self.name = self.__class__.__name__ - - # load split indices - if not self.split == 'none': - self.compute_index() - self.compute_targets() - - def compute_index(self): - split_name = f'{self.split}_split_{self.split_similarity_threshold}' if self.split in ['sequence','structure'] else f'{self.split}_split' - if split_name in self.proteins[0]['protein']: - self.train_index = np.array([i for i,p in enumerate(self.proteins) if p['protein'][split_name] == 'train']) - self.val_index = np.array([i for i,p in enumerate(self.proteins) if p['protein'][split_name] == 'val']) - self.test_index = np.array([i for i,p in enumerate(self.proteins) if p['protein'][split_name] == 'test']) - else: - self.train_index, self.val_index, self.test_index = self.compute_custom_split(self.split) - - self.update_index() - - def update_index(self): - pass + self.split_similarity_threshold = split_similarity_threshold - def compute_targets(self): - # compute targets (e.g. for scaling) - self.train_targets = [self.target(self.proteins[i]) for i in self.train_index] - self.val_targets = [self.target(self.proteins[i]) for i in self.val_index] - self.test_targets = [self.target(self.proteins[i]) for i in self.test_index] - - def compute_custom_split(self, split): - """ Implements random splitting. Only necessary when not using the precomputed splits, e.g. when implementing a custom task. - Note that the random, sequence and structure splits will be automatically computed for your custom task if it is merged into ProteinShake main. - Override this method to implement your own splitting logic. - Compare also the proteinshake_release repository. + if not self.files_exist: + if use_precomputed_task and self.files_hosted: self.download_precomputed() + else: self.compute() + + task_data = load(f'{self.root}/{self.__class__.__name__}.json.gz') + self.load_splits(task_data) + self.load_targets(task_data) - Arguments - ------------ - split: str - Name of the custom split as passed to the task. ('random', 'sequence', 'structure', 'none') + def __getattr__(self, key): + """ Captures method calls and forwards them to the dataset if they are a representation or framework conversion. + """ + if key in ['to_graph','to_point','to_voxel','np','dgl','pyg','torch','nx','tf']: + def proxy(*args, **kwargs): + self.dataset = getattr(self.dataset, key)(*args, **kwargs) + return self + return proxy + else: return object.__getattribute__(self, key) + + @property + def files_exist(self): + return os.path.exists(f'{self.root}/{self.__class__.__name__}.json.gz') - Returns: - -------- - train_index - Numpy array with the index of proteins in the train split. - val_index - Numpy array with the index of proteins in the validation split. - test_index - Numpy array with the index of proteins in the test split. + @property + def files_hosted(self): + return requests.head(f'{self.dataset.repository_url}/{self.__class__.__name__}.npz', timeout=5).status_code == 200 + + def download_precomputed(self): + raise NotImplementedError + + def compute(self): + """ Computes the task file with splits, token maps, and targets. """ - inds = list(range(len(self.dataset.proteins()))) - train, test = train_test_split(inds, test_size=0.2) - val, test = train_test_split(test, test_size=0.5) + self.token_map = self.compute_token_map() + self.targets = self.compute_paired_targets() if self.pairwise else self.compute_targets() + save({ + 'random_split': self.compute_random_split(), + 'sequence_split': self.compute_sequence_split(), + 'structure_split': self.compute_structure_split(), + 'custom_split': self.compute_custom_split(), + 'token_map': self.token_map, + 'targets': self.targets, + }, f'{self.root}/{self.__class__.__name__}.json.gz') + + def compute_random_split(self): + indices = np.arange(len(self.dataset.proteins())) + train_indices, test_val_indices = train_test_split(indices, test_size=0.2, random_state=0) + test_indices, val_indices = train_test_split(test_val_indices, test_size=0.5, random_state=0) + return { + 'train': train_indices.tolist(), + 'test': test_indices.tolist(), + 'val': val_indices.tolist(), + } + + def compute_sequence_split(self): + return None + + def compute_structure_split(self): + return None + + def compute_custom_split(self): + return None + + def compute_token_map(self): + return None - return train, val, test + def compute_targets(self): + return [ + self.target(protein_dict) + for protein_dict in progressbar(self.dataset.proteins(), verbosity=self.dataset.verbosity, desc='Computing targets') + ] + + def compute_paired_targets(self): + print(len(self.dataset.proteins())) + index = self.compute_paired_index(np.arange(len(self.dataset.proteins()))) + print(len(index)) + exit() + return [ + self.target(A,B) + for A in progressbar(self.dataset.proteins(), verbosity=self.dataset.verbosity, desc='Computing targets') + for B in self.dataset.proteins() + ] + + def load_splits(self, task_data): + split_data = task_data[f'{self.split}_split'] + if self.split != 'random': split_data = split_data[f'similarity_{self.split_similarity_threshold}'] + self.train_index, self.test_index, self.val_index = split_data['train'], split_data['test'], split_data['val'] + if self.pairwise: + self.train_index = self.compute_paired_index(self.train_index) + self.val_index = self.compute_paired_index(self.val_index) + self.test_index = self.compute_paired_index(self.test_index) + + def compute_paired_index(self, index): + """ Computes all pairs between each element in the index. + """ + combinations = np.array(list(itertools.combinations(range(len(index)), 2)), dtype=int) + return np.array(index)[combinations] + + def load_targets(self, task_data): + self.token_map = task_data['token_map'] + self.targets = np.array([self.target_transform(target) for target in task_data['targets']], dtype=object) + if self.pairwise: self.targets = self.targets.reshape(int(np.square(len(self.targets))), int(np.square(len(self.targets)))) + + def target_transform(self, target): + return target @property - def task_type(self): - """ Returns a string describing the type of task.""" - raise NotImplementedError + def X_train(self): + return self.dataset[self.train_index] @property - def num_features(self): - """ Number of input features to use for this task """ - raise NotImplementedError + def X_test(self): + return self.dataset[self.test_index] + + @property + def X_val(self): + return self.dataset[self.val_index] + + @property + def y_train(self): + return np.take(self.targets, self.train_index) @property - def num_classes(self): - """ Size of the output dimension for this task """ + def y_test(self): + return np.take(self.targets, self.test_index) + + @property + def y_val(self): + return np.take(self.targets, self.val_index) + + @property + def y_dummy(self): + """ Generates a random data object of the right type and shape to compare against the test set. + """ raise NotImplementedError - @property - def target(self, protein): + def target(self, protein_dict): """ Return the prediction target for one protein in the dataset. + Can be a scalar or a fixed size array. Arguments ------------ - protein: dict - proteinshake protein dictionary - + protein_dict: dict + A protein dictionary. - .. code-block: python + Returns + ------- + target: float + The prediction target. - >>> from proteinshake.tasks import EnzymeCommissionTask - >>> ta = EnzymeCommissionTask() """ raise NotImplementedError @@ -171,51 +209,5 @@ def evaluate(self, y_true, y_pred): Dictionary with evaluation results. Key-value pairs correspond to metric-score pairs. E.g. 'roc-auc': 0.7 """ raise NotImplementedError - - @property - def train(self): - return self.dataset[self.train_index] - - @property - def val(self): - return self.dataset[self.val_index] - - @property - def test(self): - return self.dataset[self.test_index] - - def to_graph(self, *args, **kwargs): - self.dataset = self.dataset.to_graph(*args, **kwargs) - return self - - def to_point(self, *args, **kwargs): - self.dataset = self.dataset.to_point(*args, **kwargs) - return self - - def to_voxel(self, *args, **kwargs): - self.dataset = self.dataset.to_voxel(*args, **kwargs) - return self - - def pyg(self, *args, **kwargs): - self.dataset = self.dataset.pyg(*args, **kwargs) - return self - - def dgl(self, *args, **kwargs): - self.dataset = self.dataset.dgl(*args, **kwargs) - return self - - def nx(self, *args, **kwargs): - self.dataset = self.dataset.nx(*args, **kwargs) - return self - - def np(self, *args, **kwargs): - self.dataset = self.dataset.np(*args, **kwargs) - return self - - def tf(self, *args, **kwargs): - self.dataset = self.dataset.tf(*args, **kwargs) - return self - - def torch(self, *args, **kwargs): - self.dataset = self.dataset.torch(*args, **kwargs) - return self + + \ No newline at end of file diff --git a/proteinshake/tasks/virtual_screen.py b/proteinshake/tasks/virtual_screen.py index 6fc1f48f..97fd2a22 100644 --- a/proteinshake/tasks/virtual_screen.py +++ b/proteinshake/tasks/virtual_screen.py @@ -6,33 +6,18 @@ class VirtualScreenTask(Task): """ Test an affinity scoring model on a virtual screen. - The goal in a virtual screen is: for a given protein and a library of potential - binders, bring the binders to the top of the list. + The goal in a virtual screen is: for a given protein and a library of potential binders, bring the binders to the top of the list. In this task, the model is given a protein and a list of ligands to score. - The model scores each ligand in a library with a score proportional to the likelihood - that the protein and ligand will bind. This can be a docking score, energy calculation, - or just a probability. - Each protein's ligand library contains a certain number of active molecules (ligands) - and a certai (larger) number of decoys (non-binders). - We use the predicted scores to sort the whole library and calculate the position of each - active ligand in the sorted library. - Ligands in the topi percentiles which are known to be active contribute a 1 to the score - and those below the cutoff contribute a 0. + The model scores each ligand in a library with a score proportional to the likelihood that the protein and ligand will bind. + This can be a docking score, energy calculation, or just a probability. + Each protein's ligand library contains a certain number of active molecules (ligands) and a certai (larger) number of decoys (non-binders). + We use the predicted scores to sort the whole library and calculate the position of each active ligand in the sorted library. + Ligands in the topi percentiles which are known to be active contribute a 1 to the score and those below the cutoff contribute a 0. .. warning:: - This is a zero-shot task so we use the whole dataset in - evaluation. No train/test split. - - .. code-block:: python - - >>> from proteinshake.tasks import VirtualScreenTask - >>> import numpy as np - >>> task = VirtualScreenTask() - # predict a (random) binding score for each molecule - >>> preds = [np.random.rand(len(task.target(p))) for p in task.dataset.proteins()] - >>> task.evaluate(preds, cutoff_fraction=.2) - {'enrichment_factor-@.2': 0.6} + This is a zero-shot task so we use the whole dataset in evaluation. + There are no train/test/val splits. """ @@ -41,28 +26,13 @@ class VirtualScreenTask(Task): type = 'Ranking' input = 'Protein and Molecule' output = 'Affinity Score Ranking' + default_metric = 'Enrichment Factor' def __init__(self, *args, **kwargs): kwargs['split'] = 'none' super().__init__(*args, **kwargs) self.test_targets = [self.target(p) for p in self.proteins] - @property - def task_in(self): - return ('protein', 'molecule') - - @property - def task_type(self): - return ('protein', 'virtual_screen') - - @property - def task_out(self): - return ('virtual_screen') - - @property - def target_dim(self): - return (1) - def target(self, protein): """ The target here is a sorted list of smiles where the true ligands occupy the top of the list and the decoys occupy the rest. @@ -71,19 +41,6 @@ def target(self, protein): """ return protein['protein']['ligands_smiles'] + protein['protein']['decoys_smiles'] - @property - def num_features(self): - return 20 - - def dummy_output(self): - import random - return [[random.random() for _ in range(len(self.target(p)))] for p in self.proteins] - - @property - def default_metric(self): - return 'enrichment_factor' - pass - def evaluate(self, y_true, y_pred, cutoff_fraction=.2): """ Computing enrichment factor on the whole dataset. To compute the EF, for each active ligand we check if it is in the top @@ -119,5 +76,9 @@ def evaluate(self, y_true, y_pred, cutoff_fraction=.2): mean_active_rank = np.mean([ranks_dict[lig_id] for lig_id in active_ids]) efs.append(mean_active_rank) - return {f'enrichment_factor': np.mean(efs)} + return {'Enrichment Factor': np.mean(efs)} + + def dummy(self): + import random + return [[random.random() for _ in range(len(self.target(p)))] for p in self.proteins] diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 81e1a46b..70ed730b 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -4,7 +4,6 @@ import unittest, tempfile from proteinshake.datasets import * -from proteinshake.datasets.alphafold import AF_DATASET_NAMES class TestDatasets(unittest.TestCase): diff --git a/tests/test_tasks.py b/tests/test_tasks.py index e8ed05fd..59b69e8d 100644 --- a/tests/test_tasks.py +++ b/tests/test_tasks.py @@ -2,7 +2,6 @@ Tests prediction task classes. ''' -import random import unittest, tempfile from proteinshake.tasks import *