BorgwardtLab · timkucera · Jun 18, 2023 · Jun 18, 2023 · Jun 18, 2023 · Jun 18, 2023
diff --git a/proteinshake/__init__.py b/proteinshake/__init__.py
@@ -1 +1 @@
-__version__ = '0.3.9'
+__version__ = '0.3.11'
diff --git a/proteinshake/datasets/alphafold.py b/proteinshake/datasets/alphafold.py
@@ -1,40 +1,10 @@
-import os
-import re
-import tarfile
 import glob
 
 from proteinshake.datasets import Dataset
-from proteinshake.utils import download_url, extract_tar, load, save, unzip_file, progressbar
-
-# A map of organism names to their download file names. See https://alphafold.ebi.ac.uk/download
-AF_DATASET_NAMES = {
-    'arabidopsis_thaliana': 'UP000006548_3702_ARATH',
-    'caenorhabditis_elegans': 'UP000001940_6239_CAEEL',
-    'candida_albicans': 'UP000000559_237561_CANAL',
-    'danio_rerio': 'UP000000437_7955_DANRE',
-    'dictyostelium_discoideum': 'UP000002195_44689_DICDI',
-    'drosophila_melanogaster': 'UP000000803_7227_DROME',
-    'escherichia_coli': 'UP000000625_83333_ECOLI',
-    'glycine_max': 'UP000008827_3847_SOYBN',
-    'homo_sapiens': 'UP000005640_9606_HUMAN',
-    'methanocaldococcus_jannaschii': 'UP000000805_243232_METJA',
-    'mus_musculus': 'UP000000589_10090_MOUSE',
-    'oryza_sativa': 'UP000059680_39947_ORYSJ',
-    'rattus_norvegicus': 'UP000002494_10116_RAT',
-    'saccharomyces_cerevisiae': 'UP000002311_559292_YEAST',
-    'schizosaccharomyces_pombe': 'UP000002485_284812_SCHPO',
-    'zea_mays': 'UP000007305_4577_MAIZE',
-    'swissprot': 'swissprot_pdb',
-}
-
-description = 'Predicted structures'
+from proteinshake.utils import download_url, extract_tar, unzip_file, progressbar
 
 class AlphaFoldDataset(Dataset):
-    """ 3D structures predicted by AlphaFold.
-    Requires the `organism` name to be specified.
-    See https://alphafold.ebi.ac.uk/download for a full list of available organsims.
-    Pass the full latin organism name separated by a space or underscore.
-    `organism` can also be 'swissprot', in which case the full SwissProt structure predictions will be downloaded (ca. 500.000).
+    """ SwissProt 3D structures predicted by AlphaFold.
 
     .. admonition:: Please cite
 
@@ -46,74 +16,22 @@ class AlphaFoldDataset(Dataset):
 
       Raw data was obtained and modified from `AlphaFoldDB <https://alphafold.ebi.ac.uk>`_, originally licensed under `CC-BY-4.0 <https://creativecommons.org/licenses/by/4.0/>`_.
 
-
-    .. list-table :: Data Properties
-       :widths: 50 50
-       :header-rows: 1
-
-       * - organism
-         - # proteins
-       * - ``'arabidopsis_thaliana'``
-         - 27,386
-       * - ``'caenorhabditis_elegans'``
-         - 19,613
-       * - ``'candida_albicans'``
-         - 5,951
-       * - ``'danio_rerio'``
-         - 24,430
-       * - ``'dictyostelium_discoideum'``
-         - 12,485
-       * - ``'drosophila_melanogaster'``
-         - 13,318
-       * - ``'escherichia_coli'``
-         - 4,362
-       * - ``'glycine_max'``
-         - 55,696
-       * - ``'homo_sapiens'``
-         - 23,172
-       * - ``'methanocaldococcus_jannaschii'``
-         - 1,773
-       * - ``'mus_musculus'``
-         - 21,398
-       * - ``'oryza_sativa'``
-         - 43,631
-       * - ``'rattus_norvegicus'``
-         - 21,069
-       * - ``'saccharomyces_cerevisiae'``
-         - 6,016
-       * - ``'schizosaccharomyces_pombe'``
-         - 5,104
-       * - ``'zea_mays'``
-         - 39,203
-       * - ``'swissprot'``
-         - 541,143
-
     Parameters
     ----------
-    organism: str
-        The organism name or 'swissprot'.
+    version: int, default 4
+        The AlphaFoldDB version.
     """
 
-    exlude_args_from_signature = ['organism']
-
-    def __init__(self, organism='swissprot', version='v4', only_single_chain=True, **kwargs):
-        self.organism = organism.lower().replace(' ','_')
-        self.base_url = 'https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/'
-        self.version = version
+    def __init__(self, version=4, only_single_chain=True, **kwargs):
+        self.base_url = f'https://ftp.ebi.ac.uk/pub/databases/alphafold/v{version}'
+        #self.file_name = f'swissprot_pdb_v{version}'
+        self.file_name = f'UP000000805_243232_METJA_v{version}'
         super().__init__(only_single_chain=only_single_chain, **kwargs)
 
-    @property
-    def name(self):
-        return f'{self.__class__.__name__}_{self.organism}'
-
-    def get_raw_files(self):
-        return glob.glob(f'{self.root}/raw/*/*.pdb')[:self.limit]
-
     def get_id_from_filename(self, filename):
-        return re.search('(?<=AF-)(.*)(?=-F.+-model)', filename).group()
+        return filename.split('-')[1]
 
     def download(self):
-        os.makedirs(f'{self.root}/raw/{self.organism}', exist_ok=True)
-        download_url(self.base_url+AF_DATASET_NAMES[self.organism]+f'_{self.version}.tar', f'{self.root}/raw/{self.organism}', verbosity=self.verbosity)
-        extract_tar(f'{self.root}/raw/{self.organism}/{AF_DATASET_NAMES[self.organism]}_{self.version}.tar', f'{self.root}/raw/{self.organism}', verbosity=self.verbosity)
-        [unzip_file(f) for f in progressbar(glob.glob(f'{self.root}/raw/*/*.pdb.gz')[:self.limit], desc='Unzipping', verbosity=self.verbosity)]
+        download_url(f'{self.base_url}/{self.file_name}.tar', f'{self.root}/raw', verbosity=self.verbosity)
+        extract_tar(f'{self.root}/raw/{self.file_name}.tar', f'{self.root}/raw/files', verbosity=self.verbosity)
+        for path in progressbar(glob.glob(f'{self.root}/raw/*/*.pdb.gz')[:self.limit], desc='Unzipping', verbosity=self.verbosity): unzip_file(path)
diff --git a/proteinshake/datasets/dataset.py b/proteinshake/datasets/dataset.py
@@ -2,22 +2,17 @@
 """
 Base dataset class for protein 3D structures.
 """
-import os, gzip, inspect, time, itertools, tarfile, io, requests
-import copy
-from collections import defaultdict, Counter
+import os, inspect, requests, glob
 from functools import cached_property
-import multiprocessing as mp
 
-import pandas as pd
 import numpy as np
 import freesasa
 from biopandas.pdb import PandasPdb
 from joblib import Parallel, delayed
-from sklearn.neighbors import kneighbors_graph, radius_neighbors_graph
 from fastavro import reader as avro_reader
 
 from proteinshake.transforms import IdentityTransform, RandomRotateTransform, CenterTransform
-from proteinshake.utils import download_url, save, load, unzip_file, write_avro, Generator, progressbar, warning, error
+from proteinshake.utils import download_url, unzip_file, write_avro, Generator, progressbar, warning, error
 
 AA_THREE_TO_ONE = {'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E', 'PHE': 'F', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LYS': 'K', 'LEU': 'L', 'MET': 'M', 'ASN': 'N', 'PRO': 'P', 'GLN': 'Q', 'ARG': 'R', 'SER': 'S', 'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y'}
 AA_ONE_TO_THREE = {v:k for k, v in AA_THREE_TO_ONE.items()}
@@ -241,7 +236,7 @@ def get_raw_files(self):
         list
             The list of raw PDB files used in this dataset.
         """
-        raise NotImplementedError
+        return glob.glob(f'{self.root}/raw/files/*.pdb')[:self.limit]
 
     def get_id_from_filename(self, filename):
         """ Implement me in a subclass!
@@ -258,7 +253,7 @@ def get_id_from_filename(self, filename):
         str
             A PDB identifier or other ID.
         """
-        raise NotImplementedError
+        return filename.rstrip('.pdb')
 
     def download(self):
         """ Implement me in a subclass!
@@ -306,6 +301,10 @@ def download_precomputed(self, resolution='residue'):
             download_url(f'{self.repository_url}/{self.name}.{resolution}.avro.gz', f'{self.root}', verbosity=self.verbosity)
             if self.verbosity > 0: print('Unzipping...')
             unzip_file(f'{self.root}/{self.name}.{resolution}.avro.gz')
+            for filename in self.additional_files:
+                if not os.path.exists(f'{self.root}/{filename}'):
+                    download_url(f'{self.repository_url}/{filename}.gz', f'{self.root}', verbosity=0)
+                    unzip_file(f'{self.root}/{filename}.gz')
 
     def parse(self):
         """ Parses all PDB files returned from :meth:`proteinshake.datasets.Dataset.get_raw_files()` and saves them to disk. Can run in parallel.
@@ -320,9 +319,6 @@ def parse(self):
 
         if self.verbosity > 0: print(f'Filtered {before-len(proteins)} proteins.')
 
-        # if self.center:
-        # if True:
-        # if self.random_rotate:
         if self.name == 'ProteinProteinInteractionDataset':
             print("Centering")
             proteins = [CenterTransform()(p) for p in proteins]
@@ -334,6 +330,7 @@ def parse(self):
         atom_proteins = [{'protein':p['protein'], 'atom':p['atom']} for p in proteins]
         write_avro(residue_proteins, f'{self.root}/{self.name}.residue.avro')
         write_avro(atom_proteins, f'{self.root}/{self.name}.atom.avro')
+        return proteins
 
     def parse_pdb(self, path):
         """ Parses a single PDB file first into a DataFrame, then into a protein object (a dictionary). Also validates the PDB file and provides the hook for `add_protein_attributes`. Returns `None` if the protein was found to be invalid.
@@ -489,21 +486,6 @@ def validate(self, df):
         if not sum(df['residue_type'].map(lambda x: not x is None)) > 0:
             return False
         return True
-
-    def describe(self):
-        """ Produces dataset statistics.
-
-        Returns
-        -------
-        dict
-            A dictionary of summary statistics of this dataset.
-        """
-        n_resi = len(self.data.residue_index) / len(self.data.ID)
-        data = {'name': type(self).__name__,
-                'num_proteins': len(self),
-                'avg size (# residues)': n_resi
-               }
-        return data
 
     def to_graph(self, resolution='residue', transform=IdentityTransform(), **kwargs):
         """ Converts the raw dataset to a graph dataset. See :meth:`proteinshake.representations.GraphDataset` for arguments.

diff --git a/proteinshake/datasets/enzyme_commission.py b/proteinshake/datasets/enzyme_commission.py
@@ -13,15 +13,6 @@ class EnzymeCommissionDataset(RCSBDataset):
 
         Raw data was obtained and modified from `RCSB Protein Data Bank <https://www.rcsb.org/>`_, originally licensed under `CC0 1.0 <https://creativecommons.org/publicdomain/zero/1.0/>`_.
 
-
-    .. list-table:: Dataset stats
-        :widths: 100
-        :header-rows: 1
-
-        * - # proteins
-        * - 15603 
-
-
     .. list-table:: Annotations
         :widths: 25 35 45
         :header-rows: 1
@@ -35,8 +26,6 @@ class EnzymeCommissionDataset(RCSBDataset):
 
     """
 
-    description = 'Enzymes'
-
     def __init__(self, query=[['rcsb_polymer_entity.rcsb_ec_lineage.name','exists']], **kwargs):
         """
 

diff --git a/proteinshake/datasets/gene_ontology.py b/proteinshake/datasets/gene_ontology.py
@@ -22,16 +22,6 @@ class GeneOntologyDataset(RCSBDataset):
 
       Raw data was obtained and modified from `RCSB Protein Data Bank <https://www.rcsb.org/>`_, originally licensed under `CC0 1.0 <https://creativecommons.org/publicdomain/zero/1.0/>`_.
 
-
-
-    .. list-table:: Dataset stats
-       :widths: 100
-       :header-rows: 1
-
-       * - # proteins
-       * - 32633
-
-
    .. list-table:: Annotations
       :widths: 25 25 50
       :header-rows: 1
@@ -52,8 +42,6 @@ class GeneOntologyDataset(RCSBDataset):
 
     """
 
-    description = 'Gene Ontology'
-
     additional_files = ['GeneOntologyDataset.godag.obo']
 
     def __init__(self, query=[['rcsb_polymer_entity_annotation.type','exact_match','GO']], **kwargs):
@@ -85,10 +73,3 @@ def add_protein_attributes(self, protein):
         protein['protein']['cellular_component'] = [term for term in go_terms if godag[term].namespace == 'cellular_component']
         protein['protein']['biological_process'] = [term for term in go_terms if godag[term].namespace == 'biological_process']
         return protein
-
-    def describe(self):
-        desc = super().describe()
-        desc['property'] = "Gene Ontology (GO)"
-        desc['values'] = f"{len(set((p['GO'][0] for p in self.proteins)))} (root)"
-        desc['type'] = 'Categorical, Hierarchical'
-        return desc
diff --git a/proteinshake/datasets/protein_family.py b/proteinshake/datasets/protein_family.py
@@ -14,16 +14,6 @@ class ProteinFamilyDataset(RCSBDataset):
 
       Raw data was obtained and modified from `RCSB Protein Data Bank <https://www.rcsb.org/>`_, originally licensed under `CC0 1.0 <https://creativecommons.org/publicdomain/zero/1.0/>`_.
 
-
-
-    .. list-table:: Dataset stats
-       :widths: 100
-       :header-rows: 1
-
-       * - # proteins
-       * - 31109
-
-
    .. list-table:: Annotations
       :widths: 25 35 45
       :header-rows: 1
@@ -37,8 +27,6 @@ class ProteinFamilyDataset(RCSBDataset):
 
     """
 
-    description = 'Protein Families'
-
     def __init__(self, pfam_version='34.0', query=[['rcsb_polymer_entity_annotation.type','exact_match','Pfam']], **kwargs):
         self.pfam_version = pfam_version
         super().__init__(query=query, **kwargs)
@@ -53,10 +41,3 @@ def add_protein_attributes(self, protein):
                 pfams.append(a['annotation_id'])
         protein['protein']['Pfam'] = pfams
         return protein
-
-    def describe(self):
-        desc = super().describe()
-        desc['property'] = "Protein Family (Pfam)"
-        desc['values'] = f"{len(set((p['Pfam'][0] for p in self.proteins)))} (root)"
-        desc['type'] = 'Categorical, Hierarchical'
-        return desc
diff --git a/proteinshake/datasets/protein_ligand_decoys.py b/proteinshake/datasets/protein_ligand_decoys.py
@@ -33,17 +33,6 @@ class ProteinLigandDecoysDataset(Dataset):
 
       Raw data was obtained and modified from `DUDE-Z <https://dudez.docking.org/>`_.
 
-
-
-
-    .. list-table:: Dataset stats
-       :widths: 100
-       :header-rows: 1
-
-       * - # proteins
-       * - 38
-
-
    .. list-table:: Annotations
       :widths: 25 35 45
       :header-rows: 1
@@ -70,18 +59,10 @@ class ProteinLigandDecoysDataset(Dataset):
 
     """
 
-    description = 'Proteins with ligands and decoys'
-
     @patch('proteinshake.datasets.dataset.AA_THREE_TO_ONE', EXTENDED_AA_THREE_TO_ONE)
     def pdb2df(self, path):
         return super().pdb2df(path)
 
-    def get_raw_files(self):
-        return glob.glob(f'{self.root}/raw/files/*.pdb')[:self.limit]
-
-    def get_id_from_filename(self, filename):
-        return filename.split(".")[0]
-
     def download(self):
         targets  = ['AA2AR', 'ABL1', 'ACES', 'ADA', 'ADRB2', 'AMPC', 'ANDR', 'CSF1R', 'CXCR4', 'DEF', 'DRD4', 'EGFR', 'FA7', 'FA10', 'FABP4', 'FGFR1', 'FKB1A', 'GLCM', 'HDAC8', 'HIVPR', 'HMDH', 'HS90A', 'ITAL', 'KITH', 'KIT', 'LCK', 'MAPK2', 'MK01', 'MT1', 'NRAM', 'PARP1', 'PLK1', 'PPARA', 'PTN1', 'PUR2', 'RENI', 'ROCK1', 'SRC', 'THRB', 'TRY1', 'TRYB1', 'UROK', 'XIAP']
 

diff --git a/proteinshake/datasets/protein_ligand_interface.py b/proteinshake/datasets/protein_ligand_interface.py
@@ -37,13 +37,6 @@ class ProteinLigandInterfaceDataset(Dataset):
     version: str
         PDBBind version to use.
 
-    .. list-table:: Dataset stats
-       :widths: 100
-       :header-rows: 1
-
-       * - # proteins
-       * - 4642
-
     .. list-table:: Annotations
         :widths: 20 55 25
         :header-rows: 1
@@ -80,8 +73,6 @@ class ProteinLigandInterfaceDataset(Dataset):
         - :code:`'[..,0, 0, 1, 0, 1, 0, 0, 0,..]`
     """
 
-    description = ''
-
     def __init__(self, version='2020', **kwargs):
         self.version = version
         super().__init__(**kwargs)
@@ -203,10 +194,3 @@ def add_protein_attributes(self, protein):
         protein['protein']['fp_morgan_r2'] = fp_morgan
 
         return protein
-
-    def describe(self):
-        desc = super().describe()
-        desc['property'] = "Small Mol. Binding Site (residue-level)"
-        desc['values'] = 2
-        desc['type'] = 'Binary'
-        return desc