Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Task re-design #235

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion proteinshake/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.3.9'
__version__ = '0.3.11'
106 changes: 12 additions & 94 deletions proteinshake/datasets/alphafold.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,10 @@
import os
import re
import tarfile
import glob

from proteinshake.datasets import Dataset
from proteinshake.utils import download_url, extract_tar, load, save, unzip_file, progressbar

# A map of organism names to their download file names. See https://alphafold.ebi.ac.uk/download
AF_DATASET_NAMES = {
'arabidopsis_thaliana': 'UP000006548_3702_ARATH',
'caenorhabditis_elegans': 'UP000001940_6239_CAEEL',
'candida_albicans': 'UP000000559_237561_CANAL',
'danio_rerio': 'UP000000437_7955_DANRE',
'dictyostelium_discoideum': 'UP000002195_44689_DICDI',
'drosophila_melanogaster': 'UP000000803_7227_DROME',
'escherichia_coli': 'UP000000625_83333_ECOLI',
'glycine_max': 'UP000008827_3847_SOYBN',
'homo_sapiens': 'UP000005640_9606_HUMAN',
'methanocaldococcus_jannaschii': 'UP000000805_243232_METJA',
'mus_musculus': 'UP000000589_10090_MOUSE',
'oryza_sativa': 'UP000059680_39947_ORYSJ',
'rattus_norvegicus': 'UP000002494_10116_RAT',
'saccharomyces_cerevisiae': 'UP000002311_559292_YEAST',
'schizosaccharomyces_pombe': 'UP000002485_284812_SCHPO',
'zea_mays': 'UP000007305_4577_MAIZE',
'swissprot': 'swissprot_pdb',
}

description = 'Predicted structures'
from proteinshake.utils import download_url, extract_tar, unzip_file, progressbar

class AlphaFoldDataset(Dataset):
""" 3D structures predicted by AlphaFold.
Requires the `organism` name to be specified.
See https://alphafold.ebi.ac.uk/download for a full list of available organsims.
Pass the full latin organism name separated by a space or underscore.
`organism` can also be 'swissprot', in which case the full SwissProt structure predictions will be downloaded (ca. 500.000).
""" SwissProt 3D structures predicted by AlphaFold.

.. admonition:: Please cite

Expand All @@ -46,74 +16,22 @@ class AlphaFoldDataset(Dataset):

Raw data was obtained and modified from `AlphaFoldDB <https://alphafold.ebi.ac.uk>`_, originally licensed under `CC-BY-4.0 <https://creativecommons.org/licenses/by/4.0/>`_.


.. list-table :: Data Properties
:widths: 50 50
:header-rows: 1

* - organism
- # proteins
* - ``'arabidopsis_thaliana'``
- 27,386
* - ``'caenorhabditis_elegans'``
- 19,613
* - ``'candida_albicans'``
- 5,951
* - ``'danio_rerio'``
- 24,430
* - ``'dictyostelium_discoideum'``
- 12,485
* - ``'drosophila_melanogaster'``
- 13,318
* - ``'escherichia_coli'``
- 4,362
* - ``'glycine_max'``
- 55,696
* - ``'homo_sapiens'``
- 23,172
* - ``'methanocaldococcus_jannaschii'``
- 1,773
* - ``'mus_musculus'``
- 21,398
* - ``'oryza_sativa'``
- 43,631
* - ``'rattus_norvegicus'``
- 21,069
* - ``'saccharomyces_cerevisiae'``
- 6,016
* - ``'schizosaccharomyces_pombe'``
- 5,104
* - ``'zea_mays'``
- 39,203
* - ``'swissprot'``
- 541,143

Parameters
----------
organism: str
The organism name or 'swissprot'.
version: int, default 4
The AlphaFoldDB version.
"""

exlude_args_from_signature = ['organism']

def __init__(self, organism='swissprot', version='v4', only_single_chain=True, **kwargs):
self.organism = organism.lower().replace(' ','_')
self.base_url = 'https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/'
self.version = version
def __init__(self, version=4, only_single_chain=True, **kwargs):
self.base_url = f'https://ftp.ebi.ac.uk/pub/databases/alphafold/v{version}'
#self.file_name = f'swissprot_pdb_v{version}'
self.file_name = f'UP000000805_243232_METJA_v{version}'
super().__init__(only_single_chain=only_single_chain, **kwargs)

@property
def name(self):
return f'{self.__class__.__name__}_{self.organism}'

def get_raw_files(self):
return glob.glob(f'{self.root}/raw/*/*.pdb')[:self.limit]

def get_id_from_filename(self, filename):
return re.search('(?<=AF-)(.*)(?=-F.+-model)', filename).group()
return filename.split('-')[1]

def download(self):
os.makedirs(f'{self.root}/raw/{self.organism}', exist_ok=True)
download_url(self.base_url+AF_DATASET_NAMES[self.organism]+f'_{self.version}.tar', f'{self.root}/raw/{self.organism}', verbosity=self.verbosity)
extract_tar(f'{self.root}/raw/{self.organism}/{AF_DATASET_NAMES[self.organism]}_{self.version}.tar', f'{self.root}/raw/{self.organism}', verbosity=self.verbosity)
[unzip_file(f) for f in progressbar(glob.glob(f'{self.root}/raw/*/*.pdb.gz')[:self.limit], desc='Unzipping', verbosity=self.verbosity)]
download_url(f'{self.base_url}/{self.file_name}.tar', f'{self.root}/raw', verbosity=self.verbosity)
extract_tar(f'{self.root}/raw/{self.file_name}.tar', f'{self.root}/raw/files', verbosity=self.verbosity)
for path in progressbar(glob.glob(f'{self.root}/raw/*/*.pdb.gz')[:self.limit], desc='Unzipping', verbosity=self.verbosity): unzip_file(path)
36 changes: 9 additions & 27 deletions proteinshake/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,17 @@
"""
Base dataset class for protein 3D structures.
"""
import os, gzip, inspect, time, itertools, tarfile, io, requests
import copy
from collections import defaultdict, Counter
import os, inspect, requests, glob
from functools import cached_property
import multiprocessing as mp

import pandas as pd
import numpy as np
import freesasa
from biopandas.pdb import PandasPdb
from joblib import Parallel, delayed
from sklearn.neighbors import kneighbors_graph, radius_neighbors_graph
from fastavro import reader as avro_reader

from proteinshake.transforms import IdentityTransform, RandomRotateTransform, CenterTransform
from proteinshake.utils import download_url, save, load, unzip_file, write_avro, Generator, progressbar, warning, error
from proteinshake.utils import download_url, unzip_file, write_avro, Generator, progressbar, warning, error

AA_THREE_TO_ONE = {'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E', 'PHE': 'F', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LYS': 'K', 'LEU': 'L', 'MET': 'M', 'ASN': 'N', 'PRO': 'P', 'GLN': 'Q', 'ARG': 'R', 'SER': 'S', 'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y'}
AA_ONE_TO_THREE = {v:k for k, v in AA_THREE_TO_ONE.items()}
Expand Down Expand Up @@ -241,7 +236,7 @@ def get_raw_files(self):
list
The list of raw PDB files used in this dataset.
"""
raise NotImplementedError
return glob.glob(f'{self.root}/raw/files/*.pdb')[:self.limit]

def get_id_from_filename(self, filename):
""" Implement me in a subclass!
Expand All @@ -258,7 +253,7 @@ def get_id_from_filename(self, filename):
str
A PDB identifier or other ID.
"""
raise NotImplementedError
return filename.rstrip('.pdb')

def download(self):
""" Implement me in a subclass!
Expand Down Expand Up @@ -306,6 +301,10 @@ def download_precomputed(self, resolution='residue'):
download_url(f'{self.repository_url}/{self.name}.{resolution}.avro.gz', f'{self.root}', verbosity=self.verbosity)
if self.verbosity > 0: print('Unzipping...')
unzip_file(f'{self.root}/{self.name}.{resolution}.avro.gz')
for filename in self.additional_files:
if not os.path.exists(f'{self.root}/{filename}'):
download_url(f'{self.repository_url}/{filename}.gz', f'{self.root}', verbosity=0)
unzip_file(f'{self.root}/{filename}.gz')

def parse(self):
""" Parses all PDB files returned from :meth:`proteinshake.datasets.Dataset.get_raw_files()` and saves them to disk. Can run in parallel.
Expand All @@ -320,9 +319,6 @@ def parse(self):

if self.verbosity > 0: print(f'Filtered {before-len(proteins)} proteins.')

# if self.center:
# if True:
# if self.random_rotate:
if self.name == 'ProteinProteinInteractionDataset':
print("Centering")
proteins = [CenterTransform()(p) for p in proteins]
Expand All @@ -334,6 +330,7 @@ def parse(self):
atom_proteins = [{'protein':p['protein'], 'atom':p['atom']} for p in proteins]
write_avro(residue_proteins, f'{self.root}/{self.name}.residue.avro')
write_avro(atom_proteins, f'{self.root}/{self.name}.atom.avro')
return proteins

def parse_pdb(self, path):
""" Parses a single PDB file first into a DataFrame, then into a protein object (a dictionary). Also validates the PDB file and provides the hook for `add_protein_attributes`. Returns `None` if the protein was found to be invalid.
Expand Down Expand Up @@ -489,21 +486,6 @@ def validate(self, df):
if not sum(df['residue_type'].map(lambda x: not x is None)) > 0:
return False
return True

def describe(self):
""" Produces dataset statistics.

Returns
-------
dict
A dictionary of summary statistics of this dataset.
"""
n_resi = len(self.data.residue_index) / len(self.data.ID)
data = {'name': type(self).__name__,
'num_proteins': len(self),
'avg size (# residues)': n_resi
}
return data

def to_graph(self, resolution='residue', transform=IdentityTransform(), **kwargs):
""" Converts the raw dataset to a graph dataset. See :meth:`proteinshake.representations.GraphDataset` for arguments.
Expand Down
11 changes: 0 additions & 11 deletions proteinshake/datasets/enzyme_commission.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,6 @@ class EnzymeCommissionDataset(RCSBDataset):

Raw data was obtained and modified from `RCSB Protein Data Bank <https://www.rcsb.org/>`_, originally licensed under `CC0 1.0 <https://creativecommons.org/publicdomain/zero/1.0/>`_.


.. list-table:: Dataset stats
:widths: 100
:header-rows: 1

* - # proteins
* - 15603


.. list-table:: Annotations
:widths: 25 35 45
:header-rows: 1
Expand All @@ -35,8 +26,6 @@ class EnzymeCommissionDataset(RCSBDataset):

"""

description = 'Enzymes'

def __init__(self, query=[['rcsb_polymer_entity.rcsb_ec_lineage.name','exists']], **kwargs):
"""

Expand Down
19 changes: 0 additions & 19 deletions proteinshake/datasets/gene_ontology.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,6 @@ class GeneOntologyDataset(RCSBDataset):

Raw data was obtained and modified from `RCSB Protein Data Bank <https://www.rcsb.org/>`_, originally licensed under `CC0 1.0 <https://creativecommons.org/publicdomain/zero/1.0/>`_.



.. list-table:: Dataset stats
:widths: 100
:header-rows: 1

* - # proteins
* - 32633


.. list-table:: Annotations
:widths: 25 25 50
:header-rows: 1
Expand All @@ -52,8 +42,6 @@ class GeneOntologyDataset(RCSBDataset):

"""

description = 'Gene Ontology'

additional_files = ['GeneOntologyDataset.godag.obo']

def __init__(self, query=[['rcsb_polymer_entity_annotation.type','exact_match','GO']], **kwargs):
Expand Down Expand Up @@ -85,10 +73,3 @@ def add_protein_attributes(self, protein):
protein['protein']['cellular_component'] = [term for term in go_terms if godag[term].namespace == 'cellular_component']
protein['protein']['biological_process'] = [term for term in go_terms if godag[term].namespace == 'biological_process']
return protein

def describe(self):
desc = super().describe()
desc['property'] = "Gene Ontology (GO)"
desc['values'] = f"{len(set((p['GO'][0] for p in self.proteins)))} (root)"
desc['type'] = 'Categorical, Hierarchical'
return desc
19 changes: 0 additions & 19 deletions proteinshake/datasets/protein_family.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,6 @@ class ProteinFamilyDataset(RCSBDataset):

Raw data was obtained and modified from `RCSB Protein Data Bank <https://www.rcsb.org/>`_, originally licensed under `CC0 1.0 <https://creativecommons.org/publicdomain/zero/1.0/>`_.



.. list-table:: Dataset stats
:widths: 100
:header-rows: 1

* - # proteins
* - 31109


.. list-table:: Annotations
:widths: 25 35 45
:header-rows: 1
Expand All @@ -37,8 +27,6 @@ class ProteinFamilyDataset(RCSBDataset):

"""

description = 'Protein Families'

def __init__(self, pfam_version='34.0', query=[['rcsb_polymer_entity_annotation.type','exact_match','Pfam']], **kwargs):
self.pfam_version = pfam_version
super().__init__(query=query, **kwargs)
Expand All @@ -53,10 +41,3 @@ def add_protein_attributes(self, protein):
pfams.append(a['annotation_id'])
protein['protein']['Pfam'] = pfams
return protein

def describe(self):
desc = super().describe()
desc['property'] = "Protein Family (Pfam)"
desc['values'] = f"{len(set((p['Pfam'][0] for p in self.proteins)))} (root)"
desc['type'] = 'Categorical, Hierarchical'
return desc
19 changes: 0 additions & 19 deletions proteinshake/datasets/protein_ligand_decoys.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,6 @@ class ProteinLigandDecoysDataset(Dataset):

Raw data was obtained and modified from `DUDE-Z <https://dudez.docking.org/>`_.




.. list-table:: Dataset stats
:widths: 100
:header-rows: 1

* - # proteins
* - 38


.. list-table:: Annotations
:widths: 25 35 45
:header-rows: 1
Expand All @@ -70,18 +59,10 @@ class ProteinLigandDecoysDataset(Dataset):

"""

description = 'Proteins with ligands and decoys'

@patch('proteinshake.datasets.dataset.AA_THREE_TO_ONE', EXTENDED_AA_THREE_TO_ONE)
def pdb2df(self, path):
return super().pdb2df(path)

def get_raw_files(self):
return glob.glob(f'{self.root}/raw/files/*.pdb')[:self.limit]

def get_id_from_filename(self, filename):
return filename.split(".")[0]

def download(self):
targets = ['AA2AR', 'ABL1', 'ACES', 'ADA', 'ADRB2', 'AMPC', 'ANDR', 'CSF1R', 'CXCR4', 'DEF', 'DRD4', 'EGFR', 'FA7', 'FA10', 'FABP4', 'FGFR1', 'FKB1A', 'GLCM', 'HDAC8', 'HIVPR', 'HMDH', 'HS90A', 'ITAL', 'KITH', 'KIT', 'LCK', 'MAPK2', 'MK01', 'MT1', 'NRAM', 'PARP1', 'PLK1', 'PPARA', 'PTN1', 'PUR2', 'RENI', 'ROCK1', 'SRC', 'THRB', 'TRY1', 'TRYB1', 'UROK', 'XIAP']

Expand Down
16 changes: 0 additions & 16 deletions proteinshake/datasets/protein_ligand_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,6 @@ class ProteinLigandInterfaceDataset(Dataset):
version: str
PDBBind version to use.

.. list-table:: Dataset stats
:widths: 100
:header-rows: 1

* - # proteins
* - 4642

.. list-table:: Annotations
:widths: 20 55 25
:header-rows: 1
Expand Down Expand Up @@ -80,8 +73,6 @@ class ProteinLigandInterfaceDataset(Dataset):
- :code:`'[..,0, 0, 1, 0, 1, 0, 0, 0,..]`
"""

description = ''

def __init__(self, version='2020', **kwargs):
self.version = version
super().__init__(**kwargs)
Expand Down Expand Up @@ -203,10 +194,3 @@ def add_protein_attributes(self, protein):
protein['protein']['fp_morgan_r2'] = fp_morgan

return protein

def describe(self):
desc = super().describe()
desc['property'] = "Small Mol. Binding Site (residue-level)"
desc['values'] = 2
desc['type'] = 'Binary'
return desc
Loading