diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 5ea4f233..e674be59 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -40,6 +40,8 @@ jobs: channels: "conda-forge, salilab, pytorch, pyg" python-version: ${{ matrix.python-version }} use-mamba: true + - name: Install setuptools + run: pip install setuptools==69.5.1 - name: Install Boost 1.7.3 (for DSSP) run: conda install -c anaconda libboost=1.73.0 - name: Install DSSP diff --git a/CHANGELOG.md b/CHANGELOG.md index e172426f..d0baa19e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ * Fix cluster file loading bug in `pdb_data.py` [#396](https://github.com/a-r-j/graphein/pull/396) #### Misc +* add metadata options for uniprot, ecnumber and CATH code to pdb manager [#398](https://github.com/a-r-j/graphein/pull/398) * bumped logging level down from `INFO` to `DEBUG` at several places to reduced output length [#391](https://github.com/a-r-j/graphein/pull/391) * exposed `fill_value` and `bfactor` option to `protein_to_pyg` function. [#385](https://github.com/a-r-j/graphein/pull/385) and [#388](https://github.com/a-r-j/graphein/pull/388) * Updated Foldcomp datasets with improved setup function and updated database choices such as ESMAtlas. [#382](https://github.com/a-r-j/graphein/pull/382) diff --git a/graphein/ml/datasets/pdb_data.py b/graphein/ml/datasets/pdb_data.py index 4ba8fbe7..6dbb4fc4 100644 --- a/graphein/ml/datasets/pdb_data.py +++ b/graphein/ml/datasets/pdb_data.py @@ -6,7 +6,7 @@ from datetime import datetime from io import StringIO from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Literal, Optional, Union import numpy as np import pandas as pd @@ -36,13 +36,16 @@ def __init__( split_ratios: Optional[List[float]] = None, split_time_frames: Optional[List[np.datetime64]] = None, assign_leftover_rows_to_split_n: int = 0, + labels: Optional[ + List[Literal["uniprot_id", "cath_code", "ec_number"]] + ] = None, ): """Instantiate a selection of experimental PDB structures. :param root_dir: The directory in which to store all PDB entries, defaults to ``"."``. :type root_dir: str, optional - :param structure_format: Whether to use ``.pdb`` or ``.mmtf`` file. + :param structure_format: Whether to use ``.pdb``, ``.mmtf`` or ``mmcif`` file. Defaults to ``"pdb"``. :type structure_format: str, optional :param splits: A list of names corresponding to each dataset split, @@ -58,6 +61,9 @@ def __init__( to assign any rows remaining after creation of new dataset splits, defaults to ``0``. :type assign_leftover_rows_to_split_n: int, optional + :param labels: A list of names corresponding to metadata labels that should be included in PDB manager dataframe, + defaults to ``None``. + :type labels: Optional[List[Literal["uniprot_id", "cath_code", "ec_number"]]], optional """ # Arguments self.root_dir = Path(root_dir) @@ -83,6 +89,12 @@ def __init__( ) self.pdb_availability_url = "https://files.wwpdb.org/pub/pdb/compatible/pdb_bundle/pdb_bundle_index.txt" + self.pdb_chain_cath_uniprot_url = "https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_cath_uniprot.tsv.gz" + + self.cath_id_cath_code_url = "http://download.cathdb.info/cath/releases/daily-release/newest/cath-b-newest-all.gz" + + self.pdb_chain_ec_number_url = "https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_enzyme.tsv.gz" + self.pdb_dir = self.root_dir / "pdb" if not os.path.exists(self.pdb_dir): os.makedirs(self.pdb_dir) @@ -99,12 +111,19 @@ def __init__( self.pdb_deposition_date_url ).name self.pdb_availability_filename = Path(self.pdb_availability_url).name + self.pdb_chain_cath_uniprot_filename = Path( + self.pdb_chain_cath_uniprot_url + ).name + self.cath_id_cath_code_filename = Path(self.cath_id_cath_code_url).name + self.pdb_chain_ec_number_filename = Path( + self.pdb_chain_ec_number_url + ).name self.list_columns = ["ligands"] # Data self.download_metadata() - self.df = self.parse() + self.df = self.parse(labels) self.source = self.df.copy() # Splits @@ -146,6 +165,9 @@ def download_metadata(self): self._download_entry_metadata() self._download_exp_type() self._download_pdb_availability() + self._download_pdb_chain_cath_uniprot_map() + self._download_cath_id_cath_code_map() + self._download_pdb_chain_ec_number_map() def get_unavailable_pdb_files( self, splits: Optional[List[str]] = None @@ -411,6 +433,39 @@ def _download_pdb_availability(self): wget.download(self.pdb_availability_url, out=str(self.root_dir)) log.debug("Downloaded PDB availability map") + def _download_pdb_chain_cath_uniprot_map(self): + """Download mapping from PDB chain to uniprot accession and CATH ID from + https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_cath_uniprot.tsv.gz + """ + if not os.path.exists( + self.root_dir / self.pdb_chain_cath_uniprot_filename + ): + log.info("Downloading Uniprot CATH map...") + wget.download( + self.pdb_chain_cath_uniprot_url, out=str(self.root_dir) + ) + log.debug("Downloaded Uniprot CATH map") + + def _download_cath_id_cath_code_map(self): + """Download mapping from CATH IDs to CATH code from + http://download.cathdb.info/cath/releases/daily-release/newest/cath-b-newest-all.gz + """ + if not os.path.exists(self.root_dir / self.cath_id_cath_code_filename): + log.info("Downloading CATH ID to CATH code map...") + wget.download(self.cath_id_cath_code_url, out=str(self.root_dir)) + log.debug("Downloaded CATH ID to CATH code map") + + def _download_pdb_chain_ec_number_map(self): + """Download mapping from PDB chains to EC number from + https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_enzyme.tsv.gz + """ + if not os.path.exists( + self.root_dir / self.pdb_chain_ec_number_filename + ): + log.info("Downloading EC number map...") + wget.download(self.pdb_chain_ec_number_url, out=str(self.root_dir)) + log.debug("Downloaded EC number map") + def _parse_ligand_map(self) -> Dict[str, List[str]]: """Parse the ligand maps for all PDB records. @@ -508,7 +563,7 @@ def _parse_entries(self) -> Dict[str, datetime]: df.dropna(subset=["id"], inplace=True) df.id = df.id.str.lower() - df.date = pd.to_datetime(df.date) + df.date = pd.to_datetime(df.date, format="%m/%d/%y") return pd.Series(df["date"].values, index=df["id"]).to_dict() def _parse_experiment_type(self) -> Dict[str, str]: @@ -536,9 +591,107 @@ def _parse_pdb_availability(self) -> Dict[str, bool]: ids = {id: False for id in ids} return ids - def parse(self) -> pd.DataFrame: + def _parse_uniprot_id(self) -> Dict[str, str]: + """Parse the uniprot ID for all PDB chains. + + :return: Dictionary of PDB chain ID with their + corresponding uniprot ID. + :rtype: Dict[str, str] + """ + uniprot_mapping = {} + with gzip.open( + self.root_dir / self.pdb_chain_cath_uniprot_filename, "rt" + ) as f: + for line in f: + try: + pdb, chain, uniprot_id, cath_id = line.strip().split("\t") + key = f"{pdb}_{chain}" + uniprot_mapping[key] = uniprot_id + except ValueError: + continue + return uniprot_mapping + + def _parse_cath_id(self) -> Dict[str, str]: + """Parse the CATH ID for all PDB chains. + + :return: Dictionary of PDB chain ID with their + corresponding CATH ID. + :rtype: Dict[str, str] + """ + cath_mapping = {} + with gzip.open( + self.root_dir / self.pdb_chain_cath_uniprot_filename, "rt" + ) as f: + next(f) # Skip header line + for line in f: + try: + pdb, chain, uniprot_id, cath_id = line.strip().split("\t") + key = f"{pdb}_{chain}" + cath_mapping[key] = cath_id + except ValueError: + continue + return cath_mapping + + def _parse_cath_code(self) -> Dict[str, str]: + """Parse the CATH code for all CATH IDs. + + :return: Dictionary of CATH ID with their + corresponding CATH code. + :rtype: Dict[str, str] + """ + cath_mapping = {} + with gzip.open( + self.root_dir / self.cath_id_cath_code_filename, "rt" + ) as f: + print(f) + for line in f: + print(line) + try: + cath_id, cath_version, cath_code, cath_segment = ( + line.strip().split() + ) + cath_mapping[cath_id] = cath_code + print(cath_id, cath_code) + except ValueError: + continue + return cath_mapping + + def _parse_ec_number(self) -> Dict[str, str]: + """Parse the CATH ID for all PDB chains and adds None when no EC number is present. + + :return: Dictionary of PDB chain ID with their + corresponding EC number. + :rtype: Dict[str, str] + """ + ec_mapping = {} + with gzip.open( + self.root_dir / self.pdb_chain_ec_number_filename, "rt" + ) as f: + next(f) # Skip header line + for line in f: + try: + pdb, chain, uniprot_id, ec_number = line.strip().split( + "\t" + ) + key = f"{pdb}_{chain}" + ec_number = None if ec_number == "?" else ec_number + ec_mapping[key] = ec_number + except ValueError: + continue + return ec_mapping + + def parse( + self, + labels: Optional[ + List[Literal["uniprot_id", "cath_code", "ec_number"]] + ] = None, + ) -> pd.DataFrame: """Parse all PDB sequence records. + :param labels: A list of names corresponding to metadata labels that should be included in PDB manager dataframe, + defaults to ``None``. + :type labels: Optional[List[str]], optional + :return: DataFrame containing PDB sequence entries with their corresponding metadata. :rtype: pd.DataFrame @@ -578,7 +731,15 @@ def parse(self) -> pd.DataFrame: df["deposition_date"] = df.pdb.map(self._parse_entries()) df["experiment_type"] = df.pdb.map(self._parse_experiment_type()) df["pdb_file_available"] = df.pdb.map(self._parse_pdb_availability()) - df.pdb_file_available.fillna(True, inplace=True) + df["pdb_file_available"] = df["pdb_file_available"].fillna(True) + if labels: + if "uniprot_id" in labels: + df["uniprot_id"] = df.id.map(self._parse_uniprot_id()) + if "cath_code" in labels: + df["cath_id"] = df.id.map(self._parse_cath_id()) + df["cath_code"] = df.cath_id.map(self._parse_cath_code()) + if "ec_number" in labels: + df["ec_number"] = df.id.map(self._parse_ec_number()) return df @@ -1150,6 +1311,105 @@ def select_complexes_with_grouped_molecule_types( if update: self.df = df + def has_uniprot_id( + self, + select_ids: Optional[List[str]] = None, + splits: Optional[List[str]] = None, + update: bool = False, + ) -> pd.DataFrame: + """ + Select entries that have a uniprot ID. + + :param select_ids: If present, filter for only these IDs. If not present, filter for entries + that have any uniprot ID. + defaults to ``None``. + :type select_ids: Optional[List[str]], optional + :param splits: Names of splits for which to perform the operation, + defaults to ``None``. + :type splits: Optional[List[str]], optional + :param update: Whether to modify the DataFrame in place, defaults to + ``False``. + :type update: bool, optional + + :return: DataFrame of selected molecules. + :rtype: pd.DataFrame + """ + splits_df = self.get_splits(splits) + df = splits_df.dropna(subset=["uniprot_id"]) + + if select_ids: + df = df[df["uniprot_id"].isin(select_ids)] + + if update: + self.df = df + return df + + def has_cath_code( + self, + select_ids: Optional[List[str]] = None, + splits: Optional[List[str]] = None, + update: bool = False, + ) -> pd.DataFrame: + """ + Select entries that have a cath code. + + :param select_ids: If present, filter for only these CATH codes. If not present, filter for entries + that have any cath code. + defaults to ``None``. + :type select_ids: Optional[List[str]], optional + :param splits: Names of splits for which to perform the operation, + defaults to ``None``. + :type splits: Optional[List[str]], optional + :param update: Whether to modify the DataFrame in place, defaults to + ``False``. + :type update: bool, optional + + :return: DataFrame of selected molecules. + :rtype: pd.DataFrame + """ + splits_df = self.get_splits(splits) + df = splits_df.dropna(subset=["cath_code"]) + + if select_ids: + df = df[df["cath_code"].isin(select_ids)] + + if update: + self.df = df + return df + + def has_ec_number( + self, + select_ids: Optional[List[str]] = None, + splits: Optional[List[str]] = None, + update: bool = False, + ) -> pd.DataFrame: + """ + Select entries that have an EC number. + + :param select_ids: If present, filter for only these ec_numbers. If not present, filter for entries + that have any EC number + defaults to ``None``. + :type select_ids: Optional[List[str]], optional + :param splits: Names of splits for which to perform the operation, + defaults to ``None``. + :type splits: Optional[List[str]], optional + :param update: Whether to modify the DataFrame in place, defaults to + ``False``. + :type update: bool, optional + + :return: DataFrame of selected molecules. + :rtype: pd.DataFrame + """ + splits_df = self.get_splits(splits) + df = splits_df.dropna(subset=["ec_number"]) + + if select_ids: + df = df[df["ec_number"].isin(select_ids)] + + if update: + self.df = df + return df + def split_df_proportionally( self, df: pd.DataFrame, @@ -1561,8 +1821,8 @@ def reset(self) -> pd.DataFrame: def download_pdbs( self, - out_dir=".", - format="pdb", + out_dir: str = ".", + format: str = "pdb", splits: Optional[List[str]] = None, overwrite: bool = False, max_workers: int = 8, @@ -1572,7 +1832,7 @@ def download_pdbs( :param out_dir: Output directory, defaults to ``"."`` :type out_dir: str, optional - :param format: Filetype to download. ``pdb`` or ``mmtf``. + :param format: Filetype to download. ``pdb``, ``mmtf``, ``mmcif`` or ``bcif``. :type format: str :param splits: Names of splits for which to perform the operation, defaults to ``None``. diff --git a/graphein/protein/tensor/io.py b/graphein/protein/tensor/io.py index dc7698bd..58089158 100644 --- a/graphein/protein/tensor/io.py +++ b/graphein/protein/tensor/io.py @@ -349,7 +349,9 @@ def protein_df_to_tensor( """ num_residues = get_protein_length(df, insertions=insertions) df = df.loc[df["atom_name"].isin(atoms_to_keep)] - residue_indices = pd.factorize(get_residue_id(df, unique=False))[0] + residue_indices = pd.factorize( + pd.Series(get_residue_id(df, unique=False)) + )[0] atom_indices = df["atom_name"].map(lambda x: atoms_to_keep.index(x)).values positions: AtomTensor = ( diff --git a/graphein/protein/utils.py b/graphein/protein/utils.py index 6c9b76e8..c16669f1 100644 --- a/graphein/protein/utils.py +++ b/graphein/protein/utils.py @@ -108,7 +108,7 @@ def download_pdb_multiprocessing( :type pdb_codes: List[str] :param out_dir: Path to directory to download PDB structures to. :type out_dir: Union[str, Path] - :param format: Filetype to download. ``pdb`` or ``mmtf``. + :param format: Filetype to download. ``pdb``, ``mmtf``, ``mmcif`` or ``bcif``. :type format: str :param overwrite: Whether to overwrite existing files, defaults to ``False``. @@ -162,7 +162,7 @@ def download_pdb( :param out_dir: Path to directory to download PDB structure to. If ``None``, will download to a temporary directory. :type out_dir: Optional[Union[str, Path]] - :param format: Filetype to download. ``pdb`` or ``mmtf``. + :param format: Filetype to download. ``pdb``, ``mmtf``, ``mmcif`` or ``bcif``. :type format: str :param check_obsolete: Whether to check for obsolete PDB codes, defaults to ``False``. If an obsolete PDB code is found, the updated PDB @@ -183,8 +183,16 @@ def download_pdb( elif format == "mmtf": BASE_URL = "https://mmtf.rcsb.org/v1.0/full/" extension = ".mmtf.gz" + elif format == "mmcif": + BASE_URL = "https://files.rcsb.org/download/" + extension = ".cif.gz" + elif format == "bcif": + BASE_URL = "https://models.rcsb.org/" + extension = ".bcif.gz" else: - raise ValueError(f"Invalid format: {format}. Must be 'pdb' or 'mmtf'.") + raise ValueError( + f"Invalid format: {format}. Must be 'pdb', 'mmtf', 'mmcif' or 'bcif'." + ) # Make output directory if it doesn't exist or set it to tempdir if None if out_dir is not None: