Skip to content

Commit

Permalink
[pre-commit.ci] auto fixes from pre-commit.com hooks
Browse files Browse the repository at this point in the history
for more information, see https://pre-commit.ci
  • Loading branch information
pre-commit-ci[bot] committed May 29, 2024
1 parent 0ca75eb commit e577c82
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 29 deletions.
75 changes: 47 additions & 28 deletions graphein/ml/datasets/pdb_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,6 @@ def __init__(

self.pdb_chain_ec_number_url = "https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_enzyme.tsv.gz"



self.pdb_dir = self.root_dir / "pdb"
if not os.path.exists(self.pdb_dir):
os.makedirs(self.pdb_dir)
Expand All @@ -111,9 +109,13 @@ def __init__(
self.pdb_deposition_date_url
).name
self.pdb_availability_filename = Path(self.pdb_availability_url).name
self.pdb_chain_cath_uniprot_filename = Path(self.pdb_chain_cath_uniprot_url).name
self.pdb_chain_cath_uniprot_filename = Path(
self.pdb_chain_cath_uniprot_url
).name
self.cath_id_cath_code_filename = Path(self.cath_id_cath_code_url).name
self.pdb_chain_ec_number_filename = Path(self.pdb_chain_ec_number_url).name
self.pdb_chain_ec_number_filename = Path(
self.pdb_chain_ec_number_url
).name

self.list_columns = ["ligands"]

Expand Down Expand Up @@ -428,16 +430,20 @@ def _download_pdb_availability(self):
log.info("Downloading PDB availability map...")
wget.download(self.pdb_availability_url, out=str(self.root_dir))
log.debug("Downloaded PDB availability map")

def _download_pdb_chain_cath_uniprot_map(self):
"""Download mapping from PDB chain to uniprot accession and CATH ID from
https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_cath_uniprot.tsv.gz
"""
if not os.path.exists(self.root_dir / self.pdb_chain_cath_uniprot_filename):
if not os.path.exists(
self.root_dir / self.pdb_chain_cath_uniprot_filename
):
log.info("Downloading Uniprot CATH map...")
wget.download(self.pdb_chain_cath_uniprot_url, out=str(self.root_dir))
wget.download(
self.pdb_chain_cath_uniprot_url, out=str(self.root_dir)
)
log.debug("Downloaded Uniprot CATH map")

def _download_cath_id_cath_code_map(self):
"""Download mapping from CATH IDs to CATH code from
http://download.cathdb.info/cath/releases/daily-release/newest/cath-b-newest-all.gz
Expand All @@ -451,7 +457,9 @@ def _download_pdb_chain_ec_number_map(self):
"""Download mapping from PDB chains to EC number from
https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_enzyme.tsv.gz
"""
if not os.path.exists(self.root_dir / self.pdb_chain_ec_number_filename):
if not os.path.exists(
self.root_dir / self.pdb_chain_ec_number_filename
):
log.info("Downloading EC number map...")
wget.download(self.pdb_chain_ec_number_url, out=str(self.root_dir))
log.debug("Downloaded EC number map")
Expand Down Expand Up @@ -553,7 +561,7 @@ def _parse_entries(self) -> Dict[str, datetime]:
df.dropna(subset=["id"], inplace=True)

df.id = df.id.str.lower()
df.date = pd.to_datetime(df.date, format = "%m/%d/%y")
df.date = pd.to_datetime(df.date, format="%m/%d/%y")
return pd.Series(df["date"].values, index=df["id"]).to_dict()

def _parse_experiment_type(self) -> Dict[str, str]:
Expand Down Expand Up @@ -589,16 +597,18 @@ def _parse_uniprot_id(self) -> Dict[str, str]:
:rtype: Dict[str, str]
"""
uniprot_mapping = {}
with gzip.open(self.root_dir / self.pdb_chain_cath_uniprot_filename, 'rt') as f:
with gzip.open(
self.root_dir / self.pdb_chain_cath_uniprot_filename, "rt"
) as f:
for line in f:
try:
pdb, chain, uniprot_id, cath_id = line.strip().split('\t')
pdb, chain, uniprot_id, cath_id = line.strip().split("\t")
key = f"{pdb}_{chain}"
uniprot_mapping[key] = uniprot_id
except ValueError:
continue
return uniprot_mapping

def _parse_cath_id(self) -> Dict[str, str]:
"""Parse the CATH ID for all PDB chains.
Expand All @@ -607,17 +617,19 @@ def _parse_cath_id(self) -> Dict[str, str]:
:rtype: Dict[str, str]
"""
cath_mapping = {}
with gzip.open(self.root_dir / self.pdb_chain_cath_uniprot_filename, 'rt') as f:
next(f) # Skip header line
with gzip.open(
self.root_dir / self.pdb_chain_cath_uniprot_filename, "rt"
) as f:
next(f) # Skip header line
for line in f:
try:
pdb, chain, uniprot_id, cath_id = line.strip().split('\t')
pdb, chain, uniprot_id, cath_id = line.strip().split("\t")
key = f"{pdb}_{chain}"
cath_mapping[key] = cath_id
except ValueError:
continue
return cath_mapping

def _parse_cath_code(self) -> Dict[str, str]:
"""Parse the CATH code for all CATH IDs.
Expand All @@ -626,18 +638,22 @@ def _parse_cath_code(self) -> Dict[str, str]:
:rtype: Dict[str, str]
"""
cath_mapping = {}
with gzip.open(self.root_dir / self.cath_id_cath_code_filename, 'rt') as f:
with gzip.open(
self.root_dir / self.cath_id_cath_code_filename, "rt"
) as f:
print(f)
for line in f:
print(line)
try:
cath_id, cath_version, cath_code, cath_segment = line.strip().split()
cath_id, cath_version, cath_code, cath_segment = (
line.strip().split()
)
cath_mapping[cath_id] = cath_code
print(cath_id, cath_code)
except ValueError:
continue
return cath_mapping

def _parse_ec_number(self) -> Dict[str, str]:
"""Parse the CATH ID for all PDB chains and adds None when no EC number is present.
Expand All @@ -646,13 +662,17 @@ def _parse_ec_number(self) -> Dict[str, str]:
:rtype: Dict[str, str]
"""
ec_mapping = {}
with gzip.open(self.root_dir / self.pdb_chain_ec_number_filename, 'rt') as f:
next(f) # Skip header line
with gzip.open(
self.root_dir / self.pdb_chain_ec_number_filename, "rt"
) as f:
next(f) # Skip header line
for line in f:
try:
pdb, chain, uniprot_id, ec_number = line.strip().split('\t')
pdb, chain, uniprot_id, ec_number = line.strip().split(
"\t"
)
key = f"{pdb}_{chain}"
ec_number = None if ec_number == '?' else ec_number
ec_number = None if ec_number == "?" else ec_number
ec_mapping[key] = ec_number
except ValueError:
continue
Expand Down Expand Up @@ -1303,13 +1323,12 @@ def has_uniprot_id(
:rtype: pd.DataFrame
"""
splits_df = self.get_splits(splits)
df = splits_df.dropna(subset=['uniprot_id'])
df = splits_df.dropna(subset=["uniprot_id"])

if update:
self.df = df
return df


def has_cath_code(
self,
splits: Optional[List[str]] = None,
Expand All @@ -1329,7 +1348,7 @@ def has_cath_code(
:rtype: pd.DataFrame
"""
splits_df = self.get_splits(splits)
df = splits_df.dropna(subset=['cath_code'])
df = splits_df.dropna(subset=["cath_code"])

if update:
self.df = df
Expand All @@ -1354,7 +1373,7 @@ def has_ec_number(
:rtype: pd.DataFrame
"""
splits_df = self.get_splits(splits)
df = splits_df.dropna(subset=['ec_number'])
df = splits_df.dropna(subset=["ec_number"])

if update:
self.df = df
Expand Down
4 changes: 3 additions & 1 deletion graphein/protein/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,9 @@ def download_pdb(
BASE_URL = "https://models.rcsb.org/"
extension = ".bcif.gz"
else:
raise ValueError(f"Invalid format: {format}. Must be 'pdb', 'mmtf', 'mmcif' or 'bcif'.")
raise ValueError(
f"Invalid format: {format}. Must be 'pdb', 'mmtf', 'mmcif' or 'bcif'."
)

# Make output directory if it doesn't exist or set it to tempdir if None
if out_dir is not None:
Expand Down

0 comments on commit e577c82

Please sign in to comment.