Skip to content

Commit

Permalink
added mass calculation to delta mass mode
Browse files Browse the repository at this point in the history
  • Loading branch information
jspaezp committed Dec 5, 2023
1 parent 32c798c commit fa49998
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 36 deletions.
20 changes: 14 additions & 6 deletions ms2ml/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,21 +308,26 @@ def encoding_aa_order_mapping(self) -> dict[str | None, int]:
return {aa: i for i, aa in enumerate(self.encoding_aa_order)}

@lazy
def fixed_mods_dict(self) -> dict[str, list[str, float]]:
def fixed_mods_dict(self) -> dict[str, set[str, float]]:
"""Returns a dictionary of fixed mods.
The keys are the aminoacids and the values are the modifications.
Examples:
>>> config = Config()
>>> config.fixed_mods_dict
{'C': ['[UNIMOD:4]']}
{'C': {'[UNIMOD:4]'}}
>>> config = Config(mod_fixed_mods=("[+22.2222]@C",))
>>> config.fixed_mods_dict
{'C': [22.2222]}
>>> tmp = config.fixed_mods_dict
# {'C': {'[+22.2222]', 22.2222}}
>>> config = Config(mod_fixed_mods=("[+22.0]@C",))
>>> config.fixed_mods_dict
{'C': [22.0]}
>>> tmp = config.fixed_mods_dict
# {'C': {'[+22.0]', 22.0}}
>>> {k: sorted([str(w) for w in v]) for k, v in tmp.items()}
{'C': ['22.0', '[+22.0]']}
"""
out = {}
for mod in self.mod_fixed_mods:
Expand All @@ -335,12 +340,15 @@ def fixed_mods_dict(self) -> dict[str, list[str, float]]:
if mod_key not in out:
out[mod_key] = []

out[mod_key].append(mod_value)
try:
mod_value = float(mod_value.replace("[", "").replace("]", ""))
except ValueError:
pass

out[mod_key].append(mod_value)

out = {k: set(v) for k, v in out.items()}
return out

def _resolve_mod_list(self, x):
Expand Down
70 changes: 49 additions & 21 deletions ms2ml/peptide.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,36 @@ def to_proforma(self) -> str:
props["fixed_modifications"] = keep
else:
props = self.properties
return to_proforma(self.sequence, **props)

fixed_mods = {}
for x in props["fixed_modifications"]:
for y in x.targets:
fixed_mods[y] = x.modification_tag

seqs = self.sequence.copy()
seqs_out = [None] * len(seqs)

for i, s in enumerate(seqs):
if s[1] is None:
seqs_out[i] = s
continue

m = s[1]
mk = []

for mod in m:
if s[0] in fixed_mods and mod == fixed_mods[s[0]]:
continue
else:
mk.append(mod)

if len(mk) == 0:
mk = None

seqs_out[i] = (s[0], mk)

out = to_proforma(seqs_out, **props)
return out

def to_massdiff_seq(self) -> str:
"""Converts the peptide to a string following the massdiff specifications.
Expand Down Expand Up @@ -664,14 +693,17 @@ def mod_seq(self):
for x in mods:
if hasattr(x, "__iter__"):
x = list(set(x))
if len(x) > 1:
if all(isinstance(y, float) for y in x):
x = sum(x)
elif len(x) > 1:
error_msg = "Multiple modifications on the"
error_msg += " same aminoacid are not supported"
error_msg += f" got:({x})"

# TODO consider is more informative messages are required
raise ValueError(error_msg)
x = x[0]
else:
x = x[0]

vector.append(x)
return vector
Expand Down Expand Up @@ -769,13 +801,7 @@ def from_iter(it, config: Config, charge=None, drop_fixed=False): # noqa: PLR09
... config=Config(),
... )
>>> foo.to_proforma()
'<[UNIMOD:4]@C>AMC[UNIMOD:4]'
>>> foo.mass
380.11881216611
TODO: Decide if this is the desired behaviour ,,,
I would argue that it should add the mass of the carbamidomethyl twice ...
'<[UNIMOD:4]@C>AMC'
>>> foo = Peptide.from_iter(
... [
... ("n_term", None),
Expand Down Expand Up @@ -808,21 +834,19 @@ def from_iter(it, config: Config, charge=None, drop_fixed=False): # noqa: PLR09
for aa, mod in it:
if mod is None:
mod = ""

if drop_fixed:
# if aa == "C" and "C" in fmd:
# print(f"aa={aa}, mod={mod}, fmd={fmd}")
if aa in fmd:
tfmd = fmd[aa]

if isinstance(mod, str) and mod in tfmd:
mod = ""
elif isinstance(mod, float) and any(
abs(mod - fmdaa) < 0.0001 # noqa: PLR2004
for fmdaa in tfmd
if isinstance(fmdaa, float)
):
mod = ""
if mod:
if isinstance(mod, str) and mod in tfmd:
mod = ""
elif isinstance(mod, float) and any(
abs(mod - fmdaa) < 0.001 # noqa: PLR2004
for fmdaa in tfmd
if isinstance(fmdaa, float)
):
mod = ""

if isinstance(mod, str):
pass
Expand Down Expand Up @@ -851,6 +875,10 @@ def from_iter(it, config: Config, charge=None, drop_fixed=False): # noqa: PLR09
else:
seq = "".join(seqs) + f"/{charge}"

for i in range(4):
pep = Peptide.from_proforma_seq(seq, config=config)
seq = pep.to_proforma()

return Peptide.from_proforma_seq(seq, config=config)

@lazy
Expand Down
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "ms2ml"
version = "0.0.44"
version = "0.0.45"
description = "Provides an intermediate layer between mass spec data and ML applications, such as encoding."
authors = ["J. Sebastian Paez <[email protected]>"]
license = "Apache 2.0"
Expand Down Expand Up @@ -99,7 +99,7 @@ target-version = ["py39"]
line-length = 88

[tool.ruff]
extend-select = ["I", "PL", "RUF", "NPY"]
extend-select = ["I", "PL", "RUF", "NPY", "T20", "T100"]
# TODO add ANN and N
# PLR0913 = too many arguments
# PLW2901 = for loop variable over-written
Expand All @@ -108,7 +108,7 @@ ignore = ["F811", "PLR0913", "PLW2901", "NPY002"]

[tool.ruff.lint.per-file-ignores]
"__init__.py" = ["E402"]
"notebooks/**" = ["PLR2004"]
"notebooks/**" = ["PLR2004", "T20"]
"**/{tests}/*" = ["PLR2004", "RUF012"]

[tool.isort]
Expand Down
17 changes: 11 additions & 6 deletions tests/test_peptide.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ def test_peptide_proforma(delta_mass: bool):

for seq in sequences:
pep = Peptide.from_sequence(seq["input_sequence"], config=config)
print(pep.to_proforma())
assert pep.to_proforma() == seq["expected_out"]
assert int(1_000 * pep.mass) == int(1_000 * seq["expected_mass"])

Expand Down Expand Up @@ -162,11 +161,17 @@ def test_variable_possible_mods(input):
)
pep = Peptide.from_sequence(input["input_sequence"], config=config)
out = pep.get_variable_possible_mods()
out = [x.to_proforma() for x in out]
out.sort()

unexpected = set(out) - set(input["expected_out"])
assert len(unexpected) == 0, f"Unexpected -> {unexpected}"
# Check that masses can be calculated ...
_ = [x.mass for x in out]

missing = set(input["expected_out"]) - set(out)
proformas = [x.to_proforma() for x in out]
proformas.sort()

unexpected = set(proformas) - set(input["expected_out"])
assert (
len(unexpected) == 0
), f"From {input['input_sequence']} Unexpected -> {unexpected}"

missing = set(input["expected_out"]) - set(proformas)
assert len(missing) == 0, f"Missing -> {missing}"

0 comments on commit fa49998

Please sign in to comment.