Skip to content

Commit

Permalink
Update documentation and expose the default ScoringMatrix as a cons…
Browse files Browse the repository at this point in the history
…tant
  • Loading branch information
althonos committed May 6, 2024
1 parent a1a8209 commit 6e92f9f
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 27 deletions.
11 changes: 6 additions & 5 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@
intersphinx_mapping = {
"python": ("https://docs.python.org/3/", None),
"biopython": ("https://biopython.org/docs/latest/api/", None),
"scoring_matrices": ("https://scoring-matrices.readthedocs.io/en/stable", None),
}

# -- Options for recommonmark extension --------------------------------------
Expand All @@ -207,9 +208,9 @@
# -- Options for extlinks extension ------------------------------------------

extlinks = {
'doi': ('https://doi.org/%s', 'doi:'),
'pmid': ('https://pubmed.ncbi.nlm.nih.gov/%s', 'PMID:'),
'pmc': ('https://www.ncbi.nlm.nih.gov/pmc/articles/PMC%s', 'PMC'),
'isbn': ('https://www.worldcat.org/isbn/%s', 'ISBN:'),
'wiki': ('https://en.wikipedia.org/wiki/%s', 'Wikipedia:')
'doi': ('https://doi.org/%s', 'doi:%s'),
'pmid': ('https://pubmed.ncbi.nlm.nih.gov/%s', 'PMID:%s'),
'pmc': ('https://www.ncbi.nlm.nih.gov/pmc/articles/PMC%s', 'PMC:%s'),
'isbn': ('https://www.worldcat.org/isbn/%s', 'ISBN:%s'),
'wiki': ('https://en.wikipedia.org/wiki/%s', 'Wikipedia:%s')
}
2 changes: 2 additions & 0 deletions pyfamsa/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
GuideTree,
Sequence,
famsa_info,
FAMSA_ALPHABET,
MIQS,
)

__doc__ = _famsa.__doc__
Expand Down
6 changes: 6 additions & 0 deletions pyfamsa/_famsa.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ import datetime
import os
import typing

from scoring_matrices import ScoringMatrix

try:
from typing import Literal
except ImportError:
Expand All @@ -11,6 +13,9 @@ GuideTreeMethod = Literal["sl", "slink", "upgma", "nj"]
TreeHeuristicMethod = Literal["medoid", "part"]
Node = typing.Tuple[int, int]

FAMSA_ALPHABET: str
MIQS: ScoringMatrix

class _VersionInfo(typing.NamedTuple):
major: int
minor: int
Expand Down Expand Up @@ -56,6 +61,7 @@ class Aligner:
n_refinements: int = 100,
keep_duplicates: bool = False,
refine: typing.Optional[bool] = None,
scoring_matrix: ScoringMatrix = MIQS,
) -> None: ...
def align(self, sequences: typing.Iterable[Sequence]) -> Alignment: ...
def build_tree(self, sequences: typing.Iterable[Sequence]) -> GuideTree: ...
Expand Down
57 changes: 37 additions & 20 deletions pyfamsa/_famsa.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,21 @@
# cython: language_level=3, linetrace=True
"""Bindings to FAMSA, an algorithm for fast multiple sequence alignments.
Attributes:
FAMSA_ALPHABET (`str`): The alphabet used by default by FAMSA to encode
sequences with ordinal encoding.
MIQS (`~scoring_matrices.ScoringMatrix`): The MIQS scoring matrix proposed
by Yamada & Tomii (2014), used by default in FAMSA for scoring
alignments.
References:
- Deorowicz, S., Debudaj-Grabysz, A., Gudyś, A. (2016)
*FAMSA: Fast and accurate multiple sequence alignment of huge protein
families*. Scientific Reports, 6, 33964. :doi:`10.1038/srep33964`.
- Yamada, K., Tomii, K. (2014).
*Revisiting amino acid substitution matrices for identifying distantly
related proteins*. Bioinformatics (Oxford, England), 30(3), 317-325.
:doi:`10.1093/bioinformatics/btt694`. :pmid:`24281694`.
"""

Expand Down Expand Up @@ -49,14 +60,30 @@ include "_version.py"

# --- Constants --------------------------------------------------------------

_FAMSA_ALPHABET = "ARNDCQEGHILKMFPSTWYVBZX*"
FAMSA_ALPHABET = "ARNDCQEGHILKMFPSTWYVBZX*"

cdef memory_monotonic_safe* MMA = new memory_monotonic_safe()

cdef char SYMBOLS[NO_AMINOACIDS]
for i, x in enumerate(_FAMSA_ALPHABET):
for i, x in enumerate(FAMSA_ALPHABET):
SYMBOLS[i] = ord(x)

cdef ScoringMatrix _make_miqs():
cdef list row
cdef list weights = []
for i in range(NO_AMINOACIDS):
row = []
for j in range(NO_AMINOACIDS):
row.append(round(SM_MIQS[i][j], 4))
weights.append(row)
return ScoringMatrix(
weights,
alphabet=FAMSA_ALPHABET,
name="MIQS",
)

MIQS = _make_miqs()

# Log.getInstance(LEVEL_NORMAL).enable()
# Log.getInstance(LEVEL_VERBOSE).enable()
# Log.getInstance(LEVEL_DEBUG).enable()
Expand Down Expand Up @@ -295,7 +322,7 @@ cdef class Aligner:
int n_refinements=100,
bool keep_duplicates=False,
object refine=None,
ScoringMatrix scoring_matrix=None,
ScoringMatrix scoring_matrix not None=MIQS,
):
"""__init__(self, *, threads=0, guide_tree="sl", tree_heuristic=None, medoid_threshold=0, n_refinements=100, keep_duplicates=False, refine=None)\n--
Expand Down Expand Up @@ -323,6 +350,9 @@ cdef class Aligner:
refine (`bool` or `None`): Set to `True` to force refinement,
`False` to disable refinement, or leave as `None` to disable
refinement automatically for sets of more than 1000 sequences.
scoring_matrix (`~scoring_matrices.ScoringMatrix`): The scoring
matrix to use for scoring alignments. By default, the *MIQS*
matrix by Yamada & Tomii (2014) is used.
"""
self._params.keepDuplicates = keep_duplicates
Expand Down Expand Up @@ -373,29 +403,16 @@ cdef class Aligner:
else:
raise ValueError("`n_refinements` argument must be positive")

if scoring_matrix is not None:
if scoring_matrix.alphabet != _FAMSA_ALPHABET:
raise ValueError(f"invalid scoring matrix alphabet: expected {_FAMSA_ALPHABET!r}, got {scoring_matrix.alphabet!r}")
self.scoring_matrix = scoring_matrix
else:
weights = []
for i in range(NO_AMINOACIDS):
row = []
for j in range(NO_AMINOACIDS):
row.append(round(SM_MIQS[i][j], 4))
weights.append(row)
self.scoring_matrix = ScoringMatrix(
weights,
alphabet=_FAMSA_ALPHABET,
name="MIQS",
)
if scoring_matrix.alphabet != FAMSA_ALPHABET:
raise ValueError(f"invalid scoring matrix alphabet: expected {FAMSA_ALPHABET!r}, got {scoring_matrix.alphabet!r}")
self.scoring_matrix = scoring_matrix

# --- Methods ------------------------------------------------------------

cdef int _copy_matrix(self, CFAMSA* famsa) except 1 nogil:
cdef size_t i
cdef size_t j
cdef const float** matrix = self.scoring_matrix.matrix()
cdef const float** matrix = self.scoring_matrix.matrix_ptr()
for i in range(NO_AMINOACIDS):
famsa.score_vector[i] = <score_t> roundf(cost_cast_factor * matrix[i][i])
for j in range(NO_AMINOACIDS):
Expand Down
4 changes: 2 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,10 @@ python_requires = >=3.6
setup_requires =
setuptools >=46.4
cython ~=3.0
scoring-matrices ~=0.1
scoring-matrices ~=0.2
semantic-version ~=2.10
install_requires =
scoring-matrices ~=0.1
scoring-matrices ~=0.2
tests_require =
importlib-resources ; python_version < '3.9'

Expand Down

0 comments on commit 6e92f9f

Please sign in to comment.