Clarify util.string.formula_double_format return type, fix using `i…

…nt` to round to nearest int (#4163) * add types * tweak formula_double_format implementation * test edge case * tweak docstring * fix round bug * use structure for now * tweak type and docstring * I believe the new formula is correct * avoid overwriting func name * clean up test * annotate matrix * use math.isclose * explicit rel_tol 0
materialsproject · Nov 12, 2024 · 3ff3d53 · 3ff3d53
1 parent d302920
commit 3ff3d53
Show file tree

Hide file tree

Showing 3 changed files with 116 additions and 77 deletions.
diff --git a/src/pymatgen/util/string.py b/src/pymatgen/util/string.py
@@ -1,11 +1,27 @@
-"""This module provides utility classes for string operations."""
+"""This module provides utility classes for string operations.
+
+TODO: make standalone functions in this module use the same implementation as Stringify
+Note: previous deprecations of standalone functions in this module were removed due to
+a community need.
+"""
 
 from __future__ import annotations
 
+import math
 import re
 from fractions import Fraction
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+    from typing import Any, Literal, TextIO
+
+    from numpy.typing import ArrayLike
 
-SUBSCRIPT_UNICODE = {
+    from pymatgen.core import Structure
+    from pymatgen.util.typing import Vector3D
+
+SUBSCRIPT_UNICODE: dict[str, str] = {
     "0": "₀",
     "1": "₁",
     "2": "₂",
@@ -18,7 +34,7 @@
     "9": "₉",
 }
 
-SUPERSCRIPT_UNICODE = {
+SUPERSCRIPT_UNICODE: dict[str, str] = {
     "0": "⁰",
     "1": "¹",
     "2": "²",
@@ -33,10 +49,6 @@
     "-": "⁻",
 }
 
-# TODO: make standalone functions in this module use the same implementation as Stringify
-# Note: previous deprecations of standalone functions in this module were removed due to
-# a community need.
-
 
 class Stringify:
     """Mix-in class for string formatting, e.g. superscripting numbers and symbols or superscripting."""
@@ -81,7 +93,7 @@ def to_html_string(self) -> str:
             str_,
         )
 
-    def to_unicode_string(self):
+    def to_unicode_string(self) -> str:
         """Unicode string with proper sub and superscripts. Note that this works only
         with systems where the sub and superscripts are pure integers.
         """
@@ -97,57 +109,66 @@ def to_unicode_string(self):
         return str_
 
 
-def str_delimited(results, header=None, delimiter="\t"):
+def str_delimited(
+    results: Sequence[Sequence[Any]],
+    header: Sequence[str] | None = None,
+    delimiter: str = "\t",
+) -> str:
     r"""Given a tuple of tuples, generate a delimited string form.
-    >>> results = [["a", "b", "c"], ["d", "e", "f"], [1, 2, 3]]
+    >>> results = (("a", "b", "c"), ("d", "e", "f"), (1, 2, 3))
     >>> print(str_delimited(results, delimiter=","))
-    a,b,c
-    d,e,f
-    1,2,3.
+        a,b,c
+        d,e,f
+        1,2,3.
 
     Args:
-        results: 2d sequence of arbitrary types.
-        header: optional header
-        delimiter: Defaults to "\t" for tab-delimited output.
+        results (Sequence[Sequence[Any]]): 2D sequence of arbitrary types.
+        header (Sequence[str]): optional headers.
+        delimiter (str): Defaults to "\t" for tab-delimited output.
 
     Returns:
-        Aligned string output in a table-like format.
+        str: Aligned string output in a table-like format.
     """
     out = ""
     if header is not None:
         out += f"{delimiter.join(header)}\n"
     return out + "\n".join(delimiter.join([str(m) for m in result]) for result in results)
 
 
-def formula_double_format(afloat, ignore_ones=True, tol: float = 1e-8):
-    """This function is used to make pretty formulas by formatting the amounts.
-    Instead of Li1.0 Fe1.0 P1.0 O4.0, you get LiFePO4.
+def formula_double_format(
+    afloat: float,
+    ignore_ones: bool = True,
+    tol: float = 1e-8,
+) -> float | Literal[""]:
+    """Format a float for pretty formulas.
+    E.g. "Li1.0 Fe1.0 P1.0 O4.0" -> "LiFePO4".
 
     Args:
-        afloat (float): a float
-        ignore_ones (bool): if true, floats of 1 are ignored.
-        tol (float): Tolerance to round to nearest int. i.e. 2.0000000001 -> 2
+        afloat (float): The float to be formatted.
+        ignore_ones (bool): if true, floats of 1.0 are ignored.
+        tol (float): Absolute tolerance to round to nearest int. i.e. (2 + 1E-9) -> 2.
 
     Returns:
-        A string representation of the float for formulas.
+        float | "": Formatted float for formulas.
     """
-    if ignore_ones and afloat == 1:
+    if ignore_ones and math.isclose(afloat, 1, abs_tol=tol):
         return ""
-    if abs(afloat - int(afloat)) < tol:
-        return int(afloat)
+
+    if math.isclose(afloat, round(afloat), abs_tol=tol, rel_tol=0):
+        return round(afloat)
     return round(afloat, 8)
 
 
-def charge_string(charge, brackets=True, explicit_one=True):
+def charge_string(charge: float, brackets: bool = True, explicit_one: bool = True) -> str:
     """Get a string representing the charge of an Ion. By default, the
     charge is placed in brackets with the sign preceding the magnitude, e.g.
     '[+2]'. For uncharged species, the string returned is '(aq)'.
 
     Args:
-        charge: the charge of the Ion
-        brackets: whether to enclose the charge in brackets, e.g. [+2]. Default: True
-        explicit_one: whether to include the number one for monovalent ions, e.g.
-            +1 rather than +. Default: True
+        charge (float): The charge of the ion.
+        brackets (bool): Whether to enclose the charge in brackets, e.g. [+2]. Default is True.
+        explicit_one (bool): whether to include the number one for monovalent ions,
+            e.g. "+1" rather than "+". Default is True.
     """
     chg_str = "(aq)" if charge == 0 else f"{formula_double_format(charge, ignore_ones= False):+}"
 
@@ -160,19 +181,19 @@ def charge_string(charge, brackets=True, explicit_one=True):
     return chg_str
 
 
-def latexify(formula: str, bold: bool = False):
+def latexify(formula: str, bold: bool = False) -> str:
     """Generate a LaTeX formatted formula. e.g. Fe2O3 is transformed to
     Fe$_{2}$O$_{3}$.
 
-    Note that Composition now has a to_latex_string() method that may
+    Note that Composition now has `to_latex_string` method that may
     be used instead.
 
     Args:
         formula (str): Input formula.
         bold (bool): Whether to make the subscripts bold. Defaults to False.
 
     Returns:
-        Formula suitable for display as in LaTeX with proper subscripts.
+        str: Formula suitable for display as in LaTeX with proper subscripts.
     """
     return re.sub(
         r"([A-Za-z\(\)])([\d\.]+)",
@@ -189,7 +210,7 @@ def htmlify(formula: str) -> str:
     be used instead.
 
     Args:
-        formula: The string to format.
+        formula (str): The string to format.
     """
     return re.sub(r"([A-Za-z\(\)])([\d\.]+)", r"\1<sub>\2</sub>", formula)
 
@@ -202,7 +223,7 @@ def unicodeify(formula: str) -> str:
     be used instead.
 
     Args:
-        formula: The string to format.
+        formula (str): The string to format.
     """
     if "." in formula:
         raise ValueError("No unicode character exists for subscript period.")
@@ -213,7 +234,7 @@ def unicodeify(formula: str) -> str:
     return formula
 
 
-def latexify_spacegroup(spacegroup_symbol):
+def latexify_spacegroup(spacegroup_symbol: str) -> str:
     r"""Generate a latex formatted spacegroup. e.g. P2_1/c is converted to
     P2$_{1}$/c and P-1 is converted to P$\\overline{1}$.
 
@@ -224,24 +245,24 @@ def latexify_spacegroup(spacegroup_symbol):
         spacegroup_symbol (str): A spacegroup symbol
 
     Returns:
-        A latex formatted spacegroup with proper subscripts and overlines.
+        str: A latex formatted spacegroup with proper subscripts and overlines.
     """
     sym = re.sub(r"_(\d+)", r"$_{\1}$", spacegroup_symbol)
     return re.sub(r"-(\d)", r"$\\overline{\1}$", sym)
 
 
-def unicodeify_spacegroup(spacegroup_symbol):
+def unicodeify_spacegroup(spacegroup_symbol: str) -> str:
     r"""Generate a unicode formatted spacegroup. e.g. P2$_{1}$/c is converted to
     P2₁/c and P$\\overline{1}$ is converted to P̅1.
 
     Note that SymmetryGroup now has a to_unicode_string() method that
     may be called instead.
 
     Args:
-        spacegroup_symbol (str): A spacegroup symbol as LaTeX
+        spacegroup_symbol (str): A spacegroup symbol as LaTeX.
 
     Returns:
-        A unicode spacegroup with proper subscripts and overlines.
+        str: A unicode spacegroup with proper subscripts and overlines.
     """
     if not spacegroup_symbol:
         return ""
@@ -261,18 +282,18 @@ def unicodeify_spacegroup(spacegroup_symbol):
     return symbol.replace("}", overline)
 
 
-def unicodeify_species(specie_string):
+def unicodeify_species(specie_string: str) -> str:
     """Generate a unicode formatted species string, with appropriate
     superscripts for oxidation states.
 
-    Note that Species now has a to_unicode_string() method that
+    Note that Species now has `to_unicode_string` method that
     may be used instead.
 
     Args:
-        specie_string (str): Species string, e.g. O2-
+        specie_string (str): Species string, e.g. "O2-"
 
     Returns:
-        Species string, e.g. O²⁻
+        str: Species string, e.g. "O²⁻"
     """
     if not specie_string:
         return ""
@@ -283,7 +304,7 @@ def unicodeify_species(specie_string):
     return specie_string
 
 
-def stream_has_colors(stream):
+def stream_has_colors(stream: TextIO) -> bool:
     """True if stream supports colors. Python cookbook, #475186."""
     if not hasattr(stream, "isatty"):
         return False
@@ -300,19 +321,25 @@ def stream_has_colors(stream):
         return curses.tigetnum("colors") > 2
 
 
-def transformation_to_string(matrix, translation_vec=(0, 0, 0), components=("x", "y", "z"), c="", delim=","):
+def transformation_to_string(
+    matrix: ArrayLike,
+    translation_vec: Vector3D = (0, 0, 0),
+    components: tuple[str, str, str] = ("x", "y", "z"),
+    c: str = "",
+    delim: str = ",",
+) -> str:
     """Convenience method. Given matrix returns string, e.g. x+2y+1/4.
 
     Args:
-        matrix: A 3x3 matrix.
-        translation_vec: A 3-element tuple representing the translation vector. Defaults to (0, 0, 0).
-        components: A tuple of 3 strings representing the components. Either ('x', 'y', 'z') or ('a', 'b', 'c').
+        matrix (ArrayLike): A 3x3 matrix.
+        translation_vec (Vector3D): The translation vector. Defaults to (0, 0, 0).
+        components(tuple[str, str, str]): The components. Either ('x', 'y', 'z') or ('a', 'b', 'c').
             Defaults to ('x', 'y', 'z').
-        c: An optional additional character to print (used for magmoms). Defaults to "".
-        delim: A delimiter. Defaults to ",".
+        c (str): An optional additional character to print (used for magmoms). Defaults to "".
+        delim (str): A delimiter. Defaults to ",".
 
     Returns:
-        xyz string.
+        str: xyz string.
     """
     parts = []
     for idx in range(3):
@@ -339,19 +366,22 @@ def transformation_to_string(matrix, translation_vec=(0, 0, 0), components=("x",
     return delim.join(parts)
 
 
-def disordered_formula(disordered_struct, symbols=("x", "y", "z"), fmt="plain"):
+def disordered_formula(
+    disordered_struct: Structure,
+    symbols: Sequence[str] = ("x", "y", "z"),
+    fmt: Literal["plain", "HTML", "LaTex"] = "plain",
+) -> str:
     """Get a formula of a form like AxB1-x (x=0.5)
     for disordered structures. Will only return a
     formula for disordered structures with one
     kind of disordered site at present.
 
     Args:
-        disordered_struct: a disordered structure
-        symbols: a tuple of characters to use for
-        subscripts, by default this is ('x', 'y', 'z')
-        but if you have more than three disordered
-        species more symbols will need to be added
-        fmt (str): 'plain', 'HTML' or 'LaTeX'
+        disordered_struct (Structure): a disordered structure.
+        symbols (Sequence[str]): Characters to use for subscripts,
+            by default this is ('x', 'y', 'z') but if you have more than three
+            disordered species more symbols will need to be added.
+        fmt (str): 'plain', 'HTML' or 'LaTeX',
 
     Returns:
         str: a disordered formula string
@@ -373,18 +403,18 @@ def disordered_formula(disordered_struct, symbols=("x", "y", "z"), fmt="plain"):
         )
     disordered_site_composition = disordered_site_compositions.pop()
 
-    disordered_species = {str(sp) for sp, occu in disordered_site_composition.items()}
+    disordered_species = {str(sp) for sp, _occu in disordered_site_composition.items()}
 
     if len(disordered_species) > len(symbols):
         # this probably won't happen too often either
         raise ValueError(f"Not enough symbols to describe disordered composition: {symbols}")
-    symbols = list(symbols)[0 : len(disordered_species) - 1]
+    symbols = list(symbols)[: len(disordered_species) - 1]
 
     comp = disordered_struct.composition.get_el_amt_dict().items()
     # sort by electronegativity, as per composition
     comp = sorted(comp, key=lambda x: get_el_sp(x[0]).X)
 
-    disordered_comp = []
+    disordered_comp: list[tuple[str, str]] = []
     variable_map = {}
 
     total_disordered_occu = sum(occu for sp, occu in comp if str(sp) in disordered_species)
@@ -403,7 +433,7 @@ def disordered_formula(disordered_struct, symbols=("x", "y", "z"), fmt="plain"):
     for sp, occu in comp:
         species = str(sp)
         if species not in disordered_species:
-            disordered_comp.append((species, formula_double_format(occu / factor)))
+            disordered_comp.append((species, str(formula_double_format(occu / factor))))
         elif len(symbols) > 0:
             symbol = symbols.pop(0)
             disordered_comp.append((species, symbol))
@@ -423,16 +453,16 @@ def disordered_formula(disordered_struct, symbols=("x", "y", "z"), fmt="plain"):
     else:
         raise ValueError("Unsupported output format, choose from: LaTeX, HTML, plain")
 
-    disordered_formula = []
+    disordered_formulas = []
     for sp, occu in disordered_comp:
-        disordered_formula.append(sp)
+        disordered_formulas.append(sp)
         if occu:  # can be empty string if 1
             if fmt != "plain":
-                disordered_formula.append(sub_start)
-            disordered_formula.append(occu)
+                disordered_formulas.append(sub_start)
+            disordered_formulas.append(occu)
             if fmt != "plain":
-                disordered_formula.append(sub_end)
-    disordered_formula.append(" ")
-    disordered_formula += [f"{key}={formula_double_format(val)} " for key, val in variable_map.items()]
+                disordered_formulas.append(sub_end)
+    disordered_formulas.append(" ")
+    disordered_formulas += [f"{key}={formula_double_format(val)} " for key, val in variable_map.items()]
 
-    return "".join(map(str, disordered_formula))[:-1]
+    return "".join(map(str, disordered_formulas))[:-1]
diff --git a/tests/io/test_cif.py b/tests/io/test_cif.py
@@ -364,7 +364,7 @@ def test_cif_parser_cod(self):
         # Symbol in capital letters
         parser = CifParser(f"{TEST_FILES_DIR}/cif/Cod_2100513.cif")
         for struct in parser.parse_structures():
-            assert struct.formula == "Ca4 Nb2.0 Al2 O12"
+            assert struct.formula == "Ca4 Nb2 Al2 O12"
 
         # Label in capital letters
         parser = CifParser(f"{TEST_FILES_DIR}/cif/Cod_4115344.cif")