Skip to content

Commit

Permalink
Clarify util.string.formula_double_format return type, fix using `i…
Browse files Browse the repository at this point in the history
…nt` to round to nearest int (#4163)

* add types

* tweak formula_double_format implementation

* test edge case

* tweak docstring

* fix round bug

* use structure for now

* tweak type and docstring

* I believe the new formula is correct

* avoid overwriting func name

* clean up test

* annotate matrix

* use math.isclose

* explicit rel_tol 0
  • Loading branch information
DanielYang59 authored Nov 12, 2024
1 parent d302920 commit 3ff3d53
Show file tree
Hide file tree
Showing 3 changed files with 116 additions and 77 deletions.
176 changes: 103 additions & 73 deletions src/pymatgen/util/string.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,27 @@
"""This module provides utility classes for string operations."""
"""This module provides utility classes for string operations.
TODO: make standalone functions in this module use the same implementation as Stringify
Note: previous deprecations of standalone functions in this module were removed due to
a community need.
"""

from __future__ import annotations

import math
import re
from fractions import Fraction
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from collections.abc import Sequence
from typing import Any, Literal, TextIO

from numpy.typing import ArrayLike

SUBSCRIPT_UNICODE = {
from pymatgen.core import Structure
from pymatgen.util.typing import Vector3D

SUBSCRIPT_UNICODE: dict[str, str] = {
"0": "₀",
"1": "₁",
"2": "₂",
Expand All @@ -18,7 +34,7 @@
"9": "₉",
}

SUPERSCRIPT_UNICODE = {
SUPERSCRIPT_UNICODE: dict[str, str] = {
"0": "⁰",
"1": "¹",
"2": "²",
Expand All @@ -33,10 +49,6 @@
"-": "⁻",
}

# TODO: make standalone functions in this module use the same implementation as Stringify
# Note: previous deprecations of standalone functions in this module were removed due to
# a community need.


class Stringify:
"""Mix-in class for string formatting, e.g. superscripting numbers and symbols or superscripting."""
Expand Down Expand Up @@ -81,7 +93,7 @@ def to_html_string(self) -> str:
str_,
)

def to_unicode_string(self):
def to_unicode_string(self) -> str:
"""Unicode string with proper sub and superscripts. Note that this works only
with systems where the sub and superscripts are pure integers.
"""
Expand All @@ -97,57 +109,66 @@ def to_unicode_string(self):
return str_


def str_delimited(results, header=None, delimiter="\t"):
def str_delimited(
results: Sequence[Sequence[Any]],
header: Sequence[str] | None = None,
delimiter: str = "\t",
) -> str:
r"""Given a tuple of tuples, generate a delimited string form.
>>> results = [["a", "b", "c"], ["d", "e", "f"], [1, 2, 3]]
>>> results = (("a", "b", "c"), ("d", "e", "f"), (1, 2, 3))
>>> print(str_delimited(results, delimiter=","))
a,b,c
d,e,f
1,2,3.
a,b,c
d,e,f
1,2,3.
Args:
results: 2d sequence of arbitrary types.
header: optional header
delimiter: Defaults to "\t" for tab-delimited output.
results (Sequence[Sequence[Any]]): 2D sequence of arbitrary types.
header (Sequence[str]): optional headers.
delimiter (str): Defaults to "\t" for tab-delimited output.
Returns:
Aligned string output in a table-like format.
str: Aligned string output in a table-like format.
"""
out = ""
if header is not None:
out += f"{delimiter.join(header)}\n"
return out + "\n".join(delimiter.join([str(m) for m in result]) for result in results)


def formula_double_format(afloat, ignore_ones=True, tol: float = 1e-8):
"""This function is used to make pretty formulas by formatting the amounts.
Instead of Li1.0 Fe1.0 P1.0 O4.0, you get LiFePO4.
def formula_double_format(
afloat: float,
ignore_ones: bool = True,
tol: float = 1e-8,
) -> float | Literal[""]:
"""Format a float for pretty formulas.
E.g. "Li1.0 Fe1.0 P1.0 O4.0" -> "LiFePO4".
Args:
afloat (float): a float
ignore_ones (bool): if true, floats of 1 are ignored.
tol (float): Tolerance to round to nearest int. i.e. 2.0000000001 -> 2
afloat (float): The float to be formatted.
ignore_ones (bool): if true, floats of 1.0 are ignored.
tol (float): Absolute tolerance to round to nearest int. i.e. (2 + 1E-9) -> 2.
Returns:
A string representation of the float for formulas.
float | "": Formatted float for formulas.
"""
if ignore_ones and afloat == 1:
if ignore_ones and math.isclose(afloat, 1, abs_tol=tol):
return ""
if abs(afloat - int(afloat)) < tol:
return int(afloat)

if math.isclose(afloat, round(afloat), abs_tol=tol, rel_tol=0):
return round(afloat)
return round(afloat, 8)


def charge_string(charge, brackets=True, explicit_one=True):
def charge_string(charge: float, brackets: bool = True, explicit_one: bool = True) -> str:
"""Get a string representing the charge of an Ion. By default, the
charge is placed in brackets with the sign preceding the magnitude, e.g.
'[+2]'. For uncharged species, the string returned is '(aq)'.
Args:
charge: the charge of the Ion
brackets: whether to enclose the charge in brackets, e.g. [+2]. Default: True
explicit_one: whether to include the number one for monovalent ions, e.g.
+1 rather than +. Default: True
charge (float): The charge of the ion.
brackets (bool): Whether to enclose the charge in brackets, e.g. [+2]. Default is True.
explicit_one (bool): whether to include the number one for monovalent ions,
e.g. "+1" rather than "+". Default is True.
"""
chg_str = "(aq)" if charge == 0 else f"{formula_double_format(charge, ignore_ones= False):+}"

Expand All @@ -160,19 +181,19 @@ def charge_string(charge, brackets=True, explicit_one=True):
return chg_str


def latexify(formula: str, bold: bool = False):
def latexify(formula: str, bold: bool = False) -> str:
"""Generate a LaTeX formatted formula. e.g. Fe2O3 is transformed to
Fe$_{2}$O$_{3}$.
Note that Composition now has a to_latex_string() method that may
Note that Composition now has `to_latex_string` method that may
be used instead.
Args:
formula (str): Input formula.
bold (bool): Whether to make the subscripts bold. Defaults to False.
Returns:
Formula suitable for display as in LaTeX with proper subscripts.
str: Formula suitable for display as in LaTeX with proper subscripts.
"""
return re.sub(
r"([A-Za-z\(\)])([\d\.]+)",
Expand All @@ -189,7 +210,7 @@ def htmlify(formula: str) -> str:
be used instead.
Args:
formula: The string to format.
formula (str): The string to format.
"""
return re.sub(r"([A-Za-z\(\)])([\d\.]+)", r"\1<sub>\2</sub>", formula)

Expand All @@ -202,7 +223,7 @@ def unicodeify(formula: str) -> str:
be used instead.
Args:
formula: The string to format.
formula (str): The string to format.
"""
if "." in formula:
raise ValueError("No unicode character exists for subscript period.")
Expand All @@ -213,7 +234,7 @@ def unicodeify(formula: str) -> str:
return formula


def latexify_spacegroup(spacegroup_symbol):
def latexify_spacegroup(spacegroup_symbol: str) -> str:
r"""Generate a latex formatted spacegroup. e.g. P2_1/c is converted to
P2$_{1}$/c and P-1 is converted to P$\\overline{1}$.
Expand All @@ -224,24 +245,24 @@ def latexify_spacegroup(spacegroup_symbol):
spacegroup_symbol (str): A spacegroup symbol
Returns:
A latex formatted spacegroup with proper subscripts and overlines.
str: A latex formatted spacegroup with proper subscripts and overlines.
"""
sym = re.sub(r"_(\d+)", r"$_{\1}$", spacegroup_symbol)
return re.sub(r"-(\d)", r"$\\overline{\1}$", sym)


def unicodeify_spacegroup(spacegroup_symbol):
def unicodeify_spacegroup(spacegroup_symbol: str) -> str:
r"""Generate a unicode formatted spacegroup. e.g. P2$_{1}$/c is converted to
P2₁/c and P$\\overline{1}$ is converted to P̅1.
Note that SymmetryGroup now has a to_unicode_string() method that
may be called instead.
Args:
spacegroup_symbol (str): A spacegroup symbol as LaTeX
spacegroup_symbol (str): A spacegroup symbol as LaTeX.
Returns:
A unicode spacegroup with proper subscripts and overlines.
str: A unicode spacegroup with proper subscripts and overlines.
"""
if not spacegroup_symbol:
return ""
Expand All @@ -261,18 +282,18 @@ def unicodeify_spacegroup(spacegroup_symbol):
return symbol.replace("}", overline)


def unicodeify_species(specie_string):
def unicodeify_species(specie_string: str) -> str:
"""Generate a unicode formatted species string, with appropriate
superscripts for oxidation states.
Note that Species now has a to_unicode_string() method that
Note that Species now has `to_unicode_string` method that
may be used instead.
Args:
specie_string (str): Species string, e.g. O2-
specie_string (str): Species string, e.g. "O2-"
Returns:
Species string, e.g. O²⁻
str: Species string, e.g. "O²⁻"
"""
if not specie_string:
return ""
Expand All @@ -283,7 +304,7 @@ def unicodeify_species(specie_string):
return specie_string


def stream_has_colors(stream):
def stream_has_colors(stream: TextIO) -> bool:
"""True if stream supports colors. Python cookbook, #475186."""
if not hasattr(stream, "isatty"):
return False
Expand All @@ -300,19 +321,25 @@ def stream_has_colors(stream):
return curses.tigetnum("colors") > 2


def transformation_to_string(matrix, translation_vec=(0, 0, 0), components=("x", "y", "z"), c="", delim=","):
def transformation_to_string(
matrix: ArrayLike,
translation_vec: Vector3D = (0, 0, 0),
components: tuple[str, str, str] = ("x", "y", "z"),
c: str = "",
delim: str = ",",
) -> str:
"""Convenience method. Given matrix returns string, e.g. x+2y+1/4.
Args:
matrix: A 3x3 matrix.
translation_vec: A 3-element tuple representing the translation vector. Defaults to (0, 0, 0).
components: A tuple of 3 strings representing the components. Either ('x', 'y', 'z') or ('a', 'b', 'c').
matrix (ArrayLike): A 3x3 matrix.
translation_vec (Vector3D): The translation vector. Defaults to (0, 0, 0).
components(tuple[str, str, str]): The components. Either ('x', 'y', 'z') or ('a', 'b', 'c').
Defaults to ('x', 'y', 'z').
c: An optional additional character to print (used for magmoms). Defaults to "".
delim: A delimiter. Defaults to ",".
c (str): An optional additional character to print (used for magmoms). Defaults to "".
delim (str): A delimiter. Defaults to ",".
Returns:
xyz string.
str: xyz string.
"""
parts = []
for idx in range(3):
Expand All @@ -339,19 +366,22 @@ def transformation_to_string(matrix, translation_vec=(0, 0, 0), components=("x",
return delim.join(parts)


def disordered_formula(disordered_struct, symbols=("x", "y", "z"), fmt="plain"):
def disordered_formula(
disordered_struct: Structure,
symbols: Sequence[str] = ("x", "y", "z"),
fmt: Literal["plain", "HTML", "LaTex"] = "plain",
) -> str:
"""Get a formula of a form like AxB1-x (x=0.5)
for disordered structures. Will only return a
formula for disordered structures with one
kind of disordered site at present.
Args:
disordered_struct: a disordered structure
symbols: a tuple of characters to use for
subscripts, by default this is ('x', 'y', 'z')
but if you have more than three disordered
species more symbols will need to be added
fmt (str): 'plain', 'HTML' or 'LaTeX'
disordered_struct (Structure): a disordered structure.
symbols (Sequence[str]): Characters to use for subscripts,
by default this is ('x', 'y', 'z') but if you have more than three
disordered species more symbols will need to be added.
fmt (str): 'plain', 'HTML' or 'LaTeX',
Returns:
str: a disordered formula string
Expand All @@ -373,18 +403,18 @@ def disordered_formula(disordered_struct, symbols=("x", "y", "z"), fmt="plain"):
)
disordered_site_composition = disordered_site_compositions.pop()

disordered_species = {str(sp) for sp, occu in disordered_site_composition.items()}
disordered_species = {str(sp) for sp, _occu in disordered_site_composition.items()}

if len(disordered_species) > len(symbols):
# this probably won't happen too often either
raise ValueError(f"Not enough symbols to describe disordered composition: {symbols}")
symbols = list(symbols)[0 : len(disordered_species) - 1]
symbols = list(symbols)[: len(disordered_species) - 1]

comp = disordered_struct.composition.get_el_amt_dict().items()
# sort by electronegativity, as per composition
comp = sorted(comp, key=lambda x: get_el_sp(x[0]).X)

disordered_comp = []
disordered_comp: list[tuple[str, str]] = []
variable_map = {}

total_disordered_occu = sum(occu for sp, occu in comp if str(sp) in disordered_species)
Expand All @@ -403,7 +433,7 @@ def disordered_formula(disordered_struct, symbols=("x", "y", "z"), fmt="plain"):
for sp, occu in comp:
species = str(sp)
if species not in disordered_species:
disordered_comp.append((species, formula_double_format(occu / factor)))
disordered_comp.append((species, str(formula_double_format(occu / factor))))
elif len(symbols) > 0:
symbol = symbols.pop(0)
disordered_comp.append((species, symbol))
Expand All @@ -423,16 +453,16 @@ def disordered_formula(disordered_struct, symbols=("x", "y", "z"), fmt="plain"):
else:
raise ValueError("Unsupported output format, choose from: LaTeX, HTML, plain")

disordered_formula = []
disordered_formulas = []
for sp, occu in disordered_comp:
disordered_formula.append(sp)
disordered_formulas.append(sp)
if occu: # can be empty string if 1
if fmt != "plain":
disordered_formula.append(sub_start)
disordered_formula.append(occu)
disordered_formulas.append(sub_start)
disordered_formulas.append(occu)
if fmt != "plain":
disordered_formula.append(sub_end)
disordered_formula.append(" ")
disordered_formula += [f"{key}={formula_double_format(val)} " for key, val in variable_map.items()]
disordered_formulas.append(sub_end)
disordered_formulas.append(" ")
disordered_formulas += [f"{key}={formula_double_format(val)} " for key, val in variable_map.items()]

return "".join(map(str, disordered_formula))[:-1]
return "".join(map(str, disordered_formulas))[:-1]
2 changes: 1 addition & 1 deletion tests/io/test_cif.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ def test_cif_parser_cod(self):
# Symbol in capital letters
parser = CifParser(f"{TEST_FILES_DIR}/cif/Cod_2100513.cif")
for struct in parser.parse_structures():
assert struct.formula == "Ca4 Nb2.0 Al2 O12"
assert struct.formula == "Ca4 Nb2 Al2 O12"

# Label in capital letters
parser = CifParser(f"{TEST_FILES_DIR}/cif/Cod_4115344.cif")
Expand Down
Loading

0 comments on commit 3ff3d53

Please sign in to comment.