Skip to content

Commit

Permalink
Builder updates for legacy data parity (#249)
Browse files Browse the repository at this point in the history
* Rename reasons to deprecation_reasons

* Structure not included in StructureMetadata

* Materials updated with structure field changes

* Builder and test changes for new PropertyDoc

* Remove debug print

* Provenance bug fixes

* Linting
  • Loading branch information
Jason Munro authored Aug 26, 2021
1 parent df8192e commit 99ada6e
Show file tree
Hide file tree
Showing 24 changed files with 261 additions and 562 deletions.
191 changes: 45 additions & 146 deletions emmet-builders/emmet/builders/materials/electronic_structure.py

Large diffs are not rendered by default.

71 changes: 23 additions & 48 deletions emmet-builders/emmet/builders/materials/provenance.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,7 @@ def __init__(
for s in source_snls:
s.key = "snl_id"

super().__init__(
sources=[materials, *source_snls], targets=[provenance], **kwargs
)
super().__init__(sources=[materials, *source_snls], targets=[provenance], **kwargs)

def ensure_indicies(self):

Expand All @@ -63,22 +61,14 @@ def prechunk(self, number_splits: int) -> Iterable[Dict]:

# Find all formulas for materials that have been updated since this
# builder was last ran
q = {**self.query, "property_name": ProvenanceDoc.property_name}
updated_materials = self.provenance.newer_in(
self.materials,
criteria=q,
exhaustive=True,
)
forms_to_update = set(
self.materials.distinct(
"formula_pretty", {"material_id": {"$in": updated_materials}}
)
)
q = self.query
updated_materials = self.provenance.newer_in(self.materials, criteria=q, exhaustive=True,)
forms_to_update = set(self.materials.distinct("formula_pretty", {"material_id": {"$in": updated_materials}}))

# Find all new SNL formulas since the builder was last run
for source in self.source_snls:
new_snls = self.provenance.newer_in(source)
forms_to_update |= set(source.distinct("formula_pretty", new_snls))
forms_to_update |= set(source.distinct("formula_pretty", {source.key: {"$in": new_snls}}))

# Now reduce to the set of formulas we actually have
forms_avail = set(self.materials.distinct("formula_pretty", self.query))
Expand All @@ -92,7 +82,7 @@ def prechunk(self, number_splits: int) -> Iterable[Dict]:
for chunk in grouper(forms_to_update, number_splits):
yield {"formula_pretty": {"$in": chunk}}

def get_items(self) -> Tuple[List[Dict], List[Dict]]:
def get_items(self) -> Tuple[List[Dict], List[Dict]]: # type: ignore
"""
Gets all materials to assocaite with SNLs
Returns:
Expand All @@ -105,37 +95,27 @@ def get_items(self) -> Tuple[List[Dict], List[Dict]]:

# Find all formulas for materials that have been updated since this
# builder was last ran
q = {**self.query, "property_name": ProvenanceDoc.property_name}
updated_materials = self.provenance.newer_in(
self.materials,
criteria=q,
exhaustive=True,
)
forms_to_update = set(
self.materials.distinct(
"formula_pretty", {"material_id": {"$in": updated_materials}}
)
)
q = self.query
updated_materials = self.provenance.newer_in(self.materials, criteria=q, exhaustive=True,)
forms_to_update = set(self.materials.distinct("formula_pretty", {"material_id": {"$in": updated_materials}}))

# Find all new SNL formulas since the builder was last run
for source in self.source_snls:
new_snls = self.provenance.newer_in(source)
forms_to_update |= set(source.distinct("formula_pretty", new_snls))
forms_to_update |= set(source.distinct("formula_pretty", {source.key: {"$in": new_snls}}))

# Now reduce to the set of formulas we actually have
forms_avail = set(self.materials.distinct("formula_pretty", self.query))
forms_to_update = forms_to_update & forms_avail

self.logger.info(f"Found {len(forms_to_update)} new/updated systems to proces")
self.logger.info(f"Found {len(forms_to_update)} new/updated systems to process")

self.total = len(forms_to_update)

for formulas in grouper(forms_to_update, self.chunk_size):
snls = []
snls = [] # type: list
for source in self.source_snls:
snls.extend(
source.query(criteria={"formula_pretty": {"$in": formulas}})
)
snls.extend(source.query(criteria={"formula_pretty": {"$in": formulas}}))

mats = list(
self.materials.query(
Expand All @@ -145,6 +125,7 @@ def get_items(self) -> Tuple[List[Dict], List[Dict]]:
"structure",
"initial_structures",
"formula_pretty",
"deprecated",
],
criteria={"formula_pretty": {"$in": formulas}},
)
Expand All @@ -162,9 +143,7 @@ def get_items(self) -> Tuple[List[Dict], List[Dict]]:

mat_group = mat_groups[formula]

self.logger.debug(
f"Found {len(snl_group)} snls and {len(mat_group)} mats"
)
self.logger.debug(f"Found {len(snl_group)} snls and {len(mat_group)} mats")
yield mat_group, snl_group

def process_item(self, item) -> List[Dict]:
Expand All @@ -183,9 +162,13 @@ def process_item(self, item) -> List[Dict]:
# Match up SNLS with materials
for mat in mats:
matched_snls = list(self.match(source_snls, mat))

if len(matched_snls) > 0:
doc = ProvenanceDoc.from_SNLs(
material_id=mat["material_id"], snls=matched_snls
material_id=mat["material_id"],
structure=Structure.from_dict(mat["structure"]),
snls=matched_snls,
deprecated=mat["deprecated"],
)

doc.authors.append(self.settings.DEFAULT_AUTHOR)
Expand Down Expand Up @@ -222,17 +205,9 @@ def match(self, snls, mat):
angle_tol=self.settings.ANGLE_TOL,
# comparator=OrderDisorderElementComparator(),
)
matched_groups = [
group
for group in groups
if any(not hasattr(struc, "snl") for struc in group)
]
snls = [
struc.snl
for group in matched_groups
for struc in group
if hasattr(struc, "snl")
]

matched_groups = [group for group in groups if any(not hasattr(struc, "snl") for struc in group)]
snls = [struc.snl for group in matched_groups for struc in group if hasattr(struc, "snl")]

self.logger.debug(f"Found {len(snls)} SNLs for {mat['material_id']}")
return snls
Expand Down
34 changes: 10 additions & 24 deletions emmet-core/emmet/core/electronic_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@

class ElectronicStructureBaseData(BaseModel):
task_id: MPID = Field(
...,
description="The source calculation (task) ID for the electronic structure data.",
..., description="The source calculation (task) ID for the electronic structure data.",
)

band_gap: float = Field(..., description="Band gap energy in eV.")
Expand Down Expand Up @@ -63,18 +62,15 @@ class DosSummaryData(ElectronicStructureBaseData):

class BandstructureData(BaseModel):
setyawan_curtarolo: BandStructureSummaryData = Field(
None,
description="Band structure summary data using the Setyawan-Curtarolo path convention.",
None, description="Band structure summary data using the Setyawan-Curtarolo path convention.",
)

hinuma: BandStructureSummaryData = Field(
None,
description="Band structure summary data using the Hinuma et al. path convention.",
None, description="Band structure summary data using the Hinuma et al. path convention.",
)

latimer_munro: BandStructureSummaryData = Field(
None,
description="Band structure summary data using the Latimer-Munro path convention.",
None, description="Band structure summary data using the Latimer-Munro path convention.",
)


Expand All @@ -88,16 +84,13 @@ class DosData(BaseModel):
Dict[Union[Literal["1", "-1"], Spin], DosSummaryData],
],
] = Field(
None,
description="Band structure summary data using the Hinuma et al. path convention.",
None, description="Band structure summary data using the Hinuma et al. path convention.",
)

orbital: Dict[
Union[Literal["total", "s", "p", "d", "f"], OrbitalType],
Dict[Union[Literal["1", "-1"], Spin], DosSummaryData],
Union[Literal["total", "s", "p", "d", "f"], OrbitalType], Dict[Union[Literal["1", "-1"], Spin], DosSummaryData],
] = Field(
None,
description="Band structure summary data using the Latimer-Munro path convention.",
None, description="Band structure summary data using the Latimer-Munro path convention.",
)

magnetic_ordering: Union[str, Ordering] = Field(None, description="Magnetic ordering of the calculation.")
Expand All @@ -118,8 +111,7 @@ class ElectronicStructureDoc(PropertyDoc, ElectronicStructureSummary):
dos: DosData = Field(None, description="Density of states data for the material.")

last_updated: datetime = Field(
description="Timestamp for when this document was last updated",
default_factory=datetime.utcnow,
description="Timestamp for when this document was last updated", default_factory=datetime.utcnow,
)

@classmethod
Expand Down Expand Up @@ -288,13 +280,7 @@ def from_bsdos( # type: ignore[override]
nbands = bs.nb_bands

# - Get equivalent labels between different conventions
hskp = HighSymmKpath(
bs.structure,
path_type="all",
symprec=0.1,
angle_tolerance=5,
atol=1e-5,
)
hskp = HighSymmKpath(bs.structure, path_type="all", symprec=0.1, angle_tolerance=5, atol=1e-5,)
equivalent_labels = hskp.equiv_labels

if bs_type == "latimer_munro":
Expand Down Expand Up @@ -367,7 +353,7 @@ def from_bsdos( # type: ignore[override]
return cls.from_structure(
material_id=MPID(material_id),
task_id=summary_task,
structure=structure,
meta_structure=structure,
band_gap=summary_band_gap,
cbm=summary_cbm,
vbm=summary_vbm,
Expand Down
50 changes: 18 additions & 32 deletions emmet-core/emmet/core/material.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@
from __future__ import annotations

from datetime import datetime
from typing import List, Mapping, Type, TypeVar
from typing import List, Mapping, Type, TypeVar, Union

from pydantic import BaseModel, Field
from pymatgen.core import Structure

from emmet.core.mpid import MPID
from emmet.core.structure import StructureMetadata
from emmet.core.vasp.validation import DeprecationMessage


class PropertyOrigin(BaseModel):
Expand All @@ -17,12 +18,9 @@ class PropertyOrigin(BaseModel):
"""

name: str = Field(..., description="The property name")
task_id: MPID = Field(
..., description="The calculation ID this property comes from"
)
task_id: MPID = Field(..., description="The calculation ID this property comes from")
last_updated: datetime = Field(
description="The timestamp when this calculation was last updated",
default_factory=datetime.utcnow,
description="The timestamp when this calculation was last updated", default_factory=datetime.utcnow,
)


Expand All @@ -41,60 +39,48 @@ class MaterialsDoc(StructureMetadata):
"This comes in the form and MPID or int",
)

structure: Structure = Field(
..., description="The best structure for this material"
)
structure: Structure = Field(..., description="The best structure for this material")

deprecated: bool = Field(
True,
description="Whether this materials document is deprecated.",
True, description="Whether this materials document is deprecated.",
)

deprecation_reasons: List[Union[DeprecationMessage, str]] = Field(
None, description="List of deprecation tags detailing why this materials document isn't valid",
)

initial_structures: List[Structure] = Field(
[],
description="Initial structures used in the DFT optimizations corresponding to this material",
[], description="Initial structures used in the DFT optimizations corresponding to this material",
)

task_ids: List[MPID] = Field(
[],
title="Calculation IDs",
description="List of Calculations IDs used to make this Materials Document",
[], title="Calculation IDs", description="List of Calculations IDs used to make this Materials Document",
)

deprecated_tasks: List[str] = Field([], title="Deprecated Tasks")

calc_types: Mapping[str, str] = Field(
None,
description="Calculation types for all the calculations that make up this material",
None, description="Calculation types for all the calculations that make up this material",
)

last_updated: datetime = Field(
description="Timestamp for when this document was last updated",
default_factory=datetime.utcnow,
description="Timestamp for when this document was last updated", default_factory=datetime.utcnow,
)

created_at: datetime = Field(
description="Timestamp for when this material document was first created",
default_factory=datetime.utcnow,
description="Timestamp for when this material document was first created", default_factory=datetime.utcnow,
)

origins: List[PropertyOrigin] = Field(
None, description="Dictionary for tracking the provenance of properties"
)
origins: List[PropertyOrigin] = Field(None, description="Dictionary for tracking the provenance of properties")

warnings: List[str] = Field([], description="Any warnings related to this material")

@classmethod
def from_structure( # type: ignore[override]
cls: Type[T], structure: Structure, material_id: MPID, **kwargs
) -> T:
def from_structure(cls: Type[T], structure: Structure, material_id: MPID, **kwargs) -> T: # type: ignore[override]
"""
Builds a materials document using the minimal amount of information
"""

return super().from_structure( # type: ignore
structure=structure,
material_id=material_id,
include_structure=True,
**kwargs
meta_structure=structure, material_id=material_id, structure=structure, **kwargs
)
13 changes: 7 additions & 6 deletions emmet-core/emmet/core/material_property.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,11 @@ class PropertyDoc(StructureMetadata):
)

deprecated: bool = Field(
...,
description="Whether this property document is deprecated.",
..., description="Whether this property document is deprecated.",
)

reasons: List[Union[DeprecationMessage, str]] = Field(
None, description="List of deprecation tags detailing why this document isn't valid"
deprecation_reasons: List[Union[DeprecationMessage, str]] = Field(
None, description="List of deprecation tags detailing why this document isn't valid",
)

last_updated: datetime = Field(
Expand All @@ -48,9 +47,11 @@ class PropertyDoc(StructureMetadata):
warnings: Sequence[str] = Field([], description="Any warnings related to this property")

@classmethod
def from_structure(cls: Type[S], structure: Structure, material_id: MPID, **kwargs) -> S: # type: ignore[override]
def from_structure( # type: ignore[override]
cls: Type[S], meta_structure: Structure, material_id: MPID, **kwargs
) -> S:
"""
Builds a materials document using the minimal amount of information
"""

return super().from_structure(structure=structure, material_id=material_id, **kwargs) # type: ignore
return super().from_structure(meta_structure=meta_structure, material_id=material_id, **kwargs) # type: ignore
3 changes: 2 additions & 1 deletion emmet-core/emmet/core/oxidation_states.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ class OxidationStateDoc(PropertyDoc):

property_name = "oxidation"

structure: Structure = Field(..., description="The structure used in the generation of the oxidation state data")
possible_species: List[str] = Field(description="Possible charged species in this material")
possible_valences: List[float] = Field(description="List of valences for each site in this material")
average_oxidation_states: Dict[str, float] = Field(description="Average oxidation states for each unique species")
Expand Down Expand Up @@ -77,5 +78,5 @@ def from_structure(cls, structure: Structure, material_id: MPID, **kwargs): # t
raise e

return super().from_structure(
structure=structure, material_id=material_id, include_structure=True, **d, **kwargs
meta_structure=structure, material_id=material_id, structure=structure, **d, **kwargs
)
Loading

0 comments on commit 99ada6e

Please sign in to comment.