diff --git a/CHANGELOG.md b/CHANGELOG.md index 605f4243b..4e60db699 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,10 +8,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### Changed +- Indent for the json string of Tidy3D models has been changed to `None` when used internally; kept as `indent=4` for writing to `json` and `yaml` files. ### Fixed - Fixed the duplication of log messages in Jupyter when `set_logging_file` is used. - If input to circular filters in adjoint have size smaller than the diameter, instead of erroring, warn user and truncate the filter kernel accordingly. +- When writing the json string of a model to an `hdf5` file, the string is split into chunks if it has more than a set (very large) number of characters. This fixes potential error if the string size is more than 4GB. + ## [2.5.0rc2] - 2023-10-30 diff --git a/tests/sims/simulation_2_5_0rc2.h5 b/tests/sims/simulation_2_5_0rc2.h5 index 6897bbaa2..1e92efab4 100644 Binary files a/tests/sims/simulation_2_5_0rc2.h5 and b/tests/sims/simulation_2_5_0rc2.h5 differ diff --git a/tests/test_components/test_IO.py b/tests/test_components/test_IO.py index 2bd770081..c79c5be42 100644 --- a/tests/test_components/test_IO.py +++ b/tests/test_components/test_IO.py @@ -21,6 +21,14 @@ SIM_DIR = "tests/sims" +@pytest.fixture +def split_string(monkeypatch): + """Lower the max string length in hdf5 read/write, in order to test the string splitting.""" + from tidy3d.components import base + + monkeypatch.setattr(base, "MAX_STRING_LENGTH", 100) + + def set_datasets_to_none(sim): sim_dict = sim.dict() for src in sim_dict["sources"]: @@ -47,7 +55,7 @@ def set_datasets_to_none(sim): return td.Simulation.parse_obj(sim_dict) -def test_simulation_load_export(): +def test_simulation_load_export(split_string): major, minor, patch = __version__.split(".") path = os.path.join(SIM_DIR, f"simulation_{major}_{minor}_{patch}.json") # saving as .h5 since *.hdf5 is git ignored @@ -81,28 +89,28 @@ def test_component_load_export_yaml(tmp_path): assert td.Medium() == M2, "original and loaded medium are not the same" -def test_simulation_load_export_hdf5(tmp_path): +def test_simulation_load_export_hdf5(split_string, tmp_path): path = str(tmp_path / "simulation.hdf5") SIM.to_file(path) SIM2 = td.Simulation.from_file(path) assert SIM == SIM2, "original and loaded simulations are not the same" -def test_simulation_load_export_hdf5_gz(tmp_path): +def test_simulation_load_export_hdf5_gz(split_string, tmp_path): path = str(tmp_path / "simulation.hdf5.gz") SIM.to_file(path) SIM2 = td.Simulation.from_file(path) assert SIM == SIM2, "original and loaded simulations are not the same" -def test_simulation_load_export_hdf5_explicit(tmp_path): +def test_simulation_load_export_hdf5_explicit(split_string, tmp_path): path = str(tmp_path / "simulation.hdf5") SIM.to_hdf5(path) SIM2 = td.Simulation.from_hdf5(path) assert SIM == SIM2, "original and loaded simulations are not the same" -def test_simulation_load_export_hdf5_gz_explicit(tmp_path): +def test_simulation_load_export_hdf5_gz_explicit(split_string, tmp_path): path = str(tmp_path / "simulation.hdf5.gz") SIM.to_hdf5_gz(path) SIM2 = td.Simulation.from_hdf5_gz(path) diff --git a/tests/test_web/test_webapi.py b/tests/test_web/test_webapi.py index a2edc26ab..c8fb17396 100644 --- a/tests/test_web/test_webapi.py +++ b/tests/test_web/test_webapi.py @@ -357,7 +357,7 @@ def get_str(*args, **kwargs): return sim.json().encode("utf-8") monkeypatch.setattr(f"{task_core_path}.download_file", mock_download) - monkeypatch.setattr(f"{task_core_path}._read_simulation_from_hdf5", get_str) + monkeypatch.setattr(f"{task_core_path}.read_simulation_from_hdf5", get_str) fname_tmp = str(tmp_path / "web_test_tmp.json") download_json(TASK_ID, fname_tmp) diff --git a/tests/test_web/test_webapi_heat.py b/tests/test_web/test_webapi_heat.py index 22ee21e0b..b1dff1f7a 100644 --- a/tests/test_web/test_webapi_heat.py +++ b/tests/test_web/test_webapi_heat.py @@ -257,7 +257,7 @@ def get_str(*args, **kwargs): return sim.json().encode("utf-8") monkeypatch.setattr(f"{task_core_path}.download_file", mock_download) - monkeypatch.setattr(f"{task_core_path}._read_simulation_from_hdf5", get_str) + monkeypatch.setattr(f"{task_core_path}.read_simulation_from_hdf5", get_str) fname_tmp = str(tmp_path / "web_test_tmp.json") download_json(TASK_ID, fname_tmp) diff --git a/tidy3d/components/base.py b/tidy3d/components/base.py index 1aa85a1bf..06539e97f 100644 --- a/tidy3d/components/base.py +++ b/tidy3d/components/base.py @@ -7,6 +7,7 @@ import tempfile from functools import wraps from typing import List, Callable, Dict, Union, Tuple, Any +from math import ceil import rich import pydantic.v1 as pydantic @@ -22,9 +23,12 @@ from ..exceptions import FileError from ..log import log -# default indentation (# spaces) in files -INDENT = 4 + +INDENT_JSON_FILE = 4 # default indentation of json string in json files +INDENT = None # default indentation of json string used internally JSON_TAG = "JSON_STRING" +# If json string is larger than ``MAX_STRING_LENGTH``, split the string when storing in hdf5 +MAX_STRING_LENGTH = 1e9 def cache(prop): @@ -309,7 +313,7 @@ def to_json(self, fname: str) -> None: ------- >>> simulation.to_json(fname='folder/sim.json') # doctest: +SKIP """ - json_string = self._json_string + json_string = self._json(indent=INDENT_JSON_FILE) self._warn_if_contains_data(json_string) with open(fname, "w", encoding="utf-8") as file_handle: file_handle.write(json_string) @@ -375,7 +379,7 @@ def to_yaml(self, fname: str) -> None: self._warn_if_contains_data(json_string) model_dict = json.loads(json_string) with open(fname, "w+", encoding="utf-8") as file_handle: - yaml.dump(model_dict, file_handle, indent=INDENT) + yaml.dump(model_dict, file_handle, indent=INDENT_JSON_FILE) @staticmethod def _warn_if_contains_data(json_str: str) -> None: @@ -430,6 +434,23 @@ def get_sub_model(cls, group_path: str, model_dict: dict | list) -> dict: model_dict = model_dict[key] return model_dict + @staticmethod + def _json_string_key(index: int) -> str: + """Get json string key for string chunk number ``index``.""" + if index: + return f"{JSON_TAG}_{index}" + return JSON_TAG + + @classmethod + def _json_string_from_hdf5(cls, fname: str) -> str: + """Load the model json string from an hdf5 file.""" + with h5py.File(fname, "r") as f_handle: + num_string_parts = len([key for key in f_handle.keys() if JSON_TAG in key]) + json_string = b"" + for ind in range(num_string_parts): + json_string += f_handle[cls._json_string_key(ind)][()] + return json_string + @classmethod def dict_from_hdf5( cls, fname: str, group_path: str = "", custom_decoders: List[Callable] = None @@ -501,10 +522,7 @@ def load_data_from_file(model_dict: dict, group_path: str = "") -> None: elif isinstance(value, dict): load_data_from_file(model_dict=value, group_path=subpath) - with h5py.File(fname, "r") as f_handle: - json_string = f_handle[JSON_TAG][()] - model_dict = json.loads(json_string) - + model_dict = json.loads(cls._json_string_from_hdf5(fname=fname)) group_path = cls._construct_group_path(group_path) model_dict = cls.get_sub_model(group_path=group_path, model_dict=model_dict) load_data_from_file(model_dict=model_dict, group_path=group_path) @@ -563,7 +581,11 @@ def to_hdf5(self, fname: str, custom_encoders: List[Callable] = None) -> None: with h5py.File(fname, "w") as f_handle: - f_handle[JSON_TAG] = self._json_string + json_str = self._json_string + for ind in range(ceil(len(json_str) / MAX_STRING_LENGTH)): + ind_start = int(ind * MAX_STRING_LENGTH) + ind_stop = min(int(ind + 1) * MAX_STRING_LENGTH, len(json_str)) + f_handle[self._json_string_key(ind)] = json_str[ind_start:ind_stop] def add_data_to_file(data_dict: dict, group_path: str = "") -> None: """For every DataArray item in dictionary, write path of hdf5 group as value.""" diff --git a/tidy3d/plugins/adjoint/components/base.py b/tidy3d/plugins/adjoint/components/base.py index 459b49e7a..f45d6bc98 100644 --- a/tidy3d/plugins/adjoint/components/base.py +++ b/tidy3d/plugins/adjoint/components/base.py @@ -9,7 +9,7 @@ from jax.tree_util import tree_flatten as jax_tree_flatten from jax.tree_util import tree_unflatten as jax_tree_unflatten -from ....components.base import Tidy3dBaseModel, cached_property +from ....components.base import Tidy3dBaseModel from .data.data_array import JaxDataArray, JAX_DATA_ARRAY_TAG @@ -93,11 +93,10 @@ def from_tidy3d(cls, tidy3d_obj: Tidy3dBaseModel) -> JaxObject: """ IO """ - @cached_property - def _json_string(self) -> str: + def _json(self, *args, **kwargs) -> str: """Overwritten method to get the json string to store in the files.""" - json_string_og = super()._json_string + json_string_og = super()._json(*args, **kwargs) json_dict = json.loads(json_string_og) def strip_data_array(sub_dict: dict) -> None: diff --git a/tidy3d/web/core/file_util.py b/tidy3d/web/core/file_util.py index bd3bbdc9b..9305eaab1 100644 --- a/tidy3d/web/core/file_util.py +++ b/tidy3d/web/core/file_util.py @@ -49,12 +49,28 @@ def read_simulation_from_hdf5_gz(file_name: str) -> str: return json_str +"""TODO: _json_string_key and read_simulation_from_hdf5 are duplicated functions that also exist +as methods in Tidy3dBaseModel. For consistency it would be best if this duplication is avoided.""" + + +def _json_string_key(index): + """Get json string key for string chunk number ``index``.""" + if index: + return f"{JSON_TAG}_{index}" + return JSON_TAG + + def read_simulation_from_hdf5(file_name: str) -> str: """read simulation str from hdf5""" - with h5py.File(file_name, "r") as f_handle: - json_string = f_handle[JSON_TAG][()] - return json_string + num_string_parts = len([key for key in f_handle.keys() if JSON_TAG in key]) + json_string = b"" + for ind in range(num_string_parts): + json_string += f_handle[_json_string_key(ind)][()] + return json_string + + +"""End TODO""" def read_simulation_from_json(file_name: str) -> str: diff --git a/tidy3d/web/core/task_core.py b/tidy3d/web/core/task_core.py index a37533987..f44cff153 100644 --- a/tidy3d/web/core/task_core.py +++ b/tidy3d/web/core/task_core.py @@ -8,7 +8,6 @@ from typing import List, Optional, Callable, Tuple import pydantic.v1 as pd from pydantic.v1 import Extra, Field, parse_obj_as -import h5py from . import http_util from .core_config import get_logger_console @@ -22,16 +21,8 @@ from .types import Tidy3DResource -from .constants import SIM_FILE_HDF5_GZ, SIMULATION_DATA_HDF5, SIM_LOG_FILE, JSON_TAG -from .file_util import extract_gzip_file - - -def _read_simulation_from_hdf5(file_name: str): - """read simulation str from hdf5""" - - with h5py.File(file_name, "r") as f_handle: - json_string = f_handle[JSON_TAG][()] - return json_string +from .constants import SIM_FILE_HDF5_GZ, SIMULATION_DATA_HDF5, SIM_LOG_FILE +from .file_util import extract_gzip_file, read_simulation_from_hdf5 class Folder(Tidy3DResource, Queryable, extra=Extra.allow): @@ -313,7 +304,7 @@ def get_simulation_json(self, to_file: str, verbose: bool = True) -> pathlib.Pat try: self.get_simulation_hdf5(hdf5_file_path) if os.path.exists(hdf5_file_path): - json_string = _read_simulation_from_hdf5(hdf5_file_path) + json_string = read_simulation_from_hdf5(hdf5_file_path) with open(to_file, "w") as file: # Write the string to the file file.write(json_string.decode("utf-8"))