Adding incremental files.

aiidateam · Jan 23, 2025 · 748336d · 748336d
1 parent d2c7287
commit 748336d
Show file tree

Hide file tree

Showing 2 changed files with 166 additions and 144 deletions.
diff --git a/src/aiida/tools/dumping/incremental.py b/src/aiida/tools/dumping/incremental.py
@@ -9,6 +9,14 @@
 # TODO: flat list using UUIDs, or sorted by type?
 # TODO: individual lists, ids, uuids, etc., or sorted: by data, or more fine-grained
 # TODO: Dumping to json files with timestamp in filename, inside file, or none at all
+# TODO: When dumping incrementally, just keep a list of all the UUIDs, and, before creating/updating the dictionary,
+# TODO: drop everything apart from the set intersection
+# TODO: How to take care of deleting previously dumped node data that doesn't exist in the profile anymore
+# TODO: One could persist the list of all profile UUIDs throughout multiple instantiations of the class, and method
+# TODO: calls, by writing to file, but not sure if this is something we want to do by default?
+# TODO: Possibly, persist the history of up to N dumps -> Specifyable by the user
+# TODO: Pass the `DumpNodeCollector` around, and `pop` UUIDs, as the dumping has actually been done?
+# TODO: Make sure the dumping UUID list update functions are atomic
 
 
 _node_dict_entries = [
@@ -20,188 +28,71 @@
     'symlinked',  # Should this be here?
 ]
 
-_node_dict_template: dict[str,None] = {k: None for k in _node_dict_entries}
-
-class DumpNodeCollector:
-
-    # TODO: When dumping incrementally, just keep a list of all the UUIDs, and, before creating/updating the dictionary,
-    # TODO: drop everything apart from the set intersection
-    # TODO: How to take care of deleting previously dumped node data that doesn't exist in the profile anymore
-    # TODO: One could persist the list of all profile UUIDs throughout multiple instantiations of the class, and method
-    # TODO: calls, by writing to file, but not sure if this is something we want to do by default?
-    # TODO: Possibly, persist the history of up to N dumps -> Specifyable by the user
-    # TODO: Pass the `DumpNodeCollector` around, and `pop` UUIDs, as the dumping has actually been done?
+_node_dict_template: dict[str, None] = {k: None for k in _node_dict_entries}
 
 
+class IncrementalDumper:
     def __init__(self, dump_parent_path: Path):
         self.dump_parent_path: Path = dump_parent_path
+
+        # self.profile_uuids: set[str] = set()
+        profile_uuids = orm.QueryBuilder().append(orm.Node, project=['uuid']).all(flat=True)
+        self.profile_uuids = set(profile_uuids)
+
         self.prevdump_uuids: set[str] = set()
+        self.prevdel_uuids: set[str] = set()
         self.todump_uuids: set[str] = set()
         self.todel_uuids: set[str] = set()
 
-        self.profile_uuid_file: Path= self.dump_parent_path / 'profile-uuids.json'
-        self.todump_uuid_file: Path = self.dump_parent_path / 'profile-uuids.json'
+        self.profile_uuid_file: Path = self.dump_parent_path / 'profile-uuids.json'
+        self.todump_uuid_file: Path = self.dump_parent_path / 'todump-uuids.json'
         self.prevdump_uuid_file: Path = self.dump_parent_path / 'prevdump-uuids.json'
         self.todel_uuid_file: Path = self.dump_parent_path / 'todel-uuids.json'
         self.prevdel_uuid_file: Path = self.dump_parent_path / 'prevdel-uuids.json'
 
-        self.profile_uuids: set[str] = set(orm.QueryBuilder().append(orm.Node, project=['uuid']).all(flat=True))
-
-
-    # TODO: Make sure the dumping UUID list update functions are atomic
-
-
-
     def update_uuids_before_dump(self):
+        """
+        abc
+        """
+
+        # Could implement smarter logic here to check if anything actually changed in terms of profile UUIDs
 
         if self.prevdump_uuid_file.exists():
             with open(self.prevdump_uuid_file, 'r') as handle:
-                self.prevdump_uuids = json.load(handle)
+                self.prevdump_uuids = set(json.load(handle) or [])
         else:
             self.prevdump_uuids = set()
 
-        # ? Differentiate different types here somewhat?
+        if self.prevdel_uuid_file.exists():
+            with open(self.prevdel_uuid_file, 'r') as handle:
+                self.prevdel_uuids = set(json.load(handle) or [])
+        else:
+            self.prevdel_uuids = set()
 
         # New UUIDs from the profile that weren't dumped in the past
-        self.todump_uuids = set(self.profile_uuids).difference(self.prevdump_uuids)
+        self.todump_uuids = {str(uuid) for uuid in self.profile_uuids.difference(self.prevdump_uuids)}
+        self.profile_uuids.add(set(self.todump_uuids))
 
         # UUIDs of nodes that were deleted in the profile, but had already been dumped in the past
-        # Depending on `--delete-files` this should be deleted, or not
+        # Depending on `--delete-files` these should be deleted, or not
         self.todel_uuids = set(self.prevdump_uuids).difference(self.profile_uuids)
 
         with open(self.profile_uuid_file, 'w') as handle:
-            json.dump(list(self.profile_uuids), handle, indent=4)
+            json.dump(self.profile_uuids, handle, indent=4)
 
         with open(self.todump_uuid_file, 'w') as handle:
-            json.dump(list(self.todump_uuids), handle, indent=4)
+            json.dump(self.todump_uuids, handle, indent=4)
 
         with open(self.todel_uuid_file, 'w') as handle:
-            json.dump(list(self.todel_uuids), handle, indent=4)
-
-        # ? Maybe better to keep it as separate files, so that one doesn't have to re-write everything...
-        # full_uuid_dict = {
-        #     'profile': self.profile_uuids,
-        #     'prev-dump': self.prevdump_uuids,
-        #     'this-dump': self.todump_uuids,
-        #     'to-delete:': self.todel_uuids
-        # }
-
-        # self.full_uuid_dict = full_uuid_dict
-
-        # with open(self.dump_parent_path / 'full-uuid-dict.json', 'w') as f:
-        #     json.dump(list(self.full_uuid_dict), f, indent=4)
+            json.dump(self.todel_uuids, handle, indent=4)
 
     def update_uuids_after_dump(self):
-
         # Move `todump` to `prevdump`
         self.todump_uuid_file.rename(self.prevdump_uuid_file)
 
         # Move `todel` to `prevddel`
         self.todel_uuid_file.rename(self.prevdel_uuid_file)
 
-        # Re-create empty todump-file -> Don't really need that
-        # (self.dump_parent_path / 'todump-uuids.json').touch()
-        # with open(self.dump_parent_path / 'todump-uuids.json', 'w') as handle:
-        #     json.dump([], handle, indent=4)
-
-    # def create_organized_uuid_dicts(self):
-
-    #     # ! This requires actually loading the `Node`s rather than just working with UUID strings
-
-    #     for uuid in self.profile_uuids:
-    #         node = orm.load_node(uuid)
-    #         entry_point_name = node.entry_point.name
-    #         if entry_point_name not in self.profile_organized_uuids:
-    #             self.profile_organized_uuids[entry_point_name] = [uuid]
-    #         else:
-    #             self.profile_organized_uuids[entry_point_name].append(uuid)
-
-    #     for uuid in self.todump_uuids:
-    #         node = orm.load_node(uuid)
-    #         entry_point_name = node.entry_point.name
-    #         if entry_point_name not in self.todump_organized_uuids:
-    #             self.todump_organized_uuids[entry_point_name] = [uuid]
-    #         else:
-    #             self.todump_organized_uuids[entry_point_name].append(uuid)
-
-    #     for uuid in self.todel_uuids:
-    #         node = orm.load_node(uuid)
-    #         entry_point_name = node.entry_point.name
-    #         if entry_point_name not in self.todel_organized_uuids:
-    #             self.todel_organized_uuids[entry_point_name] = [uuid]
-    #         else:
-    #             self.todel_organized_uuids[entry_point_name].append(uuid)
-
-    #     pprint(self.profile_organized_uuids)
-    #     pprint(self.todump_organized_uuids)
-    #     pprint(self.todel_organized_uuids)
-
-    #     import json
-    #     import os
-
-
-    #     # Dump profile_organized_uuids to a JSON file
-    #     with open(self.dump_parent_path / 'profile-organized-uuids.json', 'w') as f:
-    #         json.dump(self.profile_organized_uuids, f, indent=4)
-
-    #     # Dump todump_organized_uuids to a JSON file
-    #     with open(self.dump_parent_path / 'todump-organized-uuids.json', 'w') as f:
-    #         json.dump(self.todump_organized_uuids, f, indent=4)
-
-    #     # Dump todel_organized_uuids to a JSON file
-    #     with open(self.dump_parent_path / 'todel-organized-uuids.json', 'w') as f:
-    #         json.dump(self.todel_organized_uuids, f, indent=4)
-    #         # breakpoint()
-
-    #     # List comprehension rather than dict.from_keys, otherwise always same reference is passed
-    #     # self.todump_dict = {uuid: _node_dict_template.copy() for uuid in self.todump_uuids}
-
-    # def populate_uuid_dict(self):
-    #     # Mainly copied from `def get_collection_nodes`
-    #     # TODO: Possibly add 'dumped' True/False to dictionary and then json file
-    #     # TODO: What if node in multiple groups -> Maybe just dump the last one?
-    #     # Add group information
-    #     groups = orm.QueryBuilder().append(orm.Group).all(flat=True)
-
-    #     for group in groups:
-
-    #         nodes_in_groups = [node.uuid for node in group.nodes]
-
-    #         # Only resolve the sub-workflows for nodes in groups if nodes will be dumped
-    #         nodes_in_groups = [node for node in nodes_in_groups if node in self.todump_uuids]
-
-    #         # Need to expand here also with the called_descendants of `WorkflowNodes`, otherwise the called
-    #         # `CalculationNode`s for `WorkflowNode`s that are part of a group are dumped twice
-    #         sub_nodes_in_groups = list(it.chain(
-    #             *[
-    #                 orm.load_node(node).called_descendants
-    #                 for node in nodes_in_groups
-    #                 if isinstance(orm.load_node(node), orm.WorkflowNode)
-    #             ]
-    #         ))
-    #         sub_nodes_in_groups = [node.uuid for node in sub_nodes_in_groups]
-    #         nodes_in_groups = nodes_in_groups + sub_nodes_in_groups
-
-    #         dict_list = []
-    #         for node_in_group in nodes_in_groups:
-    #             self.todump_dict[node_in_group]['group'] = group.label
-
-
-    #     pprint(self.todump_dict)
-    #     print(self.todump_dict['df12d734-a99b-4bef-9868-4f96aef34405'] == self.todump_dict['f9eabbd5-97bd-456a-bd20-895b84942d38'])
-
-    #         # for node_uuid in self.todump_uuids:
-
-    #         #     node = orm.load_node(node_uuid)
-    #         #     raise SystemExit
-
-    #     pass
-
-
-    def obtain_node_difference():
-        pass
-
-
     def resolve_path():
         pass
 

diff --git a/tests/tools/dumping/test_incremental.py b/tests/tools/dumping/test_incremental.py
@@ -0,0 +1,131 @@
+###########################################################################
+# Copyright (c), The AiiDA team. All rights reserved.                     #
+# This file is part of the AiiDA code.                                    #
+#                                                                         #
+# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core #
+# For further information on the license, see the LICENSE.txt file        #
+# For further information please visit http://www.aiida.net               #
+###########################################################################
+"""Tests for the dumping of ProcessNode data to disk."""
+
+from __future__ import annotations
+
+import io
+import shutil
+from pathlib import Path
+
+from _pytest.tmpdir import tmp_path
+import pytest
+
+from aiida.tools.dumping.incremental import IncrementalDumper
+from aiida import orm
+from aiida.manage import get_manager
+
+# @pytest.fixture(scope='class', autouse=True)
+# def setup_test_dumper(test_dumper):
+
+
+@pytest.mark.usefixtures("init_profile")
+class TestIncrementalDumper:
+
+    @classmethod
+    def setup_class(cls):
+
+        cls.int: orm.Node = orm.Int(1).store()
+        cls.float: orm.Node = orm.Float(1.0).store()
+        cls.str: orm.Node = orm.Str('a').store()
+
+        cls.groups = {
+            'add': orm.Group(label='add').store(),
+            'add_multiply': orm.Group(label='add_multiply').store(),
+        }
+
+        kpoints = orm.KpointsData()
+        kpoints.set_kpoints_mesh(mesh=[1]*3)
+        cls.kpoints = kpoints
+        cls.structuredata = orm.StructureData(cell=((1.0, 0.0, 0.0), (0.0, 1.0, 0.0), (0.0, 0.0, 1.0))).store()
+
+
+        # other organizational entities
+        cls.manager = get_manager()
+        cls.profile = cls.manager.get_profile()
+        cls.storage = cls.manager.get_profile_storage()
+
+        # for dev
+        cls.storage_info: dict = cls.storage.get_info(detailed=True)
+
+        # Add one ArithmeticAddNode not in any group
+        # cls.add_node = generate_calculation_node_add()
+        # cls.multiply_add_node = generate_workchain_multiply_add()
+
+    @pytest.fixture(scope="session", autouse=True)
+    def init_profile(
+        self,
+        tmpdir_factory
+    ):
+        """Initialize the profile."""
+        # Cannot put this into `setup_class`, as then `tmpdir_factory` is expected as an actual argument
+        self.incremental_dumper: IncrementalDumper = IncrementalDumper(dump_parent_path=tmpdir_factory.mktemp('incr') / 'dump-parent')
+
+    def test_update_uuids_before_dump(self):
+        # print(wc_node)
+        from aiida.cmdline.utils import echo
+        print(self.float)
+        pass
+        # print(self.storage_info)
+        # self.incremental_dumper.update_uuids_before_dump()
+        # # echo.echo_dictionary(self.storage_info)
+        # assert False
+        # pass
+
+    def test_update_uuids_after_dump(self, tmp_path):
+        pass
+
+
+        # cj_nodes = [
+        #     generate_calculation_node_io(attach_outputs=False),
+        #     generate_calculation_node_io(attach_outputs=False),
+        # ]
+        # wc_node = generate_workchain_node_io(cj_nodes=cj_nodes)
+
+
+
+    # @pytest.fixture(scope="session", autouse=True)
+    # def init_profile(
+    #     self,
+    #     # aiida_localhost,
+    #     # tmp_path,  # -> Function-scoped
+    #     tmpdir_factory
+    # ):
+    #     """Initialize the profile."""
+    #     # Add one of each orm entity of interest to the profile
+    #     # self.computer: orm.Computer = aiida_localhost
+    #     # self.authinfo: orm.AuthInfo = self.computer.get_authinfo(user=orm.User.collection.get_default())
+    #     # self.code: orm.Code = orm.InstalledCode(computer=self.computer, filepath_executable='/bin/true').store()
+    #     self.int: orm.Node = orm.Int(1).store()
+    #     # self.float: orm.Node = orm.Float(1.0).store()
+    #     # self.str: orm.Node = orm.Str('a').store()
+
+    #     # self.groups = {
+    #     #     'add': orm.Group(label='add').store(),
+    #     #     'add_multiply': orm.Group(label='add_multiply').store(),
+    #     # }
+
+    #     # kpoints = orm.KpointsData()
+    #     # kpoints.set_kpoints_mesh(mesh=[1]*3)
+    #     # self.kpoints = kpoints
+    #     # self.structuredata = orm.StructureData(cell=((1.0, 0.0, 0.0), (0.0, 1.0, 0.0), (0.0, 0.0, 1.0))).store()
+
+    #     # self.incremental_dumper: IncrementalDumper = IncrementalDumper(dump_parent_path=tmpdir_factory.mktemp('incr') / 'dump-parent')
+
+    #     # # other organizational entities
+    #     # self.manager = get_manager()
+    #     # self.profile = self.manager.get_profile()
+    #     # self.storage = self.manager.get_profile_storage()
+
+    #     # # for dev
+    #     # self.storage_info: dict = self.storage.get_info(detailed=True)
+
+    #     # Add one ArithmeticAddNode not in any group
+    #     # self.add_node = generate_calculation_node_add()
+    #     # self.multiply_add_node = generate_workchain_multiply_add()