atomind-ai · chiang-yuan · Oct 29, 2024 · Oct 30, 2024 · Oct 30, 2024 · Oct 30, 2024
diff --git a/mlip_arena/tasks/defects/READMD.md b/mlip_arena/tasks/defects/READMD.md
@@ -0,0 +1,12 @@
+## Note on reproducibility
+
+1. Download and extract `bulk_primitive_folders` and `defect_relaxations` from `https://zenodo.org/records/10579527`.
+2. Run `TODO` to generate ASE atom database `HSE06.db`
+
+## Citing
+
+The benchmark data was taken from the following work. Please cite their work also if you find the benchmark useful.
+
+```
+Mosquera-Lois, I., Kavanagh, S.R., Ganose, A.M. et al. Machine-learning structural reconstructions for accelerated point defect calculations. npj Comput Mater 10, 121 (2024). https://doi.org/10.1038/s41524-024-01303-9
+```
diff --git a/mlip_arena/tasks/defects/download.py b/mlip_arena/tasks/defects/download.py
@@ -0,0 +1,26 @@
+from tqdm.auto import tqdm
+import requests
+
+record_id = "10579527"
+base_url = f"https://zenodo.org/record/{record_id}/files/"
+
+# bulk_primitive_folders.tar.gz
+
+files = ["bulk_primitive_folders.tar.gz", "defect_relaxations.tar.gz"]
+
+
+for file in files:
+    response = requests.get(base_url + file + "?download=1", stream=True)
+    response.raise_for_status()  # Check if the request was successful
+
+    total = int(response.headers.get("content-length", 0))
+
+    # Save the file locally
+    with open(file, "wb") as f, tqdm(
+        desc=file, total=total, unit="iB", unit_scale=True, unit_divisor=1024
+    ) as bar:
+        for chunk in response.iter_content(chunk_size=8192):
+            size = f.write(chunk)
+            bar.update(size)
+
+    print(f"{file} downloaded successfully.")
diff --git a/mlip_arena/tasks/defects/extract.py b/mlip_arena/tasks/defects/extract.py
@@ -0,0 +1,18 @@
+import tarfile
+import glob
+from tqdm.auto import tqdm
+
+files = sorted(glob.glob("*.tar.gz"))
+
+for file in files:
+
+    # Check if the file is a tar archive and extract it
+    if file.endswith('.tar.gz') or file.endswith('.tar'):
+        with tarfile.open(file, 'r:gz' if file.endswith('.gz') else 'r:') as tar_ref:
+            members = tar_ref.getmembers()
+
+            # Initialize progress bar
+            with tqdm(total=len(members), desc=file, unit="file") as pbar:
+                for member in members:
+                    tar_ref.extract(member, file.split('.')[0])
+                    pbar.update()