Rebalance tool: Fix record count. Enable using all files.dat entries (#…

…213) ## Scope With this change we introduce the possibility of using all data from a files.dat even if the header specifies a different count. The usage is `--max-entries=0` We also update the header in the output file accordingly, i.e. independently of the original header, the output header specifies the number of records as written in the file. ## Testing Tested with proj134/home/king/BBPP134-917/p_MultiCycle_Support --------- Co-authored-by: Antonio Bellotta <[email protected]>
BlueBrain · Dec 2, 2024 · ea9572f · ea9572f
1 parent 073956e
commit ea9572f
Showing 1 changed file with 19 additions and 8 deletions.
diff --git a/tools/rebalance-corenrn-data.py b/tools/rebalance-corenrn-data.py
@@ -12,6 +12,7 @@
 import heapq
 import itertools
 import logging
+import math
 import os
 import sys
 
@@ -39,6 +40,9 @@ def distribute_dat_to_bucket(dat_entry, size, buckets, bucket_sizes):
 def redistribute_files_dat(files_dat_file, n_buckets, max_entries=None, show_stats=False):
     """
     Read and process each entry from the dat file and distribute them into buckets.
+
+    If max entries is not set (None) respect the number of entries from the header.
+    If user sets to `0` we use all entries from the file (disregard header)
     """
     base_dir = os.path.dirname(files_dat_file)
     metadata = {}
@@ -47,16 +51,17 @@ def redistribute_files_dat(files_dat_file, n_buckets, max_entries=None, show_sta
     with open(files_dat_file, "r") as file:
         # read header
         metadata["version"] = file.readline().strip()
-        n_entries = file.readline().strip()
+        metadata["n_files"] = file.readline().strip()
 
-        metadata["n_files"] = max_entries or n_entries
+        if max_entries is None:
+            max_entries = int(metadata["n_files"])
 
         # read all dat entries
         dat_entries = file.readlines()
 
-    if (n_files := int(metadata["n_files"])) < len(dat_entries):
-        logging.warning("files.dat: processing reduced number of entries: %d", n_files)
-        dat_entries = dat_entries[:n_files]
+    if 0 < max_entries < len(dat_entries):
+        logging.warning("files.dat: processing reduced number of entries: %d", max_entries)
+        dat_entries = dat_entries[:max_entries]
 
     logging.info("Distributing files into %d buckets...", n_buckets)
 
@@ -110,12 +115,17 @@ def batch(iterable, first=0):
                 yield group + [CORENRN_SKIP_MARK] * (ranks_per_machine - len(group))
                 break
             yield group
-            first, last = last, last + 40
+            first, last = last, last + ranks_per_machine
             group = iterable[first:last]
 
+    # compute max number of cell groups per rank so we know the n_files in the header
+    max_len = max(len(m) for m in buckets)
+    max_groups_rank = math.ceil(max_len / ranks_per_machine)
+    total_entries = max_groups_rank * ranks_per_machine * len(buckets)
+
     with open(output_file, "w") as out:
         print(infos["version"], file=out)
-        print(infos["n_files"], file=out)
+        print(total_entries, file=out)
 
         for buckets in itertools.zip_longest(*[batch(m) for m in buckets]):
             for entries in buckets:
@@ -149,7 +159,8 @@ def main():
         "--max-entries",
         type=int,
         default=None,
-        help="Consider only the first N entries of the input file",
+        help="Consider only the first N entries of the input file."
+             "To force using all data and disregard the header set to 0",
     )
     parser.add_argument(
         "--output-file",