Skip to content

Commit

Permalink
Rebalance tool: Fix record count. Enable using all files.dat entries (#…
Browse files Browse the repository at this point in the history
…213)

## Scope

With this change we introduce the possibility of using all data from a
files.dat even if the header specifies a different count.

The usage is `--max-entries=0`

We also update the header in the output file accordingly, i.e.
independently of the original header, the output header specifies the
number of records as written in the file.

## Testing

Tested with proj134/home/king/BBPP134-917/p_MultiCycle_Support

---------

Co-authored-by: Antonio Bellotta <[email protected]>
  • Loading branch information
ferdonline and st4rl3ss authored Dec 2, 2024
1 parent 073956e commit ea9572f
Showing 1 changed file with 19 additions and 8 deletions.
27 changes: 19 additions & 8 deletions tools/rebalance-corenrn-data.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import heapq
import itertools
import logging
import math
import os
import sys

Expand Down Expand Up @@ -39,6 +40,9 @@ def distribute_dat_to_bucket(dat_entry, size, buckets, bucket_sizes):
def redistribute_files_dat(files_dat_file, n_buckets, max_entries=None, show_stats=False):
"""
Read and process each entry from the dat file and distribute them into buckets.
If max entries is not set (None) respect the number of entries from the header.
If user sets to `0` we use all entries from the file (disregard header)
"""
base_dir = os.path.dirname(files_dat_file)
metadata = {}
Expand All @@ -47,16 +51,17 @@ def redistribute_files_dat(files_dat_file, n_buckets, max_entries=None, show_sta
with open(files_dat_file, "r") as file:
# read header
metadata["version"] = file.readline().strip()
n_entries = file.readline().strip()
metadata["n_files"] = file.readline().strip()

metadata["n_files"] = max_entries or n_entries
if max_entries is None:
max_entries = int(metadata["n_files"])

# read all dat entries
dat_entries = file.readlines()

if (n_files := int(metadata["n_files"])) < len(dat_entries):
logging.warning("files.dat: processing reduced number of entries: %d", n_files)
dat_entries = dat_entries[:n_files]
if 0 < max_entries < len(dat_entries):
logging.warning("files.dat: processing reduced number of entries: %d", max_entries)
dat_entries = dat_entries[:max_entries]

logging.info("Distributing files into %d buckets...", n_buckets)

Expand Down Expand Up @@ -110,12 +115,17 @@ def batch(iterable, first=0):
yield group + [CORENRN_SKIP_MARK] * (ranks_per_machine - len(group))
break
yield group
first, last = last, last + 40
first, last = last, last + ranks_per_machine
group = iterable[first:last]

# compute max number of cell groups per rank so we know the n_files in the header
max_len = max(len(m) for m in buckets)
max_groups_rank = math.ceil(max_len / ranks_per_machine)
total_entries = max_groups_rank * ranks_per_machine * len(buckets)

with open(output_file, "w") as out:
print(infos["version"], file=out)
print(infos["n_files"], file=out)
print(total_entries, file=out)

for buckets in itertools.zip_longest(*[batch(m) for m in buckets]):
for entries in buckets:
Expand Down Expand Up @@ -149,7 +159,8 @@ def main():
"--max-entries",
type=int,
default=None,
help="Consider only the first N entries of the input file",
help="Consider only the first N entries of the input file."
"To force using all data and disregard the header set to 0",
)
parser.add_argument(
"--output-file",
Expand Down

0 comments on commit ea9572f

Please sign in to comment.