Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

A collection of fixes to improve performance and stability #34

Merged
merged 10 commits into from
Oct 28, 2024
2 changes: 1 addition & 1 deletion graph/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def init_matplotlib(args):
fatal(f"Cannot load matplotlib backend engine {args.engine}")


GRAPH_TYPES = ["perf", "perf_watt", "watts"]
GRAPH_TYPES = ["perf", "perf_watt", "watts", "cpu_clock"]


class Graph:
Expand Down
25 changes: 25 additions & 0 deletions graph/scaling.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
aggregated_perfs_watt = {} # type: dict[str, dict[str, Any]]
aggregated_watt = {} # type: dict[str, dict[str, Any]]
aggregated_watt_err = {} # type: dict[str, dict[str, Any]]
aggregated_cpu_clock = {} # type: dict[str, dict[str, Any]]
aggregated_cpu_clock_err = {} # type: dict[str, dict[str, Any]]
workers = {} # type: dict[str, list]
logical_core_per_worker = []
perf_list, unit = benches[emp]["metrics"]
Expand All @@ -41,6 +43,8 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
aggregated_perfs_watt[perf] = {}
aggregated_watt[perf] = {}
aggregated_watt_err[perf] = {}
aggregated_cpu_clock[perf] = {}
aggregated_cpu_clock_err[perf] = {}
# For every trace file given at the command line
for trace in args.traces:
workers[trace.get_name()] = []
Expand All @@ -63,13 +67,17 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
aggregated_perfs_watt[perf][trace.get_name()] = []
aggregated_watt[perf][trace.get_name()] = []
aggregated_watt_err[perf][trace.get_name()] = []
aggregated_cpu_clock[perf][trace.get_name()] = []
aggregated_cpu_clock_err[perf][trace.get_name()] = []

bench.add_perf(
perf,
traces_perf=aggregated_perfs[perf][trace.get_name()],
perf_watt=aggregated_perfs_watt[perf][trace.get_name()],
watt=aggregated_watt[perf][trace.get_name()],
watt_err=aggregated_watt_err[perf][trace.get_name()],
cpu_clock=aggregated_cpu_clock[perf][trace.get_name()],
cpu_clock_err=aggregated_cpu_clock_err[perf][trace.get_name()],
)

# Let's render all graphs types
Expand All @@ -94,6 +102,13 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
outfile = f"scaling_watt_{clean_perf}_{bench.get_title_engine_name().replace(' ','_')}"
y_label = "Watts"
y_source = aggregated_watt
elif "cpu_clock" in graph_type:
graph_type_title = (
f"Scaling {graph_type}: {args.traces[0].get_metric_name()}"
)
outfile = f"scaling_cpu_clock_{clean_perf}_{bench.get_title_engine_name().replace(' ','_')}"
y_label = "Mhz"
y_source = aggregated_cpu_clock
else:
graph_type_title = (
f"Scaling {graph_type}: {bench.get_title_engine_name()}"
Expand Down Expand Up @@ -164,6 +179,16 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
capsize=4,
label=trace_name,
)
elif y_source == aggregated_cpu_clock:
graph.get_ax().errorbar(
x_serie,
y_serie,
yerr=np.array(aggregated_cpu_clock_err[perf][trace_name]).T,
ecolor=e_color,
color=color_name,
capsize=4,
label=trace_name,
)
else:
graph.get_ax().plot(
x_serie,
Expand Down
37 changes: 37 additions & 0 deletions graph/trace.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,8 @@ def add_perf(
perf_watt=None,
watt=None,
watt_err=None,
cpu_clock=None,
cpu_clock_err=None,
index=None,
) -> None:
"""Extract performance and power efficiency"""
Expand Down Expand Up @@ -316,6 +318,41 @@ def add_perf(
watt_err.append(metric)
else:
watt_err[index] = metric

if cpu_clock is not None:
mm = self.get_monitoring_metric(Metrics.FREQ)
mean_values = []
min_values = []
max_values = []

for freq_metric in mm:
if freq_metric != "CPU":
continue
# We have to compute metrics of all systems cores
for core in mm[freq_metric]:
# MIN of min ?
# Mean of mean ?
# Max of max ?
min_values.append(min(mm[freq_metric][core].get_min()))
mean_values.append(mean(mm[freq_metric][core].get_mean()))
max_values.append(max(mm[freq_metric][core].get_max()))
min_value = min(min_values)
mean_value = mean(mean_values)
max_value = max(max_values)

if index is None:
cpu_clock.append(mean_value)
else:
cpu_clock[index] = mean_value

# If we want to keep the error distribution to plot error bars
if cpu_clock_err is not None:
metric = (mean_value - min_value, max_value - mean_value)
if index is None:
cpu_clock_err.append(metric)
else:
cpu_clock_err[index] = metric

except ValueError:
fatal(f"No {perf} found in {self.get_bench_name()}")

Expand Down
Loading