Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

A collection of fixes to improve performance and stability #34

Merged
merged 10 commits into from
Oct 28, 2024
35 changes: 29 additions & 6 deletions graph/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def init_matplotlib(args):
fatal(f"Cannot load matplotlib backend engine {args.engine}")


GRAPH_TYPES = ["perf", "perf_watt", "watts"]
GRAPH_TYPES = ["perf", "perf_watt", "watts", "cpu_clock"]


class Graph:
Expand Down Expand Up @@ -260,9 +260,32 @@ def generic_graph(
time_serie.append(time)
# Collect all components mean value
for component in components:
if component.get_name() not in data_serie:
data_serie[component.get_name()] = []
data_serie[component.get_name()].append(component.get_mean()[sample])
if component.get_full_name() not in data_serie:
data_serie[component.get_full_name()] = []
# If we are missing some datapoints ....
if len(component.get_mean()) <= sample:
# If the user didn't explictely agreed to be replaced by 0, let's be fatal
if not args.ignore_missing_datapoint:
fatal(
f"{trace.get_name()}/{bench.get_bench_name()}: {component.get_full_name()} is missing the {sample+1}th data point.\
Use --ignore-missing-datapoint to ignore this case. Generated graphs will be partially incorrect."
)
else:
# User is fine with a missing data to be replaced.
# Let's do that so we can render things properly.

# Let's pick the last known value
if args.ignore_missing_datapoint == "last":
data_serie[component.get_full_name()].append(
component.get_mean()[-1]
)
else:
# Replace it by a zero
data_serie[component.get_full_name()].append(0)
else:
data_serie[component.get_full_name()].append(
component.get_mean()[sample]
)

if second_axis:
for _, entry in bench.get_monitoring_metric(second_axis).items():
Expand Down Expand Up @@ -291,8 +314,8 @@ def generic_graph(
graph.get_ax2().plot(x_serie, y2_serie, "", label=data2_item, marker="o")

for component in components:
y_serie = np.array(data_serie[component.get_name()])[order]
graph.get_ax().plot(x_serie, y_serie, "", label=component.get_name())
y_serie = np.array(data_serie[component.get_full_name()])[order]
graph.get_ax().plot(x_serie, y_serie, "", label=component.get_full_name())

graph.prepare_axes(
30,
Expand Down
29 changes: 21 additions & 8 deletions graph/hwgraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,14 +94,20 @@ def graph_monitoring_metrics(args, trace: Trace, bench_name: str, output_dir) ->
metrics = bench.get_component(Metrics.MONITOR, metric_name)
if metrics:
for metric in metrics:
rendered_graphs += yerr_graph(
args,
output_dir,
bench,
Metrics.MONITOR,
metrics[metric],
prefix=f"{metric_name}.",
)
# If a metric has no measure, let's ignore it
if len(metrics[metric].get_samples()) == 0:
print(
f"{bench_name}: No samples found in {metric_name}.{metric}, ignoring metric."
)
else:
rendered_graphs += yerr_graph(
args,
output_dir,
bench,
Metrics.MONITOR,
metrics[metric],
prefix=f"{metric_name}.",
)

return rendered_graphs

Expand Down Expand Up @@ -344,6 +350,13 @@ def main():
action="store_true",
help="Enable verbose mode",
)

parser_graph.add_argument(
"--ignore-missing-datapoint",
choices=["zero", "last"],
default="",
help="Replace a missing datapoint instead of stopping the rendering. Could be by a zero or the last known value.",
)
parser_graph.set_defaults(func=render_traces)

parser_list = subparsers.add_parser(
Expand Down
25 changes: 25 additions & 0 deletions graph/scaling.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
aggregated_perfs_watt = {} # type: dict[str, dict[str, Any]]
aggregated_watt = {} # type: dict[str, dict[str, Any]]
aggregated_watt_err = {} # type: dict[str, dict[str, Any]]
aggregated_cpu_clock = {} # type: dict[str, dict[str, Any]]
aggregated_cpu_clock_err = {} # type: dict[str, dict[str, Any]]
workers = {} # type: dict[str, list]
logical_core_per_worker = []
perf_list, unit = benches[emp]["metrics"]
Expand All @@ -41,6 +43,8 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
aggregated_perfs_watt[perf] = {}
aggregated_watt[perf] = {}
aggregated_watt_err[perf] = {}
aggregated_cpu_clock[perf] = {}
aggregated_cpu_clock_err[perf] = {}
# For every trace file given at the command line
for trace in args.traces:
workers[trace.get_name()] = []
Expand All @@ -63,13 +67,17 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
aggregated_perfs_watt[perf][trace.get_name()] = []
aggregated_watt[perf][trace.get_name()] = []
aggregated_watt_err[perf][trace.get_name()] = []
aggregated_cpu_clock[perf][trace.get_name()] = []
aggregated_cpu_clock_err[perf][trace.get_name()] = []

bench.add_perf(
perf,
traces_perf=aggregated_perfs[perf][trace.get_name()],
perf_watt=aggregated_perfs_watt[perf][trace.get_name()],
watt=aggregated_watt[perf][trace.get_name()],
watt_err=aggregated_watt_err[perf][trace.get_name()],
cpu_clock=aggregated_cpu_clock[perf][trace.get_name()],
cpu_clock_err=aggregated_cpu_clock_err[perf][trace.get_name()],
)

# Let's render all graphs types
Expand All @@ -94,6 +102,13 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
outfile = f"scaling_watt_{clean_perf}_{bench.get_title_engine_name().replace(' ','_')}"
y_label = "Watts"
y_source = aggregated_watt
elif "cpu_clock" in graph_type:
graph_type_title = (
f"Scaling {graph_type}: {args.traces[0].get_metric_name()}"
)
outfile = f"scaling_cpu_clock_{clean_perf}_{bench.get_title_engine_name().replace(' ','_')}"
y_label = "Mhz"
y_source = aggregated_cpu_clock
else:
graph_type_title = (
f"Scaling {graph_type}: {bench.get_title_engine_name()}"
Expand Down Expand Up @@ -164,6 +179,16 @@ def scaling_graph(args, output_dir, job: str, traces_name: list) -> int:
capsize=4,
label=trace_name,
)
elif y_source == aggregated_cpu_clock:
graph.get_ax().errorbar(
x_serie,
y_serie,
yerr=np.array(aggregated_cpu_clock_err[perf][trace_name]).T,
ecolor=e_color,
color=color_name,
capsize=4,
label=trace_name,
)
else:
graph.get_ax().plot(
x_serie,
Expand Down
39 changes: 38 additions & 1 deletion graph/trace.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def load_monitoring(self):
mm = Temperature(measure, original_measure["unit"])
else:
mm = MonitorMetric(measure, original_measure["unit"])
mm.load_from_dict(original_measure)
mm.load_from_dict(original_measure, measure)
self.metrics[metric][component_family][measure] = mm
else:
fatal(f"Unexpected {metric} in monitoring")
Expand Down Expand Up @@ -232,6 +232,8 @@ def add_perf(
perf_watt=None,
watt=None,
watt_err=None,
cpu_clock=None,
cpu_clock_err=None,
index=None,
) -> None:
"""Extract performance and power efficiency"""
Expand Down Expand Up @@ -316,6 +318,41 @@ def add_perf(
watt_err.append(metric)
else:
watt_err[index] = metric

if cpu_clock is not None:
mm = self.get_monitoring_metric(Metrics.FREQ)
mean_values = []
min_values = []
max_values = []

for freq_metric in mm:
if freq_metric != "CPU":
continue
# We have to compute metrics of all systems cores
for core in mm[freq_metric]:
# MIN of min ?
# Mean of mean ?
# Max of max ?
min_values.append(min(mm[freq_metric][core].get_min()))
mean_values.append(mean(mm[freq_metric][core].get_mean()))
max_values.append(max(mm[freq_metric][core].get_max()))
min_value = min(min_values)
mean_value = mean(mean_values)
max_value = max(max_values)

if index is None:
cpu_clock.append(mean_value)
else:
cpu_clock[index] = mean_value

# If we want to keep the error distribution to plot error bars
if cpu_clock_err is not None:
metric = (mean_value - min_value, max_value - mean_value)
if index is None:
cpu_clock_err.append(metric)
else:
cpu_clock_err[index] = metric

except ValueError:
fatal(f"No {perf} found in {self.get_bench_name()}")

Expand Down
4 changes: 2 additions & 2 deletions hwbench/bench/monitoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,8 @@ def next_iter():
start_time = self.get_monotonic_clock()
if self.turbostat:
# Turbostat will run for the whole duration of this loop
# We just retract a 2/10th of second to ensure it will not overdue
self.turbostat.run(interval=(precision - 0.2))
# We just retract a 5/10th of second to ensure it will not overdue
self.turbostat.run(interval=(precision - 0.5))
# Let's monitor the time spent at monitoring the CPU
self.get_metric(Metrics.MONITOR)["CPU"]["Polling"].add(
(self.get_monotonic_clock() - start_time) * 1e-6
Expand Down
9 changes: 8 additions & 1 deletion hwbench/bench/monitoring_structs.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ def __post_init__(self):
# If a value is given, let's add it
self.add(self.value)

def load_from_dict(self, input: dict):
def load_from_dict(self, input: dict, full_name: str):
self.full_name = full_name
self.name = str(input.get("name"))
self.unit = str(input.get("unit"))
self.mean = input.get("mean") # type: ignore[assignment]
Expand All @@ -35,6 +36,12 @@ def load_from_dict(self, input: dict):
self.stdev = input.get("stdev") # type: ignore[assignment]
self.samples = input.get("samples") # type: ignore[assignment]

def get_full_name(self):
"""Return the metric full name"""
if self.full_name:
return self.full_name
return self.name

def get_name(self):
"""Return the metric name"""
return self.name
Expand Down
20 changes: 16 additions & 4 deletions hwbench/environment/vendors/dell/dell.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@


class IDRAC(BMC):
oem_endpoint = ""

def get_thermal(self):
return self.get_redfish_url("/redfish/v1/Chassis/System.Embedded.1/Thermal")

Expand Down Expand Up @@ -42,17 +44,27 @@ def get_power(self):
return self.get_redfish_url("/redfish/v1/Chassis/System.Embedded.1/Power")

def get_oem_system(self):
# If we already found the proper endpoint, let's reuse it.
if self.oem_endpoint:
return self.get_redfish_url(
self.oem_endpoint,
log_failure=False,
)

new_oem_endpoint = "/redfish/v1/Managers/iDRAC.Embedded.1/Oem/Dell/DellAttributes/System.Embedded.1"
oem = self.get_redfish_url(
"/redfish/v1/Managers/iDRAC.Embedded.1/Oem/Dell/DellAttributes/System.Embedded.1",
new_oem_endpoint,
log_failure=False,
)
# If not System.Embedded, let's use the default attributes
if "Attributes" not in oem:
oem = self.get_redfish_url(
"/redfish/v1/Managers/iDRAC.Embedded.1/Attributes"
)
new_oem_endpoint = "/redfish/v1/Managers/iDRAC.Embedded.1/Attributes"
oem = self.get_redfish_url(new_oem_endpoint)
if "Attributes" not in oem:
h.fatal("Cannot find Dell OEM metrics, please fill a bug.")

# Let's save the endpoint to avoid trying all of them at every run
self.oem_endpoint = new_oem_endpoint
return oem

def read_power_consumption(
Expand Down
61 changes: 48 additions & 13 deletions hwbench/environment/vendors/hpe/hpe.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import logging
import pathlib
import re
from functools import cache
from typing import cast

from ....bench.monitoring_structs import (
Expand Down Expand Up @@ -84,24 +86,45 @@ def add(self, name):
def get_power(self):
return self.get_redfish_url("/redfish/v1/Chassis/1/Power/")

@cache
def __warn_psu(self, psu_number, message):
logging.error(f"PSU {psu_number}: {message}")

def read_power_supplies(
self, power_supplies: dict[str, dict[str, Power]] = {}
) -> dict[str, dict[str, Power]]:
"""Return power supplies power from server"""
if str(PowerContext.BMC) not in power_supplies:
power_supplies[str(PowerContext.BMC)] = {} # type: ignore[no-redef]
for psu in self.get_power().get("PowerSupplies"):
# Both PSUs are named the same (HpeServerPowerSupply)
psu_position = str(psu["Oem"]["Hpe"]["BayNumber"])
# All PSUs are named the same (HpeServerPowerSupply)
# Let's update it to have a unique name
name = psu["Name"] + str(psu["Oem"]["Hpe"]["BayNumber"])
psu_name = "PS" + str(psu["Oem"]["Hpe"]["BayNumber"])
super().add_monitoring_value(
cast(dict[str, dict[str, MonitorMetric]], power_supplies),
PowerContext.BMC,
Power(psu_name),
name,
psu["Oem"]["Hpe"]["AveragePowerOutputWatts"],
)
psu_status = psu.get("Status")
if psu_status:
psu_state = psu_status.get("State")
if psu_state:
# We only consider healthy PSU
if str(psu_state).lower() == "enabled":
name = psu["Name"] + psu_position
psu_name = "PS" + psu_position
super().add_monitoring_value(
cast(dict[str, dict[str, MonitorMetric]], power_supplies),
PowerContext.BMC,
Power(psu_name),
name,
psu["Oem"]["Hpe"]["AveragePowerOutputWatts"],
)
else:
# Let's inform the user the PSU is reported as non healthy
self.__warn_psu(
psu_position,
f'marked as {psu_state} in {psu_status.get("Health")} state',
)
continue

# Let's inform the user that no status was found, maybe a parsing or fw issue ?
self.__warn_psu(psu_position, "no status or state found !")

return power_supplies

Expand Down Expand Up @@ -145,11 +168,23 @@ def read_power_consumption(
)
return power_consumption

def get_oem_chassis(self):
return self.get_redfish_url(
"/redfish/v1/Chassis/enclosurechassis/", log_failure=False
@cache
def is_multinode_chassis(self) -> bool:
return (
True
if self.get_redfish_url(
"/redfish/v1/Chassis/enclosurechassis/", log_failure=False
)
else False
)

def get_oem_chassis(self):
if self.is_multinode_chassis():
return self.get_redfish_url(
"/redfish/v1/Chassis/enclosurechassis/", log_failure=False
)
return {}


class Hpe(Vendor):
def __init__(self, out_dir, dmi, monitoring_config_filename):
Expand Down
Loading
Loading