From 0a653758715757ce42fe58c99728cdb7c98dab23 Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Fri, 8 Dec 2023 15:15:28 +0000 Subject: [PATCH] Style fixes (trimming long lines) + restored accidentally removed units check. --- post-processing/post_processing.py | 119 +++++++++++++++++++---------- 1 file changed, 80 insertions(+), 39 deletions(-) diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py index 908f81a6..85fd2cc1 100644 --- a/post-processing/post_processing.py +++ b/post-processing/post_processing.py @@ -28,7 +28,8 @@ def __init__(self, debug=False, verbose=False): def run_post_processing(self, log_path, config): """ - Return a dataframe containing the information passed to a plotting script and produce relevant graphs. + Return a dataframe containing the information passed to a plotting script + and produce relevant graphs. Args: log_path: str, path to a log file or a directory containing log files. @@ -42,12 +43,14 @@ def run_post_processing(self, log_path, config): raise RuntimeError("Perflog file name provided should have a .log extension.") log_files = [log_path] elif os.path.isdir(log_path): - log_files_temp = [os.path.join(root, file) for root, _, files in os.walk(log_path) for file in files] + log_files_temp = [os.path.join(root, file) for root, _, files in os.walk(log_path) + for file in files] for file in log_files_temp: if os.path.splitext(file)[1] == ".log": log_files.append(file) if len(log_files) == 0: - raise RuntimeError("No perflogs found in this path. Perflogs should have a .log extension.") + raise RuntimeError( + "No perflogs found in this path. Perflogs should have a .log extension.") else: raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), log_path) @@ -65,7 +68,8 @@ def run_post_processing(self, log_path, config): df = pd.concat([df, temp], ignore_index=True) except KeyError as e: if self.debug: - print("Discarding %s:" %os.path.basename(file), type(e).__name__ + ":", e.args[0], e.args[1]) + print("Discarding %s:" %os.path.basename(file), + type(e).__name__ + ":", e.args[0], e.args[1]) print("") if df.empty: raise FileNotFoundError(errno.ENOENT, "Could not find a valid perflog in path", log_path) @@ -83,7 +87,8 @@ def run_post_processing(self, log_path, config): series_filters = [[s[0], "==", s[1]] for s in series] # check acceptable number of series if len(set(series_columns)) > 1: - raise RuntimeError("Currently supporting grouping of series by only one column. Please use a single column name in your series configuration.") + raise RuntimeError("Currently supporting grouping of series by only one column. \ + Please use a single column name in your series configuration.") # add series columns to dataframe column list for c in series_columns: if c not in columns: @@ -130,13 +135,14 @@ def run_post_processing(self, log_path, config): conversion_type = "float64" elif pd.api.types.is_integer_dtype(conversion_type): # all integers treated as Int64 (nullable) - # note: default pandas integer type is int64 (not nullable) + # NOTE: default pandas integer type is int64 (not nullable) conversion_type = "Int64" elif pd.api.types.is_datetime64_any_dtype(conversion_type): # all datetimes treated as datetime64[ns] (nullable) conversion_type = "datetime64[ns]" else: - raise RuntimeError("Unsupported user-specified type '{0}' for column '{1}'.".format(conversion_type, col)) + raise RuntimeError("Unsupported user-specified type '{0}' for column '{1}'." \ + .format(conversion_type, col)) # skip type conversion if column is already the desired type if conversion_type == df[col].dtype: @@ -177,7 +183,9 @@ def run_post_processing(self, log_path, config): num_x_data_points = series_combinations * len(set(df[config["x_axis"]["value"]][mask])) # check expected number of rows if num_filtered_rows > num_x_data_points: - raise RuntimeError("Unexpected number of rows ({0}) does not match number of unique x-axis values per series ({1})".format(num_filtered_rows, num_x_data_points), df[columns][mask]) + raise RuntimeError("Unexpected number of rows ({0}) does not match \ + number of unique x-axis values per series ({1})" \ + .format(num_filtered_rows, num_x_data_points), df[columns][mask]) scaling_column = None scaling_series_mask = None @@ -191,29 +199,37 @@ def run_post_processing(self, log_path, config): scaling_column = df[config["y_axis"]["scaling"]["column"]["name"]].copy() # get mask of scaling series if config["y_axis"]["scaling"]["column"].get("series") is not None: - scaling_series_mask = self.row_filter(series_filters[config["y_axis"]["scaling"]["column"]["series"]], df) + scaling_series_mask = self.row_filter( + series_filters[config["y_axis"]["scaling"]["column"]["series"]], df) # get mask of scaling x-value if config["y_axis"]["scaling"]["column"].get("x_value"): - scaling_x_value_mask = df[config["x_axis"]["value"]] == config["y_axis"]["scaling"]["column"]["x_value"] + scaling_x_value_mask = ( + df[config["x_axis"]["value"]] == config["y_axis"]["scaling"]["column"]["x_value"]) # check custom value is not zero elif not config["y_axis"]["scaling"].get("custom"): - raise RuntimeError("Invalid custom scaling value (cannot divide by {0}).".format(config["y_axis"]["scaling"].get("custom"))) + raise RuntimeError("Invalid custom scaling value (cannot divide by {0})." \ + .format(config["y_axis"]["scaling"].get("custom"))) # apply data transformation per series if series_filters: for f in series_filters: m = self.row_filter(f, df) - df[mask & m] = self.transform_axis(df[mask & m], mask & m, config["y_axis"], scaling_column, scaling_series_mask, scaling_x_value_mask) + df[mask & m] = self.transform_axis( + df[mask & m], mask & m, config["y_axis"], scaling_column, + scaling_series_mask, scaling_x_value_mask) # apply data transformation to all data else: - df[mask] = self.transform_axis(df[mask], mask, config["y_axis"], scaling_column, scaling_series_mask, scaling_x_value_mask) + df[mask] = self.transform_axis( + df[mask], mask, config["y_axis"], scaling_column, + scaling_series_mask, scaling_x_value_mask) print("Selected dataframe:") print(df[columns][mask]) # call a plotting script - self.plot_generic(config["title"], df[columns][mask], config["x_axis"], config["y_axis"], series_filters) + self.plot_generic( + config["title"], df[columns][mask], config["x_axis"], config["y_axis"], series_filters) if self.debug & self.verbose: print("") @@ -252,7 +268,8 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters): # combine group names for later plotting with groupby index_group_col = "_".join(groups) # group by group names (or just x-axis if no other groups are present) - grouped_df = df.groupby(x_column, sort=False) if len(groups) == 1 else df.groupby(groups, sort=False) + grouped_df = df.groupby(x_column, sort=False) if len(groups) == 1 \ + else df.groupby(groups, sort=False) if self.debug: print("") @@ -267,13 +284,17 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters): else math.ceil(np.nanmax(df[y_column])*1.2) # create html file to store plot in - output_file(filename=os.path.join(Path(__file__).parent, "{0}.html".format(title.replace(" ", "_"))), title=title) + output_file(filename=os.path.join( + Path(__file__).parent, "{0}.html".format(title.replace(" ", "_"))), title=title) # create plot - plot = figure(x_range=grouped_df, y_range=(min_y, max_y), title=title, width=800, toolbar_location="above") + plot = figure(x_range=grouped_df, y_range=(min_y, max_y), title=title, + width=800, toolbar_location="above") # configure tooltip - plot.add_tools(HoverTool(tooltips=[(y_label, "@{0}_mean".format(y_column) - + ("{%0.2f}" if pd.api.types.is_float_dtype(df[y_column].dtype) else ""))], + plot.add_tools(HoverTool(tooltips= + [(y_label, "@{0}_mean".format(y_column) + + ("{%0.2f}" if pd.api.types.is_float_dtype(df[y_column].dtype) + else ""))], formatters={"@{0}_mean".format(y_column) : "printf"})) # sort x-axis values in descending order (otherwise default sort is ascending) @@ -289,7 +310,8 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters): colour_factors = [str(x) for x in sorted(pd.Series(df[groups[-1]].unique(), dtype=last_group_dtype))] # divide and assign colours - index_cmap = factor_cmap(index_group_col, palette=viridis(len(colour_factors)), factors=colour_factors, start=len(groups)-1, end=len(groups)) + index_cmap = factor_cmap(index_group_col, palette=viridis(len(colour_factors)), + factors=colour_factors, start=len(groups)-1, end=len(groups)) # add legend labels to data source data_source = ColumnDataSource(grouped_df).data legend_labels = ["{0} = {1}".format(groups[-1].replace("_", " "), @@ -300,7 +322,8 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters): # create legend outside plot plot.add_layout(Legend(), "right") # add bars - plot.vbar(x=index_group_col, top="{0}_mean".format(y_column), width=0.9, source=data_source, line_color="white", fill_color=index_cmap, legend_group="legend_labels", hover_alpha=0.9) + plot.vbar(x=index_group_col, top="{0}_mean".format(y_column), width=0.9, source=data_source, + line_color="white", fill_color=index_cmap, legend_group="legend_labels", hover_alpha=0.9) # add labels plot.xaxis.axis_label = x_label plot.yaxis.axis_label = y_label @@ -333,7 +356,8 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters): def row_filter(self, filter, df: pd.DataFrame): """ - Return a dataframe mask based on a filter condition. The filter is a list that contains a column name, an operator, and a value (e.g. ["flops_value", ">=", 1.0]). + Return a dataframe mask based on a filter condition. The filter is a list that + contains a column name, an operator, and a value (e.g. ["flops_value", ">=", 1.0]). Args: filter: list, a condition based on which a dataframe is filtered. @@ -368,7 +392,8 @@ def row_filter(self, filter, df: pd.DataFrame): return mask - def transform_axis(self, df: pd.DataFrame, df_mask, axis, scaling_column, scaling_series_mask, scaling_x_value_mask): + def transform_axis(self, df: pd.DataFrame, df_mask, axis, scaling_column, + scaling_series_mask, scaling_x_value_mask): """ Divide axis values by specified values and reflect this change in the dataframe. @@ -427,18 +452,25 @@ def read_args(): Return parsed command line arguments. """ - parser = argparse.ArgumentParser(description="Plot benchmark data. At least one perflog must be supplied.") + parser = argparse.ArgumentParser(description="Plot benchmark data. \ + At least one perflog must be supplied.") # required positional arguments (log path, config path) - parser.add_argument("log_path", type=str, help="path to a perflog file or a directory containing perflog files") - parser.add_argument("config_path", type=str, help="path to a configuration file specifying what to plot") + parser.add_argument("log_path", type=str, + help="path to a perflog file or a directory containing perflog files") + parser.add_argument("config_path", type=str, + help="path to a configuration file specifying what to plot") # optional argument (plot type) - parser.add_argument("-p", "--plot_type", type=str, default="generic", help="type of plot to be generated (default: 'generic')") + parser.add_argument("-p", "--plot_type", type=str, default="generic", + help="type of plot to be generated (default: 'generic')") # info dump flags - parser.add_argument("-d", "--debug", action="store_true", help="debug flag for printing additional information") - parser.add_argument("-v", "--verbose", action="store_true", help="verbose flag for printing more debug information (must be used in conjunction with the debug flag)") + parser.add_argument("-d", "--debug", action="store_true", + help="debug flag for printing additional information") + parser.add_argument("-v", "--verbose", action="store_true", + help="verbose flag for printing more debug information \ + (must be used in conjunction with the debug flag)") return parser.parse_args() @@ -483,9 +515,11 @@ def read_config(path): # check series length if config.get("series") is None: - raise KeyError("Missing series information (specify an empty list [] if there is only one series).") + raise KeyError( + "Missing series information (specify an empty list [] if there is only one series).") if len(config["series"]) == 1: - raise KeyError("Number of series must be >= 2 (specify an empty list [] if there is only one series).") + raise KeyError( + "Number of series must be >= 2 (specify an empty list [] if there is only one series).") # check filters are present if not config.get("filters"): @@ -509,7 +543,8 @@ def read_perflog(path): Args: path: str, path to log file. - NB: This currently depends on having a non-default handlers_perflog.filelog.format in reframe's configuration. See code. + NB: This currently depends on having a non-default handlers_perflog.filelog.format + in reframe's configuration. See code. The returned dataframe will have columns for all fields in a performance log record except display name, extra resources, and env vars. Display name will be broken up @@ -522,7 +557,8 @@ def read_perflog(path): REQUIRED_LOG_FIELDS = ["job_completion_time", r"\w+_value$", r"\w+_unit$", "display_name"] # look for required column matches - required_field_matches = [len(list(filter(re.compile(rexpr).match, df.columns))) > 0 for rexpr in REQUIRED_LOG_FIELDS] + required_field_matches = [len(list(filter(re.compile(rexpr).match, df.columns))) > 0 + for rexpr in REQUIRED_LOG_FIELDS] # check all required columns are present if False in required_field_matches: raise KeyError("Perflog missing one or more required fields", REQUIRED_LOG_FIELDS) @@ -549,10 +585,13 @@ def read_perflog(path): def get_display_name_info(display_name): """ - Return a tuple containing the test name and a dictionary of parameter names and their values from the given input string. The parameter dictionary may be empty if no parameters are present. + Return a tuple containing the test name and a dictionary of parameter names + and their values from the given input string. The parameter dictionary may be empty + if no parameters are present. Args: - display_name: str, expecting a format of followed by zero or more %= pairs. + display_name: str, expecting a format of followed by zero or more + %= pairs. """ split_display_name = display_name.split(" %") @@ -578,7 +617,8 @@ def insert_key_cols(df: pd.DataFrame, index, results): def get_axis_info(df: pd.DataFrame, axis, series_filters): """ - Return the column name and label for a given axis. If a column name is supplied as units information, the actual units will be extracted from a dataframe. + Return the column name and label for a given axis. If a column name is supplied as + units information, the actual units will be extracted from a dataframe. Args: df: dataframe, data to plot. @@ -592,8 +632,8 @@ def get_axis_info(df: pd.DataFrame, axis, series_filters): if axis.get("units").get("column"): unit_set = set(df[axis["units"]["column"]].dropna()) # check all rows have the same units - #if len(unit_set) != 1: - # raise RuntimeError("Unexpected number of axis unit entries {0}".format(unit_set)) + if len(unit_set) != 1: + raise RuntimeError("Unexpected number of axis unit entries {0}".format(unit_set)) units = next(iter(unit_set)) # get scaling information @@ -612,7 +652,8 @@ def get_axis_info(df: pd.DataFrame, axis, series_filters): # determine axis label label = "{0}{1}{2}".format(titlecase(col_name.replace("_", " ")), - titlecase(" Scaled by {0}".format(scaling.replace("_", " "))) if scaling else "", + titlecase(" Scaled by {0}".format(scaling.replace("_", " "))) + if scaling else "", " ({0})".format(units) if units else "") return col_name, label