From 706816f58b32a247e6b89af3c4ebcc49fde35f22 Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Mon, 23 Oct 2023 17:07:33 +0100 Subject: [PATCH 01/35] Separated row filters into OR and AND categories. --- post-processing/post_processing.py | 18 ++++++++++-------- post-processing/post_processing_config.yaml | 4 +++- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py index 10a5613e..0338dce7 100644 --- a/post-processing/post_processing.py +++ b/post-processing/post_processing.py @@ -82,16 +82,17 @@ def run_post_processing(self, log_path, config): # check acceptable number of series if len(set(series_columns)) > 1: raise RuntimeError("Currently supporting grouping of series by only one column. Please use a single column name in your series configuration.") - # add series columns to column list + # add series columns to dataframe column list for c in series_columns: if c not in columns: columns.append(c) - filters = config["filters"] + and_filters = config["filters"]["and"] + or_filters = config["filters"]["or"] # extract filter columns - filter_columns = [f[0] for f in filters] + filter_columns = [f[0] for f in and_filters] + [f[0] for f in or_filters] # gather all relevant columns - all_columns = columns + filter_columns + all_columns = set(columns + filter_columns) invalid_columns = [] # check for invalid columns @@ -138,12 +139,13 @@ def run_post_processing(self, log_path, config): mask = pd.Series(df.index.notnull()) # filter rows - if filters: - mask = reduce(op.and_, (self.row_filter(f, df) for f in filters)) + if and_filters: + mask = reduce(op.and_, (self.row_filter(f, df) for f in and_filters)) + if or_filters: + mask = mask & reduce(op.or_, (self.row_filter(f, df) for f in or_filters)) # apply series filters if series_filters: - series_mask = reduce(op.or_, (self.row_filter(f, df) for f in series_filters)) - mask = mask & series_mask + mask = mask & reduce(op.or_, (self.row_filter(f, df) for f in series_filters)) # ensure not all rows are filtered away if df[mask].empty: raise pd.errors.EmptyDataError("Filtered dataframe is empty", df[mask].index) diff --git a/post-processing/post_processing_config.yaml b/post-processing/post_processing_config.yaml index 8ca87e69..f24e08a1 100644 --- a/post-processing/post_processing_config.yaml +++ b/post-processing/post_processing_config.yaml @@ -19,7 +19,9 @@ y_axis: # Optional row filters (specify an empty list [] if none are required). # Filter format: [column_name, operator, value] -filters: [["system", "==", "default"]] +filters: + and: [["system", "==", "default"]] + or: [] # Optional setting to display several plots in the same graph. # Number of series, if used, must be >=2 (specify an empty list [] if there is only one series). From 9a777d312ef98f8a4d4be5f294ebc37f78c5a89f Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Mon, 23 Oct 2023 17:11:41 +0100 Subject: [PATCH 02/35] Updated existing tests to account for filtering change. --- post-processing/test_post_processing.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/post-processing/test_post_processing.py b/post-processing/test_post_processing.py index a4104530..1b660674 100644 --- a/post-processing/test_post_processing.py +++ b/post-processing/test_post_processing.py @@ -182,7 +182,7 @@ def test_high_level_script(run_sombrero): # check expected failure from invalid (filter) column try: - post_.run_post_processing(sombrero_log_path, {"filters": [["fake_column", "==", 2]], "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}}) + post_.run_post_processing(sombrero_log_path, {"filters": {"and": [["fake_column", "==", 2]], "or": []}, "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}}) except KeyError as e: assert e.args[1] == ["fake_column"] else: @@ -190,7 +190,7 @@ def test_high_level_script(run_sombrero): # check expected failure from invalid filter operator try: - post_.run_post_processing(sombrero_log_path, {"filters": [["tasks", "!!", 2]], "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str"}}) + post_.run_post_processing(sombrero_log_path, {"filters": {"and": [["tasks", "!!", 2]], "or": []}, "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str"}}) except KeyError as e: assert e.args[1] == "!!" else: @@ -198,7 +198,7 @@ def test_high_level_script(run_sombrero): # check expected failure from invalid filter value type try: - post_.run_post_processing(sombrero_log_path, {"filters": [["flops_value", ">", "v"]], "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str"}}) + post_.run_post_processing(sombrero_log_path, {"filters": {"and": [["flops_value", ">", "v"]], "or": []}, "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str"}}) except ValueError: assert True else: @@ -206,7 +206,7 @@ def test_high_level_script(run_sombrero): # check expected failure from filtering out every row try: - post_.run_post_processing(sombrero_log_path, {"filters": [["tasks", ">", 2]], "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str"}}) + post_.run_post_processing(sombrero_log_path, {"filters": {"and": [["tasks", ">", 2]], "or": []}, "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str"}}) except pd.errors.EmptyDataError: assert True else: @@ -214,7 +214,7 @@ def test_high_level_script(run_sombrero): # check expected failure from row number vs unique x-axis value number mismatch try: - df = post_.run_post_processing(sombrero_log_path, {"filters": [], "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str"}}) + df = post_.run_post_processing(sombrero_log_path, {"filters": {"and": [], "or": []}, "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str"}}) except RuntimeError: assert True else: @@ -222,7 +222,7 @@ def test_high_level_script(run_sombrero): # check correct display name parsing try: - df = post_.run_post_processing(sombrero_changed_log_path, {"filters": [], "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "cpus_per_task", "units": {"column": "extra_param"}}, "column_types": {"tasks": "int", "cpus_per_task": "int", "extra_param": "int"}}) + df = post_.run_post_processing(sombrero_changed_log_path, {"filters": {"and": [], "or": []}, "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "cpus_per_task", "units": {"column": "extra_param"}}, "column_types": {"tasks": "int", "cpus_per_task": "int", "extra_param": "int"}}) except RuntimeError as e: # three param columns found in changed log EXPECTED_FIELDS = ["tasks", "cpus_per_task", "extra_param"] @@ -232,14 +232,14 @@ def test_high_level_script(run_sombrero): assert False # check correct date filtering - df = post_.run_post_processing(sombrero_changed_log_path, {"title": "Title", "filters": [["job_completion_time", ">", "2000-06-01T12:30:15"]], "series": [], "x_axis": {"value": "job_completion_time", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"job_completion_time": "datetime", "flops_value": "float", "flops_unit": "str"}}) + df = post_.run_post_processing(sombrero_changed_log_path, {"title": "Title", "filters": {"and": [["job_completion_time", ">", "2000-06-01T12:30:15"]], "or": []}, "series": [], "x_axis": {"value": "job_completion_time", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"job_completion_time": "datetime", "flops_value": "float", "flops_unit": "str"}}) # check returned subset is as expected assert len(df) == 2 # check correct concatenation of two dataframes with different columns try: # get collated dataframe subset - df = post_.run_post_processing(Path(sombrero_log_path).parent, {"filters": [], "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str"}}) + df = post_.run_post_processing(Path(sombrero_log_path).parent, {"filters": {"and": [], "or": []}, "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str"}}) except RuntimeError as e: # dataframe has records from both files assert len(e.args[1]) == 8 @@ -247,7 +247,7 @@ def test_high_level_script(run_sombrero): assert False # get filtered dataframe subset - df = post_.run_post_processing(sombrero_log_path, {"title": "Title", "filters": [["tasks", ">", 1], ["cpus_per_task", "==", 2]], "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str", "cpus_per_task": "int"}}) + df = post_.run_post_processing(sombrero_log_path, {"title": "Title", "filters": {"and": [["tasks", ">", 1], ["cpus_per_task", "==", 2]], "or": []}, "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str", "cpus_per_task": "int"}}) EXPECTED_FIELDS = ["tasks", "flops_value", "flops_unit"] # check returned subset is as expected From a60bad251a2b78ad77e9b344fd31c3cc96708eb0 Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Tue, 24 Oct 2023 15:20:06 +0100 Subject: [PATCH 03/35] Updated filter documentation. --- post-processing/README.md | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/post-processing/README.md b/post-processing/README.md index 1100cf6c..5b87dfb6 100644 --- a/post-processing/README.md +++ b/post-processing/README.md @@ -41,6 +41,8 @@ Before running post-processing, create a config file including all necessary inf - `value` - Axis data points. Specified with a column name. - `units` - Axis units. Specified either with a column name or a custom label (may be null). - `filters` - (Optional.) Filter data rows based on specified conditions. (Specify an empty list if no filters are required.) + - `and` - Filter mask is determined from a logical AND of conditions in list. + - `or` - Filter mask is determined from a logical OR of conditions in list. - `Format: [column_name, operator, value]` - `Accepted operators: "==", "!=", "<", ">", "<=", ">="` - `series` - (Optional.) Display several plots in the same graph and group x-axis data by specified column values. (Specify an empty list if there is only one series.) @@ -63,7 +65,9 @@ y_axis: units: column: "unit_col" -filters: [["filter_col_1", "<=", filter_val_1], ["filter_col_2", "!=", filter_val_2]] +filters: + and: [["filter_col_1", "<=", filter_val_1], ["filter_col_2", "!=", filter_val_2]] + or: [] series: [["series_col", "series_val_1"], ["series_col", "series_val_2"]] @@ -85,6 +89,16 @@ The settings above will produce a graph that will have its x-axis data grouped b - (`x_val_2`, `series_val_1`) - (`x_val_2`, `series_val_2`) +#### A Note on Filters + +AND filters, OR filters, and series (treated as special OR filters) are all combined with a logical AND to produce the final filter mask applied to the DataFrame prior to graphing. For example: + +- `and_filters` = `cond1`, `cond2` +- `or_filters`= `cond3`, `cond4` +- `series` = `ser1`, `ser2` + +The filters above would produce the final filter `mask` = (`cond1` AND `cond2`) AND (`cond3` OR `cond4`) AND (`ser1` OR `ser2`). + #### A Note on Column Types All user-specified types are internally converted to their nullable incarnations. As such: From 44c0b8251a6f31f377e161034655a8a1d8983828 Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Tue, 24 Oct 2023 16:52:16 +0100 Subject: [PATCH 04/35] Slight filter mask code adjustment. --- post-processing/post_processing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py index 0338dce7..b38cdc9c 100644 --- a/post-processing/post_processing.py +++ b/post-processing/post_processing.py @@ -142,10 +142,10 @@ def run_post_processing(self, log_path, config): if and_filters: mask = reduce(op.and_, (self.row_filter(f, df) for f in and_filters)) if or_filters: - mask = mask & reduce(op.or_, (self.row_filter(f, df) for f in or_filters)) + mask &= reduce(op.or_, (self.row_filter(f, df) for f in or_filters)) # apply series filters if series_filters: - mask = mask & reduce(op.or_, (self.row_filter(f, df) for f in series_filters)) + mask &= reduce(op.or_, (self.row_filter(f, df) for f in series_filters)) # ensure not all rows are filtered away if df[mask].empty: raise pd.errors.EmptyDataError("Filtered dataframe is empty", df[mask].index) From 8fe70c2c9527cc999f10faf6901ea43b85818e41 Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Wed, 25 Oct 2023 15:45:01 +0100 Subject: [PATCH 05/35] Added OR filter functionality unit test. --- post-processing/test_post_processing.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/post-processing/test_post_processing.py b/post-processing/test_post_processing.py index 1b660674..f1c60194 100644 --- a/post-processing/test_post_processing.py +++ b/post-processing/test_post_processing.py @@ -236,6 +236,11 @@ def test_high_level_script(run_sombrero): # check returned subset is as expected assert len(df) == 2 + # check correct or filtering + df = post_.run_post_processing(sombrero_log_path, {"title": "Title", "filters": {"and": [], "or": [["tasks", ">", "1"], ["tasks", "<", "2"]]}, "series": [["cpus_per_task", "1"], ["cpus_per_task", "2"]], "x_axis": {"value": "cpus_per_task", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"tasks": "int", "cpus_per_task": "int", "flops_value": "float", "flops_unit": "str"}}) + # check returned subset is as expected + assert len(df) == 4 + # check correct concatenation of two dataframes with different columns try: # get collated dataframe subset From c7c92e42c46aaec74b083e2017b85d6fb7bb3955 Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Thu, 2 Nov 2023 17:34:34 +0000 Subject: [PATCH 06/35] Removed series implementation information from filter documentation. --- post-processing/README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/post-processing/README.md b/post-processing/README.md index 5b87dfb6..83b0f66c 100644 --- a/post-processing/README.md +++ b/post-processing/README.md @@ -91,13 +91,12 @@ The settings above will produce a graph that will have its x-axis data grouped b #### A Note on Filters -AND filters, OR filters, and series (treated as special OR filters) are all combined with a logical AND to produce the final filter mask applied to the DataFrame prior to graphing. For example: +AND filters and OR filters are combined with a logical AND to produce the final filter mask applied to the DataFrame prior to graphing. For example: - `and_filters` = `cond1`, `cond2` - `or_filters`= `cond3`, `cond4` -- `series` = `ser1`, `ser2` -The filters above would produce the final filter `mask` = (`cond1` AND `cond2`) AND (`cond3` OR `cond4`) AND (`ser1` OR `ser2`). +The filters above would produce the final filter `mask` = (`cond1` AND `cond2`) AND (`cond3` OR `cond4`). #### A Note on Column Types From 380d2bca3cd0d354855342eb4a5fce49dbddce53 Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Mon, 6 Nov 2023 16:53:25 +0000 Subject: [PATCH 07/35] Added ability to scale axis values by a column. --- post-processing/post_processing.py | 65 ++++++++++++++++++++++++++---- 1 file changed, 58 insertions(+), 7 deletions(-) diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py index b38cdc9c..c34dcdef 100644 --- a/post-processing/post_processing.py +++ b/post-processing/post_processing.py @@ -10,9 +10,10 @@ from itertools import chain from pathlib import Path +import numpy as np import pandas as pd import yaml -from bokeh.models import Legend, HoverTool +from bokeh.models import HoverTool, Legend from bokeh.models.sources import ColumnDataSource from bokeh.palettes import viridis from bokeh.plotting import figure, output_file, save @@ -91,8 +92,16 @@ def run_post_processing(self, log_path, config): or_filters = config["filters"]["or"] # extract filter columns filter_columns = [f[0] for f in and_filters] + [f[0] for f in or_filters] + + # FIXME: add scaling for x-axis + scaling_columns = [] + # extract scaling columns + if config["y_axis"].get("scaling"): + if config["y_axis"]["scaling"].get("column"): + scaling_columns.append(config["y_axis"]["scaling"]["column"]["name"]) + # gather all relevant columns - all_columns = set(columns + filter_columns) + all_columns = set(columns + filter_columns + scaling_columns) invalid_columns = [] # check for invalid columns @@ -161,6 +170,15 @@ def run_post_processing(self, log_path, config): if num_filtered_rows > num_x_data_points: raise RuntimeError("Unexpected number of rows ({0}) does not match number of unique x-axis values per series ({1})".format(num_filtered_rows, num_x_data_points), df[columns][mask]) + # apply data transformation per series + if series_filters: + for f in series_filters: + m = self.row_filter(f, df) + df[mask & m] = self.transform_axis(df[mask & m], config["y_axis"]) + # apply data transformation to all data + else: + df[mask] = self.transform_axis(df[mask], config["y_axis"]) + print("Selected dataframe:") print(df[columns][mask]) @@ -211,9 +229,9 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters): # adjust y-axis range min_y = 0 if min(df[y_column]) >= 0 \ - else math.floor(min(df[y_column])*1.2) + else math.floor(np.nanmin(df[y_column])*1.2) max_y = 0 if max(df[y_column]) <= 0 \ - else math.ceil(max(df[y_column])*1.2) + else math.ceil(np.nanmax(df[y_column])*1.2) # create html file to store plot in output_file(filename=os.path.join(Path(__file__).parent, "{0}.html".format(title.replace(" ", "_"))), title=title) @@ -294,6 +312,31 @@ def row_filter(self, filter, df: pd.DataFrame): return mask + def transform_axis(self, df: pd.DataFrame, axis): + """ + Divide axis values by specified values and reflect this change in the dataframe. + + Args: + df: dataframe, data to plot. + axis: dict, axis column, units, and values to scale by. + """ + + # FIXME: try to make this an in-place process + if axis.get("scaling"): + # scale by column + if axis["scaling"].get("column"): + # check types + if not pd.api.types.is_numeric_dtype(df[axis["value"]].dtype) or \ + not pd.api.types.is_numeric_dtype(df[axis["scaling"]["column"]["name"]].dtype): + # both columns must be numeric + raise TypeError("Cannot scale column '{0}' of type {1} by column '{2}' of type {3}." + .format(axis["value"], df[axis["value"]].dtype, + axis["scaling"]["column"]["name"], + df[axis["scaling"]["column"]["name"]].dtype)) + df[axis["value"]] /= df[axis["scaling"]["column"]["name"]] + + return df + def read_args(): """ Return parsed command line arguments. @@ -437,7 +480,7 @@ def get_axis_info(df: pd.DataFrame, axis): Args: df: dataframe, data to plot. - axis: dict, axis column and units. + axis: dict, axis column, units, and values to scale by. """ # get column name of axis @@ -450,9 +493,17 @@ def get_axis_info(df: pd.DataFrame, axis): if len(unit_set) != 1: raise RuntimeError("Unexpected number of axis unit entries {0}".format(unit_set)) units = next(iter(unit_set)) + + # get values to scale by + scaling = None + if axis.get("scaling"): + if axis.get("scaling").get("column"): + scaling = axis.get("scaling").get("column").get("name") + # determine axis label - label = "{0}{1}".format(col_name.replace("_", " ").title(), - " ({0})".format(units) if units else "") + label = "{0}{1}{2}".format(col_name.replace("_", " ").title(), + " Scaled by {0}".format(scaling.replace("_", " ").title()) if scaling else "", + " ({0})".format(units) if units else "") return col_name, label From 35c211e142b8418050bbadd01f872476c669b21d Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Mon, 6 Nov 2023 16:54:17 +0000 Subject: [PATCH 08/35] Added column scaling unit tests. --- post-processing/test_post_processing.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/post-processing/test_post_processing.py b/post-processing/test_post_processing.py index f1c60194..8d5efb93 100644 --- a/post-processing/test_post_processing.py +++ b/post-processing/test_post_processing.py @@ -241,6 +241,17 @@ def test_high_level_script(run_sombrero): # check returned subset is as expected assert len(df) == 4 + # check correct scaling + dfs = post_.run_post_processing(sombrero_log_path, {"title": "Title", "filters": {"and": [["cpus_per_task", "==", 2]], "or": []}, "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}, "scaling": {"column": {"name": "OMP_NUM_THREADS"}}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str", "cpus_per_task": "int", "OMP_NUM_THREADS": "int"}}) + # check flops values are halved compared to previous df + assert (dfs["flops_value"] == df[df["cpus_per_task"] == 2]["flops_value"]/2).all() + + # check expected failure from scaling by incorrect type + try: + df = post_.run_post_processing(sombrero_log_path, {"title": "Title", "filters": {"and": [], "or": []}, "series": [["cpus_per_task", 1], ["cpus_per_task", 2]], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}, "scaling": {"column": {"name": "OMP_NUM_THREADS"}}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str", "cpus_per_task": "int", "OMP_NUM_THREADS": "str"}}) + except TypeError: + assert True + # check correct concatenation of two dataframes with different columns try: # get collated dataframe subset From a78b81352a9f02baa90301eb81cfe2ab0d24e26f Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Mon, 6 Nov 2023 17:30:11 +0000 Subject: [PATCH 09/35] Added preliminary functionality to scale by specific value in a given column. --- post-processing/post_processing.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py index c34dcdef..47265839 100644 --- a/post-processing/post_processing.py +++ b/post-processing/post_processing.py @@ -174,10 +174,10 @@ def run_post_processing(self, log_path, config): if series_filters: for f in series_filters: m = self.row_filter(f, df) - df[mask & m] = self.transform_axis(df[mask & m], config["y_axis"]) + df[mask & m] = self.transform_axis(df[mask & m], config["y_axis"], config["x_axis"]["value"]) # apply data transformation to all data else: - df[mask] = self.transform_axis(df[mask], config["y_axis"]) + df[mask] = self.transform_axis(df[mask], config["y_axis"], config["x_axis"]["value"]) print("Selected dataframe:") print(df[columns][mask]) @@ -259,6 +259,9 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters): # add labels plot.xaxis.axis_label = x_label plot.yaxis.axis_label = y_label + # remove x-axis group ticks + plot.xaxis.major_tick_line_color = None + plot.xaxis.major_label_text_font_size = "0pt" # adjust font size plot.title.text_font_size = "15pt" @@ -312,19 +315,21 @@ def row_filter(self, filter, df: pd.DataFrame): return mask - def transform_axis(self, df: pd.DataFrame, axis): + def transform_axis(self, df: pd.DataFrame, axis, x_column): """ Divide axis values by specified values and reflect this change in the dataframe. Args: df: dataframe, data to plot. axis: dict, axis column, units, and values to scale by. + x_column: string, name of column containing x-axis values. """ # FIXME: try to make this an in-place process if axis.get("scaling"): # scale by column if axis["scaling"].get("column"): + # check types if not pd.api.types.is_numeric_dtype(df[axis["value"]].dtype) or \ not pd.api.types.is_numeric_dtype(df[axis["scaling"]["column"]["name"]].dtype): @@ -333,7 +338,14 @@ def transform_axis(self, df: pd.DataFrame, axis): .format(axis["value"], df[axis["value"]].dtype, axis["scaling"]["column"]["name"], df[axis["scaling"]["column"]["name"]].dtype)) - df[axis["value"]] /= df[axis["scaling"]["column"]["name"]] + + # scale by specific value in column + if axis["scaling"]["column"].get("x_value"): + x_value = axis["scaling"]["column"]["x_value"] + df[axis["value"]] /= df[df[x_column] == x_value][axis["scaling"]["column"]["name"]].iloc[0] + # scale by entire column + else: + df[axis["value"]] /= df[axis["scaling"]["column"]["name"]] return df @@ -498,7 +510,10 @@ def get_axis_info(df: pd.DataFrame, axis): scaling = None if axis.get("scaling"): if axis.get("scaling").get("column"): - scaling = axis.get("scaling").get("column").get("name") + if axis.get("scaling").get("column").get("x_value"): + scaling = "{0} {1}".format(axis.get("scaling").get("column").get("x_value"), axis.get("scaling").get("column").get("name")) + else: + scaling = axis.get("scaling").get("column").get("name") # determine axis label label = "{0}{1}{2}".format(col_name.replace("_", " ").title(), From ca2deebab7322a7dd6e731d991fc2c11af47baf6 Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Wed, 8 Nov 2023 17:19:14 +0000 Subject: [PATCH 10/35] Added ability to scale axis values by one custom value. --- post-processing/post_processing.py | 42 ++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py index 47265839..c035f6e9 100644 --- a/post-processing/post_processing.py +++ b/post-processing/post_processing.py @@ -305,7 +305,7 @@ def row_filter(self, filter, df: pd.DataFrame): value = pd.Series(value, dtype=df[column].dtype).iloc[0] mask = operator(df[column], value) except TypeError or ValueError as e: - e.args = (e.args[0] + " for column: \'{0}\' and value: \'{1}\'".format(column, value),) + e.args = (e.args[0] + " for column '{0}' and value '{1}'".format(column, value),) raise if self.debug & self.verbose: @@ -327,26 +327,39 @@ def transform_axis(self, df: pd.DataFrame, axis, x_column): # FIXME: try to make this an in-place process if axis.get("scaling"): + # scale by column if axis["scaling"].get("column"): + scaling_column = axis["scaling"]["column"]["name"] + x_value = axis["scaling"]["column"].get("x_value") + # check types if not pd.api.types.is_numeric_dtype(df[axis["value"]].dtype) or \ - not pd.api.types.is_numeric_dtype(df[axis["scaling"]["column"]["name"]].dtype): + not pd.api.types.is_numeric_dtype(df[scaling_column].dtype): # both columns must be numeric raise TypeError("Cannot scale column '{0}' of type {1} by column '{2}' of type {3}." .format(axis["value"], df[axis["value"]].dtype, - axis["scaling"]["column"]["name"], - df[axis["scaling"]["column"]["name"]].dtype)) + scaling_column, df[scaling_column].dtype)) # scale by specific value in column - if axis["scaling"]["column"].get("x_value"): - x_value = axis["scaling"]["column"]["x_value"] - df[axis["value"]] /= df[df[x_column] == x_value][axis["scaling"]["column"]["name"]].iloc[0] + if x_value: + df[axis["value"]] /= df[df[x_column] == x_value][scaling_column].iloc[0] # scale by entire column else: df[axis["value"]] /= df[axis["scaling"]["column"]["name"]] + # scale by custom value + elif axis["scaling"].get("custom"): + scaling_value = axis["scaling"]["custom"] + try: + # interpret scaling value as column dtype + scaling_value = pd.Series(scaling_value, dtype=df[axis["value"]].dtype).iloc[0] + except ValueError as e: + e.args = (e.args[0] + " as a scaling value for column '{0}'".format(axis["value"]),) + raise + df[axis["value"]] /= scaling_value + return df def read_args(): @@ -361,7 +374,7 @@ def read_args(): parser.add_argument("config_path", type=str, help="path to a configuration file specifying what to plot") # optional argument (plot type) - parser.add_argument("-p", "--plot_type", type=str, default="generic", help="type of plot to be generated (default: \'generic\')") + parser.add_argument("-p", "--plot_type", type=str, default="generic", help="type of plot to be generated (default: 'generic')") # info dump flags parser.add_argument("-d", "--debug", action="store_true", help="debug flag for printing additional information") @@ -506,14 +519,15 @@ def get_axis_info(df: pd.DataFrame, axis): raise RuntimeError("Unexpected number of axis unit entries {0}".format(unit_set)) units = next(iter(unit_set)) - # get values to scale by + # get scaling information scaling = None if axis.get("scaling"): - if axis.get("scaling").get("column"): - if axis.get("scaling").get("column").get("x_value"): - scaling = "{0} {1}".format(axis.get("scaling").get("column").get("x_value"), axis.get("scaling").get("column").get("name")) - else: - scaling = axis.get("scaling").get("column").get("name") + if axis["scaling"].get("column"): + scaling_column = axis["scaling"]["column"]["name"] + x_value = axis["scaling"]["column"].get("x_value") + scaling = "{0} {1}".format(x_value, scaling_column) if x_value else scaling_column + else: + scaling = str(axis["scaling"].get("custom")) # determine axis label label = "{0}{1}{2}".format(col_name.replace("_", " ").title(), From 1c47a1ab9134f09cb42a747ab3ad3e77e77a1e51 Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Wed, 8 Nov 2023 17:23:55 +0000 Subject: [PATCH 11/35] Added custom value scaling unit tests. --- post-processing/test_post_processing.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/post-processing/test_post_processing.py b/post-processing/test_post_processing.py index 8d5efb93..d05a5101 100644 --- a/post-processing/test_post_processing.py +++ b/post-processing/test_post_processing.py @@ -241,17 +241,28 @@ def test_high_level_script(run_sombrero): # check returned subset is as expected assert len(df) == 4 - # check correct scaling + # check correct column scaling dfs = post_.run_post_processing(sombrero_log_path, {"title": "Title", "filters": {"and": [["cpus_per_task", "==", 2]], "or": []}, "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}, "scaling": {"column": {"name": "OMP_NUM_THREADS"}}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str", "cpus_per_task": "int", "OMP_NUM_THREADS": "int"}}) # check flops values are halved compared to previous df assert (dfs["flops_value"] == df[df["cpus_per_task"] == 2]["flops_value"]/2).all() - # check expected failure from scaling by incorrect type + # check correct custom scaling + dfs = post_.run_post_processing(sombrero_log_path, {"title": "Title", "filters": {"and": [["cpus_per_task", "==", 2]], "or": []}, "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}, "scaling": {"custom": 2}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str", "cpus_per_task": "int"}}) + # check flops values are halved compared to previous df + assert (dfs["flops_value"] == df[df["cpus_per_task"] == 2]["flops_value"]/2).all() + + # check expected failure from scaling by incorrect column type try: df = post_.run_post_processing(sombrero_log_path, {"title": "Title", "filters": {"and": [], "or": []}, "series": [["cpus_per_task", 1], ["cpus_per_task", 2]], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}, "scaling": {"column": {"name": "OMP_NUM_THREADS"}}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str", "cpus_per_task": "int", "OMP_NUM_THREADS": "str"}}) except TypeError: assert True + # check expected failure from scaling by incompatible custom type + try: + df = post_.run_post_processing(sombrero_log_path, {"title": "Title", "filters": {"and": [["cpus_per_task", "==", 2]], "or": []}, "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}, "scaling": {"custom": "s"}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str", "cpus_per_task": "int"}}) + except ValueError: + assert True + # check correct concatenation of two dataframes with different columns try: # get collated dataframe subset From 4daef87f07f1c97488582f3d769fee444e5f8925 Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Thu, 9 Nov 2023 18:54:29 +0000 Subject: [PATCH 12/35] Added preliminary functionality to scale by a series. --- post-processing/post_processing.py | 109 +++++++++++++++++------------ 1 file changed, 66 insertions(+), 43 deletions(-) diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py index c035f6e9..dd122201 100644 --- a/post-processing/post_processing.py +++ b/post-processing/post_processing.py @@ -170,14 +170,30 @@ def run_post_processing(self, log_path, config): if num_filtered_rows > num_x_data_points: raise RuntimeError("Unexpected number of rows ({0}) does not match number of unique x-axis values per series ({1})".format(num_filtered_rows, num_x_data_points), df[columns][mask]) - # apply data transformation per series - if series_filters: - for f in series_filters: - m = self.row_filter(f, df) - df[mask & m] = self.transform_axis(df[mask & m], config["y_axis"], config["x_axis"]["value"]) - # apply data transformation to all data - else: - df[mask] = self.transform_axis(df[mask], config["y_axis"], config["x_axis"]["value"]) + scaling_column = None + scaling_series_mask = None + scaling_x_value_mask = None + # extract scaling information + if config["y_axis"].get("scaling"): + + if config["y_axis"]["scaling"].get("column"): + # copy scaling column (prevents issues when scaling by itself) + scaling_column = df[config["y_axis"]["scaling"]["column"]["name"]].copy() + # get mask of scaling series + if config["y_axis"]["scaling"]["column"].get("series") is not None: + scaling_series_mask = self.row_filter(series_filters[config["y_axis"]["scaling"]["column"]["series"]], df) + # get mask of scaling x-value + if config["y_axis"]["scaling"]["column"].get("x_value"): + scaling_x_value_mask = df[config["x_axis"]["value"]] == config["y_axis"]["scaling"]["column"]["x_value"] + + # apply data transformation per series + if series_filters: + for f in series_filters: + m = self.row_filter(f, df) + df[mask & m] = self.transform_axis(df[mask & m], mask & m, config["y_axis"], scaling_column, scaling_series_mask, scaling_x_value_mask) + # apply data transformation to all data + else: + df[mask] = self.transform_axis(df[mask], mask, config["y_axis"], scaling_column, scaling_series_mask, scaling_x_value_mask) print("Selected dataframe:") print(df[columns][mask]) @@ -315,7 +331,7 @@ def row_filter(self, filter, df: pd.DataFrame): return mask - def transform_axis(self, df: pd.DataFrame, axis, x_column): + def transform_axis(self, df: pd.DataFrame, df_mask, axis, scaling_column, scaling_series_mask, scaling_x_value_mask): """ Divide axis values by specified values and reflect this change in the dataframe. @@ -326,39 +342,43 @@ def transform_axis(self, df: pd.DataFrame, axis, x_column): """ # FIXME: try to make this an in-place process - if axis.get("scaling"): - - # scale by column - if axis["scaling"].get("column"): - - scaling_column = axis["scaling"]["column"]["name"] - x_value = axis["scaling"]["column"].get("x_value") - - # check types - if not pd.api.types.is_numeric_dtype(df[axis["value"]].dtype) or \ - not pd.api.types.is_numeric_dtype(df[scaling_column].dtype): - # both columns must be numeric - raise TypeError("Cannot scale column '{0}' of type {1} by column '{2}' of type {3}." - .format(axis["value"], df[axis["value"]].dtype, - scaling_column, df[scaling_column].dtype)) - - # scale by specific value in column - if x_value: - df[axis["value"]] /= df[df[x_column] == x_value][scaling_column].iloc[0] - # scale by entire column - else: - df[axis["value"]] /= df[axis["scaling"]["column"]["name"]] - - # scale by custom value - elif axis["scaling"].get("custom"): - scaling_value = axis["scaling"]["custom"] - try: - # interpret scaling value as column dtype - scaling_value = pd.Series(scaling_value, dtype=df[axis["value"]].dtype).iloc[0] - except ValueError as e: - e.args = (e.args[0] + " as a scaling value for column '{0}'".format(axis["value"]),) - raise - df[axis["value"]] /= scaling_value + + # scale by column + if scaling_column is not None: + + # check types + if not pd.api.types.is_numeric_dtype(df[axis["value"]].dtype) or \ + not pd.api.types.is_numeric_dtype(scaling_column.dtype): + # both columns must be numeric + raise TypeError("Cannot scale column '{0}' of type {1} by column '{2}' of type {3}." + .format(axis["value"], df[axis["value"]].dtype, + axis["scaling"]["column"]["name"], scaling_column.dtype)) + + # get mask of scaling value(s) + scaling_mask = df_mask.copy() + if scaling_series_mask is not None: + scaling_mask = scaling_series_mask + if scaling_x_value_mask is not None: + scaling_mask &= scaling_x_value_mask + + scaling_val = scaling_column[scaling_mask].iloc[0] if len(scaling_column[scaling_mask]) == 1 \ + else scaling_column[scaling_mask].values + + # FIXME: add a check that the masked scaling column has the same number of values + # as the masked df (unless there is only one scaling value) + + df[axis["value"]] = df[axis["value"]].values / scaling_val + + # scale by custom value + elif axis["scaling"].get("custom"): + scaling_value = axis["scaling"]["custom"] + try: + # interpret scaling value as column dtype + scaling_value = pd.Series(scaling_value, dtype=df[axis["value"]].dtype).iloc[0] + except ValueError as e: + e.args = (e.args[0] + " as a scaling value for column '{0}'".format(axis["value"]),) + raise + df[axis["value"]] /= scaling_value return df @@ -524,8 +544,11 @@ def get_axis_info(df: pd.DataFrame, axis): if axis.get("scaling"): if axis["scaling"].get("column"): scaling_column = axis["scaling"]["column"]["name"] + series_index = axis["scaling"]["column"].get("series") x_value = axis["scaling"]["column"].get("x_value") - scaling = "{0} {1}".format(x_value, scaling_column) if x_value else scaling_column + series_col = "series {0} of {1}".format(series_index, scaling_column) \ + if series_index is not None else scaling_column + scaling = "{0} {1}".format(x_value, series_col) if x_value else series_col else: scaling = str(axis["scaling"].get("custom")) From 2d6db1e0976dc1c7460c8d4f1188c9f841daee7d Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Fri, 10 Nov 2023 15:03:30 +0000 Subject: [PATCH 13/35] Minor fixes + making axis label clearer. --- post-processing/post_processing.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py index dd122201..e609dd9f 100644 --- a/post-processing/post_processing.py +++ b/post-processing/post_processing.py @@ -176,6 +176,8 @@ def run_post_processing(self, log_path, config): # extract scaling information if config["y_axis"].get("scaling"): + # FIXME: if there is a scaling field, check that there is at least one of column or custom + if config["y_axis"]["scaling"].get("column"): # copy scaling column (prevents issues when scaling by itself) scaling_column = df[config["y_axis"]["scaling"]["column"]["name"]].copy() @@ -221,8 +223,8 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters): """ # get column names and labels for axes - x_column, x_label = get_axis_info(df, x_axis) - y_column, y_label = get_axis_info(df, y_axis) + x_column, x_label = get_axis_info(df, x_axis, series_filters) + y_column, y_label = get_axis_info(df, y_axis, series_filters) # find x-axis groups (series columns) groups = [x_column] @@ -336,9 +338,12 @@ def transform_axis(self, df: pd.DataFrame, df_mask, axis, scaling_column, scalin Divide axis values by specified values and reflect this change in the dataframe. Args: - df: dataframe, data to plot. + df: dataframe, data to plot (pre-masked by series, if present). + df_mask: bool list, the mask (pre-)applied to the df argument. axis: dict, axis column, units, and values to scale by. - x_column: string, name of column containing x-axis values. + scaling_column: dataframe column, copy of column containing values to scale by. + scaling_series_mask: bool list, a series mask to be applied to the scaling column. + scaling_x_value_mask: bool list, an x-axis value mask to be applied to the scaling column. """ # FIXME: try to make this an in-place process @@ -519,7 +524,7 @@ def insert_key_cols(df: pd.DataFrame, index, results): # insert keys as new columns df.insert(index, k, [r[k] if k in r.keys() else None for r in results]) -def get_axis_info(df: pd.DataFrame, axis): +def get_axis_info(df: pd.DataFrame, axis, series_filters): """ Return the column name and label for a given axis. If a column name is supplied as units information, the actual units will be extracted from a dataframe. @@ -546,7 +551,7 @@ def get_axis_info(df: pd.DataFrame, axis): scaling_column = axis["scaling"]["column"]["name"] series_index = axis["scaling"]["column"].get("series") x_value = axis["scaling"]["column"].get("x_value") - series_col = "series {0} of {1}".format(series_index, scaling_column) \ + series_col = "{0} {1}".format(series_filters[series_index][2], scaling_column) \ if series_index is not None else scaling_column scaling = "{0} {1}".format(x_value, series_col) if x_value else series_col else: From b5f24d4c27ba845eeb652b5f33c0762f92dc0834 Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Fri, 10 Nov 2023 15:07:35 +0000 Subject: [PATCH 14/35] Bug fix for legend labels of plots without series. --- post-processing/post_processing.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py index e609dd9f..e3c03718 100644 --- a/post-processing/post_processing.py +++ b/post-processing/post_processing.py @@ -269,7 +269,9 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters): index_cmap = factor_cmap(index_group_col, palette=viridis(len(colour_factors)), factors=colour_factors, start=len(groups)-1, end=len(groups)) # add legend labels to data source data_source = ColumnDataSource(grouped_df).data - legend_labels = ["{0} = {1}".format(groups[-1].replace("_", " "), group[-1]) for group in data_source[index_group_col]] + legend_labels = ["{0} = {1}".format(groups[-1].replace("_", " "), + group[-1] if len(groups) > 1 else group) + for group in data_source[index_group_col]] data_source["legend_labels"] = legend_labels # add bars From 257647507b43e4c05ec3b317ec0dddc489a81bd5 Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Fri, 17 Nov 2023 15:30:16 +0000 Subject: [PATCH 15/35] Updated read_config errors. --- post-processing/post_processing.py | 39 +++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py index e3c03718..a344fbb2 100644 --- a/post-processing/post_processing.py +++ b/post-processing/post_processing.py @@ -422,32 +422,49 @@ def read_config(path): # check x-axis information if not config.get("x_axis"): - raise KeyError("Missing x-axis information") + raise KeyError("Missing x-axis information.") if not config.get("x_axis").get("value"): - raise KeyError("Missing x-axis value information") + raise KeyError("Missing x-axis value information.") if not config.get("x_axis").get("units"): - raise KeyError("Missing x-axis units information") + raise KeyError("Missing x-axis units information.") + if config.get("x_axis").get("units").get("custom") is not None and \ + config.get("x_axis").get("units").get("column") is not None: + raise KeyError("Specify x-axis units information as only one of 'custom' or 'column'.") + # check y-axis information if not config.get("y_axis"): - raise KeyError("Missing y-axis information") + raise KeyError("Missing y-axis information.") if not config.get("y_axis").get("value"): - raise KeyError("Missing y-axis value information") + raise KeyError("Missing y-axis value information.") if not config.get("y_axis").get("units"): - raise KeyError("Missing y-axis units information") + raise KeyError("Missing y-axis units information.") + if config.get("y_axis").get("units").get("custom") is not None and \ + config.get("y_axis").get("units").get("column") is not None: + raise KeyError("Specify y-axis units information as only one of 'custom' or 'column'.") + + # check optional scaling information + if config.get("y_axis").get("scaling"): + if config.get("y_axis").get("scaling").get("custom") is not None and \ + config.get("y_axis").get("scaling").get("column") is not None: + raise KeyError("Specify y-axis scaling information as only one of 'custom' or 'column'.") # check series length if config.get("series") is None: - raise KeyError("Missing series information (specify an empty list [] if there is only one series)") + raise KeyError("Missing series information (specify an empty list [] if there is only one series).") if len(config["series"]) == 1: - raise KeyError("Number of series must be >= 2 (specify an empty list [] if there is only one series)") + raise KeyError("Number of series must be >= 2 (specify an empty list [] if there is only one series).") # check filters are present - if config.get("filters") is None: - raise KeyError("Missing filters information (specify an empty list [] if none are required)") + if not config.get("filters"): + raise KeyError("Missing filter information (specify 'and' and 'or' filters).") + if config.get("filters").get("and") is None: + raise KeyError("Missing 'and' filters (specify an empty list [] if none are required).") + if config.get("filters").get("or") is None: + raise KeyError("Missing 'or' filters (specify an empty list [] if none are required).") # check plot title information if not config.get("title"): - raise KeyError("Missing plot title information") + raise KeyError("Missing plot title information.") return config From d9fde628bc42062f235570668fdcb6fa9473b459 Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Fri, 17 Nov 2023 15:45:53 +0000 Subject: [PATCH 16/35] Added check to ensure custom scaling value cannot be zero. --- post-processing/post_processing.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py index a344fbb2..48dfb05b 100644 --- a/post-processing/post_processing.py +++ b/post-processing/post_processing.py @@ -176,8 +176,7 @@ def run_post_processing(self, log_path, config): # extract scaling information if config["y_axis"].get("scaling"): - # FIXME: if there is a scaling field, check that there is at least one of column or custom - + # check column information if config["y_axis"]["scaling"].get("column"): # copy scaling column (prevents issues when scaling by itself) scaling_column = df[config["y_axis"]["scaling"]["column"]["name"]].copy() @@ -188,6 +187,10 @@ def run_post_processing(self, log_path, config): if config["y_axis"]["scaling"]["column"].get("x_value"): scaling_x_value_mask = df[config["x_axis"]["value"]] == config["y_axis"]["scaling"]["column"]["x_value"] + # check custom value is not zero + elif not config["y_axis"]["scaling"].get("custom"): + raise RuntimeError("Invalid custom scaling value (cannot divide by {0}).".format(config["y_axis"]["scaling"].get("custom"))) + # apply data transformation per series if series_filters: for f in series_filters: From a1a4b9620149bed397a10f5f4310c5c5ccf5702c Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Mon, 20 Nov 2023 15:28:53 +0000 Subject: [PATCH 17/35] Added initial attempt at sorting categorical x-axis. FIXME: dataframe sorting not reflected in bokeh graph. --- post-processing/post_processing.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py index 48dfb05b..8bc37d10 100644 --- a/post-processing/post_processing.py +++ b/post-processing/post_processing.py @@ -200,6 +200,21 @@ def run_post_processing(self, log_path, config): else: df[mask] = self.transform_axis(df[mask], mask, config["y_axis"], scaling_column, scaling_series_mask, scaling_x_value_mask) + # FIXME: sorted dataframe doesn't translate to sorted bokeh graph + if config["x_axis"].get("sorting"): + ascending = None + if config["x_axis"]["sorting"] == "ascending": + ascending = True + elif config["x_axis"]["sorting"] == "descending": + ascending = False + if ascending is not None: + # sort x values + df.sort_values([config["x_axis"]["value"]], ascending=ascending, inplace=True, ignore_index=True) + # NOTE: currently assuming there can only be one series column + if series_columns: + # sort series column + df.sort_values(series_columns[0], ascending=ascending, inplace=True, ignore_index=True) + print("Selected dataframe:") print(df[columns][mask]) @@ -240,7 +255,7 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters): # combine group names for later plotting with groupby index_group_col = "_".join(groups) # group by group names (or just x-axis if no other groups are present) - grouped_df = df.groupby(x_column) if len(groups) == 1 else df.groupby(groups) + grouped_df = df.groupby(x_column, sort=False) if len(groups) == 1 else df.groupby(groups, sort=False) if self.debug: print("") From 556dca2fd312fe34f401f9b32953a742717d6e68 Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Thu, 30 Nov 2023 21:38:14 +0000 Subject: [PATCH 18/35] Added more data transform unit tests. --- post-processing/test_post_processing.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/post-processing/test_post_processing.py b/post-processing/test_post_processing.py index d05a5101..a4a8c7b3 100644 --- a/post-processing/test_post_processing.py +++ b/post-processing/test_post_processing.py @@ -237,14 +237,28 @@ def test_high_level_script(run_sombrero): assert len(df) == 2 # check correct or filtering - df = post_.run_post_processing(sombrero_log_path, {"title": "Title", "filters": {"and": [], "or": [["tasks", ">", "1"], ["tasks", "<", "2"]]}, "series": [["cpus_per_task", "1"], ["cpus_per_task", "2"]], "x_axis": {"value": "cpus_per_task", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"tasks": "int", "cpus_per_task": "int", "flops_value": "float", "flops_unit": "str"}}) + df = post_.run_post_processing(sombrero_log_path, {"title": "Title", "filters": {"and": [], "or": [["tasks", ">", "1"], ["tasks", "<", "2"]]}, "series": [["cpus_per_task", "1"], ["cpus_per_task", "2"]], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"tasks": "int", "cpus_per_task": "int", "flops_value": "float", "flops_unit": "str"}}) # check returned subset is as expected assert len(df) == 4 # check correct column scaling dfs = post_.run_post_processing(sombrero_log_path, {"title": "Title", "filters": {"and": [["cpus_per_task", "==", 2]], "or": []}, "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}, "scaling": {"column": {"name": "OMP_NUM_THREADS"}}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str", "cpus_per_task": "int", "OMP_NUM_THREADS": "int"}}) # check flops values are halved compared to previous df - assert (dfs["flops_value"] == df[df["cpus_per_task"] == 2]["flops_value"]/2).all() + assert (dfs["flops_value"].values == df[df["cpus_per_task"] == 2]["flops_value"].values/2).all() + + # check correct column + series scaling + dfs = post_.run_post_processing(sombrero_log_path, {"title": "Title", "filters": {"and": [], "or": []}, "series": [["cpus_per_task", 1], ["cpus_per_task", 2]], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}, "scaling": {"column": {"name": "flops_value", "series": 0}}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str", "cpus_per_task": "int"}}) + assert (dfs[dfs["cpus_per_task"] == 1]["flops_value"].values == + df[df["cpus_per_task"] == 1]["flops_value"].values / + df[df["cpus_per_task"] == 1]["flops_value"].values).all() + assert (dfs[dfs["cpus_per_task"] == 2]["flops_value"].values == + df[df["cpus_per_task"] == 2]["flops_value"].values / + df[df["cpus_per_task"] == 1]["flops_value"].values).all() + + # check correct column + series + x value scaling + dfs = post_.run_post_processing(sombrero_log_path, {"title": "Title", "filters": {"and": [], "or": []}, "series": [["cpus_per_task", 1], ["cpus_per_task", 2]], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}, "scaling": {"column": {"name": "flops_value", "series": 0, "x_value": 2}}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str", "cpus_per_task": "int"}}) + assert (dfs["flops_value"].values == df["flops_value"].values / + df[(df["cpus_per_task"] == 1) & (df["tasks"] == 2)]["flops_value"].iloc[0]).all() # check correct custom scaling dfs = post_.run_post_processing(sombrero_log_path, {"title": "Title", "filters": {"and": [["cpus_per_task", "==", 2]], "or": []}, "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}, "scaling": {"custom": 2}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str", "cpus_per_task": "int"}}) From 608fd0726f6f46d2c46a421a17d85cb561edbf74 Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Fri, 1 Dec 2023 10:35:13 +0000 Subject: [PATCH 19/35] Updated documentation to explain scaling and possible data transformations. --- post-processing/README.md | 117 +++++++++++++++++++- post-processing/post_processing_config.yaml | 7 ++ 2 files changed, 121 insertions(+), 3 deletions(-) diff --git a/post-processing/README.md b/post-processing/README.md index 83b0f66c..f20b5ad5 100644 --- a/post-processing/README.md +++ b/post-processing/README.md @@ -4,14 +4,16 @@ The post-processing scripts provided with the ExCALIBUR tests package are intended to grant users a quick starting point for visualising benchmark results with basic graphs and tables. Their components can also be used inside custom users' scripts. -There are three main post-processing components: +There are four main post-processing components: - **`Perflog parsing`:** - Data from benchmark performance logs are stored in a pandas DataFrame. - **`Data filtering`:** - If more than one perflog is used for plotting, DataFrames from individual perflogs are concatenated together into one DataFrame. - The DataFrame is then filtered, keeping only relevant rows and columns. +- **`Data transformation`:** + - Axis value columns in the DataFrame are scaled according to user specifications. - **`Plotting`:** - - A filtered DataFrame is passed to a plotting script, which produces a graph and embeds it in a simple HTML file. + - A filtered and transformed DataFrame is passed to a plotting script, which produces a graph and embeds it in a simple HTML file. - Users may run the plotting script to generate a generic bar chart. Graph settings should be specified in a configuration YAML file. ### Installation @@ -34,12 +36,13 @@ Run `post_processing.py -h` for more information (including debugging flags). ### Configuration Structure -Before running post-processing, create a config file including all necessary information for graph generation (you must specify at least plot title, x-axis, y-axis, and column types). See below for an example. +Before running post-processing, create a config file including all necessary information for graph generation (you must specify at least plot title, x-axis, y-axis, and column types). See below for an example and some clarifying notes. - `title` - Plot title. - `x_axis`, `y_axis` - Axis information. - `value` - Axis data points. Specified with a column name. - `units` - Axis units. Specified either with a column name or a custom label (may be null). + - `scaling` - (Optional.) Scale axis values by either a column or a custom value. - `filters` - (Optional.) Filter data rows based on specified conditions. (Specify an empty list if no filters are required.) - `and` - Filter mask is determined from a logical AND of conditions in list. - `or` - Filter mask is determined from a logical OR of conditions in list. @@ -64,6 +67,11 @@ y_axis: value: "y_axis_col" units: column: "unit_col" + scaling: + column: + name: "scaling_col" + series: 0 + x_value: "x_val_s" filters: and: [["filter_col_1", "<=", filter_val_1], ["filter_col_2", "!=", filter_val_2]] @@ -75,6 +83,7 @@ column_types: x_axis_col: "str" y_axis_col: "float" unit_col: "str" + scaling_col: "float" filter_col_1: "datetime" filter_col_2: "int" series_col: "str" @@ -89,6 +98,108 @@ The settings above will produce a graph that will have its x-axis data grouped b - (`x_val_2`, `series_val_1`) - (`x_val_2`, `series_val_2`) +#### A Note on Scaling + +When axis values are scaled, they are all divided by a number or a list of numbers. If using more than one number for scaling, the length of the list must match the length of the axis column being scaled. (`Note: scaling is currently only supported for y-axis data, as graphs with a non-categorical x-axis are still a work in progress.`) + +**Custom Scaling** + +Manually specify one value to scale axis values by. + +```yaml +y_axis: + value: "y_axis_col" + units: + column: "unit_col" + scaling: + custom: 2 +``` + +In the snippet above, all y-axis values are to be divided by 2. + +|y_axis_col|scaled_y_axis_col| +|-|-| +|3.2|3.2 / 2.0 = 1.6| +|5.4|5.4 / 2.0 = 2.7| +|2.4|2.4 / 2.0 = 1.2| +|5.0|5.0 / 2.0 = 2.5| + +**Column Scaling** + +Specify one column to scale axis values by. + +```yaml +y_axis: + value: "y_axis_col" + units: + column: "unit_col" + scaling: + column: + name: "scaling_col" +``` + +In the snippet above, all y-axis values are to be divided by the corresponding values in the scaling column. + +|y_axis_col|scaling_col|scaled_y_axis_col| +|-|-|-| +|3.2|**`1.6`**|3.2 / 1.6 = 2.0| +|5.4|**`2.0`**|5.4 / 2.0 = 2.7| +|2.4|**`0.6`**|2.4 / 0.6 = 4.0| +|5.0|**`2.5`**|5.0 / 2.5 = 2.0| + +**Series Scaling** + +Specify one series to scale axis values by. This is done with an index, which is used to find the correct series from a list. + +In the case of the list of series from the example config above, index 0 would select a scaling series of `["series_col", "series_val_1"]`, while index 1 would scale by `["series_col", "series_val_2"]`. + +```yaml +y_axis: + value: "y_axis_col" + units: + column: "unit_col" + scaling: + column: + name: "scaling_col" + series: 0 +``` + +In the snippet above, all y-axis values are to be split by series and divided by the corresponding values in the scaling series. + +|y_axis_col|scaling_col|series_col|scaled_y_axis_col| +|-|-|-|-| +|3.2|**`1.6`**|`series_val_1`|3.2 / 1.6 = 2.0| +|5.4|**`2.0`**|`series_val_1`|5.4 / 2.0 = 2.7| +|2.4|0.6|series_val_2|2.4 / 1.6 = 1.5| +|5.0|2.5|series_val_2|5.0 / 2.0 = 2.5| + +**Selected Value Scaling** + +Specify one value from a column to scale axis values by. + +```yaml +y_axis: + value: "y_axis_col" + units: + column: "unit_col" + scaling: + column: + name: "scaling_col" + series: 0 + x_value: "x_val_s" +``` + +In the snippet above, all y-axis values are to be divided by the scaling value found by filtering the scaling column by both series and x-axis value. + +|x_axis_col|y_axis_col|scaling_col|series_col|scaled_y_axis_col| +|-|-|-|-|-| +|x_val_1|3.2|1.6|series_val_1|3.2 / 2.0 = 1.6| +|`x_val_s`|5.4|**`2.0`**|`series_val_1`|5.4 / 2.0 = 2.7| +|x_val_2|2.4|0.7|series_val_2|2.4 / 2.0 = 1.2| +|x_val_s|5.0|2.5|series_val_2|5.0 / 2.0 = 2.5| + +(`Note: if series are not present and x-axis values are all unique, it is enough to specify just the column name and x-value.`) + #### A Note on Filters AND filters and OR filters are combined with a logical AND to produce the final filter mask applied to the DataFrame prior to graphing. For example: diff --git a/post-processing/post_processing_config.yaml b/post-processing/post_processing_config.yaml index f24e08a1..2e4b3521 100644 --- a/post-processing/post_processing_config.yaml +++ b/post-processing/post_processing_config.yaml @@ -6,6 +6,8 @@ title: Example Plot # and "display_name" fields, if it exists). # Units can either be specified with a perflog column name # or a custom label (including null (i.e. None) if units are N/A). +# Optional axis scaling can either be specified with a perflog column name +# (+ a series index and/or an x-axis value for filtering), or a custom value x_axis: value: "tasks" @@ -16,6 +18,11 @@ y_axis: value: "flops_value" units: column: "flops_unit" + scaling: + column: + name: "flops_value" + series: 0 # The series at index 0 is ["cpus_per_task", 1] + x_value: 2 # Find the row where "tasks" is 2 # Optional row filters (specify an empty list [] if none are required). # Filter format: [column_name, operator, value] From b0ec2515e06a76866baf00b0421adf7f9f0adb5f Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Fri, 1 Dec 2023 11:22:58 +0000 Subject: [PATCH 20/35] Making use of titlecase library in graph labels to preserve acronyms. --- post-processing/post_processing.py | 8 +++++--- pyproject.toml | 1 + 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py index 8bc37d10..bff9cde7 100644 --- a/post-processing/post_processing.py +++ b/post-processing/post_processing.py @@ -18,6 +18,7 @@ from bokeh.palettes import viridis from bokeh.plotting import figure, output_file, save from bokeh.transform import factor_cmap +from titlecase import titlecase class PostProcessing: @@ -588,15 +589,16 @@ def get_axis_info(df: pd.DataFrame, axis, series_filters): scaling_column = axis["scaling"]["column"]["name"] series_index = axis["scaling"]["column"].get("series") x_value = axis["scaling"]["column"].get("x_value") - series_col = "{0} {1}".format(series_filters[series_index][2], scaling_column) \ + # FIXME: make scaling label more clear + series_col = "{0} in {1}".format(series_filters[series_index][2], scaling_column) \ if series_index is not None else scaling_column scaling = "{0} {1}".format(x_value, series_col) if x_value else series_col else: scaling = str(axis["scaling"].get("custom")) # determine axis label - label = "{0}{1}{2}".format(col_name.replace("_", " ").title(), - " Scaled by {0}".format(scaling.replace("_", " ").title()) if scaling else "", + label = "{0}{1}{2}".format(titlecase(col_name.replace("_", " ")), + titlecase(" Scaled by {0}".format(scaling.replace("_", " "))) if scaling else "", " ({0})".format(units) if units else "") return col_name, label diff --git a/pyproject.toml b/pyproject.toml index d82eb9ec..7360d45b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ test = [ post-processing = [ "pandas >= 2.0.1", "bokeh >= 3.2.0", + "titlecase >= 2.4.1", ] [tool.setuptools_scm] From c9ae3f29acf4ba4a2367f7fb0873a0fd8cf8d59c Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Fri, 1 Dec 2023 13:10:10 +0000 Subject: [PATCH 21/35] Fixed simple categorical x-axis sorting. --- post-processing/post_processing.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py index bff9cde7..a8553d40 100644 --- a/post-processing/post_processing.py +++ b/post-processing/post_processing.py @@ -201,20 +201,13 @@ def run_post_processing(self, log_path, config): else: df[mask] = self.transform_axis(df[mask], mask, config["y_axis"], scaling_column, scaling_series_mask, scaling_x_value_mask) - # FIXME: sorted dataframe doesn't translate to sorted bokeh graph - if config["x_axis"].get("sorting"): - ascending = None - if config["x_axis"]["sorting"] == "ascending": - ascending = True - elif config["x_axis"]["sorting"] == "descending": - ascending = False - if ascending is not None: - # sort x values - df.sort_values([config["x_axis"]["value"]], ascending=ascending, inplace=True, ignore_index=True) - # NOTE: currently assuming there can only be one series column - if series_columns: - # sort series column - df.sort_values(series_columns[0], ascending=ascending, inplace=True, ignore_index=True) + # sort series in ascending order + # NOTE: currently assuming there can only be one series column + if series_columns: + # NOTE: don't use ignore_index=True, this results in unexpected behaviour + df.sort_values(series_columns[0], inplace=True) + # reset index after sorting + df.index = range(len(df.index)) print("Selected dataframe:") print(df[columns][mask]) @@ -280,6 +273,11 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters): + ("{%0.2f}" if pd.api.types.is_float_dtype(df[y_column].dtype) else ""))], formatters={"@{0}_mean".format(y_column) : "printf"})) + # sort x-axis values in ascending order (otherwise default sort is descending) + if x_axis.get("sort"): + if x_axis["sort"] == "ascending": + plot.x_range.factors = sorted(plot.x_range.factors, key=lambda x: x[0], reverse=True) + # create legend outside plot plot.add_layout(Legend(), "right") # automatically base bar colouring on last group column From b2d7ad9e574a2790e30ea80243060a33f677ac72 Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Fri, 1 Dec 2023 13:16:30 +0000 Subject: [PATCH 22/35] Added note on sorting categorical x-axis. --- post-processing/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/post-processing/README.md b/post-processing/README.md index f20b5ad5..d288cb93 100644 --- a/post-processing/README.md +++ b/post-processing/README.md @@ -43,6 +43,7 @@ Before running post-processing, create a config file including all necessary inf - `value` - Axis data points. Specified with a column name. - `units` - Axis units. Specified either with a column name or a custom label (may be null). - `scaling` - (Optional.) Scale axis values by either a column or a custom value. + - `sort` - (Optional.) Sort categorical x-axis in ascending order (otherwise values are sorted in descending order by default). - `filters` - (Optional.) Filter data rows based on specified conditions. (Specify an empty list if no filters are required.) - `and` - Filter mask is determined from a logical AND of conditions in list. - `or` - Filter mask is determined from a logical OR of conditions in list. @@ -62,6 +63,7 @@ x_axis: value: "x_axis_col" units: custom: "unit_label" + sort: "ascending" y_axis: value: "y_axis_col" From 562f4aac2e47207ce30ddff9dd1f9f5cc29525ad Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Fri, 1 Dec 2023 13:30:48 +0000 Subject: [PATCH 23/35] Fixed stray missing detail in unit test. --- post-processing/test_post_processing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/post-processing/test_post_processing.py b/post-processing/test_post_processing.py index a4a8c7b3..7638388c 100644 --- a/post-processing/test_post_processing.py +++ b/post-processing/test_post_processing.py @@ -263,7 +263,7 @@ def test_high_level_script(run_sombrero): # check correct custom scaling dfs = post_.run_post_processing(sombrero_log_path, {"title": "Title", "filters": {"and": [["cpus_per_task", "==", 2]], "or": []}, "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}, "scaling": {"custom": 2}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str", "cpus_per_task": "int"}}) # check flops values are halved compared to previous df - assert (dfs["flops_value"] == df[df["cpus_per_task"] == 2]["flops_value"]/2).all() + assert (dfs["flops_value"].values == df[df["cpus_per_task"] == 2]["flops_value"].values/2).all() # check expected failure from scaling by incorrect column type try: From 4ecbb82fd13d0f64425bd1902422355cc6bab064 Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Mon, 4 Dec 2023 15:05:27 +0000 Subject: [PATCH 24/35] Updated dataframe sorting and fixed scaling mismatch by sorting before scaling. --- post-processing/post_processing.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py index a8553d40..e45df0ea 100644 --- a/post-processing/post_processing.py +++ b/post-processing/post_processing.py @@ -171,6 +171,14 @@ def run_post_processing(self, log_path, config): if num_filtered_rows > num_x_data_points: raise RuntimeError("Unexpected number of rows ({0}) does not match number of unique x-axis values per series ({1})".format(num_filtered_rows, num_x_data_points), df[columns][mask]) + sorting_columns = [config["x_axis"]["value"]] + # sort x-axis values and series in ascending order + if series_columns: + # NOTE: currently assuming there can only be one series column + sorting_columns.append(series_columns[0]) + # sorting here is necessary to ensure correct scaling alignment + df.sort_values(sorting_columns, inplace=True, ignore_index=True) + scaling_column = None scaling_series_mask = None scaling_x_value_mask = None @@ -201,14 +209,6 @@ def run_post_processing(self, log_path, config): else: df[mask] = self.transform_axis(df[mask], mask, config["y_axis"], scaling_column, scaling_series_mask, scaling_x_value_mask) - # sort series in ascending order - # NOTE: currently assuming there can only be one series column - if series_columns: - # NOTE: don't use ignore_index=True, this results in unexpected behaviour - df.sort_values(series_columns[0], inplace=True) - # reset index after sorting - df.index = range(len(df.index)) - print("Selected dataframe:") print(df[columns][mask]) @@ -386,7 +386,7 @@ def transform_axis(self, df: pd.DataFrame, df_mask, axis, scaling_column, scalin scaling_mask &= scaling_x_value_mask scaling_val = scaling_column[scaling_mask].iloc[0] if len(scaling_column[scaling_mask]) == 1 \ - else scaling_column[scaling_mask].values + else scaling_column[scaling_mask].values # FIXME: add a check that the masked scaling column has the same number of values # as the masked df (unless there is only one scaling value) From e10e5be425df19119958e33a3cff498a2aa7efba Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Mon, 4 Dec 2023 15:09:50 +0000 Subject: [PATCH 25/35] Updated x-axis sorting to work as expected for non-string values. --- post-processing/post_processing.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py index e45df0ea..770e50f9 100644 --- a/post-processing/post_processing.py +++ b/post-processing/post_processing.py @@ -243,6 +243,8 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters): for f in series_filters: if f[0] not in groups: groups.append(f[0]) + # keep original x-axis dtype for sorting + x_col_dtype = df[x_column].dtype # all x-axis data treated as categorical for g in groups: df[g] = df[g].astype(str) @@ -274,9 +276,13 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters): formatters={"@{0}_mean".format(y_column) : "printf"})) # sort x-axis values in ascending order (otherwise default sort is descending) + reverse = False if x_axis.get("sort"): if x_axis["sort"] == "ascending": - plot.x_range.factors = sorted(plot.x_range.factors, key=lambda x: x[0], reverse=True) + reverse = True + plot.x_range.factors = sorted(plot.x_range.factors, + key=lambda x: pd.Series(x[0], dtype=x_col_dtype).iloc[0], + reverse=reverse) # create legend outside plot plot.add_layout(Legend(), "right") From 5073413f2979304fb4139853751afd9969eedfb0 Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Mon, 4 Dec 2023 15:36:04 +0000 Subject: [PATCH 26/35] Fixed sorting for graphs without series. --- post-processing/post_processing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py index 770e50f9..ceee66b9 100644 --- a/post-processing/post_processing.py +++ b/post-processing/post_processing.py @@ -280,9 +280,9 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters): if x_axis.get("sort"): if x_axis["sort"] == "ascending": reverse = True - plot.x_range.factors = sorted(plot.x_range.factors, - key=lambda x: pd.Series(x[0], dtype=x_col_dtype).iloc[0], - reverse=reverse) + plot.x_range.factors = sorted(plot.x_range.factors, reverse=reverse, + key=lambda x: pd.Series(x[0] if len(groups) > 1 else x, + dtype=x_col_dtype).iloc[0]) # create legend outside plot plot.add_layout(Legend(), "right") From 385e1553dba68286b1a40a649af7a92174180f5a Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Tue, 5 Dec 2023 16:49:15 +0000 Subject: [PATCH 27/35] Moved sorting to not interfere with filter mask. --- post-processing/post_processing.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py index ceee66b9..4b960428 100644 --- a/post-processing/post_processing.py +++ b/post-processing/post_processing.py @@ -147,6 +147,14 @@ def run_post_processing(self, log_path, config): else: raise KeyError("Could not find user-specified type for column", col) + sorting_columns = [config["x_axis"]["value"]] + # sort x-axis values and series in ascending order + if series_columns: + # NOTE: currently assuming there can only be one series column + sorting_columns.append(series_columns[0]) + # sorting here is necessary to ensure correct filtering + scaling alignment + df.sort_values(sorting_columns, inplace=True, ignore_index=True) + mask = pd.Series(df.index.notnull()) # filter rows if and_filters: @@ -171,14 +179,6 @@ def run_post_processing(self, log_path, config): if num_filtered_rows > num_x_data_points: raise RuntimeError("Unexpected number of rows ({0}) does not match number of unique x-axis values per series ({1})".format(num_filtered_rows, num_x_data_points), df[columns][mask]) - sorting_columns = [config["x_axis"]["value"]] - # sort x-axis values and series in ascending order - if series_columns: - # NOTE: currently assuming there can only be one series column - sorting_columns.append(series_columns[0]) - # sorting here is necessary to ensure correct scaling alignment - df.sort_values(sorting_columns, inplace=True, ignore_index=True) - scaling_column = None scaling_series_mask = None scaling_x_value_mask = None @@ -582,8 +582,8 @@ def get_axis_info(df: pd.DataFrame, axis, series_filters): if axis.get("units").get("column"): unit_set = set(df[axis["units"]["column"]].dropna()) # check all rows have the same units - if len(unit_set) != 1: - raise RuntimeError("Unexpected number of axis unit entries {0}".format(unit_set)) + #if len(unit_set) != 1: + # raise RuntimeError("Unexpected number of axis unit entries {0}".format(unit_set)) units = next(iter(unit_set)) # get scaling information From e9521084c13905dfd133c8a0eb502029a7f7521f Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Tue, 5 Dec 2023 16:55:23 +0000 Subject: [PATCH 28/35] Changed default categorical x-axis sort from descending to ascending. --- post-processing/README.md | 4 ++-- post-processing/post_processing.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/post-processing/README.md b/post-processing/README.md index d288cb93..3a9cee50 100644 --- a/post-processing/README.md +++ b/post-processing/README.md @@ -43,7 +43,7 @@ Before running post-processing, create a config file including all necessary inf - `value` - Axis data points. Specified with a column name. - `units` - Axis units. Specified either with a column name or a custom label (may be null). - `scaling` - (Optional.) Scale axis values by either a column or a custom value. - - `sort` - (Optional.) Sort categorical x-axis in ascending order (otherwise values are sorted in descending order by default). + - `sort` - (Optional.) Sort categorical x-axis in descending order (otherwise values are sorted in ascending order by default). - `filters` - (Optional.) Filter data rows based on specified conditions. (Specify an empty list if no filters are required.) - `and` - Filter mask is determined from a logical AND of conditions in list. - `or` - Filter mask is determined from a logical OR of conditions in list. @@ -63,7 +63,7 @@ x_axis: value: "x_axis_col" units: custom: "unit_label" - sort: "ascending" + sort: "descending" y_axis: value: "y_axis_col" diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py index 4b960428..1cc13f6e 100644 --- a/post-processing/post_processing.py +++ b/post-processing/post_processing.py @@ -275,11 +275,11 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters): + ("{%0.2f}" if pd.api.types.is_float_dtype(df[y_column].dtype) else ""))], formatters={"@{0}_mean".format(y_column) : "printf"})) - # sort x-axis values in ascending order (otherwise default sort is descending) - reverse = False + # sort x-axis values in descending order (otherwise default sort is ascending) + reverse = True if x_axis.get("sort"): - if x_axis["sort"] == "ascending": - reverse = True + if x_axis["sort"] == "descending": + reverse = False plot.x_range.factors = sorted(plot.x_range.factors, reverse=reverse, key=lambda x: pd.Series(x[0] if len(groups) > 1 else x, dtype=x_col_dtype).iloc[0]) From 697d4766afba9c11e9129459fc1a44f92bac29e2 Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Tue, 5 Dec 2023 17:04:51 +0000 Subject: [PATCH 29/35] Adjusted graph colour sorting. --- post-processing/post_processing.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py index 1cc13f6e..dbe37990 100644 --- a/post-processing/post_processing.py +++ b/post-processing/post_processing.py @@ -245,6 +245,7 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters): groups.append(f[0]) # keep original x-axis dtype for sorting x_col_dtype = df[x_column].dtype + last_group_dtype = df[groups[-1]].dtype # all x-axis data treated as categorical for g in groups: df[g] = df[g].astype(str) @@ -287,11 +288,13 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters): # create legend outside plot plot.add_layout(Legend(), "right") # automatically base bar colouring on last group column - colour_factors = sorted(df[groups[-1]].unique()) + colour_factors = [str(x) for x in sorted(pd.Series(df[groups[-1]].unique(), + dtype=last_group_dtype))] # divide and assign colours index_cmap = factor_cmap(index_group_col, palette=viridis(len(colour_factors)), factors=colour_factors, start=len(groups)-1, end=len(groups)) # add legend labels to data source data_source = ColumnDataSource(grouped_df).data + # FIXME: attempt to adjust legend label sorting to match new colouring legend_labels = ["{0} = {1}".format(groups[-1].replace("_", " "), group[-1] if len(groups) > 1 else group) for group in data_source[index_group_col]] From 32d4334be392812d0b97b7c8181daf8cdff968c0 Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Fri, 8 Dec 2023 14:39:55 +0000 Subject: [PATCH 30/35] Adjusted legend label sorting + fixed default data sorting order. --- post-processing/post_processing.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py index dbe37990..908f81a6 100644 --- a/post-processing/post_processing.py +++ b/post-processing/post_processing.py @@ -277,16 +277,14 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters): formatters={"@{0}_mean".format(y_column) : "printf"})) # sort x-axis values in descending order (otherwise default sort is ascending) - reverse = True + reverse = False if x_axis.get("sort"): if x_axis["sort"] == "descending": - reverse = False + reverse = True plot.x_range.factors = sorted(plot.x_range.factors, reverse=reverse, key=lambda x: pd.Series(x[0] if len(groups) > 1 else x, dtype=x_col_dtype).iloc[0]) - # create legend outside plot - plot.add_layout(Legend(), "right") # automatically base bar colouring on last group column colour_factors = [str(x) for x in sorted(pd.Series(df[groups[-1]].unique(), dtype=last_group_dtype))] @@ -294,14 +292,15 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters): index_cmap = factor_cmap(index_group_col, palette=viridis(len(colour_factors)), factors=colour_factors, start=len(groups)-1, end=len(groups)) # add legend labels to data source data_source = ColumnDataSource(grouped_df).data - # FIXME: attempt to adjust legend label sorting to match new colouring legend_labels = ["{0} = {1}".format(groups[-1].replace("_", " "), group[-1] if len(groups) > 1 else group) for group in data_source[index_group_col]] data_source["legend_labels"] = legend_labels + # create legend outside plot + plot.add_layout(Legend(), "right") # add bars - plot.vbar(x=index_group_col, top="{0}_mean".format(y_column), width=0.9, source=data_source, line_color="white", fill_color=index_cmap, legend_field="legend_labels", hover_alpha=0.9) + plot.vbar(x=index_group_col, top="{0}_mean".format(y_column), width=0.9, source=data_source, line_color="white", fill_color=index_cmap, legend_group="legend_labels", hover_alpha=0.9) # add labels plot.xaxis.axis_label = x_label plot.yaxis.axis_label = y_label @@ -311,6 +310,14 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters): # adjust font size plot.title.text_font_size = "15pt" + # get label values with their original dtype + label_values = [pd.Series(x.label.value.split("=")[1].strip(), dtype=last_group_dtype).iloc[0] + for x in plot.legend[0].items] + # sort legend items (order determined by x-axis sort) + sorted_legend_items = [x[1] for x in sorted(zip(label_values, plot.legend[0].items), + reverse=reverse, key=lambda x: x[0])] + plot.legend[0].items = sorted_legend_items + # save to file save(plot) From 0a653758715757ce42fe58c99728cdb7c98dab23 Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Fri, 8 Dec 2023 15:15:28 +0000 Subject: [PATCH 31/35] Style fixes (trimming long lines) + restored accidentally removed units check. --- post-processing/post_processing.py | 119 +++++++++++++++++++---------- 1 file changed, 80 insertions(+), 39 deletions(-) diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py index 908f81a6..85fd2cc1 100644 --- a/post-processing/post_processing.py +++ b/post-processing/post_processing.py @@ -28,7 +28,8 @@ def __init__(self, debug=False, verbose=False): def run_post_processing(self, log_path, config): """ - Return a dataframe containing the information passed to a plotting script and produce relevant graphs. + Return a dataframe containing the information passed to a plotting script + and produce relevant graphs. Args: log_path: str, path to a log file or a directory containing log files. @@ -42,12 +43,14 @@ def run_post_processing(self, log_path, config): raise RuntimeError("Perflog file name provided should have a .log extension.") log_files = [log_path] elif os.path.isdir(log_path): - log_files_temp = [os.path.join(root, file) for root, _, files in os.walk(log_path) for file in files] + log_files_temp = [os.path.join(root, file) for root, _, files in os.walk(log_path) + for file in files] for file in log_files_temp: if os.path.splitext(file)[1] == ".log": log_files.append(file) if len(log_files) == 0: - raise RuntimeError("No perflogs found in this path. Perflogs should have a .log extension.") + raise RuntimeError( + "No perflogs found in this path. Perflogs should have a .log extension.") else: raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), log_path) @@ -65,7 +68,8 @@ def run_post_processing(self, log_path, config): df = pd.concat([df, temp], ignore_index=True) except KeyError as e: if self.debug: - print("Discarding %s:" %os.path.basename(file), type(e).__name__ + ":", e.args[0], e.args[1]) + print("Discarding %s:" %os.path.basename(file), + type(e).__name__ + ":", e.args[0], e.args[1]) print("") if df.empty: raise FileNotFoundError(errno.ENOENT, "Could not find a valid perflog in path", log_path) @@ -83,7 +87,8 @@ def run_post_processing(self, log_path, config): series_filters = [[s[0], "==", s[1]] for s in series] # check acceptable number of series if len(set(series_columns)) > 1: - raise RuntimeError("Currently supporting grouping of series by only one column. Please use a single column name in your series configuration.") + raise RuntimeError("Currently supporting grouping of series by only one column. \ + Please use a single column name in your series configuration.") # add series columns to dataframe column list for c in series_columns: if c not in columns: @@ -130,13 +135,14 @@ def run_post_processing(self, log_path, config): conversion_type = "float64" elif pd.api.types.is_integer_dtype(conversion_type): # all integers treated as Int64 (nullable) - # note: default pandas integer type is int64 (not nullable) + # NOTE: default pandas integer type is int64 (not nullable) conversion_type = "Int64" elif pd.api.types.is_datetime64_any_dtype(conversion_type): # all datetimes treated as datetime64[ns] (nullable) conversion_type = "datetime64[ns]" else: - raise RuntimeError("Unsupported user-specified type '{0}' for column '{1}'.".format(conversion_type, col)) + raise RuntimeError("Unsupported user-specified type '{0}' for column '{1}'." \ + .format(conversion_type, col)) # skip type conversion if column is already the desired type if conversion_type == df[col].dtype: @@ -177,7 +183,9 @@ def run_post_processing(self, log_path, config): num_x_data_points = series_combinations * len(set(df[config["x_axis"]["value"]][mask])) # check expected number of rows if num_filtered_rows > num_x_data_points: - raise RuntimeError("Unexpected number of rows ({0}) does not match number of unique x-axis values per series ({1})".format(num_filtered_rows, num_x_data_points), df[columns][mask]) + raise RuntimeError("Unexpected number of rows ({0}) does not match \ + number of unique x-axis values per series ({1})" \ + .format(num_filtered_rows, num_x_data_points), df[columns][mask]) scaling_column = None scaling_series_mask = None @@ -191,29 +199,37 @@ def run_post_processing(self, log_path, config): scaling_column = df[config["y_axis"]["scaling"]["column"]["name"]].copy() # get mask of scaling series if config["y_axis"]["scaling"]["column"].get("series") is not None: - scaling_series_mask = self.row_filter(series_filters[config["y_axis"]["scaling"]["column"]["series"]], df) + scaling_series_mask = self.row_filter( + series_filters[config["y_axis"]["scaling"]["column"]["series"]], df) # get mask of scaling x-value if config["y_axis"]["scaling"]["column"].get("x_value"): - scaling_x_value_mask = df[config["x_axis"]["value"]] == config["y_axis"]["scaling"]["column"]["x_value"] + scaling_x_value_mask = ( + df[config["x_axis"]["value"]] == config["y_axis"]["scaling"]["column"]["x_value"]) # check custom value is not zero elif not config["y_axis"]["scaling"].get("custom"): - raise RuntimeError("Invalid custom scaling value (cannot divide by {0}).".format(config["y_axis"]["scaling"].get("custom"))) + raise RuntimeError("Invalid custom scaling value (cannot divide by {0})." \ + .format(config["y_axis"]["scaling"].get("custom"))) # apply data transformation per series if series_filters: for f in series_filters: m = self.row_filter(f, df) - df[mask & m] = self.transform_axis(df[mask & m], mask & m, config["y_axis"], scaling_column, scaling_series_mask, scaling_x_value_mask) + df[mask & m] = self.transform_axis( + df[mask & m], mask & m, config["y_axis"], scaling_column, + scaling_series_mask, scaling_x_value_mask) # apply data transformation to all data else: - df[mask] = self.transform_axis(df[mask], mask, config["y_axis"], scaling_column, scaling_series_mask, scaling_x_value_mask) + df[mask] = self.transform_axis( + df[mask], mask, config["y_axis"], scaling_column, + scaling_series_mask, scaling_x_value_mask) print("Selected dataframe:") print(df[columns][mask]) # call a plotting script - self.plot_generic(config["title"], df[columns][mask], config["x_axis"], config["y_axis"], series_filters) + self.plot_generic( + config["title"], df[columns][mask], config["x_axis"], config["y_axis"], series_filters) if self.debug & self.verbose: print("") @@ -252,7 +268,8 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters): # combine group names for later plotting with groupby index_group_col = "_".join(groups) # group by group names (or just x-axis if no other groups are present) - grouped_df = df.groupby(x_column, sort=False) if len(groups) == 1 else df.groupby(groups, sort=False) + grouped_df = df.groupby(x_column, sort=False) if len(groups) == 1 \ + else df.groupby(groups, sort=False) if self.debug: print("") @@ -267,13 +284,17 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters): else math.ceil(np.nanmax(df[y_column])*1.2) # create html file to store plot in - output_file(filename=os.path.join(Path(__file__).parent, "{0}.html".format(title.replace(" ", "_"))), title=title) + output_file(filename=os.path.join( + Path(__file__).parent, "{0}.html".format(title.replace(" ", "_"))), title=title) # create plot - plot = figure(x_range=grouped_df, y_range=(min_y, max_y), title=title, width=800, toolbar_location="above") + plot = figure(x_range=grouped_df, y_range=(min_y, max_y), title=title, + width=800, toolbar_location="above") # configure tooltip - plot.add_tools(HoverTool(tooltips=[(y_label, "@{0}_mean".format(y_column) - + ("{%0.2f}" if pd.api.types.is_float_dtype(df[y_column].dtype) else ""))], + plot.add_tools(HoverTool(tooltips= + [(y_label, "@{0}_mean".format(y_column) + + ("{%0.2f}" if pd.api.types.is_float_dtype(df[y_column].dtype) + else ""))], formatters={"@{0}_mean".format(y_column) : "printf"})) # sort x-axis values in descending order (otherwise default sort is ascending) @@ -289,7 +310,8 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters): colour_factors = [str(x) for x in sorted(pd.Series(df[groups[-1]].unique(), dtype=last_group_dtype))] # divide and assign colours - index_cmap = factor_cmap(index_group_col, palette=viridis(len(colour_factors)), factors=colour_factors, start=len(groups)-1, end=len(groups)) + index_cmap = factor_cmap(index_group_col, palette=viridis(len(colour_factors)), + factors=colour_factors, start=len(groups)-1, end=len(groups)) # add legend labels to data source data_source = ColumnDataSource(grouped_df).data legend_labels = ["{0} = {1}".format(groups[-1].replace("_", " "), @@ -300,7 +322,8 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters): # create legend outside plot plot.add_layout(Legend(), "right") # add bars - plot.vbar(x=index_group_col, top="{0}_mean".format(y_column), width=0.9, source=data_source, line_color="white", fill_color=index_cmap, legend_group="legend_labels", hover_alpha=0.9) + plot.vbar(x=index_group_col, top="{0}_mean".format(y_column), width=0.9, source=data_source, + line_color="white", fill_color=index_cmap, legend_group="legend_labels", hover_alpha=0.9) # add labels plot.xaxis.axis_label = x_label plot.yaxis.axis_label = y_label @@ -333,7 +356,8 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters): def row_filter(self, filter, df: pd.DataFrame): """ - Return a dataframe mask based on a filter condition. The filter is a list that contains a column name, an operator, and a value (e.g. ["flops_value", ">=", 1.0]). + Return a dataframe mask based on a filter condition. The filter is a list that + contains a column name, an operator, and a value (e.g. ["flops_value", ">=", 1.0]). Args: filter: list, a condition based on which a dataframe is filtered. @@ -368,7 +392,8 @@ def row_filter(self, filter, df: pd.DataFrame): return mask - def transform_axis(self, df: pd.DataFrame, df_mask, axis, scaling_column, scaling_series_mask, scaling_x_value_mask): + def transform_axis(self, df: pd.DataFrame, df_mask, axis, scaling_column, + scaling_series_mask, scaling_x_value_mask): """ Divide axis values by specified values and reflect this change in the dataframe. @@ -427,18 +452,25 @@ def read_args(): Return parsed command line arguments. """ - parser = argparse.ArgumentParser(description="Plot benchmark data. At least one perflog must be supplied.") + parser = argparse.ArgumentParser(description="Plot benchmark data. \ + At least one perflog must be supplied.") # required positional arguments (log path, config path) - parser.add_argument("log_path", type=str, help="path to a perflog file or a directory containing perflog files") - parser.add_argument("config_path", type=str, help="path to a configuration file specifying what to plot") + parser.add_argument("log_path", type=str, + help="path to a perflog file or a directory containing perflog files") + parser.add_argument("config_path", type=str, + help="path to a configuration file specifying what to plot") # optional argument (plot type) - parser.add_argument("-p", "--plot_type", type=str, default="generic", help="type of plot to be generated (default: 'generic')") + parser.add_argument("-p", "--plot_type", type=str, default="generic", + help="type of plot to be generated (default: 'generic')") # info dump flags - parser.add_argument("-d", "--debug", action="store_true", help="debug flag for printing additional information") - parser.add_argument("-v", "--verbose", action="store_true", help="verbose flag for printing more debug information (must be used in conjunction with the debug flag)") + parser.add_argument("-d", "--debug", action="store_true", + help="debug flag for printing additional information") + parser.add_argument("-v", "--verbose", action="store_true", + help="verbose flag for printing more debug information \ + (must be used in conjunction with the debug flag)") return parser.parse_args() @@ -483,9 +515,11 @@ def read_config(path): # check series length if config.get("series") is None: - raise KeyError("Missing series information (specify an empty list [] if there is only one series).") + raise KeyError( + "Missing series information (specify an empty list [] if there is only one series).") if len(config["series"]) == 1: - raise KeyError("Number of series must be >= 2 (specify an empty list [] if there is only one series).") + raise KeyError( + "Number of series must be >= 2 (specify an empty list [] if there is only one series).") # check filters are present if not config.get("filters"): @@ -509,7 +543,8 @@ def read_perflog(path): Args: path: str, path to log file. - NB: This currently depends on having a non-default handlers_perflog.filelog.format in reframe's configuration. See code. + NB: This currently depends on having a non-default handlers_perflog.filelog.format + in reframe's configuration. See code. The returned dataframe will have columns for all fields in a performance log record except display name, extra resources, and env vars. Display name will be broken up @@ -522,7 +557,8 @@ def read_perflog(path): REQUIRED_LOG_FIELDS = ["job_completion_time", r"\w+_value$", r"\w+_unit$", "display_name"] # look for required column matches - required_field_matches = [len(list(filter(re.compile(rexpr).match, df.columns))) > 0 for rexpr in REQUIRED_LOG_FIELDS] + required_field_matches = [len(list(filter(re.compile(rexpr).match, df.columns))) > 0 + for rexpr in REQUIRED_LOG_FIELDS] # check all required columns are present if False in required_field_matches: raise KeyError("Perflog missing one or more required fields", REQUIRED_LOG_FIELDS) @@ -549,10 +585,13 @@ def read_perflog(path): def get_display_name_info(display_name): """ - Return a tuple containing the test name and a dictionary of parameter names and their values from the given input string. The parameter dictionary may be empty if no parameters are present. + Return a tuple containing the test name and a dictionary of parameter names + and their values from the given input string. The parameter dictionary may be empty + if no parameters are present. Args: - display_name: str, expecting a format of followed by zero or more %= pairs. + display_name: str, expecting a format of followed by zero or more + %= pairs. """ split_display_name = display_name.split(" %") @@ -578,7 +617,8 @@ def insert_key_cols(df: pd.DataFrame, index, results): def get_axis_info(df: pd.DataFrame, axis, series_filters): """ - Return the column name and label for a given axis. If a column name is supplied as units information, the actual units will be extracted from a dataframe. + Return the column name and label for a given axis. If a column name is supplied as + units information, the actual units will be extracted from a dataframe. Args: df: dataframe, data to plot. @@ -592,8 +632,8 @@ def get_axis_info(df: pd.DataFrame, axis, series_filters): if axis.get("units").get("column"): unit_set = set(df[axis["units"]["column"]].dropna()) # check all rows have the same units - #if len(unit_set) != 1: - # raise RuntimeError("Unexpected number of axis unit entries {0}".format(unit_set)) + if len(unit_set) != 1: + raise RuntimeError("Unexpected number of axis unit entries {0}".format(unit_set)) units = next(iter(unit_set)) # get scaling information @@ -612,7 +652,8 @@ def get_axis_info(df: pd.DataFrame, axis, series_filters): # determine axis label label = "{0}{1}{2}".format(titlecase(col_name.replace("_", " ")), - titlecase(" Scaled by {0}".format(scaling.replace("_", " "))) if scaling else "", + titlecase(" Scaled by {0}".format(scaling.replace("_", " "))) + if scaling else "", " ({0})".format(units) if units else "") return col_name, label From 28deebde93815c69a0b61fc198a21e1e86c7ed0d Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Fri, 15 Dec 2023 17:36:13 +0000 Subject: [PATCH 32/35] Fixed grouped (x, series) sorting for non-string data. --- post-processing/post_processing.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py index 85fd2cc1..739d50e6 100644 --- a/post-processing/post_processing.py +++ b/post-processing/post_processing.py @@ -302,9 +302,23 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters): if x_axis.get("sort"): if x_axis["sort"] == "descending": reverse = True - plot.x_range.factors = sorted(plot.x_range.factors, reverse=reverse, - key=lambda x: pd.Series(x[0] if len(groups) > 1 else x, - dtype=x_col_dtype).iloc[0]) + + if len(groups) > 1: + # sort by x-axis values first + plot.x_range.factors = sorted(plot.x_range.factors, reverse=reverse, + key=lambda x: pd.Series(x[0], dtype=x_col_dtype).iloc[0]) + # get series values with their original dtype + # NOTE: currently not accounting for more than one series column + series_values = [pd.Series(x[-1], dtype=last_group_dtype).iloc[0] + for x in plot.x_range.factors] + # sort x-axis groups by series + sorted_x_items = [x[1] for x in sorted(zip(series_values, plot.x_range.factors), + reverse=reverse, key=lambda x: x[0])] + plot.x_range.factors = sorted_x_items + else: + # sort only by x-axis values + plot.x_range.factors = sorted(plot.x_range.factors, reverse=reverse, + key=lambda x: pd.Series(x, dtype=x_col_dtype).iloc[0]) # automatically base bar colouring on last group column colour_factors = [str(x) for x in sorted(pd.Series(df[groups[-1]].unique(), @@ -452,8 +466,8 @@ def read_args(): Return parsed command line arguments. """ - parser = argparse.ArgumentParser(description="Plot benchmark data. \ - At least one perflog must be supplied.") + parser = argparse.ArgumentParser( + description="Plot benchmark data. At least one perflog must be supplied.") # required positional arguments (log path, config path) parser.add_argument("log_path", type=str, From 7e24217a1b47aa0ecc47cb8fd223443b662c503e Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Fri, 15 Dec 2023 18:22:33 +0000 Subject: [PATCH 33/35] Adjusted grouped (x, series) sorting to ensure series sorting is secondary to x-value sorting. --- post-processing/post_processing.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py index 739d50e6..2379b4e4 100644 --- a/post-processing/post_processing.py +++ b/post-processing/post_processing.py @@ -303,22 +303,20 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters): if x_axis["sort"] == "descending": reverse = True + # sort x-axis groups by series first if len(groups) > 1: - # sort by x-axis values first - plot.x_range.factors = sorted(plot.x_range.factors, reverse=reverse, - key=lambda x: pd.Series(x[0], dtype=x_col_dtype).iloc[0]) # get series values with their original dtype # NOTE: currently not accounting for more than one series column series_values = [pd.Series(x[-1], dtype=last_group_dtype).iloc[0] for x in plot.x_range.factors] - # sort x-axis groups by series sorted_x_items = [x[1] for x in sorted(zip(series_values, plot.x_range.factors), reverse=reverse, key=lambda x: x[0])] plot.x_range.factors = sorted_x_items - else: - # sort only by x-axis values - plot.x_range.factors = sorted(plot.x_range.factors, reverse=reverse, - key=lambda x: pd.Series(x, dtype=x_col_dtype).iloc[0]) + + # sort by x-axis values + plot.x_range.factors = sorted(plot.x_range.factors, reverse=reverse, + key=lambda x: pd.Series(x[0] if len(groups) > 1 else x, + dtype=x_col_dtype).iloc[0]) # automatically base bar colouring on last group column colour_factors = [str(x) for x in sorted(pd.Series(df[groups[-1]].unique(), From 7119ce7a5168013207cf14a7936417de3079ab68 Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Mon, 18 Dec 2023 18:41:25 +0000 Subject: [PATCH 34/35] Added some README clarifications + a config template. --- post-processing/README.md | 112 ++++++++++++++++++++++++++++---------- 1 file changed, 82 insertions(+), 30 deletions(-) diff --git a/post-processing/README.md b/post-processing/README.md index 3a9cee50..b72fb0e5 100644 --- a/post-processing/README.md +++ b/post-processing/README.md @@ -36,7 +36,7 @@ Run `post_processing.py -h` for more information (including debugging flags). ### Configuration Structure -Before running post-processing, create a config file including all necessary information for graph generation (you must specify at least plot title, x-axis, y-axis, and column types). See below for an example and some clarifying notes. +Before running post-processing, create a config file including all necessary information for graph generation (you must specify at least plot title, x-axis, y-axis, and column types). See below for a template, an example, and some clarifying notes. - `title` - Plot title. - `x_axis`, `y_axis` - Axis information. @@ -54,8 +54,58 @@ Before running post-processing, create a config file including all necessary inf - `column_types` - Pandas dtype for each relevant column (axes, units, filters, series). Specified with a dictionary. - `Accepted types: "str"/"string"/"object", "int"/"int64", "float"/"float64", "datetime"/"datetime64"` +### Complete Config Template + +This template includes all possible config fields, some of which are optional or mutually exclusive (e.g. `column` and `custom`). + +```yaml +title: + +x_axis: + value: + # use one of 'column' or 'custom' + units: + column: + custom: + # optional (default: ascending) + sort: "descending" + +y_axis: + value: + # use one of 'column' or 'custom' + units: + column: + custom: + # optional (default: no data transformation) + # use one of 'column' or 'custom' + scaling: + column: + name: + series: + x_value: + custom: + +# optional (default: include all data) +# entry format: [, , ] +# accepted operators: ==, !=, <, >, <=, >= +filters: + and: + or: + +# optional (default: no x-axis grouping, one plot per graph) +# entry format: [, ] +series: + +# include types for each column that is used in the config +# accepted types: string/object, int, float, datetime +column_types: + : +``` + ### Example Config +This example more accurately illustrates what an actual config file may look like. + ```yaml title: "Plot Title" @@ -91,7 +141,7 @@ column_types: series_col: "str" ``` -#### A Note on X-axis Grouping +#### X-axis Grouping The settings above will produce a graph that will have its x-axis data grouped based on the values in `x_axis_col` and `series_col`. (`Note: only groupings with one series column are currently supported.`) If we imagine that `x_axis_col` has two unique values, `"x_val_1"` and `"x_val_2"`, there will be four groups (and four bars) along the x-axis: @@ -100,7 +150,7 @@ The settings above will produce a graph that will have its x-axis data grouped b - (`x_val_2`, `series_val_1`) - (`x_val_2`, `series_val_2`) -#### A Note on Scaling +#### Scaling When axis values are scaled, they are all divided by a number or a list of numbers. If using more than one number for scaling, the length of the list must match the length of the axis column being scaled. (`Note: scaling is currently only supported for y-axis data, as graphs with a non-categorical x-axis are still a work in progress.`) @@ -119,12 +169,12 @@ y_axis: In the snippet above, all y-axis values are to be divided by 2. -|y_axis_col|scaled_y_axis_col| -|-|-| -|3.2|3.2 / 2.0 = 1.6| -|5.4|5.4 / 2.0 = 2.7| -|2.4|2.4 / 2.0 = 1.2| -|5.0|5.0 / 2.0 = 2.5| +|y_axis_col||scaled_y_axis_col| +|-|-|-| +|3.2|3.2 / 2.0 =|1.6| +|5.4|5.4 / 2.0 =|2.7| +|2.4|2.4 / 2.0 =|1.2| +|5.0|5.0 / 2.0 =|2.5| **Column Scaling** @@ -142,12 +192,12 @@ y_axis: In the snippet above, all y-axis values are to be divided by the corresponding values in the scaling column. -|y_axis_col|scaling_col|scaled_y_axis_col| -|-|-|-| -|3.2|**`1.6`**|3.2 / 1.6 = 2.0| -|5.4|**`2.0`**|5.4 / 2.0 = 2.7| -|2.4|**`0.6`**|2.4 / 0.6 = 4.0| -|5.0|**`2.5`**|5.0 / 2.5 = 2.0| +|y_axis_col|scaling_col||scaled_y_axis_col| +|-|-|-|-| +|3.2|**`1.6`**|3.2 / 1.6 =|2.0| +|5.4|**`2.0`**|5.4 / 2.0 =|2.7| +|2.4|**`0.6`**|2.4 / 0.6 =|4.0| +|5.0|**`2.5`**|5.0 / 2.5 =|2.0| **Series Scaling** @@ -168,12 +218,12 @@ y_axis: In the snippet above, all y-axis values are to be split by series and divided by the corresponding values in the scaling series. -|y_axis_col|scaling_col|series_col|scaled_y_axis_col| -|-|-|-|-| -|3.2|**`1.6`**|`series_val_1`|3.2 / 1.6 = 2.0| -|5.4|**`2.0`**|`series_val_1`|5.4 / 2.0 = 2.7| -|2.4|0.6|series_val_2|2.4 / 1.6 = 1.5| -|5.0|2.5|series_val_2|5.0 / 2.0 = 2.5| +|y_axis_col|scaling_col|series_col||scaled_y_axis_col| +|-|-|-|-|-| +|3.2|**`1.6`**|`series_val_1`|3.2 / 1.6 =|2.0| +|5.4|**`2.0`**|`series_val_1`|5.4 / 2.0 =|2.7| +|2.4|0.6|series_val_2|2.4 / 1.6 =|1.5| +|5.0|2.5|series_val_2|5.0 / 2.0 =|2.5| **Selected Value Scaling** @@ -193,25 +243,27 @@ y_axis: In the snippet above, all y-axis values are to be divided by the scaling value found by filtering the scaling column by both series and x-axis value. -|x_axis_col|y_axis_col|scaling_col|series_col|scaled_y_axis_col| -|-|-|-|-|-| -|x_val_1|3.2|1.6|series_val_1|3.2 / 2.0 = 1.6| -|`x_val_s`|5.4|**`2.0`**|`series_val_1`|5.4 / 2.0 = 2.7| -|x_val_2|2.4|0.7|series_val_2|2.4 / 2.0 = 1.2| -|x_val_s|5.0|2.5|series_val_2|5.0 / 2.0 = 2.5| +|x_axis_col|y_axis_col|scaling_col|series_col||scaled_y_axis_col| +|-|-|-|-|-|-| +|x_val_1|3.2|1.6|series_val_1|3.2 / 2.0 =|1.6| +|`x_val_s`|5.4|**`2.0`**|`series_val_1`|5.4 / 2.0 =|2.7| +|x_val_2|2.4|0.7|series_val_2|2.4 / 2.0 =|1.2| +|x_val_s|5.0|2.5|series_val_2|5.0 / 2.0 =|2.5| (`Note: if series are not present and x-axis values are all unique, it is enough to specify just the column name and x-value.`) -#### A Note on Filters +#### Filters -AND filters and OR filters are combined with a logical AND to produce the final filter mask applied to the DataFrame prior to graphing. For example: +A condition list for filtering has entries in the format `[, , ]`. AND filters and OR filters are combined with a logical AND to produce the final filter mask applied to the DataFrame prior to graphing. For example: - `and_filters` = `cond1`, `cond2` - `or_filters`= `cond3`, `cond4` The filters above would produce the final filter `mask` = (`cond1` AND `cond2`) AND (`cond3` OR `cond4`). -#### A Note on Column Types +#### Column Types + +Types must be specified for all columns included in the config in the format `:`. Accepted types include `string/object`, `int`, `float`, and `datetime`. All user-specified types are internally converted to their nullable incarnations. As such: From 49884d83a6e19cb52374c4ff0dd0bb0616ceb0bc Mon Sep 17 00:00:00 2001 From: pineapple-cat Date: Tue, 19 Dec 2023 16:26:15 +0000 Subject: [PATCH 35/35] Rehomed note on replaced reframe columns. --- post-processing/README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/post-processing/README.md b/post-processing/README.md index b72fb0e5..4d3ed4a4 100644 --- a/post-processing/README.md +++ b/post-processing/README.md @@ -54,6 +54,12 @@ Before running post-processing, create a config file including all necessary inf - `column_types` - Pandas dtype for each relevant column (axes, units, filters, series). Specified with a dictionary. - `Accepted types: "str"/"string"/"object", "int"/"int64", "float"/"float64", "datetime"/"datetime64"` +#### A Note on Replaced ReFrame Columns + +A perflog contains certain columns that will not be present in the DataFrame available to the graphing script. Currently, these columns are `display_name`, `extra_resources`, and `env_vars`. Removed columns should not be referenced in a plot config file. + +When the row contents of `display_name` are parsed, they are separated into their constituent benchmark names and parameters. This column is replaced with a new `test_name` column and new parameter columns (if present). Similarly, the `extra_resources` and `env_vars` columns are replaced with their respective dictionary row contents (keys become columns, values become row contents). + ### Complete Config Template This template includes all possible config fields, some of which are optional or mutually exclusive (e.g. `column` and `custom`). @@ -272,12 +278,6 @@ All user-specified types are internally converted to their nullable incarnations - Integers are treated as `Int64`. - Datetimes are treated as `datetime64[ns]`. -#### A Note on Replaced ReFrame Columns - -A perflog contains certain columns that will not be present in the DataFrame available to the graphing script. Currently, these columns are `display_name`, `extra_resources`, and `env_vars`. Removed columns should not be referenced in a plot config file. - -When the row contents of `display_name` are parsed, they are separated into their constituent benchmark names and parameters. This column is replaced with a new `test_name` column and new parameter columns (if present). Similarly, the `extra_resources` and `env_vars` columns are replaced with their respective dictionary row contents (keys become columns, values become row contents). - ### Future Development The post-processing capabilities are still a work in progress. Some upcoming developments: