ukri-excalibur · pineapple-cat · Dec 19, 2023 · Oct 23, 2023 · Oct 23, 2023 · Oct 24, 2023
diff --git a/post-processing/README.md b/post-processing/README.md
@@ -41,6 +41,8 @@ Before running post-processing, create a config file including all necessary inf
     - `value` - Axis data points. Specified with a column name.
     - `units` - Axis units. Specified either with a column name or a custom label (may be null).
 - `filters` - (Optional.) Filter data rows based on specified conditions. (Specify an empty list if no filters are required.)
+  - `and` - Filter mask is determined from a logical AND of conditions in list.
+  - `or` - Filter mask is determined from a logical OR of conditions in list.
   - `Format: [column_name, operator, value]`
   - `Accepted operators: "==", "!=", "<", ">", "<=", ">="`
 - `series` - (Optional.) Display several plots in the same graph and group x-axis data by specified column values. (Specify an empty list if there is only one series.)
@@ -63,7 +65,9 @@ y_axis:
   units:
     column: "unit_col"
 
-filters: [["filter_col_1", "<=", filter_val_1], ["filter_col_2", "!=", filter_val_2]]
+filters:
+  and: [["filter_col_1", "<=", filter_val_1], ["filter_col_2", "!=", filter_val_2]]
+  or: []
 
 series: [["series_col", "series_val_1"], ["series_col", "series_val_2"]]
 
@@ -85,6 +89,15 @@ The settings above will produce a graph that will have its x-axis data grouped b
 - (`x_val_2`, `series_val_1`)
 - (`x_val_2`, `series_val_2`)
 
+#### A Note on Filters
+
+AND filters and OR filters are combined with a logical AND to produce the final filter mask applied to the DataFrame prior to graphing. For example:
+
+- `and_filters` = `cond1`, `cond2`
+- `or_filters`= `cond3`, `cond4`
+
+The filters above would produce the final filter `mask` = (`cond1` AND `cond2`) AND (`cond3` OR `cond4`).
+
 #### A Note on Column Types
 
 All user-specified types are internally converted to their nullable incarnations. As such:

diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py
@@ -10,9 +10,10 @@
 from itertools import chain
 from pathlib import Path
 
+import numpy as np
 import pandas as pd
 import yaml
-from bokeh.models import Legend, HoverTool
+from bokeh.models import HoverTool, Legend
 from bokeh.models.sources import ColumnDataSource
 from bokeh.palettes import viridis
 from bokeh.plotting import figure, output_file, save
@@ -82,16 +83,25 @@ def run_post_processing(self, log_path, config):
         # check acceptable number of series
         if len(set(series_columns)) > 1:
             raise RuntimeError("Currently supporting grouping of series by only one column. Please use a single column name in your series configuration.")
-        # add series columns to column list
+        # add series columns to dataframe column list
         for c in series_columns:
             if c not in columns:
                 columns.append(c)
 
-        filters = config["filters"]
+        and_filters = config["filters"]["and"]
+        or_filters = config["filters"]["or"]
         # extract filter columns
-        filter_columns = [f[0] for f in filters]
+        filter_columns = [f[0] for f in and_filters] + [f[0] for f in or_filters]
+
+        # FIXME: add scaling for x-axis
+        scaling_columns = []
+        # extract scaling columns
+        if config["y_axis"].get("scaling"):
+            if config["y_axis"]["scaling"].get("column"):
+                scaling_columns.append(config["y_axis"]["scaling"]["column"]["name"])
+
         # gather all relevant columns
-        all_columns = columns + filter_columns
+        all_columns = set(columns + filter_columns + scaling_columns)
 
         invalid_columns = []
         # check for invalid columns
@@ -138,12 +148,13 @@ def run_post_processing(self, log_path, config):
 
         mask = pd.Series(df.index.notnull())
         # filter rows
-        if filters:
-            mask = reduce(op.and_, (self.row_filter(f, df) for f in filters))
+        if and_filters:
+            mask = reduce(op.and_, (self.row_filter(f, df) for f in and_filters))
+        if or_filters:
+            mask &= reduce(op.or_, (self.row_filter(f, df) for f in or_filters))
         # apply series filters
         if series_filters:
-            series_mask = reduce(op.or_, (self.row_filter(f, df) for f in series_filters))
-            mask = mask & series_mask
+            mask &= reduce(op.or_, (self.row_filter(f, df) for f in series_filters))
         # ensure not all rows are filtered away
         if df[mask].empty:
             raise pd.errors.EmptyDataError("Filtered dataframe is empty", df[mask].index)
@@ -159,6 +170,15 @@ def run_post_processing(self, log_path, config):
         if num_filtered_rows > num_x_data_points:
             raise RuntimeError("Unexpected number of rows ({0}) does not match number of unique x-axis values per series ({1})".format(num_filtered_rows, num_x_data_points), df[columns][mask])
 
+        # apply data transformation per series
+        if series_filters:
+            for f in series_filters:
+                m = self.row_filter(f, df)
+                df[mask & m] = self.transform_axis(df[mask & m], config["y_axis"], config["x_axis"]["value"])
+        # apply data transformation to all data
+        else:
+            df[mask] = self.transform_axis(df[mask], config["y_axis"], config["x_axis"]["value"])
+
         print("Selected dataframe:")
         print(df[columns][mask])
 
@@ -209,9 +229,9 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters):
 
         # adjust y-axis range
         min_y = 0 if min(df[y_column]) >= 0 \
-                else math.floor(min(df[y_column])*1.2)
+                else math.floor(np.nanmin(df[y_column])*1.2)
         max_y = 0 if max(df[y_column]) <= 0 \
-                else math.ceil(max(df[y_column])*1.2)
+                else math.ceil(np.nanmax(df[y_column])*1.2)
 
         # create html file to store plot in
         output_file(filename=os.path.join(Path(__file__).parent, "{0}.html".format(title.replace(" ", "_"))), title=title)
@@ -239,6 +259,9 @@ def plot_generic(self, title, df: pd.DataFrame, x_axis, y_axis, series_filters):
         # add labels
         plot.xaxis.axis_label = x_label
         plot.yaxis.axis_label = y_label
+        # remove x-axis group ticks
+        plot.xaxis.major_tick_line_color = None
+        plot.xaxis.major_label_text_font_size = "0pt"
         # adjust font size
         plot.title.text_font_size = "15pt"
 
@@ -292,6 +315,40 @@ def row_filter(self, filter, df: pd.DataFrame):
 
         return mask
 
+    def transform_axis(self, df: pd.DataFrame, axis, x_column):
+        """
+            Divide axis values by specified values and reflect this change in the dataframe.
+
+            Args:
+                df: dataframe, data to plot.
+                axis: dict, axis column, units, and values to scale by.
+                x_column: string, name of column containing x-axis values.
+        """
+
+        # FIXME: try to make this an in-place process
+        if axis.get("scaling"):
+            # scale by column
+            if axis["scaling"].get("column"):
+
+                # check types
+                if not pd.api.types.is_numeric_dtype(df[axis["value"]].dtype) or \
+                   not pd.api.types.is_numeric_dtype(df[axis["scaling"]["column"]["name"]].dtype):
+                    # both columns must be numeric
+                    raise TypeError("Cannot scale column '{0}' of type {1} by column '{2}' of type {3}."
+                                    .format(axis["value"], df[axis["value"]].dtype,
+                                            axis["scaling"]["column"]["name"],
+                                            df[axis["scaling"]["column"]["name"]].dtype))
+
+                # scale by specific value in column
+                if axis["scaling"]["column"].get("x_value"):
+                    x_value = axis["scaling"]["column"]["x_value"]
+                    df[axis["value"]] /= df[df[x_column] == x_value][axis["scaling"]["column"]["name"]].iloc[0]
+                # scale by entire column
+                else:
+                    df[axis["value"]] /= df[axis["scaling"]["column"]["name"]]
+
+        return df
+
 def read_args():
     """
         Return parsed command line arguments.
@@ -435,7 +492,7 @@ def get_axis_info(df: pd.DataFrame, axis):
 
         Args:
             df: dataframe, data to plot.
-            axis: dict, axis column and units.
+            axis: dict, axis column, units, and values to scale by.
     """
 
     # get column name of axis
@@ -448,9 +505,20 @@ def get_axis_info(df: pd.DataFrame, axis):
         if len(unit_set) != 1:
             raise RuntimeError("Unexpected number of axis unit entries {0}".format(unit_set))
         units = next(iter(unit_set))
+
+    # get values to scale by
+    scaling = None
+    if axis.get("scaling"):
+        if axis.get("scaling").get("column"):
+            if axis.get("scaling").get("column").get("x_value"):
+                scaling = "{0} {1}".format(axis.get("scaling").get("column").get("x_value"), axis.get("scaling").get("column").get("name"))
+            else:
+                scaling = axis.get("scaling").get("column").get("name")
+
     # determine axis label
-    label = "{0}{1}".format(col_name.replace("_", " ").title(),
-                            " ({0})".format(units) if units else "")
+    label = "{0}{1}{2}".format(col_name.replace("_", " ").title(),
+                               " Scaled by {0}".format(scaling.replace("_", " ").title()) if scaling else "",
+                               " ({0})".format(units) if units else "")
 
     return col_name, label
 

diff --git a/post-processing/post_processing_config.yaml b/post-processing/post_processing_config.yaml
@@ -19,7 +19,9 @@ y_axis:
 
 # Optional row filters (specify an empty list [] if none are required).
 # Filter format: [column_name, operator, value]
-filters: [["system", "==", "default"]]
+filters:
+  and: [["system", "==", "default"]]
+  or: []
 
 # Optional setting to display several plots in the same graph.
 # Number of series, if used, must be >=2 (specify an empty list [] if there is only one series).

diff --git a/post-processing/test_post_processing.py b/post-processing/test_post_processing.py
@@ -182,47 +182,47 @@ def test_high_level_script(run_sombrero):
 
     # check expected failure from invalid (filter) column
     try:
-        post_.run_post_processing(sombrero_log_path, {"filters": [["fake_column", "==", 2]], "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}})
+        post_.run_post_processing(sombrero_log_path, {"filters": {"and": [["fake_column", "==", 2]], "or": []}, "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}})
     except KeyError as e:
         assert e.args[1] == ["fake_column"]
     else:
         assert False
 
     # check expected failure from invalid filter operator
     try:
-        post_.run_post_processing(sombrero_log_path, {"filters": [["tasks", "!!", 2]], "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str"}})
+        post_.run_post_processing(sombrero_log_path, {"filters": {"and": [["tasks", "!!", 2]], "or": []}, "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str"}})
     except KeyError as e:
         assert e.args[1] == "!!"
     else:
         assert False
 
     # check expected failure from invalid filter value type
     try:
-        post_.run_post_processing(sombrero_log_path, {"filters": [["flops_value", ">", "v"]], "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str"}})
+        post_.run_post_processing(sombrero_log_path, {"filters": {"and": [["flops_value", ">", "v"]], "or": []}, "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str"}})
     except ValueError:
         assert True
     else:
         assert False
 
     # check expected failure from filtering out every row
     try:
-        post_.run_post_processing(sombrero_log_path, {"filters": [["tasks", ">", 2]], "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str"}})
+        post_.run_post_processing(sombrero_log_path, {"filters": {"and": [["tasks", ">", 2]], "or": []}, "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str"}})
     except pd.errors.EmptyDataError:
         assert True
     else:
         assert False
 
     # check expected failure from row number vs unique x-axis value number mismatch
     try:
-        df = post_.run_post_processing(sombrero_log_path, {"filters": [], "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str"}})
+        df = post_.run_post_processing(sombrero_log_path, {"filters": {"and": [], "or": []}, "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str"}})
     except RuntimeError:
         assert True
     else:
         assert False
 
     # check correct display name parsing
     try:
-        df = post_.run_post_processing(sombrero_changed_log_path, {"filters": [], "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "cpus_per_task", "units": {"column": "extra_param"}}, "column_types": {"tasks": "int", "cpus_per_task": "int", "extra_param": "int"}})
+        df = post_.run_post_processing(sombrero_changed_log_path, {"filters": {"and": [], "or": []}, "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "cpus_per_task", "units": {"column": "extra_param"}}, "column_types": {"tasks": "int", "cpus_per_task": "int", "extra_param": "int"}})
     except RuntimeError as e:
         # three param columns found in changed log
         EXPECTED_FIELDS = ["tasks", "cpus_per_task", "extra_param"]
@@ -232,22 +232,38 @@ def test_high_level_script(run_sombrero):
         assert False
 
     # check correct date filtering
-    df = post_.run_post_processing(sombrero_changed_log_path, {"title": "Title", "filters": [["job_completion_time", ">", "2000-06-01T12:30:15"]], "series": [], "x_axis": {"value": "job_completion_time", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"job_completion_time": "datetime", "flops_value": "float", "flops_unit": "str"}})
+    df = post_.run_post_processing(sombrero_changed_log_path, {"title": "Title", "filters": {"and": [["job_completion_time", ">", "2000-06-01T12:30:15"]], "or": []}, "series": [], "x_axis": {"value": "job_completion_time", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"job_completion_time": "datetime", "flops_value": "float", "flops_unit": "str"}})
     # check returned subset is as expected
     assert len(df) == 2
 
+    # check correct or filtering
+    df = post_.run_post_processing(sombrero_log_path, {"title": "Title", "filters": {"and": [], "or": [["tasks", ">", "1"], ["tasks", "<", "2"]]}, "series": [["cpus_per_task", "1"], ["cpus_per_task", "2"]], "x_axis": {"value": "cpus_per_task", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"tasks": "int", "cpus_per_task": "int", "flops_value": "float", "flops_unit": "str"}})
+    # check returned subset is as expected
+    assert len(df) == 4
+
+    # check correct scaling
+    dfs = post_.run_post_processing(sombrero_log_path, {"title": "Title", "filters": {"and": [["cpus_per_task", "==", 2]], "or": []}, "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}, "scaling": {"column": {"name": "OMP_NUM_THREADS"}}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str", "cpus_per_task": "int", "OMP_NUM_THREADS": "int"}})
+    # check flops values are halved compared to previous df
+    assert (dfs["flops_value"] == df[df["cpus_per_task"] == 2]["flops_value"]/2).all()
+
+    # check expected failure from scaling by incorrect type
+    try:
+        df = post_.run_post_processing(sombrero_log_path, {"title": "Title", "filters": {"and": [], "or": []}, "series": [["cpus_per_task", 1], ["cpus_per_task", 2]], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}, "scaling": {"column": {"name": "OMP_NUM_THREADS"}}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str", "cpus_per_task": "int", "OMP_NUM_THREADS": "str"}})
+    except TypeError:
+        assert True
+
     # check correct concatenation of two dataframes with different columns
     try:
         # get collated dataframe subset
-        df = post_.run_post_processing(Path(sombrero_log_path).parent, {"filters": [], "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str"}})
+        df = post_.run_post_processing(Path(sombrero_log_path).parent, {"filters": {"and": [], "or": []}, "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str"}})
     except RuntimeError as e:
         # dataframe has records from both files
         assert len(e.args[1]) == 8
     else:
         assert False
 
     # get filtered dataframe subset
-    df = post_.run_post_processing(sombrero_log_path, {"title": "Title", "filters": [["tasks", ">", 1], ["cpus_per_task", "==", 2]], "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str", "cpus_per_task": "int"}})
+    df = post_.run_post_processing(sombrero_log_path, {"title": "Title", "filters": {"and": [["tasks", ">", 1], ["cpus_per_task", "==", 2]], "or": []}, "series": [], "x_axis": {"value": "tasks", "units": {"custom": None}}, "y_axis": {"value": "flops_value", "units": {"column": "flops_unit"}}, "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str", "cpus_per_task": "int"}})
 
     EXPECTED_FIELDS = ["tasks", "flops_value", "flops_unit"]
     # check returned subset is as expected