diff --git a/duck/data_stats.py b/duck/data_stats.py index a259c22..fd24d8b 100644 --- a/duck/data_stats.py +++ b/duck/data_stats.py @@ -3,6 +3,11 @@ import numpy as np import yaml import pathlib +import io +import base64 + +# import matplotlib +# matplotlib.use('agg') def get_stats(data): return { @@ -20,6 +25,7 @@ def __init__(self, output_dir): else: self.output_dir = pathlib.Path(output_dir) self.info = None + self.histogram = None def gen_data_stats(self, filename, var, nbins=100): @@ -41,14 +47,28 @@ def gen_data_stats(self, filename, var, nbins=100): mratio = 1 - mratio / (nlon * nlat) # TODO: It would be great to store the distribution graph in a database - if False: + if True: + plt.close() plt.imshow(hist, aspect="auto", origin='lower', extent=[vstats["min"], vstats["max"], 0, ntime], cmap="gist_ncar") ax = plt.gca() ax.grid(color='gray', linestyle='-.', linewidth=1) plt.xlabel(var) plt.ylabel("Timesteps") - outfile = self.output_dir / "histime.png" - plt.savefig(outfile.as_posix(), dpi=50) + # outfile = self.output_dir / "histogram.png" + # print(f"histogram: {outfile}") + # plt.savefig(outfile.as_posix(), dpi=50) + # store as base64 + # Save the plot to a BytesIO object + buffer = io.BytesIO() + plt.savefig(buffer, format='png') + buffer.seek(0) + + # Encode the BytesIO object as base64 + base64_encoded_plot = base64.b64encode(buffer.read()).decode('utf-8') + print(f"{base64_encoded_plot}") + self.histogram = base64_encoded_plot + # close plot + plt.close() # The following information should be stored in a database attrs = {} @@ -64,18 +84,11 @@ def gen_data_stats(self, filename, var, nbins=100): self.info["Vars"] = list(dict(ds.variables).keys()) self.info["Vstats"] = vstats self.info["Mstats"] = get_stats(mratio) - # self.info["Hist"] = hist - print(self.info) + # print(self.info) def write_json(self): outfile = self.output_dir / "info.txt" with open(outfile.as_posix(), "w") as f: yaml.dump(self.info, f) return outfile - - def write_png(self): - outfile = self.output_dir / "histime.png" - return outfile - - - + \ No newline at end of file diff --git a/duck/processes/wps_clintai.py b/duck/processes/wps_clintai.py index 8b2c03a..8cc2da1 100644 --- a/duck/processes/wps_clintai.py +++ b/duck/processes/wps_clintai.py @@ -182,6 +182,7 @@ def _handler(self, request, response): "mean": stats["mean"], "stddev": stats["std"], "info": json.dumps(datastats.info, separators=(',', ':')), + "histogram": datastats.histogram, }, [datasets[0].as_posix()], [f"{datasets[0].as_posix()}_infilled.nc"], diff --git a/duck/processes/wps_dashboard.py b/duck/processes/wps_dashboard.py index 784fa1d..98d1275 100644 --- a/duck/processes/wps_dashboard.py +++ b/duck/processes/wps_dashboard.py @@ -53,7 +53,7 @@ def _handler(self, request, response): def write_html(self, df, workdir): # Convert the DataFrame to an HTML table - html_table = df.to_html(index=False) + html_table = df.to_html(escape=False, index=False) # Define the HTML template html_template = f""" diff --git a/duck/provenance.py b/duck/provenance.py index 5200fac..63243ac 100644 --- a/duck/provenance.py +++ b/duck/provenance.py @@ -95,6 +95,7 @@ def add_operator(self, operator, parameters, collection, output, start, end): "stddev", "mean", "info", + "histogram", "dataset_name", "variable_name", ]: diff --git a/duck/query.py b/duck/query.py index f0f953d..e9e3d8f 100644 --- a/duck/query.py +++ b/duck/query.py @@ -1,10 +1,22 @@ from duck.db import GraphDB import pandas as pd +import base64 +from PIL import Image +import io +import json +def display_image(base64_image): + img_data = base64.b64decode(base64_image) + img = Image.open(io.BytesIO(img_data)) + return ''.format(base64_image) + +def display_json(data): + content = json.loads(data) + return f"
{content}
" def query(): query_str = """ - SELECT ?process ?dataset ?variable ?startTime ?endTime ?input ?output ?min ?max ?mean ?stddev ?info + SELECT ?process ?dataset ?variable ?startTime ?endTime ?input ?output ?min ?max ?mean ?stddev ?info ?histogram WHERE { ?exec rdf:type provone:Execution ; rdfs:label ?process ; @@ -16,7 +28,8 @@ def query(): clint:max ?max ; clint:mean ?mean ; clint:stddev ?stddev ; - clint:info ?info . + clint:info ?info ; + clint:histogram ?histogram . ?input rdf:type prov:Entity . @@ -43,8 +56,9 @@ def query(): max = row.max.value mean = row.mean.value stddev = row.stddev.value - info = row.info.value - data.append({ + info = json.loads(row.info.value) + histogram = row.histogram.value + entry = { "Process": process, "Dataset": dataset, "Variable": variable, @@ -52,11 +66,14 @@ def query(): "End Time": end_time, "Input": input, "Output": output, - "Min": min, - "Max": max, - "Mean": mean, - "StdDev": stddev, - "Info": info, - }) + # "Min": min, + # "Max": max, + # "Mean": mean, + # "StdDev": stddev, + "Histogram": display_image(histogram), + } + for key in info: + entry[key] = info[key] + data.append(entry) df = pd.DataFrame(data) return df