From 699676f9c8f83c924a1946888f405916ef60efa0 Mon Sep 17 00:00:00 2001
From: Eli Jones <biblicabeebli@gmail.com>
Date: Tue, 12 Dec 2023 01:51:46 -0500
Subject: [PATCH] [staged-updates] The forest task progress works again. It
 still serves no purpose.

---
 database/forest_models.py |  2 +-
 database/study_models.py  |  1 +
 pages/forest_pages.py     | 51 +++++++++++++++++++++++++++------------
 urls.py                   |  2 +-
 4 files changed, 39 insertions(+), 17 deletions(-)

diff --git a/database/forest_models.py b/database/forest_models.py
index 13293e34c..9a515ae36 100644
--- a/database/forest_models.py
+++ b/database/forest_models.py
@@ -49,7 +49,7 @@ class ForestTask(TimestampedModel):
     process_end_time = models.DateTimeField(null=True, blank=True)
     status = models.TextField(choices=ForestTaskStatus.choices())
     stacktrace = models.TextField(null=True, blank=True, default=None)
-    # Whether or not there was any data output by Forest (None indicates unknown)
+    # Whether or not there was any data output by Forest (None means construct_summary_statistics errored)
     forest_output_exists = models.BooleanField(null=True, blank=True)
     
     # S3 file paths
diff --git a/database/study_models.py b/database/study_models.py
index a670227a2..42336a0d3 100644
--- a/database/study_models.py
+++ b/database/study_models.py
@@ -147,6 +147,7 @@ def _get_data_time_bin(
             only_after_epoch: if True, will filter results only for datetimes after the Unix epoch
                               (1970-01-01T00:00:00Z)
             only_before_now: if True, will filter results only for datetimes before now """
+        
         time_bins: QuerySet[datetime] = self.chunk_registries.values_list("time_bin", flat=True)
         comparator = operator.lt if earliest else operator.gt
         now = timezone.now()
diff --git a/pages/forest_pages.py b/pages/forest_pages.py
index 3c6bd0abc..0f6ecbb4a 100644
--- a/pages/forest_pages.py
+++ b/pages/forest_pages.py
@@ -57,42 +57,62 @@
 @authenticate_researcher_study_access
 @forest_enabled
 def forest_tasks_progress(request: ResearcherRequest, study_id=None):
-    study: Study = Study.objects.get(pk=study_id)
+    # generates a chart of study analysis progress logs
+    study = Study.objects.get(pk=study_id)
     participants: ParticipantQuerySet = Participant.objects.filter(study=study_id)
     
-    # generate chart of study analysis progress logs
+    # later tasks will overwrite earlier tasks - this is intentional.
+    # number of forest tasks shouldn't be the bottleneck here.
     tasks = ForestTask.objects.filter(participant__in=participants).order_by("created_on")
     
+    # these are quite optimized buuuuut it is still slow.
     start_date = (study.get_earliest_data_time_bin() or study.created_on).date()
     end_date = (study.get_latest_data_time_bin() or timezone.now()).date()
     
     params = {}
-    results = defaultdict(lambda: "--")
-    # this code simultaneously builds up the chart of most recent forest results for date ranges
-    # by participant and tree, and tracks the metadata
+    results = defaultdict(lambda: "-")
+    chart_elements_lookup = {False: "N", None: "?"}
+    # this loop builds the chart of whether there are forest results for date ranges
+    # per-participant- -and-tree. The tasks query is ordered by creation date, so later tasks will
+    # overwrite earlier tasks. a "-" means no task has been run, "N" means a task ran but there was
+    # no output, "?" means there the code ran successfully but there was an error reading in the
+    # data so we MIGHT have data, and "Y" means there was definitely data.
     for task in tasks:
         for a_date in daterange(task.data_date_start, task.data_date_end, inclusive=True):
-            results[(task.participant_id, task.forest_tree, a_date)] = task
-            params[(task.participant_id, task.forest_tree, a_date)] = task.safe_unpickle_parameters_as_string()
+            key = (task.participant_id, task.forest_tree, a_date)
+            in_table = results[key]  # will populates with a "-" on first access
+            output_exists = task.forest_output_exists
+            if in_table == "Y" or output_exists:  # always force "Y"
+                results[key] = "Y"
+            elif in_table != "?":
+                # We have some nice constraints here:
+                # 1. output_exists is False or None, so chart_elements_lookup[false or None]
+                #    can only return "N" or "?".
+                # 2. The chart's current field is "-", "N", or "?"
+                # 3. If in_table is a ? we can just skip it because we cannot upgrade from ? to Y here.
+                #  So, we just skip if we are already at ? in the chart element, and otherwise we do the lookup.
+                results[key] = chart_elements_lookup[output_exists]
+            params[key] = task.safe_unpickle_parameters_as_string()
     
-    # generate the date range for charting
+    # generate the date range for the chart, we need it many times.
     dates = list(daterange(start_date, end_date, inclusive=True))
-    
     chart = []
     for participant in participants:
-        for tree in ForestTree.values():
-            row = [participant.patient_id, tree] + \
-                [results[(participant.id, tree, date)] for date in dates]
+        for tree_name in ForestTree.values():
+            # we need to make a list of lists with the participant and tree name
+            row = [participant.patient_id, tree_name] + \
+                [results[(participant.id, tree_name, date)] for date in dates]
             chart.append(row)
     
     # ensure that within each tree, only a single set of param values are used (only the most recent runs
     # are considered, and unsuccessful runs are assumed to invalidate old runs, clearing params)
     params_conflict = False
-    for tree in {k[1] for k in params.keys()}:
-        if len({m for k, m in params.items() if m is not None and k[1] == tree}) > 1:
+    for tree_name in {k[1] for k in params.keys()}:
+        if len({m for k, m in params.items() if m is not None and k[1] == tree_name}) > 1:
             params_conflict = True
             break
     
+    chart_json = orjson.dumps(chart).decode()  # may be huge, but orjson is very fast.
     return render(
         request,
         'forest/forest_tasks_progress.html',  # has been renamed internally because this is imprecise.
@@ -103,7 +123,7 @@ def forest_tasks_progress(request: ResearcherRequest, study_id=None):
             params_conflict=params_conflict,
             start_date=start_date,
             end_date=end_date,
-            chart=chart  # this uses the jinja safe filter and should never involve user input
+            chart=chart_json  # this uses the jinja safe filter and should never involve user input
         )
     )
 
@@ -157,6 +177,7 @@ def task_log(request: ResearcherRequest, study_id=None):
         task_dict["forest_tree_display"] = task_dict.pop("forest_tree").title()
         task_dict["created_on_display"] = task_dict.pop("created_on").strftime(DEV_TIME_FORMAT)
         task_dict["forest_output_exists_display"] = display_true(task_dict["forest_output_exists"])
+        
         # dates/times that require safety
         task_dict["process_end_time"] = task_dict["process_end_time"].strftime(DEV_TIME_FORMAT) \
              if task_dict["process_end_time"] else None
diff --git a/urls.py b/urls.py
index 80fb1450d..f10d8d5ac 100644
--- a/urls.py
+++ b/urls.py
@@ -219,8 +219,8 @@ def path(
 path('graph', mobile_pages.fetch_graph)
 
 # forest pages
-path('studies/<str:study_id>/forest/progress', forest_pages.forest_tasks_progress, login_redirect=SAFE)
 path('studies/<str:study_id>/forest/tasks/create', forest_pages.create_tasks)
+path('studies/<str:study_id>/forest/progress', forest_pages.forest_tasks_progress, login_redirect=SAFE)
 path("studies/<str:study_id>/forest/tasks/<str:forest_task_external_id>/cancel", forest_pages.cancel_task)
 path('studies/<str:study_id>/forest/tasks', forest_pages.task_log, login_redirect=SAFE)
 path('forest/tasks/download', forest_pages.download_task_log)