Fix wrong display in statistics because the DataFrame was not sorted (#…

…574) * Fix wrong display in statistics because the DataFrame was not sorted * Keep fixing ordering of the data * Refactor query to /statistics * Do not drop indexing field when necessary * Fill NaN when generating histograms * Fill NaN when generating histograms -- II * Actually apply arguments... * Make the field on which to sort and index more generic
astrolabsoftware · Jan 22, 2024 · 7264225 · 7264225
1 parent c79c0dd
commit 7264225
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 113 deletions.
diff --git a/apps/plotting.py b/apps/plotting.py
@@ -40,6 +40,7 @@
 from apps.utils import sine_fit
 from apps.utils import class_colors
 from apps.utils import request_api
+from apps.utils import query_and_order_statistics
 from apps.statistics import dic_names
 
 from fink_utils.sso.spins import func_hg, func_hg12, func_hg1g2, func_hg1g2_with_spin
@@ -3610,18 +3611,7 @@ def plot_stat_evolution(param_name, switch):
     else:
         param_name_ = param_name
 
-    r = request_api(
-        '/api/v1/statistics',
-        json={
-            'date': '',
-            'output-format': 'json',
-            'columns': param_name_
-        }
-    )
-
-    # Format output in a DataFrame
-    pdf = pd.read_json(r)
-    pdf = pdf.set_index('key:key')
+    pdf = query_and_order_statistics(columns=param_name_)
     pdf = pdf.fillna(0)
 
     pdf['date'] = [
@@ -3963,21 +3953,7 @@ def make_daily_card(pdf, color, linecolor, title, description, height='12pc', sc
 def hist_sci_raw(dropdown_days):
     """ Make an histogram
     """
-    r = request_api(
-        '/api/v1/statistics',
-        json={
-            'date': '',
-            'output-format': 'json',
-            'columns': 'basic:raw,basic:sci'
-        }
-    )
-
-    # Format output in a DataFrame
-    pdf = pd.read_json(r)
-    pdf = pdf.set_index('key:key')
-    # Remove hbase specific fields
-    if 'key:time' in pdf.columns:
-        pdf = pdf.drop(columns=['key:time'])
+    pdf = query_and_order_statistics(columns='basic:raw,basic:sci')
 
     if dropdown_days is None or dropdown_days == '':
         dropdown_days = pdf.index[-1]
@@ -3992,7 +3968,7 @@ def hist_sci_raw(dropdown_days):
     """
 
     card = make_daily_card(
-        pdf, color='rgb(158,202,225)', linecolor='rgb(8,48,107)', title='Quality cuts', description=description, norm=norm
+        pdf[['Received', 'Processed']], color='rgb(158,202,225)', linecolor='rgb(8,48,107)', title='Quality cuts', description=description, norm=norm
     )
 
     return card
@@ -4004,21 +3980,8 @@ def hist_sci_raw(dropdown_days):
 def hist_catalogued(dropdown_days):
     """ Make an histogram
     """
-    r = request_api(
-        '/api/v1/statistics',
-        json={
-            'date': '',
-            'output-format': 'json',
-            'columns': 'class:Solar System MPC,class:simbad_tot,basic:sci'
-        }
-    )
-
-    # Format output in a DataFrame
-    pdf = pd.read_json(r)
-    pdf = pdf.set_index('key:key')
-    # Remove hbase specific fields
-    if 'key:time' in pdf.columns:
-        pdf = pdf.drop(columns=['key:time'])
+    pdf = query_and_order_statistics(columns='class:Solar System MPC,class:simbad_tot,basic:sci')
+    pdf = pdf.fillna(0)
 
     pdf = pdf.rename(columns={'class:Solar System MPC': 'MPC', 'class:simbad_tot': 'SIMBAD'})
 
@@ -4047,24 +4010,9 @@ def hist_catalogued(dropdown_days):
 def hist_classified(dropdown_days):
     """ Make an histogram
     """
-    r = request_api(
-        '/api/v1/statistics',
-        json={
-            'date': '',
-            'output-format': 'json',
-            'columns': 'basic:sci,class:Unknown'
-        }
-    )
-
-    # Format output in a DataFrame
-    pdf = pd.read_json(r)
-    pdf = pdf.set_index('key:key')
-    # Remove hbase specific fields
-    if 'key:time' in pdf.columns:
-        pdf = pdf.drop(columns=['key:time'])
+    pdf = query_and_order_statistics(columns='basic:sci,class:Unknown')
+    pdf = pdf.fillna(0)
 
-    # In case class:unknown contains NaN (see https://github.com/astrolabsoftware/fink-utils/issues/25)
-    pdf['class:Unknown'] = pdf['class:Unknown'].replace(np.nan, 0)
     pdf['Classified'] = pdf['basic:sci'].astype(int) - pdf['class:Unknown'].astype(int)
     pdf = pdf.rename(columns={'class:Unknown': 'Unclassified'})
 
@@ -4093,21 +4041,7 @@ def hist_classified(dropdown_days):
 def hist_candidates(dropdown_days):
     """ Make an histogram
     """
-    r = request_api(
-        '/api/v1/statistics',
-        json={
-            'date': '',
-            'output-format': 'json',
-            'columns': 'class:Solar System candidate,class:SN candidate,class:Early SN Ia candidate,class:Kilonova candidate'
-        }
-    )
-
-    # Format output in a DataFrame
-    pdf = pd.read_json(r)
-    pdf = pdf.set_index('key:key')
-    # Remove hbase specific fields
-    if 'key:time' in pdf.columns:
-        pdf = pdf.drop(columns=['key:time'])
+    pdf = query_and_order_statistics(columns='class:Solar System candidate,class:SN candidate,class:Early SN Ia candidate,class:Kilonova candidate')
 
     pdf = pdf.rename(
         columns={
@@ -4139,21 +4073,7 @@ def hist_candidates(dropdown_days):
 def fields_exposures(dropdown_days):
     """ Make an histogram
     """
-    r = request_api(
-        '/api/v1/statistics',
-        json={
-            'date': '',
-            'output-format': 'json',
-            'columns': '*'
-        }
-    )
-
-    # Format output in a DataFrame
-    pdf = pd.read_json(r)
-    pdf = pdf.set_index('key:key')
-    # Remove hbase specific fields
-    if 'key:time' in pdf.columns:
-        pdf = pdf.drop(columns=['key:time'])
+    pdf = query_and_order_statistics(columns='*')
 
     to_drop = [i for i in pdf.columns if i.startswith('basic:')]
     pdf = pdf.drop(columns=to_drop)

diff --git a/apps/statistics.py b/apps/statistics.py
@@ -19,6 +19,7 @@
 from app import app
 from apps.utils import loading
 from apps.utils import request_api
+from apps.utils import query_and_order_statistics
 
 import numpy as np
 import pandas as pd
@@ -119,20 +120,11 @@ def store_stat_query(name):
 
     https://dash.plotly.com/sharing-data-between-callbacks
     """
-    cols = 'basic:raw,basic:sci,basic:fields,basic:exposures,class:Unknown'
-
-    r = request_api(
-        '/api/v1/statistics',
-        json={
-            'date': '',
-            'output-format': 'json',
-            'columns': cols
-        }
+    pdf = query_and_order_statistics(
+        columns='basic:raw,basic:sci,basic:fields,basic:exposures,class:Unknown',
+        drop=False
     )
 
-    pdf = pd.read_json(r)
-    pdf = pdf.set_index('key:key', drop=False)
-
     return pdf.to_json()
 
 @app.callback(
@@ -327,17 +319,7 @@ def daily_stats():
 def generate_night_list():
     """ Generate the list of available nights (last night first)
     """
-    r = request_api(
-        '/api/v1/statistics',
-        json={
-            'date': '',
-            'output-format': 'json',
-            'columns': ''
-        }
-    )
-
-    # Format output in a DataFrame
-    pdf = pd.read_json(r)
+    pdf = query_and_order_statistics(columns='', drop=False)
 
     labels = list(pdf['key:key'].apply(lambda x: x[4:8] + '-' + x[8:10] + '-' + x[10:12]))
 

diff --git a/apps/utils.py b/apps/utils.py
@@ -177,6 +177,47 @@ def format_hbase_output(
 
     return pdfs
 
+def query_and_order_statistics(date='', columns='*', index_by='key:key', drop=True):
+    """ Query /statistics, and order the resulting dataframe
+
+    Parameters
+    ----------
+    date: str, optional
+        Date (default is '')
+    columns: str
+        Column names (default is '*')
+    index_by: str, optional
+        Column name on which to index on (default is key:key)
+    drop: bool
+        If True, drop original column used to index the dataframe.
+        Default is False.
+
+    Returns
+    ----------
+    pdf: Pandas DataFrame
+        DataFrame with statistics data, ordered from
+        oldest (top) to most recent (bottom)
+    """
+    r = request_api(
+        '/api/v1/statistics',
+        json={
+            'date': date,
+            'columns': columns,
+            'output-format': 'json'
+        }
+    )
+
+    # Format output in a DataFrame
+    pdf = pd.read_json(r)
+    pdf = pdf.sort_values(index_by)
+    pdf = pdf.set_index(index_by, drop=drop)
+
+    # Remove hbase specific fields
+    if 'key:time' in pdf.columns:
+        pdf = pdf.drop(columns=['key:time'])
+
+    return pdf
+
 def isoify_time(t):
     try:
         tt = Time(t)