Decoding/inconsistencies (#582)

* Proper conversion of columns * Typo * Add support for boolean columns * More conversion * Fix decoding for statistics * Update type conversion with python 3.11 capabilities * Fix typo * Fix typo
astrolabsoftware · Feb 6, 2024 · 1b10f84 · 1b10f84
1 parent 81d3c3d
commit 1b10f84
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 9 deletions.
diff --git a/apps/api/utils.py b/apps/api/utils.py
@@ -39,6 +39,7 @@
 from apps.utils import format_hbase_output
 from apps.utils import extract_cutouts
 from apps.utils import hbase_type_converter
+from apps.utils import convert_datatype
 from apps.utils import isoify_time
 from apps.utils import hbase_to_dict
 
@@ -613,9 +614,11 @@ def return_ssocand_pdf(payload: dict) -> pd.DataFrame:
         pdf = pdf.drop(columns=['key:time'])
 
     # Type conversion
-    pdf = pdf.astype(
-        {i: hbase_type_converter[schema_client.type(i)] for i in pdf.columns}
-    )
+    for col in pdf.columns:
+        pdf[col] = convert_datatype(
+            pdf[col],
+            hbase_type_converter[schema_client.type(col)]
+        )
 
     return pdf
 
@@ -1082,6 +1085,9 @@ def return_statistics_pdf(payload: dict) -> pd.DataFrame:
         )
         pdf = pd.DataFrame.from_dict(hbase_to_dict(results), orient='index')
 
+        # See https://github.com/astrolabsoftware/fink-science-portal/issues/579
+        pdf = pdf.replace(regex={r'^\x00.*$': 0})
+
     client.close()
 
     return pdf
@@ -1615,8 +1621,11 @@ def download_euclid_data(payload: dict) -> pd.DataFrame:
 
     # Type conversion
     schema = client.schema()
-    pdf = pdf.astype(
-        {i: hbase_type_converter[schema.type(i)] for i in pdf.columns})
+    for col in pdf.columns:
+        pdf[col] = convert_datatype(
+            pdf[col],
+            hbase_type_converter[schema.type(col)]
+        )
 
     client.close()
 

diff --git a/apps/utils.py b/apps/utils.py
@@ -50,13 +50,16 @@
 simbad_types = get_simbad_labels('old_and_new')
 simbad_types = sorted(simbad_types, key=lambda s: s.lower())
 
+# For int we use `Int64` due to the presence of NaN
+# See https://pandas.pydata.org/pandas-docs/version/1.3/user_guide/integer_na.html
 hbase_type_converter = {
-    'integer': int,
+    'integer': 'Int64',
     'long': int,
     'float': float,
     'double': float,
     'string': str,
-    'fits/image': str
+    'fits/image': str,
+    'boolean': bool
 }
 
 class_colors = {
@@ -89,6 +92,19 @@ def hbase_to_dict(hbase_output):
 
     return optimized
 
+def convert_datatype(series: pd.Series, type_: type) -> pd.Series:
+    """ Convert Series from HBase data with proper type
+
+    Parameters
+    ----------
+    series: pd.Series
+        a column of the DataFrame
+    type_: type
+        Python built-in type (Int64, int, str, float, bool)
+    """
+
+    return series.astype(type_)
+
 def format_hbase_output(
         hbase_output, schema_client,
         group_alerts: bool, truncated: bool = False,
@@ -118,8 +134,11 @@ def format_hbase_output(
             pdfs = pdfs.drop(columns=colname)
 
     # Type conversion
-    pdfs = pdfs.astype(
-        {i: hbase_type_converter[schema_client.type(i)] for i in pdfs.columns})
+    for col in pdfs.columns:
+        pdfs[col] = convert_datatype(
+            pdfs[col],
+            hbase_type_converter[schema_client.type(col)]
+        )
 
     # cast 'nan' into `[]` for easier json decoding
     for col in ['d:lc_features_g', 'd:lc_features_r']: