Skip to content

Commit

Permalink
Decoding/inconsistencies (#582)
Browse files Browse the repository at this point in the history
* Proper conversion of columns

* Typo

* Add support for boolean columns

* More conversion

* Fix decoding for statistics

* Update type conversion with python 3.11 capabilities

* Fix typo

* Fix typo
  • Loading branch information
JulienPeloton authored Feb 6, 2024
1 parent 81d3c3d commit 1b10f84
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 9 deletions.
19 changes: 14 additions & 5 deletions apps/api/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from apps.utils import format_hbase_output
from apps.utils import extract_cutouts
from apps.utils import hbase_type_converter
from apps.utils import convert_datatype
from apps.utils import isoify_time
from apps.utils import hbase_to_dict

Expand Down Expand Up @@ -613,9 +614,11 @@ def return_ssocand_pdf(payload: dict) -> pd.DataFrame:
pdf = pdf.drop(columns=['key:time'])

# Type conversion
pdf = pdf.astype(
{i: hbase_type_converter[schema_client.type(i)] for i in pdf.columns}
)
for col in pdf.columns:
pdf[col] = convert_datatype(
pdf[col],
hbase_type_converter[schema_client.type(col)]
)

return pdf

Expand Down Expand Up @@ -1082,6 +1085,9 @@ def return_statistics_pdf(payload: dict) -> pd.DataFrame:
)
pdf = pd.DataFrame.from_dict(hbase_to_dict(results), orient='index')

# See https://github.com/astrolabsoftware/fink-science-portal/issues/579
pdf = pdf.replace(regex={r'^\x00.*$': 0})

client.close()

return pdf
Expand Down Expand Up @@ -1615,8 +1621,11 @@ def download_euclid_data(payload: dict) -> pd.DataFrame:

# Type conversion
schema = client.schema()
pdf = pdf.astype(
{i: hbase_type_converter[schema.type(i)] for i in pdf.columns})
for col in pdf.columns:
pdf[col] = convert_datatype(
pdf[col],
hbase_type_converter[schema.type(col)]
)

client.close()

Expand Down
27 changes: 23 additions & 4 deletions apps/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,16 @@
simbad_types = get_simbad_labels('old_and_new')
simbad_types = sorted(simbad_types, key=lambda s: s.lower())

# For int we use `Int64` due to the presence of NaN
# See https://pandas.pydata.org/pandas-docs/version/1.3/user_guide/integer_na.html
hbase_type_converter = {
'integer': int,
'integer': 'Int64',
'long': int,
'float': float,
'double': float,
'string': str,
'fits/image': str
'fits/image': str,
'boolean': bool
}

class_colors = {
Expand Down Expand Up @@ -89,6 +92,19 @@ def hbase_to_dict(hbase_output):

return optimized

def convert_datatype(series: pd.Series, type_: type) -> pd.Series:
""" Convert Series from HBase data with proper type
Parameters
----------
series: pd.Series
a column of the DataFrame
type_: type
Python built-in type (Int64, int, str, float, bool)
"""

return series.astype(type_)

def format_hbase_output(
hbase_output, schema_client,
group_alerts: bool, truncated: bool = False,
Expand Down Expand Up @@ -118,8 +134,11 @@ def format_hbase_output(
pdfs = pdfs.drop(columns=colname)

# Type conversion
pdfs = pdfs.astype(
{i: hbase_type_converter[schema_client.type(i)] for i in pdfs.columns})
for col in pdfs.columns:
pdfs[col] = convert_datatype(
pdfs[col],
hbase_type_converter[schema_client.type(col)]
)

# cast 'nan' into `[]` for easier json decoding
for col in ['d:lc_features_g', 'd:lc_features_r']:
Expand Down

0 comments on commit 1b10f84

Please sign in to comment.