From 5b4e5ff29356421492d353010407292fb7acde77 Mon Sep 17 00:00:00 2001 From: FusRoman <46221629+FusRoman@users.noreply.github.com> Date: Wed, 4 Oct 2023 14:56:16 +0200 Subject: [PATCH] Issue/94/fix memory lack (#95) * add the code to fix the memory issue, add a thread timer to print logs during the streaming process * add the code to fix the memory issue, add a thread timer to print logs during the streaming process * gcn stream listen now only the gw significant * move a function * broadcast variables in global context * fix globals * fix bug * fix bug * fix function issue * ci run locally * fix kwargs call * fix type problem * swap the broadcast join * Issue/96/offline refactoring (#97) * large refactoring of fink_mm * pep8 * refactor in progress * online test ok * add offline test, fix pep8 * offline test ok * move ztf_join_gcn from online to fink_mm directory, remove the old offline * fix pep8 * fix init test * fix fun_utils test * fix pep8 * fix join test * fix join test * fix distrib test * fix distrib test * fix distrib test * fix distrib test * fix distrib test * hope fix test * improve the skymap loading from hdfs, fix offline test * fix gcn_stream * fix conf file for test * fix conf file for test --- .flake8 | 2 +- fink_mm/__init__.py | 2 +- fink_mm/conf/distribute_for_test.conf | 4 +- fink_mm/conf/fink_mm.conf | 6 +- fink_mm/conf/integration.conf | 6 +- fink_mm/conftest.py | 14 +- fink_mm/distribution/distribution.py | 19 +- fink_mm/fink_mm_cli.py | 10 +- fink_mm/gcn_stream/gcn_stream.py | 2 +- fink_mm/init.py | 179 +++-- fink_mm/observatory/IceCube/IceCube.py | 2 +- fink_mm/observatory/LVK/LVK.py | 80 +- fink_mm/observatory/observatory.py | 2 +- fink_mm/offline/__init__.py | 0 fink_mm/offline/spark_offline.py | 430 ----------- fink_mm/online/__init__.py | 0 fink_mm/online/ztf_join_gcn.py | 470 ------------ ...4394-abe7-7e6b8e5195db.c000.snappy.parquet | Bin 0 -> 34248 bytes ...4394-abe7-7e6b8e5195db.c000.snappy.parquet | Bin 0 -> 34248 bytes ...4672-9953-f9522699a560.c000.snappy.parquet | Bin 0 -> 33254 bytes ...42bb-ba81-298de3eb33d3.c000.snappy.parquet | Bin 33127 -> 0 bytes ...470d-be88-97067acad74b.c000.snappy.parquet | Bin 0 -> 33254 bytes fink_mm/utils/application.py | 121 +-- fink_mm/utils/fun_utils.py | 221 +++--- fink_mm/ztf_join_gcn.py | 722 ++++++++++++++++++ scheduler/gen_avro_schema.py | 1 + setup.py | 1 + 27 files changed, 1069 insertions(+), 1225 deletions(-) delete mode 100644 fink_mm/offline/__init__.py delete mode 100644 fink_mm/offline/spark_offline.py delete mode 100644 fink_mm/online/__init__.py delete mode 100644 fink_mm/online/ztf_join_gcn.py create mode 100644 fink_mm/test/test_data/offline/year=2019/month=9/day=2/part-00000-73370ab7-afe3-4394-abe7-7e6b8e5195db.c000.snappy.parquet create mode 100644 fink_mm/test/test_data/offline/year=2019/month=9/day=3/part-00000-73370ab7-afe3-4394-abe7-7e6b8e5195db.c000.snappy.parquet create mode 100644 fink_mm/test/test_data/online/year=2019/month=09/day=02/part-00172-ec9814eb-4888-4672-9953-f9522699a560.c000.snappy.parquet delete mode 100644 fink_mm/test/test_data/online/year=2019/month=09/day=03/part-00000-e0bdebee-12e7-42bb-ba81-298de3eb33d3.c000.snappy.parquet create mode 100644 fink_mm/test/test_data/online/year=2019/month=09/day=03/part-00172-0f505f6a-2acf-470d-be88-97067acad74b.c000.snappy.parquet create mode 100644 fink_mm/ztf_join_gcn.py diff --git a/.flake8 b/.flake8 index 9c38ee058..2c8f24f4c 100644 --- a/.flake8 +++ b/.flake8 @@ -10,7 +10,7 @@ exclude = build, dist per-file-ignores = - ../fink-mm/fink_mm/online/ztf_join_gcn.py:W503,E402 + ../fink-mm/fink_mm/ztf_join_gcn.py:W503,E402 ../fink-mm/fink_mm/offline/spark_offline.py:W503,W605 ../fink-mm/fink_mm/utils/fun_utils.py:F811 ../fink-mm/fink_mm/distribution/distribution.py:W503 diff --git a/fink_mm/__init__.py b/fink_mm/__init__.py index cca034f2c..9d3fe606f 100644 --- a/fink_mm/__init__.py +++ b/fink_mm/__init__.py @@ -12,6 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.14.3" +__version__ = "0.15" __distribution_schema_version__ = "1.3" __observatory_schema_version__ = "1.1" diff --git a/fink_mm/conf/distribute_for_test.conf b/fink_mm/conf/distribute_for_test.conf index e9fd14a23..a92030230 100644 --- a/fink_mm/conf/distribute_for_test.conf +++ b/fink_mm/conf/distribute_for_test.conf @@ -26,7 +26,7 @@ hbase_catalog=/home/roman.le-montagner/fink-broker/catalogs_hbase/ztf.jd.json # port are the port where the hdfs driver listen # user are the name of the hdfs user [HDFS] -host= +host=127.0.0.1 port= user= @@ -71,7 +71,7 @@ username_writer=toto password_writer=tata [ADMIN] -verbose=False +debug=False # Healpix map resolution, better if a power of 2 NSIDE=4 diff --git a/fink_mm/conf/fink_mm.conf b/fink_mm/conf/fink_mm.conf index 9c1acbc22..ef00b0dc7 100644 --- a/fink_mm/conf/fink_mm.conf +++ b/fink_mm/conf/fink_mm.conf @@ -13,7 +13,7 @@ hdfs_gcn_storage=/user/roman.le-montagner/gcn_storage/raw # They can be in local FS (/path/ or files:///path/) or # in distributed FS (e.g. hdfs:///path/). # Be careful though to have enough disk space! -online_ztf_data_prefix=fink_mm/test/test_data/ztf_test/online +online_ztf_data_prefix=fink_mm/test/test_data/ztf_test # Prefix path on disk to save GRB join ZTF data (work for both online and offline). online_grb_data_prefix=fink_mm/test/test_output @@ -26,7 +26,7 @@ hbase_catalog=/home/roman.le-montagner/fink-broker/catalogs_hbase/ztf.jd.json # port are the port where the hdfs driver listen # user are the name of the hdfs user [HDFS] -host= +host=127.0.0.1 port= user= @@ -71,7 +71,7 @@ username_writer=toto password_writer=tata [ADMIN] -verbose=False +debug=True # Healpix map resolution, better if a power of 2 NSIDE=4 diff --git a/fink_mm/conf/integration.conf b/fink_mm/conf/integration.conf index 56755f829..67f2e3c51 100644 --- a/fink_mm/conf/integration.conf +++ b/fink_mm/conf/integration.conf @@ -13,7 +13,7 @@ hdfs_gcn_storage=fink_mm/ci_gcn_test # They can be in local FS (/path/ or files:///path/) or # in distributed FS (e.g. hdfs:///path/). # Be careful though to have enough disk space! -online_ztf_data_prefix=fink_mm/test/test_data/ztf_test/online +online_ztf_data_prefix=fink_mm/test/test_data/ztf_test # Prefix path on disk to save GRB join ZTF data (work for both online and offline). online_grb_data_prefix=fink_mm/ci_join_test @@ -26,7 +26,7 @@ hbase_catalog=$FINK_HOME/catalogs_hbase/ztf.jd.json # port are the port where the hdfs driver listen # user are the name of the hdfs user [HDFS] -host= +host=127.0.0.1 port= user= @@ -71,7 +71,7 @@ username_writer=toto password_writer=tata [ADMIN] -verbose=True +debug=True # Healpix map resolution, better if a power of 2 NSIDE=4 diff --git a/fink_mm/conftest.py b/fink_mm/conftest.py index 59d0e9306..af65e8176 100644 --- a/fink_mm/conftest.py +++ b/fink_mm/conftest.py @@ -166,11 +166,15 @@ def init_LVK(doctest_namespace): @pytest.fixture(autouse=True, scope="session") def init_spark(doctest_namespace): from astropy.time import Time + from fink_mm.utils.application import DataMode + import pyspark.sql.functions as sql_func online_output_tempdir = tempfile.TemporaryDirectory() doctest_namespace["online_output_tempdir"] = online_output_tempdir doctest_namespace["Time"] = Time + doctest_namespace["DataMode"] = DataMode + doctest_namespace["sql_func"] = sql_func grb_data = "fink_mm/test/test_data/gcn_test/raw/year=2019/month=09/day=03" gw_data = "fink_mm/test/test_data/S230518h_0_test" @@ -184,15 +188,15 @@ def init_spark(doctest_namespace): doctest_namespace["join_data"] = join_data doctest_namespace["alert_data"] = alert_data - ztf_datatest = "fink_mm/test/test_data/ztf_test/online" + ztf_datatest = "fink_mm/test/test_data/ztf_test" gcn_datatest = "fink_mm/test/test_data/gcn_test/raw" - join_data_test = "fink_mm/test/test_data/online" - offline_join_data_test = "fink_mm/test/test_data/offline_datatest.parquet" + online_data_test = "fink_mm/test/test_data/online" + offline_data_test = "fink_mm/test/test_data/offline" doctest_namespace["ztf_datatest"] = ztf_datatest doctest_namespace["gcn_datatest"] = gcn_datatest - doctest_namespace["join_data_test"] = join_data_test - doctest_namespace["offline_data_test"] = offline_join_data_test + doctest_namespace["online_data_test"] = online_data_test + doctest_namespace["offline_data_test"] = offline_data_test fink_home = os.environ["FINK_HOME"] hbase_catalog = fink_home + "/catalogs_hbase/ztf.jd.json" diff --git a/fink_mm/distribution/distribution.py b/fink_mm/distribution/distribution.py index 1fb93ceca..c2945ca43 100644 --- a/fink_mm/distribution/distribution.py +++ b/fink_mm/distribution/distribution.py @@ -231,7 +231,8 @@ def launch_distribution(arguments): >>> launch_distribution({ ... "--config" : "fink_mm/conf/distribute_for_test.conf", ... "--night" : "20190903", - ... "--exit_after" : 30 + ... "--exit_after" : 30, + ... "--verbose" : False ... }) >>> consumer = AlertConsumer(topics, myconfig) @@ -251,7 +252,7 @@ def launch_distribution(arguments): config = get_config(arguments) logger = init_logging() - verbose = return_verbose_level(config, logger) + verbose = return_verbose_level(arguments, config, logger) spark_submit = read_and_build_spark_submit(config, logger) @@ -272,6 +273,7 @@ def launch_distribution(arguments): _, _, _, + _, kafka_broker, username_writer, password_writer, @@ -297,20 +299,13 @@ def launch_distribution(arguments): external_files, ) - process = subprocess.Popen( - spark_submit, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - shell=True, - ) + completed_process = subprocess.run(spark_submit, shell=True, capture_output=True) - stdout, stderr = process.communicate() - if process.returncode != 0: # pragma: no cover + if completed_process.returncode != 0: # pragma: no cover logger.error( "fink-mm distribution stream spark application has ended with a non-zero returncode.\ \n\t cause:\n\t\t{}\n\t\t{}\n\n\n{}\n\n".format( - stdout, stderr, spark_submit + completed_process.stdout, completed_process.stderr, spark_submit ) ) exit(1) diff --git a/fink_mm/fink_mm_cli.py b/fink_mm/fink_mm_cli.py index 284b3f7b5..bc9b55dc9 100644 --- a/fink_mm/fink_mm_cli.py +++ b/fink_mm/fink_mm_cli.py @@ -45,14 +45,16 @@ def main(): elif arguments["join_stream"]: if arguments["online"]: - from fink_mm.online.ztf_join_gcn import launch_joining_stream + from fink_mm.ztf_join_gcn import launch_join + from fink_mm.utils.application import DataMode - launch_joining_stream(arguments) + launch_join(arguments, DataMode.STREAMING) elif arguments["offline"]: - from fink_mm.offline.spark_offline import launch_offline_mode + from fink_mm.ztf_join_gcn import launch_join + from fink_mm.utils.application import DataMode - launch_offline_mode(arguments) + launch_join(arguments, DataMode.OFFLINE) elif arguments["distribute"]: from fink_mm.distribution.distribution import launch_distribution diff --git a/fink_mm/gcn_stream/gcn_stream.py b/fink_mm/gcn_stream/gcn_stream.py index e2897d7ea..39ed984bc 100644 --- a/fink_mm/gcn_stream/gcn_stream.py +++ b/fink_mm/gcn_stream/gcn_stream.py @@ -215,7 +215,7 @@ def start_gcn_stream(arguments): config = get_config(arguments) logger = init_logging() - logs = return_verbose_level(config, logger) + logs = return_verbose_level(arguments, config, logger) # keep track of the gcn update gcn_tracking = pd.DataFrame(columns=["triggerId", "triggerTimejd", "nb_received"]) diff --git a/fink_mm/init.py b/fink_mm/init.py index 5f7e52e98..7877f1c7e 100644 --- a/fink_mm/init.py +++ b/fink_mm/init.py @@ -6,49 +6,12 @@ # from importlib.resources import files from importlib_resources import files import logging -import types import pathlib +from typing import Tuple import fink_mm -def return_verbose_level(config, logger): - """ - Get the verbose level from the config file and return it. - - Parameters - ---------- - config : dictionnary - dictionnary containing the key values pair from the config file - logger : logging object - the logger used to print logs - - Returns - ------- - logs : boolean - if True, print the logs - - Examples - -------- - >>> c = get_config({"--config" : "fink_mm/conf/fink_mm.conf"}) - >>> logger = init_logging() - - >>> return_verbose_level(c, logger) - False - """ - try: - logs = config["ADMIN"]["verbose"] == "True" - except Exception as e: - logger.error( - "Config entry not found \n\t {}\n\tsetting verbose to True by default".format( - e - ) - ) - logs = True - - return logs - - def init_fink_mm(arguments): """ Initialise the fink_mm environment. Get the config specify by the user with the @@ -75,7 +38,7 @@ def init_fink_mm(arguments): config = get_config(arguments) logger = init_logging() - logs = return_verbose_level(config, logger) + logs = return_verbose_level(arguments, config, logger) gcn_path = config["PATH"]["online_gcn_data_prefix"] + "/raw" grb_path = config["PATH"]["online_grb_data_prefix"] + "/grb" @@ -139,6 +102,14 @@ def get_config(arguments): return config +class EnvInterpolation(configparser.BasicInterpolation): + """Interpolation which expands environment variables in values.""" + + def before_get(self, parser, section, option, value, defaults): + value = super().before_get(parser, section, option, value, defaults) + return os.path.expandvars(value) + + class CustomTZFormatter(logging.Formatter): # pragma: no cover """override logging.Formatter to use an aware datetime object""" @@ -159,15 +130,63 @@ def formatTime(self, record, datefmt=None): return s -class EnvInterpolation(configparser.BasicInterpolation): - """Interpolation which expands environment variables in values.""" +class LoggerNewLine(logging.Logger): + """ + A custom logger class adding only a method to print a newline. - def before_get(self, parser, section, option, value, defaults): - value = super().before_get(parser, section, option, value, defaults) - return os.path.expandvars(value) + Examples + -------- + logger.newline() + """ + def __init__(self, name: str, level: int = 0) -> None: + super().__init__(name, level) + ch = logging.StreamHandler() -def init_logging(logger_name=fink_mm.__name__) -> logging.Logger: + self.setLevel(logging.DEBUG) + + # create console handler and set level to debug + ch = logging.StreamHandler() + ch.setLevel(logging.DEBUG) + + # create formatter + formatter = CustomTZFormatter( + "%(asctime)s - %(name)s - %(levelname)s \n\t message: %(message)s" + ) + + # add formatter to ch + ch.setFormatter(formatter) + + # add ch to logger + self.addHandler(ch) + + blank_handler = logging.StreamHandler() + blank_handler.setLevel(logging.DEBUG) + blank_handler.setFormatter(logging.Formatter(fmt="")) + self.console_handler = ch + self.blank_handler = blank_handler + + def newline(self, how_many_lines=1): + """ + Print blank line using the logger class + + Parameters + ---------- + how_many_lines : int, optional + how many blank line to print, by default 1 + """ + # Switch handler, output a blank line + self.removeHandler(self.console_handler) + self.addHandler(self.blank_handler) + for _ in range(how_many_lines): + self.info("\n") + + # Switch back + self.removeHandler(self.blank_handler) + self.addHandler(self.console_handler) + + +def init_logging(logger_name=fink_mm.__name__) -> LoggerNewLine: """ Initialise a logger for the gcn stream @@ -184,45 +203,53 @@ def init_logging(logger_name=fink_mm.__name__) -> logging.Logger: -------- >>> l = init_logging() >>> type(l) - + """ # create logger - def log_newline(self, how_many_lines=1): - # Switch handler, output a blank line - self.removeHandler(self.console_handler) - self.addHandler(self.blank_handler) - for i in range(how_many_lines): - self.info("\n") - - # Switch back - self.removeHandler(self.blank_handler) - self.addHandler(self.console_handler) - + logging.setLoggerClass(LoggerNewLine) logger = logging.getLogger(logger_name) - logger.setLevel(logging.DEBUG) + return logger + - # create console handler and set level to debug - ch = logging.StreamHandler() - ch.setLevel(logging.DEBUG) +def return_verbose_level( + argument: dict, config: dict, logger: LoggerNewLine +) -> Tuple[bool, bool]: + """ + Get the verbose level from the config file and return it. - # create formatter - formatter = CustomTZFormatter( - "%(asctime)s - %(name)s - %(levelname)s \n\t message: %(message)s" - ) + Parameters + ---------- + config : dictionnary + dictionnary containing the key values pair from the config file + logger : logging object + the logger used to print logs - # add formatter to ch - ch.setFormatter(formatter) + Returns + ------- + logs : boolean + if True, print the logs - # add ch to logger - logger.addHandler(ch) + Examples + -------- + >>> c = get_config({"--config" : "fink_mm/conf/fink_mm.conf"}) + >>> logger = init_logging() - blank_handler = logging.StreamHandler() - blank_handler.setLevel(logging.DEBUG) - blank_handler.setFormatter(logging.Formatter(fmt="")) + >>> return_verbose_level({}, c, logger) + (True, True) - logger.console_handler = ch - logger.blank_handler = blank_handler - logger.newline = types.MethodType(log_newline, logger) + >>> return_verbose_level({"--verbose": True}, c, logger) + (True, True) + """ + try: + debug = config["ADMIN"]["debug"] == "True" + logs = argument["--verbose"] + except Exception: + logger.error( + f"error when reading config file or cli argument \n\t config = {config}\n\tcli argument = {argument}\n\tsetting verbose and debug to True by default", + exc_info=1, + ) + logs = True + debug = True - return logger + return logs, debug diff --git a/fink_mm/observatory/IceCube/IceCube.py b/fink_mm/observatory/IceCube/IceCube.py index e619a4411..c46d4bd96 100644 --- a/fink_mm/observatory/IceCube/IceCube.py +++ b/fink_mm/observatory/IceCube/IceCube.py @@ -163,7 +163,7 @@ def voevent_to_df(self): return df def association_proba( - self, ztf_ra: float, ztf_dec: float, jdstarthist: float + self, ztf_ra: float, ztf_dec: float, jdstarthist: float, **kwargs ) -> float: """ Compute the association probability between the IceCube event and a ztf alerts diff --git a/fink_mm/observatory/LVK/LVK.py b/fink_mm/observatory/LVK/LVK.py index dc27cb8bb..a51ac893d 100644 --- a/fink_mm/observatory/LVK/LVK.py +++ b/fink_mm/observatory/LVK/LVK.py @@ -1,7 +1,9 @@ import os.path as path import io +from hdfs import InsecureClient import numpy as np import pandas as pd +import os import astropy.units as u from astropy.time import Time from astropy.table import QTable @@ -17,6 +19,39 @@ from fink_mm.test.hypothesis.observatory_schema import voevent_df_schema +def gcn_from_hdfs(client, triggerId, triggerTime, gcn_status): + root = "/user/julien.peloton/fink_mm/gcn_storage/raw" + path_date = os.path.join( + root, + f"year={triggerTime.year:04d}/month={triggerTime.month:02d}/day={triggerTime.day:02d}", + ) + all_gcn = [] + for p, _, files in client.walk(path_date): + for f in np.sort(files): + if triggerId in f: + path_to_load = os.path.join(p, f) + with client.read(path_to_load) as reader: + content = reader.read() + pdf = pd.read_parquet(io.BytesIO(content)) + all_gcn.append(pdf) + + if len(all_gcn) == 0: + raise FileNotFoundError( + "File not found at these locations {} with triggerId = {}".format( + path_date, triggerId + ) + ) + else: + pdf_concat = pd.concat(all_gcn) + res = pdf_concat[pdf_concat["gcn_status"] == gcn_status] + if len(res) == 0: + raise FileNotFoundError( + "File not found with this gcn_status = {}".format(gcn_status) + ) + else: + return res + + class LVK(Observatory): """ LVK network @@ -40,7 +75,7 @@ def __init__(self, notice: str): """ super().__init__(path.join(OBSERVATORY_PATH, "LVK", "lvk.json"), notice) - def get_skymap(self) -> QTable: + def get_skymap(self, **kwargs) -> QTable: """ Decode and return the skymap @@ -54,7 +89,21 @@ def get_skymap(self) -> QTable: >>> np.array(lvk_initial.get_skymap()["UNIQ"]) array([ 1285, 1287, 1296, ..., 162369, 162370, 162371]) """ - skymap_str = self.voevent["event"]["skymap"] + if "skymap" in self.voevent["event"]: + skymap_str = self.voevent["event"]["skymap"] + else: + hdfs_adress = kwargs["hdfs_adress"] + hdfs_client = InsecureClient( + f"http://{hdfs_adress}:50070", user="hdfs", root="/user/julien.peloton" + ) + triggerId = self.get_trigger_id() + gcn_status = kwargs["gcn_status"] + t_obs = Time(self.get_trigger_time()[1], format="jd").to_datetime() + gcn_pdf = gcn_from_hdfs( + hdfs_client, triggerId, t_obs, gcn_status + ) + skymap_str = json.loads(gcn_pdf["raw_event"].iloc[0])["event"]["skymap"] + # Decode and parse skymap skymap_bytes = b64decode(skymap_str) skymap = QTable.read(io.BytesIO(skymap_bytes)) @@ -94,7 +143,10 @@ def is_observation(self, is_test: bool) -> bool: self.voevent["superevent_id"][0] == "S" or self.voevent["superevent_id"][0] == "M" ) - return self.voevent["superevent_id"][0] == "S" + return ( + self.voevent["superevent_id"][0] == "S" + and self.voevent["event"]["significant"] + ) def is_listened_packets_types(self) -> bool: """ @@ -349,7 +401,7 @@ def get_pixels(self, NSIDE: int) -> list: return np.unique(ang2pix(NSIDE, theta, phi)).tolist() def association_proba( - self, ztf_ra: float, ztf_dec: float, jdstarthist: float + self, ztf_ra: float, ztf_dec: float, jdstarthist: float, **kwargs ) -> float: """ return the probability density at a known sky position for this gw event. @@ -362,6 +414,17 @@ def association_proba( ztf declination jdstarthist: float first time the alert varied + kwargs: dict + used to get the gw skymap from hdfs, need the following keys: + - hdfs_adress: HDFS adress used to instanciate the hdfs client from the hdfs package + - gcn_status: used to distinguish gcn with the same triggerId (account the gcn update) + - last_day: the last day to make the search on hdfs + - end_day: the end day to make the search on hdfs (the gcn will be search between last day and end day) + if not provided, get the skymap from the gcn stored in the current object. + + Can raise key not found if the skymap has been remove from the current object. + Can raise FileNotFound if the search on hdfs doesn't found the gcn with the current triggerId + Returns ------- @@ -375,7 +438,14 @@ def association_proba( >>> lvk_initial.association_proba(95.712890625, -10.958863307027668, 0) 0.0054008620296433045 """ - skymap = self.get_skymap() + if ( + "hdfs_adress" in kwargs + and "gcn_status" in kwargs + ): + skymap = self.get_skymap(**kwargs) + else: + skymap = self.get_skymap() + max_level = 29 max_nside = ah.level_to_nside(max_level) level, ipix = ah.uniq_to_level_ipix(skymap["UNIQ"]) diff --git a/fink_mm/observatory/observatory.py b/fink_mm/observatory/observatory.py index ff3d4c956..7db52ceb4 100644 --- a/fink_mm/observatory/observatory.py +++ b/fink_mm/observatory/observatory.py @@ -295,7 +295,7 @@ def get_pixels(self, NSIDE: int) -> list: return ipix_disc def association_proba( - self, ztf_ra: float, ztf_dec: float, jdstarthist: float + self, ztf_ra: float, ztf_dec: float, jdstarthist: float, **kwargs ) -> float: """ Compute the association probability between a gcn event and a ztf alerts diff --git a/fink_mm/offline/__init__.py b/fink_mm/offline/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/fink_mm/offline/spark_offline.py b/fink_mm/offline/spark_offline.py deleted file mode 100644 index db5d51703..000000000 --- a/fink_mm/offline/spark_offline.py +++ /dev/null @@ -1,430 +0,0 @@ -import json -from astropy.time import TimeDelta, Time - -from fink_utils.science.utils import ang2pix -from fink_utils.broker.sparkUtils import init_sparksession - -from pyspark.sql import functions as F -from pyspark.sql.functions import explode, col -import sys -import subprocess - -from fink_utils.spark.partitioning import convert_to_datetime - -from fink_mm.utils.fun_utils import ( - build_spark_submit, - join_post_process, - read_and_build_spark_submit, - read_prior_params, - read_grb_admin_options, - read_additional_spark_options, -) -import fink_mm.utils.application as apps -from fink_mm.init import get_config, init_logging, return_verbose_level -from fink_mm.utils.fun_utils import get_pixels - -from fink_filters.filter_mm_module.filter import ( - f_grb_bronze_events, - f_grb_silver_events, - f_gw_bronze_events, -) - - -def ztf_grb_filter(spark_ztf, ast_dist, pansstar_dist, pansstar_star_score, gaia_dist): - """ - filter the ztf alerts by taking cross-match values from ztf. - - Parameters - ---------- - spark_ztf : spark dataframe - a spark dataframe containing alerts, this following columns are mandatory and have to be at the candidate level. - - ssdistnr, distpsnr1, neargaia - - Returns - ------- - spark_filter : spark dataframe - filtered alerts - ast_dist: float - distance to nearest known solar system object; set to -999.0 if none [arcsec] - ssdistnr field - pansstar_dist: float - Distance of closest source from PS1 catalog; if exists within 30 arcsec [arcsec] - distpsnr1 field - pansstar_star_score: float - Star/Galaxy score of closest source from PS1 catalog 0 <= sgscore <= 1 where closer to 1 implies higher likelihood of being a star - sgscore1 field - gaia_dist: float - Distance to closest source from Gaia DR1 catalog irrespective of magnitude; if exists within 90 arcsec [arcsec] - neargaia field - - Examples - -------- - >>> sparkDF = spark.read.format('parquet').load(alert_data) - - >>> sparkDF = sparkDF.select( - ... "objectId", - ... "candid", - ... "candidate.ra", - ... "candidate.dec", - ... "candidate.jd", - ... "candidate.jdstarthist", - ... "candidate.jdendhist", - ... "candidate.ssdistnr", - ... "candidate.distpsnr1", - ... "candidate.sgscore1", - ... "candidate.neargaia", - ... ) - - >>> spark_filter = ztf_grb_filter(sparkDF, 5, 2, 0, 5) - - >>> spark_filter.count() - 32 - """ - spark_filter = ( - spark_ztf.filter( - (spark_ztf.ssdistnr > ast_dist) - | ( - spark_ztf.ssdistnr == -999.0 - ) # distance to nearest known SSO above 30 arcsecond - ) - .filter( - (spark_ztf.distpsnr1 > pansstar_dist) - | ( - spark_ztf.distpsnr1 == -999.0 - ) # distance of closest source from Pan-Starrs 1 catalog above 30 arcsecond - | (spark_ztf.sgscore1 < pansstar_star_score) - ) - .filter( - (spark_ztf.neargaia > gaia_dist) - | ( - spark_ztf.neargaia == -999.0 - ) # distance of closest source from Gaia DR1 catalog above 60 arcsecond - ) - ) - - return spark_filter - - -def spark_offline( - hbase_catalog: str, - gcn_read_path: str, - grbxztf_write_path: str, - night: str, - NSIDE: int, - start_window: float, - time_window: int, - ast_dist: float, - pansstar_dist: float, - pansstar_star_score: float, - gaia_dist: float, - with_columns_filter: bool = True, -): - """ - Cross-match Fink and the GNC in order to find the optical alerts falling in the error box of a GCN. - - Parameters - ---------- - hbase_catalog : string - path to the hbase catalog (json format) - Key index must be jd_objectId - gcn_read_path : string - path to the gcn database - grbxztf_write_path : string - path to store the cross match ZTF/GCN results - night : string - launching night of the script - NSIDE: int - Healpix map resolution, better if a power of 2 - start_window : float - start date of the time window (in jd / julian date) - time_window : int - Number of day between start_window and (start_window - time_window) to join ztf alerts and gcn. - time_window are in days. - ast_dist: float - distance to nearest known solar system object; set to -999.0 if none [arcsec] - ssdistnr field - pansstar_dist: float - Distance of closest source from PS1 catalog; if exists within 30 arcsec [arcsec] - distpsnr1 field - pansstar_star_score: float - Star/Galaxy score of closest source from PS1 catalog 0 <= sgscore <= 1 where closer to 1 implies higher likelihood of being a star - sgscore1 field - gaia_dist: float - Distance to closest source from Gaia DR1 catalog irrespective of magnitude; if exists within 90 arcsec [arcsec] - neargaia field - with_columns_filter : boolean - Hbase options to optimize loading, work only in distributed mode - Set this option at False if in local mode. default = True - - Returns - ------- - None - - Examples - -------- - >>> grb_dataoutput_dir = tempfile.TemporaryDirectory() - >>> grb_dataoutput = grb_dataoutput_dir.name - - >>> spark_offline( - ... hbase_catalog, - ... gcn_datatest, - ... grb_dataoutput, - ... "20190903", - ... 4, - ... Time("2019-09-04").jd, - ... 7, 5, 2, 0, 5, - ... False - ... ) - - >>> datajoin = pd.read_parquet(grb_dataoutput + "/offline").sort_values(["objectId", "triggerId", "gcn_ra"]).reset_index(drop=True) - >>> datatest = pd.read_parquet(offline_data_test).sort_values(["objectId", "triggerId", "gcn_ra"]).reset_index(drop=True) - - >>> cols = ['t2_AGN', 't2_EB', - ... 't2_KN', 't2_M-dwarf', 't2_Mira', 't2_RRL', 't2_SLSN-I', 't2_SNII', - ... 't2_SNIa', 't2_SNIa-91bg', 't2_SNIax', 't2_SNIbc', 't2_TDE', - ... 't2_mu-Lens-Single', "year", "month", "day"] - >>> datatest = datatest.drop(cols, axis=1) - >>> datajoin = datajoin.drop(cols, axis=1) - - >>> assert_frame_equal(datatest, datajoin, check_dtype=False, check_column_type=False, check_categorical=False) - """ - spark = init_sparksession( - "science2mm_offline_{}{}{}".format(night[0:4], night[4:6], night[6:8]) - ) - logger = init_logging() - low_bound = start_window - TimeDelta(time_window * 24 * 3600, format="sec").jd - - if low_bound < 0 or low_bound > start_window: - raise ValueError( - "The time window is higher than the start_window : \nstart_window = {}\ntime_window = {}\nlow_bound={}".format( - start_window, time_window, low_bound - ) - ) - - grb_alert = spark.read.format("parquet").option('mergeSchema', True).load(gcn_read_path) - - grb_alert = grb_alert.filter(grb_alert.triggerTimejd >= low_bound).filter( - grb_alert.triggerTimejd <= start_window - ) - - nb_gcn_alert = grb_alert.cache().count() - if nb_gcn_alert == 0: - logger.info( - "No gcn between {} and {}, exit the offline mode.".format( - Time(low_bound, format="jd").iso, Time(start_window, format="jd").iso - ) - ) - return - - with open(hbase_catalog) as f: - catalog = json.load(f) - - ztf_alert = ( - spark.read.option("catalog", catalog) - .format("org.apache.hadoop.hbase.spark") - .option("hbase.spark.use.hbasecontext", False) - .option("hbase.spark.pushdown.columnfilter", with_columns_filter) - .load() - .filter(~col("jd_objectId").startswith("schema_")) # remove key column - ) - - ztf_alert = ztf_alert.filter( - ztf_alert["jd_objectId"] >= "{}".format(low_bound) - ).filter(ztf_alert["jd_objectId"] < "{}".format(start_window)) - - ztf_alert = ztf_grb_filter( - ztf_alert, ast_dist, pansstar_dist, pansstar_star_score, gaia_dist - ) - - ztf_alert.cache().count() - - ztf_alert = ztf_alert.withColumn( - "hpix", - ang2pix(ztf_alert.ra, ztf_alert.dec, F.lit(NSIDE)), - ) - - grb_alert = grb_alert.withColumn( - "hpix_circle", - get_pixels(grb_alert.observatory, grb_alert.raw_event, F.lit(NSIDE)), - ) - grb_alert = grb_alert.withColumn("hpix", explode("hpix_circle")) - - ztf_alert = ztf_alert.withColumnRenamed("ra", "ztf_ra").withColumnRenamed( - "dec", "ztf_dec" - ) - - grb_alert = grb_alert.withColumnRenamed("ra", "gcn_ra").withColumnRenamed( - "dec", "gcn_dec" - ) - - join_condition = [ - ztf_alert.hpix == grb_alert.hpix, - ztf_alert.jdstarthist > grb_alert.triggerTimejd, - ztf_alert.jdendhist - grb_alert.triggerTimejd <= 10, - ] - join_ztf_grb = ztf_alert.join(grb_alert, join_condition, "inner") - - df_grb = join_post_process(join_ztf_grb, with_rate=False, from_hbase=True) - - df_grb = df_grb.withColumn( - "is_grb_bronze", - f_grb_bronze_events(df_grb["fink_class"], df_grb["observatory"], df_grb["rb"]), - ) - - df_grb = df_grb.withColumn( - "is_grb_silver", - f_grb_silver_events( - df_grb["fink_class"], - df_grb["observatory"], - df_grb["rb"], - df_grb["p_assoc"], - ), - ) - - df_grb = df_grb.withColumn( - "is_gw_bronze", - f_gw_bronze_events(df_grb["fink_class"], df_grb["observatory"], df_grb["rb"]), - ) - - timecol = "jd" - converter = lambda x: convert_to_datetime(x) # noqa: E731 - if "timestamp" not in df_grb.columns: - df_grb = df_grb.withColumn("timestamp", converter(df_grb[timecol])) - - if "year" not in df_grb.columns: - df_grb = df_grb.withColumn("year", F.date_format("timestamp", "yyyy")) - - if "month" not in df_grb.columns: - df_grb = df_grb.withColumn("month", F.date_format("timestamp", "MM")) - - if "day" not in df_grb.columns: - df_grb = df_grb.withColumn("day", F.date_format("timestamp", "dd")) - - grbxztf_write_path = grbxztf_write_path + "/offline" - - df_grb.write.mode("append").partitionBy("year", "month", "day").parquet( - grbxztf_write_path - ) - - -def launch_offline_mode(arguments): - """ - Launch the offline grb module, used by the command line interface. - - Parameters - ---------- - arguments : dictionnary - arguments parse from the command line. - - Returns - ------- - None - - Examples - -------- - >>> launch_offline_mode({ - ... "--config" : None, - ... "--night" : "20190903", - ... "--exit_after" : 100, - ... "--test" : True, - ... "--verbose" : False - ... } - ... ) - - >>> datajoin = pd.read_parquet("fink_mm/test/test_output/offline").sort_values(["objectId", "triggerId", "gcn_ra"]).reset_index(drop=True) - >>> datatest = pd.read_parquet(offline_data_test).sort_values(["objectId", "triggerId", "gcn_ra"]).reset_index(drop=True) - - >>> cols = ['t2_AGN', 't2_EB', - ... 't2_KN', 't2_M-dwarf', 't2_Mira', 't2_RRL', 't2_SLSN-I', 't2_SNII', - ... 't2_SNIa', 't2_SNIa-91bg', 't2_SNIax', 't2_SNIbc', 't2_TDE', - ... 't2_mu-Lens-Single', "year", "month", "day"] - >>> datatest = datatest.drop(cols, axis=1) - >>> datajoin = datajoin.drop(cols, axis=1) - - >>> assert_frame_equal(datatest, datajoin, check_dtype=False, check_column_type=False, check_categorical=False) - """ - config = get_config(arguments) - logger = init_logging() - - verbose = return_verbose_level(config, logger) - - spark_submit = read_and_build_spark_submit(config, logger) - - ast_dist, pansstar_dist, pansstar_star_score, gaia_dist = read_prior_params( - config, logger - ) - - ( - external_python_libs, - spark_jars, - packages, - external_files, - ) = read_additional_spark_options( - arguments, config, logger, verbose, arguments["--test"] - ) - - ( - night, - _, - _, - gcn_datapath_prefix, - grb_datapath_prefix, - _, - NSIDE, - hbase_catalog, - time_window, - _, - _, - _, - ) = read_grb_admin_options(arguments, config, logger, is_test=arguments["--test"]) - - application = apps.Application.OFFLINE.build_application( - logger, - hbase_catalog=hbase_catalog, - gcn_datapath_prefix=gcn_datapath_prefix, - grb_datapath_prefix=grb_datapath_prefix, - night=night, - NSIDE=NSIDE, - time_window=time_window, - ast_dist=ast_dist, - pansstar_dist=pansstar_dist, - pansstar_star_score=pansstar_star_score, - gaia_dist=gaia_dist, - is_test=arguments["--test"], - ) - - spark_submit = build_spark_submit( - spark_submit, - application, - external_python_libs, - spark_jars, - packages, - external_files, - ) - - process = subprocess.Popen( - spark_submit, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - shell=True, - ) - - stdout, stderr = process.communicate() - if process.returncode != 0: # pragma: no cover - logger.error( - f"fink-mm offline crossmatch application has ended with a non-zero returncode.\ - \n\tstdout: \n\n{stdout} \n\tstderr:\n\n{stderr}" - ) - exit(1) - - if arguments["--verbose"]: - logger.info("fink-mm joining stream spark application ended normally") - print() - logger.info(f"job logs:\n\n{stdout}") - return - - -if __name__ == "__main__": - if sys.argv[1] == "prod": # pragma: no cover - apps.Application.OFFLINE.run_application() diff --git a/fink_mm/online/__init__.py b/fink_mm/online/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/fink_mm/online/ztf_join_gcn.py b/fink_mm/online/ztf_join_gcn.py deleted file mode 100644 index c60c28051..000000000 --- a/fink_mm/online/ztf_join_gcn.py +++ /dev/null @@ -1,470 +0,0 @@ -import warnings - -warnings.filterwarnings("ignore") - -import time -import subprocess -import sys - -from pyspark.sql import functions as F -from pyspark.sql.functions import explode, col - -from astropy.time import Time -from datetime import timedelta - -from fink_utils.science.utils import ang2pix -from fink_utils.spark.partitioning import convert_to_datetime -from fink_utils.broker.sparkUtils import init_sparksession, connect_to_raw_database - -from fink_mm.utils.fun_utils import ( - build_spark_submit, - join_post_process, - read_and_build_spark_submit, - read_prior_params, - read_additional_spark_options, - read_grb_admin_options, -) -import fink_mm.utils.application as apps -from fink_mm.init import get_config, init_logging, return_verbose_level -from fink_mm.utils.fun_utils import get_pixels - - -def ztf_grb_filter(spark_ztf, ast_dist, pansstar_dist, pansstar_star_score, gaia_dist): - """ - filter the ztf alerts by taking cross-match values from ztf. - - Parameters - ---------- - spark_ztf : spark dataframe - a spark dataframe containing alerts, this following columns are mandatory and have to be at the candidate level. - - ssdistnr, distpsnr1, sgscore1, neargaia - ast_dist: float - distance to nearest known solar system object; set to -999.0 if none [arcsec] - ssdistnr field - pansstar_dist: float - Distance of closest source from PS1 catalog; if exists within 30 arcsec [arcsec] - distpsnr1 field - pansstar_star_score: float - Star/Galaxy score of closest source from PS1 catalog 0 <= sgscore <= 1 where closer to 1 implies higher likelihood of being a star - sgscore1 field - gaia_dist: float - Distance to closest source from Gaia DR1 catalog irrespective of magnitude; if exists within 90 arcsec [arcsec] - neargaia field - - Returns - ------- - spark_filter : spark dataframe - filtered alerts - - Examples - -------- - >>> sparkDF = spark.read.format('parquet').load(alert_data) - - >>> spark_filter = ztf_grb_filter(sparkDF, 5, 2, 0, 5) - - >>> spark_filter.count() - 32 - """ - spark_filter = ( - spark_ztf.filter( - (spark_ztf.candidate.ssdistnr > ast_dist) - | ( - spark_ztf.candidate.ssdistnr == -999.0 - ) # distance to nearest known SSO above 30 arcsecond - ) - .filter( - (spark_ztf.candidate.distpsnr1 > pansstar_dist) - | ( - spark_ztf.candidate.distpsnr1 == -999.0 - ) # distance of closest source from Pan-Starrs 1 catalog above 30 arcsecond - | (spark_ztf.candidate.sgscore1 < pansstar_star_score) - ) - .filter( - (spark_ztf.candidate.neargaia > gaia_dist) - | ( - spark_ztf.candidate.neargaia == -999.0 - ) # distance of closest source from Gaia DR1 catalog above 60 arcsecond - ) - ) - - return spark_filter - - -def check_path_exist(spark, path): - """Check we have data for the given night on HDFS - - Parameters - ---------- - path: str - Path on HDFS (file or folder) - - Returns - ---------- - out: bool - """ - # check on hdfs - jvm = spark._jvm - jsc = spark._jsc - fs = jvm.org.apache.hadoop.fs.FileSystem.get(jsc.hadoopConfiguration()) - if fs.exists(jvm.org.apache.hadoop.fs.Path(path)): - return True - else: - return False - - -def ztf_join_gcn_stream( - ztf_datapath_prefix, - gcn_datapath_prefix, - grb_datapath_prefix, - night, - NSIDE, - exit_after, - tinterval, - ast_dist, - pansstar_dist, - pansstar_star_score, - gaia_dist, - logs=False, -): - """ - Join the ztf alerts stream and the gcn stream to find the counterparts of the gcn alerts - in the ztf stream. - - Parameters - ---------- - ztf_datapath_prefix : string - the prefix path where are stored the ztf alerts. - gcn_datapath_prefix : string - the prefix path where are stored the gcn alerts. - grb_datapath_prefix : string - the prefix path to save GRB join ZTF outputs. - night : string - the processing night - NSIDE: String - Healpix map resolution, better if a power of 2 - exit_after : int - the maximum active time in second of the streaming process - tinterval : int - the processing interval time in second between the data batch - ast_dist: float - distance to nearest known solar system object; set to -999.0 if none [arcsec] - ssdistnr field - pansstar_dist: float - Distance of closest source from PS1 catalog; if exists within 30 arcsec [arcsec] - distpsnr1 field - pansstar_star_score: float - Star/Galaxy score of closest source from PS1 catalog 0 <= sgscore <= 1 where closer to 1 implies higher likelihood of being a star - sgscore1 field - gaia_dist: float - Distance to closest source from Gaia DR1 catalog irrespective of magnitude; if exists within 90 arcsec [arcsec] - neargaia field - - Returns - ------- - None - - Examples - -------- - >>> grb_dataoutput_dir = tempfile.TemporaryDirectory() - >>> grb_dataoutput = grb_dataoutput_dir.name - >>> ztf_join_gcn_stream( - ... ztf_datatest, - ... gcn_datatest, - ... grb_dataoutput, - ... "20190903", - ... 4, 100, 5, 5, 2, 0, 5 - ... ) - - >>> datatest = pd.read_parquet(join_data_test).sort_values(["objectId", "triggerId", "gcn_ra"]).reset_index(drop=True).sort_index(axis=1) - >>> datajoin = pd.read_parquet(grb_dataoutput + "/online").sort_values(["objectId", "triggerId", "gcn_ra"]).reset_index(drop=True).sort_index(axis=1) - - >>> datatest = datatest.drop("t2", axis=1) - >>> datajoin = datajoin.drop("t2", axis=1) - - >>> datatest["gcn_status"] = "initial" - >>> datatest = datatest.reindex(sorted(datatest.columns), axis=1) - >>> datajoin = datajoin.reindex(sorted(datajoin.columns), axis=1) - >>> assert_frame_equal(datatest, datajoin, check_dtype=False, check_column_type=False, check_categorical=False) - """ - logger = init_logging() - spark = init_sparksession( - "science2mm_online_{}{}{}".format(night[0:4], night[4:6], night[6:8]) - ) - - scidatapath = ztf_datapath_prefix + "/science" - - # connection to the ztf science stream - df_ztf_stream = connect_to_raw_database( - scidatapath - + "/year={}/month={}/day={}".format(night[0:4], night[4:6], night[6:8]), - scidatapath - + "/year={}/month={}/day={}".format(night[0:4], night[4:6], night[6:8]), - latestfirst=False, - ) - df_ztf_stream = df_ztf_stream.select( - "objectId", - "candid", - "candidate", - "prv_candidates", - "cdsxmatch", - "DR3Name", - "Plx", - "e_Plx", - "gcvs", - "vsx", - "x3hsp", - "x4lac", - "mangrove", - "roid", - "rf_snia_vs_nonia", - "snn_snia_vs_nonia", - "snn_sn_vs_all", - "mulens", - "nalerthist", - "rf_kn_vs_nonkn", - "t2", - "anomaly_score", - "lc_features_g", - "lc_features_r", - ) - - df_ztf_stream = ztf_grb_filter( - df_ztf_stream, ast_dist, pansstar_dist, pansstar_star_score, gaia_dist - ) - - gcn_rawdatapath = gcn_datapath_prefix - - # df_grb_stream = connect_to_raw_database( - # gcn_rawdatapath, - # gcn_rawdatapath + "/year={}/month={}/day=*?*".format(night[0:4], night[4:6]), - # latestfirst=True, - # ) - - # Create a DF from the database - userschema = spark.read.option('mergeSchema', True).parquet(gcn_rawdatapath).schema - - df_grb_stream = ( - spark.readStream.format("parquet") - .schema(userschema) - .option("basePath", gcn_rawdatapath) - .option("path", gcn_rawdatapath + "/year={}/month={}/day=*?*".format(night[0:4], night[4:6])) - .option("latestFirst", True) - .option('mergeSchema', True) - .load() - ) - - # keep gcn emitted during the day time until the end of the stream (17:00 Paris Time) - cur_time = Time(f"{night[0:4]}-{night[4:6]}-{night[6:8]}") - last_time = cur_time - timedelta(hours=7) # 17:00 Paris time yesterday - end_time = cur_time + timedelta(hours=17) # 17:00 Paris time today - df_grb_stream = df_grb_stream.filter( - f"triggerTimejd >= {last_time.jd} and triggerTimejd < {end_time.jd}" - ) - - if logs: # pragma: no cover - logger.info("connection to the database successfull") - - # compute healpix column for each streaming df - - # compute pixels for ztf alerts - df_ztf_stream = df_ztf_stream.withColumn( - "hpix", - ang2pix(df_ztf_stream.candidate.ra, df_ztf_stream.candidate.dec, F.lit(NSIDE)), - ) - - # compute pixels for gcn alerts - df_grb_stream = df_grb_stream.withColumn( - "hpix_circle", - get_pixels(df_grb_stream.observatory, df_grb_stream.raw_event, F.lit(NSIDE)), - ) - df_grb_stream = df_grb_stream.withColumn("hpix", explode("hpix_circle")) - - if logs: # pragma: no cover - logger.info("Healpix columns computing successfull") - - df_ztf_stream = df_ztf_stream.withColumn("ztf_ra", col("candidate.ra")).withColumn( - "ztf_dec", col("candidate.dec") - ) - - df_grb_stream = df_grb_stream.withColumnRenamed("ra", "gcn_ra").withColumnRenamed( - "dec", "gcn_dec" - ) - - # join the two streams according to the healpix columns. - # A pixel id will be assign to each alerts / gcn according to their position in the sky. - # Each alerts / gcn with the same pixel id are in the same area of the sky. - join_condition = [ - df_ztf_stream.hpix == df_grb_stream.hpix, - df_ztf_stream.candidate.jdstarthist > df_grb_stream.triggerTimejd, - ] - df_grb = df_ztf_stream.join(F.broadcast(df_grb_stream), join_condition, "inner") - - df_grb = join_post_process(df_grb) - - # re-create partitioning columns if needed. - timecol = "jd" - converter = lambda x: convert_to_datetime(x) # noqa: E731 - if "timestamp" not in df_grb.columns: - df_grb = df_grb.withColumn("timestamp", converter(df_grb[timecol])) - - if "year" not in df_grb.columns: - df_grb = df_grb.withColumn("year", F.date_format("timestamp", "yyyy")) - - if "month" not in df_grb.columns: - df_grb = df_grb.withColumn("month", F.date_format("timestamp", "MM")) - - if "day" not in df_grb.columns: - df_grb = df_grb.withColumn("day", F.date_format("timestamp", "dd")) - - grbdatapath = grb_datapath_prefix + "/online" - checkpointpath_grb_tmp = grb_datapath_prefix + "/online_checkpoint" - - query_grb = ( - df_grb.writeStream.outputMode("append") - .format("parquet") - .option("checkpointLocation", checkpointpath_grb_tmp) - .option("path", grbdatapath) - .partitionBy("year", "month", "day") - .trigger(processingTime="{} seconds".format(tinterval)) - .start() - ) - - if logs: # pragma: no cover - logger.info("Stream launching successfull") - print("-----------------") - logger.info(f"last progress : {query_grb.lastProgress}") - print() - print() - logger.info(f"recent progress : {query_grb.recentProgress}") - print() - print() - logger.info(f"query status : {query_grb.status}") - print("-----------------") - - # Keep the Streaming running until something or someone ends it! - if exit_after is not None: - time.sleep(int(exit_after)) - query_grb.stop() - logger.info("Exiting the science2grb streaming subprocess normally...") - else: # pragma: no cover - # Wait for the end of queries - spark.streams.awaitAnyTermination() - - -def launch_joining_stream(arguments): - """ - Launch the joining stream job. - - Parameters - ---------- - arguments : dictionnary - arguments parse by docopt from the command line - - Returns - ------- - None - - Examples - -------- - >>> launch_joining_stream({ - ... "--config" : None, - ... "--night" : "20190903", - ... "--exit_after" : 100, - ... "--verbose" : False - ... }) - - >>> datatest = pd.read_parquet(join_data_test).sort_values(["objectId", "triggerId", "gcn_ra"]).reset_index(drop=True).sort_index(axis=1) - >>> datajoin = pd.read_parquet("fink_mm/test/test_output/online").sort_values(["objectId", "triggerId", "gcn_ra"]).reset_index(drop=True).sort_index(axis=1) - - >>> datatest = datatest.drop("t2", axis=1) - >>> datajoin = datajoin.drop("t2", axis=1) - - >>> datatest["gcn_status"] = "initial" - >>> datatest = datatest.reindex(sorted(datatest.columns), axis=1) - >>> datajoin = datajoin.reindex(sorted(datajoin.columns), axis=1) - >>> assert_frame_equal(datatest, datajoin, check_dtype=False, check_column_type=False, check_categorical=False) - """ - config = get_config(arguments) - logger = init_logging() - - verbose = return_verbose_level(config, logger) - - spark_submit = read_and_build_spark_submit(config, logger) - - ast_dist, pansstar_dist, pansstar_star_score, gaia_dist = read_prior_params( - config, logger - ) - - ( - external_python_libs, - spark_jars, - packages, - external_files, - ) = read_additional_spark_options(arguments, config, logger, verbose, False) - - ( - night, - exit_after, - ztf_datapath_prefix, - gcn_datapath_prefix, - grb_datapath_prefix, - tinterval, - NSIDE, - _, - _, - _, - _, - _, - ) = read_grb_admin_options(arguments, config, logger) - - application = apps.Application.ONLINE.build_application( - logger, - ztf_datapath_prefix=ztf_datapath_prefix, - gcn_datapath_prefix=gcn_datapath_prefix, - grb_datapath_prefix=grb_datapath_prefix, - night=night, - NSIDE=NSIDE, - exit_after=exit_after, - tinterval=tinterval, - ast_dist=ast_dist, - pansstar_dist=pansstar_dist, - pansstar_star_score=pansstar_star_score, - gaia_dist=gaia_dist, - logs=verbose, - ) - - spark_submit = build_spark_submit( - spark_submit, - application, - external_python_libs, - spark_jars, - packages, - external_files, - ) - - process = subprocess.Popen( - spark_submit, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - shell=True, - ) - - stdout, stderr = process.communicate() - if process.returncode != 0: # pragma: no cover - logger.error( - f"fink-mm joining stream spark application has ended with a non-zero returncode.\ - \n\tstdout:\n\n{stdout} \n\tstderr:\n\n{stderr}" - ) - exit(1) - - if arguments["--verbose"]: - logger.info("fink-mm joining stream spark application ended normally") - print() - logger.info(f"job logs:\n\n{stdout}") - return - - -if __name__ == "__main__": - if sys.argv[1] == "prod": # pragma: no cover - apps.Application.ONLINE.run_application() diff --git a/fink_mm/test/test_data/offline/year=2019/month=9/day=2/part-00000-73370ab7-afe3-4394-abe7-7e6b8e5195db.c000.snappy.parquet b/fink_mm/test/test_data/offline/year=2019/month=9/day=2/part-00000-73370ab7-afe3-4394-abe7-7e6b8e5195db.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..e64b9d2c5341aa1a7c93f1ea839e2c222c279663 GIT binary patch literal 34248 zcmeHw4R{;XneG``ibsy*$i@;lpj&?m$EC${L1Uzt?*@l|Jj{HPuw-@}q7ZivxJYyZLx=BwWY6N{o39ueT}ANs?`)^3dEvuCZMK0 z8x!FWOZ5agqb#=_pgw`TjcC)R(O8siS!Hi<*z66o$K7CeJ7_(vUgmWmU1x*|h1CJl z+G*9YRI6yo*reX4zCz`z`kl7Y*uliv7O>#2i5l8#Z}v5+mZ}}C9V{QnhM27DbSZ#d2M;Cm$JL8HVNN1g3xOo2>RNJ25c;MP-+SzJv_Rvt@iD)Rk+8kuI^V(}P@lb>vTZ@zfeWtY< zm&fB|zEZW0l5~L|H>n5}I4M|NkNl7@PTM6KdatY6e0h(a^B|UD9D~c5Tz~p%J(N z8fxJNh+Fw6*MC1EH;sn$Z93ZKu-f!%$sjwTZCf!B%7zY)(J%JqdvnaS_J8JETfvvhj<7{}lzbhJKr}cZ7-J$ZmUZmN?#fD7HN}FxV zX+*LC|8NX7Ker`@T3Qgzqb=LeiVX}Gj6!Si_rF!8Kiu%%ZJ$ES@&=`qQ(b=Z14_*Z zROuHVCT-I;EzwV-6+IyyP{PazZ<-*dZFaRoPwUomvBbnTu4Dc|8+<(Bol@C4*kONf z-=LrIukUZyZ`KSmVg3vEdTG=>$Z#Xjib7F;dwATx7K|K@UH@p*i;M#ka2GS-fs6p5<`&?4_(0h=jksarLS?N^~ze>ORNl|bt}Vg6Ojbk z^8K&acz<6a?vD-o`r`={f3$vWuc$rpp&b-v0~ZX zWKH9`!A*_Bl`U#pV-PN>abzgc80SK5yOexm;Dlhtq$VMQI)R(o1a5FCVYr*kNme+tRnFW^F9S1$l29x#O@>>(mSY z@b^OXd@R8QSbuD2j17FZqO&ft#n(5)gOMh{VsU5(!?Ekl*8d=V(D0~%M3iEpL8>Pf ziyD4L>UfIbSYoj=6r`9q6^9)7j#8U1^Ldx|Q3bIcbyVI+F1<^wX|~#%ht?rz|AFXwM27gd;Q@`dTW6;o zR`(6Ph=dB!+fZZwR2gxoW&`JMi;i5ihtxHAtWH%%u9r zWrS*jbs2%|LoXvGyKQV4q3h}E>-FpZ&3?J2x!LNz{%QIsv3D7vdVuT-hp#d%CS6w7 zWFtNkI1mm^v>!nv?P(f*o7B={Uqz^fby13l0Hb72278+uUEC`mDPdD z+6EZqsBkP$c{y1drDDSrL&d*;i6#W&>bhT-Rkmsk9_!=X)lRVt~!Lt@O{2#Qtye_BL>9E>iSg4|^+bgRGxZ4Ql#sQ=@ zSsP&H219wMvSx{r?$AEG{G8Q{1PYg~#@^E3=4(_}xFd+D(a|j(Y(RZEqP>@+>Q2(D zM;)NbT6+r()$4LN9KOc2cDD-=R2|*YY;Cf;n?T}i!8dpBAll&TDs66YCyOv@a>E>2IqC<}RBi6lLQ7?~}^d z@Gom>;xKY~Df&P;v~9ASxqPQVAqE5BM7#y$0qBLTlMENSvqfng zsvOtVUClZ{Ge zv2Zvxm6$NMUw?^mBw+2Usc8$d&?8j-Lal7DjrG(R?F}A}3k*0y>_wFO%LE#%+*wV* z%z=qgiHTt4Zz`13MC48l6^~JsRy73!&IlMXOf9{sqPJ2({~Xau1qqX=_IC;7s~WKR zC!n_qO}9+OSYKm{9@SU)8k3XCmWu3&Z}yb5T5m{BNY#`I5`{@)b|Irm5GTK2CY*7e z4OywLPC}L&%ODZ%%^bio5E}6ZfTfSKf#j=|M>?PS{fDdWYlS=r?VilDT1n+FlBqm+ z5s@o})a=y?sn;YmR%&qJ9CeWq0T_J3U&f3cR!qr~i|fiEF$D7@7)Emd%99Dk)B(Vy z4%4!1@WTrIL84nbF?A;30cmYmT-+%a>*&Er za`{^Jv*`mcmJ+WUGpw*VIv$NpMRTo?OfZ%w53m)oW!YBPuhgGXB(3mS#iwf_eLVxy z|BikK<+L|9H#wZN!$v!3XOqKiv$a#m<951iO?DeBnwn^v+X*6~wGLKL9tY%YZgRUR z)Jof2&CaG~7u^H{E5O*it*8S^*kH_Y*gbBC!|iNtrdwK(&F;1XqPxlF_Ry~0!^qa; zXz~E5-PL4kYHHbykexWy-2^$Dj;1EJ$K#-JyN@#10@}<$d&3t>)Umv6;CF58iG>{H;kLpHNE&YXj|XA2)+io2YMU2u%N%c8#lt95Xa!j+dyv@ zGmYhg9i4=c&}oh2P(V$19|U z)S`?yPoVQe@)Njj(FeQr8f;(Nv*#Y7%D=PXwV4(7y%~uA;pEJL*^MptwZAqK?|S0X zhaNgP^YuHLzP(ld+RUxjt-WUgy@qwN;?66*4NsSWXUG05cd0XyM+bx1OilC!I-RXWY=N#086Nc2*V%1EC2-ml>EmMh*?k+xIk-xqVQc)q?B=5QH1 zKf=rn7i|@fR#mTRP4=x>Qs7H&|0oJ>xPhj7E}&^WXu7HuRx&xd*3C`VWYH{LbER|9 z_0Ky8{mZGdTs!1hjOJPDOCzv+TL3KoS99MjSO#hn{?6`#kH6wtU6?&A%pT@Dd-(JA zk$=hP3wP6B{@nz#m-G|f4QJN>)8~!7WtVcT?|J`t|HEdkm48<8FF!GGtv~!j#a?XLFTKAsERNHdZ=UI!C2% z&zLc{p8n7@3nh`E^HLQ_uKxZf2vW0q$A>jA(MRo7DvE4<;5?N(|5k{`x*M&%!%3uH~nbr+{~Z;RQtt$+;DCtaaK{S z+jbUjVP~yKQV2#eg^88UmCi-!+&SEtHNXgB`KCD_!QYuRseSA-Ym#QllEP_n9whp$ z%h4b6=}$VJYj4X%Po)^i)DcEc9ZS+|bh)1iq^196pYY>r0N?!VYrrX~Ymgq%Wl8}? zYZEvAiN$r-KGLCvXAm3py-5fBmG|IBJZV1&c>WR#GaMzlmQexl7YuECFp6Jg{zW`4-$M&qutyGOX)tGSxOd02yshN!F7#So+;}FP5 z;79brH)KtDmcgOWvcR1L_Z|P`xfQ1KkqsIi9Q?=qyb!mc4AqUCNpCXe(-(8B?JtPnewwZiX|mkoQfR*kJ^;10!OWg zZk;JAEk*~wT|bLmO+_$O22cggfqnfGdI&Rp{M?;K%lJ`era3G4S_$XX$ zu3Rh1ac(9YPF7PS5r*gZB6I~fh=Q9b1fayl)Jh;2k0#;AqgZ!EoW8i$*jg$<$i>5i>6W_xn()q?(CLfNyRh$cl-YA$EH#8uZ(t1tblp3MBYML`*gb5hH%^L<&ID z3x!!|wb2pQZ?}o*FVma8QvP6yzCIVoZz-WjTsVYjT3l7VgSq1Fsn7Vi7-t`>-%< z;k;Vt;lR%y^uxhCqD&JE4G&8qJmypkhoa;DK$zi$kq4J7pYX$342D=GoGlWk{^Cj5 zaB9F7UV$mI-l3d1z!Vu9Z z$`&W{jl{x$gmY=IWGYqAu)$PoIzMb7YxNqFOSyV8xEU%RN@?M%~Tv zJ#2_u_PBjUDPAOxhG%7EWz$;$;3#2uhOnFkh@9vUP%cN7Pqm{SNwxG;SpYv-W;mg+ z9QF9z7moPIqw}7tB~W#Gx-Cz>cdW?1;QZmV4ZH4u*7wQ4DED#8ZA#)LmsqK z9nwV$D-oGBPj!@F%|q)XV7#i4;T~F70`4p8C0ucZBtw2>eF@0>yX44twIxH|-&F$g z3mY-=O64MxC%tr&mTY*b226()hTkd-2NjkVHsXesKPVwj4ZhiZA#iaOCBnZz7@i_5 zZ+1fyks=Y`?vS13yDtv(*6J0>#w#oy1+#pMFg#6I4)g*T9x_GeXbdKu9?gyp3^=o|6&1^$zcfFO@FAJM*RC8FE~t&C~kwIwqBuwP=zTq4U@e%N0k z%kDvmWs$h(mcj2HED`?M%~B=Hmhl3`-Gt!)VR?2lRJ9bBfU8?4+6a(o z{_>U*HNSgny8Xl6Le6bY4~lK@$jBO`u)s6^r?gN2Bb zn@BRUCxa!Dy~h?JlQyYjeDAR(@*No|#Fw!NCZjwuQX=KsqlGBNO*R?V+oL6NJv3H` ztMDeCjQ^pr68T>rFT|hTB$Uy;K3*c-!AK#xj7>-x<-tgalqaHvC^Ppmg-uQw@rh`O z#QP`aMl5Wi%82(*lt}#IwnD@?o3t|87q^v2yO%FSTVQuiM!uIXk^H%MA@V|-%rf@p z;w7?wlqkf0u}y5579S-_)Z&?`LRwsOlU%08GgBq1@xgQ$wE(AWb{zFME*B#E66WA$B~gi))L7M-BFkfKPi%tLDmw23T_g zJuqvoG#~4+zbVLi;q?I&EYZ5>?k-3zUm=M0h0G<|_xR@u(&b$vK>qXv7Pqr<*XIi| z7Fs1hku+&>Ys}~GEl8TPPJp~6vfX~5AlsZ)3Q#_+L2*Vt_Jx8PQzWAyE z3YSQK~P3Nja5ML?M}YjLB@ z-X{vO&S4n=rAy?0?rR14vzHN&wM4RyzFwFtYZ(DqOC)>d$%15gml2S^M8*%kDKQ!s ziT+E%It8Y-?vcdw%K#&`of(de^`3PS9k&FPl z|H;&cG(gXdL&4~|(tPOm{*%l=a+gAZMXY4V11wUjxkz+Mo@#ja1sMytDGK1?svD93 z96zPaj3mLAkSoo13D5p-Irx~|K9H25dFmQ4c!h1@ibH&_o_Hm`l zLiF%VLqMJjK^%Bl1|fF{O#?uV3jv&w17!G*rV${^1rQ9+-wv*%U?HOse(}W0OfV3q zViiJGtjQlAF{&?R_U_#_p(WP;$msEJZ zH3)8$#TOQ$$x0}W9U_Y_o{CsYBc%Jo2r~gf@PQP5RY>R`#g0OV|H>;|i+^GXZp>cj z6UF{;=bY;gN57?&Fyq)Q2s;G99IHP&VorjZ@7iD{=hxIy{ejT@N@-~sF^g$s%=5jFj6laUO!}HGFrB~=ap?l{=FM>3 zD;_2{aNc(@c~S-f!xFhUe~o)G1dAqz(} z7Z05i2wgl>M(lL)Fa^S`iwDFJ+QpZ~d+a5YPQ`)G7viS5Pe$cqJD!A!i~fbN9ri-z zJ55>`+Y9aqEsX6K-ZPWE7(;krMfygZcO%S3S{U2m^&tym`}}*eg|Yo2``rKi$9D07 zc=3Hn@_Bdh0O{9;6c2%)Ru^AddO1n)7#Z&~DIP}t3Y6keq_?I>81vo3O}=JD0+{ba zM||8Xm51L6pU3kdc$QmiQQ5m+B#8NH&kYq{c1))Fe0kF^rIG06qB-vVX1vNqLK=$2 z!s1@r{BXpFqow2ESLT$6l6uX~ykTzIiHsYQ&paC<9F=@XO4+S;o7GO$4JATh_=Ry# zb8y%*+}!M-hurM28-Ca{Xmi`#&8*87XdY^6Zn6!rLqYqM@P;4wm6wJ8^RfWC;aY?? Iwe{Qo6Xclg_W%F@ literal 0 HcmV?d00001 diff --git a/fink_mm/test/test_data/offline/year=2019/month=9/day=3/part-00000-73370ab7-afe3-4394-abe7-7e6b8e5195db.c000.snappy.parquet b/fink_mm/test/test_data/offline/year=2019/month=9/day=3/part-00000-73370ab7-afe3-4394-abe7-7e6b8e5195db.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..e64b9d2c5341aa1a7c93f1ea839e2c222c279663 GIT binary patch literal 34248 zcmeHw4R{;XneG``ibsy*$i@;lpj&?m$EC${L1Uzt?*@l|Jj{HPuw-@}q7ZivxJYyZLx=BwWY6N{o39ueT}ANs?`)^3dEvuCZMK0 z8x!FWOZ5agqb#=_pgw`TjcC)R(O8siS!Hi<*z66o$K7CeJ7_(vUgmWmU1x*|h1CJl z+G*9YRI6yo*reX4zCz`z`kl7Y*uliv7O>#2i5l8#Z}v5+mZ}}C9V{QnhM27DbSZ#d2M;Cm$JL8HVNN1g3xOo2>RNJ25c;MP-+SzJv_Rvt@iD)Rk+8kuI^V(}P@lb>vTZ@zfeWtY< zm&fB|zEZW0l5~L|H>n5}I4M|NkNl7@PTM6KdatY6e0h(a^B|UD9D~c5Tz~p%J(N z8fxJNh+Fw6*MC1EH;sn$Z93ZKu-f!%$sjwTZCf!B%7zY)(J%JqdvnaS_J8JETfvvhj<7{}lzbhJKr}cZ7-J$ZmUZmN?#fD7HN}FxV zX+*LC|8NX7Ker`@T3Qgzqb=LeiVX}Gj6!Si_rF!8Kiu%%ZJ$ES@&=`qQ(b=Z14_*Z zROuHVCT-I;EzwV-6+IyyP{PazZ<-*dZFaRoPwUomvBbnTu4Dc|8+<(Bol@C4*kONf z-=LrIukUZyZ`KSmVg3vEdTG=>$Z#Xjib7F;dwATx7K|K@UH@p*i;M#ka2GS-fs6p5<`&?4_(0h=jksarLS?N^~ze>ORNl|bt}Vg6Ojbk z^8K&acz<6a?vD-o`r`={f3$vWuc$rpp&b-v0~ZX zWKH9`!A*_Bl`U#pV-PN>abzgc80SK5yOexm;Dlhtq$VMQI)R(o1a5FCVYr*kNme+tRnFW^F9S1$l29x#O@>>(mSY z@b^OXd@R8QSbuD2j17FZqO&ft#n(5)gOMh{VsU5(!?Ekl*8d=V(D0~%M3iEpL8>Pf ziyD4L>UfIbSYoj=6r`9q6^9)7j#8U1^Ldx|Q3bIcbyVI+F1<^wX|~#%ht?rz|AFXwM27gd;Q@`dTW6;o zR`(6Ph=dB!+fZZwR2gxoW&`JMi;i5ihtxHAtWH%%u9r zWrS*jbs2%|LoXvGyKQV4q3h}E>-FpZ&3?J2x!LNz{%QIsv3D7vdVuT-hp#d%CS6w7 zWFtNkI1mm^v>!nv?P(f*o7B={Uqz^fby13l0Hb72278+uUEC`mDPdD z+6EZqsBkP$c{y1drDDSrL&d*;i6#W&>bhT-Rkmsk9_!=X)lRVt~!Lt@O{2#Qtye_BL>9E>iSg4|^+bgRGxZ4Ql#sQ=@ zSsP&H219wMvSx{r?$AEG{G8Q{1PYg~#@^E3=4(_}xFd+D(a|j(Y(RZEqP>@+>Q2(D zM;)NbT6+r()$4LN9KOc2cDD-=R2|*YY;Cf;n?T}i!8dpBAll&TDs66YCyOv@a>E>2IqC<}RBi6lLQ7?~}^d z@Gom>;xKY~Df&P;v~9ASxqPQVAqE5BM7#y$0qBLTlMENSvqfng zsvOtVUClZ{Ge zv2Zvxm6$NMUw?^mBw+2Usc8$d&?8j-Lal7DjrG(R?F}A}3k*0y>_wFO%LE#%+*wV* z%z=qgiHTt4Zz`13MC48l6^~JsRy73!&IlMXOf9{sqPJ2({~Xau1qqX=_IC;7s~WKR zC!n_qO}9+OSYKm{9@SU)8k3XCmWu3&Z}yb5T5m{BNY#`I5`{@)b|Irm5GTK2CY*7e z4OywLPC}L&%ODZ%%^bio5E}6ZfTfSKf#j=|M>?PS{fDdWYlS=r?VilDT1n+FlBqm+ z5s@o})a=y?sn;YmR%&qJ9CeWq0T_J3U&f3cR!qr~i|fiEF$D7@7)Emd%99Dk)B(Vy z4%4!1@WTrIL84nbF?A;30cmYmT-+%a>*&Er za`{^Jv*`mcmJ+WUGpw*VIv$NpMRTo?OfZ%w53m)oW!YBPuhgGXB(3mS#iwf_eLVxy z|BikK<+L|9H#wZN!$v!3XOqKiv$a#m<951iO?DeBnwn^v+X*6~wGLKL9tY%YZgRUR z)Jof2&CaG~7u^H{E5O*it*8S^*kH_Y*gbBC!|iNtrdwK(&F;1XqPxlF_Ry~0!^qa; zXz~E5-PL4kYHHbykexWy-2^$Dj;1EJ$K#-JyN@#10@}<$d&3t>)Umv6;CF58iG>{H;kLpHNE&YXj|XA2)+io2YMU2u%N%c8#lt95Xa!j+dyv@ zGmYhg9i4=c&}oh2P(V$19|U z)S`?yPoVQe@)Njj(FeQr8f;(Nv*#Y7%D=PXwV4(7y%~uA;pEJL*^MptwZAqK?|S0X zhaNgP^YuHLzP(ld+RUxjt-WUgy@qwN;?66*4NsSWXUG05cd0XyM+bx1OilC!I-RXWY=N#086Nc2*V%1EC2-ml>EmMh*?k+xIk-xqVQc)q?B=5QH1 zKf=rn7i|@fR#mTRP4=x>Qs7H&|0oJ>xPhj7E}&^WXu7HuRx&xd*3C`VWYH{LbER|9 z_0Ky8{mZGdTs!1hjOJPDOCzv+TL3KoS99MjSO#hn{?6`#kH6wtU6?&A%pT@Dd-(JA zk$=hP3wP6B{@nz#m-G|f4QJN>)8~!7WtVcT?|J`t|HEdkm48<8FF!GGtv~!j#a?XLFTKAsERNHdZ=UI!C2% z&zLc{p8n7@3nh`E^HLQ_uKxZf2vW0q$A>jA(MRo7DvE4<;5?N(|5k{`x*M&%!%3uH~nbr+{~Z;RQtt$+;DCtaaK{S z+jbUjVP~yKQV2#eg^88UmCi-!+&SEtHNXgB`KCD_!QYuRseSA-Ym#QllEP_n9whp$ z%h4b6=}$VJYj4X%Po)^i)DcEc9ZS+|bh)1iq^196pYY>r0N?!VYrrX~Ymgq%Wl8}? zYZEvAiN$r-KGLCvXAm3py-5fBmG|IBJZV1&c>WR#GaMzlmQexl7YuECFp6Jg{zW`4-$M&qutyGOX)tGSxOd02yshN!F7#So+;}FP5 z;79brH)KtDmcgOWvcR1L_Z|P`xfQ1KkqsIi9Q?=qyb!mc4AqUCNpCXe(-(8B?JtPnewwZiX|mkoQfR*kJ^;10!OWg zZk;JAEk*~wT|bLmO+_$O22cggfqnfGdI&Rp{M?;K%lJ`era3G4S_$XX$ zu3Rh1ac(9YPF7PS5r*gZB6I~fh=Q9b1fayl)Jh;2k0#;AqgZ!EoW8i$*jg$<$i>5i>6W_xn()q?(CLfNyRh$cl-YA$EH#8uZ(t1tblp3MBYML`*gb5hH%^L<&ID z3x!!|wb2pQZ?}o*FVma8QvP6yzCIVoZz-WjTsVYjT3l7VgSq1Fsn7Vi7-t`>-%< z;k;Vt;lR%y^uxhCqD&JE4G&8qJmypkhoa;DK$zi$kq4J7pYX$342D=GoGlWk{^Cj5 zaB9F7UV$mI-l3d1z!Vu9Z z$`&W{jl{x$gmY=IWGYqAu)$PoIzMb7YxNqFOSyV8xEU%RN@?M%~Tv zJ#2_u_PBjUDPAOxhG%7EWz$;$;3#2uhOnFkh@9vUP%cN7Pqm{SNwxG;SpYv-W;mg+ z9QF9z7moPIqw}7tB~W#Gx-Cz>cdW?1;QZmV4ZH4u*7wQ4DED#8ZA#)LmsqK z9nwV$D-oGBPj!@F%|q)XV7#i4;T~F70`4p8C0ucZBtw2>eF@0>yX44twIxH|-&F$g z3mY-=O64MxC%tr&mTY*b226()hTkd-2NjkVHsXesKPVwj4ZhiZA#iaOCBnZz7@i_5 zZ+1fyks=Y`?vS13yDtv(*6J0>#w#oy1+#pMFg#6I4)g*T9x_GeXbdKu9?gyp3^=o|6&1^$zcfFO@FAJM*RC8FE~t&C~kwIwqBuwP=zTq4U@e%N0k z%kDvmWs$h(mcj2HED`?M%~B=Hmhl3`-Gt!)VR?2lRJ9bBfU8?4+6a(o z{_>U*HNSgny8Xl6Le6bY4~lK@$jBO`u)s6^r?gN2Bb zn@BRUCxa!Dy~h?JlQyYjeDAR(@*No|#Fw!NCZjwuQX=KsqlGBNO*R?V+oL6NJv3H` ztMDeCjQ^pr68T>rFT|hTB$Uy;K3*c-!AK#xj7>-x<-tgalqaHvC^Ppmg-uQw@rh`O z#QP`aMl5Wi%82(*lt}#IwnD@?o3t|87q^v2yO%FSTVQuiM!uIXk^H%MA@V|-%rf@p z;w7?wlqkf0u}y5579S-_)Z&?`LRwsOlU%08GgBq1@xgQ$wE(AWb{zFME*B#E66WA$B~gi))L7M-BFkfKPi%tLDmw23T_g zJuqvoG#~4+zbVLi;q?I&EYZ5>?k-3zUm=M0h0G<|_xR@u(&b$vK>qXv7Pqr<*XIi| z7Fs1hku+&>Ys}~GEl8TPPJp~6vfX~5AlsZ)3Q#_+L2*Vt_Jx8PQzWAyE z3YSQK~P3Nja5ML?M}YjLB@ z-X{vO&S4n=rAy?0?rR14vzHN&wM4RyzFwFtYZ(DqOC)>d$%15gml2S^M8*%kDKQ!s ziT+E%It8Y-?vcdw%K#&`of(de^`3PS9k&FPl z|H;&cG(gXdL&4~|(tPOm{*%l=a+gAZMXY4V11wUjxkz+Mo@#ja1sMytDGK1?svD93 z96zPaj3mLAkSoo13D5p-Irx~|K9H25dFmQ4c!h1@ibH&_o_Hm`l zLiF%VLqMJjK^%Bl1|fF{O#?uV3jv&w17!G*rV${^1rQ9+-wv*%U?HOse(}W0OfV3q zViiJGtjQlAF{&?R_U_#_p(WP;$msEJZ zH3)8$#TOQ$$x0}W9U_Y_o{CsYBc%Jo2r~gf@PQP5RY>R`#g0OV|H>;|i+^GXZp>cj z6UF{;=bY;gN57?&Fyq)Q2s;G99IHP&VorjZ@7iD{=hxIy{ejT@N@-~sF^g$s%=5jFj6laUO!}HGFrB~=ap?l{=FM>3 zD;_2{aNc(@c~S-f!xFhUe~o)G1dAqz(} z7Z05i2wgl>M(lL)Fa^S`iwDFJ+QpZ~d+a5YPQ`)G7viS5Pe$cqJD!A!i~fbN9ri-z zJ55>`+Y9aqEsX6K-ZPWE7(;krMfygZcO%S3S{U2m^&tym`}}*eg|Yo2``rKi$9D07 zc=3Hn@_Bdh0O{9;6c2%)Ru^AddO1n)7#Z&~DIP}t3Y6keq_?I>81vo3O}=JD0+{ba zM||8Xm51L6pU3kdc$QmiQQ5m+B#8NH&kYq{c1))Fe0kF^rIG06qB-vVX1vNqLK=$2 z!s1@r{BXpFqow2ESLT$6l6uX~ykTzIiHsYQ&paC<9F=@XO4+S;o7GO$4JATh_=Ry# zb8y%*+}!M-hurM28-Ca{Xmi`#&8*87XdY^6Zn6!rLqYqM@P;4wm6wJ8^RfWC;aY?? Iwe{Qo6Xclg_W%F@ literal 0 HcmV?d00001 diff --git a/fink_mm/test/test_data/online/year=2019/month=09/day=02/part-00172-ec9814eb-4888-4672-9953-f9522699a560.c000.snappy.parquet b/fink_mm/test/test_data/online/year=2019/month=09/day=02/part-00172-ec9814eb-4888-4672-9953-f9522699a560.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..466b4f346f2f6e4c21834e094ba14cca039c8fec GIT binary patch literal 33254 zcmeHw3wRsVnf4i3vPX{W*v1la>V%2v2nTFg8eJ?4#jzbbj)U#EwiE7%u{5@2>tfGH zzJyJ~B~X@SX-aAS5GbTvvXD}iru=N1EwM{kmhyD-bmTDf%N~* z=rSW|CW$53XSa|2Ks586bKdtm-?_{=XTDDwXdkrc)%sQXHNAS3UX4(hRf`aEx$O=& z?RMJqNME6^ZqV03reZx=Yt*KbB$ozMFDnNmvQ`EE))5FJQ2M%F_yC#;@8pde%rBjs zZrk?vceA&>Fx>`B2K_1t6Pe0kBvW~SM@9l?t-EZ7Ne!Z_|agAZ3 z?@y60$`GYejASY=Cd!q{5V`c8q~X3Wem75=kRR)@!Gh5sy8t2M`r=PP71 zDkXqMN=b@kK%WfrzqCVte0@^g$1Z;VCy(e6^6uEyHN!??)Lb|e;XQMFaD!<)7Mt=k zHOT4wE*C^yzbTdlUH%lmqV{NrqxX$(gA7!&ccCMq}+ynw7MJVmSLOALt8x#m&NJurF)dm-Q4JIp-mJQ z4Y3jWDjfBN&nd1Yd)>4Y{UN#;jNH<)x$7ADBu4l)#!T#3Pj}QgbY69iG zO_p^?T}st7Ys(yvjfYIJekRPeZeDA4Q}r`JhT5^Mv3pbhdT&$B8qEd@GWnyC7~|K{ z?yd1qkfr+kT@jX>2dFO~PZJs#7>`ES){AY8cB`$CcDovFE<3HKwQD?fRM8b?f+4M+ zv~*d@SgI{y$>@yUtG!U;tvpdyZtP@YY%5ss_e3@Av9)-cG^@1swoaD!bHOQWqt@Z^ zAWt>UOi^K$=b15Lz2p zA(r#?$HV#t9qn+L`WsO7Fc%!_>C|7i{WlI1754`=n8Iv~u>=Eym!R4qb~?^NAEL*R z+THB(pbEy;9ghTK8%zOao-eyp7Yl~jiA_j7w5?{7 z&gphLn1?GjlRWF+*M%y2A~81NEFm3ES7nFVAFKQdH8mcMS^Uv>WXI3Y+O3pt7|Qd0 zCe-ps1q@C*UD@aQDxxewC<)yyH28mz4vfv59zY!fsCpb5XkAOgL`H~40y{$S&KH(rgHwT)^Er@7$Td)2xz zXwq*!Kw9UlWyBzj>iU8_poEw)zGjM?v)Z(FJzdewMdMT7xs3VWWr4@yo>`5xlO6T- zZyWY8zV5*e{dV0b6XL&dmxo5Z!wfeDy(k#*b%Z8;o50AS=oODdJjggS1*4b=t-Z!L z=j&r){_%a&sDceoMY-c{)BrUcijRbYJWK)id_q5j)ZV6bmi4u?hgcUvo7aV45RnAh z@uP>Wyl-1P=8KN{2ARm%52|TD^0*PG6k_JtfNz*%B7Ez8h>UUZYv%|G&rIDc4AVFF zI9n!ci0qC8w%QPVFLIqm7JrGOH#jyP+Z5+!Sl{-+UVU`cceKXOIw`#3T0jN^b*hsi3#s%AVtNEt#8_E4mBQVCSZo0+gwlz)1gZ{}-G&(~{Jt<7;8@=7@U1qhEo0yMDckqk$ErDY)Dq`*Y%8wb6peBL-qTL*Jg6>n=!O9J znP3ASjdOn17af^k{XZz}st@n*ZX4mjNK;_381#eD=oKc*f0AD4c+^llLNSp5)fbIM z48JBTc#7dzVud3ZpqLmHgB*B7X|!u*v?R>&{X4bD(s7xVAn5NC)lk?s*zSC?+UA6w z=yn#<3&j=ew1s27=JY(GBDzs$`Bmhq+qJqDi>+m3GlKr_k8~q4$j1!#>dJa6Y_#3t zy0RaU&>(s%Y8sp^A`Vn<;e73pv5WSQ6^(9-!&!MyJ9jbv z6Y6!QVBgjQ#U2&7{sjTkOtp*L?_$mDwz}kR!wBqQ50c+R|eBm(pto zk$Era8HCB$9|}%&96}`RZXSJ$ETbp>4%KX5{dYP$ls9*g^gg0ysOC}E6`EGPy2@W( z+XzoNDirmXUqIGIsOTugP_ZAKrwhV!b@T6w%G=5e?y@?VSy_&Se{6`(Q5s|Ut`)jil$)=tt*3%8Q+9W4>)qbJs}5~5|9WNR`pT~I>sIRc7#DqHef7{h zA7jIs&id`cn;JJ?>THC}H9d4AB4NJ&*4NZNUgJbQV&39JH4F(edHHtTDnl9K=~bw# zR^1c1anmYSc@3fS_6%(!F1ZdKD|pr7g8znXPLI>!ao8<3cq~*>RUPG(1dKMqv2_S( zYb=eha)U>Cu)KPun(i!nVC^Z32?-R=ifUWyV7s?TTj~lUqPl`^?PUGh3lQzO09AF7 z9zE&=RW{jLX=q-j-EQ|bZL+zXh@k4}))q^%&D9JNPb(hW{ljRBw~OPV96iVcf^oi; zc3PU<#8tEbt^NUIs5w<*gpVt9h_5^!E=D%0G+)?!uc~8ZQT1SZ12Fekb-@S|`@dgQ zM@N5KT^)lbmxrS7E`h#HmM|CWGU&WpqO;m>(P6~SHBdwFlnjM}F}8tXZf-BF=Hg*! zy^bBI7L*=jM_I01Q$bR3&ivQsk+4GWd_q>8B&#jsv2e(wIilWRGCfLyB4u}0LqpF~ z!Sa78{cUB%29wp|?CU4JJ(0@s`YD)@nUNsS%!Aq1ezYYD55TEdE6DvY3tMIwE_h3; z+A>l;Sy6up8@7xw;V=X3aM8M-1GfGeqH^uUXkMcwHw9ywjg3D=-m2N^s-E+_lpj>z zU>c2vLebgyl&Ry2^VDO0%eLz3_7Dp*LixXI)s5DPzG|bb(d~AE0mq125#@S_K*Qy` zswh}FFcB&~6)68hshXM!-=d>pQL5acrQm@x28Ik#tF9^SFIUmOL9|*$!Xm2UT>^P4 zhpfISn5}|ytus;9+tjK@4W-_uNA4wPI#2J`=~8ra2CzYCeE(W~L@UyLx?txYKi?dR-bA(v7J*jLQRSb#G#K zr!GO2CVS`h(b?NT3fsKw>Ti!sMxwKk-0CM2j3vnnT>WfWcJ=R6>t9zTtN#_%{#r=y zW?b2SGa??%#-r9z&4!i+# z+kvsA*|n8I_04V|w>g`w&CPJ@h62`Rn-y|3(^i*5#==FQn-Ly{k_u!M0{s;Yy+z-f zv_#z<*H=H$iqPV2?&FoM<;SRvM=Y%yk3BHc)_JON<5!>kM(fWy>o;aAV1zETxZ9}4 zWx)25yK`k-dqK>LF{*A}^w#jd_P+bd#ER2IR(y9R z*3Y$5ErSaq6zrcdk4@>Pdi5lpXx!uFkhle&)?9B_XS)w?KTAPE_q7iVY09t0vuh|G z_61;liLYT%DO^L_59+NA`n?3q3&r~4T9{SHR7PW|P+S0mGV7}`;&~@3i8?%L_Yx}A zd2e?Rct(N4uH=MD)`B*uANK43ZPi~Sfi|kKIPov;SK=L%FlK`n0RbQz^|igA7=Y>; z2*`*3@U#Z#wR%dN4UK9n#RCT6NndgYf^tV*I z=>*q+EhB}5jE0fgBZ07to;b^tKD2Rr=Ga_~rAFaj+>YYif31yA6Z#L}DPD_v-tPC? z@G7CNF)+(;qqr+w+K)#-dv|{^d<^vt^*8ol!Qfyo?tpzkj=`(4q5d9b4$Fr-y9gsu zQD!7Z{93|uAIKY9Y$IcM%!cEQy=;VU912FpLM#UF>cAjSxCe~$p-Mm4pg&n84$tz; z62>A>1M(jws7CnEoL(>z_iKF@{{@8~vFMt*%y;(A($y5m=naYcYU#=9C zs%%RP@uHG8Qy8_Wz!tKj{N(PO9+w=?CwHet`N`elD3?lsCOOJg!nu6zC|^cjwhRnA zNl#ktGs~kX`3`V-o`J!DN5}FEjNj@mKRtx*0Q~_zm!GTYeZ^nu(*w41H>A$Rwz#3q zsh9tBaT|QL)ji)pkh;Z}c?&_3=%;;fV*JJ@A807@jc0zRA<4yyt2Ctbx(nY*zG8y(lJ+hCLKTDS>wNd8|BttW)CT|Ll;-{u#-urgQfq1XP#xtEbm5^cOx*yGj?R+ zOz@Zbr2Vg+BR}7H=JQ2g-i`dvpCe(_vM;@Ed%ydizh>+&I-hHMa;yEq7WjPbA6C@= zJ-EzXIQQ`#2Uc=zYu^1NdL780{ioml{B;A@#{W?DuRn*+Yj4;Udk;R}^L%{I113&p zk(o*{k||6)ndC~v1sox_$doGEe)Q$11}`@+mh|rH`Ek?Q#b*wG@5<|ZHH$y4e0cDW zvAV@=J$DQruQ4wk`{wUo)O|=THvHniJFBlUFFtYLcenl3O7miW|3kaot1ev3T@j~{ zjASYgRyJ397G*Qn>}_voF*@n4Fulq>xYX6>C=8ANuz9E=Ap^7UCyWRTa}G z7X~l?4|I$BKNgN}@J~N?*~bfi{&Vd&|8dKyg~)&?VmWF%8%u(G+*Gbx)pF~7f2=`iH|jj3ho`x}#H0cfcd zZWUiTJ?hoJ?0B}LJr_NdVkA>n7(I0@O0&^(+&Ulw{#!ik!*zf+w9=FNR7$E2;)^q! zUz9>V(X9h~CE#3StG++!?Y?d=e(jR>J%D#2(GbH?2|q5%56*c(@EpW18Z+2k5-$aV zd=V*a(PDM>s{(%fkJmo%hsR{vCY53&Q&-q}scTUR1Q~4u-E=PL3b3Vj--_3M!|lDT ztY!Z4^*nqAb$1?w4e-B>pZW=5L_5nUWP$(pmsNOYlQ|zcRDKMXfU0J7OI#LX7|B!~ ztWd5L6jB^R(r$_0qTs*Wj+06;lBp}ql)4tBP&lLN(vFkrX7#`O<*$?7Go$Im?q?)Z zc`!?=oG1mB?2ZFAzA60}zy}Nmr#8H`5kAqYnzzBeHxS=_lORdzmOsD@b%S61??eT; z`1N0E>d7{{T~lE_c+Wj=;_Pkz`fqRIrq8l;5-8Dmg}hj^Tq#J-$|hSAnYe_2 z!7;Nw76)0RBr8&(4dC#Zl9l41iiFo;imD8wgIlSO#h#D}Fii$fDb9gi(c^pyhZLNG zV+n-?dVheQ3uB+^RF)EOVZy--_@0C+6%B;u5?315CtTxl;6FH%MlG7*=TgNrbM|q5 zO1yJ`gDu=SRYo!l?igHjhJ{!n6>_SL+DUfaCxi>Yn^Z6cG8Ta(T~tWHcPh-sMT3Ea zSu40vA0G)az8T&ZiNZxfT**hEwz=}MgdFE)LZM_ch2tT38=ZhI1^-3xf`s^pSR%C? z2qq&*_{j*?T@|Cxt~IunM$kHeQ^rK1VFoNO~S-JW`vK1;xRV6 z3DzdlN8+R41e~;C(x_OKlsPMtGGA6JmEmTFF~C7P`jhrJA6gsYnW8XvFdS5W40<^S z{U}ixwOD&KKgrG}q+0O#j^QGg=5#Y|xtj@Hc)6Q>rrj*@xZ4>U6(4u_MtI!e8ABUj zLovn&LoeBHVN@nNcHcqp1hRiiva5!Gvo?S zAwykFZOtbmX0leVt8o^S>Ke&|UavCL8J;H$#|bm(l?1s;E@5pUAMaYz0PB7=Jo{iR zgg+26|B9&9mnKP)YX^bhMZ)kB0jXZ2Oh_wUCgp~|A{qEci$wU;YD9wnXc6G&MzUi3 zT&W!XDYa;$to4mR^ODN&s>*yy?Ny9IRZIXSmLAe7V}f6lh$?E+Yb`;$Lzfv2U1a`6 ziC4C2HnE&^MJANIpD-LE%%^PLvjiW==yT96;!0L^0_^RC;V!~_(C!tY4{}2gWZ;fC z#1_t65ekTjesshE?fhUeI3c+!g>VJj&bd-KW9)H>N+;Hs0(!_0Gw*SEjba3XjD{yA zWhE<80pJK>I7*mL0z^){0VtOvlT+=8TU0H*x)s1*C^9^!G9Ph!U9i|cLkNM4&L4S1 zP;GjJEK&I*52&0SBY;)TmdUC7q*p{wt+oa1=QM`>YV(s`tnoRxAl;$ai-afP8$56d7-jB*@3N6o7o&R*bx^SY6{z@3zEU z7T#R}(_X^xl*(|I%6!{a+|lIy;AL9yK<}Br6C0ld{L{cdm=E+q7ZJVRVRT6DDlB(fAI_Q?|XBSQtkpBNS! zS+esMDDEH(cN69l!_d^CLp<&`;+*JVc4$t&?JD-78T)bw})+z*BEbn;4J%9RcQ+-Z{z>oU-fUoF}t z^DIIz%RZIiewF$0tHCU}zUH|9=Sty*>@q^(4iC>SL8?kWiDItD6QXuhxDJ9~Z zb5#lLfvEy%e>JT{t8kzuA^+8Mf#i4bO5{rC!V>no_yXCF$CTL5cJ3_E;&`k;EpDGt z(&DV=+7dNxpD9p{=Vz7F$UHZfa6Ugf)mBE9sH@UO~cp>iPnCpWda!oB7^C!uj;B0y#hUvJ$7#dl3oy2VX9b{fQgI1KI4= zEPnAK;exCMlD&7MG8ulgBO!yV1(F@QS&>Y9c_d+k`~@<;wMU7O6L$tbQ{h88$eJL_ z3S&ra4YH(iTZ2PiQ6!Vu8vyGIDl?P;*4)4Z%$h5eV?FWLimYee9YDbXt$X-y6se{A zgM@t{bAk3f@wbX}vO5IGpRS-Z6hYkVNu_eOJ^WQgMx{Lh6iJgx_mU#1R4PaM&R#{* zoLvIsEs*WOyA;{ZX`cY)(;7(4Db`mim8qw0x?hoh3Ht^>DNv7N4=C!fl$`^>6sX9lgNlkM?j4{^fvitIsK|Ow zy9X$r)<8OqigTS*DmT}C@Q|VgOWHpGPJx;{@ra@(XWT(RxdNHrJEX|0xQBo;X;$g; zM6|qADrfmak1DdB!!80!7s&tCV~YHO|CVH(4Ot5$yZ_tDWLet?$XX!TtB)&^$!;Sc ze}Rnqo)8(0#R-Qa!G5r`1?Lh-27=t+6x>;IrE;C+{U;>^B}peH2?L}9LvBO~X2_Mw zF}(Oa34`bumEwR65adXhDJv!<+2ymOavXO)E#XjOXR8!E7zq*qJ~xU3!{}WOv&nG#&V>plPQ&>Ir0q9q+G}(&T9h6JA~l`VTLUDWntQFL@AI%=I!3W z1xY1ResB^qK|1K48?%A+&y~vQ|Joujs8il)3Ew(_;JC_gRAq)_Ajpluzy!HcIf8vZ zmJncnJ&F2|2I#qQ7Z^QPDu@2ka}ondeEtUPAB76M zMi|~E%#g#I85xmz2V7^0qR6d~;f;?8Lmen%G{O%@SePjW;wdabXoV&D8xBSde7m&8 zj*RPba`4k3{3`(P%K$M87oD}>-wVLsscw9x`Fa!n3TKnYgrg+eIyKt2$?^GBuXWJ5HYMJe~gSc7!hWvZ6k!vh{cy`Q8Csge~eUc zPW}jKU&Z_(lHou31AyD1Sa>uPW$YG7dY!p0f3^H!azg|2$4Z2< z+Lk|rG-fe>6d}wp|Kf^~k@-WWf+_PyNQOZQ@t2!sEioj{9zQr zg7XK&4lVhY#z)HqlupGfFBRgNb6<=~pY3=Nn%ML&KilCfWU0%fy*5qrw#xR%qEqBUhu~{uP zs(vIM48d;^yITUI?$MSOJ3ZoJM_uqci2S|$~R)5P#b4#;zgdGXk*26cqz>hI5 S|94gkpjY7cv!%H literal 0 HcmV?d00001 diff --git a/fink_mm/test/test_data/online/year=2019/month=09/day=03/part-00000-e0bdebee-12e7-42bb-ba81-298de3eb33d3.c000.snappy.parquet b/fink_mm/test/test_data/online/year=2019/month=09/day=03/part-00000-e0bdebee-12e7-42bb-ba81-298de3eb33d3.c000.snappy.parquet deleted file mode 100644 index 0e2c3b582da93fcf1df3eac0e7cda76e29b88b51..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 33127 zcmeHw4RjmTnf4t;ibsy@$i}0{sgq39L>REg(#Vo!S8;5|PV8Vi`D`aZD2TB%ww36| znUVYhO%qz6d|AFUWob498qxq+U@4_(m!&MlF5P8Ymd#-)r9fG>P)aH7vN^QNh8#%u zy)%+#^fQSq(LG;#?BiH7_ult;-g)o;o%dC?whuTdJ#|b+9VU9IN=lE=LBeN32<;f! z=xpYOC#H4_)2tbhiAiyV*|ake78o-!O-zXa52WrECRaGj6H|$a1TM5&5Fk}A2Jy|r z_)ZE@UdmTTbpyv@imdUVT2mSr@~x>mj4J|bO;`tiy9xagGdnavuHL|ZDSd$upd2Kq zsidOay@!^(_A}w&t!2&AS6(8*XUKHm7(tr;eeCRy2rIh2j7ALnJ6vY=O(sj;d!qa) zERn64mBg|b!%C*gSWs=A)Hy17C}qy>)0_V3E7tx+%Y?Q^H@hxwhR=WayCt>XJhQv) z;;9dBJF--0TlvmMvFjlH>3{s?k6*V4ZQ_%DL`aL~c_u`SaS6pqMedNfmZvEU+`)q&zy?edOFP_a+I)!848AS}al69_k1np)EfAAoN5J>a8WZhX3< zz4li>`6>PcmgUq+HA||5l}wevEV-$`Qs$7GIkCjvd)Pt9?fMJpCgX)7$Hv#axgI{z ztA@9rP46SV^#(zb^v%C9z)e8^3&Kp!{^d^$wPc&iWiUIA-FfF5xOm&Y{`(v2b8Vgk zT5l!cN-B$W%T0xp3wA-Z%~Ma8Yx7i&HEn@RYozjlF~jENRAsV8ORS3^b5loO4)`KS zwBkPinZC{jA}Lc6NRP*}+-8BS2;|{!wqzw`NlF@N17vRGg-9qz34@eul|KL|<31Y_ zno1I~%wdE~kJr`cWxZ}^4y-YqWmPT#Ub#`FXVa(~?5-Qv*dYDw1Cr^c2+8P?%!u6Mw% zi*_`&I2`cHI2?|gx?84E(5jUIT4`lz%78u@9Y)<9#j)RBd(Uqk!~;Oq5SYrblBp|f zk<_)4nvK4h>Q4@VYY&z{=CIm=oy1X)(@6=l~ZDhJsw22kMM*(SvGB zl0sHXei6V@21L?Kx8aHYh3(X-OOmzl;AN+ObU%fVZ~K<62|gO9ry}8~*fJ%C*4f75 z@$r_1hRMlEX41vPgwY1p;czxw+1EQ57~>;ceJCo%xoCj5(V>Z$5M5`34-klQBFBu1 zBLQYK5N(k9@~2@$@ox;r{aa(ApLMWoy~A72HV-*HEv%!Z(ZjeK{psH6_cqmgn^_wz z#KQbKTWnb5g$XVmibZYo1TTnivChUa4jT<>wuq80y|Dn4AgWh9<^Ux)Cd$Vf;87aN z-mo@IKv|mpyd9M>sGeXP%#;|kwU*Mf?~1V)L3-h$JrN&^2|e4l4D`3Qet>9rtOiit z*TAer`ck^O$ynA1#Y89+@8=?X>&De~FI_tk;^^&L>NjoZzr@#2z0$CbhC+c@G|mN# ztao!F9OCJ|Kv$F(rUB|xsHFjI-8vSF@~xLS>s=0KJ?r(1b5AtRN8Kf4quW!_p%26>zC@3Y#o|mLmWXbD9CHK{((0tsAKi--uPL>t*+NIf??IQcj--| zP^DkHhjdIi%7_6Lt?3JifD-0L#p-c#%HcG+DAv45h$Y6qaRv9cWx)p%Et3XECqLrv z-!kOq{F??ksBNYZE-Zfa_7)cP4spULw4zYd-x1#B-vCAq$F90R+JdZu1y^mbs*cZ6Vf%(Z;o5=tLxe zwtxRVhv?sui2GwB{sAsJ`rRrvfLgo=R0?y`e9%86a8a@KZbZg~#I;idjYpcm(=2=M;c@J>GP-O{ew$adeBVG)cp4i<|;I~a*wWn=!E^g+X;2NO{mrV_d@ z7K>VbN}5HQ6L?}tV<<>-aXJnqaE~$=S56p77~}h|HzKCv3L`>p@%KU(E< zLre6!7qc^q&HQdg;J)H+xnD&vCT}CtWFw6y zRuG16W+j1~!_OlT&iYamz}=a2MGV4qsZmgmg3R zZG?WUE!4O9$l@00Gn=?D7l?-fo-dN7MwW58$2@lcQ#OdqI7J8=IHqMFEWK-UfuZXMdfW}7H`=a7+HCNhd!EOU4_l8&81KLS6}?|HxRIZjX56TnBVi;j-)35FDMKQ=9F^7R zd!jdPSnesWCQQDb!7apRu7ha>FUCCZJLGh?xEosI79bI9pk$UboBT@-=L5dfbShYuVOjrpf7Pf|Qn4+`0RQP`9s35Mly5zy(7Iv6Xc* zOEQ5>T^*Y0+Hr}P{Sh}ccpuG;5dmN@vl#Bnzr}eRs zUshGcVdiR~**i+0ZIdP3sy!ByuRAtr{5l&!{8Sx12(x539E$UGH21~!(kdYlf!gcX zj%q;Z0e*xR$_-|cP6+n@dl3mMgqDxU$}?mYGZv48ZHANjbvD}rBq&mLpb9E_nhur! zOX)8w%$AC>;LsYE33{4x1Mf@sgZ1R5&eQ%S?h zfs4|K@nHFHO7--3uc4Jfb)~+B(Y#swuM@rT#zoQRjq}y#jNS|Zt4P-+NsNQ zW;XZ%J@vXSd9w8--QgO@-^9Vnt#iOdH#(b}n_P{o%fY(X#wM4?;lNwtC>Qlyj5=9| zyScHc+08b=um>d_jjc5DIUFz`xSU>(%jIcoZe~3+GK|(WxjjvduBIlB*XwGbv8>bM z^uYR~$>H&`ZUElgh8&G}3GH1H=w^lCO!WX+LZJTzPa3+Z-lRp=+;&6NBdrL{?iD^<+FJe;z5XQAy8fwqCfYjB z*02Ba(_d|UzOy#>+yhQ-X7^h4SO)4qbvIx3)vMw@j8*x?SzopIQ{OwUJnXyP^EbZJ zH~+{te(F}=?>`>$Wt+j8`yaKjEr3hASg41L4n9te_fjN2@AHn8LuNOO z!$NOYXS*M7EJ`7T?Q0(#G?af9ADx905q}U?4)_|D3(_^Tfm;cRt-~kk_{e)PKJq40 zC9S2Byo3fqomCgX-?i^`5NHkV&&PB+-P*VI6L|Zp-qnLXs2}!Y0Nz2>sj#hjoGkyv zEljxsiPbsii+};3tyE1fs0OH}IsyvfAAGt4@N1}A)d7(nr{Y$O@IjgC4iYLm$c-xLJo0aynp3Cy@9K@YB~yDr98TkJ>0Hj0JRjQmsDh#C4iu z6+KS1N`G-(DtG_8Ha74Q_pVPq*P&w68um$q6I4IbGh5yFvS={>AVy*gM!? z--8nd26}NL>5}iPd1dB?D>fV)N3YDpdmj4Sk)toqJp9F`Z(T{fGIPi5=GMa< zugt7D{CXh%`skd=fG9_UpQ)O{S%uNNomR%`#_(e-@U2N`Z+*f;?=&UHC zOY`^oGkbGdV6uOo*_-O$XZ9++UY!bL$zHFM-ooVe`UUg_%OJGSW-@KC(3`M}fik)% zR|}R|uneA5U}k}zh1p|a_Q)JLp;7$VtT@0yYF~yU3H|c#E^C7)Y+Lh89XOKAzAQ2X zNuuZa;E3?&AH8p&$)A7kR|b-ty{yteIC{=S z9rElo^X%l68CbpR2h0E2^t%PiKyAX$nZ5Am&$w0>Ru2oShxx7^{gJ1PuTa`wm{Af@fKUxODjTKD}=5wv5rYxQzm z8dg^s`kr2@i*VT9Bn!~lQ^TRH19q$GjQd9 zp_{$`HFIiRVD~><@!`xL{!sI^zwbUf^OLu~@w*xJ?99oJzxG%COFx=019lQbd^m=87 zxU11qF4?*0shs&bbF2N4fjRv%`;Z?$Lx!WMT){X;zQTo5&ybl48H&GX%7I>bt0hGO znCcEjfA-c7;Vr<SfmvNt&3$!ZJ>8d{(>o$qt2Dr;Fdv{Vh5M^4c zvyzghQ@kg*7Di@0BrKMlV-Zz~Z?2Dz_ytbBDZw}3gCv(*G!hER=}UG7!DkQr>)`dG zoT3wklNURo45lf?j;*ri2PVQv1|ECIrtX%cd{V(vCzWOdm#k6nl#`W{^a1}6MIU@4 z>4US%@Ev#x`FZTzB2$zn0kB^pc=TXgT(o4|&xv9T9z;@YBoy7{4}>8pO)PMe{tWeu zfuJ}Q!R~XZG9}>afc;kSJqe(TwuYzVD+BMBt}!^cicF+o7fpy$DRIM;YfKzh?i#_D z3hvxcBN2E%E5s&v2o=#Gx87>p#ZUVs&ro=i0xBcxB1lP#N;&vWhxvq9s#?s#h+m9` zIRAv`kH+9ae)tM88qJqpCa2@lTsSP3T^vb-;k}d$T?(!$;P9f zID28Ov9%16)-p>O7mYU?ey3lIC)=|jkPs6Q zIlEf&UG>A8^#C;1P?(3-7aisOPKTUN?n(%8@Y(_!;g)3zhoazW>W>S2G>KNJPV!4m zu5#JTjA$$*CRH*gB}PJVzZi~og&0FOTktz3Y1BtMK^d3wFL zi=UKJjo|(t$0C>KG&5nLnMqA}p_zT6%}lbXG02DGoFDpEvT;J!OSa)3Z#PBR5VBO^-EVi;+RI2VdW#*wAFx~BRg&Q8`)rfTXgpX zZFzZ-N!@+*FhY{vcQ+1)o~HsPGSxfS()5iSct& z^WmQ@Rjh-TLlX9P2+Na%{cI^= ztacl|`_G`V3(?McALEe5$}o)g_oQf9z!* zmcz@GI$A6*tR;#U2+OO4{qQoVr`+<u-^F`ReI?s{&t=T&k<^-?svOZbT)<&rQvT z|J-r~KXdV~VtH;kupCc@UM9C{5R8dga#QoMoTyQ%B5OM#)95U(!cbM?(+q*s%m5|k zA5WR59`~(M43XN}NV>^G(Y{q+g520iY=Yd>d?t93R!mT;Y?dU7cL~e;g#Aex&n;=M zajnS7Ji0$(Q?QNcU6%xX;tI=&OYM)?eED>Gu~#LR-kV8sCke|Fg#E={U)p;;eF4v$ zRx)*w5B)&Df}YyvN!TwMEDz}I2m0~j&cSOu&jUX;P(zW)DGzTMCD`8+mXBaOAMjzv zIt?H2JXj}(6fAkusX{wBlt#;@@qTP$arh6LVj&W5y*!=i;>qZ)>nJe`+Q}8kN5S!bWCUYtl3 zJ^fHIy);}T)9pcpDf2u<#d3SFNS0H)!je3pQNf?$i-g}ls#LP-%txZQjj-H7*!Pb@ zRVz-}xZNms`D)^pnHH5oWULWc_ zz3Ny5kFrvi%|K1}C%}DPhZp_FBGQnn?gCPq#je~apIu&yDt5^{1(VEjTxWR*c3Dv{ zORkqQZvVNd`B-k*oo1=cJXw=Y)KoQa%kFdysQp!yuBJ}SNAKq_=$uT@nZS7OGWlXqDZn2CbYk$+bdg*S-Jr##ed4R)f9QrH`QQGW7JvFgSw;8u=Zd5|{&_9Bj1y}W z|3HTAgQ%N3O(08^wQ zU)ZmyNapJs6)VUT$@l}|+b5rxNK6|$&>-l#LAXubzZ#k+-t=>1t_JzVl+V|CCnsj-04p2V* zfQ6m696GMasI_+hku<4#k2&|9=L`30lIH9lplp$B2kzHoo74UQ*05g=S7{mE}?($8TV0n$bCzxSvn|2(!4fKsF$51rK1V?NsmfGJXux4*5a zh~_o|WQt@x{6%UjGo?I6x%#)j%kLU3psdzG6 z3MCrZsSFAwA15=6A@w+!srhIQ{4>xjPQE2p+!Q5_*9gm7gdIxwGNTnT?||!kkqx=; z=q!(%BP`vZjMWO?sm*ZX9DIW{BZWIK$*;Gz8sLjJ85go%l2bz59mZdf3}2cYXN1@! zgTG1{f8DJ0L(dI1{Oy*u78?#;2*ho5wviAY4vMyxt8cL3uUUhvQ~(8JDuuxv8SbNr zaF`noLn#pF>!dGf#>lpo8}{s}OC!h;7m7T5P4PUWWxq*9Iv7h}$~m(qqdE!)0MAE7 zcz(L8B1aU6`96R!T)6B`mFn~5RYF=6erPB2s(LBNq;PRn&`jYV=_s4RA!P4CMSi}f zOu7da2apc_DI6vLUEvVPFsZ@;z)4OK z9tp=dJPglqWR)VZ3YNyPTZ-^}%@3Y#g$qm3VMT~zpS8lpQxRxsg!DWZ;l?2m+|;E1 zuRxj?Fjn@%WGGLBd*)n!IGQc3gbl}HK{y~J%(42jBj6OM`S!=0 zpt)k9l5uv0!^oj|h080^dP2Iv186@vq)+H6v1aNp-NF?-Ps}ck?ERwv zOToa!Agkhsi-6GzIW8JfJ4(55=$ydj!l5!^p$mu62%jz-5Jy}WE{zX7i;zymb# znz=7V)pI*ugvzHq3v)Z1fz5ZBv@o}89tJMV?dLx%RejNg^aT{@jyUf|n2oeBx5L+g zEX?imAI%o#_6xk=`2T-yPk%v2;ll}j`B=EL@--lZV`RKKEF4DtWg~^7q`#j;flw!% z_XO{nJ;=H_`AV)nBBcI{TH47t9gLH%9ZrP8@Lw!?n}Z|Xk>+L>JM7^{Jn)}~1sxuT prcK9!3wzdyA{|As4+zkK# diff --git a/fink_mm/test/test_data/online/year=2019/month=09/day=03/part-00172-0f505f6a-2acf-470d-be88-97067acad74b.c000.snappy.parquet b/fink_mm/test/test_data/online/year=2019/month=09/day=03/part-00172-0f505f6a-2acf-470d-be88-97067acad74b.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..466b4f346f2f6e4c21834e094ba14cca039c8fec GIT binary patch literal 33254 zcmeHw3wRsVnf4i3vPX{W*v1la>V%2v2nTFg8eJ?4#jzbbj)U#EwiE7%u{5@2>tfGH zzJyJ~B~X@SX-aAS5GbTvvXD}iru=N1EwM{kmhyD-bmTDf%N~* z=rSW|CW$53XSa|2Ks586bKdtm-?_{=XTDDwXdkrc)%sQXHNAS3UX4(hRf`aEx$O=& z?RMJqNME6^ZqV03reZx=Yt*KbB$ozMFDnNmvQ`EE))5FJQ2M%F_yC#;@8pde%rBjs zZrk?vceA&>Fx>`B2K_1t6Pe0kBvW~SM@9l?t-EZ7Ne!Z_|agAZ3 z?@y60$`GYejASY=Cd!q{5V`c8q~X3Wem75=kRR)@!Gh5sy8t2M`r=PP71 zDkXqMN=b@kK%WfrzqCVte0@^g$1Z;VCy(e6^6uEyHN!??)Lb|e;XQMFaD!<)7Mt=k zHOT4wE*C^yzbTdlUH%lmqV{NrqxX$(gA7!&ccCMq}+ynw7MJVmSLOALt8x#m&NJurF)dm-Q4JIp-mJQ z4Y3jWDjfBN&nd1Yd)>4Y{UN#;jNH<)x$7ADBu4l)#!T#3Pj}QgbY69iG zO_p^?T}st7Ys(yvjfYIJekRPeZeDA4Q}r`JhT5^Mv3pbhdT&$B8qEd@GWnyC7~|K{ z?yd1qkfr+kT@jX>2dFO~PZJs#7>`ES){AY8cB`$CcDovFE<3HKwQD?fRM8b?f+4M+ zv~*d@SgI{y$>@yUtG!U;tvpdyZtP@YY%5ss_e3@Av9)-cG^@1swoaD!bHOQWqt@Z^ zAWt>UOi^K$=b15Lz2p zA(r#?$HV#t9qn+L`WsO7Fc%!_>C|7i{WlI1754`=n8Iv~u>=Eym!R4qb~?^NAEL*R z+THB(pbEy;9ghTK8%zOao-eyp7Yl~jiA_j7w5?{7 z&gphLn1?GjlRWF+*M%y2A~81NEFm3ES7nFVAFKQdH8mcMS^Uv>WXI3Y+O3pt7|Qd0 zCe-ps1q@C*UD@aQDxxewC<)yyH28mz4vfv59zY!fsCpb5XkAOgL`H~40y{$S&KH(rgHwT)^Er@7$Td)2xz zXwq*!Kw9UlWyBzj>iU8_poEw)zGjM?v)Z(FJzdewMdMT7xs3VWWr4@yo>`5xlO6T- zZyWY8zV5*e{dV0b6XL&dmxo5Z!wfeDy(k#*b%Z8;o50AS=oODdJjggS1*4b=t-Z!L z=j&r){_%a&sDceoMY-c{)BrUcijRbYJWK)id_q5j)ZV6bmi4u?hgcUvo7aV45RnAh z@uP>Wyl-1P=8KN{2ARm%52|TD^0*PG6k_JtfNz*%B7Ez8h>UUZYv%|G&rIDc4AVFF zI9n!ci0qC8w%QPVFLIqm7JrGOH#jyP+Z5+!Sl{-+UVU`cceKXOIw`#3T0jN^b*hsi3#s%AVtNEt#8_E4mBQVCSZo0+gwlz)1gZ{}-G&(~{Jt<7;8@=7@U1qhEo0yMDckqk$ErDY)Dq`*Y%8wb6peBL-qTL*Jg6>n=!O9J znP3ASjdOn17af^k{XZz}st@n*ZX4mjNK;_381#eD=oKc*f0AD4c+^llLNSp5)fbIM z48JBTc#7dzVud3ZpqLmHgB*B7X|!u*v?R>&{X4bD(s7xVAn5NC)lk?s*zSC?+UA6w z=yn#<3&j=ew1s27=JY(GBDzs$`Bmhq+qJqDi>+m3GlKr_k8~q4$j1!#>dJa6Y_#3t zy0RaU&>(s%Y8sp^A`Vn<;e73pv5WSQ6^(9-!&!MyJ9jbv z6Y6!QVBgjQ#U2&7{sjTkOtp*L?_$mDwz}kR!wBqQ50c+R|eBm(pto zk$Era8HCB$9|}%&96}`RZXSJ$ETbp>4%KX5{dYP$ls9*g^gg0ysOC}E6`EGPy2@W( z+XzoNDirmXUqIGIsOTugP_ZAKrwhV!b@T6w%G=5e?y@?VSy_&Se{6`(Q5s|Ut`)jil$)=tt*3%8Q+9W4>)qbJs}5~5|9WNR`pT~I>sIRc7#DqHef7{h zA7jIs&id`cn;JJ?>THC}H9d4AB4NJ&*4NZNUgJbQV&39JH4F(edHHtTDnl9K=~bw# zR^1c1anmYSc@3fS_6%(!F1ZdKD|pr7g8znXPLI>!ao8<3cq~*>RUPG(1dKMqv2_S( zYb=eha)U>Cu)KPun(i!nVC^Z32?-R=ifUWyV7s?TTj~lUqPl`^?PUGh3lQzO09AF7 z9zE&=RW{jLX=q-j-EQ|bZL+zXh@k4}))q^%&D9JNPb(hW{ljRBw~OPV96iVcf^oi; zc3PU<#8tEbt^NUIs5w<*gpVt9h_5^!E=D%0G+)?!uc~8ZQT1SZ12Fekb-@S|`@dgQ zM@N5KT^)lbmxrS7E`h#HmM|CWGU&WpqO;m>(P6~SHBdwFlnjM}F}8tXZf-BF=Hg*! zy^bBI7L*=jM_I01Q$bR3&ivQsk+4GWd_q>8B&#jsv2e(wIilWRGCfLyB4u}0LqpF~ z!Sa78{cUB%29wp|?CU4JJ(0@s`YD)@nUNsS%!Aq1ezYYD55TEdE6DvY3tMIwE_h3; z+A>l;Sy6up8@7xw;V=X3aM8M-1GfGeqH^uUXkMcwHw9ywjg3D=-m2N^s-E+_lpj>z zU>c2vLebgyl&Ry2^VDO0%eLz3_7Dp*LixXI)s5DPzG|bb(d~AE0mq125#@S_K*Qy` zswh}FFcB&~6)68hshXM!-=d>pQL5acrQm@x28Ik#tF9^SFIUmOL9|*$!Xm2UT>^P4 zhpfISn5}|ytus;9+tjK@4W-_uNA4wPI#2J`=~8ra2CzYCeE(W~L@UyLx?txYKi?dR-bA(v7J*jLQRSb#G#K zr!GO2CVS`h(b?NT3fsKw>Ti!sMxwKk-0CM2j3vnnT>WfWcJ=R6>t9zTtN#_%{#r=y zW?b2SGa??%#-r9z&4!i+# z+kvsA*|n8I_04V|w>g`w&CPJ@h62`Rn-y|3(^i*5#==FQn-Ly{k_u!M0{s;Yy+z-f zv_#z<*H=H$iqPV2?&FoM<;SRvM=Y%yk3BHc)_JON<5!>kM(fWy>o;aAV1zETxZ9}4 zWx)25yK`k-dqK>LF{*A}^w#jd_P+bd#ER2IR(y9R z*3Y$5ErSaq6zrcdk4@>Pdi5lpXx!uFkhle&)?9B_XS)w?KTAPE_q7iVY09t0vuh|G z_61;liLYT%DO^L_59+NA`n?3q3&r~4T9{SHR7PW|P+S0mGV7}`;&~@3i8?%L_Yx}A zd2e?Rct(N4uH=MD)`B*uANK43ZPi~Sfi|kKIPov;SK=L%FlK`n0RbQz^|igA7=Y>; z2*`*3@U#Z#wR%dN4UK9n#RCT6NndgYf^tV*I z=>*q+EhB}5jE0fgBZ07to;b^tKD2Rr=Ga_~rAFaj+>YYif31yA6Z#L}DPD_v-tPC? z@G7CNF)+(;qqr+w+K)#-dv|{^d<^vt^*8ol!Qfyo?tpzkj=`(4q5d9b4$Fr-y9gsu zQD!7Z{93|uAIKY9Y$IcM%!cEQy=;VU912FpLM#UF>cAjSxCe~$p-Mm4pg&n84$tz; z62>A>1M(jws7CnEoL(>z_iKF@{{@8~vFMt*%y;(A($y5m=naYcYU#=9C zs%%RP@uHG8Qy8_Wz!tKj{N(PO9+w=?CwHet`N`elD3?lsCOOJg!nu6zC|^cjwhRnA zNl#ktGs~kX`3`V-o`J!DN5}FEjNj@mKRtx*0Q~_zm!GTYeZ^nu(*w41H>A$Rwz#3q zsh9tBaT|QL)ji)pkh;Z}c?&_3=%;;fV*JJ@A807@jc0zRA<4yyt2Ctbx(nY*zG8y(lJ+hCLKTDS>wNd8|BttW)CT|Ll;-{u#-urgQfq1XP#xtEbm5^cOx*yGj?R+ zOz@Zbr2Vg+BR}7H=JQ2g-i`dvpCe(_vM;@Ed%ydizh>+&I-hHMa;yEq7WjPbA6C@= zJ-EzXIQQ`#2Uc=zYu^1NdL780{ioml{B;A@#{W?DuRn*+Yj4;Udk;R}^L%{I113&p zk(o*{k||6)ndC~v1sox_$doGEe)Q$11}`@+mh|rH`Ek?Q#b*wG@5<|ZHH$y4e0cDW zvAV@=J$DQruQ4wk`{wUo)O|=THvHniJFBlUFFtYLcenl3O7miW|3kaot1ev3T@j~{ zjASYgRyJ397G*Qn>}_voF*@n4Fulq>xYX6>C=8ANuz9E=Ap^7UCyWRTa}G z7X~l?4|I$BKNgN}@J~N?*~bfi{&Vd&|8dKyg~)&?VmWF%8%u(G+*Gbx)pF~7f2=`iH|jj3ho`x}#H0cfcd zZWUiTJ?hoJ?0B}LJr_NdVkA>n7(I0@O0&^(+&Ulw{#!ik!*zf+w9=FNR7$E2;)^q! zUz9>V(X9h~CE#3StG++!?Y?d=e(jR>J%D#2(GbH?2|q5%56*c(@EpW18Z+2k5-$aV zd=V*a(PDM>s{(%fkJmo%hsR{vCY53&Q&-q}scTUR1Q~4u-E=PL3b3Vj--_3M!|lDT ztY!Z4^*nqAb$1?w4e-B>pZW=5L_5nUWP$(pmsNOYlQ|zcRDKMXfU0J7OI#LX7|B!~ ztWd5L6jB^R(r$_0qTs*Wj+06;lBp}ql)4tBP&lLN(vFkrX7#`O<*$?7Go$Im?q?)Z zc`!?=oG1mB?2ZFAzA60}zy}Nmr#8H`5kAqYnzzBeHxS=_lORdzmOsD@b%S61??eT; z`1N0E>d7{{T~lE_c+Wj=;_Pkz`fqRIrq8l;5-8Dmg}hj^Tq#J-$|hSAnYe_2 z!7;Nw76)0RBr8&(4dC#Zl9l41iiFo;imD8wgIlSO#h#D}Fii$fDb9gi(c^pyhZLNG zV+n-?dVheQ3uB+^RF)EOVZy--_@0C+6%B;u5?315CtTxl;6FH%MlG7*=TgNrbM|q5 zO1yJ`gDu=SRYo!l?igHjhJ{!n6>_SL+DUfaCxi>Yn^Z6cG8Ta(T~tWHcPh-sMT3Ea zSu40vA0G)az8T&ZiNZxfT**hEwz=}MgdFE)LZM_ch2tT38=ZhI1^-3xf`s^pSR%C? z2qq&*_{j*?T@|Cxt~IunM$kHeQ^rK1VFoNO~S-JW`vK1;xRV6 z3DzdlN8+R41e~;C(x_OKlsPMtGGA6JmEmTFF~C7P`jhrJA6gsYnW8XvFdS5W40<^S z{U}ixwOD&KKgrG}q+0O#j^QGg=5#Y|xtj@Hc)6Q>rrj*@xZ4>U6(4u_MtI!e8ABUj zLovn&LoeBHVN@nNcHcqp1hRiiva5!Gvo?S zAwykFZOtbmX0leVt8o^S>Ke&|UavCL8J;H$#|bm(l?1s;E@5pUAMaYz0PB7=Jo{iR zgg+26|B9&9mnKP)YX^bhMZ)kB0jXZ2Oh_wUCgp~|A{qEci$wU;YD9wnXc6G&MzUi3 zT&W!XDYa;$to4mR^ODN&s>*yy?Ny9IRZIXSmLAe7V}f6lh$?E+Yb`;$Lzfv2U1a`6 ziC4C2HnE&^MJANIpD-LE%%^PLvjiW==yT96;!0L^0_^RC;V!~_(C!tY4{}2gWZ;fC z#1_t65ekTjesshE?fhUeI3c+!g>VJj&bd-KW9)H>N+;Hs0(!_0Gw*SEjba3XjD{yA zWhE<80pJK>I7*mL0z^){0VtOvlT+=8TU0H*x)s1*C^9^!G9Ph!U9i|cLkNM4&L4S1 zP;GjJEK&I*52&0SBY;)TmdUC7q*p{wt+oa1=QM`>YV(s`tnoRxAl;$ai-afP8$56d7-jB*@3N6o7o&R*bx^SY6{z@3zEU z7T#R}(_X^xl*(|I%6!{a+|lIy;AL9yK<}Br6C0ld{L{cdm=E+q7ZJVRVRT6DDlB(fAI_Q?|XBSQtkpBNS! zS+esMDDEH(cN69l!_d^CLp<&`;+*JVc4$t&?JD-78T)bw})+z*BEbn;4J%9RcQ+-Z{z>oU-fUoF}t z^DIIz%RZIiewF$0tHCU}zUH|9=Sty*>@q^(4iC>SL8?kWiDItD6QXuhxDJ9~Z zb5#lLfvEy%e>JT{t8kzuA^+8Mf#i4bO5{rC!V>no_yXCF$CTL5cJ3_E;&`k;EpDGt z(&DV=+7dNxpD9p{=Vz7F$UHZfa6Ugf)mBE9sH@UO~cp>iPnCpWda!oB7^C!uj;B0y#hUvJ$7#dl3oy2VX9b{fQgI1KI4= zEPnAK;exCMlD&7MG8ulgBO!yV1(F@QS&>Y9c_d+k`~@<;wMU7O6L$tbQ{h88$eJL_ z3S&ra4YH(iTZ2PiQ6!Vu8vyGIDl?P;*4)4Z%$h5eV?FWLimYee9YDbXt$X-y6se{A zgM@t{bAk3f@wbX}vO5IGpRS-Z6hYkVNu_eOJ^WQgMx{Lh6iJgx_mU#1R4PaM&R#{* zoLvIsEs*WOyA;{ZX`cY)(;7(4Db`mim8qw0x?hoh3Ht^>DNv7N4=C!fl$`^>6sX9lgNlkM?j4{^fvitIsK|Ow zy9X$r)<8OqigTS*DmT}C@Q|VgOWHpGPJx;{@ra@(XWT(RxdNHrJEX|0xQBo;X;$g; zM6|qADrfmak1DdB!!80!7s&tCV~YHO|CVH(4Ot5$yZ_tDWLet?$XX!TtB)&^$!;Sc ze}Rnqo)8(0#R-Qa!G5r`1?Lh-27=t+6x>;IrE;C+{U;>^B}peH2?L}9LvBO~X2_Mw zF}(Oa34`bumEwR65adXhDJv!<+2ymOavXO)E#XjOXR8!E7zq*qJ~xU3!{}WOv&nG#&V>plPQ&>Ir0q9q+G}(&T9h6JA~l`VTLUDWntQFL@AI%=I!3W z1xY1ResB^qK|1K48?%A+&y~vQ|Joujs8il)3Ew(_;JC_gRAq)_Ajpluzy!HcIf8vZ zmJncnJ&F2|2I#qQ7Z^QPDu@2ka}ondeEtUPAB76M zMi|~E%#g#I85xmz2V7^0qR6d~;f;?8Lmen%G{O%@SePjW;wdabXoV&D8xBSde7m&8 zj*RPba`4k3{3`(P%K$M87oD}>-wVLsscw9x`Fa!n3TKnYgrg+eIyKt2$?^GBuXWJ5HYMJe~gSc7!hWvZ6k!vh{cy`Q8Csge~eUc zPW}jKU&Z_(lHou31AyD1Sa>uPW$YG7dY!p0f3^H!azg|2$4Z2< z+Lk|rG-fe>6d}wp|Kf^~k@-WWf+_PyNQOZQ@t2!sEioj{9zQr zg7XK&4lVhY#z)HqlupGfFBRgNb6<=~pY3=Nn%ML&KilCfWU0%fy*5qrw#xR%qEqBUhu~{uP zs(vIM48d;^yITUI?$MSOJ3ZoJM_uqci2S|$~R)5P#b4#;zgdGXk*26cqz>hI5 S|94gkpjY7cv!%H literal 0 HcmV?d00001 diff --git a/fink_mm/utils/application.py b/fink_mm/utils/application.py index 08f7cc068..dde6d2608 100644 --- a/fink_mm/utils/application.py +++ b/fink_mm/utils/application.py @@ -1,22 +1,19 @@ import sys import os from enum import Flag, auto -from dateutil import parser - -from astropy.time import Time import fink_mm -import fink_mm.offline.spark_offline as offline -import fink_mm.online.ztf_join_gcn as online +import fink_mm.ztf_join_gcn as online import fink_mm.distribution.distribution as distrib +from fink_mm.utils.fun_utils import DataMode +from fink_mm.init import LoggerNewLine class Application(Flag): - OFFLINE = auto() - ONLINE = auto() + JOIN = auto() DISTRIBUTION = auto() - def build_application(self, logger, **kwargs): + def build_application(self, logger: LoggerNewLine, data_mode: DataMode = None, **kwargs) -> str: """ Return the command line application @@ -41,45 +38,18 @@ def build_application(self, logger, **kwargs): application : String command line application to append to a spark-submit """ - if self == Application.OFFLINE: + if self == Application.JOIN: application = os.path.join( os.path.dirname(fink_mm.__file__), - "offline", - "spark_offline.py prod", + "ztf_join_gcn.py", ) - try: - start_window_in_jd = ( - Time(parser.parse(kwargs["night"]), format="datetime").jd + 0.49 - ) # + 0.49 to start the time window in the night - - application += " " + kwargs["hbase_catalog"] - application += " " + kwargs["gcn_datapath_prefix"] - application += " " + kwargs["grb_datapath_prefix"] - application += " " + kwargs["night"] - application += " " + kwargs["NSIDE"] - application += " " + str(start_window_in_jd) - application += " " + str(kwargs["time_window"]) - application += " " + kwargs["ast_dist"] - application += " " + kwargs["pansstar_dist"] - application += " " + kwargs["pansstar_star_score"] - application += " " + kwargs["gaia_dist"] - - if kwargs["is_test"]: - application += " " + str(False) - else: - application += " " + str(True) - except Exception as e: - logger.error("Parameter not found \n\t {}\n\t{}".format(e, kwargs)) - exit(1) - - return application - elif self == Application.ONLINE: - application = os.path.join( - os.path.dirname(fink_mm.__file__), - "online", - "ztf_join_gcn.py prod", - ) + if data_mode == DataMode.OFFLINE: + application += " offline" + elif data_mode == DataMode.STREAMING: + application += " streaming" + else: + raise Exception(f"data_mode not exist: {data_mode}") try: application += " " + kwargs["ztf_datapath_prefix"] @@ -89,11 +59,19 @@ def build_application(self, logger, **kwargs): application += " " + kwargs["NSIDE"] application += " " + str(kwargs["exit_after"]) application += " " + kwargs["tinterval"] + application += " " + str(kwargs["time_window"]) application += " " + kwargs["ast_dist"] application += " " + kwargs["pansstar_dist"] application += " " + kwargs["pansstar_star_score"] application += " " + kwargs["gaia_dist"] application += " " + str(True) if kwargs["logs"] else " " + str(False) + application += " " + kwargs["hdfs_adress"] + + if kwargs["is_test"]: + application += " " + str(True) + else: + application += " " + str(False) + except Exception as e: logger.error("Parameter not found \n\t {}\n\t{}".format(e, kwargs)) exit(1) @@ -121,42 +99,11 @@ def build_application(self, logger, **kwargs): return application - def run_application(self): + def run_application(self, data_mode: DataMode = None): """ Run the application """ - if self == Application.OFFLINE: - hbase_catalog = sys.argv[2] - gcn_datapath_prefix = sys.argv[3] - grb_datapath_prefix = sys.argv[4] - night = sys.argv[5] - NSIDE = int(sys.argv[6]) - start_window = float(sys.argv[7]) - time_window = int(sys.argv[8]) - - ast_dist = float(sys.argv[9]) - pansstar_dist = float(sys.argv[10]) - pansstar_star_score = float(sys.argv[11]) - gaia_dist = float(sys.argv[12]) - - column_filter = True if sys.argv[13] == "True" else False - - offline.spark_offline( - hbase_catalog, - gcn_datapath_prefix, - grb_datapath_prefix, - night, - NSIDE, - start_window, - time_window, - ast_dist, - pansstar_dist, - pansstar_star_score, - gaia_dist, - with_columns_filter=column_filter, - ) - - elif self == Application.ONLINE: + if self == Application.JOIN: ztf_datapath_prefix = sys.argv[2] gcn_datapath_prefix = sys.argv[3] grb_datapath_prefix = sys.argv[4] @@ -164,14 +111,17 @@ def run_application(self): NSIDE = int(sys.argv[6]) exit_after = sys.argv[7] tinterval = sys.argv[8] - - ast_dist = float(sys.argv[9]) - pansstar_dist = float(sys.argv[10]) - pansstar_star_score = float(sys.argv[11]) - gaia_dist = float(sys.argv[12]) - logs = True if sys.argv[13] == "True" else False - - online.ztf_join_gcn_stream( + time_window = sys.argv[9] + ast_dist = float(sys.argv[10]) + pansstar_dist = float(sys.argv[11]) + pansstar_star_score = float(sys.argv[12]) + gaia_dist = float(sys.argv[13]) + logs = True if sys.argv[14] == "True" else False + hdfs_adress = sys.argv[15] + is_test = True if sys.argv[16] == "True" else False + + online.ztf_join_gcn( + data_mode, ztf_datapath_prefix, gcn_datapath_prefix, grb_datapath_prefix, @@ -179,11 +129,14 @@ def run_application(self): NSIDE, exit_after, tinterval, + time_window, + hdfs_adress, ast_dist, pansstar_dist, pansstar_star_score, gaia_dist, logs, + is_test, ) elif self == Application.DISTRIBUTION: diff --git a/fink_mm/utils/fun_utils.py b/fink_mm/utils/fun_utils.py index e78c299e0..3fab4ca4d 100644 --- a/fink_mm/utils/fun_utils.py +++ b/fink_mm/utils/fun_utils.py @@ -5,8 +5,9 @@ from pyarrow import fs import pyspark.sql.functions as F +from pyspark.sql import DataFrame -from pyspark.sql.functions import pandas_udf, col +from pyspark.sql.functions import pandas_udf from pyspark.sql.types import DoubleType, ArrayType, IntegerType from fink_filters.classification import extract_fink_classification @@ -16,10 +17,16 @@ from fink_mm.observatory.observatory import Observatory from fink_mm.gcn_stream.gcn_reader import load_voevent_from_file, load_json_from_file from fink_mm.init import init_logging +from enum import Enum # from fink_broker.tracklet_identification import add_tracklet_information +class DataMode(Enum): + STREAMING = "streaming" + OFFLINE = "offline" + + def get_hdfs_connector(host: str, port: int, user: str): """ Initialise a connector to HDFS. @@ -336,6 +343,8 @@ def get_association_proba( ztf_ra: pd.Series, ztf_dec: pd.Series, jdstarthist: pd.Series, + hdfs_adress: pd.Series, + gcn_status: pd.Series, ) -> pd.Series: """ Compute the association probability between the ztf alerts and the gcn events. @@ -357,6 +366,10 @@ def get_association_proba( only detections that fell on the same field and readout-channel ID where the input candidate was observed are counted. All raw detections down to a photometric S/N of ~ 3 are included. + hdfs_adress : HDFS adress used to instanciate the hdfs client, used to search the gw skymap from the gcn stored in hdfs + gcn_status : used to distinguish gcn with the same triggerId (account the gcn update) + last_day : the last day to make the search on hdfs + end_day : the end day to make the search on hdfs (the gcn will be search between last day and end day) Return ------ @@ -376,6 +389,8 @@ def get_association_proba( ... sparkDF["ra"], ... sparkDF["dec"], ... sparkDF["candidate.jdstarthist"], + ... sql_func.lit(""), + ... sql_func.lit("") ... ), ... ) @@ -409,9 +424,15 @@ def get_association_proba( """ return pd.Series( [ - get_observatory(obs, event).association_proba(z_ra, z_dec, z_trigger_time) - for obs, event, z_ra, z_dec, z_trigger_time in zip( - obsname, rawEvent, ztf_ra, ztf_dec, jdstarthist + get_observatory(obs, event).association_proba( + z_ra, + z_dec, + z_trigger_time, + hdfs_adress=hdfs_adress.values[0], + gcn_status=status + ) + for obs, event, z_ra, z_dec, z_trigger_time, status in zip( + obsname, rawEvent, ztf_ra, ztf_dec, jdstarthist, gcn_status ) ] ) @@ -703,7 +724,10 @@ def format_rate_results(spark_df, rate_column): ) -def join_post_process(df_grb, with_rate=True, from_hbase=False): +def join_post_process( + df_grb: DataFrame, + hdfs_adress: str, +) -> DataFrame: """ Post processing after the join, used by offline and online @@ -711,6 +735,11 @@ def join_post_process(df_grb, with_rate=True, from_hbase=False): ---------- df_grb: PySpark DataFrame the dataframe return by the gcn join ztf. + hdfs_adress: str + used to instantiate the hdfs client + last_time, end_time: str + gcn from hdfs will be queried between last_time and end_time, + date are string in the format 'YYYYMMDD' with_rate: boolean if True, compute the rate. should be True only when historical data are available in the alert packets. @@ -725,27 +754,26 @@ def join_post_process(df_grb, with_rate=True, from_hbase=False): Examples -------- """ - if with_rate: - df_grb = concat_col(df_grb, "magpsf") - df_grb = concat_col(df_grb, "diffmaglim") - df_grb = concat_col(df_grb, "jd") - df_grb = concat_col(df_grb, "fid") - - df_grb = df_grb.withColumn( - "c_rate", - compute_rate( - df_grb["{}magpsf".format("" if from_hbase else "candidate.")], - df_grb["{}jdstarthist".format("" if from_hbase else "candidate.")], - df_grb["{}jd".format("" if from_hbase else "candidate.")], - df_grb["{}fid".format("" if from_hbase else "candidate.")], - df_grb["cmagpsf"], - df_grb["cdiffmaglim"], - df_grb["cjd"], - df_grb["cfid"], - ), - ) + df_grb = concat_col(df_grb, "magpsf") + df_grb = concat_col(df_grb, "diffmaglim") + df_grb = concat_col(df_grb, "jd") + df_grb = concat_col(df_grb, "fid") + + df_grb = df_grb.withColumn( + "c_rate", + compute_rate( + df_grb["candidate.magpsf"], + df_grb["candidate.jdstarthist"], + df_grb["candidate.jd"], + df_grb["candidate.fid"], + df_grb["cmagpsf"], + df_grb["cdiffmaglim"], + df_grb["cjd"], + df_grb["cfid"], + ), + ) - df_grb = format_rate_results(df_grb, "c_rate") + df_grb = format_rate_results(df_grb, "c_rate") # TODO : do something better with satellites # df_grb = add_tracklet_information(df_grb) @@ -761,11 +789,11 @@ def join_post_process(df_grb, with_rate=True, from_hbase=False): df_grb["snn_snia_vs_nonia"], df_grb["snn_sn_vs_all"], df_grb["rf_snia_vs_nonia"], - df_grb["{}ndethist".format("" if from_hbase else "candidate.")], - df_grb["{}drb".format("" if from_hbase else "candidate.")], - df_grb["{}classtar".format("" if from_hbase else "candidate.")], - df_grb["{}jd".format("" if from_hbase else "candidate.")], - df_grb["{}jdstarthist".format("" if from_hbase else "candidate.")], + df_grb["candidate.ndethist"], + df_grb["candidate.drb"], + df_grb["candidate.classtar"], + df_grb["candidate.jd"], + df_grb["candidate.jdstarthist"], df_grb["rf_kn_vs_nonkn"], df_grb["tracklet"], ), @@ -779,104 +807,41 @@ def join_post_process(df_grb, with_rate=True, from_hbase=False): df_grb["raw_event"], df_grb["ztf_ra"], df_grb["ztf_dec"], - df_grb["{}".format("start_vartime" if with_rate else "jdstarthist")], + df_grb["start_vartime"], + F.lit(hdfs_adress), + df_grb["gcn_status"], ), ) - fink_added_value = [ - "cdsxmatch", - "DR3Name", - "Plx", - "e_Plx", - "gcvs", - "vsx", - "x3hsp", - "x4lac", - "mangrove", - "roid", - "rf_snia_vs_nonia", - "snn_snia_vs_nonia", - "snn_sn_vs_all", - "mulens", - "nalerthist", - "rf_kn_vs_nonkn", - "t2", - "anomaly_score", - "lc_features_g", - "lc_features_r", + # select only relevant columns + cols_to_remove = [ + "candidate", + "prv_candidates", + "timestamp", + "hpix", + "hpix_circle", + "index", + "fink_broker_version", + "fink_science_version", + "cmagpsf", + "cdiffmaglim", + "cjd", + "cfid", + "tracklet", + "ivorn", + "hpix_circle", + "triggerTimejd", ] - if from_hbase: - fink_added_value = [ - "DR3Name", - "Plx", - "anomaly_score", - "cdsxmatch", - "e_Plx", - "gcvs", - "mangrove_2MASS_name", - "mangrove_HyperLEDA_name", - "mangrove_ang_dist", - "mangrove_lum_dist", - "mulens", - "rf_kn_vs_nonkn", - "rf_snia_vs_nonia", - "roid", - "snn_sn_vs_all", - "snn_snia_vs_nonia", - "t2_AGN", - "t2_EB", - "t2_KN", - "t2_M-dwarf", - "t2_Mira", - "t2_RRL", - "t2_SLSN-I", - "t2_SNII", - "t2_SNIa", - "t2_SNIa-91bg", - "t2_SNIax", - "t2_SNIbc", - "t2_TDE", - "t2_mu-Lens-Single", - "tracklet", - "vsx", - "x3hsp", - "x4lac", - ] - - column_to_return = [ - "objectId", - "candid", - "ztf_ra", - "ztf_dec", - "{}fid".format("" if from_hbase else "candidate."), - "{}jdstarthist".format("" if from_hbase else "candidate."), - "{}rb".format("" if from_hbase else "candidate."), - "{}jd".format("" if from_hbase else "candidate."), - "instrument", - "event", - "observatory", - "triggerId", - "gcn_status", - "gcn_ra", - "gcn_dec", - col("err_arcmin").alias("gcn_loc_error"), - "triggerTimeUTC", - "p_assoc", - "fink_class", - "raw_event", - ] + fink_added_value - - if with_rate: - column_to_return += [ - "delta_mag", - "rate", - "from_upper", - "start_vartime", - "diff_vartime", - ] - - # select a subset of columns before the writing - df_grb = df_grb.select(column_to_return).filter("p_assoc != -1.0") + cols_fink = [i for i in df_grb.columns if i not in cols_to_remove] + cols_extra = [ + "candidate.candid", + "candidate.fid", + "candidate.jdstarthist", + "candidate.rb", + "candidate.jd", + ] + df_grb = df_grb.select(cols_fink + cols_extra).filter("p_assoc != -1.0") + df_grb = df_grb.withColumnRenamed("err_arcmin", "gcn_loc_error") return df_grb @@ -1111,6 +1076,8 @@ def read_grb_admin_options(arguments, config, logger, is_test=False): Path where to store the output of fink-grb. tinterval: String Time interval between batch processing for online mode. + hdfs_adress : String + HDFS adress used to instanciate the hdfs client from the hdfs package NSIDE: String Healpix map resolution, better if a power of 2 hbase_catalog: String @@ -1135,13 +1102,13 @@ def read_grb_admin_options(arguments, config, logger, is_test=False): >>> logger = init_logging() >>> read_grb_admin_options(arguments, config, logger, False) - ('20221014', '120', 'fink_mm/test/test_data/ztf_test/online', 'fink_mm/test/test_data/gcn_test/raw', 'fink_mm/test/test_output', '30', '4', '/home/roman.le-montagner/fink-broker/catalogs_hbase/ztf.jd.json', 7, 'localhost:9092', 'toto', 'tata') + ('20221014', '120', 'fink_mm/test/test_data/ztf_test', 'fink_mm/test/test_data/gcn_test/raw', 'fink_mm/test/test_output', '30', '127.0.0.1', '4', '/home/roman.le-montagner/fink-broker/catalogs_hbase/ztf.jd.json', 7, 'localhost:9092', 'toto', 'tata') >>> res = read_grb_admin_options(arguments, config, logger, True) >>> fink_home = os.environ["FINK_HOME"] >>> expected_res = f'{fink_home}/catalogs_hbase/ztf.jd.json' - >>> res[7] == expected_res + >>> res[8] == expected_res True """ try: @@ -1163,6 +1130,7 @@ def read_grb_admin_options(arguments, config, logger, is_test=False): gcn_datapath_prefix = config["PATH"]["online_gcn_data_prefix"] grb_datapath_prefix = config["PATH"]["online_grb_data_prefix"] tinterval = config["STREAM"]["tinterval"] + hdfs_adress = config["HDFS"]["host"] NSIDE = config["ADMIN"]["NSIDE"] hbase_catalog = config["PATH"]["hbase_catalog"] @@ -1191,6 +1159,7 @@ def read_grb_admin_options(arguments, config, logger, is_test=False): gcn_datapath_prefix, grb_datapath_prefix, tinterval, + hdfs_adress, NSIDE, hbase_catalog, time_window, diff --git a/fink_mm/ztf_join_gcn.py b/fink_mm/ztf_join_gcn.py new file mode 100644 index 000000000..5ab1eb7f4 --- /dev/null +++ b/fink_mm/ztf_join_gcn.py @@ -0,0 +1,722 @@ +import warnings + +warnings.filterwarnings("ignore") + +import time +import subprocess +from typing import Tuple +import sys +import json +import pandas as pd +from threading import Timer + +from pyspark.sql import functions as F +from pyspark.sql.functions import explode, col, pandas_udf +from pyspark.sql.types import StringType +from pyspark.sql import SparkSession, DataFrame + + +from astropy.time import Time +from datetime import timedelta + +from fink_utils.science.utils import ang2pix +from fink_utils.spark.partitioning import convert_to_datetime +from fink_utils.broker.sparkUtils import init_sparksession, connect_to_raw_database + +from fink_mm.utils.fun_utils import ( + build_spark_submit, + join_post_process, + read_and_build_spark_submit, + read_prior_params, + read_additional_spark_options, + read_grb_admin_options, +) +from fink_mm.init import LoggerNewLine +import fink_mm.utils.application as apps +from fink_mm.utils.fun_utils import DataMode +from fink_mm.init import get_config, init_logging, return_verbose_level +from fink_mm.utils.fun_utils import get_pixels + +from fink_filters.filter_mm_module.filter import ( + f_grb_bronze_events, + f_grb_silver_events, + f_grb_gold_events, + f_gw_bronze_events, +) + + +def ztf_grb_filter(spark_ztf, ast_dist, pansstar_dist, pansstar_star_score, gaia_dist): + """ + filter the ztf alerts by taking cross-match values from ztf. + + Parameters + ---------- + spark_ztf : spark dataframe + a spark dataframe containing alerts, this following columns are mandatory and have to be at the candidate level. + - ssdistnr, distpsnr1, sgscore1, neargaia + ast_dist: float + distance to nearest known solar system object; set to -999.0 if none [arcsec] + ssdistnr field + pansstar_dist: float + Distance of closest source from PS1 catalog; if exists within 30 arcsec [arcsec] + distpsnr1 field + pansstar_star_score: float + Star/Galaxy score of closest source from PS1 catalog 0 <= sgscore <= 1 where closer to 1 implies higher likelihood of being a star + sgscore1 field + gaia_dist: float + Distance to closest source from Gaia DR1 catalog irrespective of magnitude; if exists within 90 arcsec [arcsec] + neargaia field + + Returns + ------- + spark_filter : spark dataframe + filtered alerts + + Examples + -------- + >>> sparkDF = spark.read.format('parquet').load(alert_data) + + >>> spark_filter = ztf_grb_filter(sparkDF, 5, 2, 0, 5) + + >>> spark_filter.count() + 32 + """ + spark_filter = ( + spark_ztf.filter( + (spark_ztf.candidate.ssdistnr > ast_dist) + | ( + spark_ztf.candidate.ssdistnr == -999.0 + ) # distance to nearest known SSO above 30 arcsecond + ) + .filter( + (spark_ztf.candidate.distpsnr1 > pansstar_dist) + | ( + spark_ztf.candidate.distpsnr1 == -999.0 + ) # distance of closest source from Pan-Starrs 1 catalog above 30 arcsecond + | (spark_ztf.candidate.sgscore1 < pansstar_star_score) + ) + .filter( + (spark_ztf.candidate.neargaia > gaia_dist) + | ( + spark_ztf.candidate.neargaia == -999.0 + ) # distance of closest source from Gaia DR1 catalog above 60 arcsecond + ) + ) + + return spark_filter + + +def check_path_exist(spark, path): + """Check we have data for the given night on HDFS + + Parameters + ---------- + path: str + Path on HDFS (file or folder) + + Returns + ---------- + out: bool + """ + # check on hdfs + jvm = spark._jvm + jsc = spark._jsc + fs = jvm.org.apache.hadoop.fs.FileSystem.get(jsc.hadoopConfiguration()) + if fs.exists(jvm.org.apache.hadoop.fs.Path(path)): + return True + else: + return False + + +def aux_remove_skymap(d: dict) -> dict: + """ + Remove the skymap key from the gw event given in input. + + Parameters + ---------- + d : dict + gw event dictionnary + + Returns + ------- + dict + same as input but without the skymap key. + """ + return { + k: v if k != "event" else {k2: v2 for k2, v2 in v.items() if k2 != "skymap"} + for k, v in d.items() + } + + +@pandas_udf(StringType()) +def remove_skymap(obsname: pd.Series, rawEvent: pd.Series) -> pd.Series: + """ + Remove the skymap key for the LVK alert + + Parameters + ---------- + obsname : pd.Series + observatory name + rawEvent : pd.Series + raw_event + + Returns + ------- + pd.Series + raw_event columns but for the LVK alerts, the skymap key has been removed. + """ + return pd.Series( + [ + json.dumps(aux_remove_skymap(json.loads(raw))) if obs == "LVK" else raw + for obs, raw in zip(obsname, rawEvent) + ] + ) + + +def load_dataframe( + spark: SparkSession, + ztf_path: str, + gcn_path: str, + night: str, + time_window: int, + load_mode: DataMode, +) -> Tuple[DataFrame, DataFrame]: + if load_mode == DataMode.STREAMING: + # connection to the ztf science stream + ztf_alert = connect_to_raw_database( + ztf_path + + "/online/science/year={}/month={}/day={}".format( + night[0:4], night[4:6], night[6:8] + ), + ztf_path + + "/online/science/year={}/month={}/day={}".format( + night[0:4], night[4:6], night[6:8] + ), + latestfirst=False, + ) + + userschema = spark.read.option("mergeSchema", True).parquet(gcn_path).schema + gcn_alert = ( + spark.readStream.format("parquet") + .schema(userschema) + .option("basePath", gcn_path) + .option( + "path", + gcn_path + "/year={}/month={}/day=*?*".format(night[0:4], night[4:6]), + ) + .option("latestFirst", True) + .option("mergeSchema", True) + .load() + ) + # keep gcn emitted during the day time until the end of the stream (17:00 Paris Time) + cur_time = Time(f"{night[0:4]}-{night[4:6]}-{night[6:8]}") + last_time = cur_time - timedelta(hours=7) # 17:00 Paris time yesterday + end_time = cur_time + timedelta(hours=17) # 17:00 Paris time today + + elif load_mode == DataMode.OFFLINE: + ztf_alert = ( + spark.read.format("parquet") + .option("mergeSchema", True) + .load( + ztf_path + + "/archive/science/year={}/month={}/day={}".format( + night[0:4], night[4:6], night[6:8] + ) + ) + ) + + gcn_alert = ( + spark.read.format("parquet").option("mergeSchema", True).load(gcn_path) + ) + cur_time = Time(f"{night[0:4]}-{night[4:6]}-{night[6:8]}") + last_time = cur_time - timedelta( + days=time_window, hours=7 + ) # 17:00 Paris time yesterday + end_time = cur_time + timedelta(hours=18) # 18:00 Paris time today + + gcn_alert = gcn_alert.filter( + f"triggerTimejd >= {last_time.jd} and triggerTimejd < {end_time.jd}" + ) + return ztf_alert, gcn_alert + + +def write_dataframe( + spark: SparkSession, + df_join: DataFrame, + write_path: str, + logger: LoggerNewLine, + tinterval: int, + exit_after: int, + logs: bool, + test: bool, + write_mode: DataMode, +): + if write_mode == DataMode.STREAMING: + grbdatapath = write_path + "/online" + checkpointpath_grb_tmp = write_path + "/online_checkpoint" + + query_grb = ( + df_join.writeStream.outputMode("append") + .format("parquet") + .option("checkpointLocation", checkpointpath_grb_tmp) + .option("path", grbdatapath) + .partitionBy("year", "month", "day") + .trigger(processingTime="{} seconds".format(tinterval)) + .start() + ) + logger.info("Stream launching successfull") + + class RepeatTimer(Timer): + def run(self): + while not self.finished.wait(self.interval): + self.function(*self.args, **self.kwargs) + + def print_logs(): + if logs and not test: # pragma: no cover + logger.newline() + logger.info(f"last progress : {query_grb.lastProgress}") + logger.newline(2) + logger.info(f"recent progress : {query_grb.recentProgress}") + logger.newline(2) + logger.info(f"query status : {query_grb.status}") + logger.newline() + + logs_thread = RepeatTimer(int(tinterval) / 2, print_logs) + logs_thread.start() + # Keep the Streaming running until something or someone ends it! + if exit_after is not None: + time.sleep(int(exit_after)) + query_grb.stop() + logs_thread.cancel() + logger.info("Exiting the science2grb streaming subprocess normally...") + return + else: # pragma: no cover + # Wait for the end of queries + spark.streams.awaitAnyTermination() + return + + elif write_mode == DataMode.OFFLINE: + df_join = df_join.withColumn( + "is_grb_bronze", + f_grb_bronze_events( + df_join["fink_class"], df_join["observatory"], df_join["rb"] + ), + ) + + df_join = df_join.withColumn( + "is_grb_silver", + f_grb_silver_events( + df_join["fink_class"], + df_join["observatory"], + df_join["rb"], + df_join["p_assoc"], + ), + ) + + df_join = df_join.withColumn( + "is_grb_gold", + f_grb_gold_events( + df_join["fink_class"], + df_join["observatory"], + df_join["rb"], + df_join["gcn_loc_error"], + df_join["p_assoc"], + df_join["rate"] + ), + ) + + df_join = df_join.withColumn( + "is_gw_bronze", + f_gw_bronze_events( + df_join["fink_class"], df_join["observatory"], df_join["rb"] + ), + ) + + grbxztf_write_path = write_path + "/offline" + + df_join.write.mode("append").partitionBy("year", "month", "day").parquet( + grbxztf_write_path + ) + return + + +def ztf_pre_join( + ztf_dataframe: DataFrame, + ast_dist: float, + pansstar_dist: float, + pansstar_star_score: float, + gaia_dist: float, + NSIDE: int, +) -> DataFrame: + ztf_dataframe = ztf_dataframe.drop( + "candid", + "schemavsn", + "publisher", + "cutoutScience", + "cutoutTemplate", + "cutoutDifference", + ) + + ztf_dataframe = ztf_grb_filter( + ztf_dataframe, ast_dist, pansstar_dist, pansstar_star_score, gaia_dist + ) + + # compute pixels for ztf alerts + ztf_dataframe = ztf_dataframe.withColumn( + "hpix", + ang2pix(ztf_dataframe.candidate.ra, ztf_dataframe.candidate.dec, F.lit(NSIDE)), + ) + + ztf_dataframe = ztf_dataframe.withColumn("ztf_ra", col("candidate.ra")).withColumn( + "ztf_dec", col("candidate.dec") + ) + return ztf_dataframe + + +def gcn_pre_join( + gcn_dataframe: DataFrame, + NSIDE: int, + test: bool, +) -> DataFrame: + # compute pixels for gcn alerts + gcn_dataframe = gcn_dataframe.withColumn( + "hpix_circle", + get_pixels(gcn_dataframe.observatory, gcn_dataframe.raw_event, F.lit(NSIDE)), + ) + + if not test: + # remove the gw skymap to save memory before the join + gcn_dataframe = gcn_dataframe.withColumn( + "raw_event", + remove_skymap(gcn_dataframe.observatory, gcn_dataframe.raw_event), + ) + + gcn_dataframe = gcn_dataframe.withColumn("hpix", explode("hpix_circle")) + + gcn_dataframe = gcn_dataframe.withColumnRenamed("ra", "gcn_ra").withColumnRenamed( + "dec", "gcn_dec" + ) + return gcn_dataframe + + +def ztf_join_gcn( + mm_mode: DataMode, + ztf_datapath_prefix: str, + gcn_datapath_prefix: str, + join_datapath_prefix: str, + night: str, + NSIDE: int, + exit_after: int, + tinterval: int, + time_window: int, + hdfs_adress: str, + ast_dist: float, + pansstar_dist: float, + pansstar_star_score: float, + gaia_dist: float, + logs: bool = False, + test: bool = False, +): + """ + Join the ztf alerts stream and the gcn stream to find the counterparts of the gcn alerts + in the ztf stream. + + Parameters + ---------- + ztf_datapath_prefix : string + the prefix path where are stored the ztf alerts. + gcn_datapath_prefix : string + the prefix path where are stored the gcn alerts. + join_datapath_prefix : string + the prefix path to save GRB join ZTF outputs. + night : string + the processing night + NSIDE: String + Healpix map resolution, better if a power of 2 + exit_after : int + the maximum active time in second of the streaming process + tinterval : int + the processing interval time in second between the data batch + hdfs_adress: string + HDFS adress used to instanciate the hdfs client from the hdfs package + ast_dist: float + distance to nearest known solar system object; set to -999.0 if none [arcsec] + ssdistnr field + pansstar_dist: float + Distance of closest source from PS1 catalog; if exists within 30 arcsec [arcsec] + distpsnr1 field + pansstar_star_score: float + Star/Galaxy score of closest source from PS1 catalog 0 <= sgscore <= 1 where closer to 1 implies higher likelihood of being a star + sgscore1 field + gaia_dist: float + Distance to closest source from Gaia DR1 catalog irrespective of magnitude; if exists within 90 arcsec [arcsec] + neargaia field + + Returns + ------- + None + + Examples + -------- + >>> grb_dataoutput_dir = tempfile.TemporaryDirectory() + >>> grb_dataoutput = grb_dataoutput_dir.name + >>> ztf_join_gcn( + ... DataMode.STREAMING, + ... ztf_datatest, + ... gcn_datatest, + ... grb_dataoutput, + ... "20190903", + ... 4, 100, 5, 7, "127.0.0.1", 5, 2, 0, 5, False, True + ... ) + + >>> datatest = pd.read_parquet(online_data_test).sort_values(["objectId", "triggerId", "gcn_ra"]).reset_index(drop=True).sort_index(axis=1) + >>> datajoin = pd.read_parquet(grb_dataoutput + "/online").sort_values(["objectId", "triggerId", "gcn_ra"]).reset_index(drop=True).sort_index(axis=1) + + >>> datatest = datatest.drop("t2", axis=1) + >>> datajoin = datajoin.drop("t2", axis=1) + + >>> datatest["gcn_status"] = "initial" + >>> datatest = datatest.reindex(sorted(datatest.columns), axis=1) + >>> datajoin = datajoin.reindex(sorted(datajoin.columns), axis=1) + + >>> list(datatest.columns) == list(datajoin.columns) + True + >>> len(datatest) == len(datajoin) + True + + >>> ztf_join_gcn( + ... DataMode.OFFLINE, + ... ztf_datatest, + ... gcn_datatest, + ... grb_dataoutput, + ... "20190903", + ... 4, 100, 5, 7, "127.0.0.1", 5, 2, 0, 5, False, True + ... ) + + >>> datatest = pd.read_parquet(offline_data_test).sort_values(["objectId", "triggerId", "gcn_ra"]).reset_index(drop=True).sort_index(axis=1) + >>> datajoin = pd.read_parquet(grb_dataoutput + "/offline").sort_values(["objectId", "triggerId", "gcn_ra"]).reset_index(drop=True).sort_index(axis=1) + + >>> datatest = datatest.drop("t2", axis=1) + >>> datajoin = datajoin.drop("t2", axis=1) + + >>> datatest["gcn_status"] = "initial" + >>> datatest = datatest.reindex(sorted(datatest.columns), axis=1) + >>> datajoin = datajoin.reindex(sorted(datajoin.columns), axis=1) + + >>> list(datatest.columns) == list(datajoin.columns) + True + >>> len(datatest) == len(datajoin) + True + """ + logger = init_logging() + + if DataMode.OFFLINE: + job_name = "offline" + elif DataMode.STREAMING: + job_name = "online" + + spark = init_sparksession( + "science2mm_{}_{}{}{}".format(job_name, night[0:4], night[4:6], night[6:8]) + ) + + ztf_dataframe, gcn_dataframe = load_dataframe( + spark, + ztf_datapath_prefix, + gcn_datapath_prefix, + night, + int(time_window), + mm_mode, + ) + ztf_dataframe = ztf_pre_join( + ztf_dataframe, ast_dist, pansstar_dist, pansstar_star_score, gaia_dist, NSIDE + ) + gcn_dataframe = gcn_pre_join(gcn_dataframe, NSIDE, test) + + # join the two streams according to the healpix columns. + # A pixel id will be assign to each alerts / gcn according to their position in the sky. + # Each alerts / gcn with the same pixel id are in the same area of the sky. + join_condition = [ + ztf_dataframe.hpix == gcn_dataframe.hpix, + ztf_dataframe.candidate.jdstarthist > gcn_dataframe.triggerTimejd, + ] + df_join_mm = gcn_dataframe.join(F.broadcast(ztf_dataframe), join_condition, "inner") + + df_join_mm = join_post_process(df_join_mm, hdfs_adress) + + # re-create partitioning columns if needed. + timecol = "jd" + converter = lambda x: convert_to_datetime(x) # noqa: E731 + if "timestamp" not in df_join_mm.columns: + df_join_mm = df_join_mm.withColumn("timestamp", converter(df_join_mm[timecol])) + + if "year" not in df_join_mm.columns: + df_join_mm = df_join_mm.withColumn("year", F.date_format("timestamp", "yyyy")) + + if "month" not in df_join_mm.columns: + df_join_mm = df_join_mm.withColumn("month", F.date_format("timestamp", "MM")) + + if "day" not in df_join_mm.columns: + df_join_mm = df_join_mm.withColumn("day", F.date_format("timestamp", "dd")) + + write_dataframe( + spark, + df_join_mm, + join_datapath_prefix, + logger, + tinterval, + exit_after, + logs, + test, + mm_mode, + ) + + +def launch_join(arguments: dict, data_mode, test: bool = False): + """ + Launch the joining stream job. + + Parameters + ---------- + arguments : dictionnary + arguments parse by docopt from the command line + + Returns + ------- + None + + Examples + -------- + >>> launch_join({ + ... "--config" : None, + ... "--night" : "20190903", + ... "--exit_after" : 100, + ... "--verbose" : False + ... }, DataMode.STREAMING, True) + + >>> datatest = pd.read_parquet(online_data_test).sort_values(["objectId", "triggerId", "gcn_ra"]).reset_index(drop=True).sort_index(axis=1) + >>> datajoin = pd.read_parquet("fink_mm/test/test_output/online").sort_values(["objectId", "triggerId", "gcn_ra"]).reset_index(drop=True).sort_index(axis=1) + + >>> datatest = datatest.drop("t2", axis=1) + >>> datajoin = datajoin.drop("t2", axis=1) + + >>> datatest["gcn_status"] = "initial" + >>> datatest = datatest.reindex(sorted(datatest.columns), axis=1) + >>> datajoin = datajoin.reindex(sorted(datajoin.columns), axis=1) + + >>> list(datatest.columns) == list(datajoin.columns) + True + >>> len(datatest) == len(datajoin) + True + + >>> launch_join({ + ... "--config" : None, + ... "--night" : "20190903", + ... "--exit_after" : 100, + ... "--verbose" : False + ... }, DataMode.OFFLINE, True) + + >>> datatest = pd.read_parquet(offline_data_test).sort_values(["objectId", "triggerId", "gcn_ra"]).reset_index(drop=True).sort_index(axis=1) + >>> datajoin = pd.read_parquet("fink_mm/test/test_output/offline").sort_values(["objectId", "triggerId", "gcn_ra"]).reset_index(drop=True).sort_index(axis=1) + + >>> datatest = datatest.drop("t2", axis=1) + >>> datajoin = datajoin.drop("t2", axis=1) + + >>> datatest["gcn_status"] = "initial" + >>> datatest = datatest.reindex(sorted(datatest.columns), axis=1) + >>> datajoin = datajoin.reindex(sorted(datajoin.columns), axis=1) + + >>> list(datatest.columns) == list(datajoin.columns) + True + >>> len(datatest) == len(datajoin) + True + """ + config = get_config(arguments) + logger = init_logging() + + verbose, debug = return_verbose_level(arguments, config, logger) + + spark_submit = read_and_build_spark_submit(config, logger) + + ast_dist, pansstar_dist, pansstar_star_score, gaia_dist = read_prior_params( + config, logger + ) + + ( + external_python_libs, + spark_jars, + packages, + external_files, + ) = read_additional_spark_options(arguments, config, logger, verbose, False) + + ( + night, + exit_after, + ztf_datapath_prefix, + gcn_datapath_prefix, + grb_datapath_prefix, + tinterval, + hdfs_adress, + NSIDE, + _, + time_window, + _, + _, + _, + ) = read_grb_admin_options(arguments, config, logger) + + application = apps.Application.JOIN.build_application( + logger, + data_mode=data_mode, + ztf_datapath_prefix=ztf_datapath_prefix, + gcn_datapath_prefix=gcn_datapath_prefix, + grb_datapath_prefix=grb_datapath_prefix, + night=night, + NSIDE=NSIDE, + exit_after=exit_after, + tinterval=tinterval, + time_window=time_window, + ast_dist=ast_dist, + pansstar_dist=pansstar_dist, + pansstar_star_score=pansstar_star_score, + gaia_dist=gaia_dist, + logs=verbose, + hdfs_adress=hdfs_adress, + is_test=test, + ) + + if debug: + logger.debug(f"application command = {application}") + + spark_submit = build_spark_submit( + spark_submit, + application, + external_python_libs, + spark_jars, + packages, + external_files, + ) + + if debug: + logger.debug(f"spark-submit command = {spark_submit}") + + completed_process = subprocess.run(spark_submit, shell=True, capture_output=True) + + if completed_process.returncode != 0: # pragma: no cover + logger.error( + f"fink-mm joining stream spark application has ended with a non-zero returncode.\ + \n\tstdout:\n\n{completed_process.stdout} \n\tstderr:\n\n{completed_process.stderr}" + ) + exit(1) + + if arguments["--verbose"]: + logger.info("fink-mm joining stream spark application ended normally") + print() + logger.info(f"job logs:\n\n{completed_process.stdout}") + return + + +if __name__ == "__main__": + if sys.argv[1] == "streaming": # pragma: no cover + apps.Application.JOIN.run_application(DataMode.STREAMING) + elif sys.argv[1] == "offline": + apps.Application.JOIN.run_application(DataMode.OFFLINE) diff --git a/scheduler/gen_avro_schema.py b/scheduler/gen_avro_schema.py index 46e425852..6ce283b23 100755 --- a/scheduler/gen_avro_schema.py +++ b/scheduler/gen_avro_schema.py @@ -14,6 +14,7 @@ SCHEMA_VERSION = __distribution_schema_version__ + def readschemadata(bytes_io: io._io.BytesIO) -> fastavro._read.reader: """Read data that already has an Avro schema. Parameters diff --git a/setup.py b/setup.py index 4c4a2ab4a..8c990b201 100644 --- a/setup.py +++ b/setup.py @@ -56,6 +56,7 @@ "pytest==7.2.2", "pandera==0.14.5", "astropy_healpix==0.7", + "hdfs==2.7.2", ], entry_points={"console_scripts": ["fink_mm=fink_mm.fink_mm_cli:main"]}, license="Apache-2.0 License",