From 9d7d9773c82389876ccfb46cbf564bfdc3039891 Mon Sep 17 00:00:00 2001
From: Guy Nir <37179063+guynir42@users.noreply.github.com>
Date: Thu, 13 Jun 2024 22:35:58 -0700
Subject: [PATCH] Preprocessing steps (#305)

Separate preprocessing steps to those already done on the exposure (in the instrument class) and those that are required in the config.
---
 .github/workflows/run-pipeline-tests-1.yml |   1 -
 default_config.yaml                        |   2 +-
 models/decam.py                            |   3 +-
 models/enums_and_bitflags.py               |  25 ++---
 models/image.py                            |  13 +++
 models/instrument.py                       |  32 +++---
 models/ptf.py                              |   4 +-
 pipeline/parameters.py                     |   2 +-
 pipeline/preprocessing.py                  | 119 ++++++++-------------
 tests/improc/test_tools.py                 |   1 +
 tests/pipeline/test_pipeline.py            |   2 -
 tests/pipeline/test_preprocessing.py       |  25 +----
 12 files changed, 102 insertions(+), 127 deletions(-)

diff --git a/.github/workflows/run-pipeline-tests-1.yml b/.github/workflows/run-pipeline-tests-1.yml
index af0b121f..38132c7e 100644
--- a/.github/workflows/run-pipeline-tests-1.yml
+++ b/.github/workflows/run-pipeline-tests-1.yml
@@ -59,6 +59,5 @@ jobs:
 
       - name: run test
         run: |
-          df -h
           shopt -s nullglob
           TEST_SUBFOLDER=$(ls tests/pipeline/test_{a..o}*.py) docker compose run runtests
diff --git a/default_config.yaml b/default_config.yaml
index dcbe863a..4f676164 100644
--- a/default_config.yaml
+++ b/default_config.yaml
@@ -79,7 +79,7 @@ catalog_gaiadr3:
 pipeline: {}
 
 preprocessing:
-  use_sky_subtraction: False
+  steps_required: [ 'overscan', 'linearity', 'flat', 'fringe' ]
 
 extraction:
   sources:
diff --git a/models/decam.py b/models/decam.py
index 31d18ca2..53430041 100644
--- a/models/decam.py
+++ b/models/decam.py
@@ -124,7 +124,8 @@ def __init__(self, **kwargs):
         # will apply kwargs to attributes, and register instrument in the INSTRUMENT_INSTANCE_CACHE
         Instrument.__init__(self, **kwargs)
 
-        self.preprocessing_steps = [ 'overscan', 'linearity', 'flat', 'fringe' ]
+        self.preprocessing_steps_available = [ 'overscan', 'linearity', 'flat', 'fringe' ]
+        self.preprocessing_steps_done = []
 
     @classmethod
     def get_section_ids(cls):
diff --git a/models/enums_and_bitflags.py b/models/enums_and_bitflags.py
index a979b657..85f62131 100644
--- a/models/enums_and_bitflags.py
+++ b/models/enums_and_bitflags.py
@@ -325,6 +325,19 @@ def string_to_bitflag(value, dictionary):
         return output
 
 
+# bitflag for image preprocessing steps that have been done
+image_preprocessing_dict = {
+    0: 'overscan',
+    1: 'zero',
+    2: 'dark',
+    3: 'linearity',
+    4: 'flat',
+    5: 'fringe',
+    6: 'illumination'
+}
+image_preprocessing_inverse = {EnumConverter.c(v):k for k, v in image_preprocessing_dict.items()}
+
+
 # these are the ways an Image or Exposure are allowed to be bad
 image_badness_dict = {
     1: 'banding',
@@ -389,18 +402,6 @@ class BadnessConverter( EnumConverter ):
     _dict_filtered = None
     _dict_inverse = None
 
-# bitflag for image preprocessing steps that have been done
-image_preprocessing_dict = {
-    0: 'overscan',
-    1: 'zero',
-    2: 'dark',
-    3: 'linearity',
-    4: 'flat',
-    5: 'fringe',
-    6: 'illumination'
-}
-image_preprocessing_inverse = {EnumConverter.c(v):k for k, v in image_preprocessing_dict.items()}
-
 # bitflag used in flag images
 flag_image_bits = {
     0: 'bad pixel',        # Bad pixel flagged by the instrument
diff --git a/models/image.py b/models/image.py
index c2b09e8c..d1efd3f1 100644
--- a/models/image.py
+++ b/models/image.py
@@ -34,6 +34,10 @@
 from models.enums_and_bitflags import (
     ImageFormatConverter,
     ImageTypeConverter,
+    string_to_bitflag,
+    bitflag_to_string,
+    image_preprocessing_dict,
+    image_preprocessing_inverse,
     image_badness_inverse,
 )
 
@@ -361,6 +365,15 @@ def mid_mjd(self):
         doc='Bitflag specifying which preprocessing steps have been completed for the image.'
     )
 
+    @property
+    def preprocessing_done(self):
+        """Return a list of the names of preprocessing steps that have been completed for this image."""
+        return bitflag_to_string(self.preproc_bitflag, image_preprocessing_dict)
+
+    @preprocessing_done.setter
+    def preprocessing_done(self, value):
+        self.preproc_bitflag = string_to_bitflag(value, image_preprocessing_inverse)
+
     astro_cal_done = sa.Column(
         sa.BOOLEAN,
         nullable=False,
diff --git a/models/instrument.py b/models/instrument.py
index 0193d1d4..dea62d94 100644
--- a/models/instrument.py
+++ b/models/instrument.py
@@ -382,15 +382,22 @@ def __init__(self, **kwargs):
         self._dateobs_for_sections = getattr(self, '_dateobs_for_sections', None)  # dateobs when sections were loaded
         self._dateobs_range_days = getattr(self, '_dateobs_range_days', 1.0)  # how many days from dateobs to reload
 
-        # List of the preprocessing steps to apply to images from this
-        # instrument, in order overscan must always be first.  The
-        # values here (in the Instrument class) are all possible values.
-        # Subclasses should redefine this with the subset that they
-        # actually need.  If a subclass has to add a new preprocessing
-        # step, then it should add that step to this list, and (if it's
-        # a step that includes a calibraiton image or datafile) to the
-        # CalibratorTypeConverter dict in enums_and_bitflags.py
-        self.preprocessing_steps = [ 'overscan', 'zero','dark', 'linearity', 'flat', 'fringe', 'illumination' ]
+        # List of the preprocessing steps that can be applied to exposures from this
+        # instrument, in order.  'overscan' must always be first.
+        # All preprocessing steps that are available for an instrument are listed under preprocessing_steps_available.
+        # Use image_preprocessing_dict, defined in the enums_and_bitflags file to see all possible values.
+        # Subclasses of Instrument should redefine this with the subset that they
+        # actually need to apply. So, if an instrument has exposures
+        # that already have overscan removed, that instrument should remove 'overscan' from this list.
+        # If a subclass has to add a new preprocessing step,
+        # then it should add that step to enum_and_bitflags.image_preprocessing_dict,
+        # and (if it's a step that includes a calibraiton image or datafile)
+        # to the CalibratorTypeConverter dict in enums_and_bitflags.
+        self.preprocessing_steps_available = ['overscan', 'zero', 'dark', 'linearity', 'flat', 'fringe', 'illumination']
+        # a list of preprocessing steps that are pre-applied to the exposure data
+        self.preprocessing_steps_done = []
+        self.preprocessing_step_skip_by_filter = {}  # e.g., {'g': ['fringe', 'illumination']} will skip those for g
+
         # nofile_steps are ones that don't have an associated file
         self.preprocessing_nofile_steps = [ 'overscan' ]
 
@@ -1457,7 +1464,7 @@ def preprocessing_calibrator_files( self, calibset, flattype, section, filter, m
         construction).
 
         """
-
+        section = str(section)
         SCLogger.debug( f'Looking for calibrators for {calibset} {section}' )
 
         if ( calibset == 'externally_supplied' ) != ( flattype == 'externally_supplied' ):
@@ -1475,7 +1482,7 @@ def preprocessing_calibrator_files( self, calibset, flattype, section, filter, m
         expdatetime = pytz.utc.localize( astropy.time.Time( mjd, format='mjd' ).datetime )
 
         with SmartSession(session) as session:
-            for calibtype in self.preprocessing_steps:
+            for calibtype in self.preprocessing_steps_available:
                 if calibtype in self.preprocessing_nofile_steps:
                     continue
 
@@ -1895,7 +1902,8 @@ def __init__(self, **kwargs):
         Instrument.__init__(self, **kwargs)
 
         # DemoInstrument doesn't know how to preprocess
-        self.preprocessing_steps = []
+        self.preprocessing_steps_available = []
+        self.preprocessing_steps_done = ['overscan', 'linearity', 'flat', 'fringe']
 
     @classmethod
     def get_section_ids(cls):
diff --git a/models/ptf.py b/models/ptf.py
index 55a8729c..00914ca0 100644
--- a/models/ptf.py
+++ b/models/ptf.py
@@ -26,7 +26,9 @@ def __init__(self, **kwargs):
         # will apply kwargs to attributes, and register instrument in the INSTRUMENT_INSTANCE_CACHE
         Instrument.__init__(self, **kwargs)
 
-        self.preprocessing_steps = []
+        # we are using preprocessed data as the exposures, so everything is already done
+        self.preprocessing_steps_available = []
+        self.preprocessing_steps_done = ['overscan', 'linearity', 'flat', 'fringe']
 
     @classmethod
     def get_section_ids(cls):
diff --git a/pipeline/parameters.py b/pipeline/parameters.py
index ef5212fd..a58c3977 100644
--- a/pipeline/parameters.py
+++ b/pipeline/parameters.py
@@ -333,7 +333,7 @@ def __setattr__(self, key, value):
             and not isinstance(value, self.__typecheck__[real_key])
         ):
             raise TypeError(
-                f'Parameter "{key}" must be of type {self.__typecheck__[real_key]}'
+                f'Parameter "{key}" must be of type {self.__typecheck__[real_key]}, got {type(value)} instead. '
             )
         super().__setattr__(real_key, value)
 
diff --git a/pipeline/preprocessing.py b/pipeline/preprocessing.py
index 1613319f..1b70999d 100644
--- a/pipeline/preprocessing.py
+++ b/pipeline/preprocessing.py
@@ -23,17 +23,16 @@ class ParsPreprocessor(Parameters):
     def __init__(self, **kwargs):
         super().__init__()
 
-        self.use_sky_subtraction = self.add_par('use_sky_subtraction', False, bool, 'Apply sky subtraction. ',
-                                                critical=True)
-        self.add_par( 'steps', None, ( list, None ), "Steps to do; don't specify, or pass None, to do all." )
-        self.add_par( 'calibset', None, ( str, None ),
-                      ( "One of the CalibratorSetConverter enum; "
-                        "the calibrator set to use.  Defaults to the instrument default" ),
-                      critical = True )
+        self.add_par( 'steps_required', [], list, "Steps that need to be done to each exposure" )
+
+        self.add_par( 'calibset', 'externally_supplied', str,
+                      "The calibrator set to use.  Choose one of the CalibratorSetConverter enum. ",
+                      critical=True )
         self.add_alias( 'calibrator_set', 'calibset' )
-        self.add_par( 'flattype', None, ( str, None ),
-                      ( "One of the FlatTypeConverter enum; defaults to the instrument default" ),
-                      critical = True )
+
+        self.add_par( 'flattype', 'externally_supplied', str,
+                      "One of the FlatTypeConverter enum. ",
+                      critical=True )
 
         self._enforce_no_new_attrs = True
 
@@ -75,10 +74,6 @@ def __init__(self, **kwargs):
         # the object did any work or just loaded from DB or datastore
         self.has_recalculated = False
 
-        # TODO : remove this if/when we actually put sky subtraction in run()
-        if self.pars.use_sky_subtraction:
-            raise NotImplementedError( "Sky subtraction in preprocessing isn't implemented." )
-
     def run( self, *args, **kwargs ):
         """Run preprocessing for a given exposure and section_identifier.
 
@@ -90,9 +85,6 @@ def run( self, *args, **kwargs ):
           - Exposure, section_identifier
         Passing just an image won't work.
 
-        kwargs can also include things that override the preprocessing
-        behavior.  (TODO: document this)
-
         Returns
         -------
         DataStore
@@ -105,9 +97,6 @@ def run( self, *args, **kwargs ):
         except Exception as e:
             return DataStore.catch_failure_to_parse(e, *args)
 
-        # This is here just for testing purposes
-        self._ds = ds  # TODO: is there a reason not to just use the output datastore?
-
         try:  # catch any exceptions and save them in the datastore
             t_start = time.perf_counter()
             if parse_bool(os.getenv('SEECHANGE_TRACEMALLOC')):
@@ -124,41 +113,21 @@ def run( self, *args, **kwargs ):
             if ( self.instrument is None ) or ( self.instrument.name != ds.exposure.instrument ):
                 self.instrument = ds.exposure.instrument_object
 
-            # The only reason these are saved in self, rather than being
-            # local variables, is so that tests can probe them
-            self._calibset = None
-            self._flattype = None
-            self._stepstodo = None
-
-            if 'calibset' in kwargs:
-                self._calibset = kwargs['calibset']
-            elif 'calibratorset' in kwargs:
-                self._calibset = kwargs['calibrator_set']
-            elif self.pars.calibset is not None:
-                self._calibset = self.pars.calibset
-            else:
-                self._calibset = cfg.value( f'{self.instrument.name}.calibratorset',
-                                            default=cfg.value( 'instrument_default.calibratorset' ) )
-
-            if 'flattype' in kwargs:
-                self._flattype = kwargs['flattype']
-            elif self.pars.flattype is not None:
-                self._flattype = self.pars.flattype
-            else:
-                self._flattype = cfg.value( f'{self.instrument.name}.flattype',
-                                            default=cfg.value( 'instrument_default.flattype' ) )
-
-            if 'steps' in kwargs:
-                self._stepstodo = [ s for s in self.instrument.preprocessing_steps if s in kwargs['steps'] ]
-            elif self.pars.steps is not None:
-                self._stepstodo = [ s for s in self.instrument.preprocessing_steps if s in self.pars.steps ]
-            else:
-                self._stepstodo = self.instrument.preprocessing_steps
-
+            # check that all required steps can be done (or have been done) by the instrument:
+            known_steps = self.instrument.preprocessing_steps_available
+            known_steps += self.instrument.preprocessing_steps_done
+            known_steps = set(known_steps)
+            needed_steps = set(self.pars.steps_required)
+            if not needed_steps.issubset(known_steps):
+                raise ValueError(
+                    f'Missing some preprocessing steps {needed_steps - known_steps} '
+                    f'for instrument {self.instrument.name}'
+                )
+            
             # Get the calibrator files
             SCLogger.debug("preprocessing: getting calibrator files")
-            preprocparam = self.instrument.preprocessing_calibrator_files( self._calibset,
-                                                                           self._flattype,
+            preprocparam = self.instrument.preprocessing_calibrator_files( self.pars.calibset,
+                                                                           self.pars.flattype,
                                                                            ds.section_id,
                                                                            ds.exposure.filter_short,
                                                                            ds.exposure.mjd,
@@ -167,18 +136,6 @@ def run( self, *args, **kwargs ):
             SCLogger.debug("preprocessing: got calibrator files")
 
             # get the provenance for this step, using the current parameters:
-            # Provenance includes not just self.pars.get_critical_pars(),
-            # but also the steps that were performed.  Reason: we may well
-            # load non-flatfielded images in the database for purposes of
-            # collecting images used for later building flats.  We will then
-            # flatfield those images.  The two images in the database must have
-            # different provenances.
-            # We also include any overrides to calibrator files, as that indicates
-            # that something individual happened here that's different from
-            # normal processing of the image.
-            # Fix this as part of issue #147
-            # provdict = dict( self.pars.get_critical_pars() )
-            # provdict['preprocessing_steps' ] = self._stepstodo
             prov = ds.get_provenance('preprocessing', self.pars.get_critical_pars(), session=session)
 
             # check if the image already exists in memory or in the database:
@@ -194,17 +151,27 @@ def run( self, *args, **kwargs ):
             if image.preproc_bitflag is None:
                 image.preproc_bitflag = 0
 
-            required_bitflag = 0
-            for step in self._stepstodo:
-                required_bitflag |= string_to_bitflag( step, image_preprocessing_inverse )
-
-            if image._data is None:  # in case we are skipping all preprocessing steps
-                image.data = image.raw_data
-
-            if image.preproc_bitflag != required_bitflag:
+            needed_steps -= set(self.instrument.preprocessing_steps_done)
+            filter_skips = self.instrument.preprocessing_step_skip_by_filter.get(ds.exposure.filter_short, [])
+            if not isinstance(filter_skips, list):
+                raise ValueError(f'Filter skips parameter for {ds.exposure.filter_short} must be a list')
+            filter_skips = set(filter_skips)
+            needed_steps -= filter_skips
+
+            if image._data is None:  # in case we skip all preprocessing steps
+                image.data = image.raw_data 
+            
+            # the image keeps track of the steps already done to it in image.preproc_bitflag,
+            # which is translated into a list of keywords when calling image.preprocessing_done
+            # this includes the things that already were applied in the exposure
+            # (i.e., the instrument's preprocessing_steps_done) but does not
+            # include the things that were skipped for this filter
+            # (i.e., the instrument's preprocessing_step_skip_by_filter)
+            already_done = set(image.preprocessing_done.split(', ') if image.preprocessing_done else [])
+            if not needed_steps.issubset(already_done):  # still things to do here
                 self.has_recalculated = True
                 # Overscan is always first (as it reshapes the image)
-                if 'overscan' in self._stepstodo:
+                if 'overscan' in needed_steps:
                     SCLogger.debug('preprocessing: overscan and trim')
                     image.data = self.instrument.overscan_and_trim( image )
                     # Update the header ra/dec calculations now that we know the real width/height
@@ -212,7 +179,7 @@ def run( self, *args, **kwargs ):
                     image.preproc_bitflag |= string_to_bitflag( 'overscan', image_preprocessing_inverse )
 
                 # Apply steps in the order expected by the instrument
-                for step in self._stepstodo:
+                for step in needed_steps:
                     if step == 'overscan':
                         continue
                     SCLogger.debug(f"preprocessing: {step}")
@@ -274,7 +241,7 @@ def run( self, *args, **kwargs ):
 
                     image.preproc_bitflag |= string_to_bitflag( step, image_preprocessing_inverse )
 
-                # Get the Instrument standard bad pixel mask for this image
+            # Get the Instrument standard bad pixel mask for this image
             if image._flags is None or image._weight is None:
                 image._flags = self.instrument.get_standard_flags_image( ds.section_id )
 
diff --git a/tests/improc/test_tools.py b/tests/improc/test_tools.py
index 23bf0e10..aefef7d7 100644
--- a/tests/improc/test_tools.py
+++ b/tests/improc/test_tools.py
@@ -1,6 +1,7 @@
 from astropy.io import fits
 from improc.tools import strip_wcs_keywords
 
+
 def test_strip_wcs_keywords():
     hdr = fits.Header()
 
diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py
index 1d94b996..df045e5b 100644
--- a/tests/pipeline/test_pipeline.py
+++ b/tests/pipeline/test_pipeline.py
@@ -151,7 +151,6 @@ def test_parameters( test_config ):
 
     # Verify that we can override from the yaml config file
     pipeline = Pipeline()
-    assert not pipeline.preprocessor.pars['use_sky_subtraction']
     assert pipeline.astrometor.pars['cross_match_catalog'] == 'gaia_dr3'
     assert pipeline.astrometor.pars['catalog'] == 'gaia_dr3'
     assert pipeline.subtractor.pars['method'] == 'zogy'
@@ -537,7 +536,6 @@ def test_provenance_tree(pipeline_for_tests, decam_exposure, decam_datastore, de
 def test_inject_warnings_errors(decam_datastore, decam_reference, pipeline_for_tests):
     from pipeline.top_level import PROCESS_OBJECTS
     p = pipeline_for_tests
-
     obj_to_process_name = {
         'preprocessor': 'preprocessing',
         'extractor': 'detection',
diff --git a/tests/pipeline/test_preprocessing.py b/tests/pipeline/test_preprocessing.py
index 7cf3b928..0095042d 100644
--- a/tests/pipeline/test_preprocessing.py
+++ b/tests/pipeline/test_preprocessing.py
@@ -26,11 +26,11 @@ def test_preprocessing(
     assert not preprocessor.has_recalculated
 
     # Check some Preprocesor internals
-    assert preprocessor._calibset == 'externally_supplied'
-    assert preprocessor._flattype == 'externally_supplied'
-    assert preprocessor._stepstodo == [ 'overscan', 'linearity', 'flat', 'fringe' ]
-    assert preprocessor._ds.exposure.filter[:1] == 'g'
-    assert preprocessor._ds.section_id == 'N1'
+    assert preprocessor.pars.calibset == 'externally_supplied'
+    assert preprocessor.pars.flattype == 'externally_supplied'
+    assert preprocessor.pars.steps_required == [ 'overscan', 'linearity', 'flat', 'fringe' ]
+    ds.exposure.filter[:1] == 'g'
+    ds.section_id == 'N1'
     assert set( preprocessor.stepfiles.keys() ) == { 'flat', 'linearity' }
 
     # Make sure that the BSCALE and BZERO keywords got stripped
@@ -89,21 +89,6 @@ def test_preprocessing(
     # TODO : other checks that preprocessing did what it was supposed to do?
     # (Look at image header once we have HISTORY adding in there.)
 
-    # Test some overriding
-    # clear these caches
-    preprocessor.instrument = None
-    preprocessor.stepfilesids = {}
-    preprocessor.stepfiles = {}
-
-    ds = preprocessor.run( decam_exposure, 'N1', steps=['overscan', 'linearity'] )
-    assert preprocessor.has_recalculated
-    assert preprocessor._calibset == 'externally_supplied'
-    assert preprocessor._flattype == 'externally_supplied'
-    assert preprocessor._stepstodo == [ 'overscan', 'linearity' ]
-    assert preprocessor._ds.exposure.filter[:1] == 'g'
-    assert preprocessor._ds.section_id == 'N1'
-    assert set( preprocessor.stepfiles.keys() ) == { 'linearity' }
-
 
 def test_warnings_and_exceptions(decam_exposure, preprocessor, decam_default_calibrators, archive):
     preprocessor.pars.inject_warnings = 1