diff --git a/.github/workflows/run-improc-tests.yml b/.github/workflows/run-improc-tests.yml index f22ad765..66ab002c 100644 --- a/.github/workflows/run-improc-tests.yml +++ b/.github/workflows/run-improc-tests.yml @@ -59,4 +59,4 @@ jobs: - name: run test run: | - TEST_SUBFOLDER=improc docker compose run runtests + TEST_SUBFOLDER=tests/improc docker compose run runtests diff --git a/.github/workflows/run-model-tests.yml b/.github/workflows/run-model-tests-1.yml similarity index 93% rename from .github/workflows/run-model-tests.yml rename to .github/workflows/run-model-tests-1.yml index b749a6a0..a7487536 100644 --- a/.github/workflows/run-model-tests.yml +++ b/.github/workflows/run-model-tests-1.yml @@ -1,4 +1,4 @@ -name: Run Model Tests +name: Run Model Tests 1 on: push: @@ -59,4 +59,5 @@ jobs: - name: run test run: | - TEST_SUBFOLDER=models docker compose run runtests + shopt -s nullglob + TEST_SUBFOLDER=$(ls tests/models/test_{a..l}*.py) docker compose run runtests diff --git a/.github/workflows/run-model-tests-2.yml b/.github/workflows/run-model-tests-2.yml new file mode 100644 index 00000000..c2d0eace --- /dev/null +++ b/.github/workflows/run-model-tests-2.yml @@ -0,0 +1,63 @@ +name: Run Model Tests 2 + +on: + push: + branches: + - main + pull_request: + workflow_dispatch: + +jobs: + tests: + name: run tests in docker image + runs-on: ubuntu-latest + env: + REGISTRY: ghcr.io + COMPOSE_FILE: tests/docker-compose.yaml + + steps: + - name: Dump docker logs on failure + if: failure() + uses: jwalton/gh-docker-logs@v2 + + - name: checkout code + uses: actions/checkout@v3 + with: + submodules: recursive + + - name: log into github container registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: setup docker buildx + uses: docker/setup-buildx-action@v2 + with: + driver: docker-container + + - name: bake + uses: docker/bake-action@v2.3.0 + with: + workdir: tests + load: true + files: docker-compose.yaml + set: | + seechange_postgres.tags=ghcr.io/${{ github.repository_owner }}/seechange-postgres + seechange_postgres.cache-from=type=gha,scope=cached-seechange-postgres + seechange_postgres.cache-to=type=gha,scope=cached-seechange-postgres,mode=max + setuptables.tags=ghcr.io/${{ github.repository_owner }}/runtests + setuptables.cache-from=type=gha,scope=cached-seechange + setuptables.cache-to=type=gha,scope=cached-seechange,mode=max + runtests.tags=ghcr.io/${{ github.repository_owner }}/runtests + runtests.cache-from=type=gha,scope=cached-seechange + runtests.cache-to=type=gha,scope=cached-seechange,mode=max + shell.tags=ghcr.io/${{ github.repository_owner }}/runtests + shell.cache-from=type=gha,scope=cached-seechange + shell.cache-to=type=gha,scope=cached-seechange,mode=max + + - name: run test + run: | + shopt -s nullglob + TEST_SUBFOLDER=$(ls tests/models/test_{m..z}*.py) docker compose run runtests diff --git a/.github/workflows/run-pipeline-tests.yml b/.github/workflows/run-pipeline-tests.yml index 9d39459a..b1b24cbe 100644 --- a/.github/workflows/run-pipeline-tests.yml +++ b/.github/workflows/run-pipeline-tests.yml @@ -59,4 +59,4 @@ jobs: - name: run test run: | - TEST_SUBFOLDER=pipeline docker compose run runtests + TEST_SUBFOLDER=tests/pipeline docker compose run runtests diff --git a/.github/workflows/run-util-tests.yml b/.github/workflows/run-util-tests.yml index 8c37b922..7c626aeb 100644 --- a/.github/workflows/run-util-tests.yml +++ b/.github/workflows/run-util-tests.yml @@ -59,4 +59,4 @@ jobs: - name: run test run: | - TEST_SUBFOLDER=util docker compose run runtests + TEST_SUBFOLDER=tests/util docker compose run runtests diff --git a/alembic/versions/2024_05_15_1210-485334f16c23_add_report_model.py b/alembic/versions/2024_05_15_1210-485334f16c23_add_report_model.py new file mode 100644 index 00000000..7210bf05 --- /dev/null +++ b/alembic/versions/2024_05_15_1210-485334f16c23_add_report_model.py @@ -0,0 +1,76 @@ +"""add report model + +Revision ID: 485334f16c23 +Revises: 573289f12368 +Create Date: 2024-05-15 12:10:56.118620 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = '485334f16c23' +down_revision = 'ec64a8fd8cf3' +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('reports', + sa.Column('exposure_id', sa.BigInteger(), nullable=False), + sa.Column('section_id', sa.Text(), nullable=False), + sa.Column('start_time', sa.DateTime(), nullable=False), + sa.Column('finish_time', sa.DateTime(), nullable=True), + sa.Column('success', sa.Boolean(), nullable=False), + sa.Column('num_prev_reports', sa.Integer(), nullable=False), + sa.Column('worker_id', sa.Text(), nullable=True), + sa.Column('node_id', sa.Text(), nullable=True), + sa.Column('cluster_id', sa.Text(), nullable=True), + sa.Column('error_step', sa.Text(), nullable=True), + sa.Column('error_type', sa.Text(), nullable=True), + sa.Column('error_message', sa.Text(), nullable=True), + sa.Column('warnings', sa.Text(), nullable=True), + sa.Column('process_memory', postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.Column('process_runtime', postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.Column('progress_steps_bitflag', sa.BIGINT(), nullable=False), + sa.Column('products_exist_bitflag', sa.BIGINT(), nullable=False), + sa.Column('products_committed_bitflag', sa.BIGINT(), nullable=False), + sa.Column('provenance_id', sa.String(), nullable=False), + sa.Column('created_at', sa.DateTime(), nullable=False), + sa.Column('modified', sa.DateTime(), nullable=False), + sa.Column('id', sa.BigInteger(), autoincrement=True, nullable=False), + sa.ForeignKeyConstraint(['exposure_id'], ['exposures.id'], name='reports_exposure_id_fkey', ondelete='CASCADE'), + sa.ForeignKeyConstraint(['provenance_id'], ['provenances.id'], name='images_provenance_id_fkey', ondelete='CASCADE'), + sa.PrimaryKeyConstraint('id') + ) + op.create_index(op.f('ix_reports_created_at'), 'reports', ['created_at'], unique=False) + op.create_index(op.f('ix_reports_exposure_id'), 'reports', ['exposure_id'], unique=False) + op.create_index(op.f('ix_reports_finish_time'), 'reports', ['finish_time'], unique=False) + op.create_index(op.f('ix_reports_id'), 'reports', ['id'], unique=False) + op.create_index(op.f('ix_reports_products_committed_bitflag'), 'reports', ['products_committed_bitflag'], unique=False) + op.create_index(op.f('ix_reports_products_exist_bitflag'), 'reports', ['products_exist_bitflag'], unique=False) + op.create_index(op.f('ix_reports_progress_steps_bitflag'), 'reports', ['progress_steps_bitflag'], unique=False) + op.create_index(op.f('ix_reports_provenance_id'), 'reports', ['provenance_id'], unique=False) + op.create_index(op.f('ix_reports_section_id'), 'reports', ['section_id'], unique=False) + op.create_index(op.f('ix_reports_start_time'), 'reports', ['start_time'], unique=False) + op.create_index(op.f('ix_reports_success'), 'reports', ['success'], unique=False) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index(op.f('ix_reports_success'), table_name='reports') + op.drop_index(op.f('ix_reports_start_time'), table_name='reports') + op.drop_index(op.f('ix_reports_section_id'), table_name='reports') + op.drop_index(op.f('ix_reports_provenance_id'), table_name='reports') + op.drop_index(op.f('ix_reports_progress_steps_bitflag'), table_name='reports') + op.drop_index(op.f('ix_reports_products_exist_bitflag'), table_name='reports') + op.drop_index(op.f('ix_reports_products_committed_bitflag'), table_name='reports') + op.drop_index(op.f('ix_reports_id'), table_name='reports') + op.drop_index(op.f('ix_reports_finish_time'), table_name='reports') + op.drop_index(op.f('ix_reports_exposure_id'), table_name='reports') + op.drop_index(op.f('ix_reports_created_at'), table_name='reports') + op.drop_table('reports') + # ### end Alembic commands ### diff --git a/alembic/versions/2024_05_22_1122-9a4097979249_reference_instrument.py b/alembic/versions/2024_05_22_1122-9a4097979249_reference_instrument.py new file mode 100644 index 00000000..b5a6aee0 --- /dev/null +++ b/alembic/versions/2024_05_22_1122-9a4097979249_reference_instrument.py @@ -0,0 +1,30 @@ +"""reference instrument + +Revision ID: 9a4097979249 +Revises: 485334f16c23 +Create Date: 2024-05-22 11:22:20.322800 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = '9a4097979249' +down_revision = '485334f16c23' +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('refs', sa.Column('instrument', sa.Text(), nullable=False)) + op.create_index(op.f('ix_refs_instrument'), 'refs', ['instrument'], unique=False) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index(op.f('ix_refs_instrument'), table_name='refs') + op.drop_column('refs', 'instrument') + # ### end Alembic commands ### diff --git a/docker/application/Dockerfile b/docker/application/Dockerfile index 10954067..c8ac099c 100755 --- a/docker/application/Dockerfile +++ b/docker/application/Dockerfile @@ -124,17 +124,17 @@ ENV PYTHONPATH "/seechange" # # Need to install mpich here rather than via package manager to ensure # ABI compatibility. -ARG mpich_version=4.0.2 -ARG mpich_prefix=mpich-$mpich_version -RUN curl -L https://www.mpich.org/static/downloads/$mpich_version/$mpich_prefix.tar.gz -O \ - && tar xf $mpich_prefix.tar.gz \ - && cd $mpich_prefix \ - && ./configure FFLAGS=-fallow-argument-mismatch FCFLAGS=-fallow-argument-mismatch \ - && make -j 16 \ - && make install \ - && make clean \ - && cd .. \ - && rm -rf $mpich_prefix $mpich_prefix.tar.gz +# ARG mpich_version=4.0.2 +# ARG mpich_prefix=mpich-$mpich_version +# RUN curl -L https://www.mpich.org/static/downloads/$mpich_version/$mpich_prefix.tar.gz -O \ +# && tar xf $mpich_prefix.tar.gz \ +# && cd $mpich_prefix \ +# && ./configure FFLAGS=-fallow-argument-mismatch FCFLAGS=-fallow-argument-mismatch \ +# && make -j 16 \ +# && make install \ +# && make clean \ +# && cd .. \ +# && rm -rf $mpich_prefix $mpich_prefix.tar.gz # Hotpants Alard/Lupton image subtraction RUN git clone https://github.com/acbecker/hotpants.git \ diff --git a/docs/overview.md b/docs/overview.md index 2aa9ec3b..988d187e 100644 --- a/docs/overview.md +++ b/docs/overview.md @@ -274,7 +274,7 @@ Only parameters that affect the product values are included. The upstreams are other `Provenance` objects defined for the data products that are an input to the current processing step. -The flowchart of the different process steps is defined in `pipeline.datastore.UPSTREAM_NAMES`. +The flowchart of the different process steps is defined in `pipeline.datastore.UPSTREAM_STEPS`. E.g., the upstreams for the `photo_cal` object are `['extraction', 'astro_cal']`. When a `Provenance` object has all the required inputs, it will produce a hash identifier diff --git a/models/base.py b/models/base.py index 13fc5b76..fae85231 100644 --- a/models/base.py +++ b/models/base.py @@ -79,6 +79,7 @@ def setup_warning_filters(): "- parent object of type has been garbage collected", ) + setup_warning_filters() # need to call this here and also call it explicitly when setting up tests _engine = None @@ -446,7 +447,7 @@ def to_dict(self): @classmethod def from_dict(cls, dictionary): """Convert a dictionary into a new object. """ - dictionary.pop('modified') # we do not want to recreate the object with an old "modified" time + dictionary.pop('modified', None) # we do not want to recreate the object with an old "modified" time md5sum = dictionary.get('md5sum', None) if md5sum is not None: diff --git a/models/enums_and_bitflags.py b/models/enums_and_bitflags.py index 7460d6bf..ff64148a 100644 --- a/models/enums_and_bitflags.py +++ b/models/enums_and_bitflags.py @@ -320,7 +320,7 @@ def string_to_bitflag(value, dictionary): original_keyword = keyword keyword = EnumConverter.c(keyword) if keyword not in dictionary: - raise ValueError(f'Keyword "{original_keyword}" not recognized in dictionary') + raise ValueError(f'Keyword "{original_keyword.strip()}" not recognized in dictionary') output += 2 ** dictionary[keyword] return output @@ -409,3 +409,36 @@ class BitFlagConverter( EnumConverter ): _allowed_values = flag_image_bits _dict_filtered = None _dict_inverse = None + + +# the list of possible processing steps from a section of an exposure up to measurments, r/b scores, and report +process_steps_dict = { + 1: 'preprocessing', # creates an Image from a section of the Exposure + 2: 'extraction', # creates a SourceList from an Image, and a PSF + 3: 'astro_cal', # creates a WorldCoordinates from a SourceList + 4: 'photo_cal', # creates a ZeroPoint from a WorldCoordinates + 5: 'subtraction', # creates a subtraction Image + 6: 'detection', # creates a SourceList from a subtraction Image + 7: 'cutting', # creates Cutouts from a subtraction Image + 8: 'measuring', # creates Measurements from Cutouts + # TODO: add R/B scores and maybe an extra step for finalizing a report +} +process_steps_inverse = {EnumConverter.c(v): k for k, v in process_steps_dict.items()} + + +# the list of objects that could be loaded to a datastore after running the pipeline +pipeline_products_dict = { + 1: 'image', + 2: 'sources', + 3: 'psf', + # 4: 'background', # not yet implemented + 5: 'wcs', + 6: 'zp', + 7: 'sub_image', + 8: 'detections', + 9: 'cutouts', + 10: 'measurements', + # 11: 'rb_scores', +} + +pipeline_products_inverse = {EnumConverter.c(v): k for k, v in pipeline_products_dict.items()} \ No newline at end of file diff --git a/models/exposure.py b/models/exposure.py index 61d7ede7..e14e11cb 100644 --- a/models/exposure.py +++ b/models/exposure.py @@ -1,3 +1,4 @@ +import time import pathlib from collections import defaultdict @@ -7,6 +8,7 @@ from sqlalchemy.schema import CheckConstraint from sqlalchemy.orm.session import object_session from sqlalchemy.ext.hybrid import hybrid_property +from sqlalchemy.exc import IntegrityError from astropy.time import Time from astropy.io import fits @@ -522,6 +524,14 @@ def end_mjd(self): return None return self.mjd + self.exp_time / 86400.0 + @property + def observation_time(self): + """Translation of the MJD column to datetime object.""" + if self.mjd is None: + return None + else: + return Time(self.mjd, format='mjd').datetime + def __repr__(self): filter_str = '--' @@ -735,6 +745,47 @@ def get_downstreams(self, session=None): return images + def merge_concurrent(self, session=None): + """Try multiple times to fetch and merge this exposure. + This will hopefully protect us against concurrently adding the exposure from multiple processes. + Should also be safe to use in case that the same exposure (i.e., with the same filepath) + was added by previous runs. + """ + exposure = None + with SmartSession(session) as session: + + for i in range(5): + try: + found_exp = session.scalars( + sa.select(Exposure).where(Exposure.filepath == self.filepath) + ).first() + if found_exp is None: + exposure = session.merge(self) + session.commit() + else: + # update the found exposure with any modifications on the existing exposure + columns = Exposure.__table__.columns.keys() + for col in columns: + if col in ['id', 'created_at', 'modified']: + continue + setattr(found_exp, col, getattr(self, col)) + exposure = found_exp + + break # if we got here without an exception, we can break out of the loop + except IntegrityError as e: + # this could happen if in between the query and the merge(exposure) + # another process added the same exposure to the database + if 'duplicate key value violates unique constraint "ix_exposures_filepath"' in str(e): + SCLogger.debug(str(e)) + session.rollback() + time.sleep(0.1 * 2 ** i) # exponential backoff + else: + raise e + else: # if we didn't break out of the loop, there must have been some integrity error + raise e + + return exposure + if __name__ == '__main__': import os diff --git a/models/image.py b/models/image.py index 55fef2ac..7d7f44c8 100644 --- a/models/image.py +++ b/models/image.py @@ -1498,7 +1498,6 @@ def load(self): if not ( gotim and gotweight and gotflags ): raise FileNotFoundError( "Failed to load at least one of image, weight, flags" ) - def free( self, free_derived_products=True, free_aligned=True, only_free=None ): """Free loaded image memory. Does not delete anything from disk. @@ -1557,8 +1556,6 @@ def free( self, free_derived_products=True, free_aligned=True, only_free=None ): for alim in self._aligned_images: alim.free( free_derived_products=free_derived_products, only_free=only_free ) - - def get_upstream_provenances(self): """Collect the provenances for all upstream objects. diff --git a/models/provenance.py b/models/provenance.py index 1b5c0da0..f5022c09 100644 --- a/models/provenance.py +++ b/models/provenance.py @@ -1,3 +1,4 @@ +import time import json import base64 import hashlib @@ -5,6 +6,7 @@ from sqlalchemy import event from sqlalchemy.orm import relationship from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy.exc import IntegrityError from util.util import get_git_hash @@ -338,6 +340,32 @@ def get_code_version(cls, session=None): code_version = session.scalars(sa.select(CodeVersion).order_by(CodeVersion.id.desc())).first() return code_version + def merge_concurrent(self, session=None, commit=True): + """Merge the provenance but make sure it doesn't exist before adding it to the database. + + If between the time we check if the provenance exists and the time it is merged, + another process has added the same provenance, we will get an integrity error. + This is expected under the assumptions of "optimistic concurrency". + If that happens, we simply begin again, checking for the provenance and merging it. + """ + output = None + with SmartSession(session) as session: + for i in range(5): + try: + output = session.merge(self) + if commit: + session.commit() + break + except IntegrityError as e: + if 'duplicate key value violates unique constraint "pk_provenances"' in str(e): + session.rollback() + time.sleep(0.1 * 2 ** i) # exponential sleep + else: + raise e + else: # if we didn't break out of the loop, there must have been some integrity error + raise e + + return output @event.listens_for(Provenance, "before_insert") def insert_new_dataset(mapper, connection, target): diff --git a/models/reference.py b/models/reference.py index befc5695..20aca8d3 100644 --- a/models/reference.py +++ b/models/reference.py @@ -1,5 +1,5 @@ import sqlalchemy as sa -from sqlalchemy import orm +from sqlalchemy import orm, func from models.base import Base, AutoIDMixin, SmartSession from models.image import Image @@ -52,6 +52,13 @@ class Reference(Base, AutoIDMixin): ) ) + instrument = sa.Column( + sa.Text, + nullable=False, + index=True, + doc="Name of the instrument used to make the images for this reference image. " + ) + filter = sa.Column( sa.Text, nullable=False, @@ -138,6 +145,7 @@ def __init__(self, **kwargs): def __setattr__(self, key, value): if key == 'image' and value is not None: self.target = value.target + self.instrument = value.instrument self.filter = value.filter self.section_id = value.section_id self.sources = value.sources diff --git a/models/report.py b/models/report.py new file mode 100644 index 00000000..6869b114 --- /dev/null +++ b/models/report.py @@ -0,0 +1,354 @@ + +import sqlalchemy as sa +from sqlalchemy import orm +from sqlalchemy.dialects.postgresql import JSONB + +from models.base import Base, SeeChangeBase, AutoIDMixin, SmartSession +from models.enums_and_bitflags import ( + bitflag_to_string, + string_to_bitflag, + process_steps_dict, + process_steps_inverse, + pipeline_products_dict, + pipeline_products_inverse, +) + +from util.logger import SCLogger + +class Report(Base, AutoIDMixin): + """A report on the status of analysis of one section from an Exposure. + + The report's main role is to keep a database record of when we started + and finished processing this section of the Exposure. It also keeps + track of any errors or warnings that came up during processing. + """ + __tablename__ = 'reports' + + exposure_id = sa.Column( + sa.ForeignKey('exposures.id', ondelete='CASCADE', name='reports_exposure_id_fkey'), + nullable=False, + index=True, + doc=( + "ID of the exposure for which the report was made. " + ) + ) + + exposure = orm.relationship( + 'Exposure', + cascade='save-update, merge, refresh-expire, expunge', + doc=( + "Exposure for which the report was made. " + ) + ) + + section_id = sa.Column( + sa.Text, + nullable=False, + index=True, + doc=( + "ID of the section of the exposure for which the report was made. " + ) + ) + + start_time = sa.Column( + sa.DateTime, + nullable=False, + index=True, + doc=( + "Time when processing of the section started. " + ) + ) + + finish_time = sa.Column( + sa.DateTime, + nullable=True, + index=True, + doc=( + "Time when processing of the section finished. " + "If an error occurred, this will show the time of the error. " + "If the processing is still ongoing (or hanging) this will be NULL. " + ) + ) + + success = sa.Column( + sa.Boolean, + nullable=False, + index=True, + default=False, + doc=( + "Whether the processing of this section was successful. " + ) + ) + + num_prev_reports = sa.Column( + sa.Integer, + nullable=False, + default=0, + doc=( + "Number of previous reports for this exposure, section, and provenance. " + ) + ) + + worker_id = sa.Column( + sa.Text, + nullable=True, + doc=( + "ID of the worker/process that ran this section. " + ) + ) + + node_id = sa.Column( + sa.Text, + nullable=True, + doc=( + "ID of the node where the worker/process ran this section. " + ) + ) + + cluster_id = sa.Column( + sa.Text, + nullable=True, + doc=( + "ID of the cluster where the worker/process ran this section. " + ) + ) + + error_step = sa.Column( + sa.Text, + nullable=True, + doc=( + "Name of the processing step where an error occurred. " + ) + ) + + error_type = sa.Column( + sa.Text, + nullable=True, + doc=( + "Type of error that was raised during processing. " + ) + ) + + error_message = sa.Column( + sa.Text, + nullable=True, + doc=( + "Error message that was raised during processing. " + ) + ) + + warnings = sa.Column( + sa.Text, + nullable=True, + doc=( + "Comma-separated string of warnings that were raised during processing. " + "Each warning begins with the processing step name, followed by the warning type and message. " + ) + ) + + process_memory = sa.Column( + JSONB, + nullable=False, + default={}, + doc='Memory usage of the process during processing. ' + 'Each key in the dictionary is for a processing step, ' + 'and the value is the memory usage in megabytes. ' + ) + + process_runtime = sa.Column( + JSONB, + nullable=False, + default={}, + doc='Runtime of the process during processing. ' + 'Each key in the dictionary is for a processing step, ' + 'and the value is the runtime in seconds. ' + ) + + progress_steps_bitflag = sa.Column( + sa.BIGINT, + nullable=False, + default=0, + index=True, + doc='Bitflag recording what processing steps have already been applied to this section. ' + ) + + @property + def progress_steps(self): + """A comma separated string of the processing steps that have already been applied to this section. """ + return bitflag_to_string(self.progress_steps_bitflag, process_steps_dict) + + @progress_steps.setter + def progress_steps(self, value): + """Set the progress steps for this report using a comma separated string. """ + self.progress_steps_bitflag = string_to_bitflag(value, process_steps_inverse) + + def append_progress(self, value): + """Add some keywords (in a comma separated string) + describing what is processing steps were done on this section. + The keywords will be added to the list "progress_steps" + and progress_bitflag for this report will be updated accordingly. + """ + self.progress_steps_bitflag |= string_to_bitflag(value, process_steps_inverse) + + products_exist_bitflag = sa.Column( + sa.BIGINT, + nullable=False, + default=0, + index=True, + doc='Bitflag recording which pipeline products were not None when the pipeline finished. ' + ) + + @property + def products_exist(self): + """A comma separated string representing which products + have already been filled on the datastore when the pipeline finished. + """ + return bitflag_to_string(self.products_exist_bitflag, pipeline_products_dict) + + @products_exist.setter + def products_exist(self, value): + """Set the products_exist for this report using a comma separated string. """ + self.products_exist_bitflag = string_to_bitflag(value, pipeline_products_inverse) + + def append_products_exist(self, value): + """Add some keywords (in a comma separated string) + describing which products existed (were not None) on the datastore. + The keywords will be added to the list "products_exist" + and products_exist_bitflag for this report will be updated accordingly. + """ + self.products_exist_bitflag |= string_to_bitflag(value, pipeline_products_inverse) + + products_committed_bitflag = sa.Column( + sa.BIGINT, + nullable=False, + default=0, + index=True, + doc='Bitflag recording which pipeline products were not None when the pipeline finished. ' + ) + + @property + def products_committed(self): + """A comma separated string representing which products + have already been successfully saved using the datastore when the pipeline finished. + """ + return bitflag_to_string(self.products_committed_bitflag, pipeline_products_dict) + + @products_committed.setter + def products_committed(self, value): + """Set the products_committed for this report using a comma separated string. """ + self.products_committed_bitflag = string_to_bitflag(value, pipeline_products_inverse) + + def append_products_committed(self, value): + """Add some keywords (in a comma separated string) + describing which products were successfully saved by the datastore. + The keywords will be added to the list "products_committed" + and products_committed_bitflag for this report will be updated accordingly. + """ + self.products_committed_bitflag |= string_to_bitflag(value, pipeline_products_inverse) + + provenance_id = sa.Column( + sa.ForeignKey('provenances.id', ondelete="CASCADE", name='images_provenance_id_fkey'), + nullable=False, + index=True, + doc=( + "ID of the provenance of this report. " + "The provenance has upstreams that point to the " + "measurements and R/B score objects that themselves " + "point back to all the other provenances that were " + "used to produce this report. " + ) + ) + + provenance = orm.relationship( + 'Provenance', + cascade='save-update, merge, refresh-expire, expunge', + lazy='selectin', + doc=( + "The provenance of this report. " + "The provenance has upstreams that point to the " + "measurements and R/B score objects that themselves " + "point back to all the other provenances that were " + "used to produce this report. " + ) + ) + + def __init__(self, **kwargs): + SeeChangeBase.__init__(self) # do not pass kwargs to Base.__init__, as there may be non-column attributes + + # verify these attributes get their default even if the object is not committed to DB + self.success = False + self.num_prev_reports = 0 + self.progress_steps_bitflag = 0 + self.products_exist_bitflag = 0 + self.products_committed_bitflag = 0 + self.process_memory = {} + self.process_runtime = {} + + # manually set all properties (columns or not) + self.set_attributes_from_dict(kwargs) + + @orm.reconstructor + def init_on_load(self): + SeeChangeBase.init_on_load(self) + + def scan_datastore(self, ds, process_step, session=None): + """Go over all the data in a datastore and update the report accordingly. + Will commit the changes to the database. + If there are any exceptions pending on the datastore it will re-raise them. + """ + # parse the error, if it exists, so we can get to other data products without raising + exception = ds.read_exception() + + # append the newest step to the progress bitflag + self.append_progress(process_step) + + # check which objects exist on the datastore, and which have been committed + for prod in pipeline_products_dict.values(): + if getattr(ds, prod) is not None: + self.append_products_exist(prod) + + self.products_committed = ds.products_committed + + # store the runtime and memory usage statistics + self.process_runtime = ds.runtimes # update with new dictionary + self.process_memory = ds.memory_usages # update with new dictionary + + # parse the warnings, if they exist + if isinstance(ds.warnings_list, list): + new_string = self.read_warnings(process_step, ds.warnings_list) + if self.warnings is None or self.warnings == '': + self.warnings = new_string + else: + self.warnings += '\n***|***|***\n' + new_string + + if exception is not None: + self.error_type = exception.__class__.__name__ + self.error_message = str(exception) + self.error_step = process_step + + with SmartSession(session) as session: + new_report = self.commit_to_database(session=session) + + if exception is not None: + raise exception + + return new_report + + def commit_to_database(self, session): + """Commit this report to the database. """ + new_report = session.merge(self) + session.commit() + return new_report + + @staticmethod + def read_warnings(process_step, warnings_list): + """Convert a list of warnings into a comma separated string. """ + formatted_warnings = [] + for w in warnings_list: + text = f'{process_step}: {w.category} {w.message} ({w.filename}:{w.lineno})' + formatted_warnings.append(text) + SCLogger.warning(text) # make sure warnings also get printed to the log/on screen. + + warnings_list.clear() # remove all the warnings but keep the list object + + return ', '.join(formatted_warnings) diff --git a/models/source_list.py b/models/source_list.py index 17a72101..d6962bf5 100644 --- a/models/source_list.py +++ b/models/source_list.py @@ -167,8 +167,8 @@ def __setattr__(self, key, value): @orm.reconstructor def init_on_load(self): - Base.init_on_load(self) FileOnDiskMixin.init_on_load(self) + SeeChangeBase.init_on_load(self) self._data = None self._info = None self._is_star = None diff --git a/models/world_coordinates.py b/models/world_coordinates.py index e64e9aa3..fb7508ec 100644 --- a/models/world_coordinates.py +++ b/models/world_coordinates.py @@ -9,7 +9,7 @@ from astropy.io import fits from astropy.wcs import utils -from models.base import Base, SmartSession, AutoIDMixin, HasBitFlagBadness +from models.base import Base, SmartSession, AutoIDMixin, HasBitFlagBadness, FileOnDiskMixin, SeeChangeBase from models.enums_and_bitflags import catalog_match_badness_inverse from models.source_list import SourceList @@ -101,14 +101,21 @@ def wcs( self, value ): self._wcs = value self.header_excerpt = value.to_header().tostring( sep='\n', padding=False ) + def __init__(self, *args, **kwargs): + SeeChangeBase.__init__(self) # don't pass kwargs as they could contain non-column key-values + self._wcs = None + + # manually set all properties (columns or not) + self.set_attributes_from_dict(kwargs) + + @orm.reconstructor + def init_on_load(self): + SeeChangeBase.init_on_load(self) + def _get_inverse_badness(self): """Get a dict with the allowed values of badness that can be assigned to this object""" return catalog_match_badness_inverse - def __init__( self, *args, **kwargs ): - super().__init__( *args, **kwargs ) - self._wcs = None - @orm.reconstructor def init_on_load( self ): Base.init_on_load( self ) diff --git a/models/zero_point.py b/models/zero_point.py index 41d92545..5daf321e 100644 --- a/models/zero_point.py +++ b/models/zero_point.py @@ -6,11 +6,12 @@ from sqlalchemy.schema import UniqueConstraint from sqlalchemy.dialects.postgresql import ARRAY -from models.base import Base, SmartSession, AutoIDMixin, HasBitFlagBadness +from models.base import Base, SmartSession, AutoIDMixin, HasBitFlagBadness, FileOnDiskMixin, SeeChangeBase from models.enums_and_bitflags import catalog_match_badness_inverse from models.world_coordinates import WorldCoordinates from models.source_list import SourceList + class ZeroPoint(Base, AutoIDMixin, HasBitFlagBadness): __tablename__ = 'zero_points' @@ -91,6 +92,16 @@ class ZeroPoint(Base, AutoIDMixin, HasBitFlagBadness): "pipeline isn't expected to have photometry to better than a couple of percent." ) ) + def __init__(self, *args, **kwargs): + SeeChangeBase.__init__(self) # don't pass kwargs as they could contain non-column key-values + + # manually set all properties (columns or not) + self.set_attributes_from_dict(kwargs) + + @orm.reconstructor + def init_on_load(self): + SeeChangeBase.init_on_load(self) + def _get_inverse_badness(self): """Get a dict with the allowed values of badness that can be assigned to this object""" return catalog_match_badness_inverse diff --git a/pipeline/astro_cal.py b/pipeline/astro_cal.py index 267c989e..3bd796a1 100644 --- a/pipeline/astro_cal.py +++ b/pipeline/astro_cal.py @@ -1,9 +1,17 @@ +import os +import time import pathlib + import improc.scamp + from util.exceptions import CatalogNotFoundError, SubprocessFailure, BadMatchException + from util.logger import SCLogger +from util.util import parse_bool + from models.catalog_excerpt import CatalogExcerpt from models.world_coordinates import WorldCoordinates + from pipeline.parameters import Parameters from pipeline.data_store import DataStore from pipeline.catalog_tools import fetch_gaia_dr3_excerpt @@ -264,39 +272,61 @@ def run(self, *args, **kwargs): Returns a DataStore object with the products of the processing. """ self.has_recalculated = False - ds, session = DataStore.from_args(*args, **kwargs) + try: # first make sure we get back a datastore, even an empty one + ds, session = DataStore.from_args(*args, **kwargs) + except Exception as e: + return DataStore.catch_failure_to_parse(e, *args) + + try: + t_start = time.perf_counter() + if parse_bool(os.getenv('SEECHANGE_TRACEMALLOC')): + import tracemalloc + tracemalloc.reset_peak() # start accounting for the peak memory usage from here + + self.pars.do_warning_exception_hangup_injection_here() + + # get the provenance for this step: + prov = ds.get_provenance(self.pars.get_process_name(), self.pars.get_critical_pars(), session=session) + + # try to find the world coordinates in memory or in the database: + wcs = ds.get_wcs(prov, session=session) + + if wcs is None: # must create a new WorldCoordinate object + self.has_recalculated = True + image = ds.get_image(session=session) + if image.astro_cal_done: + SCLogger.warning( + f"Failed to find a wcs for image {pathlib.Path( image.filepath ).name}, " + f"but it has astro_cal_done=True" + ) + + if self.pars.solution_method == 'scamp': + self._run_scamp( ds, prov, session=session ) + else: + raise ValueError( f'Unknown solution method {self.pars.solution_method}' ) + + # update the upstream bitflag + sources = ds.get_sources( session=session ) + if sources is None: + raise ValueError(f'Cannot find a source list corresponding to the datastore inputs: {ds.get_inputs()}') + if ds.wcs._upstream_bitflag is None: + ds.wcs._upstream_bitflag = 0 + ds.wcs._upstream_bitflag |= sources.bitflag + + # If an astro cal wasn't previously run on this image, + # update the image's ra/dec and corners attributes based on this new wcs + if not image.astro_cal_done: + image.set_corners_from_header_wcs(wcs=ds.wcs.wcs, setradec=True) + image.astro_cal_done = True + + ds.runtimes['astro_cal'] = time.perf_counter() - t_start + if parse_bool(os.getenv('SEECHANGE_TRACEMALLOC')): + import tracemalloc + ds.memory_usages['astro_cal'] = tracemalloc.get_traced_memory()[1] / 1024 ** 2 # in MB + + except Exception as e: + ds.catch_exception(e) + finally: + # make sure datastore is returned to be used in the next step + return ds - # get the provenance for this step: - prov = ds.get_provenance(self.pars.get_process_name(), self.pars.get_critical_pars(), session=session) - - # try to find the world coordinates in memory or in the database: - wcs = ds.get_wcs(prov, session=session) - - if wcs is None: # must create a new WorldCoordinate object - self.has_recalculated = True - image = ds.get_image(session=session) - if image.astro_cal_done: - SCLogger.warning( f"Failed to find a wcs for image {pathlib.Path( image.filepath ).name}, " - f"but it has astro_cal_done=True" ) - - if self.pars.solution_method == 'scamp': - self._run_scamp( ds, prov, session=session ) - else: - raise ValueError( f'Unknown solution method {self.pars.solution_method}' ) - - # update the upstream bitflag - sources = ds.get_sources( session=session ) - if sources is None: - raise ValueError(f'Cannot find a source list corresponding to the datastore inputs: {ds.get_inputs()}') - if ds.wcs._upstream_bitflag is None: - ds.wcs._upstream_bitflag = 0 - ds.wcs._upstream_bitflag |= sources.bitflag - - # If an astro cal wasn't previously run on this image, - # update the image's ra/dec and corners attributes based on this new wcs - if not image.astro_cal_done: - image.set_corners_from_header_wcs( wcs=ds.wcs.wcs, setradec=True ) - image.astro_cal_done = True - - # make sure this is returned to be used in the next step - return ds diff --git a/pipeline/cutting.py b/pipeline/cutting.py index 67724a80..17afc428 100644 --- a/pipeline/cutting.py +++ b/pipeline/cutting.py @@ -1,12 +1,15 @@ +import os +import time -from pipeline.parameters import Parameters -from pipeline.data_store import DataStore -from util.util import parse_session +from improc.tools import make_cutouts from models.source_list import SourceList from models.cutouts import Cutouts -from improc.tools import make_cutouts +from pipeline.parameters import Parameters +from pipeline.data_store import DataStore + +from util.util import parse_session, parse_bool class ParsCutter(Parameters): @@ -46,75 +49,93 @@ def run(self, *args, **kwargs): Returns a DataStore object with the products of the processing. """ self.has_recalculated = False - if isinstance(args[0], SourceList) and args[0].is_sub: # most likely to get a SourceList detections object - args, kwargs, session = parse_session(*args, **kwargs) - ds = DataStore() - ds.detections = args[0] - ds.sub_image = args[0].image - ds.image = args[0].image.new_image - else: - ds, session = DataStore.from_args(*args, **kwargs) - - # get the provenance for this step: - prov = ds.get_provenance(self.pars.get_process_name(), self.pars.get_critical_pars(), session=session) - - # try to find some measurements in memory or in the database: - cutout_list = ds.get_cutouts(prov, session=session) - - if cutout_list is None or len(cutout_list) == 0: # must create a new list of Cutouts - self.has_recalculated = True - # use the latest source list in the data store, - # or load using the provenance given in the - # data store's upstream_provs, or just use - # the most recent provenance for "detection" - detections = ds.get_detections(session=session) - - if detections is None: - raise ValueError(f'Cannot find a source list corresponding to the datastore inputs: {ds.get_inputs()}') - - cutout_list = [] - x = detections.x - y = detections.y - sz = self.pars.cutout_size - sub_stamps_data = make_cutouts(ds.sub_image.data, x, y, sz) - sub_stamps_weight = make_cutouts(ds.sub_image.weight, x, y, sz, fillvalue=0) - sub_stamps_flags = make_cutouts(ds.sub_image.flags, x, y, sz, fillvalue=0) - ref_stamps_data = make_cutouts(ds.sub_image.ref_aligned_image.data, x, y, sz) - ref_stamps_weight = make_cutouts(ds.sub_image.ref_aligned_image.weight, x, y, sz, fillvalue=0) - ref_stamps_flags = make_cutouts(ds.sub_image.ref_aligned_image.flags, x, y, sz, fillvalue=0) - new_stamps_data = make_cutouts(ds.sub_image.new_aligned_image.data, x, y, sz) - new_stamps_weight = make_cutouts(ds.sub_image.new_aligned_image.weight, x, y, sz, fillvalue=0) - new_stamps_flags = make_cutouts(ds.sub_image.new_aligned_image.flags, x, y, sz, fillvalue=0) - - for i, source in enumerate(detections.data): - # get the cutouts - cutout = Cutouts.from_detections(detections, i, provenance=prov) - cutout.sub_data = sub_stamps_data[i] - cutout.sub_weight = sub_stamps_weight[i] - cutout.sub_flags = sub_stamps_flags[i] - cutout.ref_data = ref_stamps_data[i] - cutout.ref_weight = ref_stamps_weight[i] - cutout.ref_flags = ref_stamps_flags[i] - cutout.new_data = new_stamps_data[i] - cutout.new_weight = new_stamps_weight[i] - cutout.new_flags = new_stamps_flags[i] - cutout._upstream_bitflag = 0 - cutout._upstream_bitflag |= detections.bitflag - cutout_list.append(cutout) - - # add the resulting list to the data store - for cutout in cutout_list: - if cutout.provenance is None: - cutout.provenance = prov + try: # first make sure we get back a datastore, even an empty one + if isinstance(args[0], SourceList) and args[0].is_sub: # most likely to get a SourceList detections object + args, kwargs, session = parse_session(*args, **kwargs) + ds = DataStore() + ds.detections = args[0] + ds.sub_image = args[0].image + ds.image = args[0].image.new_image else: - if cutout.provenance.id != prov.id: - raise ValueError( - f'Provenance mismatch for cutout {cutout.provenance.id[:6]} ' - f'and preset provenance {prov.id[:6]}!' - ) - - ds.cutouts = cutout_list - - # make sure this is returned to be used in the next step - return ds + ds, session = DataStore.from_args(*args, **kwargs) + except Exception as e: + return DataStore.catch_failure_to_parse(e, *args) + + try: + t_start = time.perf_counter() + if parse_bool(os.getenv('SEECHANGE_TRACEMALLOC')): + import tracemalloc + tracemalloc.reset_peak() # start accounting for the peak memory usage from here + + self.pars.do_warning_exception_hangup_injection_here() + + # get the provenance for this step: + prov = ds.get_provenance(self.pars.get_process_name(), self.pars.get_critical_pars(), session=session) + + # try to find some measurements in memory or in the database: + cutout_list = ds.get_cutouts(prov, session=session) + + if cutout_list is None or len(cutout_list) == 0: # must create a new list of Cutouts + self.has_recalculated = True + # use the latest source list in the data store, + # or load using the provenance given in the + # data store's upstream_provs, or just use + # the most recent provenance for "detection" + detections = ds.get_detections(session=session) + + if detections is None: + raise ValueError(f'Cannot find a source list corresponding to the datastore inputs: {ds.get_inputs()}') + + cutout_list = [] + x = detections.x + y = detections.y + sz = self.pars.cutout_size + sub_stamps_data = make_cutouts(ds.sub_image.data, x, y, sz) + sub_stamps_weight = make_cutouts(ds.sub_image.weight, x, y, sz, fillvalue=0) + sub_stamps_flags = make_cutouts(ds.sub_image.flags, x, y, sz, fillvalue=0) + ref_stamps_data = make_cutouts(ds.sub_image.ref_aligned_image.data, x, y, sz) + ref_stamps_weight = make_cutouts(ds.sub_image.ref_aligned_image.weight, x, y, sz, fillvalue=0) + ref_stamps_flags = make_cutouts(ds.sub_image.ref_aligned_image.flags, x, y, sz, fillvalue=0) + new_stamps_data = make_cutouts(ds.sub_image.new_aligned_image.data, x, y, sz) + new_stamps_weight = make_cutouts(ds.sub_image.new_aligned_image.weight, x, y, sz, fillvalue=0) + new_stamps_flags = make_cutouts(ds.sub_image.new_aligned_image.flags, x, y, sz, fillvalue=0) + + for i, source in enumerate(detections.data): + # get the cutouts + cutout = Cutouts.from_detections(detections, i, provenance=prov) + cutout.sub_data = sub_stamps_data[i] + cutout.sub_weight = sub_stamps_weight[i] + cutout.sub_flags = sub_stamps_flags[i] + cutout.ref_data = ref_stamps_data[i] + cutout.ref_weight = ref_stamps_weight[i] + cutout.ref_flags = ref_stamps_flags[i] + cutout.new_data = new_stamps_data[i] + cutout.new_weight = new_stamps_weight[i] + cutout.new_flags = new_stamps_flags[i] + cutout._upstream_bitflag = 0 + cutout._upstream_bitflag |= detections.bitflag + cutout_list.append(cutout) + + # add the resulting list to the data store + for cutout in cutout_list: + if cutout.provenance is None: + cutout.provenance = prov + else: + if cutout.provenance.id != prov.id: + raise ValueError( + f'Provenance mismatch for cutout {cutout.provenance.id[:6]} ' + f'and preset provenance {prov.id[:6]}!' + ) + + ds.cutouts = cutout_list + + ds.runtimes['cutting'] = time.perf_counter() - t_start + if parse_bool(os.getenv('SEECHANGE_TRACEMALLOC')): + import tracemalloc + ds.memory_usages['cutting'] = tracemalloc.get_traced_memory()[1] / 1024 ** 2 # in MB + + except Exception as e: + ds.catch_exception(e) + finally: # make sure datastore is returned to be used in the next step + return ds diff --git a/pipeline/data_store.py b/pipeline/data_store.py index def36030..14f589a3 100644 --- a/pipeline/data_store.py +++ b/pipeline/data_store.py @@ -1,8 +1,8 @@ import math +import datetime import sqlalchemy as sa from util.util import get_latest_provenance, parse_session -from util.logger import SCLogger from models.base import SmartSession, FileOnDiskMixin from models.provenance import CodeVersion, Provenance @@ -15,10 +15,11 @@ from models.reference import Reference from models.cutouts import Cutouts from models.measurements import Measurements -from models.objects import Object +from util.logger import SCLogger -UPSTREAM_NAMES = { +# for each process step, list the steps that go into its upstream +UPSTREAM_STEPS = { 'exposure': [], # no upstreams 'preprocessing': ['exposure'], 'extraction': ['preprocessing'], @@ -30,11 +31,13 @@ 'measuring': ['cutting'], } -UPSTREAM_OBJECTS = { +# The products that are made at each processing step. +# Usually it is only one, but sometimes there are multiple products for one step (e.g., extraction) +PROCESS_PRODUCTS = { 'exposure': 'exposure', 'preprocessing': 'image', 'coaddition': 'image', - 'extraction': 'sources', + 'extraction': ['sources', 'psf'], # TODO: add background, maybe move wcs and zp in here too? 'astro_cal': 'wcs', 'photo_cal': 'zp', 'reference': 'reference', @@ -51,7 +54,8 @@ class DataStore: to be fetched from the database, and keep a cached version of the products for use downstream in the pipeline. """ - attributes_to_save = [ + # the products_to_save are also getting cleared along with products_to_clear + products_to_save = [ 'exposure', 'image', 'sources', @@ -64,7 +68,8 @@ class DataStore: 'measurements' ] - attributes_to_clear = [ + # these get cleared but not saved + products_to_clear = [ 'ref_image', 'sub_image', 'reference', @@ -100,76 +105,6 @@ def from_args(*args, **kwargs): session = ds.parse_args(*args, **kwargs) return ds, session - def __init__(self, *args, **kwargs): - """ - See the parse_args method for details on how to initialize this object. - - Please make sure to add any new attributes to the attributes_to_save list. - """ - # these are data products that can be cached in the store - self._exposure = None # single image, entire focal plane - self._section = None # SensorSection - - self.upstream_provs = None # provenances to override the upstreams if no upstream objects exist - - # these all need to be added to the attributes_to_save list - self.image = None # single image from one sensor section - self.sources = None # extracted sources (a SourceList object, basically a catalog) - self.psf = None # psf determined from the extracted sources - self.wcs = None # astrometric solution - self.zp = None # photometric calibration - self.reference = None # the Reference object needed to make subtractions - self.sub_image = None # subtracted image - self.detections = None # a SourceList object for sources detected in the subtraction image - self.cutouts = None # cutouts around sources - self.measurements = None # photometry and other measurements for each source - self.objects = None # a list of Object associations of Measurements - - # these need to be added to the attributes_to_clear list - self.ref_image = None # to be used to make subtractions - self.sub_image = None # subtracted image - self.reference = None # the Reference object needed to make subtractions - self.exposure_id = None # use this and section_id to find the raw image - self.section_id = None # corresponds to SensorSection.identifier (*not* .id) - self.image_id = None # use this to specify an image already in the database - - # The database session parsed in parse_args; it could still be None even after parse_args - self.session = None - self.parse_args(*args, **kwargs) - - @property - def exposure( self ): - if self._exposure is None: - if self.exposure_id is not None: - self._exposure = self.get_raw_exposure( session=self.session ) - return self._exposure - - @exposure.setter - def exposure( self, value ): - self._exposure = value - self.exposure_id = value.id if value is not None else None - - @property - def section( self ): - if self._section is None: - if self.section_id is not None: - if self.exposure is not None: - self.exposure.instrument_object.fetch_sections() - self._section = self.exposure.instrument_object.get_section( self.section_id ) - return self._section - - @property - def ref_image( self ): - if self.reference is not None: - return self.reference.image - return None - - @ref_image.setter - def ref_image( self, value ): - if self.reference is None: - self.reference = Reference() - self.reference.image = value - def parse_args(self, *args, **kwargs): """ Parse the arguments to the DataStore constructor. @@ -279,6 +214,131 @@ def parse_args(self, *args, **kwargs): return output_session + @staticmethod + def catch_failure_to_parse(exception, *args): + """Call this when the from_args() function fails. + It is gaurenteed to return a DataStore object, + and will set the error attribute to the exception message. + """ + datastores = [a for a in args if isinstance(a, DataStore)] + if len(datastores) > 0: + ds = datastores[0] + else: + ds = DataStore() # return an empty datastore, as we cannot create it and cannot find one in args + + ds.exception = exception + + return ds + + def catch_exception(self, exception): + """Store the exception into the datastore for later use. """ + self.exception = exception + # This is a trivial function now, but we may want to do more complicated stuff down the road + + def read_exception(self): + """Return the stored exception and clear it from the datastore. """ + output = self.exception + self.exception = None + return output + + def reraise(self): + """If an exception is logged to the datastore, raise it. Otherwise pass. """ + if self.exception is not None: + raise self.exception + + def __init__(self, *args, **kwargs): + """ + See the parse_args method for details on how to initialize this object. + + Please make sure to add any new attributes to the products_to_save list. + """ + # these are data products that can be cached in the store + self._exposure = None # single image, entire focal plane + self._section = None # SensorSection + + self.upstream_provs = None # provenances to override the upstreams if no upstream objects exist + + # these all need to be added to the products_to_save list + self.image = None # single image from one sensor section + self.sources = None # extracted sources (a SourceList object, basically a catalog) + self.psf = None # psf determined from the extracted sources + self.wcs = None # astrometric solution + self.zp = None # photometric calibration + self.reference = None # the Reference object needed to make subtractions + self.sub_image = None # subtracted image + self.detections = None # a SourceList object for sources detected in the subtraction image + self.cutouts = None # cutouts around sources + self.measurements = None # photometry and other measurements for each source + self.objects = None # a list of Object associations of Measurements + + # these need to be added to the products_to_clear list + self.ref_image = None # to be used to make subtractions + self.sub_image = None # subtracted image + self.reference = None # the Reference object needed to make subtractions + self.exposure_id = None # use this and section_id to find the raw image + self.section_id = None # corresponds to SensorSection.identifier (*not* .id) + self.image_id = None # use this to specify an image already in the database + + self.warnings_list = None # will be replaced by a list of warning objects in top_level.Pipeline.run() + self.exception = None # the exception object (so we can re-raise it if needed) + self.runtimes = {} # for each process step, the total runtime in seconds + self.memory_usages = {} # for each process step, the peak memory usage in MB + self.products_committed = '' # a comma separated list of object names (e.g., "image, sources") saved to DB + self.report = None # keep a reference to the report object for this run + + # The database session parsed in parse_args; it could still be None even after parse_args + self.session = None + self.parse_args(*args, **kwargs) + + @property + def exposure( self ): + if self._exposure is None: + if self.exposure_id is not None: + self._exposure = self.get_raw_exposure( session=self.session ) + return self._exposure + + @exposure.setter + def exposure( self, value ): + self._exposure = value + self.exposure_id = value.id if value is not None else None + + @property + def section( self ): + if self._section is None: + if self.section_id is not None: + if self.exposure is not None: + self.exposure.instrument_object.fetch_sections() + self._section = self.exposure.instrument_object.get_section( self.section_id ) + return self._section + + @property + def ref_image( self ): + if self.reference is not None: + return self.reference.image + return None + + @ref_image.setter + def ref_image( self, value ): + if self.reference is None: + self.reference = Reference() + self.reference.image = value + + def __getattribute__(self, key): + # if this datastore has a pending error, will raise it as soon as any other data is used + if ( + key not in ['exception', 'read_exception', 'update_report', 'reraise', 'report'] and + not key.startswith('__') and hasattr(self, 'exception') and self.exception is not None + ): + SCLogger.warning('DataStore has a pending exception. Call read_exception() to get it, or reraise() to raise it.') + SCLogger.warning(f'Exception was triggered by trying to access attribute {key}.') + raise self.exception + + value = super().__getattribute__(key) + if key == 'image' and value is not None: + self.append_image_products(value) + + return value + def __setattr__(self, key, value): """ Check some of the inputs before saving them. @@ -338,12 +398,18 @@ def __setattr__(self, key, value): super().__setattr__(key, value) - def __getattribute__(self, key): - value = super().__getattribute__(key) - if key == 'image' and value is not None: - self.append_image_products(value) + def update_report(self, process_step, session=None): + """Update the report object with the latest results from a processing step that just finished. """ + self.report = self.report.scan_datastore(self, process_step=process_step, session=session) - return value + def finalize_report(self, session=None): + """Mark the report as successful and set the finish time.""" + self.report.success = True + self.report.finish_time = datetime.datetime.utcnow() + with SmartSession(session) as session: + new_report = session.merge(self.report) + session.commit() + self.report = new_report def get_inputs(self): """Get a string with the relevant inputs. """ @@ -408,9 +474,12 @@ def get_provenance(self, process, pars_dict, upstream_provs=None, session=None): # check if we can find the upstream provenances upstreams = [] - for name in UPSTREAM_NAMES[process]: + for name in UPSTREAM_STEPS[process]: # first try to load an upstream that was given explicitly: - obj = getattr(self, UPSTREAM_OBJECTS[name], None) + obj_names = PROCESS_PRODUCTS[name] + if isinstance(obj_names, str): + obj_names = [obj_names] + obj = getattr(self, obj_names[0], None) # only need one object to get the provenance if isinstance(obj, list): obj = obj[0] # for cutouts or measurements just use the first one if upstream_provs is not None and name in [p.process for p in upstream_provs]: @@ -431,7 +500,7 @@ def get_provenance(self, process, pars_dict, upstream_provs=None, session=None): upstreams.append(prov) - if len(upstreams) != len(UPSTREAM_NAMES[process]): + if len(upstreams) != len(UPSTREAM_STEPS[process]): raise ValueError(f'Could not find all upstream provenances for process {process}.') # we have a code version object and upstreams, we can make a provenance @@ -440,7 +509,7 @@ def get_provenance(self, process, pars_dict, upstream_provs=None, session=None): code_version=code_version, parameters=pars_dict, upstreams=upstreams, - is_testing="test_parameter" in pars_dict, # this is a flag for testing purposes + is_testing="test_parameter" in pars_dict, # this is a flag for testing purposes ) db_prov = session.scalars(sa.select(Provenance).where(Provenance.id == prov.id)).first() if db_prov is not None: # only merge if this provenance already exists @@ -905,7 +974,7 @@ def get_zp(self, provenance=None, session=None): return self.zp @classmethod - def _overlap_frac( cls, image, refim ): + def _overlap_frac(cls, image, refim): """Calculate the overlap fraction between image and refim. Parameters @@ -946,23 +1015,23 @@ def _overlap_frac( cls, image, refim ): """ - dimra = ( ( ( image.ra_corner_10 + image.ra_corner_11 ) / 2. - - ( image.ra_corner_00 + image.ra_corner_01 ) / 2. - ) / math.cos( image.dec * math.pi / 180. ) ) - dimdec = ( ( image.dec_corner_01 + image.dec_corner_11 ) / 2. - - ( image.dec_corner_00 + image.dec_corner_10 ) / 2. ) - r0 = max( refim.ra_corner_00, refim.ra_corner_01, - image.ra_corner_00, image.ra_corner_01 ) - r1 = min( refim.ra_corner_10, refim.ra_corner_10, - image.ra_corner_10, image.ra_corner_10 ) - d0 = max( refim.dec_corner_00, refim.dec_corner_10, - image.dec_corner_00, image.dec_corner_10 ) - d1 = min( refim.dec_corner_01, refim.dec_corner_11, - image.dec_corner_01, image.dec_corner_11 ) - dra = ( r1 - r0 ) / math.cos( ( d1 + d0 ) / 2. * math.pi / 180. ) + dimra = (((image.ra_corner_10 + image.ra_corner_11) / 2. - + (image.ra_corner_00 + image.ra_corner_01) / 2. + ) / math.cos(image.dec * math.pi / 180.)) + dimdec = ((image.dec_corner_01 + image.dec_corner_11) / 2. - + (image.dec_corner_00 + image.dec_corner_10) / 2.) + r0 = max(refim.ra_corner_00, refim.ra_corner_01, + image.ra_corner_00, image.ra_corner_01) + r1 = min(refim.ra_corner_10, refim.ra_corner_10, + image.ra_corner_10, image.ra_corner_10) + d0 = max(refim.dec_corner_00, refim.dec_corner_10, + image.dec_corner_00, image.dec_corner_10) + d1 = min(refim.dec_corner_01, refim.dec_corner_11, + image.dec_corner_01, image.dec_corner_11) + dra = (r1 - r0) / math.cos((d1 + d0) / 2. * math.pi / 180.) ddec = d1 - d0 - return ( dra * ddec ) / ( dimra * dimdec ) + return (dra * ddec) / (dimra * dimdec) def get_reference(self, minovfrac=0.85, must_match_instrument=True, must_match_filter=True, must_match_target=False, must_match_section=False, session=None ): @@ -976,25 +1045,20 @@ def get_reference(self, minovfrac=0.85, must_match_instrument=True, must_match_f (Warning: calculation implicitly assumes that images are aligned N/S and E/W.) Make this <= 0 to not consider overlap fraction when finding a reference. - must_match_instrument: bool, default True If True, only find a reference from the same instrument as that of the DataStore's image. - must_match_filter: bool, default True If True, only find a reference whose filter matches the DataStore's images' filter. - must_match_target: bool, default False If True, only find a reference if the "target" field of the reference image matches the "target" field of the image in the DataStore. - must_match_section: bool, default False - If True, only find a ference if the "section_id" field of + If True, only find a reference if the "section_id" field of the reference image matches that of the image in the Datastore. - session: sqlalchemy.orm.session.Session or SmartSession An optional session to use for the database query. If not given, will use the session stored inside the @@ -1006,10 +1070,6 @@ def get_reference(self, minovfrac=0.85, must_match_instrument=True, must_match_f ref: Image object The reference image for this image, or None if no reference is found. - - It will only return references whose validity date range - includes DataStore.image.observation_time. - If minovfrac is given, it will return the reference that has the highest ovfrac. (If, by unlikely chance, more than one have identical overlap fractions, an undeterministically chosen @@ -1019,8 +1079,8 @@ def get_reference(self, minovfrac=0.85, must_match_instrument=True, must_match_f that has an appreciable overlap with any possible image from that instrument. The software does not enforce this, however.) - If minovfrac is not given, it will return first reference found - that match the other criteria. Be careful with this. + If minovfrac is not given, it will return the first reference found + that matches the other criteria. Be careful with this. """ @@ -1472,14 +1532,8 @@ def save_and_commit(self, exists_ok=False, overwrite=True, no_archive=False, Note that this method calls session.commit() """ - # To avoid problems with lazy load failures if the things aren't - # all yet attached to a session: - if self.image is not None: - if self.psf is not None: - self.psf.image = self.image - # save to disk whatever is FileOnDiskMixin - for att in self.attributes_to_save: + for att in self.products_to_save: obj = getattr(self, att, None) if obj is None: continue @@ -1490,7 +1544,7 @@ def save_and_commit(self, exists_ok=False, overwrite=True, no_archive=False, continue SCLogger.debug( f'save_and_commit considering a {obj.__class__.__name__} with filepath ' - f'{obj.filepath if isinstance(obj,FileOnDiskMixin) else ""}' ) + f'{obj.filepath if isinstance(obj,FileOnDiskMixin) else ""}' ) if isinstance(obj, FileOnDiskMixin): mustsave = True @@ -1527,7 +1581,7 @@ def save_and_commit(self, exists_ok=False, overwrite=True, no_archive=False, else: SCLogger.debug( f'Not saving the {obj.__class__.__name__} because it already has ' - f'a md5sum in the database' ) + f'a md5sum in the database' ) # carefully merge all the objects including the products with SmartSession(session, self.session) as session: @@ -1545,12 +1599,23 @@ def save_and_commit(self, exists_ok=False, overwrite=True, no_archive=False, if self.image_id is None and self.image is not None: self.image_id = self.image.id + self.psf = self.image.psf + self.sources = self.image.sources + self.wcs = self.image.wcs + self.zp = self.image.zp + + session.commit() + self.products_committed = 'image, sources, psf, wcs, zp' + if self.sub_image is not None: self.sub_image.new_image = self.image # update with the now-merged image self.sub_image = self.sub_image.merge_all(session) # merges the upstream_images and downstream products self.sub_image.ref_image.id = self.sub_image.ref_image_id # just to make sure the ref has an ID for merging self.detections = self.sub_image.sources + session.commit() + self.products_committed += ', sub_image' + if self.detections is not None: if self.cutouts is not None: if self.measurements is not None: # keep track of which cutouts goes to which measurements @@ -1568,12 +1633,8 @@ def save_and_commit(self, exists_ok=False, overwrite=True, no_archive=False, self.measurements[i] = session.merge(self.measurements[i]) self.measurements[i].object.measurements.append(self.measurements[i]) - self.psf = self.image.psf - self.sources = self.image.sources - self.wcs = self.image.wcs - self.zp = self.image.zp - session.commit() + self.products_committed += ', detections, cutouts, measurements' def delete_everything(self, session=None, commit=True): """Delete everything associated with this sub-image. @@ -1649,6 +1710,11 @@ def delete_everything(self, session=None, commit=True): # verify that the objects are in fact deleted by deleting the image at the root of the datastore if self.image is not None and self.image.id is not None: session.execute(sa.delete(Image).where(Image.id == self.image.id)) + # also make sure aligned images are deleted from disk and archive + + if self.sub_image is not None and self.sub_image._aligned_images is not None: + for im in self.sub_image._aligned_images: # do not autoload, which happens if using aligned_images + im.remove_data_from_disk() # verify that no objects were accidentally added to the session's "new" set for obj in obj_list: @@ -1664,9 +1730,12 @@ def delete_everything(self, session=None, commit=True): finally: session.autoflush = autoflush_state - # Make sure all data products are None so that they aren't used again now that they're gone - # for att in self.attributes_to_save: - # setattr(self, att, None) - # - # for att in self.attributes_to_clear: - # setattr(self, att, None) + self.products_committed = '' # TODO: maybe not critical, but what happens if we fail to delete some of them? + + def clear_products(self): + """ Make sure all data products are None so that they aren't used again. """ + for att in self.products_to_save: + setattr(self, att, None) + + for att in self.products_to_clear: + setattr(self, att, None) diff --git a/pipeline/detection.py b/pipeline/detection.py index 6e00578e..09238193 100644 --- a/pipeline/detection.py +++ b/pipeline/detection.py @@ -1,7 +1,8 @@ +import os import pathlib import random import subprocess -from functools import partial +import time import numpy as np import numpy.lib.recfunctions as rfn @@ -15,6 +16,7 @@ from util.config import Config from util.logger import SCLogger +from util.util import parse_bool from pipeline.parameters import Parameters from pipeline.data_store import DataStore @@ -205,95 +207,131 @@ def run(self, *args, **kwargs): Returns a DataStore object with the products of the processing. """ self.has_recalculated = False - ds, session = DataStore.from_args(*args, **kwargs) - - # get the provenance for this step: - prov = ds.get_provenance(self.pars.get_process_name(), self.pars.get_critical_pars(), session=session) + try: # first make sure we get back a datastore, even an empty one + ds, session = DataStore.from_args(*args, **kwargs) + except Exception as e: + return DataStore.catch_failure_to_parse(e, *args) # try to find the sources/detections in memory or in the database: if self.pars.subtraction: - if ds.sub_image is None and ds.image is not None and ds.image.is_sub: - ds.sub_image = ds.image - ds.image = ds.sub_image.new_image # back-fill the image from the sub_image + try: + t_start = time.perf_counter() + if parse_bool(os.getenv('SEECHANGE_TRACEMALLOC')): + import tracemalloc + tracemalloc.reset_peak() # start accounting for the peak memory usage from here - detections = ds.get_detections(prov, session=session) + self.pars.do_warning_exception_hangup_injection_here() - if detections is None: - self.has_recalculated = True + prov = ds.get_provenance(self.pars.get_process_name(), self.pars.get_critical_pars(), session=session) + if ds.sub_image is None and ds.image is not None and ds.image.is_sub: + ds.sub_image = ds.image + ds.image = ds.sub_image.new_image # back-fill the image from the sub_image - # load the subtraction image from memory - # or load using the provenance given in the - # data store's upstream_provs, or just use - # the most recent provenance for "subtraction" - image = ds.get_subtraction(session=session) + detections = ds.get_detections(prov, session=session) - if image is None: - raise ValueError( - f'Cannot find a subtraction image corresponding to the datastore inputs: {ds.get_inputs()}' - ) + if detections is None: + self.has_recalculated = True - # TODO -- should probably pass **kwargs along to extract_sources - # in any event, need a way of passing parameters - # Question: why is it not enough to just define what you need in the Parameters object? - # Related to issue #50 - detections, _, _, _ = self.extract_sources( image ) + # load the subtraction image from memory + # or load using the provenance given in the + # data store's upstream_provs, or just use + # the most recent provenance for "subtraction" + image = ds.get_subtraction(session=session) - detections.image = image + if image is None: + raise ValueError( + f'Cannot find a subtraction image corresponding to the datastore inputs: {ds.get_inputs()}' + ) - if detections.provenance is None: - detections.provenance = prov - else: - if detections.provenance.id != prov.id: - raise ValueError('Provenance mismatch for detections and provenance!') + # TODO -- should probably pass **kwargs along to extract_sources + # in any event, need a way of passing parameters + # Question: why is it not enough to just define what you need in the Parameters object? + # Related to issue #50 + detections, _, _, _ = self.extract_sources( image ) + + detections.image = image + + if detections.provenance is None: + detections.provenance = prov + else: + if detections.provenance.id != prov.id: + raise ValueError('Provenance mismatch for detections and provenance!') + + detections._upstream_bitflag |= ds.sub_image.bitflag + ds.sub_image.sources = detections + ds.detections = detections - detections._upstream_bitflag |= ds.sub_image.bitflag - ds.sub_image.sources = detections - ds.detections = detections + ds.runtimes['detection'] = time.perf_counter() - t_start + if parse_bool(os.getenv('SEECHANGE_TRACEMALLOC')): + import tracemalloc + ds.memory_usages['detection'] = tracemalloc.get_traced_memory()[1] / 1024 ** 2 # in MB + + except Exception as e: + ds.catch_exception(e) + finally: # make sure datastore is returned to be used in the next step + return ds else: # regular image - sources = ds.get_sources(prov, session=session) - psf = ds.get_psf(prov, session=session) - - if sources is None or psf is None: - # TODO: when only one of these is not found (which is a strange situation) - # we may end up with a new version of the existing object - # (if sources is missing, we will end up with one sources and two psfs). - # This could get us in trouble when saving (the object will have the same provenance) - # Right now this is taken care of using "safe_merge" but I don't know if that's the right thing. - self.has_recalculated = True - # use the latest image in the data store, - # or load using the provenance given in the - # data store's upstream_provs, or just use - # the most recent provenance for "preprocessing" - image = ds.get_image(session=session) - - if image is None: - raise ValueError(f'Cannot find an image corresponding to the datastore inputs: {ds.get_inputs()}') - - sources, psf, bkg, bkgsig = self.extract_sources( image ) - sources.image = image - if sources.provenance is None: - sources.provenance = prov - else: - if sources.provenance.id != prov.id: - raise ValueError('Provenance mismatch for sources and provenance!') + prov = ds.get_provenance(self.pars.get_process_name(), self.pars.get_critical_pars(), session=session) + try: + t_start = time.perf_counter() + if parse_bool(os.getenv('SEECHANGE_TRACEMALLOC')): + import tracemalloc + tracemalloc.reset_peak() # start accounting for the peak memory usage from here + + self.pars.do_warning_exception_hangup_injection_here() + + sources = ds.get_sources(prov, session=session) + psf = ds.get_psf(prov, session=session) + + if sources is None or psf is None: + # TODO: when only one of these is not found (which is a strange situation) + # we may end up with a new version of the existing object + # (if sources is missing, we will end up with one sources and two psfs). + # This could get us in trouble when saving (the object will have the same provenance) + # Right now this is taken care of using "safe_merge" but I don't know if that's the right thing. + self.has_recalculated = True + # use the latest image in the data store, + # or load using the provenance given in the + # data store's upstream_provs, or just use + # the most recent provenance for "preprocessing" + image = ds.get_image(session=session) + + if image is None: + raise ValueError(f'Cannot find an image corresponding to the datastore inputs: {ds.get_inputs()}') + + sources, psf, bkg, bkgsig = self.extract_sources( image ) + sources.image = image + if sources.provenance is None: + sources.provenance = prov + else: + if sources.provenance.id != prov.id: + raise ValueError('Provenance mismatch for sources and provenance!') + + psf.image_id = image.id + if psf.provenance is None: + psf.provenance = prov + else: + if psf.provenance.id != prov.id: + raise ValueError('Provenance mismatch for pfs and provenance!') + + ds.sources = sources + ds.psf = psf + ds.image.fwhm_estimate = psf.fwhm_pixels # TODO: should we only write if the property is None? + if self.has_recalculated: + ds.image.bkg_mean_estimate = float( bkg ) + ds.image.bkg_rms_estimate = float( bkgsig ) + + ds.runtimes['extraction'] = time.perf_counter() - t_start + if parse_bool(os.getenv('SEECHANGE_TRACEMALLOC')): + import tracemalloc + ds.memory_usages['extraction'] = tracemalloc.get_traced_memory()[1] / 1024 ** 2 # in MB + + except Exception as e: + ds.catch_exception(e) + finally: # make sure datastore is returned to be used in the next step + return ds - psf.image_id = image.id - if psf.provenance is None: - psf.provenance = prov - else: - if psf.provenance.id != prov.id: - raise ValueError('Provenance mismatch for pfs and provenance!') - - ds.sources = sources - ds.psf = psf - ds.image.fwhm_estimate = psf.fwhm_pixels # TODO: should we only write if the property is None? - if self.has_recalculated: - ds.image.bkg_mean_estimate = float( bkg ) - ds.image.bkg_rms_estimate = float( bkgsig ) - - # make sure this is returned to be used in the next step - return ds def extract_sources(self, image): """Calls one of the extraction methods, based on self.pars.method. """ diff --git a/pipeline/measuring.py b/pipeline/measuring.py index 375dd9df..725631cd 100644 --- a/pipeline/measuring.py +++ b/pipeline/measuring.py @@ -1,19 +1,21 @@ +import os +import time import warnings import numpy as np from scipy import signal -from pipeline.parameters import Parameters -from pipeline.data_store import DataStore -from util.util import parse_session +from improc.photometry import iterative_cutouts_photometry +from improc.tools import make_gaussian from models.cutouts import Cutouts from models.measurements import Measurements -from models.objects import Object from models.enums_and_bitflags import BitFlagConverter -from improc.photometry import iterative_cutouts_photometry -from improc.tools import make_gaussian +from pipeline.parameters import Parameters +from pipeline.data_store import DataStore + +from util.util import parse_session, parse_bool class ParsMeasurer(Parameters): @@ -138,175 +140,184 @@ def run(self, *args, **kwargs): Returns a DataStore object with the products of the processing. """ - # most likely to get a Cutouts object or list of Cutouts - if isinstance(args[0], Cutouts): - new_args = [args[0]] # make it a list if we got a single Cutouts object for some reason - new_args += list(args[1:]) - args = tuple(new_args) - - if isinstance(args[0], list) and all([isinstance(c, Cutouts) for c in args[0]]): - args, kwargs, session = parse_session(*args, **kwargs) - ds = DataStore() - ds.cutouts = args[0] - ds.detections = ds.cutouts[0].sources - ds.sub_image = ds.detections.image - ds.image = ds.sub_image.new_image - else: - ds, session = DataStore.from_args(*args, **kwargs) self.has_recalculated = False - - # get the provenance for this step: - prov = ds.get_provenance(self.pars.get_process_name(), self.pars.get_critical_pars(), session=session) - - # try to find some measurements in memory or in the database: - measurements_list = ds.get_measurements(prov, session=session) - - # note that if measurements_list is found, there will not be an all_measurements appended to datastore! - if measurements_list is None or len(measurements_list) == 0: # must create a new list of Measurements - self.has_recalculated = True - # use the latest source list in the data store, - # or load using the provenance given in the - # data store's upstream_provs, or just use - # the most recent provenance for "detection" - detections = ds.get_detections(session=session) - - if detections is None: - raise ValueError(f'Cannot find a source list corresponding to the datastore inputs: {ds.get_inputs()}') - - cutouts = ds.get_cutouts(session=session) - - # prepare the filter bank for this batch of cutouts - if self._filter_psf_fwhm is None or self._filter_psf_fwhm != cutouts[0].sources.image.get_psf().fwhm_pixels: - self.make_filter_bank(cutouts[0].sub_data.shape[0], cutouts[0].sources.image.get_psf().fwhm_pixels) - - # go over each cutouts object and produce a measurements object - measurements_list = [] - for i, c in enumerate(cutouts): - m = Measurements(cutouts=c) - # make sure to remember which cutout belongs to this measurement, - # before either of them is in the DB and then use the cutouts_id instead - m._cutouts_list_index = i - - m.aper_radii = c.sources.image.new_image.zp.aper_cor_radii # zero point corrected aperture radii - - ignore_bits = 0 - for badness in self.pars.bad_pixel_exclude: - ignore_bits |= 2 ** BitFlagConverter.convert(badness) - - # remove the bad pixels that we want to ignore - # NOTE : this was throwing a RuntimeWarning, which was causing tests to - # fail; not sure why they didn't fail before. (New version of numpy? Dunno.) - # There were nans present; not sure whether those should be set to - # "bad pixel" or "out of bounds" (could be either), so choosing "bad pixel". - # Put in "casting='unsafe'" to take care of this. - # if np.any( np.isnan( c.sub_flags ) ): - # import pdb; pdb.set_trace() - # pass - # c.sub_flags[ np.isnan( c.sub_flags ) ] = BitFlagConverter.convert( "bad pixel" ) - flags = c.sub_flags.astype('uint16') & ~np.array(ignore_bits).astype('uint16') - - annulus_radii_pixels = self.pars.annulus_radii - if self.pars.annulus_units == 'fwhm': - annulus_radii_pixels = [rad * c.source.image.get_psf().fwhm_pixels for rad in annulus_radii_pixels] - - # TODO: consider if there are any additional parameters that photometry needs - output = iterative_cutouts_photometry( - c.sub_data, - c.sub_weight, - flags, - m.psf, - radii=m.aper_radii, - annulus=annulus_radii_pixels, - ) - - m.flux_psf = output['psf_flux'] - m.flux_psf_err = output['psf_err'] - m.area_psf = output['psf_area'] - m.flux_apertures = output['fluxes'] - m.flux_apertures_err = [np.sqrt(output['variance'] * a) for a in output['areas']] # TODO: add source noise?? - m.aper_radii = output['radii'] - m.area_apertures = output['areas'] - m.background = output['background'] - m.background_err = np.sqrt(output['variance']) - m.offset_x = output['offset_x'] - m.offset_y = output['offset_y'] - m.width = (output['major'] + output['minor']) / 2 - m.elongation = output['elongation'] - m.position_angle = output['angle'] - - if self.pars.chosen_aperture == 'auto': - raise NotImplementedError('Automatic aperture selection is not yet implemented.') - if self.pars.chosen_aperture == 'psf': - ap_index = -1 - elif isinstance(self.pars.chosen_aperture, int): - ap_index = self.pars.chosen_aperture - else: - raise ValueError( - f'Invalid value "{self.pars.chosen_aperture}" for chosen_aperture in the measuring parameters.' + try: # first make sure we get back a datastore, even an empty one + # most likely to get a Cutouts object or list of Cutouts + if isinstance(args[0], Cutouts): + new_args = [args[0]] # make it a list if we got a single Cutouts object for some reason + new_args += list(args[1:]) + args = tuple(new_args) + + if isinstance(args[0], list) and all([isinstance(c, Cutouts) for c in args[0]]): + args, kwargs, session = parse_session(*args, **kwargs) + ds = DataStore() + ds.cutouts = args[0] + ds.detections = ds.cutouts[0].sources + ds.sub_image = ds.detections.image + ds.image = ds.sub_image.new_image + else: + ds, session = DataStore.from_args(*args, **kwargs) + except Exception as e: + return DataStore.catch_failure_to_parse(e, *args) + + try: + t_start = time.perf_counter() + if parse_bool(os.getenv('SEECHANGE_TRACEMALLOC')): + import tracemalloc + tracemalloc.reset_peak() # start accounting for the peak memory usage from here + + self.pars.do_warning_exception_hangup_injection_here() + + # get the provenance for this step: + prov = ds.get_provenance(self.pars.get_process_name(), self.pars.get_critical_pars(), session=session) + + # try to find some measurements in memory or in the database: + measurements_list = ds.get_measurements(prov, session=session) + + # note that if measurements_list is found, there will not be an all_measurements appended to datastore! + if measurements_list is None or len(measurements_list) == 0: # must create a new list of Measurements + self.has_recalculated = True + # use the latest source list in the data store, + # or load using the provenance given in the + # data store's upstream_provs, or just use + # the most recent provenance for "detection" + detections = ds.get_detections(session=session) + + if detections is None: + raise ValueError(f'Cannot find a source list corresponding to the datastore inputs: {ds.get_inputs()}') + + cutouts = ds.get_cutouts(session=session) + + # prepare the filter bank for this batch of cutouts + if self._filter_psf_fwhm is None or self._filter_psf_fwhm != cutouts[0].sources.image.get_psf().fwhm_pixels: + self.make_filter_bank(cutouts[0].sub_data.shape[0], cutouts[0].sources.image.get_psf().fwhm_pixels) + + # go over each cutouts object and produce a measurements object + measurements_list = [] + for i, c in enumerate(cutouts): + m = Measurements(cutouts=c) + # make sure to remember which cutout belongs to this measurement, + # before either of them is in the DB and then use the cutouts_id instead + m._cutouts_list_index = i + + m.aper_radii = c.sources.image.new_image.zp.aper_cor_radii # zero point corrected aperture radii + + ignore_bits = 0 + for badness in self.pars.bad_pixel_exclude: + ignore_bits |= 2 ** BitFlagConverter.convert(badness) + + # remove the bad pixels that we want to ignore + flags = c.sub_flags.astype('uint16') & ~np.array(ignore_bits).astype('uint16') + + annulus_radii_pixels = self.pars.annulus_radii + if self.pars.annulus_units == 'fwhm': + annulus_radii_pixels = [rad * c.source.image.get_psf().fwhm_pixels for rad in annulus_radii_pixels] + + # TODO: consider if there are any additional parameters that photometry needs + output = iterative_cutouts_photometry( + c.sub_data, + c.sub_weight, + flags, + m.psf, + radii=m.aper_radii, + annulus=annulus_radii_pixels, ) - m.best_aperture = ap_index - - m.provenance = prov - m.provenance_id = prov.id - - # Apply analytic cuts to each stamp image, to rule out artefacts. - m.disqualifier_scores = {} - if m.background != 0 and m.background_err > 0.1: - norm_data = (c.sub_nandata - m.background) / m.background_err # normalize - else: - warnings.warn(f'Background mean= {m.background}, std= {m.background_err}, normalization skipped!') - norm_data = c.sub_nandata # no good background measurement, do not normalize! - - positives = np.sum(norm_data > self.pars.outlier_sigma) - negatives = np.sum(norm_data < -self.pars.outlier_sigma) - if negatives == 0: - m.disqualifier_scores['negatives'] = 0.0 - elif positives == 0: - m.disqualifier_scores['negatives'] = 1.0 - else: - m.disqualifier_scores['negatives'] = negatives / positives - - x, y = np.meshgrid(range(c.sub_data.shape[0]), range(c.sub_data.shape[1])) - x = x - c.sub_data.shape[1] // 2 - m.offset_x - y = y - c.sub_data.shape[0] // 2 - m.offset_y - r = np.sqrt(x ** 2 + y ** 2) - bad_pixel_inclusion = r <= self.pars.bad_pixel_radius + 0.5 - m.disqualifier_scores['bad pixels'] = np.sum(flags[bad_pixel_inclusion] > 0) - - norm_data_no_nans = norm_data.copy() - norm_data_no_nans[np.isnan(norm_data)] = 0 - - filter_scores = [] - for template in self._filter_bank: - filter_scores.append(np.max(signal.correlate(abs(norm_data_no_nans), template, mode='same'))) - - m.disqualifier_scores['filter bank'] = np.argmax(filter_scores) - - offset = np.sqrt(m.offset_x ** 2 + m.offset_y ** 2) - m.disqualifier_scores['offsets'] = offset - - # TODO: add additional disqualifiers - - # make sure disqualifier scores don't have any numpy types - for k, v in m.disqualifier_scores.items(): - if isinstance(v, np.number): - m.disqualifier_scores[k] = v.item() - - measurements_list.append(m) - - saved_measurements = [] - for m in measurements_list: - if m.passes(): # all disqualifiers are below threshold - saved_measurements.append(m) - - # add the resulting measurements to the data store - ds.all_measurements = measurements_list # debugging only - ds.failed_measurements = [m for m in measurements_list if m not in saved_measurements] # debugging only - ds.measurements = saved_measurements # only keep measurements that passed the disqualifiers cuts. - ds.sub_image.measurements = saved_measurements - - # make sure this is returned to be used in the next step - return ds + + m.flux_psf = output['psf_flux'] + m.flux_psf_err = output['psf_err'] + m.area_psf = output['psf_area'] + m.flux_apertures = output['fluxes'] + m.flux_apertures_err = [np.sqrt(output['variance'] * a) for a in output['areas']] # TODO: add source noise?? + m.aper_radii = output['radii'] + m.area_apertures = output['areas'] + m.background = output['background'] + m.background_err = np.sqrt(output['variance']) + m.offset_x = output['offset_x'] + m.offset_y = output['offset_y'] + m.width = (output['major'] + output['minor']) / 2 + m.elongation = output['elongation'] + m.position_angle = output['angle'] + + if self.pars.chosen_aperture == 'auto': + raise NotImplementedError('Automatic aperture selection is not yet implemented.') + if self.pars.chosen_aperture == 'psf': + ap_index = -1 + elif isinstance(self.pars.chosen_aperture, int): + ap_index = self.pars.chosen_aperture + else: + raise ValueError( + f'Invalid value "{self.pars.chosen_aperture}" for chosen_aperture in the measuring parameters.' + ) + m.best_aperture = ap_index + + m.provenance = prov + m.provenance_id = prov.id + + # Apply analytic cuts to each stamp image, to rule out artefacts. + m.disqualifier_scores = {} + if m.background != 0 and m.background_err > 0.1: + norm_data = (c.sub_nandata - m.background) / m.background_err # normalize + else: + warnings.warn(f'Background mean= {m.background}, std= {m.background_err}, normalization skipped!') + norm_data = c.sub_nandata # no good background measurement, do not normalize! + + positives = np.sum(norm_data > self.pars.outlier_sigma) + negatives = np.sum(norm_data < -self.pars.outlier_sigma) + if negatives == 0: + m.disqualifier_scores['negatives'] = 0.0 + elif positives == 0: + m.disqualifier_scores['negatives'] = 1.0 + else: + m.disqualifier_scores['negatives'] = negatives / positives + + x, y = np.meshgrid(range(c.sub_data.shape[0]), range(c.sub_data.shape[1])) + x = x - c.sub_data.shape[1] // 2 - m.offset_x + y = y - c.sub_data.shape[0] // 2 - m.offset_y + r = np.sqrt(x ** 2 + y ** 2) + bad_pixel_inclusion = r <= self.pars.bad_pixel_radius + 0.5 + m.disqualifier_scores['bad pixels'] = np.sum(flags[bad_pixel_inclusion] > 0) + + norm_data_no_nans = norm_data.copy() + norm_data_no_nans[np.isnan(norm_data)] = 0 + + filter_scores = [] + for template in self._filter_bank: + filter_scores.append(np.max(signal.correlate(abs(norm_data_no_nans), template, mode='same'))) + + m.disqualifier_scores['filter bank'] = np.argmax(filter_scores) + + offset = np.sqrt(m.offset_x ** 2 + m.offset_y ** 2) + m.disqualifier_scores['offsets'] = offset + + # TODO: add additional disqualifiers + + # make sure disqualifier scores don't have any numpy types + for k, v in m.disqualifier_scores.items(): + if isinstance(v, np.number): + m.disqualifier_scores[k] = v.item() + + measurements_list.append(m) + + saved_measurements = [] + for m in measurements_list: + if m.passes(): # all disqualifiers are below threshold + saved_measurements.append(m) + + # add the resulting measurements to the data store + ds.all_measurements = measurements_list # debugging only + ds.failed_measurements = [m for m in measurements_list if m not in saved_measurements] # debugging only + ds.measurements = saved_measurements # only keep measurements that passed the disqualifiers cuts. + ds.sub_image.measurements = saved_measurements + + ds.runtimes['measuring'] = time.perf_counter() - t_start + if parse_bool(os.getenv('SEECHANGE_TRACEMALLOC')): + import tracemalloc + ds.memory_usages['measuring'] = tracemalloc.get_traced_memory()[1] / 1024 ** 2 # in MB + + except Exception as e: + ds.catch_exception(e) + finally: # make sure datastore is returned to be used in the next step + return ds def make_filter_bank(self, imsize, psf_fwhm): """Make a filter bank matching the PSF width. diff --git a/pipeline/parameters.py b/pipeline/parameters.py index 934c08a7..3f03c6eb 100644 --- a/pipeline/parameters.py +++ b/pipeline/parameters.py @@ -1,4 +1,6 @@ import copy +import random +import warnings import sqlalchemy as sa @@ -122,6 +124,31 @@ def __init__(self, **kwargs): "verbose", 0, int, "Level of verbosity (0=quiet).", critical=False ) + self.inject_warnings = self.add_par( + "inject_warnings", + False, + (bool, float), + "Inject warnings into the pipeline. If given as float, use as probability to inject. ", + critical=False, + ) + + self.inject_exceptions = self.add_par( + "inject_exceptions", + False, + (bool, float), + "Inject exceptions into the pipeline. If given as float, use as probability to inject. ", + critical=False, + ) + + self.inject_hangups = self.add_par( + "inject_hangups", + False, + (bool, float), + "Inject hangups into the pipeline. If given as float, use as probability to inject. " + "Note that if an exception occurs from inject exceptions, the hangup would not be reached. ", + critical=False, + ) + self._enforce_type_checks = self.add_par( "_enforce_type_checks", True, @@ -722,6 +749,27 @@ def get_provenance(self, code_version=None, prov_cache=None, session=None): return prov + def do_warning_exception_hangup_injection_here(self): + """When called, will check if any of the inject_ parameters are set to non-zero value. + If they are, will raise a warning, exception or hangup, depending on the value. + If any of the values is a float between 0 and 1, will compare it to a uniform random number, + and if that random number is lower than the value, will inject the warning, exception or hangup. + Note that if an exception occurs, it will override the hangup (it would not be reached). + + """ + if self.inject_warnings > 0: + if random.uniform(0, 1) <= self.inject_warnings: + warnings.warn(f"Warning injected by pipeline parameters in process '{self.get_process_name()}'.") + + if self.inject_exceptions > 0: + if random.uniform(0, 1) <= self.inject_exceptions: + raise RuntimeError(f"Exception injected by pipeline parameters in process '{self.get_process_name()}'.") + + if self.inject_hangups > 0: + if random.uniform(0, 1) <= self.inject_hangups: + while True: + pass + class ParsDemoSubclass(Parameters): def __init__(self, **kwargs): diff --git a/pipeline/photo_cal.py b/pipeline/photo_cal.py index b00af0a2..b9fa1ce7 100644 --- a/pipeline/photo_cal.py +++ b/pipeline/photo_cal.py @@ -1,15 +1,20 @@ +import os +import time import numpy as np import astropy.units as u -from pipeline.parameters import Parameters -from pipeline.data_store import DataStore import pipeline.catalog_tools from models.zero_point import ZeroPoint +import pipeline.catalog_tools +from pipeline.parameters import Parameters +from pipeline.data_store import DataStore + from util.exceptions import BadMatchException from util.logger import SCLogger +from util.util import parse_bool # TODO: Make max_catalog_mag and mag_range_catalog defaults be supplied # by the instrument, since there are going to be different sane defaults @@ -227,65 +232,85 @@ def run(self, *args, **kwargs): will add a ZeroPoint object to the .zp field of the DataStore. """ + self.has_recalculated = False - ds, session = DataStore.from_args(*args, **kwargs) - - # get the provenance for this step: - prov = ds.get_provenance(self.pars.get_process_name(), self.pars.get_critical_pars(), session=session) - - # try to find the world coordinates in memory or in the database: - zp = ds.get_zp(prov, session=session) - - if zp is None: # must create a new ZeroPoint object - self.has_recalculated = True - if self.pars.cross_match_catalog != 'gaia_dr3': - raise NotImplementedError( f"Currently only know how to calibrate to gaia_dr3, not " - f"{self.pars.cross_match_catalog}" ) - - image = ds.get_image(session=session) - - sources = ds.get_sources(session=session) - if sources is None: - raise ValueError(f'Cannot find a source list corresponding to the datastore inputs: {ds.get_inputs()}') - - wcs = ds.get_wcs( session=session ) - if wcs is None: - raise ValueError( f'Cannot find a wcs for image {image.filepath}' ) - - catname = self.pars.cross_match_catalog - fetch_func = getattr(pipeline.catalog_tools, f'fetch_{catname}_excerpt') - catexp = fetch_func( - image=image, - minstars=self.pars.min_catalog_stars, - maxmags=self.pars.max_catalog_mag, - magrange=self.pars.mag_range_catalog, - session=session, - ) - - # Save for testing/evaluation purposes - self.catexp = catexp - - zpval, dzpval = self._solve_zp( image, sources, wcs, catexp ) - - # Add the aperture corrections - apercors = [] - for i, rad in enumerate( sources.aper_rads ): - if i == sources.inf_aper_num: - apercors.append( 0. ) - else: - apercors.append( sources.calc_aper_cor( aper_num=i ) ) - - # Make the ZeroPoint object - ds.zp = ZeroPoint( sources=ds.sources, provenance=prov, zp=zpval, dzp=dzpval, - aper_cor_radii=sources.aper_rads, aper_cors=apercors ) - - if ds.zp._upstream_bitflag is None: - ds.zp._upstream_bitflag = 0 - ds.zp._upstream_bitflag |= sources.bitflag - ds.zp._upstream_bitflag |= wcs.bitflag - - ds.image.zero_point_estimate = ds.zp.zp # TODO: should we only write if the property is None? - # TODO: we should also add a limiting magnitude calculation here. - - # make sure the DataStore is returned to be used in the next step - return ds + try: # first make sure we get back a datastore, even an empty one + ds, session = DataStore.from_args(*args, **kwargs) + except Exception as e: + return DataStore.catch_failure_to_parse(e, *args) + + try: + t_start = time.perf_counter() + if parse_bool(os.getenv('SEECHANGE_TRACEMALLOC')): + import tracemalloc + tracemalloc.reset_peak() # start accounting for the peak memory usage from here + + self.pars.do_warning_exception_hangup_injection_here() + + # get the provenance for this step: + prov = ds.get_provenance(self.pars.get_process_name(), self.pars.get_critical_pars(), session=session) + + # try to find the world coordinates in memory or in the database: + zp = ds.get_zp(prov, session=session) + + if zp is None: # must create a new ZeroPoint object + self.has_recalculated = True + if self.pars.cross_match_catalog != 'gaia_dr3': + raise NotImplementedError( f"Currently only know how to calibrate to gaia_dr3, not " + f"{self.pars.cross_match_catalog}" ) + + image = ds.get_image(session=session) + + sources = ds.get_sources(session=session) + if sources is None: + raise ValueError(f'Cannot find a source list corresponding to the datastore inputs: {ds.get_inputs()}') + + wcs = ds.get_wcs( session=session ) + if wcs is None: + raise ValueError( f'Cannot find a wcs for image {image.filepath}' ) + + catname = self.pars.cross_match_catalog + fetch_func = getattr(pipeline.catalog_tools, f'fetch_{catname}_excerpt') + catexp = fetch_func( + image=image, + minstars=self.pars.min_catalog_stars, + maxmags=self.pars.max_catalog_mag, + magrange=self.pars.mag_range_catalog, + session=session, + ) + + # Save for testing/evaluation purposes + self.catexp = catexp + + zpval, dzpval = self._solve_zp( image, sources, wcs, catexp ) + + # Add the aperture corrections + apercors = [] + for i, rad in enumerate( sources.aper_rads ): + if i == sources.inf_aper_num: + apercors.append( 0. ) + else: + apercors.append( sources.calc_aper_cor( aper_num=i ) ) + + # Make the ZeroPoint object + ds.zp = ZeroPoint( sources=ds.sources, provenance=prov, zp=zpval, dzp=dzpval, + aper_cor_radii=sources.aper_rads, aper_cors=apercors ) + + if ds.zp._upstream_bitflag is None: + ds.zp._upstream_bitflag = 0 + ds.zp._upstream_bitflag |= sources.bitflag + ds.zp._upstream_bitflag |= wcs.bitflag + + ds.image.zero_point_estimate = ds.zp.zp # TODO: should we only write if the property is None? + # TODO: we should also add a limiting magnitude calculation here. + + ds.runtimes['photo_cal'] = time.perf_counter() - t_start + if parse_bool(os.getenv('SEECHANGE_TRACEMALLOC')): + import tracemalloc + ds.memory_usages['photo_cal'] = tracemalloc.get_traced_memory()[1] / 1024 ** 2 # in MB + + except Exception as e: + ds.catch_exception(e) + finally: + # make sure the DataStore is returned to be used in the next step + return ds diff --git a/pipeline/preprocessing.py b/pipeline/preprocessing.py index 6c165e53..accbddd7 100644 --- a/pipeline/preprocessing.py +++ b/pipeline/preprocessing.py @@ -1,4 +1,6 @@ +import os import pathlib +import time import numpy as np @@ -14,6 +16,8 @@ from util.config import Config from util.logger import SCLogger +from util.util import parse_bool + class ParsPreprocessor(Parameters): def __init__(self, **kwargs): @@ -96,224 +100,242 @@ def run( self, *args, **kwargs ): """ self.has_recalculated = False - ds, session = DataStore.from_args( *args, **kwargs ) + try: # first make sure we get back a datastore, even an empty one + ds, session = DataStore.from_args( *args, **kwargs ) + except Exception as e: + return DataStore.catch_failure_to_parse(e, *args) # This is here just for testing purposes - self._ds = ds - - if ( ds.exposure is None ) or ( ds.section_id is None ): - raise RuntimeError( "Preprocessing requires an exposure and a sensor section" ) - - cfg = Config.get() - - if ( self.instrument is None ) or ( self.instrument.name != ds.exposure.instrument ): - self.instrument = ds.exposure.instrument_object - - # The only reason these are saved in self, rather than being - # local variables, is so that tests can probe them - self._calibset = None - self._flattype = None - self._stepstodo = None - - if 'calibset' in kwargs: - self._calibset = kwargs['calibset'] - elif 'calibratorset' in kwargs: - self._calibset = kwargs['calibrator_set'] - elif self.pars.calibset is not None: - self._calibset = self.pars.calibset - else: - self._calibset = cfg.value( f'{self.instrument.name}.calibratorset', - default=cfg.value( 'instrument_default.calibratorset' ) ) - - if 'flattype' in kwargs: - self._flattype = kwargs['flattype'] - elif self.pars.flattype is not None: - self._flattype = self.pars.flattype - else: - self._flattype = cfg.value( f'{self.instrument.name}.flattype', - default=cfg.value( 'instrument_default.flattype' ) ) - - if 'steps' in kwargs: - self._stepstodo = [ s for s in self.instrument.preprocessing_steps if s in kwargs['steps'] ] - elif self.pars.steps is not None: - self._stepstodo = [ s for s in self.instrument.preprocessing_steps if s in self.pars.steps ] - else: - self._stepstodo = self.instrument.preprocessing_steps - - # Get the calibrator files - - SCLogger.debug( "preprocessing: getting calibrator files" ) - preprocparam = self.instrument.preprocessing_calibrator_files( self._calibset, - self._flattype, - ds.section_id, - ds.exposure.filter_short, - ds.exposure.mjd, - session=session ) - SCLogger.debug( "preprocessing: got calibrator files" ) - - # get the provenance for this step, using the current parameters: - # Provenance includes not just self.pars.get_critical_pars(), - # but also the steps that were performed. Reason: we may well - # load non-flatfielded images in the database for purposes of - # collecting images used for later building flats. We will then - # flatfield those images. The two images in the database must have - # different provenances. - # We also include any overrides to calibrator files, as that indicates - # that something individual happened here that's different from - # normal processing of the image. - provdict = dict( self.pars.get_critical_pars() ) - provdict['preprocessing_steps' ] = self._stepstodo - prov = ds.get_provenance(self.pars.get_process_name(), provdict, session=session) - - # check if the image already exists in memory or in the database: - image = ds.get_image(prov, session=session) - - if image is None: # need to make new image - # get the single-chip image from the exposure - image = Image.from_exposure( ds.exposure, ds.section_id ) - - if image is None: - raise ValueError('Image cannot be None at this point!') - - if image.preproc_bitflag is None: - image.preproc_bitflag = 0 - - required_bitflag = 0 - for step in self._stepstodo: - required_bitflag |= string_to_bitflag( step, image_preprocessing_inverse ) - - if image._data is None: # in case we are skipping all preprocessing steps - image.data = image.raw_data - - if image.preproc_bitflag != required_bitflag: - self.has_recalculated = True - # Overscan is always first (as it reshapes the image) - if 'overscan' in self._stepstodo: - SCLogger.debug( 'preprocessing: overscan and trim' ) - image.data = self.instrument.overscan_and_trim( image ) - # Update the header ra/dec calculations now that we know the real width/height - image.set_corners_from_header_wcs( setradec=True ) - image.preproc_bitflag |= string_to_bitflag( 'overscan', image_preprocessing_inverse ) - - # Apply steps in the order expected by the instrument + self._ds = ds # TODO: is there a reason not to just use the output datastore? + + try: # catch any exceptions and save them in the datastore + t_start = time.perf_counter() + if parse_bool(os.getenv('SEECHANGE_TRACEMALLOC')): + import tracemalloc + tracemalloc.reset_peak() # start accounting for the peak memory usage from here + + if ( ds.exposure is None ) or ( ds.section_id is None ): + raise RuntimeError( "Preprocessing requires an exposure and a sensor section" ) + + self.pars.do_warning_exception_hangup_injection_here() + + cfg = Config.get() + + if ( self.instrument is None ) or ( self.instrument.name != ds.exposure.instrument ): + self.instrument = ds.exposure.instrument_object + + # The only reason these are saved in self, rather than being + # local variables, is so that tests can probe them + self._calibset = None + self._flattype = None + self._stepstodo = None + + if 'calibset' in kwargs: + self._calibset = kwargs['calibset'] + elif 'calibratorset' in kwargs: + self._calibset = kwargs['calibrator_set'] + elif self.pars.calibset is not None: + self._calibset = self.pars.calibset + else: + self._calibset = cfg.value( f'{self.instrument.name}.calibratorset', + default=cfg.value( 'instrument_default.calibratorset' ) ) + + if 'flattype' in kwargs: + self._flattype = kwargs['flattype'] + elif self.pars.flattype is not None: + self._flattype = self.pars.flattype + else: + self._flattype = cfg.value( f'{self.instrument.name}.flattype', + default=cfg.value( 'instrument_default.flattype' ) ) + + if 'steps' in kwargs: + self._stepstodo = [ s for s in self.instrument.preprocessing_steps if s in kwargs['steps'] ] + elif self.pars.steps is not None: + self._stepstodo = [ s for s in self.instrument.preprocessing_steps if s in self.pars.steps ] + else: + self._stepstodo = self.instrument.preprocessing_steps + + # Get the calibrator files + SCLogger.debug("preprocessing: getting calibrator files") + preprocparam = self.instrument.preprocessing_calibrator_files( self._calibset, + self._flattype, + ds.section_id, + ds.exposure.filter_short, + ds.exposure.mjd, + session=session ) + + SCLogger.debug("preprocessing: got calibrator files") + + # get the provenance for this step, using the current parameters: + # Provenance includes not just self.pars.get_critical_pars(), + # but also the steps that were performed. Reason: we may well + # load non-flatfielded images in the database for purposes of + # collecting images used for later building flats. We will then + # flatfield those images. The two images in the database must have + # different provenances. + # We also include any overrides to calibrator files, as that indicates + # that something individual happened here that's different from + # normal processing of the image. + provdict = dict( self.pars.get_critical_pars() ) + provdict['preprocessing_steps' ] = self._stepstodo + prov = ds.get_provenance(self.pars.get_process_name(), provdict, session=session) + + # check if the image already exists in memory or in the database: + image = ds.get_image(prov, session=session) + + if image is None: # need to make new image + # get the single-chip image from the exposure + image = Image.from_exposure( ds.exposure, ds.section_id ) + + if image is None: + raise ValueError('Image cannot be None at this point!') + + if image.preproc_bitflag is None: + image.preproc_bitflag = 0 + + required_bitflag = 0 for step in self._stepstodo: - if step == 'overscan': - continue - SCLogger.debug( f"preprocessing: {step}" ) - - stepfileid = None - # Acquire the calibration file - if f'{step}_fileid' in kwargs: - stepfileid = kwargs[ f'{step}_fileid' ] - elif f'{step}_fileid' in preprocparam: - stepfileid = preprocparam[ f'{step}_fileid' ] - else: - raise RuntimeError( f"Can't find calibration file for preprocessing step {step}" ) - - if stepfileid is None: - SCLogger.warning( f"Skipping step {step} for filter {ds.exposure.filter_short} " - f"because there is no calibration file (this may be normal)" ) - # should we also mark it as having "done" this step? otherwise it will not know it's done + required_bitflag |= string_to_bitflag( step, image_preprocessing_inverse ) + + if image._data is None: # in case we are skipping all preprocessing steps + image.data = image.raw_data + + if image.preproc_bitflag != required_bitflag: + self.has_recalculated = True + # Overscan is always first (as it reshapes the image) + if 'overscan' in self._stepstodo: + SCLogger.debug('preprocessing: overscan and trim') + image.data = self.instrument.overscan_and_trim( image ) + # Update the header ra/dec calculations now that we know the real width/height + image.set_corners_from_header_wcs(setradec=True) + image.preproc_bitflag |= string_to_bitflag( 'overscan', image_preprocessing_inverse ) + + # Apply steps in the order expected by the instrument + for step in self._stepstodo: + if step == 'overscan': + continue + SCLogger.debug(f"preprocessing: {step}") + stepfileid = None + # Acquire the calibration file + if f'{step}_fileid' in kwargs: + stepfileid = kwargs[ f'{step}_fileid' ] + elif f'{step}_fileid' in preprocparam: + stepfileid = preprocparam[ f'{step}_fileid' ] + else: + raise RuntimeError( f"Can't find calibration file for preprocessing step {step}" ) + + if stepfileid is None: + SCLogger.info(f"Skipping step {step} for filter {ds.exposure.filter_short} " + f"because there is no calibration file (this may be normal)") + # should we also mark it as having "done" this step? otherwise it will not know it's done + image.preproc_bitflag |= string_to_bitflag( step, image_preprocessing_inverse ) + continue + + # Use the cached calibrator file for this step if it's the right one; otherwise, grab it + if ( stepfileid in self.stepfilesids ) and ( self.stepfilesids[step] == stepfileid ): + calibfile = self.stepfiles[ calibfile ] + else: + + with SmartSession( session ) as session: + if step in [ 'zero', 'dark', 'flat', 'illumination', 'fringe' ]: + calibfile = session.get( Image, stepfileid ) + if calibfile is None: + raise RuntimeError( f"Unable to load image id {stepfileid} for preproc step {step}" ) + elif step == 'linearity': + calibfile = session.get( DataFile, stepfileid ) + if calibfile is None: + raise RuntimeError( f"Unable to load datafile id {stepfileid} for preproc step {step}" ) + else: + raise ValueError( f"Preprocessing step {step} has an unknown file type (image vs. datafile)" ) + self.stepfilesids[ step ] = stepfileid + self.stepfiles[ step ] = calibfile + if step in [ 'zero', 'dark' ]: + # Subtract zeros and darks + image.data -= calibfile.data + + elif step in [ 'flat', 'illumination' ]: + # Divide flats and illuminations + image.data /= calibfile.data + + elif step == 'fringe': + # TODO FRINGE CORRECTION + SCLogger.info( "Fringe correction not implemented" ) + + elif step == 'linearity': + # Linearity is instrument-specific + self.instrument.linearity_correct( image, linearitydata=calibfile ) + + else: + # TODO: Replace this with a call into an instrument method? + # In that case, the logic above about acquiring step files + # will need to be updated. + raise ValueError( f"Unknown preprocessing step {step}" ) + image.preproc_bitflag |= string_to_bitflag( step, image_preprocessing_inverse ) - continue - - # Use the cached calibrator file for this step if it's the right one; otherwise, grab it - if ( stepfileid in self.stepfilesids ) and ( self.stepfilesids[step] == stepfileid ): - calibfile = self.stepfiles[ calibfile ] - else: - - with SmartSession( session ) as session: - if step in [ 'zero', 'dark', 'flat', 'illumination', 'fringe' ]: - calibfile = session.get( Image, stepfileid ) - if calibfile is None: - raise RuntimeError( f"Unable to load image id {stepfileid} for preproc step {step}" ) - elif step == 'linearity': - calibfile = session.get( DataFile, stepfileid ) - if calibfile is None: - raise RuntimeError( f"Unable to load datafile id {stepfileid} for preproc step {step}" ) - else: - raise ValueError( f"Preprocessing step {step} has an unknown file type (image vs. datafile)" ) - self.stepfilesids[ step ] = stepfileid - self.stepfiles[ step ] = calibfile - if step in [ 'zero', 'dark' ]: - # Subtract zeros and darks - image.data -= calibfile.data - - elif step in [ 'flat', 'illumination' ]: - # Divide flats and illuminations - image.data /= calibfile.data - - elif step == 'fringe': - # TODO FRINGE CORRECTION - SCLogger.warning( "Fringe correction not implemented" ) - - elif step == 'linearity': - # Linearity is instrument-specific - self.instrument.linearity_correct( image, linearitydata=calibfile ) - - else: - # TODO: Replace this with a call into an instrument method? - # In that case, the logic above about acquiring step files - # will need to be updated. - raise ValueError( f"Unknown preprocessing step {step}" ) - - image.preproc_bitflag |= string_to_bitflag( step, image_preprocessing_inverse ) - - # Get the Instrument standard bad pixel mask for this image - if image._flags is None or image._weight is None: - image._flags = self.instrument.get_standard_flags_image( ds.section_id ) - - # Estimate the background rms with sep - boxsize = self.instrument.background_box_size - filtsize = self.instrument.background_filt_size - SCLogger.debug( "Subtracting sky and estimating sky RMS" ) - # Dysfunctionality alert: sep requires a *float* image for the mask - # IEEE 32-bit floats have 23 bits in the mantissa, so they should - # be able to precisely represent a 16-bit integer mask image - # In any event, sep.Background uses >0 as "bad" - fmask = np.array( image._flags, dtype=np.float32 ) - backgrounder = sep.Background( image.data, mask=fmask, - bw=boxsize, bh=boxsize, fw=filtsize, fh=filtsize ) - fmask = None - rms = backgrounder.rms() - sky = backgrounder.back() - subim = image.data - sky - SCLogger.debug( "Building weight image and augmenting flags image" ) - - wbad = np.where( rms <= 0 ) - wgood = np.where( rms > 0 ) - rms = rms ** 2 - subim[ subim < 0 ] = 0 - gain = self.instrument.average_gain( image ) - gain = gain if gain is not None else 1. - # Shot noise from image above background - rms += subim / gain - image._weight = np.zeros( image.data.shape, dtype=np.float32 ) - image._weight[ wgood ] = 1. / rms[ wgood ] - image._flags[ wbad ] |= string_to_bitflag( "zero weight", flag_image_bits_inverse ) - # Now make the weight zero on the bad pixels too - image._weight[ image._flags != 0 ] = 0. - # Figure out saturated pixels - satlevel = self.instrument.average_saturation_limit( image ) - if satlevel is not None: - wsat = image.data >= satlevel - image._flags[ wsat ] |= string_to_bitflag( "saturated", flag_image_bits_inverse ) - image._weight[ wsat ] = 0. - - if image.provenance is None: - image.provenance = prov - else: - if image.provenance.id != prov.id: - # Logically, this should never happen - raise ValueError('Provenance mismatch for image and provenance!') - - image.filepath = image.invent_filepath() - SCLogger.debug( f"Done with {pathlib.Path(image.filepath).name}" ) - - if image._upstream_bitflag is None: - image._upstream_bitflag = 0 - image._upstream_bitflag |= ds.exposure.bitflag - - ds.image = image - - return ds + + # Get the Instrument standard bad pixel mask for this image + if image._flags is None or image._weight is None: + image._flags = self.instrument.get_standard_flags_image( ds.section_id ) + + # Estimate the background rms with sep + boxsize = self.instrument.background_box_size + filtsize = self.instrument.background_filt_size + SCLogger.debug( "Subtracting sky and estimating sky RMS" ) + # Dysfunctionality alert: sep requires a *float* image for the mask + # IEEE 32-bit floats have 23 bits in the mantissa, so they should + # be able to precisely represent a 16-bit integer mask image + # In any event, sep.Background uses >0 as "bad" + fmask = np.array( image._flags, dtype=np.float32 ) + backgrounder = sep.Background( image.data, mask=fmask, + bw=boxsize, bh=boxsize, fw=filtsize, fh=filtsize ) + fmask = None + rms = backgrounder.rms() + sky = backgrounder.back() + subim = image.data - sky + SCLogger.debug( "Building weight image and augmenting flags image" ) + + wbad = np.where( rms <= 0 ) + wgood = np.where( rms > 0 ) + rms = rms ** 2 + subim[ subim < 0 ] = 0 + gain = self.instrument.average_gain( image ) + gain = gain if gain is not None else 1. + # Shot noise from image above background + rms += subim / gain + image._weight = np.zeros( image.data.shape, dtype=np.float32 ) + image._weight[ wgood ] = 1. / rms[ wgood ] + image._flags[ wbad ] |= string_to_bitflag( "zero weight", flag_image_bits_inverse ) + # Now make the weight zero on the bad pixels too + image._weight[ image._flags != 0 ] = 0. + # Figure out saturated pixels + satlevel = self.instrument.average_saturation_limit( image ) + if satlevel is not None: + wsat = image.data >= satlevel + image._flags[ wsat ] |= string_to_bitflag( "saturated", flag_image_bits_inverse ) + image._weight[ wsat ] = 0. + + if image.provenance is None: + image.provenance = prov + else: + if image.provenance.id != prov.id: + # Logically, this should never happen + raise ValueError('Provenance mismatch for image and provenance!') + + image.filepath = image.invent_filepath() + SCLogger.debug( f"Done with {pathlib.Path(image.filepath).name}" ) + + if image._upstream_bitflag is None: + image._upstream_bitflag = 0 + image._upstream_bitflag |= ds.exposure.bitflag + + ds.image = image + + ds.runtimes['preprocessing'] = time.perf_counter() - t_start + if parse_bool(os.getenv('SEECHANGE_TRACEMALLOC')): + import tracemalloc + ds.memory_usages['preprocessing'] = tracemalloc.get_traced_memory()[1] / 1024 ** 2 # in MB + + except Exception as e: + ds.catch_exception(e) + finally: + return ds diff --git a/pipeline/subtraction.py b/pipeline/subtraction.py index bf10d321..54edeff6 100644 --- a/pipeline/subtraction.py +++ b/pipeline/subtraction.py @@ -1,3 +1,4 @@ +import os import time import numpy as np @@ -5,15 +6,15 @@ from pipeline.data_store import DataStore from models.base import SmartSession -from models.provenance import Provenance from models.image import Image from improc.zogy import zogy_subtract, zogy_add_weights_flags from improc.inpainting import Inpainter - from improc.alignment import ImageAligner from improc.tools import sigma_clipping +from util.util import parse_bool + class ParsSubtractor(Parameters): def __init__(self, **kwargs): @@ -235,96 +236,114 @@ def run(self, *args, **kwargs): Returns a DataStore object with the products of the processing. """ self.has_recalculated = False - ds, session = DataStore.from_args(*args, **kwargs) - - # get the provenance for this step: - with SmartSession(session) as session: - prov = ds.get_provenance(self.pars.get_process_name(), self.pars.get_critical_pars(), session=session) - - # look for a reference that has to do with the current image - ref = ds.get_reference(session=session) - if ref is None: - raise ValueError( - f'Cannot find a reference image corresponding to the datastore inputs: {ds.get_inputs()}' - ) - - # manually replace the "reference" provenances with the reference image and its products - upstreams = prov.upstreams - upstreams = [x for x in upstreams if x.process != 'reference'] # remove reference provenance - upstreams.append(ref.image.provenance) - upstreams.append(ref.sources.provenance) - upstreams.append(ref.psf.provenance) - upstreams.append(ref.wcs.provenance) - upstreams.append(ref.zp.provenance) - prov.upstreams = upstreams # must re-assign to make sure list items are unique - prov.update_id() - prov = session.merge(prov) - sub_image = ds.get_subtraction(prov, session=session) - - if sub_image is None: - self.has_recalculated = True - # use the latest image in the data store, - # or load using the provenance given in the - # data store's upstream_provs, or just use - # the most recent provenance for "preprocessing" - image = ds.get_image(session=session) - if image is None: - raise ValueError(f'Cannot find an image corresponding to the datastore inputs: {ds.get_inputs()}') - - sub_image = Image.from_ref_and_new(ref.image, image) - sub_image.is_sub = True - sub_image.provenance = prov - sub_image.provenance_id = prov.id - sub_image.coordinates_to_alignment_target() # make sure the WCS is aligned to the correct image - - # make sure to grab the correct aligned images - new_image = [im for im in sub_image.aligned_images if im.mjd == sub_image.new_image.mjd] - if len(new_image) != 1: - raise ValueError('Cannot find the new image in the aligned images') - new_image = new_image[0] - - ref_image = [im for im in sub_image.aligned_images if im.mjd == sub_image.ref_image.mjd] - if len(ref_image) != 1: - raise ValueError('Cannot find the reference image in the aligned images') - ref_image = ref_image[0] - - if self.pars.method == 'naive': - outdict = self._subtract_naive(new_image, ref_image) - elif self.pars.method == 'hotpants': - outdict = self._subtract_hotpants(new_image, ref_image) - elif self.pars.method == 'zogy': - outdict = self._subtract_zogy(new_image, ref_image) - else: - raise ValueError(f'Unknown subtraction method {self.pars.method}') - - sub_image.data = outdict['outim'] - sub_image.weight = outdict['outwt'] - sub_image.flags = outdict['outfl'] - if 'score' in outdict: - sub_image.score = outdict['score'] - if 'alpha' in outdict: - sub_image.psfflux = outdict['alpha'] - if 'alpha_err' in outdict: - sub_image.psffluxerr = outdict['alpha_err'] - if 'psf' in outdict: - # TODO: clip the array to be a cutout around the PSF, right now it is same shape as image! - sub_image.zogy_psf = outdict['psf'] # not saved but can be useful for testing / source detection - if 'alpha' in outdict and 'alpha_err' in outdict: - sub_image.psfflux = outdict['alpha'] - sub_image.psffluxerr = outdict['alpha_err'] - - sub_image.subtraction_output = outdict # save the full output for debugging - - if sub_image._upstream_bitflag is None: - sub_image._upstream_bitflag = 0 - sub_image._upstream_bitflag |= ds.sources.bitflag - sub_image._upstream_bitflag |= ds.image.bitflag - sub_image._upstream_bitflag |= ds.wcs.bitflag - sub_image._upstream_bitflag |= ds.zp.bitflag - if 'ref_image' in locals(): - sub_image._upstream_bitflag |= ref_image.bitflag - - ds.sub_image = sub_image - - # make sure this is returned to be used in the next step - return ds + try: # first make sure we get back a datastore, even an empty one + ds, session = DataStore.from_args(*args, **kwargs) + except Exception as e: + return DataStore.catch_failure_to_parse(e, *args) + + try: + t_start = time.perf_counter() + if parse_bool(os.getenv('SEECHANGE_TRACEMALLOC')): + import tracemalloc + tracemalloc.reset_peak() # start accounting for the peak memory usage from here + + self.pars.do_warning_exception_hangup_injection_here() + + # get the provenance for this step: + with SmartSession(session) as session: + prov = ds.get_provenance(self.pars.get_process_name(), self.pars.get_critical_pars(), session=session) + + # look for a reference that has to do with the current image + ref = ds.get_reference(session=session) + if ref is None: + raise ValueError( + f'Cannot find a reference image corresponding to the datastore inputs: {ds.get_inputs()}' + ) + + # manually replace the "reference" provenances with the reference image and its products + upstreams = prov.upstreams + upstreams = [x for x in upstreams if x.process != 'reference'] # remove reference provenance + upstreams.append(ref.image.provenance) + upstreams.append(ref.sources.provenance) + upstreams.append(ref.psf.provenance) + upstreams.append(ref.wcs.provenance) + upstreams.append(ref.zp.provenance) + prov.upstreams = upstreams # must re-assign to make sure list items are unique + prov.update_id() + prov = session.merge(prov) + sub_image = ds.get_subtraction(prov, session=session) + + if sub_image is None: + self.has_recalculated = True + # use the latest image in the data store, + # or load using the provenance given in the + # data store's upstream_provs, or just use + # the most recent provenance for "preprocessing" + image = ds.get_image(session=session) + if image is None: + raise ValueError(f'Cannot find an image corresponding to the datastore inputs: {ds.get_inputs()}') + + sub_image = Image.from_ref_and_new(ref.image, image) + sub_image.is_sub = True + sub_image.provenance = prov + sub_image.provenance_id = prov.id + sub_image.coordinates_to_alignment_target() # make sure the WCS is aligned to the correct image + + # make sure to grab the correct aligned images + new_image = [im for im in sub_image.aligned_images if im.mjd == sub_image.new_image.mjd] + if len(new_image) != 1: + raise ValueError('Cannot find the new image in the aligned images') + new_image = new_image[0] + + ref_image = [im for im in sub_image.aligned_images if im.mjd == sub_image.ref_image.mjd] + if len(ref_image) != 1: + raise ValueError('Cannot find the reference image in the aligned images') + ref_image = ref_image[0] + + if self.pars.method == 'naive': + outdict = self._subtract_naive(new_image, ref_image) + elif self.pars.method == 'hotpants': + outdict = self._subtract_hotpants(new_image, ref_image) + elif self.pars.method == 'zogy': + outdict = self._subtract_zogy(new_image, ref_image) + else: + raise ValueError(f'Unknown subtraction method {self.pars.method}') + + sub_image.data = outdict['outim'] + sub_image.weight = outdict['outwt'] + sub_image.flags = outdict['outfl'] + if 'score' in outdict: + sub_image.score = outdict['score'] + if 'alpha' in outdict: + sub_image.psfflux = outdict['alpha'] + if 'alpha_err' in outdict: + sub_image.psffluxerr = outdict['alpha_err'] + if 'psf' in outdict: + # TODO: clip the array to be a cutout around the PSF, right now it is same shape as image! + sub_image.zogy_psf = outdict['psf'] # not saved but can be useful for testing / source detection + if 'alpha' in outdict and 'alpha_err' in outdict: + sub_image.psfflux = outdict['alpha'] + sub_image.psffluxerr = outdict['alpha_err'] + + sub_image.subtraction_output = outdict # save the full output for debugging + + if sub_image._upstream_bitflag is None: + sub_image._upstream_bitflag = 0 + sub_image._upstream_bitflag |= ds.sources.bitflag + sub_image._upstream_bitflag |= ds.image.bitflag + sub_image._upstream_bitflag |= ds.wcs.bitflag + sub_image._upstream_bitflag |= ds.zp.bitflag + if 'ref_image' in locals(): + sub_image._upstream_bitflag |= ref_image.bitflag + + ds.sub_image = sub_image + + ds.runtimes['subtraction'] = time.perf_counter() - t_start + if parse_bool(os.getenv('SEECHANGE_TRACEMALLOC')): + import tracemalloc + ds.memory_usages['subtraction'] = tracemalloc.get_traced_memory()[1] / 1024 ** 2 # in MB + + except Exception as e: + ds.catch_exception(e) + finally: # make sure datastore is returned to be used in the next step + return ds diff --git a/pipeline/top_level.py b/pipeline/top_level.py index c6155c8c..b23b70c6 100644 --- a/pipeline/top_level.py +++ b/pipeline/top_level.py @@ -1,5 +1,11 @@ +import os +import datetime +import warnings + +import sqlalchemy as sa + from pipeline.parameters import Parameters -from pipeline.data_store import DataStore +from pipeline.data_store import DataStore, UPSTREAM_STEPS from pipeline.preprocessing import Preprocessor from pipeline.astro_cal import AstroCalibrator from pipeline.photo_cal import PhotCalibrator @@ -8,11 +14,36 @@ from pipeline.cutting import Cutter from pipeline.measuring import Measurer +from models.base import SmartSession +from models.provenance import Provenance +from models.exposure import Exposure +from models.report import Report + from util.config import Config from util.logger import SCLogger - -# should this come from db.py instead? -from models.base import SmartSession +from util.util import parse_bool + +# describes the pipeline objects that are used to produce each step of the pipeline +# if multiple objects are used in one step, replace the string with a sub-dictionary, +# where the sub-dictionary keys are the keywords inside the expected critical parameters +# that come from all the different objects. +PROCESS_OBJECTS = { + 'preprocessing': 'preprocessor', + 'extraction': 'extractor', # the same object also makes the PSF (and background?) + # TODO: when joining the astro/photo cal into extraction, use this format: + # 'extraction': { + # 'sources': 'extractor', + # 'astro_cal': 'astro_cal', + # 'photo_cal': 'photo_cal', + # } + 'astro_cal': 'astro_cal', + 'photo_cal': 'photo_cal', + 'subtraction': 'subtractor', + 'detection': 'detector', + 'cutting': 'cutter', + 'measuring': 'measurer', + # TODO: add one more for R/B deep learning scores +} # put all the top-level pipeline parameters in the init of this class: @@ -82,61 +113,180 @@ def __init__(self, **kwargs): self.cutter = Cutter(**cutting_config) # measure photometry, analytical cuts, and deep learning models on the Cutouts: - measurement_config = self.config.value('measuring', {}) - measurement_config.update(kwargs.get('measuring', {})) - self.pars.add_defaults_to_dict(measurement_config) - self.measurer = Measurer(**measurement_config) + measuring_config = self.config.value('measuring', {}) + measuring_config.update(kwargs.get('measuring', {})) + self.pars.add_defaults_to_dict(measuring_config) + self.measurer = Measurer(**measuring_config) + + def override_parameters(self, **kwargs): + """Override some of the parameters for this object and its sub-objects, using Parameters.override(). """ + for key, value in kwargs.items(): + if key in PROCESS_OBJECTS: + getattr(self, PROCESS_OBJECTS[key]).pars.override(value) + else: + self.pars.override({key: value}) + + def augment_parameters(self, **kwargs): + """Add some parameters to this object and its sub-objects, using Parameters.augment(). """ + for key, value in kwargs.items(): + if key in PROCESS_OBJECTS: + getattr(self, PROCESS_OBJECTS[key]).pars.augment(value) + else: + self.pars.augment({key: value}) + + def setup_datastore(self, *args, **kwargs): + """Initialize a datastore, including an exposure and a report, to use in the pipeline run. + + Will raise an exception if there is no valid Exposure, + if there's no reference available, or if the report cannot + be posted to the database. + + After these objects are instantiated, the pipeline will proceed + and record any exceptions into the report object before raising them. + + Parameters + ---------- + Inputs should include the exposure and section_id, or a datastore + with these things already loaded. If a session is passed in as + one of the arguments, it will be used as a single session for + running the entire pipeline (instead of opening and closing + sessions where needed). + + Returns + ------- + ds : DataStore + The DataStore object that was created or loaded. + session: sqlalchemy.orm.session.Session + An optional session. If not given, this will be None + """ + ds, session = DataStore.from_args(*args, **kwargs) + + if ds.exposure is None: + raise RuntimeError('Not sure if there is a way to run this pipeline method without an exposure!') + + try: # must make sure the exposure is on the DB + ds.exposure = ds.exposure.merge_concurrent(session=session) + except Exception as e: + raise RuntimeError('Failed to merge the exposure into the session!') from e + + try: # create (and commit, if not existing) all provenances for the products + with SmartSession(session) as dbsession: + provs = self.make_provenance_tree(ds.exposure, session=dbsession, commit=True) + except Exception as e: + raise RuntimeError('Failed to create the provenance tree!') from e + + try: # must make sure the report is on the DB + report = Report(exposure=ds.exposure, section_id=ds.section_id) + report.start_time = datetime.datetime.utcnow() + prov = Provenance( + process='report', + code_version=ds.exposure.provenance.code_version, + parameters={}, + upstreams=[provs['measuring']], + is_testing=ds.exposure.provenance.is_testing, + ) + report.provenance = prov + with SmartSession(session) as dbsession: + # check how many times this report was generated before + prev_rep = dbsession.scalars( + sa.select(Report).where( + Report.exposure_id == ds.exposure.id, + Report.section_id == ds.section_id, + Report.provenance_id == prov.id, + ) + ).all() + report.num_prev_reports = len(prev_rep) + report = dbsession.merge(report) + dbsession.commit() + + if report.exposure_id is None: + raise RuntimeError('Report did not get a valid exposure_id!') + except Exception as e: + raise RuntimeError('Failed to create or merge a report for the exposure!') from e + + ds.report = report + + return ds, session def run(self, *args, **kwargs): """ Run the entire pipeline on a specific CCD in a specific exposure. Will open a database session and grab any existing data, and calculate and commit any new data that did not exist. - """ - - ds, session = DataStore.from_args(*args, **kwargs) - if ( ds.image is not None ): - SCLogger.info( f"Pipeline starting for image {ds.image.id} ({ds.image.filepath})" ) - elif ( ds.exposure is not None ): - SCLogger.info( f"Pipeline starting for exposure {ds.exposure.id} ({ds.exposure}) section {ds.section_id}" ) + Parameters + ---------- + Inputs should include the exposure and section_id, or a datastore + with these things already loaded. If a session is passed in as + one of the arguments, it will be used as a single session for + running the entire pipeline (instead of opening and closing + sessions where needed). + + Returns + ------- + ds : DataStore + The DataStore object that includes all the data products. + """ + ds, session = self.setup_datastore(*args, **kwargs) + if ds.image is not None: + SCLogger.info(f"Pipeline starting for image {ds.image.id} ({ds.image.filepath})") + elif ds.exposure is not None: + SCLogger.info(f"Pipeline starting for exposure {ds.exposure.id} ({ds.exposure}) section {ds.section_id}") else: - SCLogger.info( f"Pipeline starting with args {args}, kwargs {kwargs}" ) - - # run dark/flat and sky subtraction tools, save the results as Image objects to DB and disk - SCLogger.info( f"preprocessor" ) - ds = self.preprocessor.run(ds, session) - SCLogger.info( f"preprocessing complete: image id = {ds.image.id}, filepath={ds.image.filepath}" ) - - # extract sources and make a SourceList from the regular image - SCLogger.info( f"extractor for image id {ds.image.id}" ) - ds = self.extractor.run(ds, session) - - # find astrometric solution, save WCS into Image object and FITS headers - SCLogger.info( f"astro_cal for image id {ds.image.id}" ) - ds = self.astro_cal.run(ds, session) - - # cross-match against photometric catalogs and get zero point, save into Image object and FITS headers - SCLogger.info( f"photo_cal for image id {ds.image.id}" ) - ds = self.photo_cal.run(ds, session) - - # fetch reference images and subtract them, save SubtractedImage objects to DB and disk - SCLogger.info( f"subtractor for image id {ds.image.id}" ) - ds = self.subtractor.run(ds, session) - - # find sources, generate a source list for detections - SCLogger.info( f"detector for image id {ds.image.id}" ) - ds = self.detector.run(ds, session) - - # make cutouts of all the sources in the "detections" source list - SCLogger.info( f"cutter for image id {ds.image.id}" ) - ds = self.cutter.run(ds, session) - - # extract photometry, analytical cuts, and deep learning models on the Cutouts: - SCLogger.info( f"measurer for image id {ds.image.id}" ) - ds = self.measurer.run(ds, session) - - return ds + SCLogger.info(f"Pipeline starting with args {args}, kwargs {kwargs}") + + if parse_bool(os.getenv('SEECHANGE_TRACEMALLOC')): + # ref: https://docs.python.org/3/library/tracemalloc.html#record-the-current-and-peak-size-of-all-traced-memory-blocks + import tracemalloc + tracemalloc.start() # trace the size of memory that is being used + + with warnings.catch_warnings(record=True) as w: + ds.warnings_list = w # appends warning to this list as it goes along + # run dark/flat preprocessing, cut out a specific section of the sensor + # TODO: save the results as Image objects to DB and disk? Or save at the end? + SCLogger.info(f"preprocessor") + ds = self.preprocessor.run(ds, session) + ds.update_report('preprocessing', session) + SCLogger.info(f"preprocessing complete: image id = {ds.image.id}, filepath={ds.image.filepath}") + + # extract sources and make a SourceList and PSF from the image + SCLogger.info(f"extractor for image id {ds.image.id}") + ds = self.extractor.run(ds, session) + ds.update_report('extraction', session) + + # find astrometric solution, save WCS into Image object and FITS headers + SCLogger.info(f"astro_cal for image id {ds.image.id}") + ds = self.astro_cal.run(ds, session) + ds.update_report('astro_cal', session) + + # cross-match against photometric catalogs and get zero point, save into Image object and FITS headers + SCLogger.info(f"photo_cal for image id {ds.image.id}") + ds = self.photo_cal.run(ds, session) + ds.update_report('photo_cal', session) + + # fetch reference images and subtract them, save SubtractedImage objects to DB and disk + SCLogger.info(f"subtractor for image id {ds.image.id}") + ds = self.subtractor.run(ds, session) + ds.update_report('subtraction', session) + + # find sources, generate a source list for detections + SCLogger.info(f"detector for image id {ds.image.id}") + ds = self.detector.run(ds, session) + ds.update_report('detection', session) + + # make cutouts of all the sources in the "detections" source list + SCLogger.info(f"cutter for image id {ds.image.id}") + ds = self.cutter.run(ds, session) + ds.update_report('cutting', session) + + # extract photometry, analytical cuts, and deep learning models on the Cutouts: + SCLogger.info(f"measurer for image id {ds.image.id}") + ds = self.measurer.run(ds, session) + ds.update_report('measuring', session) + + ds.finalize_report(session) + + return ds def run_with_session(self): """ @@ -147,3 +297,98 @@ def run_with_session(self): with SmartSession() as session: self.run(session=session) + def make_provenance_tree(self, exposure, session=None, commit=True): + """Use the current configuration of the pipeline and all the objects it has + to generate the provenances for all the processing steps. + This will conclude with the reporting step, which simply has an upstreams + list of provenances to the measuring provenance and to the machine learning score + provenances. From those, a user can recreate the entire tree of provenances. + + Parameters + ---------- + exposure : Exposure + The exposure to use to get the initial provenance. + This provenance should be automatically created by the exposure. + session : SmartSession, optional + The function needs to work with the database to merge existing provenances. + If a session is given, it will use that, otherwise it will open a new session, + which will also close automatically at the end of the function. + commit: bool, optional, default True + By default, the provenances are merged and committed inside this function. + To disable this, set commit=False. This may leave the provenances in a + transient state, and is most likely not what you want. + + Returns + ------- + dict + A dictionary of all the provenances that were created in this function, + keyed according to the different steps in the pipeline. + The provenances are all merged to the session. + """ + with SmartSession(session) as session: + # start by getting the exposure and reference + exposure = session.merge(exposure) # also merges the provenance and code_version + # TODO: need a better way to find the relevant reference PROVENANCE for this exposure + # i.e., we do not look for a valid reference and get its provenance, instead, + # we look for a provenance based on our policy (that can be defined in the subtraction parameters) + # and find a specific provenance id that matches our policy. + # If we later find that no reference with that provenance exists that overlaps our images, + # that will be recorded as an error in the report. + # One way to do this would be to add a RefSet table that has a name (e.g., "standard") and + # a validity time range (which will be removed from Reference), maybe also the instrument. + # That would allow us to use a combination of name+obs_time to find a specific RefSet, + # which has a single reference provenance ID. If you want a custom reference, + # add a new RefSet with a new name. + # This also means that the reference making pipeline MUST use a single set of policies + # to create all the references for a given RefSet... we need to make sure we can actually + # make that happen consistently (e.g., if you change parameters or start mixing instruments + # when you make the references it will create multiple provenances for the same RefSet). + + # for now, use the latest provenance that has to do with references + ref_prov = session.scalars( + sa.select(Provenance).where(Provenance.process == 'reference').order_by(Provenance.created_at.desc()) + ).first() + provs = {'exposure': exposure.provenance} # TODO: does this always work on any exposure? + code_version = exposure.provenance.code_version + is_testing = exposure.provenance.is_testing + + for step in PROCESS_OBJECTS: + if isinstance(PROCESS_OBJECTS[step], dict): + parameters = {} + for key, value in PROCESS_OBJECTS[step].items(): + parameters[key] = getattr(self, value).pars.get_critical_pars() + else: + parameters = getattr(self, PROCESS_OBJECTS[step]).pars.get_critical_pars() + + # some preprocessing parameters (the "preprocessing_steps") doesn't come from the + # config file, but instead comes from the preprocessing itself. + # TODO: fix this as part of issue #147 + if step == 'preprocessing': + if 'preprocessing_steps' not in parameters: + parameters['preprocessing_steps'] = ['overscan', 'linearity', 'flat', 'fringe'] + + # figure out which provenances go into the upstreams for this step + up_steps = UPSTREAM_STEPS[step] + if isinstance(up_steps, str): + up_steps = [up_steps] + upstreams = [] + for upstream in up_steps: + if upstream == 'reference': + upstreams += ref_prov.upstreams + else: + upstreams.append(provs[upstream]) + + provs[step] = Provenance( + code_version=code_version, + process=step, + parameters=parameters, + upstreams=upstreams, + is_testing=is_testing, + ) + + provs[step] = provs[step].merge_concurrent(session=session, commit=commit) + + # if commit: + # session.commit() + + return provs diff --git a/requirements.txt b/requirements.txt index ecd62d6c..7125174d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,12 +5,11 @@ astropy==5.3.4 astroquery==0.4.6 beautifulsoup4==4.12.2 fitsio==0.9.12 -flaky==3.7.0 +flaky==3.8.1 GitPython==3.1.40 h5py==3.10.0 healpy==1.16.6 matplotlib==3.8.2 -mpi4py==3.1.6 numpy==1.26.2 pandas==2.1.3 photutils==1.9.0 diff --git a/tests/conftest.py b/tests/conftest.py index 777d1086..ec1e1fb0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,6 +4,7 @@ import uuid import shutil import pathlib +import logging import numpy as np @@ -38,6 +39,10 @@ ARCHIVE_PATH = None +# We may want to turn this on only for tests, as it may add a lot of runtime/memory overhead +# ref: https://www.mail-archive.com/python-list@python.org/msg443129.html +# os.environ["SEECHANGE_TRACEMALLOC"] = "1" + # this fixture should be the first thing loaded by the test suite # (session is the pytest session, not the SQLAlchemy session) diff --git a/tests/docker-compose.yaml b/tests/docker-compose.yaml index 8b185bf1..f2e6a938 100644 --- a/tests/docker-compose.yaml +++ b/tests/docker-compose.yaml @@ -98,7 +98,7 @@ services: target: /archive_storage working_dir: /seechange user: ${USERID:-0}:${GROUPID:-0} - entrypoint: "pytest -v /seechange/tests/$TEST_SUBFOLDER" + entrypoint: "pytest -v /seechange/$TEST_SUBFOLDER" runalltests: image: ghcr.io/${GITHUB_REPOSITORY_OWNER:-c3-time-domain}/seechange:${IMGTAG:-tests} diff --git a/tests/fixtures/decam.py b/tests/fixtures/decam.py index 289d7b98..75b070d8 100644 --- a/tests/fixtures/decam.py +++ b/tests/fixtures/decam.py @@ -208,15 +208,10 @@ def decam_exposure(decam_filename, data_dir): exphdrinfo = Instrument.extract_header_info( hdr, [ 'mjd', 'exp_time', 'filter', 'project', 'target' ] ) with SmartSession() as session: - # first try to recover an existing exposure - exposure = session.scalars(sa.select(Exposure).where(Exposure.filepath == filename)).first() - if exposure is None: - exposure = Exposure( filepath=filename, instrument='DECam', **exphdrinfo ) - exposure.save() # save to archive and get an MD5 sum - - exposure.provenance = session.merge(exposure.provenance) - session.add(exposure) - session.commit() + exposure = Exposure( filepath=filename, instrument='DECam', **exphdrinfo ) + exposure.save() # save to archive and get an MD5 sum + + exposure = exposure.merge_concurrent(session) # also commits the session yield exposure diff --git a/tests/fixtures/pipeline_objects.py b/tests/fixtures/pipeline_objects.py index 02d2d6d3..52c6bc5d 100644 --- a/tests/fixtures/pipeline_objects.py +++ b/tests/fixtures/pipeline_objects.py @@ -29,6 +29,7 @@ from pipeline.subtraction import Subtractor from pipeline.cutting import Cutter from pipeline.measuring import Measurer +from pipeline.top_level import Pipeline from util.logger import SCLogger @@ -171,7 +172,7 @@ def make_detector(): det.pars.test_parameter = det.pars.add_par( 'test_parameter', 'test_value', str, 'parameter to define unique tests', critical=True ) - det.pars._enforce_no_new_attrs = False + det.pars._enforce_no_new_attrs = True return det @@ -192,7 +193,7 @@ def make_cutter(): cut.pars.test_parameter = cut.pars.add_par( 'test_parameter', 'test_value', str, 'parameter to define unique tests', critical=True ) - cut.pars._enforce_no_new_attrs = False + cut.pars._enforce_no_new_attrs = True return cut @@ -213,7 +214,7 @@ def make_measurer(): meas.pars.test_parameter = meas.pars.add_par( 'test_parameter', 'test_value', str, 'parameter to define unique tests', critical=True ) - meas.pars._enforce_no_new_attrs = False + meas.pars._enforce_no_new_attrs = True return meas @@ -226,8 +227,7 @@ def measurer(measurer_factory): @pytest.fixture(scope='session') -def datastore_factory( - data_dir, +def pipeline_factory( preprocessor_factory, extractor_factory, astrometor_factory, @@ -236,8 +236,30 @@ def datastore_factory( detector_factory, cutter_factory, measurer_factory, - + test_config, ): + def make_pipeline(): + p = Pipeline(**test_config.value('pipeline')) + p.preprocessor = preprocessor_factory() + p.extractor = extractor_factory() + p.astro_cal = astrometor_factory() + p.photo_cal = photometor_factory() + p.subtractor = subtractor_factory() + p.detector = detector_factory() + p.cutter = cutter_factory() + p.measurer = measurer_factory() + return p + + return make_pipeline + + +@pytest.fixture +def pipeline_for_tests(pipeline_factory): + return pipeline_factory() + + +@pytest.fixture(scope='session') +def datastore_factory(data_dir, pipeline_factory): """Provide a function that returns a datastore with all the products based on the given exposure and section ID. To use this data store in a test where new data is to be generated, @@ -265,38 +287,11 @@ def make_datastore( if cache_dir is not None and cache_base_name is not None: ds.cache_base_name = os.path.join(cache_dir, cache_base_name) # save this for testing purposes - # allow calling scope to override/augment parameters for any of the processing steps - preprocessor = preprocessor_factory() - preprocessor.pars.override(overrides.get('preprocessing', {})) - preprocessor.pars.augment(augments.get('preprocessing', {})) - - extractor = extractor_factory() - extractor.pars.override(overrides.get('extraction', {})) - extractor.pars.augment(augments.get('extraction', {})) - - astrometor = astrometor_factory() - astrometor.pars.override(overrides.get('astro_cal', {})) - astrometor.pars.augment(augments.get('astro_cal', {})) - - photometor = photometor_factory() - photometor.pars.override(overrides.get('photo_cal', {})) - photometor.pars.augment(augments.get('photo_cal', {})) + p = pipeline_factory() - subtractor = subtractor_factory() - subtractor.pars.override(overrides.get('subtraction', {})) - subtractor.pars.augment(augments.get('subtraction', {})) - - detector = detector_factory() - detector.pars.override(overrides.get('detection', {})) - detector.pars.augment(augments.get('detection', {})) - - cutter = cutter_factory() - cutter.pars.override(overrides.get('cutting', {})) - cutter.pars.augment(augments.get('cutting', {})) - - measurer = measurer_factory() - measurer.pars.override(overrides.get('measurement', {})) - measurer.pars.augment(augments.get('measurement', {})) + # allow calling scope to override/augment parameters for any of the processing steps + p.override_parameters(**overrides) + p.augment_parameters(**augments) with SmartSession(session) as session: code_version = session.merge(code_version) @@ -319,7 +314,7 @@ def make_datastore( # add the preprocessing steps from instrument (TODO: remove this as part of Issue #142) preprocessing_steps = ds.image.instrument_object.preprocessing_steps - prep_pars = preprocessor.pars.get_critical_pars() + prep_pars = p.preprocessor.pars.get_critical_pars() prep_pars['preprocessing_steps'] = preprocessing_steps upstreams = [ds.exposure.provenance] if ds.exposure is not None else [] # images without exposure @@ -351,7 +346,7 @@ def make_datastore( if ds.image is None: # make the preprocessed image SCLogger.debug('making preprocessed image. ') - ds = preprocessor.run(ds) + ds = p.preprocessor.run(ds) ds.image.provenance.is_testing = True if bad_pixel_map is not None: ds.image.flags |= bad_pixel_map @@ -400,7 +395,7 @@ def make_datastore( code_version=code_version, process='extraction', upstreams=[ds.image.provenance], - parameters=extractor.pars.get_critical_pars(), + parameters=p.extractor.pars.get_critical_pars(), is_testing=True, ) prov = session.merge(prov) @@ -461,7 +456,7 @@ def make_datastore( if ds.sources is None or ds.psf is None: # make the source list from the regular image SCLogger.debug('extracting sources. ') - ds = extractor.run(ds) + ds = p.extractor.run(ds) ds.sources.save() ds.sources.copy_to_cache(cache_dir) ds.psf.save(overwrite=True) @@ -480,7 +475,7 @@ def make_datastore( code_version=code_version, process='astro_cal', upstreams=[ds.sources.provenance], - parameters=astrometor.pars.get_critical_pars(), + parameters=p.astro_cal.pars.get_critical_pars(), is_testing=True, ) prov = session.merge(prov) @@ -508,7 +503,7 @@ def make_datastore( if ds.wcs is None: # make the WCS SCLogger.debug('Running astrometric calibration') - ds = astrometor.run(ds) + ds = p.astro_cal.run(ds) if cache_dir is not None and cache_base_name is not None: # must provide a name because this one isn't a FileOnDiskMixin output_path = ds.wcs.copy_to_cache(cache_dir, cache_name) @@ -525,8 +520,8 @@ def make_datastore( prov = Provenance( code_version=code_version, process='photo_cal', - upstreams=[ds.sources.provenance], - parameters=photometor.pars.get_critical_pars(), + upstreams=[ds.sources.provenance, ds.wcs.provenance], + parameters=p.photo_cal.pars.get_critical_pars(), is_testing=True, ) prov = session.merge(prov) @@ -554,7 +549,7 @@ def make_datastore( if ds.zp is None: # make the zero point SCLogger.debug('Running photometric calibration') - ds = photometor.run(ds) + ds = p.photo_cal.run(ds) if cache_dir is not None and cache_base_name is not None: output_path = ds.zp.copy_to_cache(cache_dir, cache_name) if output_path != cache_path: @@ -584,7 +579,7 @@ def make_datastore( ref.wcs.provenance, ref.zp.provenance, ], - parameters=subtractor.pars.get_critical_pars(), + parameters=p.subtractor.pars.get_critical_pars(), is_testing=True, ) sub_im = Image.from_new_and_ref(ds.image, ref.image) @@ -594,19 +589,110 @@ def make_datastore( if os.path.isfile(os.path.join(cache_dir, cache_name)): SCLogger.debug('loading subtraction image from cache. ') ds.sub_image = Image.copy_from_cache(cache_dir, cache_name) + ds.sub_image.provenance = prov ds.sub_image.upstream_images.append(ref.image) ds.sub_image.ref_image_id = ref.image_id ds.sub_image.new_image = ds.image ds.sub_image.save(verify_md5=False) # make sure it is also saved to archive - if ds.sub_image is None: # no hit in the cache - ds = subtractor.run(ds) + # try to load the aligned images from cache + prov_aligned_ref = Provenance( + code_version=code_version, + parameters={ + 'method': 'swarp', + 'to_index': 'new', + 'max_arcsec_residual': 0.2, + 'crossid_radius': 2.0, + 'max_sources_to_use': 2000, + 'min_frac_matched': 0.1, + 'min_matched': 10, + }, + upstreams=[ + ds.image.provenance, + ds.sources.provenance, # this also includes the PSF's provenance + ds.wcs.provenance, + ds.ref_image.provenance, + ds.ref_image.sources.provenance, + ds.ref_image.wcs.provenance, + ds.ref_image.zp.provenance, + ], + process='alignment', + is_testing=True, + ) + # TODO: can we find a less "hacky" way to do this? + f = ref.image.invent_filepath() + f = f.replace('ComSci', 'Warped') # not sure if this or 'Sci' will be in the filename + f = f.replace('Sci', 'Warped') # in any case, replace it with 'Warped' + f = f[:-6] + prov_aligned_ref.id[:6] # replace the provenance ID + filename_aligned_ref = f + + prov_aligned_new = Provenance( + code_version=code_version, + parameters=prov_aligned_ref.parameters, + upstreams=[ + ds.image.provenance, + ds.sources.provenance, # this also includes the PSF's provenance + ds.wcs.provenance, + ds.zp.provenance, + ], + process='alignment', + is_testing=True, + ) + f = ds.sub_image.new_image.invent_filepath() + f = f.replace('ComSci', 'Warped') + f = f.replace('Sci', 'Warped') + f = f[:-6] + prov_aligned_new.id[:6] + filename_aligned_new = f + + cache_name_ref = filename_aligned_ref + '.fits.json' + cache_name_new = filename_aligned_new + '.fits.json' + if ( + os.path.isfile(os.path.join(cache_dir, cache_name_ref)) and + os.path.isfile(os.path.join(cache_dir, cache_name_new)) + ): + SCLogger.debug('loading aligned reference image from cache. ') + image_aligned_ref = Image.copy_from_cache(cache_dir, cache_name) + image_aligned_ref.provenance = prov_aligned_ref + image_aligned_ref.info['original_image_id'] = ds.ref_image_id + image_aligned_ref.info['original_image_filepath'] = ds.ref_image.filepath + image_aligned_ref.save(verify_md5=False, no_archive=True) + # TODO: should we also load the aligned image's sources, PSF, and ZP? + + SCLogger.debug('loading aligned new image from cache. ') + image_aligned_new = Image.copy_from_cache(cache_dir, cache_name) + image_aligned_new.provenance = prov_aligned_new + image_aligned_new.info['original_image_id'] = ds.image_id + image_aligned_new.info['original_image_filepath'] = ds.image.filepath + image_aligned_new.save(verify_md5=False, no_archive=True) + # TODO: should we also load the aligned image's sources, PSF, and ZP? + + if image_aligned_ref.mjd < image_aligned_new.mjd: + ds.sub_image._aligned_images = [image_aligned_ref, image_aligned_new] + else: + ds.sub_image._aligned_images = [image_aligned_new, image_aligned_ref] + + if ds.sub_image is None: # no hit in the cache + ds = p.subtractor.run(ds) + ds.sub_image.save(verify_md5=False) # make sure it is also saved to archive + ds.sub_image.copy_to_cache(cache_dir) + + # make sure that the aligned images get into the cache, too + if ( + 'cache_name_ref' in locals() and + os.path.isfile(os.path.join(cache_dir, cache_name_ref)) and + 'cache_name_new' in locals() and + os.path.isfile(os.path.join(cache_dir, cache_name_new)) + ): + for im in ds.sub_image.aligned_images: + im.copy_to_cache(cache_dir) + + ############ detecting to create a source list ############ prov = Provenance( code_version=code_version, process='detection', upstreams=[ds.sub_image.provenance], - parameters=detector.pars.get_critical_pars(), + parameters=p.detector.pars.get_critical_pars(), is_testing=True, ) cache_name = os.path.join(cache_dir, cache_sub_name + f'.sources_{prov.id[:6]}.npy.json') @@ -618,15 +704,16 @@ def make_datastore( ds.sub_image.sources = ds.detections ds.detections.save(verify_md5=False) else: # cannot find detections on cache - ds = detector.run(ds) + ds = p.detector.run(ds) ds.detections.save(verify_md5=False) ds.detections.copy_to_cache(cache_dir, cache_name) + ############ cutting to create cutouts ############ prov = Provenance( code_version=code_version, process='cutting', upstreams=[ds.detections.provenance], - parameters=cutter.pars.get_critical_pars(), + parameters=p.cutter.pars.get_critical_pars(), is_testing=True, ) cache_name = os.path.join(cache_dir, cache_sub_name + f'.cutouts_{prov.id[:6]}.h5') @@ -638,15 +725,16 @@ def make_datastore( [setattr(c, 'sources', ds.detections) for c in ds.cutouts] Cutouts.save_list(ds.cutouts) # make sure to save to archive as well else: # cannot find cutouts on cache - ds = cutter.run(ds) + ds = p.cutter.run(ds) Cutouts.save_list(ds.cutouts) Cutouts.copy_list_to_cache(ds.cutouts, cache_dir) + ############ measuring to create measurements ############ prov = Provenance( code_version=code_version, process='measuring', upstreams=[ds.cutouts[0].provenance], - parameters=measurer.pars.get_critical_pars(), + parameters=p.measurer.pars.get_critical_pars(), is_testing=True, ) @@ -661,7 +749,7 @@ def make_datastore( [m.associate_object(session) for m in ds.measurements] # create or find an object for each measurement # no need to save list because Measurements is not a FileOnDiskMixin! else: # cannot find measurements on cache - ds = measurer.run(ds) + ds = p.measurer.run(ds) Measurements.copy_list_to_cache(ds.all_measurements, cache_dir, cache_name) # must provide filepath! ds.save_and_commit(session=session) diff --git a/tests/models/test_reports.py b/tests/models/test_reports.py new file mode 100644 index 00000000..bb052e41 --- /dev/null +++ b/tests/models/test_reports.py @@ -0,0 +1,151 @@ +import os +import time +import uuid + +from pprint import pprint + +import sqlalchemy as sa + +from pipeline.top_level import PROCESS_OBJECTS + +from models.base import SmartSession +from models.report import Report + +from util.util import parse_bool + + +def test_report_bitflags(decam_exposure, decam_reference, decam_default_calibrators): + report = Report(exposure=decam_exposure, section_id='N1') + + # test that the progress steps flag is working + assert report.progress_steps_bitflag == 0 + assert report.progress_steps == '' + + report.progress_steps = 'preprocessing' + assert report.progress_steps_bitflag == 2 ** 1 + assert report.progress_steps == 'preprocessing' + + report.progress_steps = 'preprocessing, Extraction' + assert report.progress_steps_bitflag == 2 ** 1 + 2 ** 2 + assert report.progress_steps == 'preprocessing, extraction' + + report.append_progress('photo_cal') + assert report.progress_steps_bitflag == 2 ** 1 + 2 ** 2 + 2 ** 4 + assert report.progress_steps == 'preprocessing, extraction, photo_cal' + + report.append_progress('preprocessing') # appending it again makes no difference + assert report.progress_steps_bitflag == 2 ** 1 + 2 ** 2 + 2 ** 4 + assert report.progress_steps == 'preprocessing, extraction, photo_cal' + + report.append_progress('subtraction, cutting') # append two at a time + assert report.progress_steps_bitflag == 2 ** 1 + 2 ** 2 + 2 ** 4 + 2 ** 5 + 2 ** 7 + assert report.progress_steps == 'preprocessing, extraction, photo_cal, subtraction, cutting' + + # test that the products exist flag is working + assert report.products_exist_bitflag == 0 + assert report.products_exist == '' + + report.products_exist = 'image' + assert report.products_exist_bitflag == 2 ** 1 + assert report.products_exist == 'image' + + report.products_exist = 'image, sources' + assert report.products_exist_bitflag == 2 ** 1 + 2 ** 2 + assert report.products_exist == 'image, sources' + + report.append_products_exist('psf') + assert report.products_exist_bitflag == 2 ** 1 + 2 ** 2 + 2 ** 3 + assert report.products_exist == 'image, sources, psf' + + report.append_products_exist('image') # appending it again makes no difference + assert report.products_exist_bitflag == 2 ** 1 + 2 ** 2 + 2 ** 3 + assert report.products_exist == 'image, sources, psf' + + report.append_products_exist('sub_image, detections') # append two at a time + assert report.products_exist_bitflag == 2 ** 1 + 2 ** 2 + 2 ** 3 + 2 ** 7 + 2 ** 8 + assert report.products_exist == 'image, sources, psf, sub_image, detections' + + # test that the products committed flag is working + assert report.products_committed_bitflag == 0 + assert report.products_committed == '' + + report.products_committed = 'sources' + assert report.products_committed_bitflag == 2 ** 2 + assert report.products_committed == 'sources' + + report.products_committed = 'sources, zp' + assert report.products_committed_bitflag == 2 ** 2 + 2 ** 6 + assert report.products_committed == 'sources, zp' + + report.append_products_committed('sub_image') + assert report.products_committed_bitflag == 2 ** 2 + 2 ** 6 + 2 ** 7 + assert report.products_committed == 'sources, zp, sub_image' + + report.append_products_committed('sub_image, detections') # append two at a time + assert report.products_committed_bitflag == 2 ** 2 + 2 ** 6 + 2 ** 7 + 2 ** 8 + assert report.products_committed == 'sources, zp, sub_image, detections' + + report.append_products_committed('sub_image') # appending it again makes no difference + assert report.products_committed_bitflag == 2 ** 2 + 2 ** 6 + 2 ** 7 + 2 ** 8 + assert report.products_committed == 'sources, zp, sub_image, detections' + + +def test_measure_runtime_memory(decam_exposure, decam_reference, pipeline_for_tests, decam_default_calibrators): + # make sure we get a random new provenance, not reuse any of the existing data + p = pipeline_for_tests + p.preprocessor.pars.test_parameter = uuid.uuid4().hex + + t0 = time.perf_counter() + + ds = p.run(decam_exposure, 'N1') + + assert p.preprocessor.has_recalculated + assert p.extractor.has_recalculated + assert p.astro_cal.has_recalculated + assert p.photo_cal.has_recalculated + assert p.subtractor.has_recalculated + assert p.detector.has_recalculated + assert p.cutter.has_recalculated + assert p.measurer.has_recalculated + + measured_time = 0 + peak_memory = 0 + for step in PROCESS_OBJECTS.keys(): # also make sure all the keys are present in both dictionaries + measured_time += ds.runtimes[step] + if parse_bool(os.getenv('SEECHANGE_TRACEMALLOC')): + peak_memory = max(peak_memory, ds.memory_usages[step]) + + total_time = time.perf_counter() - t0 + + print(f'total_time: {total_time:.1f}s') + print(f'measured_time: {measured_time:.1f}s') + pprint(ds.runtimes, sort_dicts=False) + assert measured_time > 0.99 * total_time # at least 99% of the time is accounted for + + if parse_bool(os.getenv('SEECHANGE_TRACEMALLOC')): + print(f'peak_memory: {peak_memory:.1f}MB') + pprint(ds.memory_usages, sort_dicts=False) + assert 1000.0 < peak_memory < 10000.0 # memory usage is in MB, takes between 1 and 10 GB + + with SmartSession() as session: + rep = session.scalars(sa.select(Report).where(Report.exposure_id == decam_exposure.id)).one() + assert rep is not None + assert rep.success + assert rep.process_runtime == ds.runtimes + assert rep.process_memory == ds.memory_usages + # 'preprocessing, extraction, astro_cal, photo_cal, subtraction, detection, cutting, measuring' + assert rep.progress_steps == ', '.join(PROCESS_OBJECTS.keys()) + assert rep.products_exist == 'image, sources, psf, wcs, zp, sub_image, detections, cutouts, measurements' + assert rep.products_committed == '' # we don't save the data store objects at any point? + assert rep.provenance.upstreams[0].id == ds.measurements[0].provenance.id + assert rep.num_prev_reports == 0 + + +def test_inject_warnings(decam_datastore, decam_reference, pipeline_for_tests, decam_default_calibrators): + pass + + +def test_inject_exceptions(decam_datastore, decam_reference, pipeline_for_tests): + pass + + diff --git a/tests/pipeline/test_astro_cal.py b/tests/pipeline/test_astro_cal.py index 20e4f2e2..37bad569 100644 --- a/tests/pipeline/test_astro_cal.py +++ b/tests/pipeline/test_astro_cal.py @@ -181,3 +181,19 @@ def test_run_scamp( decam_datastore, astrometor ): # TODO : test that it fails when it's supposed to + +def test_warnings_and_exceptions(decam_datastore, astrometor): + astrometor.pars.inject_warnings = 1 + + with pytest.warns(UserWarning) as record: + astrometor.run(decam_datastore) + assert len(record) > 0 + assert any("Warning injected by pipeline parameters in process 'astro_cal'." in str(w.message) for w in record) + + astrometor.pars.inject_warnings = 0 + astrometor.pars.inject_exceptions = 1 + with pytest.raises(Exception) as excinfo: + ds = astrometor.run(decam_datastore) + ds.reraise() + assert "Exception injected by pipeline parameters in process 'astro_cal'." in str(excinfo.value) + ds.read_exception() diff --git a/tests/pipeline/test_cutting.py b/tests/pipeline/test_cutting.py new file mode 100644 index 00000000..e79b5334 --- /dev/null +++ b/tests/pipeline/test_cutting.py @@ -0,0 +1,18 @@ +import pytest + + +def test_warnings_and_exceptions(decam_datastore, cutter): + cutter.pars.inject_warnings = 1 + + with pytest.warns(UserWarning) as record: + cutter.run(decam_datastore) + assert len(record) > 0 + assert any("Warning injected by pipeline parameters in process 'cutting'." in str(w.message) for w in record) + + cutter.pars.inject_warnings = 0 + cutter.pars.inject_exceptions = 1 + with pytest.raises(Exception) as excinfo: + ds = cutter.run(decam_datastore) + ds.reraise() + assert "Exception injected by pipeline parameters in process 'cutting'." in str(excinfo.value) + ds.read_exception() \ No newline at end of file diff --git a/tests/pipeline/test_detection.py b/tests/pipeline/test_detection.py index ba47f643..ed6fcaaf 100644 --- a/tests/pipeline/test_detection.py +++ b/tests/pipeline/test_detection.py @@ -1,4 +1,4 @@ -import os +import pytest import numpy as np import matplotlib.pyplot as plt import scipy.signal @@ -149,3 +149,20 @@ def test_detection_ptf_supernova(detector, ptf_subtraction1, blocking_plots, cac finally: ds.detections.delete_from_disk_and_database() + + +def test_warnings_and_exceptions(decam_datastore, detector): + detector.pars.inject_warnings = 1 + + with pytest.warns(UserWarning) as record: + detector.run(decam_datastore) + assert len(record) > 0 + assert any("Warning injected by pipeline parameters in process 'detection'." in str(w.message) for w in record) + + detector.pars.inject_warnings = 0 + detector.pars.inject_exceptions = 1 + with pytest.raises(Exception) as excinfo: + ds = detector.run(decam_datastore) + ds.reraise() + assert "Exception injected by pipeline parameters in process 'detection'." in str(excinfo.value) + ds.read_exception() \ No newline at end of file diff --git a/tests/pipeline/test_extraction.py b/tests/pipeline/test_extraction.py index 79892a5f..3fa443bd 100644 --- a/tests/pipeline/test_extraction.py +++ b/tests/pipeline/test_extraction.py @@ -376,3 +376,20 @@ def test_run_detection_sextractor( decam_datastore, extractor ): finally: ds.delete_everything() + + +def test_warnings_and_exceptions(decam_datastore, extractor): + extractor.pars.inject_warnings = 1 + + with pytest.warns(UserWarning) as record: + extractor.run(decam_datastore) + assert len(record) > 0 + assert any("Warning injected by pipeline parameters in process 'extraction'." in str(w.message) for w in record) + + extractor.pars.inject_warnings = 0 + extractor.pars.inject_exceptions = 1 + with pytest.raises(Exception) as excinfo: + ds = extractor.run(decam_datastore) + ds.reraise() + assert "Exception injected by pipeline parameters in process 'extraction'." in str(excinfo.value) + ds.read_exception() \ No newline at end of file diff --git a/tests/pipeline/test_measuring.py b/tests/pipeline/test_measuring.py index d7aa5725..b88fd0d1 100644 --- a/tests/pipeline/test_measuring.py +++ b/tests/pipeline/test_measuring.py @@ -215,3 +215,20 @@ def test_measuring(measurer, decam_cutouts): assert m.get_filter_description() == 'Streaked (angle= 25.0 deg)' assert m.background < 1.0 # see TODO above assert m.background_err < 3.0 # TODO: above + + +def test_warnings_and_exceptions(decam_datastore, measurer): + measurer.pars.inject_warnings = 1 + + with pytest.warns(UserWarning) as record: + measurer.run(decam_datastore) + assert len(record) > 0 + assert any("Warning injected by pipeline parameters in process 'measuring'." in str(w.message) for w in record) + + measurer.pars.inject_exceptions = 1 + measurer.pars.inject_warnings = 0 + with pytest.raises(Exception) as excinfo: + ds = measurer.run(decam_datastore) + ds.reraise() + assert "Exception injected by pipeline parameters in process 'measuring'." in str(excinfo.value) + ds.read_exception() diff --git a/tests/pipeline/test_photo_cal.py b/tests/pipeline/test_photo_cal.py index 7e3cf5aa..ceb5f8eb 100644 --- a/tests/pipeline/test_photo_cal.py +++ b/tests/pipeline/test_photo_cal.py @@ -63,3 +63,20 @@ def test_decam_photo_cal( decam_datastore, photometor, blocking_plots ): 17.323, 21.653, 30.315, 43.307 ], abs=0.01 ) assert ds.zp.aper_cors == pytest.approx( [-0.457, -0.177, -0.028, -0.007, 0.0, 0.003, 0.005, 0.006 ], abs=0.01 ) + + +def test_warnings_and_exceptions(decam_datastore, photometor): + photometor.pars.inject_warnings = 1 + + with pytest.warns(UserWarning) as record: + photometor.run(decam_datastore) + assert len(record) > 0 + assert any("Warning injected by pipeline parameters in process 'photo_cal'." in str(w.message) for w in record) + + photometor.pars.inject_warnings = 0 + photometor.pars.inject_exceptions = 1 + with pytest.raises(Exception) as excinfo: + ds = photometor.run(decam_datastore) + ds.reraise() + assert "Exception injected by pipeline parameters in process 'photo_cal'." in str(excinfo.value) + ds.read_exception() \ No newline at end of file diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 3d29e99b..091df2d6 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -1,6 +1,8 @@ import os import pytest import shutil +import datetime + import sqlalchemy as sa import numpy as np @@ -13,6 +15,8 @@ from models.zero_point import ZeroPoint from models.cutouts import Cutouts from models.measurements import Measurements +from models.report import Report + from util.logger import SCLogger from pipeline.top_level import Pipeline @@ -358,6 +362,10 @@ def test_bitflag_propagation(decam_exposure, decam_reference, decam_default_cali # this should be removed after we add datastore failure modes (issue #150) shutil.rmtree(os.path.join(os.path.dirname(exposure.get_fullpath()), '115'), ignore_errors=True) shutil.rmtree(os.path.join(archive.test_folder_path, '115'), ignore_errors=True) + with SmartSession() as session: + ds.exposure.bitflag = 0 + session.merge(ds.exposure) + session.commit() def test_get_upstreams_and_downstreams(decam_exposure, decam_reference, decam_default_calibrators, archive): @@ -480,3 +488,106 @@ def test_datastore_delete_everything(decam_datastore): assert session.scalars( sa.select(Measurements).where(Measurements.id == measurements_list[0].id) ).first() is None + + +def test_provenance_tree(pipeline_for_tests, decam_exposure, decam_datastore, decam_reference): + p = pipeline_for_tests + provs = p.make_provenance_tree(decam_exposure) + assert isinstance(provs, dict) + + t_start = datetime.datetime.utcnow() + ds = p.run(decam_exposure, 'N1') # the data should all be there so this should be quick + t_end = datetime.datetime.utcnow() + + assert ds.image.provenance_id == provs['preprocessing'].id + assert ds.sources.provenance_id == provs['extraction'].id + assert ds.psf.provenance_id == provs['extraction'].id + assert ds.wcs.provenance_id == provs['astro_cal'].id + assert ds.zp.provenance_id == provs['photo_cal'].id + assert ds.sub_image.provenance_id == provs['subtraction'].id + assert ds.detections.provenance_id == provs['detection'].id + assert ds.cutouts[0].provenance_id == provs['cutting'].id + assert ds.measurements[0].provenance_id == provs['measuring'].id + + with SmartSession() as session: + report = session.scalars( + sa.select(Report).where(Report.exposure_id == decam_exposure.id).order_by(Report.start_time.desc()) + ).first() + assert report is not None + assert report.success + assert abs(report.start_time - t_start) < datetime.timedelta(seconds=1) + assert abs(report.finish_time - t_end) < datetime.timedelta(seconds=1) + + +def test_inject_warnings_errors(decam_datastore, decam_reference, pipeline_for_tests): + from pipeline.top_level import PROCESS_OBJECTS + p = pipeline_for_tests + for process, obj in PROCESS_OBJECTS.items(): + # first reset all warnings and errors + for _, obj2 in PROCESS_OBJECTS.items(): + getattr(p, obj2).pars.inject_exceptions = False + getattr(p, obj2).pars.inject_warnings = False + + # set the warning: + getattr(p, obj).pars.inject_warnings = True + + # run the pipeline + ds = p.run(decam_datastore) + expected = f"{process}: Warning injected by pipeline parameters in process '{process}'" + assert expected in ds.report.warnings + + # these are used to find the report later on + exp_id = ds.exposure_id + sec_id = ds.section_id + prov_id = ds.report.provenance_id + + # set the error instead + getattr(p, obj).pars.inject_warnings = False + getattr(p, obj).pars.inject_exceptions = True + # run the pipeline again, this time with an exception + + with pytest.raises(RuntimeError, match=f"Exception injected by pipeline parameters in process '{process}'"): + ds = p.run(decam_datastore) + + # fetch the report object + with SmartSession() as session: + reports = session.scalars( + sa.select(Report).where( + Report.exposure_id == exp_id, + Report.section_id == sec_id, + Report.provenance_id == prov_id + ).order_by(Report.start_time.desc()) + ).all() + report = reports[0] # the last report is the one we just generated + assert len(reports) - 1 == report.num_prev_reports + assert not report.success + assert report.error_step == process + assert report.error_type == 'RuntimeError' + assert 'Exception injected by pipeline parameters' in report.error_message + + +def test_multiprocessing_make_provenances_and_exposure(decam_exposure, decam_reference, pipeline_for_tests): + from multiprocessing import SimpleQueue, Process + process_list = [] + + def make_provenances(exposure, pipeline, queue): + provs = pipeline.make_provenance_tree(exposure) + queue.put(provs) + + queue = SimpleQueue() + for i in range(3): # github has 4 CPUs for testing, so 3 sub-processes and 1 main process + p = Process(target=make_provenances, args=(decam_exposure, pipeline_for_tests, queue)) + p.start() + process_list.append(p) + + # also run this on the main process + provs = pipeline_for_tests.make_provenance_tree(decam_exposure) + + for p in process_list: + p.join() + assert not p.exitcode + + # check that the provenances are the same + for _ in process_list: # order is not kept but all outputs should be the same + output_provs = queue.get() + assert output_provs['measuring'].id == provs['measuring'].id diff --git a/tests/pipeline/test_preprocessing.py b/tests/pipeline/test_preprocessing.py index 3e9de665..7cf3b928 100644 --- a/tests/pipeline/test_preprocessing.py +++ b/tests/pipeline/test_preprocessing.py @@ -104,3 +104,19 @@ def test_preprocessing( assert preprocessor._ds.section_id == 'N1' assert set( preprocessor.stepfiles.keys() ) == { 'linearity' } + +def test_warnings_and_exceptions(decam_exposure, preprocessor, decam_default_calibrators, archive): + preprocessor.pars.inject_warnings = 1 + + with pytest.warns(UserWarning) as record: + preprocessor.run(decam_exposure, 'N1') + assert len(record) > 0 + assert any("Warning injected by pipeline parameters in process 'preprocessing'." in str(w.message) for w in record) + + preprocessor.pars.inject_warnings = 0 + preprocessor.pars.inject_exceptions = 1 + with pytest.raises(Exception) as excinfo: + ds = preprocessor.run(decam_exposure, 'N1') + ds.reraise() + assert "Exception injected by pipeline parameters in process 'preprocessing'." in str(excinfo.value) + diff --git a/tests/pipeline/test_subtraction.py b/tests/pipeline/test_subtraction.py index 929cdf48..9ff3faf3 100644 --- a/tests/pipeline/test_subtraction.py +++ b/tests/pipeline/test_subtraction.py @@ -1,3 +1,4 @@ +import pytest import uuid import numpy as np @@ -75,3 +76,20 @@ def test_subtraction_ptf_zogy(ptf_ref, ptf_supernova_images, subtractor): assert abs(mu) < 0.2 # this is not working perfectly, we need to improve the background removal! # assert abs(sigma - 1) < 0.1 # the standard deviation should be close to 1 assert abs(sigma - 1) < 1 # the standard deviation may be also affected by background... + + +def test_warnings_and_exceptions(decam_datastore, decam_reference, subtractor, decam_default_calibrators): + subtractor.pars.inject_warnings = 1 + + with pytest.warns(UserWarning) as record: + subtractor.run(decam_datastore) + assert len(record) > 0 + assert any("Warning injected by pipeline parameters in process 'subtraction'." in str(w.message) for w in record) + + subtractor.pars.inject_warnings = 0 + subtractor.pars.inject_exceptions = 1 + with pytest.raises(Exception) as excinfo: + ds = subtractor.run(decam_datastore) + ds.reraise() + assert "Exception injected by pipeline parameters in process 'subtraction'." in str(excinfo.value) + ds.read_exception() \ No newline at end of file diff --git a/util/util.py b/util/util.py index 4273e1fa..8ba0f9ea 100644 --- a/util/util.py +++ b/util/util.py @@ -376,4 +376,16 @@ def save_fits_image_file(filename, data, header, extname=None, overwrite=True, s else: hdul.writeto(full_name, overwrite=overwrite) - return str( full_name ) \ No newline at end of file + return str( full_name ) + + +def parse_bool(text): + """Check if a string of text that represents a boolean value is True or False.""" + if text is None: + return False + elif text.lower() in ['true', 'yes', '1']: + return True + elif text.lower() in ['false', 'no', '0']: + return False + else: + raise ValueError(f'Cannot parse boolean value from "{text}"')