Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-45738: Experiments with extending butler collections API #1053

Merged
merged 26 commits into from
Aug 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
d8444aa
Fix test so that it is using a tuple not iterating over characters
timj Aug 14, 2024
212d620
Provide explicit reason when client/server tests are skipped
timj Aug 15, 2024
ac61b4d
Change butler.collections to be a ButlerCollections sequence
timj Aug 13, 2024
4460259
Initial work on extending butler collections API
timj Aug 13, 2024
1ad0ab1
Use new collections API in more scripts
timj Aug 14, 2024
3c3d3b7
Replace queryCollections call in export
timj Aug 15, 2024
077b5e5
Add butler.collections.register and use it
timj Aug 15, 2024
8c7236b
Change collections.query to collections.x_query for "experimental"
timj Aug 15, 2024
5c50057
Add butler.collections.x_remove
timj Aug 15, 2024
970f589
Add hybrid butler collections class for hybrid butler
timj Aug 15, 2024
ebbc097
Use new collections API in test_butler.py
timj Aug 15, 2024
9027b0a
Remove __eq__ from ButlerCollections and fix test
timj Aug 16, 2024
4cc657e
Allow CollectionInfo to be sorted by name
timj Aug 16, 2024
b234be3
Add collections.x_query_info and simplify some usage of x_query
timj Aug 16, 2024
a49c3fa
Deprecate Butler.collection_chains
timj Aug 16, 2024
066bea5
Always include the collection docs in collections.get_info
timj Aug 16, 2024
ff8f771
Default CollectionInfo.parents to None
timj Aug 16, 2024
3d17223
Add dataset_types and governors to CollectionInfo
timj Aug 16, 2024
e6251b8
Use collection summaries to speed up butler query-datasets
timj Aug 16, 2024
eaafc03
Improve the error message with table comparisons.
timj Aug 16, 2024
af98c98
Remove governors from CollectionInfo until we need it
timj Aug 19, 2024
a859c71
Fix some spelling errors
timj Aug 19, 2024
c6d1cb0
Rewrite command-lines to filter dataset types by collection summary
timj Aug 19, 2024
467cc55
Add --collections to query-dataset-types command-line
timj Aug 19, 2024
13ac826
Add news fragment
timj Aug 19, 2024
dc4b3cb
Catch and reraise the IntegrityError in collections.x_remove
timj Aug 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/changes/DM-45738.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
* Added ``--collections`` option to ``butler query-dataset-types`` to allow the resultant dataset types to be constrained by those that are used by specific collections.
* Changed the ``Butler.collections`` property to be a ``ButlerCollections`` instance.
This object can still act as a sequence equivalent to ``ButlerCollections.defaults`` but adds new APIs for querying and manipulating collections.
Any methods with names starting with ``x_`` are deemed to be an experimental API that may change in the future.
2 changes: 2 additions & 0 deletions doc/changes/DM-45738.removal.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
The ``Butler.collection_chains`` property is now deprecated.
Please use ``Butler.collections`` instead.
10 changes: 6 additions & 4 deletions python/lsst/daf/butler/_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1422,15 +1422,17 @@ def collection_chains(self) -> ButlerCollections:
"""Object with methods for modifying collection chains
(`~lsst.daf.butler.ButlerCollections`).

Use of this object is preferred over `registry` wherever possible.
Deprecated. Replaced with ``collections`` property.
"""
raise NotImplementedError()

@property
@abstractmethod
def collections(self) -> Sequence[str]:
"""The collections to search by default, in order
(`~collections.abc.Sequence` [ `str` ]).
def collections(self) -> ButlerCollections:
"""Object with methods for modifying and querying collections
(`~lsst.daf.butler.ButlerCollections`).

Use of this object is preferred over `registry` wherever possible.
"""
raise NotImplementedError()

Expand Down
256 changes: 253 additions & 3 deletions python/lsst/daf/butler/_butler_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,67 @@

from __future__ import annotations

__all__ = ("ButlerCollections",)
__all__ = ("ButlerCollections", "CollectionInfo")

from abc import ABC, abstractmethod
from collections.abc import Iterable
from collections.abc import Iterable, Sequence, Set
from typing import Any, overload

from pydantic import BaseModel

class ButlerCollections(ABC):
from ._collection_type import CollectionType


class CollectionInfo(BaseModel):
"""Information about a single Butler collection."""

name: str
"""Name of the collection."""
type: CollectionType
"""Type of the collection."""
doc: str = ""
"""Documentation string associated with this collection."""
children: tuple[str, ...] = tuple()
"""Children of this collection (only if CHAINED)."""
parents: frozenset[str] | None = None
"""Any parents of this collection.

`None` if the parents were not requested.
"""
dataset_types: frozenset[str] | None = None
"""Names of any dataset types associated with datasets in this collection.

`None` if no dataset type information was requested
"""

def __lt__(self, other: Any) -> bool:
"""Compare objects by collection name."""
if not isinstance(other, type(self)):
return NotImplemented

Check warning on line 66 in python/lsst/daf/butler/_butler_collections.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/_butler_collections.py#L66

Added line #L66 was not covered by tests
return self.name < other.name


class ButlerCollections(ABC, Sequence):
"""Methods for working with collections stored in the Butler."""

@overload
def __getitem__(self, index: int) -> str: ...

@overload
def __getitem__(self, index: slice) -> Sequence[str]: ...

def __getitem__(self, index: int | slice) -> str | Sequence[str]:
return self.defaults[index]

def __len__(self) -> int:
return len(self.defaults)

@property
@abstractmethod
def defaults(self) -> Sequence[str]:
"""Collection defaults associated with this butler."""
raise NotImplementedError("Defaults must be implemented by a subclass")

@abstractmethod
def extend_chain(self, parent_collection_name: str, child_collection_names: str | Iterable[str]) -> None:
"""Add children to the end of a CHAINED collection.
Expand Down Expand Up @@ -165,3 +217,201 @@
transactions short.
"""
raise NotImplementedError()

def x_query(
self,
expression: str | Iterable[str],
collection_types: Set[CollectionType] | CollectionType | None = None,
flatten_chains: bool = False,
include_chains: bool | None = None,
) -> Sequence[str]:
"""Query the butler for collections matching an expression.

**This is an experimental interface that can change at any time.**

Parameters
----------
expression : `str` or `~collections.abc.Iterable` [ `str` ]
One or more collection names or globs to include in the search.
collection_types : `set` [`CollectionType`], `CollectionType` or `None`
Restrict the types of collections to be searched. If `None` all
collection types are searched.
flatten_chains : `bool`, optional
If `True` (`False` is default), recursively yield the child
collections of matching `~CollectionType.CHAINED` collections.
include_chains : `bool` or `None`, optional
If `True`, yield records for matching `~CollectionType.CHAINED`
collections. Default is the opposite of ``flatten_chains``:
include either CHAINED collections or their children, but not both.

Returns
-------
collections : `~collections.abc.Sequence` [ `str` ]
The names of collections that match ``expression``.

Notes
-----
The order in which collections are returned is unspecified, except that
the children of a `~CollectionType.CHAINED` collection are guaranteed
to be in the order in which they are searched. When multiple parent
`~CollectionType.CHAINED` collections match the same criteria, the
order in which the two lists appear is unspecified, and the lists of
children may be incomplete if a child has multiple parents.

The default implementation is a wrapper around `x_query_info`.
"""
collections_info = self.x_query_info(
expression,
collection_types=collection_types,
flatten_chains=flatten_chains,
include_chains=include_chains,
)
return [info.name for info in collections_info]

@abstractmethod
def x_query_info(
self,
expression: str | Iterable[str],
collection_types: Set[CollectionType] | CollectionType | None = None,
flatten_chains: bool = False,
include_chains: bool | None = None,
include_parents: bool = False,
include_summary: bool = False,
) -> Sequence[CollectionInfo]:
"""Query the butler for collections matching an expression and
return detailed information about those collections.

**This is an experimental interface that can change at any time.**

Parameters
----------
expression : `str` or `~collections.abc.Iterable` [ `str` ]
One or more collection names or globs to include in the search.
collection_types : `set` [`CollectionType`], `CollectionType` or `None`
Restrict the types of collections to be searched. If `None` all
collection types are searched.
flatten_chains : `bool`, optional
If `True` (`False` is default), recursively yield the child
collections of matching `~CollectionType.CHAINED` collections.
include_chains : `bool` or `None`, optional
If `True`, yield records for matching `~CollectionType.CHAINED`
collections. Default is the opposite of ``flatten_chains``:
include either CHAINED collections or their children, but not both.
include_parents : `bool`, optional
Whether the returned information includes parents.
include_summary : `bool`, optional
Whether the returned information includes dataset type and
governor information for the collections.

Returns
-------
collections : `~collections.abc.Sequence` [ `CollectionInfo` ]
The names of collections that match ``expression``.

Notes
-----
The order in which collections are returned is unspecified, except that
the children of a `~CollectionType.CHAINED` collection are guaranteed
to be in the order in which they are searched. When multiple parent
`~CollectionType.CHAINED` collections match the same criteria, the
order in which the two lists appear is unspecified, and the lists of
children may be incomplete if a child has multiple parents.
"""
raise NotImplementedError()
timj marked this conversation as resolved.
Show resolved Hide resolved

@abstractmethod
def get_info(
self, name: str, include_parents: bool = False, include_summary: bool = False
) -> CollectionInfo:
"""Obtain information for a specific collection.

Parameters
----------
name : `str`
The name of the collection of interest.
include_parents : `bool`, optional
If `True` any parents of this collection will be included.
include_summary : `bool`, optional
If `True` dataset type names and governor dimensions of datasets
stored in this collection will be included in the result.

timj marked this conversation as resolved.
Show resolved Hide resolved
Returns
-------
info : `CollectionInfo`
Information on the requested collection.
"""
raise NotImplementedError()

@abstractmethod
def register(self, name: str, type: CollectionType = CollectionType.RUN, doc: str | None = None) -> bool:
"""Add a new collection if one with the given name does not exist.

Parameters
----------
name : `str`
The name of the collection to create.
type : `CollectionType`, optional
Enum value indicating the type of collection to create. Default
is to create a RUN collection.
doc : `str`, optional
Documentation string for the collection.

Returns
-------
registered : `bool`
Boolean indicating whether the collection was already registered
or was created by this call.

Notes
-----
This method cannot be called within transactions, as it needs to be
able to perform its own transaction to be concurrent
"""
raise NotImplementedError()

@abstractmethod
def x_remove(self, name: str) -> None:
"""Remove the given collection from the registry.

**This is an experimental interface that can change at any time.**

Parameters
----------
name : `str`
The name of the collection to remove.
timj marked this conversation as resolved.
Show resolved Hide resolved

Raises
------
lsst.daf.butler.registry.MissingCollectionError
Raised if no collection with the given name exists.
lsst.daf.butler.registry.OrphanedRecordError
Raised if the database rows associated with the collection are
still referenced by some other table, such as a dataset in a
datastore (for `~CollectionType.RUN` collections only) or a
`~CollectionType.CHAINED` collection of which this collection is
a child.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to catch and reraise the SQLAlchemy error in order to have any chance at consistency between DirectButler and RemoteButler.

In fact, this seems like a good place to pull some of the CLI script logic into the Butler itself, by providing options for breaking parent relationships where necessary instead of just raising.


Notes
-----
If this is a `~CollectionType.RUN` collection, all datasets and quanta
in it will removed from the `Registry` database. This requires that
those datasets be removed (or at least trashed) from any datastores
that hold them first.

A collection may not be deleted as long as it is referenced by a
`~CollectionType.CHAINED` collection; the ``CHAINED`` collection must
be deleted or redefined first.
"""
raise NotImplementedError()

def _filter_dataset_types(
self, dataset_types: Iterable[str], collections: Iterable[CollectionInfo]
) -> Iterable[str]:
dataset_types_set = set(dataset_types)
collection_dataset_types: set[str] = set()
for info in collections:
if info.dataset_types is None:
raise RuntimeError("Can only filter by collections if include_summary was True")

Check warning on line 414 in python/lsst/daf/butler/_butler_collections.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/_butler_collections.py#L414

Added line #L414 was not covered by tests
collection_dataset_types.update(info.dataset_types)
dataset_types_set = dataset_types_set.intersection(collection_dataset_types)
return dataset_types_set
5 changes: 5 additions & 0 deletions python/lsst/daf/butler/cli/cmd/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,11 @@ def query_collections(*args: Any, **kwargs: Any) -> None:
"dataset types to return."
)
@verbose_option(help="Include dataset type name, dimensions, and storage class in output.")
@collections_option(
help="Constrain the resulting dataset types by these collections. "
"This constraint does not say that a dataset of this type is definitely present, "
"solely that one may have been present at some point."
)
@options_file_option()
def query_dataset_types(*args: Any, **kwargs: Any) -> None:
"""Get the dataset types in a repository."""
Expand Down
23 changes: 11 additions & 12 deletions python/lsst/daf/butler/direct_butler/_direct_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
from collections.abc import Iterable, Iterator, MutableMapping, Sequence
from typing import TYPE_CHECKING, Any, ClassVar, TextIO, cast

from deprecated.sphinx import deprecated
from lsst.resources import ResourcePath, ResourcePathExpression
from lsst.utils.introspection import get_class_of
from lsst.utils.logging import VERBOSE, getLogger
Expand Down Expand Up @@ -296,7 +297,7 @@ def __reduce__(self) -> tuple:
DirectButler._unpickle,
(
self._config,
self.collections,
self.collections.defaults,
self.run,
dict(self._registry.defaults.dataId.required),
self._registry.isWriteable(),
Expand Down Expand Up @@ -1975,7 +1976,7 @@ def transfer_from(
if registry := getattr(source_butler, "registry", None):
run_doc = registry.getCollectionDocumentation(run)
if not dry_run:
registered = self._registry.registerRun(run, doc=run_doc)
registered = self.collections.register(run, doc=run_doc)
else:
registered = True
handled_collections.add(run)
Expand Down Expand Up @@ -2149,21 +2150,19 @@ def validateConfiguration(
raise ValidationError(";\n".join(messages))

@property
@deprecated(
"Please use 'collections' instead. collection_chains will be removed after v28.",
version="v28",
category=FutureWarning,
)
def collection_chains(self) -> DirectButlerCollections:
"""Object with methods for modifying collection chains."""
return DirectButlerCollections(self._registry)

@property
def collections(self) -> Sequence[str]:
"""The collections to search by default, in order
(`~collections.abc.Sequence` [ `str` ]).

This is an alias for ``self.registry.defaults.collections``. It cannot
be set directly in isolation, but all defaults may be changed together
by assigning a new `RegistryDefaults` instance to
``self.registry.defaults``.
"""
return self._registry.defaults.collections
def collections(self) -> DirectButlerCollections:
"""Object with methods for modifying and inspecting collections."""
return DirectButlerCollections(self._registry)

@property
def run(self) -> str | None:
Expand Down
Loading
Loading