From f86e45a1ce0a2f57498b0991c0638afa7af5b073 Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Wed, 29 Jan 2025 14:09:22 +0200 Subject: [PATCH 01/14] Remove extra spatial arrow table, include spatial domain info in index column arrow table --- .../src/soma/soma_geometry_column.cc | 6 +- .../src/soma/soma_geometry_dataframe.cc | 4 +- .../src/soma/soma_geometry_dataframe.h | 1 - libtiledbsoma/src/utils/arrow_adapter.cc | 97 ++++++++++++++----- libtiledbsoma/src/utils/arrow_adapter.h | 41 ++++++-- .../test/unit_soma_geometry_dataframe.cc | 6 -- 6 files changed, 111 insertions(+), 44 deletions(-) diff --git a/libtiledbsoma/src/soma/soma_geometry_column.cc b/libtiledbsoma/src/soma/soma_geometry_column.cc index 537d3ecc71..1924107006 100644 --- a/libtiledbsoma/src/soma/soma_geometry_column.cc +++ b/libtiledbsoma/src/soma/soma_geometry_column.cc @@ -74,13 +74,13 @@ std::shared_ptr SOMAGeometryColumn::create( throw TileDBSOMAError(std::format( "[SOMAGeometryColumn] " "Unkwown type metadata for `{}`: " - "Expected 'WKB', got {}", + "Expected 'WKB', got '{}'", SOMA_GEOMETRY_COLUMN_NAME, type_metadata)); } for (int64_t j = 0; j < spatial_schema->n_children; ++j) { - dims.push_back(ArrowAdapter::tiledb_dimension_from_arrow_schema( + dims.push_back(ArrowAdapter::tiledb_dimension_from_arrow_schema_ext( ctx, spatial_schema->children[j], spatial_array->children[j], @@ -92,7 +92,7 @@ std::shared_ptr SOMAGeometryColumn::create( } for (int64_t j = 0; j < spatial_schema->n_children; ++j) { - dims.push_back(ArrowAdapter::tiledb_dimension_from_arrow_schema( + dims.push_back(ArrowAdapter::tiledb_dimension_from_arrow_schema_ext( ctx, spatial_schema->children[j], spatial_array->children[j], diff --git a/libtiledbsoma/src/soma/soma_geometry_dataframe.cc b/libtiledbsoma/src/soma/soma_geometry_dataframe.cc index ec2a2eb063..05fd4b9ecc 100644 --- a/libtiledbsoma/src/soma/soma_geometry_dataframe.cc +++ b/libtiledbsoma/src/soma/soma_geometry_dataframe.cc @@ -31,7 +31,6 @@ void SOMAGeometryDataFrame::create( std::string_view uri, const std::unique_ptr& schema, const ArrowTable& index_columns, - const ArrowTable& spatial_columns, const SOMACoordinateSpace& coordinate_space, std::shared_ptr ctx, PlatformConfig platform_config, @@ -43,8 +42,7 @@ void SOMAGeometryDataFrame::create( index_columns, "SOMAGeometryDataFrame", true, - platform_config, - spatial_columns); + platform_config); auto array = SOMAArray::_create( ctx, diff --git a/libtiledbsoma/src/soma/soma_geometry_dataframe.h b/libtiledbsoma/src/soma/soma_geometry_dataframe.h index 22a4746a92..d712de5a8a 100644 --- a/libtiledbsoma/src/soma/soma_geometry_dataframe.h +++ b/libtiledbsoma/src/soma/soma_geometry_dataframe.h @@ -49,7 +49,6 @@ class SOMAGeometryDataFrame : virtual public SOMAArray { std::string_view uri, const std::unique_ptr& schema, const ArrowTable& index_columns, - const ArrowTable& spatial_columns, const SOMACoordinateSpace& coordinate_space, std::shared_ptr ctx, PlatformConfig platform_config = PlatformConfig(), diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 827730fe49..ef896172d9 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -879,8 +879,7 @@ ArrowAdapter::tiledb_schema_from_arrow_schema( const ArrowTable& index_column_info, std::string soma_type, bool is_sparse, - PlatformConfig platform_config, - const ArrowTable& spatial_column_info) { + PlatformConfig platform_config) { auto& index_column_array = index_column_info.first; auto& index_column_schema = index_column_info.second; @@ -940,13 +939,12 @@ ArrowAdapter::tiledb_schema_from_arrow_schema( if (strcmp(child->name, index_column_schema->children[i]->name) == 0) { if (strcmp(child->name, SOMA_GEOMETRY_COLUMN_NAME.c_str()) == - 0 && - spatial_column_info.first.get() != nullptr) { + 0) { columns.push_back(SOMAGeometryColumn::create( ctx, child, - spatial_column_info.second.get(), - spatial_column_info.first.get(), + index_column_schema->children[i], + index_column_array->children[i], soma_type, type_metadata, platform_config)); @@ -1053,23 +1051,28 @@ ArrowAdapter::tiledb_schema_from_arrow_schema( continue; } - if (column->name() == SOMA_GEOMETRY_COLUMN_NAME) { - std::vector cdslot; - for (int64_t j = 0; j < spatial_column_info.first->n_children; - ++j) { - cdslot.push_back(ArrowAdapter::get_table_any_column<2>( - spatial_column_info.first->children[j], - spatial_column_info.second->children[j], - 3)); - } - - column->set_current_domain_slot(ndrect, cdslot); - } else { - column->set_current_domain_slot( - ndrect, - get_table_any_column_by_name<2>( - index_column_info, column->name(), 3)); - } + column->set_current_domain_slot( + ndrect, + get_table_any_column_by_name<2>( + index_column_info, column->name(), 3)); + + // if (column->name() == SOMA_GEOMETRY_COLUMN_NAME) { + // std::vector cdslot; + // for (int64_t j = 0; j < spatial_column_info.first->n_children; + // ++j) { + // cdslot.push_back(ArrowAdapter::get_table_any_column<2>( + // spatial_column_info.first->children[j], + // spatial_column_info.second->children[j], + // 3)); + // } + + // column->set_current_domain_slot(ndrect, cdslot); + // } else { + // column->set_current_domain_slot( + // ndrect, + // get_table_any_column_by_name<2>( + // index_column_info, column->name(), 3)); + // } } current_domain.set_ndrectangle(ndrect); @@ -1121,6 +1124,54 @@ Dimension ArrowAdapter::tiledb_dimension_from_arrow_schema( return dim; } +Dimension ArrowAdapter::tiledb_dimension_from_arrow_schema_ext( + std::shared_ptr ctx, + ArrowSchema* schema, + ArrowArray* array, + std::string soma_type, + std::string_view type_metadata, + std::string prefix, + std::string suffix, + PlatformConfig platform_config) { + if (strcmp(schema->format, "+l") != 0) { + throw TileDBSOMAError( + std::format("[tiledb_dimension_from_arrow_schema_ext] Schema " + "should be of type list.")); + } + + if (schema->n_children != 1) { + throw TileDBSOMAError( + std::format("[tiledb_dimension_from_arrow_schema_ext] Schema " + "should have exactly 1 child")); + } + + auto type = ArrowAdapter::to_tiledb_format( + schema->children[0]->format, type_metadata); + + if (ArrowAdapter::arrow_is_var_length_type(schema->format)) { + type = TILEDB_STRING_ASCII; + } + + auto col_name = prefix + std::string(schema->name) + suffix; + + FilterList filter_list = ArrowAdapter::_create_dim_filter_list( + col_name, platform_config, soma_type, ctx); + + if (array->length != 5) { + throw TileDBSOMAError(std::format( + "ArrowAdapter: unexpected length {} != 5 for name " + "'{}'", + array->length, + col_name)); + } + + const void* buff = array->children[0]->buffers[1]; + auto dim = ArrowAdapter::_create_dim(type, col_name, buff, ctx); + dim.set_filter_list(filter_list); + + return dim; +} + std::pair> ArrowAdapter::tiledb_attribute_from_arrow_schema( std::shared_ptr ctx, diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index 29bc0159b2..f94278d4f8 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -408,10 +408,7 @@ class ArrowAdapter { const ArrowTable& index_column_info, std::string soma_type, bool is_sparse = true, - PlatformConfig platform_config = PlatformConfig(), - const ArrowTable& spatial_column_info = { - std::unique_ptr(nullptr), - std::unique_ptr(nullptr)}); + PlatformConfig platform_config = PlatformConfig()); /** * @brief Get a TileDB dimension from an Arrow schema. @@ -428,6 +425,26 @@ class ArrowAdapter { std::string suffix = std::string(), PlatformConfig platform_config = PlatformConfig()); + /** + * @brief Get a TileDB dimension from an Arrow schema. + * + * @remarks This is a list variation which expects a schemaand a data array + * to describe a list instead of a simple columns. Used especialy with + * nested domains where it is described by a struct and each nested + * dimension is described by a list. + * + * @return std::pair The TileDB dimension. + */ + static Dimension tiledb_dimension_from_arrow_schema_ext( + std::shared_ptr ctx, + ArrowSchema* schema, + ArrowArray* array, + std::string soma_type, + std::string_view type_metadata, + std::string prefix = std::string(), + std::string suffix = std::string(), + PlatformConfig platform_config = PlatformConfig()); + /** * @brief Get a TileDB attribute with its enumeration from an Arrow schema. * @@ -909,11 +926,19 @@ class ArrowAdapter { ArrowArray* selected_array = arrow_array->children[column_index]; ArrowSchema* selected_schema = arrow_schema->children[column_index]; - // Complex domain - if (selected_array->n_children != 0) { + if (strcmp(selected_schema->format, "+s") == 0) { + // Complex domain like the ones required by `GeometryColumn` expect + // a struct containing lists of values for (int64_t i = 0; i < selected_schema->n_children; ++i) { - ArrowArray* array = selected_array->children[i]; - ArrowSchema* schema = selected_schema->children[i]; + if (strcmp(selected_schema->children[i]->format, "+l") != 0) { + throw std::runtime_error(std::format( + "[ArrowAdapter][get_table_any_column_by_index] Complex " + "column struct should contain list but found '{}'", + selected_schema->children[i]->format)); + } + + ArrowArray* array = selected_array->children[i]->children[0]; + ArrowSchema* schema = selected_schema->children[i]->children[0]; result.push_back( get_table_any_column(array, schema, offset)); diff --git a/libtiledbsoma/test/unit_soma_geometry_dataframe.cc b/libtiledbsoma/test/unit_soma_geometry_dataframe.cc index bd2f263337..a78b070aa1 100644 --- a/libtiledbsoma/test/unit_soma_geometry_dataframe.cc +++ b/libtiledbsoma/test/unit_soma_geometry_dataframe.cc @@ -70,9 +70,6 @@ TEST_CASE("SOMAGeometryDataFrame: basic", "[SOMAGeometryDataFrame]") { std::move(schema), ArrowTable( std::move(index_columns.first), std::move(index_columns.second)), - ArrowTable( - std::move(spatial_columns.first), - std::move(spatial_columns.second)), coord_space, ctx, platform_config, @@ -156,9 +153,6 @@ TEST_CASE("SOMAGeometryDataFrame: Roundtrip", "[SOMAGeometryDataFrame]") { std::move(schema), ArrowTable( std::move(index_columns.first), std::move(index_columns.second)), - ArrowTable( - std::move(spatial_columns.first), - std::move(spatial_columns.second)), coord_space, ctx, platform_config, From 3063481207321ae486855087b7f9b845965afc80 Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Wed, 29 Jan 2025 14:10:38 +0200 Subject: [PATCH 02/14] Add GeometryDataframe python implementation --- apis/python/setup.py | 1 + apis/python/src/tiledbsoma/__init__.py | 2 + apis/python/src/tiledbsoma/_dataframe.py | 52 +++- .../src/tiledbsoma/_geometry_dataframe.py | 227 ++++++++++++++++-- apis/python/src/tiledbsoma/_tdb_handles.py | 16 ++ apis/python/src/tiledbsoma/pytiledbsoma.cc | 2 + .../src/tiledbsoma/soma_geometry_dataframe.cc | 141 +++++++++++ apis/python/src/tiledbsoma/soma_object.cc | 5 +- libtiledbsoma/src/utils/arrow_adapter.cc | 97 ++------ 9 files changed, 437 insertions(+), 106 deletions(-) create mode 100644 apis/python/src/tiledbsoma/soma_geometry_dataframe.cc diff --git a/apis/python/setup.py b/apis/python/setup.py index 4b6214911f..8f141e9204 100644 --- a/apis/python/setup.py +++ b/apis/python/setup.py @@ -320,6 +320,7 @@ def run(self): "src/tiledbsoma/soma_object.cc", "src/tiledbsoma/soma_dataframe.cc", "src/tiledbsoma/soma_point_cloud_dataframe.cc", + "src/tiledbsoma/soma_geometry_dataframe.cc", "src/tiledbsoma/soma_dense_ndarray.cc", "src/tiledbsoma/soma_sparse_ndarray.cc", "src/tiledbsoma/soma_group.cc", diff --git a/apis/python/src/tiledbsoma/__init__.py b/apis/python/src/tiledbsoma/__init__.py index 43da8e9c69..410fc7201d 100644 --- a/apis/python/src/tiledbsoma/__init__.py +++ b/apis/python/src/tiledbsoma/__init__.py @@ -179,6 +179,7 @@ from ._measurement import Measurement from ._multiscale_image import MultiscaleImage from ._point_cloud_dataframe import PointCloudDataFrame +from ._geometry_dataframe import GeometryDataFrame from ._query import ExperimentAxisQuery from ._scene import Scene from ._sparse_nd_array import SparseNDArray, SparseNDArrayRead @@ -209,6 +210,7 @@ "DoesNotExistError", "Experiment", "ExperimentAxisQuery", + "GeometryDataFrame", "get_implementation_version", "get_implementation", "get_SOMA_version", diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 7b8794bc7c..291d92de63 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -27,7 +27,7 @@ from . import _arrow_types, _util from . import pytiledbsoma as clib -from ._constants import SOMA_JOINID +from ._constants import SOMA_GEOMETRY, SOMA_JOINID from ._exception import SOMAError, map_exception_for_create from ._read_iters import ManagedQuery, TableReadIter from ._soma_array import SOMAArray @@ -811,9 +811,20 @@ def _canonicalize_schema( # add SOMA_JOINID schema = schema.append(pa.field(SOMA_JOINID, pa.int64())) + if SOMA_GEOMETRY in schema.names: + geometry_type = schema.field(SOMA_GEOMETRY).type + if geometry_type != pa.binary() and geometry_type != pa.large_binary(): + raise ValueError( + f"{SOMA_GEOMETRY} field must be of type Arrow binary or large_binary but is {geometry_type}" + ) + schema.set(schema.get_field_index(SOMA_GEOMETRY), schema.field(SOMA_GEOMETRY).with_metadata({"dtype": "WKB"})) + else: + # add SOMA_GEOMETRY + schema = schema.append(pa.field(SOMA_GEOMETRY, pa.large_binary(), metadata={"dtype": "WKB"})) + # verify no illegal use of soma_ prefix for field_name in schema.names: - if field_name.startswith("soma_") and field_name != SOMA_JOINID: + if field_name.startswith("soma_") and field_name != SOMA_JOINID and field_name != SOMA_GEOMETRY: raise ValueError( f"DataFrame schema may not contain fields with name prefix ``soma_``: got ``{field_name}``" ) @@ -821,7 +832,7 @@ def _canonicalize_schema( # verify that all index_column_names are present in the schema schema_names_set = set(schema.names) for index_column_name in index_column_names: - if index_column_name.startswith("soma_") and index_column_name != SOMA_JOINID: + if index_column_name.startswith("soma_") and index_column_name != SOMA_JOINID and index_column_name != SOMA_GEOMETRY: raise ValueError( f'index_column_name other than "soma_joinid" must not begin with "soma_"; got "{index_column_name}"' ) @@ -873,7 +884,31 @@ def _fill_out_slot_soma_domain( Returns a boolean for whether the underlying datatype's max range was used. """ saturated_range = False - if slot_domain is not None: + if index_column_name == SOMA_GEOMETRY: + # SOMA_GEOMETRY domain should be either a list on None or a list of tuple[float, float] + lo = [] + hi = [] + if isinstance(slot_domain, list): + finfo: NPFInfo = np.finfo(np.float64) + for axis_domain in slot_domain: + if axis_domain is None: + lo.append(finfo.min) + hi.append(finfo.max) + saturated_range = True + elif not isinstance(axis_domain, tuple) or len(axis_domain) != 2: + raise ValueError( + "Axis domain should be a tuple[float, float]" + ) + else: + if np.issubdtype(type(axis_domain[0]), NPFloating) or np.issubdtype(type(axis_domain[1]), NPFloating): + raise ValueError( + "Axis domain should be a tuple[float, float]" + ) + + lo.append(axis_domain[0]) + hi.append(axis_domain[1]) + slot_domain = lo, hi + elif slot_domain is not None: # User-specified; go with it when possible if ( ( @@ -1007,6 +1042,9 @@ def _find_extent_for_domain( if isinstance(dtype, np.dtype) and dtype.itemsize == 1: extent = 1 + if index_column_name == SOMA_GEOMETRY: + return extent + # Core string dims have no extent and no (core) domain. We return "" here # simply so we can pass libtiledbsoma "" for domain and extent, while it # will (and must) ignore these when creating the TileDB schema. @@ -1056,6 +1094,10 @@ def _revise_domain_for_extent( domain: Tuple[Any, Any], extent: Any, saturated_range: bool ) -> Tuple[Any, Any]: if saturated_range: - return (domain[0], domain[1] - extent) + # Handle SOMA_GEOMETRY domain with is tuple[list[float], list[float]] + if isinstance(domain[1], list): + return (domain[0], [dim_max - extent for dim_max in domain[1]],) + else: + return (domain[0], domain[1] - extent) else: return domain diff --git a/apis/python/src/tiledbsoma/_geometry_dataframe.py b/apis/python/src/tiledbsoma/_geometry_dataframe.py index c41a5b72d3..ddf8088817 100644 --- a/apis/python/src/tiledbsoma/_geometry_dataframe.py +++ b/apis/python/src/tiledbsoma/_geometry_dataframe.py @@ -13,18 +13,31 @@ import pyarrow as pa import somacore from somacore import CoordinateSpace, CoordinateTransform, options +from tiledbsoma._tdb_handles import GeometryDataFrameWrapper +from tiledbsoma.options._soma_tiledb_context import _validate_soma_tiledb_context +from tiledbsoma.options._tiledb_create_write_options import TileDBCreateOptions from typing_extensions import Self -from ._constants import SPATIAL_DISCLAIMER -from ._dataframe import Domain +from . import _arrow_types, _util +from . import pytiledbsoma as clib +from ._constants import SOMA_COORDINATE_SPACE_METADATA_KEY, SOMA_GEOMETRY, SOMA_JOINID, SPATIAL_DISCLAIMER +from ._dataframe import Domain, _canonicalize_schema, _fill_out_slot_soma_domain, _find_extent_for_domain, _revise_domain_for_extent from ._read_iters import TableReadIter from ._types import OpenTimestamp from .options import SOMATileDBContext +from ._exception import SOMAError, map_exception_for_create +from ._spatial_dataframe import SpatialDataFrame +from ._spatial_util import ( + coordinate_space_from_json, + coordinate_space_to_json, + process_spatial_df_region, +) + _UNBATCHED = options.BatchSize() -class GeometryDataFrame(somacore.GeometryDataFrame): +class GeometryDataFrame(SpatialDataFrame, somacore.GeometryDataFrame): """A specialized SOMA object for storing complex geometries with spatial indexing. The ``GeometryDataFrame`` class is designed to store and manage geometric shapes @@ -35,7 +48,8 @@ class GeometryDataFrame(somacore.GeometryDataFrame): Experimental. """ - __slots__ = () + __slots__ = ("_coord_space",) + _wrapper_type = GeometryDataFrameWrapper # Lifecycle @@ -85,7 +99,176 @@ def create( Experimental. """ warnings.warn(SPATIAL_DISCLAIMER) - raise NotImplementedError() + + # Get coordinate space axis data. + if isinstance(coordinate_space, CoordinateSpace): + axis_names = tuple(axis.name for axis in coordinate_space) + axis_units = tuple(axis.unit for axis in coordinate_space) + else: + axis_names = tuple(coordinate_space) + axis_units = tuple(len(axis_names) * [None]) + + index_column_names = (SOMA_GEOMETRY, SOMA_JOINID,) + + context = _validate_soma_tiledb_context(context) + schema = _canonicalize_schema(schema, index_column_names) + + # SOMA-to-core mappings: + # + # Before the current-domain feature was enabled (possible after core 2.25): + # + # * SOMA domain <-> core domain, AKA "max domain" which is a name we'll use for clarity + # * core current domain did not exist + # + # After the current-domain feature was enabled: + # + # * SOMA max_domain <-> core domain + # * SOMA domain <-> core current domain + # + # As far as the user is concerned, the SOMA-level domain is the only + # thing they see and care about. Before 2.25 support, it was immutable + # (since it was implemented by core domain). After 2.25 support, it is + # mutable/up-resizeable (since it is implemented by core current domain). + + # At this point shift from API terminology "domain" to specifying a soma_ or core_ + # prefix for these variables. This is crucial to avoid developer confusion. + soma_domain = domain + domain = None + + # Check if domain has the right size (number of axis + 1 for SOMA_JOINID) + + if soma_domain is None: + soma_domain = tuple(None for _ in index_column_names) + else: + ndom = len(soma_domain) + nidx = len(index_column_names) + if ndom != nidx: + raise ValueError( + f"if domain is specified, it must have the same length as " + f"index_column_names; got {ndom} != {nidx}" + ) + + mutable_soma_domain = list(soma_domain) + soma_geometry_domain = mutable_soma_domain[index_column_names.index(SOMA_GEOMETRY)] + + if soma_geometry_domain is None: + soma_geometry_domain = [None for _ in axis_names] + elif not isinstance(soma_geometry_domain, list): + raise ValueError(f"'{SOMA_GEOMETRY}' domain should be a list of tuple[float, float]") + elif len(soma_geometry_domain) != len(axis_names): + raise ValueError(f"Dimension mishmatch between '{SOMA_GEOMETRY}' domain and coordinate system") + + mutable_soma_domain[index_column_names.index(SOMA_GEOMETRY)] = soma_geometry_domain + soma_domain = tuple(mutable_soma_domain) + + index_column_schema = [] + index_column_data = {} + + for index_column_name, slot_soma_domain in zip(index_column_names, soma_domain): + pa_field = schema.field(index_column_name) + dtype = _arrow_types.tiledb_type_from_arrow_type( + pa_field.type, is_indexed_column=True + ) + + (slot_core_current_domain, saturated_cd) = _fill_out_slot_soma_domain( + slot_soma_domain, False, index_column_name, pa_field.type, dtype + ) + + if index_column_name == SOMA_GEOMETRY: + (slot_core_max_domain, saturated_md) = _fill_out_slot_soma_domain( + [None for _ in axis_names], True, index_column_name, pa_field.type, dtype + ) + else: + (slot_core_max_domain, saturated_md) = _fill_out_slot_soma_domain( + None, True, index_column_name, pa_field.type, dtype + ) + + extent = _find_extent_for_domain( + index_column_name, + TileDBCreateOptions.from_platform_config(platform_config), + dtype, + slot_core_current_domain, + ) + + # Necessary to avoid core array-creation error "Reduce domain max by + # 1 tile extent to allow for expansion." + slot_core_current_domain = _revise_domain_for_extent( + slot_core_current_domain, extent, saturated_cd + ) + slot_core_max_domain = _revise_domain_for_extent( + slot_core_max_domain, extent, saturated_md + ) + + # Here is our Arrow data API for communicating schema info between + # Python/R and C++ libtiledbsoma: + # + # [0] core max domain lo + # [1] core max domain hi + # [2] core extent parameter + # If present, these next two signal to use the current-domain feature: + # [3] core current domain lo + # [4] core current domain hi + if index_column_name == SOMA_GEOMETRY: + # SOMA_GEOMETRY has a specific schema + index_column_schema.append(pa.field(pa_field.name, pa.struct({axis:pa.list_(pa.float64()) for axis in axis_names}), nullable=True)) + struct = [] + for idx, axis in enumerate(axis_names): + struct.append((axis, + [ + *[float(max_domain[idx]) for max_domain in slot_core_max_domain], + extent, + *[float(current_domain[idx]) for current_domain in slot_core_current_domain] + ] + )) + + index_column_data[pa_field.name] = [struct, None, None, None, None] + else: + index_column_schema.append(pa_field) + index_column_data[pa_field.name] = [ + *slot_core_max_domain, + extent, + *slot_core_current_domain, + ] + + index_column_info = pa.RecordBatch.from_pydict( + index_column_data, schema=pa.schema(index_column_schema) + ) + + plt_cfg = _util.build_clib_platform_config(platform_config) + timestamp_ms = context._open_timestamp_ms(tiledb_timestamp) + try: + clib.SOMAGeometryDataFrame.create( + uri, + schema=schema, + index_column_info=index_column_info, + axis_names=axis_names, + axis_units=axis_units, + ctx=context.native_context, + platform_config=plt_cfg, + timestamp=(0, timestamp_ms), + ) + except SOMAError as e: + raise map_exception_for_create(e, uri) from None + + return cls( + cls._wrapper_type.open(uri, "w", context, tiledb_timestamp), + _dont_call_this_use_create_or_open_instead="tiledbsoma-internal-code", + ) + + def __init__( + self, + handle: GeometryDataFrameWrapper, + **kwargs: Any, + ): + super().__init__(handle, **kwargs) + + # Get and validate coordinate space. + try: + coord_space = self.metadata[SOMA_COORDINATE_SPACE_METADATA_KEY] + except KeyError as ke: + raise SOMAError("Missing coordinate space metadata") from ke + self._coord_space = coordinate_space_from_json(coord_space) + # Data operations @@ -196,24 +379,6 @@ def write( # Metadata operations - @property - def schema(self) -> pa.Schema: - """The schema of the data in this dataframe. - - Lifecycle: - Experimental. - """ - raise NotImplementedError() - - @property - def index_column_names(self) -> Tuple[str, ...]: - """The names of the index (dimension) columns. - - Lifecycle: - Experimental. - """ - raise NotImplementedError() - @property def axis_names(self) -> Tuple[str, ...]: """The names of the axes of the coordinate space the data is defined on. @@ -221,7 +386,7 @@ def axis_names(self) -> Tuple[str, ...]: Lifecycle: Experimental. """ - raise NotImplementedError() + return self._coord_space.axis_names @property def coordinate_space(self) -> CoordinateSpace: @@ -230,7 +395,7 @@ def coordinate_space(self) -> CoordinateSpace: Lifecycle: Experimental. """ - raise NotImplementedError() + return self._coord_space @coordinate_space.setter def coordinate_space(self, value: CoordinateSpace) -> None: @@ -239,7 +404,17 @@ def coordinate_space(self, value: CoordinateSpace) -> None: Lifecycle: Experimental. """ - raise NotImplementedError() + if self._coord_space is not None: + if value.axis_names != self._coord_space.axis_names: + raise ValueError( + f"Cannot change axis names of a geometry dataframe. Existing " + f"axis names are {self._coord_space.axis_names}. New coordinate " + f"space has axis names {value.axis_names}." + ) + self.metadata[SOMA_COORDINATE_SPACE_METADATA_KEY] = coordinate_space_to_json( + value + ) + self._coord_space = value @property def domain(self) -> Tuple[Tuple[Any, Any], ...]: diff --git a/apis/python/src/tiledbsoma/_tdb_handles.py b/apis/python/src/tiledbsoma/_tdb_handles.py index 9934a1e5fb..81ecc76fe9 100644 --- a/apis/python/src/tiledbsoma/_tdb_handles.py +++ b/apis/python/src/tiledbsoma/_tdb_handles.py @@ -45,6 +45,7 @@ clib.SOMAArray, clib.SOMADataFrame, clib.SOMAPointCloudDataFrame, + clib.SOMAGeometryDataFrame, clib.SOMASparseNDArray, clib.SOMADenseNDArray, clib.SOMAGroup, @@ -77,6 +78,7 @@ def open( "somagroup": clib.SOMAGroup.open, "somadataframe": clib.SOMADataFrame.open, "somapointclouddataframe": clib.SOMAPointCloudDataFrame.open, + "somageometrydataframe": clib.SOMAGeometryDataFrame.open, "somadensendarray": clib.SOMADenseNDArray.open, "somasparsendarray": clib.SOMASparseNDArray.open, "somacollection": clib.SOMACollection.open, @@ -105,6 +107,7 @@ def open( _type_to_class = { "somadataframe": DataFrameWrapper, "somapointclouddataframe": PointCloudDataFrameWrapper, + "somageometrydataframe": GeometryDataFrameWrapper, "somadensendarray": DenseNDArrayWrapper, "somasparsendarray": SparseNDArrayWrapper, "somacollection": CollectionWrapper, @@ -638,6 +641,19 @@ def write(self, values: pa.RecordBatch) -> None: self._handle.write(values) +class GeometryDataFrameWrapper(SOMAArrayWrapper[clib.SOMAGeometryDataFrame]): + """Wrapper around a Pybind11 SOMAGeometryDataFrame handle.""" + + _WRAPPED_TYPE = clib.SOMAGeometryDataFrame + + @property + def count(self) -> int: + return int(self._handle.count) + + def write(self, values: pa.RecordBatch) -> None: + self._handle.write(values) + + class DenseNDArrayWrapper(SOMAArrayWrapper[clib.SOMADenseNDArray]): """Wrapper around a Pybind11 DenseNDArrayWrapper handle.""" diff --git a/apis/python/src/tiledbsoma/pytiledbsoma.cc b/apis/python/src/tiledbsoma/pytiledbsoma.cc index 8d5514f7b4..3f3e1a5090 100644 --- a/apis/python/src/tiledbsoma/pytiledbsoma.cc +++ b/apis/python/src/tiledbsoma/pytiledbsoma.cc @@ -21,6 +21,7 @@ void load_soma_object(py::module&); void load_soma_array(py::module&); void load_soma_dataframe(py::module&); void load_soma_point_cloud_dataframe(py::module&); +void load_soma_geometry_dataframe(py::module&); void load_soma_dense_ndarray(py::module&); void load_soma_sparse_ndarray(py::module&); void load_soma_group(py::module&); @@ -166,6 +167,7 @@ PYBIND11_MODULE(pytiledbsoma, m) { load_soma_dense_ndarray(m); load_soma_sparse_ndarray(m); load_soma_point_cloud_dataframe(m); + load_soma_geometry_dataframe(m); load_soma_group(m); load_soma_collection(m); load_query_condition(m); diff --git a/apis/python/src/tiledbsoma/soma_geometry_dataframe.cc b/apis/python/src/tiledbsoma/soma_geometry_dataframe.cc new file mode 100644 index 0000000000..157526697c --- /dev/null +++ b/apis/python/src/tiledbsoma/soma_geometry_dataframe.cc @@ -0,0 +1,141 @@ +/** + * @file soma_point_cloud_dataframe.cc + * + * @section LICENSE + * + * Licensed under the MIT License. + * Copyright (c) TileDB, Inc. and The Chan Zuckerberg Initiative Foundation + * + * @section DESCRIPTION + * + * This file defines the SOMAGeometryDataFrame bindings. + */ + +#include +#include +#include +#include +#include + +#include + +#include "common.h" + +namespace libtiledbsomacpp { + +namespace py = pybind11; +using namespace py::literals; +using namespace tiledbsoma; + +void load_soma_geometry_dataframe(py::module& m) { + py::class_( + m, "SOMAGeometryDataFrame") + + .def_static( + "create", + [](std::string_view uri, + py::object py_schema, + py::object index_column_info, + std::vector axis_names, + std::vector> axis_units, + std::shared_ptr context, + PlatformConfig platform_config, + std::optional> timestamp) { + ArrowSchema schema; + uintptr_t schema_ptr = (uintptr_t)(&schema); + py_schema.attr("_export_to_c")(schema_ptr); + + // Please see + // https://github.com/single-cell-data/TileDB-SOMA/issues/2869 + // for the reasoning here. + // + // TL;DR: + // * The table has an `ArrowSchema`; each of its children + // is also an `ArrowSchema`. + // * Arrow fields are nullable by default in the user API. + // * There is a field-level nullability flag, _and_ users + // can set a "nullable" metadata as well. + // * In the absence of metadata, respect the flag we get. + // * In the present of metdata with "nullable", let that + // override. + + auto metadata = py_schema.attr("metadata"); + if (py::hasattr(metadata, "get")) { + for (int64_t i = 0; i < schema.n_children; ++i) { + auto child = schema.children[i]; + auto val = metadata.attr("get")( + py::str(child->name).attr("encode")("utf-8")); + + if (!val.is(py::none()) && + val.cast() == "nullable") { + child->flags |= ARROW_FLAG_NULLABLE; + } + } + } + + ArrowSchema index_column_schema; + ArrowArray index_column_array; + uintptr_t + index_column_schema_ptr = (uintptr_t)(&index_column_schema); + uintptr_t + index_column_array_ptr = (uintptr_t)(&index_column_array); + index_column_info.attr("_export_to_c")( + index_column_array_ptr, index_column_schema_ptr); + + SOMACoordinateSpace coord_space{axis_names, axis_units}; + + try { + SOMAGeometryDataFrame::create( + uri, + std::make_unique(schema), + ArrowTable( + std::make_unique(index_column_array), + std::make_unique(index_column_schema)), + coord_space, + context, + platform_config, + timestamp); + } catch (const std::out_of_range& e) { + throw py::type_error(e.what()); + } catch (const std::exception& e) { + TPY_ERROR_LOC(e.what()); + } + schema.release(&schema); + }, + "uri"_a, + py::kw_only(), + "schema"_a, + "index_column_info"_a, + "axis_names"_a, + "axis_units"_a, + "ctx"_a, + "platform_config"_a, + "timestamp"_a = py::none()) + + .def_static( + "open", + py::overload_cast< + std::string_view, + OpenMode, + std::shared_ptr, + std::vector, + ResultOrder, + std::optional>>( + &SOMAGeometryDataFrame::open), + "uri"_a, + "mode"_a, + "context"_a, + py::kw_only(), + "column_names"_a = py::tuple(), + "result_order"_a = ResultOrder::automatic, + "timestamp"_a = py::none()) + + .def_static("exists", &SOMAGeometryDataFrame::exists) + .def_property_readonly( + "index_column_names", &SOMAGeometryDataFrame::index_column_names) + .def_property_readonly( + "count", + &SOMAGeometryDataFrame::count, + py::call_guard()); +} +} // namespace libtiledbsomacpp diff --git a/apis/python/src/tiledbsoma/soma_object.cc b/apis/python/src/tiledbsoma/soma_object.cc index 288a263856..afbbb7ca6e 100644 --- a/apis/python/src/tiledbsoma/soma_object.cc +++ b/apis/python/src/tiledbsoma/soma_object.cc @@ -68,9 +68,12 @@ void load_soma_object(py::module& m) { if (soma_obj_type == "somadataframe") return py::cast(dynamic_cast(*soma_obj)); - if (soma_obj_type == "somapointclouddataframe") + else if (soma_obj_type == "somapointclouddataframe") return py::cast( dynamic_cast(*soma_obj)); + else if (soma_obj_type == "somageometrydataframe") + return py::cast( + dynamic_cast(*soma_obj)); else if (soma_obj_type == "somasparsendarray") return py::cast( dynamic_cast(*soma_obj)); diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index ef896172d9..827730fe49 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -879,7 +879,8 @@ ArrowAdapter::tiledb_schema_from_arrow_schema( const ArrowTable& index_column_info, std::string soma_type, bool is_sparse, - PlatformConfig platform_config) { + PlatformConfig platform_config, + const ArrowTable& spatial_column_info) { auto& index_column_array = index_column_info.first; auto& index_column_schema = index_column_info.second; @@ -939,12 +940,13 @@ ArrowAdapter::tiledb_schema_from_arrow_schema( if (strcmp(child->name, index_column_schema->children[i]->name) == 0) { if (strcmp(child->name, SOMA_GEOMETRY_COLUMN_NAME.c_str()) == - 0) { + 0 && + spatial_column_info.first.get() != nullptr) { columns.push_back(SOMAGeometryColumn::create( ctx, child, - index_column_schema->children[i], - index_column_array->children[i], + spatial_column_info.second.get(), + spatial_column_info.first.get(), soma_type, type_metadata, platform_config)); @@ -1051,28 +1053,23 @@ ArrowAdapter::tiledb_schema_from_arrow_schema( continue; } - column->set_current_domain_slot( - ndrect, - get_table_any_column_by_name<2>( - index_column_info, column->name(), 3)); - - // if (column->name() == SOMA_GEOMETRY_COLUMN_NAME) { - // std::vector cdslot; - // for (int64_t j = 0; j < spatial_column_info.first->n_children; - // ++j) { - // cdslot.push_back(ArrowAdapter::get_table_any_column<2>( - // spatial_column_info.first->children[j], - // spatial_column_info.second->children[j], - // 3)); - // } - - // column->set_current_domain_slot(ndrect, cdslot); - // } else { - // column->set_current_domain_slot( - // ndrect, - // get_table_any_column_by_name<2>( - // index_column_info, column->name(), 3)); - // } + if (column->name() == SOMA_GEOMETRY_COLUMN_NAME) { + std::vector cdslot; + for (int64_t j = 0; j < spatial_column_info.first->n_children; + ++j) { + cdslot.push_back(ArrowAdapter::get_table_any_column<2>( + spatial_column_info.first->children[j], + spatial_column_info.second->children[j], + 3)); + } + + column->set_current_domain_slot(ndrect, cdslot); + } else { + column->set_current_domain_slot( + ndrect, + get_table_any_column_by_name<2>( + index_column_info, column->name(), 3)); + } } current_domain.set_ndrectangle(ndrect); @@ -1124,54 +1121,6 @@ Dimension ArrowAdapter::tiledb_dimension_from_arrow_schema( return dim; } -Dimension ArrowAdapter::tiledb_dimension_from_arrow_schema_ext( - std::shared_ptr ctx, - ArrowSchema* schema, - ArrowArray* array, - std::string soma_type, - std::string_view type_metadata, - std::string prefix, - std::string suffix, - PlatformConfig platform_config) { - if (strcmp(schema->format, "+l") != 0) { - throw TileDBSOMAError( - std::format("[tiledb_dimension_from_arrow_schema_ext] Schema " - "should be of type list.")); - } - - if (schema->n_children != 1) { - throw TileDBSOMAError( - std::format("[tiledb_dimension_from_arrow_schema_ext] Schema " - "should have exactly 1 child")); - } - - auto type = ArrowAdapter::to_tiledb_format( - schema->children[0]->format, type_metadata); - - if (ArrowAdapter::arrow_is_var_length_type(schema->format)) { - type = TILEDB_STRING_ASCII; - } - - auto col_name = prefix + std::string(schema->name) + suffix; - - FilterList filter_list = ArrowAdapter::_create_dim_filter_list( - col_name, platform_config, soma_type, ctx); - - if (array->length != 5) { - throw TileDBSOMAError(std::format( - "ArrowAdapter: unexpected length {} != 5 for name " - "'{}'", - array->length, - col_name)); - } - - const void* buff = array->children[0]->buffers[1]; - auto dim = ArrowAdapter::_create_dim(type, col_name, buff, ctx); - dim.set_filter_list(filter_list); - - return dim; -} - std::pair> ArrowAdapter::tiledb_attribute_from_arrow_schema( std::shared_ptr ctx, From 623e5088aa5c21c8ff2746b1df00df785dff73b7 Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Wed, 29 Jan 2025 17:23:39 +0200 Subject: [PATCH 03/14] Update schema generation --- libtiledbsoma/src/utils/arrow_adapter.cc | 97 ++++++++++++++++++------ 1 file changed, 74 insertions(+), 23 deletions(-) diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 827730fe49..ef896172d9 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -879,8 +879,7 @@ ArrowAdapter::tiledb_schema_from_arrow_schema( const ArrowTable& index_column_info, std::string soma_type, bool is_sparse, - PlatformConfig platform_config, - const ArrowTable& spatial_column_info) { + PlatformConfig platform_config) { auto& index_column_array = index_column_info.first; auto& index_column_schema = index_column_info.second; @@ -940,13 +939,12 @@ ArrowAdapter::tiledb_schema_from_arrow_schema( if (strcmp(child->name, index_column_schema->children[i]->name) == 0) { if (strcmp(child->name, SOMA_GEOMETRY_COLUMN_NAME.c_str()) == - 0 && - spatial_column_info.first.get() != nullptr) { + 0) { columns.push_back(SOMAGeometryColumn::create( ctx, child, - spatial_column_info.second.get(), - spatial_column_info.first.get(), + index_column_schema->children[i], + index_column_array->children[i], soma_type, type_metadata, platform_config)); @@ -1053,23 +1051,28 @@ ArrowAdapter::tiledb_schema_from_arrow_schema( continue; } - if (column->name() == SOMA_GEOMETRY_COLUMN_NAME) { - std::vector cdslot; - for (int64_t j = 0; j < spatial_column_info.first->n_children; - ++j) { - cdslot.push_back(ArrowAdapter::get_table_any_column<2>( - spatial_column_info.first->children[j], - spatial_column_info.second->children[j], - 3)); - } - - column->set_current_domain_slot(ndrect, cdslot); - } else { - column->set_current_domain_slot( - ndrect, - get_table_any_column_by_name<2>( - index_column_info, column->name(), 3)); - } + column->set_current_domain_slot( + ndrect, + get_table_any_column_by_name<2>( + index_column_info, column->name(), 3)); + + // if (column->name() == SOMA_GEOMETRY_COLUMN_NAME) { + // std::vector cdslot; + // for (int64_t j = 0; j < spatial_column_info.first->n_children; + // ++j) { + // cdslot.push_back(ArrowAdapter::get_table_any_column<2>( + // spatial_column_info.first->children[j], + // spatial_column_info.second->children[j], + // 3)); + // } + + // column->set_current_domain_slot(ndrect, cdslot); + // } else { + // column->set_current_domain_slot( + // ndrect, + // get_table_any_column_by_name<2>( + // index_column_info, column->name(), 3)); + // } } current_domain.set_ndrectangle(ndrect); @@ -1121,6 +1124,54 @@ Dimension ArrowAdapter::tiledb_dimension_from_arrow_schema( return dim; } +Dimension ArrowAdapter::tiledb_dimension_from_arrow_schema_ext( + std::shared_ptr ctx, + ArrowSchema* schema, + ArrowArray* array, + std::string soma_type, + std::string_view type_metadata, + std::string prefix, + std::string suffix, + PlatformConfig platform_config) { + if (strcmp(schema->format, "+l") != 0) { + throw TileDBSOMAError( + std::format("[tiledb_dimension_from_arrow_schema_ext] Schema " + "should be of type list.")); + } + + if (schema->n_children != 1) { + throw TileDBSOMAError( + std::format("[tiledb_dimension_from_arrow_schema_ext] Schema " + "should have exactly 1 child")); + } + + auto type = ArrowAdapter::to_tiledb_format( + schema->children[0]->format, type_metadata); + + if (ArrowAdapter::arrow_is_var_length_type(schema->format)) { + type = TILEDB_STRING_ASCII; + } + + auto col_name = prefix + std::string(schema->name) + suffix; + + FilterList filter_list = ArrowAdapter::_create_dim_filter_list( + col_name, platform_config, soma_type, ctx); + + if (array->length != 5) { + throw TileDBSOMAError(std::format( + "ArrowAdapter: unexpected length {} != 5 for name " + "'{}'", + array->length, + col_name)); + } + + const void* buff = array->children[0]->buffers[1]; + auto dim = ArrowAdapter::_create_dim(type, col_name, buff, ctx); + dim.set_filter_list(filter_list); + + return dim; +} + std::pair> ArrowAdapter::tiledb_attribute_from_arrow_schema( std::shared_ptr ctx, From 033fe33b2b2029ad0e1c3988ba2637263f7bb8f0 Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Tue, 4 Feb 2025 19:43:12 +0200 Subject: [PATCH 04/14] Pass coordinate space down to geometry column during schema creation, decouple arrow schema between column schema and column domain --- libtiledbsoma/src/soma/soma_array.cc | 39 ++--------- libtiledbsoma/src/soma/soma_array.h | 9 ++- libtiledbsoma/src/soma/soma_attribute.cc | 9 ++- libtiledbsoma/src/soma/soma_attribute.h | 7 +- libtiledbsoma/src/soma/soma_column.cc | 32 +++++++-- libtiledbsoma/src/soma/soma_column.h | 13 ++-- libtiledbsoma/src/soma/soma_dataframe.cc | 2 + libtiledbsoma/src/soma/soma_dense_ndarray.cc | 2 + libtiledbsoma/src/soma/soma_dimension.cc | 47 ++++++++++---- libtiledbsoma/src/soma/soma_dimension.h | 7 +- .../src/soma/soma_geometry_column.cc | 65 ++++++++++++++++--- libtiledbsoma/src/soma/soma_geometry_column.h | 22 +++++-- .../src/soma/soma_geometry_dataframe.cc | 2 + .../src/soma/soma_point_cloud_dataframe.cc | 1 + libtiledbsoma/src/soma/soma_sparse_ndarray.cc | 2 + libtiledbsoma/src/utils/arrow_adapter.cc | 18 ++--- libtiledbsoma/src/utils/arrow_adapter.h | 8 ++- libtiledbsoma/test/unit_soma_array.cc | 2 +- libtiledbsoma/test/unit_soma_column.cc | 55 +++++++--------- 19 files changed, 219 insertions(+), 123 deletions(-) diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index 9020c311d7..4e8b20071f 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -224,37 +224,9 @@ void SOMAArray::fill_columns() { TILEDB_SOMA_SCHEMA_KEY, uri())); } - - auto soma_schema_extension_raw = get_metadata(TILEDB_SOMA_SCHEMA_KEY) - .value(); - auto data = static_cast( - std::get<2>(soma_schema_extension_raw)); - auto soma_schema_extension = data != nullptr ? - nlohmann::json::parse(std::string( - data, - std::get<1>( - soma_schema_extension_raw))) : - nlohmann::json::object(); - - if (!soma_schema_extension.contains(TILEDB_SOMA_SCHEMA_COL_KEY)) { - throw TileDBSOMAError(std::format( - "[SOMAArray][fill_columns] Missing '{}' key from '{}'", - TILEDB_SOMA_SCHEMA_COL_KEY, - TILEDB_SOMA_SCHEMA_KEY)); - } - - columns_ = SOMAColumn::deserialize( - soma_schema_extension.value( - TILEDB_SOMA_SCHEMA_COL_KEY, nlohmann::json::array()), - *ctx_->tiledb_ctx(), - *arr_); - - } else { - // Non-geometry dataframes have trivially constructible columns and do - // not require a schema - columns_ = SOMAColumn::deserialize( - nlohmann::json::array(), *ctx_->tiledb_ctx(), *arr_); } + + columns_ = SOMAColumn::deserialize(*ctx_->tiledb_ctx(), *arr_, metadata_); } const std::string SOMAArray::uri() const { @@ -509,11 +481,12 @@ ArrowTable SOMAArray::_get_core_domainish(enum Domainish which_kind) { for (const auto& column : columns_ | std::views::filter( [](const auto& col) { return col->isIndexColumn(); })) { - arrow_schema->children[child_index] = column->arrow_schema_slot( - *ctx_, *arr_); - arrow_array->children[child_index] = column->arrow_domain_slot( + auto kind_domain_slot = column->arrow_domain_slot( *ctx_, *arr_, which_kind); + arrow_array->children[child_index] = kind_domain_slot.first; + arrow_schema->children[child_index] = kind_domain_slot.second; + ++child_index; } diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 57a745334b..bb277d5a35 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -624,8 +624,13 @@ class SOMAArray : public SOMAObject { * @return std::unique_ptr Schema */ std::unique_ptr arrow_schema() const { - return ArrowAdapter::arrow_schema_from_tiledb_array( - ctx_->tiledb_ctx(), arr_); + auto schema = ArrowAdapter::make_arrow_schema_parent(columns_.size()); + + for (size_t i = 0; i < columns_.size(); ++i) { + schema->children[i] = columns_[i]->arrow_schema_slot(*ctx_, *arr_); + } + + return schema; } /** diff --git a/libtiledbsoma/src/soma/soma_attribute.cc b/libtiledbsoma/src/soma/soma_attribute.cc index 2c57768abf..2bb048797e 100644 --- a/libtiledbsoma/src/soma/soma_attribute.cc +++ b/libtiledbsoma/src/soma/soma_attribute.cc @@ -15,7 +15,10 @@ namespace tiledbsoma { std::shared_ptr SOMAAttribute::deserialize( - const nlohmann::json& soma_schema, const Context& ctx, const Array& array) { + const nlohmann::json& soma_schema, + const Context& ctx, + const Array& array, + const std::map&) { if (!soma_schema.contains(TILEDB_SOMA_SCHEMA_COL_ATTR_KEY)) { throw TileDBSOMAError( "[SOMAAttribute][deserialize] Missing required field " @@ -136,7 +139,7 @@ std::any SOMAAttribute::_core_current_domain_slot(NDRectangle&) const { name())); } -ArrowArray* SOMAAttribute::arrow_domain_slot( +std::pair SOMAAttribute::arrow_domain_slot( const SOMAContext&, Array&, enum Domainish) const { throw TileDBSOMAError(std::format( "[SOMAAttribute][arrow_domain_slot] Column with name {} is not an " @@ -145,7 +148,7 @@ ArrowArray* SOMAAttribute::arrow_domain_slot( } ArrowSchema* SOMAAttribute::arrow_schema_slot( - const SOMAContext& ctx, Array& array) { + const SOMAContext& ctx, Array& array) const { return ArrowAdapter::arrow_schema_from_tiledb_attribute( attribute, *ctx.tiledb_ctx(), array) .release(); diff --git a/libtiledbsoma/src/soma/soma_attribute.h b/libtiledbsoma/src/soma/soma_attribute.h index ef6b6c23f3..6fcbbd88fd 100644 --- a/libtiledbsoma/src/soma/soma_attribute.h +++ b/libtiledbsoma/src/soma/soma_attribute.h @@ -35,7 +35,8 @@ class SOMAAttribute : public SOMAColumn { static std::shared_ptr deserialize( const nlohmann::json& soma_schema, const Context& ctx, - const Array& array); + const Array& array, + const std::map& metadata); /** * Create a ``SOMAAttribute`` shared pointer from an Arrow schema @@ -96,13 +97,13 @@ class SOMAAttribute : public SOMAColumn { return std::vector({enumeration.value()}); } - ArrowArray* arrow_domain_slot( + std::pair arrow_domain_slot( const SOMAContext& ctx, Array& array, enum Domainish kind) const override; ArrowSchema* arrow_schema_slot( - const SOMAContext& ctx, Array& array) override; + const SOMAContext& ctx, Array& array) const override; void serialize(nlohmann::json&) const override; diff --git a/libtiledbsoma/src/soma/soma_column.cc b/libtiledbsoma/src/soma/soma_column.cc index 3062eabf6c..7f41986ff6 100644 --- a/libtiledbsoma/src/soma/soma_column.cc +++ b/libtiledbsoma/src/soma/soma_column.cc @@ -26,22 +26,46 @@ std::map SOMAColumn::deserialiser_map = { SOMAGeometryColumn::deserialize}}; std::vector> SOMAColumn::deserialize( - const nlohmann::json& soma_schema_columns, const Context& ctx, - const Array& array) { + const Array& array, + const std::map& metadata) { std::vector> columns; + nlohmann::json soma_schema_columns = nlohmann::json::array(); + + if (metadata.contains(TILEDB_SOMA_SCHEMA_KEY)) { + auto soma_schema_extension_raw = metadata.at(TILEDB_SOMA_SCHEMA_KEY); + auto data = static_cast( + std::get<2>(soma_schema_extension_raw)); + auto soma_schema_extension = data != nullptr ? + nlohmann::json::parse(std::string( + data, + std::get<1>( + soma_schema_extension_raw))) : + nlohmann::json::object(); + + if (!soma_schema_extension.contains(TILEDB_SOMA_SCHEMA_COL_KEY)) { + throw TileDBSOMAError(std::format( + "[SOMAArray][fill_columns] Missing '{}' key from '{}'", + TILEDB_SOMA_SCHEMA_COL_KEY, + TILEDB_SOMA_SCHEMA_KEY)); + } + + soma_schema_columns = soma_schema_extension.value( + TILEDB_SOMA_SCHEMA_COL_KEY, nlohmann::json::array()); + } + if (!soma_schema_columns.empty()) { for (auto& column : soma_schema_columns) { auto type = column[TILEDB_SOMA_SCHEMA_COL_TYPE_KEY] .template get(); - auto col = deserialiser_map[type](column, ctx, array); + auto col = deserialiser_map[type](column, ctx, array, metadata); if (col) { // Deserialized column can be null in case the array is modified // and the column no longer exists. - columns.push_back(deserialiser_map[type](column, ctx, array)); + columns.push_back(col); } } diff --git a/libtiledbsoma/src/soma/soma_column.h b/libtiledbsoma/src/soma/soma_column.h index 397759f696..4610a32d22 100644 --- a/libtiledbsoma/src/soma/soma_column.h +++ b/libtiledbsoma/src/soma/soma_column.h @@ -40,9 +40,9 @@ class SOMAColumn { //=================================================================== static std::vector> deserialize( - const nlohmann::json& soma_schema, const Context& ctx, - const Array& array); + const Array& array, + const std::map& metadata); //=================================================================== //= public non-static @@ -121,7 +121,7 @@ class SOMAColumn { * @param array * @param which_kind */ - virtual ArrowArray* arrow_domain_slot( + virtual std::pair arrow_domain_slot( const SOMAContext& ctx, Array& array, enum Domainish which_kind) const = 0; @@ -133,7 +133,7 @@ class SOMAColumn { * @param array */ virtual ArrowSchema* arrow_schema_slot( - const SOMAContext& ctx, Array& array) = 0; + const SOMAContext& ctx, Array& array) const = 0; /** * Get the domain kind of the SOMAColumn. @@ -531,7 +531,10 @@ class SOMAColumn { private: typedef std::shared_ptr (*Factory)( - const nlohmann::json&, const Context&, const Array&); + const nlohmann::json&, + const Context&, + const Array&, + const std::map&); static std::map deserialiser_map; }; diff --git a/libtiledbsoma/src/soma/soma_dataframe.cc b/libtiledbsoma/src/soma/soma_dataframe.cc index 1a1eff4136..3ab970e130 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.cc +++ b/libtiledbsoma/src/soma/soma_dataframe.cc @@ -14,6 +14,7 @@ #include "soma_dataframe.h" #include #include "../utils/logger.h" +#include "soma_coordinates.h" namespace tiledbsoma { using namespace tiledb; @@ -34,6 +35,7 @@ void SOMADataFrame::create( ctx->tiledb_ctx(), schema, index_columns, + std::nullopt, "SOMADataFrame", true, platform_config); diff --git a/libtiledbsoma/src/soma/soma_dense_ndarray.cc b/libtiledbsoma/src/soma/soma_dense_ndarray.cc index 76ec8444ad..04f534f640 100644 --- a/libtiledbsoma/src/soma/soma_dense_ndarray.cc +++ b/libtiledbsoma/src/soma/soma_dense_ndarray.cc @@ -11,6 +11,7 @@ * This file defines the SOMADenseNDArray class. */ #include "soma_dense_ndarray.h" +#include "soma_coordinates.h" namespace tiledbsoma { using namespace tiledb; @@ -65,6 +66,7 @@ void SOMADenseNDArray::create( ctx->tiledb_ctx(), schema, index_columns, + {}, "SOMADenseNDArray", false, platform_config); diff --git a/libtiledbsoma/src/soma/soma_dimension.cc b/libtiledbsoma/src/soma/soma_dimension.cc index d87fe9028f..2ae71923c2 100644 --- a/libtiledbsoma/src/soma/soma_dimension.cc +++ b/libtiledbsoma/src/soma/soma_dimension.cc @@ -17,7 +17,10 @@ namespace tiledbsoma { std::shared_ptr SOMADimension::deserialize( - const nlohmann::json& soma_schema, const Context&, const Array& array) { + const nlohmann::json& soma_schema, + const Context&, + const Array& array, + const std::map&) { if (!soma_schema.contains(TILEDB_SOMA_SCHEMA_COL_DIM_KEY)) { throw TileDBSOMAError( "[SOMADimension][deserialize] Missing required field " @@ -949,8 +952,10 @@ std::any SOMADimension::_core_current_domain_slot(NDRectangle& ndrect) const { } } -ArrowArray* SOMADimension::arrow_domain_slot( +std::pair SOMADimension::arrow_domain_slot( const SOMAContext& ctx, Array& array, enum Domainish kind) const { + ArrowArray* arrow_array; + switch (domain_type().value()) { case TILEDB_DATETIME_YEAR: case TILEDB_DATETIME_MONTH: @@ -975,39 +980,50 @@ ArrowArray* SOMADimension::arrow_domain_slot( case TILEDB_TIME_FS: case TILEDB_TIME_AS: case TILEDB_INT64: - return ArrowAdapter::make_arrow_array_child( + arrow_array = ArrowAdapter::make_arrow_array_child( domain_slot(ctx, array, kind)); + break; case TILEDB_UINT64: - return ArrowAdapter::make_arrow_array_child( + arrow_array = ArrowAdapter::make_arrow_array_child( domain_slot(ctx, array, kind)); + break; case TILEDB_INT32: - return ArrowAdapter::make_arrow_array_child( + arrow_array = ArrowAdapter::make_arrow_array_child( domain_slot(ctx, array, kind)); + break; case TILEDB_UINT32: - return ArrowAdapter::make_arrow_array_child( + arrow_array = ArrowAdapter::make_arrow_array_child( domain_slot(ctx, array, kind)); + break; case TILEDB_INT16: - return ArrowAdapter::make_arrow_array_child( + arrow_array = ArrowAdapter::make_arrow_array_child( domain_slot(ctx, array, kind)); + break; case TILEDB_UINT16: - return ArrowAdapter::make_arrow_array_child( + arrow_array = ArrowAdapter::make_arrow_array_child( domain_slot(ctx, array, kind)); + break; case TILEDB_INT8: - return ArrowAdapter::make_arrow_array_child( + arrow_array = ArrowAdapter::make_arrow_array_child( domain_slot(ctx, array, kind)); + break; case TILEDB_UINT8: - return ArrowAdapter::make_arrow_array_child( + arrow_array = ArrowAdapter::make_arrow_array_child( domain_slot(ctx, array, kind)); + break; case TILEDB_FLOAT64: - return ArrowAdapter::make_arrow_array_child( + arrow_array = ArrowAdapter::make_arrow_array_child( domain_slot(ctx, array, kind)); + break; case TILEDB_FLOAT32: - return ArrowAdapter::make_arrow_array_child( + arrow_array = ArrowAdapter::make_arrow_array_child( domain_slot(ctx, array, kind)); + break; case TILEDB_STRING_ASCII: case TILEDB_STRING_UTF8: - return ArrowAdapter::make_arrow_array_child_string( + arrow_array = ArrowAdapter::make_arrow_array_child_string( domain_slot(ctx, array, kind)); + break; default: throw TileDBSOMAError(std::format( "[SOMADimension][arrow_domain_slot] dim {} has unhandled " @@ -1016,9 +1032,12 @@ ArrowArray* SOMADimension::arrow_domain_slot( name(), tiledb::impl::type_to_str(domain_type().value()))); } + + return std::make_pair(arrow_array, arrow_schema_slot(ctx, array)); } -ArrowSchema* SOMADimension::arrow_schema_slot(const SOMAContext&, Array&) { +ArrowSchema* SOMADimension::arrow_schema_slot( + const SOMAContext&, Array&) const { return ArrowAdapter::arrow_schema_from_tiledb_dimension(dimension) .release(); } diff --git a/libtiledbsoma/src/soma/soma_dimension.h b/libtiledbsoma/src/soma/soma_dimension.h index abf512c8fe..97584c658d 100644 --- a/libtiledbsoma/src/soma/soma_dimension.h +++ b/libtiledbsoma/src/soma/soma_dimension.h @@ -36,7 +36,8 @@ class SOMADimension : public SOMAColumn { static std::shared_ptr deserialize( const nlohmann::json& soma_schema, const Context& ctx, - const Array& array); + const Array& array, + const std::map& metadata); static std::shared_ptr create( std::shared_ptr ctx, @@ -89,13 +90,13 @@ class SOMADimension : public SOMAColumn { return std::nullopt; } - ArrowArray* arrow_domain_slot( + std::pair arrow_domain_slot( const SOMAContext& ctx, Array& array, enum Domainish kind) const override; ArrowSchema* arrow_schema_slot( - const SOMAContext& ctx, Array& array) override; + const SOMAContext& ctx, Array& array) const override; void serialize(nlohmann::json&) const override; diff --git a/libtiledbsoma/src/soma/soma_geometry_column.cc b/libtiledbsoma/src/soma/soma_geometry_column.cc index 1924107006..f9cc03e14f 100644 --- a/libtiledbsoma/src/soma/soma_geometry_column.cc +++ b/libtiledbsoma/src/soma/soma_geometry_column.cc @@ -15,7 +15,10 @@ namespace tiledbsoma { std::shared_ptr SOMAGeometryColumn::deserialize( - const nlohmann::json& soma_schema, const Context&, const Array& array) { + const nlohmann::json& soma_schema, + const Context&, + const Array& array, + const std::map& metadata) { if (!soma_schema.contains(TILEDB_SOMA_SCHEMA_COL_DIM_KEY)) { throw TileDBSOMAError( "[SOMAGeometryColumn][deserialize] Missing required field " @@ -58,7 +61,19 @@ std::shared_ptr SOMAGeometryColumn::deserialize( auto attribute = array.schema().attribute(attribute_names[0]); - return std::make_shared(dimensions, attribute); + if (!metadata.contains(SOMA_COORDINATE_SPACE_KEY)) { + throw TileDBSOMAError(std::format( + "[SOMAGeometryColumn][deserialize] Missing required '{}' " + "metadata key.", + SOMA_COORDINATE_SPACE_KEY)); + } + + auto coordinate_space = std::apply( + SOMACoordinateSpace::from_metadata, + metadata.at(SOMA_COORDINATE_SPACE_KEY)); + + return std::make_shared( + dimensions, attribute, coordinate_space); } std::shared_ptr SOMAGeometryColumn::create( @@ -66,6 +81,7 @@ std::shared_ptr SOMAGeometryColumn::create( ArrowSchema* schema, ArrowSchema* spatial_schema, ArrowArray* spatial_array, + const SOMACoordinateSpace& coordinate_space, const std::string& soma_type, std::string_view type_metadata, PlatformConfig platform_config) { @@ -107,7 +123,7 @@ std::shared_ptr SOMAGeometryColumn::create( ctx, schema, type_metadata, platform_config); return std::make_shared( - SOMAGeometryColumn(dims, attribute.first)); + SOMAGeometryColumn(dims, attribute.first, coordinate_space)); } void SOMAGeometryColumn::_set_dim_points( @@ -464,13 +480,44 @@ std::any SOMAGeometryColumn::_core_current_domain_slot( std::make_pair(min, max)); } -ArrowArray* SOMAGeometryColumn::arrow_domain_slot( +std::pair SOMAGeometryColumn::arrow_domain_slot( const SOMAContext& ctx, Array& array, enum Domainish kind) const { switch (domain_type().value()) { - case TILEDB_FLOAT64: - return ArrowAdapter::make_arrow_array_child_var( - domain_slot>(ctx, array, kind)); - break; + case TILEDB_FLOAT64: { + auto parent_schema = ArrowAdapter::make_arrow_schema_parent( + TDB_DIM_PER_SPATIAL_AXIS, name()); + auto parent_array = ArrowAdapter::make_arrow_array_parent( + TDB_DIM_PER_SPATIAL_AXIS); + + auto kind_domain = domain_slot>( + ctx, array, kind); + + for (size_t i = 0; i < TDB_DIM_PER_SPATIAL_AXIS; ++i) { + // Generate coordinate space axie schema + auto child_schema = static_cast( + malloc(sizeof(ArrowSchema))); + child_schema->format = strdup( + ArrowAdapter::to_arrow_format(TILEDB_FLOAT64).data()); + child_schema->name = strdup( + coordinate_space.axis(i).name.c_str()); + child_schema->metadata = nullptr; + child_schema->flags = 0; + child_schema->n_children = 0; + child_schema->children = nullptr; + child_schema->dictionary = nullptr; + child_schema->release = &ArrowAdapter::release_schema; + child_schema->private_data = nullptr; + + parent_schema->children[i] = child_schema; + + parent_array->children[i] = + ArrowAdapter::make_arrow_array_child(std::vector( + {kind_domain.first[i], kind_domain.second[i]})); + } + + return std::make_pair( + parent_array.release(), parent_schema.release()); + } break; default: throw TileDBSOMAError(std::format( "[SOMAGeometryColumn][arrow_domain_slot] dim {} has unhandled " @@ -482,7 +529,7 @@ ArrowArray* SOMAGeometryColumn::arrow_domain_slot( } ArrowSchema* SOMAGeometryColumn::arrow_schema_slot( - const SOMAContext& ctx, Array& array) { + const SOMAContext& ctx, Array& array) const { return ArrowAdapter::arrow_schema_from_tiledb_attribute( attribute, *ctx.tiledb_ctx(), array) .release(); diff --git a/libtiledbsoma/src/soma/soma_geometry_column.h b/libtiledbsoma/src/soma/soma_geometry_column.h index 1b8a0e7624..b9409aeeea 100644 --- a/libtiledbsoma/src/soma/soma_geometry_column.h +++ b/libtiledbsoma/src/soma/soma_geometry_column.h @@ -28,6 +28,7 @@ #include #include "soma_column.h" +#include "soma_coordinates.h" namespace tiledbsoma { @@ -43,20 +44,26 @@ class SOMAGeometryColumn : public SOMAColumn { static std::shared_ptr deserialize( const nlohmann::json& soma_schema, const Context& ctx, - const Array& array); + const Array& array, + const std::map&); static std::shared_ptr create( std::shared_ptr ctx, ArrowSchema* schema, ArrowSchema* spatial_schema, ArrowArray* spatial_array, + const SOMACoordinateSpace& coordinate_space, const std::string& soma_type, std::string_view type_metadata, PlatformConfig platform_config); - SOMAGeometryColumn(std::vector dimensions, Attribute attribute) + SOMAGeometryColumn( + std::vector dimensions, + Attribute attribute, + SOMACoordinateSpace coordinate_space) : dimensions(dimensions) - , attribute(attribute){}; + , attribute(attribute) + , coordinate_space(coordinate_space){}; inline std::string name() const override { return SOMA_GEOMETRY_COLUMN_NAME; @@ -97,16 +104,20 @@ class SOMAGeometryColumn : public SOMAColumn { return std::nullopt; } - ArrowArray* arrow_domain_slot( + std::pair arrow_domain_slot( const SOMAContext& ctx, Array& array, enum Domainish kind) const override; ArrowSchema* arrow_schema_slot( - const SOMAContext& ctx, Array& array) override; + const SOMAContext& ctx, Array& array) const override; void serialize(nlohmann::json&) const override; + inline SOMACoordinateSpace get_coordinate_space() const { + return coordinate_space; + } + protected: void _set_dim_points( const std::unique_ptr& query, @@ -148,6 +159,7 @@ class SOMAGeometryColumn : public SOMAColumn { const size_t TDB_DIM_PER_SPATIAL_AXIS = 2; std::vector dimensions; Attribute attribute; + SOMACoordinateSpace coordinate_space; /** * Compute the usable domain limits. If the array has a current domain then diff --git a/libtiledbsoma/src/soma/soma_geometry_dataframe.cc b/libtiledbsoma/src/soma/soma_geometry_dataframe.cc index 05fd4b9ecc..b19612e259 100644 --- a/libtiledbsoma/src/soma/soma_geometry_dataframe.cc +++ b/libtiledbsoma/src/soma/soma_geometry_dataframe.cc @@ -16,6 +16,7 @@ #include "../geometry/operators/envelope.h" #include "../geometry/operators/io/write.h" #include "../utils/util.h" +#include "soma_geometry_column.h" #include #include @@ -40,6 +41,7 @@ void SOMAGeometryDataFrame::create( ctx->tiledb_ctx(), schema, index_columns, + std::make_optional(coordinate_space), "SOMAGeometryDataFrame", true, platform_config); diff --git a/libtiledbsoma/src/soma/soma_point_cloud_dataframe.cc b/libtiledbsoma/src/soma/soma_point_cloud_dataframe.cc index 84c331e903..a028e266ab 100644 --- a/libtiledbsoma/src/soma/soma_point_cloud_dataframe.cc +++ b/libtiledbsoma/src/soma/soma_point_cloud_dataframe.cc @@ -36,6 +36,7 @@ void SOMAPointCloudDataFrame::create( ctx->tiledb_ctx(), schema, index_columns, + std::make_optional(coordinate_space), "SOMAPointCloudDataFrame", true, platform_config); diff --git a/libtiledbsoma/src/soma/soma_sparse_ndarray.cc b/libtiledbsoma/src/soma/soma_sparse_ndarray.cc index 889aa5ce4f..9705712560 100644 --- a/libtiledbsoma/src/soma/soma_sparse_ndarray.cc +++ b/libtiledbsoma/src/soma/soma_sparse_ndarray.cc @@ -12,6 +12,7 @@ */ #include "soma_sparse_ndarray.h" +#include "soma_coordinates.h" namespace tiledbsoma { using namespace tiledb; @@ -66,6 +67,7 @@ void SOMASparseNDArray::create( ctx->tiledb_ctx(), schema, index_columns, + {}, "SOMASparseNDArray", true, platform_config); diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index ef896172d9..5f30371448 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -19,10 +19,7 @@ #include "util.h" #include "../soma/soma_attribute.h" -#include "../soma/soma_dimension.h" -#include "../soma/soma_geometry_column.h" - -#include "../soma/soma_attribute.h" +#include "../soma/soma_coordinates.h" #include "../soma/soma_dimension.h" #include "../soma/soma_geometry_column.h" @@ -479,7 +476,7 @@ std::unique_ptr ArrowAdapter::arrow_schema_from_tiledb_dimension( } std::unique_ptr ArrowAdapter::arrow_schema_from_tiledb_attribute( - Attribute& attribute, const Context& ctx, const Array& tiledb_array) { + const Attribute& attribute, const Context& ctx, const Array& tiledb_array) { std::unique_ptr arrow_schema = std::make_unique(); arrow_schema->format = strdup( ArrowAdapter::to_arrow_format(attribute.type()).data()); @@ -502,9 +499,10 @@ std::unique_ptr ArrowAdapter::arrow_schema_from_tiledb_attribute( "name {}", arrow_schema->format, arrow_schema->name)); - + // We shouldn;t have to cast constness away. Maybe missing const qualifier + // from AttributeExperimental::get_enumeration_name auto enmr_name = AttributeExperimental::get_enumeration_name( - ctx, attribute); + ctx, const_cast(attribute)); if (enmr_name.has_value()) { auto enmr = ArrayExperimental::get_enumeration( ctx, tiledb_array, attribute.name()); @@ -877,6 +875,7 @@ ArrowAdapter::tiledb_schema_from_arrow_schema( std::shared_ptr ctx, const std::unique_ptr& arrow_schema, const ArrowTable& index_column_info, + const std::optional& coordinate_space, std::string soma_type, bool is_sparse, PlatformConfig platform_config) { @@ -945,6 +944,7 @@ ArrowAdapter::tiledb_schema_from_arrow_schema( child, index_column_schema->children[i], index_column_array->children[i], + coordinate_space.value(), soma_type, type_metadata, platform_config)); @@ -1662,10 +1662,10 @@ std::unique_ptr ArrowAdapter::make_arrow_schema( } std::unique_ptr ArrowAdapter::make_arrow_schema_parent( - size_t num_columns) { + size_t num_columns, std::string_view name) { auto arrow_schema = std::make_unique(); arrow_schema->format = strdup("+s"); // structure, i.e. non-leaf node - arrow_schema->name = strdup("parent"); + arrow_schema->name = strdup(name.data()); arrow_schema->metadata = nullptr; arrow_schema->flags = 0; arrow_schema->n_children = static_cast( diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index f94278d4f8..adbafa17a6 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -58,6 +58,7 @@ using namespace tiledb; using json = nlohmann::json; class ColumnBuffer; +class SOMACoordinateSpace; /** * @brief The ArrowBuffer holds a shared pointer to a ColumnBuffer, which @@ -366,7 +367,9 @@ class ArrowAdapter { * @return ArrowSchema */ static std::unique_ptr arrow_schema_from_tiledb_attribute( - Attribute& attribute, const Context& ctx, const Array& tiledb_array); + const Attribute& attribute, + const Context& ctx, + const Array& tiledb_array); /** * @brief Get members of the TileDB Schema in the form of a @@ -406,6 +409,7 @@ class ArrowAdapter { std::shared_ptr ctx, const std::unique_ptr& arrow_schema, const ArrowTable& index_column_info, + const std::optional& coordinate_space, std::string soma_type, bool is_sparse = true, PlatformConfig platform_config = PlatformConfig()); @@ -515,7 +519,7 @@ class ArrowAdapter { * ArrowSchema. This constructs the parent and not the children. */ static std::unique_ptr make_arrow_schema_parent( - size_t num_columns); + size_t num_columns, std::string_view name = "parent"); /** * @brief Creates a nanoarrow ArrowArray which accommodates diff --git a/libtiledbsoma/test/unit_soma_array.cc b/libtiledbsoma/test/unit_soma_array.cc index efb15f0c22..b5f7e92e4b 100644 --- a/libtiledbsoma/test/unit_soma_array.cc +++ b/libtiledbsoma/test/unit_soma_array.cc @@ -466,7 +466,7 @@ TEST_CASE("SOMAArray: Write and read back Boolean") { schema.add_attribute(attr); schema.set_allows_dups(true); - SOMAArray::create(ctx, uri, std::move(schema), "NONE", ""); + SOMAArray::create(ctx, uri, std::move(schema), "NONE"); auto soma_array = SOMAArray::open(OpenMode::write, uri, ctx); auto arrow_schema = std::make_unique(); diff --git a/libtiledbsoma/test/unit_soma_column.cc b/libtiledbsoma/test/unit_soma_column.cc index 36d526ba5c..4a11b83179 100644 --- a/libtiledbsoma/test/unit_soma_column.cc +++ b/libtiledbsoma/test/unit_soma_column.cc @@ -14,6 +14,7 @@ #include #include #include +#include #include "../src/soma/soma_attribute.h" #include "../src/soma/soma_column.h" #include "../src/soma/soma_dimension.h" @@ -342,33 +343,30 @@ TEST_CASE_METHOD( ned_str = ArrowAdapter::get_table_string_column_by_name( non_empty_domain, "mystring"); - std::vector - ned_str_col = ArrowAdapter::get_array_string_column( - columns[0]->arrow_domain_slot( - *ctx_, raw_array, Domainish::kind_non_empty_domain), - columns[0]->arrow_schema_slot(*ctx_, raw_array)); + std::vector ned_str_col = std::apply( + ArrowAdapter::get_array_string_column, + columns[0]->arrow_domain_slot( + *ctx_, raw_array, Domainish::kind_non_empty_domain)); ArrowTable soma_domain = sdf->get_soma_domain(); std::vector dom_str = ArrowAdapter::get_table_string_column_by_name( soma_domain, "mystring"); - std::vector - dom_str_col = ArrowAdapter::get_array_string_column( - columns[0]->arrow_domain_slot( - *ctx_, raw_array, Domainish::kind_core_current_domain), - columns[0]->arrow_schema_slot(*ctx_, raw_array)); + std::vector dom_str_col = std::apply( + ArrowAdapter::get_array_string_column, + columns[0]->arrow_domain_slot( + *ctx_, raw_array, Domainish::kind_core_current_domain)); ArrowTable soma_maxdomain = sdf->get_soma_maxdomain(); std::vector maxdom_str = ArrowAdapter::get_table_string_column_by_name( soma_maxdomain, "mystring"); - std::vector - maxdom_str_col = ArrowAdapter::get_array_string_column( - columns[0]->arrow_domain_slot( - *ctx_, raw_array, Domainish::kind_core_domain), - columns[0]->arrow_schema_slot(*ctx_, raw_array)); + std::vector maxdom_str_col = std::apply( + ArrowAdapter::get_array_string_column, + columns[0]->arrow_domain_slot( + *ctx_, raw_array, Domainish::kind_core_domain)); REQUIRE(ned_str == std::vector({"", ""})); @@ -501,33 +499,30 @@ TEST_CASE_METHOD( ned_str = ArrowAdapter::get_table_string_column_by_name( non_empty_domain, "mystring"); - std::vector - ned_str_col = ArrowAdapter::get_array_string_column( - columns[0]->arrow_domain_slot( - *ctx_, raw_array, Domainish::kind_non_empty_domain), - columns[0]->arrow_schema_slot(*ctx_, raw_array)); + std::vector ned_str_col = std::apply( + ArrowAdapter::get_array_string_column, + columns[0]->arrow_domain_slot( + *ctx_, raw_array, Domainish::kind_non_empty_domain)); ArrowTable soma_domain = sdf->get_soma_domain(); std::vector dom_str = ArrowAdapter::get_table_string_column_by_name( soma_domain, "mystring"); - std::vector - dom_str_col = ArrowAdapter::get_array_string_column( - columns[0]->arrow_domain_slot( - *ctx_, raw_array, Domainish::kind_core_current_domain), - columns[0]->arrow_schema_slot(*ctx_, raw_array)); + std::vector dom_str_col = std::apply( + ArrowAdapter::get_array_string_column, + columns[0]->arrow_domain_slot( + *ctx_, raw_array, Domainish::kind_core_current_domain)); ArrowTable soma_maxdomain = sdf->get_soma_maxdomain(); std::vector maxdom_str = ArrowAdapter::get_table_string_column_by_name( soma_maxdomain, "mystring"); - std::vector - maxdom_str_col = ArrowAdapter::get_array_string_column( - columns[0]->arrow_domain_slot( - *ctx_, raw_array, Domainish::kind_core_domain), - columns[0]->arrow_schema_slot(*ctx_, raw_array)); + std::vector maxdom_str_col = std::apply( + ArrowAdapter::get_array_string_column, + columns[0]->arrow_domain_slot( + *ctx_, raw_array, Domainish::kind_core_domain)); REQUIRE(ned_str == std::vector({"", ""})); From 7aef7c8f2766a4176a85925be5d2d5559bc59178 Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Wed, 5 Feb 2025 02:42:37 +0200 Subject: [PATCH 05/14] Change geometry domain representation --- .../src/tiledbsoma/_geometry_dataframe.py | 34 +++++-------------- .../src/soma/soma_geometry_column.cc | 9 +++-- libtiledbsoma/src/utils/arrow_adapter.h | 11 ++---- 3 files changed, 18 insertions(+), 36 deletions(-) diff --git a/apis/python/src/tiledbsoma/_geometry_dataframe.py b/apis/python/src/tiledbsoma/_geometry_dataframe.py index ddf8088817..c0feb2a7a0 100644 --- a/apis/python/src/tiledbsoma/_geometry_dataframe.py +++ b/apis/python/src/tiledbsoma/_geometry_dataframe.py @@ -210,18 +210,14 @@ def create( # [4] core current domain hi if index_column_name == SOMA_GEOMETRY: # SOMA_GEOMETRY has a specific schema - index_column_schema.append(pa.field(pa_field.name, pa.struct({axis:pa.list_(pa.float64()) for axis in axis_names}), nullable=True)) - struct = [] - for idx, axis in enumerate(axis_names): - struct.append((axis, - [ - *[float(max_domain[idx]) for max_domain in slot_core_max_domain], - extent, - *[float(current_domain[idx]) for current_domain in slot_core_current_domain] - ] - )) - - index_column_data[pa_field.name] = [struct, None, None, None, None] + index_column_schema.append(pa.field(pa_field.name, pa.struct({axis:pa.float64() for axis in axis_names}))) + index_column_data[pa_field.name] = [ + [(axis, slot_core_max_domain[0][idx]) for idx, axis in enumerate(axis_names)], + [(axis, slot_core_max_domain[1][idx]) for idx, axis in enumerate(axis_names)], + [(axis, extent) for axis in axis_names], + [(axis, slot_core_current_domain[0][idx]) for idx, axis in enumerate(axis_names)], + [(axis, slot_core_current_domain[0][idx]) for idx, axis in enumerate(axis_names)] + ] else: index_column_schema.append(pa_field) index_column_data[pa_field.name] = [ @@ -414,16 +410,4 @@ def coordinate_space(self, value: CoordinateSpace) -> None: self.metadata[SOMA_COORDINATE_SPACE_METADATA_KEY] = coordinate_space_to_json( value ) - self._coord_space = value - - @property - def domain(self) -> Tuple[Tuple[Any, Any], ...]: - """The allowable range of values in each index column. - - Returns: a tuple of minimum and maximum values, inclusive, - storable on each index column of the dataframe. - - Lifecycle: - Experimental. - """ - raise NotImplementedError() + self._coord_space = value \ No newline at end of file diff --git a/libtiledbsoma/src/soma/soma_geometry_column.cc b/libtiledbsoma/src/soma/soma_geometry_column.cc index f9cc03e14f..b89471b011 100644 --- a/libtiledbsoma/src/soma/soma_geometry_column.cc +++ b/libtiledbsoma/src/soma/soma_geometry_column.cc @@ -96,7 +96,7 @@ std::shared_ptr SOMAGeometryColumn::create( } for (int64_t j = 0; j < spatial_schema->n_children; ++j) { - dims.push_back(ArrowAdapter::tiledb_dimension_from_arrow_schema_ext( + dims.push_back(ArrowAdapter::tiledb_dimension_from_arrow_schema( ctx, spatial_schema->children[j], spatial_array->children[j], @@ -108,7 +108,7 @@ std::shared_ptr SOMAGeometryColumn::create( } for (int64_t j = 0; j < spatial_schema->n_children; ++j) { - dims.push_back(ArrowAdapter::tiledb_dimension_from_arrow_schema_ext( + dims.push_back(ArrowAdapter::tiledb_dimension_from_arrow_schema( ctx, spatial_schema->children[j], spatial_array->children[j], @@ -489,6 +489,11 @@ std::pair SOMAGeometryColumn::arrow_domain_slot( auto parent_array = ArrowAdapter::make_arrow_array_parent( TDB_DIM_PER_SPATIAL_AXIS); + parent_array->length = 2; + parent_array->n_buffers = 1; + parent_array->buffers = (const void**)malloc(sizeof(void*)); + parent_array->buffers[0] = nullptr; + auto kind_domain = domain_slot>( ctx, array, kind); diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index adbafa17a6..ccfc671ee3 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -934,15 +934,8 @@ class ArrowAdapter { // Complex domain like the ones required by `GeometryColumn` expect // a struct containing lists of values for (int64_t i = 0; i < selected_schema->n_children; ++i) { - if (strcmp(selected_schema->children[i]->format, "+l") != 0) { - throw std::runtime_error(std::format( - "[ArrowAdapter][get_table_any_column_by_index] Complex " - "column struct should contain list but found '{}'", - selected_schema->children[i]->format)); - } - - ArrowArray* array = selected_array->children[i]->children[0]; - ArrowSchema* schema = selected_schema->children[i]->children[0]; + ArrowArray* array = selected_array->children[i]; + ArrowSchema* schema = selected_schema->children[i]; result.push_back( get_table_any_column(array, schema, offset)); From 30a147ceced82ed73d4a183c235e62387dcebfae Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Wed, 5 Feb 2025 15:00:40 +0200 Subject: [PATCH 06/14] Update unit test and C++ geometry index column generation --- libtiledbsoma/test/common.cc | 73 ++++++++++++----- libtiledbsoma/test/common.h | 6 +- .../test/unit_soma_geometry_dataframe.cc | 82 ++++--------------- 3 files changed, 74 insertions(+), 87 deletions(-) diff --git a/libtiledbsoma/test/common.cc b/libtiledbsoma/test/common.cc index 6c75251de5..8176c56022 100644 --- a/libtiledbsoma/test/common.cc +++ b/libtiledbsoma/test/common.cc @@ -17,7 +17,8 @@ namespace helper { static std::unique_ptr _create_index_cols_info_array( - const std::vector& dim_infos); + const std::vector& dim_infos, + std::optional coordinate_space = std::nullopt); // Core PR: https://github.com/TileDB-Inc/TileDB/pull/5303 bool have_dense_current_domain_support() { @@ -64,7 +65,8 @@ bool have_dense_current_domain_support() { std::pair, ArrowTable> create_arrow_schema_and_index_columns( const std::vector& dim_infos, - const std::vector& attr_infos) { + const std::vector& attr_infos, + std::optional coordinate_space) { int ndim = dim_infos.size(); int nattr = attr_infos.size(); int nfield = ndim + nattr; @@ -84,8 +86,10 @@ create_arrow_schema_and_index_columns( auto arrow_schema = ArrowAdapter::make_arrow_schema( names, tiledb_datatypes); - auto index_cols_info_schema = create_index_cols_info_schema(dim_infos); - auto index_cols_info_array = _create_index_cols_info_array(dim_infos); + auto index_cols_info_schema = create_index_cols_info_schema( + dim_infos, coordinate_space); + auto index_cols_info_array = _create_index_cols_info_array( + dim_infos, coordinate_space); return std::pair( std::move(arrow_schema), @@ -112,7 +116,8 @@ ArrowTable create_column_index_info(const std::vector& dim_infos) { } std::unique_ptr create_index_cols_info_schema( - const std::vector& dim_infos) { + const std::vector& dim_infos, + std::optional coordinate_space) { auto ndim = dim_infos.size(); std::vector names(ndim); @@ -127,18 +132,31 @@ std::unique_ptr create_index_cols_info_schema( auto schema = ArrowAdapter::make_arrow_schema(names, tiledb_datatypes); for (size_t i = 0; i < static_cast(schema->n_children); ++i) { - if (strcmp(schema->children[i]->name, "soma_geometry")) { - nanoarrow::UniqueBuffer buffer; - ArrowMetadataBuilderInit(buffer.get(), nullptr); - ArrowMetadataBuilderAppend( - buffer.get(), - ArrowCharView("dtype"), - ArrowCharView( - dim_infos[i].tiledb_datatype == TILEDB_GEOM_WKB ? "WKB" : - "WKT")); - ArrowSchemaSetMetadata( - schema->children[i], - std::string((char*)buffer->data, buffer->size_bytes).c_str()); + if (strcmp(schema->children[i]->name, "soma_geometry") == 0) { + // Recreate schema for WKB domain (struct of floats) + auto geometry_schema = ArrowAdapter::make_arrow_schema_parent( + coordinate_space->size(), "soma_geometry"); + for (size_t j = 0; j < coordinate_space->size(); ++j) { + ArrowSchema* dim_schema = (ArrowSchema*)malloc( + sizeof(ArrowSchema)); + auto arrow_type_name = ArrowAdapter::tdb_to_arrow_type( + TILEDB_FLOAT64); + dim_schema->name = strdup( + coordinate_space->axis(j).name.c_str()); + dim_schema->format = strdup(arrow_type_name.c_str()); + dim_schema->metadata = nullptr; + dim_schema->flags = 0; + dim_schema->n_children = 0; // leaf node + dim_schema->children = nullptr; // leaf node + dim_schema->dictionary = nullptr; + dim_schema->release = &ArrowAdapter::release_schema; + dim_schema->private_data = nullptr; + + geometry_schema->children[j] = dim_schema; + } + + schema->release(schema->children[i]); + schema->children[i] = geometry_schema.release(); } } @@ -146,7 +164,8 @@ std::unique_ptr create_index_cols_info_schema( } static std::unique_ptr _create_index_cols_info_array( - const std::vector& dim_infos) { + const std::vector& dim_infos, + std::optional coordinate_space) { int ndim = dim_infos.size(); auto index_cols_info_array = ArrowAdapter::make_arrow_array_parent(ndim); @@ -179,7 +198,23 @@ static std::unique_ptr _create_index_cols_info_array( } else if (info.tiledb_datatype == TILEDB_GEOM_WKB) { // No domain can be set for WKB. The domain will be set to the // individual spatial axes. - dim_array = ArrowAdapter::make_arrow_array_child_binary(); + dim_array = ArrowAdapter::make_arrow_array_parent( + coordinate_space->size()) + .release(); + dim_array->n_buffers = 1; + dim_array->buffers = (const void**)malloc(sizeof(void*)); + dim_array->buffers[0] = nullptr; + dim_array->length = 5; + for (size_t j = 0; j < coordinate_space->size(); ++j) { + std::vector dom( + {0, + (double_t)CORE_DOMAIN_MAX, + 1, + 0, + (double_t)info.dim_max}); + dim_array->children[j] = ArrowAdapter::make_arrow_array_child( + dom); + } } else if (info.tiledb_datatype == TILEDB_FLOAT64) { // domain big; current_domain small std::vector dom( diff --git a/libtiledbsoma/test/common.h b/libtiledbsoma/test/common.h index ea9d4ce85b..dcf82416b4 100644 --- a/libtiledbsoma/test/common.h +++ b/libtiledbsoma/test/common.h @@ -68,7 +68,8 @@ struct AttrInfo { std::pair, ArrowTable> create_arrow_schema_and_index_columns( const std::vector& dim_infos, - const std::vector& attr_infos); + const std::vector& attr_infos, + std::optional coordinate_space = std::nullopt); ArrowTable create_column_index_info(const std::vector& dim_infos); @@ -78,7 +79,8 @@ std::string to_arrow_format(tiledb_datatype_t tiledb_datatype); bool have_dense_current_domain_support(); std::unique_ptr create_index_cols_info_schema( - const std::vector& dim_infos); + const std::vector& dim_infos, + std::optional coordinate_space = std::nullopt); } // namespace helper #endif diff --git a/libtiledbsoma/test/unit_soma_geometry_dataframe.cc b/libtiledbsoma/test/unit_soma_geometry_dataframe.cc index a78b070aa1..5eed13d4e3 100644 --- a/libtiledbsoma/test/unit_soma_geometry_dataframe.cc +++ b/libtiledbsoma/test/unit_soma_geometry_dataframe.cc @@ -40,20 +40,6 @@ TEST_CASE("SOMAGeometryDataFrame: basic", "[SOMAGeometryDataFrame]") { .string_lo = "N/A", .string_hi = "N/A"})}); - std::vector spatial_dim_infos( - {helper::DimInfo( - {.name = "x", - .tiledb_datatype = TILEDB_FLOAT64, - .dim_max = 200, - .string_lo = "N/A", - .string_hi = "N/A"}), - helper::DimInfo( - {.name = "y", - .tiledb_datatype = TILEDB_FLOAT64, - .dim_max = 100, - .string_lo = "N/A", - .string_hi = "N/A"})}); - std::vector attr_infos({helper::AttrInfo( {.name = "quality", .tiledb_datatype = TILEDB_FLOAT64})}); @@ -62,8 +48,8 @@ TEST_CASE("SOMAGeometryDataFrame: basic", "[SOMAGeometryDataFrame]") { // Create the geometry dataframe. auto [schema, index_columns] = - helper::create_arrow_schema_and_index_columns(dim_infos, attr_infos); - auto spatial_columns = helper::create_column_index_info(spatial_dim_infos); + helper::create_arrow_schema_and_index_columns( + dim_infos, attr_infos, coord_space); SOMAGeometryDataFrame::create( uri, @@ -95,8 +81,6 @@ TEST_CASE("SOMAGeometryDataFrame: basic", "[SOMAGeometryDataFrame]") { std::vector expected_index_column_names = { dim_infos[0].name, dim_infos[1].name}; - std::vector expected_spatial_column_names = { - spatial_dim_infos[0].name, spatial_dim_infos[1].name}; REQUIRE(soma_geometry->index_column_names() == expected_index_column_names); REQUIRE(soma_geometry->coordinate_space() == coord_space); REQUIRE(soma_geometry->nnz() == 0); @@ -127,26 +111,12 @@ TEST_CASE("SOMAGeometryDataFrame: Roundtrip", "[SOMAGeometryDataFrame]") { .string_lo = "N/A", .string_hi = "N/A"})}); - std::vector spatial_dim_infos( - {helper::DimInfo( - {.name = "x", - .tiledb_datatype = TILEDB_FLOAT64, - .dim_max = 200, - .string_lo = "N/A", - .string_hi = "N/A"}), - helper::DimInfo( - {.name = "y", - .tiledb_datatype = TILEDB_FLOAT64, - .dim_max = 100, - .string_lo = "N/A", - .string_hi = "N/A"})}); - std::vector attr_infos({helper::AttrInfo( {.name = "quality", .tiledb_datatype = TILEDB_FLOAT64})}); auto [schema, index_columns] = - helper::create_arrow_schema_and_index_columns(dim_infos, attr_infos); - auto spatial_columns = helper::create_column_index_info(spatial_dim_infos); + helper::create_arrow_schema_and_index_columns( + dim_infos, attr_infos, coord_space); SOMAGeometryDataFrame::create( uri, @@ -257,43 +227,23 @@ TEST_CASE("SOMAGeometryDataFrame: Roundtrip", "[SOMAGeometryDataFrame]") { while (auto batch = soma_geometry->read_next()) { auto arrbuf = batch.value(); auto d0span = arrbuf->at(dim_infos[0].name)->data(); - auto d1span = arrbuf - ->at( - SOMA_GEOMETRY_DIMENSION_PREFIX + - spatial_dim_infos[0].name + "__min") - ->data(); - auto d2span = arrbuf - ->at( - SOMA_GEOMETRY_DIMENSION_PREFIX + - spatial_dim_infos[0].name + "__max") - ->data(); - auto d3span = arrbuf - ->at( - SOMA_GEOMETRY_DIMENSION_PREFIX + - spatial_dim_infos[1].name + "__min") - ->data(); - auto d4span = arrbuf - ->at( - SOMA_GEOMETRY_DIMENSION_PREFIX + - spatial_dim_infos[1].name + "__max") - ->data(); auto wkbs = arrbuf->at(dim_infos[1].name)->binaries(); auto a0span = arrbuf->at(attr_infos[0].name)->data(); CHECK( std::vector({1}) == std::vector(d0span.begin(), d0span.end())); - CHECK( - std::vector({0}) == - std::vector(d1span.begin(), d1span.end())); - CHECK( - std::vector({1}) == - std::vector(d2span.begin(), d2span.end())); - CHECK( - std::vector({0}) == - std::vector(d3span.begin(), d3span.end())); - CHECK( - std::vector({1}) == - std::vector(d4span.begin(), d4span.end())); + // CHECK( + // std::vector({0}) == + // std::vector(d1span.begin(), d1span.end())); + // CHECK( + // std::vector({1}) == + // std::vector(d2span.begin(), d2span.end())); + // CHECK( + // std::vector({0}) == + // std::vector(d3span.begin(), d3span.end())); + // CHECK( + // std::vector({1}) == + // std::vector(d4span.begin(), d4span.end())); CHECK(geometry::to_wkb(polygon) == wkbs[0]); CHECK( std::vector({63}) == From 9080ee3e8a4965852c9459a25da9e4578d8e288b Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Wed, 5 Feb 2025 16:39:11 +0200 Subject: [PATCH 07/14] lint fixes --- apis/python/src/tiledbsoma/__init__.py | 2 +- apis/python/src/tiledbsoma/_dataframe.py | 62 +++++++----- .../src/tiledbsoma/_geometry_dataframe.py | 95 +++++++++++++------ apis/python/src/tiledbsoma/_scene.py | 4 +- 4 files changed, 111 insertions(+), 52 deletions(-) diff --git a/apis/python/src/tiledbsoma/__init__.py b/apis/python/src/tiledbsoma/__init__.py index 410fc7201d..bc520b0ba8 100644 --- a/apis/python/src/tiledbsoma/__init__.py +++ b/apis/python/src/tiledbsoma/__init__.py @@ -175,11 +175,11 @@ get_storage_engine, show_package_versions, ) +from ._geometry_dataframe import GeometryDataFrame from ._indexer import IntIndexer, tiledbsoma_build_index from ._measurement import Measurement from ._multiscale_image import MultiscaleImage from ._point_cloud_dataframe import PointCloudDataFrame -from ._geometry_dataframe import GeometryDataFrame from ._query import ExperimentAxisQuery from ._scene import Scene from ._sparse_nd_array import SparseNDArray, SparseNDArrayRead diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 291d92de63..e9b486768a 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -817,14 +817,23 @@ def _canonicalize_schema( raise ValueError( f"{SOMA_GEOMETRY} field must be of type Arrow binary or large_binary but is {geometry_type}" ) - schema.set(schema.get_field_index(SOMA_GEOMETRY), schema.field(SOMA_GEOMETRY).with_metadata({"dtype": "WKB"})) + schema.set( + schema.get_field_index(SOMA_GEOMETRY), + schema.field(SOMA_GEOMETRY).with_metadata({"dtype": "WKB"}), + ) else: # add SOMA_GEOMETRY - schema = schema.append(pa.field(SOMA_GEOMETRY, pa.large_binary(), metadata={"dtype": "WKB"})) + schema = schema.append( + pa.field(SOMA_GEOMETRY, pa.large_binary(), metadata={"dtype": "WKB"}) + ) # verify no illegal use of soma_ prefix for field_name in schema.names: - if field_name.startswith("soma_") and field_name != SOMA_JOINID and field_name != SOMA_GEOMETRY: + if ( + field_name.startswith("soma_") + and field_name != SOMA_JOINID + and field_name != SOMA_GEOMETRY + ): raise ValueError( f"DataFrame schema may not contain fields with name prefix ``soma_``: got ``{field_name}``" ) @@ -832,7 +841,11 @@ def _canonicalize_schema( # verify that all index_column_names are present in the schema schema_names_set = set(schema.names) for index_column_name in index_column_names: - if index_column_name.startswith("soma_") and index_column_name != SOMA_JOINID and index_column_name != SOMA_GEOMETRY: + if ( + index_column_name.startswith("soma_") + and index_column_name != SOMA_JOINID + and index_column_name != SOMA_GEOMETRY + ): raise ValueError( f'index_column_name other than "soma_joinid" must not begin with "soma_"; got "{index_column_name}"' ) @@ -885,29 +898,31 @@ def _fill_out_slot_soma_domain( """ saturated_range = False if index_column_name == SOMA_GEOMETRY: - # SOMA_GEOMETRY domain should be either a list on None or a list of tuple[float, float] - lo = [] - hi = [] + # SOMA_GEOMETRY domain should be either a list of None or a list of tuple[float, float] + axes_lo = [] + axes_hi = [] if isinstance(slot_domain, list): - finfo: NPFInfo = np.finfo(np.float64) + f64info: NPFInfo = np.finfo(np.float64) for axis_domain in slot_domain: if axis_domain is None: - lo.append(finfo.min) - hi.append(finfo.max) + axes_lo.append(f64info.min) + axes_hi.append(f64info.max) saturated_range = True elif not isinstance(axis_domain, tuple) or len(axis_domain) != 2: - raise ValueError( - "Axis domain should be a tuple[float, float]" - ) + raise ValueError("Axis domain should be a tuple[float, float]") else: - if np.issubdtype(type(axis_domain[0]), NPFloating) or np.issubdtype(type(axis_domain[1]), NPFloating): - raise ValueError( - "Axis domain should be a tuple[float, float]" - ) - - lo.append(axis_domain[0]) - hi.append(axis_domain[1]) - slot_domain = lo, hi + if np.issubdtype(type(axis_domain[0]), NPFloating) or np.issubdtype( + type(axis_domain[1]), NPFloating + ): + raise ValueError("Axis domain should be a tuple[float, float]") + + axes_lo.append(axis_domain[0]) + axes_hi.append(axis_domain[1]) + slot_domain = tuple(axes_lo), tuple(axes_hi) + else: + raise ValueError( + f"{SOMA_GEOMETRY} domain should be either a list of None or a list of tuple[float, float]" + ) elif slot_domain is not None: # User-specified; go with it when possible if ( @@ -1096,7 +1111,10 @@ def _revise_domain_for_extent( if saturated_range: # Handle SOMA_GEOMETRY domain with is tuple[list[float], list[float]] if isinstance(domain[1], list): - return (domain[0], [dim_max - extent for dim_max in domain[1]],) + return ( + domain[0], + [dim_max - extent for dim_max in domain[1]], + ) else: return (domain[0], domain[1] - extent) else: diff --git a/apis/python/src/tiledbsoma/_geometry_dataframe.py b/apis/python/src/tiledbsoma/_geometry_dataframe.py index c0feb2a7a0..9c11960ee2 100644 --- a/apis/python/src/tiledbsoma/_geometry_dataframe.py +++ b/apis/python/src/tiledbsoma/_geometry_dataframe.py @@ -13,26 +13,36 @@ import pyarrow as pa import somacore from somacore import CoordinateSpace, CoordinateTransform, options +from typing_extensions import Self + from tiledbsoma._tdb_handles import GeometryDataFrameWrapper from tiledbsoma.options._soma_tiledb_context import _validate_soma_tiledb_context from tiledbsoma.options._tiledb_create_write_options import TileDBCreateOptions -from typing_extensions import Self from . import _arrow_types, _util from . import pytiledbsoma as clib -from ._constants import SOMA_COORDINATE_SPACE_METADATA_KEY, SOMA_GEOMETRY, SOMA_JOINID, SPATIAL_DISCLAIMER -from ._dataframe import Domain, _canonicalize_schema, _fill_out_slot_soma_domain, _find_extent_for_domain, _revise_domain_for_extent -from ._read_iters import TableReadIter -from ._types import OpenTimestamp -from .options import SOMATileDBContext - +from ._constants import ( + SOMA_COORDINATE_SPACE_METADATA_KEY, + SOMA_GEOMETRY, + SOMA_JOINID, + SPATIAL_DISCLAIMER, +) +from ._dataframe import ( + Domain, + _canonicalize_schema, + _fill_out_slot_soma_domain, + _find_extent_for_domain, + _revise_domain_for_extent, +) from ._exception import SOMAError, map_exception_for_create +from ._read_iters import TableReadIter from ._spatial_dataframe import SpatialDataFrame from ._spatial_util import ( coordinate_space_from_json, coordinate_space_to_json, - process_spatial_df_region, ) +from ._types import OpenTimestamp +from .options import SOMATileDBContext _UNBATCHED = options.BatchSize() @@ -108,8 +118,11 @@ def create( axis_names = tuple(coordinate_space) axis_units = tuple(len(axis_names) * [None]) - index_column_names = (SOMA_GEOMETRY, SOMA_JOINID,) - + index_column_names = ( + SOMA_GEOMETRY, + SOMA_JOINID, + ) + context = _validate_soma_tiledb_context(context) schema = _canonicalize_schema(schema, index_column_names) @@ -136,7 +149,7 @@ def create( domain = None # Check if domain has the right size (number of axis + 1 for SOMA_JOINID) - + if soma_domain is None: soma_domain = tuple(None for _ in index_column_names) else: @@ -147,20 +160,28 @@ def create( f"if domain is specified, it must have the same length as " f"index_column_names; got {ndom} != {nidx}" ) - + mutable_soma_domain = list(soma_domain) - soma_geometry_domain = mutable_soma_domain[index_column_names.index(SOMA_GEOMETRY)] + soma_geometry_domain = mutable_soma_domain[ + index_column_names.index(SOMA_GEOMETRY) + ] if soma_geometry_domain is None: soma_geometry_domain = [None for _ in axis_names] elif not isinstance(soma_geometry_domain, list): - raise ValueError(f"'{SOMA_GEOMETRY}' domain should be a list of tuple[float, float]") + raise ValueError( + f"'{SOMA_GEOMETRY}' domain should be a list of tuple[float, float]" + ) elif len(soma_geometry_domain) != len(axis_names): - raise ValueError(f"Dimension mishmatch between '{SOMA_GEOMETRY}' domain and coordinate system") - - mutable_soma_domain[index_column_names.index(SOMA_GEOMETRY)] = soma_geometry_domain + raise ValueError( + f"Dimension mishmatch between '{SOMA_GEOMETRY}' domain and coordinate system" + ) + + mutable_soma_domain[index_column_names.index(SOMA_GEOMETRY)] = ( + soma_geometry_domain + ) soma_domain = tuple(mutable_soma_domain) - + index_column_schema = [] index_column_data = {} @@ -176,7 +197,11 @@ def create( if index_column_name == SOMA_GEOMETRY: (slot_core_max_domain, saturated_md) = _fill_out_slot_soma_domain( - [None for _ in axis_names], True, index_column_name, pa_field.type, dtype + [None for _ in axis_names], + True, + index_column_name, + pa_field.type, + dtype, ) else: (slot_core_max_domain, saturated_md) = _fill_out_slot_soma_domain( @@ -210,13 +235,30 @@ def create( # [4] core current domain hi if index_column_name == SOMA_GEOMETRY: # SOMA_GEOMETRY has a specific schema - index_column_schema.append(pa.field(pa_field.name, pa.struct({axis:pa.float64() for axis in axis_names}))) + index_column_schema.append( + pa.field( + pa_field.name, + pa.struct({axis: pa.float64() for axis in axis_names}), + ) + ) index_column_data[pa_field.name] = [ - [(axis, slot_core_max_domain[0][idx]) for idx, axis in enumerate(axis_names)], - [(axis, slot_core_max_domain[1][idx]) for idx, axis in enumerate(axis_names)], + [ + (axis, slot_core_max_domain[0][idx]) + for idx, axis in enumerate(axis_names) + ], + [ + (axis, slot_core_max_domain[1][idx]) + for idx, axis in enumerate(axis_names) + ], [(axis, extent) for axis in axis_names], - [(axis, slot_core_current_domain[0][idx]) for idx, axis in enumerate(axis_names)], - [(axis, slot_core_current_domain[0][idx]) for idx, axis in enumerate(axis_names)] + [ + (axis, slot_core_current_domain[0][idx]) + for idx, axis in enumerate(axis_names) + ], + [ + (axis, slot_core_current_domain[0][idx]) + for idx, axis in enumerate(axis_names) + ], ] else: index_column_schema.append(pa_field) @@ -250,7 +292,7 @@ def create( cls._wrapper_type.open(uri, "w", context, tiledb_timestamp), _dont_call_this_use_create_or_open_instead="tiledbsoma-internal-code", ) - + def __init__( self, handle: GeometryDataFrameWrapper, @@ -265,7 +307,6 @@ def __init__( raise SOMAError("Missing coordinate space metadata") from ke self._coord_space = coordinate_space_from_json(coord_space) - # Data operations def read( @@ -410,4 +451,4 @@ def coordinate_space(self, value: CoordinateSpace) -> None: self.metadata[SOMA_COORDINATE_SPACE_METADATA_KEY] = coordinate_space_to_json( value ) - self._coord_space = value \ No newline at end of file + self._coord_space = value diff --git a/apis/python/src/tiledbsoma/_scene.py b/apis/python/src/tiledbsoma/_scene.py index cc19511aa1..196ce357ac 100644 --- a/apis/python/src/tiledbsoma/_scene.py +++ b/apis/python/src/tiledbsoma/_scene.py @@ -213,7 +213,7 @@ def _set_transform_to_element( # Either set the new coordinate space or check the axes of the current # coordinate space the element is defined on. if coordinate_space is None: - elem_axis_names: Tuple[str, ...] = elem.coordinate_space.axis_names # type: ignore[attr-defined] + elem_axis_names: Tuple[str, ...] = elem.coordinate_space.axis_names if elem_axis_names != transform.output_axes: raise ValueError( f"The name of transform output axes, {transform.output_axes}, do " @@ -221,7 +221,7 @@ def _set_transform_to_element( f" space, {elem_axis_names}." ) else: - elem.coordinate_space = coordinate_space # type: ignore[attr-defined] + elem.coordinate_space = coordinate_space # Set the transform metadata and return the multisclae image. coll.metadata[f"soma_scene_registry_{key}"] = transform_to_json(transform) From 42359ca353a04720adaf3fe1a57d534dc34f4096 Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Wed, 5 Feb 2025 17:55:16 +0200 Subject: [PATCH 08/14] Add automatic addition of column configurable --- apis/python/src/tiledbsoma/_dataframe.py | 8 +++++--- apis/python/src/tiledbsoma/_geometry_dataframe.py | 4 +++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index e9b486768a..9c0d71fad8 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -790,7 +790,9 @@ def write( def _canonicalize_schema( - schema: pa.Schema, index_column_names: Sequence[str] + schema: pa.Schema, + index_column_names: Sequence[str], + required_columns: Sequence[str] = [SOMA_JOINID], ) -> pa.Schema: """Turns an Arrow schema into the canonical version and checks for errors. @@ -807,7 +809,7 @@ def _canonicalize_schema( raise ValueError( f"{SOMA_JOINID} field must be of type Arrow int64 but is {joinid_type}" ) - else: + elif SOMA_JOINID in required_columns: # add SOMA_JOINID schema = schema.append(pa.field(SOMA_JOINID, pa.int64())) @@ -821,7 +823,7 @@ def _canonicalize_schema( schema.get_field_index(SOMA_GEOMETRY), schema.field(SOMA_GEOMETRY).with_metadata({"dtype": "WKB"}), ) - else: + elif SOMA_GEOMETRY in required_columns: # add SOMA_GEOMETRY schema = schema.append( pa.field(SOMA_GEOMETRY, pa.large_binary(), metadata={"dtype": "WKB"}) diff --git a/apis/python/src/tiledbsoma/_geometry_dataframe.py b/apis/python/src/tiledbsoma/_geometry_dataframe.py index 9c11960ee2..4691e6b355 100644 --- a/apis/python/src/tiledbsoma/_geometry_dataframe.py +++ b/apis/python/src/tiledbsoma/_geometry_dataframe.py @@ -124,7 +124,9 @@ def create( ) context = _validate_soma_tiledb_context(context) - schema = _canonicalize_schema(schema, index_column_names) + schema = _canonicalize_schema( + schema, index_column_names, [SOMA_JOINID, SOMA_GEOMETRY] + ) # SOMA-to-core mappings: # From 78203015a53ec8945d0c1943630f37cb1377a5df Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Wed, 5 Feb 2025 19:53:00 +0200 Subject: [PATCH 09/14] Fix domain generation --- apis/python/src/tiledbsoma/_dataframe.py | 2 +- apis/python/src/tiledbsoma/_geometry_dataframe.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 9c0d71fad8..338498f1d8 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -1112,7 +1112,7 @@ def _revise_domain_for_extent( ) -> Tuple[Any, Any]: if saturated_range: # Handle SOMA_GEOMETRY domain with is tuple[list[float], list[float]] - if isinstance(domain[1], list): + if isinstance(domain[1], tuple): return ( domain[0], [dim_max - extent for dim_max in domain[1]], diff --git a/apis/python/src/tiledbsoma/_geometry_dataframe.py b/apis/python/src/tiledbsoma/_geometry_dataframe.py index 4691e6b355..a3d51be6d5 100644 --- a/apis/python/src/tiledbsoma/_geometry_dataframe.py +++ b/apis/python/src/tiledbsoma/_geometry_dataframe.py @@ -258,7 +258,7 @@ def create( for idx, axis in enumerate(axis_names) ], [ - (axis, slot_core_current_domain[0][idx]) + (axis, slot_core_current_domain[1][idx]) for idx, axis in enumerate(axis_names) ], ] From 76a1ffd8f95fad6e19009a857f5eb68cda95e02f Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Wed, 5 Feb 2025 19:53:23 +0200 Subject: [PATCH 10/14] Add basic dataframe construction tests --- apis/python/tests/test_geometry_dataframe.py | 57 ++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 apis/python/tests/test_geometry_dataframe.py diff --git a/apis/python/tests/test_geometry_dataframe.py b/apis/python/tests/test_geometry_dataframe.py new file mode 100644 index 0000000000..a20d750240 --- /dev/null +++ b/apis/python/tests/test_geometry_dataframe.py @@ -0,0 +1,57 @@ +from urllib.parse import urljoin + +import numpy as np +import pyarrow as pa +import pytest +import shapely +import typeguard + +import tiledbsoma as soma + +@pytest.mark.parametrize("domain", [None, [[(-1000, 1000), (0, 100)], [0, 100]]]) +def test_geometry_domain(tmp_path, domain): + uri = tmp_path.as_uri() + + asch = pa.schema([("quality", pa.float32())]) + + with soma.GeometryDataFrame.create(uri, schema=asch, domain=domain) as geom: + soma_domain = geom.domain + + assert len(soma_domain) == 2 + + for idx, name in enumerate(geom.index_column_names): + if name == "soma_geometry": + f64info = np.finfo(np.float64) + + assert len(soma_domain[idx][0]) == 2 + assert len(soma_domain[idx][1]) == 2 + + for axis_idx, axis_name in enumerate(geom.coordinate_space.axis_names): + assert soma_domain[idx][0][axis_name] == (f64info.min if domain is None else domain[0][axis_idx][0]) + + for axis_idx, axis_name in enumerate(geom.coordinate_space.axis_names): + assert soma_domain[idx][1][axis_name] == (f64info.max if domain is None else domain[0][axis_idx][1]) + + +def test_geometry_coordinate_space(tmp_path): + uri = tmp_path.as_uri() + + asch = pa.schema([("quality", pa.float32())]) + + with soma.GeometryDataFrame.create(uri, schema=asch) as geom: + assert len(geom.coordinate_space) == 2 + assert geom.coordinate_space.axis_names == ("x", "y") + assert geom.coordinate_space.axes == (soma.Axis(name="x"), soma.Axis(name="y")) + + # Axis names do not match + with pytest.raises(ValueError): + geom.coordinate_space = soma.CoordinateSpace( + [soma.Axis(name="a"), soma.Axis(name="y")] + ) + + geom.coordinate_space = soma.CoordinateSpace( + [soma.Axis(name="x", unit="m"), soma.Axis(name="y", unit="in")] + ) + assert geom.coordinate_space[0] == soma.Axis(name="x", unit="m") + assert geom.coordinate_space[1] == soma.Axis(name="y", unit="in") + From 6059973005f612272812097ab63019099eac039d Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Wed, 5 Feb 2025 20:00:13 +0200 Subject: [PATCH 11/14] lint fixes --- apis/python/tests/test_geometry_dataframe.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/apis/python/tests/test_geometry_dataframe.py b/apis/python/tests/test_geometry_dataframe.py index a20d750240..eb8dde08a2 100644 --- a/apis/python/tests/test_geometry_dataframe.py +++ b/apis/python/tests/test_geometry_dataframe.py @@ -1,13 +1,10 @@ -from urllib.parse import urljoin - import numpy as np import pyarrow as pa import pytest -import shapely -import typeguard import tiledbsoma as soma + @pytest.mark.parametrize("domain", [None, [[(-1000, 1000), (0, 100)], [0, 100]]]) def test_geometry_domain(tmp_path, domain): uri = tmp_path.as_uri() @@ -27,10 +24,14 @@ def test_geometry_domain(tmp_path, domain): assert len(soma_domain[idx][1]) == 2 for axis_idx, axis_name in enumerate(geom.coordinate_space.axis_names): - assert soma_domain[idx][0][axis_name] == (f64info.min if domain is None else domain[0][axis_idx][0]) + assert soma_domain[idx][0][axis_name] == ( + f64info.min if domain is None else domain[0][axis_idx][0] + ) for axis_idx, axis_name in enumerate(geom.coordinate_space.axis_names): - assert soma_domain[idx][1][axis_name] == (f64info.max if domain is None else domain[0][axis_idx][1]) + assert soma_domain[idx][1][axis_name] == ( + f64info.max if domain is None else domain[0][axis_idx][1] + ) def test_geometry_coordinate_space(tmp_path): @@ -54,4 +55,3 @@ def test_geometry_coordinate_space(tmp_path): ) assert geom.coordinate_space[0] == soma.Axis(name="x", unit="m") assert geom.coordinate_space[1] == soma.Axis(name="y", unit="in") - From ad6341110a90f46d1bdd59fc2c22295085321d15 Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Thu, 6 Feb 2025 12:41:05 +0200 Subject: [PATCH 12/14] Properly handle complex domain saturation flag --- apis/python/src/tiledbsoma/_dataframe.py | 31 ++++++++++++++++++------ 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 338498f1d8..ac11239bed 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -890,7 +890,7 @@ def _fill_out_slot_soma_domain( index_column_name: str, pa_type: pa.DataType, dtype: Any, -) -> Tuple[Tuple[Any, Any], bool]: +) -> Tuple[Tuple[Any, Any], Union[bool, Tuple[bool, ...]]]: """Helper function for _build_tiledb_schema. Given a user-specified domain for a dimension slot -- which may be ``None``, or a two-tuple of which either element may be ``None`` -- return either what the user specified (if adequate) or @@ -905,11 +905,12 @@ def _fill_out_slot_soma_domain( axes_hi = [] if isinstance(slot_domain, list): f64info: NPFInfo = np.finfo(np.float64) + saturated_multi_range = [] for axis_domain in slot_domain: if axis_domain is None: axes_lo.append(f64info.min) axes_hi.append(f64info.max) - saturated_range = True + saturated_multi_range.append(True) elif not isinstance(axis_domain, tuple) or len(axis_domain) != 2: raise ValueError("Axis domain should be a tuple[float, float]") else: @@ -920,12 +921,16 @@ def _fill_out_slot_soma_domain( axes_lo.append(axis_domain[0]) axes_hi.append(axis_domain[1]) + saturated_multi_range.append(False) slot_domain = tuple(axes_lo), tuple(axes_hi) else: raise ValueError( f"{SOMA_GEOMETRY} domain should be either a list of None or a list of tuple[float, float]" ) - elif slot_domain is not None: + + return (slot_domain, tuple(saturated_multi_range)) + + if slot_domain is not None: # User-specified; go with it when possible if ( ( @@ -1108,16 +1113,26 @@ def _find_extent_for_domain( # extent exceeds max value representable by domain type. Reduce domain max # by 1 tile extent to allow for expansion. def _revise_domain_for_extent( - domain: Tuple[Any, Any], extent: Any, saturated_range: bool + domain: Tuple[Any, Any], extent: Any, saturated_range: Union[bool, Tuple[bool, ...]] ) -> Tuple[Any, Any]: - if saturated_range: + if isinstance(saturated_range, tuple): # Handle SOMA_GEOMETRY domain with is tuple[list[float], list[float]] if isinstance(domain[1], tuple): + if len(saturated_range) != len(domain[1]): + raise ValueError( + "Internal error: Saturatin flag length does not match domain size" + ) + return ( domain[0], - [dim_max - extent for dim_max in domain[1]], + [ + (dim_max - extent) if saturated_range[idx] else dim_max + for idx, dim_max in enumerate(domain[1]) + ], ) - else: - return (domain[0], domain[1] - extent) + + raise ValueError("Expected a complex domain") + elif saturated_range: + return (domain[0], domain[1] - extent) else: return domain From 6bcef5cfbaca6f1a251fa65d80c01231268a78c9 Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos <38084549+XanthosXanthopoulos@users.noreply.github.com> Date: Tue, 11 Feb 2025 16:21:48 +0200 Subject: [PATCH 13/14] Apply suggestions from code review Co-authored-by: Julia Dark <24235303+jp-dark@users.noreply.github.com> --- libtiledbsoma/src/soma/soma_dense_ndarray.cc | 2 +- libtiledbsoma/src/soma/soma_sparse_ndarray.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/libtiledbsoma/src/soma/soma_dense_ndarray.cc b/libtiledbsoma/src/soma/soma_dense_ndarray.cc index 04f534f640..52c0e89d50 100644 --- a/libtiledbsoma/src/soma/soma_dense_ndarray.cc +++ b/libtiledbsoma/src/soma/soma_dense_ndarray.cc @@ -66,7 +66,7 @@ void SOMADenseNDArray::create( ctx->tiledb_ctx(), schema, index_columns, - {}, + std::nullopt, "SOMADenseNDArray", false, platform_config); diff --git a/libtiledbsoma/src/soma/soma_sparse_ndarray.cc b/libtiledbsoma/src/soma/soma_sparse_ndarray.cc index 9705712560..4e83a78f2a 100644 --- a/libtiledbsoma/src/soma/soma_sparse_ndarray.cc +++ b/libtiledbsoma/src/soma/soma_sparse_ndarray.cc @@ -67,7 +67,7 @@ void SOMASparseNDArray::create( ctx->tiledb_ctx(), schema, index_columns, - {}, + std::nullopt, "SOMASparseNDArray", true, platform_config); From 985276311a76f935ba7a5261cbed728547234879 Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Tue, 11 Feb 2025 16:44:07 +0200 Subject: [PATCH 14/14] Update unit test --- .../test/unit_soma_geometry_dataframe.cc | 32 ++++++++++++------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/libtiledbsoma/test/unit_soma_geometry_dataframe.cc b/libtiledbsoma/test/unit_soma_geometry_dataframe.cc index 5eed13d4e3..c2d39a863f 100644 --- a/libtiledbsoma/test/unit_soma_geometry_dataframe.cc +++ b/libtiledbsoma/test/unit_soma_geometry_dataframe.cc @@ -227,23 +227,31 @@ TEST_CASE("SOMAGeometryDataFrame: Roundtrip", "[SOMAGeometryDataFrame]") { while (auto batch = soma_geometry->read_next()) { auto arrbuf = batch.value(); auto d0span = arrbuf->at(dim_infos[0].name)->data(); + auto d1span = arrbuf->at(SOMA_GEOMETRY_DIMENSION_PREFIX + "x__min") + ->data(); + auto d2span = arrbuf->at(SOMA_GEOMETRY_DIMENSION_PREFIX + "x__max") + ->data(); + auto d3span = arrbuf->at(SOMA_GEOMETRY_DIMENSION_PREFIX + "y__min") + ->data(); + auto d4span = arrbuf->at(SOMA_GEOMETRY_DIMENSION_PREFIX + "y__max") + ->data(); auto wkbs = arrbuf->at(dim_infos[1].name)->binaries(); auto a0span = arrbuf->at(attr_infos[0].name)->data(); CHECK( std::vector({1}) == std::vector(d0span.begin(), d0span.end())); - // CHECK( - // std::vector({0}) == - // std::vector(d1span.begin(), d1span.end())); - // CHECK( - // std::vector({1}) == - // std::vector(d2span.begin(), d2span.end())); - // CHECK( - // std::vector({0}) == - // std::vector(d3span.begin(), d3span.end())); - // CHECK( - // std::vector({1}) == - // std::vector(d4span.begin(), d4span.end())); + CHECK( + std::vector({0}) == + std::vector(d1span.begin(), d1span.end())); + CHECK( + std::vector({1}) == + std::vector(d2span.begin(), d2span.end())); + CHECK( + std::vector({0}) == + std::vector(d3span.begin(), d3span.end())); + CHECK( + std::vector({1}) == + std::vector(d4span.begin(), d4span.end())); CHECK(geometry::to_wkb(polygon) == wkbs[0]); CHECK( std::vector({63}) ==