diff --git a/apis/python/setup.py b/apis/python/setup.py index 3d9fadf7eb..4ad2ced690 100644 --- a/apis/python/setup.py +++ b/apis/python/setup.py @@ -308,7 +308,7 @@ def run(self): library_dirs=LIB_DIRS, libraries=["tiledbsoma"] + (["tiledb"] if os.name == "nt" else []), extra_link_args=CXX_FLAGS, - extra_compile_args=["-std=c++17" if os.name != "nt" else "/std:c++17"] + extra_compile_args=["-std=c++17" if os.name != "nt" else "/std:c++17", "-g"] + CXX_FLAGS, language="c++", ) diff --git a/apis/python/src/tiledbsoma/_arrow_types.py b/apis/python/src/tiledbsoma/_arrow_types.py index 490a109a0c..4c73f74f02 100644 --- a/apis/python/src/tiledbsoma/_arrow_types.py +++ b/apis/python/src/tiledbsoma/_arrow_types.py @@ -169,7 +169,7 @@ def tiledb_schema_to_arrow( if attr.enum_label is not None: # enumerated if A is None: A = tiledb.open(uri, ctx=ctx) - info = A.enum(name) + info = A.enum(attr.enum_label) arrow_schema_dict[name] = pa.dictionary( index_type=arrow_type_from_tiledb_dtype(attr.dtype), value_type=arrow_type_from_tiledb_dtype( diff --git a/apis/python/src/tiledbsoma/_collection.py b/apis/python/src/tiledbsoma/_collection.py index 48a484be77..f3886b96d6 100644 --- a/apis/python/src/tiledbsoma/_collection.py +++ b/apis/python/src/tiledbsoma/_collection.py @@ -434,13 +434,9 @@ def __getitem__(self, key: str) -> CollectionElementType: context = self.context timestamp = self.tiledb_timestamp_ms - try: - wrapper = _tdb_handles.open(uri, mode, context, timestamp) - entry.soma = _factory.reify_handle(wrapper) - except SOMAError: - entry.soma = _factory._open_internal( - entry.entry.wrapper_type.open, uri, mode, context, timestamp - ) + wrapper = _tdb_handles.open(uri, mode, context, timestamp) + entry.soma = _factory.reify_handle(wrapper) + # Since we just opened this object, we own it and should close it. self._close_stack.enter_context(entry.soma) return cast(CollectionElementType, entry.soma) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index c5fcd71a4d..509c84d4c9 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -6,13 +6,12 @@ """ Implementation of a SOMA DataFrame """ -from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union, cast +from typing import Any, Optional, Sequence, Tuple, Type, Union, cast import numpy as np import pandas as pd import pyarrow as pa import somacore -import tiledb from somacore import options from typing_extensions import Self @@ -122,6 +121,7 @@ class DataFrame(TileDBArray, somacore.DataFrame): it must be ``None``. """ + _wrapper_type = DataFrameWrapper _reader_wrapper_type = DataFrameWrapper @classmethod @@ -209,14 +209,78 @@ def create( """ context = _validate_soma_tiledb_context(context) schema = _canonicalize_schema(schema, index_column_names) - tdb_schema = _build_tiledb_schema( + if domain is None: + domain = tuple(None for _ in index_column_names) + else: + ndom = len(domain) + nidx = len(index_column_names) + if ndom != nidx: + raise ValueError( + f"if domain is specified, it must have the same length as index_column_names; got {ndom} != {nidx}" + ) + + domains = [] + extents = [] + for index_column_name, slot_domain in zip(index_column_names, domain): + pa_type = schema.field(index_column_name).type + dtype = _arrow_types.tiledb_type_from_arrow_type( + pa_type, is_indexed_column=True + ) + + slot_domain = _fill_out_slot_domain( + slot_domain, index_column_name, pa_type, dtype + ) + + extent = _find_extent_for_domain( + index_column_name, + TileDBCreateOptions.from_platform_config(platform_config), + dtype, + slot_domain, + ) + + domains.append(pa.array(slot_domain, type=pa_type)) + extents.append(pa.array([extent], type=pa_type)) + + domains = pa.StructArray.from_arrays(domains, names=index_column_names) + extents = pa.StructArray.from_arrays(extents, names=index_column_names) + + plt_cfg = None + if platform_config: + ops = TileDBCreateOptions.from_platform_config(platform_config) + plt_cfg = clib.PlatformConfig() + plt_cfg.dataframe_dim_zstd_level = ops.dataframe_dim_zstd_level + plt_cfg.sparse_nd_array_dim_zstd_level = ops.sparse_nd_array_dim_zstd_level + plt_cfg.write_X_chunked = ops.write_X_chunked + plt_cfg.goal_chunk_nnz = ops.goal_chunk_nnz + plt_cfg.capacity = ops.capacity + if ops.offsets_filters: + plt_cfg.offsets_filters = [ + info["_type"] for info in ops.offsets_filters + ] + if ops.validity_filters: + plt_cfg.validity_filters = [ + info["_type"] for info in ops.validity_filters + ] + plt_cfg.allows_duplicates = ops.allows_duplicates + plt_cfg.tile_order = ops.tile_order + plt_cfg.cell_order = ops.cell_order + plt_cfg.consolidate_and_vacuum = ops.consolidate_and_vacuum + + # TODO add as kw args + timestamp_ms = context._open_timestamp_ms(tiledb_timestamp) + clib.SOMADataFrame.create( + uri, schema, index_column_names, - domain, - TileDBCreateOptions.from_platform_config(platform_config), - context, + domains, + extents, + context.native_context, + plt_cfg, + (0, timestamp_ms), ) - handle = cls._create_internal(uri, tdb_schema, context, tiledb_timestamp) + + handle = cls._wrapper_type.open(uri, "w", context, tiledb_timestamp) + return cls( handle, _dont_call_this_use_create_or_open_instead="tiledbsoma-internal-code", @@ -343,20 +407,16 @@ def read( context = handle.context() if platform_config is not None: config = context.tiledb_config.copy() - config.update(platform_config or {}) + config.update(platform_config) context = clib.SOMAContext(config) - ts = None - if handle.timestamp is not None: - ts = (0, handle.timestamp) - sr = clib.SOMADataFrame.open( uri=handle.uri, mode=clib.OpenMode.read, context=context, column_names=column_names or [], result_order=_util.to_clib_result_order(result_order), - timestamp=ts, + timestamp=handle.timestamp and (0, handle.timestamp), ) if value_filter is not None: @@ -400,92 +460,50 @@ def write( """ _util.check_type("values", values, (pa.Table,)) - dim_cols_map: Dict[str, pd.DataFrame] = {} - attr_cols_map: Dict[str, pd.DataFrame] = {} - dim_names_set = self.index_column_names - n = None - - for col_info in values.schema: - name = col_info.name - col = values.column(name).combine_chunks() - n = len(col) - - if self._handle.schema.has_attr(name): - attr = self._handle.schema.attr(name) - - # Add the enumeration values to the TileDB Array from ArrowArray - if attr.enum_label is not None: - if not pa.types.is_dictionary(col_info.type): - raise ValueError( - "Expected dictionary type for enumerated attribute " - f"{name} but saw {col.type}" + target_schema = [] + for i, input_field in enumerate(values.schema): + name = input_field.name + target_field = self.schema.field(name) + + if pa.types.is_dictionary(target_field.type): + if not pa.types.is_dictionary(input_field.type): + raise ValueError(f"{name} requires dictionary entry") + col = values.column(name).combine_chunks() + if pa.types.is_boolean(target_field.type.value_type): + col = col.cast( + pa.dictionary( + target_field.type.index_type, + pa.uint8(), + target_field.type.ordered, ) - - enmr = self._handle.enum(attr.name) - - # get new enumeration values by taking the set difference - # while maintaining ordering - update_vals = np.setdiff1d( - col.dictionary, enmr.values(), assume_unique=True ) + new_enmr = self._handle._handle.extend_enumeration(name, col) + + if pa.types.is_binary( + target_field.type.value_type + ) or pa.types.is_large_binary(target_field.type.value_type): + new_enmr = np.array(new_enmr, "S") + elif pa.types.is_boolean(target_field.type.value_type): + new_enmr = np.array(new_enmr, bool) + + df = pd.Categorical( + col.to_pandas(), + ordered=target_field.type.ordered, + categories=new_enmr, + ) + values = values.set_column( + i, name, pa.DictionaryArray.from_pandas(df, type=target_field.type) + ) - index_capacity_current = len(enmr.values()) + len(update_vals) - index_capacity_max = np.iinfo( - col_info.type.index_type.to_pandas_dtype() - ).max - if index_capacity_max < index_capacity_current: - raise ValueError( - f"Too many enumeration values ({index_capacity_current}) " - "for index type {col_info.type.index_type}" - ) - - # only extend if there are new values - if len(update_vals) != 0: - se = tiledb.ArraySchemaEvolution(self.context.tiledb_ctx) - if np.issubdtype(enmr.dtype.type, np.str_): - extend_vals = np.array(update_vals, "U") - elif np.issubdtype(enmr.dtype.type, np.bytes_): - extend_vals = np.array(update_vals, "S") - else: - extend_vals = np.array(update_vals, enmr.dtype) - new_enmr = enmr.extend(extend_vals) - df = pd.Categorical(col.to_pandas(), new_enmr.values()) - col = pa.DictionaryArray.from_pandas(df) - se.extend_enumeration(new_enmr) - se.array_evolve(uri=self.uri) - - cols_map = dim_cols_map if name in dim_names_set else attr_cols_map - schema = self._handle.schema - if pa.types.is_dictionary(col.type): - if ( - name not in dim_names_set - and schema.attr(name).enum_label is not None - ): - cols_map[name] = col.indices.to_pandas() - else: - cols_map[name] = col - + if pa.types.is_boolean(input_field.type): + target_schema.append(target_field.with_type(pa.uint8())) else: - if name not in dim_names_set: - if schema.attr(name).enum_label is not None: - raise ValueError( - f"Categorical column {name} must be presented with categorical data" - ) - - cols_map[name] = col.to_pandas() + target_schema.append(target_field) + values = values.cast(pa.schema(target_schema, values.schema.metadata)) - if n is None: - raise ValueError(f"did not find any column names in {values.schema.names}") + for batch in values.to_batches(): + self._handle.write(batch) - # We need to produce the dim cols in the same order as they're present in the TileDB schema - # (tracked by self.index_column_names). This is important in the multi-index case. Suppose - # the Arrow schema has two index columns in the order "burger" and "meister", and suppose - # the user set index_column_names = ["meister", "burger"] when creating the TileDB schema. - # Then the above for-loop over the Arrow schema will find the former ordering, but for the - # ``writer[dims] = attrs`` below we must have dims with the latter ordering. - dim_cols_list = [dim_cols_map[name] for name in self.index_column_names] - dim_cols_tuple = tuple(dim_cols_list) - self._handle.writer[dim_cols_tuple] = attr_cols_map tiledb_create_options = TileDBCreateOptions.from_platform_config( platform_config ) @@ -495,7 +513,7 @@ def write( return self def _set_reader_coord( - self, sr: clib.SOMAArray, dim_idx: int, dim: tiledb.Dim, coord: object + self, sr: clib.SOMAArray, dim_idx: int, dim: pa.Field, coord: object ) -> bool: if coord is None: return True # No constraint; select all in this dimension @@ -569,7 +587,7 @@ def _set_reader_coord_by_py_seq_or_np_array( self, sr: clib.SOMAArray, dim_idx: int, - dim: tiledb.Dim, + dim: pa.Field, coord: object, ) -> bool: if isinstance(coord, np.ndarray): @@ -694,120 +712,6 @@ def _canonicalize_schema( return schema -def _build_tiledb_schema( - schema: pa.Schema, - index_column_names: Sequence[str], - domain: Optional[Sequence[Optional[Tuple[Any, Any]]]], - tiledb_create_options: TileDBCreateOptions, - context: SOMATileDBContext, -) -> tiledb.ArraySchema: - """Converts an Arrow schema into a TileDB ArraySchema for creation.""" - - if domain is None: - domain = tuple(None for _ in index_column_names) - else: - ndom = len(domain) - nidx = len(index_column_names) - if ndom != nidx: - raise ValueError( - f"if domain is specified, it must have the same length as index_column_names; got {ndom} != {nidx}" - ) - - dims = [] - for index_column_name, slot_domain in zip(index_column_names, domain): - pa_type = schema.field(index_column_name).type - dtype = _arrow_types.tiledb_type_from_arrow_type( - pa_type, is_indexed_column=True - ) - - slot_domain = _fill_out_slot_domain( - slot_domain, index_column_name, pa_type, dtype - ) - - extent = _find_extent_for_domain( - index_column_name, tiledb_create_options, dtype, slot_domain - ) - - dim = tiledb.Dim( - name=index_column_name, - domain=slot_domain, - tile=extent, - dtype=dtype, - filters=tiledb_create_options.dim_filters_tiledb( - index_column_name, - [ - dict( - _type="ZstdFilter", - level=tiledb_create_options.dataframe_dim_zstd_level, - ) - ], - ), - ) - dims.append(dim) - - dom = tiledb.Domain(dims, ctx=context.tiledb_ctx) - - attrs = [] - enums = [] - metadata = schema.metadata or {} - for pa_attr in schema: - attr_name = pa_attr.name - - if attr_name in index_column_names: - continue - - has_enum = pa.types.is_dictionary(pa_attr.type) - - if has_enum: - enmr_dtype: np.dtype[Any] - vtype = pa_attr.type.value_type - if pa.types.is_large_string(vtype) or pa.types.is_string(vtype): - enmr_dtype = np.dtype("U") - elif pa.types.is_large_binary(vtype) or pa.types.is_binary(vtype): - enmr_dtype = np.dtype("S") - else: - enmr_dtype = np.dtype(vtype.to_pandas_dtype()) - enums.append( - tiledb.Enumeration( - name=attr_name, - ordered=pa_attr.type.ordered, - dtype=enmr_dtype, - ) - ) - - attr = tiledb.Attr( - name=attr_name, - dtype=_arrow_types.tiledb_type_from_arrow_type( - schema.field(attr_name).type - ), - nullable=metadata.get(attr_name.encode("utf-8")) == b"nullable", - filters=tiledb_create_options.attr_filters_tiledb( - attr_name, ["ZstdFilter"] - ), - enum_label=attr_name if has_enum else None, - ctx=context.tiledb_ctx, - ) - attrs.append(attr) - - cell_order, tile_order = tiledb_create_options.cell_tile_orders() - - return tiledb.ArraySchema( - domain=dom, - attrs=attrs, - enums=enums, - sparse=True, - allows_duplicates=tiledb_create_options.allows_duplicates, - offsets_filters=tiledb_create_options.offsets_filters_tiledb(), - validity_filters=tiledb_create_options.validity_filters_tiledb(), - capacity=tiledb_create_options.capacity, - cell_order=cell_order, - # As of TileDB core 2.8.2, we cannot consolidate string-indexed sparse arrays with - # col-major tile order: so we write ``X`` with row-major tile order. - tile_order=tile_order, - ctx=context.tiledb_ctx, - ) - - def _fill_out_slot_domain( slot_domain: Optional[Tuple[Any, Any]], index_column_name: str, @@ -847,7 +751,7 @@ def _fill_out_slot_domain( ) elif isinstance(dtype, str): - slot_domain = None, None + slot_domain = "", "" elif np.issubdtype(dtype, NPInteger): iinfo = np.iinfo(cast(NPInteger, dtype)) slot_domain = iinfo.min, iinfo.max - 1 @@ -912,7 +816,7 @@ def _find_extent_for_domain( extent = 64 if isinstance(dtype, str): - return extent + return "" lo, hi = slot_domain if lo is None or hi is None: diff --git a/apis/python/src/tiledbsoma/_dense_nd_array.py b/apis/python/src/tiledbsoma/_dense_nd_array.py index 4617bd93b0..117387dd8b 100644 --- a/apis/python/src/tiledbsoma/_dense_nd_array.py +++ b/apis/python/src/tiledbsoma/_dense_nd_array.py @@ -15,9 +15,10 @@ from typing_extensions import Self from . import _util +from . import pytiledbsoma as clib from ._common_nd_array import NDArray from ._exception import SOMAError -from ._tdb_handles import ArrayWrapper +from ._tdb_handles import DenseNDArrayWrapper from ._util import dense_indices_to_shape from .options._tiledb_create_options import TileDBCreateOptions @@ -72,7 +73,7 @@ class DenseNDArray(NDArray, somacore.DenseNDArray): __slots__ = () - _reader_wrapper_type = ArrayWrapper + _reader_wrapper_type = DenseNDArrayWrapper def read( self, @@ -107,7 +108,7 @@ def read( Lifecycle: Experimental. """ - del partitions, platform_config # Currently unused. + del partitions # Currently unused. self._check_open_read() result_order = somacore.ResultOrder(result_order) @@ -123,13 +124,28 @@ def read( # # The only exception is if the array has been created but no data have been written at # all, in which case the best we can do is use the schema shape. - data_shape = self._handle.schema.shape + handle: clib.DenseNDArrayWrapper = self._handle._handle + + data_shape = handle.shape ned = self.non_empty_domain() if ned is not None: data_shape = tuple(slot[1] + 1 for slot in ned) target_shape = dense_indices_to_shape(coords, data_shape, result_order) - sr = self._soma_reader(result_order=result_order) + context = handle.context() + if platform_config is not None: + config = context.tiledb_config.copy() + config.update(platform_config) + context = clib.SOMAContext(config) + + sr = clib.SOMADenseNDArray.open( + uri=handle.uri, + mode=clib.OpenMode.read, + context=context, + column_names=[], + result_order=_util.to_clib_result_order(result_order), + timestamp=handle.timestamp and (0, handle.timestamp), + ) self._set_reader_coords(sr, coords) diff --git a/apis/python/src/tiledbsoma/_sparse_nd_array.py b/apis/python/src/tiledbsoma/_sparse_nd_array.py index 8397c3c5f7..4220eeb82a 100644 --- a/apis/python/src/tiledbsoma/_sparse_nd_array.py +++ b/apis/python/src/tiledbsoma/_sparse_nd_array.py @@ -38,7 +38,7 @@ SparseCOOTensorReadIter, TableReadIter, ) -from ._tdb_handles import ArrayWrapper +from ._tdb_handles import SparseNDArrayWrapper from ._types import NTuple from .options._tiledb_create_options import TileDBCreateOptions @@ -95,7 +95,7 @@ class SparseNDArray(NDArray, somacore.SparseNDArray): __slots__ = () - _reader_wrapper_type = ArrayWrapper + _reader_wrapper_type = SparseNDArrayWrapper # Inherited from somacore # * ndim accessor @@ -155,11 +155,27 @@ def read( ``slice(2,None)`` or ``slice(None,4)``. * Negative indexing is unsupported. """ - del batch_size, platform_config # Currently unused. + del batch_size # Currently unused. + handle: clib.SparseNDArrayWrapper = self._handle._handle + self._check_open_read() _util.check_unpartitioned(partitions) - sr = self._soma_reader(schema=self._handle.schema, result_order=result_order) + context = handle.context() + if platform_config is not None: + config = context.tiledb_config.copy() + config.update(platform_config) + context = clib.SOMAContext(config) + + sr = clib.SOMASparseNDArray.open( + uri=handle.uri, + mode=clib.OpenMode.read, + context=context, + column_names=[], + result_order=_util.to_clib_result_order(result_order), + timestamp=handle.timestamp and (0, handle.timestamp), + ) + return SparseNDArrayRead(sr, self, coords) def write( diff --git a/apis/python/src/tiledbsoma/_tdb_handles.py b/apis/python/src/tiledbsoma/_tdb_handles.py index 8513d30bdb..2d6aed8baf 100644 --- a/apis/python/src/tiledbsoma/_tdb_handles.py +++ b/apis/python/src/tiledbsoma/_tdb_handles.py @@ -16,7 +16,6 @@ Dict, Generic, Iterator, - List, Mapping, MutableMapping, Optional, @@ -57,7 +56,7 @@ def open( # if there is not a valid SOMAObject at the given URI, this # returns None soma_object = clib.SOMAObject.open( - uri, open_mode, context.native_context, (0, timestamp_ms) + uri, open_mode, context.native_context, timestamp=(0, timestamp_ms) ) # Avoid creating a TileDB-Py Ctx unless necessary @@ -70,13 +69,16 @@ def open( if not obj_type: raise DoesNotExistError(f"{uri!r} does not exist") - if open_mode == clib.OpenMode.read and obj_type == "SOMADataFrame": + if obj_type == "SOMADataFrame": return DataFrameWrapper._from_soma_object(soma_object, context) + if open_mode == clib.OpenMode.read and obj_type == "SOMADenseNDArray": + return DenseNDArrayWrapper._from_soma_object(soma_object, context) + if open_mode == clib.OpenMode.read and obj_type == "SOMASparseNDArray": + return SparseNDArrayWrapper._from_soma_object(soma_object, context) if obj_type in ( - "SOMADataFrame", - "SOMASparseNDArray", "SOMADenseNDArray", + "SOMASparseNDArray", "array", ): return ArrayWrapper.open(uri, mode, context, timestamp) @@ -127,7 +129,7 @@ def open( handle._do_initial_reads(auxiliary_reader) else: handle._do_initial_reads(tdb) - except tiledb.TileDBError as tdbe: + except (RuntimeError, tiledb.TileDBError) as tdbe: if is_does_not_exist_error(tdbe): raise DoesNotExistError(f"{uri!r} does not exist") from tdbe raise @@ -319,8 +321,13 @@ def _do_initial_reads(self, reader: tiledb.Group) -> None: } -class DataFrameWrapper(Wrapper[clib.SOMADataFrame]): - """Wrapper around a Pybind11 SOMADataFrame handle.""" +_ArrType = TypeVar("_ArrType", bound=clib.SOMAArray) + + +class SOMAArrayWrapper(Wrapper[_ArrType]): + """Base class for Pybind11 SOMAArrayWrapper handles.""" + + _WRAPPED_TYPE: Type[_ArrType] @classmethod def _opener( @@ -329,18 +336,15 @@ def _opener( mode: options.OpenMode, context: SOMATileDBContext, timestamp: int, - ) -> clib.SOMADataFrame: + ) -> clib.SOMADenseNDArray: open_mode = clib.OpenMode.read if mode == "r" else clib.OpenMode.write - config = {k: str(v) for k, v in context.tiledb_config.items()} - column_names: List[str] = [] - result_order = clib.ResultOrder.automatic - return clib.SOMADataFrame.open( + return cls._WRAPPED_TYPE.open( uri, open_mode, - config, - column_names, - result_order, - (0, timestamp), + context=context.native_context, + column_names=[], + result_order=clib.ResultOrder.automatic, + timestamp=(0, timestamp), ) # Covariant types should normally not be in parameters, but this is for @@ -352,7 +356,7 @@ def _do_initial_reads(self, reader: _RawHdl_co) -> None: # type: ignore[misc] will need to retrieve data from the backing store on setup. """ # non–attrs-managed field - self.metadata = MetadataWrapper(self, dict(reader.meta)) + self.metadata = MetadataWrapper(self, reader.meta()) @property def schema(self) -> pa.Schema: @@ -360,21 +364,17 @@ def schema(self) -> pa.Schema: @property def meta(self) -> "MetadataWrapper": - return MetadataWrapper(self, dict(self._handle.meta)) + return self.metadata @property def ndim(self) -> int: - return len(self._handle.index_column_names) - - @property - def count(self) -> int: - return int(self._handle.count) + return len(self._handle.dimension_names) def _cast_domain( self, domain: Callable[[str, DTypeLike], Tuple[object, object]] ) -> Tuple[Tuple[object, object], ...]: result = [] - for name in self._handle.index_column_names: + for name in self._handle.dimension_names: dtype = self._handle.schema.field(name).type if pa.types.is_timestamp(dtype): np_dtype = np.dtype(dtype.to_pandas_dtype()) @@ -405,12 +405,12 @@ def non_empty_domain(self) -> Tuple[Tuple[object, object], ...]: @property def attr_names(self) -> Tuple[str, ...]: return tuple( - f.name for f in self.schema if f.name not in self._handle.index_column_names + f.name for f in self.schema if f.name not in self._handle.dimension_names ) @property def dim_names(self) -> Tuple[str, ...]: - return tuple(self._handle.index_column_names) + return tuple(self._handle.dimension_names) def enum(self, label: str) -> tiledb.Enumeration: # The DataFrame handle may either be ArrayWrapper or DataFrameWrapper. @@ -419,6 +419,43 @@ def enum(self, label: str) -> tiledb.Enumeration: raise NotImplementedError +class DataFrameWrapper(SOMAArrayWrapper[clib.SOMADataFrame]): + """Wrapper around a Pybind11 SOMADataFrame handle.""" + + _WRAPPED_TYPE = clib.SOMADataFrame + + @property + def count(self) -> int: + return int(self._handle.count) + + def write(self, values: pa.Table) -> None: + self._handle.write(values) + + +class DenseNDArrayWrapper(SOMAArrayWrapper[clib.SOMADenseNDArray]): + """Wrapper around a Pybind11 DenseNDArrayWrapper handle.""" + + _WRAPPED_TYPE = clib.SOMADenseNDArray + + @property + def shape(self) -> Tuple[int, ...]: + return tuple(self._handle.shape) + + +class SparseNDArrayWrapper(SOMAArrayWrapper[clib.SOMASparseNDArray]): + """Wrapper around a Pybind11 SparseNDArrayWrapper handle.""" + + _WRAPPED_TYPE = clib.SOMASparseNDArray + + @property + def shape(self) -> Tuple[int, ...]: + return tuple(self._handle.shape) + + @property + def nnz(self) -> int: + return int(self._handle.nnz) + + class _DictMod(enum.Enum): """State machine to keep track of modifications to a dictionary. @@ -519,12 +556,25 @@ def _write(self) -> None: # There were no changes (e.g., it's a read handle). Do nothing. return # Only try to get the writer if there are changes to be made. - meta = self.owner.writer.meta - for key, mod in self._mods.items(): - if mod in (_DictMod.ADDED, _DictMod.UPDATED): - meta[key] = self.cache[key] - if mod is _DictMod.DELETED: - del meta[key] + if isinstance(self.owner, DataFrameWrapper): + meta = self.owner.meta + for key, mod in self._mods.items(): + if mod in (_DictMod.ADDED, _DictMod.UPDATED): + set_metadata = self.owner._handle.set_metadata + val = self.cache[key] + if isinstance(val, str): + set_metadata(key, np.array([val], "S")) + else: + set_metadata(key, np.array([val])) + if mod is _DictMod.DELETED: + self.owner._handle.delete_metadata(key) + else: + meta = self.owner.writer.meta + for key, mod in self._mods.items(): + if mod in (_DictMod.ADDED, _DictMod.UPDATED): + meta[key] = self.cache[key] + if mod is _DictMod.DELETED: + del meta[key] # Temporary hack: When we flush writes, note that the cache # is back in sync with disk. self._mods.clear() diff --git a/apis/python/src/tiledbsoma/_tiledb_array.py b/apis/python/src/tiledbsoma/_tiledb_array.py index f0c0d8cadf..779d92911f 100644 --- a/apis/python/src/tiledbsoma/_tiledb_array.py +++ b/apis/python/src/tiledbsoma/_tiledb_array.py @@ -183,6 +183,7 @@ def _create_internal( the newly-created array, open for writing. """ tiledb.Array.create(uri, schema, ctx=context.tiledb_ctx) + handle = cls._wrapper_type.open(uri, "w", context, tiledb_timestamp) cls._set_create_metadata(handle) return handle diff --git a/apis/python/src/tiledbsoma/_tiledb_object.py b/apis/python/src/tiledbsoma/_tiledb_object.py index a444c41c4a..ad51b62646 100644 --- a/apis/python/src/tiledbsoma/_tiledb_object.py +++ b/apis/python/src/tiledbsoma/_tiledb_object.py @@ -40,8 +40,13 @@ class TileDBObject(somacore.SOMAObject, Generic[_WrapperType_co]): """ _wrapper_type: Type[_WrapperType_co] + """Class variable of the Wrapper class used to open this object type.""" + _reader_wrapper_type: Union[ - Type[_WrapperType_co], Type[_tdb_handles.DataFrameWrapper] + Type[_WrapperType_co], + Type[_tdb_handles.DataFrameWrapper], + Type[_tdb_handles.DenseNDArrayWrapper], + Type[_tdb_handles.SparseNDArrayWrapper], ] __slots__ = ("_close_stack", "_handle") @@ -128,8 +133,6 @@ def __init__( self._handle = handle self._close_stack.enter_context(self._handle) - """Class variable of the Wrapper class used to open this object type.""" - @property def context(self) -> SOMATileDBContext: return self._handle.context @@ -267,7 +270,7 @@ def exists( if not isinstance(md_type, str): return False return md_type.lower() == cls.soma_type.lower() - except (SOMAError, tiledb.cc.TileDBError): + except (RuntimeError, SOMAError, tiledb.cc.TileDBError): return False @classmethod diff --git a/apis/python/src/tiledbsoma/common.cc b/apis/python/src/tiledbsoma/common.cc index 490c0eb7bb..9f35eec208 100644 --- a/apis/python/src/tiledbsoma/common.cc +++ b/apis/python/src/tiledbsoma/common.cc @@ -140,6 +140,17 @@ tiledb_datatype_t np_to_tdb_dtype(py::dtype type) { TPY_ERROR_LOC("could not handle numpy dtype"); } +bool is_tdb_str(tiledb_datatype_t type) { + switch (type) { + case TILEDB_STRING_ASCII: + case TILEDB_STRING_UTF8: + case TILEDB_CHAR: + return true; + default: + return false; + } +} + /** * @brief Convert ArrayBuffers to Arrow table. * diff --git a/apis/python/src/tiledbsoma/common.h b/apis/python/src/tiledbsoma/common.h index 33eb27ec53..210b4ccbe5 100644 --- a/apis/python/src/tiledbsoma/common.h +++ b/apis/python/src/tiledbsoma/common.h @@ -34,6 +34,7 @@ namespace tiledbsoma { py::dtype tdb_to_np_dtype(tiledb_datatype_t type, uint32_t cell_val_num); tiledb_datatype_t np_to_tdb_dtype(py::dtype type); +bool is_tdb_str(tiledb_datatype_t type); std::optional to_table( std::optional> buffers); diff --git a/apis/python/src/tiledbsoma/io/ingest.py b/apis/python/src/tiledbsoma/io/ingest.py index c91ac772c3..48e2906ac0 100644 --- a/apis/python/src/tiledbsoma/io/ingest.py +++ b/apis/python/src/tiledbsoma/io/ingest.py @@ -1295,7 +1295,7 @@ def _write_dataframe_impl( ) try: - soma_df = _factory.open(df_uri, "w", soma_type=DataFrame, context=context) + soma_df = DataFrame.open(df_uri, "w", context=context) except DoesNotExistError: soma_df = DataFrame.create( df_uri, @@ -1987,7 +1987,7 @@ def _write_matrix_to_denseNDArray( def _read_nonempty_domain(arr: TileDBArray) -> Any: try: return arr._handle.non_empty_domain() - except SOMAError: + except (SOMAError, RuntimeError): # This means that we're open in write-only mode. # Reopen the array in read mode. pass diff --git a/apis/python/src/tiledbsoma/pytiledbsoma.cc b/apis/python/src/tiledbsoma/pytiledbsoma.cc index f99cf25621..1c980c4166 100644 --- a/apis/python/src/tiledbsoma/pytiledbsoma.cc +++ b/apis/python/src/tiledbsoma/pytiledbsoma.cc @@ -94,6 +94,26 @@ PYBIND11_MODULE(pytiledbsoma, m) { }, "Print TileDB internal statistics. Lifecycle: experimental."); + py::class_(m, "PlatformConfig") + .def(py::init<>()) + .def_readwrite( + "dataframe_dim_zstd_level", + &PlatformConfig::dataframe_dim_zstd_level) + .def_readwrite( + "sparse_nd_array_dim_zstd_level", + &PlatformConfig::sparse_nd_array_dim_zstd_level) + .def_readwrite("write_X_chunked", &PlatformConfig::write_X_chunked) + .def_readwrite("goal_chunk_nnz", &PlatformConfig::goal_chunk_nnz) + .def_readwrite("remote_cap_nbytes", &PlatformConfig::remote_cap_nbytes) + .def_readwrite("capacity", &PlatformConfig::capacity) + .def_readwrite("offsets_filters", &PlatformConfig::offsets_filters) + .def_readwrite("validity_filters", &PlatformConfig::validity_filters) + .def_readwrite("allows_duplicates", &PlatformConfig::allows_duplicates) + .def_readwrite("tile_order", &PlatformConfig::tile_order) + .def_readwrite("cell_order", &PlatformConfig::cell_order) + .def_readwrite( + "consolidate_and_vacuum", &PlatformConfig::consolidate_and_vacuum); + load_soma_context(m); load_soma_object(m); load_soma_array(m); diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index 8254aca816..fe3d7e8a6f 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -39,6 +39,120 @@ namespace py = pybind11; using namespace py::literals; using namespace tiledbsoma; +void write(SOMAArray& array, py::handle py_batch) { + ArrowSchema arrow_schema; + ArrowArray arrow_array; + uintptr_t arrow_schema_ptr = (uintptr_t)(&arrow_schema); + uintptr_t arrow_array_ptr = (uintptr_t)(&arrow_array); + py_batch.attr("_export_to_c")(arrow_array_ptr, arrow_schema_ptr); + + auto attributes = array.tiledb_schema()->attributes(); + + for (auto i = 0; i < arrow_schema.n_children; ++i) { + auto sch_ = arrow_schema.children[i]; + auto arr_ = arrow_array.children[i]; + + const void* data; + uint64_t* offsets = nullptr; + uint8_t* validities = (uint8_t*)arr_->buffers[0]; + + if (arr_->n_buffers == 3) { + offsets = (uint64_t*)arr_->buffers[1]; + data = arr_->buffers[2]; + } else { + data = arr_->buffers[1]; + } + + // if (attributes.find(sch_->name) != attributes.end()) { + // auto enmr_name = AttributeExperimental::get_enumeration_name( + // *array.ctx()->tiledb_ctx(), attributes.at(sch_->name)); + + // if (enmr_name.has_value()) { + // auto dict = arr_->dictionary; + // if (!dict) { + // array.clear_column_data(); + // throw py::value_error( + // "Saw non-dictionary column passed to enumerated + // type"); + // } + + // const void* enmr_data; + // uint64_t* enmr_offsets = nullptr; + // if (dict->n_buffers == 3) { + // enmr_offsets = (uint64_t*)dict->buffers[1]; + // enmr_data = dict->buffers[2]; + // } else { + // enmr_data = dict->buffers[1]; + // } + + // if (dict->length != 0) { + // array.extend_enumeration( + // sch_->name, dict->length, enmr_data, enmr_offsets); + // } + // } + // } + + auto np = py::module::import("numpy"); + auto table_offset = arr_->offset; + auto data_size = tiledb::impl::type_size( + ArrowAdapter::to_tiledb_format(sch_->format)); + + if (offsets) { + offsets += table_offset; + } + if (validities) { + validities += table_offset; + } + + array.set_column_data( + sch_->name, + arr_->length, + (char*)data + table_offset * data_size, + offsets, + nullptr); + } + + try { + array.write(); + } catch (const std::exception& e) { + TPY_ERROR_LOC(e.what()); + } +} + +py::dict meta(SOMAArray& array) { + py::dict results; + + for (auto [key, val] : array.get_metadata()) { + auto [tdb_type, value_num, value] = val; + + if (tdb_type == TILEDB_STRING_UTF8 || tdb_type == TILEDB_STRING_ASCII) { + auto py_buf = py::array(py::dtype("|S1"), value_num, value); + auto res = py_buf.attr("tobytes")().attr("decode")("UTF-8"); + results[py::str(key)] = res; + } else { + py::dtype value_type = tdb_to_np_dtype(tdb_type, 1); + auto res = py::array(value_type, value_num, value).attr("item")(0); + results[py::str(key)] = res; + } + } + return results; +} + +void set_metadata(SOMAArray& sr, const std::string& key, py::array value) { + tiledb_datatype_t value_type = np_to_tdb_dtype(value.dtype()); + + if (is_tdb_str(value_type) && value.size() > 1) + throw py::type_error("array/list of strings not supported"); + + py::buffer_info value_buffer = value.request(); + if (value_buffer.ndim != 1) + throw py::type_error("Only 1D Numpy arrays can be stored as metadata"); + + auto value_num = is_tdb_str(value_type) ? value.nbytes() : value.size(); + sr.set_metadata( + key, value_type, value_num, value_num > 0 ? value.data() : nullptr); +} + py::tuple get_enum(SOMAArray& sr, std::string attr_name) { auto attr_to_enmrs = sr.get_attr_to_enum_mapping(); if (attr_to_enmrs.count(attr_name) == 0) @@ -121,6 +235,14 @@ void load_soma_array(py::module& m) { "platform_config"_a = py::dict(), "timestamp"_a = py::none()) + .def("__enter__", [](SOMAArray& reader) { return reader; }) + .def( + "__exit__", + [](SOMAArray& reader, + py::object exc_type, + py::object exc_value, + py::object traceback) { reader.close(); }) + .def( "set_condition", [](SOMAArray& reader, @@ -513,6 +635,8 @@ void load_soma_array(py::module& m) { return std::nullopt; }) + .def("write", write) + .def("nnz", &SOMAArray::nnz, py::call_guard()) .def_property_readonly("shape", &SOMAArray::shape) @@ -585,6 +709,7 @@ void load_soma_array(py::module& m) { "Unsupported dtype for nonempty domain."); } }) + .def( "domain", [](SOMAArray& reader, std::string name, py::dtype dtype) { @@ -633,37 +758,136 @@ void load_soma_array(py::module& m) { } }) - .def("set_metadata", &SOMAArray::set_metadata) - .def("delete_metadata", &SOMAArray::delete_metadata) + .def_property_readonly("dimension_names", &SOMAArray::dimension_names) + .def( - "get_metadata", - py::overload_cast(&SOMAArray::get_metadata)) - .def_property_readonly( - "meta", - [](SOMAArray& soma_dataframe) -> py::dict { - py::dict results; - - for (auto const& [key, val] : soma_dataframe.get_metadata()) { - tiledb_datatype_t tdb_type = std::get( - val); - uint32_t value_num = std::get(val); - const void* value = std::get(val); - - if (tdb_type == TILEDB_STRING_UTF8) { - results[py::str(key)] = py::str( - std::string((const char*)value, value_num)); - } else if (tdb_type == TILEDB_STRING_ASCII) { - results[py::str(key)] = py::bytes( - std::string((const char*)value, value_num)); - } else { - py::dtype value_type = tdb_to_np_dtype(tdb_type, 1); - results[py::str(key)] = py::array( - value_type, value_num, value); + "extend_enumeration", + [](SOMAArray& array, + std::string attr_name, + py::handle py_batch) -> py::array { + ArrowSchema arrow_schema; + ArrowArray arrow_array; + uintptr_t arrow_schema_ptr = (uintptr_t)(&arrow_schema); + uintptr_t arrow_array_ptr = (uintptr_t)(&arrow_array); + py_batch.attr("_export_to_c")( + arrow_array_ptr, arrow_schema_ptr); + + auto dict = arrow_array.dictionary; + const void* enmr_data; + uint64_t* enmr_offsets = nullptr; + if (dict->n_buffers == 3) { + enmr_offsets = (uint64_t*)dict->buffers[1]; + enmr_data = dict->buffers[2]; + } else { + enmr_data = dict->buffers[1]; + } + + if (dict->length != 0) { + auto new_enmr = array.extend_enumeration( + attr_name, dict->length, enmr_data, enmr_offsets); + + auto emdr_format = arrow_schema.dictionary->format; + switch (ArrowAdapter::to_tiledb_format(emdr_format)) { + case TILEDB_STRING_ASCII: + case TILEDB_CHAR: + case TILEDB_STRING_UTF8: { + auto result = new_enmr.as_vector(); + return py::array(py::cast(result)); + } + case TILEDB_BOOL: + case TILEDB_INT8: { + auto result = new_enmr.as_vector(); + return py::array( + py::dtype("int8"), + result.size(), + result.data()); + } + case TILEDB_UINT8: { + auto result = new_enmr.as_vector(); + return py::array( + py::dtype("uint8"), + result.size(), + result.data()); + } + case TILEDB_INT16: { + auto result = new_enmr.as_vector(); + return py::array( + py::dtype("int16"), + result.size(), + result.data()); + } + case TILEDB_UINT16: { + auto result = new_enmr.as_vector(); + return py::array( + py::dtype("uint16"), + result.size(), + result.data()); + } + case TILEDB_INT32: { + auto result = new_enmr.as_vector(); + return py::array( + py::dtype("int32"), + result.size(), + result.data()); + } + case TILEDB_UINT32: { + auto result = new_enmr.as_vector(); + return py::array( + py::dtype("uint32"), + result.size(), + result.data()); + } + case TILEDB_INT64: { + auto result = new_enmr.as_vector(); + return py::array( + py::dtype("int64"), + result.size(), + result.data()); + } + case TILEDB_UINT64: { + auto result = new_enmr.as_vector(); + return py::array( + py::dtype("uint64"), + result.size(), + result.data()); + } + case TILEDB_FLOAT32: { + auto result = new_enmr.as_vector(); + return py::array( + py::dtype("float32"), + result.size(), + result.data()); + } + case TILEDB_FLOAT64: { + auto result = new_enmr.as_vector(); + return py::array( + py::dtype("float64"), + result.size(), + result.data()); + } + default: + throw TileDBSOMAError( + "extend_enumeration: Unsupported dict " + "datatype"); } + + } else { + return py::array(); } - return results; }) + + .def("set_metadata", set_metadata) + + .def("delete_metadata", &SOMAArray::delete_metadata) + + .def( + "get_metadata", + py::overload_cast(&SOMAArray::get_metadata)) + + .def("meta", meta) + .def("has_metadata", &SOMAArray::has_metadata) + .def("metadata_num", &SOMAArray::metadata_num); } } // namespace libtiledbsomacpp \ No newline at end of file diff --git a/apis/python/src/tiledbsoma/soma_dataframe.cc b/apis/python/src/tiledbsoma/soma_dataframe.cc index 507c48dd43..f7f6823450 100644 --- a/apis/python/src/tiledbsoma/soma_dataframe.cc +++ b/apis/python/src/tiledbsoma/soma_dataframe.cc @@ -49,6 +49,63 @@ using namespace tiledbsoma; void load_soma_dataframe(py::module& m) { py::class_(m, "SOMADataFrame") + .def_static( + "create", + [](std::string_view uri, + py::object py_schema, + std::vector index_columns_names, + py::object py_domains, + py::object py_extents, + std::shared_ptr context, + std::optional platform_config, + std::optional> timestamp) { + ArrowSchema schema; + uintptr_t schema_ptr = (uintptr_t)(&schema); + py_schema.attr("_export_to_c")(schema_ptr); + + for (int64_t sch_idx = 0; sch_idx < schema.n_children; + ++sch_idx) { + auto child = schema.children[sch_idx]; + auto metadata = py_schema.attr("metadata"); + if (py::hasattr(metadata, "get")) { + auto val = metadata.attr("get")( + py::str(child->name).attr("encode")("utf-8")); + + if (val != py::none() && + val.cast() == "nullable") { + child->flags &= ARROW_FLAG_NULLABLE; + } else { + child->flags &= ~ARROW_FLAG_NULLABLE; + } + } + } + + ArrowArray domains; + uintptr_t domains_ptr = (uintptr_t)(&domains); + py_domains.attr("_export_to_c")(domains_ptr); + + ArrowArray extents; + uintptr_t extents_ptr = (uintptr_t)(&extents); + py_extents.attr("_export_to_c")(extents_ptr); + + try { + SOMADataFrame::create( + uri, + std::make_shared(schema), + ColumnIndexInfo( + index_columns_names, + std::make_shared(domains), + std::make_shared(extents)), + context, + platform_config, + timestamp); + } catch (const std::out_of_range& e) { + throw py::type_error(e.what()); + } catch (const std::exception& e) { + TPY_ERROR_LOC(e.what()); + } + }) + .def_static( "open", py::overload_cast< diff --git a/apis/python/src/tiledbsoma/soma_dense_ndarray.cc b/apis/python/src/tiledbsoma/soma_dense_ndarray.cc index 3f99121174..06b7b257f0 100644 --- a/apis/python/src/tiledbsoma/soma_dense_ndarray.cc +++ b/apis/python/src/tiledbsoma/soma_dense_ndarray.cc @@ -61,7 +61,7 @@ void load_soma_dense_ndarray(py::module& m) { &SOMADenseNDArray::open), "uri"_a, "mode"_a, - "ctx"_a, + "context"_a, py::kw_only(), "column_names"_a = py::none(), "result_order"_a = ResultOrder::automatic, diff --git a/apis/python/src/tiledbsoma/soma_object.cc b/apis/python/src/tiledbsoma/soma_object.cc index 3bdb3647f3..e9d821d0d2 100644 --- a/apis/python/src/tiledbsoma/soma_object.cc +++ b/apis/python/src/tiledbsoma/soma_object.cc @@ -54,28 +54,35 @@ void load_soma_object(py::module& m) { "open", [](std::string_view uri, OpenMode mode, - std::shared_ptr ctx, + std::shared_ptr context, std::optional> timestamp) -> py::object { try { - auto obj = SOMAObject::open(uri, mode, ctx, timestamp); - if (obj->type() == "SOMADataFrame") + auto obj = SOMAObject::open(uri, mode, context, timestamp); + auto soma_type = obj->type(); + if (soma_type == "SOMADataFrame") return py::cast(dynamic_cast(*obj)); - else if (obj->type() == "SOMASparseNDArray") + else if (soma_type == "SOMASparseNDArray") return py::cast(dynamic_cast(*obj)); - else if (obj->type() == "SOMADenseNDArray") + else if (soma_type == "SOMADenseNDArray") return py::cast(dynamic_cast(*obj)); - else if (obj->type() == "SOMACollection") + else if (soma_type == "SOMACollection") return py::cast(dynamic_cast(*obj)); - else if (obj->type() == "SOMAExperiment") + else if (soma_type == "SOMAExperiment") return py::cast(dynamic_cast(*obj)); - else if (obj->type() == "SOMAMeasurement") + else if (soma_type == "SOMAMeasurement") return py::cast(dynamic_cast(*obj)); return py::none(); } catch (...) { return py::none(); } - }) + }, + "uri"_a, + "mode"_a, + "context"_a, + py::kw_only(), + "timestamp"_a = py::none()) + .def_property_readonly("type", &SOMAObject::type); }; } // namespace libtiledbsomacpp diff --git a/apis/python/src/tiledbsoma/soma_sparse_ndarray.cc b/apis/python/src/tiledbsoma/soma_sparse_ndarray.cc index d4f1b429ba..a6dce317dd 100644 --- a/apis/python/src/tiledbsoma/soma_sparse_ndarray.cc +++ b/apis/python/src/tiledbsoma/soma_sparse_ndarray.cc @@ -61,7 +61,7 @@ void load_soma_sparse_ndarray(py::module& m) { &SOMASparseNDArray::open), "uri"_a, "mode"_a, - "ctx"_a, + "context"_a, py::kw_only(), "column_names"_a = py::none(), "result_order"_a = ResultOrder::automatic, diff --git a/apis/python/tests/test_collection.py b/apis/python/tests/test_collection.py index 39a6918ff8..47f5f5a030 100644 --- a/apis/python/tests/test_collection.py +++ b/apis/python/tests/test_collection.py @@ -486,6 +486,7 @@ def test_issue919(tmp_path): expt.add_new_dataframe( "df", schema=schema, index_column_names=["soma_joinid"] ) + assert expt["df"].tiledb_timestamp_ms == 100 with soma.Collection.open(uri, context=context) as c: assert "df" in c["expt"] and "causes_bug" in c["expt"] diff --git a/apis/python/tests/test_dataframe.py b/apis/python/tests/test_dataframe.py index 9d21b08f01..e9d2d92cd2 100644 --- a/apis/python/tests/test_dataframe.py +++ b/apis/python/tests/test_dataframe.py @@ -115,6 +115,14 @@ def test_dataframe(tmp_path, arrow_schema): assert sdf.count == 5 assert len(sdf) == 5 + # Ensure read mode uses clib object + with soma.DataFrame.open(tmp_path.as_posix(), "r") as A: + assert isinstance(A._handle._handle, soma.pytiledbsoma.SOMADataFrame) + + # Ensure write mode uses clib object + with soma.DataFrame.open(tmp_path.as_posix(), "w") as A: + assert isinstance(A._handle._handle, soma.pytiledbsoma.SOMADataFrame) + def test_dataframe_with_float_dim(tmp_path, arrow_schema): sdf = soma.DataFrame.create( @@ -1151,9 +1159,7 @@ def test_extend_enumerations(tmp_path): with soma.open(str(tmp_path)) as soma_dataframe: df = soma_dataframe.read().concat().to_pandas() for c in df: - # TODO bytes are being set to ascii - requires a fix in tiledb-py - # assert df[c].dtype == pandas_df[c].dtype - assert df[c].dtype.kind == pandas_df[c].dtype.kind + assert df[c].dtype == pandas_df[c].dtype if df[c].dtype == "category": assert df[c].cat.categories.dtype == pandas_df[c].cat.categories.dtype @@ -1328,7 +1334,7 @@ def test_enum_extend_past_numerical_limit(tmp_path): soma.DataFrame.create(uri, schema=schema).close() n_elem = 132 - n_cats = 128 + n_cats = 127 df1 = pd.DataFrame( { "soma_joinid": pd.Series(np.arange(n_elem), dtype=np.int64), @@ -1355,7 +1361,7 @@ def test_enum_extend_past_numerical_limit(tmp_path): # cannot add additional categories as already maxed out earlier tbl = pa.Table.from_pandas(df2, preserve_index=False) - with pytest.raises(ValueError): + with pytest.raises((RuntimeError, soma.SOMAError)): with soma.open(uri, mode="w") as A: A.write(tbl) diff --git a/apis/python/tests/test_dataframe_index_columns.py b/apis/python/tests/test_dataframe_index_columns.py index 3348572cbd..05555f3df9 100644 --- a/apis/python/tests/test_dataframe_index_columns.py +++ b/apis/python/tests/test_dataframe_index_columns.py @@ -1730,55 +1730,55 @@ def test_types_create_errors( "int32-py-list-shaped-out-of-bounds", ["int32"], [[100, 200]], - tiledb.cc.TileDBError, + soma._exception.SOMAError, ], [ "int16-py-list-shaped-out-of-bounds", ["int16"], [[100, 200]], - tiledb.cc.TileDBError, + soma._exception.SOMAError, ], [ "int8-py-list-shaped-out-of-bounds", ["int8"], [[10, 20]], - tiledb.cc.TileDBError, + soma._exception.SOMAError, ], [ "uint64-py-list-shaped-out-of-bounds", ["uint64"], [[100, 200]], - tiledb.cc.TileDBError, + soma._exception.SOMAError, ], [ "uint32-py-list-shaped-out-of-bounds", ["uint32"], [[100, 200]], - tiledb.cc.TileDBError, + soma._exception.SOMAError, ], [ "uint32-py-list-shaped-out-of-bounds", ["uint32"], [[100, 200]], - tiledb.cc.TileDBError, + soma._exception.SOMAError, ], [ "uint8-py-list-shaped-out-of-bounds", ["uint8"], [[10, 20]], - tiledb.cc.TileDBError, + soma._exception.SOMAError, ], [ "float32-py-list-shaped-out-of-bounds", ["float32"], [[100.0, 200.0]], - tiledb.cc.TileDBError, + soma._exception.SOMAError, ], [ "float64-py-list-shaped-out-of-bounds", ["float64"], [[100.0, 200.0]], - tiledb.cc.TileDBError, + soma._exception.SOMAError, ], ], ) diff --git a/apis/python/tests/test_dense_nd_array.py b/apis/python/tests/test_dense_nd_array.py index 47d363058b..665a78fb71 100644 --- a/apis/python/tests/test_dense_nd_array.py +++ b/apis/python/tests/test_dense_nd_array.py @@ -49,6 +49,14 @@ def test_dense_nd_array_create_ok( with tiledb.open(tmp_path.as_posix()) as A: assert not A.schema.sparse + # Ensure read mode uses clib object + with soma.DenseNDArray.open(tmp_path.as_posix(), "r") as A: + assert isinstance(A._handle._handle, soma.pytiledbsoma.SOMADenseNDArray) + + # Ensure write mode uses Python object + with soma.DenseNDArray.open(tmp_path.as_posix(), "w") as A: + assert isinstance(A._handle._handle, tiledb.Array) + @pytest.mark.parametrize("shape", [(10,)]) @pytest.mark.parametrize("element_type", NDARRAY_ARROW_TYPES_NOT_SUPPORTED) diff --git a/apis/python/tests/test_io.py b/apis/python/tests/test_io.py index e696ffe927..61a5861952 100644 --- a/apis/python/tests/test_io.py +++ b/apis/python/tests/test_io.py @@ -171,3 +171,4 @@ def test_write_arrow_table(tmp_path, num_rows, cap_nbytes): with soma.DataFrame.open(uri) as sdf: pdf = sdf.read().concat().to_pandas() assert list(pdf["foo"]) == pydict["foo"] + assert list(pdf["bar"]) == pydict["bar"] diff --git a/apis/python/tests/test_platform_config.py b/apis/python/tests/test_platform_config.py index 680936c1e0..08018f3ba4 100644 --- a/apis/python/tests/test_platform_config.py +++ b/apis/python/tests/test_platform_config.py @@ -26,7 +26,20 @@ def adata(h5ad_file): return anndata.read_h5ad(h5ad_file) +@pytest.mark.skip(reason="No longer return ArraySchema - see note in test") def test_platform_config(adata): + # TODO as we remove usage of TileDB-Py in favor of ArrowSchema, we + # need a new method to get which filters have applied to the column + # rather than grabbing it from the ArraySchema. One consideration + # would be to store TileDB information in JSON format as a field in + # the ArraySchema metadata very similar to how Pandas stores information + # within pa.Schema.pandas_metadata. This could hold not only which + # filters have been applied to the column, but other info that cannot + # be "directly" stored in the ArrowSchema such as whether the column + # is a TileDB attribute or dimension, whether this represent a dense + # or sparse array, etc. This may be as easy as simply copying the + # platform_config by calling pa.Schema.with_metadata(platform_config). + # Set up anndata input path and tiledb-group output path original = adata.copy() with tempfile.TemporaryDirectory() as output_path: diff --git a/apis/python/tests/test_query_condition.py b/apis/python/tests/test_query_condition.py index 3fad4f47fa..53b8c494c6 100644 --- a/apis/python/tests/test_query_condition.py +++ b/apis/python/tests/test_query_condition.py @@ -3,10 +3,8 @@ import os import pytest -import tiledb import tiledbsoma.pytiledbsoma as clib -from tiledbsoma._arrow_types import tiledb_schema_to_arrow from tiledbsoma._exception import SOMAError from tiledbsoma._query_condition import QueryCondition @@ -30,8 +28,7 @@ def pandas_query(uri, condition): def soma_query(uri, condition): qc = QueryCondition(condition) sr = clib.SOMAArray(uri) - schema = tiledb_schema_to_arrow(tiledb.open(uri).schema, uri, tiledb.default_ctx()) - sr.set_condition(qc, schema) + sr.set_condition(qc, sr.schema) arrow_table = sr.read_next() assert sr.results_complete() @@ -45,7 +42,7 @@ def soma_query(uri, condition): "n_genes > 500", # int 'louvain == "NK cells"', # string "percent_mito > 0.02", # float - "is_b_cell == True", # bool + "is_b_cell == True or is_b_cell == False", # bool # compare_op "n_genes == 480", "n_genes != 480", @@ -74,11 +71,11 @@ def test_query_condition(condition): pandas = pandas_query(uri, condition) soma_arrow = soma_query(uri, condition) assert len(pandas.index) == soma_arrow.num_rows - assert ( - (pandas.reset_index(drop=True) == soma_arrow.to_pandas().reset_index(drop=True)) - .all() - .all() - ) + + for name in pandas: + expected = pandas[name].reset_index(drop=True) + actual = soma_arrow[name].to_pandas().reset_index(drop=True) + assert (expected == actual).all() @pytest.mark.parametrize( @@ -110,8 +107,7 @@ def test_query_condition_select_columns(): sr = clib.SOMAArray(uri, column_names=["n_genes"]) qc = QueryCondition(condition) - schema = tiledb_schema_to_arrow(tiledb.open(uri).schema, uri, tiledb.default_ctx()) - sr.set_condition(qc, schema) + sr.set_condition(qc, sr.schema) arrow_table = sr.read_next() assert sr.results_complete() @@ -124,10 +120,9 @@ def test_query_condition_all_columns(): condition = "percent_mito > 0.02" qc = QueryCondition(condition) - schema = tiledb_schema_to_arrow(tiledb.open(uri).schema, uri, tiledb.default_ctx()) sr = clib.SOMAArray(uri) - sr.set_condition(qc, schema) + sr.set_condition(qc, sr.schema) arrow_table = sr.read_next() assert sr.results_complete() @@ -140,10 +135,9 @@ def test_query_condition_reset(): condition = "percent_mito > 0.02" qc = QueryCondition(condition) - schema = tiledb_schema_to_arrow(tiledb.open(uri).schema, uri, tiledb.default_ctx()) sr = clib.SOMAArray(uri) - sr.set_condition(qc, schema) + sr.set_condition(qc, sr.schema) arrow_table = sr.read_next() assert sr.results_complete() @@ -155,7 +149,7 @@ def test_query_condition_reset(): condition = "percent_mito < 0.02" qc = QueryCondition(condition) sr.reset(column_names=["percent_mito"]) - sr.set_condition(qc, schema) + sr.set_condition(qc, sr.schema) arrow_table = sr.read_next() @@ -218,17 +212,16 @@ def test_parsing_error_conditions(malformed_condition): def test_eval_error_conditions(malformed_condition): """Conditions which should not evaluate (but WILL parse)""" uri = os.path.join(SOMA_URI, "obs") - schema = tiledb_schema_to_arrow(tiledb.open(uri).schema, uri, tiledb.default_ctx()) qc = QueryCondition(malformed_condition) with pytest.raises(SOMAError): sr = clib.SOMAArray(uri) - sr.set_condition(qc, schema) + sr.set_condition(qc, sr.schema) with pytest.raises(SOMAError): # test function directly for codecov - qc.init_query_condition(schema, []) - qc.init_query_condition(schema, ["bad_query_attr"]) + qc.init_query_condition(sr.schema, []) + qc.init_query_condition(sr.schema, ["bad_query_attr"]) if __name__ == "__main__": diff --git a/apis/python/tests/test_sparse_nd_array.py b/apis/python/tests/test_sparse_nd_array.py index d31b64582f..cb27dcfddd 100644 --- a/apis/python/tests/test_sparse_nd_array.py +++ b/apis/python/tests/test_sparse_nd_array.py @@ -55,6 +55,14 @@ def test_sparse_nd_array_create_ok( assert a.schema.field(f"soma_dim_{d}").type == pa.int64() assert a.schema.field("soma_data").type == element_type + # Ensure read mode uses clib object + with soma.SparseNDArray.open(tmp_path.as_posix(), "r") as A: + assert isinstance(A._handle._handle, soma.pytiledbsoma.SOMASparseNDArray) + + # Ensure write mode uses Python object + with soma.SparseNDArray.open(tmp_path.as_posix(), "w") as A: + assert isinstance(A._handle._handle, tiledb.Array) + @pytest.mark.parametrize("shape", [(10,)]) @pytest.mark.parametrize("element_type", NDARRAY_ARROW_TYPES_NOT_SUPPORTED) diff --git a/libtiledbsoma/src/soma/array_buffers.h b/libtiledbsoma/src/soma/array_buffers.h index 202c306061..3ab142ea6e 100644 --- a/libtiledbsoma/src/soma/array_buffers.h +++ b/libtiledbsoma/src/soma/array_buffers.h @@ -47,7 +47,7 @@ using namespace tiledb; class ArrayBuffers { public: ArrayBuffers() = default; - ArrayBuffers(const ArrayBuffers&) = delete; + ArrayBuffers(const ArrayBuffers&) = default; ArrayBuffers(ArrayBuffers&&) = default; ~ArrayBuffers() = default; diff --git a/libtiledbsoma/src/soma/column_buffer.cc b/libtiledbsoma/src/soma/column_buffer.cc index 00e5e49af3..572378517b 100644 --- a/libtiledbsoma/src/soma/column_buffer.cc +++ b/libtiledbsoma/src/soma/column_buffer.cc @@ -69,7 +69,7 @@ std::shared_ptr ColumnBuffer::create( } return ColumnBuffer::alloc( - schema, + schema.context().config(), name_str, type, is_var, @@ -91,7 +91,13 @@ std::shared_ptr ColumnBuffer::create( } return ColumnBuffer::alloc( - schema, name_str, type, is_var, false, std::nullopt, false); + schema.context().config(), + name_str, + type, + is_var, + false, + std::nullopt, + false); } throw TileDBSOMAError("[ColumnBuffer] Column name not found: " + name_str); @@ -211,7 +217,7 @@ std::string_view ColumnBuffer::string_view(uint64_t index) { //=================================================================== std::shared_ptr ColumnBuffer::alloc( - ArraySchema schema, + Config config, std::string_view name, tiledb_datatype_t type, bool is_var, @@ -221,7 +227,6 @@ std::shared_ptr ColumnBuffer::alloc( // Set number of bytes for the data buffer. Override with a value from // the config if present. auto num_bytes = DEFAULT_ALLOC_BYTES; - auto config = schema.context().config(); if (config.contains(CONFIG_KEY_INIT_BYTES)) { auto value_str = config.get(CONFIG_KEY_INIT_BYTES); try { @@ -235,10 +240,10 @@ std::shared_ptr ColumnBuffer::alloc( } } - bool is_dense = schema.array_type() == TILEDB_DENSE; - if (is_dense) { - // TODO: Handle dense arrays similar to tiledb python module - } + // bool is_dense = schema.array_type() == TILEDB_DENSE; + // if (is_dense) { + // // TODO: Handle dense arrays similar to tiledb python module + // } // For variable length column types, allocate an extra num_bytes to hold // offset values. The number of cells is the set by the size of the diff --git a/libtiledbsoma/src/soma/column_buffer.h b/libtiledbsoma/src/soma/column_buffer.h index 4a727f5c37..3f88cfa625 100644 --- a/libtiledbsoma/src/soma/column_buffer.h +++ b/libtiledbsoma/src/soma/column_buffer.h @@ -38,7 +38,9 @@ #include #include +#include "../utils/arrow_adapter.h" #include "../utils/common.h" +#include "soma_context.h" #include "span/span.hpp" namespace tiledbsoma { @@ -69,28 +71,6 @@ class ColumnBuffer { static std::shared_ptr create( std::shared_ptr array, std::string_view name); - /** - * @brief Create a ColumnBuffer from a schema, column name, and data. - * - * @param array TileDB array - * @param name TileDB dimension or attribute name - * @param data Data to set in buffer - * @return ColumnBuffer - */ - template - static std::shared_ptr create( - std::shared_ptr array, - std::string_view name, - std::vector data) { - auto column_buff = ColumnBuffer::create(array, name); - column_buff->num_cells_ = data.size(); - column_buff->data_.resize(data.size()); - column_buff->data_.assign( - reinterpret_cast(data.data()), - reinterpret_cast(data.data() + data.size())); - return column_buff; - } - /** * @brief Convert a bytemap to a bitmap in place. * @@ -136,6 +116,45 @@ class ColumnBuffer { */ void attach(Query& query); + /** + * @brief Set the ColumnBuffer's data. + * + * @param data pointer to the beginning of the data to write + * @param num_elems the number of elements in the column + */ + void set_data( + uint64_t num_elems, + const void* data, + uint64_t* offsets = nullptr, + uint8_t* validity = nullptr) { + num_cells_ = num_elems; + + if (offsets != nullptr) { + auto num_offsets = num_elems + 1; + offsets_.resize(num_offsets); + offsets_.assign( + (uint64_t*)offsets, (uint64_t*)offsets + num_offsets); + + data_size_ = offsets_[num_offsets - 1]; + data_.resize(data_size_); + data_.assign((std::byte*)data, (std::byte*)data + data_size_); + } else { + data_size_ = num_elems; + data_.resize(num_elems); + data_.assign( + (std::byte*)data, (std::byte*)data + num_elems * type_size_); + } + + if (is_nullable_) { + if (validity != nullptr) { + validity_.assign(validity, validity + num_elems); + } else { + validity_.resize(num_elems); + std::fill(validity_.begin(), validity_.end(), 1); + } + } + } + /** * @brief Size num_cells_ to match the read query results. * @@ -152,6 +171,15 @@ class ColumnBuffer { return num_cells_; } + /** + * @brief Return size of the data buffer. + * + * @return uint64_t + */ + uint64_t data_size() { + return data_size_; + } + /** * @brief Return a view of the ColumnBuffer data. * @@ -342,7 +370,7 @@ class ColumnBuffer { /** * @brief Allocate and return a ColumnBuffer. * - * @param array TileDB array + * @param config TileDB Config * @param name Column name * @param type TileDB datatype * @param is_var True if variable length data @@ -352,7 +380,7 @@ class ColumnBuffer { * @return ColumnBuffer */ static std::shared_ptr alloc( - ArraySchema schema, + Config config, std::string_view name, tiledb_datatype_t type, bool is_var, @@ -370,6 +398,9 @@ class ColumnBuffer { // Data type of the column from the schema. tiledb_datatype_t type_; + // Data size which is calculated different for var vs non-var + uint64_t data_size_; + // Bytes per element. uint64_t type_size_; diff --git a/libtiledbsoma/src/soma/managed_query.cc b/libtiledbsoma/src/soma/managed_query.cc index ca840871ac..b0d18f2f45 100644 --- a/libtiledbsoma/src/soma/managed_query.cc +++ b/libtiledbsoma/src/soma/managed_query.cc @@ -96,6 +96,128 @@ void ManagedQuery::select_columns( } } +void ManagedQuery::set_column_data( + std::shared_ptr column_buffer) { + auto column_name = std::string(column_buffer->name()); + bool has_attr = schema_->has_attribute(column_name); + bool is_sparse = array_->schema().array_type() == TILEDB_SPARSE; + + if (is_sparse) { + auto data = column_buffer->data(); + query_->set_data_buffer( + column_name, (void*)data.data(), column_buffer->data_size()); + if (column_buffer->is_var()) { + auto offsets = column_buffer->offsets(); + query_->set_offsets_buffer( + column_name, offsets.data(), offsets.size()); + } + if (column_buffer->is_nullable()) { + auto validity = column_buffer->validity(); + query_->set_validity_buffer( + column_name, validity.data(), validity.size()); + } + } else { + if (has_attr) { + auto data = column_buffer->data(); + query_->set_data_buffer( + column_name, (void*)data.data(), column_buffer->data_size()); + if (column_buffer->is_var()) { + auto offsets = column_buffer->offsets(); + query_->set_offsets_buffer( + column_name, offsets.data(), offsets.size()); + } + if (column_buffer->is_nullable()) { + auto validity = column_buffer->validity(); + query_->set_validity_buffer( + column_name, validity.data(), validity.size()); + } + } else { + switch (column_buffer->type()) { + case TILEDB_STRING_ASCII: + case TILEDB_STRING_UTF8: + case TILEDB_CHAR: + case TILEDB_BLOB: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_FLOAT32: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_FLOAT64: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_UINT8: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_INT8: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_UINT16: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_INT16: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_UINT32: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_INT32: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_UINT64: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_INT64: + case TILEDB_TIME_SEC: + case TILEDB_TIME_MS: + case TILEDB_TIME_US: + case TILEDB_TIME_NS: + case TILEDB_DATETIME_SEC: + case TILEDB_DATETIME_MS: + case TILEDB_DATETIME_US: + case TILEDB_DATETIME_NS: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + default: + break; + } + query_->set_subarray(*subarray_); + } + } +} + void ManagedQuery::setup_read() { // If the query is complete, return so we do not submit it again auto status = query_->query_status(); @@ -150,6 +272,7 @@ void ManagedQuery::setup_read() { void ManagedQuery::submit_write() { query_->submit(); + query_->finalize(); } void ManagedQuery::submit_read() { @@ -237,5 +360,4 @@ void ManagedQuery::check_column_name(const std::string& name) { name)); } } - }; // namespace tiledbsoma diff --git a/libtiledbsoma/src/soma/managed_query.h b/libtiledbsoma/src/soma/managed_query.h index b9a9238ba1..9e1b1d03b2 100644 --- a/libtiledbsoma/src/soma/managed_query.h +++ b/libtiledbsoma/src/soma/managed_query.h @@ -212,98 +212,7 @@ class ManagedQuery { * @param buff Buffer array pointer with elements of the column type. * @param nelements Number of array elements in buffer */ - void set_column_data( - std::string column_name, std::shared_ptr column_buffer) { - if (array_->schema().array_type() == TILEDB_SPARSE || - schema_->has_attribute(column_name)) { - auto data = column_buffer->data(); - query_->set_data_buffer( - column_name, (void*)data.data(), data.size_bytes()); - } else { - switch (column_buffer->type()) { - case TILEDB_STRING_ASCII: - case TILEDB_STRING_UTF8: - case TILEDB_CHAR: - case TILEDB_BLOB: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_FLOAT32: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_FLOAT64: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_UINT8: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_INT8: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_UINT16: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_INT16: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_UINT32: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_INT32: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_UINT64: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_INT64: - case TILEDB_TIME_SEC: - case TILEDB_TIME_MS: - case TILEDB_TIME_US: - case TILEDB_TIME_NS: - case TILEDB_DATETIME_SEC: - case TILEDB_DATETIME_MS: - case TILEDB_DATETIME_US: - case TILEDB_DATETIME_NS: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - default: - break; - } - query_->set_subarray(*subarray_); - } - } + void set_column_data(std::shared_ptr buffer); /** * @brief Configure query and allocate result buffers for reads. diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index 081b8fd4ad..f25ebb6710 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -45,15 +45,36 @@ void SOMAArray::create( std::shared_ptr ctx, std::string_view uri, ArraySchema schema, - std::string soma_type) { + std::string soma_type, + std::optional timestamp) { Array::create(std::string(uri), schema); - auto array = Array(*ctx->tiledb_ctx(), std::string(uri), TILEDB_WRITE); - array.put_metadata( - "soma_object_type", + + std::unique_ptr array; + if (timestamp) { + array = std::make_unique( + *ctx->tiledb_ctx(), + std::string(uri), + TILEDB_WRITE, + TemporalPolicy( + TimestampStartEnd, timestamp->first, timestamp->second)); + } else { + array = std::make_unique( + *ctx->tiledb_ctx(), std::string(uri), TILEDB_WRITE); + } + + array->put_metadata( + SOMA_OBJECT_TYPE_KEY, TILEDB_STRING_UTF8, static_cast(soma_type.length()), soma_type.c_str()); - array.close(); + + array->put_metadata( + ENCODING_VERSION_KEY, + TILEDB_STRING_UTF8, + static_cast(ENCODING_VERSION_VAL.length()), + ENCODING_VERSION_VAL.c_str()); + + array->close(); } std::unique_ptr SOMAArray::open( @@ -64,7 +85,7 @@ std::unique_ptr SOMAArray::open( std::vector column_names, std::string_view batch_size, ResultOrder result_order, - std::optional> timestamp) { + std::optional timestamp) { LOG_DEBUG( fmt::format("[SOMAArray] static method 'cfg' opening array '{}'", uri)); return std::make_unique( @@ -86,7 +107,7 @@ std::unique_ptr SOMAArray::open( std::vector column_names, std::string_view batch_size, ResultOrder result_order, - std::optional> timestamp) { + std::optional timestamp) { LOG_DEBUG( fmt::format("[SOMAArray] static method 'ctx' opening array '{}'", uri)); return std::make_unique( @@ -112,7 +133,7 @@ SOMAArray::SOMAArray( std::vector column_names, std::string_view batch_size, ResultOrder result_order, - std::optional> timestamp) + std::optional timestamp) : uri_(util::rstrip_uri(uri)) , result_order_(result_order) , timestamp_(timestamp) { @@ -130,7 +151,7 @@ SOMAArray::SOMAArray( std::vector column_names, std::string_view batch_size, ResultOrder result_order, - std::optional> timestamp) + std::optional timestamp) : uri_(util::rstrip_uri(uri)) , ctx_(ctx) , result_order_(result_order) @@ -141,19 +162,25 @@ SOMAArray::SOMAArray( } void SOMAArray::fill_metadata_cache() { - std::shared_ptr array; if (arr_->query_type() == TILEDB_WRITE) { - array = std::make_shared(*ctx_->tiledb_ctx(), uri_, TILEDB_READ); + meta_cache_arr_ = std::make_shared( + *ctx_->tiledb_ctx(), + uri_, + TILEDB_READ, + TemporalPolicy( + TimestampStartEnd, timestamp()->first, timestamp()->second)); } else { - array = arr_; + meta_cache_arr_ = arr_; } - for (uint64_t idx = 0; idx < array->metadata_num(); ++idx) { + metadata_.clear(); + + for (uint64_t idx = 0; idx < meta_cache_arr_->metadata_num(); ++idx) { std::string key; tiledb_datatype_t value_type; uint32_t value_num; const void* value; - array->get_metadata_from_index( + meta_cache_arr_->get_metadata_from_index( idx, &key, &value_type, &value_num, &value); MetadataValue mdval(value_type, value_num, value); std::pair mdpair(key, mdval); @@ -169,26 +196,22 @@ std::shared_ptr SOMAArray::ctx() { return ctx_; }; -void SOMAArray::open( - OpenMode mode, std::optional> timestamp) { - auto tdb_mode = mode == OpenMode::read ? TILEDB_READ : TILEDB_WRITE; - arr_->open(tdb_mode); - if (timestamp) { - if (timestamp->first > timestamp->second) { - throw std::invalid_argument("timestamp start > end"); - } - arr_->set_open_timestamp_start(timestamp->first); - arr_->set_open_timestamp_end(timestamp->second); - arr_->close(); - arr_->open(tdb_mode); - } +void SOMAArray::open(OpenMode mode, std::optional timestamp) { + timestamp_ = timestamp; + + validate(mode, name_, timestamp); reset(column_names(), batch_size_, result_order_); + fill_metadata_cache(); } void SOMAArray::close() { + if (arr_->query_type() == TILEDB_WRITE) + meta_cache_arr_->close(); + // Close the array through the managed query to ensure any pending queries // are completed. mq_->close(); + metadata_.clear(); } void SOMAArray::reset( @@ -254,16 +277,168 @@ std::optional> SOMAArray::read_next() { return mq_->results(); } -void SOMAArray::write(std::shared_ptr buffers) { +Enumeration SOMAArray::extend_enumeration( + std::string_view name, + uint64_t num_elems, + const void* data, + uint64_t* offsets) { + auto enmr = ArrayExperimental::get_enumeration( + *ctx_->tiledb_ctx(), *arr_, std::string(name)); + + uint64_t max_capacity; + switch (tiledb_schema()->attribute(std::string(name)).type()) { + case TILEDB_INT8: + max_capacity = std::numeric_limits::max(); + break; + case TILEDB_UINT8: + max_capacity = std::numeric_limits::max(); + break; + case TILEDB_INT16: + max_capacity = std::numeric_limits::max(); + break; + case TILEDB_UINT16: + max_capacity = std::numeric_limits::max(); + break; + case TILEDB_INT32: + max_capacity = std::numeric_limits::max(); + break; + case TILEDB_UINT32: + max_capacity = std::numeric_limits::max(); + break; + case TILEDB_INT64: + max_capacity = std::numeric_limits::max(); + break; + case TILEDB_UINT64: + max_capacity = std::numeric_limits::max(); + break; + default: + throw TileDBSOMAError( + "Saw invalid enumeration index type when trying to extend " + "enumeration"); + } + + switch (enmr.type()) { + case TILEDB_STRING_ASCII: + case TILEDB_STRING_UTF8: + case TILEDB_CHAR: { + std::vector offsets_v( + (uint32_t*)offsets, (uint32_t*)offsets + num_elems + 1); + std::string data_v( + (char*)data, (char*)data + offsets_v[offsets_v.size() - 1]); + std::vector enums_in_write; + + for (size_t offset_idx = 0; offset_idx < offsets_v.size() - 1; + ++offset_idx) { + auto beg = offsets_v[offset_idx]; + auto sz = offsets_v[offset_idx + 1] - beg; + enums_in_write.push_back(data_v.substr(beg, sz)); + } + + std::vector extend_values; + auto enums_existing = enmr.as_vector(); + for (auto enum_val : enums_in_write) { + if (std::find( + enums_existing.begin(), + enums_existing.end(), + enum_val) == enums_existing.end()) { + extend_values.push_back(enum_val); + } + } + + if (extend_values.size() != 0) { + // Check that we extend the enumeration values without + // overflowing + auto free_capacity = max_capacity - enums_existing.size(); + if (free_capacity < extend_values.size()) { + throw TileDBSOMAError( + "Cannot extend enumeration; reached maximum capacity"); + } + ArraySchemaEvolution se(*ctx_->tiledb_ctx()); + se.extend_enumeration(enmr.extend(extend_values)); + se.array_evolve(uri_); + return enmr.extend(extend_values); + } + + return enmr; + } + case TILEDB_BOOL: + case TILEDB_INT8: + return SOMAArray::_extend_value_helper( + (int8_t*)data, num_elems, enmr, max_capacity); + case TILEDB_UINT8: + return SOMAArray::_extend_value_helper( + (uint8_t*)data, num_elems, enmr, max_capacity); + case TILEDB_INT16: + return SOMAArray::_extend_value_helper( + (int16_t*)data, num_elems, enmr, max_capacity); + case TILEDB_UINT16: + return SOMAArray::_extend_value_helper( + (uint16_t*)data, num_elems, enmr, max_capacity); + case TILEDB_INT32: + return SOMAArray::_extend_value_helper( + (int32_t*)data, num_elems, enmr, max_capacity); + case TILEDB_UINT32: + return SOMAArray::_extend_value_helper( + (uint32_t*)data, num_elems, enmr, max_capacity); + case TILEDB_INT64: + return SOMAArray::_extend_value_helper( + (int64_t*)data, num_elems, enmr, max_capacity); + case TILEDB_UINT64: + return SOMAArray::_extend_value_helper( + (uint64_t*)data, num_elems, enmr, max_capacity); + case TILEDB_FLOAT32: + return SOMAArray::_extend_value_helper( + (float*)data, num_elems, enmr, max_capacity); + case TILEDB_FLOAT64: + return SOMAArray::_extend_value_helper( + (double*)data, num_elems, enmr, max_capacity); + default: + throw TileDBSOMAError(fmt::format( + "ArrowAdapter: Unsupported TileDB dict datatype: {} ", + tiledb::impl::type_to_str(enmr.type()))); + } +} + +void SOMAArray::set_column_data( + std::string_view name, + uint64_t num_elems, + const void* data, + uint64_t* offsets, + uint8_t* validity) { if (mq_->query_type() != TILEDB_WRITE) { throw TileDBSOMAError("[SOMAArray] array must be opened in write mode"); } - for (auto col_name : buffers->names()) { - mq_->set_column_data(col_name, buffers->at(col_name)); - } + // Create the array_buffer_ as necessary + if (array_buffer_ == nullptr) + array_buffer_ = std::make_shared(); + + // Create a ColumnBuffer object instead of passing it in as an argument to + // `set_column_data` because ColumnBuffer::create requires a TileDB Array + // argument which should remain a private member of SOMAArray + auto column = ColumnBuffer::create(arr_, name); + column->set_data(num_elems, data, offsets, validity); + + // Keep the ColumnBuffer alive by attaching it to the ArrayBuffers class + // member. Otherwise, the data held by the ColumnBuffer will be garbage + // collected before it is submitted to the write query + array_buffer_->emplace(std::string(name), column); + + mq_->set_column_data(column); +}; + +void SOMAArray::clear_column_data() { + array_buffer_ = nullptr; +} +void SOMAArray::write() { + if (mq_->query_type() != TILEDB_WRITE) { + throw TileDBSOMAError("[SOMAArray] array must be opened in write mode"); + } mq_->submit_write(); + + mq_->reset(); + clear_column_data(); } uint64_t SOMAArray::nnz() { @@ -511,35 +686,41 @@ void SOMAArray::set_metadata( tiledb_datatype_t value_type, uint32_t value_num, const void* value) { - if (key.compare("soma_object_type") == 0) { - throw TileDBSOMAError("soma_object_type cannot be modified."); - } + if (key.compare(SOMA_OBJECT_TYPE_KEY) == 0) + throw TileDBSOMAError(SOMA_OBJECT_TYPE_KEY + " cannot be modified."); + + if (key.compare(ENCODING_VERSION_KEY) == 0) + throw TileDBSOMAError(ENCODING_VERSION_KEY + " cannot be modified."); arr_->put_metadata(key, value_type, value_num, value); + MetadataValue mdval(value_type, value_num, value); std::pair mdpair(key, mdval); metadata_.insert(mdpair); } void SOMAArray::delete_metadata(const std::string& key) { - if (key.compare("soma_object_type") == 0) { - throw TileDBSOMAError("soma_object_type cannot be deleted."); - } + if (key.compare(SOMA_OBJECT_TYPE_KEY) == 0) + throw TileDBSOMAError(SOMA_OBJECT_TYPE_KEY + " cannot be deleted."); + + if (key.compare(ENCODING_VERSION_KEY) == 0) + throw TileDBSOMAError(ENCODING_VERSION_KEY + " cannot be deleted."); + arr_->delete_metadata(key); metadata_.erase(key); } -std::map SOMAArray::get_metadata() { - return metadata_; -} - std::optional SOMAArray::get_metadata(const std::string& key) { - if (metadata_.count(key) == 0) { + if (metadata_.count(key) == 0) return std::nullopt; - } + return metadata_[key]; } +std::map SOMAArray::get_metadata() { + return metadata_; +} + bool SOMAArray::has_metadata(const std::string& key) { return metadata_.count(key) != 0; } @@ -551,26 +732,21 @@ uint64_t SOMAArray::metadata_num() const { void SOMAArray::validate( OpenMode mode, std::string_view name, - std::optional> timestamp) { + std::optional timestamp) { // Validate parameters auto tdb_mode = mode == OpenMode::read ? TILEDB_READ : TILEDB_WRITE; try { LOG_DEBUG(fmt::format("[SOMAArray] opening array '{}'", uri_)); - arr_ = std::make_shared(*ctx_->tiledb_ctx(), uri_, tdb_mode); if (timestamp) { - if (timestamp->first > timestamp->second) { - throw std::invalid_argument("timestamp start > end"); - } - arr_->set_open_timestamp_start(timestamp->first); - arr_->set_open_timestamp_end(timestamp->second); - arr_->close(); - arr_->open(tdb_mode); - LOG_DEBUG(fmt::format( - "[SOMAArray] timestamp_start = {}", - arr_->open_timestamp_start())); - LOG_DEBUG(fmt::format( - "[SOMAArray] timestamp_end = {}", arr_->open_timestamp_end())); + arr_ = std::make_shared( + *ctx_->tiledb_ctx(), + uri_, + tdb_mode, + TemporalPolicy( + TimestampStartEnd, timestamp->first, timestamp->second)); + } else { + arr_ = std::make_shared(*ctx_->tiledb_ctx(), uri_, tdb_mode); } LOG_TRACE(fmt::format("[SOMAArray] loading enumerations")); ArrayExperimental::load_all_enumerations( @@ -582,7 +758,7 @@ void SOMAArray::validate( } } -std::optional> SOMAArray::timestamp() { +std::optional SOMAArray::timestamp() { return timestamp_; } diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index b4c17ee6c4..2e56a2cf64 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -67,7 +67,8 @@ class SOMAArray : public SOMAObject { std::shared_ptr ctx, std::string_view uri, ArraySchema schema, - std::string soma_type); + std::string soma_type, + std::optional timestamp = std::nullopt); /** * @brief Open an array at the specified URI and return SOMAArray @@ -92,7 +93,7 @@ class SOMAArray : public SOMAObject { std::vector column_names = {}, std::string_view batch_size = "auto", ResultOrder result_order = ResultOrder::automatic, - std::optional> timestamp = std::nullopt); + std::optional timestamp = std::nullopt); /** * @brief Open an array at the specified URI and return SOMAArray @@ -117,7 +118,7 @@ class SOMAArray : public SOMAObject { std::vector column_names = {}, std::string_view batch_size = "auto", ResultOrder result_order = ResultOrder::automatic, - std::optional> timestamp = std::nullopt); + std::optional timestamp = std::nullopt); //=================================================================== //= public non-static @@ -143,7 +144,7 @@ class SOMAArray : public SOMAObject { std::vector column_names, std::string_view batch_size, ResultOrder result_order, - std::optional> timestamp = std::nullopt); + std::optional timestamp = std::nullopt); /** * @brief Construct a new SOMAArray object @@ -165,7 +166,7 @@ class SOMAArray : public SOMAObject { std::vector column_names, std::string_view batch_size, ResultOrder result_order, - std::optional> timestamp = std::nullopt); + std::optional timestamp = std::nullopt); SOMAArray(const SOMAArray& other) : uri_(other.uri_) @@ -178,8 +179,11 @@ class SOMAArray : public SOMAObject { , mq_(std::make_unique( other.arr_, other.ctx_->tiledb_ctx(), other.name_)) , arr_(other.arr_) + , meta_cache_arr_(other.meta_cache_arr_) , first_read_next_(other.first_read_next_) - , submitted_(other.submitted_) { + , submitted_(other.submitted_) + , array_buffer_(other.array_buffer_) { + fill_metadata_cache(); } SOMAArray(SOMAArray&&) = default; @@ -212,8 +216,7 @@ class SOMAArray : public SOMAObject { * @param timestamp Timestamp */ void open( - OpenMode mode, - std::optional> timestamp = std::nullopt); + OpenMode mode, std::optional timestamp = std::nullopt); /** * Close the SOMAArray object. @@ -404,18 +407,20 @@ class SOMAArray : public SOMAObject { */ std::optional> read_next(); - /** - * @brief Set the write data for a column. - * - * @param column_name Column name - * @param buff Buffer array pointer with elements of the column type. - * @param nelements Number of array elements in buffer - */ + Enumeration extend_enumeration( + std::string_view name, + uint64_t num_elems, + const void* data, + uint64_t* offsets); + void set_column_data( - std::string_view column_name, - std::shared_ptr column_buffer) { - mq_->set_column_data(std::string(column_name), column_buffer); - } + std::string_view name, + uint64_t num_elems, + const void* data, + uint64_t* offsets = nullptr, + uint8_t* validity = nullptr); + + void clear_column_data(); /** * @brief Write ArrayBuffers data to the array. @@ -439,7 +444,7 @@ class SOMAArray : public SOMAObject { * * @param buffers The ArrayBuffers to write to the array */ - void write(std::shared_ptr buffers); + void write(); /** * @brief Check if the query is complete. @@ -509,9 +514,9 @@ class SOMAArray : public SOMAObject { /** * @brief Get the Arrow schema of the array. * - * @return std::unique_ptr Schema + * @return std::shared_ptr Schema */ - std::unique_ptr arrow_schema() const { + std::shared_ptr arrow_schema() const { return ArrowAdapter::arrow_schema_from_tiledb_array( ctx_->tiledb_ctx(), arr_); } @@ -537,7 +542,11 @@ class SOMAArray : public SOMAObject { */ template std::pair non_empty_domain(const std::string& name) { - return arr_->non_empty_domain(name); + try { + return arr_->non_empty_domain(name); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } } /** @@ -547,7 +556,11 @@ class SOMAArray : public SOMAObject { */ std::pair non_empty_domain_var( const std::string& name) { - return arr_->non_empty_domain_var(name); + try { + return arr_->non_empty_domain_var(name); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } } /** @@ -683,21 +696,48 @@ class SOMAArray : public SOMAObject { void validate( OpenMode mode, std::string_view name, - std::optional> timestamp); + std::optional timestamp); /** * Return optional timestamp pair SOMAArray was opened with. */ - std::optional> timestamp(); + std::optional timestamp(); private: //=================================================================== //= private non-static //=================================================================== - /** - * Fills the metadata cache upon opening the array. - */ + template + Enumeration _extend_value_helper( + T* data, uint64_t num_elems, Enumeration enmr, uint64_t max_capacity) { + std::vector enums_in_write((T*)data, (T*)data + num_elems); + auto enums_existing = enmr.as_vector(); + std::vector extend_values; + for (auto enum_val : enums_in_write) { + if (std::find( + enums_existing.begin(), enums_existing.end(), enum_val) == + enums_existing.end()) { + extend_values.push_back(enum_val); + } + } + + if (extend_values.size() != 0) { + auto free_capacity = max_capacity - enums_existing.size(); + if (free_capacity < extend_values.size()) { + throw TileDBSOMAError( + "Cannot extend enumeration; reached maximum capacity"); + } + ArraySchemaEvolution se(*ctx_->tiledb_ctx()); + se.extend_enumeration(enmr.extend(extend_values)); + se.array_evolve(uri_); + return enmr.extend(extend_values); + } + + return enmr; + } + + // Fills the metadata cache upon opening the array. void fill_metadata_cache(); // SOMAArray URI @@ -719,7 +759,7 @@ class SOMAArray : public SOMAObject { std::map metadata_; // Read timestamp range (start, end) - std::optional> timestamp_; + std::optional timestamp_; // Managed query for the array std::unique_ptr mq_; @@ -727,6 +767,11 @@ class SOMAArray : public SOMAObject { // Array associated with mq_ std::shared_ptr arr_; + // Array associated with metadata_. Metadata values need to be accessible in + // write mode as well. We need to keep this read-mode array alive in order + // for the metadata value pointers in the cache to be accessible + std::shared_ptr meta_cache_arr_; + // True if this is the first call to read_next() bool first_read_next_ = true; @@ -735,6 +780,9 @@ class SOMAArray : public SOMAObject { // Unoptimized method for computing nnz() (issue `count_cells` query) uint64_t nnz_slow(); + + // ArrayBuffers to hold ColumnBuffers alive when submitting to write query + std::shared_ptr array_buffer_ = nullptr; }; } // namespace tiledbsoma diff --git a/libtiledbsoma/src/soma/soma_collection.cc b/libtiledbsoma/src/soma/soma_collection.cc index 9fa9c654fb..dff721027f 100644 --- a/libtiledbsoma/src/soma/soma_collection.cc +++ b/libtiledbsoma/src/soma/soma_collection.cc @@ -41,10 +41,11 @@ using namespace tiledb; //= public static //=================================================================== -std::unique_ptr SOMACollection::create( - std::string_view uri, std::shared_ptr ctx) { - SOMAGroup::create(ctx, uri, "SOMACollection"); - return SOMACollection::open(uri, OpenMode::read, ctx); +void SOMACollection::create( + std::string_view uri, + std::shared_ptr ctx, + std::optional timestamp) { + SOMAGroup::create(ctx, uri, "SOMACollection", timestamp); } std::unique_ptr SOMACollection::open( @@ -96,7 +97,9 @@ std::shared_ptr SOMACollection::add_new_collection( std::string_view uri, URIType uri_type, std::shared_ptr ctx) { - std::shared_ptr member = SOMACollection::create(uri, ctx); + SOMACollection::create(uri, ctx); + std::shared_ptr member = SOMAExperiment::open( + uri, OpenMode::read, ctx); this->set(std::string(uri), uri_type, std::string(key)); children_[std::string(key)] = member; return member; @@ -107,9 +110,12 @@ std::shared_ptr SOMACollection::add_new_experiment( std::string_view uri, URIType uri_type, std::shared_ptr ctx, - ArraySchema schema) { - std::shared_ptr member = SOMAExperiment::create( - uri, schema, ctx); + std::shared_ptr schema, + ColumnIndexInfo index_columns, + std::optional platform_config) { + SOMAExperiment::create(uri, schema, index_columns, ctx, platform_config); + std::shared_ptr member = SOMAExperiment::open( + uri, OpenMode::read, ctx); this->set(std::string(uri), uri_type, std::string(key)); children_[std::string(key)] = member; return member; @@ -120,9 +126,11 @@ std::shared_ptr SOMACollection::add_new_measurement( std::string_view uri, URIType uri_type, std::shared_ptr ctx, - ArraySchema schema) { - std::shared_ptr member = SOMAMeasurement::create( - uri, schema, ctx); + std::shared_ptr schema, + ColumnIndexInfo index_columns) { + SOMAMeasurement::create(uri, schema, index_columns, ctx); + std::shared_ptr member = SOMAMeasurement::open( + uri, OpenMode::read, ctx); this->set(std::string(uri), uri_type, std::string(key)); children_[std::string(key)] = member; return member; @@ -133,9 +141,12 @@ std::shared_ptr SOMACollection::add_new_dataframe( std::string_view uri, URIType uri_type, std::shared_ptr ctx, - ArraySchema schema) { - std::shared_ptr member = SOMADataFrame::create( - uri, schema, ctx); + std::shared_ptr schema, + ColumnIndexInfo index_columns, + std::optional platform_config) { + SOMADataFrame::create(uri, schema, index_columns, ctx, platform_config); + std::shared_ptr member = SOMADataFrame::open( + uri, OpenMode::read, ctx); this->set(std::string(uri), uri_type, std::string(key)); children_[std::string(key)] = member; return member; @@ -147,8 +158,9 @@ std::shared_ptr SOMACollection::add_new_dense_ndarray( URIType uri_type, std::shared_ptr ctx, ArraySchema schema) { - std::shared_ptr member = SOMADenseNDArray::create( - uri, schema, ctx); + SOMADenseNDArray::create(uri, schema, ctx); + std::shared_ptr member = SOMADenseNDArray::open( + uri, OpenMode::read, ctx); this->set(std::string(uri), uri_type, std::string(key)); children_[std::string(key)] = member; return member; @@ -160,8 +172,9 @@ std::shared_ptr SOMACollection::add_new_sparse_ndarray( URIType uri_type, std::shared_ptr ctx, ArraySchema schema) { - std::shared_ptr member = SOMASparseNDArray::create( - uri, schema, ctx); + SOMASparseNDArray::create(uri, schema, ctx); + std::shared_ptr member = SOMASparseNDArray::open( + uri, OpenMode::read, ctx); this->set(std::string(uri), uri_type, std::string(key)); children_[std::string(key)] = member; return member; diff --git a/libtiledbsoma/src/soma/soma_collection.h b/libtiledbsoma/src/soma/soma_collection.h index e869a7d68a..f022752ef5 100644 --- a/libtiledbsoma/src/soma/soma_collection.h +++ b/libtiledbsoma/src/soma/soma_collection.h @@ -61,8 +61,10 @@ class SOMACollection : public SOMAGroup { * @param ctx TileDB context * @param uri URI to create the SOMACollection */ - static std::unique_ptr create( - std::string_view uri, std::shared_ptr ctx); + static void create( + std::string_view uri, + std::shared_ptr ctx, + std::optional timestamp = std::nullopt); /** * @brief Open a group at the specified URI and return SOMACollection @@ -155,7 +157,9 @@ class SOMACollection : public SOMAGroup { std::string_view uri, URIType uri_type, std::shared_ptr ctx, - ArraySchema schema); + std::shared_ptr schema, + ColumnIndexInfo index_columns, + std::optional platform_config = std::nullopt); /** * Create and add a SOMAMeasurement to the SOMACollection. @@ -170,7 +174,8 @@ class SOMACollection : public SOMAGroup { std::string_view uri, URIType uri_type, std::shared_ptr ctx, - ArraySchema schema); + std::shared_ptr schema, + ColumnIndexInfo index_columns); /** * Create and add a SOMADataFrame to the SOMACollection. @@ -185,7 +190,9 @@ class SOMACollection : public SOMAGroup { std::string_view uri, URIType uri_type, std::shared_ptr ctx, - ArraySchema schema); + std::shared_ptr schema, + ColumnIndexInfo index_columns, + std::optional platform_config = std::nullopt); /** * Create and add a SOMADenseNDArray to the SOMACollection. diff --git a/libtiledbsoma/src/soma/soma_dataframe.cc b/libtiledbsoma/src/soma/soma_dataframe.cc index 3fdab76d96..6058b81968 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.cc +++ b/libtiledbsoma/src/soma/soma_dataframe.cc @@ -39,12 +39,16 @@ using namespace tiledb; //= public static //=================================================================== -std::unique_ptr SOMADataFrame::create( +void SOMADataFrame::create( std::string_view uri, - ArraySchema schema, - std::shared_ptr ctx) { - SOMAArray::create(ctx, uri, schema, "SOMADataFrame"); - return SOMADataFrame::open(uri, OpenMode::read, ctx); + std::shared_ptr schema, + ColumnIndexInfo index_columns, + std::shared_ptr ctx, + std::optional platform_config, + std::optional> timestamp) { + auto tiledb_schema = ArrowAdapter::tiledb_schema_from_arrow_schema( + ctx->tiledb_ctx(), schema, index_columns, platform_config); + SOMAArray::create(ctx, uri, tiledb_schema, "SOMADataFrame", timestamp); } std::unique_ptr SOMADataFrame::open( @@ -72,7 +76,7 @@ bool SOMADataFrame::exists(std::string_view uri) { //= public non-static //=================================================================== -std::unique_ptr SOMADataFrame::schema() const { +std::shared_ptr SOMADataFrame::schema() const { return this->arrow_schema(); } diff --git a/libtiledbsoma/src/soma/soma_dataframe.h b/libtiledbsoma/src/soma/soma_dataframe.h index 1ed21f0b02..6f163d9f2c 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.h +++ b/libtiledbsoma/src/soma/soma_dataframe.h @@ -55,12 +55,14 @@ class SOMADataFrame : public SOMAArray { * @param uri URI to create the SOMADataFrame * @param schema TileDB ArraySchema * @param platform_config Optional config parameter dictionary - * @return std::shared_ptr opened in read mode */ - static std::unique_ptr create( + static void create( std::string_view uri, - ArraySchema schema, - std::shared_ptr ctx); + std::shared_ptr schema, + ColumnIndexInfo index_columns, + std::shared_ptr ctx, + std::optional platform_config = std::nullopt, + std::optional> timestamp = std::nullopt); /** * @brief Open and return a SOMADataFrame object at the given URI. @@ -140,9 +142,9 @@ class SOMADataFrame : public SOMAArray { /** * Return the data schema, in the form of a ArrowSchema. * - * @return std::unique_ptr + * @return std::shared_ptr */ - std::unique_ptr schema() const; + std::shared_ptr schema() const; /** * Return the index (dimension) column names. diff --git a/libtiledbsoma/src/soma/soma_dense_ndarray.cc b/libtiledbsoma/src/soma/soma_dense_ndarray.cc index 6df2e84a51..1bcc1c4efd 100644 --- a/libtiledbsoma/src/soma/soma_dense_ndarray.cc +++ b/libtiledbsoma/src/soma/soma_dense_ndarray.cc @@ -38,12 +38,12 @@ using namespace tiledb; //= public static //=================================================================== -std::unique_ptr SOMADenseNDArray::create( +void SOMADenseNDArray::create( std::string_view uri, ArraySchema schema, - std::shared_ptr ctx) { - SOMAArray::create(ctx, uri, schema, "SOMADenseNDArray"); - return SOMADenseNDArray::open(uri, OpenMode::read, ctx); + std::shared_ptr ctx, + std::optional> timestamp) { + SOMAArray::create(ctx, uri, schema, "SOMADenseNDArray", timestamp); } std::unique_ptr SOMADenseNDArray::open( @@ -71,7 +71,7 @@ bool SOMADenseNDArray::exists(std::string_view uri) { //= public non-static //=================================================================== -std::unique_ptr SOMADenseNDArray::schema() const { +std::shared_ptr SOMADenseNDArray::schema() const { return this->arrow_schema(); } diff --git a/libtiledbsoma/src/soma/soma_dense_ndarray.h b/libtiledbsoma/src/soma/soma_dense_ndarray.h index 7a8d41dc87..a55ba630d3 100644 --- a/libtiledbsoma/src/soma/soma_dense_ndarray.h +++ b/libtiledbsoma/src/soma/soma_dense_ndarray.h @@ -57,10 +57,11 @@ class SOMADenseNDArray : public SOMAArray { * @param platform_config Optional config parameter dictionary * @return std::shared_ptr opened in read mode */ - static std::unique_ptr create( + static void create( std::string_view uri, ArraySchema schema, - std::shared_ptr ctx); + std::shared_ptr ctx, + std::optional> timestamp = std::nullopt); /** * @brief Open and return a SOMADenseNDArray object at the given URI. @@ -129,6 +130,11 @@ class SOMADenseNDArray : public SOMAArray { : SOMAArray(other) { } + SOMADenseNDArray() = delete; + SOMADenseNDArray(const SOMADenseNDArray&) = default; + SOMADenseNDArray(SOMADenseNDArray&&) = delete; + ~SOMADenseNDArray() = default; + using SOMAArray::open; /** @@ -145,7 +151,7 @@ class SOMADenseNDArray : public SOMAArray { * * @return std::unique_ptr */ - std::unique_ptr schema() const; + std::shared_ptr schema() const; }; } // namespace tiledbsoma diff --git a/libtiledbsoma/src/soma/soma_experiment.cc b/libtiledbsoma/src/soma/soma_experiment.cc index bfdfb417d2..2c42aea68a 100644 --- a/libtiledbsoma/src/soma/soma_experiment.cc +++ b/libtiledbsoma/src/soma/soma_experiment.cc @@ -41,21 +41,38 @@ using namespace tiledb; //= public static //=================================================================== -std::unique_ptr SOMAExperiment::create( +void SOMAExperiment::create( std::string_view uri, - ArraySchema schema, - std::shared_ptr ctx) { + std::shared_ptr schema, + ColumnIndexInfo index_columns, + std::shared_ptr ctx, + std::optional platform_config, + std::optional timestamp) { std::string exp_uri(uri); - SOMAGroup::create(ctx, exp_uri, "SOMAExperiment"); - SOMADataFrame::create(exp_uri + "/obs", schema, ctx); - SOMACollection::create(exp_uri + "/ms", ctx); + SOMAGroup::create(ctx, exp_uri, "SOMAExperiment", timestamp); + SOMADataFrame::create( + exp_uri + "/obs", + schema, + index_columns, + ctx, + platform_config, + timestamp); + SOMACollection::create(exp_uri + "/ms", ctx, timestamp); - auto group = SOMAGroup::open(OpenMode::write, exp_uri, ctx); + auto name = std::string(std::filesystem::path(uri).filename()); + auto group = SOMAGroup::open( + OpenMode::write, exp_uri, ctx, name, timestamp); group->set(exp_uri + "/obs", URIType::absolute, "obs"); group->set(exp_uri + "/ms", URIType::absolute, "ms"); group->close(); +} - return std::make_unique(OpenMode::read, exp_uri, ctx); +std::unique_ptr SOMAExperiment::open( + std::string_view uri, + OpenMode mode, + std::shared_ptr ctx, + std::optional> timestamp) { + return std::make_unique(mode, uri, ctx, timestamp); } } // namespace tiledbsoma diff --git a/libtiledbsoma/src/soma/soma_experiment.h b/libtiledbsoma/src/soma/soma_experiment.h index ff991a6a07..14b918a269 100644 --- a/libtiledbsoma/src/soma/soma_experiment.h +++ b/libtiledbsoma/src/soma/soma_experiment.h @@ -54,10 +54,29 @@ class SOMAExperiment : public SOMACollection { * @param schema TileDB ArraySchema * @param platform_config Optional config parameter dictionary */ - static std::unique_ptr create( + static void create( std::string_view uri, - ArraySchema schema, - std::shared_ptr ctx); + std::shared_ptr schema, + ColumnIndexInfo index_columns, + std::shared_ptr ctx, + std::optional platform_config = std::nullopt, + std::optional timestamp = std::nullopt); + + /** + * @brief Open a group at the specified URI and return SOMAExperiment + * object. + * + * @param uri URI of the array + * @param mode read or write + * @param ctx TileDB context + * @param timestamp Optional pair indicating timestamp start and end + * @return std::shared_ptr SOMAExperiment + */ + static std::unique_ptr open( + std::string_view uri, + OpenMode mode, + std::shared_ptr ctx, + std::optional> timestamp = std::nullopt); //=================================================================== //= public non-static @@ -80,6 +99,8 @@ class SOMAExperiment : public SOMACollection { SOMAExperiment(SOMAExperiment&&) = default; ~SOMAExperiment() = default; + using SOMACollection::open; + private: //=================================================================== //= private non-static diff --git a/libtiledbsoma/src/soma/soma_group.cc b/libtiledbsoma/src/soma/soma_group.cc index 4d354d6543..1db839be42 100644 --- a/libtiledbsoma/src/soma/soma_group.cc +++ b/libtiledbsoma/src/soma/soma_group.cc @@ -44,14 +44,27 @@ using namespace tiledb; void SOMAGroup::create( std::shared_ptr ctx, std::string_view uri, - std::string soma_type) { + std::string soma_type, + std::optional timestamp) { Group::create(*ctx->tiledb_ctx(), std::string(uri)); - auto group = Group(*ctx->tiledb_ctx(), std::string(uri), TILEDB_WRITE); + auto group = Group( + *ctx->tiledb_ctx(), + std::string(uri), + TILEDB_WRITE, + _set_timestamp(ctx, timestamp)); + group.put_metadata( - "soma_object_type", + SOMA_OBJECT_TYPE_KEY, TILEDB_STRING_UTF8, static_cast(soma_type.length()), soma_type.c_str()); + + group.put_metadata( + ENCODING_VERSION_KEY, + TILEDB_STRING_UTF8, + static_cast(ENCODING_VERSION_VAL.length()), + ENCODING_VERSION_VAL.c_str()); + group.close(); } @@ -60,7 +73,7 @@ std::unique_ptr SOMAGroup::open( std::string_view uri, std::shared_ptr ctx, std::string_view name, - std::optional> timestamp) { + std::optional timestamp) { return std::make_unique(mode, uri, ctx, name, timestamp); } @@ -73,74 +86,57 @@ SOMAGroup::SOMAGroup( std::string_view uri, std::shared_ptr ctx, std::string_view name, - std::optional> timestamp) + std::optional timestamp) : ctx_(ctx) , uri_(util::rstrip_uri(uri)) , name_(name) { - auto cfg = ctx_->tiledb_ctx()->config(); - if (timestamp) { - if (timestamp->first > timestamp->second) { - throw std::invalid_argument("timestamp start > end"); - } - cfg["sm.group.timestamp_start"] = timestamp->first; - cfg["sm.group.timestamp_end"] = timestamp->second; - } group_ = std::make_unique( *ctx_->tiledb_ctx(), std::string(uri), mode == OpenMode::read ? TILEDB_READ : TILEDB_WRITE, - cfg); - + _set_timestamp(ctx, timestamp)); fill_caches(); } void SOMAGroup::fill_caches() { - std::shared_ptr grp; if (group_->query_type() == TILEDB_WRITE) { - grp = std::make_shared(*ctx_->tiledb_ctx(), uri_, TILEDB_READ); + cache_group_ = std::make_shared( + *ctx_->tiledb_ctx(), uri_, TILEDB_READ); } else { - grp = group_; + cache_group_ = group_; } - for (uint64_t idx = 0; idx < grp->metadata_num(); ++idx) { + for (uint64_t idx = 0; idx < cache_group_->metadata_num(); ++idx) { std::string key; tiledb_datatype_t value_type; uint32_t value_num; const void* value; - grp->get_metadata_from_index( + cache_group_->get_metadata_from_index( idx, &key, &value_type, &value_num, &value); MetadataValue mdval(value_type, value_num, value); std::pair mdpair(key, mdval); metadata_.insert(mdpair); } - for (uint64_t i = 0; i < grp->member_count(); ++i) { - auto mem = grp->member(i); + for (uint64_t i = 0; i < cache_group_->member_count(); ++i) { + auto mem = cache_group_->member(i); member_to_uri_[mem.name().value()] = mem.uri(); } - - if (group_->query_type() == TILEDB_WRITE) { - grp->close(); - } } void SOMAGroup::open( - OpenMode query_type, - std::optional> timestamp) { - auto cfg = ctx_->tiledb_ctx()->config(); - if (timestamp) { - if (timestamp->first > timestamp->second) { - throw std::invalid_argument("timestamp start > end"); - } - cfg["sm.group.timestamp_start"] = timestamp->first; - cfg["sm.group.timestamp_end"] = timestamp->second; - } - group_->set_config(cfg); + OpenMode query_type, std::optional timestamp) { + timestamp_ = timestamp; + group_->set_config(_set_timestamp(ctx_, timestamp)); group_->open(query_type == OpenMode::read ? TILEDB_READ : TILEDB_WRITE); + fill_caches(); } void SOMAGroup::close() { + if (group_->query_type() == TILEDB_WRITE) + cache_group_->close(); group_->close(); + metadata_.clear(); } const std::string SOMAGroup::uri() const { @@ -195,36 +191,41 @@ void SOMAGroup::set_metadata( tiledb_datatype_t value_type, uint32_t value_num, const void* value) { - if (key.compare("soma_object_type") == 0) { - throw TileDBSOMAError("soma_object_type cannot be modified."); - } + if (key.compare(SOMA_OBJECT_TYPE_KEY) == 0) + throw TileDBSOMAError(SOMA_OBJECT_TYPE_KEY + " cannot be modified."); + + if (key.compare(ENCODING_VERSION_KEY) == 0) + throw TileDBSOMAError(ENCODING_VERSION_KEY + " cannot be modified."); group_->put_metadata(key, value_type, value_num, value); + MetadataValue mdval(value_type, value_num, value); std::pair mdpair(key, mdval); metadata_.insert(mdpair); } void SOMAGroup::delete_metadata(const std::string& key) { - if (key.compare("soma_object_type") == 0) { - throw TileDBSOMAError("soma_object_type cannot be deleted."); - } + if (key.compare(SOMA_OBJECT_TYPE_KEY) == 0) + throw TileDBSOMAError(SOMA_OBJECT_TYPE_KEY + " cannot be deleted."); + + if (key.compare(ENCODING_VERSION_KEY) == 0) + throw TileDBSOMAError(ENCODING_VERSION_KEY + " cannot be deleted."); group_->delete_metadata(key); metadata_.erase(key); } -std::map SOMAGroup::get_metadata() { - return metadata_; -} - std::optional SOMAGroup::get_metadata(const std::string& key) { - if (metadata_.count(key) == 0) { + if (metadata_.count(key) == 0) return std::nullopt; - } + return metadata_[key]; } +std::map SOMAGroup::get_metadata() { + return metadata_; +} + bool SOMAGroup::has_metadata(const std::string& key) { return metadata_.count(key) != 0; } @@ -233,4 +234,17 @@ uint64_t SOMAGroup::metadata_num() const { return metadata_.size(); } +Config SOMAGroup::_set_timestamp( + std::shared_ptr ctx, std::optional timestamp) { + auto cfg = ctx->tiledb_ctx()->config(); + if (timestamp) { + if (timestamp->first > timestamp->second) { + throw std::invalid_argument("timestamp start > end"); + } + cfg["sm.group.timestamp_start"] = timestamp->first; + cfg["sm.group.timestamp_end"] = timestamp->second; + } + return cfg; +} + } // namespace tiledbsoma \ No newline at end of file diff --git a/libtiledbsoma/src/soma/soma_group.h b/libtiledbsoma/src/soma/soma_group.h index 27c3c5010f..d90225ae70 100644 --- a/libtiledbsoma/src/soma/soma_group.h +++ b/libtiledbsoma/src/soma/soma_group.h @@ -61,7 +61,8 @@ class SOMAGroup : public SOMAObject { static void create( std::shared_ptr ctx, std::string_view uri, - std::string soma_type); + std::string soma_type, + std::optional timestamp = std::nullopt); /** * @brief Open a group at the specified URI and return SOMAGroup @@ -79,7 +80,7 @@ class SOMAGroup : public SOMAObject { std::string_view uri, std::shared_ptr ctx, std::string_view name = "unnamed", - std::optional> timestamp = std::nullopt); + std::optional timestamp = std::nullopt); //=================================================================== //= public non-static @@ -99,7 +100,7 @@ class SOMAGroup : public SOMAObject { std::string_view uri, std::shared_ptr ctx, std::string_view name, - std::optional> timestamp = std::nullopt); + std::optional timestamp = std::nullopt); SOMAGroup() = delete; SOMAGroup(const SOMAGroup&) = default; @@ -113,8 +114,7 @@ class SOMAGroup : public SOMAObject { * @param timestamp Optional pair indicating timestamp start and end */ void open( - OpenMode mode, - std::optional> timestamp = std::nullopt); + OpenMode mode, std::optional timestamp = std::nullopt); /** * Close the SOMAGroup object. @@ -251,9 +251,16 @@ class SOMAGroup : public SOMAObject { * @return MetadataValue (std::tuple) */ - std::map get_metadata(); std::optional get_metadata(const std::string& key); + /** + * Get a mapping of all metadata keys with its associated value datatype, + * number of values, and value in binary form. + * + * @return std::map + */ + std::map get_metadata(); + /** * Check if the key exists in metadata from an open group. The group must * be opened in READ mode, otherwise the function will error out. @@ -275,6 +282,14 @@ class SOMAGroup : public SOMAObject { //= private non-static //=================================================================== + /** + * Helper function to set the pass in timestamp in the config associated + * with the SOMAContext passed in + */ + static Config _set_timestamp( + std::shared_ptr ctx, + std::optional timestamp); + /** * Fills the metadata and member-to-uri caches upon opening the array. */ @@ -288,13 +303,21 @@ class SOMAGroup : public SOMAObject { // Name displayed in log messages std::string name_; - + // // TileDBGroup associated with the SOMAGroup std::shared_ptr group_; // Metadata cache std::map metadata_; + // Read timestamp range (start, end) + std::optional timestamp_; + + // Group associated with metadata_. Metadata values need to be accessible in + // write mode as well. We need to keep this read-mode array alive in order + // for the metadata value pointers in the cache to be accessible + std::shared_ptr cache_group_; + // Member-to-URI cache std::map member_to_uri_; }; diff --git a/libtiledbsoma/src/soma/soma_measurement.cc b/libtiledbsoma/src/soma/soma_measurement.cc index 80c44f11ae..d6605e7383 100644 --- a/libtiledbsoma/src/soma/soma_measurement.cc +++ b/libtiledbsoma/src/soma/soma_measurement.cc @@ -41,21 +41,31 @@ using namespace tiledb; //= public static //=================================================================== -std::unique_ptr SOMAMeasurement::create( +void SOMAMeasurement::create( std::string_view uri, - ArraySchema schema, - std::shared_ptr ctx) { + std::shared_ptr schema, + ColumnIndexInfo index_columns, + std::shared_ptr ctx, + std::optional platform_config, + std::optional timestamp) { std::string exp_uri(uri); - SOMAGroup::create(ctx, exp_uri, "SOMAMeasurement"); - SOMADataFrame::create(exp_uri + "/var", schema, ctx); - SOMACollection::create(exp_uri + "/X", ctx); - SOMACollection::create(exp_uri + "/obsm", ctx); - SOMACollection::create(exp_uri + "/obsp", ctx); - SOMACollection::create(exp_uri + "/varm", ctx); - SOMACollection::create(exp_uri + "/varp", ctx); + SOMAGroup::create(ctx, exp_uri, "SOMAMeasurement", timestamp); + SOMADataFrame::create( + exp_uri + "/var", + schema, + index_columns, + ctx, + platform_config, + timestamp); + SOMACollection::create(exp_uri + "/X", ctx, timestamp); + SOMACollection::create(exp_uri + "/obsm", ctx, timestamp); + SOMACollection::create(exp_uri + "/obsp", ctx, timestamp); + SOMACollection::create(exp_uri + "/varm", ctx, timestamp); + SOMACollection::create(exp_uri + "/varp", ctx, timestamp); - auto group = SOMAGroup::open(OpenMode::write, uri, ctx); + auto name = std::string(std::filesystem::path(uri).filename()); + auto group = SOMAGroup::open(OpenMode::write, uri, ctx, name, timestamp); group->set(exp_uri + "/var", URIType::absolute, "var"); group->set(exp_uri + "/X", URIType::absolute, "X"); group->set(exp_uri + "/obsm", URIType::absolute, "obsm"); @@ -63,7 +73,13 @@ std::unique_ptr SOMAMeasurement::create( group->set(exp_uri + "/varm", URIType::absolute, "varm"); group->set(exp_uri + "/varp", URIType::absolute, "varp"); group->close(); +} - return std::make_unique(OpenMode::read, uri, ctx); +std::unique_ptr SOMAMeasurement::open( + std::string_view uri, + OpenMode mode, + std::shared_ptr ctx, + std::optional> timestamp) { + return std::make_unique(mode, uri, ctx, timestamp); } } // namespace tiledbsoma diff --git a/libtiledbsoma/src/soma/soma_measurement.h b/libtiledbsoma/src/soma/soma_measurement.h index cfaf950549..591c057751 100644 --- a/libtiledbsoma/src/soma/soma_measurement.h +++ b/libtiledbsoma/src/soma/soma_measurement.h @@ -55,10 +55,29 @@ class SOMAMeasurement : public SOMACollection { * @param schema TileDB ArraySchema * @param ctx TileDB context */ - static std::unique_ptr create( + static void create( std::string_view uri, - ArraySchema schema, - std::shared_ptr ctx); + std::shared_ptr schema, + ColumnIndexInfo index_columns, + std::shared_ptr ctx, + std::optional platform_config = std::nullopt, + std::optional timestamp = std::nullopt); + + /** + * @brief Open a group at the specified URI and return SOMAMeasurement + * object. + * + * @param uri URI of the array + * @param mode read or write + * @param ctx TileDB context + * @param timestamp Optional pair indicating timestamp start and end + * @return std::shared_ptr SOMAMeasurement + */ + static std::unique_ptr open( + std::string_view uri, + OpenMode mode, + std::shared_ptr ctx, + std::optional> timestamp = std::nullopt); //=================================================================== //= public non-static @@ -80,6 +99,8 @@ class SOMAMeasurement : public SOMACollection { SOMAMeasurement(SOMAMeasurement&&) = default; ~SOMAMeasurement() = default; + using SOMACollection::open; + private: //=================================================================== //= private non-static diff --git a/libtiledbsoma/src/soma/soma_object.cc b/libtiledbsoma/src/soma/soma_object.cc index c7fa2defa9..36614a6ecb 100644 --- a/libtiledbsoma/src/soma/soma_object.cc +++ b/libtiledbsoma/src/soma/soma_object.cc @@ -20,10 +20,11 @@ std::unique_ptr SOMAObject::open( std::shared_ptr ctx, std::optional> timestamp) { auto obj = tiledb::Object::object(*ctx->tiledb_ctx(), std::string(uri)); + auto name = std::string(std::filesystem::path(uri).filename()); if (obj.type() == tiledb::Object::Type::Array) { auto array_ = SOMAArray::open( - mode, uri, ctx, "", {}, "auto", ResultOrder::automatic, timestamp); + mode, uri, ctx, "", {}, name, ResultOrder::automatic, timestamp); if (!array_->type().has_value()) throw TileDBSOMAError("SOMAArray has no type info"); @@ -38,7 +39,7 @@ std::unique_ptr SOMAObject::open( throw TileDBSOMAError("Saw invalid SOMAArray type"); } } else if (obj.type() == tiledb::Object::Type::Group) { - auto group_ = SOMAGroup::open(mode, uri, ctx, "", timestamp); + auto group_ = SOMAGroup::open(mode, uri, ctx, name, timestamp); if (!group_->type().has_value()) throw TileDBSOMAError("SOMAGroup has no type info"); diff --git a/libtiledbsoma/src/soma/soma_sparse_ndarray.cc b/libtiledbsoma/src/soma/soma_sparse_ndarray.cc index 1259c528e8..f7b0cd4ecd 100644 --- a/libtiledbsoma/src/soma/soma_sparse_ndarray.cc +++ b/libtiledbsoma/src/soma/soma_sparse_ndarray.cc @@ -39,12 +39,12 @@ using namespace tiledb; //= public static //=================================================================== -std::unique_ptr SOMASparseNDArray::create( +void SOMASparseNDArray::create( std::string_view uri, ArraySchema schema, - std::shared_ptr ctx) { - SOMAArray::create(ctx, uri, schema, "SOMASparseNDArray"); - return SOMASparseNDArray::open(uri, OpenMode::read, ctx); + std::shared_ptr ctx, + std::optional> timestamp) { + SOMAArray::create(ctx, uri, schema, "SOMASparseNDArray", timestamp); } std::unique_ptr SOMASparseNDArray::open( @@ -72,7 +72,7 @@ bool SOMASparseNDArray::exists(std::string_view uri) { //= public non-static //=================================================================== -std::unique_ptr SOMASparseNDArray::schema() const { +std::shared_ptr SOMASparseNDArray::schema() const { return this->arrow_schema(); } } // namespace tiledbsoma diff --git a/libtiledbsoma/src/soma/soma_sparse_ndarray.h b/libtiledbsoma/src/soma/soma_sparse_ndarray.h index 1842a88cb2..4bd9cddee7 100644 --- a/libtiledbsoma/src/soma/soma_sparse_ndarray.h +++ b/libtiledbsoma/src/soma/soma_sparse_ndarray.h @@ -57,10 +57,11 @@ class SOMASparseNDArray : public SOMAArray { * @param platform_config Optional config parameter dictionary * @return std::shared_ptr opened in read mode */ - static std::unique_ptr create( + static void create( std::string_view uri, ArraySchema schema, - std::shared_ptr ctx); + std::shared_ptr ctx, + std::optional> timestamp = std::nullopt); /** * @brief Open and return a SOMASparseNDArray object at the given URI. @@ -129,6 +130,11 @@ class SOMASparseNDArray : public SOMAArray { : SOMAArray(other) { } + SOMASparseNDArray() = delete; + SOMASparseNDArray(const SOMASparseNDArray&) = default; + SOMASparseNDArray(SOMASparseNDArray&&) = delete; + ~SOMASparseNDArray() = default; + using SOMAArray::open; /** @@ -143,9 +149,9 @@ class SOMASparseNDArray : public SOMAArray { /** * Return the data schema, in the form of an ArrowSchema. * - * @return std::unique_ptr + * @return std::shared_ptr */ - std::unique_ptr schema() const; + std::shared_ptr schema() const; }; } // namespace tiledbsoma diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 4739bd0ed2..bde9dce3fe 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -101,13 +101,13 @@ void ArrowAdapter::release_array(struct ArrowArray* array) { array->release = nullptr; } -std::unique_ptr ArrowAdapter::arrow_schema_from_tiledb_array( +std::shared_ptr ArrowAdapter::arrow_schema_from_tiledb_array( std::shared_ptr ctx, std::shared_ptr tiledb_array) { auto tiledb_schema = tiledb_array->schema(); auto ndim = tiledb_schema.domain().ndim(); auto nattr = tiledb_schema.attribute_num(); - std::unique_ptr arrow_schema = std::make_unique(); + std::shared_ptr arrow_schema = std::make_shared(); arrow_schema->format = "+s"; arrow_schema->n_children = ndim + nattr; arrow_schema->release = &ArrowAdapter::release_schema; @@ -186,7 +186,8 @@ std::pair ArrowAdapter::_get_data_and_length( } case TILEDB_INT8: { auto data = enmr.as_vector(); - return std::pair(_fill_data_buffer(data, dst), data.size()); + return std::pair( + ArrowAdapter::_fill_data_buffer(data, dst), data.size()); } case TILEDB_UINT8: { auto data = enmr.as_vector(); @@ -240,10 +241,140 @@ std::pair ArrowAdapter::_get_data_and_length( } } -std::pair, std::unique_ptr> -ArrowAdapter::to_arrow(std::shared_ptr column) { - std::unique_ptr schema = std::make_unique(); - std::unique_ptr array = std::make_unique(); +ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( + std::shared_ptr ctx, + std::shared_ptr arrow_schema, + ColumnIndexInfo index_column_info, + std::optional platform_config) { + auto [index_column_names, domains, extents] = index_column_info; + + ArraySchema schema(*ctx, TILEDB_SPARSE); + Domain domain(*ctx); + + if (platform_config) { + std::map convert_filter = { + {"GzipFilter", TILEDB_FILTER_GZIP}, + {"ZstdFilter", TILEDB_FILTER_ZSTD}, + {"LZ4Filter", TILEDB_FILTER_LZ4}, + {"Bzip2Filter", TILEDB_FILTER_BZIP2}, + {"RleFilter", TILEDB_FILTER_RLE}, + {"DeltaFilter", TILEDB_FILTER_DELTA}, + {"DoubleDeltaFilter", TILEDB_FILTER_DOUBLE_DELTA}, + {"BitWidthReductionFilter", TILEDB_FILTER_BIT_WIDTH_REDUCTION}, + {"BitShuffleFilter", TILEDB_FILTER_BITSHUFFLE}, + {"ByteShuffleFilter", TILEDB_FILTER_BYTESHUFFLE}, + {"PositiveDeltaFilter", TILEDB_FILTER_POSITIVE_DELTA}, + {"ChecksumMD5Filter", TILEDB_FILTER_CHECKSUM_MD5}, + {"ChecksumSHA256Filter", TILEDB_FILTER_CHECKSUM_SHA256}, + {"DictionaryFilter", TILEDB_FILTER_DICTIONARY}, + {"FloatScaleFilter", TILEDB_FILTER_SCALE_FLOAT}, + {"XORFilter", TILEDB_FILTER_XOR}, + {"WebpFilter", TILEDB_FILTER_WEBP}, + {"NoOpFilter", TILEDB_FILTER_NONE}, + }; + + schema.set_capacity(platform_config->capacity); + + if (platform_config->offsets_filters.size() != 0) { + FilterList offset_filter_list(*ctx); + for (auto offset : platform_config->offsets_filters) { + offset_filter_list.add_filter( + Filter(*ctx, convert_filter[offset])); + } + schema.set_offsets_filter_list(offset_filter_list); + } + + if (platform_config->validity_filters.size() != 0) { + FilterList validity_filter_list(*ctx); + for (auto validity : platform_config->validity_filters) { + validity_filter_list.add_filter( + Filter(*ctx, convert_filter[validity])); + } + schema.set_validity_filter_list(validity_filter_list); + } + + schema.set_allows_dups(platform_config->allows_duplicates); + + if (platform_config->tile_order) + schema.set_tile_order( + platform_config->tile_order == "row" ? TILEDB_ROW_MAJOR : + TILEDB_COL_MAJOR); + + if (platform_config->cell_order) + schema.set_cell_order( + platform_config->cell_order == "row" ? TILEDB_ROW_MAJOR : + TILEDB_COL_MAJOR); + } + + std::map dims; + + for (int64_t sch_idx = 0; sch_idx < arrow_schema->n_children; ++sch_idx) { + auto child = arrow_schema->children[sch_idx]; + auto type = ArrowAdapter::to_tiledb_format(child->format); + + auto idx_col_begin = index_column_names.begin(); + auto idx_col_end = index_column_names.end(); + auto idx_col_it = std::find(idx_col_begin, idx_col_end, child->name); + + if (idx_col_it != idx_col_end) { + auto idx_col_idx = std::distance(idx_col_begin, idx_col_it); + if (ArrowAdapter::_isvar(child->format)) { + type = TILEDB_STRING_ASCII; + } + + auto dim = Dimension::create( + *ctx, + child->name, + type, + type == TILEDB_STRING_ASCII ? + nullptr : + domains->children[idx_col_idx]->buffers[1], + type == TILEDB_STRING_ASCII ? + nullptr : + extents->children[idx_col_idx]->buffers[1]); + + dims.insert({dim.name(), dim}); + } else { + Attribute attr(*ctx, child->name, type); + + if (child->flags & ARROW_FLAG_NULLABLE) { + attr.set_nullable(true); + } + + if (ArrowAdapter::_isvar(child->format)) { + attr.set_cell_val_num(TILEDB_VAR_NUM); + } + + if (child->dictionary != nullptr) { + auto enmr_format = child->dictionary->format; + auto enmr_type = ArrowAdapter::to_tiledb_format(enmr_format); + auto enmr = Enumeration::create_empty( + *ctx, + child->name, + enmr_type, + ArrowAdapter::_isvar(enmr_format) ? TILEDB_VAR_NUM : 1, + child->flags & ARROW_FLAG_DICTIONARY_ORDERED); + ArraySchemaExperimental::add_enumeration(*ctx, schema, enmr); + AttributeExperimental::set_enumeration_name( + *ctx, attr, child->name); + } + + schema.add_attribute(attr); + } + } + + for (auto column_name : index_column_names) + domain.add_dimension(dims.at(column_name)); + schema.set_domain(domain); + + schema.check(); + + return schema; +} + +ArrowTable ArrowAdapter::to_arrow(std::shared_ptr column) { + std::shared_ptr schema = std::make_shared(); + std::shared_ptr array = std::make_shared(); schema->format = to_arrow_format(column->type()).data(); schema->name = column->name().data(); @@ -329,7 +460,7 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { dict_sch->release = &release_schema; dict_sch->private_data = nullptr; - const int n_buf = strcmp(dict_sch->format, "u") == 0 ? 3 : 2; + const int n_buf = ArrowAdapter::_isvar(dict_sch->format) ? 3 : 2; dict_arr->null_count = 0; dict_arr->offset = 0; dict_arr->n_buffers = n_buf; @@ -353,7 +484,7 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { // returns std::optional where std::nullopt indicates the // column does not contain enumerated values. if (enmr->type() == TILEDB_STRING_ASCII or - enmr->type() == TILEDB_STRING_UTF8) { + enmr->type() == TILEDB_STRING_UTF8 or enmr->type() == TILEDB_CHAR) { auto dict_vec = enmr->as_vector(); column->convert_enumeration(); dict_arr->buffers[1] = column->enum_offsets().data(); @@ -370,64 +501,63 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { array->dictionary = dict_arr; } - return std::pair(std::move(array), std::move(schema)); + return ArrowTable(array, schema); +} + +bool ArrowAdapter::_isvar(const char* format) { + if ((strcmp(format, "U") == 0) || (strcmp(format, "Z") == 0) || + (strcmp(format, "u") == 0) || (strcmp(format, "z") == 0)) { + return true; + } + return false; } std::string_view ArrowAdapter::to_arrow_format( - tiledb_datatype_t datatype, bool use_large) { - switch (datatype) { - case TILEDB_STRING_ASCII: - case TILEDB_STRING_UTF8: - return use_large ? "U" : "u"; // large because TileDB - // uses 64bit offsets - case TILEDB_CHAR: - case TILEDB_BLOB: - return use_large ? "Z" : "z"; // large because TileDB - // uses 64bit offsets - case TILEDB_BOOL: - return "b"; - case TILEDB_INT32: - return "i"; - case TILEDB_INT64: - return "l"; - case TILEDB_FLOAT32: - return "f"; - case TILEDB_FLOAT64: - return "g"; - case TILEDB_INT8: - return "c"; - case TILEDB_UINT8: - return "C"; - case TILEDB_INT16: - return "s"; - case TILEDB_UINT16: - return "S"; - case TILEDB_UINT32: - return "I"; - case TILEDB_UINT64: - return "L"; - case TILEDB_TIME_SEC: - return "tts"; - case TILEDB_TIME_MS: - return "ttm"; - case TILEDB_TIME_US: - return "ttu"; - case TILEDB_TIME_NS: - return "ttn"; - case TILEDB_DATETIME_SEC: - return "tss:"; - case TILEDB_DATETIME_MS: - return "tsm:"; - case TILEDB_DATETIME_US: - return "tsu:"; - case TILEDB_DATETIME_NS: - return "tsn:"; - default: - break; + tiledb_datatype_t tiledb_dtype, bool use_large) { + auto u = use_large ? "U" : "u"; + auto z = use_large ? "Z" : "z"; + std::map _to_arrow_format_map = { + {TILEDB_STRING_ASCII, u}, {TILEDB_CHAR, z}, + {TILEDB_STRING_UTF8, u}, {TILEDB_BLOB, z}, + {TILEDB_INT8, "c"}, {TILEDB_UINT8, "C"}, + {TILEDB_INT16, "s"}, {TILEDB_UINT16, "S"}, + {TILEDB_INT32, "i"}, {TILEDB_UINT32, "I"}, + {TILEDB_INT64, "l"}, {TILEDB_UINT64, "L"}, + {TILEDB_FLOAT32, "f"}, {TILEDB_FLOAT64, "g"}, + {TILEDB_BOOL, "b"}, {TILEDB_DATETIME_SEC, "tss:"}, + {TILEDB_DATETIME_MS, "tsm:"}, {TILEDB_DATETIME_US, "tsu:"}, + {TILEDB_DATETIME_NS, "tsn:"}, + }; + + try { + return _to_arrow_format_map.at(tiledb_dtype); + } catch (const std::out_of_range& e) { + throw std::out_of_range(fmt::format( + "ArrowAdapter: Unsupported TileDB type: {} ", + tiledb::impl::type_to_str(tiledb_dtype))); + } +} + +tiledb_datatype_t ArrowAdapter::to_tiledb_format(std::string_view arrow_dtype) { + std::map _to_tiledb_format_map = { + {"u", TILEDB_STRING_UTF8}, {"U", TILEDB_STRING_UTF8}, + {"z", TILEDB_CHAR}, {"Z", TILEDB_CHAR}, + {"c", TILEDB_INT8}, {"C", TILEDB_UINT8}, + {"s", TILEDB_INT16}, {"S", TILEDB_UINT16}, + {"i", TILEDB_INT32}, {"I", TILEDB_UINT32}, + {"l", TILEDB_INT64}, {"L", TILEDB_UINT64}, + {"f", TILEDB_FLOAT32}, {"g", TILEDB_FLOAT64}, + {"b", TILEDB_BOOL}, {"tss:", TILEDB_DATETIME_SEC}, + {"tsm:", TILEDB_DATETIME_MS}, {"tsu:", TILEDB_DATETIME_US}, + {"tsn:", TILEDB_DATETIME_NS}, + }; + + try { + return _to_tiledb_format_map.at(arrow_dtype); + } catch (const std::out_of_range& e) { + throw std::out_of_range(fmt::format( + "ArrowAdapter: Unsupported Arrow type: {} ", arrow_dtype)); } - throw TileDBSOMAError(fmt::format( - "ArrowAdapter: Unsupported TileDB datatype: {} ", - tiledb::impl::type_to_str(datatype))); } -} // namespace tiledbsoma \ No newline at end of file +} // namespace tiledbsoma diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index a210aca77c..e22c75b32c 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -32,6 +32,32 @@ struct ArrowBuffer { std::shared_ptr buffer_; }; +using ArrowTable = + std::pair, std::shared_ptr>; + +using ColumnIndexInfo = std::tuple< + std::vector, // name of column + std::shared_ptr, // domain + std::shared_ptr // tile extent + >; + +class PlatformConfig { + public: + uint64_t dataframe_dim_zstd_level = 3; + uint64_t sparse_nd_array_dim_zstd_level = 3; + bool write_X_chunked = true; + uint64_t goal_chunk_nnz = 100000000; + uint64_t remote_cap_nbytes = 2400000000; + uint64_t capacity = 100000; + std::vector offsets_filters = { + "DoubleDeltaFilter", "BitWidthReductionFilter", "ZstdFilter"}; + std::vector validity_filters; + bool allows_duplicates = false; + std::optional tile_order = std::nullopt; + std::optional cell_order = std::nullopt; + bool consolidate_and_vacuum = false; +}; + class ArrowAdapter { public: static void release_schema(struct ArrowSchema* schema); @@ -40,15 +66,30 @@ class ArrowAdapter { /** * @brief Convert ColumnBuffer to an Arrow array. * - * @return std::pair, - * std::unique_ptr> + * @return std::pair, + * std::shared_ptr> */ - static std::pair, std::unique_ptr> - to_arrow(std::shared_ptr column); + static ArrowTable to_arrow(std::shared_ptr column); - static std::unique_ptr arrow_schema_from_tiledb_array( + /** + * @brief Create an ArrowSchema from TileDB Array + * + * @return std::shared_ptr + */ + static std::shared_ptr arrow_schema_from_tiledb_array( std::shared_ptr ctx, std::shared_ptr tiledb_array); + /** + * @brief Create a TileDB ArraySchema from ArrowSchema + * + * @return tiledb::ArraySchema + */ + static ArraySchema tiledb_schema_from_arrow_schema( + std::shared_ptr ctx, + std::shared_ptr arrow_schema, + ColumnIndexInfo index_column_info, + std::optional platform_config); + /** * @brief Get Arrow format string from TileDB datatype. * @@ -58,6 +99,14 @@ class ArrowAdapter { static std::string_view to_arrow_format( tiledb_datatype_t datatype, bool use_large = true); + /** + * @brief Get TileDB datatype from Arrow format string. + * + * @param datatype TileDB datatype. + * @return std::string_view Arrow format string. + */ + static tiledb_datatype_t to_tiledb_format(std::string_view arrow_dtype); + private: static std::pair _get_data_and_length( Enumeration& enmr, const void* dst); @@ -69,6 +118,11 @@ class ArrowAdapter { std::memcpy((void*)dst, src.data(), sz); return dst; } + + static std::optional> _get_dim_info( + std::string_view dim_name, ArrowTable index_columns); + + static bool _isvar(const char* format); }; }; // namespace tiledbsoma diff --git a/libtiledbsoma/src/utils/common.h b/libtiledbsoma/src/utils/common.h index 3928b04675..87b41d5cd0 100644 --- a/libtiledbsoma/src/utils/common.h +++ b/libtiledbsoma/src/utils/common.h @@ -39,9 +39,15 @@ namespace tiledbsoma { +const std::string SOMA_OBJECT_TYPE_KEY = "soma_object_type"; +const std::string ENCODING_VERSION_KEY = "soma_encoding_version"; +const std::string ENCODING_VERSION_VAL = "1"; + using MetadataValue = std::tuple; enum MetadataInfo { dtype = 0, num, value }; +using TimestampRange = std::pair; + class TileDBSOMAError : public std::runtime_error { public: explicit TileDBSOMAError(const char* m) diff --git a/libtiledbsoma/test/CMakeLists.txt b/libtiledbsoma/test/CMakeLists.txt index b58a8952bc..4efbc890b1 100644 --- a/libtiledbsoma/test/CMakeLists.txt +++ b/libtiledbsoma/test/CMakeLists.txt @@ -27,6 +27,8 @@ find_package(Catch_EP REQUIRED) add_executable(unit_soma $ + common.cc + common.h unit_column_buffer.cc unit_managed_query.cc unit_soma_array.cc diff --git a/libtiledbsoma/test/common.cc b/libtiledbsoma/test/common.cc new file mode 100644 index 0000000000..f497908dd6 --- /dev/null +++ b/libtiledbsoma/test/common.cc @@ -0,0 +1,133 @@ +/** + * @file common.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2024 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file manages common headers and helper classes for the unit test files. + */ + +#include "common.h" + +namespace helper { +ArraySchema create_schema(Context& ctx, bool allow_duplicates) { + // Create schema + ArraySchema schema(ctx, TILEDB_SPARSE); + + auto dim = Dimension::create(ctx, "d0", {0, 1000}); + + Domain domain(ctx); + domain.add_dimension(dim); + schema.set_domain(domain); + + auto attr = Attribute::create(ctx, "a0"); + schema.add_attribute(attr); + schema.set_allows_dups(allow_duplicates); + schema.check(); + + return schema; +} + +std::pair, ColumnIndexInfo> create_arrow_schema() { + // Create ArrowSchema + auto arrow_schema = std::make_shared(); + arrow_schema->format = "+s"; + arrow_schema->n_children = 2; + arrow_schema->dictionary = nullptr; + arrow_schema->release = &ArrowAdapter::release_schema; + arrow_schema->children = new ArrowSchema*[arrow_schema->n_children]; + + ArrowSchema* dim = nullptr; + dim = arrow_schema->children[0] = new ArrowSchema; + dim->format = "l"; + dim->name = "d0"; + dim->n_children = 0; + dim->dictionary = nullptr; + dim->release = &ArrowAdapter::release_schema; + + ArrowSchema* attr = nullptr; + attr = arrow_schema->children[1] = new ArrowSchema; + attr->format = "l"; + attr->name = "a0"; + attr->n_children = 0; + attr->dictionary = nullptr; + attr->release = &ArrowAdapter::release_schema; + + // Create array for index columns + std::vector index_column_names = {"d0"}; + + auto domains = std::make_shared(); + domains->length = 0; + domains->null_count = 0; + domains->offset = 0; + domains->n_buffers = 0; + domains->buffers = nullptr; + domains->n_children = 2; + domains->release = &ArrowAdapter::release_array; + domains->children = new ArrowArray*[1]; + + auto d0_domain = domains->children[0] = new ArrowArray; + d0_domain->length = 2; + d0_domain->null_count = 0; + d0_domain->offset = 0; + d0_domain->n_buffers = 2; + d0_domain->release = &ArrowAdapter::release_array; + d0_domain->buffers = new const void*[2]; + d0_domain->buffers[0] = nullptr; + d0_domain->buffers[1] = malloc(sizeof(int64_t) * 2); + d0_domain->n_children = 0; + int64_t dom[] = {0, 1000}; + std::memcpy((void*)d0_domain->buffers[1], &dom, sizeof(int64_t) * 2); + + auto tiles = std::make_shared(); + tiles->length = 0; + tiles->null_count = 0; + tiles->offset = 0; + tiles->n_buffers = 0; + tiles->buffers = nullptr; + tiles->n_children = 2; + tiles->release = &ArrowAdapter::release_array; + tiles->children = new ArrowArray*[1]; + + ArrowArray* d0_tile = tiles->children[0] = new ArrowArray; + d0_tile->length = 1; + d0_tile->null_count = 0; + d0_tile->offset = 0; + d0_tile->n_buffers = 2; + d0_tile->release = &ArrowAdapter::release_array; + d0_tile->buffers = new const void*[2]; + d0_tile->buffers[0] = nullptr; + d0_tile->buffers[1] = malloc(sizeof(int64_t)); + d0_tile->n_children = 0; + int64_t tile = 1; + std::memcpy((void*)d0_tile->buffers[1], &tile, sizeof(int64_t)); + + ColumnIndexInfo index_columns_info = std::tuple( + index_column_names, domains, tiles); + + return std::pair(arrow_schema, index_columns_info); +} +} // namespace helper \ No newline at end of file diff --git a/libtiledbsoma/test/common.h b/libtiledbsoma/test/common.h new file mode 100644 index 0000000000..16ce7e4bbd --- /dev/null +++ b/libtiledbsoma/test/common.h @@ -0,0 +1,66 @@ +/** + * @file common.h + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2024 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file manages common headers and helper classes for the unit test files. + */ + +#ifndef UNIT_TEST_COMMON_H +#define UNIT_TEST_COMMON_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "utils/util.h" + +using namespace tiledb; +using namespace tiledbsoma; +using namespace Catch::Matchers; + +#ifndef TILEDBSOMA_SOURCE_ROOT +#define TILEDBSOMA_SOURCE_ROOT "not_defined" +#endif + +static const std::string src_path = TILEDBSOMA_SOURCE_ROOT; + +namespace helper { +ArraySchema create_schema(Context& ctx, bool allow_duplicates = false); +std::pair, ColumnIndexInfo> create_arrow_schema(); +} // namespace helper +#endif \ No newline at end of file diff --git a/libtiledbsoma/test/unit_soma_array.cc b/libtiledbsoma/test/unit_soma_array.cc index aae178f0e5..baaa0492d5 100644 --- a/libtiledbsoma/test/unit_soma_array.cc +++ b/libtiledbsoma/test/unit_soma_array.cc @@ -86,7 +86,7 @@ std::tuple create_array( schema.check(); // Create array - SOMAArray::create(ctx, uri, schema, "NONE"); + SOMAArray::create(ctx, uri, schema, "NONE", TimestampRange(0, 2)); uint64_t nnz = num_fragments * num_cells_per_fragment; @@ -125,7 +125,7 @@ std::tuple, std::vector> write_array( {}, "auto", ResultOrder::automatic, - std::pair(timestamp + i, timestamp + i)); + TimestampRange(timestamp + i, timestamp + i)); std::vector d0(num_cells_per_fragment); for (int j = 0; j < num_cells_per_fragment; j++) { @@ -138,14 +138,10 @@ std::tuple, std::vector> write_array( } std::vector a0(num_cells_per_fragment, frag_num); - auto array_buffer = std::make_shared(); - auto tdb_arr = std::make_shared( - *ctx->tiledb_ctx(), uri, TILEDB_READ); - array_buffer->emplace("a0", ColumnBuffer::create(tdb_arr, "a0", a0)); - array_buffer->emplace("d0", ColumnBuffer::create(tdb_arr, "d0", d0)); - // Write data to array - soma_array->write(array_buffer); + soma_array->set_column_data("a0", a0.size(), a0.data()); + soma_array->set_column_data("d0", d0.size(), d0.data()); + soma_array->write(); soma_array->close(); } @@ -218,8 +214,7 @@ TEST_CASE("SOMAArray: nnz") { {}, "auto", ResultOrder::automatic, - std::pair( - timestamp, timestamp + num_fragments - 1)); + TimestampRange(timestamp, timestamp + num_fragments - 1)); uint64_t nnz = soma_array->nnz(); REQUIRE(nnz == expected_nnz); @@ -283,7 +278,7 @@ TEST_CASE("SOMAArray: nnz with timestamp") { uri, ctx, num_cells_per_fragment, num_fragments, overlap, 40); // Get total cell num at timestamp (0, 20) - std::pair timestamp{0, 20}; + TimestampRange timestamp{0, 20}; auto soma_array = SOMAArray::open( OpenMode::read, uri, @@ -364,7 +359,6 @@ TEST_CASE("SOMAArray: nnz with consolidation") { TEST_CASE("SOMAArray: metadata") { auto ctx = std::make_shared(); - std::string base_uri = "mem://unit-test-array"; const auto& [uri, expected_nnz] = create_array(base_uri, ctx); @@ -376,35 +370,51 @@ TEST_CASE("SOMAArray: metadata") { {}, "auto", ResultOrder::automatic, - std::pair(1, 1)); + TimestampRange(1, 1)); + int32_t val = 100; soma_array->set_metadata("md", TILEDB_INT32, 1, &val); soma_array->close(); - soma_array->open(OpenMode::read, std::pair(1, 1)); - REQUIRE(soma_array->metadata_num() == 2); - REQUIRE(soma_array->has_metadata("soma_object_type") == true); - REQUIRE(soma_array->has_metadata("md") == true); - + // Read metadata + soma_array->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(soma_array->metadata_num() == 3); + REQUIRE(soma_array->has_metadata("soma_object_type")); + REQUIRE(soma_array->has_metadata("soma_encoding_version")); + REQUIRE(soma_array->has_metadata("md")); auto mdval = soma_array->get_metadata("md"); REQUIRE(std::get(*mdval) == TILEDB_INT32); REQUIRE(std::get(*mdval) == 1); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); soma_array->close(); - soma_array->open(OpenMode::write, std::pair(2, 2)); + // md should not be available at (2, 2) + soma_array->open(OpenMode::read, TimestampRange(2, 2)); + REQUIRE(soma_array->metadata_num() == 2); + REQUIRE(soma_array->has_metadata("soma_object_type")); + REQUIRE(soma_array->has_metadata("soma_encoding_version")); + REQUIRE(!soma_array->has_metadata("md")); + soma_array->close(); + // Metadata should also be retrievable in write mode + soma_array->open(OpenMode::write, TimestampRange(0, 2)); + REQUIRE(soma_array->metadata_num() == 3); + REQUIRE(soma_array->has_metadata("soma_object_type")); + REQUIRE(soma_array->has_metadata("soma_encoding_version")); + REQUIRE(soma_array->has_metadata("md")); mdval = soma_array->get_metadata("md"); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); + + // Delete and have it reflected when reading metadata while in write mode soma_array->delete_metadata("md"); mdval = soma_array->get_metadata("md"); REQUIRE(!mdval.has_value()); soma_array->close(); - soma_array->open(OpenMode::read, std::pair(3, 3)); - REQUIRE(soma_array->has_metadata("md") == false); - REQUIRE(soma_array->metadata_num() == 1); - soma_array->close(); + // Confirm delete in read mode + soma_array->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(!soma_array->has_metadata("md")); + REQUIRE(soma_array->metadata_num() == 2); } TEST_CASE("SOMAArray: Test buffer size") { diff --git a/libtiledbsoma/test/unit_soma_collection.cc b/libtiledbsoma/test/unit_soma_collection.cc index e873c86c9c..914e263c70 100644 --- a/libtiledbsoma/test/unit_soma_collection.cc +++ b/libtiledbsoma/test/unit_soma_collection.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2022 TileDB, Inc. + * @copyright Copyright (c) 2024 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -30,58 +30,14 @@ * This file manages unit tests for the SOMACollection class */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include "utils/util.h" - -using namespace tiledb; -using namespace tiledbsoma; -using namespace Catch::Matchers; - -#ifndef TILEDBSOMA_SOURCE_ROOT -#define TILEDBSOMA_SOURCE_ROOT "not_defined" -#endif - -const std::string src_path = TILEDBSOMA_SOURCE_ROOT; - -namespace { -ArraySchema create_schema( - Context& ctx, bool sparse = false, bool allow_duplicates = false) { - // Create schema - ArraySchema schema(ctx, sparse ? TILEDB_SPARSE : TILEDB_DENSE); - - auto dim = Dimension::create(ctx, "d0", {0, 1000}); - - Domain domain(ctx); - domain.add_dimension(dim); - schema.set_domain(domain); - - auto attr = Attribute::create(ctx, "a0"); - schema.add_attribute(attr); - schema.set_allows_dups(allow_duplicates); - schema.check(); - - return schema; -} -}; // namespace +#include "common.h" TEST_CASE("SOMACollection: basic") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-collection-basic"; - auto soma_collection = SOMACollection::create(uri, ctx); + SOMACollection::create(uri, ctx); + auto soma_collection = SOMACollection::open(uri, OpenMode::read, ctx); REQUIRE(soma_collection->uri() == uri); REQUIRE(soma_collection->ctx() == ctx); REQUIRE(soma_collection->type() == "SOMACollection"); @@ -94,7 +50,8 @@ TEST_CASE("SOMACollection: add SOMASparseNDArray") { std::string sub_uri = "mem://unit-test-add-sparse-ndarray/sub"; SOMACollection::create(base_uri, ctx); - auto schema = create_schema(*ctx->tiledb_ctx(), true); + auto [arrow_schema, index_columns] = helper::create_arrow_schema(); + auto schema = helper::create_schema(*ctx->tiledb_ctx(), true); std::map expected_map{ {"sparse_ndarray", sub_uri}}; @@ -123,7 +80,7 @@ TEST_CASE("SOMACollection: add SOMADenseNDArray") { std::string sub_uri = "mem://unit-test-add-dense-ndarray/sub"; SOMACollection::create(base_uri, ctx); - auto schema = create_schema(*ctx->tiledb_ctx(), false); + auto schema = helper::create_schema(*ctx->tiledb_ctx(), false); std::map expected_map{{"dense_ndarray", sub_uri}}; @@ -150,13 +107,13 @@ TEST_CASE("SOMACollection: add SOMADataFrame") { std::string sub_uri = "mem://unit-test-add-dataframe/sub"; SOMACollection::create(base_uri, ctx); - auto schema = create_schema(*ctx->tiledb_ctx(), true); + auto [schema, index_columns] = helper::create_arrow_schema(); std::map expected_map{{"dataframe", sub_uri}}; auto soma_collection = SOMACollection::open(base_uri, OpenMode::write, ctx); auto soma_dataframe = soma_collection->add_new_dataframe( - "dataframe", sub_uri, URIType::absolute, ctx, schema); + "dataframe", sub_uri, URIType::absolute, ctx, schema, index_columns); REQUIRE(soma_collection->member_to_uri_mapping() == expected_map); REQUIRE(soma_dataframe->uri() == sub_uri); REQUIRE(soma_dataframe->ctx() == ctx); @@ -178,7 +135,7 @@ TEST_CASE("SOMACollection: add SOMACollection") { std::string sub_uri = "mem://unit-test-add-collection/sub"; SOMACollection::create(base_uri, ctx); - auto schema = create_schema(*ctx->tiledb_ctx(), false); + auto schema = helper::create_schema(*ctx->tiledb_ctx(), false); std::map expected_map{{"subcollection", sub_uri}}; @@ -202,13 +159,13 @@ TEST_CASE("SOMACollection: add SOMAExperiment") { std::string sub_uri = "mem://unit-test-add-experiment/sub"; SOMACollection::create(base_uri, ctx); - auto schema = create_schema(*ctx->tiledb_ctx(), false); + auto [schema, index_columns] = helper::create_arrow_schema(); std::map expected_map{{"experiment", sub_uri}}; auto soma_collection = SOMACollection::open(base_uri, OpenMode::write, ctx); auto soma_experiment = soma_collection->add_new_experiment( - "experiment", sub_uri, URIType::absolute, ctx, schema); + "experiment", sub_uri, URIType::absolute, ctx, schema, index_columns); REQUIRE(soma_collection->member_to_uri_mapping() == expected_map); REQUIRE(soma_experiment->uri() == sub_uri); REQUIRE(soma_experiment->ctx() == ctx); @@ -227,13 +184,13 @@ TEST_CASE("SOMACollection: add SOMAMeasurement") { std::string sub_uri = "mem://unit-test-add-measurement/sub"; SOMACollection::create(base_uri, ctx); - auto schema = create_schema(*ctx->tiledb_ctx(), false); + auto [schema, index_columns] = helper::create_arrow_schema(); std::map expected_map{{"measurement", sub_uri}}; auto soma_collection = SOMACollection::open(base_uri, OpenMode::write, ctx); auto soma_measurement = soma_collection->add_new_measurement( - "measurement", sub_uri, URIType::absolute, ctx, schema); + "measurement", sub_uri, URIType::absolute, ctx, schema, index_columns); REQUIRE(soma_collection->member_to_uri_mapping() == expected_map); REQUIRE(soma_measurement->uri() == sub_uri); REQUIRE(soma_measurement->ctx() == ctx); @@ -250,110 +207,161 @@ TEST_CASE("SOMACollection: metadata") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-collection"; - SOMACollection::create(uri, ctx); + SOMACollection::create(uri, ctx, TimestampRange(0, 2)); auto soma_collection = SOMACollection::open( uri, OpenMode::write, ctx, std::pair(1, 1)); + int32_t val = 100; soma_collection->set_metadata("md", TILEDB_INT32, 1, &val); soma_collection->close(); - soma_collection->open(OpenMode::read, std::pair(1, 1)); - REQUIRE(soma_collection->metadata_num() == 2); - REQUIRE(soma_collection->has_metadata("soma_object_type") == true); - REQUIRE(soma_collection->has_metadata("md") == true); - + // Read metadata + soma_collection->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(soma_collection->metadata_num() == 3); + REQUIRE(soma_collection->has_metadata("soma_object_type")); + REQUIRE(soma_collection->has_metadata("soma_encoding_version")); + REQUIRE(soma_collection->has_metadata("md")); auto mdval = soma_collection->get_metadata("md"); REQUIRE(std::get(*mdval) == TILEDB_INT32); REQUIRE(std::get(*mdval) == 1); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); soma_collection->close(); - soma_collection->open(OpenMode::write, std::pair(2, 2)); + // md should not be available at (2, 2) + soma_collection->open(OpenMode::read, TimestampRange(2, 2)); + REQUIRE(soma_collection->metadata_num() == 2); + REQUIRE(soma_collection->has_metadata("soma_object_type")); + REQUIRE(soma_collection->has_metadata("soma_encoding_version")); + REQUIRE(!soma_collection->has_metadata("md")); + soma_collection->close(); + // Metadata should also be retrievable in write mode + soma_collection->open(OpenMode::write, TimestampRange(0, 2)); + REQUIRE(soma_collection->metadata_num() == 3); + REQUIRE(soma_collection->has_metadata("soma_object_type")); + REQUIRE(soma_collection->has_metadata("soma_encoding_version")); + REQUIRE(soma_collection->has_metadata("md")); mdval = soma_collection->get_metadata("md"); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); + + // Delete and have it reflected when reading metadata while in write mode soma_collection->delete_metadata("md"); mdval = soma_collection->get_metadata("md"); REQUIRE(!mdval.has_value()); soma_collection->close(); - soma_collection->open(OpenMode::read, std::pair(3, 3)); - REQUIRE(soma_collection->has_metadata("md") == false); - REQUIRE(soma_collection->metadata_num() == 1); - soma_collection->close(); + // Confirm delete in read mode + soma_collection->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(!soma_collection->has_metadata("md")); + REQUIRE(soma_collection->metadata_num() == 2); } TEST_CASE("SOMAExperiment: metadata") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-experiment"; - SOMAExperiment::create(uri, create_schema(*ctx->tiledb_ctx()), ctx); + auto [schema, index_columns] = helper::create_arrow_schema(); + SOMAExperiment::create( + uri, schema, index_columns, ctx, std::nullopt, TimestampRange(0, 2)); auto soma_experiment = SOMAExperiment::open( uri, OpenMode::write, ctx, std::pair(1, 1)); + int32_t val = 100; soma_experiment->set_metadata("md", TILEDB_INT32, 1, &val); soma_experiment->close(); - soma_experiment->open(OpenMode::read, std::pair(1, 1)); - REQUIRE(soma_experiment->metadata_num() == 2); - REQUIRE(soma_experiment->has_metadata("soma_object_type") == true); - REQUIRE(soma_experiment->has_metadata("md") == true); - + // Read metadata + soma_experiment->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(soma_experiment->metadata_num() == 3); + REQUIRE(soma_experiment->has_metadata("soma_object_type")); + REQUIRE(soma_experiment->has_metadata("soma_encoding_version")); + REQUIRE(soma_experiment->has_metadata("md")); auto mdval = soma_experiment->get_metadata("md"); REQUIRE(std::get(*mdval) == TILEDB_INT32); REQUIRE(std::get(*mdval) == 1); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); soma_experiment->close(); - soma_experiment->open(OpenMode::write, std::pair(2, 2)); + // md should not be available at (2, 2) + soma_experiment->open(OpenMode::read, TimestampRange(2, 2)); + REQUIRE(soma_experiment->metadata_num() == 2); + REQUIRE(soma_experiment->has_metadata("soma_object_type")); + REQUIRE(soma_experiment->has_metadata("soma_encoding_version")); + REQUIRE(!soma_experiment->has_metadata("md")); + soma_experiment->close(); + // Metadata should also be retrievable in write mode + soma_experiment->open(OpenMode::write, TimestampRange(0, 2)); + REQUIRE(soma_experiment->metadata_num() == 3); + REQUIRE(soma_experiment->has_metadata("soma_object_type")); + REQUIRE(soma_experiment->has_metadata("soma_encoding_version")); + REQUIRE(soma_experiment->has_metadata("md")); mdval = soma_experiment->get_metadata("md"); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); + + // Delete and have it reflected when reading metadata while in write mode soma_experiment->delete_metadata("md"); mdval = soma_experiment->get_metadata("md"); REQUIRE(!mdval.has_value()); soma_experiment->close(); - soma_experiment->open(OpenMode::read, std::pair(3, 3)); - REQUIRE(soma_experiment->has_metadata("md") == false); - REQUIRE(soma_experiment->metadata_num() == 1); - soma_experiment->close(); + // Confirm delete in read mode + soma_experiment->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(!soma_experiment->has_metadata("md")); + REQUIRE(soma_experiment->metadata_num() == 2); } TEST_CASE("SOMAMeasurement: metadata") { auto ctx = std::make_shared(); - std::string uri = "mem://unit-test-measurement"; - SOMAMeasurement::create(uri, create_schema(*ctx->tiledb_ctx()), ctx); + auto [schema, index_columns] = helper::create_arrow_schema(); + SOMAMeasurement::create( + uri, schema, index_columns, ctx, std::nullopt, TimestampRange(0, 2)); + auto soma_measurement = SOMAMeasurement::open( uri, OpenMode::write, ctx, std::pair(1, 1)); + int32_t val = 100; soma_measurement->set_metadata("md", TILEDB_INT32, 1, &val); soma_measurement->close(); - soma_measurement->open(OpenMode::read, std::pair(1, 1)); - REQUIRE(soma_measurement->metadata_num() == 2); - REQUIRE(soma_measurement->has_metadata("soma_object_type") == true); - REQUIRE(soma_measurement->has_metadata("md") == true); - + // Read metadata + soma_measurement->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(soma_measurement->metadata_num() == 3); + REQUIRE(soma_measurement->has_metadata("soma_object_type")); + REQUIRE(soma_measurement->has_metadata("soma_encoding_version")); + REQUIRE(soma_measurement->has_metadata("md")); auto mdval = soma_measurement->get_metadata("md"); REQUIRE(std::get(*mdval) == TILEDB_INT32); REQUIRE(std::get(*mdval) == 1); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); soma_measurement->close(); - soma_measurement->open( - OpenMode::write, std::pair(2, 2)); + // md should not be available at (2, 2) + soma_measurement->open(OpenMode::read, TimestampRange(2, 2)); + REQUIRE(soma_measurement->metadata_num() == 2); + REQUIRE(soma_measurement->has_metadata("soma_object_type")); + REQUIRE(soma_measurement->has_metadata("soma_encoding_version")); + REQUIRE(!soma_measurement->has_metadata("md")); + soma_measurement->close(); + // Metadata should also be retrievable in write mode + soma_measurement->open(OpenMode::write, TimestampRange(0, 2)); + REQUIRE(soma_measurement->metadata_num() == 3); + REQUIRE(soma_measurement->has_metadata("soma_object_type")); + REQUIRE(soma_measurement->has_metadata("soma_encoding_version")); + REQUIRE(soma_measurement->has_metadata("md")); mdval = soma_measurement->get_metadata("md"); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); + + // Delete and have it reflected when reading metadata while in write mode soma_measurement->delete_metadata("md"); mdval = soma_measurement->get_metadata("md"); REQUIRE(!mdval.has_value()); soma_measurement->close(); - soma_measurement->open(OpenMode::read, std::pair(3, 3)); - REQUIRE(soma_measurement->has_metadata("md") == false); - REQUIRE(soma_measurement->metadata_num() == 1); - soma_measurement->close(); + // Confirm delete in read mode + soma_measurement->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(!soma_measurement->has_metadata("md")); + REQUIRE(soma_measurement->metadata_num() == 2); } \ No newline at end of file diff --git a/libtiledbsoma/test/unit_soma_dataframe.cc b/libtiledbsoma/test/unit_soma_dataframe.cc index 9822caeeb8..39bceda87c 100644 --- a/libtiledbsoma/test/unit_soma_dataframe.cc +++ b/libtiledbsoma/test/unit_soma_dataframe.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2022 TileDB, Inc. + * @copyright Copyright (c) 2024 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -30,57 +30,14 @@ * This file manages unit tests for the SOMADataFrame class */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include "utils/util.h" - -using namespace tiledb; -using namespace tiledbsoma; -using namespace Catch::Matchers; - -#ifndef TILEDBSOMA_SOURCE_ROOT -#define TILEDBSOMA_SOURCE_ROOT "not_defined" -#endif - -const std::string src_path = TILEDBSOMA_SOURCE_ROOT; - -namespace { -ArraySchema create_schema(Context& ctx, bool allow_duplicates = false) { - // Create schema - ArraySchema schema(ctx, TILEDB_SPARSE); - - auto dim = Dimension::create(ctx, "d0", {0, 1000}); - - Domain domain(ctx); - domain.add_dimension(dim); - schema.set_domain(domain); - - auto attr = Attribute::create(ctx, "a0"); - schema.add_attribute(attr); - schema.set_allows_dups(allow_duplicates); - schema.check(); - - return schema; -} -}; // namespace +#include "common.h" TEST_CASE("SOMADataFrame: basic") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-dataframe-basic"; - SOMADataFrame::create(uri, create_schema(*ctx->tiledb_ctx()), ctx); + auto [schema, index_columns] = helper::create_arrow_schema(); + SOMADataFrame::create(uri, schema, index_columns, ctx); auto soma_dataframe = SOMADataFrame::open(uri, OpenMode::read, ctx); REQUIRE(soma_dataframe->uri() == uri); @@ -97,14 +54,10 @@ TEST_CASE("SOMADataFrame: basic") { d0[j] = j; std::vector a0(10, 1); - auto array_buffer = std::make_shared(); - auto tdb_arr = std::make_shared( - *ctx->tiledb_ctx(), uri, TILEDB_READ); - array_buffer->emplace("a0", ColumnBuffer::create(tdb_arr, "a0", a0)); - array_buffer->emplace("d0", ColumnBuffer::create(tdb_arr, "d0", d0)); - soma_dataframe = SOMADataFrame::open(uri, OpenMode::write, ctx); - soma_dataframe->write(array_buffer); + soma_dataframe->set_column_data("a0", a0.size(), a0.data()); + soma_dataframe->set_column_data("d0", d0.size(), d0.data()); + soma_dataframe->write(); soma_dataframe->close(); soma_dataframe = SOMADataFrame::open(uri, OpenMode::read, ctx); @@ -125,42 +78,60 @@ TEST_CASE("SOMADataFrame: basic") { TEST_CASE("SOMADataFrame: metadata") { auto ctx = std::make_shared(); - std::string uri = "mem://unit-test-collection"; - SOMADataFrame::create(uri, create_schema(*ctx->tiledb_ctx()), ctx); + auto [schema, index_columns] = helper::create_arrow_schema(); + SOMADataFrame::create( + uri, schema, index_columns, ctx, std::nullopt, TimestampRange(0, 2)); + auto soma_dataframe = SOMADataFrame::open( uri, OpenMode::write, ctx, {}, ResultOrder::automatic, - std::pair(1, 1)); + TimestampRange(1, 1)); + int32_t val = 100; soma_dataframe->set_metadata("md", TILEDB_INT32, 1, &val); soma_dataframe->close(); - soma_dataframe->open(OpenMode::read, std::pair(1, 1)); - REQUIRE(soma_dataframe->metadata_num() == 2); - REQUIRE(soma_dataframe->has_metadata("soma_object_type") == true); - REQUIRE(soma_dataframe->has_metadata("md") == true); - + // Read metadata + soma_dataframe->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(soma_dataframe->metadata_num() == 3); + REQUIRE(soma_dataframe->has_metadata("soma_object_type")); + REQUIRE(soma_dataframe->has_metadata("soma_encoding_version")); + REQUIRE(soma_dataframe->has_metadata("md")); auto mdval = soma_dataframe->get_metadata("md"); REQUIRE(std::get(*mdval) == TILEDB_INT32); REQUIRE(std::get(*mdval) == 1); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); soma_dataframe->close(); - soma_dataframe->open(OpenMode::write, std::pair(2, 2)); + // md should not be available at (2, 2) + soma_dataframe->open(OpenMode::read, TimestampRange(2, 2)); + REQUIRE(soma_dataframe->metadata_num() == 2); + REQUIRE(soma_dataframe->has_metadata("soma_object_type")); + REQUIRE(soma_dataframe->has_metadata("soma_encoding_version")); + REQUIRE(!soma_dataframe->has_metadata("md")); + soma_dataframe->close(); + // Metadata should also be retrievable in write mode + soma_dataframe->open(OpenMode::write, TimestampRange(0, 2)); + REQUIRE(soma_dataframe->metadata_num() == 3); + REQUIRE(soma_dataframe->has_metadata("soma_object_type")); + REQUIRE(soma_dataframe->has_metadata("soma_encoding_version")); + REQUIRE(soma_dataframe->has_metadata("md")); mdval = soma_dataframe->get_metadata("md"); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); + + // Delete and have it reflected when reading metadata while in write mode soma_dataframe->delete_metadata("md"); mdval = soma_dataframe->get_metadata("md"); REQUIRE(!mdval.has_value()); soma_dataframe->close(); - soma_dataframe->open(OpenMode::read, std::pair(3, 3)); - REQUIRE(soma_dataframe->has_metadata("md") == false); - REQUIRE(soma_dataframe->metadata_num() == 1); - soma_dataframe->close(); + // Confirm delete in read mode + soma_dataframe->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(!soma_dataframe->has_metadata("md")); + REQUIRE(soma_dataframe->metadata_num() == 2); } \ No newline at end of file diff --git a/libtiledbsoma/test/unit_soma_dense_ndarray.cc b/libtiledbsoma/test/unit_soma_dense_ndarray.cc index decb0c5400..97f6677034 100644 --- a/libtiledbsoma/test/unit_soma_dense_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_dense_ndarray.cc @@ -97,14 +97,10 @@ TEST_CASE("SOMADenseNDArray: basic") { std::vector d0{1, 10}; std::vector a0(10, 1); - auto array_buffer = std::make_shared(); - auto tdb_arr = std::make_shared( - *ctx->tiledb_ctx(), uri, TILEDB_READ); - array_buffer->emplace("a0", ColumnBuffer::create(tdb_arr, "a0", a0)); - array_buffer->emplace("d0", ColumnBuffer::create(tdb_arr, "d0", d0)); - soma_dense->open(OpenMode::write); - soma_dense->write(array_buffer); + soma_dense->set_column_data("a0", a0.size(), a0.data()); + soma_dense->set_column_data("d0", d0.size(), d0.data()); + soma_dense->write(); soma_dense->close(); soma_dense->open(OpenMode::read); @@ -124,7 +120,8 @@ TEST_CASE("SOMADenseNDArray: metadata") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-dense-ndarray"; - SOMADenseNDArray::create(uri, create_schema(*ctx->tiledb_ctx()), ctx); + SOMADenseNDArray::create( + uri, create_schema(*ctx->tiledb_ctx()), ctx, TimestampRange(0, 2)); auto soma_dense = SOMADenseNDArray::open( uri, OpenMode::write, @@ -132,32 +129,48 @@ TEST_CASE("SOMADenseNDArray: metadata") { {}, ResultOrder::automatic, std::pair(1, 1)); + int32_t val = 100; soma_dense->set_metadata("md", TILEDB_INT32, 1, &val); soma_dense->close(); - soma_dense->open(OpenMode::read, std::pair(1, 1)); - REQUIRE(soma_dense->metadata_num() == 2); - REQUIRE(soma_dense->has_metadata("soma_object_type") == true); - REQUIRE(soma_dense->has_metadata("md") == true); - + // Read metadata + soma_dense->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(soma_dense->metadata_num() == 3); + REQUIRE(soma_dense->has_metadata("soma_object_type")); + REQUIRE(soma_dense->has_metadata("soma_encoding_version")); + REQUIRE(soma_dense->has_metadata("md")); auto mdval = soma_dense->get_metadata("md"); REQUIRE(std::get(*mdval) == TILEDB_INT32); REQUIRE(std::get(*mdval) == 1); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); soma_dense->close(); - soma_dense->open(OpenMode::write, std::pair(2, 2)); + // md should not be available at (2, 2) + soma_dense->open(OpenMode::read, TimestampRange(2, 2)); + REQUIRE(soma_dense->metadata_num() == 2); + REQUIRE(soma_dense->has_metadata("soma_object_type")); + REQUIRE(soma_dense->has_metadata("soma_encoding_version")); + REQUIRE(!soma_dense->has_metadata("md")); + soma_dense->close(); + // Metadata should also be retrievable in write mode + soma_dense->open(OpenMode::write, TimestampRange(0, 2)); + REQUIRE(soma_dense->metadata_num() == 3); + REQUIRE(soma_dense->has_metadata("soma_object_type")); + REQUIRE(soma_dense->has_metadata("soma_encoding_version")); + REQUIRE(soma_dense->has_metadata("md")); mdval = soma_dense->get_metadata("md"); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); + + // Delete and have it reflected when reading metadata while in write mode soma_dense->delete_metadata("md"); mdval = soma_dense->get_metadata("md"); REQUIRE(!mdval.has_value()); soma_dense->close(); - soma_dense->open(OpenMode::read, std::pair(3, 3)); - REQUIRE(soma_dense->has_metadata("md") == false); - REQUIRE(soma_dense->metadata_num() == 1); - soma_dense->close(); + // Confirm delete in read mode + soma_dense->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(!soma_dense->has_metadata("md")); + REQUIRE(soma_dense->metadata_num() == 2); } \ No newline at end of file diff --git a/libtiledbsoma/test/unit_soma_group.cc b/libtiledbsoma/test/unit_soma_group.cc index a45430b375..c04030037b 100644 --- a/libtiledbsoma/test/unit_soma_group.cc +++ b/libtiledbsoma/test/unit_soma_group.cc @@ -156,11 +156,7 @@ TEST_CASE("SOMAGroup: basic") { "mem://sub-array", *ctx->tiledb_ctx()); auto soma_group = SOMAGroup::open( - OpenMode::write, - uri_main_group, - ctx, - "metadata", - std::pair(0, 1)); + OpenMode::write, uri_main_group, ctx, "metadata", TimestampRange(0, 1)); soma_group->set(uri_sub_group, URIType::absolute, "subgroup"); soma_group->set(uri_sub_array, URIType::absolute, "subarray"); soma_group->close(); @@ -168,7 +164,7 @@ TEST_CASE("SOMAGroup: basic") { std::map expected_map{ {"subgroup", uri_sub_group}, {"subarray", uri_sub_array}}; - soma_group->open(OpenMode::read, std::pair(0, 2)); + soma_group->open(OpenMode::read, TimestampRange(0, 2)); REQUIRE(soma_group->ctx() == ctx); REQUIRE(soma_group->uri() == uri_main_group); REQUIRE(soma_group->count() == 2); @@ -177,12 +173,12 @@ TEST_CASE("SOMAGroup: basic") { REQUIRE(soma_group->get("subarray").type() == Object::Type::Array); soma_group->close(); - soma_group->open(OpenMode::write, std::pair(0, 3)); + soma_group->open(OpenMode::write, TimestampRange(0, 3)); REQUIRE(expected_map == soma_group->member_to_uri_mapping()); soma_group->del("subgroup"); soma_group->close(); - soma_group->open(OpenMode::read, std::pair(0, 4)); + soma_group->open(OpenMode::read, TimestampRange(0, 4)); REQUIRE(soma_group->count() == 1); REQUIRE(soma_group->has("subgroup") == false); REQUIRE(soma_group->has("subarray") == true); @@ -193,39 +189,50 @@ TEST_CASE("SOMAGroup: metadata") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-group"; - SOMAGroup::create(ctx, uri, "NONE"); + SOMAGroup::create(ctx, uri, "NONE", TimestampRange(0, 2)); auto soma_group = SOMAGroup::open( - OpenMode::write, - uri, - ctx, - "metadata", - std::pair(1, 1)); + OpenMode::write, uri, ctx, "metadata", TimestampRange(1, 1)); int32_t val = 100; soma_group->set_metadata("md", TILEDB_INT32, 1, &val); soma_group->close(); - soma_group->open(OpenMode::read, std::pair(1, 1)); - REQUIRE(soma_group->metadata_num() == 2); - REQUIRE(soma_group->has_metadata("soma_object_type") == true); - REQUIRE(soma_group->has_metadata("md") == true); - + // Read metadata + soma_group->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(soma_group->metadata_num() == 3); + REQUIRE(soma_group->has_metadata("soma_object_type")); + REQUIRE(soma_group->has_metadata("soma_encoding_version")); + REQUIRE(soma_group->has_metadata("md")); auto mdval = soma_group->get_metadata("md"); REQUIRE(std::get(*mdval) == TILEDB_INT32); REQUIRE(std::get(*mdval) == 1); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); soma_group->close(); - soma_group->open(OpenMode::write, std::pair(2, 2)); + // md should not be available at (2, 2) + soma_group->open(OpenMode::read, TimestampRange(2, 2)); + REQUIRE(soma_group->metadata_num() == 2); + REQUIRE(soma_group->has_metadata("soma_object_type")); + REQUIRE(soma_group->has_metadata("soma_encoding_version")); + REQUIRE(!soma_group->has_metadata("md")); + soma_group->close(); + // Metadata should also be retrievable in write mode + soma_group->open(OpenMode::write, TimestampRange(0, 2)); + REQUIRE(soma_group->metadata_num() == 3); + REQUIRE(soma_group->has_metadata("soma_object_type")); + REQUIRE(soma_group->has_metadata("soma_encoding_version")); + REQUIRE(soma_group->has_metadata("md")); mdval = soma_group->get_metadata("md"); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); + + // Delete and have it reflected when reading metadata while in write mode soma_group->delete_metadata("md"); mdval = soma_group->get_metadata("md"); REQUIRE(!mdval.has_value()); soma_group->close(); - soma_group->open(OpenMode::read, std::pair(3, 3)); - REQUIRE(soma_group->has_metadata("md") == false); - REQUIRE(soma_group->metadata_num() == 1); - soma_group->close(); + // Confirm delete in read mode + soma_group->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(!soma_group->has_metadata("md")); + REQUIRE(soma_group->metadata_num() == 2); } \ No newline at end of file diff --git a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc index 246db3692c..b8ac0a6075 100644 --- a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc @@ -80,7 +80,8 @@ TEST_CASE("SOMASparseNDArray: basic") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-sparse-ndarray-basic"; - SOMASparseNDArray::create(uri, create_schema(*ctx->tiledb_ctx()), ctx); + SOMASparseNDArray::create( + uri, create_schema(*ctx->tiledb_ctx()), ctx, TimestampRange(0, 2)); auto soma_sparse = SOMASparseNDArray::open(uri, OpenMode::read, ctx); REQUIRE(soma_sparse->uri() == uri); @@ -99,14 +100,10 @@ TEST_CASE("SOMASparseNDArray: basic") { d0[j] = j; std::vector a0(10, 1); - auto array_buffer = std::make_shared(); - auto tdb_arr = std::make_shared( - *ctx->tiledb_ctx(), uri, TILEDB_READ); - array_buffer->emplace("a0", ColumnBuffer::create(tdb_arr, "a0", a0)); - array_buffer->emplace("d0", ColumnBuffer::create(tdb_arr, "d0", d0)); - soma_sparse->open(OpenMode::write); - soma_sparse->write(array_buffer); + soma_sparse->set_column_data("a0", a0.size(), a0.data()); + soma_sparse->set_column_data("d0", d0.size(), d0.data()); + soma_sparse->write(); soma_sparse->close(); soma_sparse->open(OpenMode::read); @@ -124,7 +121,8 @@ TEST_CASE("SOMASparseNDArray: metadata") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-sparse-ndarray"; - SOMASparseNDArray::create(uri, create_schema(*ctx->tiledb_ctx()), ctx); + SOMASparseNDArray::create( + uri, create_schema(*ctx->tiledb_ctx()), ctx, TimestampRange(0, 2)); auto soma_sparse = SOMASparseNDArray::open( uri, OpenMode::write, @@ -132,32 +130,48 @@ TEST_CASE("SOMASparseNDArray: metadata") { {}, ResultOrder::automatic, std::pair(1, 1)); + int32_t val = 100; soma_sparse->set_metadata("md", TILEDB_INT32, 1, &val); soma_sparse->close(); - soma_sparse->open(OpenMode::read, std::pair(1, 1)); - REQUIRE(soma_sparse->metadata_num() == 2); - REQUIRE(soma_sparse->has_metadata("soma_object_type") == true); - REQUIRE(soma_sparse->has_metadata("md") == true); - + // Read metadata + soma_sparse->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(soma_sparse->metadata_num() == 3); + REQUIRE(soma_sparse->has_metadata("soma_object_type")); + REQUIRE(soma_sparse->has_metadata("soma_encoding_version")); + REQUIRE(soma_sparse->has_metadata("md")); auto mdval = soma_sparse->get_metadata("md"); REQUIRE(std::get(*mdval) == TILEDB_INT32); REQUIRE(std::get(*mdval) == 1); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); soma_sparse->close(); - soma_sparse->open(OpenMode::write, std::pair(2, 2)); + // md should not be available at (2, 2) + soma_sparse->open(OpenMode::read, TimestampRange(2, 2)); + REQUIRE(soma_sparse->metadata_num() == 2); + REQUIRE(soma_sparse->has_metadata("soma_object_type")); + REQUIRE(soma_sparse->has_metadata("soma_encoding_version")); + REQUIRE(!soma_sparse->has_metadata("md")); + soma_sparse->close(); + // Metadata should also be retrievable in write mode + soma_sparse->open(OpenMode::write, TimestampRange(0, 2)); + REQUIRE(soma_sparse->metadata_num() == 3); + REQUIRE(soma_sparse->has_metadata("soma_object_type")); + REQUIRE(soma_sparse->has_metadata("soma_encoding_version")); + REQUIRE(soma_sparse->has_metadata("md")); mdval = soma_sparse->get_metadata("md"); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); + + // Delete and have it reflected when reading metadata while in write mode soma_sparse->delete_metadata("md"); mdval = soma_sparse->get_metadata("md"); REQUIRE(!mdval.has_value()); soma_sparse->close(); - soma_sparse->open(OpenMode::read, std::pair(3, 3)); - REQUIRE(soma_sparse->has_metadata("md") == false); - REQUIRE(soma_sparse->metadata_num() == 1); - soma_sparse->close(); + // Confirm delete in read mode + soma_sparse->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(!soma_sparse->has_metadata("md")); + REQUIRE(soma_sparse->metadata_num() == 2); } \ No newline at end of file