From 0f512d48560e2c57d5eda5eb686d411245dd7e75 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Mon, 12 Feb 2024 22:29:18 -0600 Subject: [PATCH 01/70] [python] Use bindings for `DenseNDArray` readpath --- apis/python/src/tiledbsoma/_arrow_types.py | 2 +- apis/python/src/tiledbsoma/_dense_nd_array.py | 29 ++++-- apis/python/src/tiledbsoma/_tdb_handles.py | 89 ++++++++++++++----- apis/python/src/tiledbsoma/_tiledb_object.py | 3 +- apis/python/tests/test_dataframe.py | 8 ++ apis/python/tests/test_dense_nd_array.py | 8 ++ libtiledbsoma/src/soma/soma_dense_ndarray.h | 5 ++ 7 files changed, 113 insertions(+), 31 deletions(-) diff --git a/apis/python/src/tiledbsoma/_arrow_types.py b/apis/python/src/tiledbsoma/_arrow_types.py index 490a109a0c..4c73f74f02 100644 --- a/apis/python/src/tiledbsoma/_arrow_types.py +++ b/apis/python/src/tiledbsoma/_arrow_types.py @@ -169,7 +169,7 @@ def tiledb_schema_to_arrow( if attr.enum_label is not None: # enumerated if A is None: A = tiledb.open(uri, ctx=ctx) - info = A.enum(name) + info = A.enum(attr.enum_label) arrow_schema_dict[name] = pa.dictionary( index_type=arrow_type_from_tiledb_dtype(attr.dtype), value_type=arrow_type_from_tiledb_dtype( diff --git a/apis/python/src/tiledbsoma/_dense_nd_array.py b/apis/python/src/tiledbsoma/_dense_nd_array.py index 4617bd93b0..d4aefff69f 100644 --- a/apis/python/src/tiledbsoma/_dense_nd_array.py +++ b/apis/python/src/tiledbsoma/_dense_nd_array.py @@ -15,9 +15,10 @@ from typing_extensions import Self from . import _util +from . import pytiledbsoma as clib from ._common_nd_array import NDArray from ._exception import SOMAError -from ._tdb_handles import ArrayWrapper +from ._tdb_handles import DenseNDArrayWrapper from ._util import dense_indices_to_shape from .options._tiledb_create_options import TileDBCreateOptions @@ -72,7 +73,7 @@ class DenseNDArray(NDArray, somacore.DenseNDArray): __slots__ = () - _reader_wrapper_type = ArrayWrapper + _reader_wrapper_type = DenseNDArrayWrapper def read( self, @@ -107,7 +108,7 @@ def read( Lifecycle: Experimental. """ - del partitions, platform_config # Currently unused. + del partitions # Currently unused. self._check_open_read() result_order = somacore.ResultOrder(result_order) @@ -123,13 +124,31 @@ def read( # # The only exception is if the array has been created but no data have been written at # all, in which case the best we can do is use the schema shape. - data_shape = self._handle.schema.shape + handle: clib.DenseNDArrayWrapper = self._handle._handle + + data_shape = handle.shape ned = self.non_empty_domain() if ned is not None: data_shape = tuple(slot[1] + 1 for slot in ned) target_shape = dense_indices_to_shape(coords, data_shape, result_order) - sr = self._soma_reader(result_order=result_order) + config = handle.config().copy() + config.update(platform_config or {}) + + ts = None + if handle.timestamp is not None: + ts = (0, handle.timestamp) + + sr = clib.SOMADenseNDArray.open( + uri=handle.uri, + mode=clib.OpenMode.read, + platform_config=config, + column_names=[], + result_order=_util.to_clib_result_order(result_order), + timestamp=ts, + ) + + # sr = self._soma_reader(result_order=result_order) self._set_reader_coords(sr, coords) diff --git a/apis/python/src/tiledbsoma/_tdb_handles.py b/apis/python/src/tiledbsoma/_tdb_handles.py index 8513d30bdb..0d2c2ac99f 100644 --- a/apis/python/src/tiledbsoma/_tdb_handles.py +++ b/apis/python/src/tiledbsoma/_tdb_handles.py @@ -319,8 +319,8 @@ def _do_initial_reads(self, reader: tiledb.Group) -> None: } -class DataFrameWrapper(Wrapper[clib.SOMADataFrame]): - """Wrapper around a Pybind11 SOMADataFrame handle.""" +class SOMAArrayWrapper(Wrapper[clib.SOMAArray]): + """Base class for Pybind11 SOMAArrayWrapper handles.""" @classmethod def _opener( @@ -329,19 +329,8 @@ def _opener( mode: options.OpenMode, context: SOMATileDBContext, timestamp: int, - ) -> clib.SOMADataFrame: - open_mode = clib.OpenMode.read if mode == "r" else clib.OpenMode.write - config = {k: str(v) for k, v in context.tiledb_config.items()} - column_names: List[str] = [] - result_order = clib.ResultOrder.automatic - return clib.SOMADataFrame.open( - uri, - open_mode, - config, - column_names, - result_order, - (0, timestamp), - ) + ) -> clib.SOMAArray: + raise NotImplementedError # Covariant types should normally not be in parameters, but this is for # internal use only so it's OK. @@ -364,17 +353,13 @@ def meta(self) -> "MetadataWrapper": @property def ndim(self) -> int: - return len(self._handle.index_column_names) - - @property - def count(self) -> int: - return int(self._handle.count) + return len(self._handle.dimension_names) def _cast_domain( self, domain: Callable[[str, DTypeLike], Tuple[object, object]] ) -> Tuple[Tuple[object, object], ...]: result = [] - for name in self._handle.index_column_names: + for name in self._handle.dimension_names: dtype = self._handle.schema.field(name).type if pa.types.is_timestamp(dtype): np_dtype = np.dtype(dtype.to_pandas_dtype()) @@ -405,12 +390,12 @@ def non_empty_domain(self) -> Tuple[Tuple[object, object], ...]: @property def attr_names(self) -> Tuple[str, ...]: return tuple( - f.name for f in self.schema if f.name not in self._handle.index_column_names + f.name for f in self.schema if f.name not in self._handle.dimension_names ) @property def dim_names(self) -> Tuple[str, ...]: - return tuple(self._handle.index_column_names) + return tuple(self._handle.dimension_names) def enum(self, label: str) -> tiledb.Enumeration: # The DataFrame handle may either be ArrayWrapper or DataFrameWrapper. @@ -419,6 +404,64 @@ def enum(self, label: str) -> tiledb.Enumeration: raise NotImplementedError +class DataFrameWrapper(SOMAArrayWrapper, Wrapper[clib.SOMADataFrame]): + """Wrapper around a Pybind11 SOMADataFrame handle.""" + + @classmethod + def _opener( + cls, + uri: str, + mode: options.OpenMode, + context: SOMATileDBContext, + timestamp: int, + ) -> clib.SOMADataFrame: + open_mode = clib.OpenMode.read if mode == "r" else clib.OpenMode.write + config = {k: str(v) for k, v in context.tiledb_config.items()} + column_names: List[str] = [] + result_order = clib.ResultOrder.automatic + return clib.SOMADataFrame.open( + uri, + open_mode, + config, + column_names, + result_order, + (0, timestamp), + ) + + @property + def count(self) -> int: + return int(self._handle.count) + + +class DenseNDArrayWrapper(SOMAArrayWrapper, Wrapper[clib.SOMADenseNDArray]): + """Wrapper around a Pybind11 DenseNDArrayWrapper handle.""" + + @classmethod + def _opener( + cls, + uri: str, + mode: options.OpenMode, + context: SOMATileDBContext, + timestamp: int, + ) -> clib.SOMADenseNDArray: + open_mode = clib.OpenMode.read if mode == "r" else clib.OpenMode.write + config = {k: str(v) for k, v in context.tiledb_config.items()} + column_names: List[str] = [] + result_order = clib.ResultOrder.automatic + return clib.SOMADenseNDArray.open( + uri, + open_mode, + config, + column_names, + result_order, + (0, timestamp), + ) + + @property + def shape(self) -> Tuple[int, ...]: + return tuple(self._handle.shape) + + class _DictMod(enum.Enum): """State machine to keep track of modifications to a dictionary. diff --git a/apis/python/src/tiledbsoma/_tiledb_object.py b/apis/python/src/tiledbsoma/_tiledb_object.py index a444c41c4a..288cd03bc3 100644 --- a/apis/python/src/tiledbsoma/_tiledb_object.py +++ b/apis/python/src/tiledbsoma/_tiledb_object.py @@ -39,6 +39,7 @@ class TileDBObject(somacore.SOMAObject, Generic[_WrapperType_co]): Experimental. """ + """Class variable of the Wrapper class used to open this object type.""" _wrapper_type: Type[_WrapperType_co] _reader_wrapper_type: Union[ Type[_WrapperType_co], Type[_tdb_handles.DataFrameWrapper] @@ -128,8 +129,6 @@ def __init__( self._handle = handle self._close_stack.enter_context(self._handle) - """Class variable of the Wrapper class used to open this object type.""" - @property def context(self) -> SOMATileDBContext: return self._handle.context diff --git a/apis/python/tests/test_dataframe.py b/apis/python/tests/test_dataframe.py index 9d21b08f01..23059bfee0 100644 --- a/apis/python/tests/test_dataframe.py +++ b/apis/python/tests/test_dataframe.py @@ -115,6 +115,14 @@ def test_dataframe(tmp_path, arrow_schema): assert sdf.count == 5 assert len(sdf) == 5 + # Ensure read mode uses clib object + with soma.DataFrame.open(tmp_path.as_posix(), "r") as A: + assert isinstance(A._handle._handle, soma.pytiledbsoma.SOMADataFrame) + + # Ensure write mode uses Python object + with soma.DataFrame.open(tmp_path.as_posix(), "w") as A: + assert isinstance(A._handle._handle, tiledb.Array) + def test_dataframe_with_float_dim(tmp_path, arrow_schema): sdf = soma.DataFrame.create( diff --git a/apis/python/tests/test_dense_nd_array.py b/apis/python/tests/test_dense_nd_array.py index 47d363058b..665a78fb71 100644 --- a/apis/python/tests/test_dense_nd_array.py +++ b/apis/python/tests/test_dense_nd_array.py @@ -49,6 +49,14 @@ def test_dense_nd_array_create_ok( with tiledb.open(tmp_path.as_posix()) as A: assert not A.schema.sparse + # Ensure read mode uses clib object + with soma.DenseNDArray.open(tmp_path.as_posix(), "r") as A: + assert isinstance(A._handle._handle, soma.pytiledbsoma.SOMADenseNDArray) + + # Ensure write mode uses Python object + with soma.DenseNDArray.open(tmp_path.as_posix(), "w") as A: + assert isinstance(A._handle._handle, tiledb.Array) + @pytest.mark.parametrize("shape", [(10,)]) @pytest.mark.parametrize("element_type", NDARRAY_ARROW_TYPES_NOT_SUPPORTED) diff --git a/libtiledbsoma/src/soma/soma_dense_ndarray.h b/libtiledbsoma/src/soma/soma_dense_ndarray.h index 7a8d41dc87..39f2b5d18b 100644 --- a/libtiledbsoma/src/soma/soma_dense_ndarray.h +++ b/libtiledbsoma/src/soma/soma_dense_ndarray.h @@ -129,6 +129,11 @@ class SOMADenseNDArray : public SOMAArray { : SOMAArray(other) { } + SOMADenseNDArray() = delete; + SOMADenseNDArray(const SOMADenseNDArray&) = default; + SOMADenseNDArray(SOMADenseNDArray&&) = delete; + ~SOMADenseNDArray() = default; + using SOMAArray::open; /** From 6af74c544c58534538cadf43c26afa52dcd0f9f1 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Fri, 23 Feb 2024 11:31:39 -0600 Subject: [PATCH 02/70] Update `DenseNDArray` with new `SOMAContext` --- apis/python/src/tiledbsoma/_dense_nd_array.py | 11 ++++--- apis/python/src/tiledbsoma/_tdb_handles.py | 4 ++- .../src/tiledbsoma/soma_dense_ndarray.cc | 33 +++++++++---------- 3 files changed, 25 insertions(+), 23 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dense_nd_array.py b/apis/python/src/tiledbsoma/_dense_nd_array.py index d4aefff69f..5b77d581ee 100644 --- a/apis/python/src/tiledbsoma/_dense_nd_array.py +++ b/apis/python/src/tiledbsoma/_dense_nd_array.py @@ -132,8 +132,11 @@ def read( data_shape = tuple(slot[1] + 1 for slot in ned) target_shape = dense_indices_to_shape(coords, data_shape, result_order) - config = handle.config().copy() - config.update(platform_config or {}) + context = handle.context() + if platform_config is not None: + config = context.tiledb_config.copy() + config.update(platform_config or {}) + context = clib.SOMAContext(config) ts = None if handle.timestamp is not None: @@ -142,14 +145,12 @@ def read( sr = clib.SOMADenseNDArray.open( uri=handle.uri, mode=clib.OpenMode.read, - platform_config=config, + context=context, column_names=[], result_order=_util.to_clib_result_order(result_order), timestamp=ts, ) - # sr = self._soma_reader(result_order=result_order) - self._set_reader_coords(sr, coords) arrow_tables = [] diff --git a/apis/python/src/tiledbsoma/_tdb_handles.py b/apis/python/src/tiledbsoma/_tdb_handles.py index 0d2c2ac99f..24c4c2992a 100644 --- a/apis/python/src/tiledbsoma/_tdb_handles.py +++ b/apis/python/src/tiledbsoma/_tdb_handles.py @@ -72,11 +72,13 @@ def open( if open_mode == clib.OpenMode.read and obj_type == "SOMADataFrame": return DataFrameWrapper._from_soma_object(soma_object, context) + elif open_mode == clib.OpenMode.read and obj_type == "SOMADenseNDArray": + return DenseNDArrayWrapper._from_soma_object(soma_object, context) if obj_type in ( "SOMADataFrame", - "SOMASparseNDArray", "SOMADenseNDArray", + "SOMASparseNDArray", "array", ): return ArrayWrapper.open(uri, mode, context, timestamp) diff --git a/apis/python/src/tiledbsoma/soma_dense_ndarray.cc b/apis/python/src/tiledbsoma/soma_dense_ndarray.cc index 3f99121174..a912d19232 100644 --- a/apis/python/src/tiledbsoma/soma_dense_ndarray.cc +++ b/apis/python/src/tiledbsoma/soma_dense_ndarray.cc @@ -49,23 +49,22 @@ using namespace tiledbsoma; void load_soma_dense_ndarray(py::module& m) { py::class_(m, "SOMADenseNDArray") - .def_static( - "open", - py::overload_cast< - std::string_view, - OpenMode, - std::shared_ptr, - std::vector, - ResultOrder, - std::optional>>( - &SOMADenseNDArray::open), - "uri"_a, - "mode"_a, - "ctx"_a, - py::kw_only(), - "column_names"_a = py::none(), - "result_order"_a = ResultOrder::automatic, - "timestamp"_a = py::none()) + .def_static( + "open", + py::overload_cast< + std::string_view, + OpenMode, + std::shared_ptr, + std::vector, + ResultOrder, + std::optional>>(&SOMADenseNDArray::open), + "uri"_a, + "mode"_a, + "context"_a, + py::kw_only(), + "column_names"_a = py::none(), + "result_order"_a = ResultOrder::automatic, + "timestamp"_a = py::none()) .def_static("exists", &SOMADenseNDArray::exists); } From 7ee29edc9b8a0bc5430518ff0b3c6844f271f91f Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Fri, 23 Feb 2024 13:59:33 -0600 Subject: [PATCH 03/70] Add documentation for _opener virtual method --- apis/python/src/tiledbsoma/_tdb_handles.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/apis/python/src/tiledbsoma/_tdb_handles.py b/apis/python/src/tiledbsoma/_tdb_handles.py index 24c4c2992a..c55e36a99d 100644 --- a/apis/python/src/tiledbsoma/_tdb_handles.py +++ b/apis/python/src/tiledbsoma/_tdb_handles.py @@ -332,6 +332,9 @@ def _opener( context: SOMATileDBContext, timestamp: int, ) -> clib.SOMAArray: + # Virtual method to implement _opener in derived classes + # SOMADataFrameWrapper, SOMASparseNDArrayWrapper, and + # SOMADenseNDArrayWrapper raise NotImplementedError # Covariant types should normally not be in parameters, but this is for From 3412ff5608a4dfd3ad562ac5d041915adddb9eb2 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Fri, 23 Feb 2024 14:01:36 -0600 Subject: [PATCH 04/70] Correct documentation --- apis/python/src/tiledbsoma/_tdb_handles.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apis/python/src/tiledbsoma/_tdb_handles.py b/apis/python/src/tiledbsoma/_tdb_handles.py index c55e36a99d..0439b198a1 100644 --- a/apis/python/src/tiledbsoma/_tdb_handles.py +++ b/apis/python/src/tiledbsoma/_tdb_handles.py @@ -332,9 +332,9 @@ def _opener( context: SOMATileDBContext, timestamp: int, ) -> clib.SOMAArray: - # Virtual method to implement _opener in derived classes - # SOMADataFrameWrapper, SOMASparseNDArrayWrapper, and - # SOMADenseNDArrayWrapper + # Ensure SOMADataFrameWrapper, SOMASparseNDArrayWrapper, and + # SOMADenseNDArrayWrapper have _opener implemented to open the correct + # clib object (clib.DataFrame.open, etc) raise NotImplementedError # Covariant types should normally not be in parameters, but this is for From 1fd5a5b377265094fd1de9fedf4b1548ff4f03b8 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Fri, 23 Feb 2024 15:25:53 -0600 Subject: [PATCH 05/70] Pass in as kw args; replace virtual method --- apis/python/src/tiledbsoma/_tdb_handles.py | 71 ++++++-------------- apis/python/src/tiledbsoma/_tiledb_object.py | 4 +- apis/python/src/tiledbsoma/soma_object.cc | 63 +++++++++-------- 3 files changed, 58 insertions(+), 80 deletions(-) diff --git a/apis/python/src/tiledbsoma/_tdb_handles.py b/apis/python/src/tiledbsoma/_tdb_handles.py index 0439b198a1..714a22485a 100644 --- a/apis/python/src/tiledbsoma/_tdb_handles.py +++ b/apis/python/src/tiledbsoma/_tdb_handles.py @@ -16,7 +16,6 @@ Dict, Generic, Iterator, - List, Mapping, MutableMapping, Optional, @@ -57,7 +56,7 @@ def open( # if there is not a valid SOMAObject at the given URI, this # returns None soma_object = clib.SOMAObject.open( - uri, open_mode, context.native_context, (0, timestamp_ms) + uri, open_mode, context.native_context, timestamp=(0, timestamp_ms) ) # Avoid creating a TileDB-Py Ctx unless necessary @@ -321,9 +320,14 @@ def _do_initial_reads(self, reader: tiledb.Group) -> None: } -class SOMAArrayWrapper(Wrapper[clib.SOMAArray]): +_ArrType = TypeVar("_ArrType", bound=clib.SOMAArray) + + +class SOMAArrayWrapper(Wrapper[_ArrType]): """Base class for Pybind11 SOMAArrayWrapper handles.""" + _WRAPPED_TYPE: Type[_ArrType] + @classmethod def _opener( cls, @@ -331,11 +335,16 @@ def _opener( mode: options.OpenMode, context: SOMATileDBContext, timestamp: int, - ) -> clib.SOMAArray: - # Ensure SOMADataFrameWrapper, SOMASparseNDArrayWrapper, and - # SOMADenseNDArrayWrapper have _opener implemented to open the correct - # clib object (clib.DataFrame.open, etc) - raise NotImplementedError + ) -> clib.SOMADenseNDArray: + open_mode = clib.OpenMode.read if mode == "r" else clib.OpenMode.write + return cls._WRAPPED_TYPE.open( + uri, + open_mode, + context=context.native_context, + column_names=[], + result_order=clib.ResultOrder.automatic, + timestamp=(0, timestamp), + ) # Covariant types should normally not be in parameters, but this is for # internal use only so it's OK. @@ -409,58 +418,20 @@ def enum(self, label: str) -> tiledb.Enumeration: raise NotImplementedError -class DataFrameWrapper(SOMAArrayWrapper, Wrapper[clib.SOMADataFrame]): +class DataFrameWrapper(SOMAArrayWrapper[clib.SOMADataFrame]): """Wrapper around a Pybind11 SOMADataFrame handle.""" - @classmethod - def _opener( - cls, - uri: str, - mode: options.OpenMode, - context: SOMATileDBContext, - timestamp: int, - ) -> clib.SOMADataFrame: - open_mode = clib.OpenMode.read if mode == "r" else clib.OpenMode.write - config = {k: str(v) for k, v in context.tiledb_config.items()} - column_names: List[str] = [] - result_order = clib.ResultOrder.automatic - return clib.SOMADataFrame.open( - uri, - open_mode, - config, - column_names, - result_order, - (0, timestamp), - ) + _WRAPPED_TYPE = clib.SOMADataFrame @property def count(self) -> int: return int(self._handle.count) -class DenseNDArrayWrapper(SOMAArrayWrapper, Wrapper[clib.SOMADenseNDArray]): +class DenseNDArrayWrapper(SOMAArrayWrapper[clib.SOMADenseNDArray]): """Wrapper around a Pybind11 DenseNDArrayWrapper handle.""" - @classmethod - def _opener( - cls, - uri: str, - mode: options.OpenMode, - context: SOMATileDBContext, - timestamp: int, - ) -> clib.SOMADenseNDArray: - open_mode = clib.OpenMode.read if mode == "r" else clib.OpenMode.write - config = {k: str(v) for k, v in context.tiledb_config.items()} - column_names: List[str] = [] - result_order = clib.ResultOrder.automatic - return clib.SOMADenseNDArray.open( - uri, - open_mode, - config, - column_names, - result_order, - (0, timestamp), - ) + _WRAPPED_TYPE = clib.SOMADenseNDArray @property def shape(self) -> Tuple[int, ...]: diff --git a/apis/python/src/tiledbsoma/_tiledb_object.py b/apis/python/src/tiledbsoma/_tiledb_object.py index 288cd03bc3..42a7c78ba3 100644 --- a/apis/python/src/tiledbsoma/_tiledb_object.py +++ b/apis/python/src/tiledbsoma/_tiledb_object.py @@ -42,7 +42,9 @@ class TileDBObject(somacore.SOMAObject, Generic[_WrapperType_co]): """Class variable of the Wrapper class used to open this object type.""" _wrapper_type: Type[_WrapperType_co] _reader_wrapper_type: Union[ - Type[_WrapperType_co], Type[_tdb_handles.DataFrameWrapper] + Type[_WrapperType_co], + Type[_tdb_handles.DataFrameWrapper], + Type[_tdb_handles.DenseNDArrayWrapper], ] __slots__ = ("_close_stack", "_handle") diff --git a/apis/python/src/tiledbsoma/soma_object.cc b/apis/python/src/tiledbsoma/soma_object.cc index 3bdb3647f3..17b750e960 100644 --- a/apis/python/src/tiledbsoma/soma_object.cc +++ b/apis/python/src/tiledbsoma/soma_object.cc @@ -50,32 +50,37 @@ using namespace tiledbsoma; void load_soma_object(py::module& m) { py::class_(m, "SOMAObject") - .def_static( - "open", - [](std::string_view uri, - OpenMode mode, - std::shared_ptr ctx, - std::optional> timestamp) - -> py::object { - try { - auto obj = SOMAObject::open(uri, mode, ctx, timestamp); - if (obj->type() == "SOMADataFrame") - return py::cast(dynamic_cast(*obj)); - else if (obj->type() == "SOMASparseNDArray") - return py::cast(dynamic_cast(*obj)); - else if (obj->type() == "SOMADenseNDArray") - return py::cast(dynamic_cast(*obj)); - else if (obj->type() == "SOMACollection") - return py::cast(dynamic_cast(*obj)); - else if (obj->type() == "SOMAExperiment") - return py::cast(dynamic_cast(*obj)); - else if (obj->type() == "SOMAMeasurement") - return py::cast(dynamic_cast(*obj)); - return py::none(); - } catch (...) { - return py::none(); - } - }) - .def_property_readonly("type", &SOMAObject::type); -}; -} // namespace libtiledbsomacpp + .def_static("open", [](std::string_view uri, + OpenMode mode, + std::shared_ptr context, + std::optional> timestamp) -> py::object { + try{ + auto obj = SOMAObject::open(uri, mode, context, timestamp); + if (obj->type() == "SOMADataFrame") + return py::cast(dynamic_cast(*obj)); + else if (obj->type() == "SOMASparseNDArray") + return py::cast(dynamic_cast(*obj)); + else if (obj->type() == "SOMADenseNDArray") + return py::cast(dynamic_cast(*obj)); + else if (obj->type() == "SOMACollection") + return py::cast(dynamic_cast(*obj)); + else if (obj->type() == "SOMAExperiment") + return py::cast(dynamic_cast(*obj)); + else if (obj->type() == "SOMAMeasurement") + return py::cast(dynamic_cast(*obj)); + return py::none(); + }catch(...){ + return py::none(); + } + }, + "uri"_a, + "mode"_a, + "context"_a, + py::kw_only(), + "timestamp"_a = py::none()) + + .def_property_readonly("type", &SOMAObject::type); + + }; +} + From 7185a5efddd91ba44d81d4ad54b976c8209c2bcd Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Fri, 23 Feb 2024 15:47:47 -0600 Subject: [PATCH 06/70] Run formatting --- apis/python/src/tiledbsoma/soma_array.cc | 45 +++++++++++++ .../src/tiledbsoma/soma_dense_ndarray.cc | 33 ++++----- apis/python/src/tiledbsoma/soma_object.cc | 67 ++++++++++--------- 3 files changed, 96 insertions(+), 49 deletions(-) diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index 8254aca816..4ea10e6575 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -186,15 +186,23 @@ void load_soma_array(py::module& m) { "batch_size"_a = "auto", "result_order"_a = ResultOrder::automatic) + .def( + "reopen", + py::overload_cast< + OpenMode, + std::optional>>(&SOMAArray::open)) + .def( "reopen", py::overload_cast< OpenMode, std::optional>>(&SOMAArray::open)) .def("close", &SOMAArray::close) + .def_property_readonly( "closed", [](SOMAArray& reader) -> bool { return not reader.is_open(); }) + .def_property_readonly( "mode", [](SOMAArray& reader) { @@ -585,6 +593,7 @@ void load_soma_array(py::module& m) { "Unsupported dtype for nonempty domain."); } }) + .def( "domain", [](SOMAArray& reader, std::string name, py::dtype dtype) { @@ -633,6 +642,42 @@ void load_soma_array(py::module& m) { } }) + .def_property_readonly("dimension_names", &SOMAArray::dimension_names) + + .def("set_metadata", &SOMAArray::set_metadata) + + .def("delete_metadata", &SOMAArray::delete_metadata) + + .def( + "get_metadata", + py::overload_cast(&SOMAArray::get_metadata)) + + .def_property_readonly( + "meta", + [](SOMAArray& soma_dataframe) -> py::dict { + py::dict results; + + for (auto const& [key, val] : soma_dataframe.get_metadata()) { + tiledb_datatype_t tdb_type = std::get( + val); + uint32_t value_num = std::get(val); + const void* value = std::get(val); + + if (tdb_type == TILEDB_STRING_UTF8) { + results[py::str(key)] = py::str( + std::string((const char*)value, value_num)); + } else if (tdb_type == TILEDB_STRING_ASCII) { + results[py::str(key)] = py::bytes( + std::string((const char*)value, value_num)); + } else { + py::dtype value_type = tdb_to_np_dtype(tdb_type, 1); + results[py::str(key)] = py::array( + value_type, value_num, value); + } + } + return results; + }) + .def("set_metadata", &SOMAArray::set_metadata) .def("delete_metadata", &SOMAArray::delete_metadata) .def( diff --git a/apis/python/src/tiledbsoma/soma_dense_ndarray.cc b/apis/python/src/tiledbsoma/soma_dense_ndarray.cc index a912d19232..06b7b257f0 100644 --- a/apis/python/src/tiledbsoma/soma_dense_ndarray.cc +++ b/apis/python/src/tiledbsoma/soma_dense_ndarray.cc @@ -49,22 +49,23 @@ using namespace tiledbsoma; void load_soma_dense_ndarray(py::module& m) { py::class_(m, "SOMADenseNDArray") - .def_static( - "open", - py::overload_cast< - std::string_view, - OpenMode, - std::shared_ptr, - std::vector, - ResultOrder, - std::optional>>(&SOMADenseNDArray::open), - "uri"_a, - "mode"_a, - "context"_a, - py::kw_only(), - "column_names"_a = py::none(), - "result_order"_a = ResultOrder::automatic, - "timestamp"_a = py::none()) + .def_static( + "open", + py::overload_cast< + std::string_view, + OpenMode, + std::shared_ptr, + std::vector, + ResultOrder, + std::optional>>( + &SOMADenseNDArray::open), + "uri"_a, + "mode"_a, + "context"_a, + py::kw_only(), + "column_names"_a = py::none(), + "result_order"_a = ResultOrder::automatic, + "timestamp"_a = py::none()) .def_static("exists", &SOMADenseNDArray::exists); } diff --git a/apis/python/src/tiledbsoma/soma_object.cc b/apis/python/src/tiledbsoma/soma_object.cc index 17b750e960..5ab4c1140e 100644 --- a/apis/python/src/tiledbsoma/soma_object.cc +++ b/apis/python/src/tiledbsoma/soma_object.cc @@ -50,37 +50,38 @@ using namespace tiledbsoma; void load_soma_object(py::module& m) { py::class_(m, "SOMAObject") - .def_static("open", [](std::string_view uri, - OpenMode mode, - std::shared_ptr context, - std::optional> timestamp) -> py::object { - try{ - auto obj = SOMAObject::open(uri, mode, context, timestamp); - if (obj->type() == "SOMADataFrame") - return py::cast(dynamic_cast(*obj)); - else if (obj->type() == "SOMASparseNDArray") - return py::cast(dynamic_cast(*obj)); - else if (obj->type() == "SOMADenseNDArray") - return py::cast(dynamic_cast(*obj)); - else if (obj->type() == "SOMACollection") - return py::cast(dynamic_cast(*obj)); - else if (obj->type() == "SOMAExperiment") - return py::cast(dynamic_cast(*obj)); - else if (obj->type() == "SOMAMeasurement") - return py::cast(dynamic_cast(*obj)); - return py::none(); - }catch(...){ - return py::none(); - } - }, - "uri"_a, - "mode"_a, - "context"_a, - py::kw_only(), - "timestamp"_a = py::none()) - - .def_property_readonly("type", &SOMAObject::type); - - }; -} + .def_static( + "open", + [](std::string_view uri, + OpenMode mode, + std::shared_ptr context, + std::optional> timestamp) + -> py::object { + try { + auto obj = SOMAObject::open(uri, mode, context, timestamp); + if (obj->type() == "SOMADataFrame") + return py::cast(dynamic_cast(*obj)); + else if (obj->type() == "SOMASparseNDArray") + return py::cast(dynamic_cast(*obj)); + else if (obj->type() == "SOMADenseNDArray") + return py::cast(dynamic_cast(*obj)); + else if (obj->type() == "SOMACollection") + return py::cast(dynamic_cast(*obj)); + else if (obj->type() == "SOMAExperiment") + return py::cast(dynamic_cast(*obj)); + else if (obj->type() == "SOMAMeasurement") + return py::cast(dynamic_cast(*obj)); + return py::none(); + } catch (...) { + return py::none(); + } + }, + "uri"_a, + "mode"_a, + "context"_a, + py::kw_only(), + "timestamp"_a = py::none()) + .def_property_readonly("type", &SOMAObject::type); +}; +} // namespace libtiledbsomacpp From 614d88638788035514d23a7ab3a7a178db874cee Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Fri, 23 Feb 2024 16:15:06 -0600 Subject: [PATCH 07/70] Correct soma_array.cc post-merge --- apis/python/src/tiledbsoma/soma_array.cc | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index 4ea10e6575..73c5462dc2 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -186,23 +186,15 @@ void load_soma_array(py::module& m) { "batch_size"_a = "auto", "result_order"_a = ResultOrder::automatic) - .def( - "reopen", - py::overload_cast< - OpenMode, - std::optional>>(&SOMAArray::open)) - .def( "reopen", py::overload_cast< OpenMode, std::optional>>(&SOMAArray::open)) .def("close", &SOMAArray::close) - .def_property_readonly( "closed", [](SOMAArray& reader) -> bool { return not reader.is_open(); }) - .def_property_readonly( "mode", [](SOMAArray& reader) { @@ -679,10 +671,13 @@ void load_soma_array(py::module& m) { }) .def("set_metadata", &SOMAArray::set_metadata) + .def("delete_metadata", &SOMAArray::delete_metadata) + .def( "get_metadata", py::overload_cast(&SOMAArray::get_metadata)) + .def_property_readonly( "meta", [](SOMAArray& soma_dataframe) -> py::dict { @@ -709,6 +704,7 @@ void load_soma_array(py::module& m) { return results; }) .def("has_metadata", &SOMAArray::has_metadata) + .def("metadata_num", &SOMAArray::metadata_num); } } // namespace libtiledbsomacpp \ No newline at end of file From 6f12efdf2731e1acd23afc7b330ad7c62430006e Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Fri, 23 Feb 2024 18:08:12 -0600 Subject: [PATCH 08/70] Changes according to review --- apis/python/src/tiledbsoma/_dataframe.py | 8 ++------ apis/python/src/tiledbsoma/_dense_nd_array.py | 8 ++------ apis/python/src/tiledbsoma/_tdb_handles.py | 2 +- apis/python/src/tiledbsoma/_tiledb_object.py | 2 +- 4 files changed, 6 insertions(+), 14 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index c5fcd71a4d..4a1c480f1a 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -343,20 +343,16 @@ def read( context = handle.context() if platform_config is not None: config = context.tiledb_config.copy() - config.update(platform_config or {}) + config.update(platform_config) context = clib.SOMAContext(config) - ts = None - if handle.timestamp is not None: - ts = (0, handle.timestamp) - sr = clib.SOMADataFrame.open( uri=handle.uri, mode=clib.OpenMode.read, context=context, column_names=column_names or [], result_order=_util.to_clib_result_order(result_order), - timestamp=ts, + timestamp=handle.timestamp and (0, handle.timestamp), ) if value_filter is not None: diff --git a/apis/python/src/tiledbsoma/_dense_nd_array.py b/apis/python/src/tiledbsoma/_dense_nd_array.py index 5b77d581ee..117387dd8b 100644 --- a/apis/python/src/tiledbsoma/_dense_nd_array.py +++ b/apis/python/src/tiledbsoma/_dense_nd_array.py @@ -135,20 +135,16 @@ def read( context = handle.context() if platform_config is not None: config = context.tiledb_config.copy() - config.update(platform_config or {}) + config.update(platform_config) context = clib.SOMAContext(config) - ts = None - if handle.timestamp is not None: - ts = (0, handle.timestamp) - sr = clib.SOMADenseNDArray.open( uri=handle.uri, mode=clib.OpenMode.read, context=context, column_names=[], result_order=_util.to_clib_result_order(result_order), - timestamp=ts, + timestamp=handle.timestamp and (0, handle.timestamp), ) self._set_reader_coords(sr, coords) diff --git a/apis/python/src/tiledbsoma/_tdb_handles.py b/apis/python/src/tiledbsoma/_tdb_handles.py index 714a22485a..8f7697b7a7 100644 --- a/apis/python/src/tiledbsoma/_tdb_handles.py +++ b/apis/python/src/tiledbsoma/_tdb_handles.py @@ -71,7 +71,7 @@ def open( if open_mode == clib.OpenMode.read and obj_type == "SOMADataFrame": return DataFrameWrapper._from_soma_object(soma_object, context) - elif open_mode == clib.OpenMode.read and obj_type == "SOMADenseNDArray": + if open_mode == clib.OpenMode.read and obj_type == "SOMADenseNDArray": return DenseNDArrayWrapper._from_soma_object(soma_object, context) if obj_type in ( diff --git a/apis/python/src/tiledbsoma/_tiledb_object.py b/apis/python/src/tiledbsoma/_tiledb_object.py index 42a7c78ba3..37801df851 100644 --- a/apis/python/src/tiledbsoma/_tiledb_object.py +++ b/apis/python/src/tiledbsoma/_tiledb_object.py @@ -39,13 +39,13 @@ class TileDBObject(somacore.SOMAObject, Generic[_WrapperType_co]): Experimental. """ - """Class variable of the Wrapper class used to open this object type.""" _wrapper_type: Type[_WrapperType_co] _reader_wrapper_type: Union[ Type[_WrapperType_co], Type[_tdb_handles.DataFrameWrapper], Type[_tdb_handles.DenseNDArrayWrapper], ] + """Class variables of the Wrapper class used to open this object type.""" __slots__ = ("_close_stack", "_handle") From a8c383658cd27c23463072c76bd9dedb60be6877 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Fri, 23 Feb 2024 18:10:23 -0600 Subject: [PATCH 09/70] Move under _wrapper_type --- apis/python/src/tiledbsoma/_tiledb_object.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apis/python/src/tiledbsoma/_tiledb_object.py b/apis/python/src/tiledbsoma/_tiledb_object.py index 37801df851..325ede9afc 100644 --- a/apis/python/src/tiledbsoma/_tiledb_object.py +++ b/apis/python/src/tiledbsoma/_tiledb_object.py @@ -40,12 +40,13 @@ class TileDBObject(somacore.SOMAObject, Generic[_WrapperType_co]): """ _wrapper_type: Type[_WrapperType_co] + """Class variable of the Wrapper class used to open this object type.""" + _reader_wrapper_type: Union[ Type[_WrapperType_co], Type[_tdb_handles.DataFrameWrapper], Type[_tdb_handles.DenseNDArrayWrapper], ] - """Class variables of the Wrapper class used to open this object type.""" __slots__ = ("_close_stack", "_handle") From 7a0c6b205ac55d76cedab0d578248771aa3e47bf Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Mon, 12 Feb 2024 22:29:18 -0600 Subject: [PATCH 10/70] [python] Use bindings for `SparseNDArray` readpath --- .../python/src/tiledbsoma/_sparse_nd_array.py | 24 +++++++++++++++---- apis/python/src/tiledbsoma/_tdb_handles.py | 16 +++++++++++++ apis/python/src/tiledbsoma/_tiledb_object.py | 1 + .../src/tiledbsoma/soma_sparse_ndarray.cc | 2 +- apis/python/tests/test_platform_config.py | 13 ++++++++++ apis/python/tests/test_sparse_nd_array.py | 8 +++++++ libtiledbsoma/src/soma/soma_sparse_ndarray.h | 5 ++++ 7 files changed, 64 insertions(+), 5 deletions(-) diff --git a/apis/python/src/tiledbsoma/_sparse_nd_array.py b/apis/python/src/tiledbsoma/_sparse_nd_array.py index 8397c3c5f7..4220eeb82a 100644 --- a/apis/python/src/tiledbsoma/_sparse_nd_array.py +++ b/apis/python/src/tiledbsoma/_sparse_nd_array.py @@ -38,7 +38,7 @@ SparseCOOTensorReadIter, TableReadIter, ) -from ._tdb_handles import ArrayWrapper +from ._tdb_handles import SparseNDArrayWrapper from ._types import NTuple from .options._tiledb_create_options import TileDBCreateOptions @@ -95,7 +95,7 @@ class SparseNDArray(NDArray, somacore.SparseNDArray): __slots__ = () - _reader_wrapper_type = ArrayWrapper + _reader_wrapper_type = SparseNDArrayWrapper # Inherited from somacore # * ndim accessor @@ -155,11 +155,27 @@ def read( ``slice(2,None)`` or ``slice(None,4)``. * Negative indexing is unsupported. """ - del batch_size, platform_config # Currently unused. + del batch_size # Currently unused. + handle: clib.SparseNDArrayWrapper = self._handle._handle + self._check_open_read() _util.check_unpartitioned(partitions) - sr = self._soma_reader(schema=self._handle.schema, result_order=result_order) + context = handle.context() + if platform_config is not None: + config = context.tiledb_config.copy() + config.update(platform_config) + context = clib.SOMAContext(config) + + sr = clib.SOMASparseNDArray.open( + uri=handle.uri, + mode=clib.OpenMode.read, + context=context, + column_names=[], + result_order=_util.to_clib_result_order(result_order), + timestamp=handle.timestamp and (0, handle.timestamp), + ) + return SparseNDArrayRead(sr, self, coords) def write( diff --git a/apis/python/src/tiledbsoma/_tdb_handles.py b/apis/python/src/tiledbsoma/_tdb_handles.py index 8f7697b7a7..65ae378eea 100644 --- a/apis/python/src/tiledbsoma/_tdb_handles.py +++ b/apis/python/src/tiledbsoma/_tdb_handles.py @@ -73,6 +73,8 @@ def open( return DataFrameWrapper._from_soma_object(soma_object, context) if open_mode == clib.OpenMode.read and obj_type == "SOMADenseNDArray": return DenseNDArrayWrapper._from_soma_object(soma_object, context) + if open_mode == clib.OpenMode.read and obj_type == "SOMASparseNDArray": + return SparseNDArrayWrapper._from_soma_object(soma_object, context) if obj_type in ( "SOMADataFrame", @@ -438,6 +440,20 @@ def shape(self) -> Tuple[int, ...]: return tuple(self._handle.shape) +class SparseNDArrayWrapper(SOMAArrayWrapper[clib.SOMASparseNDArray]): + """Wrapper around a Pybind11 SparseNDArrayWrapper handle.""" + + _WRAPPED_TYPE = clib.SOMASparseNDArray + + @property + def shape(self) -> Tuple[int, ...]: + return tuple(self._handle.shape) + + @property + def nnz(self) -> int: + return int(self._handle.nnz) + + class _DictMod(enum.Enum): """State machine to keep track of modifications to a dictionary. diff --git a/apis/python/src/tiledbsoma/_tiledb_object.py b/apis/python/src/tiledbsoma/_tiledb_object.py index 325ede9afc..cc9f863bf5 100644 --- a/apis/python/src/tiledbsoma/_tiledb_object.py +++ b/apis/python/src/tiledbsoma/_tiledb_object.py @@ -46,6 +46,7 @@ class TileDBObject(somacore.SOMAObject, Generic[_WrapperType_co]): Type[_WrapperType_co], Type[_tdb_handles.DataFrameWrapper], Type[_tdb_handles.DenseNDArrayWrapper], + Type[_tdb_handles.SparseNDArrayWrapper], ] __slots__ = ("_close_stack", "_handle") diff --git a/apis/python/src/tiledbsoma/soma_sparse_ndarray.cc b/apis/python/src/tiledbsoma/soma_sparse_ndarray.cc index d4f1b429ba..a6dce317dd 100644 --- a/apis/python/src/tiledbsoma/soma_sparse_ndarray.cc +++ b/apis/python/src/tiledbsoma/soma_sparse_ndarray.cc @@ -61,7 +61,7 @@ void load_soma_sparse_ndarray(py::module& m) { &SOMASparseNDArray::open), "uri"_a, "mode"_a, - "ctx"_a, + "context"_a, py::kw_only(), "column_names"_a = py::none(), "result_order"_a = ResultOrder::automatic, diff --git a/apis/python/tests/test_platform_config.py b/apis/python/tests/test_platform_config.py index 680936c1e0..08018f3ba4 100644 --- a/apis/python/tests/test_platform_config.py +++ b/apis/python/tests/test_platform_config.py @@ -26,7 +26,20 @@ def adata(h5ad_file): return anndata.read_h5ad(h5ad_file) +@pytest.mark.skip(reason="No longer return ArraySchema - see note in test") def test_platform_config(adata): + # TODO as we remove usage of TileDB-Py in favor of ArrowSchema, we + # need a new method to get which filters have applied to the column + # rather than grabbing it from the ArraySchema. One consideration + # would be to store TileDB information in JSON format as a field in + # the ArraySchema metadata very similar to how Pandas stores information + # within pa.Schema.pandas_metadata. This could hold not only which + # filters have been applied to the column, but other info that cannot + # be "directly" stored in the ArrowSchema such as whether the column + # is a TileDB attribute or dimension, whether this represent a dense + # or sparse array, etc. This may be as easy as simply copying the + # platform_config by calling pa.Schema.with_metadata(platform_config). + # Set up anndata input path and tiledb-group output path original = adata.copy() with tempfile.TemporaryDirectory() as output_path: diff --git a/apis/python/tests/test_sparse_nd_array.py b/apis/python/tests/test_sparse_nd_array.py index d31b64582f..cb27dcfddd 100644 --- a/apis/python/tests/test_sparse_nd_array.py +++ b/apis/python/tests/test_sparse_nd_array.py @@ -55,6 +55,14 @@ def test_sparse_nd_array_create_ok( assert a.schema.field(f"soma_dim_{d}").type == pa.int64() assert a.schema.field("soma_data").type == element_type + # Ensure read mode uses clib object + with soma.SparseNDArray.open(tmp_path.as_posix(), "r") as A: + assert isinstance(A._handle._handle, soma.pytiledbsoma.SOMASparseNDArray) + + # Ensure write mode uses Python object + with soma.SparseNDArray.open(tmp_path.as_posix(), "w") as A: + assert isinstance(A._handle._handle, tiledb.Array) + @pytest.mark.parametrize("shape", [(10,)]) @pytest.mark.parametrize("element_type", NDARRAY_ARROW_TYPES_NOT_SUPPORTED) diff --git a/libtiledbsoma/src/soma/soma_sparse_ndarray.h b/libtiledbsoma/src/soma/soma_sparse_ndarray.h index 1842a88cb2..f201ff7614 100644 --- a/libtiledbsoma/src/soma/soma_sparse_ndarray.h +++ b/libtiledbsoma/src/soma/soma_sparse_ndarray.h @@ -129,6 +129,11 @@ class SOMASparseNDArray : public SOMAArray { : SOMAArray(other) { } + SOMASparseNDArray() = delete; + SOMASparseNDArray(const SOMASparseNDArray&) = default; + SOMASparseNDArray(SOMASparseNDArray&&) = delete; + ~SOMASparseNDArray() = default; + using SOMAArray::open; /** From 71e13f58e28c178185ab2438afdf2fc4c1300f9b Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Thu, 22 Feb 2024 14:36:06 -0600 Subject: [PATCH 11/70] WIP export to c --- apis/python/src/tiledbsoma/_dataframe.py | 5 +++-- apis/python/src/tiledbsoma/_tdb_handles.py | 2 +- apis/python/src/tiledbsoma/soma_array.cc | 15 +++++++++++++++ libtiledbsoma/src/soma/array_buffers.h | 2 +- 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 4a1c480f1a..9a25dbc042 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -122,6 +122,7 @@ class DataFrame(TileDBArray, somacore.DataFrame): it must be ``None``. """ + _wrapper_type = DataFrameWrapper _reader_wrapper_type = DataFrameWrapper @classmethod @@ -491,7 +492,7 @@ def write( return self def _set_reader_coord( - self, sr: clib.SOMAArray, dim_idx: int, dim: tiledb.Dim, coord: object + self, sr: clib.SOMAArray, dim_idx: int, dim: pa.Table, coord: object ) -> bool: if coord is None: return True # No constraint; select all in this dimension @@ -565,7 +566,7 @@ def _set_reader_coord_by_py_seq_or_np_array( self, sr: clib.SOMAArray, dim_idx: int, - dim: tiledb.Dim, + dim: pa.Table, coord: object, ) -> bool: if isinstance(coord, np.ndarray): diff --git a/apis/python/src/tiledbsoma/_tdb_handles.py b/apis/python/src/tiledbsoma/_tdb_handles.py index 65ae378eea..7d1066584e 100644 --- a/apis/python/src/tiledbsoma/_tdb_handles.py +++ b/apis/python/src/tiledbsoma/_tdb_handles.py @@ -69,7 +69,7 @@ def open( if not obj_type: raise DoesNotExistError(f"{uri!r} does not exist") - if open_mode == clib.OpenMode.read and obj_type == "SOMADataFrame": + if obj_type == "SOMADataFrame": return DataFrameWrapper._from_soma_object(soma_object, context) if open_mode == clib.OpenMode.read and obj_type == "SOMADenseNDArray": return DenseNDArrayWrapper._from_soma_object(soma_object, context) diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index 73c5462dc2..adb908efb5 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -513,6 +513,21 @@ void load_soma_array(py::module& m) { return std::nullopt; }) + .def( + "write", + [](SOMAArray& array, py::handle c_array) { + ArrowSchema arrow_schema; + ArrowArray arrow_array; + uintptr_t arrow_schema_ptr = (uintptr_t)(&arrow_schema); + uintptr_t arrow_array_ptr = (uintptr_t)(&arrow_array); + c_array.attr("_export_to_c")(arrow_array_ptr, arrow_schema_ptr); + + array.write( + std::shared_ptr( + reinterpret_cast(arrow_array_ptr) + )); + }) + .def("nnz", &SOMAArray::nnz, py::call_guard()) .def_property_readonly("shape", &SOMAArray::shape) diff --git a/libtiledbsoma/src/soma/array_buffers.h b/libtiledbsoma/src/soma/array_buffers.h index 202c306061..3ab142ea6e 100644 --- a/libtiledbsoma/src/soma/array_buffers.h +++ b/libtiledbsoma/src/soma/array_buffers.h @@ -47,7 +47,7 @@ using namespace tiledb; class ArrayBuffers { public: ArrayBuffers() = default; - ArrayBuffers(const ArrayBuffers&) = delete; + ArrayBuffers(const ArrayBuffers&) = default; ArrayBuffers(ArrayBuffers&&) = default; ~ArrayBuffers() = default; From 86c2dec72349aaa862ed4447960ed7cf08d532a3 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Sat, 24 Feb 2024 09:17:51 -0600 Subject: [PATCH 12/70] WIP arrow type to tiledb type converter --- apis/python/src/tiledbsoma/_tdb_handles.py | 1 - apis/python/src/tiledbsoma/soma_array.cc | 10 +- libtiledbsoma/src/utils/arrow_adapter.cc | 134 +++++++++++++-------- libtiledbsoma/src/utils/arrow_adapter.h | 24 ++++ 4 files changed, 109 insertions(+), 60 deletions(-) diff --git a/apis/python/src/tiledbsoma/_tdb_handles.py b/apis/python/src/tiledbsoma/_tdb_handles.py index 7d1066584e..64633e6b9c 100644 --- a/apis/python/src/tiledbsoma/_tdb_handles.py +++ b/apis/python/src/tiledbsoma/_tdb_handles.py @@ -77,7 +77,6 @@ def open( return SparseNDArrayWrapper._from_soma_object(soma_object, context) if obj_type in ( - "SOMADataFrame", "SOMADenseNDArray", "SOMASparseNDArray", "array", diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index adb908efb5..2e5d8913b7 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -514,7 +514,7 @@ void load_soma_array(py::module& m) { }) .def( - "write", + "write", [](SOMAArray& array, py::handle c_array) { ArrowSchema arrow_schema; ArrowArray arrow_array; @@ -522,11 +522,9 @@ void load_soma_array(py::module& m) { uintptr_t arrow_array_ptr = (uintptr_t)(&arrow_array); c_array.attr("_export_to_c")(arrow_array_ptr, arrow_schema_ptr); - array.write( - std::shared_ptr( - reinterpret_cast(arrow_array_ptr) - )); - }) + array.write(std::shared_ptr( + reinterpret_cast(arrow_array_ptr))); + }) .def("nnz", &SOMAArray::nnz, py::call_guard()) diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 4739bd0ed2..0c9c1052c2 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -240,6 +240,39 @@ std::pair ArrowAdapter::_get_data_and_length( } } +ArraySchema tiledb_schema_from_arrow_schema( + Context context, + std::shared_ptr arrow_schema, + std::vector index_column_names, + bool sparse) { + ArraySchema schema(context, sparse ? TILEDB_SPARSE : TILEDB_DENSE); + Domain domain(context); + + for (int64_t i = 0; i < arrow_schema->n_children; ++i) { + ArrowSchema* child = arrow_schema->children[i]; + auto type = ArrowAdapter::to_tiledb_format(child->format); + + bool is_dim = std::find( + index_column_names.begin(), + index_column_names.end(), + child->name) != index_column_names.end(); + + if (is_dim) { + domain.add_dimension(Dimension::create( + context, child->name, type, dim_domain, tile_extent)); + } + // else { + // schema.add_attribute(Attribute::create(context, child->name)); + // } + } + + schema.set_domain(domain); + + schema.check(); + + return schema; +} + std::pair, std::unique_ptr> ArrowAdapter::to_arrow(std::shared_ptr column) { std::unique_ptr schema = std::make_unique(); @@ -374,60 +407,55 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { } std::string_view ArrowAdapter::to_arrow_format( - tiledb_datatype_t datatype, bool use_large) { - switch (datatype) { - case TILEDB_STRING_ASCII: - case TILEDB_STRING_UTF8: - return use_large ? "U" : "u"; // large because TileDB - // uses 64bit offsets - case TILEDB_CHAR: - case TILEDB_BLOB: - return use_large ? "Z" : "z"; // large because TileDB - // uses 64bit offsets - case TILEDB_BOOL: - return "b"; - case TILEDB_INT32: - return "i"; - case TILEDB_INT64: - return "l"; - case TILEDB_FLOAT32: - return "f"; - case TILEDB_FLOAT64: - return "g"; - case TILEDB_INT8: - return "c"; - case TILEDB_UINT8: - return "C"; - case TILEDB_INT16: - return "s"; - case TILEDB_UINT16: - return "S"; - case TILEDB_UINT32: - return "I"; - case TILEDB_UINT64: - return "L"; - case TILEDB_TIME_SEC: - return "tts"; - case TILEDB_TIME_MS: - return "ttm"; - case TILEDB_TIME_US: - return "ttu"; - case TILEDB_TIME_NS: - return "ttn"; - case TILEDB_DATETIME_SEC: - return "tss:"; - case TILEDB_DATETIME_MS: - return "tsm:"; - case TILEDB_DATETIME_US: - return "tsu:"; - case TILEDB_DATETIME_NS: - return "tsn:"; - default: - break; + tiledb_datatype_t tiledb_dtype, bool use_large) { + auto u = use_large ? "U" : "u"; + auto z = use_large ? "Z" : "z"; + std::map _to_arrow_format_map = { + {TILEDB_STRING_ASCII, u}, {TILEDB_CHAR, z}, + {TILEDB_STRING_UTF8, u}, {TILEDB_BLOB, z}, + {TILEDB_INT8, "c"}, {TILEDB_UINT8, "C"}, + {TILEDB_INT16, "s"}, {TILEDB_UINT16, "S"}, + {TILEDB_INT32, "i"}, {TILEDB_UINT32, "I"}, + {TILEDB_INT64, "l"}, {TILEDB_UINT64, "L"}, + {TILEDB_FLOAT32, "f"}, {TILEDB_FLOAT64, "g"}, + {TILEDB_BOOL, "b"}, {TILEDB_TIME_SEC, "tts"}, + {TILEDB_TIME_MS, "ttm"}, {TILEDB_TIME_US, "ttu"}, + {TILEDB_TIME_NS, "ttn"}, {TILEDB_DATETIME_SEC, "tss:"}, + {TILEDB_DATETIME_MS, "tsm:"}, {TILEDB_DATETIME_US, "tsu:"}, + {TILEDB_DATETIME_NS, "tsn:"}, + }; + + try { + return _to_arrow_format_map.at(tiledb_dtype); + } catch (const std::out_of_range& err) { + throw TileDBSOMAError(fmt::format( + "ArrowAdapter: Unsupported TileDB datatype: {} ", + tiledb::impl::type_to_str(tiledb_dtype))); + } +} + +tiledb_datatype_t to_tiledb_format(std::string_view arrow_dtype) { + std::map _to_tiledb_format_map = { + {"u", TILEDB_STRING_UTF8}, {"U", TILEDB_STRING_UTF8}, + {"z", TILEDB_CHAR}, {"Z", TILEDB_CHAR}, + {"c", TILEDB_INT8}, {"C", TILEDB_UINT8}, + {"s", TILEDB_INT16}, {"S", TILEDB_UINT16}, + {"i", TILEDB_INT32}, {"I", TILEDB_UINT32}, + {"l", TILEDB_INT64}, {"L", TILEDB_UINT64}, + {"f", TILEDB_FLOAT32}, {"g", TILEDB_FLOAT64}, + {"b", TILEDB_BOOL}, {"tts", TILEDB_TIME_SEC}, + {"ttm", TILEDB_TIME_MS}, {"ttu", TILEDB_TIME_US}, + {"ttn", TILEDB_TIME_NS}, {"tss:", TILEDB_DATETIME_SEC}, + {"tsm:", TILEDB_DATETIME_MS}, {"tsu:", TILEDB_DATETIME_US}, + {"tsn:", TILEDB_DATETIME_NS}, + }; + + try { + return _to_tiledb_format_map.at(arrow_dtype); + } catch (const std::out_of_range& err) { + throw TileDBSOMAError(fmt::format( + "ArrowAdapter: Unsupported arrow datatype: {} ", arrow_dtype)); } - throw TileDBSOMAError(fmt::format( - "ArrowAdapter: Unsupported TileDB datatype: {} ", - tiledb::impl::type_to_str(datatype))); } } // namespace tiledbsoma \ No newline at end of file diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index a210aca77c..cf1fecbeb3 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -46,9 +46,25 @@ class ArrowAdapter { static std::pair, std::unique_ptr> to_arrow(std::shared_ptr column); + /** + * @brief Create an ArrowSchema from TileDB Array + * + * @return std::unique_ptr + */ static std::unique_ptr arrow_schema_from_tiledb_array( std::shared_ptr ctx, std::shared_ptr tiledb_array); + /** + * @brief Create a TileDB ArraySchema from ArrowSchema + * + * @return tiledb::ArraySchema + */ + static ArraySchema tiledb_schema_from_arrow_schema( + Context context, + std::shared_ptr arrow_schema, + std::vector index_column_names, + bool sparse = true); + /** * @brief Get Arrow format string from TileDB datatype. * @@ -58,6 +74,14 @@ class ArrowAdapter { static std::string_view to_arrow_format( tiledb_datatype_t datatype, bool use_large = true); + /** + * @brief Get TileDB datatype from Arrow format string. + * + * @param datatype TileDB datatype. + * @return std::string_view Arrow format string. + */ + static tiledb_datatype_t to_tiledb_format(std::string_view arrow_dtype); + private: static std::pair _get_data_and_length( Enumeration& enmr, const void* dst); From 9b39e9e7bf758cfe408893bf675db85d869467e3 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Sat, 24 Feb 2024 10:43:29 -0600 Subject: [PATCH 13/70] WIP create dimensions and attrs --- libtiledbsoma/src/utils/arrow_adapter.cc | 50 ++++++++++++++---------- libtiledbsoma/src/utils/arrow_adapter.h | 10 +++-- 2 files changed, 36 insertions(+), 24 deletions(-) diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 0c9c1052c2..880f07268e 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -186,7 +186,8 @@ std::pair ArrowAdapter::_get_data_and_length( } case TILEDB_INT8: { auto data = enmr.as_vector(); - return std::pair(_fill_data_buffer(data, dst), data.size()); + return std::pair( + ArrowAdapter::_fill_data_buffer(data, dst), data.size()); } case TILEDB_UINT8: { auto data = enmr.as_vector(); @@ -240,30 +241,23 @@ std::pair ArrowAdapter::_get_data_and_length( } } -ArraySchema tiledb_schema_from_arrow_schema( - Context context, - std::shared_ptr arrow_schema, - std::vector index_column_names, - bool sparse) { - ArraySchema schema(context, sparse ? TILEDB_SPARSE : TILEDB_DENSE); - Domain domain(context); +ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( + Context ctx, ArrowSchema& arrow_schema, ArrowTable index_columns) { + ArraySchema schema(ctx, TILEDB_SPARSE); + Domain domain(ctx); - for (int64_t i = 0; i < arrow_schema->n_children; ++i) { - ArrowSchema* child = arrow_schema->children[i]; + for (int64_t i = 0; i < arrow_schema.n_children; ++i) { + ArrowSchema* child = arrow_schema.children[i]; auto type = ArrowAdapter::to_tiledb_format(child->format); + auto dim_info = ArrowAdapter::_get_dim_info(child->name, index_columns); - bool is_dim = std::find( - index_column_names.begin(), - index_column_names.end(), - child->name) != index_column_names.end(); - - if (is_dim) { + if (dim_info.has_value()) { + auto& [dim_dom, extent] = *dim_info; domain.add_dimension(Dimension::create( - context, child->name, type, dim_domain, tile_extent)); + ctx, child->name, type, dim_dom, extent)); + } else { + schema.add_attribute(Attribute(ctx, child->name, type)); } - // else { - // schema.add_attribute(Attribute::create(context, child->name)); - // } } schema.set_domain(domain); @@ -273,6 +267,22 @@ ArraySchema tiledb_schema_from_arrow_schema( return schema; } +std::optional> ArrowAdapter::_get_dim_info( + std::string_view dim_name, ArrowTable index_columns) { + auto index_columns_array = index_columns.first; + auto index_columns_schema = index_columns.second; + + for (int64_t i = 0; i < index_columns_array.n_children; ++i) { + if (dim_name == index_columns_schema.children[i]->name) { + auto dim_info = index_columns_array.children[i]->children; + auto domain = dim_info[0]->buffers[1]; + auto extent = dim_info[1]->buffers[1]; + return std::make_pair(domain, extent); + } + } + return std::nullopt; +} + std::pair, std::unique_ptr> ArrowAdapter::to_arrow(std::shared_ptr column) { std::unique_ptr schema = std::make_unique(); diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index cf1fecbeb3..c8b9c7757d 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -32,6 +32,8 @@ struct ArrowBuffer { std::shared_ptr buffer_; }; +using ArrowTable = std::pair; + class ArrowAdapter { public: static void release_schema(struct ArrowSchema* schema); @@ -60,10 +62,7 @@ class ArrowAdapter { * @return tiledb::ArraySchema */ static ArraySchema tiledb_schema_from_arrow_schema( - Context context, - std::shared_ptr arrow_schema, - std::vector index_column_names, - bool sparse = true); + Context ctx, ArrowSchema& arrow_schema, ArrowTable index_columns); /** * @brief Get Arrow format string from TileDB datatype. @@ -93,6 +92,9 @@ class ArrowAdapter { std::memcpy((void*)dst, src.data(), sz); return dst; } + + static std::optional> _get_dim_info( + std::string_view dim_name, ArrowTable index_columns); }; }; // namespace tiledbsoma From c0506c318f261ce6bc5e8136bde82bdf58bc61aa Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Sat, 24 Feb 2024 10:59:10 -0600 Subject: [PATCH 14/70] WIP change SOMADataFrame to take ArrowSchema --- libtiledbsoma/src/soma/soma_collection.cc | 15 +++++++++------ libtiledbsoma/src/soma/soma_collection.h | 9 ++++++--- libtiledbsoma/src/soma/soma_dataframe.cc | 7 +++++-- libtiledbsoma/src/soma/soma_dataframe.h | 3 ++- libtiledbsoma/src/soma/soma_experiment.cc | 5 +++-- libtiledbsoma/src/soma/soma_experiment.h | 3 ++- libtiledbsoma/src/soma/soma_measurement.cc | 5 +++-- libtiledbsoma/src/soma/soma_measurement.h | 3 ++- libtiledbsoma/src/utils/arrow_adapter.cc | 14 ++++++++------ libtiledbsoma/src/utils/arrow_adapter.h | 4 +++- 10 files changed, 43 insertions(+), 25 deletions(-) diff --git a/libtiledbsoma/src/soma/soma_collection.cc b/libtiledbsoma/src/soma/soma_collection.cc index 9fa9c654fb..dcec2953a3 100644 --- a/libtiledbsoma/src/soma/soma_collection.cc +++ b/libtiledbsoma/src/soma/soma_collection.cc @@ -107,9 +107,10 @@ std::shared_ptr SOMACollection::add_new_experiment( std::string_view uri, URIType uri_type, std::shared_ptr ctx, - ArraySchema schema) { + ArrowSchema& schema, + ArrowTable index_columns) { std::shared_ptr member = SOMAExperiment::create( - uri, schema, ctx); + uri, schema, index_columns, ctx); this->set(std::string(uri), uri_type, std::string(key)); children_[std::string(key)] = member; return member; @@ -120,9 +121,10 @@ std::shared_ptr SOMACollection::add_new_measurement( std::string_view uri, URIType uri_type, std::shared_ptr ctx, - ArraySchema schema) { + ArrowSchema& schema, + ArrowTable index_columns) { std::shared_ptr member = SOMAMeasurement::create( - uri, schema, ctx); + uri, schema, index_columns, ctx); this->set(std::string(uri), uri_type, std::string(key)); children_[std::string(key)] = member; return member; @@ -133,9 +135,10 @@ std::shared_ptr SOMACollection::add_new_dataframe( std::string_view uri, URIType uri_type, std::shared_ptr ctx, - ArraySchema schema) { + ArrowSchema& schema, + ArrowTable index_columns) { std::shared_ptr member = SOMADataFrame::create( - uri, schema, ctx); + uri, schema, index_columns, ctx); this->set(std::string(uri), uri_type, std::string(key)); children_[std::string(key)] = member; return member; diff --git a/libtiledbsoma/src/soma/soma_collection.h b/libtiledbsoma/src/soma/soma_collection.h index e869a7d68a..5f9049637e 100644 --- a/libtiledbsoma/src/soma/soma_collection.h +++ b/libtiledbsoma/src/soma/soma_collection.h @@ -155,7 +155,8 @@ class SOMACollection : public SOMAGroup { std::string_view uri, URIType uri_type, std::shared_ptr ctx, - ArraySchema schema); + ArrowSchema& schema, + ArrowTable index_columns); /** * Create and add a SOMAMeasurement to the SOMACollection. @@ -170,7 +171,8 @@ class SOMACollection : public SOMAGroup { std::string_view uri, URIType uri_type, std::shared_ptr ctx, - ArraySchema schema); + ArrowSchema& schema, + ArrowTable index_columns); /** * Create and add a SOMADataFrame to the SOMACollection. @@ -185,7 +187,8 @@ class SOMACollection : public SOMAGroup { std::string_view uri, URIType uri_type, std::shared_ptr ctx, - ArraySchema schema); + ArrowSchema& schema, + ArrowTable index_columns); /** * Create and add a SOMADenseNDArray to the SOMACollection. diff --git a/libtiledbsoma/src/soma/soma_dataframe.cc b/libtiledbsoma/src/soma/soma_dataframe.cc index 3fdab76d96..b9a9b46c97 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.cc +++ b/libtiledbsoma/src/soma/soma_dataframe.cc @@ -41,9 +41,12 @@ using namespace tiledb; std::unique_ptr SOMADataFrame::create( std::string_view uri, - ArraySchema schema, + ArrowSchema& schema, + ArrowTable index_columns, std::shared_ptr ctx) { - SOMAArray::create(ctx, uri, schema, "SOMADataFrame"); + auto tiledb_schema = ArrowAdapter::tiledb_schema_from_arrow_schema( + ctx->tiledb_ctx(), schema, index_columns); + SOMAArray::create(ctx, uri, tiledb_schema, "SOMADataFrame"); return SOMADataFrame::open(uri, OpenMode::read, ctx); } diff --git a/libtiledbsoma/src/soma/soma_dataframe.h b/libtiledbsoma/src/soma/soma_dataframe.h index 1ed21f0b02..fbb9fe055f 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.h +++ b/libtiledbsoma/src/soma/soma_dataframe.h @@ -59,7 +59,8 @@ class SOMADataFrame : public SOMAArray { */ static std::unique_ptr create( std::string_view uri, - ArraySchema schema, + ArrowSchema& schema, + ArrowTable index_columns, std::shared_ptr ctx); /** diff --git a/libtiledbsoma/src/soma/soma_experiment.cc b/libtiledbsoma/src/soma/soma_experiment.cc index bfdfb417d2..a508aeb398 100644 --- a/libtiledbsoma/src/soma/soma_experiment.cc +++ b/libtiledbsoma/src/soma/soma_experiment.cc @@ -43,12 +43,13 @@ using namespace tiledb; std::unique_ptr SOMAExperiment::create( std::string_view uri, - ArraySchema schema, + ArrowSchema& schema, + ArrowTable index_columns, std::shared_ptr ctx) { std::string exp_uri(uri); SOMAGroup::create(ctx, exp_uri, "SOMAExperiment"); - SOMADataFrame::create(exp_uri + "/obs", schema, ctx); + SOMADataFrame::create(exp_uri + "/obs", schema, index_columns, ctx); SOMACollection::create(exp_uri + "/ms", ctx); auto group = SOMAGroup::open(OpenMode::write, exp_uri, ctx); diff --git a/libtiledbsoma/src/soma/soma_experiment.h b/libtiledbsoma/src/soma/soma_experiment.h index ff991a6a07..b8b776d2e2 100644 --- a/libtiledbsoma/src/soma/soma_experiment.h +++ b/libtiledbsoma/src/soma/soma_experiment.h @@ -56,7 +56,8 @@ class SOMAExperiment : public SOMACollection { */ static std::unique_ptr create( std::string_view uri, - ArraySchema schema, + ArrowSchema& schema, + ArrowTable index_columns, std::shared_ptr ctx); //=================================================================== diff --git a/libtiledbsoma/src/soma/soma_measurement.cc b/libtiledbsoma/src/soma/soma_measurement.cc index 80c44f11ae..f394c35530 100644 --- a/libtiledbsoma/src/soma/soma_measurement.cc +++ b/libtiledbsoma/src/soma/soma_measurement.cc @@ -43,12 +43,13 @@ using namespace tiledb; std::unique_ptr SOMAMeasurement::create( std::string_view uri, - ArraySchema schema, + ArrowSchema& schema, + ArrowTable index_columns, std::shared_ptr ctx) { std::string exp_uri(uri); SOMAGroup::create(ctx, exp_uri, "SOMAMeasurement"); - SOMADataFrame::create(exp_uri + "/var", schema, ctx); + SOMADataFrame::create(exp_uri + "/var", schema, index_columns, ctx); SOMACollection::create(exp_uri + "/X", ctx); SOMACollection::create(exp_uri + "/obsm", ctx); SOMACollection::create(exp_uri + "/obsp", ctx); diff --git a/libtiledbsoma/src/soma/soma_measurement.h b/libtiledbsoma/src/soma/soma_measurement.h index cfaf950549..7739b90946 100644 --- a/libtiledbsoma/src/soma/soma_measurement.h +++ b/libtiledbsoma/src/soma/soma_measurement.h @@ -57,7 +57,8 @@ class SOMAMeasurement : public SOMACollection { */ static std::unique_ptr create( std::string_view uri, - ArraySchema schema, + ArrowSchema& schema, + ArrowTable index_columns, std::shared_ptr ctx); //=================================================================== diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 880f07268e..aa235bf5bd 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -242,9 +242,11 @@ std::pair ArrowAdapter::_get_data_and_length( } ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( - Context ctx, ArrowSchema& arrow_schema, ArrowTable index_columns) { - ArraySchema schema(ctx, TILEDB_SPARSE); - Domain domain(ctx); + std::shared_ptr ctx, + ArrowSchema& arrow_schema, + ArrowTable index_columns) { + ArraySchema schema(*ctx, TILEDB_SPARSE); + Domain domain(*ctx); for (int64_t i = 0; i < arrow_schema.n_children; ++i) { ArrowSchema* child = arrow_schema.children[i]; @@ -253,10 +255,10 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( if (dim_info.has_value()) { auto& [dim_dom, extent] = *dim_info; - domain.add_dimension(Dimension::create( - ctx, child->name, type, dim_dom, extent)); + domain.add_dimension( + Dimension::create(*ctx, child->name, type, dim_dom, extent)); } else { - schema.add_attribute(Attribute(ctx, child->name, type)); + schema.add_attribute(Attribute(*ctx, child->name, type)); } } diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index c8b9c7757d..01684149ca 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -62,7 +62,9 @@ class ArrowAdapter { * @return tiledb::ArraySchema */ static ArraySchema tiledb_schema_from_arrow_schema( - Context ctx, ArrowSchema& arrow_schema, ArrowTable index_columns); + std::shared_ptr ctx, + ArrowSchema& arrow_schema, + ArrowTable index_columns); /** * @brief Get Arrow format string from TileDB datatype. From e484bf387b79c9a30b77f5a2680901cab1ac64b2 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Sat, 24 Feb 2024 16:44:30 -0600 Subject: [PATCH 15/70] Use ArrowSchema instead of TileDB Schema to create --- libtiledbsoma/src/soma/soma_array.h | 4 +- libtiledbsoma/src/soma/soma_collection.cc | 6 +- libtiledbsoma/src/soma/soma_collection.h | 6 +- libtiledbsoma/src/soma/soma_dataframe.cc | 4 +- libtiledbsoma/src/soma/soma_dataframe.h | 6 +- libtiledbsoma/src/soma/soma_dense_ndarray.cc | 2 +- libtiledbsoma/src/soma/soma_dense_ndarray.h | 2 +- libtiledbsoma/src/soma/soma_experiment.cc | 2 +- libtiledbsoma/src/soma/soma_experiment.h | 2 +- libtiledbsoma/src/soma/soma_measurement.cc | 2 +- libtiledbsoma/src/soma/soma_measurement.h | 2 +- libtiledbsoma/src/soma/soma_sparse_ndarray.cc | 2 +- libtiledbsoma/src/soma/soma_sparse_ndarray.h | 4 +- libtiledbsoma/src/utils/arrow_adapter.cc | 27 ++++--- libtiledbsoma/src/utils/arrow_adapter.h | 16 ++-- libtiledbsoma/test/CMakeLists.txt | 2 + libtiledbsoma/test/unit_soma_collection.cc | 74 ++++--------------- libtiledbsoma/test/unit_soma_dataframe.cc | 54 ++------------ 18 files changed, 67 insertions(+), 150 deletions(-) diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index b4c17ee6c4..09ee4bbc09 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -509,9 +509,9 @@ class SOMAArray : public SOMAObject { /** * @brief Get the Arrow schema of the array. * - * @return std::unique_ptr Schema + * @return std::shared_ptr Schema */ - std::unique_ptr arrow_schema() const { + std::shared_ptr arrow_schema() const { return ArrowAdapter::arrow_schema_from_tiledb_array( ctx_->tiledb_ctx(), arr_); } diff --git a/libtiledbsoma/src/soma/soma_collection.cc b/libtiledbsoma/src/soma/soma_collection.cc index dcec2953a3..4ace686229 100644 --- a/libtiledbsoma/src/soma/soma_collection.cc +++ b/libtiledbsoma/src/soma/soma_collection.cc @@ -107,7 +107,7 @@ std::shared_ptr SOMACollection::add_new_experiment( std::string_view uri, URIType uri_type, std::shared_ptr ctx, - ArrowSchema& schema, + std::shared_ptr schema, ArrowTable index_columns) { std::shared_ptr member = SOMAExperiment::create( uri, schema, index_columns, ctx); @@ -121,7 +121,7 @@ std::shared_ptr SOMACollection::add_new_measurement( std::string_view uri, URIType uri_type, std::shared_ptr ctx, - ArrowSchema& schema, + std::shared_ptr schema, ArrowTable index_columns) { std::shared_ptr member = SOMAMeasurement::create( uri, schema, index_columns, ctx); @@ -135,7 +135,7 @@ std::shared_ptr SOMACollection::add_new_dataframe( std::string_view uri, URIType uri_type, std::shared_ptr ctx, - ArrowSchema& schema, + std::shared_ptr schema, ArrowTable index_columns) { std::shared_ptr member = SOMADataFrame::create( uri, schema, index_columns, ctx); diff --git a/libtiledbsoma/src/soma/soma_collection.h b/libtiledbsoma/src/soma/soma_collection.h index 5f9049637e..5699486266 100644 --- a/libtiledbsoma/src/soma/soma_collection.h +++ b/libtiledbsoma/src/soma/soma_collection.h @@ -155,7 +155,7 @@ class SOMACollection : public SOMAGroup { std::string_view uri, URIType uri_type, std::shared_ptr ctx, - ArrowSchema& schema, + std::shared_ptr schema, ArrowTable index_columns); /** @@ -171,7 +171,7 @@ class SOMACollection : public SOMAGroup { std::string_view uri, URIType uri_type, std::shared_ptr ctx, - ArrowSchema& schema, + std::shared_ptr schema, ArrowTable index_columns); /** @@ -187,7 +187,7 @@ class SOMACollection : public SOMAGroup { std::string_view uri, URIType uri_type, std::shared_ptr ctx, - ArrowSchema& schema, + std::shared_ptr schema, ArrowTable index_columns); /** diff --git a/libtiledbsoma/src/soma/soma_dataframe.cc b/libtiledbsoma/src/soma/soma_dataframe.cc index b9a9b46c97..d917b8ea4e 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.cc +++ b/libtiledbsoma/src/soma/soma_dataframe.cc @@ -41,7 +41,7 @@ using namespace tiledb; std::unique_ptr SOMADataFrame::create( std::string_view uri, - ArrowSchema& schema, + std::shared_ptr schema, ArrowTable index_columns, std::shared_ptr ctx) { auto tiledb_schema = ArrowAdapter::tiledb_schema_from_arrow_schema( @@ -75,7 +75,7 @@ bool SOMADataFrame::exists(std::string_view uri) { //= public non-static //=================================================================== -std::unique_ptr SOMADataFrame::schema() const { +std::shared_ptr SOMADataFrame::schema() const { return this->arrow_schema(); } diff --git a/libtiledbsoma/src/soma/soma_dataframe.h b/libtiledbsoma/src/soma/soma_dataframe.h index fbb9fe055f..c8db15e667 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.h +++ b/libtiledbsoma/src/soma/soma_dataframe.h @@ -59,7 +59,7 @@ class SOMADataFrame : public SOMAArray { */ static std::unique_ptr create( std::string_view uri, - ArrowSchema& schema, + std::shared_ptr schema, ArrowTable index_columns, std::shared_ptr ctx); @@ -141,9 +141,9 @@ class SOMADataFrame : public SOMAArray { /** * Return the data schema, in the form of a ArrowSchema. * - * @return std::unique_ptr + * @return std::shared_ptr */ - std::unique_ptr schema() const; + std::shared_ptr schema() const; /** * Return the index (dimension) column names. diff --git a/libtiledbsoma/src/soma/soma_dense_ndarray.cc b/libtiledbsoma/src/soma/soma_dense_ndarray.cc index 6df2e84a51..057f52084e 100644 --- a/libtiledbsoma/src/soma/soma_dense_ndarray.cc +++ b/libtiledbsoma/src/soma/soma_dense_ndarray.cc @@ -71,7 +71,7 @@ bool SOMADenseNDArray::exists(std::string_view uri) { //= public non-static //=================================================================== -std::unique_ptr SOMADenseNDArray::schema() const { +std::shared_ptr SOMADenseNDArray::schema() const { return this->arrow_schema(); } diff --git a/libtiledbsoma/src/soma/soma_dense_ndarray.h b/libtiledbsoma/src/soma/soma_dense_ndarray.h index 39f2b5d18b..cd12a23be9 100644 --- a/libtiledbsoma/src/soma/soma_dense_ndarray.h +++ b/libtiledbsoma/src/soma/soma_dense_ndarray.h @@ -150,7 +150,7 @@ class SOMADenseNDArray : public SOMAArray { * * @return std::unique_ptr */ - std::unique_ptr schema() const; + std::shared_ptr schema() const; }; } // namespace tiledbsoma diff --git a/libtiledbsoma/src/soma/soma_experiment.cc b/libtiledbsoma/src/soma/soma_experiment.cc index a508aeb398..8990f198cc 100644 --- a/libtiledbsoma/src/soma/soma_experiment.cc +++ b/libtiledbsoma/src/soma/soma_experiment.cc @@ -43,7 +43,7 @@ using namespace tiledb; std::unique_ptr SOMAExperiment::create( std::string_view uri, - ArrowSchema& schema, + std::shared_ptr schema, ArrowTable index_columns, std::shared_ptr ctx) { std::string exp_uri(uri); diff --git a/libtiledbsoma/src/soma/soma_experiment.h b/libtiledbsoma/src/soma/soma_experiment.h index b8b776d2e2..9800794631 100644 --- a/libtiledbsoma/src/soma/soma_experiment.h +++ b/libtiledbsoma/src/soma/soma_experiment.h @@ -56,7 +56,7 @@ class SOMAExperiment : public SOMACollection { */ static std::unique_ptr create( std::string_view uri, - ArrowSchema& schema, + std::shared_ptr schema, ArrowTable index_columns, std::shared_ptr ctx); diff --git a/libtiledbsoma/src/soma/soma_measurement.cc b/libtiledbsoma/src/soma/soma_measurement.cc index f394c35530..4bbb17655c 100644 --- a/libtiledbsoma/src/soma/soma_measurement.cc +++ b/libtiledbsoma/src/soma/soma_measurement.cc @@ -43,7 +43,7 @@ using namespace tiledb; std::unique_ptr SOMAMeasurement::create( std::string_view uri, - ArrowSchema& schema, + std::shared_ptr schema, ArrowTable index_columns, std::shared_ptr ctx) { std::string exp_uri(uri); diff --git a/libtiledbsoma/src/soma/soma_measurement.h b/libtiledbsoma/src/soma/soma_measurement.h index 7739b90946..a93ca6c6c2 100644 --- a/libtiledbsoma/src/soma/soma_measurement.h +++ b/libtiledbsoma/src/soma/soma_measurement.h @@ -57,7 +57,7 @@ class SOMAMeasurement : public SOMACollection { */ static std::unique_ptr create( std::string_view uri, - ArrowSchema& schema, + std::shared_ptr schema, ArrowTable index_columns, std::shared_ptr ctx); diff --git a/libtiledbsoma/src/soma/soma_sparse_ndarray.cc b/libtiledbsoma/src/soma/soma_sparse_ndarray.cc index 1259c528e8..e0ce770ee6 100644 --- a/libtiledbsoma/src/soma/soma_sparse_ndarray.cc +++ b/libtiledbsoma/src/soma/soma_sparse_ndarray.cc @@ -72,7 +72,7 @@ bool SOMASparseNDArray::exists(std::string_view uri) { //= public non-static //=================================================================== -std::unique_ptr SOMASparseNDArray::schema() const { +std::shared_ptr SOMASparseNDArray::schema() const { return this->arrow_schema(); } } // namespace tiledbsoma diff --git a/libtiledbsoma/src/soma/soma_sparse_ndarray.h b/libtiledbsoma/src/soma/soma_sparse_ndarray.h index f201ff7614..a18519e807 100644 --- a/libtiledbsoma/src/soma/soma_sparse_ndarray.h +++ b/libtiledbsoma/src/soma/soma_sparse_ndarray.h @@ -148,9 +148,9 @@ class SOMASparseNDArray : public SOMAArray { /** * Return the data schema, in the form of an ArrowSchema. * - * @return std::unique_ptr + * @return std::shared_ptr */ - std::unique_ptr schema() const; + std::shared_ptr schema() const; }; } // namespace tiledbsoma diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index aa235bf5bd..b2447c182a 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -101,13 +101,13 @@ void ArrowAdapter::release_array(struct ArrowArray* array) { array->release = nullptr; } -std::unique_ptr ArrowAdapter::arrow_schema_from_tiledb_array( +std::shared_ptr ArrowAdapter::arrow_schema_from_tiledb_array( std::shared_ptr ctx, std::shared_ptr tiledb_array) { auto tiledb_schema = tiledb_array->schema(); auto ndim = tiledb_schema.domain().ndim(); auto nattr = tiledb_schema.attribute_num(); - std::unique_ptr arrow_schema = std::make_unique(); + std::shared_ptr arrow_schema = std::make_shared(); arrow_schema->format = "+s"; arrow_schema->n_children = ndim + nattr; arrow_schema->release = &ArrowAdapter::release_schema; @@ -243,13 +243,13 @@ std::pair ArrowAdapter::_get_data_and_length( ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( std::shared_ptr ctx, - ArrowSchema& arrow_schema, + std::shared_ptr arrow_schema, ArrowTable index_columns) { ArraySchema schema(*ctx, TILEDB_SPARSE); Domain domain(*ctx); - for (int64_t i = 0; i < arrow_schema.n_children; ++i) { - ArrowSchema* child = arrow_schema.children[i]; + for (int64_t i = 0; i < arrow_schema->n_children; ++i) { + ArrowSchema* child = arrow_schema->children[i]; auto type = ArrowAdapter::to_tiledb_format(child->format); auto dim_info = ArrowAdapter::_get_dim_info(child->name, index_columns); @@ -274,9 +274,9 @@ std::optional> ArrowAdapter::_get_dim_info( auto index_columns_array = index_columns.first; auto index_columns_schema = index_columns.second; - for (int64_t i = 0; i < index_columns_array.n_children; ++i) { - if (dim_name == index_columns_schema.children[i]->name) { - auto dim_info = index_columns_array.children[i]->children; + for (int64_t i = 0; i < index_columns_array->n_children; ++i) { + if (dim_name == index_columns_schema->children[i]->name) { + auto dim_info = index_columns_array->children[i]->children; auto domain = dim_info[0]->buffers[1]; auto extent = dim_info[1]->buffers[1]; return std::make_pair(domain, extent); @@ -285,10 +285,9 @@ std::optional> ArrowAdapter::_get_dim_info( return std::nullopt; } -std::pair, std::unique_ptr> -ArrowAdapter::to_arrow(std::shared_ptr column) { - std::unique_ptr schema = std::make_unique(); - std::unique_ptr array = std::make_unique(); +ArrowTable ArrowAdapter::to_arrow(std::shared_ptr column) { + std::shared_ptr schema = std::make_shared(); + std::shared_ptr array = std::make_shared(); schema->format = to_arrow_format(column->type()).data(); schema->name = column->name().data(); @@ -415,7 +414,7 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { array->dictionary = dict_arr; } - return std::pair(std::move(array), std::move(schema)); + return std::pair(array, schema); } std::string_view ArrowAdapter::to_arrow_format( @@ -446,7 +445,7 @@ std::string_view ArrowAdapter::to_arrow_format( } } -tiledb_datatype_t to_tiledb_format(std::string_view arrow_dtype) { +tiledb_datatype_t ArrowAdapter::to_tiledb_format(std::string_view arrow_dtype) { std::map _to_tiledb_format_map = { {"u", TILEDB_STRING_UTF8}, {"U", TILEDB_STRING_UTF8}, {"z", TILEDB_CHAR}, {"Z", TILEDB_CHAR}, diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index 01684149ca..60593da1b8 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -32,7 +32,8 @@ struct ArrowBuffer { std::shared_ptr buffer_; }; -using ArrowTable = std::pair; +using ArrowTable = + std::pair, std::shared_ptr>; class ArrowAdapter { public: @@ -42,18 +43,17 @@ class ArrowAdapter { /** * @brief Convert ColumnBuffer to an Arrow array. * - * @return std::pair, - * std::unique_ptr> + * @return std::pair, + * std::shared_ptr> */ - static std::pair, std::unique_ptr> - to_arrow(std::shared_ptr column); + static ArrowTable to_arrow(std::shared_ptr column); /** * @brief Create an ArrowSchema from TileDB Array * - * @return std::unique_ptr + * @return std::shared_ptr */ - static std::unique_ptr arrow_schema_from_tiledb_array( + static std::shared_ptr arrow_schema_from_tiledb_array( std::shared_ptr ctx, std::shared_ptr tiledb_array); /** @@ -63,7 +63,7 @@ class ArrowAdapter { */ static ArraySchema tiledb_schema_from_arrow_schema( std::shared_ptr ctx, - ArrowSchema& arrow_schema, + std::shared_ptr arrow_schema, ArrowTable index_columns); /** diff --git a/libtiledbsoma/test/CMakeLists.txt b/libtiledbsoma/test/CMakeLists.txt index b58a8952bc..4efbc890b1 100644 --- a/libtiledbsoma/test/CMakeLists.txt +++ b/libtiledbsoma/test/CMakeLists.txt @@ -27,6 +27,8 @@ find_package(Catch_EP REQUIRED) add_executable(unit_soma $ + common.cc + common.h unit_column_buffer.cc unit_managed_query.cc unit_soma_array.cc diff --git a/libtiledbsoma/test/unit_soma_collection.cc b/libtiledbsoma/test/unit_soma_collection.cc index e873c86c9c..3132a5bd33 100644 --- a/libtiledbsoma/test/unit_soma_collection.cc +++ b/libtiledbsoma/test/unit_soma_collection.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2022 TileDB, Inc. + * @copyright Copyright (c) 2024 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -30,52 +30,7 @@ * This file manages unit tests for the SOMACollection class */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include "utils/util.h" - -using namespace tiledb; -using namespace tiledbsoma; -using namespace Catch::Matchers; - -#ifndef TILEDBSOMA_SOURCE_ROOT -#define TILEDBSOMA_SOURCE_ROOT "not_defined" -#endif - -const std::string src_path = TILEDBSOMA_SOURCE_ROOT; - -namespace { -ArraySchema create_schema( - Context& ctx, bool sparse = false, bool allow_duplicates = false) { - // Create schema - ArraySchema schema(ctx, sparse ? TILEDB_SPARSE : TILEDB_DENSE); - - auto dim = Dimension::create(ctx, "d0", {0, 1000}); - - Domain domain(ctx); - domain.add_dimension(dim); - schema.set_domain(domain); - - auto attr = Attribute::create(ctx, "a0"); - schema.add_attribute(attr); - schema.set_allows_dups(allow_duplicates); - schema.check(); - - return schema; -} -}; // namespace +#include "common.h" TEST_CASE("SOMACollection: basic") { auto ctx = std::make_shared(); @@ -94,7 +49,8 @@ TEST_CASE("SOMACollection: add SOMASparseNDArray") { std::string sub_uri = "mem://unit-test-add-sparse-ndarray/sub"; SOMACollection::create(base_uri, ctx); - auto schema = create_schema(*ctx->tiledb_ctx(), true); + auto [arrow_schema, index_columns] = helper::create_arrow_schema(); + auto schema = helper::create_schema(*ctx->tiledb_ctx(), true); std::map expected_map{ {"sparse_ndarray", sub_uri}}; @@ -123,7 +79,7 @@ TEST_CASE("SOMACollection: add SOMADenseNDArray") { std::string sub_uri = "mem://unit-test-add-dense-ndarray/sub"; SOMACollection::create(base_uri, ctx); - auto schema = create_schema(*ctx->tiledb_ctx(), false); + auto schema = helper::create_schema(*ctx->tiledb_ctx(), false); std::map expected_map{{"dense_ndarray", sub_uri}}; @@ -150,13 +106,13 @@ TEST_CASE("SOMACollection: add SOMADataFrame") { std::string sub_uri = "mem://unit-test-add-dataframe/sub"; SOMACollection::create(base_uri, ctx); - auto schema = create_schema(*ctx->tiledb_ctx(), true); + auto [schema, index_columns] = helper::create_arrow_schema(); std::map expected_map{{"dataframe", sub_uri}}; auto soma_collection = SOMACollection::open(base_uri, OpenMode::write, ctx); auto soma_dataframe = soma_collection->add_new_dataframe( - "dataframe", sub_uri, URIType::absolute, ctx, schema); + "dataframe", sub_uri, URIType::absolute, ctx, schema, index_columns); REQUIRE(soma_collection->member_to_uri_mapping() == expected_map); REQUIRE(soma_dataframe->uri() == sub_uri); REQUIRE(soma_dataframe->ctx() == ctx); @@ -178,7 +134,7 @@ TEST_CASE("SOMACollection: add SOMACollection") { std::string sub_uri = "mem://unit-test-add-collection/sub"; SOMACollection::create(base_uri, ctx); - auto schema = create_schema(*ctx->tiledb_ctx(), false); + auto schema = helper::create_schema(*ctx->tiledb_ctx(), false); std::map expected_map{{"subcollection", sub_uri}}; @@ -202,13 +158,13 @@ TEST_CASE("SOMACollection: add SOMAExperiment") { std::string sub_uri = "mem://unit-test-add-experiment/sub"; SOMACollection::create(base_uri, ctx); - auto schema = create_schema(*ctx->tiledb_ctx(), false); + auto [schema, index_columns] = helper::create_arrow_schema(); std::map expected_map{{"experiment", sub_uri}}; auto soma_collection = SOMACollection::open(base_uri, OpenMode::write, ctx); auto soma_experiment = soma_collection->add_new_experiment( - "experiment", sub_uri, URIType::absolute, ctx, schema); + "experiment", sub_uri, URIType::absolute, ctx, schema, index_columns); REQUIRE(soma_collection->member_to_uri_mapping() == expected_map); REQUIRE(soma_experiment->uri() == sub_uri); REQUIRE(soma_experiment->ctx() == ctx); @@ -227,13 +183,13 @@ TEST_CASE("SOMACollection: add SOMAMeasurement") { std::string sub_uri = "mem://unit-test-add-measurement/sub"; SOMACollection::create(base_uri, ctx); - auto schema = create_schema(*ctx->tiledb_ctx(), false); + auto [schema, index_columns] = helper::create_arrow_schema(); std::map expected_map{{"measurement", sub_uri}}; auto soma_collection = SOMACollection::open(base_uri, OpenMode::write, ctx); auto soma_measurement = soma_collection->add_new_measurement( - "measurement", sub_uri, URIType::absolute, ctx, schema); + "measurement", sub_uri, URIType::absolute, ctx, schema, index_columns); REQUIRE(soma_collection->member_to_uri_mapping() == expected_map); REQUIRE(soma_measurement->uri() == sub_uri); REQUIRE(soma_measurement->ctx() == ctx); @@ -287,7 +243,8 @@ TEST_CASE("SOMAExperiment: metadata") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-experiment"; - SOMAExperiment::create(uri, create_schema(*ctx->tiledb_ctx()), ctx); + auto [schema, index_columns] = helper::create_arrow_schema(); + SOMAExperiment::create(uri, schema, index_columns, ctx); auto soma_experiment = SOMAExperiment::open( uri, OpenMode::write, ctx, std::pair(1, 1)); int32_t val = 100; @@ -324,7 +281,8 @@ TEST_CASE("SOMAMeasurement: metadata") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-measurement"; - SOMAMeasurement::create(uri, create_schema(*ctx->tiledb_ctx()), ctx); + auto [schema, index_columns] = helper::create_arrow_schema(); + SOMAMeasurement::create(uri, schema, index_columns, ctx); auto soma_measurement = SOMAMeasurement::open( uri, OpenMode::write, ctx, std::pair(1, 1)); int32_t val = 100; diff --git a/libtiledbsoma/test/unit_soma_dataframe.cc b/libtiledbsoma/test/unit_soma_dataframe.cc index 9822caeeb8..b2310ea8c1 100644 --- a/libtiledbsoma/test/unit_soma_dataframe.cc +++ b/libtiledbsoma/test/unit_soma_dataframe.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2022 TileDB, Inc. + * @copyright Copyright (c) 2024 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -30,57 +30,14 @@ * This file manages unit tests for the SOMADataFrame class */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include "utils/util.h" - -using namespace tiledb; -using namespace tiledbsoma; -using namespace Catch::Matchers; - -#ifndef TILEDBSOMA_SOURCE_ROOT -#define TILEDBSOMA_SOURCE_ROOT "not_defined" -#endif - -const std::string src_path = TILEDBSOMA_SOURCE_ROOT; - -namespace { -ArraySchema create_schema(Context& ctx, bool allow_duplicates = false) { - // Create schema - ArraySchema schema(ctx, TILEDB_SPARSE); - - auto dim = Dimension::create(ctx, "d0", {0, 1000}); - - Domain domain(ctx); - domain.add_dimension(dim); - schema.set_domain(domain); - - auto attr = Attribute::create(ctx, "a0"); - schema.add_attribute(attr); - schema.set_allows_dups(allow_duplicates); - schema.check(); - - return schema; -} -}; // namespace +#include "common.h" TEST_CASE("SOMADataFrame: basic") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-dataframe-basic"; - SOMADataFrame::create(uri, create_schema(*ctx->tiledb_ctx()), ctx); + auto [schema, index_columns] = helper::create_arrow_schema(); + SOMADataFrame::create(uri, schema, index_columns, ctx); auto soma_dataframe = SOMADataFrame::open(uri, OpenMode::read, ctx); REQUIRE(soma_dataframe->uri() == uri); @@ -127,7 +84,8 @@ TEST_CASE("SOMADataFrame: metadata") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-collection"; - SOMADataFrame::create(uri, create_schema(*ctx->tiledb_ctx()), ctx); + auto [schema, index_columns] = helper::create_arrow_schema(); + SOMADataFrame::create(uri, schema, index_columns, ctx); auto soma_dataframe = SOMADataFrame::open( uri, OpenMode::write, From f581fada2734e13ac3ac09dde2e148dcce685bd2 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Sun, 25 Feb 2024 13:51:56 -0600 Subject: [PATCH 16/70] WIP attach --- apis/python/src/tiledbsoma/_dataframe.py | 168 +++++++++--------- apis/python/src/tiledbsoma/_tdb_handles.py | 3 + apis/python/src/tiledbsoma/soma_array.cc | 37 +++- libtiledbsoma/src/soma/column_buffer.cc | 52 +++++- libtiledbsoma/src/soma/column_buffer.h | 39 ++-- libtiledbsoma/src/soma/managed_query.h | 7 +- libtiledbsoma/src/soma/soma_array.cc | 1 + libtiledbsoma/src/utils/arrow_adapter.cc | 2 +- libtiledbsoma/test/unit_soma_array.cc | 8 +- libtiledbsoma/test/unit_soma_dataframe.cc | 8 +- libtiledbsoma/test/unit_soma_dense_ndarray.cc | 8 +- .../test/unit_soma_sparse_ndarray.cc | 8 +- 12 files changed, 219 insertions(+), 122 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 9a25dbc042..3ab0df7111 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -397,82 +397,82 @@ def write( """ _util.check_type("values", values, (pa.Table,)) - dim_cols_map: Dict[str, pd.DataFrame] = {} - attr_cols_map: Dict[str, pd.DataFrame] = {} - dim_names_set = self.index_column_names - n = None - - for col_info in values.schema: - name = col_info.name - col = values.column(name).combine_chunks() - n = len(col) - - if self._handle.schema.has_attr(name): - attr = self._handle.schema.attr(name) - - # Add the enumeration values to the TileDB Array from ArrowArray - if attr.enum_label is not None: - if not pa.types.is_dictionary(col_info.type): - raise ValueError( - "Expected dictionary type for enumerated attribute " - f"{name} but saw {col.type}" - ) - - enmr = self._handle.enum(attr.name) - - # get new enumeration values by taking the set difference - # while maintaining ordering - update_vals = np.setdiff1d( - col.dictionary, enmr.values(), assume_unique=True - ) - - index_capacity_current = len(enmr.values()) + len(update_vals) - index_capacity_max = np.iinfo( - col_info.type.index_type.to_pandas_dtype() - ).max - if index_capacity_max < index_capacity_current: - raise ValueError( - f"Too many enumeration values ({index_capacity_current}) " - "for index type {col_info.type.index_type}" - ) - - # only extend if there are new values - if len(update_vals) != 0: - se = tiledb.ArraySchemaEvolution(self.context.tiledb_ctx) - if np.issubdtype(enmr.dtype.type, np.str_): - extend_vals = np.array(update_vals, "U") - elif np.issubdtype(enmr.dtype.type, np.bytes_): - extend_vals = np.array(update_vals, "S") - else: - extend_vals = np.array(update_vals, enmr.dtype) - new_enmr = enmr.extend(extend_vals) - df = pd.Categorical(col.to_pandas(), new_enmr.values()) - col = pa.DictionaryArray.from_pandas(df) - se.extend_enumeration(new_enmr) - se.array_evolve(uri=self.uri) - - cols_map = dim_cols_map if name in dim_names_set else attr_cols_map - schema = self._handle.schema - if pa.types.is_dictionary(col.type): - if ( - name not in dim_names_set - and schema.attr(name).enum_label is not None - ): - cols_map[name] = col.indices.to_pandas() - else: - cols_map[name] = col - - else: - if name not in dim_names_set: - if schema.attr(name).enum_label is not None: - raise ValueError( - f"Categorical column {name} must be presented with categorical data" - ) - - cols_map[name] = col.to_pandas() - - if n is None: - raise ValueError(f"did not find any column names in {values.schema.names}") + # dim_cols_map: Dict[str, pd.DataFrame] = {} + # attr_cols_map: Dict[str, pd.DataFrame] = {} + # dim_names_set = self.index_column_names + # n = None + + # for col_info in values.schema: + # name = col_info.name + # col = values.column(name).combine_chunks() + # n = len(col) + + # if self._handle.schema.has_attr(name): + # attr = self._handle.schema.attr(name) + + # # Add the enumeration values to the TileDB Array from ArrowArray + # if attr.enum_label is not None: + # if not pa.types.is_dictionary(col_info.type): + # raise ValueError( + # "Expected dictionary type for enumerated attribute " + # f"{name} but saw {col.type}" + # ) + + # enmr = self._handle.enum(attr.name) + + # # get new enumeration values by taking the set difference + # # while maintaining ordering + # update_vals = np.setdiff1d( + # col.dictionary, enmr.values(), assume_unique=True + # ) + + # index_capacity_current = len(enmr.values()) + len(update_vals) + # index_capacity_max = np.iinfo( + # col_info.type.index_type.to_pandas_dtype() + # ).max + # if index_capacity_max < index_capacity_current: + # raise ValueError( + # f"Too many enumeration values ({index_capacity_current}) " + # "for index type {col_info.type.index_type}" + # ) + + # # only extend if there are new values + # if len(update_vals) != 0: + # se = tiledb.ArraySchemaEvolution(self.context.tiledb_ctx) + # if np.issubdtype(enmr.dtype.type, np.str_): + # extend_vals = np.array(update_vals, "U") + # elif np.issubdtype(enmr.dtype.type, np.bytes_): + # extend_vals = np.array(update_vals, "S") + # else: + # extend_vals = np.array(update_vals, enmr.dtype) + # new_enmr = enmr.extend(extend_vals) + # df = pd.Categorical(col.to_pandas(), new_enmr.values()) + # col = pa.DictionaryArray.from_pandas(df) + # se.extend_enumeration(new_enmr) + # se.array_evolve(uri=self.uri) + + # cols_map = dim_cols_map if name in dim_names_set else attr_cols_map + # schema = self._handle.schema + # if pa.types.is_dictionary(col.type): + # if ( + # name not in dim_names_set + # and schema.attr(name).enum_label is not None + # ): + # cols_map[name] = col.indices.to_pandas() + # else: + # cols_map[name] = col + + # else: + # if name not in dim_names_set: + # if schema.attr(name).enum_label is not None: + # raise ValueError( + # f"Categorical column {name} must be presented with categorical data" + # ) + + # cols_map[name] = col.to_pandas() + + # if n is None: + # raise ValueError(f"did not find any column names in {values.schema.names}") # We need to produce the dim cols in the same order as they're present in the TileDB schema # (tracked by self.index_column_names). This is important in the multi-index case. Suppose @@ -480,9 +480,15 @@ def write( # the user set index_column_names = ["meister", "burger"] when creating the TileDB schema. # Then the above for-loop over the Arrow schema will find the former ordering, but for the # ``writer[dims] = attrs`` below we must have dims with the latter ordering. - dim_cols_list = [dim_cols_map[name] for name in self.index_column_names] - dim_cols_tuple = tuple(dim_cols_list) - self._handle.writer[dim_cols_tuple] = attr_cols_map + print("SCHEMAAAAAAAAAAAAAAA") + print(values.schema) + print("BATCHESSSSSSSSSSSSSS ") + for batch in values.to_batches(): + self._handle.write(batch) + print("DONEEEEEEEEE") + # dim_cols_list = [dim_cols_map[name] for name in self.index_column_names] + # dim_cols_tuple = tuple(dim_cols_list) + # self._handle.writer[dim_cols_tuple] = attr_cols_map tiledb_create_options = TileDBCreateOptions.from_platform_config( platform_config ) @@ -492,7 +498,7 @@ def write( return self def _set_reader_coord( - self, sr: clib.SOMAArray, dim_idx: int, dim: pa.Table, coord: object + self, sr: clib.SOMAArray, dim_idx: int, dim: tiledb.Dim, coord: object ) -> bool: if coord is None: return True # No constraint; select all in this dimension @@ -566,7 +572,7 @@ def _set_reader_coord_by_py_seq_or_np_array( self, sr: clib.SOMAArray, dim_idx: int, - dim: pa.Table, + dim: tiledb.Dim, coord: object, ) -> bool: if isinstance(coord, np.ndarray): diff --git a/apis/python/src/tiledbsoma/_tdb_handles.py b/apis/python/src/tiledbsoma/_tdb_handles.py index 64633e6b9c..00a2e69fac 100644 --- a/apis/python/src/tiledbsoma/_tdb_handles.py +++ b/apis/python/src/tiledbsoma/_tdb_handles.py @@ -428,6 +428,9 @@ class DataFrameWrapper(SOMAArrayWrapper[clib.SOMADataFrame]): def count(self) -> int: return int(self._handle.count) + def write(self, values: pa.Table) -> None: + self._handle.write(values) + class DenseNDArrayWrapper(SOMAArrayWrapper[clib.SOMADenseNDArray]): """Wrapper around a Pybind11 DenseNDArrayWrapper handle.""" diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index 2e5d8913b7..896a2537f1 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -121,6 +121,14 @@ void load_soma_array(py::module& m) { "platform_config"_a = py::dict(), "timestamp"_a = py::none()) + .def("__enter__", [](SOMAArray& reader) { return reader; }) + .def( + "__exit__", + [](SOMAArray& reader, + void* exc_type, + void* exc_value, + void* traceback) { reader.close(); }) + .def( "set_condition", [](SOMAArray& reader, @@ -515,15 +523,34 @@ void load_soma_array(py::module& m) { .def( "write", - [](SOMAArray& array, py::handle c_array) { + [](SOMAArray& array, py::handle py_batch) { ArrowSchema arrow_schema; ArrowArray arrow_array; uintptr_t arrow_schema_ptr = (uintptr_t)(&arrow_schema); uintptr_t arrow_array_ptr = (uintptr_t)(&arrow_array); - c_array.attr("_export_to_c")(arrow_array_ptr, arrow_schema_ptr); - - array.write(std::shared_ptr( - reinterpret_cast(arrow_array_ptr))); + py_batch.attr("_export_to_c")( + arrow_array_ptr, arrow_schema_ptr); + + auto buffers = std::make_shared(); + for (auto i = 0; i < arrow_schema.n_children; ++i) { + std::cout << arrow_schema.children[i]->name << std::endl; + auto arr = std::make_shared( + *arrow_array.children[i]); + auto sch = std::make_shared( + *arrow_schema.children[i]); + + auto name = arrow_schema.children[i]->name; + auto dim_names = array.dimension_names(); + bool is_dim = std::find( + dim_names.begin(), + dim_names.end(), + name) != dim_names.end(); + auto column = ColumnBuffer::create( + ArrowTable(arr, sch), array.ctx(), is_dim); + + buffers->emplace(name, column); + } + array.write(buffers); }) .def("nnz", &SOMAArray::nnz, py::call_guard()) diff --git a/libtiledbsoma/src/soma/column_buffer.cc b/libtiledbsoma/src/soma/column_buffer.cc index 00e5e49af3..a6bde15bf7 100644 --- a/libtiledbsoma/src/soma/column_buffer.cc +++ b/libtiledbsoma/src/soma/column_buffer.cc @@ -69,7 +69,7 @@ std::shared_ptr ColumnBuffer::create( } return ColumnBuffer::alloc( - schema, + schema.context().config(), name_str, type, is_var, @@ -91,12 +91,49 @@ std::shared_ptr ColumnBuffer::create( } return ColumnBuffer::alloc( - schema, name_str, type, is_var, false, std::nullopt, false); + schema.context().config(), + name_str, + type, + is_var, + false, + std::nullopt, + false); } throw TileDBSOMAError("[ColumnBuffer] Column name not found: " + name_str); } +std::shared_ptr ColumnBuffer::create( + ArrowTable arrow_table, + std::shared_ptr context, + bool is_column_index) { + auto [arrow_array, arrow_schema] = arrow_table; + + auto cfg = context->tiledb_ctx()->config(); + auto name = arrow_schema->name; + auto type = ArrowAdapter::to_tiledb_format(arrow_schema->format); + + if (is_column_index) { + bool is_var = type == TILEDB_STRING_ASCII || type == TILEDB_STRING_UTF8; + return ColumnBuffer::alloc( + cfg, + name, + type, + is_var, + false, // is_nullable + std::nullopt, // enumeration + false // is_ordered + ); + } else { + bool is_var = type == TILEDB_STRING_ASCII || type == TILEDB_STRING_UTF8; + bool is_nullable = false; + std::optional enumeration = std::nullopt; + bool is_ordered = false; + return ColumnBuffer::alloc( + cfg, name, type, is_var, is_nullable, enumeration, is_ordered); + } +} + void ColumnBuffer::to_bitmap(tcb::span bytemap) { int i_dst = 0; for (unsigned int i_src = 0; i_src < bytemap.size(); i_src++) { @@ -211,7 +248,7 @@ std::string_view ColumnBuffer::string_view(uint64_t index) { //=================================================================== std::shared_ptr ColumnBuffer::alloc( - ArraySchema schema, + Config config, std::string_view name, tiledb_datatype_t type, bool is_var, @@ -221,7 +258,6 @@ std::shared_ptr ColumnBuffer::alloc( // Set number of bytes for the data buffer. Override with a value from // the config if present. auto num_bytes = DEFAULT_ALLOC_BYTES; - auto config = schema.context().config(); if (config.contains(CONFIG_KEY_INIT_BYTES)) { auto value_str = config.get(CONFIG_KEY_INIT_BYTES); try { @@ -235,10 +271,10 @@ std::shared_ptr ColumnBuffer::alloc( } } - bool is_dense = schema.array_type() == TILEDB_DENSE; - if (is_dense) { - // TODO: Handle dense arrays similar to tiledb python module - } + // bool is_dense = schema.array_type() == TILEDB_DENSE; + // if (is_dense) { + // // TODO: Handle dense arrays similar to tiledb python module + // } // For variable length column types, allocate an extra num_bytes to hold // offset values. The number of cells is the set by the size of the diff --git a/libtiledbsoma/src/soma/column_buffer.h b/libtiledbsoma/src/soma/column_buffer.h index 4a727f5c37..7882c889fd 100644 --- a/libtiledbsoma/src/soma/column_buffer.h +++ b/libtiledbsoma/src/soma/column_buffer.h @@ -38,7 +38,9 @@ #include #include +#include "../utils/arrow_adapter.h" #include "../utils/common.h" +#include "soma_context.h" #include "span/span.hpp" namespace tiledbsoma { @@ -70,26 +72,17 @@ class ColumnBuffer { std::shared_ptr array, std::string_view name); /** - * @brief Create a ColumnBuffer from a schema, column name, and data. + * @brief Create a ColumnBuffer from an ArrowSchema and ArrowArray that + * represents a single column. * * @param array TileDB array * @param name TileDB dimension or attribute name - * @param data Data to set in buffer * @return ColumnBuffer */ - template static std::shared_ptr create( - std::shared_ptr array, - std::string_view name, - std::vector data) { - auto column_buff = ColumnBuffer::create(array, name); - column_buff->num_cells_ = data.size(); - column_buff->data_.resize(data.size()); - column_buff->data_.assign( - reinterpret_cast(data.data()), - reinterpret_cast(data.data() + data.size())); - return column_buff; - } + ArrowTable arrow_table, + std::shared_ptr context, + bool is_column_index); /** * @brief Convert a bytemap to a bitmap in place. @@ -136,6 +129,20 @@ class ColumnBuffer { */ void attach(Query& query); + /** + * @brief Set the ColumnBuffer's data. + * + * @param std::vector data to write + */ + template + void set_data(std::vector data){ + this->num_cells_ = data.size(); + this->data_.resize(data.size()); + this->data_.assign( + reinterpret_cast(data.data()), + reinterpret_cast(data.data() + data.size())); + } + /** * @brief Size num_cells_ to match the read query results. * @@ -342,7 +349,7 @@ class ColumnBuffer { /** * @brief Allocate and return a ColumnBuffer. * - * @param array TileDB array + * @param config TileDB Config * @param name Column name * @param type TileDB datatype * @param is_var True if variable length data @@ -352,7 +359,7 @@ class ColumnBuffer { * @return ColumnBuffer */ static std::shared_ptr alloc( - ArraySchema schema, + Config config, std::string_view name, tiledb_datatype_t type, bool is_var, diff --git a/libtiledbsoma/src/soma/managed_query.h b/libtiledbsoma/src/soma/managed_query.h index b9a9238ba1..56fda17b93 100644 --- a/libtiledbsoma/src/soma/managed_query.h +++ b/libtiledbsoma/src/soma/managed_query.h @@ -216,9 +216,10 @@ class ManagedQuery { std::string column_name, std::shared_ptr column_buffer) { if (array_->schema().array_type() == TILEDB_SPARSE || schema_->has_attribute(column_name)) { - auto data = column_buffer->data(); - query_->set_data_buffer( - column_name, (void*)data.data(), data.size_bytes()); + // auto data = column_buffer->data(); + // query_->set_data_buffer( + // column_name, (void*)data.data(), data.size_bytes()); + column_buffer->attach(*query_); } else { switch (column_buffer->type()) { case TILEDB_STRING_ASCII: diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index 081b8fd4ad..49092197b0 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -259,6 +259,7 @@ void SOMAArray::write(std::shared_ptr buffers) { throw TileDBSOMAError("[SOMAArray] array must be opened in write mode"); } + // TODO create a ManagedQuery::setup_write? for (auto col_name : buffers->names()) { mq_->set_column_data(col_name, buffers->at(col_name)); } diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index b2447c182a..59eddab109 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -414,7 +414,7 @@ ArrowTable ArrowAdapter::to_arrow(std::shared_ptr column) { array->dictionary = dict_arr; } - return std::pair(array, schema); + return ArrowTable(array, schema); } std::string_view ArrowAdapter::to_arrow_format( diff --git a/libtiledbsoma/test/unit_soma_array.cc b/libtiledbsoma/test/unit_soma_array.cc index aae178f0e5..cf546d987b 100644 --- a/libtiledbsoma/test/unit_soma_array.cc +++ b/libtiledbsoma/test/unit_soma_array.cc @@ -141,8 +141,12 @@ std::tuple, std::vector> write_array( auto array_buffer = std::make_shared(); auto tdb_arr = std::make_shared( *ctx->tiledb_ctx(), uri, TILEDB_READ); - array_buffer->emplace("a0", ColumnBuffer::create(tdb_arr, "a0", a0)); - array_buffer->emplace("d0", ColumnBuffer::create(tdb_arr, "d0", d0)); + auto col_a0 = ColumnBuffer::create(tdb_arr, "a0"); + auto col_d0 = ColumnBuffer::create(tdb_arr, "d0"); + col_a0->set_data(a0); + col_d0->set_data(d0); + array_buffer->emplace("a0", col_a0); + array_buffer->emplace("d0", col_d0); // Write data to array soma_array->write(array_buffer); diff --git a/libtiledbsoma/test/unit_soma_dataframe.cc b/libtiledbsoma/test/unit_soma_dataframe.cc index b2310ea8c1..3256d83f3d 100644 --- a/libtiledbsoma/test/unit_soma_dataframe.cc +++ b/libtiledbsoma/test/unit_soma_dataframe.cc @@ -57,8 +57,12 @@ TEST_CASE("SOMADataFrame: basic") { auto array_buffer = std::make_shared(); auto tdb_arr = std::make_shared( *ctx->tiledb_ctx(), uri, TILEDB_READ); - array_buffer->emplace("a0", ColumnBuffer::create(tdb_arr, "a0", a0)); - array_buffer->emplace("d0", ColumnBuffer::create(tdb_arr, "d0", d0)); + auto col_a0 = ColumnBuffer::create(tdb_arr, "a0"); + auto col_d0 = ColumnBuffer::create(tdb_arr, "d0"); + col_a0->set_data(a0); + col_d0->set_data(d0); + array_buffer->emplace("a0", col_a0); + array_buffer->emplace("d0", col_d0); soma_dataframe = SOMADataFrame::open(uri, OpenMode::write, ctx); soma_dataframe->write(array_buffer); diff --git a/libtiledbsoma/test/unit_soma_dense_ndarray.cc b/libtiledbsoma/test/unit_soma_dense_ndarray.cc index decb0c5400..d425046d49 100644 --- a/libtiledbsoma/test/unit_soma_dense_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_dense_ndarray.cc @@ -100,8 +100,12 @@ TEST_CASE("SOMADenseNDArray: basic") { auto array_buffer = std::make_shared(); auto tdb_arr = std::make_shared( *ctx->tiledb_ctx(), uri, TILEDB_READ); - array_buffer->emplace("a0", ColumnBuffer::create(tdb_arr, "a0", a0)); - array_buffer->emplace("d0", ColumnBuffer::create(tdb_arr, "d0", d0)); + auto col_a0 = ColumnBuffer::create(tdb_arr, "a0"); + auto col_d0 = ColumnBuffer::create(tdb_arr, "d0"); + col_a0->set_data(a0); + col_d0->set_data(d0); + array_buffer->emplace("a0", col_a0); + array_buffer->emplace("d0", col_d0); soma_dense->open(OpenMode::write); soma_dense->write(array_buffer); diff --git a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc index 246db3692c..bc5b999f3f 100644 --- a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc @@ -102,8 +102,12 @@ TEST_CASE("SOMASparseNDArray: basic") { auto array_buffer = std::make_shared(); auto tdb_arr = std::make_shared( *ctx->tiledb_ctx(), uri, TILEDB_READ); - array_buffer->emplace("a0", ColumnBuffer::create(tdb_arr, "a0", a0)); - array_buffer->emplace("d0", ColumnBuffer::create(tdb_arr, "d0", d0)); + auto col_a0 = ColumnBuffer::create(tdb_arr, "a0"); + auto col_d0 = ColumnBuffer::create(tdb_arr, "d0"); + col_a0->set_data(a0); + col_d0->set_data(d0); + array_buffer->emplace("a0", col_a0); + array_buffer->emplace("d0", col_d0); soma_sparse->open(OpenMode::write); soma_sparse->write(array_buffer); From 84b80062516e1426c9b9c051c24173f3deab7286 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Sun, 25 Feb 2024 15:04:31 -0600 Subject: [PATCH 17/70] WIP replace with setup_write --- apis/python/src/tiledbsoma/soma_array.cc | 20 ++--- libtiledbsoma/src/soma/column_buffer.cc | 31 ------- libtiledbsoma/src/soma/column_buffer.h | 22 ++--- libtiledbsoma/src/soma/managed_query.cc | 108 +++++++++++++++++++++++ libtiledbsoma/src/soma/managed_query.h | 94 +------------------- libtiledbsoma/src/soma/soma_array.cc | 5 +- libtiledbsoma/src/soma/soma_array.h | 13 --- libtiledbsoma/test/unit_soma_array.cc | 4 +- 8 files changed, 126 insertions(+), 171 deletions(-) diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index 896a2537f1..338d29ea4a 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -533,20 +533,12 @@ void load_soma_array(py::module& m) { auto buffers = std::make_shared(); for (auto i = 0; i < arrow_schema.n_children; ++i) { - std::cout << arrow_schema.children[i]->name << std::endl; - auto arr = std::make_shared( - *arrow_array.children[i]); - auto sch = std::make_shared( - *arrow_schema.children[i]); - - auto name = arrow_schema.children[i]->name; - auto dim_names = array.dimension_names(); - bool is_dim = std::find( - dim_names.begin(), - dim_names.end(), - name) != dim_names.end(); - auto column = ColumnBuffer::create( - ArrowTable(arr, sch), array.ctx(), is_dim); + auto child = arrow_array.children[i]; + + std::cout << child->name << std::endl; + + auto column = ColumnBuffer::create(array.arr_, child->name); + column->write(child->buffers[1], child->length); buffers->emplace(name, column); } diff --git a/libtiledbsoma/src/soma/column_buffer.cc b/libtiledbsoma/src/soma/column_buffer.cc index a6bde15bf7..572378517b 100644 --- a/libtiledbsoma/src/soma/column_buffer.cc +++ b/libtiledbsoma/src/soma/column_buffer.cc @@ -103,37 +103,6 @@ std::shared_ptr ColumnBuffer::create( throw TileDBSOMAError("[ColumnBuffer] Column name not found: " + name_str); } -std::shared_ptr ColumnBuffer::create( - ArrowTable arrow_table, - std::shared_ptr context, - bool is_column_index) { - auto [arrow_array, arrow_schema] = arrow_table; - - auto cfg = context->tiledb_ctx()->config(); - auto name = arrow_schema->name; - auto type = ArrowAdapter::to_tiledb_format(arrow_schema->format); - - if (is_column_index) { - bool is_var = type == TILEDB_STRING_ASCII || type == TILEDB_STRING_UTF8; - return ColumnBuffer::alloc( - cfg, - name, - type, - is_var, - false, // is_nullable - std::nullopt, // enumeration - false // is_ordered - ); - } else { - bool is_var = type == TILEDB_STRING_ASCII || type == TILEDB_STRING_UTF8; - bool is_nullable = false; - std::optional enumeration = std::nullopt; - bool is_ordered = false; - return ColumnBuffer::alloc( - cfg, name, type, is_var, is_nullable, enumeration, is_ordered); - } -} - void ColumnBuffer::to_bitmap(tcb::span bytemap) { int i_dst = 0; for (unsigned int i_src = 0; i_src < bytemap.size(); i_src++) { diff --git a/libtiledbsoma/src/soma/column_buffer.h b/libtiledbsoma/src/soma/column_buffer.h index 7882c889fd..cd595b9289 100644 --- a/libtiledbsoma/src/soma/column_buffer.h +++ b/libtiledbsoma/src/soma/column_buffer.h @@ -71,19 +71,6 @@ class ColumnBuffer { static std::shared_ptr create( std::shared_ptr array, std::string_view name); - /** - * @brief Create a ColumnBuffer from an ArrowSchema and ArrowArray that - * represents a single column. - * - * @param array TileDB array - * @param name TileDB dimension or attribute name - * @return ColumnBuffer - */ - static std::shared_ptr create( - ArrowTable arrow_table, - std::shared_ptr context, - bool is_column_index); - /** * @brief Convert a bytemap to a bitmap in place. * @@ -135,7 +122,7 @@ class ColumnBuffer { * @param std::vector data to write */ template - void set_data(std::vector data){ + void set_data(std::vector data) { this->num_cells_ = data.size(); this->data_.resize(data.size()); this->data_.assign( @@ -143,6 +130,13 @@ class ColumnBuffer { reinterpret_cast(data.data() + data.size())); } + void set_data(const void* data, uint64_t num_elems) { + this->num_cells_ = num_elems; + this->data_.resize(num_elems); + this->data_.assign( + (std::byte*)data, (std::byte*)data + num_elems * type_size_); + } + /** * @brief Size num_cells_ to match the read query results. * diff --git a/libtiledbsoma/src/soma/managed_query.cc b/libtiledbsoma/src/soma/managed_query.cc index ca840871ac..5c0d84d6fd 100644 --- a/libtiledbsoma/src/soma/managed_query.cc +++ b/libtiledbsoma/src/soma/managed_query.cc @@ -96,6 +96,114 @@ void ManagedQuery::select_columns( } } +void ManagedQuery::setup_write(std::shared_ptr buffers) { + for (auto column_name : buffers->names()) { + auto column_buffer = buffers->at(column_name); + if (array_->schema().array_type() == TILEDB_SPARSE || + schema_->has_attribute(column_name)) { + auto data = column_buffer->data(); + query_->set_data_buffer( + column_name, (void*)data.data(), data.size_bytes()); + if (column_buffer->is_var()) { + // Remove one offset for TileDB, which checks that the + // offsets and validity buffers are the same size + auto offsets = column_buffer->offsets(); + query_->set_offsets_buffer( + column_name, offsets.data(), column_buffer->size() - 1); + } + if (column_buffer->is_nullable()) { + auto validity = column_buffer->validity(); + query_->set_validity_buffer( + column_name, validity.data(), column_buffer->size()); + } + // column_buffer->attach(*query_); + } else { + switch (column_buffer->type()) { + case TILEDB_STRING_ASCII: + case TILEDB_STRING_UTF8: + case TILEDB_CHAR: + case TILEDB_BLOB: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_FLOAT32: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_FLOAT64: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_UINT8: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_INT8: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_UINT16: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_INT16: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_UINT32: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_INT32: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_UINT64: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_INT64: + case TILEDB_TIME_SEC: + case TILEDB_TIME_MS: + case TILEDB_TIME_US: + case TILEDB_TIME_NS: + case TILEDB_DATETIME_SEC: + case TILEDB_DATETIME_MS: + case TILEDB_DATETIME_US: + case TILEDB_DATETIME_NS: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + default: + break; + } + query_->set_subarray(*subarray_); + } + } +} + void ManagedQuery::setup_read() { // If the query is complete, return so we do not submit it again auto status = query_->query_status(); diff --git a/libtiledbsoma/src/soma/managed_query.h b/libtiledbsoma/src/soma/managed_query.h index 56fda17b93..d0fe89b705 100644 --- a/libtiledbsoma/src/soma/managed_query.h +++ b/libtiledbsoma/src/soma/managed_query.h @@ -212,99 +212,7 @@ class ManagedQuery { * @param buff Buffer array pointer with elements of the column type. * @param nelements Number of array elements in buffer */ - void set_column_data( - std::string column_name, std::shared_ptr column_buffer) { - if (array_->schema().array_type() == TILEDB_SPARSE || - schema_->has_attribute(column_name)) { - // auto data = column_buffer->data(); - // query_->set_data_buffer( - // column_name, (void*)data.data(), data.size_bytes()); - column_buffer->attach(*query_); - } else { - switch (column_buffer->type()) { - case TILEDB_STRING_ASCII: - case TILEDB_STRING_UTF8: - case TILEDB_CHAR: - case TILEDB_BLOB: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_FLOAT32: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_FLOAT64: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_UINT8: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_INT8: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_UINT16: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_INT16: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_UINT32: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_INT32: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_UINT64: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_INT64: - case TILEDB_TIME_SEC: - case TILEDB_TIME_MS: - case TILEDB_TIME_US: - case TILEDB_TIME_NS: - case TILEDB_DATETIME_SEC: - case TILEDB_DATETIME_MS: - case TILEDB_DATETIME_US: - case TILEDB_DATETIME_NS: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - default: - break; - } - query_->set_subarray(*subarray_); - } - } + void setup_write(std::shared_ptr buffers); /** * @brief Configure query and allocate result buffers for reads. diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index 49092197b0..68439db0ef 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -259,10 +259,7 @@ void SOMAArray::write(std::shared_ptr buffers) { throw TileDBSOMAError("[SOMAArray] array must be opened in write mode"); } - // TODO create a ManagedQuery::setup_write? - for (auto col_name : buffers->names()) { - mq_->set_column_data(col_name, buffers->at(col_name)); - } + mq_->setup_write(buffers); mq_->submit_write(); } diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 09ee4bbc09..47a58c5580 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -404,19 +404,6 @@ class SOMAArray : public SOMAObject { */ std::optional> read_next(); - /** - * @brief Set the write data for a column. - * - * @param column_name Column name - * @param buff Buffer array pointer with elements of the column type. - * @param nelements Number of array elements in buffer - */ - void set_column_data( - std::string_view column_name, - std::shared_ptr column_buffer) { - mq_->set_column_data(std::string(column_name), column_buffer); - } - /** * @brief Write ArrayBuffers data to the array. * diff --git a/libtiledbsoma/test/unit_soma_array.cc b/libtiledbsoma/test/unit_soma_array.cc index cf546d987b..9e69f3cf32 100644 --- a/libtiledbsoma/test/unit_soma_array.cc +++ b/libtiledbsoma/test/unit_soma_array.cc @@ -143,8 +143,8 @@ std::tuple, std::vector> write_array( *ctx->tiledb_ctx(), uri, TILEDB_READ); auto col_a0 = ColumnBuffer::create(tdb_arr, "a0"); auto col_d0 = ColumnBuffer::create(tdb_arr, "d0"); - col_a0->set_data(a0); - col_d0->set_data(d0); + col_a0->set_data(a0.data(), a0.size()); + col_d0->set_data(d0.data(), d0.size()); array_buffer->emplace("a0", col_a0); array_buffer->emplace("d0", col_d0); From 82593a0b2bbcf94bca651c716fb64d27e8a44d0f Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Sun, 25 Feb 2024 19:10:25 -0600 Subject: [PATCH 18/70] WIP create domain / extents --- apis/python/src/tiledbsoma/_dataframe.py | 50 ++++++++++++++++---- apis/python/src/tiledbsoma/_tiledb_array.py | 1 + apis/python/src/tiledbsoma/soma_array.cc | 15 ++---- apis/python/src/tiledbsoma/soma_dataframe.cc | 28 +++++++++++ 4 files changed, 76 insertions(+), 18 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 3ab0df7111..92d11aa93a 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -6,10 +6,9 @@ """ Implementation of a SOMA DataFrame """ -from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union, cast +from typing import Any, Optional, Sequence, Tuple, Type, Union, cast import numpy as np -import pandas as pd import pyarrow as pa import somacore import tiledb @@ -210,14 +209,49 @@ def create( """ context = _validate_soma_tiledb_context(context) schema = _canonicalize_schema(schema, index_column_names) - tdb_schema = _build_tiledb_schema( + if domain is None: + domain = tuple(None for _ in index_column_names) + else: + ndom = len(domain) + nidx = len(index_column_names) + if ndom != nidx: + raise ValueError( + f"if domain is specified, it must have the same length as index_column_names; got {ndom} != {nidx}" + ) + + index_columns_info = [] + for index_column_name, slot_domain in zip(index_column_names, domain): + pa_type = schema.field(index_column_name).type + dtype = _arrow_types.tiledb_type_from_arrow_type( + pa_type, is_indexed_column=True + ) + + slot_domain = _fill_out_slot_domain( + slot_domain, index_column_name, pa_type, dtype + ) + + extent = _find_extent_for_domain( + index_column_name, + TileDBCreateOptions.from_platform_config(platform_config), + dtype, + slot_domain, + ) + + index_columns_info.append( + pa.chunked_array( + [pa.array(slot_domain, pa_type), pa.array([extent], pa_type)] + ) + ) + + handle = clib.SOMADataFrame.create( + uri, schema, - index_column_names, - domain, - TileDBCreateOptions.from_platform_config(platform_config), - context, + pa.Table.from_arrays(index_columns_info, ["column_info"]), + context.native_context, ) - handle = cls._create_internal(uri, tdb_schema, context, tiledb_timestamp) + + handle = cls._wrapper_type.open(uri, "w", context, tiledb_timestamp) + DataFrame._set_create_metadata(handle) return cls( handle, _dont_call_this_use_create_or_open_instead="tiledbsoma-internal-code", diff --git a/apis/python/src/tiledbsoma/_tiledb_array.py b/apis/python/src/tiledbsoma/_tiledb_array.py index f0c0d8cadf..779d92911f 100644 --- a/apis/python/src/tiledbsoma/_tiledb_array.py +++ b/apis/python/src/tiledbsoma/_tiledb_array.py @@ -183,6 +183,7 @@ def _create_internal( the newly-created array, open for writing. """ tiledb.Array.create(uri, schema, ctx=context.tiledb_ctx) + handle = cls._wrapper_type.open(uri, "w", context, tiledb_timestamp) cls._set_create_metadata(handle) return handle diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index 338d29ea4a..5afaa1cfb9 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -531,18 +531,13 @@ void load_soma_array(py::module& m) { py_batch.attr("_export_to_c")( arrow_array_ptr, arrow_schema_ptr); - auto buffers = std::make_shared(); for (auto i = 0; i < arrow_schema.n_children; ++i) { - auto child = arrow_array.children[i]; - - std::cout << child->name << std::endl; - - auto column = ColumnBuffer::create(array.arr_, child->name); - column->write(child->buffers[1], child->length); - - buffers->emplace(name, column); + array.set_column_data( + arrow_schema.children[i]->name, + arrow_array.children[i]->buffers[1], + arrow_array.children[i]->length); } - array.write(buffers); + array.write(); }) .def("nnz", &SOMAArray::nnz, py::call_guard()) diff --git a/apis/python/src/tiledbsoma/soma_dataframe.cc b/apis/python/src/tiledbsoma/soma_dataframe.cc index 507c48dd43..ce809e3026 100644 --- a/apis/python/src/tiledbsoma/soma_dataframe.cc +++ b/apis/python/src/tiledbsoma/soma_dataframe.cc @@ -49,6 +49,34 @@ using namespace tiledbsoma; void load_soma_dataframe(py::module& m) { py::class_(m, "SOMADataFrame") + .def_static( + "create", + [](std::string_view uri, + py::object py_schema, + py::object py_index_columns, + std::shared_ptr context) { + ArrowSchema schema; + uintptr_t schema_ptr = (uintptr_t)(&schema); + py_schema.attr("_export_to_c")(schema_ptr); + + ArrowSchema index_columns_schema; + ArrowArray index_columns_array; + uintptr_t index_columns_schema_ptr = + (uintptr_t)(&index_columns_schema); + uintptr_t + index_columns_array_ptr = (uintptr_t)(&index_columns_array); + py_index_columns.attr("_export_to_c")( + index_columns_array_ptr, index_columns_schema_ptr); + + // return SOMADataFrame::create( + // uri, + // std::make_shared(schema), + // ArrowTable( + // std::make_shared(index_columns_schema), + // std::make_shared(index_columns_array)), + // context); + }) + .def_static( "open", py::overload_cast< From 13ab9a4fdce80611c1a097bb4318c631b74ab529 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Sun, 25 Feb 2024 19:46:43 -0600 Subject: [PATCH 19/70] Use ColumnIndexInfo instead --- libtiledbsoma/src/soma/column_buffer.h | 12 +- libtiledbsoma/src/soma/managed_query.cc | 207 +++++++++--------- libtiledbsoma/src/soma/managed_query.h | 2 +- libtiledbsoma/src/soma/soma_array.cc | 28 ++- libtiledbsoma/src/soma/soma_array.h | 8 +- libtiledbsoma/src/soma/soma_collection.cc | 6 +- libtiledbsoma/src/soma/soma_collection.h | 6 +- libtiledbsoma/src/soma/soma_dataframe.cc | 2 +- libtiledbsoma/src/soma/soma_dataframe.h | 2 +- libtiledbsoma/src/soma/soma_experiment.cc | 2 +- libtiledbsoma/src/soma/soma_experiment.h | 2 +- libtiledbsoma/src/soma/soma_measurement.cc | 2 +- libtiledbsoma/src/soma/soma_measurement.h | 2 +- libtiledbsoma/src/utils/arrow_adapter.cc | 52 ++--- libtiledbsoma/src/utils/arrow_adapter.h | 8 +- libtiledbsoma/test/unit_soma_array.cc | 14 +- libtiledbsoma/test/unit_soma_dataframe.cc | 14 +- libtiledbsoma/test/unit_soma_dense_ndarray.cc | 14 +- .../test/unit_soma_sparse_ndarray.cc | 14 +- 19 files changed, 194 insertions(+), 203 deletions(-) diff --git a/libtiledbsoma/src/soma/column_buffer.h b/libtiledbsoma/src/soma/column_buffer.h index cd595b9289..a11138edde 100644 --- a/libtiledbsoma/src/soma/column_buffer.h +++ b/libtiledbsoma/src/soma/column_buffer.h @@ -119,17 +119,9 @@ class ColumnBuffer { /** * @brief Set the ColumnBuffer's data. * - * @param std::vector data to write + * @param data pointer to the beginning of the data to write + * @param num_elems the number of elements in the column */ - template - void set_data(std::vector data) { - this->num_cells_ = data.size(); - this->data_.resize(data.size()); - this->data_.assign( - reinterpret_cast(data.data()), - reinterpret_cast(data.data() + data.size())); - } - void set_data(const void* data, uint64_t num_elems) { this->num_cells_ = num_elems; this->data_.resize(num_elems); diff --git a/libtiledbsoma/src/soma/managed_query.cc b/libtiledbsoma/src/soma/managed_query.cc index 5c0d84d6fd..dd40bef0d9 100644 --- a/libtiledbsoma/src/soma/managed_query.cc +++ b/libtiledbsoma/src/soma/managed_query.cc @@ -96,111 +96,110 @@ void ManagedQuery::select_columns( } } -void ManagedQuery::setup_write(std::shared_ptr buffers) { - for (auto column_name : buffers->names()) { - auto column_buffer = buffers->at(column_name); - if (array_->schema().array_type() == TILEDB_SPARSE || - schema_->has_attribute(column_name)) { - auto data = column_buffer->data(); - query_->set_data_buffer( - column_name, (void*)data.data(), data.size_bytes()); - if (column_buffer->is_var()) { - // Remove one offset for TileDB, which checks that the - // offsets and validity buffers are the same size - auto offsets = column_buffer->offsets(); - query_->set_offsets_buffer( - column_name, offsets.data(), column_buffer->size() - 1); - } - if (column_buffer->is_nullable()) { - auto validity = column_buffer->validity(); - query_->set_validity_buffer( - column_name, validity.data(), column_buffer->size()); - } - // column_buffer->attach(*query_); - } else { - switch (column_buffer->type()) { - case TILEDB_STRING_ASCII: - case TILEDB_STRING_UTF8: - case TILEDB_CHAR: - case TILEDB_BLOB: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_FLOAT32: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_FLOAT64: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_UINT8: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_INT8: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_UINT16: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_INT16: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_UINT32: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_INT32: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_UINT64: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_INT64: - case TILEDB_TIME_SEC: - case TILEDB_TIME_MS: - case TILEDB_TIME_US: - case TILEDB_TIME_NS: - case TILEDB_DATETIME_SEC: - case TILEDB_DATETIME_MS: - case TILEDB_DATETIME_US: - case TILEDB_DATETIME_NS: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - default: - break; - } - query_->set_subarray(*subarray_); +void ManagedQuery::set_column_data( + std::shared_ptr column_buffer) { + auto column_name = std::string(column_buffer->name()); + if (array_->schema().array_type() == TILEDB_SPARSE || + schema_->has_attribute(column_name)) { + auto data = column_buffer->data(); + query_->set_data_buffer( + column_name, (void*)data.data(), data.size_bytes()); + if (column_buffer->is_var()) { + // Remove one offset for TileDB, which checks that the + // offsets and validity buffers are the same size + auto offsets = column_buffer->offsets(); + query_->set_offsets_buffer( + column_name, offsets.data(), column_buffer->size() - 1); } + if (column_buffer->is_nullable()) { + auto validity = column_buffer->validity(); + query_->set_validity_buffer( + column_name, validity.data(), column_buffer->size()); + } + // column_buffer->attach(*query_); + } else { + switch (column_buffer->type()) { + case TILEDB_STRING_ASCII: + case TILEDB_STRING_UTF8: + case TILEDB_CHAR: + case TILEDB_BLOB: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_FLOAT32: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_FLOAT64: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_UINT8: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_INT8: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_UINT16: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_INT16: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_UINT32: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_INT32: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_UINT64: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_INT64: + case TILEDB_TIME_SEC: + case TILEDB_TIME_MS: + case TILEDB_TIME_US: + case TILEDB_TIME_NS: + case TILEDB_DATETIME_SEC: + case TILEDB_DATETIME_MS: + case TILEDB_DATETIME_US: + case TILEDB_DATETIME_NS: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + default: + break; + } + query_->set_subarray(*subarray_); } } diff --git a/libtiledbsoma/src/soma/managed_query.h b/libtiledbsoma/src/soma/managed_query.h index d0fe89b705..9e1b1d03b2 100644 --- a/libtiledbsoma/src/soma/managed_query.h +++ b/libtiledbsoma/src/soma/managed_query.h @@ -212,7 +212,7 @@ class ManagedQuery { * @param buff Buffer array pointer with elements of the column type. * @param nelements Number of array elements in buffer */ - void setup_write(std::shared_ptr buffers); + void set_column_data(std::shared_ptr buffer); /** * @brief Configure query and allocate result buffers for reads. diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index 68439db0ef..d2ee3c8ed7 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -254,14 +254,38 @@ std::optional> SOMAArray::read_next() { return mq_->results(); } -void SOMAArray::write(std::shared_ptr buffers) { +void SOMAArray::set_column_data( + std::string_view name, const void* data, uint64_t num_elems) { if (mq_->query_type() != TILEDB_WRITE) { throw TileDBSOMAError("[SOMAArray] array must be opened in write mode"); } - mq_->setup_write(buffers); + // Create the array_buffer_ as necessary + if (array_buffer_ == nullptr) + array_buffer_ = std::make_shared(); + + // Create a ColumnBuffer object instead of passing it in as an argument to + // `set_column_data` because ColumnBuffer::create requires a TileDB Array + // argument which should remain a private member of SOMAArray + auto column = ColumnBuffer::create(arr_, name); + column->set_data(data, num_elems); + + // Keep the ColumnBuffer alive by attaching it to the ArrayBuffers class + // member. Otherwise, the data held by the ColumnBuffer will be garbage + // collected before it is submitted to the write query + array_buffer_->emplace(std::string(name), column); + + mq_->set_column_data(column); +}; + +void SOMAArray::write() { + if (mq_->query_type() != TILEDB_WRITE) { + throw TileDBSOMAError("[SOMAArray] array must be opened in write mode"); + } mq_->submit_write(); + + array_buffer_ = nullptr; } uint64_t SOMAArray::nnz() { diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 47a58c5580..acc26b4a8c 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -404,6 +404,9 @@ class SOMAArray : public SOMAObject { */ std::optional> read_next(); + void set_column_data( + std::string_view name, const void* data, uint64_t num_elems); + /** * @brief Write ArrayBuffers data to the array. * @@ -426,7 +429,7 @@ class SOMAArray : public SOMAObject { * * @param buffers The ArrayBuffers to write to the array */ - void write(std::shared_ptr buffers); + void write(); /** * @brief Check if the query is complete. @@ -722,6 +725,9 @@ class SOMAArray : public SOMAObject { // Unoptimized method for computing nnz() (issue `count_cells` query) uint64_t nnz_slow(); + + // ArrayBuffers to hold ColumnBuffers alive when submitting to write query + std::shared_ptr array_buffer_ = nullptr; }; } // namespace tiledbsoma diff --git a/libtiledbsoma/src/soma/soma_collection.cc b/libtiledbsoma/src/soma/soma_collection.cc index 4ace686229..c6c6e3c23a 100644 --- a/libtiledbsoma/src/soma/soma_collection.cc +++ b/libtiledbsoma/src/soma/soma_collection.cc @@ -108,7 +108,7 @@ std::shared_ptr SOMACollection::add_new_experiment( URIType uri_type, std::shared_ptr ctx, std::shared_ptr schema, - ArrowTable index_columns) { + ColumnIndexInfo index_columns) { std::shared_ptr member = SOMAExperiment::create( uri, schema, index_columns, ctx); this->set(std::string(uri), uri_type, std::string(key)); @@ -122,7 +122,7 @@ std::shared_ptr SOMACollection::add_new_measurement( URIType uri_type, std::shared_ptr ctx, std::shared_ptr schema, - ArrowTable index_columns) { + ColumnIndexInfo index_columns) { std::shared_ptr member = SOMAMeasurement::create( uri, schema, index_columns, ctx); this->set(std::string(uri), uri_type, std::string(key)); @@ -136,7 +136,7 @@ std::shared_ptr SOMACollection::add_new_dataframe( URIType uri_type, std::shared_ptr ctx, std::shared_ptr schema, - ArrowTable index_columns) { + ColumnIndexInfo index_columns) { std::shared_ptr member = SOMADataFrame::create( uri, schema, index_columns, ctx); this->set(std::string(uri), uri_type, std::string(key)); diff --git a/libtiledbsoma/src/soma/soma_collection.h b/libtiledbsoma/src/soma/soma_collection.h index 5699486266..14b308ba11 100644 --- a/libtiledbsoma/src/soma/soma_collection.h +++ b/libtiledbsoma/src/soma/soma_collection.h @@ -156,7 +156,7 @@ class SOMACollection : public SOMAGroup { URIType uri_type, std::shared_ptr ctx, std::shared_ptr schema, - ArrowTable index_columns); + ColumnIndexInfo index_columns); /** * Create and add a SOMAMeasurement to the SOMACollection. @@ -172,7 +172,7 @@ class SOMACollection : public SOMAGroup { URIType uri_type, std::shared_ptr ctx, std::shared_ptr schema, - ArrowTable index_columns); + ColumnIndexInfo index_columns); /** * Create and add a SOMADataFrame to the SOMACollection. @@ -188,7 +188,7 @@ class SOMACollection : public SOMAGroup { URIType uri_type, std::shared_ptr ctx, std::shared_ptr schema, - ArrowTable index_columns); + ColumnIndexInfo index_columns); /** * Create and add a SOMADenseNDArray to the SOMACollection. diff --git a/libtiledbsoma/src/soma/soma_dataframe.cc b/libtiledbsoma/src/soma/soma_dataframe.cc index d917b8ea4e..a97711d845 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.cc +++ b/libtiledbsoma/src/soma/soma_dataframe.cc @@ -42,7 +42,7 @@ using namespace tiledb; std::unique_ptr SOMADataFrame::create( std::string_view uri, std::shared_ptr schema, - ArrowTable index_columns, + ColumnIndexInfo index_columns, std::shared_ptr ctx) { auto tiledb_schema = ArrowAdapter::tiledb_schema_from_arrow_schema( ctx->tiledb_ctx(), schema, index_columns); diff --git a/libtiledbsoma/src/soma/soma_dataframe.h b/libtiledbsoma/src/soma/soma_dataframe.h index c8db15e667..bfd3fcb6c0 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.h +++ b/libtiledbsoma/src/soma/soma_dataframe.h @@ -60,7 +60,7 @@ class SOMADataFrame : public SOMAArray { static std::unique_ptr create( std::string_view uri, std::shared_ptr schema, - ArrowTable index_columns, + ColumnIndexInfo index_columns, std::shared_ptr ctx); /** diff --git a/libtiledbsoma/src/soma/soma_experiment.cc b/libtiledbsoma/src/soma/soma_experiment.cc index 8990f198cc..0d4ff5dab5 100644 --- a/libtiledbsoma/src/soma/soma_experiment.cc +++ b/libtiledbsoma/src/soma/soma_experiment.cc @@ -44,7 +44,7 @@ using namespace tiledb; std::unique_ptr SOMAExperiment::create( std::string_view uri, std::shared_ptr schema, - ArrowTable index_columns, + ColumnIndexInfo index_columns, std::shared_ptr ctx) { std::string exp_uri(uri); diff --git a/libtiledbsoma/src/soma/soma_experiment.h b/libtiledbsoma/src/soma/soma_experiment.h index 9800794631..33c4c79b22 100644 --- a/libtiledbsoma/src/soma/soma_experiment.h +++ b/libtiledbsoma/src/soma/soma_experiment.h @@ -57,7 +57,7 @@ class SOMAExperiment : public SOMACollection { static std::unique_ptr create( std::string_view uri, std::shared_ptr schema, - ArrowTable index_columns, + ColumnIndexInfo index_columns, std::shared_ptr ctx); //=================================================================== diff --git a/libtiledbsoma/src/soma/soma_measurement.cc b/libtiledbsoma/src/soma/soma_measurement.cc index 4bbb17655c..d2628cfad5 100644 --- a/libtiledbsoma/src/soma/soma_measurement.cc +++ b/libtiledbsoma/src/soma/soma_measurement.cc @@ -44,7 +44,7 @@ using namespace tiledb; std::unique_ptr SOMAMeasurement::create( std::string_view uri, std::shared_ptr schema, - ArrowTable index_columns, + ColumnIndexInfo index_columns, std::shared_ptr ctx) { std::string exp_uri(uri); diff --git a/libtiledbsoma/src/soma/soma_measurement.h b/libtiledbsoma/src/soma/soma_measurement.h index a93ca6c6c2..5a44f72d1d 100644 --- a/libtiledbsoma/src/soma/soma_measurement.h +++ b/libtiledbsoma/src/soma/soma_measurement.h @@ -58,7 +58,7 @@ class SOMAMeasurement : public SOMACollection { static std::unique_ptr create( std::string_view uri, std::shared_ptr schema, - ArrowTable index_columns, + ColumnIndexInfo index_columns, std::shared_ptr ctx); //=================================================================== diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 59eddab109..833c235a8f 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -244,21 +244,33 @@ std::pair ArrowAdapter::_get_data_and_length( ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( std::shared_ptr ctx, std::shared_ptr arrow_schema, - ArrowTable index_columns) { + ColumnIndexInfo index_column_info) { + auto [index_column_names, domains, extents] = index_column_info; + ArraySchema schema(*ctx, TILEDB_SPARSE); Domain domain(*ctx); - for (int64_t i = 0; i < arrow_schema->n_children; ++i) { - ArrowSchema* child = arrow_schema->children[i]; - auto type = ArrowAdapter::to_tiledb_format(child->format); - auto dim_info = ArrowAdapter::_get_dim_info(child->name, index_columns); - - if (dim_info.has_value()) { - auto& [dim_dom, extent] = *dim_info; - domain.add_dimension( - Dimension::create(*ctx, child->name, type, dim_dom, extent)); - } else { - schema.add_attribute(Attribute(*ctx, child->name, type)); + for (size_t col_idx = 0; col_idx < index_column_names.size(); ++col_idx) { + for (int64_t schema_idx = 0; schema_idx < arrow_schema->n_children; + ++schema_idx) { + auto child = arrow_schema->children[schema_idx]; + auto type = ArrowAdapter::to_tiledb_format(child->format); + if (child->name == index_column_names[col_idx]) { + auto dim = Dimension::create( + *ctx, + child->name, + type, + domains->children[col_idx]->buffers[1], + extents->children[col_idx]->buffers[1]); + + domain.add_dimension(dim); + } else { + auto attr = Attribute(*ctx, child->name, type); + if (child->flags | ARROW_FLAG_NULLABLE) { + attr.set_nullable(true); + } + schema.add_attribute(attr); + } } } @@ -269,22 +281,6 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( return schema; } -std::optional> ArrowAdapter::_get_dim_info( - std::string_view dim_name, ArrowTable index_columns) { - auto index_columns_array = index_columns.first; - auto index_columns_schema = index_columns.second; - - for (int64_t i = 0; i < index_columns_array->n_children; ++i) { - if (dim_name == index_columns_schema->children[i]->name) { - auto dim_info = index_columns_array->children[i]->children; - auto domain = dim_info[0]->buffers[1]; - auto extent = dim_info[1]->buffers[1]; - return std::make_pair(domain, extent); - } - } - return std::nullopt; -} - ArrowTable ArrowAdapter::to_arrow(std::shared_ptr column) { std::shared_ptr schema = std::make_shared(); std::shared_ptr array = std::make_shared(); diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index 60593da1b8..ed164d3ad9 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -35,6 +35,12 @@ struct ArrowBuffer { using ArrowTable = std::pair, std::shared_ptr>; +using ColumnIndexInfo = std::tuple< + std::vector, // name of column + std::shared_ptr, // domain + std::shared_ptr // tile extent + >; + class ArrowAdapter { public: static void release_schema(struct ArrowSchema* schema); @@ -64,7 +70,7 @@ class ArrowAdapter { static ArraySchema tiledb_schema_from_arrow_schema( std::shared_ptr ctx, std::shared_ptr arrow_schema, - ArrowTable index_columns); + ColumnIndexInfo index_column_info); /** * @brief Get Arrow format string from TileDB datatype. diff --git a/libtiledbsoma/test/unit_soma_array.cc b/libtiledbsoma/test/unit_soma_array.cc index 9e69f3cf32..455f9c07bd 100644 --- a/libtiledbsoma/test/unit_soma_array.cc +++ b/libtiledbsoma/test/unit_soma_array.cc @@ -138,18 +138,10 @@ std::tuple, std::vector> write_array( } std::vector a0(num_cells_per_fragment, frag_num); - auto array_buffer = std::make_shared(); - auto tdb_arr = std::make_shared( - *ctx->tiledb_ctx(), uri, TILEDB_READ); - auto col_a0 = ColumnBuffer::create(tdb_arr, "a0"); - auto col_d0 = ColumnBuffer::create(tdb_arr, "d0"); - col_a0->set_data(a0.data(), a0.size()); - col_d0->set_data(d0.data(), d0.size()); - array_buffer->emplace("a0", col_a0); - array_buffer->emplace("d0", col_d0); - // Write data to array - soma_array->write(array_buffer); + soma_array->set_column_data("a0", a0.data(), a0.size()); + soma_array->set_column_data("d0", d0.data(), d0.size()); + soma_array->write(); soma_array->close(); } diff --git a/libtiledbsoma/test/unit_soma_dataframe.cc b/libtiledbsoma/test/unit_soma_dataframe.cc index 3256d83f3d..f9c86fce1a 100644 --- a/libtiledbsoma/test/unit_soma_dataframe.cc +++ b/libtiledbsoma/test/unit_soma_dataframe.cc @@ -54,18 +54,10 @@ TEST_CASE("SOMADataFrame: basic") { d0[j] = j; std::vector a0(10, 1); - auto array_buffer = std::make_shared(); - auto tdb_arr = std::make_shared( - *ctx->tiledb_ctx(), uri, TILEDB_READ); - auto col_a0 = ColumnBuffer::create(tdb_arr, "a0"); - auto col_d0 = ColumnBuffer::create(tdb_arr, "d0"); - col_a0->set_data(a0); - col_d0->set_data(d0); - array_buffer->emplace("a0", col_a0); - array_buffer->emplace("d0", col_d0); - soma_dataframe = SOMADataFrame::open(uri, OpenMode::write, ctx); - soma_dataframe->write(array_buffer); + soma_dataframe->set_column_data("a0", a0.data(), a0.size()); + soma_dataframe->set_column_data("d0", d0.data(), d0.size()); + soma_dataframe->write(); soma_dataframe->close(); soma_dataframe = SOMADataFrame::open(uri, OpenMode::read, ctx); diff --git a/libtiledbsoma/test/unit_soma_dense_ndarray.cc b/libtiledbsoma/test/unit_soma_dense_ndarray.cc index d425046d49..8ed91fad59 100644 --- a/libtiledbsoma/test/unit_soma_dense_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_dense_ndarray.cc @@ -97,18 +97,10 @@ TEST_CASE("SOMADenseNDArray: basic") { std::vector d0{1, 10}; std::vector a0(10, 1); - auto array_buffer = std::make_shared(); - auto tdb_arr = std::make_shared( - *ctx->tiledb_ctx(), uri, TILEDB_READ); - auto col_a0 = ColumnBuffer::create(tdb_arr, "a0"); - auto col_d0 = ColumnBuffer::create(tdb_arr, "d0"); - col_a0->set_data(a0); - col_d0->set_data(d0); - array_buffer->emplace("a0", col_a0); - array_buffer->emplace("d0", col_d0); - soma_dense->open(OpenMode::write); - soma_dense->write(array_buffer); + soma_dense->set_column_data("a0", a0.data(), a0.size()); + soma_dense->set_column_data("d0", d0.data(), d0.size()); + soma_dense->write(); soma_dense->close(); soma_dense->open(OpenMode::read); diff --git a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc index bc5b999f3f..032e176605 100644 --- a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc @@ -99,18 +99,10 @@ TEST_CASE("SOMASparseNDArray: basic") { d0[j] = j; std::vector a0(10, 1); - auto array_buffer = std::make_shared(); - auto tdb_arr = std::make_shared( - *ctx->tiledb_ctx(), uri, TILEDB_READ); - auto col_a0 = ColumnBuffer::create(tdb_arr, "a0"); - auto col_d0 = ColumnBuffer::create(tdb_arr, "d0"); - col_a0->set_data(a0); - col_d0->set_data(d0); - array_buffer->emplace("a0", col_a0); - array_buffer->emplace("d0", col_d0); - soma_sparse->open(OpenMode::write); - soma_sparse->write(array_buffer); + soma_sparse->set_column_data("a0", a0.data(), a0.size()); + soma_sparse->set_column_data("d0", d0.data(), d0.size()); + soma_sparse->write(); soma_sparse->close(); soma_sparse->open(OpenMode::read); From 7a32937962892a333bd03fbd209e9a2612bb3b2b Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Sun, 25 Feb 2024 20:26:23 -0600 Subject: [PATCH 20/70] Create clib.SOMADataFrame --- apis/python/src/tiledbsoma/_dataframe.py | 20 +++++------- apis/python/src/tiledbsoma/soma_dataframe.cc | 34 +++++++++++--------- 2 files changed, 26 insertions(+), 28 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 92d11aa93a..b9569343e0 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -219,7 +219,8 @@ def create( f"if domain is specified, it must have the same length as index_column_names; got {ndom} != {nidx}" ) - index_columns_info = [] + domains = [] + extents = [] for index_column_name, slot_domain in zip(index_column_names, domain): pa_type = schema.field(index_column_name).type dtype = _arrow_types.tiledb_type_from_arrow_type( @@ -237,21 +238,20 @@ def create( slot_domain, ) - index_columns_info.append( - pa.chunked_array( - [pa.array(slot_domain, pa_type), pa.array([extent], pa_type)] - ) - ) + domains.append(slot_domain) + extents.append([extent]) handle = clib.SOMADataFrame.create( uri, schema, - pa.Table.from_arrays(index_columns_info, ["column_info"]), + index_column_names, + pa.array(domains), + pa.array(extents), context.native_context, ) handle = cls._wrapper_type.open(uri, "w", context, tiledb_timestamp) - DataFrame._set_create_metadata(handle) + return cls( handle, _dont_call_this_use_create_or_open_instead="tiledbsoma-internal-code", @@ -514,12 +514,8 @@ def write( # the user set index_column_names = ["meister", "burger"] when creating the TileDB schema. # Then the above for-loop over the Arrow schema will find the former ordering, but for the # ``writer[dims] = attrs`` below we must have dims with the latter ordering. - print("SCHEMAAAAAAAAAAAAAAA") - print(values.schema) - print("BATCHESSSSSSSSSSSSSS ") for batch in values.to_batches(): self._handle.write(batch) - print("DONEEEEEEEEE") # dim_cols_list = [dim_cols_map[name] for name in self.index_column_names] # dim_cols_tuple = tuple(dim_cols_list) # self._handle.writer[dim_cols_tuple] = attr_cols_map diff --git a/apis/python/src/tiledbsoma/soma_dataframe.cc b/apis/python/src/tiledbsoma/soma_dataframe.cc index ce809e3026..9d14837415 100644 --- a/apis/python/src/tiledbsoma/soma_dataframe.cc +++ b/apis/python/src/tiledbsoma/soma_dataframe.cc @@ -53,28 +53,30 @@ void load_soma_dataframe(py::module& m) { "create", [](std::string_view uri, py::object py_schema, - py::object py_index_columns, + std::vector index_columns_names, + py::object py_domains, + py::object py_extents, std::shared_ptr context) { ArrowSchema schema; uintptr_t schema_ptr = (uintptr_t)(&schema); py_schema.attr("_export_to_c")(schema_ptr); - ArrowSchema index_columns_schema; - ArrowArray index_columns_array; - uintptr_t index_columns_schema_ptr = - (uintptr_t)(&index_columns_schema); - uintptr_t - index_columns_array_ptr = (uintptr_t)(&index_columns_array); - py_index_columns.attr("_export_to_c")( - index_columns_array_ptr, index_columns_schema_ptr); + ArrowArray domains; + uintptr_t domains_ptr = (uintptr_t)(&domains); + py_domains.attr("_export_to_c")(domains_ptr); - // return SOMADataFrame::create( - // uri, - // std::make_shared(schema), - // ArrowTable( - // std::make_shared(index_columns_schema), - // std::make_shared(index_columns_array)), - // context); + ArrowArray extents; + uintptr_t extents_ptr = (uintptr_t)(&extents); + py_extents.attr("_export_to_c")(extents_ptr); + + return SOMADataFrame::create( + uri, + std::make_shared(schema), + ColumnIndexInfo( + index_columns_names, + std::make_shared(domains), + std::make_shared(extents)), + context); }) .def_static( From eab765227b789721396ca98f3cda91687adb8f9c Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Mon, 26 Feb 2024 08:50:02 -0600 Subject: [PATCH 21/70] WIP metadata issue unpacking values --- apis/python/src/tiledbsoma/_tdb_handles.py | 9 ++- apis/python/src/tiledbsoma/soma_array.cc | 82 ++++++++-------------- 2 files changed, 34 insertions(+), 57 deletions(-) diff --git a/apis/python/src/tiledbsoma/_tdb_handles.py b/apis/python/src/tiledbsoma/_tdb_handles.py index 00a2e69fac..67cff012e9 100644 --- a/apis/python/src/tiledbsoma/_tdb_handles.py +++ b/apis/python/src/tiledbsoma/_tdb_handles.py @@ -356,7 +356,7 @@ def _do_initial_reads(self, reader: _RawHdl_co) -> None: # type: ignore[misc] will need to retrieve data from the backing store on setup. """ # non–attrs-managed field - self.metadata = MetadataWrapper(self, dict(reader.meta)) + self.metadata = MetadataWrapper(self, reader.meta()) @property def schema(self) -> pa.Schema: @@ -364,7 +364,7 @@ def schema(self) -> pa.Schema: @property def meta(self) -> "MetadataWrapper": - return MetadataWrapper(self, dict(self._handle.meta)) + return MetadataWrapper(self, self._handle.meta()) @property def ndim(self) -> int: @@ -556,7 +556,10 @@ def _write(self) -> None: # There were no changes (e.g., it's a read handle). Do nothing. return # Only try to get the writer if there are changes to be made. - meta = self.owner.writer.meta + if isinstance(self.owner, DataFrameWrapper): + meta = self.owner.meta + else: + meta = self.owner.writer.meta for key, mod in self._mods.items(): if mod in (_DictMod.ADDED, _DictMod.UPDATED): meta[key] = self.cache[key] diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index 5afaa1cfb9..fac4f5df31 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -125,9 +125,9 @@ void load_soma_array(py::module& m) { .def( "__exit__", [](SOMAArray& reader, - void* exc_type, - void* exc_value, - void* traceback) { reader.close(); }) + py::object exc_type, + py::object exc_value, + py::object traceback) { reader.close(); }) .def( "set_condition", @@ -663,15 +663,15 @@ void load_soma_array(py::module& m) { .def_property_readonly("dimension_names", &SOMAArray::dimension_names) - .def("set_metadata", &SOMAArray::set_metadata) + // .def("set_metadata", &SOMAArray::set_metadata) - .def("delete_metadata", &SOMAArray::delete_metadata) + // .def("delete_metadata", &SOMAArray::delete_metadata) - .def( - "get_metadata", - py::overload_cast(&SOMAArray::get_metadata)) + // .def( + // "get_metadata", + // py::overload_cast(&SOMAArray::get_metadata)) - .def_property_readonly( + .def( "meta", [](SOMAArray& soma_dataframe) -> py::dict { py::dict results; @@ -682,56 +682,30 @@ void load_soma_array(py::module& m) { uint32_t value_num = std::get(val); const void* value = std::get(val); - if (tdb_type == TILEDB_STRING_UTF8) { - results[py::str(key)] = py::str( - std::string((const char*)value, value_num)); - } else if (tdb_type == TILEDB_STRING_ASCII) { - results[py::str(key)] = py::bytes( - std::string((const char*)value, value_num)); - } else { - py::dtype value_type = tdb_to_np_dtype(tdb_type, 1); - results[py::str(key)] = py::array( - value_type, value_num, value); - } + std::cout << key << std::endl; + + // if (tdb_type == TILEDB_STRING_UTF8) { + // results[py::str(key)] = py::str( + // std::string((const char*)value, value_num)); + // } else if (tdb_type == TILEDB_STRING_ASCII) { + // results[py::str(key)] = py::bytes( + // std::string((const char*)value, value_num)); + // } else { + // py::dtype value_type = tdb_to_np_dtype(tdb_type, 1); + // results[py::str(key)] = py::array( + // value_type, value_num, value); + // } } return results; }) - .def("set_metadata", &SOMAArray::set_metadata) - - .def("delete_metadata", &SOMAArray::delete_metadata) - - .def( - "get_metadata", - py::overload_cast(&SOMAArray::get_metadata)) - - .def_property_readonly( - "meta", - [](SOMAArray& soma_dataframe) -> py::dict { - py::dict results; - - for (auto const& [key, val] : soma_dataframe.get_metadata()) { - tiledb_datatype_t tdb_type = std::get( - val); - uint32_t value_num = std::get(val); - const void* value = std::get(val); + // .def( + // "get_metadata", + // py::overload_cast(&SOMAArray::get_metadata)) - if (tdb_type == TILEDB_STRING_UTF8) { - results[py::str(key)] = py::str( - std::string((const char*)value, value_num)); - } else if (tdb_type == TILEDB_STRING_ASCII) { - results[py::str(key)] = py::bytes( - std::string((const char*)value, value_num)); - } else { - py::dtype value_type = tdb_to_np_dtype(tdb_type, 1); - results[py::str(key)] = py::array( - value_type, value_num, value); - } - } - return results; - }) - .def("has_metadata", &SOMAArray::has_metadata) + // .def("has_metadata", &SOMAArray::has_metadata) - .def("metadata_num", &SOMAArray::metadata_num); + // .def("metadata_num", &SOMAArray::metadata_num) + ; } } // namespace libtiledbsomacpp \ No newline at end of file From 3297b700e7d01bb1d17d33a34d73df32fb191647 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Mon, 26 Feb 2024 13:05:47 -0600 Subject: [PATCH 22/70] Fix metadata --- apis/python/src/tiledbsoma/_dataframe.py | 1 + apis/python/src/tiledbsoma/_tdb_handles.py | 2 +- apis/python/src/tiledbsoma/soma_array.cc | 57 ++++++++++------------ libtiledbsoma/src/soma/soma_array.cc | 11 ++++- 4 files changed, 38 insertions(+), 33 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index b9569343e0..be37c34583 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -514,6 +514,7 @@ def write( # the user set index_column_names = ["meister", "burger"] when creating the TileDB schema. # Then the above for-loop over the Arrow schema will find the former ordering, but for the # ``writer[dims] = attrs`` below we must have dims with the latter ordering. + for batch in values.to_batches(): self._handle.write(batch) # dim_cols_list = [dim_cols_map[name] for name in self.index_column_names] diff --git a/apis/python/src/tiledbsoma/_tdb_handles.py b/apis/python/src/tiledbsoma/_tdb_handles.py index 67cff012e9..11ea27c80a 100644 --- a/apis/python/src/tiledbsoma/_tdb_handles.py +++ b/apis/python/src/tiledbsoma/_tdb_handles.py @@ -364,7 +364,7 @@ def schema(self) -> pa.Schema: @property def meta(self) -> "MetadataWrapper": - return MetadataWrapper(self, self._handle.meta()) + return self.metadata @property def ndim(self) -> int: diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index fac4f5df31..43df7270a2 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -663,49 +663,44 @@ void load_soma_array(py::module& m) { .def_property_readonly("dimension_names", &SOMAArray::dimension_names) - // .def("set_metadata", &SOMAArray::set_metadata) + .def("set_metadata", &SOMAArray::set_metadata) - // .def("delete_metadata", &SOMAArray::delete_metadata) + .def("delete_metadata", &SOMAArray::delete_metadata) - // .def( - // "get_metadata", - // py::overload_cast(&SOMAArray::get_metadata)) + .def( + "get_metadata", + py::overload_cast(&SOMAArray::get_metadata)) .def( "meta", - [](SOMAArray& soma_dataframe) -> py::dict { + [](SOMAArray& array) -> py::dict { py::dict results; + auto np_join = py::module::import("numpy").attr("char").attr( + "join"); + + for (auto const& [key, val] : array.get_metadata()) { + auto [tdb_type, value_num, value] = *(array.get_metadata(key)); - for (auto const& [key, val] : soma_dataframe.get_metadata()) { - tiledb_datatype_t tdb_type = std::get( - val); - uint32_t value_num = std::get(val); - const void* value = std::get(val); - - std::cout << key << std::endl; - - // if (tdb_type == TILEDB_STRING_UTF8) { - // results[py::str(key)] = py::str( - // std::string((const char*)value, value_num)); - // } else if (tdb_type == TILEDB_STRING_ASCII) { - // results[py::str(key)] = py::bytes( - // std::string((const char*)value, value_num)); - // } else { - // py::dtype value_type = tdb_to_np_dtype(tdb_type, 1); - // results[py::str(key)] = py::array( - // value_type, value_num, value); - // } + if (tdb_type == TILEDB_STRING_ASCII) { + auto py_buf = py::array(py::dtype("|S1"), value_num, value); + } else if (tdb_type == TILEDB_STRING_UTF8) { + auto py_buf = py::array(py::dtype("|S1"), value_num, value); + results[py::str(key)] = py_buf.attr("tobytes")().attr("decode")("UTF-8"); + } else { + py::dtype value_type = tdb_to_np_dtype(tdb_type, 1); + results[py::str(key)] = py::array( + value_type, value_num, value)[0]; + } } return results; }) - // .def( - // "get_metadata", - // py::overload_cast(&SOMAArray::get_metadata)) + .def( + "get_metadata", + py::overload_cast(&SOMAArray::get_metadata)) - // .def("has_metadata", &SOMAArray::has_metadata) + .def("has_metadata", &SOMAArray::has_metadata) - // .def("metadata_num", &SOMAArray::metadata_num) - ; + .def("metadata_num", &SOMAArray::metadata_num); } } // namespace libtiledbsomacpp \ No newline at end of file diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index d2ee3c8ed7..7629c9ac5c 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -48,11 +48,20 @@ void SOMAArray::create( std::string soma_type) { Array::create(std::string(uri), schema); auto array = Array(*ctx->tiledb_ctx(), std::string(uri), TILEDB_WRITE); + array.put_metadata( "soma_object_type", TILEDB_STRING_UTF8, static_cast(soma_type.length()), soma_type.c_str()); + + std::string encoding_version = "1"; + array.put_metadata( + "soma_encoding_version", + TILEDB_STRING_UTF8, + static_cast(encoding_version.length()), + encoding_version.c_str()); + array.close(); } @@ -536,8 +545,8 @@ void SOMAArray::set_metadata( if (key.compare("soma_object_type") == 0) { throw TileDBSOMAError("soma_object_type cannot be modified."); } - arr_->put_metadata(key, value_type, value_num, value); + MetadataValue mdval(value_type, value_num, value); std::pair mdpair(key, mdval); metadata_.insert(mdpair); From 1e68d6d16fefd93ce2515c86e5b90fdc10da3b74 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Mon, 26 Feb 2024 13:54:38 -0600 Subject: [PATCH 23/70] WIP set_data takes in offsets and validity --- apis/python/src/tiledbsoma/soma_array.cc | 39 ++++++++++++++++++------ libtiledbsoma/src/soma/column_buffer.h | 12 +++++++- libtiledbsoma/src/soma/soma_array.cc | 10 ++++-- libtiledbsoma/src/soma/soma_array.h | 6 +++- 4 files changed, 53 insertions(+), 14 deletions(-) diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index 43df7270a2..eb0c64c3ce 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -532,10 +532,29 @@ void load_soma_array(py::module& m) { arrow_array_ptr, arrow_schema_ptr); for (auto i = 0; i < arrow_schema.n_children; ++i) { + auto child_arr = arrow_array.children[i]; + auto len = child_arr->length; + std::optional> offsets = std::nullopt; + std::optional> validity = std::nullopt; + + // if (child_arr->n_buffers == 3) { + // auto offsets_ptr = (uint64_t*)child_arr->buffers[3]; + // offsets = std::vector( + // offsets_ptr, offsets_ptr + len); + // } + + // if (child_arr->null_count != 0) { + // auto validity_ptr = (uint8_t*)child_arr->buffers[0]; + // validity = std::vector( + // validity_ptr, validity_ptr + len); + // } + array.set_column_data( arrow_schema.children[i]->name, - arrow_array.children[i]->buffers[1], - arrow_array.children[i]->length); + child_arr->buffers[1], + len, + offsets, + validity); } array.write(); }) @@ -679,13 +698,15 @@ void load_soma_array(py::module& m) { "join"); for (auto const& [key, val] : array.get_metadata()) { - auto [tdb_type, value_num, value] = *(array.get_metadata(key)); - - if (tdb_type == TILEDB_STRING_ASCII) { - auto py_buf = py::array(py::dtype("|S1"), value_num, value); - } else if (tdb_type == TILEDB_STRING_UTF8) { - auto py_buf = py::array(py::dtype("|S1"), value_num, value); - results[py::str(key)] = py_buf.attr("tobytes")().attr("decode")("UTF-8"); + auto [tdb_type, value_num, value] = *( + array.get_metadata(key)); + + if (tdb_type == TILEDB_STRING_UTF8 | + tdb_type == TILEDB_STRING_ASCII) { + auto py_buf = py::array( + py::dtype("|S1"), value_num, value); + results[py::str(key)] = py_buf.attr("tobytes")().attr( + "decode")("UTF-8"); } else { py::dtype value_type = tdb_to_np_dtype(tdb_type, 1); results[py::str(key)] = py::array( diff --git a/libtiledbsoma/src/soma/column_buffer.h b/libtiledbsoma/src/soma/column_buffer.h index a11138edde..7a6a345603 100644 --- a/libtiledbsoma/src/soma/column_buffer.h +++ b/libtiledbsoma/src/soma/column_buffer.h @@ -122,11 +122,21 @@ class ColumnBuffer { * @param data pointer to the beginning of the data to write * @param num_elems the number of elements in the column */ - void set_data(const void* data, uint64_t num_elems) { + void set_data( + const void* data, + uint64_t num_elems, + std::optional> offsets = std::nullopt, + std::optional> validity = std::nullopt) { this->num_cells_ = num_elems; this->data_.resize(num_elems); this->data_.assign( (std::byte*)data, (std::byte*)data + num_elems * type_size_); + + if (offsets.has_value()) + offsets_ = *offsets; + + if (validity.has_value()) + validity_ = *validity; } /** diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index 7629c9ac5c..8eb6b6f067 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -54,7 +54,7 @@ void SOMAArray::create( TILEDB_STRING_UTF8, static_cast(soma_type.length()), soma_type.c_str()); - + std::string encoding_version = "1"; array.put_metadata( "soma_encoding_version", @@ -264,7 +264,11 @@ std::optional> SOMAArray::read_next() { } void SOMAArray::set_column_data( - std::string_view name, const void* data, uint64_t num_elems) { + std::string_view name, + const void* data, + uint64_t num_elems, + std::optional> offsets, + std::optional> validity) { if (mq_->query_type() != TILEDB_WRITE) { throw TileDBSOMAError("[SOMAArray] array must be opened in write mode"); } @@ -277,7 +281,7 @@ void SOMAArray::set_column_data( // `set_column_data` because ColumnBuffer::create requires a TileDB Array // argument which should remain a private member of SOMAArray auto column = ColumnBuffer::create(arr_, name); - column->set_data(data, num_elems); + column->set_data(data, num_elems, offsets, validity); // Keep the ColumnBuffer alive by attaching it to the ArrayBuffers class // member. Otherwise, the data held by the ColumnBuffer will be garbage diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index acc26b4a8c..78330c004a 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -405,7 +405,11 @@ class SOMAArray : public SOMAObject { std::optional> read_next(); void set_column_data( - std::string_view name, const void* data, uint64_t num_elems); + std::string_view name, + const void* data, + uint64_t num_elems, + std::optional> offsets = std::nullopt, + std::optional> validity = std::nullopt); /** * @brief Write ArrayBuffers data to the array. From 77412e690b039451be0688ab51c847044e9f643a Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Mon, 26 Feb 2024 17:24:43 -0600 Subject: [PATCH 24/70] WIP write with passed in offsets and validities --- apis/python/src/tiledbsoma/soma_array.cc | 58 ++++++++++++++++-------- libtiledbsoma/src/soma/soma_array.cc | 1 + libtiledbsoma/src/utils/arrow_adapter.cc | 8 +++- 3 files changed, 46 insertions(+), 21 deletions(-) diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index eb0c64c3ce..855ee94923 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -532,29 +532,49 @@ void load_soma_array(py::module& m) { arrow_array_ptr, arrow_schema_ptr); for (auto i = 0; i < arrow_schema.n_children; ++i) { - auto child_arr = arrow_array.children[i]; - auto len = child_arr->length; + auto sch_ = arrow_schema.children[i]; + auto arr_ = arrow_array.children[i]; + + const void* data; std::optional> offsets = std::nullopt; - std::optional> validity = std::nullopt; + std::optional> + validities = std::nullopt; + + if (arr_->null_count != 0) { + auto validities_ptr = (uint8_t*)arr_->buffers[0]; + validities = std::vector( + validities_ptr, validities_ptr + arr_->length); + } + + if (arr_->n_buffers == 3) { + std::vector arrow_offsets; + + if (strcmp("u", sch_->format) == 0 | + strcmp("s", sch_->format) == 0) { + auto offsets_ptr = (uint32_t*)arr_->buffers[1]; + arrow_offsets = std::vector( + offsets_ptr, offsets_ptr + arr_->length + 1); + } else { + auto offsets_ptr = (uint64_t*)arr_->buffers[1]; + arrow_offsets = std::vector( + offsets_ptr, offsets_ptr + arr_->length + 1); + } - // if (child_arr->n_buffers == 3) { - // auto offsets_ptr = (uint64_t*)child_arr->buffers[3]; - // offsets = std::vector( - // offsets_ptr, offsets_ptr + len); - // } + std::vector offsets_; + offsets_.reserve(arr_->length); + for (size_t i = 0; i < arrow_offsets.size() - 1; ++i) { + offsets_[i] = arrow_offsets[i + 1] - + arrow_offsets[i]; + } - // if (child_arr->null_count != 0) { - // auto validity_ptr = (uint8_t*)child_arr->buffers[0]; - // validity = std::vector( - // validity_ptr, validity_ptr + len); - // } + offsets = offsets_; + data = arr_->buffers[2]; + } else { + data = arr_->buffers[1]; + } array.set_column_data( - arrow_schema.children[i]->name, - child_arr->buffers[1], - len, - offsets, - validity); + sch_->name, data, arr_->length, offsets, validities); } array.write(); }) @@ -701,7 +721,7 @@ void load_soma_array(py::module& m) { auto [tdb_type, value_num, value] = *( array.get_metadata(key)); - if (tdb_type == TILEDB_STRING_UTF8 | + if (tdb_type == TILEDB_STRING_UTF8 || tdb_type == TILEDB_STRING_ASCII) { auto py_buf = py::array( py::dtype("|S1"), value_num, value); diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index 8eb6b6f067..fb10d2df3c 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -298,6 +298,7 @@ void SOMAArray::write() { mq_->submit_write(); + mq_->reset(); array_buffer_ = nullptr; } diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 833c235a8f..3bca98c97d 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -266,9 +266,13 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( domain.add_dimension(dim); } else { auto attr = Attribute(*ctx, child->name, type); - if (child->flags | ARROW_FLAG_NULLABLE) { + + if (child->flags | ARROW_FLAG_NULLABLE) attr.set_nullable(true); - } + + if (strcmp(child->format, "U")) + attr.set_cell_val_num(TILEDB_VAR_NUM); + schema.add_attribute(attr); } } From 974f0fa09121ba72d09eb07d73afbb6361c094a8 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Mon, 26 Feb 2024 21:56:50 -0600 Subject: [PATCH 25/70] WIP handle vars correctly --- apis/python/src/tiledbsoma/_dataframe.py | 7 +++- apis/python/src/tiledbsoma/soma_array.cc | 33 +++---------------- apis/python/tests/test_basic_anndata_io.py | 1 + libtiledbsoma/src/soma/column_buffer.h | 19 ++++++----- libtiledbsoma/src/soma/managed_query.cc | 16 +++++++-- libtiledbsoma/src/soma/soma_array.cc | 8 ++--- libtiledbsoma/src/soma/soma_array.h | 6 ++-- libtiledbsoma/src/utils/arrow_adapter.cc | 14 +++++--- libtiledbsoma/test/unit_soma_array.cc | 4 +-- libtiledbsoma/test/unit_soma_dataframe.cc | 4 +-- libtiledbsoma/test/unit_soma_dense_ndarray.cc | 4 +-- .../test/unit_soma_sparse_ndarray.cc | 4 +-- 12 files changed, 59 insertions(+), 61 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index be37c34583..d9b96e9276 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -369,6 +369,8 @@ def read( Lifecycle: Experimental. """ + print("STARTING READ PATH") + del batch_size # Currently unused. _util.check_unpartitioned(partitions) self._check_open_read() @@ -394,6 +396,8 @@ def read( sr.set_condition(QueryCondition(value_filter), handle.schema) self._set_reader_coords(sr, coords) + + print(sr.schema) # # TODO: batch_size return TableReadIter(sr) @@ -514,8 +518,9 @@ def write( # the user set index_column_names = ["meister", "burger"] when creating the TileDB schema. # Then the above for-loop over the Arrow schema will find the former ordering, but for the # ``writer[dims] = attrs`` below we must have dims with the latter ordering. - + print("STARTING WRITE PATH") for batch in values.to_batches(): + print(batch) self._handle.write(batch) # dim_cols_list = [dim_cols_map[name] for name in self.index_column_names] # dim_cols_tuple = tuple(dim_cols_list) diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index 855ee94923..b4d8f2781f 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -536,45 +536,22 @@ void load_soma_array(py::module& m) { auto arr_ = arrow_array.children[i]; const void* data; - std::optional> offsets = std::nullopt; - std::optional> - validities = std::nullopt; + uint64_t* offsets = nullptr; + uint8_t* validities = nullptr; if (arr_->null_count != 0) { - auto validities_ptr = (uint8_t*)arr_->buffers[0]; - validities = std::vector( - validities_ptr, validities_ptr + arr_->length); + validities = (uint8_t*)arr_->buffers[0]; } if (arr_->n_buffers == 3) { - std::vector arrow_offsets; - - if (strcmp("u", sch_->format) == 0 | - strcmp("s", sch_->format) == 0) { - auto offsets_ptr = (uint32_t*)arr_->buffers[1]; - arrow_offsets = std::vector( - offsets_ptr, offsets_ptr + arr_->length + 1); - } else { - auto offsets_ptr = (uint64_t*)arr_->buffers[1]; - arrow_offsets = std::vector( - offsets_ptr, offsets_ptr + arr_->length + 1); - } - - std::vector offsets_; - offsets_.reserve(arr_->length); - for (size_t i = 0; i < arrow_offsets.size() - 1; ++i) { - offsets_[i] = arrow_offsets[i + 1] - - arrow_offsets[i]; - } - - offsets = offsets_; + offsets = (uint64_t*)arr_->buffers[1]; data = arr_->buffers[2]; } else { data = arr_->buffers[1]; } array.set_column_data( - sch_->name, data, arr_->length, offsets, validities); + sch_->name, arr_->length, data, offsets, validities); } array.write(); }) diff --git a/apis/python/tests/test_basic_anndata_io.py b/apis/python/tests/test_basic_anndata_io.py index 12d28498cf..5e72f2829a 100644 --- a/apis/python/tests/test_basic_anndata_io.py +++ b/apis/python/tests/test_basic_anndata_io.py @@ -147,6 +147,7 @@ def test_import_anndata(adata, ingest_modes, X_kind): # Check obs obs = exp.obs.read().concat().to_pandas() + print(obs) assert sorted(obs.columns.to_list()) == sorted( orig.obs_keys() + ["soma_joinid", "obs_id"] ) diff --git a/libtiledbsoma/src/soma/column_buffer.h b/libtiledbsoma/src/soma/column_buffer.h index 7a6a345603..e4fa069e97 100644 --- a/libtiledbsoma/src/soma/column_buffer.h +++ b/libtiledbsoma/src/soma/column_buffer.h @@ -123,20 +123,21 @@ class ColumnBuffer { * @param num_elems the number of elements in the column */ void set_data( - const void* data, uint64_t num_elems, - std::optional> offsets = std::nullopt, - std::optional> validity = std::nullopt) { + const void* data, + uint64_t* offsets = nullptr, + uint8_t* validity = nullptr) { this->num_cells_ = num_elems; this->data_.resize(num_elems); - this->data_.assign( - (std::byte*)data, (std::byte*)data + num_elems * type_size_); + this->data_.assign((std::byte*)data, (std::byte*)data + num_elems); - if (offsets.has_value()) - offsets_ = *offsets; + if (offsets != nullptr) { + offsets_.assign((uint32_t*)offsets, (uint32_t*)offsets + num_elems); + } - if (validity.has_value()) - validity_ = *validity; + if (validity != nullptr) { + validity_.assign(validity, validity + num_elems); + } } /** diff --git a/libtiledbsoma/src/soma/managed_query.cc b/libtiledbsoma/src/soma/managed_query.cc index dd40bef0d9..0594c1d875 100644 --- a/libtiledbsoma/src/soma/managed_query.cc +++ b/libtiledbsoma/src/soma/managed_query.cc @@ -103,18 +103,28 @@ void ManagedQuery::set_column_data( schema_->has_attribute(column_name)) { auto data = column_buffer->data(); query_->set_data_buffer( - column_name, (void*)data.data(), data.size_bytes()); + column_name, + (void*)data.data(), + column_buffer->is_var() ? + column_buffer->offsets()[column_buffer->offsets().size() - 1] : + data.size_bytes()); if (column_buffer->is_var()) { // Remove one offset for TileDB, which checks that the // offsets and validity buffers are the same size auto offsets = column_buffer->offsets(); query_->set_offsets_buffer( - column_name, offsets.data(), column_buffer->size() - 1); + column_name, offsets.data(), offsets.size()); + + std::cout << "offset in ManagedQuery::set_column_data for " + << column_name << std::endl; + for (auto os : offsets) + std::cout << os << " "; + std::cout << std::endl; } if (column_buffer->is_nullable()) { auto validity = column_buffer->validity(); query_->set_validity_buffer( - column_name, validity.data(), column_buffer->size()); + column_name, validity.data(), validity.size()); } // column_buffer->attach(*query_); } else { diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index fb10d2df3c..e7434cb6eb 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -265,10 +265,10 @@ std::optional> SOMAArray::read_next() { void SOMAArray::set_column_data( std::string_view name, - const void* data, uint64_t num_elems, - std::optional> offsets, - std::optional> validity) { + const void* data, + uint64_t* offsets, + uint8_t* validity) { if (mq_->query_type() != TILEDB_WRITE) { throw TileDBSOMAError("[SOMAArray] array must be opened in write mode"); } @@ -281,7 +281,7 @@ void SOMAArray::set_column_data( // `set_column_data` because ColumnBuffer::create requires a TileDB Array // argument which should remain a private member of SOMAArray auto column = ColumnBuffer::create(arr_, name); - column->set_data(data, num_elems, offsets, validity); + column->set_data(num_elems, data, offsets, validity); // Keep the ColumnBuffer alive by attaching it to the ArrayBuffers class // member. Otherwise, the data held by the ColumnBuffer will be garbage diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 78330c004a..418fb10321 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -406,10 +406,10 @@ class SOMAArray : public SOMAObject { void set_column_data( std::string_view name, - const void* data, uint64_t num_elems, - std::optional> offsets = std::nullopt, - std::optional> validity = std::nullopt); + const void* data, + uint64_t* offsets = nullptr, + uint8_t* validity = nullptr); /** * @brief Write ArrayBuffers data to the array. diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 3bca98c97d..8499a6fc96 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -265,14 +265,18 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( domain.add_dimension(dim); } else { - auto attr = Attribute(*ctx, child->name, type); + Attribute attr(*ctx, child->name, type); - if (child->flags | ARROW_FLAG_NULLABLE) - attr.set_nullable(true); + // if (child->flags & ARROW_FLAG_NULLABLE) { + // attr.set_nullable(true); + // } - if (strcmp(child->format, "U")) + if ((strcmp(child->format, "U") == 0) | + (strcmp(child->format, "Z") == 0) | + (strcmp(child->format, "u") == 0) | + (strcmp(child->format, "z") == 0)) { attr.set_cell_val_num(TILEDB_VAR_NUM); - + } schema.add_attribute(attr); } } diff --git a/libtiledbsoma/test/unit_soma_array.cc b/libtiledbsoma/test/unit_soma_array.cc index 455f9c07bd..3dcddf6832 100644 --- a/libtiledbsoma/test/unit_soma_array.cc +++ b/libtiledbsoma/test/unit_soma_array.cc @@ -139,8 +139,8 @@ std::tuple, std::vector> write_array( std::vector a0(num_cells_per_fragment, frag_num); // Write data to array - soma_array->set_column_data("a0", a0.data(), a0.size()); - soma_array->set_column_data("d0", d0.data(), d0.size()); + soma_array->set_column_data("a0", a0.size(), a0.data()); + soma_array->set_column_data("d0", d0.size(), d0.data()); soma_array->write(); soma_array->close(); } diff --git a/libtiledbsoma/test/unit_soma_dataframe.cc b/libtiledbsoma/test/unit_soma_dataframe.cc index f9c86fce1a..4fba0021a6 100644 --- a/libtiledbsoma/test/unit_soma_dataframe.cc +++ b/libtiledbsoma/test/unit_soma_dataframe.cc @@ -55,8 +55,8 @@ TEST_CASE("SOMADataFrame: basic") { std::vector a0(10, 1); soma_dataframe = SOMADataFrame::open(uri, OpenMode::write, ctx); - soma_dataframe->set_column_data("a0", a0.data(), a0.size()); - soma_dataframe->set_column_data("d0", d0.data(), d0.size()); + soma_dataframe->set_column_data("a0", a0.size(), a0.data()); + soma_dataframe->set_column_data("d0", d0.size(), d0.data()); soma_dataframe->write(); soma_dataframe->close(); diff --git a/libtiledbsoma/test/unit_soma_dense_ndarray.cc b/libtiledbsoma/test/unit_soma_dense_ndarray.cc index 8ed91fad59..bd0dd1a1a9 100644 --- a/libtiledbsoma/test/unit_soma_dense_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_dense_ndarray.cc @@ -98,8 +98,8 @@ TEST_CASE("SOMADenseNDArray: basic") { std::vector a0(10, 1); soma_dense->open(OpenMode::write); - soma_dense->set_column_data("a0", a0.data(), a0.size()); - soma_dense->set_column_data("d0", d0.data(), d0.size()); + soma_dense->set_column_data("a0", a0.size(), a0.data()); + soma_dense->set_column_data("d0", d0.size(), d0.data()); soma_dense->write(); soma_dense->close(); diff --git a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc index 032e176605..24af5dc936 100644 --- a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc @@ -100,8 +100,8 @@ TEST_CASE("SOMASparseNDArray: basic") { std::vector a0(10, 1); soma_sparse->open(OpenMode::write); - soma_sparse->set_column_data("a0", a0.data(), a0.size()); - soma_sparse->set_column_data("d0", d0.data(), d0.size()); + soma_sparse->set_column_data("a0", a0.size(), a0.data()); + soma_sparse->set_column_data("d0", d0.size(), d0.data()); soma_sparse->write(); soma_sparse->close(); From 212955b54402e666430b8c866ddfeb3f984b89ee Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Tue, 27 Feb 2024 16:26:34 -0600 Subject: [PATCH 26/70] WIP fix segfaults from var-sized writes and metadata --- apis/python/setup.py | 2 +- apis/python/src/tiledbsoma/_dataframe.py | 10 +- apis/python/src/tiledbsoma/soma_array.cc | 116 ++++++------ apis/python/tests/test_basic_anndata_io.py | 1 - libtiledbsoma/src/soma/column_buffer.h | 17 +- libtiledbsoma/src/soma/managed_query.cc | 204 +++++++++++---------- libtiledbsoma/src/soma/soma_array.cc | 83 +++++---- libtiledbsoma/src/soma/soma_array.h | 9 - libtiledbsoma/src/soma/soma_group.cc | 40 ++-- libtiledbsoma/src/soma/soma_group.h | 9 +- libtiledbsoma/src/utils/arrow_adapter.cc | 58 +++--- 11 files changed, 292 insertions(+), 257 deletions(-) diff --git a/apis/python/setup.py b/apis/python/setup.py index 3d9fadf7eb..4ad2ced690 100644 --- a/apis/python/setup.py +++ b/apis/python/setup.py @@ -308,7 +308,7 @@ def run(self): library_dirs=LIB_DIRS, libraries=["tiledbsoma"] + (["tiledb"] if os.name == "nt" else []), extra_link_args=CXX_FLAGS, - extra_compile_args=["-std=c++17" if os.name != "nt" else "/std:c++17"] + extra_compile_args=["-std=c++17" if os.name != "nt" else "/std:c++17", "-g"] + CXX_FLAGS, language="c++", ) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index d9b96e9276..3fba1426e8 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -241,6 +241,7 @@ def create( domains.append(slot_domain) extents.append([extent]) + # TODO add as kw args handle = clib.SOMADataFrame.create( uri, schema, @@ -369,8 +370,6 @@ def read( Lifecycle: Experimental. """ - print("STARTING READ PATH") - del batch_size # Currently unused. _util.check_unpartitioned(partitions) self._check_open_read() @@ -396,8 +395,6 @@ def read( sr.set_condition(QueryCondition(value_filter), handle.schema) self._set_reader_coords(sr, coords) - - print(sr.schema) # # TODO: batch_size return TableReadIter(sr) @@ -518,13 +515,8 @@ def write( # the user set index_column_names = ["meister", "burger"] when creating the TileDB schema. # Then the above for-loop over the Arrow schema will find the former ordering, but for the # ``writer[dims] = attrs`` below we must have dims with the latter ordering. - print("STARTING WRITE PATH") for batch in values.to_batches(): - print(batch) self._handle.write(batch) - # dim_cols_list = [dim_cols_map[name] for name in self.index_column_names] - # dim_cols_tuple = tuple(dim_cols_list) - # self._handle.writer[dim_cols_tuple] = attr_cols_map tiledb_create_options = TileDBCreateOptions.from_platform_config( platform_config ) diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index b4d8f2781f..7e4f318d2f 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -39,6 +39,57 @@ namespace py = pybind11; using namespace py::literals; using namespace tiledbsoma; +void write(SOMAArray& array, py::handle py_batch) { + ArrowSchema arrow_schema; + ArrowArray arrow_array; + uintptr_t arrow_schema_ptr = (uintptr_t)(&arrow_schema); + uintptr_t arrow_array_ptr = (uintptr_t)(&arrow_array); + py_batch.attr("_export_to_c")(arrow_array_ptr, arrow_schema_ptr); + + for (auto i = 0; i < arrow_schema.n_children; ++i) { + auto sch_ = arrow_schema.children[i]; + auto arr_ = arrow_array.children[i]; + + const void* data; + uint64_t* offsets = nullptr; + uint8_t* validities = nullptr; + + if (arr_->null_count != 0) { + validities = (uint8_t*)arr_->buffers[0]; + } + + if (arr_->n_buffers == 3) { + offsets = (uint64_t*)arr_->buffers[1]; + data = arr_->buffers[2]; + } else { + data = arr_->buffers[1]; + } + + array.set_column_data( + sch_->name, arr_->length, data, offsets, validities); + } + array.write(); +} + +py::dict meta(SOMAArray& array){ + py::dict results; + + for (auto [key, val] : array.get_metadata()) { + auto [tdb_type, value_num, value] = val; + + if (tdb_type == TILEDB_STRING_UTF8 || tdb_type == TILEDB_STRING_ASCII) { + auto py_buf = py::array(py::dtype("|S1"), value_num, value); + auto res = py_buf.attr("tobytes")().attr("decode")("UTF-8"); + results[py::str(key)] = res; + } else { + py::dtype value_type = tdb_to_np_dtype(tdb_type, 1); + auto res = py::array(value_type, value_num, value).attr("item")(0);; + results[py::str(key)] = res; + } + } + return results; +} + py::tuple get_enum(SOMAArray& sr, std::string attr_name) { auto attr_to_enmrs = sr.get_attr_to_enum_mapping(); if (attr_to_enmrs.count(attr_name) == 0) @@ -521,40 +572,7 @@ void load_soma_array(py::module& m) { return std::nullopt; }) - .def( - "write", - [](SOMAArray& array, py::handle py_batch) { - ArrowSchema arrow_schema; - ArrowArray arrow_array; - uintptr_t arrow_schema_ptr = (uintptr_t)(&arrow_schema); - uintptr_t arrow_array_ptr = (uintptr_t)(&arrow_array); - py_batch.attr("_export_to_c")( - arrow_array_ptr, arrow_schema_ptr); - - for (auto i = 0; i < arrow_schema.n_children; ++i) { - auto sch_ = arrow_schema.children[i]; - auto arr_ = arrow_array.children[i]; - - const void* data; - uint64_t* offsets = nullptr; - uint8_t* validities = nullptr; - - if (arr_->null_count != 0) { - validities = (uint8_t*)arr_->buffers[0]; - } - - if (arr_->n_buffers == 3) { - offsets = (uint64_t*)arr_->buffers[1]; - data = arr_->buffers[2]; - } else { - data = arr_->buffers[1]; - } - - array.set_column_data( - sch_->name, arr_->length, data, offsets, validities); - } - array.write(); - }) + .def("write", write) .def("nnz", &SOMAArray::nnz, py::call_guard()) @@ -687,35 +705,7 @@ void load_soma_array(py::module& m) { "get_metadata", py::overload_cast(&SOMAArray::get_metadata)) - .def( - "meta", - [](SOMAArray& array) -> py::dict { - py::dict results; - auto np_join = py::module::import("numpy").attr("char").attr( - "join"); - - for (auto const& [key, val] : array.get_metadata()) { - auto [tdb_type, value_num, value] = *( - array.get_metadata(key)); - - if (tdb_type == TILEDB_STRING_UTF8 || - tdb_type == TILEDB_STRING_ASCII) { - auto py_buf = py::array( - py::dtype("|S1"), value_num, value); - results[py::str(key)] = py_buf.attr("tobytes")().attr( - "decode")("UTF-8"); - } else { - py::dtype value_type = tdb_to_np_dtype(tdb_type, 1); - results[py::str(key)] = py::array( - value_type, value_num, value)[0]; - } - } - return results; - }) - - .def( - "get_metadata", - py::overload_cast(&SOMAArray::get_metadata)) + .def("meta", meta) .def("has_metadata", &SOMAArray::has_metadata) diff --git a/apis/python/tests/test_basic_anndata_io.py b/apis/python/tests/test_basic_anndata_io.py index 5e72f2829a..12d28498cf 100644 --- a/apis/python/tests/test_basic_anndata_io.py +++ b/apis/python/tests/test_basic_anndata_io.py @@ -147,7 +147,6 @@ def test_import_anndata(adata, ingest_modes, X_kind): # Check obs obs = exp.obs.read().concat().to_pandas() - print(obs) assert sorted(obs.columns.to_list()) == sorted( orig.obs_keys() + ["soma_joinid", "obs_id"] ) diff --git a/libtiledbsoma/src/soma/column_buffer.h b/libtiledbsoma/src/soma/column_buffer.h index e4fa069e97..c65bbb7c99 100644 --- a/libtiledbsoma/src/soma/column_buffer.h +++ b/libtiledbsoma/src/soma/column_buffer.h @@ -127,12 +127,21 @@ class ColumnBuffer { const void* data, uint64_t* offsets = nullptr, uint8_t* validity = nullptr) { - this->num_cells_ = num_elems; - this->data_.resize(num_elems); - this->data_.assign((std::byte*)data, (std::byte*)data + num_elems); + num_cells_ = num_elems; if (offsets != nullptr) { - offsets_.assign((uint32_t*)offsets, (uint32_t*)offsets + num_elems); + // TODO this can be either a unit32_t or uint64_t pointer + offsets_.resize(num_elems + 1); + offsets_.assign( + (uint32_t*)offsets, (uint32_t*)offsets + num_elems + 1); + + data_.resize(offsets_[num_elems + 1]); + data_.assign( + (std::byte*)data, (std::byte*)data + offsets_[num_elems]); + } else { + data_.resize(num_elems); + data_.assign( + (std::byte*)data, (std::byte*)data + num_elems * type_size_); } if (validity != nullptr) { diff --git a/libtiledbsoma/src/soma/managed_query.cc b/libtiledbsoma/src/soma/managed_query.cc index 0594c1d875..8b094d93d8 100644 --- a/libtiledbsoma/src/soma/managed_query.cc +++ b/libtiledbsoma/src/soma/managed_query.cc @@ -99,117 +99,136 @@ void ManagedQuery::select_columns( void ManagedQuery::set_column_data( std::shared_ptr column_buffer) { auto column_name = std::string(column_buffer->name()); - if (array_->schema().array_type() == TILEDB_SPARSE || - schema_->has_attribute(column_name)) { + bool has_attr = schema_->has_attribute(column_name); + bool is_sparse = array_->schema().array_type() == TILEDB_SPARSE; + + if (is_sparse) { auto data = column_buffer->data(); query_->set_data_buffer( column_name, (void*)data.data(), column_buffer->is_var() ? - column_buffer->offsets()[column_buffer->offsets().size() - 1] : + column_buffer->offsets()[column_buffer->offsets().size()] : data.size_bytes()); + if (column_buffer->is_var()) { // Remove one offset for TileDB, which checks that the // offsets and validity buffers are the same size auto offsets = column_buffer->offsets(); query_->set_offsets_buffer( - column_name, offsets.data(), offsets.size()); - - std::cout << "offset in ManagedQuery::set_column_data for " - << column_name << std::endl; - for (auto os : offsets) - std::cout << os << " "; - std::cout << std::endl; + column_name, offsets.data(), offsets.size() - 1); } if (column_buffer->is_nullable()) { auto validity = column_buffer->validity(); query_->set_validity_buffer( column_name, validity.data(), validity.size()); } - // column_buffer->attach(*query_); } else { - switch (column_buffer->type()) { - case TILEDB_STRING_ASCII: - case TILEDB_STRING_UTF8: - case TILEDB_CHAR: - case TILEDB_BLOB: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_FLOAT32: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_FLOAT64: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_UINT8: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_INT8: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_UINT16: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_INT16: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_UINT32: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_INT32: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_UINT64: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - case TILEDB_INT64: - case TILEDB_TIME_SEC: - case TILEDB_TIME_MS: - case TILEDB_TIME_US: - case TILEDB_TIME_NS: - case TILEDB_DATETIME_SEC: - case TILEDB_DATETIME_MS: - case TILEDB_DATETIME_US: - case TILEDB_DATETIME_NS: - subarray_->add_range( - column_name, - column_buffer->data()[0], - column_buffer->data()[1]); - break; - default: - break; + if (has_attr) { + auto data = column_buffer->data(); + query_->set_data_buffer( + column_name, + (void*)data.data(), + column_buffer->is_var() ? + column_buffer + ->offsets()[column_buffer->offsets().size() - 1] : + data.size_bytes()); + if (column_buffer->is_var()) { + // Remove one offset for TileDB, which checks that the + // offsets and validity buffers are the same size + auto offsets = column_buffer->offsets(); + query_->set_offsets_buffer( + column_name, offsets.data(), offsets.size()); + } + if (column_buffer->is_nullable()) { + auto validity = column_buffer->validity(); + query_->set_validity_buffer( + column_name, validity.data(), validity.size()); + } + } else { + switch (column_buffer->type()) { + case TILEDB_STRING_ASCII: + case TILEDB_STRING_UTF8: + case TILEDB_CHAR: + case TILEDB_BLOB: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_FLOAT32: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_FLOAT64: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_UINT8: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_INT8: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_UINT16: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_INT16: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_UINT32: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_INT32: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_UINT64: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + case TILEDB_INT64: + case TILEDB_TIME_SEC: + case TILEDB_TIME_MS: + case TILEDB_TIME_US: + case TILEDB_TIME_NS: + case TILEDB_DATETIME_SEC: + case TILEDB_DATETIME_MS: + case TILEDB_DATETIME_US: + case TILEDB_DATETIME_NS: + subarray_->add_range( + column_name, + column_buffer->data()[0], + column_buffer->data()[1]); + break; + default: + break; + } + query_->set_subarray(*subarray_); } - query_->set_subarray(*subarray_); } } @@ -354,5 +373,4 @@ void ManagedQuery::check_column_name(const std::string& name) { name)); } } - }; // namespace tiledbsoma diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index e7434cb6eb..0a340834c3 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -128,7 +128,6 @@ SOMAArray::SOMAArray( ctx_ = std::make_shared(platform_config); validate(mode, name, timestamp); reset(column_names, batch_size, result_order); - fill_metadata_cache(); } SOMAArray::SOMAArray( @@ -146,29 +145,29 @@ SOMAArray::SOMAArray( , timestamp_(timestamp) { validate(mode, name, timestamp); reset(column_names, batch_size, result_order); - fill_metadata_cache(); } -void SOMAArray::fill_metadata_cache() { - std::shared_ptr array; - if (arr_->query_type() == TILEDB_WRITE) { - array = std::make_shared(*ctx_->tiledb_ctx(), uri_, TILEDB_READ); - } else { - array = arr_; - } - - for (uint64_t idx = 0; idx < array->metadata_num(); ++idx) { - std::string key; - tiledb_datatype_t value_type; - uint32_t value_num; - const void* value; - array->get_metadata_from_index( - idx, &key, &value_type, &value_num, &value); - MetadataValue mdval(value_type, value_num, value); - std::pair mdpair(key, mdval); - metadata_.insert(mdpair); - } -} +// void SOMAArray::fill_metadata_cache() { +// std::shared_ptr array; +// if (arr_->query_type() == TILEDB_WRITE) { +// array = std::make_shared(*ctx_->tiledb_ctx(), uri_, +// TILEDB_READ); +// } else { +// array = arr_; +// } + +// for (uint64_t idx = 0; idx < array->metadata_num(); ++idx) { +// std::string key; +// tiledb_datatype_t value_type; +// uint32_t value_num; +// const void* value; +// array->get_metadata_from_index( +// idx, &key, &value_type, &value_num, &value); +// MetadataValue mdval(value_type, value_num, value); +// std::pair mdpair(key, mdval); +// metadata_.insert(mdpair); +// } +// } const std::string SOMAArray::uri() const { return uri_; @@ -551,10 +550,6 @@ void SOMAArray::set_metadata( throw TileDBSOMAError("soma_object_type cannot be modified."); } arr_->put_metadata(key, value_type, value_num, value); - - MetadataValue mdval(value_type, value_num, value); - std::pair mdpair(key, mdval); - metadata_.insert(mdpair); } void SOMAArray::delete_metadata(const std::string& key) { @@ -562,26 +557,44 @@ void SOMAArray::delete_metadata(const std::string& key) { throw TileDBSOMAError("soma_object_type cannot be deleted."); } arr_->delete_metadata(key); - metadata_.erase(key); -} - -std::map SOMAArray::get_metadata() { - return metadata_; } std::optional SOMAArray::get_metadata(const std::string& key) { - if (metadata_.count(key) == 0) { + tiledb_datatype_t value_type; + uint32_t value_num; + const void* value; + + arr_->get_metadata(key, &value_type, &value_num, &value); + + if (value == nullptr) return std::nullopt; + + return MetadataValue(value_type, value_num, value); +} + +std::map SOMAArray::get_metadata() { + std::map meta; + + std::string key; + tiledb_datatype_t value_type; + uint32_t value_num; + const void* value; + + for (uint64_t idx = 0; idx < arr_->metadata_num(); ++idx) { + arr_->get_metadata_from_index( + idx, &key, &value_type, &value_num, &value); + meta[key] = MetadataValue(value_type, value_num, value); } - return metadata_[key]; + + return meta; } bool SOMAArray::has_metadata(const std::string& key) { - return metadata_.count(key) != 0; + return get_metadata(key) == std::nullopt; } uint64_t SOMAArray::metadata_num() const { - return metadata_.size(); + return arr_->metadata_num(); } void SOMAArray::validate( diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 418fb10321..7011198ada 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -173,7 +173,6 @@ class SOMAArray : public SOMAObject { , ctx_(other.ctx_) , batch_size_(other.batch_size_) , result_order_(other.result_order_) - , metadata_(other.metadata_) , timestamp_(other.timestamp_) , mq_(std::make_unique( other.arr_, other.ctx_->tiledb_ctx(), other.name_)) @@ -689,11 +688,6 @@ class SOMAArray : public SOMAObject { //= private non-static //=================================================================== - /** - * Fills the metadata cache upon opening the array. - */ - void fill_metadata_cache(); - // SOMAArray URI std::string uri_; @@ -709,9 +703,6 @@ class SOMAArray : public SOMAObject { // Result order ResultOrder result_order_; - // Metadata cache - std::map metadata_; - // Read timestamp range (start, end) std::optional> timestamp_; diff --git a/libtiledbsoma/src/soma/soma_group.cc b/libtiledbsoma/src/soma/soma_group.cc index 4d354d6543..a8e4a643c4 100644 --- a/libtiledbsoma/src/soma/soma_group.cc +++ b/libtiledbsoma/src/soma/soma_group.cc @@ -200,37 +200,51 @@ void SOMAGroup::set_metadata( } group_->put_metadata(key, value_type, value_num, value); - MetadataValue mdval(value_type, value_num, value); - std::pair mdpair(key, mdval); - metadata_.insert(mdpair); } void SOMAGroup::delete_metadata(const std::string& key) { if (key.compare("soma_object_type") == 0) { throw TileDBSOMAError("soma_object_type cannot be deleted."); } - group_->delete_metadata(key); - metadata_.erase(key); -} - -std::map SOMAGroup::get_metadata() { - return metadata_; } std::optional SOMAGroup::get_metadata(const std::string& key) { - if (metadata_.count(key) == 0) { + tiledb_datatype_t value_type; + uint32_t value_num; + const void* value; + + group_->get_metadata(key, &value_type, &value_num, &value); + + if (value == nullptr) return std::nullopt; + + return MetadataValue(value_type, value_num, value); +} + +std::map SOMAGroup::get_metadata() { + std::map meta; + + std::string key; + tiledb_datatype_t value_type; + uint32_t value_num; + const void* value; + + for (uint64_t idx = 0; idx < group_->metadata_num(); ++idx) { + group_->get_metadata_from_index( + idx, &key, &value_type, &value_num, &value); + meta[key] = MetadataValue(value_type, value_num, value); } - return metadata_[key]; + + return meta; } bool SOMAGroup::has_metadata(const std::string& key) { - return metadata_.count(key) != 0; + return get_metadata(key) == std::nullopt; } uint64_t SOMAGroup::metadata_num() const { - return metadata_.size(); + return group_->metadata_num(); } } // namespace tiledbsoma \ No newline at end of file diff --git a/libtiledbsoma/src/soma/soma_group.h b/libtiledbsoma/src/soma/soma_group.h index 27c3c5010f..e7aebeb637 100644 --- a/libtiledbsoma/src/soma/soma_group.h +++ b/libtiledbsoma/src/soma/soma_group.h @@ -251,9 +251,16 @@ class SOMAGroup : public SOMAObject { * @return MetadataValue (std::tuple) */ - std::map get_metadata(); std::optional get_metadata(const std::string& key); + /** + * Get a mapping of all metadata keys with its associated value datatype, + * number of values, and value in binary form. + * + * @return std::map + */ + std::map get_metadata(); + /** * Check if the key exists in metadata from an open group. The group must * be opened in READ mode, otherwise the function will error out. diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 8499a6fc96..60be571c46 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -250,35 +250,37 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( ArraySchema schema(*ctx, TILEDB_SPARSE); Domain domain(*ctx); - for (size_t col_idx = 0; col_idx < index_column_names.size(); ++col_idx) { - for (int64_t schema_idx = 0; schema_idx < arrow_schema->n_children; - ++schema_idx) { - auto child = arrow_schema->children[schema_idx]; - auto type = ArrowAdapter::to_tiledb_format(child->format); - if (child->name == index_column_names[col_idx]) { - auto dim = Dimension::create( - *ctx, - child->name, - type, - domains->children[col_idx]->buffers[1], - extents->children[col_idx]->buffers[1]); - - domain.add_dimension(dim); - } else { - Attribute attr(*ctx, child->name, type); - - // if (child->flags & ARROW_FLAG_NULLABLE) { - // attr.set_nullable(true); - // } - - if ((strcmp(child->format, "U") == 0) | - (strcmp(child->format, "Z") == 0) | - (strcmp(child->format, "u") == 0) | - (strcmp(child->format, "z") == 0)) { - attr.set_cell_val_num(TILEDB_VAR_NUM); - } - schema.add_attribute(attr); + for (int64_t sch_idx = 0; sch_idx < arrow_schema->n_children; ++sch_idx) { + auto child = arrow_schema->children[sch_idx]; + auto type = ArrowAdapter::to_tiledb_format(child->format); + + auto idx_col_begin = index_column_names.begin(); + auto idx_col_end = index_column_names.end(); + auto idx_col_it = std::find(idx_col_begin, idx_col_end, child->name); + if (idx_col_it != idx_col_end) { + auto idx_col_idx = std::distance(idx_col_begin, idx_col_it); + auto dim = Dimension::create( + *ctx, + child->name, + type, + domains->children[idx_col_idx]->buffers[1], + extents->children[idx_col_idx]->buffers[1]); + + domain.add_dimension(dim); + } else { + Attribute attr(*ctx, child->name, type); + + // if (child->flags & ARROW_FLAG_NULLABLE) { + // attr.set_nullable(true); + // } + + if ((strcmp(child->format, "U") == 0) | + (strcmp(child->format, "Z") == 0) | + (strcmp(child->format, "u") == 0) | + (strcmp(child->format, "z") == 0)) { + attr.set_cell_val_num(TILEDB_VAR_NUM); } + schema.add_attribute(attr); } } From 004a06ad803a76a4f3753c37e15913b3ee6a3742 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Wed, 28 Feb 2024 06:28:31 -0600 Subject: [PATCH 27/70] WIP create methods should be void --- apis/python/src/tiledbsoma/_dataframe.py | 2 +- apis/python/src/tiledbsoma/soma_array.cc | 5 +-- apis/python/src/tiledbsoma/soma_dataframe.cc | 2 +- libtiledbsoma/src/soma/soma_collection.cc | 32 +++++++++++-------- libtiledbsoma/src/soma/soma_collection.h | 3 +- libtiledbsoma/src/soma/soma_dataframe.cc | 3 +- libtiledbsoma/src/soma/soma_dataframe.h | 3 +- libtiledbsoma/src/soma/soma_dense_ndarray.cc | 3 +- libtiledbsoma/src/soma/soma_dense_ndarray.h | 2 +- libtiledbsoma/src/soma/soma_experiment.cc | 10 ++++-- libtiledbsoma/src/soma/soma_experiment.h | 18 ++++++++++- libtiledbsoma/src/soma/soma_measurement.cc | 10 ++++-- libtiledbsoma/src/soma/soma_measurement.h | 18 ++++++++++- libtiledbsoma/src/soma/soma_sparse_ndarray.cc | 3 +- libtiledbsoma/src/soma/soma_sparse_ndarray.h | 2 +- libtiledbsoma/test/unit_soma_collection.cc | 22 ++++++++----- 16 files changed, 95 insertions(+), 43 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 3fba1426e8..f8e5ffa000 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -242,7 +242,7 @@ def create( extents.append([extent]) # TODO add as kw args - handle = clib.SOMADataFrame.create( + clib.SOMADataFrame.create( uri, schema, index_column_names, diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index 7e4f318d2f..f4f3572657 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -71,7 +71,7 @@ void write(SOMAArray& array, py::handle py_batch) { array.write(); } -py::dict meta(SOMAArray& array){ +py::dict meta(SOMAArray& array) { py::dict results; for (auto [key, val] : array.get_metadata()) { @@ -83,7 +83,8 @@ py::dict meta(SOMAArray& array){ results[py::str(key)] = res; } else { py::dtype value_type = tdb_to_np_dtype(tdb_type, 1); - auto res = py::array(value_type, value_num, value).attr("item")(0);; + auto res = py::array(value_type, value_num, value).attr("item")(0); + ; results[py::str(key)] = res; } } diff --git a/apis/python/src/tiledbsoma/soma_dataframe.cc b/apis/python/src/tiledbsoma/soma_dataframe.cc index 9d14837415..24ab5076a1 100644 --- a/apis/python/src/tiledbsoma/soma_dataframe.cc +++ b/apis/python/src/tiledbsoma/soma_dataframe.cc @@ -69,7 +69,7 @@ void load_soma_dataframe(py::module& m) { uintptr_t extents_ptr = (uintptr_t)(&extents); py_extents.attr("_export_to_c")(extents_ptr); - return SOMADataFrame::create( + SOMADataFrame::create( uri, std::make_shared(schema), ColumnIndexInfo( diff --git a/libtiledbsoma/src/soma/soma_collection.cc b/libtiledbsoma/src/soma/soma_collection.cc index c6c6e3c23a..c86234f7e5 100644 --- a/libtiledbsoma/src/soma/soma_collection.cc +++ b/libtiledbsoma/src/soma/soma_collection.cc @@ -41,10 +41,9 @@ using namespace tiledb; //= public static //=================================================================== -std::unique_ptr SOMACollection::create( +void SOMACollection::create( std::string_view uri, std::shared_ptr ctx) { SOMAGroup::create(ctx, uri, "SOMACollection"); - return SOMACollection::open(uri, OpenMode::read, ctx); } std::unique_ptr SOMACollection::open( @@ -96,7 +95,9 @@ std::shared_ptr SOMACollection::add_new_collection( std::string_view uri, URIType uri_type, std::shared_ptr ctx) { - std::shared_ptr member = SOMACollection::create(uri, ctx); + SOMACollection::create(uri, ctx); + std::shared_ptr member = SOMAExperiment::open( + uri, OpenMode::read, ctx); this->set(std::string(uri), uri_type, std::string(key)); children_[std::string(key)] = member; return member; @@ -109,8 +110,9 @@ std::shared_ptr SOMACollection::add_new_experiment( std::shared_ptr ctx, std::shared_ptr schema, ColumnIndexInfo index_columns) { - std::shared_ptr member = SOMAExperiment::create( - uri, schema, index_columns, ctx); + SOMAExperiment::create(uri, schema, index_columns, ctx); + std::shared_ptr member = SOMAExperiment::open( + uri, OpenMode::read, ctx); this->set(std::string(uri), uri_type, std::string(key)); children_[std::string(key)] = member; return member; @@ -123,8 +125,9 @@ std::shared_ptr SOMACollection::add_new_measurement( std::shared_ptr ctx, std::shared_ptr schema, ColumnIndexInfo index_columns) { - std::shared_ptr member = SOMAMeasurement::create( - uri, schema, index_columns, ctx); + SOMAMeasurement::create(uri, schema, index_columns, ctx); + std::shared_ptr member = SOMAMeasurement::open( + uri, OpenMode::read, ctx); this->set(std::string(uri), uri_type, std::string(key)); children_[std::string(key)] = member; return member; @@ -137,8 +140,9 @@ std::shared_ptr SOMACollection::add_new_dataframe( std::shared_ptr ctx, std::shared_ptr schema, ColumnIndexInfo index_columns) { - std::shared_ptr member = SOMADataFrame::create( - uri, schema, index_columns, ctx); + SOMADataFrame::create(uri, schema, index_columns, ctx); + std::shared_ptr member = SOMADataFrame::open( + uri, OpenMode::read, ctx); this->set(std::string(uri), uri_type, std::string(key)); children_[std::string(key)] = member; return member; @@ -150,8 +154,9 @@ std::shared_ptr SOMACollection::add_new_dense_ndarray( URIType uri_type, std::shared_ptr ctx, ArraySchema schema) { - std::shared_ptr member = SOMADenseNDArray::create( - uri, schema, ctx); + SOMADenseNDArray::create(uri, schema, ctx); + std::shared_ptr member = SOMADenseNDArray::open( + uri, OpenMode::read, ctx); this->set(std::string(uri), uri_type, std::string(key)); children_[std::string(key)] = member; return member; @@ -163,8 +168,9 @@ std::shared_ptr SOMACollection::add_new_sparse_ndarray( URIType uri_type, std::shared_ptr ctx, ArraySchema schema) { - std::shared_ptr member = SOMASparseNDArray::create( - uri, schema, ctx); + SOMASparseNDArray::create(uri, schema, ctx); + std::shared_ptr member = SOMASparseNDArray::open( + uri, OpenMode::read, ctx); this->set(std::string(uri), uri_type, std::string(key)); children_[std::string(key)] = member; return member; diff --git a/libtiledbsoma/src/soma/soma_collection.h b/libtiledbsoma/src/soma/soma_collection.h index 14b308ba11..57b2d139f1 100644 --- a/libtiledbsoma/src/soma/soma_collection.h +++ b/libtiledbsoma/src/soma/soma_collection.h @@ -61,8 +61,7 @@ class SOMACollection : public SOMAGroup { * @param ctx TileDB context * @param uri URI to create the SOMACollection */ - static std::unique_ptr create( - std::string_view uri, std::shared_ptr ctx); + static void create(std::string_view uri, std::shared_ptr ctx); /** * @brief Open a group at the specified URI and return SOMACollection diff --git a/libtiledbsoma/src/soma/soma_dataframe.cc b/libtiledbsoma/src/soma/soma_dataframe.cc index a97711d845..0086223b43 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.cc +++ b/libtiledbsoma/src/soma/soma_dataframe.cc @@ -39,7 +39,7 @@ using namespace tiledb; //= public static //=================================================================== -std::unique_ptr SOMADataFrame::create( +void SOMADataFrame::create( std::string_view uri, std::shared_ptr schema, ColumnIndexInfo index_columns, @@ -47,7 +47,6 @@ std::unique_ptr SOMADataFrame::create( auto tiledb_schema = ArrowAdapter::tiledb_schema_from_arrow_schema( ctx->tiledb_ctx(), schema, index_columns); SOMAArray::create(ctx, uri, tiledb_schema, "SOMADataFrame"); - return SOMADataFrame::open(uri, OpenMode::read, ctx); } std::unique_ptr SOMADataFrame::open( diff --git a/libtiledbsoma/src/soma/soma_dataframe.h b/libtiledbsoma/src/soma/soma_dataframe.h index bfd3fcb6c0..6124d32a32 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.h +++ b/libtiledbsoma/src/soma/soma_dataframe.h @@ -55,9 +55,8 @@ class SOMADataFrame : public SOMAArray { * @param uri URI to create the SOMADataFrame * @param schema TileDB ArraySchema * @param platform_config Optional config parameter dictionary - * @return std::shared_ptr opened in read mode */ - static std::unique_ptr create( + static void create( std::string_view uri, std::shared_ptr schema, ColumnIndexInfo index_columns, diff --git a/libtiledbsoma/src/soma/soma_dense_ndarray.cc b/libtiledbsoma/src/soma/soma_dense_ndarray.cc index 057f52084e..a3f43abac4 100644 --- a/libtiledbsoma/src/soma/soma_dense_ndarray.cc +++ b/libtiledbsoma/src/soma/soma_dense_ndarray.cc @@ -38,12 +38,11 @@ using namespace tiledb; //= public static //=================================================================== -std::unique_ptr SOMADenseNDArray::create( +void SOMADenseNDArray::create( std::string_view uri, ArraySchema schema, std::shared_ptr ctx) { SOMAArray::create(ctx, uri, schema, "SOMADenseNDArray"); - return SOMADenseNDArray::open(uri, OpenMode::read, ctx); } std::unique_ptr SOMADenseNDArray::open( diff --git a/libtiledbsoma/src/soma/soma_dense_ndarray.h b/libtiledbsoma/src/soma/soma_dense_ndarray.h index cd12a23be9..f072bdac6c 100644 --- a/libtiledbsoma/src/soma/soma_dense_ndarray.h +++ b/libtiledbsoma/src/soma/soma_dense_ndarray.h @@ -57,7 +57,7 @@ class SOMADenseNDArray : public SOMAArray { * @param platform_config Optional config parameter dictionary * @return std::shared_ptr opened in read mode */ - static std::unique_ptr create( + static void create( std::string_view uri, ArraySchema schema, std::shared_ptr ctx); diff --git a/libtiledbsoma/src/soma/soma_experiment.cc b/libtiledbsoma/src/soma/soma_experiment.cc index 0d4ff5dab5..643f171abe 100644 --- a/libtiledbsoma/src/soma/soma_experiment.cc +++ b/libtiledbsoma/src/soma/soma_experiment.cc @@ -41,7 +41,7 @@ using namespace tiledb; //= public static //=================================================================== -std::unique_ptr SOMAExperiment::create( +void SOMAExperiment::create( std::string_view uri, std::shared_ptr schema, ColumnIndexInfo index_columns, @@ -56,7 +56,13 @@ std::unique_ptr SOMAExperiment::create( group->set(exp_uri + "/obs", URIType::absolute, "obs"); group->set(exp_uri + "/ms", URIType::absolute, "ms"); group->close(); +} - return std::make_unique(OpenMode::read, exp_uri, ctx); +std::unique_ptr SOMAExperiment::open( + std::string_view uri, + OpenMode mode, + std::shared_ptr ctx, + std::optional> timestamp) { + return std::make_unique(mode, uri, ctx, timestamp); } } // namespace tiledbsoma diff --git a/libtiledbsoma/src/soma/soma_experiment.h b/libtiledbsoma/src/soma/soma_experiment.h index 33c4c79b22..5f6bc7094b 100644 --- a/libtiledbsoma/src/soma/soma_experiment.h +++ b/libtiledbsoma/src/soma/soma_experiment.h @@ -54,12 +54,28 @@ class SOMAExperiment : public SOMACollection { * @param schema TileDB ArraySchema * @param platform_config Optional config parameter dictionary */ - static std::unique_ptr create( + static void create( std::string_view uri, std::shared_ptr schema, ColumnIndexInfo index_columns, std::shared_ptr ctx); + /** + * @brief Open a group at the specified URI and return SOMAExperiment + * object. + * + * @param uri URI of the array + * @param mode read or write + * @param ctx TileDB context + * @param timestamp Optional pair indicating timestamp start and end + * @return std::shared_ptr SOMAExperiment + */ + static std::unique_ptr open( + std::string_view uri, + OpenMode mode, + std::shared_ptr ctx, + std::optional> timestamp = std::nullopt); + //=================================================================== //= public non-static //=================================================================== diff --git a/libtiledbsoma/src/soma/soma_measurement.cc b/libtiledbsoma/src/soma/soma_measurement.cc index d2628cfad5..a3aa242d19 100644 --- a/libtiledbsoma/src/soma/soma_measurement.cc +++ b/libtiledbsoma/src/soma/soma_measurement.cc @@ -41,7 +41,7 @@ using namespace tiledb; //= public static //=================================================================== -std::unique_ptr SOMAMeasurement::create( +void SOMAMeasurement::create( std::string_view uri, std::shared_ptr schema, ColumnIndexInfo index_columns, @@ -64,7 +64,13 @@ std::unique_ptr SOMAMeasurement::create( group->set(exp_uri + "/varm", URIType::absolute, "varm"); group->set(exp_uri + "/varp", URIType::absolute, "varp"); group->close(); +} - return std::make_unique(OpenMode::read, uri, ctx); +std::unique_ptr SOMAMeasurement::open( + std::string_view uri, + OpenMode mode, + std::shared_ptr ctx, + std::optional> timestamp) { + return std::make_unique(mode, uri, ctx, timestamp); } } // namespace tiledbsoma diff --git a/libtiledbsoma/src/soma/soma_measurement.h b/libtiledbsoma/src/soma/soma_measurement.h index 5a44f72d1d..8530ac0bd7 100644 --- a/libtiledbsoma/src/soma/soma_measurement.h +++ b/libtiledbsoma/src/soma/soma_measurement.h @@ -55,12 +55,28 @@ class SOMAMeasurement : public SOMACollection { * @param schema TileDB ArraySchema * @param ctx TileDB context */ - static std::unique_ptr create( + static void create( std::string_view uri, std::shared_ptr schema, ColumnIndexInfo index_columns, std::shared_ptr ctx); + /** + * @brief Open a group at the specified URI and return SOMAMeasurement + * object. + * + * @param uri URI of the array + * @param mode read or write + * @param ctx TileDB context + * @param timestamp Optional pair indicating timestamp start and end + * @return std::shared_ptr SOMAMeasurement + */ + static std::unique_ptr open( + std::string_view uri, + OpenMode mode, + std::shared_ptr ctx, + std::optional> timestamp = std::nullopt); + //=================================================================== //= public non-static //=================================================================== diff --git a/libtiledbsoma/src/soma/soma_sparse_ndarray.cc b/libtiledbsoma/src/soma/soma_sparse_ndarray.cc index e0ce770ee6..f211bda77b 100644 --- a/libtiledbsoma/src/soma/soma_sparse_ndarray.cc +++ b/libtiledbsoma/src/soma/soma_sparse_ndarray.cc @@ -39,12 +39,11 @@ using namespace tiledb; //= public static //=================================================================== -std::unique_ptr SOMASparseNDArray::create( +void SOMASparseNDArray::create( std::string_view uri, ArraySchema schema, std::shared_ptr ctx) { SOMAArray::create(ctx, uri, schema, "SOMASparseNDArray"); - return SOMASparseNDArray::open(uri, OpenMode::read, ctx); } std::unique_ptr SOMASparseNDArray::open( diff --git a/libtiledbsoma/src/soma/soma_sparse_ndarray.h b/libtiledbsoma/src/soma/soma_sparse_ndarray.h index a18519e807..43627231fd 100644 --- a/libtiledbsoma/src/soma/soma_sparse_ndarray.h +++ b/libtiledbsoma/src/soma/soma_sparse_ndarray.h @@ -57,7 +57,7 @@ class SOMASparseNDArray : public SOMAArray { * @param platform_config Optional config parameter dictionary * @return std::shared_ptr opened in read mode */ - static std::unique_ptr create( + static void create( std::string_view uri, ArraySchema schema, std::shared_ptr ctx); diff --git a/libtiledbsoma/test/unit_soma_collection.cc b/libtiledbsoma/test/unit_soma_collection.cc index 3132a5bd33..ae1b619041 100644 --- a/libtiledbsoma/test/unit_soma_collection.cc +++ b/libtiledbsoma/test/unit_soma_collection.cc @@ -36,7 +36,8 @@ TEST_CASE("SOMACollection: basic") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-collection-basic"; - auto soma_collection = SOMACollection::create(uri, ctx); + SOMACollection::create(uri, ctx); + auto soma_collection = SOMACollection::open(uri, OpenMode::read, ctx); REQUIRE(soma_collection->uri() == uri); REQUIRE(soma_collection->ctx() == ctx); REQUIRE(soma_collection->type() == "SOMACollection"); @@ -251,7 +252,8 @@ TEST_CASE("SOMAExperiment: metadata") { soma_experiment->set_metadata("md", TILEDB_INT32, 1, &val); soma_experiment->close(); - soma_experiment->open(OpenMode::read, std::pair(1, 1)); + soma_experiment = SOMAExperiment::open( + uri, OpenMode::read, ctx, std::pair(1, 1)); REQUIRE(soma_experiment->metadata_num() == 2); REQUIRE(soma_experiment->has_metadata("soma_object_type") == true); REQUIRE(soma_experiment->has_metadata("md") == true); @@ -262,7 +264,8 @@ TEST_CASE("SOMAExperiment: metadata") { REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); soma_experiment->close(); - soma_experiment->open(OpenMode::write, std::pair(2, 2)); + soma_experiment = SOMAExperiment::open( + uri, OpenMode::write, ctx, std::pair(2, 2)); // Metadata should also be retrievable in write mode mdval = soma_experiment->get_metadata("md"); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); @@ -271,7 +274,8 @@ TEST_CASE("SOMAExperiment: metadata") { REQUIRE(!mdval.has_value()); soma_experiment->close(); - soma_experiment->open(OpenMode::read, std::pair(3, 3)); + soma_experiment = SOMAExperiment::open( + uri, OpenMode::read, ctx, std::pair(3, 3)); REQUIRE(soma_experiment->has_metadata("md") == false); REQUIRE(soma_experiment->metadata_num() == 1); soma_experiment->close(); @@ -289,7 +293,8 @@ TEST_CASE("SOMAMeasurement: metadata") { soma_measurement->set_metadata("md", TILEDB_INT32, 1, &val); soma_measurement->close(); - soma_measurement->open(OpenMode::read, std::pair(1, 1)); + soma_measurement = SOMAMeasurement::open( + uri, OpenMode::read, ctx, std::pair(1, 1)); REQUIRE(soma_measurement->metadata_num() == 2); REQUIRE(soma_measurement->has_metadata("soma_object_type") == true); REQUIRE(soma_measurement->has_metadata("md") == true); @@ -300,8 +305,8 @@ TEST_CASE("SOMAMeasurement: metadata") { REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); soma_measurement->close(); - soma_measurement->open( - OpenMode::write, std::pair(2, 2)); + soma_measurement = SOMAMeasurement::open( + uri, OpenMode::write, ctx, std::pair(2, 2)); // Metadata should also be retrievable in write mode mdval = soma_measurement->get_metadata("md"); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); @@ -310,7 +315,8 @@ TEST_CASE("SOMAMeasurement: metadata") { REQUIRE(!mdval.has_value()); soma_measurement->close(); - soma_measurement->open(OpenMode::read, std::pair(3, 3)); + soma_measurement = SOMAMeasurement::open( + uri, OpenMode::read, ctx, std::pair(3, 3)); REQUIRE(soma_measurement->has_metadata("md") == false); REQUIRE(soma_measurement->metadata_num() == 1); soma_measurement->close(); From 10a0d320853c6780928dbdbbcb41c10d5fede8f1 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Wed, 28 Feb 2024 14:59:39 -0600 Subject: [PATCH 28/70] WIP fix errors related to span indexing past length for offset --- apis/python/src/tiledbsoma/_dataframe.py | 2 ++ libtiledbsoma/src/soma/column_buffer.h | 24 +++++++++++++++++++----- libtiledbsoma/src/soma/managed_query.cc | 15 +++------------ 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index f8e5ffa000..9e1a336b24 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -515,7 +515,9 @@ def write( # the user set index_column_names = ["meister", "burger"] when creating the TileDB schema. # Then the above for-loop over the Arrow schema will find the former ordering, but for the # ``writer[dims] = attrs`` below we must have dims with the latter ordering. + print("WRITER") for batch in values.to_batches(): + print(batch) self._handle.write(batch) tiledb_create_options = TileDBCreateOptions.from_platform_config( platform_config diff --git a/libtiledbsoma/src/soma/column_buffer.h b/libtiledbsoma/src/soma/column_buffer.h index c65bbb7c99..2a12dfa7ba 100644 --- a/libtiledbsoma/src/soma/column_buffer.h +++ b/libtiledbsoma/src/soma/column_buffer.h @@ -131,14 +131,16 @@ class ColumnBuffer { if (offsets != nullptr) { // TODO this can be either a unit32_t or uint64_t pointer - offsets_.resize(num_elems + 1); + auto num_offsets = num_elems + 1; + offsets_.resize(num_offsets); offsets_.assign( - (uint32_t*)offsets, (uint32_t*)offsets + num_elems + 1); + (uint32_t*)offsets, (uint32_t*)offsets + num_offsets); - data_.resize(offsets_[num_elems + 1]); - data_.assign( - (std::byte*)data, (std::byte*)data + offsets_[num_elems]); + data_size_ = offsets_[num_offsets - 1]; + data_.resize(data_size_); + data_.assign((std::byte*)data, (std::byte*)data + data_size_); } else { + data_size_ = num_elems; data_.resize(num_elems); data_.assign( (std::byte*)data, (std::byte*)data + num_elems * type_size_); @@ -165,6 +167,15 @@ class ColumnBuffer { return num_cells_; } + /** + * @brief Return size of the data buffer. + * + * @return uint64_t + */ + uint64_t data_size() { + return data_size_; + } + /** * @brief Return a view of the ColumnBuffer data. * @@ -383,6 +394,9 @@ class ColumnBuffer { // Data type of the column from the schema. tiledb_datatype_t type_; + // Data size which is calculated different for var vs non-var + uint64_t data_size_; + // Bytes per element. uint64_t type_size_; diff --git a/libtiledbsoma/src/soma/managed_query.cc b/libtiledbsoma/src/soma/managed_query.cc index 8b094d93d8..9164f7fa80 100644 --- a/libtiledbsoma/src/soma/managed_query.cc +++ b/libtiledbsoma/src/soma/managed_query.cc @@ -105,11 +105,7 @@ void ManagedQuery::set_column_data( if (is_sparse) { auto data = column_buffer->data(); query_->set_data_buffer( - column_name, - (void*)data.data(), - column_buffer->is_var() ? - column_buffer->offsets()[column_buffer->offsets().size()] : - data.size_bytes()); + column_name, (void*)data.data(), column_buffer->data_size()); if (column_buffer->is_var()) { // Remove one offset for TileDB, which checks that the @@ -127,18 +123,13 @@ void ManagedQuery::set_column_data( if (has_attr) { auto data = column_buffer->data(); query_->set_data_buffer( - column_name, - (void*)data.data(), - column_buffer->is_var() ? - column_buffer - ->offsets()[column_buffer->offsets().size() - 1] : - data.size_bytes()); + column_name, (void*)data.data(), column_buffer->data_size()); if (column_buffer->is_var()) { // Remove one offset for TileDB, which checks that the // offsets and validity buffers are the same size auto offsets = column_buffer->offsets(); query_->set_offsets_buffer( - column_name, offsets.data(), offsets.size()); + column_name, offsets.data(), offsets.size() - 1); } if (column_buffer->is_nullable()) { auto validity = column_buffer->validity(); From 9a8b096b786865a37a721f4c6421f1de9af71b12 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Wed, 28 Feb 2024 19:53:52 -0600 Subject: [PATCH 29/70] WIP add timestamps to create functions --- apis/python/src/tiledbsoma/_dataframe.py | 2 -- libtiledbsoma/src/soma/soma_array.cc | 19 +++++++++++++------ libtiledbsoma/src/soma/soma_array.h | 3 ++- libtiledbsoma/src/soma/soma_dataframe.cc | 5 +++-- libtiledbsoma/src/soma/soma_dataframe.h | 3 ++- libtiledbsoma/src/soma/soma_dense_ndarray.cc | 5 +++-- libtiledbsoma/src/soma/soma_dense_ndarray.h | 3 ++- libtiledbsoma/src/soma/soma_sparse_ndarray.cc | 5 +++-- libtiledbsoma/src/soma/soma_sparse_ndarray.h | 3 ++- libtiledbsoma/test/unit_soma_array.cc | 9 +++++---- 10 files changed, 35 insertions(+), 22 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 9e1a336b24..f8e5ffa000 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -515,9 +515,7 @@ def write( # the user set index_column_names = ["meister", "burger"] when creating the TileDB schema. # Then the above for-loop over the Arrow schema will find the former ordering, but for the # ``writer[dims] = attrs`` below we must have dims with the latter ordering. - print("WRITER") for batch in values.to_batches(): - print(batch) self._handle.write(batch) tiledb_create_options = TileDBCreateOptions.from_platform_config( platform_config diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index 0a340834c3..a37f55a312 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -45,24 +45,31 @@ void SOMAArray::create( std::shared_ptr ctx, std::string_view uri, ArraySchema schema, - std::string soma_type) { + std::string soma_type, + std::optional> timestamp) { Array::create(std::string(uri), schema); - auto array = Array(*ctx->tiledb_ctx(), std::string(uri), TILEDB_WRITE); - array.put_metadata( + std::shared_ptr array; + if(timestamp){ + array = std::make_shared(*ctx->tiledb_ctx(), std::string(uri), TILEDB_WRITE, TemporalPolicy(TimestampStartEnd, timestamp->first, timestamp->second)); + }else{ + array = std::make_shared(*ctx->tiledb_ctx(), std::string(uri), TILEDB_WRITE); + } + + array->put_metadata( "soma_object_type", TILEDB_STRING_UTF8, static_cast(soma_type.length()), soma_type.c_str()); std::string encoding_version = "1"; - array.put_metadata( + array->put_metadata( "soma_encoding_version", TILEDB_STRING_UTF8, static_cast(encoding_version.length()), encoding_version.c_str()); - array.close(); + array->close(); } std::unique_ptr SOMAArray::open( @@ -590,7 +597,7 @@ std::map SOMAArray::get_metadata() { } bool SOMAArray::has_metadata(const std::string& key) { - return get_metadata(key) == std::nullopt; + return get_metadata(key) != std::nullopt; } uint64_t SOMAArray::metadata_num() const { diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 7011198ada..f2f59448a6 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -67,7 +67,8 @@ class SOMAArray : public SOMAObject { std::shared_ptr ctx, std::string_view uri, ArraySchema schema, - std::string soma_type); + std::string soma_type, + std::optional> timestamp = std::nullopt); /** * @brief Open an array at the specified URI and return SOMAArray diff --git a/libtiledbsoma/src/soma/soma_dataframe.cc b/libtiledbsoma/src/soma/soma_dataframe.cc index 0086223b43..6a085ab1a7 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.cc +++ b/libtiledbsoma/src/soma/soma_dataframe.cc @@ -43,10 +43,11 @@ void SOMADataFrame::create( std::string_view uri, std::shared_ptr schema, ColumnIndexInfo index_columns, - std::shared_ptr ctx) { + std::shared_ptr ctx, + std::optional> timestamp) { auto tiledb_schema = ArrowAdapter::tiledb_schema_from_arrow_schema( ctx->tiledb_ctx(), schema, index_columns); - SOMAArray::create(ctx, uri, tiledb_schema, "SOMADataFrame"); + SOMAArray::create(ctx, uri, tiledb_schema, "SOMADataFrame", timestamp); } std::unique_ptr SOMADataFrame::open( diff --git a/libtiledbsoma/src/soma/soma_dataframe.h b/libtiledbsoma/src/soma/soma_dataframe.h index 6124d32a32..62dd425fd9 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.h +++ b/libtiledbsoma/src/soma/soma_dataframe.h @@ -60,7 +60,8 @@ class SOMADataFrame : public SOMAArray { std::string_view uri, std::shared_ptr schema, ColumnIndexInfo index_columns, - std::shared_ptr ctx); + std::shared_ptr ctx, + std::optional> timestamp = std::nullopt); /** * @brief Open and return a SOMADataFrame object at the given URI. diff --git a/libtiledbsoma/src/soma/soma_dense_ndarray.cc b/libtiledbsoma/src/soma/soma_dense_ndarray.cc index a3f43abac4..1bcc1c4efd 100644 --- a/libtiledbsoma/src/soma/soma_dense_ndarray.cc +++ b/libtiledbsoma/src/soma/soma_dense_ndarray.cc @@ -41,8 +41,9 @@ using namespace tiledb; void SOMADenseNDArray::create( std::string_view uri, ArraySchema schema, - std::shared_ptr ctx) { - SOMAArray::create(ctx, uri, schema, "SOMADenseNDArray"); + std::shared_ptr ctx, + std::optional> timestamp) { + SOMAArray::create(ctx, uri, schema, "SOMADenseNDArray", timestamp); } std::unique_ptr SOMADenseNDArray::open( diff --git a/libtiledbsoma/src/soma/soma_dense_ndarray.h b/libtiledbsoma/src/soma/soma_dense_ndarray.h index f072bdac6c..a55ba630d3 100644 --- a/libtiledbsoma/src/soma/soma_dense_ndarray.h +++ b/libtiledbsoma/src/soma/soma_dense_ndarray.h @@ -60,7 +60,8 @@ class SOMADenseNDArray : public SOMAArray { static void create( std::string_view uri, ArraySchema schema, - std::shared_ptr ctx); + std::shared_ptr ctx, + std::optional> timestamp = std::nullopt); /** * @brief Open and return a SOMADenseNDArray object at the given URI. diff --git a/libtiledbsoma/src/soma/soma_sparse_ndarray.cc b/libtiledbsoma/src/soma/soma_sparse_ndarray.cc index f211bda77b..f7b0cd4ecd 100644 --- a/libtiledbsoma/src/soma/soma_sparse_ndarray.cc +++ b/libtiledbsoma/src/soma/soma_sparse_ndarray.cc @@ -42,8 +42,9 @@ using namespace tiledb; void SOMASparseNDArray::create( std::string_view uri, ArraySchema schema, - std::shared_ptr ctx) { - SOMAArray::create(ctx, uri, schema, "SOMASparseNDArray"); + std::shared_ptr ctx, + std::optional> timestamp) { + SOMAArray::create(ctx, uri, schema, "SOMASparseNDArray", timestamp); } std::unique_ptr SOMASparseNDArray::open( diff --git a/libtiledbsoma/src/soma/soma_sparse_ndarray.h b/libtiledbsoma/src/soma/soma_sparse_ndarray.h index 43627231fd..4bd9cddee7 100644 --- a/libtiledbsoma/src/soma/soma_sparse_ndarray.h +++ b/libtiledbsoma/src/soma/soma_sparse_ndarray.h @@ -60,7 +60,8 @@ class SOMASparseNDArray : public SOMAArray { static void create( std::string_view uri, ArraySchema schema, - std::shared_ptr ctx); + std::shared_ptr ctx, + std::optional> timestamp = std::nullopt); /** * @brief Open and return a SOMASparseNDArray object at the given URI. diff --git a/libtiledbsoma/test/unit_soma_array.cc b/libtiledbsoma/test/unit_soma_array.cc index 3dcddf6832..92778b27cc 100644 --- a/libtiledbsoma/test/unit_soma_array.cc +++ b/libtiledbsoma/test/unit_soma_array.cc @@ -86,7 +86,7 @@ std::tuple create_array( schema.check(); // Create array - SOMAArray::create(ctx, uri, schema, "NONE"); + SOMAArray::create(ctx, uri, schema, "NONE", std::pair(1, 1)); uint64_t nnz = num_fragments * num_cells_per_fragment; @@ -377,9 +377,10 @@ TEST_CASE("SOMAArray: metadata") { soma_array->set_metadata("md", TILEDB_INT32, 1, &val); soma_array->close(); - soma_array->open(OpenMode::read, std::pair(1, 1)); - REQUIRE(soma_array->metadata_num() == 2); + soma_array->open(OpenMode::read, std::pair(0, 1)); + REQUIRE(soma_array->metadata_num() == 3); REQUIRE(soma_array->has_metadata("soma_object_type") == true); + REQUIRE(soma_array->has_metadata("soma_encoding_version") == true); REQUIRE(soma_array->has_metadata("md") == true); auto mdval = soma_array->get_metadata("md"); @@ -397,7 +398,7 @@ TEST_CASE("SOMAArray: metadata") { REQUIRE(!mdval.has_value()); soma_array->close(); - soma_array->open(OpenMode::read, std::pair(3, 3)); + soma_array->open(OpenMode::read, std::pair(0, 2)); REQUIRE(soma_array->has_metadata("md") == false); REQUIRE(soma_array->metadata_num() == 1); soma_array->close(); From 15d649d0123ddf657ab8df330585866aa2274432 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Thu, 29 Feb 2024 14:26:41 -0600 Subject: [PATCH 30/70] WIP correct metadata for soma array --- libtiledbsoma/src/soma/soma_array.cc | 179 +++++++++++++++----------- libtiledbsoma/src/soma/soma_array.h | 12 ++ libtiledbsoma/src/soma/soma_group.cc | 10 +- libtiledbsoma/src/utils/common.h | 4 + libtiledbsoma/test/unit_soma_array.cc | 37 ++++-- 5 files changed, 153 insertions(+), 89 deletions(-) diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index a37f55a312..f6643d4817 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -50,24 +50,29 @@ void SOMAArray::create( Array::create(std::string(uri), schema); std::shared_ptr array; - if(timestamp){ - array = std::make_shared(*ctx->tiledb_ctx(), std::string(uri), TILEDB_WRITE, TemporalPolicy(TimestampStartEnd, timestamp->first, timestamp->second)); - }else{ - array = std::make_shared(*ctx->tiledb_ctx(), std::string(uri), TILEDB_WRITE); + if (timestamp) { + array = std::make_shared( + *ctx->tiledb_ctx(), + std::string(uri), + TILEDB_WRITE, + TemporalPolicy( + TimestampStartEnd, timestamp->first, timestamp->second)); + } else { + array = std::make_shared( + *ctx->tiledb_ctx(), std::string(uri), TILEDB_WRITE); } array->put_metadata( - "soma_object_type", + SOMA_OBJECT_TYPE_KEY, TILEDB_STRING_UTF8, static_cast(soma_type.length()), soma_type.c_str()); - std::string encoding_version = "1"; array->put_metadata( - "soma_encoding_version", + ENCODING_VERSION_KEY, TILEDB_STRING_UTF8, - static_cast(encoding_version.length()), - encoding_version.c_str()); + static_cast(ENCODING_VERSION_VAL.length()), + ENCODING_VERSION_VAL.c_str()); array->close(); } @@ -135,6 +140,7 @@ SOMAArray::SOMAArray( ctx_ = std::make_shared(platform_config); validate(mode, name, timestamp); reset(column_names, batch_size, result_order); + fill_metadata_cache(); } SOMAArray::SOMAArray( @@ -152,29 +158,33 @@ SOMAArray::SOMAArray( , timestamp_(timestamp) { validate(mode, name, timestamp); reset(column_names, batch_size, result_order); + fill_metadata_cache(); } -// void SOMAArray::fill_metadata_cache() { -// std::shared_ptr array; -// if (arr_->query_type() == TILEDB_WRITE) { -// array = std::make_shared(*ctx_->tiledb_ctx(), uri_, -// TILEDB_READ); -// } else { -// array = arr_; -// } - -// for (uint64_t idx = 0; idx < array->metadata_num(); ++idx) { -// std::string key; -// tiledb_datatype_t value_type; -// uint32_t value_num; -// const void* value; -// array->get_metadata_from_index( -// idx, &key, &value_type, &value_num, &value); -// MetadataValue mdval(value_type, value_num, value); -// std::pair mdpair(key, mdval); -// metadata_.insert(mdpair); -// } -// } +void SOMAArray::fill_metadata_cache() { + if (arr_->query_type() == TILEDB_WRITE) { + meta_cache_arr_ = std::make_shared( + *ctx_->tiledb_ctx(), + uri_, + TILEDB_READ, + TemporalPolicy( + TimestampStartEnd, timestamp()->first, timestamp()->second)); + } else { + meta_cache_arr_ = arr_; + } + + for (uint64_t idx = 0; idx < meta_cache_arr_->metadata_num(); ++idx) { + std::string key; + tiledb_datatype_t value_type; + uint32_t value_num; + const void* value; + meta_cache_arr_->get_metadata_from_index( + idx, &key, &value_type, &value_num, &value); + MetadataValue mdval(value_type, value_num, value); + std::pair mdpair(key, mdval); + metadata_.insert(mdpair); + } +} const std::string SOMAArray::uri() const { return uri_; @@ -186,24 +196,30 @@ std::shared_ptr SOMAArray::ctx() { void SOMAArray::open( OpenMode mode, std::optional> timestamp) { - auto tdb_mode = mode == OpenMode::read ? TILEDB_READ : TILEDB_WRITE; - arr_->open(tdb_mode); - if (timestamp) { - if (timestamp->first > timestamp->second) { - throw std::invalid_argument("timestamp start > end"); - } - arr_->set_open_timestamp_start(timestamp->first); - arr_->set_open_timestamp_end(timestamp->second); - arr_->close(); - arr_->open(tdb_mode); - } + // auto tdb_mode = mode == OpenMode::read ? TILEDB_READ : TILEDB_WRITE; + // arr_->open(tdb_mode); + // if (timestamp) { + // if (timestamp->first > timestamp->second) { + // throw std::invalid_argument("timestamp start > end"); + // } + // arr_->set_open_timestamp_start(timestamp->first); + // arr_->set_open_timestamp_end(timestamp->second); + // arr_->close(); + // arr_->open(tdb_mode); + // } + timestamp_ = timestamp; + + validate(mode, name_, timestamp); reset(column_names(), batch_size_, result_order_); + fill_metadata_cache(); } void SOMAArray::close() { // Close the array through the managed query to ensure any pending queries // are completed. mq_->close(); + meta_cache_arr_->close(); + metadata_.clear(); } void SOMAArray::reset( @@ -553,55 +569,67 @@ void SOMAArray::set_metadata( tiledb_datatype_t value_type, uint32_t value_num, const void* value) { - if (key.compare("soma_object_type") == 0) { + if (key.compare("soma_object_type") == 0) throw TileDBSOMAError("soma_object_type cannot be modified."); - } + arr_->put_metadata(key, value_type, value_num, value); + MetadataValue mdval(value_type, value_num, value); + std::pair mdpair(key, mdval); + metadata_.insert(mdpair); } void SOMAArray::delete_metadata(const std::string& key) { - if (key.compare("soma_object_type") == 0) { + if (key.compare("soma_object_type") == 0) throw TileDBSOMAError("soma_object_type cannot be deleted."); - } + arr_->delete_metadata(key); + metadata_.erase(key); } std::optional SOMAArray::get_metadata(const std::string& key) { - tiledb_datatype_t value_type; - uint32_t value_num; - const void* value; + if (metadata_.count(key) == 0) + return std::nullopt; - arr_->get_metadata(key, &value_type, &value_num, &value); + return metadata_[key]; - if (value == nullptr) - return std::nullopt; + // tiledb_datatype_t value_type; + // uint32_t value_num; + // const void* value; + + // arr_->get_metadata(key, &value_type, &value_num, &value); - return MetadataValue(value_type, value_num, value); + // if (value == nullptr) + // return std::nullopt; + + // return MetadataValue(value_type, value_num, value); } std::map SOMAArray::get_metadata() { - std::map meta; + return metadata_; + // std::map meta; - std::string key; - tiledb_datatype_t value_type; - uint32_t value_num; - const void* value; + // std::string key; + // tiledb_datatype_t value_type; + // uint32_t value_num; + // const void* value; - for (uint64_t idx = 0; idx < arr_->metadata_num(); ++idx) { - arr_->get_metadata_from_index( - idx, &key, &value_type, &value_num, &value); - meta[key] = MetadataValue(value_type, value_num, value); - } + // for (uint64_t idx = 0; idx < arr_->metadata_num(); ++idx) { + // arr_->get_metadata_from_index( + // idx, &key, &value_type, &value_num, &value); + // meta[key] = MetadataValue(value_type, value_num, value); + // } - return meta; + // return meta; } bool SOMAArray::has_metadata(const std::string& key) { - return get_metadata(key) != std::nullopt; + return metadata_.count(key) != 0; + // return get_metadata(key) != std::nullopt; } uint64_t SOMAArray::metadata_num() const { - return arr_->metadata_num(); + return metadata_.size(); + // return arr_->metadata_num(); } void SOMAArray::validate( @@ -613,20 +641,15 @@ void SOMAArray::validate( try { LOG_DEBUG(fmt::format("[SOMAArray] opening array '{}'", uri_)); - arr_ = std::make_shared(*ctx_->tiledb_ctx(), uri_, tdb_mode); if (timestamp) { - if (timestamp->first > timestamp->second) { - throw std::invalid_argument("timestamp start > end"); - } - arr_->set_open_timestamp_start(timestamp->first); - arr_->set_open_timestamp_end(timestamp->second); - arr_->close(); - arr_->open(tdb_mode); - LOG_DEBUG(fmt::format( - "[SOMAArray] timestamp_start = {}", - arr_->open_timestamp_start())); - LOG_DEBUG(fmt::format( - "[SOMAArray] timestamp_end = {}", arr_->open_timestamp_end())); + arr_ = std::make_shared( + *ctx_->tiledb_ctx(), + uri_, + tdb_mode, + TemporalPolicy( + TimestampStartEnd, timestamp->first, timestamp->second)); + } else { + arr_ = std::make_shared(*ctx_->tiledb_ctx(), uri_, tdb_mode); } LOG_TRACE(fmt::format("[SOMAArray] loading enumerations")); ArrayExperimental::load_all_enumerations( diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index f2f59448a6..a7d5dfec57 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -174,6 +174,7 @@ class SOMAArray : public SOMAObject { , ctx_(other.ctx_) , batch_size_(other.batch_size_) , result_order_(other.result_order_) + , metadata_(other.metadata_) , timestamp_(other.timestamp_) , mq_(std::make_unique( other.arr_, other.ctx_->tiledb_ctx(), other.name_)) @@ -689,6 +690,9 @@ class SOMAArray : public SOMAObject { //= private non-static //=================================================================== + // Fills the metadata cache upon opening the array. + void fill_metadata_cache(); + // SOMAArray URI std::string uri_; @@ -704,6 +708,9 @@ class SOMAArray : public SOMAObject { // Result order ResultOrder result_order_; + // Metadata cache + std::map metadata_; + // Read timestamp range (start, end) std::optional> timestamp_; @@ -713,6 +720,11 @@ class SOMAArray : public SOMAObject { // Array associated with mq_ std::shared_ptr arr_; + // Array associated with metadata_. Metadata values need to be accessible in + // write mode as well. We need to keep this read-mode array alive in order + // for the metadata value pointers in the cache to be accessible + std::shared_ptr meta_cache_arr_; + // True if this is the first call to read_next() bool first_read_next_ = true; diff --git a/libtiledbsoma/src/soma/soma_group.cc b/libtiledbsoma/src/soma/soma_group.cc index a8e4a643c4..ea872d920e 100644 --- a/libtiledbsoma/src/soma/soma_group.cc +++ b/libtiledbsoma/src/soma/soma_group.cc @@ -47,11 +47,19 @@ void SOMAGroup::create( std::string soma_type) { Group::create(*ctx->tiledb_ctx(), std::string(uri)); auto group = Group(*ctx->tiledb_ctx(), std::string(uri), TILEDB_WRITE); + group.put_metadata( - "soma_object_type", + SOMA_OBJECT_TYPE_KEY, TILEDB_STRING_UTF8, static_cast(soma_type.length()), soma_type.c_str()); + + group.put_metadata( + ENCODING_VERSION_KEY, + TILEDB_STRING_UTF8, + static_cast(ENCODING_VERSION_VAL.length()), + ENCODING_VERSION_VAL.c_str()); + group.close(); } diff --git a/libtiledbsoma/src/utils/common.h b/libtiledbsoma/src/utils/common.h index 3928b04675..8c3ff11799 100644 --- a/libtiledbsoma/src/utils/common.h +++ b/libtiledbsoma/src/utils/common.h @@ -39,6 +39,10 @@ namespace tiledbsoma { +const std::string SOMA_OBJECT_TYPE_KEY = "soma_object_type"; +const std::string ENCODING_VERSION_KEY = "soma_encoding_version"; +const std::string ENCODING_VERSION_VAL = "1"; + using MetadataValue = std::tuple; enum MetadataInfo { dtype = 0, num, value }; diff --git a/libtiledbsoma/test/unit_soma_array.cc b/libtiledbsoma/test/unit_soma_array.cc index 92778b27cc..547ca7b902 100644 --- a/libtiledbsoma/test/unit_soma_array.cc +++ b/libtiledbsoma/test/unit_soma_array.cc @@ -86,7 +86,8 @@ std::tuple create_array( schema.check(); // Create array - SOMAArray::create(ctx, uri, schema, "NONE", std::pair(1, 1)); + SOMAArray::create( + ctx, uri, schema, "NONE", std::pair(0, 2)); uint64_t nnz = num_fragments * num_cells_per_fragment; @@ -362,8 +363,10 @@ TEST_CASE("SOMAArray: metadata") { auto ctx = std::make_shared(); std::string base_uri = "mem://unit-test-array"; + const auto& [uri, expected_nnz] = create_array(base_uri, ctx); + // Write md at (1, 1) auto soma_array = SOMAArray::open( OpenMode::write, uri, @@ -377,31 +380,45 @@ TEST_CASE("SOMAArray: metadata") { soma_array->set_metadata("md", TILEDB_INT32, 1, &val); soma_array->close(); - soma_array->open(OpenMode::read, std::pair(0, 1)); + // Read metadata + soma_array->open(OpenMode::read, std::pair(0, 2)); REQUIRE(soma_array->metadata_num() == 3); - REQUIRE(soma_array->has_metadata("soma_object_type") == true); - REQUIRE(soma_array->has_metadata("soma_encoding_version") == true); - REQUIRE(soma_array->has_metadata("md") == true); - + REQUIRE(soma_array->has_metadata("soma_object_type")); + REQUIRE(soma_array->has_metadata("soma_encoding_version")); + REQUIRE(soma_array->has_metadata("md")); auto mdval = soma_array->get_metadata("md"); REQUIRE(std::get(*mdval) == TILEDB_INT32); REQUIRE(std::get(*mdval) == 1); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); soma_array->close(); - soma_array->open(OpenMode::write, std::pair(2, 2)); + // md should not be available at (2, 2) + soma_array->open(OpenMode::read, std::pair(2, 2)); + REQUIRE(soma_array->metadata_num() == 2); + REQUIRE(soma_array->has_metadata("soma_object_type")); + REQUIRE(soma_array->has_metadata("soma_encoding_version")); + REQUIRE(!soma_array->has_metadata("md")); + soma_array->close(); + // Metadata should also be retrievable in write mode + soma_array->open(OpenMode::write, std::pair(0, 2)); + REQUIRE(soma_array->metadata_num() == 3); + REQUIRE(soma_array->has_metadata("soma_object_type")); + REQUIRE(soma_array->has_metadata("soma_encoding_version")); + REQUIRE(soma_array->has_metadata("md")); mdval = soma_array->get_metadata("md"); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); + + // Delete and have it reflected when reading metadata while in write mode soma_array->delete_metadata("md"); mdval = soma_array->get_metadata("md"); REQUIRE(!mdval.has_value()); soma_array->close(); + // Confirm delete in read mode soma_array->open(OpenMode::read, std::pair(0, 2)); - REQUIRE(soma_array->has_metadata("md") == false); - REQUIRE(soma_array->metadata_num() == 1); - soma_array->close(); + REQUIRE(!soma_array->has_metadata("md")); + REQUIRE(soma_array->metadata_num() == 2); } TEST_CASE("SOMAArray: Test buffer size") { From d6c8c72fee011aa6e67741b1ab739963d0f5257b Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Thu, 29 Feb 2024 17:37:02 -0600 Subject: [PATCH 31/70] [WIP] Refactor metadata --- libtiledbsoma/src/soma/soma_array.cc | 80 ++++------- libtiledbsoma/src/soma/soma_array.h | 19 ++- libtiledbsoma/src/soma/soma_collection.cc | 6 +- libtiledbsoma/src/soma/soma_collection.h | 5 +- libtiledbsoma/src/soma/soma_experiment.cc | 14 +- libtiledbsoma/src/soma/soma_experiment.h | 5 +- libtiledbsoma/src/soma/soma_group.cc | 124 ++++++++---------- libtiledbsoma/src/soma/soma_group.h | 28 +++- libtiledbsoma/src/soma/soma_measurement.cc | 21 +-- libtiledbsoma/src/soma/soma_measurement.h | 5 +- libtiledbsoma/src/soma/soma_object.cc | 5 +- libtiledbsoma/src/utils/common.h | 2 + libtiledbsoma/test/unit_soma_array.cc | 24 ++-- libtiledbsoma/test/unit_soma_collection.cc | 122 +++++++++++------ libtiledbsoma/test/unit_soma_dataframe.cc | 43 ++++-- libtiledbsoma/test/unit_soma_dense_ndarray.cc | 38 ++++-- libtiledbsoma/test/unit_soma_group.cc | 55 ++++---- .../test/unit_soma_sparse_ndarray.cc | 40 ++++-- 18 files changed, 364 insertions(+), 272 deletions(-) diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index f6643d4817..74b4eb19d1 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -46,19 +46,19 @@ void SOMAArray::create( std::string_view uri, ArraySchema schema, std::string soma_type, - std::optional> timestamp) { + std::optional timestamp) { Array::create(std::string(uri), schema); - std::shared_ptr array; + std::unique_ptr array; if (timestamp) { - array = std::make_shared( + array = std::make_unique( *ctx->tiledb_ctx(), std::string(uri), TILEDB_WRITE, TemporalPolicy( TimestampStartEnd, timestamp->first, timestamp->second)); } else { - array = std::make_shared( + array = std::make_unique( *ctx->tiledb_ctx(), std::string(uri), TILEDB_WRITE); } @@ -85,7 +85,7 @@ std::unique_ptr SOMAArray::open( std::vector column_names, std::string_view batch_size, ResultOrder result_order, - std::optional> timestamp) { + std::optional timestamp) { LOG_DEBUG( fmt::format("[SOMAArray] static method 'cfg' opening array '{}'", uri)); return std::make_unique( @@ -107,7 +107,7 @@ std::unique_ptr SOMAArray::open( std::vector column_names, std::string_view batch_size, ResultOrder result_order, - std::optional> timestamp) { + std::optional timestamp) { LOG_DEBUG( fmt::format("[SOMAArray] static method 'ctx' opening array '{}'", uri)); return std::make_unique( @@ -133,7 +133,7 @@ SOMAArray::SOMAArray( std::vector column_names, std::string_view batch_size, ResultOrder result_order, - std::optional> timestamp) + std::optional timestamp) : uri_(util::rstrip_uri(uri)) , result_order_(result_order) , timestamp_(timestamp) { @@ -151,7 +151,7 @@ SOMAArray::SOMAArray( std::vector column_names, std::string_view batch_size, ResultOrder result_order, - std::optional> timestamp) + std::optional timestamp) : uri_(util::rstrip_uri(uri)) , ctx_(ctx) , result_order_(result_order) @@ -194,19 +194,7 @@ std::shared_ptr SOMAArray::ctx() { return ctx_; }; -void SOMAArray::open( - OpenMode mode, std::optional> timestamp) { - // auto tdb_mode = mode == OpenMode::read ? TILEDB_READ : TILEDB_WRITE; - // arr_->open(tdb_mode); - // if (timestamp) { - // if (timestamp->first > timestamp->second) { - // throw std::invalid_argument("timestamp start > end"); - // } - // arr_->set_open_timestamp_start(timestamp->first); - // arr_->set_open_timestamp_end(timestamp->second); - // arr_->close(); - // arr_->open(tdb_mode); - // } +void SOMAArray::open(OpenMode mode, std::optional timestamp) { timestamp_ = timestamp; validate(mode, name_, timestamp); @@ -215,10 +203,12 @@ void SOMAArray::open( } void SOMAArray::close() { + if (arr_->query_type() == TILEDB_WRITE) + meta_cache_arr_->close(); + // Close the array through the managed query to ensure any pending queries // are completed. mq_->close(); - meta_cache_arr_->close(); metadata_.clear(); } @@ -569,18 +559,25 @@ void SOMAArray::set_metadata( tiledb_datatype_t value_type, uint32_t value_num, const void* value) { - if (key.compare("soma_object_type") == 0) - throw TileDBSOMAError("soma_object_type cannot be modified."); + if (key.compare(SOMA_OBJECT_TYPE_KEY) == 0) + throw TileDBSOMAError(SOMA_OBJECT_TYPE_KEY + " cannot be modified."); + + if (key.compare(ENCODING_VERSION_KEY) == 0) + throw TileDBSOMAError(ENCODING_VERSION_KEY + " cannot be modified."); arr_->put_metadata(key, value_type, value_num, value); + MetadataValue mdval(value_type, value_num, value); std::pair mdpair(key, mdval); metadata_.insert(mdpair); } void SOMAArray::delete_metadata(const std::string& key) { - if (key.compare("soma_object_type") == 0) - throw TileDBSOMAError("soma_object_type cannot be deleted."); + if (key.compare(SOMA_OBJECT_TYPE_KEY) == 0) + throw TileDBSOMAError(SOMA_OBJECT_TYPE_KEY + " cannot be deleted."); + + if (key.compare(ENCODING_VERSION_KEY) == 0) + throw TileDBSOMAError(ENCODING_VERSION_KEY + " cannot be deleted."); arr_->delete_metadata(key); metadata_.erase(key); @@ -591,51 +588,24 @@ std::optional SOMAArray::get_metadata(const std::string& key) { return std::nullopt; return metadata_[key]; - - // tiledb_datatype_t value_type; - // uint32_t value_num; - // const void* value; - - // arr_->get_metadata(key, &value_type, &value_num, &value); - - // if (value == nullptr) - // return std::nullopt; - - // return MetadataValue(value_type, value_num, value); } std::map SOMAArray::get_metadata() { return metadata_; - // std::map meta; - - // std::string key; - // tiledb_datatype_t value_type; - // uint32_t value_num; - // const void* value; - - // for (uint64_t idx = 0; idx < arr_->metadata_num(); ++idx) { - // arr_->get_metadata_from_index( - // idx, &key, &value_type, &value_num, &value); - // meta[key] = MetadataValue(value_type, value_num, value); - // } - - // return meta; } bool SOMAArray::has_metadata(const std::string& key) { return metadata_.count(key) != 0; - // return get_metadata(key) != std::nullopt; } uint64_t SOMAArray::metadata_num() const { return metadata_.size(); - // return arr_->metadata_num(); } void SOMAArray::validate( OpenMode mode, std::string_view name, - std::optional> timestamp) { + std::optional timestamp) { // Validate parameters auto tdb_mode = mode == OpenMode::read ? TILEDB_READ : TILEDB_WRITE; @@ -661,7 +631,7 @@ void SOMAArray::validate( } } -std::optional> SOMAArray::timestamp() { +std::optional SOMAArray::timestamp() { return timestamp_; } diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index a7d5dfec57..d28051d8fc 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -68,7 +68,7 @@ class SOMAArray : public SOMAObject { std::string_view uri, ArraySchema schema, std::string soma_type, - std::optional> timestamp = std::nullopt); + std::optional timestamp = std::nullopt); /** * @brief Open an array at the specified URI and return SOMAArray @@ -93,7 +93,7 @@ class SOMAArray : public SOMAObject { std::vector column_names = {}, std::string_view batch_size = "auto", ResultOrder result_order = ResultOrder::automatic, - std::optional> timestamp = std::nullopt); + std::optional timestamp = std::nullopt); /** * @brief Open an array at the specified URI and return SOMAArray @@ -118,7 +118,7 @@ class SOMAArray : public SOMAObject { std::vector column_names = {}, std::string_view batch_size = "auto", ResultOrder result_order = ResultOrder::automatic, - std::optional> timestamp = std::nullopt); + std::optional timestamp = std::nullopt); //=================================================================== //= public non-static @@ -144,7 +144,7 @@ class SOMAArray : public SOMAObject { std::vector column_names, std::string_view batch_size, ResultOrder result_order, - std::optional> timestamp = std::nullopt); + std::optional timestamp = std::nullopt); /** * @brief Construct a new SOMAArray object @@ -166,7 +166,7 @@ class SOMAArray : public SOMAObject { std::vector column_names, std::string_view batch_size, ResultOrder result_order, - std::optional> timestamp = std::nullopt); + std::optional timestamp = std::nullopt); SOMAArray(const SOMAArray& other) : uri_(other.uri_) @@ -213,8 +213,7 @@ class SOMAArray : public SOMAObject { * @param timestamp Timestamp */ void open( - OpenMode mode, - std::optional> timestamp = std::nullopt); + OpenMode mode, std::optional timestamp = std::nullopt); /** * Close the SOMAArray object. @@ -678,12 +677,12 @@ class SOMAArray : public SOMAObject { void validate( OpenMode mode, std::string_view name, - std::optional> timestamp); + std::optional timestamp); /** * Return optional timestamp pair SOMAArray was opened with. */ - std::optional> timestamp(); + std::optional timestamp(); private: //=================================================================== @@ -712,7 +711,7 @@ class SOMAArray : public SOMAObject { std::map metadata_; // Read timestamp range (start, end) - std::optional> timestamp_; + std::optional timestamp_; // Managed query for the array std::unique_ptr mq_; diff --git a/libtiledbsoma/src/soma/soma_collection.cc b/libtiledbsoma/src/soma/soma_collection.cc index c86234f7e5..e3a9b27528 100644 --- a/libtiledbsoma/src/soma/soma_collection.cc +++ b/libtiledbsoma/src/soma/soma_collection.cc @@ -42,8 +42,10 @@ using namespace tiledb; //=================================================================== void SOMACollection::create( - std::string_view uri, std::shared_ptr ctx) { - SOMAGroup::create(ctx, uri, "SOMACollection"); + std::string_view uri, + std::shared_ptr ctx, + std::optional timestamp) { + SOMAGroup::create(ctx, uri, "SOMACollection", timestamp); } std::unique_ptr SOMACollection::open( diff --git a/libtiledbsoma/src/soma/soma_collection.h b/libtiledbsoma/src/soma/soma_collection.h index 57b2d139f1..d6b770f30c 100644 --- a/libtiledbsoma/src/soma/soma_collection.h +++ b/libtiledbsoma/src/soma/soma_collection.h @@ -61,7 +61,10 @@ class SOMACollection : public SOMAGroup { * @param ctx TileDB context * @param uri URI to create the SOMACollection */ - static void create(std::string_view uri, std::shared_ptr ctx); + static void create( + std::string_view uri, + std::shared_ptr ctx, + std::optional timestamp = std::nullopt); /** * @brief Open a group at the specified URI and return SOMACollection diff --git a/libtiledbsoma/src/soma/soma_experiment.cc b/libtiledbsoma/src/soma/soma_experiment.cc index 643f171abe..f6cd186920 100644 --- a/libtiledbsoma/src/soma/soma_experiment.cc +++ b/libtiledbsoma/src/soma/soma_experiment.cc @@ -45,14 +45,18 @@ void SOMAExperiment::create( std::string_view uri, std::shared_ptr schema, ColumnIndexInfo index_columns, - std::shared_ptr ctx) { + std::shared_ptr ctx, + std::optional timestamp) { std::string exp_uri(uri); - SOMAGroup::create(ctx, exp_uri, "SOMAExperiment"); - SOMADataFrame::create(exp_uri + "/obs", schema, index_columns, ctx); - SOMACollection::create(exp_uri + "/ms", ctx); + SOMAGroup::create(ctx, exp_uri, "SOMAExperiment", timestamp); + SOMADataFrame::create( + exp_uri + "/obs", schema, index_columns, ctx, timestamp); + SOMACollection::create(exp_uri + "/ms", ctx, timestamp); - auto group = SOMAGroup::open(OpenMode::write, exp_uri, ctx); + auto name = std::string(std::filesystem::path(uri).filename()); + auto group = SOMAGroup::open( + OpenMode::write, exp_uri, ctx, name, timestamp); group->set(exp_uri + "/obs", URIType::absolute, "obs"); group->set(exp_uri + "/ms", URIType::absolute, "ms"); group->close(); diff --git a/libtiledbsoma/src/soma/soma_experiment.h b/libtiledbsoma/src/soma/soma_experiment.h index 5f6bc7094b..c382238ed6 100644 --- a/libtiledbsoma/src/soma/soma_experiment.h +++ b/libtiledbsoma/src/soma/soma_experiment.h @@ -58,7 +58,8 @@ class SOMAExperiment : public SOMACollection { std::string_view uri, std::shared_ptr schema, ColumnIndexInfo index_columns, - std::shared_ptr ctx); + std::shared_ptr ctx, + std::optional timestamp = std::nullopt); /** * @brief Open a group at the specified URI and return SOMAExperiment @@ -97,6 +98,8 @@ class SOMAExperiment : public SOMACollection { SOMAExperiment(SOMAExperiment&&) = default; ~SOMAExperiment() = default; + using SOMACollection::open; + private: //=================================================================== //= private non-static diff --git a/libtiledbsoma/src/soma/soma_group.cc b/libtiledbsoma/src/soma/soma_group.cc index ea872d920e..1db839be42 100644 --- a/libtiledbsoma/src/soma/soma_group.cc +++ b/libtiledbsoma/src/soma/soma_group.cc @@ -44,9 +44,14 @@ using namespace tiledb; void SOMAGroup::create( std::shared_ptr ctx, std::string_view uri, - std::string soma_type) { + std::string soma_type, + std::optional timestamp) { Group::create(*ctx->tiledb_ctx(), std::string(uri)); - auto group = Group(*ctx->tiledb_ctx(), std::string(uri), TILEDB_WRITE); + auto group = Group( + *ctx->tiledb_ctx(), + std::string(uri), + TILEDB_WRITE, + _set_timestamp(ctx, timestamp)); group.put_metadata( SOMA_OBJECT_TYPE_KEY, @@ -68,7 +73,7 @@ std::unique_ptr SOMAGroup::open( std::string_view uri, std::shared_ptr ctx, std::string_view name, - std::optional> timestamp) { + std::optional timestamp) { return std::make_unique(mode, uri, ctx, name, timestamp); } @@ -81,74 +86,57 @@ SOMAGroup::SOMAGroup( std::string_view uri, std::shared_ptr ctx, std::string_view name, - std::optional> timestamp) + std::optional timestamp) : ctx_(ctx) , uri_(util::rstrip_uri(uri)) , name_(name) { - auto cfg = ctx_->tiledb_ctx()->config(); - if (timestamp) { - if (timestamp->first > timestamp->second) { - throw std::invalid_argument("timestamp start > end"); - } - cfg["sm.group.timestamp_start"] = timestamp->first; - cfg["sm.group.timestamp_end"] = timestamp->second; - } group_ = std::make_unique( *ctx_->tiledb_ctx(), std::string(uri), mode == OpenMode::read ? TILEDB_READ : TILEDB_WRITE, - cfg); - + _set_timestamp(ctx, timestamp)); fill_caches(); } void SOMAGroup::fill_caches() { - std::shared_ptr grp; if (group_->query_type() == TILEDB_WRITE) { - grp = std::make_shared(*ctx_->tiledb_ctx(), uri_, TILEDB_READ); + cache_group_ = std::make_shared( + *ctx_->tiledb_ctx(), uri_, TILEDB_READ); } else { - grp = group_; + cache_group_ = group_; } - for (uint64_t idx = 0; idx < grp->metadata_num(); ++idx) { + for (uint64_t idx = 0; idx < cache_group_->metadata_num(); ++idx) { std::string key; tiledb_datatype_t value_type; uint32_t value_num; const void* value; - grp->get_metadata_from_index( + cache_group_->get_metadata_from_index( idx, &key, &value_type, &value_num, &value); MetadataValue mdval(value_type, value_num, value); std::pair mdpair(key, mdval); metadata_.insert(mdpair); } - for (uint64_t i = 0; i < grp->member_count(); ++i) { - auto mem = grp->member(i); + for (uint64_t i = 0; i < cache_group_->member_count(); ++i) { + auto mem = cache_group_->member(i); member_to_uri_[mem.name().value()] = mem.uri(); } - - if (group_->query_type() == TILEDB_WRITE) { - grp->close(); - } } void SOMAGroup::open( - OpenMode query_type, - std::optional> timestamp) { - auto cfg = ctx_->tiledb_ctx()->config(); - if (timestamp) { - if (timestamp->first > timestamp->second) { - throw std::invalid_argument("timestamp start > end"); - } - cfg["sm.group.timestamp_start"] = timestamp->first; - cfg["sm.group.timestamp_end"] = timestamp->second; - } - group_->set_config(cfg); + OpenMode query_type, std::optional timestamp) { + timestamp_ = timestamp; + group_->set_config(_set_timestamp(ctx_, timestamp)); group_->open(query_type == OpenMode::read ? TILEDB_READ : TILEDB_WRITE); + fill_caches(); } void SOMAGroup::close() { + if (group_->query_type() == TILEDB_WRITE) + cache_group_->close(); group_->close(); + metadata_.clear(); } const std::string SOMAGroup::uri() const { @@ -203,56 +191,60 @@ void SOMAGroup::set_metadata( tiledb_datatype_t value_type, uint32_t value_num, const void* value) { - if (key.compare("soma_object_type") == 0) { - throw TileDBSOMAError("soma_object_type cannot be modified."); - } + if (key.compare(SOMA_OBJECT_TYPE_KEY) == 0) + throw TileDBSOMAError(SOMA_OBJECT_TYPE_KEY + " cannot be modified."); + + if (key.compare(ENCODING_VERSION_KEY) == 0) + throw TileDBSOMAError(ENCODING_VERSION_KEY + " cannot be modified."); group_->put_metadata(key, value_type, value_num, value); + + MetadataValue mdval(value_type, value_num, value); + std::pair mdpair(key, mdval); + metadata_.insert(mdpair); } void SOMAGroup::delete_metadata(const std::string& key) { - if (key.compare("soma_object_type") == 0) { - throw TileDBSOMAError("soma_object_type cannot be deleted."); - } + if (key.compare(SOMA_OBJECT_TYPE_KEY) == 0) + throw TileDBSOMAError(SOMA_OBJECT_TYPE_KEY + " cannot be deleted."); + + if (key.compare(ENCODING_VERSION_KEY) == 0) + throw TileDBSOMAError(ENCODING_VERSION_KEY + " cannot be deleted."); + group_->delete_metadata(key); + metadata_.erase(key); } std::optional SOMAGroup::get_metadata(const std::string& key) { - tiledb_datatype_t value_type; - uint32_t value_num; - const void* value; - - group_->get_metadata(key, &value_type, &value_num, &value); - - if (value == nullptr) + if (metadata_.count(key) == 0) return std::nullopt; - return MetadataValue(value_type, value_num, value); + return metadata_[key]; } std::map SOMAGroup::get_metadata() { - std::map meta; - - std::string key; - tiledb_datatype_t value_type; - uint32_t value_num; - const void* value; - - for (uint64_t idx = 0; idx < group_->metadata_num(); ++idx) { - group_->get_metadata_from_index( - idx, &key, &value_type, &value_num, &value); - meta[key] = MetadataValue(value_type, value_num, value); - } - - return meta; + return metadata_; } bool SOMAGroup::has_metadata(const std::string& key) { - return get_metadata(key) == std::nullopt; + return metadata_.count(key) != 0; } uint64_t SOMAGroup::metadata_num() const { - return group_->metadata_num(); + return metadata_.size(); +} + +Config SOMAGroup::_set_timestamp( + std::shared_ptr ctx, std::optional timestamp) { + auto cfg = ctx->tiledb_ctx()->config(); + if (timestamp) { + if (timestamp->first > timestamp->second) { + throw std::invalid_argument("timestamp start > end"); + } + cfg["sm.group.timestamp_start"] = timestamp->first; + cfg["sm.group.timestamp_end"] = timestamp->second; + } + return cfg; } } // namespace tiledbsoma \ No newline at end of file diff --git a/libtiledbsoma/src/soma/soma_group.h b/libtiledbsoma/src/soma/soma_group.h index e7aebeb637..d90225ae70 100644 --- a/libtiledbsoma/src/soma/soma_group.h +++ b/libtiledbsoma/src/soma/soma_group.h @@ -61,7 +61,8 @@ class SOMAGroup : public SOMAObject { static void create( std::shared_ptr ctx, std::string_view uri, - std::string soma_type); + std::string soma_type, + std::optional timestamp = std::nullopt); /** * @brief Open a group at the specified URI and return SOMAGroup @@ -79,7 +80,7 @@ class SOMAGroup : public SOMAObject { std::string_view uri, std::shared_ptr ctx, std::string_view name = "unnamed", - std::optional> timestamp = std::nullopt); + std::optional timestamp = std::nullopt); //=================================================================== //= public non-static @@ -99,7 +100,7 @@ class SOMAGroup : public SOMAObject { std::string_view uri, std::shared_ptr ctx, std::string_view name, - std::optional> timestamp = std::nullopt); + std::optional timestamp = std::nullopt); SOMAGroup() = delete; SOMAGroup(const SOMAGroup&) = default; @@ -113,8 +114,7 @@ class SOMAGroup : public SOMAObject { * @param timestamp Optional pair indicating timestamp start and end */ void open( - OpenMode mode, - std::optional> timestamp = std::nullopt); + OpenMode mode, std::optional timestamp = std::nullopt); /** * Close the SOMAGroup object. @@ -282,6 +282,14 @@ class SOMAGroup : public SOMAObject { //= private non-static //=================================================================== + /** + * Helper function to set the pass in timestamp in the config associated + * with the SOMAContext passed in + */ + static Config _set_timestamp( + std::shared_ptr ctx, + std::optional timestamp); + /** * Fills the metadata and member-to-uri caches upon opening the array. */ @@ -295,13 +303,21 @@ class SOMAGroup : public SOMAObject { // Name displayed in log messages std::string name_; - + // // TileDBGroup associated with the SOMAGroup std::shared_ptr group_; // Metadata cache std::map metadata_; + // Read timestamp range (start, end) + std::optional timestamp_; + + // Group associated with metadata_. Metadata values need to be accessible in + // write mode as well. We need to keep this read-mode array alive in order + // for the metadata value pointers in the cache to be accessible + std::shared_ptr cache_group_; + // Member-to-URI cache std::map member_to_uri_; }; diff --git a/libtiledbsoma/src/soma/soma_measurement.cc b/libtiledbsoma/src/soma/soma_measurement.cc index a3aa242d19..4f462485d4 100644 --- a/libtiledbsoma/src/soma/soma_measurement.cc +++ b/libtiledbsoma/src/soma/soma_measurement.cc @@ -45,18 +45,21 @@ void SOMAMeasurement::create( std::string_view uri, std::shared_ptr schema, ColumnIndexInfo index_columns, - std::shared_ptr ctx) { + std::shared_ptr ctx, + std::optional timestamp) { std::string exp_uri(uri); - SOMAGroup::create(ctx, exp_uri, "SOMAMeasurement"); - SOMADataFrame::create(exp_uri + "/var", schema, index_columns, ctx); - SOMACollection::create(exp_uri + "/X", ctx); - SOMACollection::create(exp_uri + "/obsm", ctx); - SOMACollection::create(exp_uri + "/obsp", ctx); - SOMACollection::create(exp_uri + "/varm", ctx); - SOMACollection::create(exp_uri + "/varp", ctx); + SOMAGroup::create(ctx, exp_uri, "SOMAMeasurement", timestamp); + SOMADataFrame::create( + exp_uri + "/var", schema, index_columns, ctx, timestamp); + SOMACollection::create(exp_uri + "/X", ctx, timestamp); + SOMACollection::create(exp_uri + "/obsm", ctx, timestamp); + SOMACollection::create(exp_uri + "/obsp", ctx, timestamp); + SOMACollection::create(exp_uri + "/varm", ctx, timestamp); + SOMACollection::create(exp_uri + "/varp", ctx, timestamp); - auto group = SOMAGroup::open(OpenMode::write, uri, ctx); + auto name = std::string(std::filesystem::path(uri).filename()); + auto group = SOMAGroup::open(OpenMode::write, uri, ctx, name, timestamp); group->set(exp_uri + "/var", URIType::absolute, "var"); group->set(exp_uri + "/X", URIType::absolute, "X"); group->set(exp_uri + "/obsm", URIType::absolute, "obsm"); diff --git a/libtiledbsoma/src/soma/soma_measurement.h b/libtiledbsoma/src/soma/soma_measurement.h index 8530ac0bd7..df2791bd77 100644 --- a/libtiledbsoma/src/soma/soma_measurement.h +++ b/libtiledbsoma/src/soma/soma_measurement.h @@ -59,7 +59,8 @@ class SOMAMeasurement : public SOMACollection { std::string_view uri, std::shared_ptr schema, ColumnIndexInfo index_columns, - std::shared_ptr ctx); + std::shared_ptr ctx, + std::optional timestamp = std::nullopt); /** * @brief Open a group at the specified URI and return SOMAMeasurement @@ -97,6 +98,8 @@ class SOMAMeasurement : public SOMACollection { SOMAMeasurement(SOMAMeasurement&&) = default; ~SOMAMeasurement() = default; + using SOMACollection::open; + private: //=================================================================== //= private non-static diff --git a/libtiledbsoma/src/soma/soma_object.cc b/libtiledbsoma/src/soma/soma_object.cc index c7fa2defa9..36614a6ecb 100644 --- a/libtiledbsoma/src/soma/soma_object.cc +++ b/libtiledbsoma/src/soma/soma_object.cc @@ -20,10 +20,11 @@ std::unique_ptr SOMAObject::open( std::shared_ptr ctx, std::optional> timestamp) { auto obj = tiledb::Object::object(*ctx->tiledb_ctx(), std::string(uri)); + auto name = std::string(std::filesystem::path(uri).filename()); if (obj.type() == tiledb::Object::Type::Array) { auto array_ = SOMAArray::open( - mode, uri, ctx, "", {}, "auto", ResultOrder::automatic, timestamp); + mode, uri, ctx, "", {}, name, ResultOrder::automatic, timestamp); if (!array_->type().has_value()) throw TileDBSOMAError("SOMAArray has no type info"); @@ -38,7 +39,7 @@ std::unique_ptr SOMAObject::open( throw TileDBSOMAError("Saw invalid SOMAArray type"); } } else if (obj.type() == tiledb::Object::Type::Group) { - auto group_ = SOMAGroup::open(mode, uri, ctx, "", timestamp); + auto group_ = SOMAGroup::open(mode, uri, ctx, name, timestamp); if (!group_->type().has_value()) throw TileDBSOMAError("SOMAGroup has no type info"); diff --git a/libtiledbsoma/src/utils/common.h b/libtiledbsoma/src/utils/common.h index 8c3ff11799..87b41d5cd0 100644 --- a/libtiledbsoma/src/utils/common.h +++ b/libtiledbsoma/src/utils/common.h @@ -46,6 +46,8 @@ const std::string ENCODING_VERSION_VAL = "1"; using MetadataValue = std::tuple; enum MetadataInfo { dtype = 0, num, value }; +using TimestampRange = std::pair; + class TileDBSOMAError : public std::runtime_error { public: explicit TileDBSOMAError(const char* m) diff --git a/libtiledbsoma/test/unit_soma_array.cc b/libtiledbsoma/test/unit_soma_array.cc index 547ca7b902..baaa0492d5 100644 --- a/libtiledbsoma/test/unit_soma_array.cc +++ b/libtiledbsoma/test/unit_soma_array.cc @@ -86,8 +86,7 @@ std::tuple create_array( schema.check(); // Create array - SOMAArray::create( - ctx, uri, schema, "NONE", std::pair(0, 2)); + SOMAArray::create(ctx, uri, schema, "NONE", TimestampRange(0, 2)); uint64_t nnz = num_fragments * num_cells_per_fragment; @@ -126,7 +125,7 @@ std::tuple, std::vector> write_array( {}, "auto", ResultOrder::automatic, - std::pair(timestamp + i, timestamp + i)); + TimestampRange(timestamp + i, timestamp + i)); std::vector d0(num_cells_per_fragment); for (int j = 0; j < num_cells_per_fragment; j++) { @@ -215,8 +214,7 @@ TEST_CASE("SOMAArray: nnz") { {}, "auto", ResultOrder::automatic, - std::pair( - timestamp, timestamp + num_fragments - 1)); + TimestampRange(timestamp, timestamp + num_fragments - 1)); uint64_t nnz = soma_array->nnz(); REQUIRE(nnz == expected_nnz); @@ -280,7 +278,7 @@ TEST_CASE("SOMAArray: nnz with timestamp") { uri, ctx, num_cells_per_fragment, num_fragments, overlap, 40); // Get total cell num at timestamp (0, 20) - std::pair timestamp{0, 20}; + TimestampRange timestamp{0, 20}; auto soma_array = SOMAArray::open( OpenMode::read, uri, @@ -361,12 +359,9 @@ TEST_CASE("SOMAArray: nnz with consolidation") { TEST_CASE("SOMAArray: metadata") { auto ctx = std::make_shared(); - std::string base_uri = "mem://unit-test-array"; - const auto& [uri, expected_nnz] = create_array(base_uri, ctx); - // Write md at (1, 1) auto soma_array = SOMAArray::open( OpenMode::write, uri, @@ -375,13 +370,14 @@ TEST_CASE("SOMAArray: metadata") { {}, "auto", ResultOrder::automatic, - std::pair(1, 1)); + TimestampRange(1, 1)); + int32_t val = 100; soma_array->set_metadata("md", TILEDB_INT32, 1, &val); soma_array->close(); // Read metadata - soma_array->open(OpenMode::read, std::pair(0, 2)); + soma_array->open(OpenMode::read, TimestampRange(0, 2)); REQUIRE(soma_array->metadata_num() == 3); REQUIRE(soma_array->has_metadata("soma_object_type")); REQUIRE(soma_array->has_metadata("soma_encoding_version")); @@ -393,7 +389,7 @@ TEST_CASE("SOMAArray: metadata") { soma_array->close(); // md should not be available at (2, 2) - soma_array->open(OpenMode::read, std::pair(2, 2)); + soma_array->open(OpenMode::read, TimestampRange(2, 2)); REQUIRE(soma_array->metadata_num() == 2); REQUIRE(soma_array->has_metadata("soma_object_type")); REQUIRE(soma_array->has_metadata("soma_encoding_version")); @@ -401,7 +397,7 @@ TEST_CASE("SOMAArray: metadata") { soma_array->close(); // Metadata should also be retrievable in write mode - soma_array->open(OpenMode::write, std::pair(0, 2)); + soma_array->open(OpenMode::write, TimestampRange(0, 2)); REQUIRE(soma_array->metadata_num() == 3); REQUIRE(soma_array->has_metadata("soma_object_type")); REQUIRE(soma_array->has_metadata("soma_encoding_version")); @@ -416,7 +412,7 @@ TEST_CASE("SOMAArray: metadata") { soma_array->close(); // Confirm delete in read mode - soma_array->open(OpenMode::read, std::pair(0, 2)); + soma_array->open(OpenMode::read, TimestampRange(0, 2)); REQUIRE(!soma_array->has_metadata("md")); REQUIRE(soma_array->metadata_num() == 2); } diff --git a/libtiledbsoma/test/unit_soma_collection.cc b/libtiledbsoma/test/unit_soma_collection.cc index ae1b619041..6107df2a67 100644 --- a/libtiledbsoma/test/unit_soma_collection.cc +++ b/libtiledbsoma/test/unit_soma_collection.cc @@ -207,37 +207,53 @@ TEST_CASE("SOMACollection: metadata") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-collection"; - SOMACollection::create(uri, ctx); + SOMACollection::create(uri, ctx, TimestampRange(0, 2)); auto soma_collection = SOMACollection::open( uri, OpenMode::write, ctx, std::pair(1, 1)); + int32_t val = 100; soma_collection->set_metadata("md", TILEDB_INT32, 1, &val); soma_collection->close(); - soma_collection->open(OpenMode::read, std::pair(1, 1)); - REQUIRE(soma_collection->metadata_num() == 2); - REQUIRE(soma_collection->has_metadata("soma_object_type") == true); - REQUIRE(soma_collection->has_metadata("md") == true); - + // Read metadata + soma_collection->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(soma_collection->metadata_num() == 3); + REQUIRE(soma_collection->has_metadata("soma_object_type")); + REQUIRE(soma_collection->has_metadata("soma_encoding_version")); + REQUIRE(soma_collection->has_metadata("md")); auto mdval = soma_collection->get_metadata("md"); REQUIRE(std::get(*mdval) == TILEDB_INT32); REQUIRE(std::get(*mdval) == 1); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); soma_collection->close(); - soma_collection->open(OpenMode::write, std::pair(2, 2)); + // md should not be available at (2, 2) + soma_collection->open(OpenMode::read, TimestampRange(2, 2)); + REQUIRE(soma_collection->metadata_num() == 2); + REQUIRE(soma_collection->has_metadata("soma_object_type")); + REQUIRE(soma_collection->has_metadata("soma_encoding_version")); + REQUIRE(!soma_collection->has_metadata("md")); + soma_collection->close(); + // Metadata should also be retrievable in write mode + soma_collection->open(OpenMode::write, TimestampRange(0, 2)); + REQUIRE(soma_collection->metadata_num() == 3); + REQUIRE(soma_collection->has_metadata("soma_object_type")); + REQUIRE(soma_collection->has_metadata("soma_encoding_version")); + REQUIRE(soma_collection->has_metadata("md")); mdval = soma_collection->get_metadata("md"); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); + + // Delete and have it reflected when reading metadata while in write mode soma_collection->delete_metadata("md"); mdval = soma_collection->get_metadata("md"); REQUIRE(!mdval.has_value()); soma_collection->close(); - soma_collection->open(OpenMode::read, std::pair(3, 3)); - REQUIRE(soma_collection->has_metadata("md") == false); - REQUIRE(soma_collection->metadata_num() == 1); - soma_collection->close(); + // Confirm delete in read mode + soma_collection->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(!soma_collection->has_metadata("md")); + REQUIRE(soma_collection->metadata_num() == 2); } TEST_CASE("SOMAExperiment: metadata") { @@ -245,79 +261,105 @@ TEST_CASE("SOMAExperiment: metadata") { std::string uri = "mem://unit-test-experiment"; auto [schema, index_columns] = helper::create_arrow_schema(); - SOMAExperiment::create(uri, schema, index_columns, ctx); + SOMAExperiment::create(uri, schema, index_columns, ctx, TimestampRange(0, 2)); auto soma_experiment = SOMAExperiment::open( uri, OpenMode::write, ctx, std::pair(1, 1)); + int32_t val = 100; soma_experiment->set_metadata("md", TILEDB_INT32, 1, &val); soma_experiment->close(); - soma_experiment = SOMAExperiment::open( - uri, OpenMode::read, ctx, std::pair(1, 1)); - REQUIRE(soma_experiment->metadata_num() == 2); - REQUIRE(soma_experiment->has_metadata("soma_object_type") == true); - REQUIRE(soma_experiment->has_metadata("md") == true); - + // Read metadata + soma_experiment->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(soma_experiment->metadata_num() == 3); + REQUIRE(soma_experiment->has_metadata("soma_object_type")); + REQUIRE(soma_experiment->has_metadata("soma_encoding_version")); + REQUIRE(soma_experiment->has_metadata("md")); auto mdval = soma_experiment->get_metadata("md"); REQUIRE(std::get(*mdval) == TILEDB_INT32); REQUIRE(std::get(*mdval) == 1); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); soma_experiment->close(); - soma_experiment = SOMAExperiment::open( - uri, OpenMode::write, ctx, std::pair(2, 2)); + // md should not be available at (2, 2) + soma_experiment->open(OpenMode::read, TimestampRange(2, 2)); + REQUIRE(soma_experiment->metadata_num() == 2); + REQUIRE(soma_experiment->has_metadata("soma_object_type")); + REQUIRE(soma_experiment->has_metadata("soma_encoding_version")); + REQUIRE(!soma_experiment->has_metadata("md")); + soma_experiment->close(); + // Metadata should also be retrievable in write mode + soma_experiment->open(OpenMode::write, TimestampRange(0, 2)); + REQUIRE(soma_experiment->metadata_num() == 3); + REQUIRE(soma_experiment->has_metadata("soma_object_type")); + REQUIRE(soma_experiment->has_metadata("soma_encoding_version")); + REQUIRE(soma_experiment->has_metadata("md")); mdval = soma_experiment->get_metadata("md"); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); + + // Delete and have it reflected when reading metadata while in write mode soma_experiment->delete_metadata("md"); mdval = soma_experiment->get_metadata("md"); REQUIRE(!mdval.has_value()); soma_experiment->close(); - soma_experiment = SOMAExperiment::open( - uri, OpenMode::read, ctx, std::pair(3, 3)); - REQUIRE(soma_experiment->has_metadata("md") == false); - REQUIRE(soma_experiment->metadata_num() == 1); - soma_experiment->close(); + // Confirm delete in read mode + soma_experiment->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(!soma_experiment->has_metadata("md")); + REQUIRE(soma_experiment->metadata_num() == 2); } TEST_CASE("SOMAMeasurement: metadata") { auto ctx = std::make_shared(); - std::string uri = "mem://unit-test-measurement"; auto [schema, index_columns] = helper::create_arrow_schema(); - SOMAMeasurement::create(uri, schema, index_columns, ctx); + SOMAMeasurement::create(uri, schema, index_columns, ctx, TimestampRange(0, 2)); + auto soma_measurement = SOMAMeasurement::open( uri, OpenMode::write, ctx, std::pair(1, 1)); + int32_t val = 100; soma_measurement->set_metadata("md", TILEDB_INT32, 1, &val); soma_measurement->close(); - soma_measurement = SOMAMeasurement::open( - uri, OpenMode::read, ctx, std::pair(1, 1)); - REQUIRE(soma_measurement->metadata_num() == 2); - REQUIRE(soma_measurement->has_metadata("soma_object_type") == true); - REQUIRE(soma_measurement->has_metadata("md") == true); - + // Read metadata + soma_measurement->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(soma_measurement->metadata_num() == 3); + REQUIRE(soma_measurement->has_metadata("soma_object_type")); + REQUIRE(soma_measurement->has_metadata("soma_encoding_version")); + REQUIRE(soma_measurement->has_metadata("md")); auto mdval = soma_measurement->get_metadata("md"); REQUIRE(std::get(*mdval) == TILEDB_INT32); REQUIRE(std::get(*mdval) == 1); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); soma_measurement->close(); - soma_measurement = SOMAMeasurement::open( - uri, OpenMode::write, ctx, std::pair(2, 2)); + // md should not be available at (2, 2) + soma_measurement->open(OpenMode::read, TimestampRange(2, 2)); + REQUIRE(soma_measurement->metadata_num() == 2); + REQUIRE(soma_measurement->has_metadata("soma_object_type")); + REQUIRE(soma_measurement->has_metadata("soma_encoding_version")); + REQUIRE(!soma_measurement->has_metadata("md")); + soma_measurement->close(); + // Metadata should also be retrievable in write mode + soma_measurement->open(OpenMode::write, TimestampRange(0, 2)); + REQUIRE(soma_measurement->metadata_num() == 3); + REQUIRE(soma_measurement->has_metadata("soma_object_type")); + REQUIRE(soma_measurement->has_metadata("soma_encoding_version")); + REQUIRE(soma_measurement->has_metadata("md")); mdval = soma_measurement->get_metadata("md"); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); + + // Delete and have it reflected when reading metadata while in write mode soma_measurement->delete_metadata("md"); mdval = soma_measurement->get_metadata("md"); REQUIRE(!mdval.has_value()); soma_measurement->close(); - soma_measurement = SOMAMeasurement::open( - uri, OpenMode::read, ctx, std::pair(3, 3)); - REQUIRE(soma_measurement->has_metadata("md") == false); - REQUIRE(soma_measurement->metadata_num() == 1); - soma_measurement->close(); + // Confirm delete in read mode + soma_measurement->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(!soma_measurement->has_metadata("md")); + REQUIRE(soma_measurement->metadata_num() == 2); } \ No newline at end of file diff --git a/libtiledbsoma/test/unit_soma_dataframe.cc b/libtiledbsoma/test/unit_soma_dataframe.cc index 4fba0021a6..98d63c0041 100644 --- a/libtiledbsoma/test/unit_soma_dataframe.cc +++ b/libtiledbsoma/test/unit_soma_dataframe.cc @@ -78,43 +78,60 @@ TEST_CASE("SOMADataFrame: basic") { TEST_CASE("SOMADataFrame: metadata") { auto ctx = std::make_shared(); - std::string uri = "mem://unit-test-collection"; auto [schema, index_columns] = helper::create_arrow_schema(); - SOMADataFrame::create(uri, schema, index_columns, ctx); + SOMADataFrame::create( + uri, schema, index_columns, ctx, TimestampRange(0, 2)); + auto soma_dataframe = SOMADataFrame::open( uri, OpenMode::write, ctx, {}, ResultOrder::automatic, - std::pair(1, 1)); + TimestampRange(1, 1)); + int32_t val = 100; soma_dataframe->set_metadata("md", TILEDB_INT32, 1, &val); soma_dataframe->close(); - soma_dataframe->open(OpenMode::read, std::pair(1, 1)); - REQUIRE(soma_dataframe->metadata_num() == 2); - REQUIRE(soma_dataframe->has_metadata("soma_object_type") == true); - REQUIRE(soma_dataframe->has_metadata("md") == true); - + // Read metadata + soma_dataframe->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(soma_dataframe->metadata_num() == 3); + REQUIRE(soma_dataframe->has_metadata("soma_object_type")); + REQUIRE(soma_dataframe->has_metadata("soma_encoding_version")); + REQUIRE(soma_dataframe->has_metadata("md")); auto mdval = soma_dataframe->get_metadata("md"); REQUIRE(std::get(*mdval) == TILEDB_INT32); REQUIRE(std::get(*mdval) == 1); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); soma_dataframe->close(); - soma_dataframe->open(OpenMode::write, std::pair(2, 2)); + // md should not be available at (2, 2) + soma_dataframe->open(OpenMode::read, TimestampRange(2, 2)); + REQUIRE(soma_dataframe->metadata_num() == 2); + REQUIRE(soma_dataframe->has_metadata("soma_object_type")); + REQUIRE(soma_dataframe->has_metadata("soma_encoding_version")); + REQUIRE(!soma_dataframe->has_metadata("md")); + soma_dataframe->close(); + // Metadata should also be retrievable in write mode + soma_dataframe->open(OpenMode::write, TimestampRange(0, 2)); + REQUIRE(soma_dataframe->metadata_num() == 3); + REQUIRE(soma_dataframe->has_metadata("soma_object_type")); + REQUIRE(soma_dataframe->has_metadata("soma_encoding_version")); + REQUIRE(soma_dataframe->has_metadata("md")); mdval = soma_dataframe->get_metadata("md"); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); + + // Delete and have it reflected when reading metadata while in write mode soma_dataframe->delete_metadata("md"); mdval = soma_dataframe->get_metadata("md"); REQUIRE(!mdval.has_value()); soma_dataframe->close(); - soma_dataframe->open(OpenMode::read, std::pair(3, 3)); - REQUIRE(soma_dataframe->has_metadata("md") == false); - REQUIRE(soma_dataframe->metadata_num() == 1); - soma_dataframe->close(); + // Confirm delete in read mode + soma_dataframe->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(!soma_dataframe->has_metadata("md")); + REQUIRE(soma_dataframe->metadata_num() == 2); } \ No newline at end of file diff --git a/libtiledbsoma/test/unit_soma_dense_ndarray.cc b/libtiledbsoma/test/unit_soma_dense_ndarray.cc index bd0dd1a1a9..567c934ac9 100644 --- a/libtiledbsoma/test/unit_soma_dense_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_dense_ndarray.cc @@ -120,7 +120,7 @@ TEST_CASE("SOMADenseNDArray: metadata") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-dense-ndarray"; - SOMADenseNDArray::create(uri, create_schema(*ctx->tiledb_ctx()), ctx); + SOMADenseNDArray::create(uri, create_schema(*ctx->tiledb_ctx()), ctx, TimestampRange(0, 2)); auto soma_dense = SOMADenseNDArray::open( uri, OpenMode::write, @@ -128,32 +128,48 @@ TEST_CASE("SOMADenseNDArray: metadata") { {}, ResultOrder::automatic, std::pair(1, 1)); + int32_t val = 100; soma_dense->set_metadata("md", TILEDB_INT32, 1, &val); soma_dense->close(); - soma_dense->open(OpenMode::read, std::pair(1, 1)); - REQUIRE(soma_dense->metadata_num() == 2); - REQUIRE(soma_dense->has_metadata("soma_object_type") == true); - REQUIRE(soma_dense->has_metadata("md") == true); - + // Read metadata + soma_dense->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(soma_dense->metadata_num() == 3); + REQUIRE(soma_dense->has_metadata("soma_object_type")); + REQUIRE(soma_dense->has_metadata("soma_encoding_version")); + REQUIRE(soma_dense->has_metadata("md")); auto mdval = soma_dense->get_metadata("md"); REQUIRE(std::get(*mdval) == TILEDB_INT32); REQUIRE(std::get(*mdval) == 1); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); soma_dense->close(); - soma_dense->open(OpenMode::write, std::pair(2, 2)); + // md should not be available at (2, 2) + soma_dense->open(OpenMode::read, TimestampRange(2, 2)); + REQUIRE(soma_dense->metadata_num() == 2); + REQUIRE(soma_dense->has_metadata("soma_object_type")); + REQUIRE(soma_dense->has_metadata("soma_encoding_version")); + REQUIRE(!soma_dense->has_metadata("md")); + soma_dense->close(); + // Metadata should also be retrievable in write mode + soma_dense->open(OpenMode::write, TimestampRange(0, 2)); + REQUIRE(soma_dense->metadata_num() == 3); + REQUIRE(soma_dense->has_metadata("soma_object_type")); + REQUIRE(soma_dense->has_metadata("soma_encoding_version")); + REQUIRE(soma_dense->has_metadata("md")); mdval = soma_dense->get_metadata("md"); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); + + // Delete and have it reflected when reading metadata while in write mode soma_dense->delete_metadata("md"); mdval = soma_dense->get_metadata("md"); REQUIRE(!mdval.has_value()); soma_dense->close(); - soma_dense->open(OpenMode::read, std::pair(3, 3)); - REQUIRE(soma_dense->has_metadata("md") == false); - REQUIRE(soma_dense->metadata_num() == 1); - soma_dense->close(); + // Confirm delete in read mode + soma_dense->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(!soma_dense->has_metadata("md")); + REQUIRE(soma_dense->metadata_num() == 2); } \ No newline at end of file diff --git a/libtiledbsoma/test/unit_soma_group.cc b/libtiledbsoma/test/unit_soma_group.cc index a45430b375..c04030037b 100644 --- a/libtiledbsoma/test/unit_soma_group.cc +++ b/libtiledbsoma/test/unit_soma_group.cc @@ -156,11 +156,7 @@ TEST_CASE("SOMAGroup: basic") { "mem://sub-array", *ctx->tiledb_ctx()); auto soma_group = SOMAGroup::open( - OpenMode::write, - uri_main_group, - ctx, - "metadata", - std::pair(0, 1)); + OpenMode::write, uri_main_group, ctx, "metadata", TimestampRange(0, 1)); soma_group->set(uri_sub_group, URIType::absolute, "subgroup"); soma_group->set(uri_sub_array, URIType::absolute, "subarray"); soma_group->close(); @@ -168,7 +164,7 @@ TEST_CASE("SOMAGroup: basic") { std::map expected_map{ {"subgroup", uri_sub_group}, {"subarray", uri_sub_array}}; - soma_group->open(OpenMode::read, std::pair(0, 2)); + soma_group->open(OpenMode::read, TimestampRange(0, 2)); REQUIRE(soma_group->ctx() == ctx); REQUIRE(soma_group->uri() == uri_main_group); REQUIRE(soma_group->count() == 2); @@ -177,12 +173,12 @@ TEST_CASE("SOMAGroup: basic") { REQUIRE(soma_group->get("subarray").type() == Object::Type::Array); soma_group->close(); - soma_group->open(OpenMode::write, std::pair(0, 3)); + soma_group->open(OpenMode::write, TimestampRange(0, 3)); REQUIRE(expected_map == soma_group->member_to_uri_mapping()); soma_group->del("subgroup"); soma_group->close(); - soma_group->open(OpenMode::read, std::pair(0, 4)); + soma_group->open(OpenMode::read, TimestampRange(0, 4)); REQUIRE(soma_group->count() == 1); REQUIRE(soma_group->has("subgroup") == false); REQUIRE(soma_group->has("subarray") == true); @@ -193,39 +189,50 @@ TEST_CASE("SOMAGroup: metadata") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-group"; - SOMAGroup::create(ctx, uri, "NONE"); + SOMAGroup::create(ctx, uri, "NONE", TimestampRange(0, 2)); auto soma_group = SOMAGroup::open( - OpenMode::write, - uri, - ctx, - "metadata", - std::pair(1, 1)); + OpenMode::write, uri, ctx, "metadata", TimestampRange(1, 1)); int32_t val = 100; soma_group->set_metadata("md", TILEDB_INT32, 1, &val); soma_group->close(); - soma_group->open(OpenMode::read, std::pair(1, 1)); - REQUIRE(soma_group->metadata_num() == 2); - REQUIRE(soma_group->has_metadata("soma_object_type") == true); - REQUIRE(soma_group->has_metadata("md") == true); - + // Read metadata + soma_group->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(soma_group->metadata_num() == 3); + REQUIRE(soma_group->has_metadata("soma_object_type")); + REQUIRE(soma_group->has_metadata("soma_encoding_version")); + REQUIRE(soma_group->has_metadata("md")); auto mdval = soma_group->get_metadata("md"); REQUIRE(std::get(*mdval) == TILEDB_INT32); REQUIRE(std::get(*mdval) == 1); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); soma_group->close(); - soma_group->open(OpenMode::write, std::pair(2, 2)); + // md should not be available at (2, 2) + soma_group->open(OpenMode::read, TimestampRange(2, 2)); + REQUIRE(soma_group->metadata_num() == 2); + REQUIRE(soma_group->has_metadata("soma_object_type")); + REQUIRE(soma_group->has_metadata("soma_encoding_version")); + REQUIRE(!soma_group->has_metadata("md")); + soma_group->close(); + // Metadata should also be retrievable in write mode + soma_group->open(OpenMode::write, TimestampRange(0, 2)); + REQUIRE(soma_group->metadata_num() == 3); + REQUIRE(soma_group->has_metadata("soma_object_type")); + REQUIRE(soma_group->has_metadata("soma_encoding_version")); + REQUIRE(soma_group->has_metadata("md")); mdval = soma_group->get_metadata("md"); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); + + // Delete and have it reflected when reading metadata while in write mode soma_group->delete_metadata("md"); mdval = soma_group->get_metadata("md"); REQUIRE(!mdval.has_value()); soma_group->close(); - soma_group->open(OpenMode::read, std::pair(3, 3)); - REQUIRE(soma_group->has_metadata("md") == false); - REQUIRE(soma_group->metadata_num() == 1); - soma_group->close(); + // Confirm delete in read mode + soma_group->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(!soma_group->has_metadata("md")); + REQUIRE(soma_group->metadata_num() == 2); } \ No newline at end of file diff --git a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc index 24af5dc936..a52c4769f9 100644 --- a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc @@ -80,7 +80,7 @@ TEST_CASE("SOMASparseNDArray: basic") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-sparse-ndarray-basic"; - SOMASparseNDArray::create(uri, create_schema(*ctx->tiledb_ctx()), ctx); + SOMASparseNDArray::create(uri, create_schema(*ctx->tiledb_ctx()), ctx, TimestampRange(0, 2)); auto soma_sparse = SOMASparseNDArray::open(uri, OpenMode::read, ctx); REQUIRE(soma_sparse->uri() == uri); @@ -120,7 +120,7 @@ TEST_CASE("SOMASparseNDArray: metadata") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-sparse-ndarray"; - SOMASparseNDArray::create(uri, create_schema(*ctx->tiledb_ctx()), ctx); + SOMASparseNDArray::create(uri, create_schema(*ctx->tiledb_ctx()), ctx, TimestampRange(0, 2)); auto soma_sparse = SOMASparseNDArray::open( uri, OpenMode::write, @@ -128,32 +128,48 @@ TEST_CASE("SOMASparseNDArray: metadata") { {}, ResultOrder::automatic, std::pair(1, 1)); + int32_t val = 100; soma_sparse->set_metadata("md", TILEDB_INT32, 1, &val); soma_sparse->close(); - soma_sparse->open(OpenMode::read, std::pair(1, 1)); - REQUIRE(soma_sparse->metadata_num() == 2); - REQUIRE(soma_sparse->has_metadata("soma_object_type") == true); - REQUIRE(soma_sparse->has_metadata("md") == true); - + // Read metadata + soma_sparse->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(soma_sparse->metadata_num() == 3); + REQUIRE(soma_sparse->has_metadata("soma_object_type")); + REQUIRE(soma_sparse->has_metadata("soma_encoding_version")); + REQUIRE(soma_sparse->has_metadata("md")); auto mdval = soma_sparse->get_metadata("md"); REQUIRE(std::get(*mdval) == TILEDB_INT32); REQUIRE(std::get(*mdval) == 1); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); soma_sparse->close(); - soma_sparse->open(OpenMode::write, std::pair(2, 2)); + // md should not be available at (2, 2) + soma_sparse->open(OpenMode::read, TimestampRange(2, 2)); + REQUIRE(soma_sparse->metadata_num() == 2); + REQUIRE(soma_sparse->has_metadata("soma_object_type")); + REQUIRE(soma_sparse->has_metadata("soma_encoding_version")); + REQUIRE(!soma_sparse->has_metadata("md")); + soma_sparse->close(); + // Metadata should also be retrievable in write mode + soma_sparse->open(OpenMode::write, TimestampRange(0, 2)); + REQUIRE(soma_sparse->metadata_num() == 3); + REQUIRE(soma_sparse->has_metadata("soma_object_type")); + REQUIRE(soma_sparse->has_metadata("soma_encoding_version")); + REQUIRE(soma_sparse->has_metadata("md")); mdval = soma_sparse->get_metadata("md"); REQUIRE(*((const int32_t*)std::get(*mdval)) == 100); + + // Delete and have it reflected when reading metadata while in write mode soma_sparse->delete_metadata("md"); mdval = soma_sparse->get_metadata("md"); REQUIRE(!mdval.has_value()); soma_sparse->close(); - soma_sparse->open(OpenMode::read, std::pair(3, 3)); - REQUIRE(soma_sparse->has_metadata("md") == false); - REQUIRE(soma_sparse->metadata_num() == 1); - soma_sparse->close(); + // Confirm delete in read mode + soma_sparse->open(OpenMode::read, TimestampRange(0, 2)); + REQUIRE(!soma_sparse->has_metadata("md")); + REQUIRE(soma_sparse->metadata_num() == 2); } \ No newline at end of file From 6f5ec4b8c9393b325b96fc6a7d07fc4f8dc865cc Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Wed, 6 Mar 2024 19:46:43 -0700 Subject: [PATCH 32/70] WIP handle _read_nonempty_domain from c++ --- apis/python/src/tiledbsoma/io/ingest.py | 4 ++-- libtiledbsoma/src/soma/soma_array.cc | 2 +- libtiledbsoma/test/unit_soma_collection.cc | 8 +++++--- libtiledbsoma/test/unit_soma_dense_ndarray.cc | 3 ++- libtiledbsoma/test/unit_soma_sparse_ndarray.cc | 6 ++++-- 5 files changed, 14 insertions(+), 9 deletions(-) diff --git a/apis/python/src/tiledbsoma/io/ingest.py b/apis/python/src/tiledbsoma/io/ingest.py index c91ac772c3..48e2906ac0 100644 --- a/apis/python/src/tiledbsoma/io/ingest.py +++ b/apis/python/src/tiledbsoma/io/ingest.py @@ -1295,7 +1295,7 @@ def _write_dataframe_impl( ) try: - soma_df = _factory.open(df_uri, "w", soma_type=DataFrame, context=context) + soma_df = DataFrame.open(df_uri, "w", context=context) except DoesNotExistError: soma_df = DataFrame.create( df_uri, @@ -1987,7 +1987,7 @@ def _write_matrix_to_denseNDArray( def _read_nonempty_domain(arr: TileDBArray) -> Any: try: return arr._handle.non_empty_domain() - except SOMAError: + except (SOMAError, RuntimeError): # This means that we're open in write-only mode. # Reopen the array in read mode. pass diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index 74b4eb19d1..82b471b0a6 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -205,7 +205,7 @@ void SOMAArray::open(OpenMode mode, std::optional timestamp) { void SOMAArray::close() { if (arr_->query_type() == TILEDB_WRITE) meta_cache_arr_->close(); - + // Close the array through the managed query to ensure any pending queries // are completed. mq_->close(); diff --git a/libtiledbsoma/test/unit_soma_collection.cc b/libtiledbsoma/test/unit_soma_collection.cc index 6107df2a67..c4f0d2d9fa 100644 --- a/libtiledbsoma/test/unit_soma_collection.cc +++ b/libtiledbsoma/test/unit_soma_collection.cc @@ -261,7 +261,8 @@ TEST_CASE("SOMAExperiment: metadata") { std::string uri = "mem://unit-test-experiment"; auto [schema, index_columns] = helper::create_arrow_schema(); - SOMAExperiment::create(uri, schema, index_columns, ctx, TimestampRange(0, 2)); + SOMAExperiment::create( + uri, schema, index_columns, ctx, TimestampRange(0, 2)); auto soma_experiment = SOMAExperiment::open( uri, OpenMode::write, ctx, std::pair(1, 1)); @@ -314,8 +315,9 @@ TEST_CASE("SOMAMeasurement: metadata") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-measurement"; auto [schema, index_columns] = helper::create_arrow_schema(); - SOMAMeasurement::create(uri, schema, index_columns, ctx, TimestampRange(0, 2)); - + SOMAMeasurement::create( + uri, schema, index_columns, ctx, TimestampRange(0, 2)); + auto soma_measurement = SOMAMeasurement::open( uri, OpenMode::write, ctx, std::pair(1, 1)); diff --git a/libtiledbsoma/test/unit_soma_dense_ndarray.cc b/libtiledbsoma/test/unit_soma_dense_ndarray.cc index 567c934ac9..97f6677034 100644 --- a/libtiledbsoma/test/unit_soma_dense_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_dense_ndarray.cc @@ -120,7 +120,8 @@ TEST_CASE("SOMADenseNDArray: metadata") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-dense-ndarray"; - SOMADenseNDArray::create(uri, create_schema(*ctx->tiledb_ctx()), ctx, TimestampRange(0, 2)); + SOMADenseNDArray::create( + uri, create_schema(*ctx->tiledb_ctx()), ctx, TimestampRange(0, 2)); auto soma_dense = SOMADenseNDArray::open( uri, OpenMode::write, diff --git a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc index a52c4769f9..b8ac0a6075 100644 --- a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc @@ -80,7 +80,8 @@ TEST_CASE("SOMASparseNDArray: basic") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-sparse-ndarray-basic"; - SOMASparseNDArray::create(uri, create_schema(*ctx->tiledb_ctx()), ctx, TimestampRange(0, 2)); + SOMASparseNDArray::create( + uri, create_schema(*ctx->tiledb_ctx()), ctx, TimestampRange(0, 2)); auto soma_sparse = SOMASparseNDArray::open(uri, OpenMode::read, ctx); REQUIRE(soma_sparse->uri() == uri); @@ -120,7 +121,8 @@ TEST_CASE("SOMASparseNDArray: metadata") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-sparse-ndarray"; - SOMASparseNDArray::create(uri, create_schema(*ctx->tiledb_ctx()), ctx, TimestampRange(0, 2)); + SOMASparseNDArray::create( + uri, create_schema(*ctx->tiledb_ctx()), ctx, TimestampRange(0, 2)); auto soma_sparse = SOMASparseNDArray::open( uri, OpenMode::write, From 9bff4467afa7dd61fb626eb55b587868f3cb8e2e Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Wed, 6 Mar 2024 23:03:08 -0700 Subject: [PATCH 33/70] WIP handle nullable attrs --- apis/python/src/tiledbsoma/soma_dataframe.cc | 17 +++++++++++++++++ libtiledbsoma/src/utils/arrow_adapter.cc | 7 ++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/apis/python/src/tiledbsoma/soma_dataframe.cc b/apis/python/src/tiledbsoma/soma_dataframe.cc index 24ab5076a1..2e42ece237 100644 --- a/apis/python/src/tiledbsoma/soma_dataframe.cc +++ b/apis/python/src/tiledbsoma/soma_dataframe.cc @@ -61,6 +61,23 @@ void load_soma_dataframe(py::module& m) { uintptr_t schema_ptr = (uintptr_t)(&schema); py_schema.attr("_export_to_c")(schema_ptr); + for (int64_t sch_idx = 0; sch_idx < schema.n_children; + ++sch_idx) { + auto child = schema.children[sch_idx]; + auto metadata = py_schema.attr("metadata"); + if (py::hasattr(metadata, "get")) { + auto val = metadata.attr("get")( + py::str(child->name).attr("encode")("utf-8")); + + if (val != py::none() && + val.cast() == "nullable") { + child->flags = ARROW_FLAG_NULLABLE; + } else { + child->flags = 0; + } + } + } + ArrowArray domains; uintptr_t domains_ptr = (uintptr_t)(&domains); py_domains.attr("_export_to_c")(domains_ptr); diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 60be571c46..711e67384d 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -257,6 +257,7 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( auto idx_col_begin = index_column_names.begin(); auto idx_col_end = index_column_names.end(); auto idx_col_it = std::find(idx_col_begin, idx_col_end, child->name); + if (idx_col_it != idx_col_end) { auto idx_col_idx = std::distance(idx_col_begin, idx_col_it); auto dim = Dimension::create( @@ -270,9 +271,9 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( } else { Attribute attr(*ctx, child->name, type); - // if (child->flags & ARROW_FLAG_NULLABLE) { - // attr.set_nullable(true); - // } + if (child->flags & ARROW_FLAG_NULLABLE) { + attr.set_nullable(true); + } if ((strcmp(child->format, "U") == 0) | (strcmp(child->format, "Z") == 0) | From 6ae926d0378be3f851a1f1c0c7a396943530726b Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Wed, 6 Mar 2024 23:58:37 -0700 Subject: [PATCH 34/70] WIP fill validity buffer if nullptr --- apis/python/src/tiledbsoma/soma_array.cc | 6 +----- apis/python/tests/test_dataframe.py | 5 +++-- libtiledbsoma/src/soma/column_buffer.h | 9 +++++++-- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index f4f3572657..06723c229e 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -52,11 +52,7 @@ void write(SOMAArray& array, py::handle py_batch) { const void* data; uint64_t* offsets = nullptr; - uint8_t* validities = nullptr; - - if (arr_->null_count != 0) { - validities = (uint8_t*)arr_->buffers[0]; - } + uint8_t* validities = (uint8_t*)arr_->buffers[0]; if (arr_->n_buffers == 3) { offsets = (uint64_t*)arr_->buffers[1]; diff --git a/apis/python/tests/test_dataframe.py b/apis/python/tests/test_dataframe.py index 23059bfee0..4174c02dda 100644 --- a/apis/python/tests/test_dataframe.py +++ b/apis/python/tests/test_dataframe.py @@ -88,6 +88,7 @@ def test_dataframe(tmp_path, arrow_schema): # Read all table = sdf.read().concat() + print(table) assert table.num_rows == 5 assert table.num_columns == 5 assert [e.as_py() for e in list(table["soma_joinid"])] == pydict["soma_joinid"] @@ -119,9 +120,9 @@ def test_dataframe(tmp_path, arrow_schema): with soma.DataFrame.open(tmp_path.as_posix(), "r") as A: assert isinstance(A._handle._handle, soma.pytiledbsoma.SOMADataFrame) - # Ensure write mode uses Python object + # Ensure write mode uses clib object with soma.DataFrame.open(tmp_path.as_posix(), "w") as A: - assert isinstance(A._handle._handle, tiledb.Array) + assert isinstance(A._handle._handle, soma.pytiledbsoma.SOMADataFrame) def test_dataframe_with_float_dim(tmp_path, arrow_schema): diff --git a/libtiledbsoma/src/soma/column_buffer.h b/libtiledbsoma/src/soma/column_buffer.h index 2a12dfa7ba..41b8a92dda 100644 --- a/libtiledbsoma/src/soma/column_buffer.h +++ b/libtiledbsoma/src/soma/column_buffer.h @@ -146,8 +146,13 @@ class ColumnBuffer { (std::byte*)data, (std::byte*)data + num_elems * type_size_); } - if (validity != nullptr) { - validity_.assign(validity, validity + num_elems); + if (is_nullable_) { + if (validity != nullptr) { + validity_.assign(validity, validity + num_elems); + } else { + validity_.resize(num_elems); + std::fill(validity_.begin(), validity_.end(), 1); + } } } From 23a81f2f0e7bde164b6f5fd46dbfcaa1d3888a5f Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Thu, 7 Mar 2024 11:07:56 -0700 Subject: [PATCH 35/70] WIP add enumerations to ArraySchema --- apis/python/src/tiledbsoma/soma_dataframe.cc | 4 +-- apis/python/tests/test_dataframe.py | 1 + libtiledbsoma/src/soma/managed_query.cc | 2 +- libtiledbsoma/src/utils/arrow_adapter.cc | 30 +++++++++++++++++--- libtiledbsoma/src/utils/arrow_adapter.h | 2 ++ 5 files changed, 32 insertions(+), 7 deletions(-) diff --git a/apis/python/src/tiledbsoma/soma_dataframe.cc b/apis/python/src/tiledbsoma/soma_dataframe.cc index 2e42ece237..ba08ba4882 100644 --- a/apis/python/src/tiledbsoma/soma_dataframe.cc +++ b/apis/python/src/tiledbsoma/soma_dataframe.cc @@ -71,9 +71,9 @@ void load_soma_dataframe(py::module& m) { if (val != py::none() && val.cast() == "nullable") { - child->flags = ARROW_FLAG_NULLABLE; + child->flags &= ARROW_FLAG_NULLABLE; } else { - child->flags = 0; + child->flags &= ~ARROW_FLAG_NULLABLE; } } } diff --git a/apis/python/tests/test_dataframe.py b/apis/python/tests/test_dataframe.py index 4174c02dda..2d5c0a693a 100644 --- a/apis/python/tests/test_dataframe.py +++ b/apis/python/tests/test_dataframe.py @@ -154,6 +154,7 @@ def test_dataframe_with_enumeration(tmp_path): with soma.DataFrame.open(tmp_path.as_posix()) as sdf: df = sdf.read().concat() + print(df) np.testing.assert_array_equal(df["foo"].chunk(0).dictionary, enums["enmr1"]) np.testing.assert_array_equal(df["bar"].chunk(0).dictionary, enums["enmr2"]) diff --git a/libtiledbsoma/src/soma/managed_query.cc b/libtiledbsoma/src/soma/managed_query.cc index 9164f7fa80..6616d21789 100644 --- a/libtiledbsoma/src/soma/managed_query.cc +++ b/libtiledbsoma/src/soma/managed_query.cc @@ -276,7 +276,7 @@ void ManagedQuery::setup_read() { } void ManagedQuery::submit_write() { - query_->submit(); + Status status = query_->submit(); } void ManagedQuery::submit_read() { diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 711e67384d..a2cdcf3059 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -275,12 +275,24 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( attr.set_nullable(true); } - if ((strcmp(child->format, "U") == 0) | - (strcmp(child->format, "Z") == 0) | - (strcmp(child->format, "u") == 0) | - (strcmp(child->format, "z") == 0)) { + if (ArrowAdapter::_isvar(child->format)) { attr.set_cell_val_num(TILEDB_VAR_NUM); } + + if (child->dictionary != nullptr) { + auto enmr_format = child->dictionary->format; + auto enmr_type = ArrowAdapter::to_tiledb_format(enmr_format); + auto enmr = Enumeration::create_empty( + *ctx, + child->name, + enmr_type, + ArrowAdapter::_isvar(enmr_format) ? TILEDB_VAR_NUM : 1, + child->flags & ARROW_FLAG_DICTIONARY_ORDERED); + ArraySchemaExperimental::add_enumeration(*ctx, schema, enmr); + AttributeExperimental::set_enumeration_name( + *ctx, attr, child->name); + } + schema.add_attribute(attr); } } @@ -289,6 +301,8 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( schema.check(); + schema.dump(); + return schema; } @@ -424,6 +438,14 @@ ArrowTable ArrowAdapter::to_arrow(std::shared_ptr column) { return ArrowTable(array, schema); } +bool ArrowAdapter::_isvar(const char* format) { + if ((strcmp(format, "U") == 0) | (strcmp(format, "Z") == 0) | + (strcmp(format, "u") == 0) | (strcmp(format, "z") == 0)) { + return true; + } + return false; +} + std::string_view ArrowAdapter::to_arrow_format( tiledb_datatype_t tiledb_dtype, bool use_large) { auto u = use_large ? "U" : "u"; diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index ed164d3ad9..a3c09b7110 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -103,6 +103,8 @@ class ArrowAdapter { static std::optional> _get_dim_info( std::string_view dim_name, ArrowTable index_columns); + + static bool _isvar(const char* format); }; }; // namespace tiledbsoma From d4d88e0c3d8c53c4777f8a92e5dd4224a9bcfb16 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Thu, 7 Mar 2024 14:27:04 -0700 Subject: [PATCH 36/70] WIP check that column to write is enum --- apis/python/src/tiledbsoma/soma_array.cc | 14 +++++++++++++- libtiledbsoma/src/soma/managed_query.cc | 2 +- libtiledbsoma/src/soma/soma_array.cc | 4 ++++ libtiledbsoma/src/soma/soma_array.h | 2 ++ libtiledbsoma/src/utils/arrow_adapter.cc | 2 -- 5 files changed, 20 insertions(+), 4 deletions(-) diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index 06723c229e..393cd7525e 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -46,6 +46,8 @@ void write(SOMAArray& array, py::handle py_batch) { uintptr_t arrow_array_ptr = (uintptr_t)(&arrow_array); py_batch.attr("_export_to_c")(arrow_array_ptr, arrow_schema_ptr); + auto attributes = array.tiledb_schema()->attributes(); + for (auto i = 0; i < arrow_schema.n_children; ++i) { auto sch_ = arrow_schema.children[i]; auto arr_ = arrow_array.children[i]; @@ -61,6 +63,17 @@ void write(SOMAArray& array, py::handle py_batch) { data = arr_->buffers[1]; } + if (attributes.find(sch_->name) != attributes.end()) { + auto enmr_name = AttributeExperimental::get_enumeration_name( + *array.ctx()->tiledb_ctx(), attributes.at(sch_->name)); + + if (enmr_name.has_value() && !sch_->dictionary) { + array.clear_column_data(); + throw py::value_error( + "Saw non-dictionary column passed to enumerated type"); + } + } + array.set_column_data( sch_->name, arr_->length, data, offsets, validities); } @@ -80,7 +93,6 @@ py::dict meta(SOMAArray& array) { } else { py::dtype value_type = tdb_to_np_dtype(tdb_type, 1); auto res = py::array(value_type, value_num, value).attr("item")(0); - ; results[py::str(key)] = res; } } diff --git a/libtiledbsoma/src/soma/managed_query.cc b/libtiledbsoma/src/soma/managed_query.cc index 6616d21789..9164f7fa80 100644 --- a/libtiledbsoma/src/soma/managed_query.cc +++ b/libtiledbsoma/src/soma/managed_query.cc @@ -276,7 +276,7 @@ void ManagedQuery::setup_read() { } void ManagedQuery::submit_write() { - Status status = query_->submit(); + query_->submit(); } void ManagedQuery::submit_read() { diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index 82b471b0a6..0f4022a4f1 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -303,6 +303,10 @@ void SOMAArray::set_column_data( mq_->set_column_data(column); }; +void SOMAArray::clear_column_data() { + array_buffer_ = nullptr; +} + void SOMAArray::write() { if (mq_->query_type() != TILEDB_WRITE) { throw TileDBSOMAError("[SOMAArray] array must be opened in write mode"); diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index d28051d8fc..d0c341f23b 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -411,6 +411,8 @@ class SOMAArray : public SOMAObject { uint64_t* offsets = nullptr, uint8_t* validity = nullptr); + void clear_column_data(); + /** * @brief Write ArrayBuffers data to the array. * diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index a2cdcf3059..338896b669 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -301,8 +301,6 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( schema.check(); - schema.dump(); - return schema; } From 27f2614cc04a3aeb7d0031da7edb838e05c914c9 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Thu, 7 Mar 2024 22:42:12 -0700 Subject: [PATCH 37/70] WIP extend enumerations on write --- apis/python/src/tiledbsoma/soma_array.cc | 14 +++++++++++++- libtiledbsoma/src/soma/soma_array.cc | 23 +++++++++++++++++++++++ libtiledbsoma/src/soma/soma_array.h | 6 ++++++ 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index 393cd7525e..a22a05098b 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -67,11 +67,23 @@ void write(SOMAArray& array, py::handle py_batch) { auto enmr_name = AttributeExperimental::get_enumeration_name( *array.ctx()->tiledb_ctx(), attributes.at(sch_->name)); - if (enmr_name.has_value() && !sch_->dictionary) { + auto dict = arr_->dictionary; + if (enmr_name.has_value() && !dict) { array.clear_column_data(); throw py::value_error( "Saw non-dictionary column passed to enumerated type"); } + + const void* enmr_data; + uint64_t* enmr_offsets = nullptr; + if (dict->n_buffers == 3) { + enmr_offsets = (uint64_t*)dict->buffers[1]; + enmr_data = dict->buffers[2]; + } else { + enmr_data = dict->buffers[1]; + } + array.extend_enumeration( + sch_->name, dict->length, enmr_data, enmr_offsets); } array.set_column_data( diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index 0f4022a4f1..a50b7e53e8 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -275,6 +275,29 @@ std::optional> SOMAArray::read_next() { return mq_->results(); } +void SOMAArray::extend_enumeration( + std::string_view name, + uint64_t num_elems, + const void* data, + uint64_t* offsets) { + auto ctx = *ctx_->tiledb_ctx(); + + auto enmr = ArrayExperimental::get_enumeration( + ctx, *arr_, std::string(name)); + + // TODO can be uint64_t + std::vector offset_v( + (uint32_t*)offsets, (uint32_t*)offsets + num_elems + 1); + uint64_t data_size = offset_v[num_elems]; + + auto new_enmr = enmr.extend( + data, data_size, offset_v.data(), num_elems * sizeof(uint64_t)); + + ArraySchemaEvolution se(ctx); + se.extend_enumeration(new_enmr); + se.array_evolve(uri_); +} + void SOMAArray::set_column_data( std::string_view name, uint64_t num_elems, diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index d0c341f23b..e4b044840c 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -404,6 +404,12 @@ class SOMAArray : public SOMAObject { */ std::optional> read_next(); + void extend_enumeration( + std::string_view name, + uint64_t num_elems, + const void* data, + uint64_t* offsets); + void set_column_data( std::string_view name, uint64_t num_elems, From f3166474d7931d2c6ce5b2b60922afd5d2786969 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Thu, 7 Mar 2024 23:01:37 -0700 Subject: [PATCH 38/70] WIP only extend enmr when present --- apis/python/src/tiledbsoma/soma_array.cc | 32 +++++++++++++----------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index a22a05098b..1266027289 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -67,23 +67,25 @@ void write(SOMAArray& array, py::handle py_batch) { auto enmr_name = AttributeExperimental::get_enumeration_name( *array.ctx()->tiledb_ctx(), attributes.at(sch_->name)); - auto dict = arr_->dictionary; - if (enmr_name.has_value() && !dict) { - array.clear_column_data(); - throw py::value_error( - "Saw non-dictionary column passed to enumerated type"); - } + if(enmr_name.has_value()){ + auto dict = arr_->dictionary; + if (!dict) { + array.clear_column_data(); + throw py::value_error( + "Saw non-dictionary column passed to enumerated type"); + } - const void* enmr_data; - uint64_t* enmr_offsets = nullptr; - if (dict->n_buffers == 3) { - enmr_offsets = (uint64_t*)dict->buffers[1]; - enmr_data = dict->buffers[2]; - } else { - enmr_data = dict->buffers[1]; + const void* enmr_data; + uint64_t* enmr_offsets = nullptr; + if (dict->n_buffers == 3) { + enmr_offsets = (uint64_t*)dict->buffers[1]; + enmr_data = dict->buffers[2]; + } else { + enmr_data = dict->buffers[1]; + } + array.extend_enumeration( + sch_->name, dict->length, enmr_data, enmr_offsets); } - array.extend_enumeration( - sch_->name, dict->length, enmr_data, enmr_offsets); } array.set_column_data( From f8a2d4a8c597a3c5d16dad23e69ae53e2791097b Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Fri, 8 Mar 2024 11:14:40 -0700 Subject: [PATCH 39/70] WIP do not extend if no values present --- apis/python/src/tiledbsoma/soma_array.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index 1266027289..b9a4967ac8 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -67,7 +67,7 @@ void write(SOMAArray& array, py::handle py_batch) { auto enmr_name = AttributeExperimental::get_enumeration_name( *array.ctx()->tiledb_ctx(), attributes.at(sch_->name)); - if(enmr_name.has_value()){ + if (enmr_name.has_value()) { auto dict = arr_->dictionary; if (!dict) { array.clear_column_data(); @@ -83,8 +83,11 @@ void write(SOMAArray& array, py::handle py_batch) { } else { enmr_data = dict->buffers[1]; } - array.extend_enumeration( - sch_->name, dict->length, enmr_data, enmr_offsets); + + if (dict->length != 0) { + array.extend_enumeration( + sch_->name, dict->length, enmr_data, enmr_offsets); + } } } From a3d39c2e5871024c5f930f7c20334fac00cdaab3 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Mon, 11 Mar 2024 09:46:08 -0500 Subject: [PATCH 40/70] WIP --- apis/python/src/tiledbsoma/_dataframe.py | 12 ++++++++++-- libtiledbsoma/src/utils/arrow_adapter.cc | 13 +++++++++++-- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index f8e5ffa000..17d185bc02 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -240,6 +240,14 @@ def create( domains.append(slot_domain) extents.append([extent]) + + print(index_column_names) + print(domains) + print(extents) + + print(index_column_names) + print(pa.array(domains)) + print(pa.array(extents)) # TODO add as kw args clib.SOMADataFrame.create( @@ -878,7 +886,7 @@ def _fill_out_slot_domain( ) elif isinstance(dtype, str): - slot_domain = None, None + slot_domain = "", "" elif np.issubdtype(dtype, NPInteger): iinfo = np.iinfo(cast(NPInteger, dtype)) slot_domain = iinfo.min, iinfo.max - 1 @@ -943,7 +951,7 @@ def _find_extent_for_domain( extent = 64 if isinstance(dtype, str): - return extent + return "" lo, hi = slot_domain if lo is None or hi is None: diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 338896b669..e3e241aa10 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -260,12 +260,21 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( if (idx_col_it != idx_col_end) { auto idx_col_idx = std::distance(idx_col_begin, idx_col_it); + if ((strcmp(child->format, "U") == 0) | + (strcmp(child->format, "u") == 0)) { + type = TILEDB_STRING_ASCII; + } + auto dim = Dimension::create( *ctx, child->name, type, - domains->children[idx_col_idx]->buffers[1], - extents->children[idx_col_idx]->buffers[1]); + type == TILEDB_STRING_ASCII ? + nullptr : + domains->children[idx_col_idx]->buffers[1], + type == TILEDB_STRING_ASCII ? + nullptr : + extents->children[idx_col_idx]->buffers[1]); domain.add_dimension(dim); } else { From b88d580dfe837ad346295e913be19e921706e276 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Mon, 11 Mar 2024 21:20:06 -0500 Subject: [PATCH 41/70] Add common unit test file --- libtiledbsoma/test/common.cc | 133 +++++++++++++++++++++++++++++++++++ libtiledbsoma/test/common.h | 66 +++++++++++++++++ 2 files changed, 199 insertions(+) create mode 100644 libtiledbsoma/test/common.cc create mode 100644 libtiledbsoma/test/common.h diff --git a/libtiledbsoma/test/common.cc b/libtiledbsoma/test/common.cc new file mode 100644 index 0000000000..f497908dd6 --- /dev/null +++ b/libtiledbsoma/test/common.cc @@ -0,0 +1,133 @@ +/** + * @file common.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2024 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file manages common headers and helper classes for the unit test files. + */ + +#include "common.h" + +namespace helper { +ArraySchema create_schema(Context& ctx, bool allow_duplicates) { + // Create schema + ArraySchema schema(ctx, TILEDB_SPARSE); + + auto dim = Dimension::create(ctx, "d0", {0, 1000}); + + Domain domain(ctx); + domain.add_dimension(dim); + schema.set_domain(domain); + + auto attr = Attribute::create(ctx, "a0"); + schema.add_attribute(attr); + schema.set_allows_dups(allow_duplicates); + schema.check(); + + return schema; +} + +std::pair, ColumnIndexInfo> create_arrow_schema() { + // Create ArrowSchema + auto arrow_schema = std::make_shared(); + arrow_schema->format = "+s"; + arrow_schema->n_children = 2; + arrow_schema->dictionary = nullptr; + arrow_schema->release = &ArrowAdapter::release_schema; + arrow_schema->children = new ArrowSchema*[arrow_schema->n_children]; + + ArrowSchema* dim = nullptr; + dim = arrow_schema->children[0] = new ArrowSchema; + dim->format = "l"; + dim->name = "d0"; + dim->n_children = 0; + dim->dictionary = nullptr; + dim->release = &ArrowAdapter::release_schema; + + ArrowSchema* attr = nullptr; + attr = arrow_schema->children[1] = new ArrowSchema; + attr->format = "l"; + attr->name = "a0"; + attr->n_children = 0; + attr->dictionary = nullptr; + attr->release = &ArrowAdapter::release_schema; + + // Create array for index columns + std::vector index_column_names = {"d0"}; + + auto domains = std::make_shared(); + domains->length = 0; + domains->null_count = 0; + domains->offset = 0; + domains->n_buffers = 0; + domains->buffers = nullptr; + domains->n_children = 2; + domains->release = &ArrowAdapter::release_array; + domains->children = new ArrowArray*[1]; + + auto d0_domain = domains->children[0] = new ArrowArray; + d0_domain->length = 2; + d0_domain->null_count = 0; + d0_domain->offset = 0; + d0_domain->n_buffers = 2; + d0_domain->release = &ArrowAdapter::release_array; + d0_domain->buffers = new const void*[2]; + d0_domain->buffers[0] = nullptr; + d0_domain->buffers[1] = malloc(sizeof(int64_t) * 2); + d0_domain->n_children = 0; + int64_t dom[] = {0, 1000}; + std::memcpy((void*)d0_domain->buffers[1], &dom, sizeof(int64_t) * 2); + + auto tiles = std::make_shared(); + tiles->length = 0; + tiles->null_count = 0; + tiles->offset = 0; + tiles->n_buffers = 0; + tiles->buffers = nullptr; + tiles->n_children = 2; + tiles->release = &ArrowAdapter::release_array; + tiles->children = new ArrowArray*[1]; + + ArrowArray* d0_tile = tiles->children[0] = new ArrowArray; + d0_tile->length = 1; + d0_tile->null_count = 0; + d0_tile->offset = 0; + d0_tile->n_buffers = 2; + d0_tile->release = &ArrowAdapter::release_array; + d0_tile->buffers = new const void*[2]; + d0_tile->buffers[0] = nullptr; + d0_tile->buffers[1] = malloc(sizeof(int64_t)); + d0_tile->n_children = 0; + int64_t tile = 1; + std::memcpy((void*)d0_tile->buffers[1], &tile, sizeof(int64_t)); + + ColumnIndexInfo index_columns_info = std::tuple( + index_column_names, domains, tiles); + + return std::pair(arrow_schema, index_columns_info); +} +} // namespace helper \ No newline at end of file diff --git a/libtiledbsoma/test/common.h b/libtiledbsoma/test/common.h new file mode 100644 index 0000000000..16ce7e4bbd --- /dev/null +++ b/libtiledbsoma/test/common.h @@ -0,0 +1,66 @@ +/** + * @file common.h + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2024 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file manages common headers and helper classes for the unit test files. + */ + +#ifndef UNIT_TEST_COMMON_H +#define UNIT_TEST_COMMON_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "utils/util.h" + +using namespace tiledb; +using namespace tiledbsoma; +using namespace Catch::Matchers; + +#ifndef TILEDBSOMA_SOURCE_ROOT +#define TILEDBSOMA_SOURCE_ROOT "not_defined" +#endif + +static const std::string src_path = TILEDBSOMA_SOURCE_ROOT; + +namespace helper { +ArraySchema create_schema(Context& ctx, bool allow_duplicates = false); +std::pair, ColumnIndexInfo> create_arrow_schema(); +} // namespace helper +#endif \ No newline at end of file From 68aa70a7ce7f98f729bfdcf89d6caf8fd36cf956 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Tue, 12 Mar 2024 11:26:00 -0500 Subject: [PATCH 42/70] WIP pass domain and extents as StructArray --- apis/python/src/tiledbsoma/_dataframe.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 17d185bc02..56aaf8417f 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -238,24 +238,19 @@ def create( slot_domain, ) - domains.append(slot_domain) - extents.append([extent]) - - print(index_column_names) - print(domains) - print(extents) + domains.append(pa.array(slot_domain)) + extents.append(pa.array([extent])) - print(index_column_names) - print(pa.array(domains)) - print(pa.array(extents)) + domains = pa.StructArray.from_arrays(domains, names=index_column_names) + extents = pa.StructArray.from_arrays(extents, names=index_column_names) # TODO add as kw args clib.SOMADataFrame.create( uri, schema, index_column_names, - pa.array(domains), - pa.array(extents), + domains, + extents, context.native_context, ) From 7277a0f24f161d906e349a84b2cd5266674093f6 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Tue, 12 Mar 2024 14:23:04 -0500 Subject: [PATCH 43/70] WIP correct offset buffer --- libtiledbsoma/src/soma/managed_query.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/libtiledbsoma/src/soma/managed_query.cc b/libtiledbsoma/src/soma/managed_query.cc index 9164f7fa80..d6834b9528 100644 --- a/libtiledbsoma/src/soma/managed_query.cc +++ b/libtiledbsoma/src/soma/managed_query.cc @@ -108,11 +108,9 @@ void ManagedQuery::set_column_data( column_name, (void*)data.data(), column_buffer->data_size()); if (column_buffer->is_var()) { - // Remove one offset for TileDB, which checks that the - // offsets and validity buffers are the same size auto offsets = column_buffer->offsets(); query_->set_offsets_buffer( - column_name, offsets.data(), offsets.size() - 1); + column_name, offsets.data(), offsets.size()); } if (column_buffer->is_nullable()) { auto validity = column_buffer->validity(); From 42bb7549df606b3fbd313c44f6498488901859c9 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Tue, 12 Mar 2024 14:36:18 -0500 Subject: [PATCH 44/70] Throw TileDBErrors as TileDBSOMAErrors --- libtiledbsoma/src/soma/soma_array.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index e4b044840c..f66a2e2d13 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -539,7 +539,11 @@ class SOMAArray : public SOMAObject { */ template std::pair non_empty_domain(const std::string& name) { - return arr_->non_empty_domain(name); + try { + return arr_->non_empty_domain(name); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } } /** @@ -549,7 +553,11 @@ class SOMAArray : public SOMAObject { */ std::pair non_empty_domain_var( const std::string& name) { - return arr_->non_empty_domain_var(name); + try { + return arr_->non_empty_domain_var(name); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } } /** From ed839056cb68930a5621d138c8cca4922aee851a Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Tue, 12 Mar 2024 14:49:06 -0500 Subject: [PATCH 45/70] WIP use ASCII for string dims --- libtiledbsoma/src/utils/arrow_adapter.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index e3e241aa10..3bf80e2ab6 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -260,8 +260,7 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( if (idx_col_it != idx_col_end) { auto idx_col_idx = std::distance(idx_col_begin, idx_col_it); - if ((strcmp(child->format, "U") == 0) | - (strcmp(child->format, "u") == 0)) { + if (ArrowAdapter::_isvar(child->format)) { type = TILEDB_STRING_ASCII; } From eebb40dabbc40285e764001c2e37b297dc0c3ef5 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Tue, 12 Mar 2024 16:04:32 -0500 Subject: [PATCH 46/70] Handle extending enumerations for non-var attrs --- libtiledbsoma/src/soma/soma_array.cc | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index a50b7e53e8..a46f7f1580 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -285,16 +285,20 @@ void SOMAArray::extend_enumeration( auto enmr = ArrayExperimental::get_enumeration( ctx, *arr_, std::string(name)); - // TODO can be uint64_t - std::vector offset_v( - (uint32_t*)offsets, (uint32_t*)offsets + num_elems + 1); - uint64_t data_size = offset_v[num_elems]; - - auto new_enmr = enmr.extend( - data, data_size, offset_v.data(), num_elems * sizeof(uint64_t)); - ArraySchemaEvolution se(ctx); - se.extend_enumeration(new_enmr); + if (offsets != nullptr) { + // TODO can be uint64_t + std::vector offset_v( + (uint32_t*)offsets, (uint32_t*)offsets + num_elems + 1); + auto data_sz = offset_v[num_elems]; + auto offset_ptr = offset_v.data(); + auto offset_sz = num_elems * sizeof(uint64_t); + se.extend_enumeration( + enmr.extend(data, data_sz, offset_ptr, offset_sz)); + } else { + auto data_sz = num_elems * tiledb::impl::type_size(enmr.type()); + se.extend_enumeration(enmr.extend(data, data_sz, nullptr, 0)); + } se.array_evolve(uri_); } From 5ad82930fe125e7c114ecc5e684b28230ab7a6db Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Wed, 13 Mar 2024 11:07:04 -0500 Subject: [PATCH 47/70] Unsupport arrow types should throw TypeError --- apis/python/src/tiledbsoma/soma_dataframe.cc | 22 +++++++++++------- apis/python/tests/test_dataframe.py | 1 + libtiledbsoma/src/utils/arrow_adapter.cc | 24 +++++++------------- 3 files changed, 23 insertions(+), 24 deletions(-) diff --git a/apis/python/src/tiledbsoma/soma_dataframe.cc b/apis/python/src/tiledbsoma/soma_dataframe.cc index ba08ba4882..2f50c116e9 100644 --- a/apis/python/src/tiledbsoma/soma_dataframe.cc +++ b/apis/python/src/tiledbsoma/soma_dataframe.cc @@ -86,14 +86,20 @@ void load_soma_dataframe(py::module& m) { uintptr_t extents_ptr = (uintptr_t)(&extents); py_extents.attr("_export_to_c")(extents_ptr); - SOMADataFrame::create( - uri, - std::make_shared(schema), - ColumnIndexInfo( - index_columns_names, - std::make_shared(domains), - std::make_shared(extents)), - context); + try{ + SOMADataFrame::create( + uri, + std::make_shared(schema), + ColumnIndexInfo( + index_columns_names, + std::make_shared(domains), + std::make_shared(extents)), + context); + }catch(const std::out_of_range& e){ + throw py::type_error(e.what()); + }catch(const std::exception& e){ + TPY_ERROR_LOC(e.what()); + } }) .def_static( diff --git a/apis/python/tests/test_dataframe.py b/apis/python/tests/test_dataframe.py index 2d5c0a693a..82defc2dda 100644 --- a/apis/python/tests/test_dataframe.py +++ b/apis/python/tests/test_dataframe.py @@ -1160,6 +1160,7 @@ def test_extend_enumerations(tmp_path): with soma.open(str(tmp_path)) as soma_dataframe: df = soma_dataframe.read().concat().to_pandas() + print(df) for c in df: # TODO bytes are being set to ascii - requires a fix in tiledb-py # assert df[c].dtype == pandas_df[c].dtype diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 3bf80e2ab6..77bf7bace3 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -464,18 +464,14 @@ std::string_view ArrowAdapter::to_arrow_format( {TILEDB_INT32, "i"}, {TILEDB_UINT32, "I"}, {TILEDB_INT64, "l"}, {TILEDB_UINT64, "L"}, {TILEDB_FLOAT32, "f"}, {TILEDB_FLOAT64, "g"}, - {TILEDB_BOOL, "b"}, {TILEDB_TIME_SEC, "tts"}, - {TILEDB_TIME_MS, "ttm"}, {TILEDB_TIME_US, "ttu"}, - {TILEDB_TIME_NS, "ttn"}, {TILEDB_DATETIME_SEC, "tss:"}, - {TILEDB_DATETIME_MS, "tsm:"}, {TILEDB_DATETIME_US, "tsu:"}, - {TILEDB_DATETIME_NS, "tsn:"}, + {TILEDB_BOOL, "b"}, }; try { return _to_arrow_format_map.at(tiledb_dtype); - } catch (const std::out_of_range& err) { - throw TileDBSOMAError(fmt::format( - "ArrowAdapter: Unsupported TileDB datatype: {} ", + } catch (const std::out_of_range& e) { + throw std::out_of_range(fmt::format( + "ArrowAdapter: Unsupported TileDB type: {} ", tiledb::impl::type_to_str(tiledb_dtype))); } } @@ -489,18 +485,14 @@ tiledb_datatype_t ArrowAdapter::to_tiledb_format(std::string_view arrow_dtype) { {"i", TILEDB_INT32}, {"I", TILEDB_UINT32}, {"l", TILEDB_INT64}, {"L", TILEDB_UINT64}, {"f", TILEDB_FLOAT32}, {"g", TILEDB_FLOAT64}, - {"b", TILEDB_BOOL}, {"tts", TILEDB_TIME_SEC}, - {"ttm", TILEDB_TIME_MS}, {"ttu", TILEDB_TIME_US}, - {"ttn", TILEDB_TIME_NS}, {"tss:", TILEDB_DATETIME_SEC}, - {"tsm:", TILEDB_DATETIME_MS}, {"tsu:", TILEDB_DATETIME_US}, - {"tsn:", TILEDB_DATETIME_NS}, + {"b", TILEDB_BOOL}, }; try { return _to_tiledb_format_map.at(arrow_dtype); - } catch (const std::out_of_range& err) { - throw TileDBSOMAError(fmt::format( - "ArrowAdapter: Unsupported arrow datatype: {} ", arrow_dtype)); + } catch (const std::out_of_range& e) { + throw std::out_of_range(fmt::format( + "ArrowAdapter: Unsupported Arrow type: {} ", arrow_dtype)); } } From 4fc8f9ab620cacd451eed04c2f47347bf5b94370 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Wed, 13 Mar 2024 12:27:22 -0500 Subject: [PATCH 48/70] Cast domains and extents to correct types --- apis/python/src/tiledbsoma/_dataframe.py | 4 ++-- apis/python/src/tiledbsoma/soma_dataframe.cc | 6 +++--- libtiledbsoma/src/utils/arrow_adapter.cc | 8 ++++++-- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 56aaf8417f..f174e0c114 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -238,8 +238,8 @@ def create( slot_domain, ) - domains.append(pa.array(slot_domain)) - extents.append(pa.array([extent])) + domains.append(pa.array(slot_domain, type=pa_type)) + extents.append(pa.array([extent], type=pa_type)) domains = pa.StructArray.from_arrays(domains, names=index_column_names) extents = pa.StructArray.from_arrays(extents, names=index_column_names) diff --git a/apis/python/src/tiledbsoma/soma_dataframe.cc b/apis/python/src/tiledbsoma/soma_dataframe.cc index 2f50c116e9..45b565e402 100644 --- a/apis/python/src/tiledbsoma/soma_dataframe.cc +++ b/apis/python/src/tiledbsoma/soma_dataframe.cc @@ -86,7 +86,7 @@ void load_soma_dataframe(py::module& m) { uintptr_t extents_ptr = (uintptr_t)(&extents); py_extents.attr("_export_to_c")(extents_ptr); - try{ + try { SOMADataFrame::create( uri, std::make_shared(schema), @@ -95,9 +95,9 @@ void load_soma_dataframe(py::module& m) { std::make_shared(domains), std::make_shared(extents)), context); - }catch(const std::out_of_range& e){ + } catch (const std::out_of_range& e) { throw py::type_error(e.what()); - }catch(const std::exception& e){ + } catch (const std::exception& e) { TPY_ERROR_LOC(e.what()); } }) diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 77bf7bace3..834475077c 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -464,7 +464,9 @@ std::string_view ArrowAdapter::to_arrow_format( {TILEDB_INT32, "i"}, {TILEDB_UINT32, "I"}, {TILEDB_INT64, "l"}, {TILEDB_UINT64, "L"}, {TILEDB_FLOAT32, "f"}, {TILEDB_FLOAT64, "g"}, - {TILEDB_BOOL, "b"}, + {TILEDB_BOOL, "b"}, {TILEDB_DATETIME_SEC, "tss:"}, + {TILEDB_DATETIME_MS, "tsm:"}, {TILEDB_DATETIME_US, "tsu:"}, + {TILEDB_DATETIME_NS, "tsn:"}, }; try { @@ -485,7 +487,9 @@ tiledb_datatype_t ArrowAdapter::to_tiledb_format(std::string_view arrow_dtype) { {"i", TILEDB_INT32}, {"I", TILEDB_UINT32}, {"l", TILEDB_INT64}, {"L", TILEDB_UINT64}, {"f", TILEDB_FLOAT32}, {"g", TILEDB_FLOAT64}, - {"b", TILEDB_BOOL}, + {"b", TILEDB_BOOL}, {"tss:", TILEDB_DATETIME_SEC}, + {"tsm:", TILEDB_DATETIME_MS}, {"tsu:", TILEDB_DATETIME_US}, + {"tsn:", TILEDB_DATETIME_NS}, }; try { From 84fb0f0827b89892487af9b792c94398873788ad Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Thu, 14 Mar 2024 12:16:56 -0500 Subject: [PATCH 49/70] WIP --- apis/python/src/tiledbsoma/_dataframe.py | 3 ++- libtiledbsoma/src/soma/managed_query.cc | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index f174e0c114..71a1e93391 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -253,7 +253,7 @@ def create( extents, context.native_context, ) - + handle = cls._wrapper_type.open(uri, "w", context, tiledb_timestamp) return cls( @@ -518,6 +518,7 @@ def write( # the user set index_column_names = ["meister", "burger"] when creating the TileDB schema. # Then the above for-loop over the Arrow schema will find the former ordering, but for the # ``writer[dims] = attrs`` below we must have dims with the latter ordering. + values = values.cast(self.schema) for batch in values.to_batches(): self._handle.write(batch) tiledb_create_options = TileDBCreateOptions.from_platform_config( diff --git a/libtiledbsoma/src/soma/managed_query.cc b/libtiledbsoma/src/soma/managed_query.cc index d6834b9528..eb808ca9d6 100644 --- a/libtiledbsoma/src/soma/managed_query.cc +++ b/libtiledbsoma/src/soma/managed_query.cc @@ -104,8 +104,27 @@ void ManagedQuery::set_column_data( if (is_sparse) { auto data = column_buffer->data(); + // if (column_buffer->type() == TILEDB_BOOL) { + // std::cout << column_name << std::endl; + // std::cout << column_buffer->data_size() << std::endl; + // std::vector bool_output; + // auto bool_data = column_buffer->data(); + // for (size_t i = 0; i * 8 < column_buffer->data_size(); ++i) { + // for (size_t j = 0; j < 8; ++j) { + // std::cout << ((bool_data[i] >> j) & 0x01); + // bool_output.push_back((bool_data[i] >> j) & 0x01); + // } + // } + // std::cout << std::endl; + + // query_->set_data_buffer( + // column_name, + // (void*)bool_output.data(), + // column_buffer->data_size()); + // } else { query_->set_data_buffer( column_name, (void*)data.data(), column_buffer->data_size()); + // } if (column_buffer->is_var()) { auto offsets = column_buffer->offsets(); From 09b08d758ad1376fae5c50c25a410ca95ee399a6 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Thu, 14 Mar 2024 13:38:24 -0500 Subject: [PATCH 50/70] Correctly throw SOMAError --- apis/python/src/tiledbsoma/_dataframe.py | 12 +++++++++++- apis/python/src/tiledbsoma/soma_array.cc | 6 +++++- .../tests/test_dataframe_index_columns.py | 18 +++++++++--------- libtiledbsoma/src/soma/column_buffer.h | 3 +-- 4 files changed, 26 insertions(+), 13 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 71a1e93391..4c59d8a07a 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -518,9 +518,19 @@ def write( # the user set index_column_names = ["meister", "burger"] when creating the TileDB schema. # Then the above for-loop over the Arrow schema will find the former ordering, but for the # ``writer[dims] = attrs`` below we must have dims with the latter ordering. - values = values.cast(self.schema) + # values = values.cast(self.schema) + # target_schema = pa.schema(self.schema.field(f.name) for f in values.schema) + # values = values.cast(target_schema) + # print(values) + + target_schema = [] + for field in values.schema: + target_schema.append(self.schema.field(field.name)) + values = values.cast(pa.schema(target_schema, values.schema.metadata)) + for batch in values.to_batches(): self._handle.write(batch) + tiledb_create_options = TileDBCreateOptions.from_platform_config( platform_config ) diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index b9a4967ac8..357c710d4e 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -94,7 +94,11 @@ void write(SOMAArray& array, py::handle py_batch) { array.set_column_data( sch_->name, arr_->length, data, offsets, validities); } - array.write(); + try { + array.write(); + } catch (const std::exception& e) { + TPY_ERROR_LOC(e.what()); + } } py::dict meta(SOMAArray& array) { diff --git a/apis/python/tests/test_dataframe_index_columns.py b/apis/python/tests/test_dataframe_index_columns.py index 3348572cbd..05555f3df9 100644 --- a/apis/python/tests/test_dataframe_index_columns.py +++ b/apis/python/tests/test_dataframe_index_columns.py @@ -1730,55 +1730,55 @@ def test_types_create_errors( "int32-py-list-shaped-out-of-bounds", ["int32"], [[100, 200]], - tiledb.cc.TileDBError, + soma._exception.SOMAError, ], [ "int16-py-list-shaped-out-of-bounds", ["int16"], [[100, 200]], - tiledb.cc.TileDBError, + soma._exception.SOMAError, ], [ "int8-py-list-shaped-out-of-bounds", ["int8"], [[10, 20]], - tiledb.cc.TileDBError, + soma._exception.SOMAError, ], [ "uint64-py-list-shaped-out-of-bounds", ["uint64"], [[100, 200]], - tiledb.cc.TileDBError, + soma._exception.SOMAError, ], [ "uint32-py-list-shaped-out-of-bounds", ["uint32"], [[100, 200]], - tiledb.cc.TileDBError, + soma._exception.SOMAError, ], [ "uint32-py-list-shaped-out-of-bounds", ["uint32"], [[100, 200]], - tiledb.cc.TileDBError, + soma._exception.SOMAError, ], [ "uint8-py-list-shaped-out-of-bounds", ["uint8"], [[10, 20]], - tiledb.cc.TileDBError, + soma._exception.SOMAError, ], [ "float32-py-list-shaped-out-of-bounds", ["float32"], [[100.0, 200.0]], - tiledb.cc.TileDBError, + soma._exception.SOMAError, ], [ "float64-py-list-shaped-out-of-bounds", ["float64"], [[100.0, 200.0]], - tiledb.cc.TileDBError, + soma._exception.SOMAError, ], ], ) diff --git a/libtiledbsoma/src/soma/column_buffer.h b/libtiledbsoma/src/soma/column_buffer.h index 41b8a92dda..3f88cfa625 100644 --- a/libtiledbsoma/src/soma/column_buffer.h +++ b/libtiledbsoma/src/soma/column_buffer.h @@ -130,11 +130,10 @@ class ColumnBuffer { num_cells_ = num_elems; if (offsets != nullptr) { - // TODO this can be either a unit32_t or uint64_t pointer auto num_offsets = num_elems + 1; offsets_.resize(num_offsets); offsets_.assign( - (uint32_t*)offsets, (uint32_t*)offsets + num_offsets); + (uint64_t*)offsets, (uint64_t*)offsets + num_offsets); data_size_ = offsets_[num_offsets - 1]; data_.resize(data_size_); From 11e6a0553d6424fbd9aed4267e61b519cedf98c9 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Mon, 18 Mar 2024 09:31:21 -0500 Subject: [PATCH 51/70] WIP correctly update metadata values --- apis/python/src/tiledbsoma/_tdb_handles.py | 20 +++++++++++++++----- apis/python/src/tiledbsoma/common.cc | 11 +++++++++++ apis/python/src/tiledbsoma/common.h | 1 + apis/python/src/tiledbsoma/soma_array.cc | 17 ++++++++++++++++- 4 files changed, 43 insertions(+), 6 deletions(-) diff --git a/apis/python/src/tiledbsoma/_tdb_handles.py b/apis/python/src/tiledbsoma/_tdb_handles.py index 11ea27c80a..1193f5a88a 100644 --- a/apis/python/src/tiledbsoma/_tdb_handles.py +++ b/apis/python/src/tiledbsoma/_tdb_handles.py @@ -558,13 +558,23 @@ def _write(self) -> None: # Only try to get the writer if there are changes to be made. if isinstance(self.owner, DataFrameWrapper): meta = self.owner.meta + for key, mod in self._mods.items(): + if mod in (_DictMod.ADDED, _DictMod.UPDATED): + set_metadata = self.owner._handle.set_metadata + val = self.cache[key] + if isinstance(val, str): + set_metadata(key, np.array([val], "S")) + else: + set_metadata(key, np.array([val])) + if mod is _DictMod.DELETED: + self.owner._handle.delete_metadata() else: meta = self.owner.writer.meta - for key, mod in self._mods.items(): - if mod in (_DictMod.ADDED, _DictMod.UPDATED): - meta[key] = self.cache[key] - if mod is _DictMod.DELETED: - del meta[key] + for key, mod in self._mods.items(): + if mod in (_DictMod.ADDED, _DictMod.UPDATED): + meta[key] = self.cache[key] + if mod is _DictMod.DELETED: + del meta[key] # Temporary hack: When we flush writes, note that the cache # is back in sync with disk. self._mods.clear() diff --git a/apis/python/src/tiledbsoma/common.cc b/apis/python/src/tiledbsoma/common.cc index 490c0eb7bb..32521771ee 100644 --- a/apis/python/src/tiledbsoma/common.cc +++ b/apis/python/src/tiledbsoma/common.cc @@ -140,6 +140,17 @@ tiledb_datatype_t np_to_tdb_dtype(py::dtype type) { TPY_ERROR_LOC("could not handle numpy dtype"); } +bool is_tdb_str(tiledb_datatype_t type) { + switch (type) { + case TILEDB_STRING_ASCII: + case TILEDB_STRING_UTF8: + case TILEDB_CHAR: + return true; + default: + return false; + } +} + /** * @brief Convert ArrayBuffers to Arrow table. * diff --git a/apis/python/src/tiledbsoma/common.h b/apis/python/src/tiledbsoma/common.h index 33eb27ec53..210b4ccbe5 100644 --- a/apis/python/src/tiledbsoma/common.h +++ b/apis/python/src/tiledbsoma/common.h @@ -34,6 +34,7 @@ namespace tiledbsoma { py::dtype tdb_to_np_dtype(tiledb_datatype_t type, uint32_t cell_val_num); tiledb_datatype_t np_to_tdb_dtype(py::dtype type); +bool is_tdb_str(tiledb_datatype_t type); std::optional to_table( std::optional> buffers); diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index 357c710d4e..f2c063a553 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -120,6 +120,21 @@ py::dict meta(SOMAArray& array) { return results; } +void set_metadata(SOMAArray& sr, const std::string &key, py::array value) { + tiledb_datatype_t value_type = np_to_tdb_dtype(value.dtype()); + + if (is_tdb_str(value_type) && value.size() > 1) + throw py::type_error("array/list of strings not supported"); + + py::buffer_info value_buffer = value.request(); + if (value_buffer.ndim != 1) + throw py::type_error("Only 1D Numpy arrays can be stored as metadata"); + + auto value_num = is_tdb_str(value_type) ? value.nbytes() : value.size(); + sr.set_metadata(key, value_type, value_num, + value_num > 0 ? value.data() : nullptr); +} + py::tuple get_enum(SOMAArray& sr, std::string attr_name) { auto attr_to_enmrs = sr.get_attr_to_enum_mapping(); if (attr_to_enmrs.count(attr_name) == 0) @@ -727,7 +742,7 @@ void load_soma_array(py::module& m) { .def_property_readonly("dimension_names", &SOMAArray::dimension_names) - .def("set_metadata", &SOMAArray::set_metadata) + .def("set_metadata", set_metadata) .def("delete_metadata", &SOMAArray::delete_metadata) From a139025e78d3224bbf7dc99c4b4d5887c701ebcd Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Mon, 18 Mar 2024 21:22:59 -0500 Subject: [PATCH 52/70] WIP cast pyarrow boolean to uint8 when writing to tiledb array --- apis/python/src/tiledbsoma/_dataframe.py | 8 +++-- apis/python/src/tiledbsoma/common.cc | 16 ++++----- apis/python/src/tiledbsoma/soma_array.cc | 6 ++-- apis/python/tests/test_query_condition.py | 39 ++++++++++------------ libtiledbsoma/src/soma/managed_query.cc | 40 +++++------------------ 5 files changed, 43 insertions(+), 66 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 4c59d8a07a..d6e1827a99 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -524,8 +524,12 @@ def write( # print(values) target_schema = [] - for field in values.schema: - target_schema.append(self.schema.field(field.name)) + for input_field in values.schema: + target_field = self.schema.field(input_field.name) + if pa.types.is_boolean(input_field.type): + target_schema.append(target_field.with_type(pa.uint8())) + else: + target_schema.append(target_field) values = values.cast(pa.schema(target_schema, values.schema.metadata)) for batch in values.to_batches(): diff --git a/apis/python/src/tiledbsoma/common.cc b/apis/python/src/tiledbsoma/common.cc index 32521771ee..9f35eec208 100644 --- a/apis/python/src/tiledbsoma/common.cc +++ b/apis/python/src/tiledbsoma/common.cc @@ -141,14 +141,14 @@ tiledb_datatype_t np_to_tdb_dtype(py::dtype type) { } bool is_tdb_str(tiledb_datatype_t type) { - switch (type) { - case TILEDB_STRING_ASCII: - case TILEDB_STRING_UTF8: - case TILEDB_CHAR: - return true; - default: - return false; - } + switch (type) { + case TILEDB_STRING_ASCII: + case TILEDB_STRING_UTF8: + case TILEDB_CHAR: + return true; + default: + return false; + } } /** diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index f2c063a553..c81c945a20 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -120,7 +120,7 @@ py::dict meta(SOMAArray& array) { return results; } -void set_metadata(SOMAArray& sr, const std::string &key, py::array value) { +void set_metadata(SOMAArray& sr, const std::string& key, py::array value) { tiledb_datatype_t value_type = np_to_tdb_dtype(value.dtype()); if (is_tdb_str(value_type) && value.size() > 1) @@ -131,8 +131,8 @@ void set_metadata(SOMAArray& sr, const std::string &key, py::array value) { throw py::type_error("Only 1D Numpy arrays can be stored as metadata"); auto value_num = is_tdb_str(value_type) ? value.nbytes() : value.size(); - sr.set_metadata(key, value_type, value_num, - value_num > 0 ? value.data() : nullptr); + sr.set_metadata( + key, value_type, value_num, value_num > 0 ? value.data() : nullptr); } py::tuple get_enum(SOMAArray& sr, std::string attr_name) { diff --git a/apis/python/tests/test_query_condition.py b/apis/python/tests/test_query_condition.py index 3fad4f47fa..8f99923ac2 100644 --- a/apis/python/tests/test_query_condition.py +++ b/apis/python/tests/test_query_condition.py @@ -3,10 +3,8 @@ import os import pytest -import tiledb import tiledbsoma.pytiledbsoma as clib -from tiledbsoma._arrow_types import tiledb_schema_to_arrow from tiledbsoma._exception import SOMAError from tiledbsoma._query_condition import QueryCondition @@ -30,11 +28,10 @@ def pandas_query(uri, condition): def soma_query(uri, condition): qc = QueryCondition(condition) sr = clib.SOMAArray(uri) - schema = tiledb_schema_to_arrow(tiledb.open(uri).schema, uri, tiledb.default_ctx()) - sr.set_condition(qc, schema) + sr.set_condition(qc, sr.schema) arrow_table = sr.read_next() assert sr.results_complete() - + return arrow_table @@ -45,7 +42,7 @@ def soma_query(uri, condition): "n_genes > 500", # int 'louvain == "NK cells"', # string "percent_mito > 0.02", # float - "is_b_cell == True", # bool + "is_b_cell == True or is_b_cell == False", # bool # compare_op "n_genes == 480", "n_genes != 480", @@ -74,11 +71,13 @@ def test_query_condition(condition): pandas = pandas_query(uri, condition) soma_arrow = soma_query(uri, condition) assert len(pandas.index) == soma_arrow.num_rows - assert ( - (pandas.reset_index(drop=True) == soma_arrow.to_pandas().reset_index(drop=True)) - .all() - .all() - ) + + for name in pandas: + expected = pandas[name].reset_index(drop=True) + actual = soma_arrow[name].to_pandas().reset_index(drop=True) + print(expected,) + print(actual) + assert (expected == actual).all() @pytest.mark.parametrize( @@ -110,8 +109,7 @@ def test_query_condition_select_columns(): sr = clib.SOMAArray(uri, column_names=["n_genes"]) qc = QueryCondition(condition) - schema = tiledb_schema_to_arrow(tiledb.open(uri).schema, uri, tiledb.default_ctx()) - sr.set_condition(qc, schema) + sr.set_condition(qc, sr.schema) arrow_table = sr.read_next() assert sr.results_complete() @@ -124,10 +122,9 @@ def test_query_condition_all_columns(): condition = "percent_mito > 0.02" qc = QueryCondition(condition) - schema = tiledb_schema_to_arrow(tiledb.open(uri).schema, uri, tiledb.default_ctx()) sr = clib.SOMAArray(uri) - sr.set_condition(qc, schema) + sr.set_condition(qc, sr.schema) arrow_table = sr.read_next() assert sr.results_complete() @@ -140,10 +137,9 @@ def test_query_condition_reset(): condition = "percent_mito > 0.02" qc = QueryCondition(condition) - schema = tiledb_schema_to_arrow(tiledb.open(uri).schema, uri, tiledb.default_ctx()) sr = clib.SOMAArray(uri) - sr.set_condition(qc, schema) + sr.set_condition(qc, sr.schema) arrow_table = sr.read_next() assert sr.results_complete() @@ -155,7 +151,7 @@ def test_query_condition_reset(): condition = "percent_mito < 0.02" qc = QueryCondition(condition) sr.reset(column_names=["percent_mito"]) - sr.set_condition(qc, schema) + sr.set_condition(qc, sr.schema) arrow_table = sr.read_next() @@ -218,17 +214,16 @@ def test_parsing_error_conditions(malformed_condition): def test_eval_error_conditions(malformed_condition): """Conditions which should not evaluate (but WILL parse)""" uri = os.path.join(SOMA_URI, "obs") - schema = tiledb_schema_to_arrow(tiledb.open(uri).schema, uri, tiledb.default_ctx()) qc = QueryCondition(malformed_condition) with pytest.raises(SOMAError): sr = clib.SOMAArray(uri) - sr.set_condition(qc, schema) + sr.set_condition(qc, sr.schema) with pytest.raises(SOMAError): # test function directly for codecov - qc.init_query_condition(schema, []) - qc.init_query_condition(schema, ["bad_query_attr"]) + qc.init_query_condition(sr.schema, []) + qc.init_query_condition(sr.schema, ["bad_query_attr"]) if __name__ == "__main__": diff --git a/libtiledbsoma/src/soma/managed_query.cc b/libtiledbsoma/src/soma/managed_query.cc index eb808ca9d6..271f6aa747 100644 --- a/libtiledbsoma/src/soma/managed_query.cc +++ b/libtiledbsoma/src/soma/managed_query.cc @@ -104,28 +104,8 @@ void ManagedQuery::set_column_data( if (is_sparse) { auto data = column_buffer->data(); - // if (column_buffer->type() == TILEDB_BOOL) { - // std::cout << column_name << std::endl; - // std::cout << column_buffer->data_size() << std::endl; - // std::vector bool_output; - // auto bool_data = column_buffer->data(); - // for (size_t i = 0; i * 8 < column_buffer->data_size(); ++i) { - // for (size_t j = 0; j < 8; ++j) { - // std::cout << ((bool_data[i] >> j) & 0x01); - // bool_output.push_back((bool_data[i] >> j) & 0x01); - // } - // } - // std::cout << std::endl; - - // query_->set_data_buffer( - // column_name, - // (void*)bool_output.data(), - // column_buffer->data_size()); - // } else { query_->set_data_buffer( column_name, (void*)data.data(), column_buffer->data_size()); - // } - if (column_buffer->is_var()) { auto offsets = column_buffer->offsets(); query_->set_offsets_buffer( @@ -142,17 +122,15 @@ void ManagedQuery::set_column_data( query_->set_data_buffer( column_name, (void*)data.data(), column_buffer->data_size()); if (column_buffer->is_var()) { - // Remove one offset for TileDB, which checks that the - // offsets and validity buffers are the same size - auto offsets = column_buffer->offsets(); - query_->set_offsets_buffer( - column_name, offsets.data(), offsets.size() - 1); - } - if (column_buffer->is_nullable()) { - auto validity = column_buffer->validity(); - query_->set_validity_buffer( - column_name, validity.data(), validity.size()); - } + auto offsets = column_buffer->offsets(); + query_->set_offsets_buffer( + column_name, offsets.data(), offsets.size()); + } + if (column_buffer->is_nullable()) { + auto validity = column_buffer->validity(); + query_->set_validity_buffer( + column_name, validity.data(), validity.size()); + } } else { switch (column_buffer->type()) { case TILEDB_STRING_ASCII: From 52569558af8862f7932fd8b50cbb8ff1294af11c Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Tue, 19 Mar 2024 13:00:19 -0500 Subject: [PATCH 53/70] WIP fix existing enum error; fix byte display issue --- apis/python/src/tiledbsoma/_dataframe.py | 4 + apis/python/tests/test_dataframe.py | 6 +- apis/python/tests/test_query_condition.py | 2 - libtiledbsoma/src/soma/managed_query.cc | 18 +- libtiledbsoma/src/soma/soma_array.cc | 265 ++++++++++++++++++++-- libtiledbsoma/src/utils/arrow_adapter.cc | 4 +- 6 files changed, 267 insertions(+), 32 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index d6e1827a99..73d7d2e6a1 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -526,6 +526,10 @@ def write( target_schema = [] for input_field in values.schema: target_field = self.schema.field(input_field.name) + + if pa.types.is_dictionary(target_field.type) and not pa.types.is_dictionary(input_field.type): + raise ValueError(f"{input_field.name} requires dictionary entry") + if pa.types.is_boolean(input_field.type): target_schema.append(target_field.with_type(pa.uint8())) else: diff --git a/apis/python/tests/test_dataframe.py b/apis/python/tests/test_dataframe.py index 82defc2dda..7caafbb62a 100644 --- a/apis/python/tests/test_dataframe.py +++ b/apis/python/tests/test_dataframe.py @@ -154,7 +154,6 @@ def test_dataframe_with_enumeration(tmp_path): with soma.DataFrame.open(tmp_path.as_posix()) as sdf: df = sdf.read().concat() - print(df) np.testing.assert_array_equal(df["foo"].chunk(0).dictionary, enums["enmr1"]) np.testing.assert_array_equal(df["bar"].chunk(0).dictionary, enums["enmr2"]) @@ -1160,11 +1159,8 @@ def test_extend_enumerations(tmp_path): with soma.open(str(tmp_path)) as soma_dataframe: df = soma_dataframe.read().concat().to_pandas() - print(df) for c in df: - # TODO bytes are being set to ascii - requires a fix in tiledb-py - # assert df[c].dtype == pandas_df[c].dtype - assert df[c].dtype.kind == pandas_df[c].dtype.kind + assert df[c].dtype == pandas_df[c].dtype if df[c].dtype == "category": assert df[c].cat.categories.dtype == pandas_df[c].cat.categories.dtype diff --git a/apis/python/tests/test_query_condition.py b/apis/python/tests/test_query_condition.py index 8f99923ac2..7504743ecf 100644 --- a/apis/python/tests/test_query_condition.py +++ b/apis/python/tests/test_query_condition.py @@ -75,8 +75,6 @@ def test_query_condition(condition): for name in pandas: expected = pandas[name].reset_index(drop=True) actual = soma_arrow[name].to_pandas().reset_index(drop=True) - print(expected,) - print(actual) assert (expected == actual).all() diff --git a/libtiledbsoma/src/soma/managed_query.cc b/libtiledbsoma/src/soma/managed_query.cc index 271f6aa747..3c47c6449a 100644 --- a/libtiledbsoma/src/soma/managed_query.cc +++ b/libtiledbsoma/src/soma/managed_query.cc @@ -122,15 +122,15 @@ void ManagedQuery::set_column_data( query_->set_data_buffer( column_name, (void*)data.data(), column_buffer->data_size()); if (column_buffer->is_var()) { - auto offsets = column_buffer->offsets(); - query_->set_offsets_buffer( - column_name, offsets.data(), offsets.size()); - } - if (column_buffer->is_nullable()) { - auto validity = column_buffer->validity(); - query_->set_validity_buffer( - column_name, validity.data(), validity.size()); - } + auto offsets = column_buffer->offsets(); + query_->set_offsets_buffer( + column_name, offsets.data(), offsets.size()); + } + if (column_buffer->is_nullable()) { + auto validity = column_buffer->validity(); + query_->set_validity_buffer( + column_name, validity.data(), validity.size()); + } } else { switch (column_buffer->type()) { case TILEDB_STRING_ASCII: diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index a46f7f1580..4ffb8326b7 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -285,21 +285,258 @@ void SOMAArray::extend_enumeration( auto enmr = ArrayExperimental::get_enumeration( ctx, *arr_, std::string(name)); - ArraySchemaEvolution se(ctx); - if (offsets != nullptr) { - // TODO can be uint64_t - std::vector offset_v( - (uint32_t*)offsets, (uint32_t*)offsets + num_elems + 1); - auto data_sz = offset_v[num_elems]; - auto offset_ptr = offset_v.data(); - auto offset_sz = num_elems * sizeof(uint64_t); - se.extend_enumeration( - enmr.extend(data, data_sz, offset_ptr, offset_sz)); - } else { - auto data_sz = num_elems * tiledb::impl::type_size(enmr.type()); - se.extend_enumeration(enmr.extend(data, data_sz, nullptr, 0)); + switch (enmr.type()) { + case TILEDB_STRING_ASCII: + case TILEDB_STRING_UTF8: + case TILEDB_CHAR: { + std::vector offsets_v( + (uint32_t*)offsets, (uint32_t*)offsets + num_elems + 1); + std::string data_v( + (char*)data, (char*)data + offsets_v[offsets_v.size() - 1]); + std::vector enums_in_write; + + for (size_t offset_idx = 0; offset_idx < offsets_v.size() - 1; + ++offset_idx) { + auto beg = offsets_v[offset_idx]; + auto sz = offsets_v[offset_idx + 1] - beg; + enums_in_write.push_back(data_v.substr(beg, sz)); + } + + std::vector extend_values; + auto enums_existing = enmr.as_vector(); + for (auto enum_val : enums_in_write) { + if (std::find( + enums_existing.begin(), + enums_existing.end(), + enum_val) == enums_existing.end()) { + extend_values.push_back(enum_val); + } + } + + if (extend_values.size() != 0) { + ArraySchemaEvolution se(ctx); + se.extend_enumeration(enmr.extend(extend_values)); + se.array_evolve(uri_); + } + break; + } + case TILEDB_BOOL: + case TILEDB_INT8: { + std::vector enums_in_write( + (int8_t*)data, (int8_t*)data + num_elems); + auto enums_existing = enmr.as_vector(); + std::vector extend_values; + for (auto enum_val : enums_in_write) { + if (std::find( + enums_existing.begin(), + enums_existing.end(), + enum_val) == enums_existing.end()) { + extend_values.push_back(enum_val); + } + } + + if (extend_values.size() != 0) { + ArraySchemaEvolution se(ctx); + se.extend_enumeration(enmr.extend( + (void*)extend_values.data(), num_elems, nullptr, 0)); + se.array_evolve(uri_); + } + break; + } + case TILEDB_UINT8: { + std::vector enums_in_write( + (uint8_t*)data, (uint8_t*)data + num_elems); + auto enums_existing = enmr.as_vector(); + std::vector extend_values; + for (auto enum_val : enums_in_write) { + if (std::find( + enums_existing.begin(), + enums_existing.end(), + enum_val) == enums_existing.end()) { + extend_values.push_back(enum_val); + } + } + + if (extend_values.size() != 0) { + ArraySchemaEvolution se(ctx); + se.extend_enumeration(enmr.extend(extend_values)); + se.array_evolve(uri_); + } + break; + } + case TILEDB_INT16: { + std::vector enums_in_write( + (int16_t*)data, (int16_t*)data + num_elems); + auto enums_existing = enmr.as_vector(); + std::vector extend_values; + for (auto enum_val : enums_in_write) { + if (std::find( + enums_existing.begin(), + enums_existing.end(), + enum_val) == enums_existing.end()) { + extend_values.push_back(enum_val); + } + } + + if (extend_values.size() != 0) { + ArraySchemaEvolution se(ctx); + se.extend_enumeration(enmr.extend(extend_values)); + se.array_evolve(uri_); + } + break; + } + case TILEDB_UINT16: { + std::vector enums_in_write( + (uint16_t*)data, (uint16_t*)data + num_elems); + auto enums_existing = enmr.as_vector(); + std::vector extend_values; + for (auto enum_val : enums_in_write) { + if (std::find( + enums_existing.begin(), + enums_existing.end(), + enum_val) == enums_existing.end()) { + extend_values.push_back(enum_val); + } + } + + if (extend_values.size() != 0) { + ArraySchemaEvolution se(ctx); + se.extend_enumeration(enmr.extend(extend_values)); + se.array_evolve(uri_); + } + break; + } + case TILEDB_INT32: { + std::vector enums_in_write( + (int32_t*)data, (int32_t*)data + num_elems); + auto enums_existing = enmr.as_vector(); + std::vector extend_values; + for (auto enum_val : enums_in_write) { + if (std::find( + enums_existing.begin(), + enums_existing.end(), + enum_val) == enums_existing.end()) { + extend_values.push_back(enum_val); + } + } + + if (extend_values.size() != 0) { + ArraySchemaEvolution se(ctx); + se.extend_enumeration(enmr.extend(extend_values)); + se.array_evolve(uri_); + } + break; + } + case TILEDB_UINT32: { + std::vector enums_in_write( + (uint32_t*)data, (uint32_t*)data + num_elems); + auto enums_existing = enmr.as_vector(); + std::vector extend_values; + for (auto enum_val : enums_in_write) { + if (std::find( + enums_existing.begin(), + enums_existing.end(), + enum_val) == enums_existing.end()) { + extend_values.push_back(enum_val); + } + } + + if (extend_values.size() != 0) { + ArraySchemaEvolution se(ctx); + se.extend_enumeration(enmr.extend(extend_values)); + se.array_evolve(uri_); + } + break; + } + case TILEDB_INT64: { + std::vector enums_in_write( + (int64_t*)data, (int64_t*)data + num_elems); + auto enums_existing = enmr.as_vector(); + std::vector extend_values; + for (auto enum_val : enums_in_write) { + if (std::find( + enums_existing.begin(), + enums_existing.end(), + enum_val) == enums_existing.end()) { + extend_values.push_back(enum_val); + } + } + + if (extend_values.size() != 0) { + ArraySchemaEvolution se(ctx); + se.extend_enumeration(enmr.extend(extend_values)); + se.array_evolve(uri_); + } + break; + } + case TILEDB_UINT64: { + std::vector enums_in_write( + (uint64_t*)data, (uint64_t*)data + num_elems); + auto enums_existing = enmr.as_vector(); + std::vector extend_values; + for (auto enum_val : enums_in_write) { + if (std::find( + enums_existing.begin(), + enums_existing.end(), + enum_val) == enums_existing.end()) { + extend_values.push_back(enum_val); + } + } + + if (extend_values.size() != 0) { + ArraySchemaEvolution se(ctx); + se.extend_enumeration(enmr.extend(extend_values)); + se.array_evolve(uri_); + } + break; + } + case TILEDB_FLOAT32: { + std::vector enums_in_write( + (float*)data, (float*)data + num_elems); + auto enums_existing = enmr.as_vector(); + std::vector extend_values; + for (auto enum_val : enums_in_write) { + if (std::find( + enums_existing.begin(), + enums_existing.end(), + enum_val) == enums_existing.end()) { + extend_values.push_back(enum_val); + } + } + + if (extend_values.size() != 0) { + ArraySchemaEvolution se(ctx); + se.extend_enumeration(enmr.extend(extend_values)); + se.array_evolve(uri_); + } + break; + } + case TILEDB_FLOAT64: { + std::vector enums_in_write( + (double*)data, (double*)data + num_elems); + auto enums_existing = enmr.as_vector(); + std::vector extend_values; + for (auto enum_val : enums_in_write) { + if (std::find( + enums_existing.begin(), + enums_existing.end(), + enum_val) == enums_existing.end()) { + extend_values.push_back(enum_val); + } + } + + if (extend_values.size() != 0) { + ArraySchemaEvolution se(ctx); + se.extend_enumeration(enmr.extend(extend_values)); + se.array_evolve(uri_); + } + break; + } + default: + throw TileDBSOMAError(fmt::format( + "ArrowAdapter: Unsupported TileDB dict datatype: {} ", + tiledb::impl::type_to_str(enmr.type()))); } - se.array_evolve(uri_); } void SOMAArray::set_column_data( diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 834475077c..f3d67075cd 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -400,7 +400,7 @@ ArrowTable ArrowAdapter::to_arrow(std::shared_ptr column) { dict_sch->release = &release_schema; dict_sch->private_data = nullptr; - const int n_buf = strcmp(dict_sch->format, "u") == 0 ? 3 : 2; + const int n_buf = ArrowAdapter::_isvar(dict_sch->format) ? 3 : 2; dict_arr->null_count = 0; dict_arr->offset = 0; dict_arr->n_buffers = n_buf; @@ -424,7 +424,7 @@ ArrowTable ArrowAdapter::to_arrow(std::shared_ptr column) { // returns std::optional where std::nullopt indicates the // column does not contain enumerated values. if (enmr->type() == TILEDB_STRING_ASCII or - enmr->type() == TILEDB_STRING_UTF8) { + enmr->type() == TILEDB_STRING_UTF8 or enmr->type() == TILEDB_CHAR) { auto dict_vec = enmr->as_vector(); column->convert_enumeration(); dict_arr->buffers[1] = column->enum_offsets().data(); From 77fb92a8a7ed95d6965b463d4225f66ca4ed5085 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Tue, 19 Mar 2024 16:59:42 -0500 Subject: [PATCH 54/70] Order dimensions in index column name order --- apis/python/src/tiledbsoma/_dataframe.py | 7 +++++++ apis/python/tests/test_dataframe.py | 1 - libtiledbsoma/src/utils/arrow_adapter.cc | 6 +++++- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 73d7d2e6a1..01cc311960 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -243,6 +243,8 @@ def create( domains = pa.StructArray.from_arrays(domains, names=index_column_names) extents = pa.StructArray.from_arrays(extents, names=index_column_names) + + print(index_column_names) # TODO add as kw args clib.SOMADataFrame.create( @@ -396,6 +398,11 @@ def read( if value_filter is not None: sr.set_condition(QueryCondition(value_filter), handle.schema) + + print("sr.schema:") + print(sr.schema) + print("coords:") + print(coords) self._set_reader_coords(sr, coords) diff --git a/apis/python/tests/test_dataframe.py b/apis/python/tests/test_dataframe.py index 7caafbb62a..e1070a26fe 100644 --- a/apis/python/tests/test_dataframe.py +++ b/apis/python/tests/test_dataframe.py @@ -88,7 +88,6 @@ def test_dataframe(tmp_path, arrow_schema): # Read all table = sdf.read().concat() - print(table) assert table.num_rows == 5 assert table.num_columns == 5 assert [e.as_py() for e in list(table["soma_joinid"])] == pydict["soma_joinid"] diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index f3d67075cd..2e703f2c87 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -250,6 +250,8 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( ArraySchema schema(*ctx, TILEDB_SPARSE); Domain domain(*ctx); + std::map dims; + for (int64_t sch_idx = 0; sch_idx < arrow_schema->n_children; ++sch_idx) { auto child = arrow_schema->children[sch_idx]; auto type = ArrowAdapter::to_tiledb_format(child->format); @@ -275,7 +277,7 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( nullptr : extents->children[idx_col_idx]->buffers[1]); - domain.add_dimension(dim); + dims.insert({dim.name(), dim}); } else { Attribute attr(*ctx, child->name, type); @@ -305,6 +307,8 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( } } + for (auto column_name : index_column_names) + domain.add_dimension(dims.at(column_name)); schema.set_domain(domain); schema.check(); From 774f66a730b5d3cfff0f086311f5eb65df958bc0 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Thu, 21 Mar 2024 13:00:47 -0500 Subject: [PATCH 55/70] Clean up enum extend code --- apis/python/src/tiledbsoma/_collection.py | 10 +- apis/python/src/tiledbsoma/_tdb_handles.py | 4 +- apis/python/tests/test_update_dataframes.py | 17 +- libtiledbsoma/src/soma/soma_array.cc | 211 +++----------------- libtiledbsoma/src/soma/soma_array.h | 23 ++- libtiledbsoma/src/soma/soma_object.cc | 4 + 6 files changed, 65 insertions(+), 204 deletions(-) diff --git a/apis/python/src/tiledbsoma/_collection.py b/apis/python/src/tiledbsoma/_collection.py index 48a484be77..e110374d3d 100644 --- a/apis/python/src/tiledbsoma/_collection.py +++ b/apis/python/src/tiledbsoma/_collection.py @@ -434,13 +434,9 @@ def __getitem__(self, key: str) -> CollectionElementType: context = self.context timestamp = self.tiledb_timestamp_ms - try: - wrapper = _tdb_handles.open(uri, mode, context, timestamp) - entry.soma = _factory.reify_handle(wrapper) - except SOMAError: - entry.soma = _factory._open_internal( - entry.entry.wrapper_type.open, uri, mode, context, timestamp - ) + wrapper = _tdb_handles.open(uri, mode, context, timestamp) + entry.soma = _factory.reify_handle(wrapper) + # Since we just opened this object, we own it and should close it. self._close_stack.enter_context(entry.soma) return cast(CollectionElementType, entry.soma) diff --git a/apis/python/src/tiledbsoma/_tdb_handles.py b/apis/python/src/tiledbsoma/_tdb_handles.py index 1193f5a88a..7eea80eebd 100644 --- a/apis/python/src/tiledbsoma/_tdb_handles.py +++ b/apis/python/src/tiledbsoma/_tdb_handles.py @@ -47,7 +47,7 @@ def open( uri: str, mode: options.OpenMode, context: SOMATileDBContext, - timestamp: Optional[OpenTimestamp], + timestamp: Optional[OpenTimestamp] ) -> "Wrapper[RawHandle]": """Determine whether the URI is an array or group, and open it.""" open_mode = clib.OpenMode.read if mode == "r" else clib.OpenMode.write @@ -68,7 +68,7 @@ def open( if not obj_type: raise DoesNotExistError(f"{uri!r} does not exist") - + if obj_type == "SOMADataFrame": return DataFrameWrapper._from_soma_object(soma_object, context) if open_mode == clib.OpenMode.read and obj_type == "SOMADenseNDArray": diff --git a/apis/python/tests/test_update_dataframes.py b/apis/python/tests/test_update_dataframes.py index 0325a15167..da632ff748 100644 --- a/apis/python/tests/test_update_dataframes.py +++ b/apis/python/tests/test_update_dataframes.py @@ -48,17 +48,18 @@ def test_no_change(adata, readback): new_var = adata.var with tiledbsoma.Experiment.open(output_path, "w") as exp: + print("WRITE BEFORE") tiledbsoma.io.update_obs(exp, new_obs) - tiledbsoma.io.update_var(exp, new_var, "RNA") - assert anndata_dataframe_unmodified(original.obs, adata.obs) - assert anndata_dataframe_unmodified(original.var, adata.var) + # tiledbsoma.io.update_var(exp, new_var, "RNA") + # assert anndata_dataframe_unmodified(original.obs, adata.obs) + # assert anndata_dataframe_unmodified(original.var, adata.var) - with tiledbsoma.Experiment.open(output_path) as exp: - o2 = exp.obs.schema - v2 = exp.ms["RNA"].var.schema + # with tiledbsoma.Experiment.open(output_path) as exp: + # o2 = exp.obs.schema + # v2 = exp.ms["RNA"].var.schema - assert o1 == o2 - assert v1 == v2 + # assert o1 == o2 + # assert v1 == v2 @pytest.mark.parametrize("readback", [False, True]) diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index 4ffb8326b7..061c9e2e43 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -163,6 +163,7 @@ SOMAArray::SOMAArray( void SOMAArray::fill_metadata_cache() { if (arr_->query_type() == TILEDB_WRITE) { + std::cout << "in write path" << std::endl; meta_cache_arr_ = std::make_shared( *ctx_->tiledb_ctx(), uri_, @@ -183,6 +184,9 @@ void SOMAArray::fill_metadata_cache() { MetadataValue mdval(value_type, value_num, value); std::pair mdpair(key, mdval); metadata_.insert(mdpair); + std::cout << "fill_metadata_cache" << std::endl; + std::cout << key << " " << std::string((const char*)value, value_num) + << std::endl; } } @@ -280,10 +284,8 @@ void SOMAArray::extend_enumeration( uint64_t num_elems, const void* data, uint64_t* offsets) { - auto ctx = *ctx_->tiledb_ctx(); - auto enmr = ArrayExperimental::get_enumeration( - ctx, *arr_, std::string(name)); + *ctx_->tiledb_ctx(), *arr_, std::string(name)); switch (enmr.type()) { case TILEDB_STRING_ASCII: @@ -314,7 +316,7 @@ void SOMAArray::extend_enumeration( } if (extend_values.size() != 0) { - ArraySchemaEvolution se(ctx); + ArraySchemaEvolution se(*ctx_->tiledb_ctx()); se.extend_enumeration(enmr.extend(extend_values)); se.array_evolve(uri_); } @@ -322,214 +324,43 @@ void SOMAArray::extend_enumeration( } case TILEDB_BOOL: case TILEDB_INT8: { - std::vector enums_in_write( - (int8_t*)data, (int8_t*)data + num_elems); - auto enums_existing = enmr.as_vector(); - std::vector extend_values; - for (auto enum_val : enums_in_write) { - if (std::find( - enums_existing.begin(), - enums_existing.end(), - enum_val) == enums_existing.end()) { - extend_values.push_back(enum_val); - } - } - - if (extend_values.size() != 0) { - ArraySchemaEvolution se(ctx); - se.extend_enumeration(enmr.extend( - (void*)extend_values.data(), num_elems, nullptr, 0)); - se.array_evolve(uri_); - } + SOMAArray::_extend_value_helper((int8_t*)data, num_elems, enmr); break; } case TILEDB_UINT8: { - std::vector enums_in_write( - (uint8_t*)data, (uint8_t*)data + num_elems); - auto enums_existing = enmr.as_vector(); - std::vector extend_values; - for (auto enum_val : enums_in_write) { - if (std::find( - enums_existing.begin(), - enums_existing.end(), - enum_val) == enums_existing.end()) { - extend_values.push_back(enum_val); - } - } - - if (extend_values.size() != 0) { - ArraySchemaEvolution se(ctx); - se.extend_enumeration(enmr.extend(extend_values)); - se.array_evolve(uri_); - } + SOMAArray::_extend_value_helper((uint8_t*)data, num_elems, enmr); break; } case TILEDB_INT16: { - std::vector enums_in_write( - (int16_t*)data, (int16_t*)data + num_elems); - auto enums_existing = enmr.as_vector(); - std::vector extend_values; - for (auto enum_val : enums_in_write) { - if (std::find( - enums_existing.begin(), - enums_existing.end(), - enum_val) == enums_existing.end()) { - extend_values.push_back(enum_val); - } - } - - if (extend_values.size() != 0) { - ArraySchemaEvolution se(ctx); - se.extend_enumeration(enmr.extend(extend_values)); - se.array_evolve(uri_); - } + SOMAArray::_extend_value_helper((int16_t*)data, num_elems, enmr); break; } case TILEDB_UINT16: { - std::vector enums_in_write( - (uint16_t*)data, (uint16_t*)data + num_elems); - auto enums_existing = enmr.as_vector(); - std::vector extend_values; - for (auto enum_val : enums_in_write) { - if (std::find( - enums_existing.begin(), - enums_existing.end(), - enum_val) == enums_existing.end()) { - extend_values.push_back(enum_val); - } - } - - if (extend_values.size() != 0) { - ArraySchemaEvolution se(ctx); - se.extend_enumeration(enmr.extend(extend_values)); - se.array_evolve(uri_); - } + SOMAArray::_extend_value_helper((uint16_t*)data, num_elems, enmr); break; } case TILEDB_INT32: { - std::vector enums_in_write( - (int32_t*)data, (int32_t*)data + num_elems); - auto enums_existing = enmr.as_vector(); - std::vector extend_values; - for (auto enum_val : enums_in_write) { - if (std::find( - enums_existing.begin(), - enums_existing.end(), - enum_val) == enums_existing.end()) { - extend_values.push_back(enum_val); - } - } - - if (extend_values.size() != 0) { - ArraySchemaEvolution se(ctx); - se.extend_enumeration(enmr.extend(extend_values)); - se.array_evolve(uri_); - } + SOMAArray::_extend_value_helper((int32_t*)data, num_elems, enmr); break; } case TILEDB_UINT32: { - std::vector enums_in_write( - (uint32_t*)data, (uint32_t*)data + num_elems); - auto enums_existing = enmr.as_vector(); - std::vector extend_values; - for (auto enum_val : enums_in_write) { - if (std::find( - enums_existing.begin(), - enums_existing.end(), - enum_val) == enums_existing.end()) { - extend_values.push_back(enum_val); - } - } - - if (extend_values.size() != 0) { - ArraySchemaEvolution se(ctx); - se.extend_enumeration(enmr.extend(extend_values)); - se.array_evolve(uri_); - } + SOMAArray::_extend_value_helper((uint32_t*)data, num_elems, enmr); break; } case TILEDB_INT64: { - std::vector enums_in_write( - (int64_t*)data, (int64_t*)data + num_elems); - auto enums_existing = enmr.as_vector(); - std::vector extend_values; - for (auto enum_val : enums_in_write) { - if (std::find( - enums_existing.begin(), - enums_existing.end(), - enum_val) == enums_existing.end()) { - extend_values.push_back(enum_val); - } - } - - if (extend_values.size() != 0) { - ArraySchemaEvolution se(ctx); - se.extend_enumeration(enmr.extend(extend_values)); - se.array_evolve(uri_); - } + SOMAArray::_extend_value_helper((int64_t*)data, num_elems, enmr); break; } case TILEDB_UINT64: { - std::vector enums_in_write( - (uint64_t*)data, (uint64_t*)data + num_elems); - auto enums_existing = enmr.as_vector(); - std::vector extend_values; - for (auto enum_val : enums_in_write) { - if (std::find( - enums_existing.begin(), - enums_existing.end(), - enum_val) == enums_existing.end()) { - extend_values.push_back(enum_val); - } - } - - if (extend_values.size() != 0) { - ArraySchemaEvolution se(ctx); - se.extend_enumeration(enmr.extend(extend_values)); - se.array_evolve(uri_); - } + SOMAArray::_extend_value_helper((uint64_t*)data, num_elems, enmr); break; } case TILEDB_FLOAT32: { - std::vector enums_in_write( - (float*)data, (float*)data + num_elems); - auto enums_existing = enmr.as_vector(); - std::vector extend_values; - for (auto enum_val : enums_in_write) { - if (std::find( - enums_existing.begin(), - enums_existing.end(), - enum_val) == enums_existing.end()) { - extend_values.push_back(enum_val); - } - } - - if (extend_values.size() != 0) { - ArraySchemaEvolution se(ctx); - se.extend_enumeration(enmr.extend(extend_values)); - se.array_evolve(uri_); - } + SOMAArray::_extend_value_helper((float*)data, num_elems, enmr); break; } case TILEDB_FLOAT64: { - std::vector enums_in_write( - (double*)data, (double*)data + num_elems); - auto enums_existing = enmr.as_vector(); - std::vector extend_values; - for (auto enum_val : enums_in_write) { - if (std::find( - enums_existing.begin(), - enums_existing.end(), - enum_val) == enums_existing.end()) { - extend_values.push_back(enum_val); - } - } - - if (extend_values.size() != 0) { - ArraySchemaEvolution se(ctx); - se.extend_enumeration(enmr.extend(extend_values)); - se.array_evolve(uri_); - } + SOMAArray::_extend_value_helper((double*)data, num_elems, enmr); break; } default: @@ -856,6 +687,16 @@ std::optional SOMAArray::get_metadata(const std::string& key) { return std::nullopt; return metadata_[key]; + + // tiledb_datatype_t value_type; + // uint32_t value_num; + // const void* value; + + // meta_cache_arr_->get_metadata(key, &value_type, &value_num, &value); + // std::cout << "get_metadata: " << std::string((const char*)value, + // value_num) + // << std::endl; + // return MetadataValue(value_type, value_num, value); } std::map SOMAArray::get_metadata() { diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index f66a2e2d13..1c1c3915c7 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -179,8 +179,7 @@ class SOMAArray : public SOMAObject { , mq_(std::make_unique( other.arr_, other.ctx_->tiledb_ctx(), other.name_)) , arr_(other.arr_) - , first_read_next_(other.first_read_next_) - , submitted_(other.submitted_) { + , first_read_next_(other.first_read_next_) { } SOMAArray(SOMAArray&&) = default; @@ -705,6 +704,26 @@ class SOMAArray : public SOMAObject { //= private non-static //=================================================================== + template + void _extend_value_helper(T* data, uint64_t num_elems, Enumeration enmr) { + std::vector enums_in_write((T*)data, (T*)data + num_elems); + auto enums_existing = enmr.as_vector(); + std::vector extend_values; + for (auto enum_val : enums_in_write) { + if (std::find( + enums_existing.begin(), enums_existing.end(), enum_val) == + enums_existing.end()) { + extend_values.push_back(enum_val); + } + } + + if (extend_values.size() != 0) { + ArraySchemaEvolution se(*ctx_->tiledb_ctx()); + se.extend_enumeration(enmr.extend(extend_values)); + se.array_evolve(uri_); + } + } + // Fills the metadata cache upon opening the array. void fill_metadata_cache(); diff --git a/libtiledbsoma/src/soma/soma_object.cc b/libtiledbsoma/src/soma/soma_object.cc index 36614a6ecb..57e9068173 100644 --- a/libtiledbsoma/src/soma/soma_object.cc +++ b/libtiledbsoma/src/soma/soma_object.cc @@ -29,6 +29,8 @@ std::unique_ptr SOMAObject::open( if (!array_->type().has_value()) throw TileDBSOMAError("SOMAArray has no type info"); + std::cout << "in soma object open" << std::endl; + if (array_->type() == "SOMADataFrame") { return std::make_unique(*array_); } else if (array_->type() == "SOMASparseNDArray") { @@ -68,6 +70,8 @@ const std::optional SOMAObject::type() { *soma_object_type); uint32_t sz = std::get(*soma_object_type); + std::cout << "soma object type is " << std::string(dtype, sz) << std::endl; + return std::string(dtype, sz); } From d46ed004f4dc713f15f4183e6d6aa5f3d225b296 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Thu, 21 Mar 2024 14:21:34 -0500 Subject: [PATCH 56/70] Fix more issues with metadata in write mode --- apis/python/src/tiledbsoma/soma_object.cc | 13 +++++++------ apis/python/tests/test_update_dataframes.py | 17 ++++++++--------- libtiledbsoma/src/soma/soma_array.cc | 16 ++-------------- libtiledbsoma/src/soma/soma_array.h | 6 +++++- libtiledbsoma/src/soma/soma_object.cc | 4 ---- 5 files changed, 22 insertions(+), 34 deletions(-) diff --git a/apis/python/src/tiledbsoma/soma_object.cc b/apis/python/src/tiledbsoma/soma_object.cc index 5ab4c1140e..e9d821d0d2 100644 --- a/apis/python/src/tiledbsoma/soma_object.cc +++ b/apis/python/src/tiledbsoma/soma_object.cc @@ -59,17 +59,18 @@ void load_soma_object(py::module& m) { -> py::object { try { auto obj = SOMAObject::open(uri, mode, context, timestamp); - if (obj->type() == "SOMADataFrame") + auto soma_type = obj->type(); + if (soma_type == "SOMADataFrame") return py::cast(dynamic_cast(*obj)); - else if (obj->type() == "SOMASparseNDArray") + else if (soma_type == "SOMASparseNDArray") return py::cast(dynamic_cast(*obj)); - else if (obj->type() == "SOMADenseNDArray") + else if (soma_type == "SOMADenseNDArray") return py::cast(dynamic_cast(*obj)); - else if (obj->type() == "SOMACollection") + else if (soma_type == "SOMACollection") return py::cast(dynamic_cast(*obj)); - else if (obj->type() == "SOMAExperiment") + else if (soma_type == "SOMAExperiment") return py::cast(dynamic_cast(*obj)); - else if (obj->type() == "SOMAMeasurement") + else if (soma_type == "SOMAMeasurement") return py::cast(dynamic_cast(*obj)); return py::none(); } catch (...) { diff --git a/apis/python/tests/test_update_dataframes.py b/apis/python/tests/test_update_dataframes.py index da632ff748..0325a15167 100644 --- a/apis/python/tests/test_update_dataframes.py +++ b/apis/python/tests/test_update_dataframes.py @@ -48,18 +48,17 @@ def test_no_change(adata, readback): new_var = adata.var with tiledbsoma.Experiment.open(output_path, "w") as exp: - print("WRITE BEFORE") tiledbsoma.io.update_obs(exp, new_obs) - # tiledbsoma.io.update_var(exp, new_var, "RNA") - # assert anndata_dataframe_unmodified(original.obs, adata.obs) - # assert anndata_dataframe_unmodified(original.var, adata.var) + tiledbsoma.io.update_var(exp, new_var, "RNA") + assert anndata_dataframe_unmodified(original.obs, adata.obs) + assert anndata_dataframe_unmodified(original.var, adata.var) - # with tiledbsoma.Experiment.open(output_path) as exp: - # o2 = exp.obs.schema - # v2 = exp.ms["RNA"].var.schema + with tiledbsoma.Experiment.open(output_path) as exp: + o2 = exp.obs.schema + v2 = exp.ms["RNA"].var.schema - # assert o1 == o2 - # assert v1 == v2 + assert o1 == o2 + assert v1 == v2 @pytest.mark.parametrize("readback", [False, True]) diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index 061c9e2e43..e1c7dafe2a 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -163,7 +163,6 @@ SOMAArray::SOMAArray( void SOMAArray::fill_metadata_cache() { if (arr_->query_type() == TILEDB_WRITE) { - std::cout << "in write path" << std::endl; meta_cache_arr_ = std::make_shared( *ctx_->tiledb_ctx(), uri_, @@ -174,6 +173,8 @@ void SOMAArray::fill_metadata_cache() { meta_cache_arr_ = arr_; } + metadata_.clear(); + for (uint64_t idx = 0; idx < meta_cache_arr_->metadata_num(); ++idx) { std::string key; tiledb_datatype_t value_type; @@ -184,9 +185,6 @@ void SOMAArray::fill_metadata_cache() { MetadataValue mdval(value_type, value_num, value); std::pair mdpair(key, mdval); metadata_.insert(mdpair); - std::cout << "fill_metadata_cache" << std::endl; - std::cout << key << " " << std::string((const char*)value, value_num) - << std::endl; } } @@ -687,16 +685,6 @@ std::optional SOMAArray::get_metadata(const std::string& key) { return std::nullopt; return metadata_[key]; - - // tiledb_datatype_t value_type; - // uint32_t value_num; - // const void* value; - - // meta_cache_arr_->get_metadata(key, &value_type, &value_num, &value); - // std::cout << "get_metadata: " << std::string((const char*)value, - // value_num) - // << std::endl; - // return MetadataValue(value_type, value_num, value); } std::map SOMAArray::get_metadata() { diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 1c1c3915c7..105a9ceff2 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -179,7 +179,11 @@ class SOMAArray : public SOMAObject { , mq_(std::make_unique( other.arr_, other.ctx_->tiledb_ctx(), other.name_)) , arr_(other.arr_) - , first_read_next_(other.first_read_next_) { + , meta_cache_arr_(other.meta_cache_arr_) + , first_read_next_(other.first_read_next_) + , submitted_(other.submitted_) + , array_buffer_(other.array_buffer_) { + fill_metadata_cache(); } SOMAArray(SOMAArray&&) = default; diff --git a/libtiledbsoma/src/soma/soma_object.cc b/libtiledbsoma/src/soma/soma_object.cc index 57e9068173..36614a6ecb 100644 --- a/libtiledbsoma/src/soma/soma_object.cc +++ b/libtiledbsoma/src/soma/soma_object.cc @@ -29,8 +29,6 @@ std::unique_ptr SOMAObject::open( if (!array_->type().has_value()) throw TileDBSOMAError("SOMAArray has no type info"); - std::cout << "in soma object open" << std::endl; - if (array_->type() == "SOMADataFrame") { return std::make_unique(*array_); } else if (array_->type() == "SOMASparseNDArray") { @@ -70,8 +68,6 @@ const std::optional SOMAObject::type() { *soma_object_type); uint32_t sz = std::get(*soma_object_type); - std::cout << "soma object type is " << std::string(dtype, sz) << std::endl; - return std::string(dtype, sz); } From 14e4846ae07fb43b8448a829ee4c187562bde390 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Thu, 21 Mar 2024 14:24:12 -0500 Subject: [PATCH 57/70] Correct metadata delete --- apis/python/src/tiledbsoma/_tdb_handles.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apis/python/src/tiledbsoma/_tdb_handles.py b/apis/python/src/tiledbsoma/_tdb_handles.py index 7eea80eebd..fb635edaa3 100644 --- a/apis/python/src/tiledbsoma/_tdb_handles.py +++ b/apis/python/src/tiledbsoma/_tdb_handles.py @@ -567,7 +567,7 @@ def _write(self) -> None: else: set_metadata(key, np.array([val])) if mod is _DictMod.DELETED: - self.owner._handle.delete_metadata() + self.owner._handle.delete_metadata(key) else: meta = self.owner.writer.meta for key, mod in self._mods.items(): From 0b029d7094b214347dc3f791680f4e61d8e77b63 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Mon, 25 Mar 2024 09:38:41 -0500 Subject: [PATCH 58/70] Read in config --- apis/python/src/tiledbsoma/_dataframe.py | 9 ++------- libtiledbsoma/src/soma/soma_dataframe.cc | 4 +++- libtiledbsoma/src/utils/arrow_adapter.cc | 8 +++++++- libtiledbsoma/src/utils/arrow_adapter.h | 6 +++++- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 01cc311960..37a59fb050 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -244,8 +244,8 @@ def create( domains = pa.StructArray.from_arrays(domains, names=index_column_names) extents = pa.StructArray.from_arrays(extents, names=index_column_names) - print(index_column_names) - + print(platform_config) + # TODO add as kw args clib.SOMADataFrame.create( uri, @@ -398,11 +398,6 @@ def read( if value_filter is not None: sr.set_condition(QueryCondition(value_filter), handle.schema) - - print("sr.schema:") - print(sr.schema) - print("coords:") - print(coords) self._set_reader_coords(sr, coords) diff --git a/libtiledbsoma/src/soma/soma_dataframe.cc b/libtiledbsoma/src/soma/soma_dataframe.cc index 6a085ab1a7..9118ed5f12 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.cc +++ b/libtiledbsoma/src/soma/soma_dataframe.cc @@ -45,8 +45,10 @@ void SOMADataFrame::create( ColumnIndexInfo index_columns, std::shared_ptr ctx, std::optional> timestamp) { + PlatformConfig platform_cfg = { + {"tiledb", {{"create", {{"allow_duplicates", false}}}}}}; auto tiledb_schema = ArrowAdapter::tiledb_schema_from_arrow_schema( - ctx->tiledb_ctx(), schema, index_columns); + ctx->tiledb_ctx(), schema, index_columns, platform_cfg); SOMAArray::create(ctx, uri, tiledb_schema, "SOMADataFrame", timestamp); } diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 2e703f2c87..ef343969bb 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -244,9 +244,15 @@ std::pair ArrowAdapter::_get_data_and_length( ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( std::shared_ptr ctx, std::shared_ptr arrow_schema, - ColumnIndexInfo index_column_info) { + ColumnIndexInfo index_column_info, + PlatformConfig platform_config) { auto [index_column_names, domains, extents] = index_column_info; + std::cout << (platform_config["tiledb"]["create"]["allows_duplicates"] ? + "yes" : + "No") + << std::endl; + ArraySchema schema(*ctx, TILEDB_SPARSE); Domain domain(*ctx); diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index a3c09b7110..35785c3d0b 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -41,6 +41,9 @@ using ColumnIndexInfo = std::tuple< std::shared_ptr // tile extent >; +using PlatformConfig = + std::map>>; + class ArrowAdapter { public: static void release_schema(struct ArrowSchema* schema); @@ -70,7 +73,8 @@ class ArrowAdapter { static ArraySchema tiledb_schema_from_arrow_schema( std::shared_ptr ctx, std::shared_ptr arrow_schema, - ColumnIndexInfo index_column_info); + ColumnIndexInfo index_column_info, + PlatformConfig platform_config); /** * @brief Get Arrow format string from TileDB datatype. From 5417c17e47f3838b03b9c7388ac5000ead5ffde8 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Tue, 26 Mar 2024 10:15:50 -0500 Subject: [PATCH 59/70] WIP add platform config in C++ --- apis/python/src/tiledbsoma/_dataframe.py | 19 +++++- apis/python/src/tiledbsoma/pytiledbsoma.cc | 20 +++++++ apis/python/src/tiledbsoma/soma_dataframe.cc | 6 +- libtiledbsoma/src/soma/soma_collection.cc | 10 ++-- libtiledbsoma/src/soma/soma_collection.h | 6 +- libtiledbsoma/src/soma/soma_dataframe.cc | 5 +- libtiledbsoma/src/soma/soma_dataframe.h | 1 + libtiledbsoma/src/soma/soma_experiment.cc | 8 ++- libtiledbsoma/src/soma/soma_experiment.h | 1 + libtiledbsoma/src/soma/soma_measurement.cc | 8 ++- libtiledbsoma/src/soma/soma_measurement.h | 1 + libtiledbsoma/src/utils/arrow_adapter.cc | 62 ++++++++++++++++++-- libtiledbsoma/src/utils/arrow_adapter.h | 20 ++++++- libtiledbsoma/test/unit_soma_collection.cc | 4 +- libtiledbsoma/test/unit_soma_dataframe.cc | 2 +- 15 files changed, 147 insertions(+), 26 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 37a59fb050..4692d2f665 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -244,7 +244,23 @@ def create( domains = pa.StructArray.from_arrays(domains, names=index_column_names) extents = pa.StructArray.from_arrays(extents, names=index_column_names) - print(platform_config) + plt_cfg = None + if platform_config: + ops = TileDBCreateOptions.from_platform_config(platform_config) + plt_cfg = clib.PlatformConfig() + plt_cfg.dataframe_dim_zstd_level = ops.dataframe_dim_zstd_level + plt_cfg.sparse_nd_array_dim_zstd_level = ops.sparse_nd_array_dim_zstd_level + plt_cfg.write_X_chunked = ops.write_X_chunked + plt_cfg.goal_chunk_nnz = ops.goal_chunk_nnz + plt_cfg.capacity = ops.capacity + if ops.offsets_filters: + plt_cfg.offsets_filters = [info["_type"] for info in ops.offsets_filters] + if ops.validity_filters: + plt_cfg.validity_filters = [info["_type"] for info in ops.validity_filters] + plt_cfg.allows_duplicates = ops.allows_duplicates + plt_cfg.tile_order = ops.tile_order + plt_cfg.cell_order = ops.cell_order + plt_cfg.consolidate_and_vacuum = ops.consolidate_and_vacuum # TODO add as kw args clib.SOMADataFrame.create( @@ -254,6 +270,7 @@ def create( domains, extents, context.native_context, + plt_cfg, ) handle = cls._wrapper_type.open(uri, "w", context, tiledb_timestamp) diff --git a/apis/python/src/tiledbsoma/pytiledbsoma.cc b/apis/python/src/tiledbsoma/pytiledbsoma.cc index f99cf25621..1c980c4166 100644 --- a/apis/python/src/tiledbsoma/pytiledbsoma.cc +++ b/apis/python/src/tiledbsoma/pytiledbsoma.cc @@ -94,6 +94,26 @@ PYBIND11_MODULE(pytiledbsoma, m) { }, "Print TileDB internal statistics. Lifecycle: experimental."); + py::class_(m, "PlatformConfig") + .def(py::init<>()) + .def_readwrite( + "dataframe_dim_zstd_level", + &PlatformConfig::dataframe_dim_zstd_level) + .def_readwrite( + "sparse_nd_array_dim_zstd_level", + &PlatformConfig::sparse_nd_array_dim_zstd_level) + .def_readwrite("write_X_chunked", &PlatformConfig::write_X_chunked) + .def_readwrite("goal_chunk_nnz", &PlatformConfig::goal_chunk_nnz) + .def_readwrite("remote_cap_nbytes", &PlatformConfig::remote_cap_nbytes) + .def_readwrite("capacity", &PlatformConfig::capacity) + .def_readwrite("offsets_filters", &PlatformConfig::offsets_filters) + .def_readwrite("validity_filters", &PlatformConfig::validity_filters) + .def_readwrite("allows_duplicates", &PlatformConfig::allows_duplicates) + .def_readwrite("tile_order", &PlatformConfig::tile_order) + .def_readwrite("cell_order", &PlatformConfig::cell_order) + .def_readwrite( + "consolidate_and_vacuum", &PlatformConfig::consolidate_and_vacuum); + load_soma_context(m); load_soma_object(m); load_soma_array(m); diff --git a/apis/python/src/tiledbsoma/soma_dataframe.cc b/apis/python/src/tiledbsoma/soma_dataframe.cc index 45b565e402..7b7b6895ab 100644 --- a/apis/python/src/tiledbsoma/soma_dataframe.cc +++ b/apis/python/src/tiledbsoma/soma_dataframe.cc @@ -56,7 +56,8 @@ void load_soma_dataframe(py::module& m) { std::vector index_columns_names, py::object py_domains, py::object py_extents, - std::shared_ptr context) { + std::shared_ptr context, + std::optional platform_config) { ArrowSchema schema; uintptr_t schema_ptr = (uintptr_t)(&schema); py_schema.attr("_export_to_c")(schema_ptr); @@ -94,7 +95,8 @@ void load_soma_dataframe(py::module& m) { index_columns_names, std::make_shared(domains), std::make_shared(extents)), - context); + context, + platform_config); } catch (const std::out_of_range& e) { throw py::type_error(e.what()); } catch (const std::exception& e) { diff --git a/libtiledbsoma/src/soma/soma_collection.cc b/libtiledbsoma/src/soma/soma_collection.cc index e3a9b27528..dff721027f 100644 --- a/libtiledbsoma/src/soma/soma_collection.cc +++ b/libtiledbsoma/src/soma/soma_collection.cc @@ -111,8 +111,9 @@ std::shared_ptr SOMACollection::add_new_experiment( URIType uri_type, std::shared_ptr ctx, std::shared_ptr schema, - ColumnIndexInfo index_columns) { - SOMAExperiment::create(uri, schema, index_columns, ctx); + ColumnIndexInfo index_columns, + std::optional platform_config) { + SOMAExperiment::create(uri, schema, index_columns, ctx, platform_config); std::shared_ptr member = SOMAExperiment::open( uri, OpenMode::read, ctx); this->set(std::string(uri), uri_type, std::string(key)); @@ -141,8 +142,9 @@ std::shared_ptr SOMACollection::add_new_dataframe( URIType uri_type, std::shared_ptr ctx, std::shared_ptr schema, - ColumnIndexInfo index_columns) { - SOMADataFrame::create(uri, schema, index_columns, ctx); + ColumnIndexInfo index_columns, + std::optional platform_config) { + SOMADataFrame::create(uri, schema, index_columns, ctx, platform_config); std::shared_ptr member = SOMADataFrame::open( uri, OpenMode::read, ctx); this->set(std::string(uri), uri_type, std::string(key)); diff --git a/libtiledbsoma/src/soma/soma_collection.h b/libtiledbsoma/src/soma/soma_collection.h index d6b770f30c..f022752ef5 100644 --- a/libtiledbsoma/src/soma/soma_collection.h +++ b/libtiledbsoma/src/soma/soma_collection.h @@ -158,7 +158,8 @@ class SOMACollection : public SOMAGroup { URIType uri_type, std::shared_ptr ctx, std::shared_ptr schema, - ColumnIndexInfo index_columns); + ColumnIndexInfo index_columns, + std::optional platform_config = std::nullopt); /** * Create and add a SOMAMeasurement to the SOMACollection. @@ -190,7 +191,8 @@ class SOMACollection : public SOMAGroup { URIType uri_type, std::shared_ptr ctx, std::shared_ptr schema, - ColumnIndexInfo index_columns); + ColumnIndexInfo index_columns, + std::optional platform_config = std::nullopt); /** * Create and add a SOMADenseNDArray to the SOMACollection. diff --git a/libtiledbsoma/src/soma/soma_dataframe.cc b/libtiledbsoma/src/soma/soma_dataframe.cc index 9118ed5f12..6058b81968 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.cc +++ b/libtiledbsoma/src/soma/soma_dataframe.cc @@ -44,11 +44,10 @@ void SOMADataFrame::create( std::shared_ptr schema, ColumnIndexInfo index_columns, std::shared_ptr ctx, + std::optional platform_config, std::optional> timestamp) { - PlatformConfig platform_cfg = { - {"tiledb", {{"create", {{"allow_duplicates", false}}}}}}; auto tiledb_schema = ArrowAdapter::tiledb_schema_from_arrow_schema( - ctx->tiledb_ctx(), schema, index_columns, platform_cfg); + ctx->tiledb_ctx(), schema, index_columns, platform_config); SOMAArray::create(ctx, uri, tiledb_schema, "SOMADataFrame", timestamp); } diff --git a/libtiledbsoma/src/soma/soma_dataframe.h b/libtiledbsoma/src/soma/soma_dataframe.h index 62dd425fd9..6f163d9f2c 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.h +++ b/libtiledbsoma/src/soma/soma_dataframe.h @@ -61,6 +61,7 @@ class SOMADataFrame : public SOMAArray { std::shared_ptr schema, ColumnIndexInfo index_columns, std::shared_ptr ctx, + std::optional platform_config = std::nullopt, std::optional> timestamp = std::nullopt); /** diff --git a/libtiledbsoma/src/soma/soma_experiment.cc b/libtiledbsoma/src/soma/soma_experiment.cc index f6cd186920..2c42aea68a 100644 --- a/libtiledbsoma/src/soma/soma_experiment.cc +++ b/libtiledbsoma/src/soma/soma_experiment.cc @@ -46,12 +46,18 @@ void SOMAExperiment::create( std::shared_ptr schema, ColumnIndexInfo index_columns, std::shared_ptr ctx, + std::optional platform_config, std::optional timestamp) { std::string exp_uri(uri); SOMAGroup::create(ctx, exp_uri, "SOMAExperiment", timestamp); SOMADataFrame::create( - exp_uri + "/obs", schema, index_columns, ctx, timestamp); + exp_uri + "/obs", + schema, + index_columns, + ctx, + platform_config, + timestamp); SOMACollection::create(exp_uri + "/ms", ctx, timestamp); auto name = std::string(std::filesystem::path(uri).filename()); diff --git a/libtiledbsoma/src/soma/soma_experiment.h b/libtiledbsoma/src/soma/soma_experiment.h index c382238ed6..14b918a269 100644 --- a/libtiledbsoma/src/soma/soma_experiment.h +++ b/libtiledbsoma/src/soma/soma_experiment.h @@ -59,6 +59,7 @@ class SOMAExperiment : public SOMACollection { std::shared_ptr schema, ColumnIndexInfo index_columns, std::shared_ptr ctx, + std::optional platform_config = std::nullopt, std::optional timestamp = std::nullopt); /** diff --git a/libtiledbsoma/src/soma/soma_measurement.cc b/libtiledbsoma/src/soma/soma_measurement.cc index 4f462485d4..d6605e7383 100644 --- a/libtiledbsoma/src/soma/soma_measurement.cc +++ b/libtiledbsoma/src/soma/soma_measurement.cc @@ -46,12 +46,18 @@ void SOMAMeasurement::create( std::shared_ptr schema, ColumnIndexInfo index_columns, std::shared_ptr ctx, + std::optional platform_config, std::optional timestamp) { std::string exp_uri(uri); SOMAGroup::create(ctx, exp_uri, "SOMAMeasurement", timestamp); SOMADataFrame::create( - exp_uri + "/var", schema, index_columns, ctx, timestamp); + exp_uri + "/var", + schema, + index_columns, + ctx, + platform_config, + timestamp); SOMACollection::create(exp_uri + "/X", ctx, timestamp); SOMACollection::create(exp_uri + "/obsm", ctx, timestamp); SOMACollection::create(exp_uri + "/obsp", ctx, timestamp); diff --git a/libtiledbsoma/src/soma/soma_measurement.h b/libtiledbsoma/src/soma/soma_measurement.h index df2791bd77..591c057751 100644 --- a/libtiledbsoma/src/soma/soma_measurement.h +++ b/libtiledbsoma/src/soma/soma_measurement.h @@ -60,6 +60,7 @@ class SOMAMeasurement : public SOMACollection { std::shared_ptr schema, ColumnIndexInfo index_columns, std::shared_ptr ctx, + std::optional platform_config = std::nullopt, std::optional timestamp = std::nullopt); /** diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index ef343969bb..41602aba9c 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -245,17 +245,67 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( std::shared_ptr ctx, std::shared_ptr arrow_schema, ColumnIndexInfo index_column_info, - PlatformConfig platform_config) { + std::optional platform_config) { auto [index_column_names, domains, extents] = index_column_info; - std::cout << (platform_config["tiledb"]["create"]["allows_duplicates"] ? - "yes" : - "No") - << std::endl; - ArraySchema schema(*ctx, TILEDB_SPARSE); Domain domain(*ctx); + if (platform_config) { + std::map convert_filter = { + {"GzipFilter", TILEDB_FILTER_GZIP}, + {"ZstdFilter", TILEDB_FILTER_ZSTD}, + {"LZ4Filter", TILEDB_FILTER_LZ4}, + {"Bzip2Filter", TILEDB_FILTER_BZIP2}, + {"RleFilter", TILEDB_FILTER_RLE}, + {"DeltaFilter", TILEDB_FILTER_DELTA}, + {"DoubleDeltaFilter", TILEDB_FILTER_DOUBLE_DELTA}, + {"BitWidthReductionFilter", TILEDB_FILTER_BIT_WIDTH_REDUCTION}, + {"BitShuffleFilter", TILEDB_FILTER_BITSHUFFLE}, + {"ByteShuffleFilter", TILEDB_FILTER_BYTESHUFFLE}, + {"PositiveDeltaFilter", TILEDB_FILTER_POSITIVE_DELTA}, + {"ChecksumMD5Filter", TILEDB_FILTER_CHECKSUM_MD5}, + {"ChecksumSHA256Filter", TILEDB_FILTER_CHECKSUM_SHA256}, + {"DictionaryFilter", TILEDB_FILTER_DICTIONARY}, + {"FloatScaleFilter", TILEDB_FILTER_SCALE_FLOAT}, + {"XORFilter", TILEDB_FILTER_XOR}, + {"WebpFilter", TILEDB_FILTER_WEBP}, + {"NoOpFilter", TILEDB_FILTER_NONE}, + }; + + schema.set_capacity(platform_config->capacity); + + if (platform_config->offsets_filters.size() != 0) { + FilterList offset_filter_list(*ctx); + for (auto offset : platform_config->offsets_filters) { + offset_filter_list.add_filter( + Filter(*ctx, convert_filter[offset])); + } + schema.set_offsets_filter_list(offset_filter_list); + } + + if (platform_config->validity_filters.size() != 0) { + FilterList validity_filter_list(*ctx); + for (auto validity : platform_config->validity_filters) { + validity_filter_list.add_filter( + Filter(*ctx, convert_filter[validity])); + } + schema.set_validity_filter_list(validity_filter_list); + } + + schema.set_allows_dups(platform_config->allows_duplicates); + + if (platform_config->tile_order) + schema.set_tile_order( + platform_config->tile_order == "row" ? TILEDB_ROW_MAJOR : + TILEDB_COL_MAJOR); + + if (platform_config->cell_order) + schema.set_cell_order( + platform_config->cell_order == "row" ? TILEDB_ROW_MAJOR : + TILEDB_COL_MAJOR); + } + std::map dims; for (int64_t sch_idx = 0; sch_idx < arrow_schema->n_children; ++sch_idx) { diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index 35785c3d0b..e22c75b32c 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -41,8 +41,22 @@ using ColumnIndexInfo = std::tuple< std::shared_ptr // tile extent >; -using PlatformConfig = - std::map>>; +class PlatformConfig { + public: + uint64_t dataframe_dim_zstd_level = 3; + uint64_t sparse_nd_array_dim_zstd_level = 3; + bool write_X_chunked = true; + uint64_t goal_chunk_nnz = 100000000; + uint64_t remote_cap_nbytes = 2400000000; + uint64_t capacity = 100000; + std::vector offsets_filters = { + "DoubleDeltaFilter", "BitWidthReductionFilter", "ZstdFilter"}; + std::vector validity_filters; + bool allows_duplicates = false; + std::optional tile_order = std::nullopt; + std::optional cell_order = std::nullopt; + bool consolidate_and_vacuum = false; +}; class ArrowAdapter { public: @@ -74,7 +88,7 @@ class ArrowAdapter { std::shared_ptr ctx, std::shared_ptr arrow_schema, ColumnIndexInfo index_column_info, - PlatformConfig platform_config); + std::optional platform_config); /** * @brief Get Arrow format string from TileDB datatype. diff --git a/libtiledbsoma/test/unit_soma_collection.cc b/libtiledbsoma/test/unit_soma_collection.cc index c4f0d2d9fa..914e263c70 100644 --- a/libtiledbsoma/test/unit_soma_collection.cc +++ b/libtiledbsoma/test/unit_soma_collection.cc @@ -262,7 +262,7 @@ TEST_CASE("SOMAExperiment: metadata") { std::string uri = "mem://unit-test-experiment"; auto [schema, index_columns] = helper::create_arrow_schema(); SOMAExperiment::create( - uri, schema, index_columns, ctx, TimestampRange(0, 2)); + uri, schema, index_columns, ctx, std::nullopt, TimestampRange(0, 2)); auto soma_experiment = SOMAExperiment::open( uri, OpenMode::write, ctx, std::pair(1, 1)); @@ -316,7 +316,7 @@ TEST_CASE("SOMAMeasurement: metadata") { std::string uri = "mem://unit-test-measurement"; auto [schema, index_columns] = helper::create_arrow_schema(); SOMAMeasurement::create( - uri, schema, index_columns, ctx, TimestampRange(0, 2)); + uri, schema, index_columns, ctx, std::nullopt, TimestampRange(0, 2)); auto soma_measurement = SOMAMeasurement::open( uri, OpenMode::write, ctx, std::pair(1, 1)); diff --git a/libtiledbsoma/test/unit_soma_dataframe.cc b/libtiledbsoma/test/unit_soma_dataframe.cc index 98d63c0041..39bceda87c 100644 --- a/libtiledbsoma/test/unit_soma_dataframe.cc +++ b/libtiledbsoma/test/unit_soma_dataframe.cc @@ -81,7 +81,7 @@ TEST_CASE("SOMADataFrame: metadata") { std::string uri = "mem://unit-test-collection"; auto [schema, index_columns] = helper::create_arrow_schema(); SOMADataFrame::create( - uri, schema, index_columns, ctx, TimestampRange(0, 2)); + uri, schema, index_columns, ctx, std::nullopt, TimestampRange(0, 2)); auto soma_dataframe = SOMADataFrame::open( uri, From 2bb57f1bf7730ed062695dd6b9f5d0b9bf5f6270 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Tue, 26 Mar 2024 11:53:07 -0500 Subject: [PATCH 60/70] WIP --- apis/python/src/tiledbsoma/_dataframe.py | 91 +----------------------- libtiledbsoma/src/soma/managed_query.cc | 1 + 2 files changed, 4 insertions(+), 88 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 4692d2f665..c3bdf053c6 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -454,94 +454,6 @@ def write( """ _util.check_type("values", values, (pa.Table,)) - # dim_cols_map: Dict[str, pd.DataFrame] = {} - # attr_cols_map: Dict[str, pd.DataFrame] = {} - # dim_names_set = self.index_column_names - # n = None - - # for col_info in values.schema: - # name = col_info.name - # col = values.column(name).combine_chunks() - # n = len(col) - - # if self._handle.schema.has_attr(name): - # attr = self._handle.schema.attr(name) - - # # Add the enumeration values to the TileDB Array from ArrowArray - # if attr.enum_label is not None: - # if not pa.types.is_dictionary(col_info.type): - # raise ValueError( - # "Expected dictionary type for enumerated attribute " - # f"{name} but saw {col.type}" - # ) - - # enmr = self._handle.enum(attr.name) - - # # get new enumeration values by taking the set difference - # # while maintaining ordering - # update_vals = np.setdiff1d( - # col.dictionary, enmr.values(), assume_unique=True - # ) - - # index_capacity_current = len(enmr.values()) + len(update_vals) - # index_capacity_max = np.iinfo( - # col_info.type.index_type.to_pandas_dtype() - # ).max - # if index_capacity_max < index_capacity_current: - # raise ValueError( - # f"Too many enumeration values ({index_capacity_current}) " - # "for index type {col_info.type.index_type}" - # ) - - # # only extend if there are new values - # if len(update_vals) != 0: - # se = tiledb.ArraySchemaEvolution(self.context.tiledb_ctx) - # if np.issubdtype(enmr.dtype.type, np.str_): - # extend_vals = np.array(update_vals, "U") - # elif np.issubdtype(enmr.dtype.type, np.bytes_): - # extend_vals = np.array(update_vals, "S") - # else: - # extend_vals = np.array(update_vals, enmr.dtype) - # new_enmr = enmr.extend(extend_vals) - # df = pd.Categorical(col.to_pandas(), new_enmr.values()) - # col = pa.DictionaryArray.from_pandas(df) - # se.extend_enumeration(new_enmr) - # se.array_evolve(uri=self.uri) - - # cols_map = dim_cols_map if name in dim_names_set else attr_cols_map - # schema = self._handle.schema - # if pa.types.is_dictionary(col.type): - # if ( - # name not in dim_names_set - # and schema.attr(name).enum_label is not None - # ): - # cols_map[name] = col.indices.to_pandas() - # else: - # cols_map[name] = col - - # else: - # if name not in dim_names_set: - # if schema.attr(name).enum_label is not None: - # raise ValueError( - # f"Categorical column {name} must be presented with categorical data" - # ) - - # cols_map[name] = col.to_pandas() - - # if n is None: - # raise ValueError(f"did not find any column names in {values.schema.names}") - - # We need to produce the dim cols in the same order as they're present in the TileDB schema - # (tracked by self.index_column_names). This is important in the multi-index case. Suppose - # the Arrow schema has two index columns in the order "burger" and "meister", and suppose - # the user set index_column_names = ["meister", "burger"] when creating the TileDB schema. - # Then the above for-loop over the Arrow schema will find the former ordering, but for the - # ``writer[dims] = attrs`` below we must have dims with the latter ordering. - # values = values.cast(self.schema) - # target_schema = pa.schema(self.schema.field(f.name) for f in values.schema) - # values = values.cast(target_schema) - # print(values) - target_schema = [] for input_field in values.schema: target_field = self.schema.field(input_field.name) @@ -555,6 +467,9 @@ def write( target_schema.append(target_field) values = values.cast(pa.schema(target_schema, values.schema.metadata)) + print("HELLLLOOOOOOOOOOOOOOOOO") + print() + for batch in values.to_batches(): self._handle.write(batch) diff --git a/libtiledbsoma/src/soma/managed_query.cc b/libtiledbsoma/src/soma/managed_query.cc index 3c47c6449a..b0d18f2f45 100644 --- a/libtiledbsoma/src/soma/managed_query.cc +++ b/libtiledbsoma/src/soma/managed_query.cc @@ -272,6 +272,7 @@ void ManagedQuery::setup_read() { void ManagedQuery::submit_write() { query_->submit(); + query_->finalize(); } void ManagedQuery::submit_read() { From 67a0b54617714552ece6d6e8e3d72a9e33fa3c79 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Wed, 27 Mar 2024 16:54:40 -0500 Subject: [PATCH 61/70] Fix writes when using slice of arrow table --- apis/python/src/tiledbsoma/_dataframe.py | 8 ++++---- apis/python/src/tiledbsoma/io/ingest.py | 1 - apis/python/src/tiledbsoma/soma_array.cc | 18 +++++++++++++++++- apis/python/tests/test_io.py | 1 + libtiledbsoma/src/soma/soma_array.cc | 3 +-- 5 files changed, 23 insertions(+), 8 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index c3bdf053c6..a5e1dacb55 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -455,6 +455,8 @@ def write( _util.check_type("values", values, (pa.Table,)) target_schema = [] + print("self.schema") + print(self.schema) for input_field in values.schema: target_field = self.schema.field(input_field.name) @@ -465,11 +467,9 @@ def write( target_schema.append(target_field.with_type(pa.uint8())) else: target_schema.append(target_field) + # print(values) values = values.cast(pa.schema(target_schema, values.schema.metadata)) - - print("HELLLLOOOOOOOOOOOOOOOOO") - print() - + for batch in values.to_batches(): self._handle.write(batch) diff --git a/apis/python/src/tiledbsoma/io/ingest.py b/apis/python/src/tiledbsoma/io/ingest.py index 48e2906ac0..a05a9385be 100644 --- a/apis/python/src/tiledbsoma/io/ingest.py +++ b/apis/python/src/tiledbsoma/io/ingest.py @@ -1219,7 +1219,6 @@ def _write_arrow_table( ) handle.write(arrow_table) - def _write_dataframe( df_uri: str, df: pd.DataFrame, diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index c81c945a20..f81364b53d 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -91,9 +91,25 @@ void write(SOMAArray& array, py::handle py_batch) { } } + auto np = py::module::import("numpy"); + auto table_offset = arr_->offset; + auto data_size = tiledb::impl::type_size(ArrowAdapter::to_tiledb_format(sch_->format)); + + if(offsets){ + offsets += table_offset; + } + if(validities){ + validities += table_offset; + } + array.set_column_data( - sch_->name, arr_->length, data, offsets, validities); + sch_->name, + arr_->length, + (char*)data + table_offset * data_size, + offsets, + nullptr); } + try { array.write(); } catch (const std::exception& e) { diff --git a/apis/python/tests/test_io.py b/apis/python/tests/test_io.py index e696ffe927..61a5861952 100644 --- a/apis/python/tests/test_io.py +++ b/apis/python/tests/test_io.py @@ -171,3 +171,4 @@ def test_write_arrow_table(tmp_path, num_rows, cap_nbytes): with soma.DataFrame.open(uri) as sdf: pdf = sdf.read().concat().to_pandas() assert list(pdf["foo"]) == pydict["foo"] + assert list(pdf["bar"]) == pydict["bar"] diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index e1c7dafe2a..e0c6b5a58b 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -404,11 +404,10 @@ void SOMAArray::write() { if (mq_->query_type() != TILEDB_WRITE) { throw TileDBSOMAError("[SOMAArray] array must be opened in write mode"); } - mq_->submit_write(); mq_->reset(); - array_buffer_ = nullptr; + // array_buffer_ = nullptr; } uint64_t SOMAArray::nnz() { From 6ce288ae1532c4b34ec8c50e394e5fcffeed0a7b Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Thu, 28 Mar 2024 14:17:38 -0500 Subject: [PATCH 62/70] WIP wrong: do not use capacity but actual max --- apis/python/src/tiledbsoma/_dataframe.py | 3 -- apis/python/src/tiledbsoma/soma_array.cc | 7 +-- libtiledbsoma/src/soma/soma_array.cc | 55 +++++++++++++++++++----- libtiledbsoma/src/soma/soma_array.h | 27 +++++++++++- 4 files changed, 75 insertions(+), 17 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index a5e1dacb55..d2d8219274 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -455,8 +455,6 @@ def write( _util.check_type("values", values, (pa.Table,)) target_schema = [] - print("self.schema") - print(self.schema) for input_field in values.schema: target_field = self.schema.field(input_field.name) @@ -467,7 +465,6 @@ def write( target_schema.append(target_field.with_type(pa.uint8())) else: target_schema.append(target_field) - # print(values) values = values.cast(pa.schema(target_schema, values.schema.metadata)) for batch in values.to_batches(): diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index f81364b53d..1d587ecde7 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -93,12 +93,13 @@ void write(SOMAArray& array, py::handle py_batch) { auto np = py::module::import("numpy"); auto table_offset = arr_->offset; - auto data_size = tiledb::impl::type_size(ArrowAdapter::to_tiledb_format(sch_->format)); + auto data_size = tiledb::impl::type_size( + ArrowAdapter::to_tiledb_format(sch_->format)); - if(offsets){ + if (offsets) { offsets += table_offset; } - if(validities){ + if (validities) { validities += table_offset; } diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index e0c6b5a58b..5552c41e34 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -285,6 +285,9 @@ void SOMAArray::extend_enumeration( auto enmr = ArrayExperimental::get_enumeration( *ctx_->tiledb_ctx(), *arr_, std::string(name)); + auto index_dtype_width = + tiledb_schema()->attribute(std::string(name)).type(); + switch (enmr.type()) { case TILEDB_STRING_ASCII: case TILEDB_STRING_UTF8: @@ -314,6 +317,28 @@ void SOMAArray::extend_enumeration( } if (extend_values.size() != 0) { + // Check that we extend the enumeration values without + // overflowing + uint64_t max_capacity; + if (index_dtype_width == 1) { + max_capacity = std::numeric_limits::max(); + } else if (index_dtype_width == 2) { + max_capacity = std::numeric_limits::max(); + } else if (index_dtype_width == 4) { + max_capacity = std::numeric_limits::max(); + } else if (index_dtype_width == 8) { + max_capacity = std::numeric_limits::max(); + } else { + throw TileDBSOMAError( + "Saw invalid size for integer when extending enums"); + } + + auto free_capacity = max_capacity - enums_existing.size(); + if (free_capacity < extend_values.size()) { + throw TileDBSOMAError( + "Cannot extend enumeration; reached maximum capacity"); + } + ArraySchemaEvolution se(*ctx_->tiledb_ctx()); se.extend_enumeration(enmr.extend(extend_values)); se.array_evolve(uri_); @@ -322,43 +347,53 @@ void SOMAArray::extend_enumeration( } case TILEDB_BOOL: case TILEDB_INT8: { - SOMAArray::_extend_value_helper((int8_t*)data, num_elems, enmr); + SOMAArray::_extend_value_helper( + (int8_t*)data, num_elems, enmr, index_dtype_width); break; } case TILEDB_UINT8: { - SOMAArray::_extend_value_helper((uint8_t*)data, num_elems, enmr); + SOMAArray::_extend_value_helper( + (uint8_t*)data, num_elems, enmr, index_dtype_width); break; } case TILEDB_INT16: { - SOMAArray::_extend_value_helper((int16_t*)data, num_elems, enmr); + SOMAArray::_extend_value_helper( + (int16_t*)data, num_elems, enmr, index_dtype_width); break; } case TILEDB_UINT16: { - SOMAArray::_extend_value_helper((uint16_t*)data, num_elems, enmr); + SOMAArray::_extend_value_helper( + (uint16_t*)data, num_elems, enmr, index_dtype_width); break; } case TILEDB_INT32: { - SOMAArray::_extend_value_helper((int32_t*)data, num_elems, enmr); + SOMAArray::_extend_value_helper( + (int32_t*)data, num_elems, enmr, index_dtype_width); break; } case TILEDB_UINT32: { - SOMAArray::_extend_value_helper((uint32_t*)data, num_elems, enmr); + SOMAArray::_extend_value_helper( + (uint32_t*)data, num_elems, enmr, index_dtype_width); break; } case TILEDB_INT64: { - SOMAArray::_extend_value_helper((int64_t*)data, num_elems, enmr); + SOMAArray::_extend_value_helper( + (int64_t*)data, num_elems, enmr, index_dtype_width); break; } case TILEDB_UINT64: { - SOMAArray::_extend_value_helper((uint64_t*)data, num_elems, enmr); + SOMAArray::_extend_value_helper( + (uint64_t*)data, num_elems, enmr, index_dtype_width); break; } case TILEDB_FLOAT32: { - SOMAArray::_extend_value_helper((float*)data, num_elems, enmr); + SOMAArray::_extend_value_helper( + (float*)data, num_elems, enmr, index_dtype_width); break; } case TILEDB_FLOAT64: { - SOMAArray::_extend_value_helper((double*)data, num_elems, enmr); + SOMAArray::_extend_value_helper( + (double*)data, num_elems, enmr, index_dtype_width); break; } default: diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 105a9ceff2..f7646b6486 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -709,7 +709,11 @@ class SOMAArray : public SOMAObject { //=================================================================== template - void _extend_value_helper(T* data, uint64_t num_elems, Enumeration enmr) { + void _extend_value_helper( + T* data, + uint64_t num_elems, + Enumeration enmr, + uint64_t index_dtype_width) { std::vector enums_in_write((T*)data, (T*)data + num_elems); auto enums_existing = enmr.as_vector(); std::vector extend_values; @@ -722,6 +726,27 @@ class SOMAArray : public SOMAObject { } if (extend_values.size() != 0) { + // Check that we extend the enumeration values without overflowing + uint64_t max_capacity; + if (index_dtype_width == 1) { + max_capacity = std::numeric_limits::max(); + } else if (index_dtype_width == 2) { + max_capacity = std::numeric_limits::max(); + } else if (index_dtype_width == 4) { + max_capacity = std::numeric_limits::max(); + } else if (index_dtype_width == 8) { + max_capacity = std::numeric_limits::max(); + } else { + throw TileDBSOMAError( + "Saw invalid size for integer when extending enums"); + } + + auto free_capacity = max_capacity - enums_existing.size(); + if (free_capacity < extend_values.size()) { + throw TileDBSOMAError( + "Cannot extend enumeration; reached maximum capacity"); + } + ArraySchemaEvolution se(*ctx_->tiledb_ctx()); se.extend_enumeration(enmr.extend(extend_values)); se.array_evolve(uri_); From f80464488d9c0b3efbc9fa3e4e5256b20fcbfabd Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Thu, 28 Mar 2024 23:36:59 -0500 Subject: [PATCH 63/70] WIP do not extend enumerations past limit for index dtype --- apis/python/tests/test_dataframe.py | 4 +- libtiledbsoma/src/soma/soma_array.cc | 67 +++++++++++++++++----------- libtiledbsoma/src/soma/soma_array.h | 20 +-------- 3 files changed, 44 insertions(+), 47 deletions(-) diff --git a/apis/python/tests/test_dataframe.py b/apis/python/tests/test_dataframe.py index e1070a26fe..dbc027087f 100644 --- a/apis/python/tests/test_dataframe.py +++ b/apis/python/tests/test_dataframe.py @@ -1334,7 +1334,7 @@ def test_enum_extend_past_numerical_limit(tmp_path): soma.DataFrame.create(uri, schema=schema).close() n_elem = 132 - n_cats = 128 + n_cats = 127 df1 = pd.DataFrame( { "soma_joinid": pd.Series(np.arange(n_elem), dtype=np.int64), @@ -1361,7 +1361,7 @@ def test_enum_extend_past_numerical_limit(tmp_path): # cannot add additional categories as already maxed out earlier tbl = pa.Table.from_pandas(df2, preserve_index=False) - with pytest.raises(ValueError): + with pytest.raises(soma.SOMAError): with soma.open(uri, mode="w") as A: A.write(tbl) diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index 5552c41e34..e8b43b2c05 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -285,8 +285,37 @@ void SOMAArray::extend_enumeration( auto enmr = ArrayExperimental::get_enumeration( *ctx_->tiledb_ctx(), *arr_, std::string(name)); - auto index_dtype_width = - tiledb_schema()->attribute(std::string(name)).type(); + uint64_t max_capacity; + switch (tiledb_schema()->attribute(std::string(name)).type()) { + case TILEDB_INT8: + max_capacity = std::numeric_limits::max(); + break; + case TILEDB_UINT8: + max_capacity = std::numeric_limits::max(); + break; + case TILEDB_INT16: + max_capacity = std::numeric_limits::max(); + break; + case TILEDB_UINT16: + max_capacity = std::numeric_limits::max(); + break; + case TILEDB_INT32: + max_capacity = std::numeric_limits::max(); + break; + case TILEDB_UINT32: + max_capacity = std::numeric_limits::max(); + break; + case TILEDB_INT64: + max_capacity = std::numeric_limits::max(); + break; + case TILEDB_UINT64: + max_capacity = std::numeric_limits::max(); + break; + default: + throw TileDBSOMAError( + "Saw invalid enumeration index type when trying to extend " + "enumeration"); + } switch (enmr.type()) { case TILEDB_STRING_ASCII: @@ -319,20 +348,6 @@ void SOMAArray::extend_enumeration( if (extend_values.size() != 0) { // Check that we extend the enumeration values without // overflowing - uint64_t max_capacity; - if (index_dtype_width == 1) { - max_capacity = std::numeric_limits::max(); - } else if (index_dtype_width == 2) { - max_capacity = std::numeric_limits::max(); - } else if (index_dtype_width == 4) { - max_capacity = std::numeric_limits::max(); - } else if (index_dtype_width == 8) { - max_capacity = std::numeric_limits::max(); - } else { - throw TileDBSOMAError( - "Saw invalid size for integer when extending enums"); - } - auto free_capacity = max_capacity - enums_existing.size(); if (free_capacity < extend_values.size()) { throw TileDBSOMAError( @@ -348,52 +363,52 @@ void SOMAArray::extend_enumeration( case TILEDB_BOOL: case TILEDB_INT8: { SOMAArray::_extend_value_helper( - (int8_t*)data, num_elems, enmr, index_dtype_width); + (int8_t*)data, num_elems, enmr, max_capacity); break; } case TILEDB_UINT8: { SOMAArray::_extend_value_helper( - (uint8_t*)data, num_elems, enmr, index_dtype_width); + (uint8_t*)data, num_elems, enmr, max_capacity); break; } case TILEDB_INT16: { SOMAArray::_extend_value_helper( - (int16_t*)data, num_elems, enmr, index_dtype_width); + (int16_t*)data, num_elems, enmr, max_capacity); break; } case TILEDB_UINT16: { SOMAArray::_extend_value_helper( - (uint16_t*)data, num_elems, enmr, index_dtype_width); + (uint16_t*)data, num_elems, enmr, max_capacity); break; } case TILEDB_INT32: { SOMAArray::_extend_value_helper( - (int32_t*)data, num_elems, enmr, index_dtype_width); + (int32_t*)data, num_elems, enmr, max_capacity); break; } case TILEDB_UINT32: { SOMAArray::_extend_value_helper( - (uint32_t*)data, num_elems, enmr, index_dtype_width); + (uint32_t*)data, num_elems, enmr, max_capacity); break; } case TILEDB_INT64: { SOMAArray::_extend_value_helper( - (int64_t*)data, num_elems, enmr, index_dtype_width); + (int64_t*)data, num_elems, enmr, max_capacity); break; } case TILEDB_UINT64: { SOMAArray::_extend_value_helper( - (uint64_t*)data, num_elems, enmr, index_dtype_width); + (uint64_t*)data, num_elems, enmr, max_capacity); break; } case TILEDB_FLOAT32: { SOMAArray::_extend_value_helper( - (float*)data, num_elems, enmr, index_dtype_width); + (float*)data, num_elems, enmr, max_capacity); break; } case TILEDB_FLOAT64: { SOMAArray::_extend_value_helper( - (double*)data, num_elems, enmr, index_dtype_width); + (double*)data, num_elems, enmr, max_capacity); break; } default: diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index f7646b6486..19ac4166ae 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -710,10 +710,7 @@ class SOMAArray : public SOMAObject { template void _extend_value_helper( - T* data, - uint64_t num_elems, - Enumeration enmr, - uint64_t index_dtype_width) { + T* data, uint64_t num_elems, Enumeration enmr, uint64_t max_capacity) { std::vector enums_in_write((T*)data, (T*)data + num_elems); auto enums_existing = enmr.as_vector(); std::vector extend_values; @@ -726,21 +723,6 @@ class SOMAArray : public SOMAObject { } if (extend_values.size() != 0) { - // Check that we extend the enumeration values without overflowing - uint64_t max_capacity; - if (index_dtype_width == 1) { - max_capacity = std::numeric_limits::max(); - } else if (index_dtype_width == 2) { - max_capacity = std::numeric_limits::max(); - } else if (index_dtype_width == 4) { - max_capacity = std::numeric_limits::max(); - } else if (index_dtype_width == 8) { - max_capacity = std::numeric_limits::max(); - } else { - throw TileDBSOMAError( - "Saw invalid size for integer when extending enums"); - } - auto free_capacity = max_capacity - enums_existing.size(); if (free_capacity < extend_values.size()) { throw TileDBSOMAError( From 482c0839de253f6f771581382515ae3d35f0173c Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Fri, 29 Mar 2024 10:14:30 -0500 Subject: [PATCH 64/70] WIP clears buffers after running --- libtiledbsoma/src/soma/soma_array.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index e8b43b2c05..e169b40bac 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -457,7 +457,7 @@ void SOMAArray::write() { mq_->submit_write(); mq_->reset(); - // array_buffer_ = nullptr; + clear_column_data(); } uint64_t SOMAArray::nnz() { From 6f1f07cab27d3fdf522b346bafb7d6f10850c041 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Mon, 1 Apr 2024 11:00:16 -0500 Subject: [PATCH 65/70] WIP --- apis/python/src/tiledbsoma/_dataframe.py | 11 ++- apis/python/src/tiledbsoma/soma_array.cc | 120 ++++++++++++++++++----- libtiledbsoma/src/soma/soma_array.cc | 66 +++++-------- libtiledbsoma/src/soma/soma_array.h | 7 +- 4 files changed, 129 insertions(+), 75 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index d2d8219274..4abe91dc63 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -458,8 +458,15 @@ def write( for input_field in values.schema: target_field = self.schema.field(input_field.name) - if pa.types.is_dictionary(target_field.type) and not pa.types.is_dictionary(input_field.type): - raise ValueError(f"{input_field.name} requires dictionary entry") + if pa.types.is_dictionary(target_field.type): + if not pa.types.is_dictionary(input_field.type): + raise ValueError(f"{input_field.name} requires dictionary entry") + # extend enums in array schema as necessary + # get evolved enums + col = values.column(input_field.name).combine_chunks() + new_enums = self._handle._handle.extend_enumeration(col) + print(new_enums) + # cast that in table if pa.types.is_boolean(input_field.type): target_schema.append(target_field.with_type(pa.uint8())) diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index 1d587ecde7..2de72d0385 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -63,33 +63,34 @@ void write(SOMAArray& array, py::handle py_batch) { data = arr_->buffers[1]; } - if (attributes.find(sch_->name) != attributes.end()) { - auto enmr_name = AttributeExperimental::get_enumeration_name( - *array.ctx()->tiledb_ctx(), attributes.at(sch_->name)); - - if (enmr_name.has_value()) { - auto dict = arr_->dictionary; - if (!dict) { - array.clear_column_data(); - throw py::value_error( - "Saw non-dictionary column passed to enumerated type"); - } - - const void* enmr_data; - uint64_t* enmr_offsets = nullptr; - if (dict->n_buffers == 3) { - enmr_offsets = (uint64_t*)dict->buffers[1]; - enmr_data = dict->buffers[2]; - } else { - enmr_data = dict->buffers[1]; - } - - if (dict->length != 0) { - array.extend_enumeration( - sch_->name, dict->length, enmr_data, enmr_offsets); - } - } - } + // if (attributes.find(sch_->name) != attributes.end()) { + // auto enmr_name = AttributeExperimental::get_enumeration_name( + // *array.ctx()->tiledb_ctx(), attributes.at(sch_->name)); + + // if (enmr_name.has_value()) { + // auto dict = arr_->dictionary; + // if (!dict) { + // array.clear_column_data(); + // throw py::value_error( + // "Saw non-dictionary column passed to enumerated + // type"); + // } + + // const void* enmr_data; + // uint64_t* enmr_offsets = nullptr; + // if (dict->n_buffers == 3) { + // enmr_offsets = (uint64_t*)dict->buffers[1]; + // enmr_data = dict->buffers[2]; + // } else { + // enmr_data = dict->buffers[1]; + // } + + // if (dict->length != 0) { + // array.extend_enumeration( + // sch_->name, dict->length, enmr_data, enmr_offsets); + // } + // } + // } auto np = py::module::import("numpy"); auto table_offset = arr_->offset; @@ -759,6 +760,71 @@ void load_soma_array(py::module& m) { .def_property_readonly("dimension_names", &SOMAArray::dimension_names) + .def( + "extend_enumeration", + [](SOMAArray& array, py::handle py_batch) -> py::object { + ArrowSchema arrow_schema; + ArrowArray arrow_array; + uintptr_t arrow_schema_ptr = (uintptr_t)(&arrow_schema); + uintptr_t arrow_array_ptr = (uintptr_t)(&arrow_array); + py_batch.attr("_export_to_c")( + arrow_array_ptr, arrow_schema_ptr); + + auto dict = arrow_array.dictionary; + const void* enmr_data; + uint64_t* enmr_offsets = nullptr; + if (dict->n_buffers == 3) { + enmr_offsets = (uint64_t*)dict->buffers[1]; + enmr_data = dict->buffers[2]; + } else { + enmr_data = dict->buffers[1]; + } + + if (dict->length != 0) { + auto new_enmr = array.extend_enumeration( + arrow_schema.name, + dict->length, + enmr_data, + enmr_offsets); + + auto emdr_format = arrow_schema.dictionary->format; + switch (ArrowAdapter::to_tiledb_format(emdr_format)) { + case TILEDB_STRING_ASCII: + case TILEDB_STRING_UTF8: + case TILEDB_CHAR: + return py::cast(new_enmr.as_vector()); + case TILEDB_BOOL: + case TILEDB_INT8: + return py::cast(new_enmr.as_vector()); + case TILEDB_UINT8: + return py::cast(new_enmr.as_vector()); + case TILEDB_INT16: + return py::cast(new_enmr.as_vector()); + case TILEDB_UINT16: + return py::cast(new_enmr.as_vector()); + case TILEDB_INT32: + return py::cast(new_enmr.as_vector()); + case TILEDB_UINT32: + return py::cast(new_enmr.as_vector()); + case TILEDB_INT64: + return py::cast(new_enmr.as_vector()); + case TILEDB_UINT64: + return py::cast(new_enmr.as_vector()); + case TILEDB_FLOAT32: + return py::cast(new_enmr.as_vector()); + case TILEDB_FLOAT64: + return py::cast(new_enmr.as_vector()); + default: + throw TileDBSOMAError( + "extend_enumeration: Unsupported dict " + "datatype"); + } + + } else { + return py::cast(std::vector()); + } + }) + .def("set_metadata", set_metadata) .def("delete_metadata", &SOMAArray::delete_metadata) diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index e169b40bac..5cbdafab4a 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -277,7 +277,7 @@ std::optional> SOMAArray::read_next() { return mq_->results(); } -void SOMAArray::extend_enumeration( +Enumeration SOMAArray::extend_enumeration( std::string_view name, uint64_t num_elems, const void* data, @@ -353,64 +353,44 @@ void SOMAArray::extend_enumeration( throw TileDBSOMAError( "Cannot extend enumeration; reached maximum capacity"); } - ArraySchemaEvolution se(*ctx_->tiledb_ctx()); se.extend_enumeration(enmr.extend(extend_values)); se.array_evolve(uri_); } - break; + + return enmr.extend(extend_values); } case TILEDB_BOOL: - case TILEDB_INT8: { - SOMAArray::_extend_value_helper( + case TILEDB_INT8: + return SOMAArray::_extend_value_helper( (int8_t*)data, num_elems, enmr, max_capacity); - break; - } - case TILEDB_UINT8: { - SOMAArray::_extend_value_helper( + case TILEDB_UINT8: + return SOMAArray::_extend_value_helper( (uint8_t*)data, num_elems, enmr, max_capacity); - break; - } - case TILEDB_INT16: { - SOMAArray::_extend_value_helper( + case TILEDB_INT16: + return SOMAArray::_extend_value_helper( (int16_t*)data, num_elems, enmr, max_capacity); - break; - } - case TILEDB_UINT16: { - SOMAArray::_extend_value_helper( + case TILEDB_UINT16: + return SOMAArray::_extend_value_helper( (uint16_t*)data, num_elems, enmr, max_capacity); - break; - } - case TILEDB_INT32: { - SOMAArray::_extend_value_helper( + case TILEDB_INT32: + return SOMAArray::_extend_value_helper( (int32_t*)data, num_elems, enmr, max_capacity); - break; - } - case TILEDB_UINT32: { - SOMAArray::_extend_value_helper( + case TILEDB_UINT32: + return SOMAArray::_extend_value_helper( (uint32_t*)data, num_elems, enmr, max_capacity); - break; - } - case TILEDB_INT64: { - SOMAArray::_extend_value_helper( + case TILEDB_INT64: + return SOMAArray::_extend_value_helper( (int64_t*)data, num_elems, enmr, max_capacity); - break; - } - case TILEDB_UINT64: { - SOMAArray::_extend_value_helper( + case TILEDB_UINT64: + return SOMAArray::_extend_value_helper( (uint64_t*)data, num_elems, enmr, max_capacity); - break; - } - case TILEDB_FLOAT32: { - SOMAArray::_extend_value_helper( + case TILEDB_FLOAT32: + return SOMAArray::_extend_value_helper( (float*)data, num_elems, enmr, max_capacity); - break; - } - case TILEDB_FLOAT64: { - SOMAArray::_extend_value_helper( + case TILEDB_FLOAT64: + return SOMAArray::_extend_value_helper( (double*)data, num_elems, enmr, max_capacity); - break; - } default: throw TileDBSOMAError(fmt::format( "ArrowAdapter: Unsupported TileDB dict datatype: {} ", diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 19ac4166ae..76e914c8c9 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -407,7 +407,7 @@ class SOMAArray : public SOMAObject { */ std::optional> read_next(); - void extend_enumeration( + Enumeration extend_enumeration( std::string_view name, uint64_t num_elems, const void* data, @@ -709,7 +709,7 @@ class SOMAArray : public SOMAObject { //=================================================================== template - void _extend_value_helper( + Enumeration _extend_value_helper( T* data, uint64_t num_elems, Enumeration enmr, uint64_t max_capacity) { std::vector enums_in_write((T*)data, (T*)data + num_elems); auto enums_existing = enmr.as_vector(); @@ -728,11 +728,12 @@ class SOMAArray : public SOMAObject { throw TileDBSOMAError( "Cannot extend enumeration; reached maximum capacity"); } - ArraySchemaEvolution se(*ctx_->tiledb_ctx()); se.extend_enumeration(enmr.extend(extend_values)); se.array_evolve(uri_); } + + return enmr.extend(extend_values); } // Fills the metadata cache upon opening the array. From 42435376d7d06f07bbd0fbf26bcf7663d5c58e5f Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Mon, 1 Apr 2024 23:06:51 -0500 Subject: [PATCH 66/70] WIP update enumeration index values when extending --- apis/python/src/tiledbsoma/_collection.py | 2 +- apis/python/src/tiledbsoma/_dataframe.py | 46 ++++++--- apis/python/src/tiledbsoma/_tdb_handles.py | 4 +- apis/python/src/tiledbsoma/io/ingest.py | 1 + apis/python/src/tiledbsoma/soma_array.cc | 107 +++++++++++++++------ apis/python/tests/test_query_condition.py | 2 +- libtiledbsoma/src/soma/soma_array.cc | 3 +- libtiledbsoma/src/soma/soma_array.h | 3 +- 8 files changed, 120 insertions(+), 48 deletions(-) diff --git a/apis/python/src/tiledbsoma/_collection.py b/apis/python/src/tiledbsoma/_collection.py index e110374d3d..f3886b96d6 100644 --- a/apis/python/src/tiledbsoma/_collection.py +++ b/apis/python/src/tiledbsoma/_collection.py @@ -436,7 +436,7 @@ def __getitem__(self, key: str) -> CollectionElementType: wrapper = _tdb_handles.open(uri, mode, context, timestamp) entry.soma = _factory.reify_handle(wrapper) - + # Since we just opened this object, we own it and should close it. self._close_stack.enter_context(entry.soma) return cast(CollectionElementType, entry.soma) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 4abe91dc63..6c3f605064 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -9,6 +9,7 @@ from typing import Any, Optional, Sequence, Tuple, Type, Union, cast import numpy as np +import pandas as pd import pyarrow as pa import somacore import tiledb @@ -243,7 +244,7 @@ def create( domains = pa.StructArray.from_arrays(domains, names=index_column_names) extents = pa.StructArray.from_arrays(extents, names=index_column_names) - + plt_cfg = None if platform_config: ops = TileDBCreateOptions.from_platform_config(platform_config) @@ -254,14 +255,18 @@ def create( plt_cfg.goal_chunk_nnz = ops.goal_chunk_nnz plt_cfg.capacity = ops.capacity if ops.offsets_filters: - plt_cfg.offsets_filters = [info["_type"] for info in ops.offsets_filters] + plt_cfg.offsets_filters = [ + info["_type"] for info in ops.offsets_filters + ] if ops.validity_filters: - plt_cfg.validity_filters = [info["_type"] for info in ops.validity_filters] + plt_cfg.validity_filters = [ + info["_type"] for info in ops.validity_filters + ] plt_cfg.allows_duplicates = ops.allows_duplicates plt_cfg.tile_order = ops.tile_order plt_cfg.cell_order = ops.cell_order plt_cfg.consolidate_and_vacuum = ops.consolidate_and_vacuum - + # TODO add as kw args clib.SOMADataFrame.create( uri, @@ -455,18 +460,31 @@ def write( _util.check_type("values", values, (pa.Table,)) target_schema = [] - for input_field in values.schema: - target_field = self.schema.field(input_field.name) + for i, input_field in enumerate(values.schema): + name = input_field.name + target_field = self.schema.field(name) if pa.types.is_dictionary(target_field.type): if not pa.types.is_dictionary(input_field.type): - raise ValueError(f"{input_field.name} requires dictionary entry") - # extend enums in array schema as necessary - # get evolved enums - col = values.column(input_field.name).combine_chunks() - new_enums = self._handle._handle.extend_enumeration(col) - print(new_enums) - # cast that in table + raise ValueError(f"{name} requires dictionary entry") + col = values.column(name).combine_chunks() + new_enmr = self._handle._handle.extend_enumeration(name, col) + + if pa.types.is_binary( + target_field.type.value_type + ) or pa.types.is_large_binary(target_field.type.value_type): + new_enmr = np.array(new_enmr, "S") + elif pa.types.is_boolean(target_field.type.value_type): + new_enmr = np.array(new_enmr, bool) + + df = pd.Categorical( + col.to_pandas(), + ordered=target_field.type.ordered, + categories=new_enmr, + ) + values = values.set_column( + i, name, pa.DictionaryArray.from_pandas(df, type=target_field.type) + ) if pa.types.is_boolean(input_field.type): target_schema.append(target_field.with_type(pa.uint8())) @@ -476,7 +494,7 @@ def write( for batch in values.to_batches(): self._handle.write(batch) - + tiledb_create_options = TileDBCreateOptions.from_platform_config( platform_config ) diff --git a/apis/python/src/tiledbsoma/_tdb_handles.py b/apis/python/src/tiledbsoma/_tdb_handles.py index fb635edaa3..5968391bea 100644 --- a/apis/python/src/tiledbsoma/_tdb_handles.py +++ b/apis/python/src/tiledbsoma/_tdb_handles.py @@ -47,7 +47,7 @@ def open( uri: str, mode: options.OpenMode, context: SOMATileDBContext, - timestamp: Optional[OpenTimestamp] + timestamp: Optional[OpenTimestamp], ) -> "Wrapper[RawHandle]": """Determine whether the URI is an array or group, and open it.""" open_mode = clib.OpenMode.read if mode == "r" else clib.OpenMode.write @@ -68,7 +68,7 @@ def open( if not obj_type: raise DoesNotExistError(f"{uri!r} does not exist") - + if obj_type == "SOMADataFrame": return DataFrameWrapper._from_soma_object(soma_object, context) if open_mode == clib.OpenMode.read and obj_type == "SOMADenseNDArray": diff --git a/apis/python/src/tiledbsoma/io/ingest.py b/apis/python/src/tiledbsoma/io/ingest.py index a05a9385be..48e2906ac0 100644 --- a/apis/python/src/tiledbsoma/io/ingest.py +++ b/apis/python/src/tiledbsoma/io/ingest.py @@ -1219,6 +1219,7 @@ def _write_arrow_table( ) handle.write(arrow_table) + def _write_dataframe( df_uri: str, df: pd.DataFrame, diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index 2de72d0385..fe3d7e8a6f 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -762,7 +762,9 @@ void load_soma_array(py::module& m) { .def( "extend_enumeration", - [](SOMAArray& array, py::handle py_batch) -> py::object { + [](SOMAArray& array, + std::string attr_name, + py::handle py_batch) -> py::array { ArrowSchema arrow_schema; ArrowArray arrow_array; uintptr_t arrow_schema_ptr = (uintptr_t)(&arrow_schema); @@ -782,38 +784,87 @@ void load_soma_array(py::module& m) { if (dict->length != 0) { auto new_enmr = array.extend_enumeration( - arrow_schema.name, - dict->length, - enmr_data, - enmr_offsets); + attr_name, dict->length, enmr_data, enmr_offsets); auto emdr_format = arrow_schema.dictionary->format; switch (ArrowAdapter::to_tiledb_format(emdr_format)) { case TILEDB_STRING_ASCII: - case TILEDB_STRING_UTF8: case TILEDB_CHAR: - return py::cast(new_enmr.as_vector()); + case TILEDB_STRING_UTF8: { + auto result = new_enmr.as_vector(); + return py::array(py::cast(result)); + } case TILEDB_BOOL: - case TILEDB_INT8: - return py::cast(new_enmr.as_vector()); - case TILEDB_UINT8: - return py::cast(new_enmr.as_vector()); - case TILEDB_INT16: - return py::cast(new_enmr.as_vector()); - case TILEDB_UINT16: - return py::cast(new_enmr.as_vector()); - case TILEDB_INT32: - return py::cast(new_enmr.as_vector()); - case TILEDB_UINT32: - return py::cast(new_enmr.as_vector()); - case TILEDB_INT64: - return py::cast(new_enmr.as_vector()); - case TILEDB_UINT64: - return py::cast(new_enmr.as_vector()); - case TILEDB_FLOAT32: - return py::cast(new_enmr.as_vector()); - case TILEDB_FLOAT64: - return py::cast(new_enmr.as_vector()); + case TILEDB_INT8: { + auto result = new_enmr.as_vector(); + return py::array( + py::dtype("int8"), + result.size(), + result.data()); + } + case TILEDB_UINT8: { + auto result = new_enmr.as_vector(); + return py::array( + py::dtype("uint8"), + result.size(), + result.data()); + } + case TILEDB_INT16: { + auto result = new_enmr.as_vector(); + return py::array( + py::dtype("int16"), + result.size(), + result.data()); + } + case TILEDB_UINT16: { + auto result = new_enmr.as_vector(); + return py::array( + py::dtype("uint16"), + result.size(), + result.data()); + } + case TILEDB_INT32: { + auto result = new_enmr.as_vector(); + return py::array( + py::dtype("int32"), + result.size(), + result.data()); + } + case TILEDB_UINT32: { + auto result = new_enmr.as_vector(); + return py::array( + py::dtype("uint32"), + result.size(), + result.data()); + } + case TILEDB_INT64: { + auto result = new_enmr.as_vector(); + return py::array( + py::dtype("int64"), + result.size(), + result.data()); + } + case TILEDB_UINT64: { + auto result = new_enmr.as_vector(); + return py::array( + py::dtype("uint64"), + result.size(), + result.data()); + } + case TILEDB_FLOAT32: { + auto result = new_enmr.as_vector(); + return py::array( + py::dtype("float32"), + result.size(), + result.data()); + } + case TILEDB_FLOAT64: { + auto result = new_enmr.as_vector(); + return py::array( + py::dtype("float64"), + result.size(), + result.data()); + } default: throw TileDBSOMAError( "extend_enumeration: Unsupported dict " @@ -821,7 +872,7 @@ void load_soma_array(py::module& m) { } } else { - return py::cast(std::vector()); + return py::array(); } }) diff --git a/apis/python/tests/test_query_condition.py b/apis/python/tests/test_query_condition.py index 7504743ecf..53b8c494c6 100644 --- a/apis/python/tests/test_query_condition.py +++ b/apis/python/tests/test_query_condition.py @@ -31,7 +31,7 @@ def soma_query(uri, condition): sr.set_condition(qc, sr.schema) arrow_table = sr.read_next() assert sr.results_complete() - + return arrow_table diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index 5cbdafab4a..f25ebb6710 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -356,9 +356,10 @@ Enumeration SOMAArray::extend_enumeration( ArraySchemaEvolution se(*ctx_->tiledb_ctx()); se.extend_enumeration(enmr.extend(extend_values)); se.array_evolve(uri_); + return enmr.extend(extend_values); } - return enmr.extend(extend_values); + return enmr; } case TILEDB_BOOL: case TILEDB_INT8: diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 76e914c8c9..2e56a2cf64 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -731,9 +731,10 @@ class SOMAArray : public SOMAObject { ArraySchemaEvolution se(*ctx_->tiledb_ctx()); se.extend_enumeration(enmr.extend(extend_values)); se.array_evolve(uri_); + return enmr.extend(extend_values); } - return enmr.extend(extend_values); + return enmr; } // Fills the metadata cache upon opening the array. From fc35aba5f71d4c865f98ef65259e0fd1d4e80113 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Tue, 2 Apr 2024 22:46:27 -0500 Subject: [PATCH 67/70] Bind the dataframe.create with the timestamp --- apis/python/src/tiledbsoma/_dataframe.py | 2 ++ apis/python/src/tiledbsoma/soma_dataframe.cc | 6 ++++-- apis/python/tests/test_collection.py | 3 ++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 6c3f605064..2aa8cd34d4 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -268,6 +268,7 @@ def create( plt_cfg.consolidate_and_vacuum = ops.consolidate_and_vacuum # TODO add as kw args + timestamp_ms = context._open_timestamp_ms(tiledb_timestamp) clib.SOMADataFrame.create( uri, schema, @@ -276,6 +277,7 @@ def create( extents, context.native_context, plt_cfg, + (0, timestamp_ms) ) handle = cls._wrapper_type.open(uri, "w", context, tiledb_timestamp) diff --git a/apis/python/src/tiledbsoma/soma_dataframe.cc b/apis/python/src/tiledbsoma/soma_dataframe.cc index 7b7b6895ab..f7f6823450 100644 --- a/apis/python/src/tiledbsoma/soma_dataframe.cc +++ b/apis/python/src/tiledbsoma/soma_dataframe.cc @@ -57,7 +57,8 @@ void load_soma_dataframe(py::module& m) { py::object py_domains, py::object py_extents, std::shared_ptr context, - std::optional platform_config) { + std::optional platform_config, + std::optional> timestamp) { ArrowSchema schema; uintptr_t schema_ptr = (uintptr_t)(&schema); py_schema.attr("_export_to_c")(schema_ptr); @@ -96,7 +97,8 @@ void load_soma_dataframe(py::module& m) { std::make_shared(domains), std::make_shared(extents)), context, - platform_config); + platform_config, + timestamp); } catch (const std::out_of_range& e) { throw py::type_error(e.what()); } catch (const std::exception& e) { diff --git a/apis/python/tests/test_collection.py b/apis/python/tests/test_collection.py index 39a6918ff8..75c792aa9e 100644 --- a/apis/python/tests/test_collection.py +++ b/apis/python/tests/test_collection.py @@ -486,6 +486,7 @@ def test_issue919(tmp_path): expt.add_new_dataframe( "df", schema=schema, index_column_names=["soma_joinid"] ) + assert expt["df"].tiledb_timestamp_ms == 100 with soma.Collection.open(uri, context=context) as c: assert "df" in c["expt"] and "causes_bug" in c["expt"] @@ -518,4 +519,4 @@ def test_context_timestamp(tmp_path: pathlib.Path): assert coll.tiledb_timestamp_ms == 234 sub_1 = coll["sub_1"] assert sub_1.tiledb_timestamp_ms == 234 - assert sub_1["sub_sub"].tiledb_timestamp_ms == 234 + assert sub_1["sub_sub"].tiledb_timestamp_ms == 234 \ No newline at end of file From 6637b82469f00096ba630b4abcab5c3e067c2772 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Thu, 4 Apr 2024 23:03:07 -0500 Subject: [PATCH 68/70] WIP fix several errors for macos --- apis/python/devtools/ingestor | 2 +- apis/python/src/tiledbsoma/_dataframe.py | 119 +---------------------- apis/python/tests/test_dataframe.py | 2 +- libtiledbsoma/src/utils/arrow_adapter.cc | 6 +- 4 files changed, 7 insertions(+), 122 deletions(-) diff --git a/apis/python/devtools/ingestor b/apis/python/devtools/ingestor index 26ab40b8ee..8da4b63560 100755 --- a/apis/python/devtools/ingestor +++ b/apis/python/devtools/ingestor @@ -16,7 +16,6 @@ import os import sys from typing import Optional -import tiledb from somacore import options import tiledbsoma @@ -26,6 +25,7 @@ import tiledbsoma._util import tiledbsoma.io import tiledbsoma.logging from tiledbsoma.options import SOMATileDBContext +import tiledb # ================================================================ diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 2aa8cd34d4..9d6b784c77 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -12,7 +12,6 @@ import pandas as pd import pyarrow as pa import somacore -import tiledb from somacore import options from typing_extensions import Self @@ -506,7 +505,7 @@ def write( return self def _set_reader_coord( - self, sr: clib.SOMAArray, dim_idx: int, dim: tiledb.Dim, coord: object + self, sr: clib.SOMAArray, dim_idx: int, dim: pa.Field, coord: object ) -> bool: if coord is None: return True # No constraint; select all in this dimension @@ -580,7 +579,7 @@ def _set_reader_coord_by_py_seq_or_np_array( self, sr: clib.SOMAArray, dim_idx: int, - dim: tiledb.Dim, + dim: pa.Field, coord: object, ) -> bool: if isinstance(coord, np.ndarray): @@ -705,120 +704,6 @@ def _canonicalize_schema( return schema -def _build_tiledb_schema( - schema: pa.Schema, - index_column_names: Sequence[str], - domain: Optional[Sequence[Optional[Tuple[Any, Any]]]], - tiledb_create_options: TileDBCreateOptions, - context: SOMATileDBContext, -) -> tiledb.ArraySchema: - """Converts an Arrow schema into a TileDB ArraySchema for creation.""" - - if domain is None: - domain = tuple(None for _ in index_column_names) - else: - ndom = len(domain) - nidx = len(index_column_names) - if ndom != nidx: - raise ValueError( - f"if domain is specified, it must have the same length as index_column_names; got {ndom} != {nidx}" - ) - - dims = [] - for index_column_name, slot_domain in zip(index_column_names, domain): - pa_type = schema.field(index_column_name).type - dtype = _arrow_types.tiledb_type_from_arrow_type( - pa_type, is_indexed_column=True - ) - - slot_domain = _fill_out_slot_domain( - slot_domain, index_column_name, pa_type, dtype - ) - - extent = _find_extent_for_domain( - index_column_name, tiledb_create_options, dtype, slot_domain - ) - - dim = tiledb.Dim( - name=index_column_name, - domain=slot_domain, - tile=extent, - dtype=dtype, - filters=tiledb_create_options.dim_filters_tiledb( - index_column_name, - [ - dict( - _type="ZstdFilter", - level=tiledb_create_options.dataframe_dim_zstd_level, - ) - ], - ), - ) - dims.append(dim) - - dom = tiledb.Domain(dims, ctx=context.tiledb_ctx) - - attrs = [] - enums = [] - metadata = schema.metadata or {} - for pa_attr in schema: - attr_name = pa_attr.name - - if attr_name in index_column_names: - continue - - has_enum = pa.types.is_dictionary(pa_attr.type) - - if has_enum: - enmr_dtype: np.dtype[Any] - vtype = pa_attr.type.value_type - if pa.types.is_large_string(vtype) or pa.types.is_string(vtype): - enmr_dtype = np.dtype("U") - elif pa.types.is_large_binary(vtype) or pa.types.is_binary(vtype): - enmr_dtype = np.dtype("S") - else: - enmr_dtype = np.dtype(vtype.to_pandas_dtype()) - enums.append( - tiledb.Enumeration( - name=attr_name, - ordered=pa_attr.type.ordered, - dtype=enmr_dtype, - ) - ) - - attr = tiledb.Attr( - name=attr_name, - dtype=_arrow_types.tiledb_type_from_arrow_type( - schema.field(attr_name).type - ), - nullable=metadata.get(attr_name.encode("utf-8")) == b"nullable", - filters=tiledb_create_options.attr_filters_tiledb( - attr_name, ["ZstdFilter"] - ), - enum_label=attr_name if has_enum else None, - ctx=context.tiledb_ctx, - ) - attrs.append(attr) - - cell_order, tile_order = tiledb_create_options.cell_tile_orders() - - return tiledb.ArraySchema( - domain=dom, - attrs=attrs, - enums=enums, - sparse=True, - allows_duplicates=tiledb_create_options.allows_duplicates, - offsets_filters=tiledb_create_options.offsets_filters_tiledb(), - validity_filters=tiledb_create_options.validity_filters_tiledb(), - capacity=tiledb_create_options.capacity, - cell_order=cell_order, - # As of TileDB core 2.8.2, we cannot consolidate string-indexed sparse arrays with - # col-major tile order: so we write ``X`` with row-major tile order. - tile_order=tile_order, - ctx=context.tiledb_ctx, - ) - - def _fill_out_slot_domain( slot_domain: Optional[Tuple[Any, Any]], index_column_name: str, diff --git a/apis/python/tests/test_dataframe.py b/apis/python/tests/test_dataframe.py index dbc027087f..e9d2d92cd2 100644 --- a/apis/python/tests/test_dataframe.py +++ b/apis/python/tests/test_dataframe.py @@ -1361,7 +1361,7 @@ def test_enum_extend_past_numerical_limit(tmp_path): # cannot add additional categories as already maxed out earlier tbl = pa.Table.from_pandas(df2, preserve_index=False) - with pytest.raises(soma.SOMAError): + with pytest.raises((RuntimeError, soma.SOMAError)): with soma.open(uri, mode="w") as A: A.write(tbl) diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 41602aba9c..bde9dce3fe 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -505,8 +505,8 @@ ArrowTable ArrowAdapter::to_arrow(std::shared_ptr column) { } bool ArrowAdapter::_isvar(const char* format) { - if ((strcmp(format, "U") == 0) | (strcmp(format, "Z") == 0) | - (strcmp(format, "u") == 0) | (strcmp(format, "z") == 0)) { + if ((strcmp(format, "U") == 0) || (strcmp(format, "Z") == 0) || + (strcmp(format, "u") == 0) || (strcmp(format, "z") == 0)) { return true; } return false; @@ -560,4 +560,4 @@ tiledb_datatype_t ArrowAdapter::to_tiledb_format(std::string_view arrow_dtype) { } } -} // namespace tiledbsoma \ No newline at end of file +} // namespace tiledbsoma From ef8d1ce2fac0c881ba384e1c1278d27d1d80c1e9 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Fri, 5 Apr 2024 13:15:58 -0500 Subject: [PATCH 69/70] Correct Boolean value writes for enum values --- apis/python/devtools/ingestor | 2 +- apis/python/src/tiledbsoma/_dataframe.py | 14 +++++++++++--- apis/python/tests/test_collection.py | 2 +- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/apis/python/devtools/ingestor b/apis/python/devtools/ingestor index 8da4b63560..26ab40b8ee 100755 --- a/apis/python/devtools/ingestor +++ b/apis/python/devtools/ingestor @@ -16,6 +16,7 @@ import os import sys from typing import Optional +import tiledb from somacore import options import tiledbsoma @@ -25,7 +26,6 @@ import tiledbsoma._util import tiledbsoma.io import tiledbsoma.logging from tiledbsoma.options import SOMATileDBContext -import tiledb # ================================================================ diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 9d6b784c77..509c84d4c9 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -276,7 +276,7 @@ def create( extents, context.native_context, plt_cfg, - (0, timestamp_ms) + (0, timestamp_ms), ) handle = cls._wrapper_type.open(uri, "w", context, tiledb_timestamp) @@ -469,15 +469,23 @@ def write( if not pa.types.is_dictionary(input_field.type): raise ValueError(f"{name} requires dictionary entry") col = values.column(name).combine_chunks() + if pa.types.is_boolean(target_field.type.value_type): + col = col.cast( + pa.dictionary( + target_field.type.index_type, + pa.uint8(), + target_field.type.ordered, + ) + ) new_enmr = self._handle._handle.extend_enumeration(name, col) - + if pa.types.is_binary( target_field.type.value_type ) or pa.types.is_large_binary(target_field.type.value_type): new_enmr = np.array(new_enmr, "S") elif pa.types.is_boolean(target_field.type.value_type): new_enmr = np.array(new_enmr, bool) - + df = pd.Categorical( col.to_pandas(), ordered=target_field.type.ordered, diff --git a/apis/python/tests/test_collection.py b/apis/python/tests/test_collection.py index 75c792aa9e..47f5f5a030 100644 --- a/apis/python/tests/test_collection.py +++ b/apis/python/tests/test_collection.py @@ -519,4 +519,4 @@ def test_context_timestamp(tmp_path: pathlib.Path): assert coll.tiledb_timestamp_ms == 234 sub_1 = coll["sub_1"] assert sub_1.tiledb_timestamp_ms == 234 - assert sub_1["sub_sub"].tiledb_timestamp_ms == 234 \ No newline at end of file + assert sub_1["sub_sub"].tiledb_timestamp_ms == 234 From 031f9a9fbf2ed0305449240af2e5b82e150bdb00 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Fri, 5 Apr 2024 15:35:30 -0500 Subject: [PATCH 70/70] Add in missing RuntimeError --- apis/python/src/tiledbsoma/_tdb_handles.py | 2 +- apis/python/src/tiledbsoma/_tiledb_object.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apis/python/src/tiledbsoma/_tdb_handles.py b/apis/python/src/tiledbsoma/_tdb_handles.py index 5968391bea..2d6aed8baf 100644 --- a/apis/python/src/tiledbsoma/_tdb_handles.py +++ b/apis/python/src/tiledbsoma/_tdb_handles.py @@ -129,7 +129,7 @@ def open( handle._do_initial_reads(auxiliary_reader) else: handle._do_initial_reads(tdb) - except tiledb.TileDBError as tdbe: + except (RuntimeError, tiledb.TileDBError) as tdbe: if is_does_not_exist_error(tdbe): raise DoesNotExistError(f"{uri!r} does not exist") from tdbe raise diff --git a/apis/python/src/tiledbsoma/_tiledb_object.py b/apis/python/src/tiledbsoma/_tiledb_object.py index cc9f863bf5..ad51b62646 100644 --- a/apis/python/src/tiledbsoma/_tiledb_object.py +++ b/apis/python/src/tiledbsoma/_tiledb_object.py @@ -270,7 +270,7 @@ def exists( if not isinstance(md_type, str): return False return md_type.lower() == cls.soma_type.lower() - except (SOMAError, tiledb.cc.TileDBError): + except (RuntimeError, SOMAError, tiledb.cc.TileDBError): return False @classmethod