From ecc87b76b8671bb79032d9d53257ddf80074c877 Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Sat, 7 Dec 2024 13:07:00 +0200 Subject: [PATCH 1/6] Fill SOMAColumn info on array open --- libtiledbsoma/src/soma/soma_array.cc | 85 ++++++++++++++++++++++++++++ libtiledbsoma/src/soma/soma_array.h | 5 ++ 2 files changed, 90 insertions(+) diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index 337b1c86da..ce9b82a2c1 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -33,8 +33,12 @@ #include #include "../utils/logger.h" #include "../utils/util.h" +#include "soma_attribute.h" +#include "soma_dimension.h" +#include "soma_geometry_column.h" #include +#include namespace tiledbsoma { using namespace tiledb; @@ -143,6 +147,7 @@ SOMAArray::SOMAArray( validate(mode, name, timestamp); reset(column_names, batch_size, result_order); fill_metadata_cache(); + fill_columns(); } SOMAArray::SOMAArray( @@ -161,6 +166,7 @@ SOMAArray::SOMAArray( validate(mode, name, timestamp); reset(column_names, batch_size, result_order); fill_metadata_cache(); + fill_columns(); } SOMAArray::SOMAArray( @@ -177,6 +183,7 @@ SOMAArray::SOMAArray( , schema_(std::make_shared(arr->schema())) { reset({}, batch_size_, result_order_); fill_metadata_cache(); + fill_columns(); } void SOMAArray::fill_metadata_cache() { @@ -220,6 +227,7 @@ void SOMAArray::open(OpenMode mode, std::optional timestamp) { validate(mode, name_, timestamp); reset(column_names(), batch_size_, result_order_); fill_metadata_cache(); + fill_columns(); } std::unique_ptr SOMAArray::reopen( @@ -1656,4 +1664,81 @@ void SOMAArray::_check_dims_are_int64() { } } +void SOMAArray::fill_columns() { + columns_.clear(); + std::deque> tdb_columns; + + for (std::size_t i = 0; i < arr_->schema().domain().ndim(); ++i) { + tdb_columns.push_back(arr_->schema().domain().dimension(i)); + } + + // We need the correct order of attributes + for (std::size_t i = 0; i < arr_->schema().attribute_num(); ++i) { + tdb_columns.push_back(arr_->schema().attribute(i)); + } + + while (!tdb_columns.empty()) { + std::visit( + [&](auto&& arg) { + using T = std::decay_t; + if constexpr (std::is_same_v) { + columns_.push_back(std::make_shared(arg)); + tdb_columns.pop_front(); + } else if constexpr (std::is_same_v) { + if (arg.name().rfind(SOMA_GEOMETRY_DIMENSION_PREFIX, 0) == + 0) { + std::vector dims; + for (std::size_t i = 0; i < tdb_columns.size(); ++i) { + if (std::holds_alternative( + tdb_columns[i]) && + std::get(tdb_columns[i]) + .name() + .rfind( + SOMA_GEOMETRY_DIMENSION_PREFIX, + 0) == 0) { + dims.push_back( + std::get(tdb_columns[i])); + } + } + + // Internal columns are all sequentially stored so we + // can remove them all by once + tdb_columns.erase( + tdb_columns.begin(), + tdb_columns.begin() + dims.size()); + + auto attr = std::find_if( + tdb_columns.begin(), + tdb_columns.end(), + [&](auto& col) { + if (std::holds_alternative(col) && + std::get(col).name().compare( + SOMA_GEOMETRY_COLUMN_NAME) == 0) { + return true; + } + return false; + }); + + if (attr == tdb_columns.end()) { + throw TileDBSOMAError(std::format( + "[SOMAArray] Missing required attribute {} for " + "SOMAGeometryColumn", + SOMA_GEOMETRY_COLUMN_NAME)); + } + + columns_.push_back(std::make_shared( + dims, std::get(*attr))); + tdb_columns.erase(attr); + } else { + // Vanilla dimension + columns_.push_back( + std::make_shared(arg)); + tdb_columns.pop_front(); + } + } + }, + tdb_columns.front()); + } +} + } // namespace tiledbsoma diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index e1a5676bf1..66a647c10f 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -45,6 +45,7 @@ #include "enums.h" #include "logger_public.h" #include "managed_query.h" +#include "soma_column.h" #include "soma_object.h" // ================================================================ @@ -227,6 +228,7 @@ class SOMAArray : public SOMAObject { , first_read_next_(other.first_read_next_) , submitted_(other.submitted_) { fill_metadata_cache(); + fill_columns(); } SOMAArray( @@ -1526,6 +1528,7 @@ class SOMAArray : public SOMAObject { std::optional _maybe_soma_joinid_tiledb_domain(); void fill_metadata_cache(); + void fill_columns(); // SOMAArray URI std::string uri_; @@ -1566,6 +1569,8 @@ class SOMAArray : public SOMAObject { // be accessible std::shared_ptr meta_cache_arr_; + std::vector> columns_; + // True if this is the first call to read_next() bool first_read_next_ = true; From a0482a5f49e37fe5da4e695ee964ff596447d1dc Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Sun, 8 Dec 2024 18:13:41 +0200 Subject: [PATCH 2/6] MIgrate domain access methods to use SOMAColumns --- libtiledbsoma/src/soma/soma_array.cc | 142 ++++++---------------- libtiledbsoma/src/soma/soma_array.h | 147 +++-------------------- libtiledbsoma/src/utils/arrow_adapter.cc | 25 ++++ libtiledbsoma/src/utils/arrow_adapter.h | 10 ++ 4 files changed, 88 insertions(+), 236 deletions(-) diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index ce9b82a2c1..09ea80fbd3 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -460,115 +460,19 @@ std::optional SOMAArray::timestamp() { // The domainish enum simply lets us re-use code which is common across // core domain, core current domain, and core non-empty domain. ArrowTable SOMAArray::_get_core_domainish(enum Domainish which_kind) { - int array_ndim = this->ndim(); - auto dimensions = tiledb_schema()->domain().dimensions(); - - // Create the schema for the info we return - std::vector names(array_ndim); - std::vector tiledb_datatypes(array_ndim); - - for (int i = 0; i < (int)array_ndim; i++) { - const Dimension& core_dim = dimensions[i]; - names[i] = core_dim.name(); - tiledb_datatypes[i] = core_dim.type(); - } - - auto arrow_schema = ArrowAdapter::make_arrow_schema( - names, tiledb_datatypes); + int array_ndim = std::count_if( + columns_.begin(), columns_.end(), [](const auto& col) { + return col->isIndexColumn(); + }); - // Create the data for the info we return + auto arrow_schema = ArrowAdapter::make_arrow_schema_parent(array_ndim); auto arrow_array = ArrowAdapter::make_arrow_array_parent(array_ndim); - for (int i = 0; i < array_ndim; i++) { - auto core_dim = dimensions[i]; - auto core_type_code = core_dim.type(); - - ArrowArray* child = nullptr; - - switch (core_type_code) { - case TILEDB_INT64: - case TILEDB_DATETIME_YEAR: - case TILEDB_DATETIME_MONTH: - case TILEDB_DATETIME_WEEK: - case TILEDB_DATETIME_DAY: - case TILEDB_DATETIME_HR: - case TILEDB_DATETIME_MIN: - case TILEDB_DATETIME_SEC: - case TILEDB_DATETIME_MS: - case TILEDB_DATETIME_US: - case TILEDB_DATETIME_NS: - case TILEDB_DATETIME_PS: - case TILEDB_DATETIME_FS: - case TILEDB_DATETIME_AS: - case TILEDB_TIME_HR: - case TILEDB_TIME_MIN: - case TILEDB_TIME_SEC: - case TILEDB_TIME_MS: - case TILEDB_TIME_US: - case TILEDB_TIME_NS: - case TILEDB_TIME_PS: - case TILEDB_TIME_FS: - case TILEDB_TIME_AS: - child = ArrowAdapter::make_arrow_array_child( - _core_domainish_slot(core_dim.name(), which_kind)); - break; - case TILEDB_UINT64: - child = ArrowAdapter::make_arrow_array_child( - _core_domainish_slot( - core_dim.name(), which_kind)); - break; - case TILEDB_INT32: - child = ArrowAdapter::make_arrow_array_child( - _core_domainish_slot(core_dim.name(), which_kind)); - break; - case TILEDB_UINT32: - child = ArrowAdapter::make_arrow_array_child( - _core_domainish_slot( - core_dim.name(), which_kind)); - break; - case TILEDB_INT16: - child = ArrowAdapter::make_arrow_array_child( - _core_domainish_slot(core_dim.name(), which_kind)); - break; - case TILEDB_UINT16: - child = ArrowAdapter::make_arrow_array_child( - _core_domainish_slot( - core_dim.name(), which_kind)); - break; - case TILEDB_INT8: - child = ArrowAdapter::make_arrow_array_child( - _core_domainish_slot(core_dim.name(), which_kind)); - break; - case TILEDB_UINT8: - child = ArrowAdapter::make_arrow_array_child( - _core_domainish_slot(core_dim.name(), which_kind)); - break; - - case TILEDB_FLOAT64: - child = ArrowAdapter::make_arrow_array_child( - _core_domainish_slot(core_dim.name(), which_kind)); - break; - case TILEDB_FLOAT32: - child = ArrowAdapter::make_arrow_array_child( - _core_domainish_slot(core_dim.name(), which_kind)); - break; - - case TILEDB_STRING_ASCII: - case TILEDB_CHAR: - case TILEDB_GEOM_WKB: - case TILEDB_GEOM_WKT: - child = ArrowAdapter::make_arrow_array_child_string( - _core_domainish_slot_string(core_dim.name(), which_kind)); - break; - - default: - throw TileDBSOMAError(std::format( - "SOMAArray::_get_core_domainish:dim {} has unhandled type " - "{}", - core_dim.name(), - tiledb::impl::type_to_str(core_type_code))); - } - arrow_array->children[i] = child; + for (int64_t i = 0; i < array_ndim; ++i) { + arrow_schema->children[i] = columns_[i]->arrow_schema_slot( + *ctx_, *arr_); + arrow_array->children[i] = columns_[i]->arrow_domain_slot( + *ctx_, *arr_, which_kind); } return ArrowTable(std::move(arrow_array), std::move(arrow_schema)); @@ -1741,4 +1645,30 @@ void SOMAArray::fill_columns() { } } +std::shared_ptr SOMAArray::get_column(std::string_view name) const { + auto result = std::find_if(columns_.begin(), columns_.end(), [&](auto col) { + return col->name() == name; + }); + + if (result == columns_.end()) { + throw TileDBSOMAError(std::format( + "[SOMAArray] internal coding error: No column named {} found", + name)); + } + + return *result; +} + +std::shared_ptr SOMAArray::get_column(std::size_t index) const { + if (index >= columns_.size()) { + throw TileDBSOMAError(std::format( + "[SOMAArray] internal coding error: Column index outside of range. " + "Requested {}, but {} exist.", + index, + columns_.size())); + } + + return columns_[index]; +} + } // namespace tiledbsoma diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 66a647c10f..8604acb27b 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -354,7 +354,7 @@ class SOMAArray : public SOMAObject { */ template void set_dim_point(const std::string& dim, const T& point) { - mq_->select_point(dim, point); + get_column(dim)->set_dim_point(mq_, *ctx_, point); } /** @@ -400,10 +400,10 @@ class SOMAArray : public SOMAObject { start + partition_size - 1, points.size())); - mq_->select_points( - dim, std::span{&points[start], partition_size}); + get_column(dim)->set_dim_points( + mq_, *ctx_, std::span{&points[start], partition_size}); } else { - mq_->select_points(dim, points); + get_column(dim)->set_dim_points(mq_, *ctx_, points); } } @@ -421,7 +421,7 @@ class SOMAArray : public SOMAObject { LOG_DEBUG( "[SOMAArray] set_dim_points: sizeof(T)=" + std::to_string(sizeof(T))); - mq_->select_points(dim, points); + get_column(dim)->set_dim_points(mq_, *ctx_, std::span(points)); } /** @@ -436,7 +436,7 @@ class SOMAArray : public SOMAObject { template void set_dim_ranges( const std::string& dim, const std::vector>& ranges) { - mq_->select_ranges(dim, ranges); + get_column(dim)->set_dim_ranges(mq_, *ctx_, ranges); } /** @@ -461,7 +461,9 @@ class SOMAArray : public SOMAObject { */ void select_columns( const std::vector& names, bool if_not_empty = false) { - mq_->select_columns(names, if_not_empty); + for (const std::string& name : names) { + get_column(name)->select_columns(mq_, if_not_empty); + } } /** @@ -610,7 +612,7 @@ class SOMAArray : public SOMAObject { * * @return size_t Total number of cells read */ - size_t total_num_cells() { + std::size_t total_num_cells() { return mq_->total_num_cells(); } @@ -763,11 +765,7 @@ class SOMAArray : public SOMAObject { */ template std::pair non_empty_domain_slot(const std::string& name) const { - try { - return arr_->non_empty_domain(name); - } catch (const std::exception& e) { - throw TileDBSOMAError(e.what()); - } + return get_column(name)->non_empty_domain_slot(*arr_); } /** @@ -805,20 +803,6 @@ class SOMAArray : public SOMAObject { } } - /** - * Retrieves the non-empty domain from the array on the given dimension. - * This is the union of the non-empty domains of the array fragments. - * Applicable only to var-sized dimensions. - */ - std::pair non_empty_domain_slot_var( - const std::string& name) const { - try { - return arr_->non_empty_domain_var(name); - } catch (const std::exception& e) { - throw TileDBSOMAError(e.what()); - } - } - /** * Exposed for testing purposes within this library. * Not for use by Python/R. @@ -855,63 +839,7 @@ class SOMAArray : public SOMAObject { */ template std::pair _core_current_domain_slot(const std::string& name) const { - if (std::is_same_v) { - throw std::runtime_error( - "SOMAArray::soma_domain_slot: template-specialization " - "failure."); - } - CurrentDomain current_domain = _get_current_domain(); - if (current_domain.is_empty()) { - throw TileDBSOMAError( - "_core_current_domain_slot: internal coding error"); - } - if (current_domain.type() != TILEDB_NDRECTANGLE) { - throw TileDBSOMAError( - "_core_current_domain_slot: found non-rectangle type"); - } - NDRectangle ndrect = current_domain.ndrectangle(); - - // Convert from two-element array (core API) to pair (tiledbsoma API) - std::array arr = ndrect.range(name); - return std::pair(arr[0], arr[1]); - } - - std::pair _core_current_domain_slot_string( - const std::string& name) const { - CurrentDomain current_domain = _get_current_domain(); - if (current_domain.is_empty()) { - throw TileDBSOMAError( - "_core_current_domain_slot: internal coding error"); - } - if (current_domain.type() != TILEDB_NDRECTANGLE) { - throw TileDBSOMAError( - "_core_current_domain_slot: found non-rectangle type"); - } - NDRectangle ndrect = current_domain.ndrectangle(); - - // Convert from two-element array (core API) to pair (tiledbsoma API) - std::array arr = ndrect.range(name); - - // Here is an intersection of a few oddities: - // - // * Core domain for string dims must be a nullptr pair; it cannot be - // anything else. - // * TileDB-Py shows this by using an empty-string pair, which we - // imitate. - // * Core current domain for string dims must _not_ be a nullptr pair. - // * In TileDB-SOMA, unless the user specifies otherwise, we use "" for - // min and "\x7f" for max. (We could use "\x7f" but that causes - // display problems in Python.) - // - // To work with all these factors, if the current domain is the default - // "" to "\7f", return an empty-string pair just as we do for domain. - // (There was some pre-1.15 software using "\xff" and it's super-cheap - // to check for that as well.) - if (arr[0] == "" && (arr[1] == "\x7f" || arr[1] == "\xff")) { - return std::pair("", ""); - } else { - return std::pair(arr[0], arr[1]); - } + return get_column(name)->core_current_domain_slot(*ctx_, *arr_); } /** @@ -930,22 +858,9 @@ class SOMAArray : public SOMAObject { */ template std::pair _core_domain_slot(const std::string& name) const { - if (std::is_same_v) { - throw std::runtime_error( - "SOMAArray::_core_domain_slot: template-specialization " - "failure."); - } return schema_->domain().dimension(name).domain(); } - std::pair _core_domain_slot_string( - const std::string&) const { - // Core domain for string dims is always a nullptr pair at the C++ - // level. We follow the convention started by TileDB-Py which is to - // report these as an empty-string pair. - return std::pair("", ""); - } - /** * Returns the SOMA domain at the given dimension. * @@ -1039,39 +954,7 @@ class SOMAArray : public SOMAObject { template std::pair _core_domainish_slot( const std::string& name, enum Domainish which_kind) const { - if (std::is_same_v) { - throw std::runtime_error( - "SOMAArray::_core_domainish_slot: template-specialization " - "failure."); - } - switch (which_kind) { - case Domainish::kind_core_domain: - return _core_domain_slot(name); - case Domainish::kind_core_current_domain: - return _core_current_domain_slot(name); - case Domainish::kind_non_empty_domain: - return non_empty_domain_slot(name); - default: - throw std::runtime_error( - "internal coding error in SOMAArray::_core_domainish_slot: " - "unknown kind"); - } - } - - std::pair _core_domainish_slot_string( - const std::string& name, enum Domainish which_kind) const { - switch (which_kind) { - case Domainish::kind_core_domain: - return _core_domain_slot_string(name); - case Domainish::kind_core_current_domain: - return _core_current_domain_slot_string(name); - case Domainish::kind_non_empty_domain: - return non_empty_domain_slot_var(name); - default: - throw std::runtime_error( - "internal coding error in " - "SOMAArray::_core_domainish_slot_string: unknown kind"); - } + return get_column(name)->domain_slot(*ctx_, *arr_, which_kind); } /** @@ -1294,6 +1177,10 @@ class SOMAArray : public SOMAObject { _set_domain_helper(newdomain, false, function_name_for_messages); } + std::shared_ptr get_column(std::string_view name) const; + + std::shared_ptr get_column(std::size_t index) const; + protected: // See top-of-file notes regarding methods for SOMADataFrame being // defined in this file. diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 2553ca5f6b..6155492594 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -1730,6 +1730,31 @@ std::unique_ptr ArrowAdapter::make_arrow_schema( return arrow_schema; } +std::unique_ptr ArrowAdapter::make_arrow_schema_parent( + int num_columns) { + auto arrow_schema = std::make_unique(); + arrow_schema->format = "+s"; // structure, i.e. non-leaf node + arrow_schema->name = strdup("parent"); + arrow_schema->metadata = nullptr; + arrow_schema->flags = 0; + arrow_schema->n_children = num_columns; // non-leaf node + arrow_schema->children = (ArrowSchema**)malloc( + arrow_schema->n_children * sizeof(ArrowSchema*)); + arrow_schema->dictionary = nullptr; + arrow_schema->release = &ArrowAdapter::release_schema; + arrow_schema->private_data = nullptr; + + for (int i = 0; i < num_columns; i++) { + arrow_schema->children[i] = nullptr; + } + + LOG_DEBUG(std::format( + "[ArrowAdapter] make_arrow_schema n_children {}", + arrow_schema->n_children)); + + return arrow_schema; +} + std::unique_ptr ArrowAdapter::make_arrow_array_parent( int num_columns) { auto arrow_array = std::make_unique(); diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index 7992624e2f..7c8cf91b6f 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -357,6 +357,16 @@ class ArrowAdapter { const std::vector& names, const std::vector& tiledb_datatypes); + /** + * @brief Creates a nanoarrow ArrowSchema which accommodates + * a varying number of columns. + * + * Note that the parents and children in nanoarrow are both of type + * ArrowSchema. This constructs the parent and not the children. + */ + static std::unique_ptr make_arrow_schema_parent( + int num_columns); + /** * @brief Creates a nanoarrow ArrowArray which accommodates * a varying number of columns. From c4a4246ec4aaea91aa169a8827c6dfd6d60fd1bf Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Mon, 9 Dec 2024 18:28:20 +0200 Subject: [PATCH 3/6] Add optional non empty domain method --- libtiledbsoma/src/soma/soma_attribute.cc | 8 + libtiledbsoma/src/soma/soma_attribute.h | 3 + libtiledbsoma/src/soma/soma_column.h | 27 ++- libtiledbsoma/src/soma/soma_dimension.cc | 196 ++++++++++++++++++ libtiledbsoma/src/soma/soma_dimension.h | 3 + .../src/soma/soma_geometry_column.cc | 46 ++++ libtiledbsoma/src/soma/soma_geometry_column.h | 3 + 7 files changed, 284 insertions(+), 2 deletions(-) diff --git a/libtiledbsoma/src/soma/soma_attribute.cc b/libtiledbsoma/src/soma/soma_attribute.cc index deaa083ef2..94bcf722a4 100644 --- a/libtiledbsoma/src/soma/soma_attribute.cc +++ b/libtiledbsoma/src/soma/soma_attribute.cc @@ -95,6 +95,14 @@ std::any SOMAAttribute::_non_empty_domain_slot(Array&) const { name())); } +std::any SOMAAttribute::_non_empty_domain_slot_opt( + const SOMAContext&, Array&) const { + throw TileDBSOMAError(std::format( + "[SOMAAttribute][_non_empty_domain_slot] Column with name {} is not an " + "index column", + name())); +} + std::any SOMAAttribute::_core_current_domain_slot( const SOMAContext&, Array&) const { throw TileDBSOMAError(std::format( diff --git a/libtiledbsoma/src/soma/soma_attribute.h b/libtiledbsoma/src/soma/soma_attribute.h index 77db963c0f..5deac03b3f 100644 --- a/libtiledbsoma/src/soma/soma_attribute.h +++ b/libtiledbsoma/src/soma/soma_attribute.h @@ -140,6 +140,9 @@ class SOMAAttribute : public virtual SOMAColumn { virtual std::any _non_empty_domain_slot(Array& array) const override; + virtual std::any _non_empty_domain_slot_opt( + const SOMAContext& ctx, Array& array) const override; + virtual std::any _core_current_domain_slot( const SOMAContext& ctx, Array& array) const override; diff --git a/libtiledbsoma/src/soma/soma_column.h b/libtiledbsoma/src/soma/soma_column.h index 4b2f513819..0528a349e5 100644 --- a/libtiledbsoma/src/soma/soma_column.h +++ b/libtiledbsoma/src/soma/soma_column.h @@ -403,8 +403,8 @@ class SOMAColumn { /** * Retrieves the non-empty domain from the array. This is the union of the - * non-empty domains of the array fragments. Returns (0, 0) for empty - * domains. + * non-empty domains of the array fragments. Returns (0, 0) or ("", "") for + * empty domains. */ template std::pair non_empty_domain_slot(Array& array) const { @@ -420,6 +420,26 @@ class SOMAColumn { } } + /** + * Retrieves the non-empty domain from the array. This is the union of the + * non-empty domains of the array fragments. Returns (0, 0) or ("", "") for + * empty domains. + */ + template + std::optional> non_empty_domain_slot_opt( + const SOMAContext& ctx, Array& array) const { + try { + return std::any_cast>>( + _non_empty_domain_slot_opt(ctx, array)); + } catch (const std::exception& e) { + throw TileDBSOMAError(std::format( + "[SOMAColumn][non_empty_domain_slot] Failed on \"{}\" with " + "error \"{}\"", + name(), + e.what())); + } + } + /** * Returns the core current domain of this column. * @@ -496,6 +516,9 @@ class SOMAColumn { virtual std::any _non_empty_domain_slot(Array& array) const = 0; + virtual std::any _non_empty_domain_slot_opt( + const SOMAContext& ctx, Array& array) const = 0; + virtual std::any _core_current_domain_slot( const SOMAContext& ctx, Array& array) const = 0; diff --git a/libtiledbsoma/src/soma/soma_dimension.cc b/libtiledbsoma/src/soma/soma_dimension.cc index 1996811f02..bf21fa0b0e 100644 --- a/libtiledbsoma/src/soma/soma_dimension.cc +++ b/libtiledbsoma/src/soma/soma_dimension.cc @@ -533,6 +533,202 @@ std::any SOMADimension::_non_empty_domain_slot(Array& array) const { } } +std::any SOMADimension::_non_empty_domain_slot_opt( + const SOMAContext& ctx, Array& array) const { + int32_t is_empty; + + switch (dimension.type()) { + case TILEDB_STRING_ASCII: + case TILEDB_STRING_UTF8: + case TILEDB_GEOM_WKT: { + void* var_start; + void* var_end; + uint64_t size_start, size_end; + ctx.tiledb_ctx()->handle_error( + tiledb_array_get_non_empty_domain_var_size_from_name( + ctx.tiledb_ctx()->ptr().get(), + array.ptr().get(), + dimension.name().c_str(), + &size_start, + &size_end, + &is_empty)); + + if (is_empty) { + return std::make_any< + std::optional>>( + std::nullopt); + } + + var_start = malloc(size_start); + var_end = malloc(size_end); + + ctx.tiledb_ctx()->handle_error( + tiledb_array_get_non_empty_domain_var_from_name( + ctx.tiledb_ctx()->ptr().get(), + array.ptr().get(), + dimension.name().c_str(), + var_start, + var_end, + &is_empty)); + + auto ned = std::make_pair( + std::string((char*)var_start, size_start), + std::string((char*)var_end, size_end)); + free(var_start); + free(var_end); + + return std::make_any< + std::optional>>(ned); + } + } + + void* fixed_ned = malloc(16); + ctx.tiledb_ctx()->handle_error(tiledb_array_get_non_empty_domain_from_name( + ctx.tiledb_ctx()->ptr().get(), + array.ptr().get(), + dimension.name().c_str(), + fixed_ned, + &is_empty)); + + if (is_empty) { + // We free buffer here and return later the correctly typed optional + free(fixed_ned); + } + + switch (dimension.type()) { + case TILEDB_UINT8: { + if (is_empty) { + return std::make_any< + std::optional>>(std::nullopt); + } else { + auto data = std::make_pair( + ((uint8_t*)fixed_ned)[0], ((uint8_t*)fixed_ned)[1]); + free(fixed_ned); + return std::make_any< + std::optional>>(data); + } + } + case TILEDB_UINT16: + if (is_empty) { + return std::make_any< + std::optional>>(std::nullopt); + } else { + auto data = std::make_pair( + ((uint16_t*)fixed_ned)[0], ((uint16_t*)fixed_ned)[1]); + free(fixed_ned); + return std::make_any< + std::optional>>(data); + } + case TILEDB_UINT32: + if (is_empty) { + return std::make_any< + std::optional>>(std::nullopt); + } else { + auto data = std::make_pair( + ((uint32_t*)fixed_ned)[0], ((uint32_t*)fixed_ned)[1]); + free(fixed_ned); + return std::make_any< + std::optional>>(data); + } + case TILEDB_UINT64: + if (is_empty) { + return std::make_any< + std::optional>>(std::nullopt); + } else { + auto data = std::make_pair( + ((uint64_t*)fixed_ned)[0], ((uint64_t*)fixed_ned)[1]); + free(fixed_ned); + return std::make_any< + std::optional>>(data); + } + case TILEDB_INT8: + if (is_empty) { + return std::make_any>>( + std::nullopt); + } else { + auto data = std::make_pair( + ((int8_t*)fixed_ned)[0], ((int8_t*)fixed_ned)[1]); + free(fixed_ned); + return std::make_any>>( + data); + } + case TILEDB_INT16: + if (is_empty) { + return std::make_any< + std::optional>>(std::nullopt); + } else { + auto data = std::make_pair( + ((int16_t*)fixed_ned)[0], ((int16_t*)fixed_ned)[1]); + free(fixed_ned); + return std::make_any< + std::optional>>(data); + } + case TILEDB_INT32: + if (is_empty) { + return std::make_any< + std::optional>>(std::nullopt); + } else { + auto data = std::make_pair( + ((int32_t*)fixed_ned)[0], ((int32_t*)fixed_ned)[1]); + free(fixed_ned); + return std::make_any< + std::optional>>(data); + } + case TILEDB_DATETIME_YEAR: + case TILEDB_DATETIME_MONTH: + case TILEDB_DATETIME_WEEK: + case TILEDB_DATETIME_DAY: + case TILEDB_DATETIME_HR: + case TILEDB_DATETIME_MIN: + case TILEDB_DATETIME_SEC: + case TILEDB_DATETIME_MS: + case TILEDB_DATETIME_US: + case TILEDB_DATETIME_NS: + case TILEDB_DATETIME_PS: + case TILEDB_DATETIME_FS: + case TILEDB_DATETIME_AS: + case TILEDB_INT64: + if (is_empty) { + return std::make_any< + std::optional>>(std::nullopt); + } else { + auto data = std::make_pair( + ((int64_t*)fixed_ned)[0], ((int64_t*)fixed_ned)[1]); + free(fixed_ned); + return std::make_any< + std::optional>>(data); + } + case TILEDB_FLOAT32: + if (is_empty) { + return std::make_any< + std::optional>>(std::nullopt); + } else { + auto data = std::make_pair( + ((float_t*)fixed_ned)[0], ((float_t*)fixed_ned)[1]); + free(fixed_ned); + return std::make_any< + std::optional>>(data); + } + case TILEDB_FLOAT64: + if (is_empty) { + return std::make_any< + std::optional>>(std::nullopt); + } else { + auto data = std::make_pair( + ((double_t*)fixed_ned)[0], ((double_t*)fixed_ned)[1]); + free(fixed_ned); + return std::make_any< + std::optional>>(data); + } + default: + throw TileDBSOMAError(std::format( + "[SOMADimension][_non_empty_domain_slot] Unknown " + "dimension " + "type {}", + impl::type_to_str(dimension.type()))); + } +} + std::any SOMADimension::_core_current_domain_slot( const SOMAContext& ctx, Array& array) const { CurrentDomain diff --git a/libtiledbsoma/src/soma/soma_dimension.h b/libtiledbsoma/src/soma/soma_dimension.h index 4c14e85d59..19f65f6611 100644 --- a/libtiledbsoma/src/soma/soma_dimension.h +++ b/libtiledbsoma/src/soma/soma_dimension.h @@ -98,6 +98,9 @@ class SOMADimension : public virtual SOMAColumn { virtual std::any _non_empty_domain_slot(Array& array) const override; + virtual std::any _non_empty_domain_slot_opt( + const SOMAContext& ctx, Array& array) const override; + virtual std::any _core_current_domain_slot( const SOMAContext& ctx, Array& array) const override; diff --git a/libtiledbsoma/src/soma/soma_geometry_column.cc b/libtiledbsoma/src/soma/soma_geometry_column.cc index e7b7925486..7dceb04ece 100644 --- a/libtiledbsoma/src/soma/soma_geometry_column.cc +++ b/libtiledbsoma/src/soma/soma_geometry_column.cc @@ -359,6 +359,52 @@ std::any SOMAGeometryColumn::_non_empty_domain_slot(Array& array) const { std::make_pair(min, max)); } +std::any SOMAGeometryColumn::_non_empty_domain_slot_opt( + const SOMAContext& ctx, Array& array) const { + std::vector min, max; + size_t dimensionality = dimensions.size() / 2; + int32_t is_empty; + double_t fixed_ned[2]; + + for (size_t i = 0; i < dimensionality; ++i) { + ctx.tiledb_ctx()->handle_error( + tiledb_array_get_non_empty_domain_from_name( + ctx.tiledb_ctx()->ptr().get(), + array.ptr().get(), + dimensions[i].name().c_str(), // Min dimension + fixed_ned, + &is_empty)); + + if (is_empty) { + return std::make_any, std::vector>>>( + std::nullopt); + } + + min.push_back(fixed_ned[0]); + + ctx.tiledb_ctx()->handle_error( + tiledb_array_get_non_empty_domain_from_name( + ctx.tiledb_ctx()->ptr().get(), + array.ptr().get(), + dimensions[i].name().c_str(), // Max dimension + fixed_ned, + &is_empty)); + + if (is_empty) { + return std::make_any, std::vector>>>( + std::nullopt); + } + + min.push_back(fixed_ned[1]); + } + + return std::make_any< + std::optional, std::vector>>>( + std::make_pair(min, max)); +} + std::any SOMAGeometryColumn::_core_current_domain_slot( const SOMAContext& ctx, Array& array) const { CurrentDomain diff --git a/libtiledbsoma/src/soma/soma_geometry_column.h b/libtiledbsoma/src/soma/soma_geometry_column.h index ada667a239..509b7cb8e2 100644 --- a/libtiledbsoma/src/soma/soma_geometry_column.h +++ b/libtiledbsoma/src/soma/soma_geometry_column.h @@ -138,6 +138,9 @@ class SOMAGeometryColumn : public virtual SOMAColumn { virtual std::any _non_empty_domain_slot(Array& array) const override; + virtual std::any _non_empty_domain_slot_opt( + const SOMAContext& ctx, Array& array) const override; + virtual std::any _core_current_domain_slot( const SOMAContext& ctx, Array& array) const override; From 67e73d7e7169de5b617e59df27393a04e7c1571e Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Mon, 9 Dec 2024 18:29:06 +0200 Subject: [PATCH 4/6] Replace optional non empty domain with the SOMAColumn implementation, update python bindings --- apis/python/src/tiledbsoma/soma_array.cc | 6 ++++-- libtiledbsoma/src/soma/soma_array.h | 26 +----------------------- 2 files changed, 5 insertions(+), 27 deletions(-) diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index c38960e548..18abda9ca8 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -776,7 +776,8 @@ void load_soma_array(py::module& m) { array.non_empty_domain_slot(name)); case TILEDB_STRING_UTF8: case TILEDB_STRING_ASCII: - return py::cast(array.non_empty_domain_slot_var(name)); + return py::cast( + array.non_empty_domain_slot(name)); default: throw TileDBSOMAError( "Unsupported dtype for nonempty domain."); @@ -832,7 +833,8 @@ void load_soma_array(py::module& m) { array.non_empty_domain_slot_opt(name)); case TILEDB_STRING_UTF8: case TILEDB_STRING_ASCII: - return py::cast(array.non_empty_domain_slot_var(name)); + return py::cast( + array.non_empty_domain_slot_opt(name)); default: throw TileDBSOMAError( "Unsupported dtype for nonempty domain."); diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 8604acb27b..473226518c 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -776,31 +776,7 @@ class SOMAArray : public SOMAObject { template std::optional> non_empty_domain_slot_opt( const std::string& name) const { - try { - int32_t is_empty; - T ned[2]; - - // TODO currently we need to use the TileDB C API in order to check - // if the domain is empty or not. The C++ API returns (0, 0) - // currently which could also represent a single point at coordinate - // 0. Replace this when the C++ API supports correct checking for - // empty domains - ctx_->tiledb_ctx()->handle_error( - tiledb_array_get_non_empty_domain_from_name( - ctx_->tiledb_ctx()->ptr().get(), - arr_->ptr().get(), - name.c_str(), - &ned, - &is_empty)); - - if (is_empty == 1) { - return std::nullopt; - } else { - return std::make_pair(ned[0], ned[1]); - } - } catch (const std::exception& e) { - throw TileDBSOMAError(e.what()); - } + return get_column(name)->non_empty_domain_slot_opt(*ctx_, *arr_); } /** From f8455c40f15e4eff7b9e3c1a60e28a1dbc6eb94f Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Tue, 10 Dec 2024 12:45:44 +0200 Subject: [PATCH 5/6] Add template-specialization guards --- libtiledbsoma/src/soma/soma_column.cc | 29 ++++++++++++++++++++++++++- libtiledbsoma/src/soma/soma_column.h | 16 +++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/libtiledbsoma/src/soma/soma_column.cc b/libtiledbsoma/src/soma/soma_column.cc index 95a1cfecc1..6d43968878 100644 --- a/libtiledbsoma/src/soma/soma_column.cc +++ b/libtiledbsoma/src/soma/soma_column.cc @@ -52,9 +52,36 @@ SOMAColumn::core_current_domain_slot( if (current_domain.first == "" && (current_domain.second == "\x7f" || current_domain.second == "\xff")) { return std::pair("", ""); + } else { + throw TileDBSOMAError(std::format( + "[SOMAColumn][core_current_domain_slot] unexpected current " + "domain returnd ({}, {})", + current_domain.first, + current_domain.second)); } + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } +} + +template <> +std::pair +SOMAColumn::core_current_domain_slot(NDRectangle& ndrect) const { + try { + std::pair + current_domain = std::any_cast>( + _core_current_domain_slot(ndrect)); - return current_domain; + if (current_domain.first == "" && (current_domain.second == "\x7f" || + current_domain.second == "\xff")) { + return std::pair("", ""); + } else { + throw TileDBSOMAError(std::format( + "[SOMAColumn][core_current_domain_slot] unexpected current " + "domain returnd ({}, {})", + current_domain.first, + current_domain.second)); + } } catch (const std::exception& e) { throw TileDBSOMAError(e.what()); } diff --git a/libtiledbsoma/src/soma/soma_column.h b/libtiledbsoma/src/soma/soma_column.h index 0528a349e5..9951c94348 100644 --- a/libtiledbsoma/src/soma/soma_column.h +++ b/libtiledbsoma/src/soma/soma_column.h @@ -390,6 +390,12 @@ class SOMAColumn { */ template std::pair core_domain_slot() const { + if (std::is_same_v) { + throw std::runtime_error( + "SOMAArray::soma_domain_slot: template-specialization " + "failure."); + } + try { return std::any_cast>(_core_domain_slot()); } catch (const std::exception& e) { @@ -457,6 +463,12 @@ class SOMAColumn { template std::pair core_current_domain_slot( const SOMAContext& ctx, Array& array) const { + if (std::is_same_v) { + throw std::runtime_error( + "SOMAArray::soma_domain_slot: template-specialization " + "failure."); + } + try { return std::any_cast>( _core_current_domain_slot(ctx, array)); @@ -534,5 +546,9 @@ std::pair SOMAColumn::core_current_domain_slot( const SOMAContext& ctx, Array& array) const; +template <> +std::pair +SOMAColumn::core_current_domain_slot(NDRectangle& ndrect) const; + } // namespace tiledbsoma #endif \ No newline at end of file From e746f3ba08abe4843c0d8b1e037b6c02411a04a5 Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Tue, 10 Dec 2024 13:11:31 +0200 Subject: [PATCH 6/6] Remove unsupported dimension datatypes --- libtiledbsoma/src/soma/soma_dimension.cc | 110 +++++++++-------------- 1 file changed, 41 insertions(+), 69 deletions(-) diff --git a/libtiledbsoma/src/soma/soma_dimension.cc b/libtiledbsoma/src/soma/soma_dimension.cc index bf21fa0b0e..54c16545ae 100644 --- a/libtiledbsoma/src/soma/soma_dimension.cc +++ b/libtiledbsoma/src/soma/soma_dimension.cc @@ -86,8 +86,6 @@ void SOMADimension::_set_dim_points( break; case TILEDB_STRING_UTF8: case TILEDB_STRING_ASCII: - case TILEDB_CHAR: - case TILEDB_BLOB: query->select_points( dimension.name(), std::any_cast>(points)); @@ -178,10 +176,6 @@ void SOMADimension::_set_dim_ranges( break; case TILEDB_STRING_UTF8: case TILEDB_STRING_ASCII: - case TILEDB_CHAR: - case TILEDB_BLOB: - case TILEDB_GEOM_WKT: - case TILEDB_GEOM_WKB: query->select_ranges( dimension.name(), std::any_cast>>( @@ -258,11 +252,7 @@ void SOMADimension::_set_current_domain_slot( rectangle.set_range(dimension.name(), dom[0], dom[1]); } break; case TILEDB_STRING_ASCII: - case TILEDB_STRING_UTF8: - case TILEDB_CHAR: - case TILEDB_BLOB: - case TILEDB_GEOM_WKT: - case TILEDB_GEOM_WKB: { + case TILEDB_STRING_UTF8: { auto dom = std::any_cast>(domain[0]); if (dom[0] == "" && dom[1] == "") { rectangle.set_range(dimension.name(), "", "\x7f"); @@ -397,11 +387,7 @@ std::pair SOMADimension::_can_set_current_domain_slot( return comparator( std::any_cast>(new_domain[0])); case TILEDB_STRING_ASCII: - case TILEDB_STRING_UTF8: - case TILEDB_CHAR: - case TILEDB_BLOB: - case TILEDB_GEOM_WKT: - case TILEDB_GEOM_WKB: { + case TILEDB_STRING_UTF8: { auto dom = std::any_cast>(new_domain[0]); if (dom[0] != "" || dom[1] != "") { return std::pair( @@ -519,10 +505,6 @@ std::any SOMADimension::_non_empty_domain_slot(Array& array) const { array.non_empty_domain(dimension.name())); case TILEDB_STRING_ASCII: case TILEDB_STRING_UTF8: - case TILEDB_BLOB: - case TILEDB_CHAR: - case TILEDB_GEOM_WKB: - case TILEDB_GEOM_WKT: return std::make_any>( array.non_empty_domain_var(dimension.name())); default: @@ -537,49 +519,46 @@ std::any SOMADimension::_non_empty_domain_slot_opt( const SOMAContext& ctx, Array& array) const { int32_t is_empty; - switch (dimension.type()) { - case TILEDB_STRING_ASCII: - case TILEDB_STRING_UTF8: - case TILEDB_GEOM_WKT: { - void* var_start; - void* var_end; - uint64_t size_start, size_end; - ctx.tiledb_ctx()->handle_error( - tiledb_array_get_non_empty_domain_var_size_from_name( - ctx.tiledb_ctx()->ptr().get(), - array.ptr().get(), - dimension.name().c_str(), - &size_start, - &size_end, - &is_empty)); - - if (is_empty) { - return std::make_any< - std::optional>>( - std::nullopt); - } - - var_start = malloc(size_start); - var_end = malloc(size_end); - - ctx.tiledb_ctx()->handle_error( - tiledb_array_get_non_empty_domain_var_from_name( - ctx.tiledb_ctx()->ptr().get(), - array.ptr().get(), - dimension.name().c_str(), - var_start, - var_end, - &is_empty)); - - auto ned = std::make_pair( - std::string((char*)var_start, size_start), - std::string((char*)var_end, size_end)); - free(var_start); - free(var_end); - + if (dimension.type() == TILEDB_STRING_ASCII || + dimension.type() == TILEDB_STRING_UTF8) { + void* var_start; + void* var_end; + uint64_t size_start, size_end; + ctx.tiledb_ctx()->handle_error( + tiledb_array_get_non_empty_domain_var_size_from_name( + ctx.tiledb_ctx()->ptr().get(), + array.ptr().get(), + dimension.name().c_str(), + &size_start, + &size_end, + &is_empty)); + + if (is_empty) { return std::make_any< - std::optional>>(ned); + std::optional>>( + std::nullopt); } + + var_start = malloc(size_start); + var_end = malloc(size_end); + + ctx.tiledb_ctx()->handle_error( + tiledb_array_get_non_empty_domain_var_from_name( + ctx.tiledb_ctx()->ptr().get(), + array.ptr().get(), + dimension.name().c_str(), + var_start, + var_end, + &is_empty)); + + auto ned = std::make_pair( + std::string((char*)var_start, size_start), + std::string((char*)var_end, size_end)); + free(var_start); + free(var_end); + + return std::make_any< + std::optional>>(ned); } void* fixed_ned = malloc(16); @@ -815,11 +794,7 @@ std::any SOMADimension::_core_current_domain_slot(NDRectangle& ndrect) const { std::make_pair(domain[0], domain[1])); } case TILEDB_STRING_UTF8: - case TILEDB_STRING_ASCII: - case TILEDB_CHAR: - case TILEDB_BLOB: - case TILEDB_GEOM_WKT: - case TILEDB_GEOM_WKB: { + case TILEDB_STRING_ASCII: { std::array domain = ndrect.range( dimension.name()); return std::make_any>( @@ -889,9 +864,6 @@ ArrowArray* SOMADimension::arrow_domain_slot( domain_slot(ctx, array, kind)); case TILEDB_STRING_ASCII: case TILEDB_STRING_UTF8: - case TILEDB_CHAR: - case TILEDB_GEOM_WKB: - case TILEDB_GEOM_WKT: return ArrowAdapter::make_arrow_array_child_string( domain_slot(ctx, array, kind)); default: