Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[c++] Integrate SOMAColumn: Update domain accessors inside SOMAArray, part 2 #3407

Open
wants to merge 6 commits into
base: xan/sc-59427/soma-column-arrow-integration
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions apis/python/src/tiledbsoma/soma_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -776,7 +776,8 @@ void load_soma_array(py::module& m) {
array.non_empty_domain_slot<float>(name));
case TILEDB_STRING_UTF8:
case TILEDB_STRING_ASCII:
return py::cast(array.non_empty_domain_slot_var(name));
return py::cast(
array.non_empty_domain_slot<std::string>(name));
default:
throw TileDBSOMAError(
"Unsupported dtype for nonempty domain.");
Expand Down Expand Up @@ -832,7 +833,8 @@ void load_soma_array(py::module& m) {
array.non_empty_domain_slot_opt<float>(name));
case TILEDB_STRING_UTF8:
case TILEDB_STRING_ASCII:
return py::cast(array.non_empty_domain_slot_var(name));
return py::cast(
array.non_empty_domain_slot_opt<std::string>(name));
default:
throw TileDBSOMAError(
"Unsupported dtype for nonempty domain.");
Expand Down
227 changes: 121 additions & 106 deletions libtiledbsoma/src/soma/soma_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,12 @@
#include <tiledb/array_experimental.h>
#include "../utils/logger.h"
#include "../utils/util.h"
#include "soma_attribute.h"
#include "soma_dimension.h"
#include "soma_geometry_column.h"

#include <format>
#include <variant>

namespace tiledbsoma {
using namespace tiledb;
Expand Down Expand Up @@ -143,6 +147,7 @@ SOMAArray::SOMAArray(
validate(mode, name, timestamp);
reset(column_names, batch_size, result_order);
fill_metadata_cache();
fill_columns();
}

SOMAArray::SOMAArray(
Expand All @@ -161,6 +166,7 @@ SOMAArray::SOMAArray(
validate(mode, name, timestamp);
reset(column_names, batch_size, result_order);
fill_metadata_cache();
fill_columns();
}

SOMAArray::SOMAArray(
Expand All @@ -177,6 +183,7 @@ SOMAArray::SOMAArray(
, schema_(std::make_shared<ArraySchema>(arr->schema())) {
reset({}, batch_size_, result_order_);
fill_metadata_cache();
fill_columns();
}

void SOMAArray::fill_metadata_cache() {
Expand Down Expand Up @@ -220,6 +227,7 @@ void SOMAArray::open(OpenMode mode, std::optional<TimestampRange> timestamp) {
validate(mode, name_, timestamp);
reset(column_names(), batch_size_, result_order_);
fill_metadata_cache();
fill_columns();
}

std::unique_ptr<SOMAArray> SOMAArray::reopen(
Expand Down Expand Up @@ -452,115 +460,19 @@ std::optional<TimestampRange> SOMAArray::timestamp() {
// The domainish enum simply lets us re-use code which is common across
// core domain, core current domain, and core non-empty domain.
ArrowTable SOMAArray::_get_core_domainish(enum Domainish which_kind) {
int array_ndim = this->ndim();
auto dimensions = tiledb_schema()->domain().dimensions();

// Create the schema for the info we return
std::vector<std::string> names(array_ndim);
std::vector<tiledb_datatype_t> tiledb_datatypes(array_ndim);

for (int i = 0; i < (int)array_ndim; i++) {
const Dimension& core_dim = dimensions[i];
names[i] = core_dim.name();
tiledb_datatypes[i] = core_dim.type();
}

auto arrow_schema = ArrowAdapter::make_arrow_schema(
names, tiledb_datatypes);
int array_ndim = std::count_if(
columns_.begin(), columns_.end(), [](const auto& col) {
return col->isIndexColumn();
});

// Create the data for the info we return
auto arrow_schema = ArrowAdapter::make_arrow_schema_parent(array_ndim);
auto arrow_array = ArrowAdapter::make_arrow_array_parent(array_ndim);

for (int i = 0; i < array_ndim; i++) {
auto core_dim = dimensions[i];
auto core_type_code = core_dim.type();

ArrowArray* child = nullptr;

switch (core_type_code) {
case TILEDB_INT64:
case TILEDB_DATETIME_YEAR:
case TILEDB_DATETIME_MONTH:
case TILEDB_DATETIME_WEEK:
case TILEDB_DATETIME_DAY:
case TILEDB_DATETIME_HR:
case TILEDB_DATETIME_MIN:
case TILEDB_DATETIME_SEC:
case TILEDB_DATETIME_MS:
case TILEDB_DATETIME_US:
case TILEDB_DATETIME_NS:
case TILEDB_DATETIME_PS:
case TILEDB_DATETIME_FS:
case TILEDB_DATETIME_AS:
case TILEDB_TIME_HR:
case TILEDB_TIME_MIN:
case TILEDB_TIME_SEC:
case TILEDB_TIME_MS:
case TILEDB_TIME_US:
case TILEDB_TIME_NS:
case TILEDB_TIME_PS:
case TILEDB_TIME_FS:
case TILEDB_TIME_AS:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<int64_t>(core_dim.name(), which_kind));
break;
case TILEDB_UINT64:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<uint64_t>(
core_dim.name(), which_kind));
break;
case TILEDB_INT32:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<int32_t>(core_dim.name(), which_kind));
break;
case TILEDB_UINT32:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<uint32_t>(
core_dim.name(), which_kind));
break;
case TILEDB_INT16:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<int16_t>(core_dim.name(), which_kind));
break;
case TILEDB_UINT16:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<uint16_t>(
core_dim.name(), which_kind));
break;
case TILEDB_INT8:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<int8_t>(core_dim.name(), which_kind));
break;
case TILEDB_UINT8:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<uint8_t>(core_dim.name(), which_kind));
break;

case TILEDB_FLOAT64:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<double>(core_dim.name(), which_kind));
break;
case TILEDB_FLOAT32:
child = ArrowAdapter::make_arrow_array_child(
_core_domainish_slot<float>(core_dim.name(), which_kind));
break;

case TILEDB_STRING_ASCII:
case TILEDB_CHAR:
case TILEDB_GEOM_WKB:
case TILEDB_GEOM_WKT:
child = ArrowAdapter::make_arrow_array_child_string(
_core_domainish_slot_string(core_dim.name(), which_kind));
break;

default:
throw TileDBSOMAError(std::format(
"SOMAArray::_get_core_domainish:dim {} has unhandled type "
"{}",
core_dim.name(),
tiledb::impl::type_to_str(core_type_code)));
}
arrow_array->children[i] = child;
for (int64_t i = 0; i < array_ndim; ++i) {
arrow_schema->children[i] = columns_[i]->arrow_schema_slot(
*ctx_, *arr_);
arrow_array->children[i] = columns_[i]->arrow_domain_slot(
*ctx_, *arr_, which_kind);
}

return ArrowTable(std::move(arrow_array), std::move(arrow_schema));
Expand Down Expand Up @@ -1656,4 +1568,107 @@ void SOMAArray::_check_dims_are_int64() {
}
}

void SOMAArray::fill_columns() {
columns_.clear();
std::deque<std::variant<Dimension, Attribute>> tdb_columns;

for (std::size_t i = 0; i < arr_->schema().domain().ndim(); ++i) {
tdb_columns.push_back(arr_->schema().domain().dimension(i));
}

// We need the correct order of attributes
for (std::size_t i = 0; i < arr_->schema().attribute_num(); ++i) {
tdb_columns.push_back(arr_->schema().attribute(i));
}

while (!tdb_columns.empty()) {
std::visit(
[&](auto&& arg) {
using T = std::decay_t<decltype(arg)>;
if constexpr (std::is_same_v<T, Attribute>) {
columns_.push_back(std::make_shared<SOMAAttribute>(arg));
tdb_columns.pop_front();
} else if constexpr (std::is_same_v<T, Dimension>) {
if (arg.name().rfind(SOMA_GEOMETRY_DIMENSION_PREFIX, 0) ==
0) {
std::vector<Dimension> dims;
for (std::size_t i = 0; i < tdb_columns.size(); ++i) {
if (std::holds_alternative<Dimension>(
tdb_columns[i]) &&
std::get<Dimension>(tdb_columns[i])
.name()
.rfind(
SOMA_GEOMETRY_DIMENSION_PREFIX,
0) == 0) {
dims.push_back(
std::get<Dimension>(tdb_columns[i]));
}
}

// Internal columns are all sequentially stored so we
// can remove them all by once
tdb_columns.erase(
tdb_columns.begin(),
tdb_columns.begin() + dims.size());

auto attr = std::find_if(
tdb_columns.begin(),
tdb_columns.end(),
[&](auto& col) {
if (std::holds_alternative<Attribute>(col) &&
std::get<Attribute>(col).name().compare(
SOMA_GEOMETRY_COLUMN_NAME) == 0) {
return true;
}
return false;
});

if (attr == tdb_columns.end()) {
throw TileDBSOMAError(std::format(
"[SOMAArray] Missing required attribute {} for "
"SOMAGeometryColumn",
SOMA_GEOMETRY_COLUMN_NAME));
}

columns_.push_back(std::make_shared<SOMAGeometryColumn>(
dims, std::get<Attribute>(*attr)));
tdb_columns.erase(attr);
} else {
// Vanilla dimension
columns_.push_back(
std::make_shared<SOMADimension>(arg));
tdb_columns.pop_front();
}
}
},
tdb_columns.front());
}
}

std::shared_ptr<SOMAColumn> SOMAArray::get_column(std::string_view name) const {
auto result = std::find_if(columns_.begin(), columns_.end(), [&](auto col) {
return col->name() == name;
});

if (result == columns_.end()) {
throw TileDBSOMAError(std::format(
"[SOMAArray] internal coding error: No column named {} found",
name));
}

return *result;
}

std::shared_ptr<SOMAColumn> SOMAArray::get_column(std::size_t index) const {
if (index >= columns_.size()) {
throw TileDBSOMAError(std::format(
"[SOMAArray] internal coding error: Column index outside of range. "
"Requested {}, but {} exist.",
index,
columns_.size()));
}

return columns_[index];
}

} // namespace tiledbsoma
Loading