Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
nguyenv committed Apr 1, 2024
1 parent 482c083 commit 6f1f07c
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 75 deletions.
11 changes: 9 additions & 2 deletions apis/python/src/tiledbsoma/_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,8 +458,15 @@ def write(
for input_field in values.schema:
target_field = self.schema.field(input_field.name)

if pa.types.is_dictionary(target_field.type) and not pa.types.is_dictionary(input_field.type):
raise ValueError(f"{input_field.name} requires dictionary entry")
if pa.types.is_dictionary(target_field.type):
if not pa.types.is_dictionary(input_field.type):
raise ValueError(f"{input_field.name} requires dictionary entry")
# extend enums in array schema as necessary
# get evolved enums
col = values.column(input_field.name).combine_chunks()
new_enums = self._handle._handle.extend_enumeration(col)
print(new_enums)
# cast that in table

if pa.types.is_boolean(input_field.type):
target_schema.append(target_field.with_type(pa.uint8()))
Expand Down
120 changes: 93 additions & 27 deletions apis/python/src/tiledbsoma/soma_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -63,33 +63,34 @@ void write(SOMAArray& array, py::handle py_batch) {
data = arr_->buffers[1];
}

if (attributes.find(sch_->name) != attributes.end()) {
auto enmr_name = AttributeExperimental::get_enumeration_name(
*array.ctx()->tiledb_ctx(), attributes.at(sch_->name));

if (enmr_name.has_value()) {
auto dict = arr_->dictionary;
if (!dict) {
array.clear_column_data();
throw py::value_error(
"Saw non-dictionary column passed to enumerated type");
}

const void* enmr_data;
uint64_t* enmr_offsets = nullptr;
if (dict->n_buffers == 3) {
enmr_offsets = (uint64_t*)dict->buffers[1];
enmr_data = dict->buffers[2];
} else {
enmr_data = dict->buffers[1];
}

if (dict->length != 0) {
array.extend_enumeration(
sch_->name, dict->length, enmr_data, enmr_offsets);
}
}
}
// if (attributes.find(sch_->name) != attributes.end()) {
// auto enmr_name = AttributeExperimental::get_enumeration_name(
// *array.ctx()->tiledb_ctx(), attributes.at(sch_->name));

// if (enmr_name.has_value()) {
// auto dict = arr_->dictionary;
// if (!dict) {
// array.clear_column_data();
// throw py::value_error(
// "Saw non-dictionary column passed to enumerated
// type");
// }

// const void* enmr_data;
// uint64_t* enmr_offsets = nullptr;
// if (dict->n_buffers == 3) {
// enmr_offsets = (uint64_t*)dict->buffers[1];
// enmr_data = dict->buffers[2];
// } else {
// enmr_data = dict->buffers[1];
// }

// if (dict->length != 0) {
// array.extend_enumeration(
// sch_->name, dict->length, enmr_data, enmr_offsets);
// }
// }
// }

auto np = py::module::import("numpy");
auto table_offset = arr_->offset;
Expand Down Expand Up @@ -759,6 +760,71 @@ void load_soma_array(py::module& m) {

.def_property_readonly("dimension_names", &SOMAArray::dimension_names)

.def(
"extend_enumeration",
[](SOMAArray& array, py::handle py_batch) -> py::object {
ArrowSchema arrow_schema;
ArrowArray arrow_array;
uintptr_t arrow_schema_ptr = (uintptr_t)(&arrow_schema);
uintptr_t arrow_array_ptr = (uintptr_t)(&arrow_array);
py_batch.attr("_export_to_c")(
arrow_array_ptr, arrow_schema_ptr);

auto dict = arrow_array.dictionary;
const void* enmr_data;
uint64_t* enmr_offsets = nullptr;
if (dict->n_buffers == 3) {
enmr_offsets = (uint64_t*)dict->buffers[1];
enmr_data = dict->buffers[2];
} else {
enmr_data = dict->buffers[1];
}

if (dict->length != 0) {
auto new_enmr = array.extend_enumeration(
arrow_schema.name,
dict->length,
enmr_data,
enmr_offsets);

auto emdr_format = arrow_schema.dictionary->format;
switch (ArrowAdapter::to_tiledb_format(emdr_format)) {
case TILEDB_STRING_ASCII:
case TILEDB_STRING_UTF8:
case TILEDB_CHAR:
return py::cast(new_enmr.as_vector<std::string>());
case TILEDB_BOOL:
case TILEDB_INT8:
return py::cast(new_enmr.as_vector<int8_t>());
case TILEDB_UINT8:
return py::cast(new_enmr.as_vector<uint8_t>());
case TILEDB_INT16:
return py::cast(new_enmr.as_vector<int16_t>());
case TILEDB_UINT16:
return py::cast(new_enmr.as_vector<uint16_t>());
case TILEDB_INT32:
return py::cast(new_enmr.as_vector<int32_t>());
case TILEDB_UINT32:
return py::cast(new_enmr.as_vector<uint32_t>());
case TILEDB_INT64:
return py::cast(new_enmr.as_vector<int64_t>());
case TILEDB_UINT64:
return py::cast(new_enmr.as_vector<uint64_t>());
case TILEDB_FLOAT32:
return py::cast(new_enmr.as_vector<float>());
case TILEDB_FLOAT64:
return py::cast(new_enmr.as_vector<double>());
default:
throw TileDBSOMAError(
"extend_enumeration: Unsupported dict "
"datatype");
}

} else {
return py::cast(std::vector<std::string>());
}
})

.def("set_metadata", set_metadata)

.def("delete_metadata", &SOMAArray::delete_metadata)
Expand Down
66 changes: 23 additions & 43 deletions libtiledbsoma/src/soma/soma_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ std::optional<std::shared_ptr<ArrayBuffers>> SOMAArray::read_next() {
return mq_->results();
}

void SOMAArray::extend_enumeration(
Enumeration SOMAArray::extend_enumeration(
std::string_view name,
uint64_t num_elems,
const void* data,
Expand Down Expand Up @@ -353,64 +353,44 @@ void SOMAArray::extend_enumeration(
throw TileDBSOMAError(
"Cannot extend enumeration; reached maximum capacity");
}

ArraySchemaEvolution se(*ctx_->tiledb_ctx());
se.extend_enumeration(enmr.extend(extend_values));
se.array_evolve(uri_);
}
break;

return enmr.extend(extend_values);
}
case TILEDB_BOOL:
case TILEDB_INT8: {
SOMAArray::_extend_value_helper(
case TILEDB_INT8:
return SOMAArray::_extend_value_helper(
(int8_t*)data, num_elems, enmr, max_capacity);
break;
}
case TILEDB_UINT8: {
SOMAArray::_extend_value_helper(
case TILEDB_UINT8:
return SOMAArray::_extend_value_helper(
(uint8_t*)data, num_elems, enmr, max_capacity);
break;
}
case TILEDB_INT16: {
SOMAArray::_extend_value_helper(
case TILEDB_INT16:
return SOMAArray::_extend_value_helper(
(int16_t*)data, num_elems, enmr, max_capacity);
break;
}
case TILEDB_UINT16: {
SOMAArray::_extend_value_helper(
case TILEDB_UINT16:
return SOMAArray::_extend_value_helper(
(uint16_t*)data, num_elems, enmr, max_capacity);
break;
}
case TILEDB_INT32: {
SOMAArray::_extend_value_helper(
case TILEDB_INT32:
return SOMAArray::_extend_value_helper(
(int32_t*)data, num_elems, enmr, max_capacity);
break;
}
case TILEDB_UINT32: {
SOMAArray::_extend_value_helper(
case TILEDB_UINT32:
return SOMAArray::_extend_value_helper(
(uint32_t*)data, num_elems, enmr, max_capacity);
break;
}
case TILEDB_INT64: {
SOMAArray::_extend_value_helper(
case TILEDB_INT64:
return SOMAArray::_extend_value_helper(
(int64_t*)data, num_elems, enmr, max_capacity);
break;
}
case TILEDB_UINT64: {
SOMAArray::_extend_value_helper(
case TILEDB_UINT64:
return SOMAArray::_extend_value_helper(
(uint64_t*)data, num_elems, enmr, max_capacity);
break;
}
case TILEDB_FLOAT32: {
SOMAArray::_extend_value_helper(
case TILEDB_FLOAT32:
return SOMAArray::_extend_value_helper(
(float*)data, num_elems, enmr, max_capacity);
break;
}
case TILEDB_FLOAT64: {
SOMAArray::_extend_value_helper(
case TILEDB_FLOAT64:
return SOMAArray::_extend_value_helper(
(double*)data, num_elems, enmr, max_capacity);
break;
}
default:
throw TileDBSOMAError(fmt::format(
"ArrowAdapter: Unsupported TileDB dict datatype: {} ",
Expand Down
7 changes: 4 additions & 3 deletions libtiledbsoma/src/soma/soma_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@ class SOMAArray : public SOMAObject {
*/
std::optional<std::shared_ptr<ArrayBuffers>> read_next();

void extend_enumeration(
Enumeration extend_enumeration(
std::string_view name,
uint64_t num_elems,
const void* data,
Expand Down Expand Up @@ -709,7 +709,7 @@ class SOMAArray : public SOMAObject {
//===================================================================

template <typename T>
void _extend_value_helper(
Enumeration _extend_value_helper(
T* data, uint64_t num_elems, Enumeration enmr, uint64_t max_capacity) {
std::vector<T> enums_in_write((T*)data, (T*)data + num_elems);
auto enums_existing = enmr.as_vector<T>();
Expand All @@ -728,11 +728,12 @@ class SOMAArray : public SOMAObject {
throw TileDBSOMAError(
"Cannot extend enumeration; reached maximum capacity");
}

ArraySchemaEvolution se(*ctx_->tiledb_ctx());
se.extend_enumeration(enmr.extend(extend_values));
se.array_evolve(uri_);
}

return enmr.extend(extend_values);
}

// Fills the metadata cache upon opening the array.
Expand Down

0 comments on commit 6f1f07c

Please sign in to comment.