diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 7d0e379731..65049be95b 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2022-2023 TileDB, Inc. + * @copyright Copyright (c) 2022-2024 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -102,8 +102,7 @@ class SOMAArray : public SOMAObject { * * @param mode read or write * @param uri URI of the array - * @param name Name of the array - * @param platform_config Config parameter dictionary + * @param ctx SOMAContext * @param column_names Columns to read * @param batch_size Read batch size * @param result_order Read result order: automatic (default), rowmajor, @@ -152,8 +151,8 @@ class SOMAArray : public SOMAObject { * * @param mode read or write * @param uri URI of the array + * @param ctx SOMAContext * @param name name of the array - * @param platform_config Config parameter dictionary * @param column_names Columns to read * @param batch_size Batch size * @param result_order Result order diff --git a/libtiledbsoma/src/soma/soma_collection.cc b/libtiledbsoma/src/soma/soma_collection.cc index 5bf74c62bb..2c6541b642 100644 --- a/libtiledbsoma/src/soma/soma_collection.cc +++ b/libtiledbsoma/src/soma/soma_collection.cc @@ -41,12 +41,11 @@ using namespace tiledb; //= public static //=================================================================== -std::unique_ptr SOMACollection::create( +void SOMACollection::create( std::string_view uri, std::shared_ptr ctx, std::optional timestamp) { - auto soma_group = SOMAGroup::create(ctx, uri, "SOMACollection", timestamp); - return std::make_unique(*soma_group); + SOMAGroup::create(ctx, uri, "SOMACollection", timestamp); } std::unique_ptr SOMACollection::open( @@ -111,8 +110,11 @@ std::shared_ptr SOMACollection::add_new_experiment( std::string_view uri, URIType uri_type, std::shared_ptr ctx, - ArraySchema schema) { - SOMAExperiment::create(uri, schema, ctx); + std::unique_ptr schema, + ColumnIndexInfo index_columns, + std::optional platform_config) { + SOMAExperiment::create( + uri, std::move(schema), index_columns, ctx, platform_config); std::shared_ptr member = SOMAExperiment::open( uri, OpenMode::read, ctx); this->set(std::string(uri), uri_type, std::string(key)); @@ -125,8 +127,9 @@ std::shared_ptr SOMACollection::add_new_measurement( std::string_view uri, URIType uri_type, std::shared_ptr ctx, - ArraySchema schema) { - SOMAMeasurement::create(uri, schema, ctx); + std::unique_ptr schema, + ColumnIndexInfo index_columns) { + SOMAMeasurement::create(uri, std::move(schema), index_columns, ctx); std::shared_ptr member = SOMAMeasurement::open( uri, OpenMode::read, ctx); this->set(std::string(uri), uri_type, std::string(key)); @@ -139,8 +142,11 @@ std::shared_ptr SOMACollection::add_new_dataframe( std::string_view uri, URIType uri_type, std::shared_ptr ctx, - ArraySchema schema) { - SOMADataFrame::create(uri, schema, ctx); + std::unique_ptr schema, + ColumnIndexInfo index_columns, + std::optional platform_config) { + SOMADataFrame::create( + uri, std::move(schema), index_columns, ctx, platform_config); std::shared_ptr member = SOMADataFrame::open( uri, OpenMode::read, ctx); this->set(std::string(uri), uri_type, std::string(key)); @@ -154,7 +160,7 @@ std::shared_ptr SOMACollection::add_new_dense_ndarray( URIType uri_type, std::shared_ptr ctx, ArraySchema schema) { - SOMADenseNDArray::create(uri, schema, ctx); + SOMADenseNDArray::create(uri, std::move(schema), ctx); std::shared_ptr member = SOMADenseNDArray::open( uri, OpenMode::read, ctx); this->set(std::string(uri), uri_type, std::string(key)); @@ -168,7 +174,7 @@ std::shared_ptr SOMACollection::add_new_sparse_ndarray( URIType uri_type, std::shared_ptr ctx, ArraySchema schema) { - SOMASparseNDArray::create(uri, schema, ctx); + SOMASparseNDArray::create(uri, std::move(schema), ctx); std::shared_ptr member = SOMASparseNDArray::open( uri, OpenMode::read, ctx); this->set(std::string(uri), uri_type, std::string(key)); diff --git a/libtiledbsoma/src/soma/soma_collection.h b/libtiledbsoma/src/soma/soma_collection.h index 58fbf418b7..47564b67f3 100644 --- a/libtiledbsoma/src/soma/soma_collection.h +++ b/libtiledbsoma/src/soma/soma_collection.h @@ -61,7 +61,7 @@ class SOMACollection : public SOMAGroup { * @param ctx TileDB context * @param uri URI to create the SOMACollection */ - static std::unique_ptr create( + static void create( std::string_view uri, std::shared_ptr ctx, std::optional timestamp = std::nullopt); @@ -157,7 +157,9 @@ class SOMACollection : public SOMAGroup { std::string_view uri, URIType uri_type, std::shared_ptr ctx, - ArraySchema schema); + std::unique_ptr schema, + ColumnIndexInfo index_columns, + std::optional platform_config = std::nullopt); /** * Create and add a SOMAMeasurement to the SOMACollection. @@ -172,7 +174,8 @@ class SOMACollection : public SOMAGroup { std::string_view uri, URIType uri_type, std::shared_ptr ctx, - ArraySchema schema); + std::unique_ptr schema, + ColumnIndexInfo index_columns); /** * Create and add a SOMADataFrame to the SOMACollection. @@ -187,7 +190,9 @@ class SOMACollection : public SOMAGroup { std::string_view uri, URIType uri_type, std::shared_ptr ctx, - ArraySchema schema); + std::unique_ptr schema, + ColumnIndexInfo index_columns, + std::optional platform_config = std::nullopt); /** * Create and add a SOMADenseNDArray to the SOMACollection. diff --git a/libtiledbsoma/src/soma/soma_dataframe.cc b/libtiledbsoma/src/soma/soma_dataframe.cc index 441288f134..56de8b193b 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.cc +++ b/libtiledbsoma/src/soma/soma_dataframe.cc @@ -39,14 +39,16 @@ using namespace tiledb; //= public static //=================================================================== -std::unique_ptr SOMADataFrame::create( +void SOMADataFrame::create( std::string_view uri, - ArraySchema schema, + std::unique_ptr schema, + ColumnIndexInfo index_columns, std::shared_ptr ctx, + std::optional platform_config, std::optional timestamp) { - auto soma_array = SOMAArray::create( - ctx, uri, schema, "SOMADataFrame", timestamp); - return std::make_unique(*soma_array); + auto tiledb_schema = ArrowAdapter::tiledb_schema_from_arrow_schema( + ctx->tiledb_ctx(), std::move(schema), index_columns, platform_config); + SOMAArray::create(ctx, uri, tiledb_schema, "SOMADataFrame", timestamp); } std::unique_ptr SOMADataFrame::open( diff --git a/libtiledbsoma/src/soma/soma_dataframe.h b/libtiledbsoma/src/soma/soma_dataframe.h index 8583f16fb2..504779913f 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.h +++ b/libtiledbsoma/src/soma/soma_dataframe.h @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2023 TileDB, Inc. + * @copyright Copyright (c) 2023-2024 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -52,33 +52,36 @@ class SOMADataFrame : public SOMAArray { /** * @brief Create a SOMADataFrame object at the given URI. * - * @param uri URI to create the SOMAArray - * @param schema TileDB ArraySchema + * @param uri URI to create the SOMADataFrame + * @param schema Arrow schema + * @param index_columns The index column names with associated domains + * and tile extents per dimension * @param ctx SOMAContext - * @param timestamp Optional pair indicating timestamp start and end - * @return std::unique_ptr + * @param platform_config Optional config parameter dictionary + * @param timestamp Optional the timestamp range to write SOMA metadata info */ - static std::unique_ptr create( + static void create( std::string_view uri, - ArraySchema schema, + std::unique_ptr schema, + ColumnIndexInfo index_columns, std::shared_ptr ctx, + std::optional platform_config = std::nullopt, std::optional timestamp = std::nullopt); /** * @brief Open and return a SOMADataFrame object at the given URI. * - * @param mode read or write * @param uri URI to create the SOMADataFrame + * @param mode read or write + * @param ctx SOMAContext * @param column_names A list of column names to use as user-defined index * columns (e.g., ``['cell_type', 'tissue_type']``). All named columns must * exist in the schema, and at least one index column name is required. - * @param platform_config Platform-specific options used to create this - * DataFrame * @param result_order Read result order: automatic (default), rowmajor, or * colmajor * @param timestamp If specified, overrides the default timestamp used to * open this object. If unset, uses the timestamp provided by the context. - * @return std::unique_ptr + * @return std::unique_ptr SOMADataFrame */ static std::unique_ptr open( std::string_view uri, diff --git a/libtiledbsoma/src/soma/soma_dense_ndarray.cc b/libtiledbsoma/src/soma/soma_dense_ndarray.cc index b82f8d3ace..ca53fbfbee 100644 --- a/libtiledbsoma/src/soma/soma_dense_ndarray.cc +++ b/libtiledbsoma/src/soma/soma_dense_ndarray.cc @@ -29,7 +29,6 @@ * * This file defines the SOMADenseNDArray class. */ - #include "soma_dense_ndarray.h" namespace tiledbsoma { @@ -39,14 +38,12 @@ using namespace tiledb; //= public static //=================================================================== -std::unique_ptr SOMADenseNDArray::create( +void SOMADenseNDArray::create( std::string_view uri, ArraySchema schema, std::shared_ptr ctx, std::optional timestamp) { - auto soma_array = SOMAArray::create( - ctx, uri, schema, "SOMADenseNDArray", timestamp); - return std::make_unique(*soma_array); + SOMAArray::create(ctx, uri, schema, "SOMADenseNDArray", timestamp); } std::unique_ptr SOMADenseNDArray::open( diff --git a/libtiledbsoma/src/soma/soma_dense_ndarray.h b/libtiledbsoma/src/soma/soma_dense_ndarray.h index 47a13f7bdb..a1997019a0 100644 --- a/libtiledbsoma/src/soma/soma_dense_ndarray.h +++ b/libtiledbsoma/src/soma/soma_dense_ndarray.h @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2023 TileDB, Inc. + * @copyright Copyright (c) 2023-2024 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -52,13 +52,12 @@ class SOMADenseNDArray : public SOMAArray { /** * @brief Create a SOMADenseNDArray object at the given URI. * - * @param uri URI to create the SOMAArray - * @param schema TileDB ArraySchema + * @param uri URI to create the SOMADenseNDArray + * @param schema Arrow schema * @param ctx SOMAContext - * @param timestamp Optional pair indicating timestamp start and end - * @return std::unique_ptr + * @param timestamp Optional the timestamp range to write SOMA metadata info */ - static std::unique_ptr create( + static void create( std::string_view uri, ArraySchema schema, std::shared_ptr ctx, @@ -67,18 +66,17 @@ class SOMADenseNDArray : public SOMAArray { /** * @brief Open and return a SOMADenseNDArray object at the given URI. * - * @param mode read or write * @param uri URI to create the SOMADenseNDArray + * @param mode read or write + * @param ctx SOMAContext * @param column_names A list of column names to use as user-defined index * columns (e.g., ``['cell_type', 'tissue_type']``). All named columns must * exist in the schema, and at least one index column name is required. - * @param platform_config Platform-specific options used to create this - * SOMADenseNDArray - * @param timestamp If specified, overrides the default timestamp used to - * open this object. If unset, uses the timestamp provided by the context. * @param result_order Read result order: automatic (default), rowmajor, or * colmajor - * @return std::shared_ptr + * @param timestamp If specified, overrides the default timestamp used to + * open this object. If unset, uses the timestamp provided by the context. + * @return std::unique_ptr SOMADenseNDArray */ static std::unique_ptr open( std::string_view uri, @@ -105,6 +103,7 @@ class SOMADenseNDArray : public SOMAArray { * @param mode read or write * @param uri URI of the array * @param ctx TileDB context + * @param column_names Columns to read * @param result_order Read result order: automatic (default), rowmajor, or * colmajor * @param timestamp Timestamp diff --git a/libtiledbsoma/src/soma/soma_experiment.cc b/libtiledbsoma/src/soma/soma_experiment.cc index b2bb3fa5ed..370c79419b 100644 --- a/libtiledbsoma/src/soma/soma_experiment.cc +++ b/libtiledbsoma/src/soma/soma_experiment.cc @@ -41,19 +41,31 @@ using namespace tiledb; //= public static //=================================================================== -std::unique_ptr SOMAExperiment::create( +void SOMAExperiment::create( std::string_view uri, - ArraySchema schema, + std::unique_ptr schema, + ColumnIndexInfo index_columns, std::shared_ptr ctx, + std::optional platform_config, std::optional timestamp) { std::string exp_uri(uri); - auto soma_group = SOMAGroup::create(ctx, uri, "SOMAExperiment", timestamp); - SOMADataFrame::create(exp_uri + "/obs", schema, ctx, timestamp); + SOMAGroup::create(ctx, exp_uri, "SOMAExperiment", timestamp); + SOMADataFrame::create( + exp_uri + "/obs", + std::move(schema), + index_columns, + ctx, + platform_config, + timestamp); SOMACollection::create(exp_uri + "/ms", ctx, timestamp); - soma_group->set(exp_uri + "/obs", URIType::absolute, "obs"); - soma_group->set(exp_uri + "/ms", URIType::absolute, "ms"); - return std::make_unique(*soma_group); + + auto name = std::string(std::filesystem::path(uri).filename()); + auto group = SOMAGroup::open( + OpenMode::write, exp_uri, ctx, name, timestamp); + group->set(exp_uri + "/obs", URIType::absolute, "obs"); + group->set(exp_uri + "/ms", URIType::absolute, "ms"); + group->close(); } std::unique_ptr SOMAExperiment::open( diff --git a/libtiledbsoma/src/soma/soma_experiment.h b/libtiledbsoma/src/soma/soma_experiment.h index 9303d42b73..43e75eac0c 100644 --- a/libtiledbsoma/src/soma/soma_experiment.h +++ b/libtiledbsoma/src/soma/soma_experiment.h @@ -54,10 +54,12 @@ class SOMAExperiment : public SOMACollection { * @param schema TileDB ArraySchema * @param platform_config Optional config parameter dictionary */ - static std::unique_ptr create( + static void create( std::string_view uri, - ArraySchema schema, + std::unique_ptr schema, + ColumnIndexInfo index_columns, std::shared_ptr ctx, + std::optional platform_config = std::nullopt, std::optional timestamp = std::nullopt); /** diff --git a/libtiledbsoma/src/soma/soma_measurement.cc b/libtiledbsoma/src/soma/soma_measurement.cc index 38c892cc5a..dc043bc1fe 100644 --- a/libtiledbsoma/src/soma/soma_measurement.cc +++ b/libtiledbsoma/src/soma/soma_measurement.cc @@ -41,29 +41,38 @@ using namespace tiledb; //= public static //=================================================================== -std::unique_ptr SOMAMeasurement::create( +void SOMAMeasurement::create( std::string_view uri, - ArraySchema schema, + std::unique_ptr schema, + ColumnIndexInfo index_columns, std::shared_ptr ctx, + std::optional platform_config, std::optional timestamp) { std::string exp_uri(uri); - auto soma_group = SOMAGroup::create( - ctx, exp_uri, "SOMAMeasurement", timestamp); - SOMADataFrame::create(exp_uri + "/var", schema, ctx, timestamp); + SOMAGroup::create(ctx, exp_uri, "SOMAMeasurement", timestamp); + SOMADataFrame::create( + exp_uri + "/var", + std::move(schema), + index_columns, + ctx, + platform_config, + timestamp); SOMACollection::create(exp_uri + "/X", ctx, timestamp); SOMACollection::create(exp_uri + "/obsm", ctx, timestamp); SOMACollection::create(exp_uri + "/obsp", ctx, timestamp); SOMACollection::create(exp_uri + "/varm", ctx, timestamp); SOMACollection::create(exp_uri + "/varp", ctx, timestamp); - soma_group->set(exp_uri + "/var", URIType::absolute, "var"); - soma_group->set(exp_uri + "/X", URIType::absolute, "X"); - soma_group->set(exp_uri + "/obsm", URIType::absolute, "obsm"); - soma_group->set(exp_uri + "/obsp", URIType::absolute, "obsp"); - soma_group->set(exp_uri + "/varm", URIType::absolute, "varm"); - soma_group->set(exp_uri + "/varp", URIType::absolute, "varp"); - return std::make_unique(*soma_group); + auto name = std::string(std::filesystem::path(uri).filename()); + auto group = SOMAGroup::open(OpenMode::write, uri, ctx, name, timestamp); + group->set(exp_uri + "/var", URIType::absolute, "var"); + group->set(exp_uri + "/X", URIType::absolute, "X"); + group->set(exp_uri + "/obsm", URIType::absolute, "obsm"); + group->set(exp_uri + "/obsp", URIType::absolute, "obsp"); + group->set(exp_uri + "/varm", URIType::absolute, "varm"); + group->set(exp_uri + "/varp", URIType::absolute, "varp"); + group->close(); } std::unique_ptr SOMAMeasurement::open( diff --git a/libtiledbsoma/src/soma/soma_measurement.h b/libtiledbsoma/src/soma/soma_measurement.h index dbfe3b2505..0d60702723 100644 --- a/libtiledbsoma/src/soma/soma_measurement.h +++ b/libtiledbsoma/src/soma/soma_measurement.h @@ -55,10 +55,12 @@ class SOMAMeasurement : public SOMACollection { * @param schema TileDB ArraySchema * @param ctx TileDB context */ - static std::unique_ptr create( + static void create( std::string_view uri, - ArraySchema schema, + std::unique_ptr schema, + ColumnIndexInfo index_columns, std::shared_ptr ctx, + std::optional platform_config = std::nullopt, std::optional timestamp = std::nullopt); /** diff --git a/libtiledbsoma/src/soma/soma_object.cc b/libtiledbsoma/src/soma/soma_object.cc index 4c2828b355..7d51b40d69 100644 --- a/libtiledbsoma/src/soma/soma_object.cc +++ b/libtiledbsoma/src/soma/soma_object.cc @@ -18,7 +18,7 @@ std::unique_ptr SOMAObject::open( std::string_view uri, OpenMode mode, std::shared_ptr ctx, - std::optional> timestamp, + std::optional timestamp, std::optional soma_type) { if (soma_type == std::nullopt) { auto tiledb_type = Object::object(*ctx->tiledb_ctx(), std::string(uri)) diff --git a/libtiledbsoma/src/soma/soma_object.h b/libtiledbsoma/src/soma/soma_object.h index 048796fc84..b2067696ef 100644 --- a/libtiledbsoma/src/soma/soma_object.h +++ b/libtiledbsoma/src/soma/soma_object.h @@ -56,7 +56,7 @@ class SOMAObject { std::string_view uri, OpenMode mode, std::shared_ptr ctx, - std::optional> timestamp = std::nullopt, + std::optional timestamp = std::nullopt, std::optional soma_type = std::nullopt); /** diff --git a/libtiledbsoma/src/soma/soma_sparse_ndarray.cc b/libtiledbsoma/src/soma/soma_sparse_ndarray.cc index ca3d91fc61..73358dff5a 100644 --- a/libtiledbsoma/src/soma/soma_sparse_ndarray.cc +++ b/libtiledbsoma/src/soma/soma_sparse_ndarray.cc @@ -39,14 +39,12 @@ using namespace tiledb; //= public static //=================================================================== -std::unique_ptr SOMASparseNDArray::create( +void SOMASparseNDArray::create( std::string_view uri, ArraySchema schema, std::shared_ptr ctx, std::optional timestamp) { - auto soma_array = SOMAArray::create( - ctx, uri, schema, "SOMASparseNDArray", timestamp); - return std::make_unique(*soma_array); + SOMAArray::create(ctx, uri, schema, "SOMASparseNDArray", timestamp); } std::unique_ptr SOMASparseNDArray::open( diff --git a/libtiledbsoma/src/soma/soma_sparse_ndarray.h b/libtiledbsoma/src/soma/soma_sparse_ndarray.h index 4500b8870e..db3b251a53 100644 --- a/libtiledbsoma/src/soma/soma_sparse_ndarray.h +++ b/libtiledbsoma/src/soma/soma_sparse_ndarray.h @@ -52,13 +52,12 @@ class SOMASparseNDArray : public SOMAArray { /** * @brief Create a SOMASparseNDArray object at the given URI. * - * @param uri URI to create the SOMAArray - * @param schema TileDB ArraySchema + * @param uri URI to create the SOMASparseNDArray + * @param schema Arrow schema * @param ctx SOMAContext - * @param timestamp Optional pair indicating timestamp start and end - * @return std::unique_ptr + * @param timestamp Optional the timestamp range to write SOMA metadata info */ - static std::unique_ptr create( + static void create( std::string_view uri, ArraySchema schema, std::shared_ptr ctx, @@ -67,18 +66,17 @@ class SOMASparseNDArray : public SOMAArray { /** * @brief Open and return a SOMASparseNDArray object at the given URI. * - * @param mode read or write * @param uri URI to create the SOMASparseNDArray + * @param mode read or write + * @param ctx SOMAContext * @param column_names A list of column names to use as user-defined index * columns (e.g., ``['cell_type', 'tissue_type']``). All named columns must * exist in the schema, and at least one index column name is required. - * @param platform_config Platform-specific options used to create this - * SOMASparseNDArray * @param result_order Read result order: automatic (default), rowmajor, or * colmajor * @param timestamp If specified, overrides the default timestamp used to * open this object. If unset, uses the timestamp provided by the context. - * @return std::unique_ptr + * @return std::unique_ptr SOMASparseNDArray */ static std::unique_ptr open( std::string_view uri, @@ -105,6 +103,7 @@ class SOMASparseNDArray : public SOMAArray { * @param mode read or write * @param uri URI of the array * @param ctx TileDB context + * @param column_names Columns to read * @param result_order Read result order: automatic (default), rowmajor, or * colmajor * @param timestamp Timestamp @@ -139,7 +138,7 @@ class SOMASparseNDArray : public SOMAArray { using SOMAArray::open; /** - * Return whether the NDArray is sparse. + * Return whether the SOMASparseNDArray is sparse. * * @return true */ diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 13f66069e1..e4f5d2dc17 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -221,6 +221,137 @@ std::unique_ptr ArrowAdapter::arrow_schema_from_tiledb_array( return arrow_schema; } +ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( + std::shared_ptr ctx, + std::unique_ptr arrow_schema, + ColumnIndexInfo index_column_info, + std::optional platform_config) { + auto [index_column_names, domains, extents] = index_column_info; + + ArraySchema schema(*ctx, TILEDB_SPARSE); + Domain domain(*ctx); + + if (platform_config) { + std::map convert_filter = { + {"GzipFilter", TILEDB_FILTER_GZIP}, + {"ZstdFilter", TILEDB_FILTER_ZSTD}, + {"LZ4Filter", TILEDB_FILTER_LZ4}, + {"Bzip2Filter", TILEDB_FILTER_BZIP2}, + {"RleFilter", TILEDB_FILTER_RLE}, + {"DeltaFilter", TILEDB_FILTER_DELTA}, + {"DoubleDeltaFilter", TILEDB_FILTER_DOUBLE_DELTA}, + {"BitWidthReductionFilter", TILEDB_FILTER_BIT_WIDTH_REDUCTION}, + {"BitShuffleFilter", TILEDB_FILTER_BITSHUFFLE}, + {"ByteShuffleFilter", TILEDB_FILTER_BYTESHUFFLE}, + {"PositiveDeltaFilter", TILEDB_FILTER_POSITIVE_DELTA}, + {"ChecksumMD5Filter", TILEDB_FILTER_CHECKSUM_MD5}, + {"ChecksumSHA256Filter", TILEDB_FILTER_CHECKSUM_SHA256}, + {"DictionaryFilter", TILEDB_FILTER_DICTIONARY}, + {"FloatScaleFilter", TILEDB_FILTER_SCALE_FLOAT}, + {"XORFilter", TILEDB_FILTER_XOR}, + {"WebpFilter", TILEDB_FILTER_WEBP}, + {"NoOpFilter", TILEDB_FILTER_NONE}, + }; + + schema.set_capacity(platform_config->capacity); + + if (platform_config->offsets_filters.size() != 0) { + FilterList offset_filter_list(*ctx); + for (auto offset : platform_config->offsets_filters) { + offset_filter_list.add_filter( + Filter(*ctx, convert_filter[offset])); + } + schema.set_offsets_filter_list(offset_filter_list); + } + + if (platform_config->validity_filters.size() != 0) { + FilterList validity_filter_list(*ctx); + for (auto validity : platform_config->validity_filters) { + validity_filter_list.add_filter( + Filter(*ctx, convert_filter[validity])); + } + schema.set_validity_filter_list(validity_filter_list); + } + + schema.set_allows_dups(platform_config->allows_duplicates); + + if (platform_config->tile_order) + schema.set_tile_order( + platform_config->tile_order == "row" ? TILEDB_ROW_MAJOR : + TILEDB_COL_MAJOR); + + if (platform_config->cell_order) + schema.set_cell_order( + platform_config->cell_order == "row" ? TILEDB_ROW_MAJOR : + TILEDB_COL_MAJOR); + } + + std::map dims; + + for (int64_t sch_idx = 0; sch_idx < arrow_schema->n_children; ++sch_idx) { + auto child = arrow_schema->children[sch_idx]; + auto type = ArrowAdapter::to_tiledb_format(child->format); + + auto idx_col_begin = index_column_names.begin(); + auto idx_col_end = index_column_names.end(); + auto idx_col_it = std::find(idx_col_begin, idx_col_end, child->name); + + if (idx_col_it != idx_col_end) { + auto idx_col_idx = std::distance(idx_col_begin, idx_col_it); + if (ArrowAdapter::_isvar(child->format)) { + type = TILEDB_STRING_ASCII; + } + + auto dim = Dimension::create( + *ctx, + child->name, + type, + type == TILEDB_STRING_ASCII ? + nullptr : + domains->children[idx_col_idx]->buffers[1], + type == TILEDB_STRING_ASCII ? + nullptr : + extents->children[idx_col_idx]->buffers[1]); + + dims.insert({dim.name(), dim}); + } else { + Attribute attr(*ctx, child->name, type); + + if (child->flags & ARROW_FLAG_NULLABLE) { + attr.set_nullable(true); + } + + if (ArrowAdapter::_isvar(child->format)) { + attr.set_cell_val_num(TILEDB_VAR_NUM); + } + + if (child->dictionary != nullptr) { + auto enmr_format = child->dictionary->format; + auto enmr_type = ArrowAdapter::to_tiledb_format(enmr_format); + auto enmr = Enumeration::create_empty( + *ctx, + child->name, + enmr_type, + ArrowAdapter::_isvar(enmr_format) ? TILEDB_VAR_NUM : 1, + child->flags & ARROW_FLAG_DICTIONARY_ORDERED); + ArraySchemaExperimental::add_enumeration(*ctx, schema, enmr); + AttributeExperimental::set_enumeration_name( + *ctx, attr, child->name); + } + + schema.add_attribute(attr); + } + } + + for (auto column_name : index_column_names) + domain.add_dimension(dims.at(column_name)); + schema.set_domain(domain); + + schema.check(); + + return schema; +} + std::pair ArrowAdapter::_get_data_and_length( Enumeration& enmr, const void* dst) { switch (enmr.type()) { @@ -473,63 +604,60 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { return std::pair(std::move(array), std::move(schema)); } +bool ArrowAdapter::_isvar(const char* format) { + if ((strcmp(format, "U") == 0) || (strcmp(format, "Z") == 0) || + (strcmp(format, "u") == 0) || (strcmp(format, "z") == 0)) { + return true; + } + return false; +} + std::string_view ArrowAdapter::to_arrow_format( - tiledb_datatype_t datatype, bool use_large) { - switch (datatype) { - case TILEDB_STRING_ASCII: - case TILEDB_STRING_UTF8: - return use_large ? "U" : "u"; // large because TileDB - // uses 64bit offsets - case TILEDB_CHAR: - case TILEDB_BLOB: - return use_large ? "Z" : "z"; // large because TileDB - // uses 64bit offsets - case TILEDB_BOOL: - return "b"; - case TILEDB_INT32: - return "i"; - case TILEDB_INT64: - return "l"; - case TILEDB_FLOAT32: - return "f"; - case TILEDB_FLOAT64: - return "g"; - case TILEDB_INT8: - return "c"; - case TILEDB_UINT8: - return "C"; - case TILEDB_INT16: - return "s"; - case TILEDB_UINT16: - return "S"; - case TILEDB_UINT32: - return "I"; - case TILEDB_UINT64: - return "L"; - case TILEDB_TIME_SEC: - return "tts"; - case TILEDB_TIME_MS: - return "ttm"; - case TILEDB_TIME_US: - return "ttu"; - case TILEDB_TIME_NS: - return "ttn"; - case TILEDB_DATETIME_DAY: - return "tdD"; - case TILEDB_DATETIME_SEC: - return "tss:"; - case TILEDB_DATETIME_MS: - return "tsm:"; - case TILEDB_DATETIME_US: - return "tsu:"; - case TILEDB_DATETIME_NS: - return "tsn:"; - default: - break; + tiledb_datatype_t tiledb_dtype, bool use_large) { + auto u = use_large ? "U" : "u"; + auto z = use_large ? "Z" : "z"; + std::map _to_arrow_format_map = { + {TILEDB_STRING_ASCII, u}, {TILEDB_CHAR, z}, + {TILEDB_STRING_UTF8, u}, {TILEDB_BLOB, z}, + {TILEDB_INT8, "c"}, {TILEDB_UINT8, "C"}, + {TILEDB_INT16, "s"}, {TILEDB_UINT16, "S"}, + {TILEDB_INT32, "i"}, {TILEDB_UINT32, "I"}, + {TILEDB_INT64, "l"}, {TILEDB_UINT64, "L"}, + {TILEDB_FLOAT32, "f"}, {TILEDB_FLOAT64, "g"}, + {TILEDB_BOOL, "b"}, {TILEDB_DATETIME_SEC, "tss:"}, + {TILEDB_DATETIME_MS, "tsm:"}, {TILEDB_DATETIME_US, "tsu:"}, + {TILEDB_DATETIME_NS, "tsn:"}, + }; + + try { + return _to_arrow_format_map.at(tiledb_dtype); + } catch (const std::out_of_range& e) { + throw std::out_of_range(fmt::format( + "ArrowAdapter: Unsupported TileDB type: {} ", + tiledb::impl::type_to_str(tiledb_dtype))); + } +} + +tiledb_datatype_t ArrowAdapter::to_tiledb_format(std::string_view arrow_dtype) { + std::map _to_tiledb_format_map = { + {"u", TILEDB_STRING_UTF8}, {"U", TILEDB_STRING_UTF8}, + {"z", TILEDB_CHAR}, {"Z", TILEDB_CHAR}, + {"c", TILEDB_INT8}, {"C", TILEDB_UINT8}, + {"s", TILEDB_INT16}, {"S", TILEDB_UINT16}, + {"i", TILEDB_INT32}, {"I", TILEDB_UINT32}, + {"l", TILEDB_INT64}, {"L", TILEDB_UINT64}, + {"f", TILEDB_FLOAT32}, {"g", TILEDB_FLOAT64}, + {"b", TILEDB_BOOL}, {"tss:", TILEDB_DATETIME_SEC}, + {"tsm:", TILEDB_DATETIME_MS}, {"tsu:", TILEDB_DATETIME_US}, + {"tsn:", TILEDB_DATETIME_NS}, + }; + + try { + return _to_tiledb_format_map.at(arrow_dtype); + } catch (const std::out_of_range& e) { + throw std::out_of_range(fmt::format( + "ArrowAdapter: Unsupported Arrow type: {} ", arrow_dtype)); } - throw TileDBSOMAError(fmt::format( - "ArrowAdapter: Unsupported TileDB datatype: {} ", - tiledb::impl::type_to_str(datatype))); } // FIXME: Add more types, maybe make it a map diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index 818f5cc370..367c34a8cd 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -31,6 +31,32 @@ struct ArrowBuffer { std::shared_ptr buffer_; }; +using ArrowTable = + std::pair, std::shared_ptr>; + +using ColumnIndexInfo = std::tuple< + std::vector, // name of column + std::shared_ptr, // domain + std::shared_ptr // tile extent + >; + +class PlatformConfig { + public: + uint64_t dataframe_dim_zstd_level = 3; + uint64_t sparse_nd_array_dim_zstd_level = 3; + bool write_X_chunked = true; + uint64_t goal_chunk_nnz = 100000000; + uint64_t remote_cap_nbytes = 2400000000; + uint64_t capacity = 100000; + std::vector offsets_filters = { + "DoubleDeltaFilter", "BitWidthReductionFilter", "ZstdFilter"}; + std::vector validity_filters; + bool allows_duplicates = false; + std::optional tile_order = std::nullopt; + std::optional cell_order = std::nullopt; + bool consolidate_and_vacuum = false; +}; + class ArrowAdapter { public: static void release_schema(struct ArrowSchema* schema); @@ -47,17 +73,41 @@ class ArrowAdapter { static std::pair, std::unique_ptr> to_arrow(std::shared_ptr column); + /** + * @brief Create a an ArrowSchema from TileDB Schema + * + * @return ArrowSchema + */ static std::unique_ptr arrow_schema_from_tiledb_array( std::shared_ptr ctx, std::shared_ptr tiledb_array); + /** + * @brief Create a TileDB ArraySchema from ArrowSchema + * + * @return tiledb::ArraySchema + */ + static ArraySchema tiledb_schema_from_arrow_schema( + std::shared_ptr ctx, + std::unique_ptr arrow_schema, + ColumnIndexInfo index_column_info, + std::optional platform_config); + /** * @brief Get Arrow format string from TileDB datatype. * - * @param datatype TileDB datatype. + * @param tiledb_dtype TileDB datatype. * @return std::string_view Arrow format string. */ static std::string_view to_arrow_format( - tiledb_datatype_t datatype, bool use_large = true); + tiledb_datatype_t tiledb_dtype, bool use_large = true); + + /** + * @brief Get TileDB datatype from Arrow format string. + * + * @param datatype TileDB datatype. + * @return std::string_view Arrow format string. + */ + static tiledb_datatype_t to_tiledb_format(std::string_view arrow_dtype); static enum ArrowType to_nanoarrow_type(std::string_view sv); @@ -72,6 +122,11 @@ class ArrowAdapter { std::memcpy((void*)dst, src.data(), sz); return dst; } + + static std::optional> _get_dim_info( + std::string_view dim_name, ArrowTable index_columns); + + static bool _isvar(const char* format); }; }; // namespace tiledbsoma diff --git a/libtiledbsoma/test/CMakeLists.txt b/libtiledbsoma/test/CMakeLists.txt index b58a8952bc..4efbc890b1 100644 --- a/libtiledbsoma/test/CMakeLists.txt +++ b/libtiledbsoma/test/CMakeLists.txt @@ -27,6 +27,8 @@ find_package(Catch_EP REQUIRED) add_executable(unit_soma $ + common.cc + common.h unit_column_buffer.cc unit_managed_query.cc unit_soma_array.cc diff --git a/libtiledbsoma/test/common.cc b/libtiledbsoma/test/common.cc new file mode 100644 index 0000000000..7826564708 --- /dev/null +++ b/libtiledbsoma/test/common.cc @@ -0,0 +1,134 @@ +/** + * @file common.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2024 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file manages common headers and helper classes for the unit test files. + */ + +#include "common.h" + +namespace helper { +ArraySchema create_schema(Context& ctx, bool allow_duplicates) { + // Create schema + ArraySchema schema(ctx, TILEDB_SPARSE); + + auto dim = Dimension::create(ctx, "d0", {0, 1000}); + + Domain domain(ctx); + domain.add_dimension(dim); + schema.set_domain(domain); + + auto attr = Attribute::create(ctx, "a0"); + schema.add_attribute(attr); + schema.set_allows_dups(allow_duplicates); + schema.check(); + + return schema; +} + +std::pair, ColumnIndexInfo> create_arrow_schema() { + // Create ArrowSchema + auto arrow_schema = std::make_unique(); + arrow_schema->format = "+s"; + arrow_schema->n_children = 2; + arrow_schema->dictionary = nullptr; + arrow_schema->release = &ArrowAdapter::release_schema; + arrow_schema->children = new ArrowSchema*[arrow_schema->n_children]; + + ArrowSchema* dim = nullptr; + dim = arrow_schema->children[0] = new ArrowSchema; + dim->format = "l"; + dim->name = "d0"; + dim->n_children = 0; + dim->dictionary = nullptr; + dim->release = &ArrowAdapter::release_schema; + + ArrowSchema* attr = nullptr; + attr = arrow_schema->children[1] = new ArrowSchema; + attr->format = "l"; + attr->name = "a0"; + attr->n_children = 0; + attr->flags = 0; + attr->dictionary = nullptr; + attr->release = &ArrowAdapter::release_schema; + + // Create array for index columns + std::vector index_column_names = {"d0"}; + + auto domains = std::make_shared(); + domains->length = 0; + domains->null_count = 0; + domains->offset = 0; + domains->n_buffers = 0; + domains->buffers = nullptr; + domains->n_children = 2; + domains->release = &ArrowAdapter::release_array; + domains->children = new ArrowArray*[1]; + + auto d0_domain = domains->children[0] = new ArrowArray; + d0_domain->length = 2; + d0_domain->null_count = 0; + d0_domain->offset = 0; + d0_domain->n_buffers = 2; + d0_domain->release = &ArrowAdapter::release_array; + d0_domain->buffers = new const void*[2]; + d0_domain->buffers[0] = nullptr; + d0_domain->buffers[1] = malloc(sizeof(int64_t) * 2); + d0_domain->n_children = 0; + int64_t dom[] = {0, 1000}; + std::memcpy((void*)d0_domain->buffers[1], &dom, sizeof(int64_t) * 2); + + auto tiles = std::make_shared(); + tiles->length = 0; + tiles->null_count = 0; + tiles->offset = 0; + tiles->n_buffers = 0; + tiles->buffers = nullptr; + tiles->n_children = 2; + tiles->release = &ArrowAdapter::release_array; + tiles->children = new ArrowArray*[1]; + + ArrowArray* d0_tile = tiles->children[0] = new ArrowArray; + d0_tile->length = 1; + d0_tile->null_count = 0; + d0_tile->offset = 0; + d0_tile->n_buffers = 2; + d0_tile->release = &ArrowAdapter::release_array; + d0_tile->buffers = new const void*[2]; + d0_tile->buffers[0] = nullptr; + d0_tile->buffers[1] = malloc(sizeof(int64_t)); + d0_tile->n_children = 0; + int64_t tile = 1; + std::memcpy((void*)d0_tile->buffers[1], &tile, sizeof(int64_t)); + + ColumnIndexInfo index_columns_info = std::tuple( + index_column_names, domains, tiles); + + return std::pair(std::move(arrow_schema), index_columns_info); +} +} // namespace helper \ No newline at end of file diff --git a/libtiledbsoma/test/common.h b/libtiledbsoma/test/common.h new file mode 100644 index 0000000000..d157cdce38 --- /dev/null +++ b/libtiledbsoma/test/common.h @@ -0,0 +1,66 @@ +/** + * @file common.h + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2024 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file manages common headers and helper classes for the unit test files. + */ + +#ifndef UNIT_TEST_COMMON_H +#define UNIT_TEST_COMMON_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "utils/util.h" + +using namespace tiledb; +using namespace tiledbsoma; +using namespace Catch::Matchers; + +#ifndef TILEDBSOMA_SOURCE_ROOT +#define TILEDBSOMA_SOURCE_ROOT "not_defined" +#endif + +static const std::string src_path = TILEDBSOMA_SOURCE_ROOT; + +namespace helper { +ArraySchema create_schema(Context& ctx, bool allow_duplicates = false); +std::pair, ColumnIndexInfo> create_arrow_schema(); +} // namespace helper +#endif \ No newline at end of file diff --git a/libtiledbsoma/test/test_indexer.cc b/libtiledbsoma/test/test_indexer.cc index 4f507940b6..c99d13ff61 100644 --- a/libtiledbsoma/test/test_indexer.cc +++ b/libtiledbsoma/test/test_indexer.cc @@ -31,10 +31,8 @@ */ #include -#include #include #include -#include #include #include #include @@ -59,9 +57,8 @@ bool run_test(int id, std::vector keys, std::vector lookups) { try { std::vector indexer_results; indexer_results.resize(lookups.size()); - auto context = std::make_shared(); - tiledbsoma::IntIndexer indexer(context); + tiledbsoma::IntIndexer indexer; indexer.map_locations(keys); auto* hash = kh_init(m64); int ret; diff --git a/libtiledbsoma/test/unit_column_buffer.cc b/libtiledbsoma/test/unit_column_buffer.cc index e9eb6ba4fc..79b04308dc 100644 --- a/libtiledbsoma/test/unit_column_buffer.cc +++ b/libtiledbsoma/test/unit_column_buffer.cc @@ -74,7 +74,7 @@ static std::shared_ptr create_array( attr.set_cell_val_num(TILEDB_VAR_NUM); schema.add_attribute(attr); - Array::create(uri, schema); + Array::create(uri, std::move(schema)); return std::make_shared(ctx, uri, TILEDB_READ); } diff --git a/libtiledbsoma/test/unit_managed_query.cc b/libtiledbsoma/test/unit_managed_query.cc index 6797c2d5eb..ee708533c5 100644 --- a/libtiledbsoma/test/unit_managed_query.cc +++ b/libtiledbsoma/test/unit_managed_query.cc @@ -81,7 +81,7 @@ auto create_array(const std::string& uri, Context& ctx) { schema.check(); // Create array and open for writing - Array::create(uri, schema); + Array::create(uri, std::move(schema)); Array array(ctx, uri, TILEDB_WRITE); std::vector d0 = { diff --git a/libtiledbsoma/test/unit_soma_array.cc b/libtiledbsoma/test/unit_soma_array.cc index 610bc1f0bb..f27a71680f 100644 --- a/libtiledbsoma/test/unit_soma_array.cc +++ b/libtiledbsoma/test/unit_soma_array.cc @@ -86,7 +86,8 @@ std::tuple create_array( schema.check(); // Create array - SOMAArray::create(ctx, uri, schema, "NONE", TimestampRange(0, 2)); + SOMAArray::create( + ctx, uri, std::move(schema), "NONE", TimestampRange(0, 2)); uint64_t nnz = num_fragments * num_cells_per_fragment; @@ -146,7 +147,6 @@ std::tuple, std::vector> write_array( // Write data to array soma_array->write(array_buffer); - soma_array->close(); } // Read from TileDB Array to get expected data @@ -463,7 +463,7 @@ TEST_CASE("SOMAArray: Enumeration") { *ctx->tiledb_ctx(), attr, "rbg"); schema.add_attribute(attr); - Array::create(uri, schema); + Array::create(uri, std::move(schema)); auto soma_array = SOMAArray::open(OpenMode::read, uri, ctx); auto attr_to_enum = soma_array->get_attr_to_enum_mapping(); diff --git a/libtiledbsoma/test/unit_soma_collection.cc b/libtiledbsoma/test/unit_soma_collection.cc index 38336e1b27..b6551b57f0 100644 --- a/libtiledbsoma/test/unit_soma_collection.cc +++ b/libtiledbsoma/test/unit_soma_collection.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2022 TileDB, Inc. + * @copyright Copyright (c) 2024 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -30,52 +30,7 @@ * This file manages unit tests for the SOMACollection class */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include "utils/util.h" - -using namespace tiledb; -using namespace tiledbsoma; -using namespace Catch::Matchers; - -#ifndef TILEDBSOMA_SOURCE_ROOT -#define TILEDBSOMA_SOURCE_ROOT "not_defined" -#endif - -const std::string src_path = TILEDBSOMA_SOURCE_ROOT; - -namespace { -ArraySchema create_schema( - Context& ctx, bool sparse = false, bool allow_duplicates = false) { - // Create schema - ArraySchema schema(ctx, sparse ? TILEDB_SPARSE : TILEDB_DENSE); - - auto dim = Dimension::create(ctx, "d0", {0, 1000}); - - Domain domain(ctx); - domain.add_dimension(dim); - schema.set_domain(domain); - - auto attr = Attribute::create(ctx, "a0"); - schema.add_attribute(attr); - schema.set_allows_dups(allow_duplicates); - schema.check(); - - return schema; -} -}; // namespace +#include "common.h" TEST_CASE("SOMACollection: basic") { auto ctx = std::make_shared(); @@ -95,7 +50,8 @@ TEST_CASE("SOMACollection: add SOMASparseNDArray") { std::string sub_uri = "mem://unit-test-add-sparse-ndarray/sub"; SOMACollection::create(base_uri, ctx); - auto schema = create_schema(*ctx->tiledb_ctx(), true); + auto [arrow_schema, index_columns] = helper::create_arrow_schema(); + auto schema = helper::create_schema(*ctx->tiledb_ctx(), true); std::map expected_map{ {"sparse_ndarray", sub_uri}}; @@ -124,7 +80,7 @@ TEST_CASE("SOMACollection: add SOMADenseNDArray") { std::string sub_uri = "mem://unit-test-add-dense-ndarray/sub"; SOMACollection::create(base_uri, ctx); - auto schema = create_schema(*ctx->tiledb_ctx(), false); + auto schema = helper::create_schema(*ctx->tiledb_ctx(), false); std::map expected_map{{"dense_ndarray", sub_uri}}; @@ -151,13 +107,18 @@ TEST_CASE("SOMACollection: add SOMADataFrame") { std::string sub_uri = "mem://unit-test-add-dataframe/sub"; SOMACollection::create(base_uri, ctx); - auto schema = create_schema(*ctx->tiledb_ctx(), true); + auto [schema, index_columns] = helper::create_arrow_schema(); std::map expected_map{{"dataframe", sub_uri}}; auto soma_collection = SOMACollection::open(base_uri, OpenMode::write, ctx); auto soma_dataframe = soma_collection->add_new_dataframe( - "dataframe", sub_uri, URIType::absolute, ctx, schema); + "dataframe", + sub_uri, + URIType::absolute, + ctx, + std::move(schema), + index_columns); REQUIRE(soma_collection->member_to_uri_mapping() == expected_map); REQUIRE(soma_dataframe->uri() == sub_uri); REQUIRE(soma_dataframe->ctx() == ctx); @@ -179,7 +140,7 @@ TEST_CASE("SOMACollection: add SOMACollection") { std::string sub_uri = "mem://unit-test-add-collection/sub"; SOMACollection::create(base_uri, ctx); - auto schema = create_schema(*ctx->tiledb_ctx(), false); + auto schema = helper::create_schema(*ctx->tiledb_ctx(), false); std::map expected_map{{"subcollection", sub_uri}}; @@ -203,13 +164,18 @@ TEST_CASE("SOMACollection: add SOMAExperiment") { std::string sub_uri = "mem://unit-test-add-experiment/sub"; SOMACollection::create(base_uri, ctx); - auto schema = create_schema(*ctx->tiledb_ctx(), false); + auto [schema, index_columns] = helper::create_arrow_schema(); std::map expected_map{{"experiment", sub_uri}}; auto soma_collection = SOMACollection::open(base_uri, OpenMode::write, ctx); auto soma_experiment = soma_collection->add_new_experiment( - "experiment", sub_uri, URIType::absolute, ctx, schema); + "experiment", + sub_uri, + URIType::absolute, + ctx, + std::move(schema), + index_columns); REQUIRE(soma_collection->member_to_uri_mapping() == expected_map); REQUIRE(soma_experiment->uri() == sub_uri); REQUIRE(soma_experiment->ctx() == ctx); @@ -228,13 +194,18 @@ TEST_CASE("SOMACollection: add SOMAMeasurement") { std::string sub_uri = "mem://unit-test-add-measurement/sub"; SOMACollection::create(base_uri, ctx); - auto schema = create_schema(*ctx->tiledb_ctx(), false); + auto [schema, index_columns] = helper::create_arrow_schema(); std::map expected_map{{"measurement", sub_uri}}; auto soma_collection = SOMACollection::open(base_uri, OpenMode::write, ctx); auto soma_measurement = soma_collection->add_new_measurement( - "measurement", sub_uri, URIType::absolute, ctx, schema); + "measurement", + sub_uri, + URIType::absolute, + ctx, + std::move(schema), + index_columns); REQUIRE(soma_collection->member_to_uri_mapping() == expected_map); REQUIRE(soma_measurement->uri() == sub_uri); REQUIRE(soma_measurement->ctx() == ctx); @@ -253,7 +224,7 @@ TEST_CASE("SOMACollection: metadata") { std::string uri = "mem://unit-test-collection"; SOMACollection::create(uri, ctx, TimestampRange(0, 2)); auto soma_collection = SOMACollection::open( - uri, OpenMode::write, ctx, TimestampRange(1, 1)); + uri, OpenMode::write, ctx, std::pair(1, 1)); int32_t val = 100; soma_collection->set_metadata("md", TILEDB_INT32, 1, &val); @@ -304,10 +275,16 @@ TEST_CASE("SOMAExperiment: metadata") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-experiment"; + auto [schema, index_columns] = helper::create_arrow_schema(); SOMAExperiment::create( - uri, create_schema(*ctx->tiledb_ctx()), ctx, TimestampRange(0, 2)); + uri, + std::move(schema), + index_columns, + ctx, + std::nullopt, + TimestampRange(0, 2)); auto soma_experiment = SOMAExperiment::open( - uri, OpenMode::write, ctx, TimestampRange(1, 1)); + uri, OpenMode::write, ctx, std::pair(1, 1)); int32_t val = 100; soma_experiment->set_metadata("md", TILEDB_INT32, 1, &val); @@ -357,10 +334,17 @@ TEST_CASE("SOMAExperiment: metadata") { TEST_CASE("SOMAMeasurement: metadata") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-measurement"; + auto [schema, index_columns] = helper::create_arrow_schema(); SOMAMeasurement::create( - uri, create_schema(*ctx->tiledb_ctx()), ctx, TimestampRange(0, 2)); + uri, + std::move(schema), + index_columns, + ctx, + std::nullopt, + TimestampRange(0, 2)); + auto soma_measurement = SOMAMeasurement::open( - uri, OpenMode::write, ctx, TimestampRange(1, 1)); + uri, OpenMode::write, ctx, std::pair(1, 1)); int32_t val = 100; soma_measurement->set_metadata("md", TILEDB_INT32, 1, &val); diff --git a/libtiledbsoma/test/unit_soma_dataframe.cc b/libtiledbsoma/test/unit_soma_dataframe.cc index 9f50de5807..bd21e0c7b9 100644 --- a/libtiledbsoma/test/unit_soma_dataframe.cc +++ b/libtiledbsoma/test/unit_soma_dataframe.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2022 TileDB, Inc. + * @copyright Copyright (c) 2024 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -30,58 +30,24 @@ * This file manages unit tests for the SOMADataFrame class */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include "utils/util.h" - -using namespace tiledb; -using namespace tiledbsoma; -using namespace Catch::Matchers; - -#ifndef TILEDBSOMA_SOURCE_ROOT -#define TILEDBSOMA_SOURCE_ROOT "not_defined" -#endif - -const std::string src_path = TILEDBSOMA_SOURCE_ROOT; - -namespace { -ArraySchema create_schema(Context& ctx, bool allow_duplicates = false) { - // Create schema - ArraySchema schema(ctx, TILEDB_SPARSE); - - auto dim = Dimension::create(ctx, "d0", {0, 1000}); - - Domain domain(ctx); - domain.add_dimension(dim); - schema.set_domain(domain); - - auto attr = Attribute::create(ctx, "a0"); - schema.add_attribute(attr); - schema.set_allows_dups(allow_duplicates); - schema.check(); - - return schema; -} -}; // namespace +#include "common.h" TEST_CASE("SOMADataFrame: basic") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-dataframe-basic"; - auto soma_dataframe = SOMADataFrame::create( - uri, create_schema(*ctx->tiledb_ctx()), ctx); + auto [schema, index_columns] = helper::create_arrow_schema(); + SOMADataFrame::create(uri, std::move(schema), index_columns, ctx); + + auto soma_dataframe = SOMADataFrame::open(uri, OpenMode::read, ctx); + REQUIRE(soma_dataframe->uri() == uri); + REQUIRE(soma_dataframe->ctx() == ctx); + REQUIRE(soma_dataframe->type() == "SOMADataFrame"); + std::vector expected_index_column_names = {"d0"}; + REQUIRE( + soma_dataframe->index_column_names() == expected_index_column_names); + REQUIRE(soma_dataframe->count() == 0); + soma_dataframe->close(); std::vector d0(10); for (int j = 0; j < 10; j++) @@ -94,13 +60,11 @@ TEST_CASE("SOMADataFrame: basic") { array_buffer->emplace("a0", ColumnBuffer::create(tdb_arr, "a0", a0)); array_buffer->emplace("d0", ColumnBuffer::create(tdb_arr, "d0", d0)); + soma_dataframe = SOMADataFrame::open(uri, OpenMode::write, ctx); soma_dataframe->write(array_buffer); soma_dataframe->close(); - soma_dataframe->open(OpenMode::read); - REQUIRE(soma_dataframe->uri() == uri); - REQUIRE(soma_dataframe->ctx() == ctx); - REQUIRE(soma_dataframe->type() == "SOMADataFrame"); + soma_dataframe = SOMADataFrame::open(uri, OpenMode::read, ctx); while (auto batch = soma_dataframe->read_next()) { auto arrbuf = batch.value(); auto d0span = arrbuf->at("d0")->data(); @@ -119,8 +83,14 @@ TEST_CASE("SOMADataFrame: basic") { TEST_CASE("SOMADataFrame: metadata") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-collection"; + auto [schema, index_columns] = helper::create_arrow_schema(); SOMADataFrame::create( - uri, create_schema(*ctx->tiledb_ctx()), ctx, TimestampRange(0, 2)); + uri, + std::move(schema), + index_columns, + ctx, + std::nullopt, + TimestampRange(0, 2)); auto soma_dataframe = SOMADataFrame::open( uri, diff --git a/libtiledbsoma/test/unit_soma_dense_ndarray.cc b/libtiledbsoma/test/unit_soma_dense_ndarray.cc index e2e9c50e28..bbac4f47b9 100644 --- a/libtiledbsoma/test/unit_soma_dense_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_dense_ndarray.cc @@ -58,8 +58,8 @@ const std::string src_path = TILEDBSOMA_SOURCE_ROOT; namespace { ArraySchema create_schema(Context& ctx, bool allow_duplicates = false) { - // SOMADenseNDArray is actually a TILEDB_SPARSE under the hood - ArraySchema schema(ctx, TILEDB_SPARSE); + // Create schema + ArraySchema schema(ctx, TILEDB_DENSE); auto dim = Dimension::create(ctx, "d0", {0, 1000}); @@ -80,12 +80,21 @@ TEST_CASE("SOMADenseNDArray: basic") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-dense-ndarray-basic"; - auto soma_dense = SOMADenseNDArray::create( - uri, create_schema(*ctx->tiledb_ctx()), ctx); + SOMADenseNDArray::create(uri, create_schema(*ctx->tiledb_ctx()), ctx); - std::vector d0(10); - for (int j = 0; j < 10; j++) - d0[j] = j; + auto soma_dense = SOMADenseNDArray::open(uri, OpenMode::read, ctx); + REQUIRE(soma_dense->uri() == uri); + REQUIRE(soma_dense->ctx() == ctx); + REQUIRE(soma_dense->type() == "SOMADenseNDArray"); + REQUIRE(soma_dense->is_sparse() == false); + auto schema = soma_dense->tiledb_schema(); + REQUIRE(schema->has_attribute("a0")); + REQUIRE(schema->domain().has_dimension("d0")); + REQUIRE(soma_dense->ndim() == 1); + REQUIRE(soma_dense->shape() == std::vector{1001}); + soma_dense->close(); + + std::vector d0{1, 10}; std::vector a0(10, 1); auto array_buffer = std::make_shared(); @@ -94,18 +103,18 @@ TEST_CASE("SOMADenseNDArray: basic") { array_buffer->emplace("a0", ColumnBuffer::create(tdb_arr, "a0", a0)); array_buffer->emplace("d0", ColumnBuffer::create(tdb_arr, "d0", d0)); + soma_dense->open(OpenMode::write); soma_dense->write(array_buffer); soma_dense->close(); soma_dense->open(OpenMode::read); - REQUIRE(soma_dense->uri() == uri); - REQUIRE(soma_dense->ctx() == ctx); - REQUIRE(soma_dense->type() == "SOMADenseNDArray"); while (auto batch = soma_dense->read_next()) { auto arrbuf = batch.value(); auto d0span = arrbuf->at("d0")->data(); auto a0span = arrbuf->at("a0")->data(); - REQUIRE(d0 == std::vector(d0span.begin(), d0span.end())); + REQUIRE( + std::vector{1, 2, 3, 4, 5, 6, 7, 8, 9, 10} == + std::vector(d0span.begin(), d0span.end())); REQUIRE(a0 == std::vector(a0span.begin(), a0span.end())); } soma_dense->close(); diff --git a/libtiledbsoma/test/unit_soma_group.cc b/libtiledbsoma/test/unit_soma_group.cc index c04030037b..8e2454313a 100644 --- a/libtiledbsoma/test/unit_soma_group.cc +++ b/libtiledbsoma/test/unit_soma_group.cc @@ -91,7 +91,7 @@ std::tuple create_array( schema.check(); // Create array - Array::create(uri, schema); + Array::create(uri, std::move(schema)); } // Open array for writing diff --git a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc index b9a37d5dc4..de457197d2 100644 --- a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc @@ -80,8 +80,20 @@ TEST_CASE("SOMASparseNDArray: basic") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-sparse-ndarray-basic"; - auto soma_sparse = SOMASparseNDArray::create( - uri, create_schema(*ctx->tiledb_ctx()), ctx); + SOMASparseNDArray::create( + uri, create_schema(*ctx->tiledb_ctx()), ctx, TimestampRange(0, 2)); + + auto soma_sparse = SOMASparseNDArray::open(uri, OpenMode::read, ctx); + REQUIRE(soma_sparse->uri() == uri); + REQUIRE(soma_sparse->ctx() == ctx); + REQUIRE(soma_sparse->type() == "SOMASparseNDArray"); + REQUIRE(soma_sparse->is_sparse() == true); + auto schema = soma_sparse->tiledb_schema(); + REQUIRE(schema->has_attribute("a0")); + REQUIRE(schema->domain().has_dimension("d0")); + REQUIRE(soma_sparse->ndim() == 1); + REQUIRE(soma_sparse->nnz() == 0); + soma_sparse->close(); std::vector d0(10); for (int j = 0; j < 10; j++) @@ -94,13 +106,11 @@ TEST_CASE("SOMASparseNDArray: basic") { array_buffer->emplace("a0", ColumnBuffer::create(tdb_arr, "a0", a0)); array_buffer->emplace("d0", ColumnBuffer::create(tdb_arr, "d0", d0)); + soma_sparse->open(OpenMode::write); soma_sparse->write(array_buffer); soma_sparse->close(); soma_sparse->open(OpenMode::read); - REQUIRE(soma_sparse->uri() == uri); - REQUIRE(soma_sparse->ctx() == ctx); - REQUIRE(soma_sparse->type() == "SOMASparseNDArray"); while (auto batch = soma_sparse->read_next()) { auto arrbuf = batch.value(); auto d0span = arrbuf->at("d0")->data();